PyPI - tico - Versions diffs - 0.2.0.dev260411__tar.gz → 0.2.0.dev260415__tar.gz - Mend

tico 0.2.0.dev260411tar.gz → 0.2.0.dev260415tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (333) hide show

{tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: tico
-Version: 0.2.0.dev260411
+Version: 0.2.0.dev260415
 Summary: Convert Exported Torch Module To Circle
 License: This file provides full text of licenses used in this project

tico-0.2.0.dev260415/tico/_version.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "0.2.0.dev260415"

{tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/algorithm/gptq/quantizer.py RENAMED Viewed

@@ -28,6 +28,11 @@ from tico.quantization.algorithm.gptq.utils import (
 from tico.quantization.config.gptq import GPTQConfig
 from tico.quantization.quantizer import BaseQuantizer
 from tico.quantization.quantizer_registry import register_quantizer
+from tico.utils.utils import move_to_device
+def move_to_cpu(obj):
+    return move_to_device(obj, "cpu")
 class StopForward(Exception):
@@ -118,12 +123,12 @@ class GPTQQuantizer(BaseQuantizer):
             for idx, item in enumerate(args):
                 if (idx + 1) > len(self.cache_args):
                     self.cache_args.append([])
-                self.cache_args[idx].append(item)
+                self.cache_args[idx].append(move_to_cpu(item))
             # Store keyword args
             for k, v in kwargs.items():
                 if self.cache_kwargs.get(k, None) is None:
                     self.cache_kwargs[k] = []
-                self.cache_kwargs[k].append(v)
+                self.cache_kwargs[k].append(move_to_cpu(v))
             self.num_batches += 1
             raise StopForward  # stop after the first layer
@@ -280,6 +285,7 @@ class GPTQQuantizer(BaseQuantizer):
                 # Run layer forward over all cached batches to build Hessian/statistics
                 batch_num = self.num_batches
+                device = next(model.parameters()).device
                 for batch_idx in tqdm(
                     range(batch_num),
                     desc=f"[L{l_idx}] collecting",
@@ -290,9 +296,13 @@ class GPTQQuantizer(BaseQuantizer):
                     cache_args_batch = gather_single_batch_from_list(
                         self.cache_args, batch_idx
                     )
+                    cache_args_batch = move_to_device(cache_args_batch, device)
                     cache_kwargs_batch = gather_single_batch_from_dict(
                         self.cache_kwargs, batch_idx
                     )
+                    cache_kwargs_batch = move_to_device(cache_kwargs_batch, device)
                     layer(*cache_args_batch, **cache_kwargs_batch)
                 # Remove handles
@@ -314,6 +324,7 @@ class GPTQQuantizer(BaseQuantizer):
                     gptq[name].free()
             # 4) After quantization, re-run the layer to produce outputs for the next layer
+            device = next(model.parameters()).device
             for batch_idx in tqdm(
                 range(batch_num),
                 desc=f"[L{l_idx}] re-forward",
@@ -324,9 +335,13 @@ class GPTQQuantizer(BaseQuantizer):
                 cache_args_batch = gather_single_batch_from_list(
                     self.cache_args, batch_idx
                 )
+                cache_args_batch = move_to_device(cache_args_batch, device)
                 cache_kwargs_batch = gather_single_batch_from_dict(
                     self.cache_kwargs, batch_idx
                 )
+                cache_kwargs_batch = move_to_device(cache_kwargs_batch, device)
                 outs = layer(*cache_args_batch, **cache_kwargs_batch)
                 # LLaMA's decoder layer return type differs across Transformers versions:
                 # some return a tuple (hidden_states, ...), others return just a tensor.
@@ -334,7 +349,14 @@ class GPTQQuantizer(BaseQuantizer):
                 outs = outs[0] if isinstance(outs, tuple) else outs
                 # Update inputs for next iteration.
                 if len(self.cache_args) > 0:
-                    self.cache_args[0][batch_idx] = outs
+                    if hasattr(outs, "to") and hasattr(
+                        self.cache_args[0][batch_idx], "device"
+                    ):
+                        self.cache_args[0][batch_idx] = outs.to(
+                            self.cache_args[0][batch_idx].device
+                        )
+                    else:
+                        self.cache_args[0][batch_idx] = outs
             if torch.cuda.is_available():
                 torch.cuda.empty_cache()

tico-0.2.0.dev260415/tico/quantization/wrapq/examples/evaluate_fk_llama_model.py ADDED Viewed

@@ -0,0 +1,156 @@
+# Copyright (c) 2026 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import torch
+from lm_eval.utils import make_table
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from tico.quantization.evaluation.script.llm_tasks_eval import evaluate_llm_on_tasks
+DTYPE_MAP = {
+    "float32": torch.float32,
+    # TODO Support more dtypes
+    # "bfloat16": torch.bfloat16,
+    # "float16": torch.float16,
+}
+def main():
+    parser = argparse.ArgumentParser(
+        description="Evaluate a fake-quantized Llama model"
+    )
+    parser.add_argument(
+        "--model", type=str, required=True, help="HF repo name or local path."
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cuda" if torch.cuda.is_available() else "cpu",
+        help="Device to run on (cuda|cpu|mps).",
+    )
+    parser.add_argument(
+        "--dtype",
+        choices=list(DTYPE_MAP.keys()),
+        default="float32",
+        help="Model dtype for load.",
+    )
+    parser.add_argument(
+        "--hf-token",
+        type=str,
+        default=None,
+        help="Optional HF token for gated/private repos.",
+    )
+    parser.add_argument(
+        "--trust-remote-code",
+        action="store_true",
+        help="Enable only if you trust the model repo code.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="cache_dir for using model/datasets loading",
+    )
+    parser.add_argument(
+        "--fk_model_path", type=str, required=True, help="Path to fake_quantized model"
+    )
+    parser.add_argument(
+        "--eval_tasks",
+        type=str,
+        default=None,
+        help="tasks to be evaluated using lm_eval, e.g. `winogrande,arc_easy,arc_challenge,openbookqa,mmlu_pro,ifeval,bbh`",
+    )
+    parser.add_argument(
+        "--skip_fp_eval",
+        action="store_true",
+        help="Skip original model evaluation.",
+    )
+    args = parser.parse_args()
+    print(args)
+    # -------------------------------------------------------------------------
+    # Basic setup
+    # -------------------------------------------------------------------------
+    device = torch.device(args.device)
+    dtype = DTYPE_MAP[args.dtype]
+    print("=== Config ===")
+    print(f"Model            : {args.model}")
+    print(f"Device           : {device.type}")
+    print(f"DType            : {args.dtype}")
+    print(f"fk_model_path    : {args.fk_model_path}")
+    print()
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.model,
+        trust_remote_code=args.trust_remote_code,
+        token=args.hf_token,
+        cache_dir=args.cache_dir,
+    )
+    if not args.skip_fp_eval:
+        # -------------------------------------------------------------------------
+        # FP model evaluation
+        # -------------------------------------------------------------------------
+        print("Loading FP model …")
+        model = (
+            AutoModelForCausalLM.from_pretrained(
+                args.model,
+                dtype=dtype,
+                trust_remote_code=args.trust_remote_code,
+                token=args.hf_token,
+                cache_dir=args.cache_dir,
+            )
+            .cpu()
+            .eval()
+        )
+        if args.eval_tasks is not None:
+            config = model.config
+            max_seq_len = config.max_position_embeddings
+            results = evaluate_llm_on_tasks(
+                model, tokenizer, args.eval_tasks, max_length=max_seq_len
+            )
+            print("Original RESULTS ARE:")
+            print(make_table(results))
+        model = model.cpu()
+        if device.type == "cuda" and torch.cuda.is_available():
+            torch.cuda.empty_cache()
+    # -------------------------------------------------------------------------
+    # FK model evaluation
+    # -------------------------------------------------------------------------
+    print("Loading fake quantized model …")
+    fk_model = torch.load(args.fk_model_path, weights_only=False).eval().to(args.device)
+    if args.eval_tasks is not None:
+        config = fk_model.wrapped.config
+        max_seq_len = config.max_position_embeddings
+        results = evaluate_llm_on_tasks(
+            fk_model, tokenizer, args.eval_tasks, max_length=max_seq_len
+        )
+        print("Quantized RESULTS ARE:")
+        print(make_table(results))
+if __name__ == "__main__":
+    main()

{tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/examples/quantize_full_qmodel_with_gptq.py RENAMED Viewed

@@ -217,9 +217,43 @@ def evaluate(q_m, tokenizer, dataset_test, args):
         print(make_table(results))
+def get_sensitivities_info_name(model, dataset, seed, n_samples):
+    model_name = model.config.name_or_path.replace("/", "_")
+    name = (
+        "."
+        + "/sensitivities_for_"
+        + model_name
+        + "_"
+        + dataset
+        + "_"
+        + str(n_samples)
+        + "_"
+        + str(seed)
+        + ".pt"
+    )
+    return name
+def get_ptq_model_name(model, args):
+    model_name = model.config.name_or_path.replace("/", "_")
+    name = (
+        f"PTQ_{model_name}_"
+        + ("SpinQuant_" if args.no_spinquant is False else "")
+        + ("GPTQ_" if args.no_GPTQ is False else "")
+        + (f"{args.gptq_mse}_" if args.no_GPTQ is False else "")
+        + str(args.nsamples_for_qcalibration)
+        + "_"
+        + str(args.seed)
+        + ".pt"
+    )
+    return name
 def main():
     parser = argparse.ArgumentParser(
-        description="GPTQ+PTQ pipeline (weight-only + activation)"
+        description="GPTQ+PTQ pipeline (weight-only + activation)",
     )
     parser.add_argument(
         "--model", type=str, required=True, help="HF repo name or local path."
@@ -270,16 +304,17 @@ def main():
         help="Leave model float",
     )
     parser.add_argument(
-        "--save_circle_to_folder",
+        "--output_dir",
         type=str,
         default=None,
-        help="Save the whole model to the folder specified",
+        help="Save specified artifacts to output_dir",
     )
     parser.add_argument(
-        "--save_layers_to_folder",
+        "--save",
+        nargs="*",
         type=str,
-        default=None,
-        help="Save all layers to the folder specified",
+        choices=["circle_full", "circle_per_layer", "ptq_checkpoint", "sensitivity"],
+        help="which artifacts should be saved to output_dir",
     )
     parser.add_argument(
         "--cache_dir",
@@ -439,6 +474,13 @@ def main():
             else:
                 calibrator = SensitivityCalibrator(model, calib_inputs)
                 sens = calibrator.compute_sensitivity_info()
+                if args.output_dir is not None and "sensitivity" in args.save:
+                    save_name = get_sensitivities_info_name(
+                        model, "wikitext", args.seed, len(calib_inputs)
+                    )
+                    save_path = pathlib.Path(args.output_dir, save_name)
+                    print(f"Saving calibrated_sensitivities to {save_path}")
+                    torch.save(sens, save_path)
         gptq_config = GPTQConfig(
             weight_bits=args.linear_weight_bits,
@@ -461,15 +503,21 @@ def main():
     if not args.no_PTQ:
         q_m = quantize_using_PTQ(q_m, calib_inputs, args)
+        if args.output_dir is not None and "ptq_checkpoint" in args.save:
+            save_name = get_ptq_model_name(model, args)
+            save_path = pathlib.Path(args.output_dir, save_name)
+            print(f"Saving PTQ model to {save_path}")
+            torch.save(q_m, save_path)
     # after PTQ quantizer only fixed-length input sequences are valid
     evaluate(q_m, tokenizer, dataset_test, args)
-    if args.save_layers_to_folder is not None:
-        save_layers_to(q_m, args.max_seq_len, args.save_layers_to_folder)
+    if args.output_dir is not None and "circle_per_layer" in args.save:
+        save_layers_to(q_m, args.max_seq_len, args.output_dir)
-    if args.save_circle_to_folder is not None:
+    if args.output_dir is not None and "circle_full" in args.save:
         calib_inputs = list(torch.stack(calib_inputs).reshape(-1, 1, args.max_seq_len))
-        save_model_to(q_m, calib_inputs, args.save_circle_to_folder)
+        save_model_to(q_m, calib_inputs, args.output_dir)
 if __name__ == "__main__":

tico 0.2.0.dev260411__tar.gz → 0.2.0.dev260415__tar.gz

tico 0.2.0.dev260411tar.gz → 0.2.0.dev260415tar.gz