tico 0.1.0.dev250924__py3-none-any.whl → 0.1.0.dev251109__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of tico might be problematic. Click here for more details.
- tico/__init__.py +1 -1
- tico/quantization/__init__.py +6 -0
- tico/quantization/algorithm/fpi_gptq/fpi_gptq.py +161 -0
- tico/quantization/algorithm/fpi_gptq/quantizer.py +179 -0
- tico/{experimental/quantization → quantization}/algorithm/gptq/gptq.py +24 -3
- tico/{experimental/quantization → quantization}/algorithm/gptq/quantizer.py +12 -6
- tico/{experimental/quantization → quantization}/algorithm/pt2e/annotation/annotator.py +6 -8
- tico/{experimental/quantization → quantization}/algorithm/pt2e/annotation/op/adaptive_avg_pool2d.py +4 -6
- tico/{experimental/quantization → quantization}/algorithm/pt2e/annotation/op/add.py +4 -6
- tico/{experimental/quantization → quantization}/algorithm/pt2e/annotation/op/conv2d.py +4 -6
- tico/{experimental/quantization → quantization}/algorithm/pt2e/annotation/op/div.py +4 -6
- tico/{experimental/quantization → quantization}/algorithm/pt2e/annotation/op/linear.py +4 -6
- tico/{experimental/quantization → quantization}/algorithm/pt2e/annotation/op/mean.py +4 -6
- tico/{experimental/quantization → quantization}/algorithm/pt2e/annotation/op/mul.py +4 -6
- tico/{experimental/quantization → quantization}/algorithm/pt2e/annotation/op/relu6.py +4 -6
- tico/{experimental/quantization → quantization}/algorithm/pt2e/annotation/op/rsqrt.py +4 -6
- tico/{experimental/quantization → quantization}/algorithm/pt2e/annotation/op/sub.py +4 -6
- tico/{experimental/quantization → quantization}/algorithm/pt2e/annotation/spec.py +1 -3
- tico/{experimental/quantization → quantization}/algorithm/pt2e/annotation/utils.py +1 -1
- tico/{experimental/quantization → quantization}/algorithm/pt2e/quantizer.py +4 -4
- tico/{experimental/quantization → quantization}/algorithm/pt2e/utils.py +1 -3
- tico/{experimental/quantization → quantization}/algorithm/smoothquant/quantizer.py +6 -10
- tico/quantization/config/fpi_gptq.py +29 -0
- tico/{experimental/quantization → quantization}/config/gptq.py +1 -1
- tico/{experimental/quantization → quantization}/config/pt2e.py +1 -1
- tico/{experimental/quantization/ptq/quant_config.py → quantization/config/ptq.py} +18 -10
- tico/{experimental/quantization → quantization}/config/smoothquant.py +1 -1
- tico/{experimental/quantization → quantization}/evaluation/evaluate.py +6 -12
- tico/{experimental/quantization → quantization}/evaluation/executor/circle_executor.py +1 -3
- tico/{experimental/quantization → quantization}/evaluation/executor/triv24_executor.py +2 -4
- tico/{experimental/quantization → quantization}/evaluation/utils.py +1 -1
- tico/{experimental/quantization → quantization}/public_interface.py +7 -7
- tico/{experimental/quantization → quantization}/quantizer.py +1 -1
- tico/{experimental/quantization → quantization}/quantizer_registry.py +11 -10
- tico/{experimental/quantization/ptq → quantization/wrapq}/examples/compare_ppl.py +8 -19
- tico/{experimental/quantization/ptq → quantization/wrapq}/examples/debug_quant_outputs.py +9 -24
- tico/{experimental/quantization/ptq → quantization/wrapq}/examples/quantize_linear.py +11 -10
- tico/{experimental/quantization/ptq → quantization/wrapq}/examples/quantize_llama_attn.py +10 -12
- tico/{experimental/quantization/ptq → quantization/wrapq}/examples/quantize_llama_decoder_layer.py +10 -9
- tico/{experimental/quantization/ptq → quantization/wrapq}/examples/quantize_llama_mlp.py +13 -13
- tico/{experimental/quantization/ptq → quantization/wrapq}/examples/quantize_with_gptq.py +14 -35
- tico/{experimental/quantization/ptq → quantization/wrapq}/observers/affine_base.py +3 -3
- tico/{experimental/quantization/ptq → quantization/wrapq}/observers/base.py +2 -2
- tico/{experimental/quantization/ptq → quantization/wrapq}/observers/ema.py +2 -2
- tico/{experimental/quantization/ptq → quantization/wrapq}/observers/identity.py +1 -1
- tico/{experimental/quantization/ptq → quantization/wrapq}/observers/minmax.py +2 -2
- tico/{experimental/quantization/ptq → quantization/wrapq}/observers/mx.py +1 -1
- tico/quantization/wrapq/quantizer.py +179 -0
- tico/{experimental/quantization/ptq → quantization/wrapq}/utils/introspection.py +3 -5
- tico/{experimental/quantization/ptq → quantization/wrapq}/utils/metrics.py +3 -2
- tico/{experimental/quantization/ptq → quantization/wrapq}/wrappers/fairseq/__init__.py +1 -1
- tico/{experimental/quantization/ptq → quantization/wrapq}/wrappers/fairseq/quant_decoder.py +6 -8
- tico/{experimental/quantization/ptq → quantization/wrapq}/wrappers/fairseq/quant_decoder_layer.py +6 -8
- tico/{experimental/quantization/ptq → quantization/wrapq}/wrappers/fairseq/quant_encoder.py +6 -8
- tico/{experimental/quantization/ptq → quantization/wrapq}/wrappers/fairseq/quant_encoder_layer.py +6 -8
- tico/{experimental/quantization/ptq → quantization/wrapq}/wrappers/fairseq/quant_mha.py +5 -7
- tico/{experimental/quantization/ptq → quantization/wrapq}/wrappers/llama/quant_attn.py +5 -7
- tico/{experimental/quantization/ptq → quantization/wrapq}/wrappers/llama/quant_decoder_layer.py +8 -12
- tico/{experimental/quantization/ptq → quantization/wrapq}/wrappers/llama/quant_mlp.py +5 -7
- tico/quantization/wrapq/wrappers/nn/__init__.py +1 -0
- tico/{experimental/quantization/ptq → quantization/wrapq}/wrappers/nn/quant_layernorm.py +6 -7
- tico/{experimental/quantization/ptq → quantization/wrapq}/wrappers/nn/quant_linear.py +7 -8
- tico/{experimental/quantization/ptq → quantization/wrapq}/wrappers/nn/quant_silu.py +8 -9
- tico/{experimental/quantization/ptq → quantization/wrapq}/wrappers/ptq_wrapper.py +4 -6
- tico/{experimental/quantization/ptq → quantization/wrapq}/wrappers/quant_elementwise.py +55 -17
- tico/{experimental/quantization/ptq → quantization/wrapq}/wrappers/quant_module_base.py +10 -9
- tico/{experimental/quantization/ptq → quantization/wrapq}/wrappers/registry.py +17 -16
- tico/utils/convert.py +9 -14
- {tico-0.1.0.dev250924.dist-info → tico-0.1.0.dev251109.dist-info}/METADATA +48 -2
- {tico-0.1.0.dev250924.dist-info → tico-0.1.0.dev251109.dist-info}/RECORD +113 -108
- tico/experimental/quantization/__init__.py +0 -6
- /tico/{experimental/quantization → quantization}/algorithm/__init__.py +0 -0
- /tico/{experimental/quantization/algorithm/gptq → quantization/algorithm/fpi_gptq}/__init__.py +0 -0
- /tico/{experimental/quantization/algorithm/pt2e → quantization/algorithm/gptq}/__init__.py +0 -0
- /tico/{experimental/quantization → quantization}/algorithm/gptq/quant.py +0 -0
- /tico/{experimental/quantization → quantization}/algorithm/gptq/utils.py +0 -0
- /tico/{experimental/quantization/algorithm/pt2e/annotation → quantization/algorithm/pt2e}/__init__.py +0 -0
- /tico/{experimental/quantization/algorithm/pt2e/transformation → quantization/algorithm/pt2e/annotation}/__init__.py +0 -0
- /tico/{experimental/quantization → quantization}/algorithm/pt2e/annotation/config.py +0 -0
- /tico/{experimental/quantization → quantization}/algorithm/pt2e/annotation/op/__init__.py +0 -0
- /tico/{experimental/quantization/algorithm/smoothquant → quantization/algorithm/pt2e/transformation}/__init__.py +0 -0
- /tico/{experimental/quantization → quantization}/algorithm/pt2e/transformation/convert_scalars_to_attrs.py +0 -0
- /tico/{experimental/quantization/config → quantization/algorithm/smoothquant}/__init__.py +0 -0
- /tico/{experimental/quantization → quantization}/algorithm/smoothquant/observer.py +0 -0
- /tico/{experimental/quantization → quantization}/algorithm/smoothquant/smooth_quant.py +0 -0
- /tico/{experimental/quantization/evaluation → quantization/config}/__init__.py +0 -0
- /tico/{experimental/quantization → quantization}/config/base.py +0 -0
- /tico/{experimental/quantization/evaluation/executor → quantization/evaluation}/__init__.py +0 -0
- /tico/{experimental/quantization → quantization}/evaluation/backend.py +0 -0
- /tico/{experimental/quantization/passes → quantization/evaluation/executor}/__init__.py +0 -0
- /tico/{experimental/quantization → quantization}/evaluation/executor/backend_executor.py +0 -0
- /tico/{experimental/quantization → quantization}/evaluation/metric.py +0 -0
- /tico/{experimental/quantization/ptq → quantization/passes}/__init__.py +0 -0
- /tico/{experimental/quantization → quantization}/passes/fold_quant_ops.py +0 -0
- /tico/{experimental/quantization → quantization}/passes/insert_quantize_on_dtype_mismatch.py +0 -0
- /tico/{experimental/quantization → quantization}/passes/propagate_qparam_backward.py +0 -0
- /tico/{experimental/quantization → quantization}/passes/propagate_qparam_forward.py +0 -0
- /tico/{experimental/quantization → quantization}/passes/quantize_bias.py +0 -0
- /tico/{experimental/quantization → quantization}/passes/remove_weight_dequant_op.py +0 -0
- /tico/{experimental/quantization/ptq/examples → quantization/wrapq}/__init__.py +0 -0
- /tico/{experimental/quantization/ptq → quantization/wrapq}/dtypes.py +0 -0
- /tico/{experimental/quantization/ptq/observers → quantization/wrapq/examples}/__init__.py +0 -0
- /tico/{experimental/quantization/ptq → quantization/wrapq}/mode.py +0 -0
- /tico/{experimental/quantization/ptq/utils → quantization/wrapq/observers}/__init__.py +0 -0
- /tico/{experimental/quantization/ptq → quantization/wrapq}/qscheme.py +0 -0
- /tico/{experimental/quantization/ptq/wrappers → quantization/wrapq/utils}/__init__.py +0 -0
- /tico/{experimental/quantization/ptq → quantization/wrapq}/utils/reduce_utils.py +0 -0
- /tico/{experimental/quantization/ptq/wrappers/llama → quantization/wrapq/wrappers}/__init__.py +0 -0
- /tico/{experimental/quantization/ptq → quantization/wrapq}/wrappers/fairseq/decoder_export_single_step.py +0 -0
- /tico/{experimental/quantization/ptq/wrappers/nn → quantization/wrapq/wrappers/llama}/__init__.py +0 -0
- {tico-0.1.0.dev250924.dist-info → tico-0.1.0.dev251109.dist-info}/LICENSE +0 -0
- {tico-0.1.0.dev250924.dist-info → tico-0.1.0.dev251109.dist-info}/WHEEL +0 -0
- {tico-0.1.0.dev250924.dist-info → tico-0.1.0.dev251109.dist-info}/entry_points.txt +0 -0
- {tico-0.1.0.dev250924.dist-info → tico-0.1.0.dev251109.dist-info}/top_level.txt +0 -0
|
@@ -18,13 +18,13 @@ import torch
|
|
|
18
18
|
|
|
19
19
|
from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
|
|
20
20
|
|
|
21
|
-
from tico.
|
|
21
|
+
from tico.quantization.algorithm.pt2e.annotation.annotator import (
|
|
22
22
|
get_asymmetric_quantization_config,
|
|
23
23
|
PT2EAnnotator,
|
|
24
24
|
)
|
|
25
|
-
from tico.
|
|
26
|
-
from tico.
|
|
27
|
-
from tico.
|
|
25
|
+
from tico.quantization.config.pt2e import PT2EConfig
|
|
26
|
+
from tico.quantization.quantizer import BaseQuantizer
|
|
27
|
+
from tico.quantization.quantizer_registry import register_quantizer
|
|
28
28
|
|
|
29
29
|
|
|
30
30
|
@register_quantizer(PT2EConfig)
|
|
@@ -20,9 +20,7 @@ import torch
|
|
|
20
20
|
from torch.ao.quantization.quantizer import QuantizationSpec
|
|
21
21
|
from torch.ao.quantization.quantizer.utils import _get_module_name_filter
|
|
22
22
|
|
|
23
|
-
from tico.
|
|
24
|
-
QuantizationConfig,
|
|
25
|
-
)
|
|
23
|
+
from tico.quantization.algorithm.pt2e.annotation.config import QuantizationConfig
|
|
26
24
|
|
|
27
25
|
|
|
28
26
|
def get_module_type_filter(tp: Callable):
|
|
@@ -16,16 +16,12 @@ from typing import Any, Dict, Optional
|
|
|
16
16
|
|
|
17
17
|
import torch
|
|
18
18
|
|
|
19
|
-
from tico.
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
from tico.
|
|
24
|
-
|
|
25
|
-
)
|
|
26
|
-
from tico.experimental.quantization.config.smoothquant import SmoothQuantConfig
|
|
27
|
-
from tico.experimental.quantization.quantizer import BaseQuantizer
|
|
28
|
-
from tico.experimental.quantization.quantizer_registry import register_quantizer
|
|
19
|
+
from tico.quantization.algorithm.smoothquant.observer import ChannelwiseMaxActsObserver
|
|
20
|
+
|
|
21
|
+
from tico.quantization.algorithm.smoothquant.smooth_quant import apply_smoothing
|
|
22
|
+
from tico.quantization.config.smoothquant import SmoothQuantConfig
|
|
23
|
+
from tico.quantization.quantizer import BaseQuantizer
|
|
24
|
+
from tico.quantization.quantizer_registry import register_quantizer
|
|
29
25
|
|
|
30
26
|
|
|
31
27
|
@register_quantizer(SmoothQuantConfig)
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from tico.quantization.config.gptq import GPTQConfig
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class FPIGPTQConfig(GPTQConfig):
|
|
19
|
+
"""
|
|
20
|
+
Configuration for FPIGPTQ (Fixed Point Iteration).
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def __init__(self, verbose: bool = False, show_progress: bool = True):
|
|
24
|
+
self.verbose = verbose
|
|
25
|
+
self.show_progress = show_progress
|
|
26
|
+
|
|
27
|
+
@property
|
|
28
|
+
def name(self) -> str:
|
|
29
|
+
return "fpi_gptq"
|
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
from tico.
|
|
15
|
+
from tico.quantization.config.base import BaseConfig
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
class GPTQConfig(BaseConfig):
|
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
from tico.
|
|
15
|
+
from tico.quantization.config.base import BaseConfig
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
class PT2EConfig(BaseConfig):
|
|
@@ -15,14 +15,15 @@
|
|
|
15
15
|
from dataclasses import dataclass, field
|
|
16
16
|
from typing import Any, Dict, Mapping, Type
|
|
17
17
|
|
|
18
|
-
from tico.
|
|
19
|
-
from tico.
|
|
20
|
-
from tico.
|
|
21
|
-
from tico.
|
|
18
|
+
from tico.quantization.config.base import BaseConfig
|
|
19
|
+
from tico.quantization.wrapq.dtypes import DType
|
|
20
|
+
from tico.quantization.wrapq.observers.base import ObserverBase
|
|
21
|
+
from tico.quantization.wrapq.observers.minmax import MinMaxObserver
|
|
22
|
+
from tico.quantization.wrapq.qscheme import QScheme
|
|
22
23
|
|
|
23
24
|
|
|
24
25
|
@dataclass
|
|
25
|
-
class
|
|
26
|
+
class PTQConfig(BaseConfig):
|
|
26
27
|
"""
|
|
27
28
|
One object describes the quantization preferences for a single wrapper
|
|
28
29
|
and its descendants.
|
|
@@ -54,9 +55,9 @@ class QuantConfig:
|
|
|
54
55
|
Example
|
|
55
56
|
-------
|
|
56
57
|
```python
|
|
57
|
-
from
|
|
58
|
+
from wrapq.observers import PercentileObserver
|
|
58
59
|
|
|
59
|
-
cfg =
|
|
60
|
+
cfg = PTQConfig(
|
|
60
61
|
default_dtype = DType.uint(8),
|
|
61
62
|
default_qscheme = QScheme.PER_TENSOR_SYMM, # <- global scheme
|
|
62
63
|
default_observer = PercentileObserver, # <- global algorithm
|
|
@@ -74,6 +75,12 @@ class QuantConfig:
|
|
|
74
75
|
default_observer: Type[ObserverBase] = MinMaxObserver
|
|
75
76
|
default_qscheme: QScheme = QScheme.PER_TENSOR_ASYMM
|
|
76
77
|
overrides: Mapping[str, Mapping[str, Any]] = field(default_factory=dict)
|
|
78
|
+
# If True, any module that cannot be wrapped will raise.
|
|
79
|
+
strict_wrap: bool = True
|
|
80
|
+
|
|
81
|
+
@property
|
|
82
|
+
def name(self) -> str:
|
|
83
|
+
return "ptq"
|
|
77
84
|
|
|
78
85
|
def get_kwargs(self, obs_name: str) -> Dict[str, Any]:
|
|
79
86
|
"""
|
|
@@ -87,7 +94,7 @@ class QuantConfig:
|
|
|
87
94
|
"""
|
|
88
95
|
return dict(self.overrides.get(obs_name, {}))
|
|
89
96
|
|
|
90
|
-
def child(self, scope: str) -> "
|
|
97
|
+
def child(self, scope: str) -> "PTQConfig":
|
|
91
98
|
"""
|
|
92
99
|
Produce a *view* for a child wrapper.
|
|
93
100
|
|
|
@@ -100,12 +107,13 @@ class QuantConfig:
|
|
|
100
107
|
Other scopes remain invisible to the child.
|
|
101
108
|
"""
|
|
102
109
|
sub_overrides = self.overrides.get(scope, {})
|
|
103
|
-
return
|
|
110
|
+
return PTQConfig(
|
|
104
111
|
self.default_dtype,
|
|
105
112
|
self.default_observer,
|
|
106
113
|
default_qscheme=self.default_qscheme,
|
|
107
114
|
overrides=sub_overrides,
|
|
115
|
+
strict_wrap=self.strict_wrap,
|
|
108
116
|
)
|
|
109
117
|
|
|
110
118
|
def __repr__(self):
|
|
111
|
-
return f"
|
|
119
|
+
return f"PTQConfig(default_dtype={self.default_dtype}, default_observer={self.default_observer}, default_qscheme={self.default_qscheme}, overrides={dict(self.overrides)}, strict_wrap={self.strict_wrap})"
|
|
@@ -20,18 +20,12 @@ import torch
|
|
|
20
20
|
from circle_schema import circle
|
|
21
21
|
from torch.utils import _pytree as pytree
|
|
22
22
|
|
|
23
|
-
from tico.
|
|
24
|
-
from tico.
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
from tico.
|
|
28
|
-
|
|
29
|
-
)
|
|
30
|
-
from tico.experimental.quantization.evaluation.executor.triv24_executor import (
|
|
31
|
-
Triv24Executor,
|
|
32
|
-
)
|
|
33
|
-
from tico.experimental.quantization.evaluation.metric import MetricCalculator
|
|
34
|
-
from tico.experimental.quantization.evaluation.utils import (
|
|
23
|
+
from tico.quantization.evaluation.backend import BACKEND
|
|
24
|
+
from tico.quantization.evaluation.executor.backend_executor import BackendExecutor
|
|
25
|
+
from tico.quantization.evaluation.executor.circle_executor import CircleExecutor
|
|
26
|
+
from tico.quantization.evaluation.executor.triv24_executor import Triv24Executor
|
|
27
|
+
from tico.quantization.evaluation.metric import MetricCalculator
|
|
28
|
+
from tico.quantization.evaluation.utils import (
|
|
35
29
|
ensure_list,
|
|
36
30
|
find_invalid_types,
|
|
37
31
|
get_graph_input_output,
|
|
@@ -19,9 +19,7 @@ from typing import List
|
|
|
19
19
|
import numpy as np
|
|
20
20
|
import torch
|
|
21
21
|
|
|
22
|
-
from tico.
|
|
23
|
-
BackendExecutor,
|
|
24
|
-
)
|
|
22
|
+
from tico.quantization.evaluation.executor.backend_executor import BackendExecutor
|
|
25
23
|
from tico.utils.model import CircleModel
|
|
26
24
|
from tico.utils.utils import run_bash_cmd
|
|
27
25
|
|
|
@@ -20,10 +20,8 @@ import numpy as np
|
|
|
20
20
|
import torch
|
|
21
21
|
from circle_schema import circle
|
|
22
22
|
|
|
23
|
-
from tico.
|
|
24
|
-
|
|
25
|
-
)
|
|
26
|
-
from tico.experimental.quantization.evaluation.utils import (
|
|
23
|
+
from tico.quantization.evaluation.executor.backend_executor import BackendExecutor
|
|
24
|
+
from tico.quantization.evaluation.utils import (
|
|
27
25
|
dequantize,
|
|
28
26
|
get_graph_input_output,
|
|
29
27
|
quantize,
|
|
@@ -44,7 +44,7 @@ def quantize(
|
|
|
44
44
|
data = np.array(data)
|
|
45
45
|
# Perfrom quantization
|
|
46
46
|
if not scale:
|
|
47
|
-
logger.
|
|
47
|
+
logger.warning("WARNING: scale value is 0. 1e-7 will be used instead.")
|
|
48
48
|
scale = 1e-7
|
|
49
49
|
rescaled = np.round(data / scale) + zero_point
|
|
50
50
|
# Clamp the values
|
|
@@ -17,11 +17,11 @@ from typing import Any, Dict, Optional
|
|
|
17
17
|
|
|
18
18
|
import torch
|
|
19
19
|
|
|
20
|
-
from tico.
|
|
21
|
-
from tico.
|
|
22
|
-
from tico.
|
|
23
|
-
from tico.
|
|
24
|
-
from tico.
|
|
20
|
+
from tico.quantization.algorithm.gptq.quantizer import GPTQQuantizer
|
|
21
|
+
from tico.quantization.algorithm.pt2e.quantizer import PT2EQuantizer
|
|
22
|
+
from tico.quantization.config.base import BaseConfig
|
|
23
|
+
from tico.quantization.quantizer import BaseQuantizer
|
|
24
|
+
from tico.quantization.quantizer_registry import get_quantizer
|
|
25
25
|
|
|
26
26
|
|
|
27
27
|
QUANTIZER_ATTRIBUTE_NAME = "tico_quantizer"
|
|
@@ -32,7 +32,7 @@ def prepare(
|
|
|
32
32
|
quant_config: BaseConfig,
|
|
33
33
|
args: Optional[Any] = None,
|
|
34
34
|
kwargs: Optional[Dict[str, Any]] = None,
|
|
35
|
-
inplace: Optional[bool] =
|
|
35
|
+
inplace: Optional[bool] = True,
|
|
36
36
|
):
|
|
37
37
|
"""
|
|
38
38
|
Prepare the model for quantization using the provided configuration.
|
|
@@ -68,7 +68,7 @@ def prepare(
|
|
|
68
68
|
return model
|
|
69
69
|
|
|
70
70
|
|
|
71
|
-
def convert(model, inplace: Optional[bool] =
|
|
71
|
+
def convert(model, inplace: Optional[bool] = True):
|
|
72
72
|
"""
|
|
73
73
|
Convert the prepared model to a quantized model using the provided configuration.
|
|
74
74
|
|
|
@@ -15,8 +15,8 @@
|
|
|
15
15
|
import importlib
|
|
16
16
|
from typing import Dict, Optional, Type, TypeVar
|
|
17
17
|
|
|
18
|
-
from tico.
|
|
19
|
-
from tico.
|
|
18
|
+
from tico.quantization.config.base import BaseConfig
|
|
19
|
+
from tico.quantization.quantizer import BaseQuantizer
|
|
20
20
|
|
|
21
21
|
TQ = TypeVar("TQ", bound=BaseQuantizer)
|
|
22
22
|
|
|
@@ -53,14 +53,15 @@ def get_quantizer(cfg: BaseConfig) -> BaseQuantizer:
|
|
|
53
53
|
# Lazy import by naming convention
|
|
54
54
|
name = getattr(cfg, "name", None)
|
|
55
55
|
if name:
|
|
56
|
-
|
|
57
|
-
importlib.import_module(
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
56
|
+
if name == "ptq":
|
|
57
|
+
importlib.import_module(f"tico.quantization.wrapq.quantizer")
|
|
58
|
+
else:
|
|
59
|
+
try:
|
|
60
|
+
importlib.import_module(f"tico.quantization.algorithm.{name}.quantizer")
|
|
61
|
+
except Exception as e:
|
|
62
|
+
raise RuntimeError(
|
|
63
|
+
f"Failed to import quantizer module for config name='{name}': {e}"
|
|
64
|
+
)
|
|
64
65
|
|
|
65
66
|
qcls = _lookup(cfg)
|
|
66
67
|
if qcls is not None:
|
|
@@ -22,16 +22,15 @@
|
|
|
22
22
|
|
|
23
23
|
import argparse
|
|
24
24
|
import sys
|
|
25
|
-
from typing import Optional
|
|
26
25
|
|
|
27
26
|
import torch
|
|
28
27
|
import tqdm
|
|
29
28
|
from datasets import load_dataset
|
|
30
29
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
31
30
|
|
|
32
|
-
from tico.
|
|
33
|
-
from tico.
|
|
34
|
-
from tico.
|
|
31
|
+
from tico.quantization import convert, prepare
|
|
32
|
+
from tico.quantization.config.ptq import PTQConfig
|
|
33
|
+
from tico.quantization.wrapq.utils.metrics import perplexity
|
|
35
34
|
|
|
36
35
|
# Token-budget presets for activation calibration
|
|
37
36
|
TOKENS: dict[str, int] = {
|
|
@@ -165,13 +164,8 @@ def main():
|
|
|
165
164
|
# ---------------------------------------------------------------------
|
|
166
165
|
# 2. Wrap every Transformer layer with PTQWrapper
|
|
167
166
|
# ---------------------------------------------------------------------
|
|
168
|
-
qcfg =
|
|
169
|
-
|
|
170
|
-
wrapped_layers = torch.nn.ModuleList()
|
|
171
|
-
for idx, layer in enumerate(uint8_model.model.layers):
|
|
172
|
-
layer_cfg = qcfg.child(f"layer{idx}")
|
|
173
|
-
wrapped_layers.append(PTQWrapper(layer, qcfg=layer_cfg))
|
|
174
|
-
uint8_model.model.layers = wrapped_layers
|
|
167
|
+
qcfg = PTQConfig() # all-uint8 defaults
|
|
168
|
+
prepare(uint8_model, qcfg)
|
|
175
169
|
|
|
176
170
|
# ---------------------------------------------------------------------
|
|
177
171
|
# 3. Single-pass activation calibration
|
|
@@ -182,11 +176,7 @@ def main():
|
|
|
182
176
|
)[:CALIB_TOKENS]
|
|
183
177
|
ids = tokenizer(calib_txt, return_tensors="pt").input_ids.to(device)
|
|
184
178
|
|
|
185
|
-
#
|
|
186
|
-
for l in uint8_model.model.layers:
|
|
187
|
-
l.enable_calibration()
|
|
188
|
-
|
|
189
|
-
# (b) run inference to collect ranges
|
|
179
|
+
# Run inference to collect ranges
|
|
190
180
|
iterator = range(0, ids.size(1) - 1, args.stride)
|
|
191
181
|
if not args.no_tqdm:
|
|
192
182
|
iterator = tqdm.tqdm(iterator, desc="Calibration")
|
|
@@ -194,9 +184,8 @@ def main():
|
|
|
194
184
|
for i in iterator:
|
|
195
185
|
uint8_model(ids[:, i : i + args.stride])
|
|
196
186
|
|
|
197
|
-
#
|
|
198
|
-
|
|
199
|
-
l.freeze_qparams()
|
|
187
|
+
# Freeze (scale, zero-point)
|
|
188
|
+
convert(uint8_model)
|
|
200
189
|
|
|
201
190
|
# -------------------------------------------------------------------------
|
|
202
191
|
# 4. Evaluate perplexity
|
|
@@ -38,13 +38,14 @@ import tqdm
|
|
|
38
38
|
from datasets import load_dataset
|
|
39
39
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
40
40
|
|
|
41
|
-
from tico.
|
|
42
|
-
from tico.
|
|
41
|
+
from tico.quantization import convert, prepare
|
|
42
|
+
from tico.quantization.config.ptq import PTQConfig
|
|
43
|
+
from tico.quantization.wrapq.utils.introspection import (
|
|
43
44
|
build_fqn_map,
|
|
44
45
|
compare_layer_outputs,
|
|
45
46
|
save_fp_outputs,
|
|
46
47
|
)
|
|
47
|
-
from tico.
|
|
48
|
+
from tico.quantization.wrapq.wrappers.ptq_wrapper import PTQWrapper
|
|
48
49
|
|
|
49
50
|
# Token-budget presets for activation calibration
|
|
50
51
|
TOKENS: dict[str, int] = {
|
|
@@ -176,19 +177,8 @@ def main():
|
|
|
176
177
|
# 2. Wrap every layer with PTQWrapper (UINT-8 activations)
|
|
177
178
|
# -------------------------------------------------------------------------
|
|
178
179
|
print("Wrapping layers with PTQWrapper …")
|
|
179
|
-
qcfg =
|
|
180
|
-
|
|
181
|
-
new_layers = torch.nn.ModuleList()
|
|
182
|
-
for idx, fp_layer in enumerate(model.model.layers):
|
|
183
|
-
layer_cfg = qcfg.child(f"layer{idx}")
|
|
184
|
-
q_layer = PTQWrapper(
|
|
185
|
-
fp_layer,
|
|
186
|
-
qcfg=layer_cfg,
|
|
187
|
-
fp_name=m_to_fqn.get(fp_layer),
|
|
188
|
-
)
|
|
189
|
-
new_layers.append(q_layer)
|
|
190
|
-
|
|
191
|
-
model.model.layers = new_layers # swap in quant wrappers
|
|
180
|
+
qcfg = PTQConfig() # default: per-tensor UINT8
|
|
181
|
+
prepare(model, qcfg)
|
|
192
182
|
|
|
193
183
|
# -------------------------------------------------------------------------
|
|
194
184
|
# 3. Activation calibration plus FP-vs-UINT8 diffing
|
|
@@ -197,10 +187,6 @@ def main():
|
|
|
197
187
|
calib_txt = " ".join(dataset["text"])[:CALIB_TOKENS]
|
|
198
188
|
ids = tokenizer(calib_txt, return_tensors="pt").input_ids.to(device)
|
|
199
189
|
|
|
200
|
-
# (a) Enable CALIB mode on every QuantModuleBase
|
|
201
|
-
for l in model.model.layers:
|
|
202
|
-
l.enable_calibration()
|
|
203
|
-
|
|
204
190
|
# Save reference FP activations before observers clamp/quantize
|
|
205
191
|
save_handles, act_cache = save_fp_outputs(model)
|
|
206
192
|
|
|
@@ -216,11 +202,10 @@ def main():
|
|
|
216
202
|
for h in save_handles:
|
|
217
203
|
h.remove()
|
|
218
204
|
|
|
219
|
-
#
|
|
220
|
-
|
|
221
|
-
l.freeze_qparams()
|
|
205
|
+
# Freeze (scale, zero-point) after calibration
|
|
206
|
+
convert(model)
|
|
222
207
|
|
|
223
|
-
#
|
|
208
|
+
# Register diff hooks and measure per-layer deltas
|
|
224
209
|
cmp_handles = compare_layer_outputs(model, act_cache, metrics=["diff", "peir"])
|
|
225
210
|
# Use same inputs for comparison.
|
|
226
211
|
with torch.no_grad():
|
|
@@ -29,13 +29,15 @@ import pathlib
|
|
|
29
29
|
import torch
|
|
30
30
|
import torch.nn as nn
|
|
31
31
|
|
|
32
|
-
from tico.
|
|
33
|
-
from tico.
|
|
34
|
-
|
|
35
|
-
from tico.
|
|
36
|
-
from tico.
|
|
32
|
+
from tico.quantization import convert, prepare
|
|
33
|
+
from tico.quantization.config.ptq import PTQConfig
|
|
34
|
+
from tico.quantization.evaluation.metric import compute_peir
|
|
35
|
+
from tico.quantization.evaluation.utils import plot_two_outputs
|
|
36
|
+
from tico.quantization.wrapq.mode import Mode
|
|
37
|
+
from tico.quantization.wrapq.wrappers.nn.quant_linear import QuantLinear
|
|
37
38
|
from tico.utils.utils import SuppressWarning
|
|
38
39
|
|
|
40
|
+
|
|
39
41
|
# -------------------------------------------------------------------------
|
|
40
42
|
# 0. Define a toy model (1 Linear layer only)
|
|
41
43
|
# -------------------------------------------------------------------------
|
|
@@ -60,20 +62,19 @@ fp32_layer = model.fc
|
|
|
60
62
|
# -------------------------------------------------------------------------
|
|
61
63
|
# 1. Replace the Linear with QuantLinear wrapper
|
|
62
64
|
# -------------------------------------------------------------------------
|
|
63
|
-
model.fc =
|
|
64
|
-
# model.fc = PTQWrapper(fp32_layer) (Wrapping helper class)
|
|
65
|
+
model.fc = prepare(fp32_layer, PTQConfig()) # type: ignore[assignment]
|
|
65
66
|
qlayer = model.fc # alias for brevity
|
|
66
67
|
|
|
67
68
|
# -------------------------------------------------------------------------
|
|
68
69
|
# 2. Single-pass calibration (collect activation ranges)
|
|
69
70
|
# -------------------------------------------------------------------------
|
|
70
|
-
assert isinstance(qlayer, QuantLinear)
|
|
71
|
+
assert isinstance(qlayer.wrapped, QuantLinear)
|
|
71
72
|
with torch.no_grad():
|
|
72
|
-
qlayer.enable_calibration()
|
|
73
73
|
for _ in range(16): # small toy batch
|
|
74
74
|
x = torch.randn(4, 16) # (batch=4, features=16)
|
|
75
75
|
_ = model(x)
|
|
76
|
-
|
|
76
|
+
|
|
77
|
+
convert(qlayer)
|
|
77
78
|
|
|
78
79
|
assert qlayer._mode is Mode.QUANT, "Quantization mode should be active now."
|
|
79
80
|
|
|
@@ -17,13 +17,12 @@ import pathlib
|
|
|
17
17
|
import torch
|
|
18
18
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
19
19
|
|
|
20
|
-
from tico.
|
|
21
|
-
from tico.
|
|
22
|
-
|
|
23
|
-
from tico.
|
|
24
|
-
from tico.
|
|
25
|
-
|
|
26
|
-
)
|
|
20
|
+
from tico.quantization import convert, prepare
|
|
21
|
+
from tico.quantization.config.ptq import PTQConfig
|
|
22
|
+
from tico.quantization.evaluation.metric import compute_peir
|
|
23
|
+
from tico.quantization.evaluation.utils import plot_two_outputs
|
|
24
|
+
from tico.quantization.wrapq.mode import Mode
|
|
25
|
+
from tico.quantization.wrapq.wrappers.llama.quant_attn import QuantLlamaAttention
|
|
27
26
|
from tico.utils.utils import SuppressWarning
|
|
28
27
|
|
|
29
28
|
name = "Maykeye/TinyLLama-v0"
|
|
@@ -34,12 +33,11 @@ tokenizer = AutoTokenizer.from_pretrained(name)
|
|
|
34
33
|
# 1. Replace layer-0’s MLP with QuantLlamaMLP
|
|
35
34
|
# -------------------------------------------------------------------------
|
|
36
35
|
orig_attn = model.model.layers[0].self_attn
|
|
37
|
-
model.model.layers[0].self_attn =
|
|
38
|
-
orig_attn
|
|
39
|
-
) # PTQWrapper(orig_attn) is also fine
|
|
36
|
+
model.model.layers[0].self_attn = prepare(orig_attn, PTQConfig())
|
|
40
37
|
model.eval()
|
|
41
38
|
|
|
42
39
|
attn_q = model.model.layers[0].self_attn # quant wrapper
|
|
40
|
+
assert isinstance(attn_q.wrapped, QuantLlamaAttention)
|
|
43
41
|
rotary = model.model.rotary_emb
|
|
44
42
|
|
|
45
43
|
# -------------------------------------------------------------------------
|
|
@@ -55,7 +53,6 @@ PROMPTS = [
|
|
|
55
53
|
]
|
|
56
54
|
|
|
57
55
|
with torch.no_grad():
|
|
58
|
-
attn_q.enable_calibration()
|
|
59
56
|
for prompt in PROMPTS:
|
|
60
57
|
ids = tokenizer(prompt, return_tensors="pt")
|
|
61
58
|
embeds = model.model.embed_tokens(ids["input_ids"])
|
|
@@ -63,7 +60,8 @@ with torch.no_grad():
|
|
|
63
60
|
S = cos_sin[0].shape[1]
|
|
64
61
|
float_mask = torch.zeros(1, 1, S, S)
|
|
65
62
|
_ = attn_q(embeds, cos_sin) # observers collect
|
|
66
|
-
|
|
63
|
+
|
|
64
|
+
convert(attn_q)
|
|
67
65
|
|
|
68
66
|
assert attn_q._mode is Mode.QUANT, "Quantization mode should be active now."
|
|
69
67
|
|
tico/{experimental/quantization/ptq → quantization/wrapq}/examples/quantize_llama_decoder_layer.py
RENAMED
|
@@ -31,10 +31,12 @@ import pathlib
|
|
|
31
31
|
import torch
|
|
32
32
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
33
33
|
|
|
34
|
-
from tico.
|
|
35
|
-
from tico.
|
|
36
|
-
from tico.
|
|
37
|
-
from tico.
|
|
34
|
+
from tico.quantization import convert, prepare
|
|
35
|
+
from tico.quantization.config.ptq import PTQConfig
|
|
36
|
+
from tico.quantization.evaluation.metric import compute_peir
|
|
37
|
+
from tico.quantization.evaluation.utils import plot_two_outputs
|
|
38
|
+
from tico.quantization.wrapq.mode import Mode
|
|
39
|
+
from tico.quantization.wrapq.wrappers.llama.quant_decoder_layer import (
|
|
38
40
|
QuantLlamaDecoderLayer,
|
|
39
41
|
)
|
|
40
42
|
from tico.utils.utils import SuppressWarning
|
|
@@ -50,12 +52,11 @@ rotary = model.model.rotary_emb # RoPE helper
|
|
|
50
52
|
# 1. Swap in the quant wrapper
|
|
51
53
|
# -------------------------------------------------------------------------
|
|
52
54
|
fp32_layer = model.model.layers[0] # keep a reference for diff check
|
|
53
|
-
model.model.layers[0] =
|
|
54
|
-
fp32_layer
|
|
55
|
-
) # PTQWrapper(fp32_layer) is also fine
|
|
55
|
+
model.model.layers[0] = prepare(fp32_layer, PTQConfig())
|
|
56
56
|
model.eval()
|
|
57
57
|
|
|
58
58
|
qlayer = model.model.layers[0] # alias for brevity
|
|
59
|
+
assert isinstance(qlayer.wrapped, QuantLlamaDecoderLayer)
|
|
59
60
|
|
|
60
61
|
# -------------------------------------------------------------------------
|
|
61
62
|
# 2. Single-pass calibration (gather activation ranges)
|
|
@@ -70,7 +71,6 @@ PROMPTS = [
|
|
|
70
71
|
]
|
|
71
72
|
|
|
72
73
|
with torch.no_grad():
|
|
73
|
-
qlayer.enable_calibration()
|
|
74
74
|
for prompt in PROMPTS:
|
|
75
75
|
ids = tokenizer(prompt, return_tensors="pt")
|
|
76
76
|
hidden = model.model.embed_tokens(ids["input_ids"])
|
|
@@ -78,7 +78,8 @@ with torch.no_grad():
|
|
|
78
78
|
S = pos[0].shape[1]
|
|
79
79
|
attn_mask = torch.zeros(1, 1, S, S) # causal-mask placeholder
|
|
80
80
|
_ = qlayer(hidden, attention_mask=attn_mask, position_embeddings=pos)
|
|
81
|
-
|
|
81
|
+
|
|
82
|
+
convert(qlayer)
|
|
82
83
|
|
|
83
84
|
assert qlayer._mode is Mode.QUANT, "Quantization mode should be active now."
|
|
84
85
|
|
|
@@ -18,13 +18,14 @@ import torch
|
|
|
18
18
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
19
19
|
|
|
20
20
|
import tico
|
|
21
|
-
from tico.
|
|
22
|
-
from tico.
|
|
23
|
-
from tico.
|
|
24
|
-
from tico.
|
|
25
|
-
from tico.
|
|
26
|
-
from tico.
|
|
27
|
-
from tico.
|
|
21
|
+
from tico.quantization import convert, prepare
|
|
22
|
+
from tico.quantization.config.ptq import PTQConfig
|
|
23
|
+
from tico.quantization.evaluation.metric import compute_peir
|
|
24
|
+
from tico.quantization.evaluation.utils import plot_two_outputs
|
|
25
|
+
from tico.quantization.wrapq.dtypes import INT16
|
|
26
|
+
from tico.quantization.wrapq.mode import Mode
|
|
27
|
+
from tico.quantization.wrapq.qscheme import QScheme
|
|
28
|
+
from tico.quantization.wrapq.wrappers.llama.quant_mlp import QuantLlamaMLP
|
|
28
29
|
from tico.utils.utils import SuppressWarning
|
|
29
30
|
|
|
30
31
|
name = "Maykeye/TinyLLama-v0"
|
|
@@ -36,13 +37,13 @@ model.eval()
|
|
|
36
37
|
# 1. Replace layer-0’s MLP with QuantLlamaMLP
|
|
37
38
|
# -------------------------------------------------------------------------
|
|
38
39
|
fp32_mlp = model.model.layers[0].mlp
|
|
39
|
-
model.model.layers[0].mlp =
|
|
40
|
-
fp32_mlp,
|
|
41
|
-
|
|
42
|
-
) # PTQWrapper(fp32_mlp) is also fine
|
|
40
|
+
model.model.layers[0].mlp = prepare(
|
|
41
|
+
fp32_mlp, PTQConfig(default_dtype=INT16, default_qscheme=QScheme.PER_TENSOR_SYMM)
|
|
42
|
+
)
|
|
43
43
|
model.eval()
|
|
44
44
|
|
|
45
45
|
mlp_q = model.model.layers[0].mlp
|
|
46
|
+
assert isinstance(mlp_q.wrapped, QuantLlamaMLP)
|
|
46
47
|
|
|
47
48
|
# -------------------------------------------------------------------------
|
|
48
49
|
# 2. Single-pass calibration
|
|
@@ -57,13 +58,12 @@ PROMPTS = [
|
|
|
57
58
|
]
|
|
58
59
|
|
|
59
60
|
with torch.no_grad():
|
|
60
|
-
mlp_q.enable_calibration()
|
|
61
61
|
for prompt in PROMPTS:
|
|
62
62
|
enc = tokenizer(prompt, return_tensors="pt")
|
|
63
63
|
emb = model.model.embed_tokens(enc["input_ids"])
|
|
64
64
|
_ = mlp_q(emb)
|
|
65
65
|
|
|
66
|
-
|
|
66
|
+
convert(mlp_q)
|
|
67
67
|
|
|
68
68
|
assert mlp_q._mode is Mode.QUANT, "Quantization mode should be active now."
|
|
69
69
|
|