tico 0.2.0.dev260411__tar.gz → 0.2.0.dev260415__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/PKG-INFO +1 -1
- tico-0.2.0.dev260415/tico/_version.py +1 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/algorithm/gptq/quantizer.py +25 -3
- tico-0.2.0.dev260415/tico/quantization/wrapq/examples/evaluate_fk_llama_model.py +156 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/examples/quantize_full_qmodel_with_gptq.py +58 -10
- tico-0.2.0.dev260415/tico/quantization/wrapq/examples/qwen/trace_qwen.py +1159 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/wrappers/qwen_vl/quant_model.py +66 -6
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/utils/utils.py +21 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico.egg-info/PKG-INFO +1 -1
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico.egg-info/SOURCES.txt +2 -0
- tico-0.2.0.dev260411/tico/_version.py +0 -1
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/LICENSE +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/README.md +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/pyproject.toml +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/setup.cfg +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/__init__.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/config/__init__.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/config/base.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/config/factory.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/config/v1.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/experimental/__init__.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/interpreter/__init__.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/interpreter/infer.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/interpreter/interpreter.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/passes/__init__.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/passes/cast_aten_where_arg_type.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/passes/cast_clamp_mixed_type_args.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/passes/cast_mixed_type_args.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/passes/const_prop_pass.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/passes/convert_conv1d_to_conv2d.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/passes/convert_conv3d_to_conv2d.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/passes/convert_expand_to_slice_cat.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/passes/convert_layout_op_to_reshape.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/passes/convert_matmul_to_linear.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/passes/convert_repeat_to_expand_copy.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/passes/convert_sym_size_to_circle_shape.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/passes/convert_to_relu6.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/passes/decompose_addmm.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/passes/decompose_batch_norm.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/passes/decompose_fake_quantize.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/passes/decompose_fake_quantize_tensor_qparams.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/passes/decompose_group_norm.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/passes/decompose_grouped_conv2d.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/passes/decompose_slice_scatter.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/passes/extract_dtype_kwargs.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/passes/fill_meta_val.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/passes/fuse_leading_unsqueeze_reshape.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/passes/fuse_redundant_reshape_to_mean.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/passes/legalize_causal_mask_value.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/passes/legalize_predefined_layout_operators.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/passes/lower_copy.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/passes/lower_pow2_to_mul.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/passes/lower_to_resize_nearest_neighbor.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/passes/lower_to_slice.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/passes/merge_consecutive_cat.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/passes/ops.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/passes/remove_nop.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/passes/remove_redundant_assert_nodes.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/passes/remove_redundant_expand.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/passes/remove_redundant_permute.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/passes/remove_redundant_reshape.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/passes/remove_redundant_slice.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/passes/remove_redundant_to_copy.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/passes/restore_linear.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/passes/segment_index_select.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/pt2_to_circle.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/__init__.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/algorithm/__init__.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/algorithm/fpi_gptq/__init__.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/algorithm/fpi_gptq/fpi_gptq.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/algorithm/fpi_gptq/quantizer.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/algorithm/gptq/__init__.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/algorithm/gptq/gptq.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/algorithm/gptq/quant.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/algorithm/gptq/utils.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/algorithm/qwen3_vl_gptq/__init__.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/algorithm/qwen3_vl_gptq/gptq.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/algorithm/qwen3_vl_gptq/quantizer.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/algorithm/qwen3_vl_gptq/utils.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/algorithm/smoothquant/__init__.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/algorithm/smoothquant/observer.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/algorithm/smoothquant/quantizer.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/algorithm/smoothquant/smooth_quant.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/algorithm/spinquant/__init__.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/algorithm/spinquant/fuse_norm_utils.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/algorithm/spinquant/hadamard_utils.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/algorithm/spinquant/quantizer.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/algorithm/spinquant/rotation_utils.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/algorithm/spinquant/spin_llama.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/config/__init__.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/config/base.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/config/builders.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/config/fpi_gptq.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/config/gptq.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/config/ptq.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/config/qwen3_vl_gptq.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/config/smoothquant.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/config/spinquant.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/config/utils.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/evaluation/__init__.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/evaluation/backend.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/evaluation/evaluate.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/evaluation/executor/__init__.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/evaluation/executor/backend_executor.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/evaluation/executor/circle_executor.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/evaluation/executor/triv24_executor.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/evaluation/metric.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/evaluation/script/llm_tasks_eval.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/evaluation/script/mini_vqa_eval.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/evaluation/utils.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/evaluation/vlm_eval_utils.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/passes/__init__.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/passes/fold_quant_ops.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/passes/insert_quantize_on_dtype_mismatch.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/passes/propagate_qparam_backward.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/passes/propagate_qparam_forward.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/passes/quantize_bias.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/passes/remove_weight_dequant_op.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/public_interface.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/quantizer.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/quantizer_registry.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/__init__.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/dtypes.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/examples/__init__.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/examples/compare_ppl.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/examples/debug_quant_outputs.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/examples/llama/__init__.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/examples/llama/quantize_attn_decode.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/examples/llama/quantize_attn_prefill.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/examples/llama/quantize_decoder_layer_decode.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/examples/llama/quantize_decoder_layer_prefill.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/examples/llama/quantize_mlp.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/examples/nn/__init__.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/examples/nn/quantize_conv3d.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/examples/nn/quantize_conv3d_special_case.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/examples/nn/quantize_linear.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/examples/quantize_full_vlm_model_with_gptq.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/examples/quantize_qwen3_vl_with_gptq.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/examples/quantize_with_gptq.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/examples/qwen/__init__.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/examples/qwen/quantize_for_conditional_generation.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/examples/qwen/quantize_model.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/examples/qwen/quantize_text_attn.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/examples/qwen/quantize_text_decoder_layer.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/examples/qwen/quantize_text_mlp.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/examples/qwen/quantize_text_model.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/examples/qwen/quantize_vision_attn.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/examples/qwen/quantize_vision_block.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/examples/qwen/quantize_vision_mlp.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/examples/qwen/quantize_vision_model.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/examples/qwen/quantize_vision_patch_embed.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/examples/qwen/quantize_vision_patch_merger.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/examples/static_llama_layer_runtime.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/mode.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/observers/__init__.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/observers/affine_base.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/observers/base.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/observers/ema.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/observers/identity.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/observers/minmax.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/observers/mx.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/qscheme.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/quantizer.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/utils/__init__.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/utils/check_missing_qparam.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/utils/introspection.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/utils/metrics.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/utils/reduce_utils.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/utils/version.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/wrap_helper.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/wrappers/__init__.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/wrappers/fairseq/__init__.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/wrappers/fairseq/decoder_export_single_step.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/wrappers/fairseq/quant_decoder.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/wrappers/fairseq/quant_decoder_layer.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/wrappers/fairseq/quant_encoder.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/wrappers/fairseq/quant_encoder_layer.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/wrappers/fairseq/quant_mha.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/wrappers/llama/__init__.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/wrappers/llama/export_adapters.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/wrappers/llama/quant_attention.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/wrappers/llama/quant_decoder_layer.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/wrappers/llama/quant_mlp.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/wrappers/llama/quant_model.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/wrappers/llama/quant_model_for_causal_lm.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/wrappers/nn/__init__.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/wrappers/nn/quant_conv3d.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/wrappers/nn/quant_conv3d_decomposed.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/wrappers/nn/quant_embedding.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/wrappers/nn/quant_layernorm.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/wrappers/nn/quant_linear.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/wrappers/nn/quant_silu.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/wrappers/ops/__init__.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/wrappers/ops/quant_rmsnorm.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/wrappers/ptq_wrapper.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/wrappers/quant_elementwise.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/wrappers/quant_module_base.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/wrappers/qwen_vl/quant_for_conditional_generation.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/wrappers/qwen_vl/quant_text_attn.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/wrappers/qwen_vl/quant_text_decoder_layer.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/wrappers/qwen_vl/quant_text_mlp.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/wrappers/qwen_vl/quant_text_model.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/wrappers/qwen_vl/quant_vision_attn.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/wrappers/qwen_vl/quant_vision_block.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/wrappers/qwen_vl/quant_vision_mlp.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/wrappers/qwen_vl/quant_vision_model.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/wrappers/qwen_vl/quant_vision_patch_embed.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/wrappers/qwen_vl/quant_vision_patch_merger.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/quantization/wrapq/wrappers/registry.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/__init__.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/circle_graph.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/circle_mapping.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/circle_serializer.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/__init__.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/adapters/__init__.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/adapters/llama_rmsnorm.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/adapters/onert/__init__.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/adapters/onert/llama_attention.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/hashable_opcode.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/node_visitor.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_abs.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_add.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_alias_copy.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_any.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_arange_start_step.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_argmax.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_attention.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_avg_pool2d.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_bmm.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_cat.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_circle_shape.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_clamp.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_clone.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_constant_pad_nd.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_conv2d.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_cos.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_cumsum.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_depthwise_conv2d.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_dequantize_per_channel.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_dequantize_per_tensor.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_div.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_embedding.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_eq.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_exp.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_expand.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_full.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_full_like.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_ge.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_gelu.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_gt.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_index.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_index_select.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_instance_norm.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_le.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_leaky_relu.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_linear.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_log.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_log1p.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_logical_and.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_logical_not.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_lt.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_max_dim.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_max_pool2d_with_indices.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_maximum.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_mean.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_minimum.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_mm.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_mul.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_ne.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_neg.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_permute.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_pow.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_prelu.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_quantize_per_tensor.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_reciprocal.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_relu.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_relu6.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_repeat.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_reshape.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_resize_nearest_neighbor.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_rmsnorm.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_round.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_rsqrt.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_scalar_tensor.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_select_copy.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_sigmoid.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_sin.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_slice.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_softmax.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_split_with_sizes.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_sqrt.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_squeeze.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_sub.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_sum.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_tanh.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_to_copy.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_transpose_conv.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_unsqueeze.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_view.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/op_where.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/operators/utils.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/pack.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/serialize/quant_param.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/utils/__init__.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/utils/compat/__init__.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/utils/compat/torch.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/utils/compat/transformers.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/utils/convert.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/utils/define.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/utils/diff_graph.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/utils/dtype.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/utils/errors.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/utils/graph.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/utils/installed_packages.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/utils/logging.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/utils/model.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/utils/mx/__init__.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/utils/mx/elemwise_ops.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/utils/mx/formats.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/utils/mx/mx_ops.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/utils/padding.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/utils/passes.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/utils/pytree_utils.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/utils/record_input.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/utils/register_custom_op.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/utils/serialize.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/utils/signature.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/utils/trace_decorators.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico/utils/validate_args_kwargs.py +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico.egg-info/dependency_links.txt +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico.egg-info/entry_points.txt +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico.egg-info/requires.txt +0 -0
- {tico-0.2.0.dev260411 → tico-0.2.0.dev260415}/tico.egg-info/top_level.txt +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.2.0.dev260415"
|
|
@@ -28,6 +28,11 @@ from tico.quantization.algorithm.gptq.utils import (
|
|
|
28
28
|
from tico.quantization.config.gptq import GPTQConfig
|
|
29
29
|
from tico.quantization.quantizer import BaseQuantizer
|
|
30
30
|
from tico.quantization.quantizer_registry import register_quantizer
|
|
31
|
+
from tico.utils.utils import move_to_device
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def move_to_cpu(obj):
|
|
35
|
+
return move_to_device(obj, "cpu")
|
|
31
36
|
|
|
32
37
|
|
|
33
38
|
class StopForward(Exception):
|
|
@@ -118,12 +123,12 @@ class GPTQQuantizer(BaseQuantizer):
|
|
|
118
123
|
for idx, item in enumerate(args):
|
|
119
124
|
if (idx + 1) > len(self.cache_args):
|
|
120
125
|
self.cache_args.append([])
|
|
121
|
-
self.cache_args[idx].append(item)
|
|
126
|
+
self.cache_args[idx].append(move_to_cpu(item))
|
|
122
127
|
# Store keyword args
|
|
123
128
|
for k, v in kwargs.items():
|
|
124
129
|
if self.cache_kwargs.get(k, None) is None:
|
|
125
130
|
self.cache_kwargs[k] = []
|
|
126
|
-
self.cache_kwargs[k].append(v)
|
|
131
|
+
self.cache_kwargs[k].append(move_to_cpu(v))
|
|
127
132
|
|
|
128
133
|
self.num_batches += 1
|
|
129
134
|
raise StopForward # stop after the first layer
|
|
@@ -280,6 +285,7 @@ class GPTQQuantizer(BaseQuantizer):
|
|
|
280
285
|
|
|
281
286
|
# Run layer forward over all cached batches to build Hessian/statistics
|
|
282
287
|
batch_num = self.num_batches
|
|
288
|
+
device = next(model.parameters()).device
|
|
283
289
|
for batch_idx in tqdm(
|
|
284
290
|
range(batch_num),
|
|
285
291
|
desc=f"[L{l_idx}] collecting",
|
|
@@ -290,9 +296,13 @@ class GPTQQuantizer(BaseQuantizer):
|
|
|
290
296
|
cache_args_batch = gather_single_batch_from_list(
|
|
291
297
|
self.cache_args, batch_idx
|
|
292
298
|
)
|
|
299
|
+
cache_args_batch = move_to_device(cache_args_batch, device)
|
|
300
|
+
|
|
293
301
|
cache_kwargs_batch = gather_single_batch_from_dict(
|
|
294
302
|
self.cache_kwargs, batch_idx
|
|
295
303
|
)
|
|
304
|
+
cache_kwargs_batch = move_to_device(cache_kwargs_batch, device)
|
|
305
|
+
|
|
296
306
|
layer(*cache_args_batch, **cache_kwargs_batch)
|
|
297
307
|
|
|
298
308
|
# Remove handles
|
|
@@ -314,6 +324,7 @@ class GPTQQuantizer(BaseQuantizer):
|
|
|
314
324
|
gptq[name].free()
|
|
315
325
|
|
|
316
326
|
# 4) After quantization, re-run the layer to produce outputs for the next layer
|
|
327
|
+
device = next(model.parameters()).device
|
|
317
328
|
for batch_idx in tqdm(
|
|
318
329
|
range(batch_num),
|
|
319
330
|
desc=f"[L{l_idx}] re-forward",
|
|
@@ -324,9 +335,13 @@ class GPTQQuantizer(BaseQuantizer):
|
|
|
324
335
|
cache_args_batch = gather_single_batch_from_list(
|
|
325
336
|
self.cache_args, batch_idx
|
|
326
337
|
)
|
|
338
|
+
cache_args_batch = move_to_device(cache_args_batch, device)
|
|
339
|
+
|
|
327
340
|
cache_kwargs_batch = gather_single_batch_from_dict(
|
|
328
341
|
self.cache_kwargs, batch_idx
|
|
329
342
|
)
|
|
343
|
+
cache_kwargs_batch = move_to_device(cache_kwargs_batch, device)
|
|
344
|
+
|
|
330
345
|
outs = layer(*cache_args_batch, **cache_kwargs_batch)
|
|
331
346
|
# LLaMA's decoder layer return type differs across Transformers versions:
|
|
332
347
|
# some return a tuple (hidden_states, ...), others return just a tensor.
|
|
@@ -334,7 +349,14 @@ class GPTQQuantizer(BaseQuantizer):
|
|
|
334
349
|
outs = outs[0] if isinstance(outs, tuple) else outs
|
|
335
350
|
# Update inputs for next iteration.
|
|
336
351
|
if len(self.cache_args) > 0:
|
|
337
|
-
|
|
352
|
+
if hasattr(outs, "to") and hasattr(
|
|
353
|
+
self.cache_args[0][batch_idx], "device"
|
|
354
|
+
):
|
|
355
|
+
self.cache_args[0][batch_idx] = outs.to(
|
|
356
|
+
self.cache_args[0][batch_idx].device
|
|
357
|
+
)
|
|
358
|
+
else:
|
|
359
|
+
self.cache_args[0][batch_idx] = outs
|
|
338
360
|
|
|
339
361
|
if torch.cuda.is_available():
|
|
340
362
|
torch.cuda.empty_cache()
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
# Copyright (c) 2026 Samsung Electronics Co., Ltd. All Rights Reserved
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import argparse
|
|
16
|
+
|
|
17
|
+
import torch
|
|
18
|
+
|
|
19
|
+
from lm_eval.utils import make_table
|
|
20
|
+
|
|
21
|
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
22
|
+
|
|
23
|
+
from tico.quantization.evaluation.script.llm_tasks_eval import evaluate_llm_on_tasks
|
|
24
|
+
|
|
25
|
+
DTYPE_MAP = {
|
|
26
|
+
"float32": torch.float32,
|
|
27
|
+
# TODO Support more dtypes
|
|
28
|
+
# "bfloat16": torch.bfloat16,
|
|
29
|
+
# "float16": torch.float16,
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def main():
|
|
34
|
+
parser = argparse.ArgumentParser(
|
|
35
|
+
description="Evaluate a fake-quantized Llama model"
|
|
36
|
+
)
|
|
37
|
+
parser.add_argument(
|
|
38
|
+
"--model", type=str, required=True, help="HF repo name or local path."
|
|
39
|
+
)
|
|
40
|
+
parser.add_argument(
|
|
41
|
+
"--device",
|
|
42
|
+
type=str,
|
|
43
|
+
default="cuda" if torch.cuda.is_available() else "cpu",
|
|
44
|
+
help="Device to run on (cuda|cpu|mps).",
|
|
45
|
+
)
|
|
46
|
+
parser.add_argument(
|
|
47
|
+
"--dtype",
|
|
48
|
+
choices=list(DTYPE_MAP.keys()),
|
|
49
|
+
default="float32",
|
|
50
|
+
help="Model dtype for load.",
|
|
51
|
+
)
|
|
52
|
+
parser.add_argument(
|
|
53
|
+
"--hf-token",
|
|
54
|
+
type=str,
|
|
55
|
+
default=None,
|
|
56
|
+
help="Optional HF token for gated/private repos.",
|
|
57
|
+
)
|
|
58
|
+
parser.add_argument(
|
|
59
|
+
"--trust-remote-code",
|
|
60
|
+
action="store_true",
|
|
61
|
+
help="Enable only if you trust the model repo code.",
|
|
62
|
+
)
|
|
63
|
+
parser.add_argument(
|
|
64
|
+
"--cache_dir",
|
|
65
|
+
type=str,
|
|
66
|
+
default=None,
|
|
67
|
+
help="cache_dir for using model/datasets loading",
|
|
68
|
+
)
|
|
69
|
+
parser.add_argument(
|
|
70
|
+
"--fk_model_path", type=str, required=True, help="Path to fake_quantized model"
|
|
71
|
+
)
|
|
72
|
+
parser.add_argument(
|
|
73
|
+
"--eval_tasks",
|
|
74
|
+
type=str,
|
|
75
|
+
default=None,
|
|
76
|
+
help="tasks to be evaluated using lm_eval, e.g. `winogrande,arc_easy,arc_challenge,openbookqa,mmlu_pro,ifeval,bbh`",
|
|
77
|
+
)
|
|
78
|
+
parser.add_argument(
|
|
79
|
+
"--skip_fp_eval",
|
|
80
|
+
action="store_true",
|
|
81
|
+
help="Skip original model evaluation.",
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
args = parser.parse_args()
|
|
85
|
+
print(args)
|
|
86
|
+
|
|
87
|
+
# -------------------------------------------------------------------------
|
|
88
|
+
# Basic setup
|
|
89
|
+
# -------------------------------------------------------------------------
|
|
90
|
+
device = torch.device(args.device)
|
|
91
|
+
dtype = DTYPE_MAP[args.dtype]
|
|
92
|
+
|
|
93
|
+
print("=== Config ===")
|
|
94
|
+
print(f"Model : {args.model}")
|
|
95
|
+
print(f"Device : {device.type}")
|
|
96
|
+
print(f"DType : {args.dtype}")
|
|
97
|
+
print(f"fk_model_path : {args.fk_model_path}")
|
|
98
|
+
print()
|
|
99
|
+
|
|
100
|
+
tokenizer = AutoTokenizer.from_pretrained(
|
|
101
|
+
args.model,
|
|
102
|
+
trust_remote_code=args.trust_remote_code,
|
|
103
|
+
token=args.hf_token,
|
|
104
|
+
cache_dir=args.cache_dir,
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
if not args.skip_fp_eval:
|
|
108
|
+
|
|
109
|
+
# -------------------------------------------------------------------------
|
|
110
|
+
# FP model evaluation
|
|
111
|
+
# -------------------------------------------------------------------------
|
|
112
|
+
print("Loading FP model …")
|
|
113
|
+
model = (
|
|
114
|
+
AutoModelForCausalLM.from_pretrained(
|
|
115
|
+
args.model,
|
|
116
|
+
dtype=dtype,
|
|
117
|
+
trust_remote_code=args.trust_remote_code,
|
|
118
|
+
token=args.hf_token,
|
|
119
|
+
cache_dir=args.cache_dir,
|
|
120
|
+
)
|
|
121
|
+
.cpu()
|
|
122
|
+
.eval()
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
if args.eval_tasks is not None:
|
|
126
|
+
config = model.config
|
|
127
|
+
max_seq_len = config.max_position_embeddings
|
|
128
|
+
results = evaluate_llm_on_tasks(
|
|
129
|
+
model, tokenizer, args.eval_tasks, max_length=max_seq_len
|
|
130
|
+
)
|
|
131
|
+
print("Original RESULTS ARE:")
|
|
132
|
+
print(make_table(results))
|
|
133
|
+
|
|
134
|
+
model = model.cpu()
|
|
135
|
+
if device.type == "cuda" and torch.cuda.is_available():
|
|
136
|
+
torch.cuda.empty_cache()
|
|
137
|
+
|
|
138
|
+
# -------------------------------------------------------------------------
|
|
139
|
+
# FK model evaluation
|
|
140
|
+
# -------------------------------------------------------------------------
|
|
141
|
+
print("Loading fake quantized model …")
|
|
142
|
+
fk_model = torch.load(args.fk_model_path, weights_only=False).eval().to(args.device)
|
|
143
|
+
|
|
144
|
+
if args.eval_tasks is not None:
|
|
145
|
+
config = fk_model.wrapped.config
|
|
146
|
+
max_seq_len = config.max_position_embeddings
|
|
147
|
+
|
|
148
|
+
results = evaluate_llm_on_tasks(
|
|
149
|
+
fk_model, tokenizer, args.eval_tasks, max_length=max_seq_len
|
|
150
|
+
)
|
|
151
|
+
print("Quantized RESULTS ARE:")
|
|
152
|
+
print(make_table(results))
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
if __name__ == "__main__":
|
|
156
|
+
main()
|
|
@@ -217,9 +217,43 @@ def evaluate(q_m, tokenizer, dataset_test, args):
|
|
|
217
217
|
print(make_table(results))
|
|
218
218
|
|
|
219
219
|
|
|
220
|
+
def get_sensitivities_info_name(model, dataset, seed, n_samples):
|
|
221
|
+
model_name = model.config.name_or_path.replace("/", "_")
|
|
222
|
+
|
|
223
|
+
name = (
|
|
224
|
+
"."
|
|
225
|
+
+ "/sensitivities_for_"
|
|
226
|
+
+ model_name
|
|
227
|
+
+ "_"
|
|
228
|
+
+ dataset
|
|
229
|
+
+ "_"
|
|
230
|
+
+ str(n_samples)
|
|
231
|
+
+ "_"
|
|
232
|
+
+ str(seed)
|
|
233
|
+
+ ".pt"
|
|
234
|
+
)
|
|
235
|
+
return name
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def get_ptq_model_name(model, args):
|
|
239
|
+
model_name = model.config.name_or_path.replace("/", "_")
|
|
240
|
+
|
|
241
|
+
name = (
|
|
242
|
+
f"PTQ_{model_name}_"
|
|
243
|
+
+ ("SpinQuant_" if args.no_spinquant is False else "")
|
|
244
|
+
+ ("GPTQ_" if args.no_GPTQ is False else "")
|
|
245
|
+
+ (f"{args.gptq_mse}_" if args.no_GPTQ is False else "")
|
|
246
|
+
+ str(args.nsamples_for_qcalibration)
|
|
247
|
+
+ "_"
|
|
248
|
+
+ str(args.seed)
|
|
249
|
+
+ ".pt"
|
|
250
|
+
)
|
|
251
|
+
return name
|
|
252
|
+
|
|
253
|
+
|
|
220
254
|
def main():
|
|
221
255
|
parser = argparse.ArgumentParser(
|
|
222
|
-
description="GPTQ+PTQ pipeline (weight-only + activation)"
|
|
256
|
+
description="GPTQ+PTQ pipeline (weight-only + activation)",
|
|
223
257
|
)
|
|
224
258
|
parser.add_argument(
|
|
225
259
|
"--model", type=str, required=True, help="HF repo name or local path."
|
|
@@ -270,16 +304,17 @@ def main():
|
|
|
270
304
|
help="Leave model float",
|
|
271
305
|
)
|
|
272
306
|
parser.add_argument(
|
|
273
|
-
"--
|
|
307
|
+
"--output_dir",
|
|
274
308
|
type=str,
|
|
275
309
|
default=None,
|
|
276
|
-
help="Save
|
|
310
|
+
help="Save specified artifacts to output_dir",
|
|
277
311
|
)
|
|
278
312
|
parser.add_argument(
|
|
279
|
-
"--
|
|
313
|
+
"--save",
|
|
314
|
+
nargs="*",
|
|
280
315
|
type=str,
|
|
281
|
-
|
|
282
|
-
help="
|
|
316
|
+
choices=["circle_full", "circle_per_layer", "ptq_checkpoint", "sensitivity"],
|
|
317
|
+
help="which artifacts should be saved to output_dir",
|
|
283
318
|
)
|
|
284
319
|
parser.add_argument(
|
|
285
320
|
"--cache_dir",
|
|
@@ -439,6 +474,13 @@ def main():
|
|
|
439
474
|
else:
|
|
440
475
|
calibrator = SensitivityCalibrator(model, calib_inputs)
|
|
441
476
|
sens = calibrator.compute_sensitivity_info()
|
|
477
|
+
if args.output_dir is not None and "sensitivity" in args.save:
|
|
478
|
+
save_name = get_sensitivities_info_name(
|
|
479
|
+
model, "wikitext", args.seed, len(calib_inputs)
|
|
480
|
+
)
|
|
481
|
+
save_path = pathlib.Path(args.output_dir, save_name)
|
|
482
|
+
print(f"Saving calibrated_sensitivities to {save_path}")
|
|
483
|
+
torch.save(sens, save_path)
|
|
442
484
|
|
|
443
485
|
gptq_config = GPTQConfig(
|
|
444
486
|
weight_bits=args.linear_weight_bits,
|
|
@@ -461,15 +503,21 @@ def main():
|
|
|
461
503
|
if not args.no_PTQ:
|
|
462
504
|
q_m = quantize_using_PTQ(q_m, calib_inputs, args)
|
|
463
505
|
|
|
506
|
+
if args.output_dir is not None and "ptq_checkpoint" in args.save:
|
|
507
|
+
save_name = get_ptq_model_name(model, args)
|
|
508
|
+
save_path = pathlib.Path(args.output_dir, save_name)
|
|
509
|
+
print(f"Saving PTQ model to {save_path}")
|
|
510
|
+
torch.save(q_m, save_path)
|
|
511
|
+
|
|
464
512
|
# after PTQ quantizer only fixed-length input sequences are valid
|
|
465
513
|
evaluate(q_m, tokenizer, dataset_test, args)
|
|
466
514
|
|
|
467
|
-
if args.
|
|
468
|
-
save_layers_to(q_m, args.max_seq_len, args.
|
|
515
|
+
if args.output_dir is not None and "circle_per_layer" in args.save:
|
|
516
|
+
save_layers_to(q_m, args.max_seq_len, args.output_dir)
|
|
469
517
|
|
|
470
|
-
if args.
|
|
518
|
+
if args.output_dir is not None and "circle_full" in args.save:
|
|
471
519
|
calib_inputs = list(torch.stack(calib_inputs).reshape(-1, 1, args.max_seq_len))
|
|
472
|
-
save_model_to(q_m, calib_inputs, args.
|
|
520
|
+
save_model_to(q_m, calib_inputs, args.output_dir)
|
|
473
521
|
|
|
474
522
|
|
|
475
523
|
if __name__ == "__main__":
|