tico 0.2.0.dev260326__tar.gz → 0.2.0.dev260331__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/PKG-INFO +1 -1
- tico-0.2.0.dev260331/tico/_version.py +1 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/config/builders.py +2 -1
- tico-0.2.0.dev260331/tico/quantization/wrapq/examples/static_llama_layer_runtime.py +604 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico.egg-info/PKG-INFO +1 -1
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico.egg-info/SOURCES.txt +1 -0
- tico-0.2.0.dev260326/tico/_version.py +0 -1
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/LICENSE +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/README.md +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/pyproject.toml +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/setup.cfg +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/__init__.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/config/__init__.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/config/base.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/config/factory.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/config/v1.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/experimental/__init__.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/interpreter/__init__.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/interpreter/infer.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/interpreter/interpreter.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/passes/__init__.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/passes/cast_aten_where_arg_type.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/passes/cast_clamp_mixed_type_args.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/passes/cast_mixed_type_args.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/passes/const_prop_pass.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/passes/convert_conv1d_to_conv2d.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/passes/convert_conv3d_to_conv2d.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/passes/convert_expand_to_slice_cat.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/passes/convert_layout_op_to_reshape.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/passes/convert_matmul_to_linear.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/passes/convert_repeat_to_expand_copy.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/passes/convert_sym_size_to_circle_shape.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/passes/convert_to_relu6.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/passes/decompose_addmm.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/passes/decompose_batch_norm.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/passes/decompose_fake_quantize.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/passes/decompose_fake_quantize_tensor_qparams.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/passes/decompose_group_norm.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/passes/decompose_grouped_conv2d.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/passes/decompose_slice_scatter.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/passes/extract_dtype_kwargs.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/passes/fill_meta_val.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/passes/fuse_leading_unsqueeze_reshape.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/passes/fuse_redundant_reshape_to_mean.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/passes/legalize_causal_mask_value.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/passes/legalize_predefined_layout_operators.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/passes/lower_copy.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/passes/lower_pow2_to_mul.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/passes/lower_to_resize_nearest_neighbor.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/passes/lower_to_slice.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/passes/merge_consecutive_cat.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/passes/ops.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/passes/remove_nop.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/passes/remove_redundant_assert_nodes.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/passes/remove_redundant_expand.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/passes/remove_redundant_permute.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/passes/remove_redundant_reshape.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/passes/remove_redundant_slice.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/passes/remove_redundant_to_copy.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/passes/restore_linear.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/passes/segment_index_select.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/pt2_to_circle.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/__init__.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/algorithm/__init__.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/algorithm/fpi_gptq/__init__.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/algorithm/fpi_gptq/fpi_gptq.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/algorithm/fpi_gptq/quantizer.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/algorithm/gptq/__init__.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/algorithm/gptq/gptq.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/algorithm/gptq/quant.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/algorithm/gptq/quantizer.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/algorithm/gptq/utils.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/algorithm/qwen3_vl_gptq/__init__.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/algorithm/qwen3_vl_gptq/gptq.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/algorithm/qwen3_vl_gptq/quantizer.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/algorithm/qwen3_vl_gptq/utils.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/algorithm/smoothquant/__init__.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/algorithm/smoothquant/observer.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/algorithm/smoothquant/quantizer.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/algorithm/smoothquant/smooth_quant.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/config/__init__.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/config/base.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/config/fpi_gptq.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/config/gptq.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/config/ptq.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/config/qwen3_vl_gptq.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/config/smoothquant.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/evaluation/__init__.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/evaluation/backend.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/evaluation/evaluate.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/evaluation/executor/__init__.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/evaluation/executor/backend_executor.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/evaluation/executor/circle_executor.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/evaluation/executor/triv24_executor.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/evaluation/metric.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/evaluation/script/llm_tasks_eval.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/evaluation/script/mini_vqa_eval.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/evaluation/utils.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/evaluation/vlm_eval_utils.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/passes/__init__.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/passes/fold_quant_ops.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/passes/insert_quantize_on_dtype_mismatch.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/passes/propagate_qparam_backward.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/passes/propagate_qparam_forward.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/passes/quantize_bias.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/passes/remove_weight_dequant_op.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/public_interface.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/quantizer.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/quantizer_registry.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/__init__.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/dtypes.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/examples/__init__.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/examples/compare_ppl.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/examples/debug_quant_outputs.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/examples/llama/__init__.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/examples/llama/quantize_attn_decode.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/examples/llama/quantize_attn_prefill.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/examples/llama/quantize_decoder_layer_decode.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/examples/llama/quantize_decoder_layer_prefill.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/examples/llama/quantize_mlp.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/examples/nn/__init__.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/examples/nn/quantize_conv3d.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/examples/nn/quantize_conv3d_special_case.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/examples/nn/quantize_linear.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/examples/quantize_full_qmodel_with_gptq.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/examples/quantize_full_vlm_model_with_gptq.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/examples/quantize_qwen3_vl_with_gptq.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/examples/quantize_with_gptq.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/examples/qwen/__init__.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/examples/qwen/quantize_text_attn.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/examples/qwen/quantize_text_decoder_layer.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/examples/qwen/quantize_text_mlp.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/examples/qwen/quantize_text_model.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/examples/qwen/quantize_vision_attn.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/examples/qwen/quantize_vision_block.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/examples/qwen/quantize_vision_mlp.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/examples/qwen/quantize_vision_model.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/examples/qwen/quantize_vision_patch_embed.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/examples/qwen/quantize_vision_patch_merger.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/mode.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/observers/__init__.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/observers/affine_base.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/observers/base.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/observers/ema.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/observers/identity.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/observers/minmax.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/observers/mx.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/qscheme.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/quantizer.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/utils/__init__.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/utils/check_missing_qparam.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/utils/introspection.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/utils/metrics.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/utils/reduce_utils.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/utils/version.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/wrap_helper.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/wrappers/__init__.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/wrappers/fairseq/__init__.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/wrappers/fairseq/decoder_export_single_step.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/wrappers/fairseq/quant_decoder.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/wrappers/fairseq/quant_decoder_layer.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/wrappers/fairseq/quant_encoder.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/wrappers/fairseq/quant_encoder_layer.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/wrappers/fairseq/quant_mha.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/wrappers/llama/__init__.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/wrappers/llama/quant_attn_decode.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/wrappers/llama/quant_attn_prefill.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/wrappers/llama/quant_decoder_layer_decode.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/wrappers/llama/quant_decoder_layer_prefill.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/wrappers/llama/quant_mlp.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/wrappers/llama/quant_model.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/wrappers/llama/quant_model_for_causal_lm.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/wrappers/nn/__init__.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/wrappers/nn/quant_conv3d.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/wrappers/nn/quant_conv3d_decomposed.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/wrappers/nn/quant_embedding.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/wrappers/nn/quant_layernorm.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/wrappers/nn/quant_linear.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/wrappers/nn/quant_silu.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/wrappers/ops/__init__.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/wrappers/ops/quant_rmsnorm.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/wrappers/ptq_wrapper.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/wrappers/quant_elementwise.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/wrappers/quant_module_base.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/wrappers/qwen_vl/quant_text_attn.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/wrappers/qwen_vl/quant_text_decoder_layer.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/wrappers/qwen_vl/quant_text_mlp.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/wrappers/qwen_vl/quant_text_model.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/wrappers/qwen_vl/quant_vision_attn.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/wrappers/qwen_vl/quant_vision_block.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/wrappers/qwen_vl/quant_vision_mlp.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/wrappers/qwen_vl/quant_vision_model.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/wrappers/qwen_vl/quant_vision_patch_embed.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/wrappers/qwen_vl/quant_vision_patch_merger.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/quantization/wrapq/wrappers/registry.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/__init__.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/circle_graph.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/circle_mapping.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/circle_serializer.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/__init__.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/adapters/__init__.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/adapters/llama_rmsnorm.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/adapters/onert/__init__.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/adapters/onert/llama_attention.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/hashable_opcode.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/node_visitor.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_abs.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_add.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_alias_copy.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_any.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_arange_start_step.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_argmax.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_attention.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_avg_pool2d.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_bmm.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_cat.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_circle_shape.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_clamp.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_clone.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_constant_pad_nd.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_conv2d.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_cos.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_cumsum.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_depthwise_conv2d.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_dequantize_per_channel.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_dequantize_per_tensor.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_div.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_embedding.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_eq.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_exp.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_expand.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_full.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_full_like.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_ge.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_gelu.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_gt.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_index.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_index_select.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_instance_norm.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_le.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_leaky_relu.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_linear.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_log.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_log1p.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_logical_and.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_logical_not.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_lt.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_max_dim.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_max_pool2d_with_indices.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_maximum.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_mean.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_minimum.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_mm.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_mul.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_ne.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_neg.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_permute.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_pow.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_prelu.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_quantize_per_tensor.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_reciprocal.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_relu.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_relu6.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_repeat.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_reshape.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_resize_nearest_neighbor.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_rmsnorm.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_round.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_rsqrt.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_scalar_tensor.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_select_copy.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_sigmoid.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_sin.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_slice.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_softmax.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_split_with_sizes.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_sqrt.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_squeeze.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_sub.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_sum.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_tanh.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_to_copy.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_transpose_conv.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_unsqueeze.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_view.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/op_where.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/operators/utils.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/pack.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/serialize/quant_param.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/utils/__init__.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/utils/compat/__init__.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/utils/compat/torch.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/utils/compat/transformers.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/utils/convert.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/utils/define.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/utils/diff_graph.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/utils/dtype.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/utils/errors.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/utils/graph.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/utils/installed_packages.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/utils/logging.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/utils/model.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/utils/mx/__init__.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/utils/mx/elemwise_ops.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/utils/mx/formats.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/utils/mx/mx_ops.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/utils/padding.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/utils/passes.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/utils/pytree_utils.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/utils/record_input.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/utils/register_custom_op.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/utils/serialize.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/utils/signature.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/utils/trace_decorators.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/utils/utils.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico/utils/validate_args_kwargs.py +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico.egg-info/dependency_links.txt +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico.egg-info/entry_points.txt +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico.egg-info/requires.txt +0 -0
- {tico-0.2.0.dev260326 → tico-0.2.0.dev260331}/tico.egg-info/top_level.txt +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.2.0.dev260331"
|
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
+
import copy
|
|
15
16
|
from typing import Any, Dict, Optional, Tuple
|
|
16
17
|
|
|
17
18
|
from tico.quantization.config.ptq import PTQConfig, WrapperVariant
|
|
@@ -113,7 +114,7 @@ def _set_nested_override(
|
|
|
113
114
|
current = root
|
|
114
115
|
for key in path[:-1]:
|
|
115
116
|
current = current.setdefault(key, {})
|
|
116
|
-
current[path[-1]] = value
|
|
117
|
+
current[path[-1]] = copy.deepcopy(value)
|
|
117
118
|
|
|
118
119
|
|
|
119
120
|
def _build_weight_override(weight_dtype: Optional[DType]) -> Dict[str, Any]:
|
|
@@ -0,0 +1,604 @@
|
|
|
1
|
+
# Copyright (c) 2026 Samsung Electronics Co., Ltd. All Rights Reserved
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import argparse
|
|
16
|
+
import copy
|
|
17
|
+
from dataclasses import dataclass
|
|
18
|
+
from typing import List, Literal, Optional, Sequence, Tuple
|
|
19
|
+
|
|
20
|
+
import torch
|
|
21
|
+
import torch.nn as nn
|
|
22
|
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
23
|
+
|
|
24
|
+
from tico.quantization import prepare
|
|
25
|
+
from tico.quantization.config.ptq import PTQConfig
|
|
26
|
+
from tico.quantization.evaluation.metric import compute_peir
|
|
27
|
+
from tico.quantization.wrapq.wrappers.llama.quant_decoder_layer_decode import (
|
|
28
|
+
QuantLlamaDecoderLayerDecode,
|
|
29
|
+
)
|
|
30
|
+
from tico.quantization.wrapq.wrappers.llama.quant_decoder_layer_prefill import (
|
|
31
|
+
QuantLlamaDecoderLayerPrefill,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class LayerCache:
|
|
37
|
+
past_k: torch.Tensor
|
|
38
|
+
past_v: torch.Tensor
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def parse_args():
|
|
42
|
+
parser = argparse.ArgumentParser(
|
|
43
|
+
description="Static-shape Llama layer runtime with prefill/decode wrappers."
|
|
44
|
+
)
|
|
45
|
+
parser.add_argument(
|
|
46
|
+
"--model",
|
|
47
|
+
type=str,
|
|
48
|
+
default="Maykeye/TinyLLama-v0",
|
|
49
|
+
help="HF model name or local model path.",
|
|
50
|
+
)
|
|
51
|
+
parser.add_argument(
|
|
52
|
+
"--max-seq",
|
|
53
|
+
type=int,
|
|
54
|
+
default=256,
|
|
55
|
+
help="Static maximum sequence length for decode runtime.",
|
|
56
|
+
)
|
|
57
|
+
parser.add_argument(
|
|
58
|
+
"--device",
|
|
59
|
+
type=str,
|
|
60
|
+
default="cpu",
|
|
61
|
+
help="Execution device, e.g. cpu or cuda.",
|
|
62
|
+
)
|
|
63
|
+
parser.add_argument(
|
|
64
|
+
"--prompt",
|
|
65
|
+
type=str,
|
|
66
|
+
default="The capital of France is",
|
|
67
|
+
help="Prompt used for verification and greedy generation.",
|
|
68
|
+
)
|
|
69
|
+
parser.add_argument(
|
|
70
|
+
"--verify-steps",
|
|
71
|
+
type=int,
|
|
72
|
+
default=6,
|
|
73
|
+
help="Number of decode steps for reference verification.",
|
|
74
|
+
)
|
|
75
|
+
parser.add_argument(
|
|
76
|
+
"--gen-steps",
|
|
77
|
+
type=int,
|
|
78
|
+
default=16,
|
|
79
|
+
help="Maximum number of new tokens for greedy generation.",
|
|
80
|
+
)
|
|
81
|
+
return parser.parse_args()
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _clone_layer_with_variant(
|
|
85
|
+
layer: nn.Module, variant: Literal["common", "prefill", "decode"]
|
|
86
|
+
) -> nn.Module:
|
|
87
|
+
"""
|
|
88
|
+
Build a wrapped decoder layer for a specific runtime phase.
|
|
89
|
+
|
|
90
|
+
The returned module is expected to be a PTQWrapper whose `.wrapped`
|
|
91
|
+
is either QuantLlamaDecoderLayerPrefill or QuantLlamaDecoderLayerDecode.
|
|
92
|
+
"""
|
|
93
|
+
qlayer = prepare(
|
|
94
|
+
layer,
|
|
95
|
+
PTQConfig(wrapper_variant=variant),
|
|
96
|
+
)
|
|
97
|
+
return qlayer
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _build_rope_templates_from_config(
|
|
101
|
+
config,
|
|
102
|
+
max_seq: int,
|
|
103
|
+
device: torch.device,
|
|
104
|
+
dtype: torch.dtype = torch.float32,
|
|
105
|
+
) -> Tuple[torch.Tensor, torch.Tensor]:
|
|
106
|
+
"""
|
|
107
|
+
Build full RoPE tables using the same simplified logic as the wrappers.
|
|
108
|
+
|
|
109
|
+
Output shapes:
|
|
110
|
+
cos: (1, max_seq, head_dim)
|
|
111
|
+
sin: (1, max_seq, head_dim)
|
|
112
|
+
"""
|
|
113
|
+
head_dim = getattr(config, "head_dim", None) or (
|
|
114
|
+
config.hidden_size // config.num_attention_heads
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
rope_params = getattr(config, "rope_parameters", None)
|
|
118
|
+
if (
|
|
119
|
+
rope_params is not None
|
|
120
|
+
and isinstance(rope_params, dict)
|
|
121
|
+
and "rope_theta" in rope_params
|
|
122
|
+
):
|
|
123
|
+
base = float(rope_params["rope_theta"])
|
|
124
|
+
else:
|
|
125
|
+
base = float(getattr(config, "rope_theta", 10000.0))
|
|
126
|
+
|
|
127
|
+
inv_freq = 1.0 / (
|
|
128
|
+
base
|
|
129
|
+
** (torch.arange(0, head_dim, 2, dtype=torch.float32, device=device) / head_dim)
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
pos = torch.arange(max_seq, dtype=torch.float32, device=device)
|
|
133
|
+
freqs = torch.outer(pos, inv_freq)
|
|
134
|
+
emb = torch.cat([freqs, freqs], dim=-1)
|
|
135
|
+
|
|
136
|
+
cos = emb.cos()
|
|
137
|
+
sin = emb.sin()
|
|
138
|
+
|
|
139
|
+
half_dim = head_dim // 2
|
|
140
|
+
sin[..., :half_dim] = -sin[..., :half_dim]
|
|
141
|
+
|
|
142
|
+
cos = cos.unsqueeze(0).to(dtype=dtype)
|
|
143
|
+
sin = sin.unsqueeze(0).to(dtype=dtype)
|
|
144
|
+
return cos, sin
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _slice_rope(
|
|
148
|
+
rope_cos: torch.Tensor,
|
|
149
|
+
rope_sin: torch.Tensor,
|
|
150
|
+
position: int,
|
|
151
|
+
batch_size: int,
|
|
152
|
+
device: torch.device,
|
|
153
|
+
dtype: torch.dtype,
|
|
154
|
+
) -> Tuple[torch.Tensor, torch.Tensor]:
|
|
155
|
+
"""
|
|
156
|
+
Slice one-step RoPE tensors for decode.
|
|
157
|
+
|
|
158
|
+
Output shapes:
|
|
159
|
+
cos: (B, 1, head_dim)
|
|
160
|
+
sin: (B, 1, head_dim)
|
|
161
|
+
"""
|
|
162
|
+
cos = rope_cos[:, position : position + 1, :].to(device=device, dtype=dtype)
|
|
163
|
+
sin = rope_sin[:, position : position + 1, :].to(device=device, dtype=dtype)
|
|
164
|
+
|
|
165
|
+
if batch_size != 1:
|
|
166
|
+
cos = cos.expand(batch_size, -1, -1).contiguous()
|
|
167
|
+
sin = sin.expand(batch_size, -1, -1).contiguous()
|
|
168
|
+
|
|
169
|
+
return cos, sin
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def _build_decode_attention_mask(
|
|
173
|
+
batch_size: int,
|
|
174
|
+
past_len: int,
|
|
175
|
+
max_seq: int,
|
|
176
|
+
device: torch.device,
|
|
177
|
+
dtype: torch.dtype,
|
|
178
|
+
mask_value: float = -120.0,
|
|
179
|
+
) -> torch.Tensor:
|
|
180
|
+
"""
|
|
181
|
+
Build a fully static decode mask.
|
|
182
|
+
|
|
183
|
+
Layout assumption:
|
|
184
|
+
- past KV occupies the first `past_len` slots inside the static past buffer
|
|
185
|
+
- padded past slots are masked
|
|
186
|
+
- current token is appended internally by the attention module at the last slot
|
|
187
|
+
|
|
188
|
+
Returned shape:
|
|
189
|
+
(B, 1, max_seq)
|
|
190
|
+
|
|
191
|
+
Valid columns:
|
|
192
|
+
[0, 1, ..., past_len - 1, max_seq - 1]
|
|
193
|
+
Masked columns:
|
|
194
|
+
[past_len, ..., max_seq - 2]
|
|
195
|
+
"""
|
|
196
|
+
mask = torch.full((batch_size, 1, max_seq), mask_value, device=device, dtype=dtype)
|
|
197
|
+
|
|
198
|
+
if past_len > 0:
|
|
199
|
+
mask[:, :, :past_len] = 0.0
|
|
200
|
+
|
|
201
|
+
mask[:, :, max_seq - 1] = 0.0
|
|
202
|
+
return mask
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
class StaticLlamaLayerRuntime:
|
|
206
|
+
"""
|
|
207
|
+
Hybrid runtime that uses:
|
|
208
|
+
- wrapped decoder layers for prefill and decode
|
|
209
|
+
- original embedding / final norm / lm_head on CPU or a chosen device
|
|
210
|
+
|
|
211
|
+
This runtime enforces static decode shapes:
|
|
212
|
+
hidden_states: (B, 1, D)
|
|
213
|
+
attention_mask: (B, 1, max_seq)
|
|
214
|
+
past_key_value: (B, n_kv, max_seq - 1, head_dim)
|
|
215
|
+
position_embeddings: (B, 1, head_dim)
|
|
216
|
+
"""
|
|
217
|
+
|
|
218
|
+
def __init__(
|
|
219
|
+
self,
|
|
220
|
+
model: AutoModelForCausalLM,
|
|
221
|
+
tokenizer: AutoTokenizer,
|
|
222
|
+
max_seq: int,
|
|
223
|
+
device: str = "cpu",
|
|
224
|
+
prefill_layers: Optional[Sequence[nn.Module]] = None,
|
|
225
|
+
decode_layers: Optional[Sequence[nn.Module]] = None,
|
|
226
|
+
):
|
|
227
|
+
self.model = model.eval().to(device)
|
|
228
|
+
self.tokenizer = tokenizer
|
|
229
|
+
self.max_seq = max_seq
|
|
230
|
+
self.device = torch.device(device)
|
|
231
|
+
|
|
232
|
+
self.embed_tokens = self.model.model.embed_tokens
|
|
233
|
+
self.final_norm = self.model.model.norm
|
|
234
|
+
self.lm_head = self.model.lm_head
|
|
235
|
+
self.layers_ref = self.model.model.layers
|
|
236
|
+
|
|
237
|
+
if prefill_layers is None:
|
|
238
|
+
self.prefill_layers = nn.ModuleList(
|
|
239
|
+
[
|
|
240
|
+
_clone_layer_with_variant(layer, "prefill")
|
|
241
|
+
for i, layer in enumerate(self.layers_ref)
|
|
242
|
+
]
|
|
243
|
+
).to(self.device)
|
|
244
|
+
else:
|
|
245
|
+
self.prefill_layers = nn.ModuleList(prefill_layers).to(self.device)
|
|
246
|
+
|
|
247
|
+
if decode_layers is None:
|
|
248
|
+
self.decode_layers = nn.ModuleList(
|
|
249
|
+
[
|
|
250
|
+
_clone_layer_with_variant(layer, "decode")
|
|
251
|
+
for i, layer in enumerate(self.layers_ref)
|
|
252
|
+
]
|
|
253
|
+
).to(self.device)
|
|
254
|
+
else:
|
|
255
|
+
self.decode_layers = nn.ModuleList(decode_layers).to(self.device)
|
|
256
|
+
|
|
257
|
+
for layer in self.prefill_layers:
|
|
258
|
+
assert hasattr(layer, "wrapped")
|
|
259
|
+
assert isinstance(layer.wrapped, QuantLlamaDecoderLayerPrefill)
|
|
260
|
+
|
|
261
|
+
for layer in self.decode_layers:
|
|
262
|
+
assert hasattr(layer, "wrapped")
|
|
263
|
+
assert isinstance(layer.wrapped, QuantLlamaDecoderLayerDecode)
|
|
264
|
+
|
|
265
|
+
self.config = self.model.config
|
|
266
|
+
self.hidden_size = self.config.hidden_size
|
|
267
|
+
self.num_hidden_layers = self.config.num_hidden_layers
|
|
268
|
+
self.num_kv_heads = self.config.num_key_value_heads
|
|
269
|
+
self.head_dim = getattr(self.config, "head_dim", None) or (
|
|
270
|
+
self.hidden_size // self.config.num_attention_heads
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
self.rope_cos, self.rope_sin = _build_rope_templates_from_config(
|
|
274
|
+
self.config,
|
|
275
|
+
max_seq=self.max_seq,
|
|
276
|
+
device=self.device,
|
|
277
|
+
dtype=torch.float32,
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
self.layer_caches: List[LayerCache] = []
|
|
281
|
+
self.past_len = 0
|
|
282
|
+
|
|
283
|
+
def reset_cache(self) -> None:
|
|
284
|
+
"""
|
|
285
|
+
Reset all runtime KV caches.
|
|
286
|
+
"""
|
|
287
|
+
self.layer_caches = []
|
|
288
|
+
self.past_len = 0
|
|
289
|
+
|
|
290
|
+
def _allocate_empty_cache(
|
|
291
|
+
self, batch_size: int, dtype: torch.dtype
|
|
292
|
+
) -> List[LayerCache]:
|
|
293
|
+
"""
|
|
294
|
+
Allocate external static KV buffers for all layers.
|
|
295
|
+
|
|
296
|
+
The runtime stores only past tokens in these buffers.
|
|
297
|
+
The current token is always produced as a delta by the decode wrapper.
|
|
298
|
+
"""
|
|
299
|
+
caches = []
|
|
300
|
+
for _ in range(self.num_hidden_layers):
|
|
301
|
+
past_k = torch.zeros(
|
|
302
|
+
batch_size,
|
|
303
|
+
self.num_kv_heads,
|
|
304
|
+
self.max_seq - 1,
|
|
305
|
+
self.head_dim,
|
|
306
|
+
device=self.device,
|
|
307
|
+
dtype=dtype,
|
|
308
|
+
)
|
|
309
|
+
past_v = torch.zeros_like(past_k)
|
|
310
|
+
caches.append(LayerCache(past_k=past_k, past_v=past_v))
|
|
311
|
+
return caches
|
|
312
|
+
|
|
313
|
+
@torch.no_grad()
|
|
314
|
+
def prefill(self, input_ids: torch.LongTensor) -> torch.Tensor:
|
|
315
|
+
"""
|
|
316
|
+
Run the prompt through all prefill layers and initialize static decode caches.
|
|
317
|
+
|
|
318
|
+
Input:
|
|
319
|
+
input_ids: (B, L)
|
|
320
|
+
|
|
321
|
+
Returns:
|
|
322
|
+
logits_last: (B, vocab_size)
|
|
323
|
+
"""
|
|
324
|
+
assert (
|
|
325
|
+
input_ids.dim() == 2
|
|
326
|
+
), f"Expected input_ids as (B, L), got {tuple(input_ids.shape)}"
|
|
327
|
+
batch_size, prompt_len = input_ids.shape
|
|
328
|
+
assert prompt_len < self.max_seq, (
|
|
329
|
+
f"Prompt length must be < max_seq so that decode still has one current slot. "
|
|
330
|
+
f"Got prompt_len={prompt_len}, max_seq={self.max_seq}"
|
|
331
|
+
)
|
|
332
|
+
|
|
333
|
+
hidden_states = self.embed_tokens(input_ids.to(self.device))
|
|
334
|
+
runtime_dtype = hidden_states.dtype
|
|
335
|
+
|
|
336
|
+
self.layer_caches = self._allocate_empty_cache(batch_size, runtime_dtype)
|
|
337
|
+
|
|
338
|
+
for layer_idx, layer in enumerate(self.prefill_layers):
|
|
339
|
+
out = layer(
|
|
340
|
+
hidden_states=hidden_states,
|
|
341
|
+
attention_mask=None,
|
|
342
|
+
position_embeddings=None,
|
|
343
|
+
use_cache=True,
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
hidden_states, present_key_value = out
|
|
347
|
+
present_k, present_v = present_key_value
|
|
348
|
+
|
|
349
|
+
assert present_k.size(2) == prompt_len
|
|
350
|
+
assert present_v.size(2) == prompt_len
|
|
351
|
+
|
|
352
|
+
self.layer_caches[layer_idx].past_k[:, :, :prompt_len, :] = present_k
|
|
353
|
+
self.layer_caches[layer_idx].past_v[:, :, :prompt_len, :] = present_v
|
|
354
|
+
|
|
355
|
+
self.past_len = prompt_len
|
|
356
|
+
|
|
357
|
+
hidden_states = self.final_norm(hidden_states)
|
|
358
|
+
logits = self.lm_head(hidden_states)
|
|
359
|
+
logits_last = logits[:, -1, :]
|
|
360
|
+
return logits_last
|
|
361
|
+
|
|
362
|
+
@torch.no_grad()
|
|
363
|
+
def decode_one(self, input_ids: torch.LongTensor) -> torch.Tensor:
|
|
364
|
+
"""
|
|
365
|
+
Run one decode step with strict static input shapes.
|
|
366
|
+
|
|
367
|
+
Input:
|
|
368
|
+
input_ids: (B, 1)
|
|
369
|
+
|
|
370
|
+
Returns:
|
|
371
|
+
logits_last: (B, vocab_size)
|
|
372
|
+
"""
|
|
373
|
+
assert (
|
|
374
|
+
input_ids.dim() == 2 and input_ids.size(1) == 1
|
|
375
|
+
), f"Decode expects input_ids as (B, 1), got {tuple(input_ids.shape)}"
|
|
376
|
+
assert (
|
|
377
|
+
len(self.layer_caches) == self.num_hidden_layers
|
|
378
|
+
), "Caches are not initialized. Call prefill() first."
|
|
379
|
+
assert (
|
|
380
|
+
self.past_len < self.max_seq
|
|
381
|
+
), f"Decode position overflow: past_len={self.past_len}, max_seq={self.max_seq}"
|
|
382
|
+
|
|
383
|
+
batch_size = input_ids.size(0)
|
|
384
|
+
hidden_states = self.embed_tokens(input_ids.to(self.device))
|
|
385
|
+
|
|
386
|
+
attention_mask = _build_decode_attention_mask(
|
|
387
|
+
batch_size=batch_size,
|
|
388
|
+
past_len=self.past_len,
|
|
389
|
+
max_seq=self.max_seq,
|
|
390
|
+
device=self.device,
|
|
391
|
+
dtype=hidden_states.dtype,
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
position_embeddings = _slice_rope(
|
|
395
|
+
self.rope_cos,
|
|
396
|
+
self.rope_sin,
|
|
397
|
+
position=self.past_len,
|
|
398
|
+
batch_size=batch_size,
|
|
399
|
+
device=self.device,
|
|
400
|
+
dtype=hidden_states.dtype,
|
|
401
|
+
)
|
|
402
|
+
|
|
403
|
+
for layer_idx, layer in enumerate(self.decode_layers):
|
|
404
|
+
cache = self.layer_caches[layer_idx]
|
|
405
|
+
|
|
406
|
+
out = layer(
|
|
407
|
+
hidden_states=hidden_states,
|
|
408
|
+
attention_mask=attention_mask,
|
|
409
|
+
past_key_value=(cache.past_k, cache.past_v),
|
|
410
|
+
position_embeddings=position_embeddings,
|
|
411
|
+
use_cache=True,
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
hidden_states, present_key_value = out
|
|
415
|
+
new_k, new_v = present_key_value
|
|
416
|
+
|
|
417
|
+
cache.past_k[:, :, self.past_len : self.past_len + 1, :] = new_k
|
|
418
|
+
cache.past_v[:, :, self.past_len : self.past_len + 1, :] = new_v
|
|
419
|
+
|
|
420
|
+
self.past_len += 1
|
|
421
|
+
|
|
422
|
+
hidden_states = self.final_norm(hidden_states)
|
|
423
|
+
logits = self.lm_head(hidden_states)
|
|
424
|
+
logits_last = logits[:, -1, :]
|
|
425
|
+
return logits_last
|
|
426
|
+
|
|
427
|
+
@torch.no_grad()
|
|
428
|
+
def generate_greedy(
|
|
429
|
+
self,
|
|
430
|
+
prompt: str,
|
|
431
|
+
max_new_tokens: int,
|
|
432
|
+
eos_token_id: Optional[int] = None,
|
|
433
|
+
) -> torch.LongTensor:
|
|
434
|
+
"""
|
|
435
|
+
Greedy generation using prefill once and then decode-only static steps.
|
|
436
|
+
"""
|
|
437
|
+
batch = self.tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
|
|
438
|
+
input_ids = batch["input_ids"].to(self.device)
|
|
439
|
+
|
|
440
|
+
if eos_token_id is None:
|
|
441
|
+
eos_token_id = self.tokenizer.eos_token_id
|
|
442
|
+
|
|
443
|
+
self.reset_cache()
|
|
444
|
+
logits = self.prefill(input_ids)
|
|
445
|
+
|
|
446
|
+
generated = input_ids.clone()
|
|
447
|
+
|
|
448
|
+
for _ in range(max_new_tokens):
|
|
449
|
+
next_token = torch.argmax(logits, dim=-1, keepdim=True)
|
|
450
|
+
generated = torch.cat([generated, next_token], dim=1)
|
|
451
|
+
|
|
452
|
+
if eos_token_id is not None and torch.all(next_token == eos_token_id):
|
|
453
|
+
break
|
|
454
|
+
|
|
455
|
+
logits = self.decode_one(next_token)
|
|
456
|
+
|
|
457
|
+
return generated
|
|
458
|
+
|
|
459
|
+
@torch.no_grad()
|
|
460
|
+
def verify_against_reference(
|
|
461
|
+
self,
|
|
462
|
+
prompt: str,
|
|
463
|
+
steps: int = 8,
|
|
464
|
+
verbose: bool = True,
|
|
465
|
+
) -> None:
|
|
466
|
+
"""
|
|
467
|
+
Compare runtime logits step-by-step against the full reference model.
|
|
468
|
+
|
|
469
|
+
This verifies runtime correctness, not export correctness.
|
|
470
|
+
If the wrapped layers are still FP-like, the mismatch should be tiny.
|
|
471
|
+
If they were converted to quantized mode, some quantization error is expected.
|
|
472
|
+
"""
|
|
473
|
+
batch = self.tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
|
|
474
|
+
input_ids = batch["input_ids"].to(self.device)
|
|
475
|
+
|
|
476
|
+
self.reset_cache()
|
|
477
|
+
|
|
478
|
+
logits_rt = self.prefill(input_ids)
|
|
479
|
+
ref_out = self.model(input_ids=input_ids)
|
|
480
|
+
logits_ref = ref_out.logits[:, -1, :]
|
|
481
|
+
|
|
482
|
+
diff = (logits_rt - logits_ref).abs()
|
|
483
|
+
mean_diff = diff.mean().item()
|
|
484
|
+
max_diff = diff.max().item()
|
|
485
|
+
|
|
486
|
+
if verbose:
|
|
487
|
+
print("=" * 100)
|
|
488
|
+
print("Step 0: prefill last-token logits")
|
|
489
|
+
print(f"mean|diff| = {mean_diff:.8f}")
|
|
490
|
+
print(f" max|diff| = {max_diff:.8f}")
|
|
491
|
+
print(f"PEIR = {compute_peir(logits_rt, logits_ref) * 100:.6f} %")
|
|
492
|
+
|
|
493
|
+
generated = input_ids.clone()
|
|
494
|
+
next_token = torch.argmax(logits_rt, dim=-1, keepdim=True)
|
|
495
|
+
generated = torch.cat([generated, next_token], dim=1)
|
|
496
|
+
|
|
497
|
+
for step in range(1, steps + 1):
|
|
498
|
+
logits_rt = self.decode_one(next_token)
|
|
499
|
+
|
|
500
|
+
ref_out = self.model(input_ids=generated)
|
|
501
|
+
logits_ref = ref_out.logits[:, -1, :]
|
|
502
|
+
|
|
503
|
+
diff = (logits_rt - logits_ref).abs()
|
|
504
|
+
mean_diff = diff.mean().item()
|
|
505
|
+
max_diff = diff.max().item()
|
|
506
|
+
|
|
507
|
+
if verbose:
|
|
508
|
+
print("-" * 100)
|
|
509
|
+
print(f"Step {step}: decode logits")
|
|
510
|
+
print(f"sequence length = {generated.size(1)}")
|
|
511
|
+
print(f"mean|diff| = {mean_diff:.8f}")
|
|
512
|
+
print(f" max|diff| = {max_diff:.8f}")
|
|
513
|
+
print(f"PEIR = {compute_peir(logits_rt, logits_ref) * 100:.6f} %")
|
|
514
|
+
|
|
515
|
+
next_token = torch.argmax(logits_rt, dim=-1, keepdim=True)
|
|
516
|
+
generated = torch.cat([generated, next_token], dim=1)
|
|
517
|
+
|
|
518
|
+
if generated.size(1) >= self.max_seq:
|
|
519
|
+
if verbose:
|
|
520
|
+
print("-" * 100)
|
|
521
|
+
print("Stopped because the static decode window is full.")
|
|
522
|
+
break
|
|
523
|
+
|
|
524
|
+
if verbose:
|
|
525
|
+
print("=" * 100)
|
|
526
|
+
print("Verification finished.")
|
|
527
|
+
|
|
528
|
+
@torch.no_grad()
|
|
529
|
+
def dump_decode_inputs(
|
|
530
|
+
self,
|
|
531
|
+
input_id: int,
|
|
532
|
+
) -> Tuple[torch.Tensor, torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
|
|
533
|
+
"""
|
|
534
|
+
Prepare one-step decode inputs without running the layers.
|
|
535
|
+
|
|
536
|
+
This is useful when debugging export/runtime parity.
|
|
537
|
+
"""
|
|
538
|
+
x = torch.tensor([[input_id]], device=self.device, dtype=torch.long)
|
|
539
|
+
hidden_states = self.embed_tokens(x)
|
|
540
|
+
|
|
541
|
+
attention_mask = _build_decode_attention_mask(
|
|
542
|
+
batch_size=1,
|
|
543
|
+
past_len=self.past_len,
|
|
544
|
+
max_seq=self.max_seq,
|
|
545
|
+
device=self.device,
|
|
546
|
+
dtype=hidden_states.dtype,
|
|
547
|
+
)
|
|
548
|
+
|
|
549
|
+
position_embeddings = _slice_rope(
|
|
550
|
+
self.rope_cos,
|
|
551
|
+
self.rope_sin,
|
|
552
|
+
position=self.past_len,
|
|
553
|
+
batch_size=1,
|
|
554
|
+
device=self.device,
|
|
555
|
+
dtype=hidden_states.dtype,
|
|
556
|
+
)
|
|
557
|
+
|
|
558
|
+
return hidden_states, attention_mask, position_embeddings
|
|
559
|
+
|
|
560
|
+
|
|
561
|
+
def main():
|
|
562
|
+
"""
|
|
563
|
+
Build the runtime, verify step-by-step parity, and run greedy generation.
|
|
564
|
+
"""
|
|
565
|
+
args = parse_args()
|
|
566
|
+
torch.set_grad_enabled(False)
|
|
567
|
+
|
|
568
|
+
model = AutoModelForCausalLM.from_pretrained(
|
|
569
|
+
args.model,
|
|
570
|
+
dtype=torch.float32,
|
|
571
|
+
).to(args.device)
|
|
572
|
+
tokenizer = AutoTokenizer.from_pretrained(args.model, legacy=False)
|
|
573
|
+
|
|
574
|
+
if tokenizer.pad_token_id is None:
|
|
575
|
+
tokenizer.pad_token = tokenizer.eos_token
|
|
576
|
+
|
|
577
|
+
model.config.max_position_embeddings = args.max_seq
|
|
578
|
+
|
|
579
|
+
runtime = StaticLlamaLayerRuntime(
|
|
580
|
+
model=model,
|
|
581
|
+
tokenizer=tokenizer,
|
|
582
|
+
max_seq=args.max_seq,
|
|
583
|
+
device=args.device,
|
|
584
|
+
)
|
|
585
|
+
|
|
586
|
+
runtime.verify_against_reference(
|
|
587
|
+
prompt=args.prompt,
|
|
588
|
+
steps=args.verify_steps,
|
|
589
|
+
verbose=True,
|
|
590
|
+
)
|
|
591
|
+
|
|
592
|
+
out_ids = runtime.generate_greedy(
|
|
593
|
+
prompt=args.prompt,
|
|
594
|
+
max_new_tokens=args.gen_steps,
|
|
595
|
+
eos_token_id=tokenizer.eos_token_id,
|
|
596
|
+
)
|
|
597
|
+
|
|
598
|
+
print("=" * 100)
|
|
599
|
+
print("Generated text:")
|
|
600
|
+
print(tokenizer.decode(out_ids[0], skip_special_tokens=True))
|
|
601
|
+
|
|
602
|
+
|
|
603
|
+
if __name__ == "__main__":
|
|
604
|
+
main()
|
|
@@ -120,6 +120,7 @@ tico/quantization/wrapq/examples/quantize_full_qmodel_with_gptq.py
|
|
|
120
120
|
tico/quantization/wrapq/examples/quantize_full_vlm_model_with_gptq.py
|
|
121
121
|
tico/quantization/wrapq/examples/quantize_qwen3_vl_with_gptq.py
|
|
122
122
|
tico/quantization/wrapq/examples/quantize_with_gptq.py
|
|
123
|
+
tico/quantization/wrapq/examples/static_llama_layer_runtime.py
|
|
123
124
|
tico/quantization/wrapq/examples/llama/__init__.py
|
|
124
125
|
tico/quantization/wrapq/examples/llama/quantize_attn_decode.py
|
|
125
126
|
tico/quantization/wrapq/examples/llama/quantize_attn_prefill.py
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "0.2.0.dev260326"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|