PyPI - ai-edge-torch-nightly - Versions diffs - 0.7.0.dev20251007__py3-none-any.whl → 0.8.0.dev20251225__py3-none-any.whl - Mend

ai-edge-torch-nightly 0.7.0.dev20251007py3-none-any.whl → 0.8.0.dev20251225py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ai-edge-torch-nightly might be problematic. Click here for more details.

Files changed (42) hide show

ai_edge_torch/generative/quantize/quant_recipe_utils.py CHANGED Viewed

@@ -32,23 +32,29 @@ from ai_edge_torch.generative.quantize import quant_attrs
 from ai_edge_torch.generative.quantize import quant_recipe
-def create_layer_quant_int8_dynamic() -> quant_recipe.LayerQuantRecipe:
+def create_layer_quant_dynamic(
+    weight_dtype: quant_attrs.Dtype = quant_attrs.Dtype.INT8,
+    granularity: quant_attrs.Granularity = quant_attrs.Granularity.CHANNELWISE,
+) -> quant_recipe.LayerQuantRecipe:
   return quant_recipe.LayerQuantRecipe(
       activation_dtype=quant_attrs.Dtype.FP32,
-      weight_dtype=quant_attrs.Dtype.INT8,
+      weight_dtype=weight_dtype,
       mode=quant_attrs.Mode.DYNAMIC_RANGE,
       algorithm=quant_attrs.Algorithm.MIN_MAX,
-      granularity=quant_attrs.Granularity.CHANNELWISE,
+      granularity=granularity,
   )
-def create_layer_quant_int8_weight_only() -> quant_recipe.LayerQuantRecipe:
+def create_layer_quant_weight_only(
+    weight_dtype: quant_attrs.Dtype = quant_attrs.Dtype.INT8,
+    granularity: quant_attrs.Granularity = quant_attrs.Granularity.CHANNELWISE,
+) -> quant_recipe.LayerQuantRecipe:
   return quant_recipe.LayerQuantRecipe(
       activation_dtype=quant_attrs.Dtype.FP32,
-      weight_dtype=quant_attrs.Dtype.INT8,
+      weight_dtype=weight_dtype,
       mode=quant_attrs.Mode.WEIGHT_ONLY,
       algorithm=quant_attrs.Algorithm.MIN_MAX,
-      granularity=quant_attrs.Granularity.CHANNELWISE,
+      granularity=granularity,
   )
@@ -60,16 +66,3 @@ def create_layer_quant_fp16() -> quant_recipe.LayerQuantRecipe:
       algorithm=quant_attrs.Algorithm.FLOAT_CAST,
       granularity=quant_attrs.Granularity.NONE,
   )
-def create_layer_quant_int4_dynamic_block(
-    block_size: int,
-) -> quant_recipe.LayerQuantRecipe:
-  return quant_recipe.LayerQuantRecipe(
-      activation_dtype=quant_attrs.Dtype.FP32,
-      weight_dtype=quant_attrs.Dtype.INT4,
-      mode=quant_attrs.Mode.DYNAMIC_RANGE,
-      algorithm=quant_attrs.Algorithm.MIN_MAX,
-      granularity=quant_attrs.Granularity.BLOCKWISE,
-      block_size=block_size,
-  )

ai_edge_torch/generative/quantize/quant_recipes.py CHANGED Viewed

@@ -29,35 +29,44 @@ Typical usage example:
 from typing import Optional
 from ai_edge_torch.generative.layers import model_config
+from ai_edge_torch.generative.quantize import quant_attrs
 from ai_edge_torch.generative.quantize import quant_recipe
 from ai_edge_torch.generative.quantize import quant_recipe_utils
 from ai_edge_torch.quantize import quant_config
-def full_int8_dynamic_recipe(
-    mcfg: Optional[model_config.ModelConfig] = None,
+def full_dynamic_recipe(
+    mcfg: model_config.ModelConfig | None = None,
+    weight_dtype: quant_attrs.Dtype = quant_attrs.Dtype.INT8,
+    granularity: quant_attrs.Granularity = quant_attrs.Granularity.CHANNELWISE,
 ) -> quant_config.QuantConfig:
   return quant_config.QuantConfig(
       generative_recipe=quant_recipe.GenerativeQuantRecipe(
-          default=quant_recipe_utils.create_layer_quant_int8_dynamic(),
+          default=quant_recipe_utils.create_layer_quant_dynamic(
+              weight_dtype, granularity
+          ),
           _model_config=mcfg,
       )
   )
-def full_int8_weight_only_recipe(
-    mcfg: Optional[model_config.ModelConfig] = None,
+def full_weight_only_recipe(
+    mcfg: model_config.ModelConfig | None = None,
+    weight_dtype: quant_attrs.Dtype = quant_attrs.Dtype.INT8,
+    granularity: quant_attrs.Granularity = quant_attrs.Granularity.CHANNELWISE,
 ) -> quant_config.QuantConfig:
   return quant_config.QuantConfig(
       generative_recipe=quant_recipe.GenerativeQuantRecipe(
-          default=quant_recipe_utils.create_layer_quant_int8_weight_only(),
+          default=quant_recipe_utils.create_layer_quant_weight_only(
+              weight_dtype, granularity
+          ),
           _model_config=mcfg,
       )
   )
 def full_fp16_recipe(
-    mcfg: Optional[model_config.ModelConfig] = None,
+    mcfg: model_config.ModelConfig | None = None,
 ) -> quant_config.QuantConfig:
   return quant_config.QuantConfig(
       generative_recipe=quant_recipe.GenerativeQuantRecipe(
@@ -65,17 +74,3 @@ def full_fp16_recipe(
           _model_config=mcfg,
       )
   )
-def all_supported_int4_dynamic_block_recipe(
-    block_size: int,
-    mcfg: Optional[model_config.ModelConfig] = None,
-) -> quant_config.QuantConfig:
-  return quant_config.QuantConfig(
-      generative_recipe=quant_recipe.GenerativeQuantRecipe(
-          default=quant_recipe_utils.create_layer_quant_int4_dynamic_block(
-              block_size
-          ),
-          _model_config=mcfg,
-      )
-  )

ai_edge_torch/generative/quantize/supported_schemes.py CHANGED Viewed

@@ -29,5 +29,8 @@ def get_supported_layer_schemes():
       (_t.FP32, _t.INT8, _m.DYNAMIC_RANGE, _a.MIN_MAX, _g.CHANNELWISE),
       (_t.FP32, _t.INT8, _m.WEIGHT_ONLY, _a.MIN_MAX, _g.CHANNELWISE),
       (_t.FP32, _t.FP16, _m.WEIGHT_ONLY, _a.FLOAT_CAST, _g.NONE),
-      (_t.FP32, _t.INT4, _m.DYNAMIC_RANGE, _a.MIN_MAX, _g.BLOCKWISE),
+      (_t.FP32, _t.INT4, _m.DYNAMIC_RANGE, _a.MIN_MAX, _g.BLOCKWISE_32),
+      (_t.FP32, _t.INT4, _m.DYNAMIC_RANGE, _a.MIN_MAX, _g.BLOCKWISE_64),
+      (_t.FP32, _t.INT4, _m.DYNAMIC_RANGE, _a.MIN_MAX, _g.BLOCKWISE_128),
+      (_t.FP32, _t.INT4, _m.DYNAMIC_RANGE, _a.MIN_MAX, _g.BLOCKWISE_256),
   ]

ai_edge_torch/generative/test/test_kv_cache.py CHANGED Viewed

@@ -41,6 +41,20 @@ class TestKVLayers(googletest.TestCase):
     )
     return config
+  def _assert_kv_cache_entry_equal(self, kv1, kv2):
+    self.assertIsInstance(kv1, kv_utils.KVCacheEntry)
+    self.assertIsInstance(kv2, kv_utils.KVCacheEntry)
+    self.assertEqual(kv1.kv_layout, kv2.kv_layout)
+    self.assertTrue(torch.equal(kv1.k_cache, kv2.k_cache))
+    self.assertTrue(torch.equal(kv1.v_cache, kv2.v_cache))
+  def _assert_kv_cache_equal(self, kv1, kv2):
+    self.assertIsInstance(kv1, kv_utils.KVCache)
+    self.assertIsInstance(kv2, kv_utils.KVCache)
+    self.assertEqual(len(kv1.caches), len(kv2.caches))
+    for kv1_entry, kv2_entry in zip(kv1.caches, kv2.caches):
+      self._assert_kv_cache_entry_equal(kv1_entry, kv2_entry)
   def test_cache_udpate(self):
     N = 1
     HEAD_DIM = 2
@@ -118,7 +132,7 @@ class TestKVLayers(googletest.TestCase):
     flat, treespec = pytree.tree_flatten(kv)
     self.assertLen(flat, NUM_LAYERS * 2)
     kv_unflat = pytree.tree_unflatten(flat, treespec)
-    self.assertEqual(kv, kv_unflat)
+    self._assert_kv_cache_equal(kv, kv_unflat)
   def test_pytree_roundtrip_kv_cache_derived(self):
     NUM_LAYERS = 4
@@ -134,7 +148,7 @@ class TestKVLayers(googletest.TestCase):
     flat, treespec = pytree.tree_flatten(kv)
     self.assertLen(flat, NUM_LAYERS * 2)
     kv_unflat = pytree.tree_unflatten(flat, treespec)
-    self.assertEqual(kv, kv_unflat)
+    self._assert_kv_cache_equal(kv, kv_unflat)
   def test_pytree_roundtrip_kv_entry(self):
     attn_config = cfg.AttentionConfig(
@@ -144,8 +158,7 @@ class TestKVLayers(googletest.TestCase):
     flat, treespec = pytree.tree_flatten(kv)
     self.assertLen(flat, 2)
     kv_unflat = pytree.tree_unflatten(flat, treespec)
-    self.assertEqual(kv, kv_unflat)
-    self.assertIsInstance(kv_unflat, kv_utils.KVCacheEntry)
+    self._assert_kv_cache_entry_equal(kv, kv_unflat)
   def test_pytree_roundtrip_kv_entry_derived(self):
     attn_config = cfg.AttentionConfig(
@@ -157,8 +170,7 @@ class TestKVLayers(googletest.TestCase):
     flat, treespec = pytree.tree_flatten(kv)
     self.assertLen(flat, 2)
     kv_unflat = pytree.tree_unflatten(flat, treespec)
-    self.assertEqual(kv, kv_unflat)
-    self.assertIsInstance(kv_unflat, kv_utils.KVCacheEntry)
+    self._assert_kv_cache_entry_equal(kv, kv_unflat)
 if __name__ == "__main__":

ai_edge_torch/generative/test/test_quantize.py CHANGED Viewed

@@ -79,18 +79,18 @@ class TestVerifyRecipes(parameterized.TestCase):
           Dtype.INT4,
           Mode.DYNAMIC_RANGE,
           Algorithm.MIN_MAX,
-          Granularity.BLOCKWISE,
-          32,
+          Granularity.BLOCKWISE_32,
+      ),
+      (
+          Dtype.FP32,
+          Dtype.INT4,
+          Mode.DYNAMIC_RANGE,
+          Algorithm.MIN_MAX,
+          Granularity.BLOCKWISE_128,
       ),
   ])
   def test_verify_valid_recipes(
-      self,
-      activation,
-      weight,
-      mode,
-      algo,
-      granularity,
-      block_size=None,
+      self, activation, weight, mode, algo, granularity
   ):
     quant_recipe.LayerQuantRecipe(
         activation, weight, mode, algo, granularity
@@ -108,21 +108,21 @@ class TestQuantizeConvert(parameterized.TestCase):
   def _attention_int8_dynamic_recipe() -> quant_config.QuantConfig:
     return quant_config.QuantConfig(
         generative_recipe=quant_recipe.GenerativeQuantRecipe(
-            attention=quant_recipe_utils.create_layer_quant_int8_dynamic(),
+            attention=quant_recipe_utils.create_layer_quant_dynamic(),
         )
     )
   def _feedforward_int8_dynamic_recipe() -> quant_config.QuantConfig:
     return quant_config.QuantConfig(
         generative_recipe=quant_recipe.GenerativeQuantRecipe(
-            feedforward=quant_recipe_utils.create_layer_quant_int8_dynamic(),
+            feedforward=quant_recipe_utils.create_layer_quant_dynamic(),
         )
     )
   @parameterized.parameters([
       (quant_recipes.full_fp16_recipe()),
-      (quant_recipes.full_int8_dynamic_recipe()),
-      (quant_recipes.full_int8_weight_only_recipe()),
+      (quant_recipes.full_dynamic_recipe()),
+      (quant_recipes.full_weight_only_recipe()),
       (_attention_int8_dynamic_recipe()),
       (_feedforward_int8_dynamic_recipe()),
   ])
@@ -148,7 +148,7 @@ class TestQuantizeConvert(parameterized.TestCase):
     idx = torch.unsqueeze(torch.arange(0, 100, dtype=torch.int), 0)
     input_pos = torch.arange(0, 100, dtype=torch.int)
-    quant_config = quant_recipes.full_int8_dynamic_recipe()
+    quant_config = quant_recipes.full_dynamic_recipe()
     quantized_model = ai_edge_torch.convert(
         pytorch_model, (idx, input_pos), quant_config=quant_config
     )
@@ -164,7 +164,9 @@ class TestQuantizeConvert(parameterized.TestCase):
     pytorch_model = toy_model.ToySingleLayerModel(config)
     idx = torch.unsqueeze(torch.arange(0, 100, dtype=torch.int), 0)
     input_pos = torch.arange(0, 100, dtype=torch.int)
-    quant_config = quant_recipes.all_supported_int4_dynamic_block_recipe(32)
+    quant_config = quant_recipes.full_dynamic_recipe(
+        weight_dtype=Dtype.INT4, granularity=Granularity.BLOCKWISE_32
+    )
     quantized_model = ai_edge_torch.convert(
         pytorch_model, (idx, input_pos), quant_config=quant_config
     )
@@ -175,17 +177,6 @@ class TestQuantizeConvert(parameterized.TestCase):
         "Quantized model isn't smaller than F32 model.",
     )
-  def test_unsupported_block_size(self):
-    config = toy_model.get_model_config()
-    pytorch_model = toy_model.ToySingleLayerModel(config)
-    idx = torch.unsqueeze(torch.arange(0, 100, dtype=torch.int), 0)
-    input_pos = torch.arange(0, 100, dtype=torch.int)
-    self.assertRaises(
-        ValueError,
-        quant_recipes.all_supported_int4_dynamic_block_recipe,
-        36,
-    )
   def test_quantize_convert_compare_toy(self):
     self.skipTest("b/338288901")
     config = toy_model_with_kv_cache.get_model_config()

ai_edge_torch/generative/utilities/converter.py CHANGED Viewed

@@ -25,6 +25,7 @@ from ai_edge_torch._convert import converter as converter_utils
 from ai_edge_torch.generative.layers import kv_cache as kv_utils
 from ai_edge_torch.generative.layers import lora as lora_utils
 import ai_edge_torch.generative.layers.model_config as cfg
+from ai_edge_torch.generative.quantize import quant_attrs
 from ai_edge_torch.generative.quantize import quant_recipes
 from ai_edge_torch.generative.utilities import export_config as export_config_lib
 from ai_edge_torch.generative.utilities import litertlm_builder
@@ -143,9 +144,23 @@ def define_conversion_flags(
       '`prefill_seq_lens` as the maximum of kv_cache size and prefill lengths '
       'in the graph.',
   )
+  flags.DEFINE_bool(
+      'export_gpu_dynamic_shape_verifications',
+      False,
+      'If true, the conversion script will export signatures used only for '
+      'verification of GPU dynamic shapes.',
+  )
   return flags
+# Context length for verifying GPU dynamic shapes.
+_CONTEXT_LENGTH_TO_VERIFY_MAGIC_NUMBERS = 1280
+# Long prefill length for verifying GPU dynamic shapes.
+_LONG_PREFILL_LENGTH_TO_VERIFY_MAGIC_NUMBERS = 1024
+# Short prefill length for verifying GPU dynamic shapes.
+_SHORT_PREFILL_LENGTH_TO_VERIFY_MAGIC_NUMBERS = 64
 def is_magic_number_(num: int) -> bool:
   """Returns true if the number is a magic number, i.e. prime number > 10."""
   if num < 10:
@@ -193,18 +208,22 @@ def get_quant_recipe_from_flag(
     case QuantizationName.NONE:
       return None
     case QuantizationName.DYNAMIC_INT8:
-      return quant_recipes.full_int8_dynamic_recipe(mcfg=model_config)
+      return quant_recipes.full_dynamic_recipe(mcfg=model_config)
     case QuantizationName.WEIGHT_ONLY_INT8:
-      return quant_recipes.full_int8_weight_only_recipe(mcfg=model_config)
+      return quant_recipes.full_weight_only_recipe(mcfg=model_config)
     case QuantizationName.FP16:
       return quant_recipes.full_fp16_recipe()
     case QuantizationName.DYNAMIC_INT4_BLOCK32:
-      return quant_recipes.all_supported_int4_dynamic_block_recipe(
-          32, mcfg=model_config
+      return quant_recipes.full_dynamic_recipe(
+          mcfg=model_config,
+          weight_dtype=quant_attrs.Dtype.INT4,
+          granularity=quant_attrs.Granularity.BLOCKWISE_32,
       )
     case QuantizationName.DYNAMIC_INT4_BLOCK128:
-      return quant_recipes.all_supported_int4_dynamic_block_recipe(
-          128, mcfg=model_config
+      return quant_recipes.full_dynamic_recipe(
+          mcfg=model_config,
+          weight_dtype=quant_attrs.Dtype.INT4,
+          granularity=quant_attrs.Granularity.BLOCKWISE_128,
       )
     case _:
       raise ValueError(f'Unsupported quantization flag: {quantize}')
@@ -263,6 +282,10 @@ def convert_to_tflite(
     config: cfg.ModelConfig = None,
     lora_ranks: Optional[list[int]] = None,
     export_config: ExportConfig = None,
+    extra_model: torch.nn.Module = None,
+    extra_prefill_seq_lens: list[int] = None,
+    extra_kv_cache_max_len: int = 0,
+    extra_signature_prefix: str = '',
 ):
   """Converts a nn.Module model to multi-signature tflite model.
@@ -315,6 +338,15 @@ def convert_to_tflite(
         no LoRA signatures will be added.
       export_config (ExportConfig, optional): The export configuration. If None,
         it uses the default export configuration.
+      extra_model (torch.nn.Module, optional): PyTorch model to export in
+        addition to the pytorch_model. This model can have different
+        prefill_seq_lens and kv_cache_max_len.
+      extra_prefill_seq_lens (list[int], optional): The prefill sequence
+        lengths for extra_model. Meaningful only when extra_model is not None.
+      extra_kv_cache_max_len (int, optional): The maximum size of KV cache
+        buffer for extra_model. Meaningful only when extra_model is not None.
+      extra_signature_prefix (str, optional): The prefix of the extra model
+        signatures. Meaningful only when extra_model is not None.
   """
   # pylint: disable=protected-access
   torch._dynamo.config.cache_size_limit = 64
@@ -353,32 +385,51 @@ def convert_to_tflite(
   )
   output_file = os.path.join(output_path, output_filename)
-  _export_helper(
+  converter = converter_utils.Converter()
+  _add_signatures(
+      converter,
       pytorch_model,
-      output_file,
       prefill_seq_lens,
       kv_cache_max_len,
       pixel_values_size,
       pixel_seq_len,
-      quantize,
       config,
       loras,
       export_config,
   )
+  if extra_model is not None and extra_prefill_seq_lens:
+    _add_signatures(
+        converter,
+        extra_model,
+        extra_prefill_seq_lens,
+        extra_kv_cache_max_len,
+        pixel_values_size,
+        pixel_seq_len,
+        config,
+        loras,
+        export_config,
+        signature_prefix=extra_signature_prefix,
+    )
+  edge_model = converter.convert(
+      quant_config=get_quant_recipe_from_flag(quantize, config),
+  )
+  edge_model.export(output_file)
   return output_file
-def _export_helper(
+def _add_signatures(
+    converter: converter_utils.Converter,
     pytorch_model: torch.nn.Module,
-    output_file: str,
     prefill_seq_lens: list[int],
     kv_cache_max_len: int,
     pixel_values_size: torch.Size,
     pixel_seq_len: int,
-    quantize: str,
     config: cfg.ModelConfig,
     loras: list[None | lora_utils.LoRA],
     export_config: ExportConfig,
+    signature_prefix: str = '',
 ):
   """Helper function to export a model to tflite."""
   prefill_tokens_list = []
@@ -423,17 +474,14 @@ def _export_helper(
       kv_layout=export_config.kvcache_layout,
   )
-  quant_config = get_quant_recipe_from_flag(quantize, config)
   # For export, we create a module that captures any non-exportable,
   # arugments, e.g. the generation config object.
   mod = ExportableModule(pytorch_model, export_config=export_config).eval()
-  converter = converter_utils.Converter()
   for lora in loras:
     for i in range(len(prefill_seq_lens)):
       prefill_seq_len = prefill_seq_lens[i]
-      prefill_signature_name = f'prefill_{prefill_seq_len}'
+      prefill_signature_name = f'{signature_prefix}prefill_{prefill_seq_len}'
       sample_kwargs = {
           'tokens': prefill_tokens_list[i],
@@ -488,17 +536,15 @@ def _export_helper(
     if lora is not None:
       sample_kwargs['lora'] = lora
+    decode_signature_name = f'{signature_prefix}decode'
+    if lora is not None:
+      decode_signature_name += f'_lora_r{lora.get_rank()}'
     converter.add_signature(
-        'decode' if lora is None else f'decode_lora_r{lora.get_rank()}',
+        decode_signature_name,
         mod,
         sample_kwargs=sample_kwargs,
     )
-  edge_model = converter.convert(
-      quant_config=quant_config,
-  )
-  edge_model.export(output_file)
 def build_and_convert_to_tflite_from_flags(
     model_builder: Callable[
@@ -521,11 +567,36 @@ def build_and_convert_to_tflite_from_flags(
       get_mask_cache_size_from_flags(),
   )
+  # Extra model for GPU dynamic shape verification if needed.
+  extra_model = None
+  extra_prefill_seq_lens = None
+  extra_kv_cache_max_len = 0
   if flags.FLAGS.gpu_dynamic_shapes:
     prefill_seq_lens = [
         get_magic_number_for(l) for l in flags.FLAGS.prefill_seq_lens
     ]
     kv_cache_max_len = get_magic_number_for(flags.FLAGS.kv_cache_max_len)
+    if flags.FLAGS.export_gpu_dynamic_shape_verifications:
+      extra_kv_cache_max_len = _CONTEXT_LENGTH_TO_VERIFY_MAGIC_NUMBERS
+      if extra_kv_cache_max_len > flags.FLAGS.kv_cache_max_len:
+        extra_kv_cache_max_len = flags.FLAGS.kv_cache_max_len
+      extra_model = model_builder(
+          checkpoint_path,
+          loader.maybe_get_custom_loader(
+              checkpoint_path, flags.FLAGS.custom_checkpoint_loader
+          ),
+          extra_kv_cache_max_len,
+      )
+      extra_prefill_seq_lens = []
+      if extra_kv_cache_max_len > _SHORT_PREFILL_LENGTH_TO_VERIFY_MAGIC_NUMBERS:
+        extra_prefill_seq_lens.append(
+            _SHORT_PREFILL_LENGTH_TO_VERIFY_MAGIC_NUMBERS
+        )
+      if extra_kv_cache_max_len > _LONG_PREFILL_LENGTH_TO_VERIFY_MAGIC_NUMBERS:
+        extra_prefill_seq_lens.append(
+            _LONG_PREFILL_LENGTH_TO_VERIFY_MAGIC_NUMBERS
+        )
   else:
     prefill_seq_lens = flags.FLAGS.prefill_seq_lens
     kv_cache_max_len = flags.FLAGS.kv_cache_max_len
@@ -539,6 +610,10 @@ def build_and_convert_to_tflite_from_flags(
       quantize=flags.FLAGS.quantize,
       lora_ranks=flags.FLAGS.lora_ranks,
       export_config=export_config_lib.get_from_flags(),
+      extra_model=extra_model,
+      extra_prefill_seq_lens=extra_prefill_seq_lens,
+      extra_kv_cache_max_len=extra_kv_cache_max_len,
+      extra_signature_prefix='test_' if extra_model is not None else '',
   )

ai_edge_torch/generative/utilities/litertlm_builder.py CHANGED Viewed

@@ -18,16 +18,19 @@
 import os
 import pathlib
+from google.protobuf import text_format
 try:
   # pylint: disable=g-import-not-at-top
   from ai_edge_litert.internal import llm_metadata_pb2
   from ai_edge_litert.internal import litertlm_builder
+  from ai_edge_litert.internal import llm_model_type_pb2
   # pylint: enable=g-import-not-at-top
   _litertlm_builder_available = True
 except ImportError:
   llm_metadata_pb2 = None
+  llm_model_type_pb2 = None
   litertlm_builder = None
   _litertlm_builder_available = False
@@ -41,16 +44,19 @@ def build_litertlm(
     workdir: str,
     output_path: str,
     context_length: int,
-    model_prompt_prefix: str | None,
-    model_prompt_suffix: str | None,
-    user_prompt_prefix: str | None,
-    user_prompt_suffix: str | None,
-    tokenizer_model_path: str | None,
-    hf_tokenizer_model_path: str | None,
+    model_prompt_prefix: str | None = None,
+    model_prompt_suffix: str | None = None,
+    user_prompt_prefix: str | None = None,
+    user_prompt_suffix: str | None = None,
+    tokenizer_model_path: str | None = None,
+    hf_tokenizer_model_path: str | None = None,
     start_token: str | None = None,
     start_token_id: int | None = None,
     stop_tokens: str | list[str] | None = None,
     stop_token_ids: list[int] | None = None,
+    llm_model_type: str = 'generic',
+    jinja_prompt_template: str | None = None,
+    base_llm_metadata_path: str | None = None,
     **kwargs,
 ):
   """Builds a LiteRT-LM file from a TFlite model."""
@@ -58,10 +64,22 @@ def build_litertlm(
   if not is_litertlm_builder_available():
     raise ValueError('LiteRT-LM builder is not available.')
-  assert llm_metadata_pb2 is not None
   assert litertlm_builder is not None
+  assert llm_metadata_pb2 is not None
+  assert llm_model_type_pb2 is not None
   llm_metadata = llm_metadata_pb2.LlmMetadata()
+  if base_llm_metadata_path:
+    if base_llm_metadata_path.endswith('.pb'):
+      with open(base_llm_metadata_path, 'rb') as f:
+        llm_metadata.ParseFromString(f.read())
+    elif base_llm_metadata_path.endswith('.textproto'):
+      with open(base_llm_metadata_path, 'r') as f:
+        text_format.Parse(f.read(), llm_metadata, allow_unknown_field=True)
+    else:
+      raise ValueError(
+          'Base LLM metadata path must be a binary or text proto file.'
+      )
   if start_token_id is not None:
     llm_metadata.start_token.token_ids.ids.append(start_token_id)
@@ -96,7 +114,42 @@ def build_litertlm(
   llm_metadata.max_num_tokens = context_length
-  llm_metadata_path = os.path.join(workdir, 'llm_metadata.pb')
+  mdl_type = llm_metadata.llm_model_type.WhichOneof('model_type')
+  if not mdl_type or mdl_type == 'generic_model':
+    match llm_model_type:
+      case litertlm_builder.LlmModelType.GENERIC:
+        llm_metadata.llm_model_type.CopyFrom(
+            llm_model_type_pb2.LlmModelType(
+                generic_model=llm_model_type_pb2.GenericModel()
+            )
+        )
+      case litertlm_builder.LlmModelType.GEMMA3N:
+        llm_metadata.llm_model_type.CopyFrom(
+            llm_model_type_pb2.LlmModelType(
+                gemma3n=llm_model_type_pb2.Gemma3N()
+            )
+        )
+      case litertlm_builder.LlmModelType.GEMMA3:
+        llm_metadata.llm_model_type.CopyFrom(
+            llm_model_type_pb2.LlmModelType(gemma3=llm_model_type_pb2.Gemma3())
+        )
+      case litertlm_builder.LlmModelType.QWEN3:
+        llm_metadata.llm_model_type.CopyFrom(
+            llm_model_type_pb2.LlmModelType(qwen3=llm_model_type_pb2.Qwen3())
+        )
+      case litertlm_builder.LlmModelType.QWEN2P5:
+        llm_metadata.llm_model_type.CopyFrom(
+            llm_model_type_pb2.LlmModelType(
+                qwen2p5=llm_model_type_pb2.Qwen2p5()
+            )
+        )
+      case _:
+        raise ValueError(f'Unsupported LLM model type: {llm_model_type}')
+  if jinja_prompt_template is not None:
+    llm_metadata.jinja_prompt_template = jinja_prompt_template
+  llm_metadata_path = os.path.join(workdir, 'llm_metadata_final.pb')
   with open(llm_metadata_path, 'wb') as f:
     f.write(llm_metadata.SerializeToString())

ai_edge_torch/generative/utilities/loader.py CHANGED Viewed

@@ -135,7 +135,8 @@ def load_pytorch_statedict(full_path: str):
   tensors = {}
   for file in files:
-    this_file_tensors = torch.load(file)
+    map_location = "cpu" if not torch.cuda.is_available() else None
+    this_file_tensors = torch.load(file, map_location=map_location)
     for k in this_file_tensors:
       assert k not in tensors
     tensors.update(this_file_tensors)

ai_edge_torch/lowertools/translate_recipe.py CHANGED Viewed

@@ -80,8 +80,14 @@ def _get_granularity(
     return _QuantGranularity.CHANNELWISE
   if granularity == quant_attrs.Granularity.NONE:
     return _QuantGranularity.TENSORWISE
-  if granularity == quant_attrs.Granularity.BLOCKWISE:
-    return _QuantGranularity.BLOCKWISE
+  if granularity == quant_attrs.Granularity.BLOCKWISE_32:
+    return _QuantGranularity.BLOCKWISE_32
+  if granularity == quant_attrs.Granularity.BLOCKWISE_64:
+    return _QuantGranularity.BLOCKWISE_64
+  if granularity == quant_attrs.Granularity.BLOCKWISE_128:
+    return _QuantGranularity.BLOCKWISE_128
+  if granularity == quant_attrs.Granularity.BLOCKWISE_256:
+    return _QuantGranularity.BLOCKWISE_256
   raise ValueError('Unimplemented granularity')
@@ -108,7 +114,6 @@ def _set_quant_config(
               symmetric=True,
               granularity=_get_granularity(layer_recipe.granularity),
               dtype=_get_dtype_from_dtype(layer_recipe.weight_dtype),
-              block_size=layer_recipe.block_size,
           ),
           compute_precision=_get_compute_precision_from_mode(layer_recipe.mode),
           explicit_dequantize=_get_explicit_dequant_from_mode(

ai_edge_torch/odml_torch/experimental/__init__.py ADDED Viewed

@@ -0,0 +1,14 @@
+# Copyright 2024 The AI Edge Torch Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================

ai-edge-torch-nightly 0.7.0.dev20251007__py3-none-any.whl → 0.8.0.dev20251225__py3-none-any.whl

Potentially problematic release.

ai-edge-torch-nightly 0.7.0.dev20251007py3-none-any.whl → 0.8.0.dev20251225py3-none-any.whl