PyPI - ai-edge-torch-nightly - Versions diffs - 0.7.0.dev20250929__py3-none-any.whl → 0.8.0.dev20251206__py3-none-any.whl - Mend

ai-edge-torch-nightly 0.7.0.dev20250929py3-none-any.whl → 0.8.0.dev20251206py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ai-edge-torch-nightly might be problematic. Click here for more details.

Files changed (57) hide show

ai_edge_torch/generative/test/test_kv_cache.py CHANGED Viewed

@@ -41,6 +41,20 @@ class TestKVLayers(googletest.TestCase):
     )
     return config
+  def _assert_kv_cache_entry_equal(self, kv1, kv2):
+    self.assertIsInstance(kv1, kv_utils.KVCacheEntry)
+    self.assertIsInstance(kv2, kv_utils.KVCacheEntry)
+    self.assertEqual(kv1.kv_layout, kv2.kv_layout)
+    self.assertTrue(torch.equal(kv1.k_cache, kv2.k_cache))
+    self.assertTrue(torch.equal(kv1.v_cache, kv2.v_cache))
+  def _assert_kv_cache_equal(self, kv1, kv2):
+    self.assertIsInstance(kv1, kv_utils.KVCache)
+    self.assertIsInstance(kv2, kv_utils.KVCache)
+    self.assertEqual(len(kv1.caches), len(kv2.caches))
+    for kv1_entry, kv2_entry in zip(kv1.caches, kv2.caches):
+      self._assert_kv_cache_entry_equal(kv1_entry, kv2_entry)
   def test_cache_udpate(self):
     N = 1
     HEAD_DIM = 2
@@ -118,7 +132,7 @@ class TestKVLayers(googletest.TestCase):
     flat, treespec = pytree.tree_flatten(kv)
     self.assertLen(flat, NUM_LAYERS * 2)
     kv_unflat = pytree.tree_unflatten(flat, treespec)
-    self.assertEqual(kv, kv_unflat)
+    self._assert_kv_cache_equal(kv, kv_unflat)
   def test_pytree_roundtrip_kv_cache_derived(self):
     NUM_LAYERS = 4
@@ -134,7 +148,7 @@ class TestKVLayers(googletest.TestCase):
     flat, treespec = pytree.tree_flatten(kv)
     self.assertLen(flat, NUM_LAYERS * 2)
     kv_unflat = pytree.tree_unflatten(flat, treespec)
-    self.assertEqual(kv, kv_unflat)
+    self._assert_kv_cache_equal(kv, kv_unflat)
   def test_pytree_roundtrip_kv_entry(self):
     attn_config = cfg.AttentionConfig(
@@ -144,8 +158,7 @@ class TestKVLayers(googletest.TestCase):
     flat, treespec = pytree.tree_flatten(kv)
     self.assertLen(flat, 2)
     kv_unflat = pytree.tree_unflatten(flat, treespec)
-    self.assertEqual(kv, kv_unflat)
-    self.assertIsInstance(kv_unflat, kv_utils.KVCacheEntry)
+    self._assert_kv_cache_entry_equal(kv, kv_unflat)
   def test_pytree_roundtrip_kv_entry_derived(self):
     attn_config = cfg.AttentionConfig(
@@ -157,8 +170,7 @@ class TestKVLayers(googletest.TestCase):
     flat, treespec = pytree.tree_flatten(kv)
     self.assertLen(flat, 2)
     kv_unflat = pytree.tree_unflatten(flat, treespec)
-    self.assertEqual(kv, kv_unflat)
-    self.assertIsInstance(kv_unflat, kv_utils.KVCacheEntry)
+    self._assert_kv_cache_entry_equal(kv, kv_unflat)
 if __name__ == "__main__":

ai_edge_torch/generative/test/test_quantize.py CHANGED Viewed

@@ -79,18 +79,18 @@ class TestVerifyRecipes(parameterized.TestCase):
           Dtype.INT4,
           Mode.DYNAMIC_RANGE,
           Algorithm.MIN_MAX,
-          Granularity.BLOCKWISE,
-          32,
+          Granularity.BLOCKWISE_32,
+      ),
+      (
+          Dtype.FP32,
+          Dtype.INT4,
+          Mode.DYNAMIC_RANGE,
+          Algorithm.MIN_MAX,
+          Granularity.BLOCKWISE_128,
       ),
   ])
   def test_verify_valid_recipes(
-      self,
-      activation,
-      weight,
-      mode,
-      algo,
-      granularity,
-      block_size=None,
+      self, activation, weight, mode, algo, granularity
   ):
     quant_recipe.LayerQuantRecipe(
         activation, weight, mode, algo, granularity
@@ -108,21 +108,21 @@ class TestQuantizeConvert(parameterized.TestCase):
   def _attention_int8_dynamic_recipe() -> quant_config.QuantConfig:
     return quant_config.QuantConfig(
         generative_recipe=quant_recipe.GenerativeQuantRecipe(
-            attention=quant_recipe_utils.create_layer_quant_int8_dynamic(),
+            attention=quant_recipe_utils.create_layer_quant_dynamic(),
         )
     )
   def _feedforward_int8_dynamic_recipe() -> quant_config.QuantConfig:
     return quant_config.QuantConfig(
         generative_recipe=quant_recipe.GenerativeQuantRecipe(
-            feedforward=quant_recipe_utils.create_layer_quant_int8_dynamic(),
+            feedforward=quant_recipe_utils.create_layer_quant_dynamic(),
         )
     )
   @parameterized.parameters([
       (quant_recipes.full_fp16_recipe()),
-      (quant_recipes.full_int8_dynamic_recipe()),
-      (quant_recipes.full_int8_weight_only_recipe()),
+      (quant_recipes.full_dynamic_recipe()),
+      (quant_recipes.full_weight_only_recipe()),
       (_attention_int8_dynamic_recipe()),
       (_feedforward_int8_dynamic_recipe()),
   ])
@@ -148,7 +148,7 @@ class TestQuantizeConvert(parameterized.TestCase):
     idx = torch.unsqueeze(torch.arange(0, 100, dtype=torch.int), 0)
     input_pos = torch.arange(0, 100, dtype=torch.int)
-    quant_config = quant_recipes.full_int8_dynamic_recipe()
+    quant_config = quant_recipes.full_dynamic_recipe()
     quantized_model = ai_edge_torch.convert(
         pytorch_model, (idx, input_pos), quant_config=quant_config
     )
@@ -164,7 +164,9 @@ class TestQuantizeConvert(parameterized.TestCase):
     pytorch_model = toy_model.ToySingleLayerModel(config)
     idx = torch.unsqueeze(torch.arange(0, 100, dtype=torch.int), 0)
     input_pos = torch.arange(0, 100, dtype=torch.int)
-    quant_config = quant_recipes.all_supported_int4_dynamic_block_recipe(32)
+    quant_config = quant_recipes.full_dynamic_recipe(
+        weight_dtype=Dtype.INT4, granularity=Granularity.BLOCKWISE_32
+    )
     quantized_model = ai_edge_torch.convert(
         pytorch_model, (idx, input_pos), quant_config=quant_config
     )
@@ -175,17 +177,6 @@ class TestQuantizeConvert(parameterized.TestCase):
         "Quantized model isn't smaller than F32 model.",
     )
-  def test_unsupported_block_size(self):
-    config = toy_model.get_model_config()
-    pytorch_model = toy_model.ToySingleLayerModel(config)
-    idx = torch.unsqueeze(torch.arange(0, 100, dtype=torch.int), 0)
-    input_pos = torch.arange(0, 100, dtype=torch.int)
-    self.assertRaises(
-        ValueError,
-        quant_recipes.all_supported_int4_dynamic_block_recipe,
-        36,
-    )
   def test_quantize_convert_compare_toy(self):
     self.skipTest("b/338288901")
     config = toy_model_with_kv_cache.get_model_config()

ai_edge_torch/generative/utilities/converter.py CHANGED Viewed

@@ -19,15 +19,17 @@ import enum
 import os
 import pathlib
 import tempfile
-from typing import Any, Optional, Union
+from typing import Callable, Dict, Optional, Union
 from absl import flags
 from ai_edge_torch._convert import converter as converter_utils
 from ai_edge_torch.generative.layers import kv_cache as kv_utils
 from ai_edge_torch.generative.layers import lora as lora_utils
 import ai_edge_torch.generative.layers.model_config as cfg
+from ai_edge_torch.generative.quantize import quant_attrs
 from ai_edge_torch.generative.quantize import quant_recipes
 from ai_edge_torch.generative.utilities import export_config as export_config_lib
 from ai_edge_torch.generative.utilities import litertlm_builder
+from ai_edge_torch.generative.utilities import loader
 from ai_edge_torch.quantize import quant_config as qcfg
 import torch
@@ -94,6 +96,11 @@ def define_conversion_flags(
       (8, 64, 128, 256, 512, 1024),
       'List of the maximum sizes of prefill input tensors.',
   )
+  flags.DEFINE_integer(
+      'decode_batch_size',
+      1,
+      'The batch size for the decode signature.',
+  )
   flags.DEFINE_integer(
       'kv_cache_max_len',
       1280,
@@ -102,14 +109,14 @@ def define_conversion_flags(
   flags.DEFINE_string(
       'quantize',
       'dynamic_int8',
-      'How the model should be quantized. Set to "none" to disable'
-      ' quantization. See `QuantizationName` for supported quantization types.',
+      'How the model should be quantized. Set to "none" to disable '
+      'quantization. See `QuantizationName` for supported quantization types.',
   )
   flags.DEFINE_multi_integer(
       'lora_ranks',
       None,
-      'If set, the model will be converted with the provided list of LoRA'
-      ' ranks.',
+      'If set, the model will be converted with the provided list of LoRA '
+      'ranks.',
   )
   flags.DEFINE_bool(
       'mask_as_input',
@@ -125,15 +132,61 @@ def define_conversion_flags(
   flags.DEFINE_bool(
       'custom_checkpoint_loader',
       False,
-      'If true, the conversion script will use a custom checkpoint loader which'
-      ' will read a checkpoint from a remote source.',
+      'If true, the conversion script will use a custom checkpoint loader '
+      'which will read a checkpoint from a remote source.',
+  )
+  flags.DEFINE_bool(
+      'gpu_dynamic_shapes',
+      False,
+      'It is to support dynamic shapes on GPU effectively. If true, the graph '
+      'sets the actual kv_cache size and prefill lengths when the graph is '
+      'initialized for inference based on the flags, `kv_cache_max_len` and '
+      '`prefill_seq_lens` as the maximum of kv_cache size and prefill lengths '
+      'in the graph.',
+  )
+  flags.DEFINE_bool(
+      'export_gpu_dynamic_shape_verifications',
+      False,
+      'If true, the conversion script will export signatures used only for '
+      'verification of GPU dynamic shapes.',
   )
   return flags
+# Context length for verifying GPU dynamic shapes.
+_CONTEXT_LENGTH_TO_VERIFY_MAGIC_NUMBERS = 1280
+# Long prefill length for verifying GPU dynamic shapes.
+_LONG_PREFILL_LENGTH_TO_VERIFY_MAGIC_NUMBERS = 1024
+# Short prefill length for verifying GPU dynamic shapes.
+_SHORT_PREFILL_LENGTH_TO_VERIFY_MAGIC_NUMBERS = 64
+def is_magic_number_(num: int) -> bool:
+  """Returns true if the number is a magic number, i.e. prime number > 10."""
+  if num < 10:
+    return False
+  if num % 2 == 0:
+    return False
+  for i in range(3, int(num / 2), 2):
+    if num % i == 0:
+      return False
+  return True
+def get_magic_number_for(org_number: int) -> int:
+  """Returns the magic number for the given original number."""
+  while not is_magic_number_(org_number):
+    org_number += 1
+  return org_number
 def get_mask_cache_size_from_flags() -> int:
   """Returns the mask cache size according to the flags."""
-  return 0 if flags.FLAGS.mask_as_input else flags.FLAGS.kv_cache_max_len
+  if flags.FLAGS.mask_as_input:
+    return 0
+  if flags.FLAGS.gpu_dynamic_shapes:
+    return get_magic_number_for(flags.FLAGS.kv_cache_max_len)
+  return flags.FLAGS.kv_cache_max_len
 def get_quant_recipe_from_flag(
@@ -155,18 +208,22 @@ def get_quant_recipe_from_flag(
     case QuantizationName.NONE:
       return None
     case QuantizationName.DYNAMIC_INT8:
-      return quant_recipes.full_int8_dynamic_recipe(mcfg=model_config)
+      return quant_recipes.full_dynamic_recipe(mcfg=model_config)
     case QuantizationName.WEIGHT_ONLY_INT8:
-      return quant_recipes.full_int8_weight_only_recipe(mcfg=model_config)
+      return quant_recipes.full_weight_only_recipe(mcfg=model_config)
     case QuantizationName.FP16:
       return quant_recipes.full_fp16_recipe()
     case QuantizationName.DYNAMIC_INT4_BLOCK32:
-      return quant_recipes.all_supported_int4_dynamic_block_recipe(
-          32, mcfg=model_config
+      return quant_recipes.full_dynamic_recipe(
+          mcfg=model_config,
+          weight_dtype=quant_attrs.Dtype.INT4,
+          granularity=quant_attrs.Granularity.BLOCKWISE_32,
       )
     case QuantizationName.DYNAMIC_INT4_BLOCK128:
-      return quant_recipes.all_supported_int4_dynamic_block_recipe(
-          128, mcfg=model_config
+      return quant_recipes.full_dynamic_recipe(
+          mcfg=model_config,
+          weight_dtype=quant_attrs.Dtype.INT4,
+          granularity=quant_attrs.Granularity.BLOCKWISE_128,
       )
     case _:
       raise ValueError(f'Unsupported quantization flag: {quantize}')
@@ -225,6 +282,10 @@ def convert_to_tflite(
     config: cfg.ModelConfig = None,
     lora_ranks: Optional[list[int]] = None,
     export_config: ExportConfig = None,
+    extra_model: torch.nn.Module = None,
+    extra_prefill_seq_lens: list[int] = None,
+    extra_kv_cache_max_len: int = 0,
+    extra_signature_prefix: str = '',
 ):
   """Converts a nn.Module model to multi-signature tflite model.
@@ -277,6 +338,15 @@ def convert_to_tflite(
         no LoRA signatures will be added.
       export_config (ExportConfig, optional): The export configuration. If None,
         it uses the default export configuration.
+      extra_model (torch.nn.Module, optional): PyTorch model to export in
+        addition to the pytorch_model. This model can have different
+        prefill_seq_lens and kv_cache_max_len.
+      extra_prefill_seq_lens (list[int], optional): The prefill sequence
+        lengths for extra_model. Meaningful only when extra_model is not None.
+      extra_kv_cache_max_len (int, optional): The maximum size of KV cache
+        buffer for extra_model. Meaningful only when extra_model is not None.
+      extra_signature_prefix (str, optional): The prefix of the extra model
+        signatures. Meaningful only when extra_model is not None.
   """
   # pylint: disable=protected-access
   torch._dynamo.config.cache_size_limit = 64
@@ -315,32 +385,51 @@ def convert_to_tflite(
   )
   output_file = os.path.join(output_path, output_filename)
-  _export_helper(
+  converter = converter_utils.Converter()
+  _add_signatures(
+      converter,
       pytorch_model,
-      output_file,
       prefill_seq_lens,
       kv_cache_max_len,
       pixel_values_size,
       pixel_seq_len,
-      quantize,
       config,
       loras,
       export_config,
   )
+  if extra_model is not None and extra_prefill_seq_lens:
+    _add_signatures(
+        converter,
+        extra_model,
+        extra_prefill_seq_lens,
+        extra_kv_cache_max_len,
+        pixel_values_size,
+        pixel_seq_len,
+        config,
+        loras,
+        export_config,
+        signature_prefix=extra_signature_prefix,
+    )
+  edge_model = converter.convert(
+      quant_config=get_quant_recipe_from_flag(quantize, config),
+  )
+  edge_model.export(output_file)
   return output_file
-def _export_helper(
+def _add_signatures(
+    converter: converter_utils.Converter,
     pytorch_model: torch.nn.Module,
-    output_file: str,
     prefill_seq_lens: list[int],
     kv_cache_max_len: int,
     pixel_values_size: torch.Size,
     pixel_seq_len: int,
-    quantize: str,
     config: cfg.ModelConfig,
     loras: list[None | lora_utils.LoRA],
     export_config: ExportConfig,
+    signature_prefix: str = '',
 ):
   """Helper function to export a model to tflite."""
   prefill_tokens_list = []
@@ -385,17 +474,14 @@ def _export_helper(
       kv_layout=export_config.kvcache_layout,
   )
-  quant_config = get_quant_recipe_from_flag(quantize, config)
   # For export, we create a module that captures any non-exportable,
   # arugments, e.g. the generation config object.
   mod = ExportableModule(pytorch_model, export_config=export_config).eval()
-  converter = converter_utils.Converter()
   for lora in loras:
     for i in range(len(prefill_seq_lens)):
       prefill_seq_len = prefill_seq_lens[i]
-      prefill_signature_name = f'prefill_{prefill_seq_len}'
+      prefill_signature_name = f'{signature_prefix}prefill_{prefill_seq_len}'
       sample_kwargs = {
           'tokens': prefill_tokens_list[i],
@@ -450,16 +536,85 @@ def _export_helper(
     if lora is not None:
       sample_kwargs['lora'] = lora
+    decode_signature_name = f'{signature_prefix}decode'
+    if lora is not None:
+      decode_signature_name += f'_lora_r{lora.get_rank()}'
     converter.add_signature(
-        'decode' if lora is None else f'decode_lora_r{lora.get_rank()}',
+        decode_signature_name,
         mod,
         sample_kwargs=sample_kwargs,
     )
-  edge_model = converter.convert(
-      quant_config=quant_config,
+def build_and_convert_to_tflite_from_flags(
+    model_builder: Callable[
+        [str, Callable[[str], Dict[str, torch.Tensor]], int], torch.nn.Module
+    ],
+    checkpoint_path: str = None,
+    output_name_prefix: str = None,
+):
+  """Builds a nn.Module model and converts it according to the flags."""
+  if checkpoint_path is None:
+    checkpoint_path = flags.FLAGS.checkpoint_path
+  if output_name_prefix is None:
+    output_name_prefix = flags.FLAGS.output_name_prefix
+  pytorch_model = model_builder(
+      checkpoint_path,
+      loader.maybe_get_custom_loader(
+          checkpoint_path, flags.FLAGS.custom_checkpoint_loader
+      ),
+      get_mask_cache_size_from_flags(),
+  )
+  # Extra model for GPU dynamic shape verification if needed.
+  extra_model = None
+  extra_prefill_seq_lens = None
+  extra_kv_cache_max_len = 0
+  if flags.FLAGS.gpu_dynamic_shapes:
+    prefill_seq_lens = [
+        get_magic_number_for(l) for l in flags.FLAGS.prefill_seq_lens
+    ]
+    kv_cache_max_len = get_magic_number_for(flags.FLAGS.kv_cache_max_len)
+    if flags.FLAGS.export_gpu_dynamic_shape_verifications:
+      extra_kv_cache_max_len = _CONTEXT_LENGTH_TO_VERIFY_MAGIC_NUMBERS
+      if extra_kv_cache_max_len > flags.FLAGS.kv_cache_max_len:
+        extra_kv_cache_max_len = flags.FLAGS.kv_cache_max_len
+      extra_model = model_builder(
+          checkpoint_path,
+          loader.maybe_get_custom_loader(
+              checkpoint_path, flags.FLAGS.custom_checkpoint_loader
+          ),
+          extra_kv_cache_max_len,
+      )
+      extra_prefill_seq_lens = []
+      if extra_kv_cache_max_len > _SHORT_PREFILL_LENGTH_TO_VERIFY_MAGIC_NUMBERS:
+        extra_prefill_seq_lens.append(
+            _SHORT_PREFILL_LENGTH_TO_VERIFY_MAGIC_NUMBERS
+        )
+      if extra_kv_cache_max_len > _LONG_PREFILL_LENGTH_TO_VERIFY_MAGIC_NUMBERS:
+        extra_prefill_seq_lens.append(
+            _LONG_PREFILL_LENGTH_TO_VERIFY_MAGIC_NUMBERS
+        )
+  else:
+    prefill_seq_lens = flags.FLAGS.prefill_seq_lens
+    kv_cache_max_len = flags.FLAGS.kv_cache_max_len
+  convert_to_tflite(
+      pytorch_model,
+      output_path=flags.FLAGS.output_path,
+      output_name_prefix=output_name_prefix,
+      prefill_seq_len=prefill_seq_lens,
+      kv_cache_max_len=kv_cache_max_len,
+      quantize=flags.FLAGS.quantize,
+      lora_ranks=flags.FLAGS.lora_ranks,
+      export_config=export_config_lib.get_from_flags(),
+      extra_model=extra_model,
+      extra_prefill_seq_lens=extra_prefill_seq_lens,
+      extra_kv_cache_max_len=extra_kv_cache_max_len,
+      extra_signature_prefix='test_' if extra_model is not None else '',
   )
-  edge_model.export(output_file)
 def convert_to_litert(

ai_edge_torch/generative/utilities/export_config.py CHANGED Viewed

@@ -56,5 +56,7 @@ def get_from_flags() -> ExportConfig:
     export_config.kvcache_layout = kv_utils.KV_LAYOUT_TRANSPOSED
   if flags.FLAGS.mask_as_input:
     export_config.mask_as_input = flags.FLAGS.mask_as_input
+  if flags.FLAGS.decode_batch_size:
+    export_config.decode_batch_size = flags.FLAGS.decode_batch_size
   return export_config

ai_edge_torch/generative/utilities/litertlm_builder.py CHANGED Viewed

@@ -18,16 +18,19 @@
 import os
 import pathlib
+from google.protobuf import text_format
 try:
   # pylint: disable=g-import-not-at-top
   from ai_edge_litert.internal import llm_metadata_pb2
   from ai_edge_litert.internal import litertlm_builder
+  from ai_edge_litert.internal import llm_model_type_pb2
   # pylint: enable=g-import-not-at-top
   _litertlm_builder_available = True
 except ImportError:
   llm_metadata_pb2 = None
+  llm_model_type_pb2 = None
   litertlm_builder = None
   _litertlm_builder_available = False
@@ -41,16 +44,19 @@ def build_litertlm(
     workdir: str,
     output_path: str,
     context_length: int,
-    model_prompt_prefix: str | None,
-    model_prompt_suffix: str | None,
-    user_prompt_prefix: str | None,
-    user_prompt_suffix: str | None,
-    tokenizer_model_path: str | None,
-    hf_tokenizer_model_path: str | None,
+    model_prompt_prefix: str | None = None,
+    model_prompt_suffix: str | None = None,
+    user_prompt_prefix: str | None = None,
+    user_prompt_suffix: str | None = None,
+    tokenizer_model_path: str | None = None,
+    hf_tokenizer_model_path: str | None = None,
     start_token: str | None = None,
     start_token_id: int | None = None,
     stop_tokens: str | list[str] | None = None,
     stop_token_ids: list[int] | None = None,
+    llm_model_type: str = 'generic',
+    jinja_prompt_template: str | None = None,
+    base_llm_metadata_path: str | None = None,
     **kwargs,
 ):
   """Builds a LiteRT-LM file from a TFlite model."""
@@ -58,10 +64,22 @@ def build_litertlm(
   if not is_litertlm_builder_available():
     raise ValueError('LiteRT-LM builder is not available.')
-  assert llm_metadata_pb2 is not None
   assert litertlm_builder is not None
+  assert llm_metadata_pb2 is not None
+  assert llm_model_type_pb2 is not None
   llm_metadata = llm_metadata_pb2.LlmMetadata()
+  if base_llm_metadata_path:
+    if base_llm_metadata_path.endswith('.pb'):
+      with open(base_llm_metadata_path, 'rb') as f:
+        llm_metadata.ParseFromString(f.read())
+    elif base_llm_metadata_path.endswith('.textproto'):
+      with open(base_llm_metadata_path, 'r') as f:
+        text_format.Parse(f.read(), llm_metadata, allow_unknown_field=True)
+    else:
+      raise ValueError(
+          'Base LLM metadata path must be a binary or text proto file.'
+      )
   if start_token_id is not None:
     llm_metadata.start_token.token_ids.ids.append(start_token_id)
@@ -96,7 +114,42 @@ def build_litertlm(
   llm_metadata.max_num_tokens = context_length
-  llm_metadata_path = os.path.join(workdir, 'llm_metadata.pb')
+  mdl_type = llm_metadata.llm_model_type.WhichOneof('model_type')
+  if not mdl_type or mdl_type == 'generic_model':
+    match llm_model_type:
+      case litertlm_builder.LlmModelType.GENERIC:
+        llm_metadata.llm_model_type.CopyFrom(
+            llm_model_type_pb2.LlmModelType(
+                generic_model=llm_model_type_pb2.GenericModel()
+            )
+        )
+      case litertlm_builder.LlmModelType.GEMMA3N:
+        llm_metadata.llm_model_type.CopyFrom(
+            llm_model_type_pb2.LlmModelType(
+                gemma3n=llm_model_type_pb2.Gemma3N()
+            )
+        )
+      case litertlm_builder.LlmModelType.GEMMA3:
+        llm_metadata.llm_model_type.CopyFrom(
+            llm_model_type_pb2.LlmModelType(gemma3=llm_model_type_pb2.Gemma3())
+        )
+      case litertlm_builder.LlmModelType.QWEN3:
+        llm_metadata.llm_model_type.CopyFrom(
+            llm_model_type_pb2.LlmModelType(qwen3=llm_model_type_pb2.Qwen3())
+        )
+      case litertlm_builder.LlmModelType.QWEN2P5:
+        llm_metadata.llm_model_type.CopyFrom(
+            llm_model_type_pb2.LlmModelType(
+                qwen2p5=llm_model_type_pb2.Qwen2p5()
+            )
+        )
+      case _:
+        raise ValueError(f'Unsupported LLM model type: {llm_model_type}')
+  if jinja_prompt_template is not None:
+    llm_metadata.jinja_prompt_template = jinja_prompt_template
+  llm_metadata_path = os.path.join(workdir, 'llm_metadata_final.pb')
   with open(llm_metadata_path, 'wb') as f:
     f.write(llm_metadata.SerializeToString())

ai_edge_torch/generative/utilities/loader.py CHANGED Viewed

@@ -135,7 +135,8 @@ def load_pytorch_statedict(full_path: str):
   tensors = {}
   for file in files:
-    this_file_tensors = torch.load(file)
+    map_location = "cpu" if not torch.cuda.is_available() else None
+    this_file_tensors = torch.load(file, map_location=map_location)
     for k in this_file_tensors:
       assert k not in tensors
     tensors.update(this_file_tensors)

ai_edge_torch/lowertools/translate_recipe.py CHANGED Viewed

@@ -80,8 +80,14 @@ def _get_granularity(
     return _QuantGranularity.CHANNELWISE
   if granularity == quant_attrs.Granularity.NONE:
     return _QuantGranularity.TENSORWISE
-  if granularity == quant_attrs.Granularity.BLOCKWISE:
-    return _QuantGranularity.BLOCKWISE
+  if granularity == quant_attrs.Granularity.BLOCKWISE_32:
+    return _QuantGranularity.BLOCKWISE_32
+  if granularity == quant_attrs.Granularity.BLOCKWISE_64:
+    return _QuantGranularity.BLOCKWISE_64
+  if granularity == quant_attrs.Granularity.BLOCKWISE_128:
+    return _QuantGranularity.BLOCKWISE_128
+  if granularity == quant_attrs.Granularity.BLOCKWISE_256:
+    return _QuantGranularity.BLOCKWISE_256
   raise ValueError('Unimplemented granularity')
@@ -108,7 +114,6 @@ def _set_quant_config(
               symmetric=True,
               granularity=_get_granularity(layer_recipe.granularity),
               dtype=_get_dtype_from_dtype(layer_recipe.weight_dtype),
-              block_size=layer_recipe.block_size,
           ),
           compute_precision=_get_compute_precision_from_mode(layer_recipe.mode),
           explicit_dequantize=_get_explicit_dequant_from_mode(

ai_edge_torch/odml_torch/experimental/__init__.py ADDED Viewed

@@ -0,0 +1,14 @@
+# Copyright 2024 The AI Edge Torch Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================

ai_edge_torch/odml_torch/experimental/torch_tfl/__init__.py ADDED Viewed

@@ -0,0 +1,20 @@
+# Copyright 2025 The AI Edge Torch Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Torch-TFL ops definitions, decompositions, and lowerings."""
+from ai_edge_torch.odml_torch.experimental.torch_tfl import _decomps
+from ai_edge_torch.odml_torch.experimental.torch_tfl import _lowerings
+from ai_edge_torch.odml_torch.experimental.torch_tfl import _ops
+decomps = _decomps.decomps

ai-edge-torch-nightly 0.7.0.dev20250929__py3-none-any.whl → 0.8.0.dev20251206__py3-none-any.whl

Potentially problematic release.

ai-edge-torch-nightly 0.7.0.dev20250929py3-none-any.whl → 0.8.0.dev20251206py3-none-any.whl