PyPI - gimlet-api - Versions diffs - 0.0.9__py3-none-any.whl → 0.0.10__py3-none-any.whl - Mend

gimlet-api 0.0.9py3-none-any.whl → 0.0.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

{gimlet_api-0.0.9.dist-info → gimlet_api-0.0.10.dist-info}/METADATA +4 -2
{gimlet_api-0.0.9.dist-info → gimlet_api-0.0.10.dist-info}/RECORD +19 -17
gml/client.py +27 -18
gml/compile.py +48 -2
gml/hf.py +232 -38
gml/model.py +37 -0
gml/preprocessing.py +17 -3
gml/proto/opentelemetry/proto/metrics/v1/metrics_pb2.py +39 -38
gml/proto/src/api/corepb/v1/compiled_pipeline_pb2.py +64 -0
gml/proto/src/api/corepb/v1/controlplane_pb2.py +35 -9
gml/proto/src/api/corepb/v1/cp_edge_pb2.py +37 -35
gml/proto/src/api/corepb/v1/deployed_pipeline_pb2.py +37 -0
gml/proto/src/api/corepb/v1/device_info_pb2.py +19 -11
gml/proto/src/api/corepb/v1/gem_config_pb2.py +17 -13
gml/proto/src/api/corepb/v1/mediastream_pb2.py +42 -39
gml/proto/src/api/corepb/v1/model_exec_pb2.py +129 -109
gml/proto/src/controlplane/compiler/cpb/v1/cpb_pb2.py +20 -8
gml/proto/src/controlplane/logicalpipeline/lppb/v1/lppb_pb2.py +25 -23
{gimlet_api-0.0.9.dist-info → gimlet_api-0.0.10.dist-info}/WHEEL +0 -0

{gimlet_api-0.0.9.dist-info → gimlet_api-0.0.10.dist-info}/METADATA RENAMED Viewed

@@ -9,11 +9,13 @@ Classifier: Typing :: Typed
 Requires-Python: >=3
 Requires-Dist: protobuf
 Requires-Dist: grpcio
-Requires-Dist: torch>=2.3.0
+Requires-Dist: torch>=2.6.0
 Requires-Dist: torch-mlir-gml
 Requires-Dist: numpy<2.0.0
+Requires-Dist: rich
 Requires-Dist: transformers>=4.43.3
+Requires-Dist: tokenizers>=0.21.0
 Requires-Dist: safetensors-mlir
-Version: 0.0.9
+Version: 0.0.10
 UNKNOWN

{gimlet_api-0.0.9.dist-info → gimlet_api-0.0.10.dist-info}/RECORD RENAMED Viewed

@@ -1,14 +1,14 @@
 gml/__init__.py,sha256=H3WQZ_RaN7VNeb__qeHEbKLEwkaG7gpL5FQ8s1IotUA,773
 gml/_utils.py,sha256=mSCWHhCdzcUvHqmJIB2FS215K1LMgJCWcZ6e6FWK3hQ,1184
 gml/asset_manager.py,sha256=VnbqUZHPOgPrAh6ri9C0EuNhS8tAHIrbUyJPAJuD9po,2053
-gml/client.py,sha256=jPA71PTbv-4DX2FsfFcj1VPN-eMGdF4tKCj2NnOW7ZE,13862
-gml/compile.py,sha256=Ih43r_zU07p91w9aiA0lrPJfmACpAWg0x_HFddMSy7Q,8346
+gml/client.py,sha256=AcnG5mniHOfq-He-uCph2-xQ39cZwmXZePaUEed87b8,14378
+gml/compile.py,sha256=3L5fpD8DK45RLiywj1b5NuDlbsxpzRxI87k1GahlMpc,9851
 gml/device.py,sha256=Iw71NnuLcgjY32ZMXHlnlPkosTuHEmL9E98utmNChlM,2650
-gml/hf.py,sha256=pp215wNmaPyCVy4DqFJbe_vEe1BRJ1GAJEURZnLuU0g,28220
-gml/model.py,sha256=xESdD7tlqn93ym67Lyyk7TZdM3wUqyn7qWdP2AbgdkI,7261
+gml/hf.py,sha256=Kv2yffy8omTRQDPnoIZocG2EOyfhr7UvLFIvTmRxw0g,36170
+gml/model.py,sha256=8fIYlLRduTsUZfYJr_YVPNxbEVIzr7_yaaTe4T-TZ2Y,8429
 gml/model_utils.py,sha256=vZvE5cHZIDkUkeZ4Pk4hhV-zOYMiREluv4b8kdqQ3Ig,1375
 gml/pipelines.py,sha256=LKj_lh5I5HzyUUIPG4CImiqBnQPrJsj0CHPKhLiOOGo,8374
-gml/preprocessing.py,sha256=MaKkEW4ZP9fjpkJQfpc0X3rCUuSuSmJnGMClHamKmZU,3210
+gml/preprocessing.py,sha256=YPcxwBOdx0h0ADzoloYbFw9qUGFbi167E8HA4Zwn7Pk,3928
 gml/proto/gogoproto/gogo_pb2.py,sha256=WVMIAR8K--mCUkTPM7mEeeXGpQlRRtt_kco10iP3CZs,15728
 gml/proto/mediapipe/framework/calculator_contract_test_pb2.py,sha256=hNjyZCBz3RYa6rN4xR3FOCZKA24gq_LsJ3EMegl5wK4,2031
 gml/proto/mediapipe/framework/calculator_options_pb2.py,sha256=Nq1BQRtLdsIgfkw7ymD3eg2p2_RSlZhiHS7YbDhNHR0,1563
@@ -23,29 +23,31 @@ gml/proto/mediapipe/framework/stream_handler_pb2.py,sha256=kNo-2Fdua_CeyJInI3q5r
 gml/proto/mediapipe/framework/test_calculators_pb2.py,sha256=tXF25VpGtHGArffRqFmjD6FO7xmuCPd5j9UYON2SVSM,2230
 gml/proto/mediapipe/framework/thread_pool_executor_pb2.py,sha256=9TJ66fqSo1BiJmEAQesK0fnVe55zcJpOqVip6HotgyE,2345
 gml/proto/opentelemetry/proto/common/v1/common_pb2.py,sha256=wQjeDti-C8JiNwRn-z5M5p-Fqxm-SmnbPaoitJcSK-4,2860
-gml/proto/opentelemetry/proto/metrics/v1/metrics_pb2.py,sha256=t2Far6oVcUFQIimzgAkZ8vQd0asMIlvECp4osC0ujgg,9735
+gml/proto/opentelemetry/proto/metrics/v1/metrics_pb2.py,sha256=k8oW5tmFlJK2574Ky6kDc0JmNNQCLroRwCCGyxDd7JA,9968
 gml/proto/opentelemetry/proto/resource/v1/resource_pb2.py,sha256=cbNmE12Nm3PjW4NXU7-Z-9m_0Zs3Ab8R1xLkDnvclCg,1730
-gml/proto/src/api/corepb/v1/controlplane_pb2.py,sha256=BosvQ6GYaUGsNTkRZH7osP2dZGWP6U9WyxItIQ_QS-8,9769
-gml/proto/src/api/corepb/v1/cp_edge_pb2.py,sha256=oIpxq13C1ynK3alzDNZTOL5URxz5qzbDLD9NOM5xxjE,14511
-gml/proto/src/api/corepb/v1/device_info_pb2.py,sha256=5orIOJAkvtH9pWBSXveDASFi4Rn59YWdOSnVLdj891A,5356
-gml/proto/src/api/corepb/v1/gem_config_pb2.py,sha256=2ljfF16Xeqgj9TM3gHN54BqRHqS3SQNhOCenEY9K9qU,4718
-gml/proto/src/api/corepb/v1/mediastream_pb2.py,sha256=Un9OwDUmWdqv92QP66K-WVOAzxP_4hMoz33JI4W1G5Y,7868
-gml/proto/src/api/corepb/v1/model_exec_pb2.py,sha256=o0drstrDssejqCFo8Cmm9F0zDw_bmzeOUHiYFrruOqE,29877
+gml/proto/src/api/corepb/v1/compiled_pipeline_pb2.py,sha256=g3MxBqshtwaM9_Nrbvwo995_XWq-maXGP6mDeiEzZKo,7529
+gml/proto/src/api/corepb/v1/controlplane_pb2.py,sha256=DylHEVXr36Deh5p-WK8aRwQF-uGW5mJ2mo8pJ3qg7KA,13213
+gml/proto/src/api/corepb/v1/cp_edge_pb2.py,sha256=H0WgAgv6-qaf7wnnKALmSBpD_czmUNHNYpsnE3Tmcrs,14988
+gml/proto/src/api/corepb/v1/deployed_pipeline_pb2.py,sha256=cZjoJuZ3fpCiw2Ox7bcHCXYqRTebb08n-aodwjE-xKI,3053
+gml/proto/src/api/corepb/v1/device_info_pb2.py,sha256=pTZGPjfglje-Wu_-R4qiwPtewXNJIGq5Kedme9SHiaU,6713
+gml/proto/src/api/corepb/v1/gem_config_pb2.py,sha256=vC0g3k9hDv-LhiV6LwaYCly6x00Xx_YA0i2AZSwCo_I,5396
+gml/proto/src/api/corepb/v1/mediastream_pb2.py,sha256=mgi5-prV7Lz0XJ2wo04jGLSvbnDGtdmduSv_6d6I9oA,8368
+gml/proto/src/api/corepb/v1/model_exec_pb2.py,sha256=_TXJvHSxkX1Il6xEVEiFIfei_ZV4KhdL3cSKaMgIYIw,33548
 gml/proto/src/common/typespb/jwt_pb2.py,sha256=lxy-bqbyg96i9n_xr2JbkuWX-ldnoJavXPMnApzVSio,5580
 gml/proto/src/common/typespb/status_pb2.py,sha256=IbBJnbsAlvsuTtyT285ZuW6k5VaPfl5kRSOnBxD_H8M,2109
 gml/proto/src/common/typespb/uuid_pb2.py,sha256=5Fm3jYpCPX7sMrP6RhRYsF0SnuZNIBEQJk9f0jwZ2Rw,1188
-gml/proto/src/controlplane/compiler/cpb/v1/cpb_pb2.py,sha256=R8jcxOlR1iz4Y7MnxIKoJ2RaNayqWPiBSt0W496QT-c,3262
+gml/proto/src/controlplane/compiler/cpb/v1/cpb_pb2.py,sha256=4mp1QWV7FOzF_nC3RDKZ9vTA-ezMhukcjBEt1lcjGmM,4933
 gml/proto/src/controlplane/compiler/cpb/v1/cpb_pb2_grpc.py,sha256=l-gTK9nYpTlVb7QGAckSQXlHhkRdKe2-nrxXc8NQavY,2912
 gml/proto/src/controlplane/directory/directorypb/v1/directory_pb2.py,sha256=KgoUT8ccF-yJPe1r4otQjAPQoKBaQzdBlHoIUSkk0yE,11445
 gml/proto/src/controlplane/directory/directorypb/v1/directory_pb2_grpc.py,sha256=p3OpT8-hfNHu4-29qr-ZahRwO-LoCYM9Q4jomAHTXGA,24572
 gml/proto/src/controlplane/filetransfer/ftpb/v1/ftpb_pb2.py,sha256=r8mbJNTq45_c0amPnTr8OFZasCk7XWu2YS_eu7GfWJg,7050
 gml/proto/src/controlplane/filetransfer/ftpb/v1/ftpb_pb2_grpc.py,sha256=XlE4R2PJaOmzQocx7y6SKJvuqt8tYBGzBuhajvzG0cc,12919
-gml/proto/src/controlplane/logicalpipeline/lppb/v1/lppb_pb2.py,sha256=wvLQvoh2UA5qCcMALT6PS47LYmmVdBz9U47WFLs5Ayg,6330
+gml/proto/src/controlplane/logicalpipeline/lppb/v1/lppb_pb2.py,sha256=2s2p6dURKJLboaR965m2-rGTo_63Bi1cXsA90Hz9u-M,6632
 gml/proto/src/controlplane/logicalpipeline/lppb/v1/lppb_pb2_grpc.py,sha256=-snjW7n6JveUzJVPFcm25XlL19kowPSKgd61l_jPnHA,9541
 gml/proto/src/controlplane/model/mpb/v1/mpb_pb2.py,sha256=RVedXkNYu2iF5OHiXoYyRw9AGRCUWG7qNyY-5QY71Go,3762
 gml/proto/src/controlplane/model/mpb/v1/mpb_pb2_grpc.py,sha256=KSdb6V04qUHDsb1R2o3wixwTyZgrhwnPYobjnRgWX4I,4735
 gml/register_submodules.py,sha256=U8IwjVygX2vxNi_aK6ljHOD4mmrOhbyVczvy4wwulqU,5027
 gml/tensor.py,sha256=aPLm3I3qkYNDcJmntaUycqqN5rsZmcj8ql0EkupJudY,14977
-gimlet_api-0.0.9.dist-info/WHEEL,sha256=sobxWSyDDkdg_rinUth-jxhXHqoNqlmNMJY3aTZn2Us,91
-gimlet_api-0.0.9.dist-info/METADATA,sha256=P5wOKzPZyJroiZTPRpWsDdjiS5XQL21GK-heo5Set_E,531
-gimlet_api-0.0.9.dist-info/RECORD,,
+gimlet_api-0.0.10.dist-info/WHEEL,sha256=sobxWSyDDkdg_rinUth-jxhXHqoNqlmNMJY3aTZn2Us,91
+gimlet_api-0.0.10.dist-info/METADATA,sha256=i3n2dnjznNFL6XFsj1bL0T544E0FmMVQySLgiBkUW04,586
+gimlet_api-0.0.10.dist-info/RECORD,,

gml/client.py CHANGED Viewed

@@ -18,8 +18,12 @@ import os
 import uuid
 from pathlib import Path
 from typing import BinaryIO, List, Optional, TextIO, Union
+from urllib.parse import quote
 import grpc
+from rich.progress import (
+    Console,
+)
 import gml.proto.src.api.corepb.v1.model_exec_pb2 as modelexecpb
 import gml.proto.src.common.typespb.uuid_pb2 as uuidpb
@@ -39,6 +43,7 @@ from gml.model import Model
 from gml.pipelines import Pipeline
 DEFAULT_CONTROLPLANE_ADDR = "app.gimletlabs.ai"
+console = Console()
 class _ChannelFactory:
@@ -282,31 +287,28 @@ class Client:
     def create_model(self, model: Model) -> modelexecpb.Model:
         existing_model = self._get_model_if_exists(model.name)
         if existing_model is not None:
-            print(
-                'warning: model "{}" already exists and will not be uploaded.'.format(
-                    model.name
-                )
+            console.print(
+                f'[yellow]warning:[/yellow] model "{model.name}" already exists and will not be uploaded.'
             )
             return existing_model
         model_info = model.to_proto()
-        with model.collect_assets() as model_assets:
-            for asset_name, file in model_assets.items():
-                if isinstance(file, Path) or isinstance(file, str):
-                    file = open(file, "rb")
-                sha256 = sha256sum(file)
+        with console.status(f'Creating model "{model.name}"...'):
+            with model.collect_assets() as model_assets:
+                for asset_name, file in model_assets.items():
+                    if isinstance(file, Path) or isinstance(file, str):
+                        file = open(file, "rb")
-                upload_name = model.name
-                if asset_name:
-                    upload_name += ":" + asset_name
-                print(f"Uploading {upload_name}...")
+                    sha256 = sha256sum(file)
-                file_info = self._upload_file_if_not_exists(sha256, file, sha256)
+                    upload_name = model.name
+                    if asset_name:
+                        upload_name += ":" + asset_name
+                    file_info = self._upload_file_if_not_exists(sha256, file, sha256)
+                    console.print(f"Uploaded {upload_name}.")
-                model_info.file_assets[asset_name].MergeFrom(file_info.file_id)
+                    model_info.file_assets[asset_name].MergeFrom(file_info.file_id)
-                file.close()
+                    file.close()
         return self._create_model(model_info)
@@ -331,6 +333,8 @@ class Client:
         else:
             raise ValueError("must specify one of 'pipeline_file' or 'pipeline'")
+        console.print(f'Uploading pipeline "{name}" to {self._org_name}...')
         for model in models:
             self.create_model(model)
@@ -343,6 +347,11 @@ class Client:
         resp: lppb.CreateLogicalPipelineResponse = stub.CreateLogicalPipeline(
             req, metadata=self._get_request_metadata(idempotent=True)
         )
+        url = f"https://{os.getenv('GML_CONTROLPLANE_ADDR')}/orgs/{quote(self._org_name)}/pipelines/{quote(name)}"
+        console.print(
+            f"[green]Pipeline upload complete![/green]\nView your pipeline at: [cyan]{url}[/cyan]"
+        )
         return resp.id
     def check_compile(

gml/compile.py CHANGED Viewed

@@ -16,10 +16,11 @@
 import contextlib
 import functools
-from typing import Any, Dict, List, Optional, Sequence, Union
+from typing import Any, Dict, Iterable, List, Optional, Sequence, Union
 import safetensors_mlir
 import torch
+import torch.utils._pytree
 import torch_mlir
 from mlir.ir import (
     BF16Type,
@@ -28,6 +29,7 @@ from mlir.ir import (
     F16Type,
     F32Type,
     F64Type,
+    Float8E4M3FNType,
     IntegerType,
     Operation,
     RankedTensorType,
@@ -40,6 +42,7 @@ from torch_mlir.dialects import torch as torch_d
 from torch_mlir.extras.fx_decomp_util import get_decomposition_table
 from torch_mlir.extras.fx_importer import FxImporter, FxImporterHooks, InputInfo
 from torch_mlir.fx import export_and_import
+from transformers import DynamicCache
 from gml.asset_manager import AssetManager
 from gml.register_submodules import submodule_registration_workarounds
@@ -53,6 +56,45 @@ def _default_decomposition_denylist():
     ]
+_registered_dynamic_cache_pytree_node = False
+def register_dynamic_cache_pytree_node():
+    """
+    Registers flattening/unflattening for transformers.DynamicCache
+    Pytree is a representation of tensor collections used inside torch.export.
+    """
+    global _registered_dynamic_cache_pytree_node
+    if _registered_dynamic_cache_pytree_node:
+        return
+    _registered_dynamic_cache_pytree_node = True
+    def flatten_cache_with_keys(dynamic_cache: DynamicCache):
+        return [
+            (
+                torch.utils._pytree.MappingKey(i),
+                list(value),
+            )
+            for i, value in enumerate(dynamic_cache.to_legacy_cache())
+        ], None
+    def flatten_cache(dynamic_cache: DynamicCache):
+        flattened, ctx = flatten_cache_with_keys(dynamic_cache)
+        return [v for _, v in flattened], ctx
+    def unflatten_cache(flattened: Iterable[Any], context: Any):
+        return DynamicCache.from_legacy_cache(flattened)
+    torch.utils._pytree.register_pytree_node(
+        DynamicCache,
+        flatten_cache,
+        unflatten_cache,
+        serialized_type_name=f"{DynamicCache.__module__}.{DynamicCache.__name__}",
+        flatten_with_keys_fn=flatten_cache_with_keys,
+    )
 @contextlib.contextmanager
 def _patch_aot_export_module():
     """This contextmanager prevents PyTorch dispatch from running when calling aot_export_module.
@@ -91,6 +133,8 @@ _torch_dtype_to_builtin_element_type = {
     torch.complex32: lambda: ComplexType.get(F16Type.get()),
     torch.complex64: lambda: ComplexType.get(F32Type.get()),
     torch.complex128: lambda: ComplexType.get(F64Type.get()),
+    # Quantized types.
+    torch.float8_e4m3fn: lambda: Float8E4M3FNType.get(),
 }
@@ -179,6 +223,7 @@ def to_torch_mlir(
     ] = None,
     decomposition_denylist: Optional[List[torch._ops.OperatorBase]] = None,
     weight_manager: Optional[AssetManager] = None,
+    export_predispatch: bool = False,
 ):
     if dynamic_shapes is not None:
         for shape in dynamic_shapes:
@@ -205,10 +250,11 @@ def to_torch_mlir(
         # Ignore errors running the model. This can happen when the model has data dependent branches.
         pass
+    register_dynamic_cache_pytree_node()
     prog = _export(
         model,
         tuple(example_inputs),
-        pre_dispatch=False,
+        pre_dispatch=export_predispatch,
         strict=False,
         dynamic_shapes=dynamic_shapes,
     )

gml/hf.py CHANGED Viewed

@@ -24,8 +24,10 @@ from typing import Any, BinaryIO, Dict, List, Optional, TextIO, Tuple
 import torch
 import transformers
+from rich.progress import Console
 from transformers import (
     BaseImageProcessor,
+    DynamicCache,
     Pipeline,
     PreTrainedModel,
     PreTrainedTokenizer,
@@ -49,6 +51,7 @@ from gml.tensor import (
     DetectionNumCandidatesDimension,
     DetectionOutputDimension,
     DimensionSemantics,
+    EmbeddingDimension,
     ImageChannelDimension,
     ImageHeightDimension,
     ImageWidthDimension,
@@ -60,6 +63,11 @@ from gml.tensor import (
 FALLBACK_RESIZE_SIZE = 512
+# Set dynamic dimension max size to less than the int64 max, leaving leeway for the size to be ~4x by the model.
+MAX_DYNAMIC_VAL = 2**61
+console = Console()
 class HuggingFaceTokenizer(Model):
     def __init__(self, tokenizer: PreTrainedTokenizer, name: Optional[str] = None):
@@ -105,7 +113,6 @@ def flatten(items):
 class WrapWithFunctionalCache(torch.nn.Module):
     def __init__(self, model: transformers.PreTrainedModel):
         super().__init__()
         self.model = model
@@ -128,6 +135,8 @@ class HuggingFaceTextGenerationPipeline:
         name: Optional[str] = None,
         tokenizer_name: Optional[str] = None,
         dynamic_seqlen: bool = False,
+        dynamic_batch: bool = False,
+        export_predispatch: bool = False,
     ):
         self.pipeline = pipeline
         self.tokenizer_model = HuggingFaceTokenizer(pipeline.tokenizer, tokenizer_name)
@@ -139,13 +148,20 @@ class HuggingFaceTextGenerationPipeline:
         self.model = self.model.to(torch.float16)
         self.model = WrapWithFunctionalCache(pipeline.model)
+        self.dynamic_batch = dynamic_batch
+        self.batch_size = 1
+        if self.dynamic_batch:
+            # dynamic tracing fails for dimensions of size 1.
+            self.batch_size = 2
         self.language_model = TorchModel(
             name,
             torch_module=self.model,
+            export_predispatch=export_predispatch,
             **self._guess_model_spec(dynamic_seqlen),
         )
-    def _initialize_key_value_cache(self):
+    def _initialize_key_value_cache(self) -> DynamicCache:
         cache = []
         config = self.pipeline.model.config
         head_dim = (
@@ -158,7 +174,12 @@ class HuggingFaceTextGenerationPipeline:
             if config.num_key_value_heads is None
             else config.num_key_value_heads
         )
-        cache_shape = (1, num_key_value_heads, self._cache_length_for_tracing, head_dim)
+        cache_shape = (
+            self.batch_size,
+            num_key_value_heads,
+            self._cache_length_for_tracing,
+            head_dim,
+        )
         for _ in range(config.num_hidden_layers):
             cache.append(
                 [
@@ -166,7 +187,67 @@ class HuggingFaceTextGenerationPipeline:
                     torch.zeros(cache_shape).to(torch.float16),
                 ]
             )
-        return cache
+        return DynamicCache.from_legacy_cache(cache)
+    def _parse_transformer_config(
+        self, model: transformers.PreTrainedModel
+    ) -> modelexecpb.TransformerConfig:
+        # Only non-default rope config set the rope_scaling parameter
+        attention_head_size = getattr(
+            model.config,
+            "attention_head_size",
+            model.config.hidden_size // model.config.num_attention_heads,
+        )
+        partial_rotary_factor = getattr(model.config, "partial_rotary_factor", 1.0)
+        rotary_embedding_dim = getattr(
+            model.config,
+            "rotary_dim",
+            int(attention_head_size * partial_rotary_factor),
+        )
+        if (
+            hasattr(model.config, "rope_scaling")
+            and model.config.rope_scaling is not None
+        ):
+            rope_scaling = model.config.rope_scaling
+            rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None))
+            if not rope_type == "llama3":
+                raise NotImplementedError(
+                    "rope scaling type {} is not supported".format(rope_type)
+                )
+            # LLAMA 3 example config: https://huggingface.co/meta-llama/Llama-3.2-1B/blob/main/config.json
+            llama3_config = modelexecpb.Llama3RopeConfig()
+            llama3_config.theta = model.config.rope_theta
+            llama3_config.rotary_embedding_dim = rotary_embedding_dim
+            llama3_config.max_position_embeddings = model.config.max_position_embeddings
+            llama3_config.factor = rope_scaling["factor"]
+            llama3_config.high_freq_factor = rope_scaling["high_freq_factor"]
+            llama3_config.low_freq_factor = rope_scaling["low_freq_factor"]
+            llama3_config.original_max_position_embeddings = rope_scaling[
+                "original_max_position_embeddings"
+            ]
+            return modelexecpb.TransformerConfig(
+                position_embedding_config=modelexecpb.PositionEmbeddingConfig(
+                    kind=modelexecpb.PositionEmbeddingKind.POSITION_EMBEDDING_KIND_ROPE_LLAMA3,
+                    llama3_rope_config=llama3_config,
+                ),
+            )
+        # Default rope configs:
+        # 1. Llama-2: https://huggingface.co/NousResearch/Llama-2-7b-hf/blob/main/config.json
+        # 2. Qwen2.5: https://huggingface.co/Qwen/Qwen2.5-14B-Instruct-1M/blob/main/config.json
+        # 3. Mixtral: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1/blob/main/config.json
+        default_rope_config = modelexecpb.DefaultRopeConfig()
+        default_rope_config.theta = model.config.rope_theta
+        default_rope_config.max_position_embeddings = (
+            model.config.max_position_embeddings
+        )
+        default_rope_config.rotary_embedding_dim = rotary_embedding_dim
+        return modelexecpb.TransformerConfig(
+            position_embedding_config=modelexecpb.PositionEmbeddingConfig(
+                kind=modelexecpb.PositionEmbeddingKind.POSITION_EMBEDDING_KIND_ROPE_DEFAULT,
+                default_rope_config=default_rope_config,
+            ),
+        )
     def _guess_model_spec(self, dynamic_seqlen: bool) -> Dict:
         input_dict = self.pipeline.preprocess("this is a prompt! Test test test?")
@@ -179,7 +260,7 @@ class HuggingFaceTextGenerationPipeline:
         input_tensor_semantics = []
         # This currently assumes that all HF language models have inputs that are [B, NUM_TOKENS].
-        inputs.append(input_dict["input_ids"])
+        inputs.append(torch.tile(input_dict["input_ids"], [self.batch_size, 1]))
         input_tensor_semantics.append(
             TensorSemantics(
                 dimensions=[
@@ -192,7 +273,7 @@ class HuggingFaceTextGenerationPipeline:
         # Assume that the model supports a KeyValue cache.
         cache_values = self._initialize_key_value_cache()
         inputs.append(cache_values)
-        for _ in cache_values:
+        for _ in range(len(cache_values)):
             input_tensor_semantics.append(AttentionKeyValueCacheTensorSemantics())
             input_tensor_semantics.append(AttentionKeyValueCacheTensorSemantics())
@@ -209,7 +290,7 @@ class HuggingFaceTextGenerationPipeline:
             if (
                 not found_logits
                 and len(tensor.shape) == 3
-                and tensor.shape[0] == 1
+                and tensor.shape[0] == self.batch_size
                 and tensor.shape[1] == seqlen
             ):
                 # This should be the logits tensor.
@@ -226,14 +307,38 @@ class HuggingFaceTextGenerationPipeline:
             else:
                 output_tensor_semantics.append(AttentionKeyValueCacheTensorSemantics())
+        if not found_logits:
+            raise ValueError(
+                "could not determine output logits tensor for text generation model"
+            )
+        num_experts_per_tok = (
+            1
+            if not hasattr(self.pipeline.model.config, "num_experts_per_tok")
+            else self.pipeline.model.config.num_experts_per_tok
+        )
         dynamic_shapes = None
-        seqlen = torch.export.Dim("seqlen", min=2, max=9223372036854775096)
+        # Set range to half of seqlen to account for # of tokens per expert.
+        # pytorch export creates a constraint on the number of possible tokens
+        # sent to each expert. That value is num_experts * seqlen. If we don't divide
+        # by number of experts, the tracing creates an integer value that exceeds the valid int64
+        # range and will throw a hard to decipher error message.
+        seqlen = torch.export.Dim(
+            "seqlen", min=2, max=MAX_DYNAMIC_VAL // num_experts_per_tok
+        )
-        cache_length = torch.export.Dim("cache_length", min=2, max=9223372036854775096)
+        cache_length = torch.export.Dim("cache_length", min=2, max=MAX_DYNAMIC_VAL)
         dynamic_shapes = [
             {1: seqlen},
-            [[{2: cache_length}, {2: cache_length}] for _ in cache_values],
+            [[{2: cache_length}, {2: cache_length}] for _ in range(len(cache_values))],
         ]
+        if self.dynamic_batch:
+            batch = torch.export.Dim("batch")
+            dynamic_shapes[0][0] = batch
+            for i in range(len(cache_values)):
+                dynamic_shapes[1][i][0][0] = batch
+                dynamic_shapes[1][i][1][0] = batch
         return {
             "example_inputs": inputs,
@@ -241,6 +346,7 @@ class HuggingFaceTextGenerationPipeline:
             "input_tensor_semantics": input_tensor_semantics,
             "output_tensor_semantics": output_tensor_semantics,
             "generation_config": HuggingFaceGenerationConfig(self.pipeline.model),
+            "transformer_config": self._parse_transformer_config(self.pipeline.model),
         }
     def models(self) -> List[Model]:
@@ -695,8 +801,8 @@ class HuggingFaceZeroShotObjectDetectionPipeline:
         spec["dynamic_shapes"].extend(
             [
-                {0: "num_labels"},
-                {0: "num_labels"},
+                {0: torch.export.Dim("num_labels", max=MAX_DYNAMIC_VAL)},
+                {0: torch.export.Dim("num_labels", max=MAX_DYNAMIC_VAL)},
             ]
         )
@@ -762,33 +868,121 @@ class HuggingFaceDepthEstimationPipeline:
         return [self.model]
-def import_huggingface_pipeline(pipeline: Pipeline, **kwargs) -> List[Model]:
-    if pipeline.framework != "pt":
-        raise ValueError(
-            "unimplemented: hugging face pipeline framework: {}".format(
-                pipeline.framework
+class HuggingFaceFeatureExtractionPipeline:
+    def __init__(self, pipeline: Pipeline, name: Optional[str] = None):
+        self.pipeline = pipeline
+        if name is None:
+            name = pipeline.model.name_or_path
+        self.tokenizer_model = HuggingFaceTokenizer(self.pipeline.tokenizer)
+        self.model = TorchModel(
+            name=name,
+            torch_module=self.pipeline.model,
+            **self._guess_model_spec(),
+        )
+    def _guess_model_spec(self) -> Dict:
+        spec = {
+            "example_inputs": [],
+            "input_tensor_semantics": [],
+            "output_tensor_semantics": [],
+            "dynamic_shapes": [],
+        }
+        input_dict = self.pipeline.preprocess("this is a prompt! Test test test?")
+        if "input_ids" not in input_dict:
+            raise ValueError(
+                'HuggingFaceFeatureExtractionPipeline expects preprocessed inputs to have an "input_ids" tensor'
             )
+        spec["example_inputs"].append(input_dict["input_ids"])
+        spec["input_tensor_semantics"].extend(
+            [
+                TensorSemantics(
+                    dimensions=[
+                        BatchDimension(),
+                        TokensDimension(),
+                    ]
+                ),
+            ]
+        )
+        spec["output_tensor_semantics"].extend(
+            [
+                TensorSemantics(
+                    dimensions=[
+                        BatchDimension(),
+                        TokensDimension(),
+                        EmbeddingDimension(),
+                    ],
+                ),
+                TensorSemantics(
+                    dimensions=[
+                        BatchDimension(),
+                        EmbeddingDimension(),
+                    ],
+                ),
+            ]
         )
-    if pipeline.task == "text-generation":
-        return HuggingFaceTextGenerationPipeline(pipeline, **kwargs).models()
-    elif pipeline.task == "image-segmentation":
-        return HuggingFaceImageSegmentationPipeline(pipeline, **kwargs).models()
-    elif pipeline.task == "object-detection":
-        return HuggingFaceObjectDetectionPipeline(pipeline, **kwargs).models()
-    elif pipeline.task == "zero-shot-object-detection":
-        return HuggingFaceZeroShotObjectDetectionPipeline(pipeline, **kwargs).models()
-    elif pipeline.task == "depth-estimation":
-        return HuggingFaceDepthEstimationPipeline(pipeline, **kwargs).models()
-    raise ValueError(
-        "unimplemented: hugging face pipeline task: {} (supported tasks: [{}])".format(
-            pipeline.task,
+        max_seqlen = (
+            getattr(self.pipeline.model.config, "max_position_embeddings", 500) - 1
+        )
+        spec["dynamic_shapes"].extend(
             [
-                "text-generation",
-                "image-segmentation",
-                "object-detection",
-                "zero-shot-object-detection",
-                "depth-estimation",
-            ],
-        )
-    )
+                {
+                    1: torch.export.Dim(
+                        "seqlen",
+                        max=max_seqlen,
+                    )
+                },
+            ]
+        )
+        return spec
+    def models(self) -> List[Model]:
+        return [self.model, self.tokenizer_model]
+def import_huggingface_pipeline(pipeline: Pipeline, **kwargs) -> List[Model]:
+    with console.status(
+        f'Importing HuggingFace pipeline: "{pipeline.model.name_or_path}"'
+    ):
+        if pipeline.framework != "pt":
+            raise ValueError(
+                "unimplemented: hugging face pipeline framework: {}".format(
+                    pipeline.framework
+                )
+            )
+        if pipeline.task == "text-generation":
+            result = HuggingFaceTextGenerationPipeline(pipeline, **kwargs).models()
+        elif pipeline.task == "image-segmentation":
+            result = HuggingFaceImageSegmentationPipeline(pipeline, **kwargs).models()
+        elif pipeline.task == "object-detection":
+            result = HuggingFaceObjectDetectionPipeline(pipeline, **kwargs).models()
+        elif pipeline.task == "zero-shot-object-detection":
+            result = HuggingFaceZeroShotObjectDetectionPipeline(
+                pipeline, **kwargs
+            ).models()
+        elif pipeline.task == "depth-estimation":
+            result = HuggingFaceDepthEstimationPipeline(pipeline, **kwargs).models()
+        elif pipeline.task == "feature-extraction":
+            result = HuggingFaceFeatureExtractionPipeline(pipeline, **kwargs).models()
+        else:
+            raise ValueError(
+                "unimplemented: hugging face pipeline task: {} (supported tasks: [{}])".format(
+                    pipeline.task,
+                    [
+                        "text-generation",
+                        "image-segmentation",
+                        "object-detection",
+                        "zero-shot-object-detection",
+                        "depth-estimation",
+                        "feature-extraction",
+                    ],
+                )
+            )
+    console.print(f'Imported HuggingFace pipeline: "{pipeline.model.name_or_path}".')
+    return result

gimlet-api 0.0.9__py3-none-any.whl → 0.0.10__py3-none-any.whl

gimlet-api 0.0.9py3-none-any.whl → 0.0.10py3-none-any.whl