PyPI - gimlet-api - Versions diffs - 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl - Mend

gimlet-api 0.0.13py3-none-any.whl → 0.0.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

{gimlet_api-0.0.13.dist-info → gimlet_api-0.0.15.dist-info}/METADATA +2 -2
{gimlet_api-0.0.13.dist-info → gimlet_api-0.0.15.dist-info}/RECORD +25 -22
gml/client.py +70 -25
gml/compile.py +0 -11
gml/device.py +2 -2
gml/hf.py +101 -63
gml/model_utils.py +10 -1
gml/pipelines.py +97 -23
gml/proto/opentelemetry/proto/common/v1/common_pb2.py +3 -1
gml/proto/opentelemetry/proto/resource/v1/resource_pb2.py +2 -2
gml/proto/src/api/corepb/v1/actor_net_pb2.py +77 -0
gml/proto/src/api/corepb/v1/compiled_pipeline_pb2.py +28 -28
gml/proto/src/api/corepb/v1/controlplane_pb2.py +35 -33
gml/proto/src/api/corepb/v1/cp_dp_pb2.py +92 -0
gml/proto/src/api/corepb/v1/cp_edge_pb2.py +90 -77
gml/proto/src/api/corepb/v1/dataplane_pb2.py +33 -0
gml/proto/src/api/corepb/v1/device_info_pb2.py +19 -17
gml/proto/src/api/corepb/v1/mediastream_pb2.py +11 -9
gml/proto/src/api/corepb/v1/model_exec_pb2.py +164 -141
gml/proto/src/common/typespb/jwt_pb2.py +14 -10
gml/proto/src/controlplane/compiler/cpb/v1/cpb_pb2.py +10 -8
gml/proto/src/controlplane/logicalpipeline/lppb/v1/lppb_pb2.py +17 -11
gml/proto/src/controlplane/logicalpipeline/lppb/v1/lppb_pb2_grpc.py +33 -0
gml/tensor.py +7 -0
{gimlet_api-0.0.13.dist-info → gimlet_api-0.0.15.dist-info}/WHEEL +0 -0

{gimlet_api-0.0.13.dist-info → gimlet_api-0.0.15.dist-info}/METADATA RENAMED Viewed

@@ -13,10 +13,10 @@ Requires-Dist: torch>=2.6.0
 Requires-Dist: torch-mlir-gml
 Requires-Dist: numpy<2.0.0
 Requires-Dist: rich
-Requires-Dist: transformers>=4.43.3
+Requires-Dist: transformers>=4.53.0
 Requires-Dist: tokenizers>=0.21.0
 Requires-Dist: safetensors-mlir
 Requires-Dist: packaging
-Version: 0.0.13
+Version: 0.0.15
 UNKNOWN

{gimlet_api-0.0.13.dist-info → gimlet_api-0.0.15.dist-info}/RECORD RENAMED Viewed

@@ -1,13 +1,13 @@
 gml/__init__.py,sha256=H3WQZ_RaN7VNeb__qeHEbKLEwkaG7gpL5FQ8s1IotUA,773
 gml/_utils.py,sha256=mSCWHhCdzcUvHqmJIB2FS215K1LMgJCWcZ6e6FWK3hQ,1184
 gml/asset_manager.py,sha256=VnbqUZHPOgPrAh6ri9C0EuNhS8tAHIrbUyJPAJuD9po,2053
-gml/client.py,sha256=ztYvImrY_o8tR-_RJMXdDPVkqzjfVsF8jjM3CUhnFOY,14441
-gml/compile.py,sha256=WnsVgZTaiW7Uh-D_ObkX1ee9p4_8PDAF2KIQCiHAbFA,10744
-gml/device.py,sha256=Iw71NnuLcgjY32ZMXHlnlPkosTuHEmL9E98utmNChlM,2650
-gml/hf.py,sha256=Kv2yffy8omTRQDPnoIZocG2EOyfhr7UvLFIvTmRxw0g,36170
+gml/client.py,sha256=3rqVTMSv7QUlSrMqSQDzXuXDvfm3WTF3G8enK8on9zU,15861
+gml/compile.py,sha256=1zM0ihwbrptZ4FphyvtvfKJyVFuVVfyBnDnnEeJ12fg,10397
+gml/device.py,sha256=9Z7dsBfpvTHShd1OWSi1Pvn85EFYDmn1dszWV8YHIJI,2648
+gml/hf.py,sha256=4tU2c3Th_mc__78Odetg2g3b16eZM4oSevUtMr27H_k,37811
 gml/model.py,sha256=8fIYlLRduTsUZfYJr_YVPNxbEVIzr7_yaaTe4T-TZ2Y,8429
-gml/model_utils.py,sha256=vZvE5cHZIDkUkeZ4Pk4hhV-zOYMiREluv4b8kdqQ3Ig,1375
-gml/pipelines.py,sha256=hjsh7yNICDsjyKB8gQh9rtpwmfSk2q6otGogc932eLA,8454
+gml/model_utils.py,sha256=Kw08MIPmwIOocoQXfjlqjn78mVerCQ2uzleT0H_zcck,1821
+gml/pipelines.py,sha256=Nif9FNqXqjYM6iL3RZzkX-KBsZamSu7xh8HwjNqsB5A,10220
 gml/preprocessing.py,sha256=YPcxwBOdx0h0ADzoloYbFw9qUGFbi167E8HA4Zwn7Pk,3928
 gml/proto/gogoproto/gogo_pb2.py,sha256=WVMIAR8K--mCUkTPM7mEeeXGpQlRRtt_kco10iP3CZs,15728
 gml/proto/mediapipe/framework/calculator_contract_test_pb2.py,sha256=hNjyZCBz3RYa6rN4xR3FOCZKA24gq_LsJ3EMegl5wK4,2031
@@ -22,33 +22,36 @@ gml/proto/mediapipe/framework/status_handler_pb2.py,sha256=dgiW2ohm-ho07z1k4TM_X
 gml/proto/mediapipe/framework/stream_handler_pb2.py,sha256=kNo-2Fdua_CeyJInI3q5r9IoAUanjhk9jh01Z1KXu6Q,2043
 gml/proto/mediapipe/framework/test_calculators_pb2.py,sha256=tXF25VpGtHGArffRqFmjD6FO7xmuCPd5j9UYON2SVSM,2230
 gml/proto/mediapipe/framework/thread_pool_executor_pb2.py,sha256=9TJ66fqSo1BiJmEAQesK0fnVe55zcJpOqVip6HotgyE,2345
-gml/proto/opentelemetry/proto/common/v1/common_pb2.py,sha256=wQjeDti-C8JiNwRn-z5M5p-Fqxm-SmnbPaoitJcSK-4,2860
+gml/proto/opentelemetry/proto/common/v1/common_pb2.py,sha256=2l9c_xGfUvShLFkzofChHmbgpa7I0-u-FJ_J2Wv3lvs,3168
 gml/proto/opentelemetry/proto/metrics/v1/metrics_pb2.py,sha256=k8oW5tmFlJK2574Ky6kDc0JmNNQCLroRwCCGyxDd7JA,9968
-gml/proto/opentelemetry/proto/resource/v1/resource_pb2.py,sha256=cbNmE12Nm3PjW4NXU7-Z-9m_0Zs3Ab8R1xLkDnvclCg,1730
-gml/proto/src/api/corepb/v1/compiled_pipeline_pb2.py,sha256=g3MxBqshtwaM9_Nrbvwo995_XWq-maXGP6mDeiEzZKo,7529
-gml/proto/src/api/corepb/v1/controlplane_pb2.py,sha256=r0tM1XFlorKCCv5hCYEq8-LXo7moJ4C8PXFGQl-GqiU,14694
-gml/proto/src/api/corepb/v1/cp_edge_pb2.py,sha256=KuLwOZprktJ6aFnUM_OkHucQXSKQ8yg-3lRHOZidWeI,21014
+gml/proto/opentelemetry/proto/resource/v1/resource_pb2.py,sha256=08f2F5overFnGlyNyZHb5rUyv7-G9pC15c4xDCccIPY,1831
+gml/proto/src/api/corepb/v1/actor_net_pb2.py,sha256=XV3UZbyxvHcXRG-kFdA5LImaTAnSVRw2scw8Cz5Mn6Q,11009
+gml/proto/src/api/corepb/v1/compiled_pipeline_pb2.py,sha256=K2xhqSAxQ3w_3VFC43XpeqXm9_EDmVz6Fb8PrA_jxOY,7609
+gml/proto/src/api/corepb/v1/controlplane_pb2.py,sha256=n6LOgoQCAJZ2LJivb9DffmzLpYY7e9K1EDT8pQsIbno,14705
+gml/proto/src/api/corepb/v1/cp_dp_pb2.py,sha256=cHJkgfehbgnohipFXMtoHSzR5mGfVZ9oKwOVFjEMRc4,10358
+gml/proto/src/api/corepb/v1/cp_edge_pb2.py,sha256=Ruw7_GcoElKzrhEoOMYkUezJDvrjwA29cVMtBhfV8I8,23294
+gml/proto/src/api/corepb/v1/dataplane_pb2.py,sha256=D3nA4c8624Irh1cIWM8rbvUBUmr29CJ0lQlPko2BgMU,1966
 gml/proto/src/api/corepb/v1/deployed_pipeline_pb2.py,sha256=XbppBI1fQ-FazD2in1o6Z9_BIPRBArCE5dVUF7iUn3Y,6649
-gml/proto/src/api/corepb/v1/device_info_pb2.py,sha256=hcnU9CSZjTa0liXMGPLOos1oSKvF3jQdUaAgXZSqFS0,6760
+gml/proto/src/api/corepb/v1/device_info_pb2.py,sha256=lXFF04AkL_Y3Tcg9mXAvgo-w3lmAMiuHQZT5yLpyO4s,7029
 gml/proto/src/api/corepb/v1/gem_config_pb2.py,sha256=vC0g3k9hDv-LhiV6LwaYCly6x00Xx_YA0i2AZSwCo_I,5396
-gml/proto/src/api/corepb/v1/mediastream_pb2.py,sha256=LB5YJNw_MMfFa4hgfWhpqp4yG2rTzxKZa4L3vzsB_lU,9838
-gml/proto/src/api/corepb/v1/model_exec_pb2.py,sha256=Z4y7P6nyO_6dwhEkv7qhsYKEyAJVEB4nS41LdO1NpYA,34465
-gml/proto/src/common/typespb/jwt_pb2.py,sha256=lxy-bqbyg96i9n_xr2JbkuWX-ldnoJavXPMnApzVSio,5580
+gml/proto/src/api/corepb/v1/mediastream_pb2.py,sha256=fAB7s7w4soBtaWJXwni5OI--lWapnLM1LeqZzIBWnlo,10359
+gml/proto/src/api/corepb/v1/model_exec_pb2.py,sha256=eh6-tQPq5GEhsuRM6IgJRc2PvKhMlQADu_Lj7FZN5O8,37754
+gml/proto/src/common/typespb/jwt_pb2.py,sha256=JxBZr8JU1mBoo1PKPClXr3SdfjZynRYRlQ-JHZRjqhE,6134
 gml/proto/src/common/typespb/status_pb2.py,sha256=IbBJnbsAlvsuTtyT285ZuW6k5VaPfl5kRSOnBxD_H8M,2109
 gml/proto/src/common/typespb/uuid_pb2.py,sha256=5Fm3jYpCPX7sMrP6RhRYsF0SnuZNIBEQJk9f0jwZ2Rw,1188
-gml/proto/src/controlplane/compiler/cpb/v1/cpb_pb2.py,sha256=4mp1QWV7FOzF_nC3RDKZ9vTA-ezMhukcjBEt1lcjGmM,4933
+gml/proto/src/controlplane/compiler/cpb/v1/cpb_pb2.py,sha256=MIizns1dezQjocpJjeNJ4Z7BFWNweKrRpJ070L9IaCk,5203
 gml/proto/src/controlplane/compiler/cpb/v1/cpb_pb2_grpc.py,sha256=l-gTK9nYpTlVb7QGAckSQXlHhkRdKe2-nrxXc8NQavY,2912
 gml/proto/src/controlplane/directory/directorypb/v1/directory_pb2.py,sha256=S3OzKYO34BRuYs3rSKbLfjAgm3LQb6wQFS-sfFdQSfk,11496
 gml/proto/src/controlplane/directory/directorypb/v1/directory_pb2_grpc.py,sha256=p3OpT8-hfNHu4-29qr-ZahRwO-LoCYM9Q4jomAHTXGA,24572
 gml/proto/src/controlplane/filetransfer/ftpb/v1/ftpb_pb2.py,sha256=r8mbJNTq45_c0amPnTr8OFZasCk7XWu2YS_eu7GfWJg,7050
 gml/proto/src/controlplane/filetransfer/ftpb/v1/ftpb_pb2_grpc.py,sha256=XlE4R2PJaOmzQocx7y6SKJvuqt8tYBGzBuhajvzG0cc,12919
-gml/proto/src/controlplane/logicalpipeline/lppb/v1/lppb_pb2.py,sha256=1-d46lvO8c80Rj7rWNaExSQsNeH9CioHilP9wW_o6I8,7985
-gml/proto/src/controlplane/logicalpipeline/lppb/v1/lppb_pb2_grpc.py,sha256=cX7p2xLe01WzYTaB2TzqhePUUhaZkTE4iOAZzHaklmQ,11634
+gml/proto/src/controlplane/logicalpipeline/lppb/v1/lppb_pb2.py,sha256=CaRhKtLOcz9AhIw9Rxws0ALqmFAktqkQyRlgmoA6OF0,8976
+gml/proto/src/controlplane/logicalpipeline/lppb/v1/lppb_pb2_grpc.py,sha256=NA_Ud55lnbJfgOw729j97MqfKuylCopiimt6pryNJoU,13740
 gml/proto/src/controlplane/model/mpb/v1/mpb_pb2.py,sha256=IryUZ-TlpQKvU52-XFRKvuAfiH-0EkXrTzwvmmK7Fmk,4591
 gml/proto/src/controlplane/model/mpb/v1/mpb_pb2_grpc.py,sha256=ZABewZxEthfQx2pEfvBfLc4M8JoKazheQOH181CegjY,6586
 gml/register_submodules.py,sha256=U8IwjVygX2vxNi_aK6ljHOD4mmrOhbyVczvy4wwulqU,5027
-gml/tensor.py,sha256=aPLm3I3qkYNDcJmntaUycqqN5rsZmcj8ql0EkupJudY,14977
+gml/tensor.py,sha256=ojRlfMEf5wsLyOHuAVl0wuZlcaqO0KF4EDYdtEju6hk,15229
 gml/version_utils.py,sha256=ouCemolnoDm71NiQRcfpa5k5bETTLaFCH6lrEyivGNY,1626
-gimlet_api-0.0.13.dist-info/WHEEL,sha256=sobxWSyDDkdg_rinUth-jxhXHqoNqlmNMJY3aTZn2Us,91
-gimlet_api-0.0.13.dist-info/METADATA,sha256=iU7mWRms-_xioOe-DyeOgOE4oTfJ5c1f5NDFSp2Ew90,611
-gimlet_api-0.0.13.dist-info/RECORD,,
+gimlet_api-0.0.15.dist-info/WHEEL,sha256=sobxWSyDDkdg_rinUth-jxhXHqoNqlmNMJY3aTZn2Us,91
+gimlet_api-0.0.15.dist-info/METADATA,sha256=E-gLFjOe2M6B_KiEyg41boQKRbEw7ej_FwV-pJaop1k,611
+gimlet_api-0.0.15.dist-info/RECORD,,

gml/client.py CHANGED Viewed

@@ -16,6 +16,7 @@
 import os
 import uuid
+import warnings
 from pathlib import Path
 from typing import BinaryIO, List, Optional, TextIO, Union
 from urllib.parse import quote
@@ -24,6 +25,8 @@ import grpc
 from rich.progress import (
     Console,
 )
+from tqdm import TqdmExperimentalWarning
+from tqdm.rich import tqdm
 import gml.proto.src.api.corepb.v1.model_exec_pb2 as modelexecpb
 import gml.proto.src.common.typespb.uuid_pb2 as uuidpb
@@ -42,6 +45,9 @@ from gml.device import DeviceCapabilities
 from gml.model import Model
 from gml.pipelines import Pipeline
+# Filter out tqdm experimental warnings for the rich progress bar.
+warnings.filterwarnings("ignore", category=TqdmExperimentalWarning)
 DEFAULT_CONTROLPLANE_ADDR = "app.gimletlabs.ai"
 console = Console()
@@ -206,15 +212,27 @@ class Client:
         file_id: uuidpb.UUID,
         sha256: str,
         file: TextIO | BinaryIO,
+        display_name: str,
         chunk_size=1024 * 1024,
     ):
         def chunked_requests():
+            file.seek(0, os.SEEK_END)
+            total_size = file.tell()
             file.seek(0)
-            for chunk in chunk_file(file, chunk_size):
-                req = ftpb.UploadFileRequest(
-                    file_id=file_id, sha256sum=sha256, chunk=chunk
-                )
-                yield req
+            with tqdm(
+                total=total_size,
+                desc=f"Uploading {display_name}",
+                unit="B",
+                unit_scale=True,
+            ) as pbar:
+                for chunk in chunk_file(file, chunk_size):
+                    pbar.update(len(chunk))
+                    req = ftpb.UploadFileRequest(
+                        file_id=file_id, sha256sum=sha256, chunk=chunk
+                    )
+                    yield req
         stub = self._fts_stub()
         resp: ftpb.UploadFileResponse = stub.UploadFile(
@@ -226,6 +244,7 @@ class Client:
         self,
         name: str,
         file: TextIO | BinaryIO,
+        display_name: str,
         sha256: Optional[str] = None,
         chunk_size=1024 * 1024,
     ) -> ftpb.FileInfo:
@@ -233,18 +252,30 @@ class Client:
         if sha256 is None:
             sha256 = sha256sum(file)
-        self._upload_created_file(file_info.file_id, sha256, file, chunk_size)
+        self._upload_created_file(
+            file_id=file_info.file_id,
+            sha256=sha256,
+            file=file,
+            display_name=display_name,
+            chunk_size=chunk_size,
+        )
         return self._file_info_by_name(name)
     def _upload_file_if_not_exists(
         self,
-        name: str,
+        name: str,  # name is what is stored in the file service, typically a sha256 of the file.
         file: TextIO | BinaryIO,
+        display_name: str,
         sha256: Optional[str] = None,
     ) -> ftpb.FileInfo:
         file_info: Optional[ftpb.FileInfo] = None
         try:
-            file_info = self.upload_file(name, file, sha256)
+            file_info = self.upload_file(
+                name=name,
+                file=file,
+                display_name=display_name,
+                sha256=sha256,
+            )
         except FileAlreadyExists:
             file_info = self._file_info_by_name(name)
@@ -252,7 +283,12 @@ class Client:
             case ftpb.FILE_STATUS_READY:
                 pass
             case ftpb.FILE_STATUS_CREATED:
-                self._upload_created_file(file_info.file_id, sha256, file)
+                self._upload_created_file(
+                    file_id=file_info.file_id,
+                    sha256=sha256,
+                    file=file,
+                    display_name=display_name,
+                )
                 file_info = self._file_info_by_name(name)
             case _:
                 raise Exception("file status is deleted or unknown, cannot re-upload")
@@ -292,23 +328,32 @@ class Client:
             )
             return existing_model
         model_info = model.to_proto()
-        with console.status(f'Creating model "{model.name}"...'):
-            with model.collect_assets() as model_assets:
-                for asset_name, file in model_assets.items():
-                    if isinstance(file, Path) or isinstance(file, str):
-                        file = open(file, "rb")
-                    sha256 = sha256sum(file)
-                    upload_name = model.name
-                    if asset_name:
-                        upload_name += ":" + asset_name
-                    file_info = self._upload_file_if_not_exists(sha256, file, sha256)
-                    console.print(f"Uploaded {upload_name}.")
+        # Don't use context lib because we want to show progress for asset collection, then will have a separate
+        # progress bar for file upload.
+        status = console.status(f'Tracing model "{model.name}"...')
+        status.start()
+        with model.collect_assets() as model_assets:
+            status.stop()
+            for asset_name, file in model_assets.items():
+                if isinstance(file, Path) or isinstance(file, str):
+                    file = open(file, "rb")
+                sha256 = sha256sum(file)
+                display_name = model.name
+                if asset_name:
+                    display_name += ":" + asset_name
+                file_info = self._upload_file_if_not_exists(
+                    name=sha256,
+                    file=file,
+                    display_name=display_name,
+                    sha256=sha256,
+                )
+                console.print(f"Uploaded {display_name}.")
-                    model_info.file_assets[asset_name].MergeFrom(file_info.file_id)
+                model_info.file_assets[asset_name].MergeFrom(file_info.file_id)
-                    file.close()
+                file.close()
         return self._create_model(model_info)
@@ -350,7 +395,7 @@ class Client:
             req, metadata=self._get_request_metadata(idempotent=True)
         )
-        url = f"https://{self._controlplane_addr}/orgs/{quote(self._org_name)}/pipeline/{quote(name)}"
+        url = f"https://{self._controlplane_addr}/orgs/{quote(self._org_name)}/workloads/{quote(name, safe='')}"
         console.print(
             f"[green]Pipeline upload complete![/green]\nView your pipeline at: [cyan]{url}[/cyan]"
         )

gml/compile.py CHANGED Viewed

@@ -253,18 +253,7 @@ def to_torch_mlir(
     if decomposition_denylist is None:
         decomposition_denylist = _default_decomposition_denylist()
-    model = model.eval().to("cpu")
     submodule_registration_workarounds(model)
-    try:
-        # Running the model a few times on the inputs, leads to more consistent compiled results.
-        for _ in range(2):
-            _ = model(*example_inputs)
-    except:  # noqa
-        # Ignore errors running the model. This can happen when the model has data dependent branches.
-        pass
     register_dynamic_cache_pytree_node()
     prog = _export(
         model,

gml/device.py CHANGED Viewed

@@ -57,8 +57,8 @@ def _runtime_str_to_runtime_protos(
             return deviceinfopb.ModelRuntimeType.MODEL_RUNTIME_TYPE_TENSORRT
         case "openvino":
             return deviceinfopb.ModelRuntimeType.MODEL_RUNTIME_TYPE_OPENVINO
-        case "hailort":
-            return deviceinfopb.ModelRuntimeType.MODEL_RUNTIME_TYPE_HAILORT
+        case "habana":
+            return deviceinfopb.ModelRuntimeType.MODEL_RUNTIME_TYPE_HABANA
         case _:
             raise ValueError("invalid runtime: {}".format(runtime))

gml/hf.py CHANGED Viewed

@@ -27,6 +27,7 @@ import transformers
 from rich.progress import Console
 from transformers import (
     BaseImageProcessor,
+    Cache,
     DynamicCache,
     Pipeline,
     PreTrainedModel,
@@ -52,9 +53,11 @@ from gml.tensor import (
     DetectionOutputDimension,
     DimensionSemantics,
     EmbeddingDimension,
+    IgnoreDimension,
     ImageChannelDimension,
     ImageHeightDimension,
     ImageWidthDimension,
+    PositionIDsDimension,
     SegmentationMaskChannel,
     TensorSemantics,
     TokensDimension,
@@ -117,10 +120,18 @@ class WrapWithFunctionalCache(torch.nn.Module):
         super().__init__()
         self.model = model
-    def forward(self, input_ids, cache):
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values: Cache,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+    ):
         outputs = self.model(
             input_ids=input_ids,
-            past_key_values=cache,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
             return_dict=True,
             use_cache=True,
         )
@@ -134,7 +145,7 @@ class HuggingFaceTextGenerationPipeline:
         pipeline: Pipeline,
         name: Optional[str] = None,
         tokenizer_name: Optional[str] = None,
-        dynamic_seqlen: bool = False,
+        trace_w_attn_mask_and_pos_ids: bool = False,
         dynamic_batch: bool = False,
         export_predispatch: bool = False,
     ):
@@ -158,7 +169,7 @@ class HuggingFaceTextGenerationPipeline:
             name,
             torch_module=self.model,
             export_predispatch=export_predispatch,
-            **self._guess_model_spec(dynamic_seqlen),
+            **self._guess_model_spec(trace_w_attn_mask_and_pos_ids),
         )
     def _initialize_key_value_cache(self) -> DynamicCache:
@@ -249,7 +260,13 @@ class HuggingFaceTextGenerationPipeline:
             ),
         )
-    def _guess_model_spec(self, dynamic_seqlen: bool) -> Dict:
+    def _guess_model_spec(self, trace_w_attn_mask_and_pos_ids: bool) -> Dict:
+        num_experts_per_tok = (
+            1
+            if not hasattr(self.pipeline.model.config, "num_experts_per_tok")
+            else self.pipeline.model.config.num_experts_per_tok
+        )
         input_dict = self.pipeline.preprocess("this is a prompt! Test test test?")
         if "input_ids" not in input_dict:
             raise ValueError(
@@ -258,9 +275,22 @@ class HuggingFaceTextGenerationPipeline:
         inputs = []
         input_tensor_semantics = []
+        dynamic_shapes = []
+        # Set range to half of seq_length to account for # of tokens per expert.
+        # pytorch export creates a constraint on the number of possible tokens
+        # sent to each expert. That value is num_experts * seq_length. If we don't divide
+        # by number of experts, the tracing creates an integer value that exceeds the valid int64
+        # range and will throw a hard to decipher error message.
+        seq_length = torch.export.Dim(
+            "seq_length", min=2, max=MAX_DYNAMIC_VAL // num_experts_per_tok
+        )
+        batch_shape = {0: torch.export.Dim("batch_size")} if self.dynamic_batch else {}
         # This currently assumes that all HF language models have inputs that are [B, NUM_TOKENS].
-        inputs.append(torch.tile(input_dict["input_ids"], [self.batch_size, 1]))
+        inputs.append(
+            torch.tile(input_dict["input_ids"].to(torch.int32), [self.batch_size, 1])
+        )
         input_tensor_semantics.append(
             TensorSemantics(
                 dimensions=[
@@ -269,76 +299,84 @@ class HuggingFaceTextGenerationPipeline:
                 ],
             )
         )
+        dynamic_shapes.append({1: seq_length} | batch_shape)
+        cache_length = torch.export.Dim("cache_length", min=2, max=MAX_DYNAMIC_VAL)
         # Assume that the model supports a KeyValue cache.
         cache_values = self._initialize_key_value_cache()
+        cache_shapes = []
         inputs.append(cache_values)
         for _ in range(len(cache_values)):
             input_tensor_semantics.append(AttentionKeyValueCacheTensorSemantics())
             input_tensor_semantics.append(AttentionKeyValueCacheTensorSemantics())
-        outputs = self.model(*inputs)
-        # Determine output semantics.
-        output_tensor_semantics = []
-        seqlen = inputs[0].shape[1]
-        found_logits = False
-        for tensor in flatten(outputs):
-            if not isinstance(tensor, torch.Tensor):
-                continue
-            if (
-                not found_logits
-                and len(tensor.shape) == 3
-                and tensor.shape[0] == self.batch_size
-                and tensor.shape[1] == seqlen
-            ):
-                # This should be the logits tensor.
-                output_tensor_semantics.append(
-                    TensorSemantics(
-                        dimensions=[
-                            BatchDimension(),
-                            TokensDimension(),
-                            VocabLogitsDimension(),
-                        ],
+            cache_shapes.append(
+                [{2: cache_length} | batch_shape, {2: cache_length} | batch_shape]
+            )
+        dynamic_shapes.append(cache_shapes)
+        if trace_w_attn_mask_and_pos_ids:
+            input_len = input_dict["input_ids"].shape[1]
+            # Assume that the model supports a 4D attention mask.
+            # This is typically an optional input and not specifying it means we treat it as a causal mask,
+            # however in scenarios where we have padded inputs or KV caches, this may be explicitly set.
+            inputs.append(
+                torch.triu(
+                    torch.ones(
+                        (input_len, input_len + self._cache_length_for_tracing),
+                        dtype=torch.float16,
                     )
+                    * (-float("inf")),
+                    diagonal=1,
+                ).expand(self.batch_size, 1, -1, -1)
+            )
+            input_tensor_semantics.append(
+                TensorSemantics(
+                    dimensions=[
+                        BatchDimension(),
+                        IgnoreDimension(),
+                        AttentionMaskDimension(),
+                        AttentionMaskDimension(),
+                    ],
                 )
-                found_logits = True
-            else:
-                output_tensor_semantics.append(AttentionKeyValueCacheTensorSemantics())
-        if not found_logits:
-            raise ValueError(
-                "could not determine output logits tensor for text generation model"
+            )
+            seq_and_cache_length = torch.export.Dim(
+                "seq_and_cache_length",
+                min=4,
+                max=MAX_DYNAMIC_VAL + MAX_DYNAMIC_VAL // num_experts_per_tok,
+            )
+            dynamic_shapes.append(
+                {2: seq_length, 3: seq_and_cache_length} | batch_shape
             )
-        num_experts_per_tok = (
-            1
-            if not hasattr(self.pipeline.model.config, "num_experts_per_tok")
-            else self.pipeline.model.config.num_experts_per_tok
-        )
-        dynamic_shapes = None
-        # Set range to half of seqlen to account for # of tokens per expert.
-        # pytorch export creates a constraint on the number of possible tokens
-        # sent to each expert. That value is num_experts * seqlen. If we don't divide
-        # by number of experts, the tracing creates an integer value that exceeds the valid int64
-        # range and will throw a hard to decipher error message.
-        seqlen = torch.export.Dim(
-            "seqlen", min=2, max=MAX_DYNAMIC_VAL // num_experts_per_tok
-        )
+            # Assume that the model supports position ids.
+            inputs.append(
+                torch.arange(
+                    self._cache_length_for_tracing,
+                    self._cache_length_for_tracing + input_len,
+                    dtype=torch.int32,
+                ).expand(self.batch_size, -1)
+            )
+            input_tensor_semantics.append(
+                TensorSemantics(
+                    dimensions=[BatchDimension(), PositionIDsDimension()],
+                )
+            )
+            dynamic_shapes.append({1: seq_length} | batch_shape)
-        cache_length = torch.export.Dim("cache_length", min=2, max=MAX_DYNAMIC_VAL)
-        dynamic_shapes = [
-            {1: seqlen},
-            [[{2: cache_length}, {2: cache_length}] for _ in range(len(cache_values))],
+        # Since we wrap the model with WrapWithFunctionalCache, the outputs are well defined.
+        output_tensor_semantics = [
+            TensorSemantics(
+                dimensions=[
+                    BatchDimension(),
+                    TokensDimension(),
+                    VocabLogitsDimension(),
+                ],
+            ),
+        ] + [
+            AttentionKeyValueCacheTensorSemantics()
+            for _ in range(len(cache_values) * 2)
         ]
-        if self.dynamic_batch:
-            batch = torch.export.Dim("batch")
-            dynamic_shapes[0][0] = batch
-            for i in range(len(cache_values)):
-                dynamic_shapes[1][i][0][0] = batch
-                dynamic_shapes[1][i][1][0] = batch
         return {
             "example_inputs": inputs,

gml/model_utils.py CHANGED Viewed

@@ -15,11 +15,13 @@
 # SPDX-License-Identifier: Apache-2.0
-def prepare_ultralytics_yolo(model):
+def prepare_ultralytics_yolo(model, example_inputs, num_iters=2):
     """Prepares an ultralytics YOLO model for export.
     Ultralytics YOLO models requires setting `export=True` on some of the torch modules for exporting to work properly.
     This function handles setting that value on the necessary modules.
+    This also runs forward passes on the model to stabilize the exported weights.
     """
     if not hasattr(model, "model"):
         raise ValueError(
@@ -33,3 +35,10 @@ def prepare_ultralytics_yolo(model):
             m.export = True
             # YOLOv8 requires setting `format` when `export = True`
             m.format = "custom"
+    # Run a couple of forward passes as a warmup since the exported weights seem to change
+    # after a forward run.
+    # See https://github.com/ultralytics/yolov5/blob/2540fd4c1c2d9186126a71b3eb681d3a0a11861e/models/yolo.py#L118
+    model.model.eval().to("cpu")
+    for _ in range(num_iters):
+        model.model(*example_inputs)

gimlet-api 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl

gimlet-api 0.0.13py3-none-any.whl → 0.0.15py3-none-any.whl