corvic-engine 0.3.0rc81__cp38-abi3-win_amd64.whl → 0.3.0rc83__cp38-abi3-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. corvic/{model → emodel}/__init__.py +40 -37
  2. corvic/emodel/_base_model.py +161 -0
  3. corvic/{model → emodel}/_completion_model.py +10 -8
  4. corvic/{model → emodel}/_feature_type.py +1 -1
  5. corvic/{model → emodel}/_feature_view.py +9 -7
  6. corvic/{model → emodel}/_pipeline.py +5 -5
  7. corvic/{model → emodel}/_proto_orm_convert.py +56 -54
  8. corvic/{model → emodel}/_resource.py +4 -4
  9. corvic/{model → emodel}/_room.py +4 -4
  10. corvic/{model → emodel}/_source.py +7 -7
  11. corvic/{model → emodel}/_space.py +9 -9
  12. corvic/engine/_native.pyd +0 -0
  13. corvic/op_graph/ops.py +6 -2
  14. corvic/system/__init__.py +10 -6
  15. corvic/system/_embedder.py +3 -0
  16. corvic/system/_image_embedder.py +50 -20
  17. corvic/system/in_memory_executor.py +6 -1
  18. corvic/transfer/__init__.py +43 -0
  19. corvic/transfer/_common_transformations.py +37 -0
  20. corvic/{model/_base_model.py → transfer/_orm_backed_proto.py} +116 -109
  21. corvic/transfer/py.typed +0 -0
  22. {corvic_engine-0.3.0rc81.dist-info → corvic_engine-0.3.0rc83.dist-info}/METADATA +2 -2
  23. {corvic_engine-0.3.0rc81.dist-info → corvic_engine-0.3.0rc83.dist-info}/RECORD +30 -26
  24. {corvic_engine-0.3.0rc81.dist-info → corvic_engine-0.3.0rc83.dist-info}/WHEEL +1 -1
  25. corvic_generated/orm/v1/agent_pb2.py +8 -8
  26. corvic_generated/orm/v1/agent_pb2.pyi +8 -4
  27. /corvic/{model → emodel}/_defaults.py +0 -0
  28. /corvic/{model → emodel}/_errors.py +0 -0
  29. /corvic/{model → emodel}/py.typed +0 -0
  30. {corvic_engine-0.3.0rc81.dist-info → corvic_engine-0.3.0rc83.dist-info}/licenses/LICENSE +0 -0
@@ -14,14 +14,14 @@ import sqlalchemy.orm as sa_orm
14
14
  from sqlalchemy.orm.interfaces import LoaderOption
15
15
 
16
16
  from corvic import eorm, op_graph, system
17
- from corvic.model._base_model import BelongsToRoomModel
18
- from corvic.model._defaults import Defaults
19
- from corvic.model._proto_orm_convert import (
17
+ from corvic.emodel._base_model import StandardModel
18
+ from corvic.emodel._defaults import Defaults
19
+ from corvic.emodel._proto_orm_convert import (
20
20
  source_delete_orms,
21
21
  source_orm_to_proto,
22
22
  source_proto_to_orm,
23
23
  )
24
- from corvic.model._resource import Resource, ResourceID
24
+ from corvic.emodel._resource import Resource, ResourceID
25
25
  from corvic.result import InvalidArgumentError, NotFoundError, Ok
26
26
  from corvic.table import Table
27
27
  from corvic_generated.model.v1alpha import models_pb2
@@ -45,7 +45,7 @@ def foreign_key(
45
45
  )
46
46
 
47
47
 
48
- class Source(BelongsToRoomModel[SourceID, models_pb2.Source, eorm.Source]):
48
+ class Source(StandardModel[SourceID, models_pb2.Source, eorm.Source]):
49
49
  """Sources describe how resources should be treated.
50
50
 
51
51
  Example:
@@ -261,8 +261,8 @@ class Source(BelongsToRoomModel[SourceID, models_pb2.Source, eorm.Source]):
261
261
  Example:
262
262
  >>> with_feature_types(
263
263
  >>> {
264
- >>> "id": corvic.model.feature_type.primary_key(),
265
- >>> "customer_id": corvic.model.feature_type.foreign_key(
264
+ >>> "id": corvic.emodel.feature_type.primary_key(),
265
+ >>> "customer_id": corvic.emodel.feature_type.foreign_key(
266
266
  >>> customer_source.id
267
267
  >>> ),
268
268
  >>> },
@@ -14,10 +14,10 @@ import sqlalchemy as sa
14
14
  from sqlalchemy import orm as sa_orm
15
15
 
16
16
  from corvic import eorm, op_graph, system
17
- from corvic.model._base_model import BelongsToRoomModel
18
- from corvic.model._defaults import Defaults
19
- from corvic.model._feature_view import FeatureView, FeatureViewEdgeTableMetadata
20
- from corvic.model._proto_orm_convert import (
17
+ from corvic.emodel._base_model import StandardModel
18
+ from corvic.emodel._defaults import Defaults
19
+ from corvic.emodel._feature_view import FeatureView, FeatureViewEdgeTableMetadata
20
+ from corvic.emodel._proto_orm_convert import (
21
21
  space_delete_orms,
22
22
  space_orm_to_proto,
23
23
  space_proto_to_orm,
@@ -53,13 +53,13 @@ name_to_proto_embedding_model = {
53
53
  def image_model_proto_to_name(image_model: embedding_models_pb2.ImageModel):
54
54
  match image_model:
55
55
  case embedding_models_pb2.IMAGE_MODEL_CUSTOM:
56
- return Ok("random")
56
+ return Ok(system.RandomImageEmbedder.model_name())
57
57
  case embedding_models_pb2.IMAGE_MODEL_CLIP:
58
- return Ok("openai/clip-vit-base-patch32")
58
+ return Ok(system.Clip.model_name())
59
59
  case embedding_models_pb2.IMAGE_MODEL_IDENTITY:
60
- return Ok("identity")
60
+ return Ok(system.IdentityImageEmbedder.model_name())
61
61
  case embedding_models_pb2.IMAGE_MODEL_SIGLIP2:
62
- return Ok("google/siglip2-base-patch16-512")
62
+ return Ok(system.SigLIP2.model_name())
63
63
  case embedding_models_pb2.IMAGE_MODEL_UNSPECIFIED:
64
64
  return Ok("")
65
65
  case _:
@@ -114,7 +114,7 @@ name_to_proto_image_model = {
114
114
  }
115
115
 
116
116
 
117
- class Space(BelongsToRoomModel[SpaceID, models_pb2.Space, eorm.Space]):
117
+ class Space(StandardModel[SpaceID, models_pb2.Space, eorm.Space]):
118
118
  """Spaces apply embedding methods to FeatureViews.
119
119
 
120
120
  Example:
corvic/engine/_native.pyd CHANGED
Binary file
corvic/op_graph/ops.py CHANGED
@@ -1260,7 +1260,9 @@ class _Base(OneofProtoWrapper[table_pb2.TableComputeOp], ABC):
1260
1260
  column_name=column_name,
1261
1261
  )
1262
1262
 
1263
- field = column.to_frame().to_arrow().schema.field(column_name)
1263
+ # TODO(aneesh): See https://github.com/pola-rs/polars/issues/23111 for
1264
+ # and remove the rechunk eventually.
1265
+ field = column.to_frame().rechunk().to_arrow().schema.field(column_name)
1264
1266
  dtype = field.type
1265
1267
 
1266
1268
  if ftype is None:
@@ -1268,8 +1270,10 @@ class _Base(OneofProtoWrapper[table_pb2.TableComputeOp], ABC):
1268
1270
 
1269
1271
  # Convert array to record batch with dummy column to use pa_scalar and then
1270
1272
  # remove the dummy column.
1273
+ # TODO(aneesh): See https://github.com/pola-rs/polars/issues/23111 for
1274
+ # and remove the rechunk eventually.
1271
1275
  value_batch = pa.record_batch(
1272
- [column.to_arrow()], schema=pa.schema([field]), metadata=None
1276
+ [column.rechunk().to_arrow()], schema=pa.schema([field]), metadata=None
1273
1277
  )
1274
1278
  structs = pa_scalar.batch_to_structs(value_batch)
1275
1279
  literal_values = [
corvic/system/__init__.py CHANGED
@@ -14,6 +14,7 @@ from corvic.system._embedder import (
14
14
  EmbedTextContext,
15
15
  EmbedTextResult,
16
16
  ImageEmbedder,
17
+ SigLIP2Text,
17
18
  TextEmbedder,
18
19
  )
19
20
  from corvic.system._image_embedder import (
@@ -21,6 +22,7 @@ from corvic.system._image_embedder import (
21
22
  CombinedImageEmbedder,
22
23
  IdentityImageEmbedder,
23
24
  RandomImageEmbedder,
25
+ SigLIP2,
24
26
  image_from_bytes,
25
27
  )
26
28
  from corvic.system._planner import OpGraphPlanner, ValidateFirstExecutor
@@ -68,23 +70,27 @@ __all__ = [
68
70
  "Client",
69
71
  "Clip",
70
72
  "ClipText",
73
+ "CombinedImageEmbedder",
71
74
  "DEFAULT_VECTOR_COLUMN_NAMES_TO_SIZES",
72
75
  "DataMisplacedError",
73
76
  "DimensionReducer",
77
+ "EmbedImageContext",
78
+ "EmbedImageResult",
74
79
  "EmbedTextContext",
75
80
  "EmbedTextResult",
76
81
  "ExecutionContext",
77
82
  "ExecutionResult",
78
- "EmbedImageContext",
79
- "EmbedImageResult",
83
+ "IdentityImageEmbedder",
84
+ "IdentityTextEmbedder",
80
85
  "ImageEmbedder",
81
86
  "InMemoryExecutionResult",
82
87
  "InMemoryExecutor",
83
88
  "OpGraphExecutor",
84
89
  "OpGraphPlanner",
85
90
  "RandomImageEmbedder",
86
- "CombinedImageEmbedder",
87
91
  "RandomTextEmbedder",
92
+ "SigLIP2",
93
+ "SigLIP2Text",
88
94
  "StagingDB",
89
95
  "StorageManager",
90
96
  "TableComputeContext",
@@ -97,9 +103,7 @@ __all__ = [
97
103
  "VectorSimilarityMetric",
98
104
  "get_polars_embedding",
99
105
  "get_polars_embedding_length",
106
+ "image_from_bytes",
100
107
  "make_dict_bytes_human_readable",
101
108
  "make_list_bytes_human_readable",
102
- "image_from_bytes",
103
- "IdentityTextEmbedder",
104
- "IdentityImageEmbedder",
105
109
  ]
@@ -71,6 +71,9 @@ class EmbedImageResult:
71
71
  class ImageEmbedder(Protocol):
72
72
  """Use a model to embed text."""
73
73
 
74
+ @classmethod
75
+ def model_name(cls) -> str: ...
76
+
74
77
  def embed(
75
78
  self, context: EmbedImageContext
76
79
  ) -> Ok[EmbedImageResult] | InvalidArgumentError | InternalError: ...
@@ -27,6 +27,10 @@ class RandomImageEmbedder(ImageEmbedder):
27
27
  Useful for testing.
28
28
  """
29
29
 
30
+ @classmethod
31
+ def model_name(cls) -> str:
32
+ return "random"
33
+
30
34
  def embed(
31
35
  self, context: EmbedImageContext
32
36
  ) -> Ok[EmbedImageResult] | InvalidArgumentError | InternalError:
@@ -82,6 +86,10 @@ class LoadedModels:
82
86
  class HFModelImageEmbedder(ImageEmbedder):
83
87
  """Generic image embedder from hugging face models."""
84
88
 
89
+ @classmethod
90
+ @abc.abstractmethod
91
+ def model_revision(cls) -> str: ...
92
+
85
93
  @abc.abstractmethod
86
94
  def _load_models(self) -> LoadedModels: ...
87
95
 
@@ -165,6 +173,14 @@ class Clip(HFModelImageEmbedder):
165
173
  overcoming several major challenges in computer vision.
166
174
  """
167
175
 
176
+ @classmethod
177
+ def model_name(cls) -> str:
178
+ return "openai/clip-vit-base-patch32"
179
+
180
+ @classmethod
181
+ def model_revision(cls) -> str:
182
+ return "5812e510083bb2d23fa43778a39ac065d205ed4d"
183
+
168
184
  def _load_models(self) -> LoadedModels:
169
185
  from transformers.models.clip import (
170
186
  CLIPModel,
@@ -174,15 +190,15 @@ class Clip(HFModelImageEmbedder):
174
190
  model = cast(
175
191
  AutoModel,
176
192
  CLIPModel.from_pretrained( # pyright: ignore[reportUnknownMemberType]
177
- pretrained_model_name_or_path="openai/clip-vit-base-patch32",
178
- revision="5812e510083bb2d23fa43778a39ac065d205ed4d",
193
+ pretrained_model_name_or_path=self.model_name(),
194
+ revision=self.model_revision(),
179
195
  ),
180
196
  )
181
197
  processor = cast(
182
198
  AutoProcessor,
183
199
  CLIPProcessor.from_pretrained( # pyright: ignore[reportUnknownMemberType]
184
- pretrained_model_name_or_path="openai/clip-vit-base-patch32",
185
- revision="5812e510083bb2d23fa43778a39ac065d205ed4d",
200
+ pretrained_model_name_or_path=self.model_name(),
201
+ revision=self.model_revision(),
186
202
  use_fast=False,
187
203
  ),
188
204
  )
@@ -192,6 +208,14 @@ class Clip(HFModelImageEmbedder):
192
208
  class SigLIP2(HFModelImageEmbedder):
193
209
  """SigLIP2 image embedder."""
194
210
 
211
+ @classmethod
212
+ def model_name(cls) -> str:
213
+ return "google/siglip2-base-patch16-512"
214
+
215
+ @classmethod
216
+ def model_revision(cls) -> str:
217
+ return "a89f5c5093f902bf39d3cd4d81d2c09867f0724b"
218
+
195
219
  def _load_models(self):
196
220
  from transformers.models.auto.modeling_auto import AutoModel
197
221
  from transformers.models.auto.processing_auto import AutoProcessor
@@ -199,16 +223,16 @@ class SigLIP2(HFModelImageEmbedder):
199
223
  model = cast(
200
224
  AutoModel,
201
225
  AutoModel.from_pretrained( # pyright: ignore[reportUnknownMemberType]
202
- pretrained_model_name_or_path="google/siglip2-base-patch16-512",
203
- revision="a89f5c5093f902bf39d3cd4d81d2c09867f0724b",
226
+ pretrained_model_name_or_path=self.model_name(),
227
+ revision=self.model_revision(),
204
228
  device_map="auto",
205
229
  ),
206
230
  )
207
231
  processor = cast(
208
232
  AutoProcessor,
209
233
  AutoProcessor.from_pretrained( # pyright: ignore[reportUnknownMemberType]
210
- pretrained_model_name_or_path="google/siglip2-base-patch16-512",
211
- revision="a89f5c5093f902bf39d3cd4d81d2c09867f0724b",
234
+ pretrained_model_name_or_path=self.model_name(),
235
+ revision=self.model_revision(),
212
236
  use_fast=True,
213
237
  ),
214
238
  )
@@ -216,23 +240,25 @@ class SigLIP2(HFModelImageEmbedder):
216
240
 
217
241
 
218
242
  class CombinedImageEmbedder(ImageEmbedder):
243
+ @classmethod
244
+ def model_name(cls) -> str:
245
+ raise InvalidArgumentError(
246
+ "CombinedImageEmbedder does not have a specific model name"
247
+ )
248
+
219
249
  def __init__(self):
220
- self._clip_embedder = Clip()
221
- self._siglip2_embedder = SigLIP2()
222
- self._random_embedder = RandomImageEmbedder()
250
+ self._embedders = {
251
+ emb.model_name(): emb()
252
+ for emb in [Clip, SigLIP2, RandomImageEmbedder, IdentityImageEmbedder]
253
+ }
223
254
 
224
255
  def embed(
225
256
  self, context: EmbedImageContext
226
257
  ) -> Ok[EmbedImageResult] | InvalidArgumentError | InternalError:
227
- match context.model_name:
228
- case "random":
229
- return self._random_embedder.embed(context)
230
- case "clip":
231
- return self._clip_embedder.embed(context)
232
- case "siglip2":
233
- return self._siglip2_embedder.embed(context)
234
- case _:
235
- return InvalidArgumentError(f"Unknown model name {context.model_name}")
258
+ embedder = self._embedders.get(context.model_name, None)
259
+ if not embedder:
260
+ return InvalidArgumentError(f"Unknown model name {context.model_name}")
261
+ return embedder.embed(context)
236
262
 
237
263
  async def aembed(
238
264
  self,
@@ -254,6 +280,10 @@ class IdentityImageEmbedder(ImageEmbedder):
254
280
  - The resulting list is truncated or padded to match the expected vector length.
255
281
  """
256
282
 
283
+ @classmethod
284
+ def model_name(cls) -> str:
285
+ return "identity"
286
+
257
287
  def _image_to_embedding(
258
288
  self, image: "Image.Image", vector_length: int, *, normalization: bool = False
259
289
  ) -> list[float]:
@@ -215,7 +215,12 @@ class _SchemaAndBatches:
215
215
  and not len(dataframe)
216
216
  ):
217
217
  return cls(expected_schema, [], metrics)
218
- table = dataframe.to_arrow()
218
+ # TODO(aneesh): without this rechunk, conversion to arrow will
219
+ # occasionally fail and complain about mismatched child array lengths.
220
+ # This should probably be fixed internally in polars (note that this
221
+ # still currently happens on polars 1.30.0 - the latest release).
222
+ # See https://github.com/pola-rs/polars/issues/23111.
223
+ table = dataframe.rechunk().to_arrow()
219
224
  schema = table.schema
220
225
  return cls(schema, table.to_batches(), metrics)
221
226
 
@@ -0,0 +1,43 @@
1
+ """Common machinery for using protocol buffers as transfer objects."""
2
+
3
+ from corvic.transfer._common_transformations import (
4
+ UNCOMMITTED_ID_PREFIX,
5
+ OrmIdT,
6
+ generate_uncommitted_id_str,
7
+ non_empty_timestamp_to_datetime,
8
+ translate_orm_id,
9
+ )
10
+ from corvic.transfer._orm_backed_proto import (
11
+ HasIdOrmBackedProto,
12
+ HasProtoSelf,
13
+ OrmBackedProto,
14
+ OrmHasIdModel,
15
+ OrmHasIdT,
16
+ OrmModel,
17
+ OrmT,
18
+ ProtoHasIdModel,
19
+ ProtoHasIdT,
20
+ ProtoModel,
21
+ ProtoT,
22
+ UsesOrmID,
23
+ )
24
+
25
+ __all__ = [
26
+ "UNCOMMITTED_ID_PREFIX",
27
+ "generate_uncommitted_id_str",
28
+ "OrmIdT",
29
+ "OrmModel",
30
+ "UsesOrmID",
31
+ "OrmT",
32
+ "ProtoT",
33
+ "HasProtoSelf",
34
+ "ProtoModel",
35
+ "ProtoHasIdT",
36
+ "OrmBackedProto",
37
+ "ProtoHasIdModel",
38
+ "OrmHasIdT",
39
+ "OrmHasIdModel",
40
+ "HasIdOrmBackedProto",
41
+ "translate_orm_id",
42
+ "non_empty_timestamp_to_datetime",
43
+ ]
@@ -0,0 +1,37 @@
1
+ import datetime
2
+ import uuid
3
+ from typing import Any, TypeVar
4
+
5
+ from google.protobuf import timestamp_pb2
6
+
7
+ from corvic import orm
8
+ from corvic.result import Ok
9
+
10
+ OrmIdT = TypeVar("OrmIdT", bound=orm.BaseID[Any])
11
+
12
+ UNCOMMITTED_ID_PREFIX = "__uncommitted_object-"
13
+
14
+
15
+ def generate_uncommitted_id_str():
16
+ return f"{UNCOMMITTED_ID_PREFIX}{uuid.uuid4()}"
17
+
18
+
19
+ def translate_orm_id(
20
+ obj_id: str, id_class: type[OrmIdT]
21
+ ) -> Ok[OrmIdT | None] | orm.InvalidORMIdentifierError:
22
+ if obj_id.startswith(UNCOMMITTED_ID_PREFIX):
23
+ return Ok(None)
24
+ parsed_obj_id = id_class(obj_id)
25
+ match parsed_obj_id.to_db():
26
+ case orm.InvalidORMIdentifierError() as err:
27
+ return err
28
+ case Ok():
29
+ return Ok(parsed_obj_id)
30
+
31
+
32
+ def non_empty_timestamp_to_datetime(
33
+ timestamp: timestamp_pb2.Timestamp,
34
+ ) -> datetime.datetime | None:
35
+ if timestamp != timestamp_pb2.Timestamp():
36
+ return timestamp.ToDatetime(tzinfo=datetime.UTC)
37
+ return None