corvic-engine 0.3.0rc81__cp38-abi3-win_amd64.whl → 0.3.0rc83__cp38-abi3-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- corvic/{model → emodel}/__init__.py +40 -37
- corvic/emodel/_base_model.py +161 -0
- corvic/{model → emodel}/_completion_model.py +10 -8
- corvic/{model → emodel}/_feature_type.py +1 -1
- corvic/{model → emodel}/_feature_view.py +9 -7
- corvic/{model → emodel}/_pipeline.py +5 -5
- corvic/{model → emodel}/_proto_orm_convert.py +56 -54
- corvic/{model → emodel}/_resource.py +4 -4
- corvic/{model → emodel}/_room.py +4 -4
- corvic/{model → emodel}/_source.py +7 -7
- corvic/{model → emodel}/_space.py +9 -9
- corvic/engine/_native.pyd +0 -0
- corvic/op_graph/ops.py +6 -2
- corvic/system/__init__.py +10 -6
- corvic/system/_embedder.py +3 -0
- corvic/system/_image_embedder.py +50 -20
- corvic/system/in_memory_executor.py +6 -1
- corvic/transfer/__init__.py +43 -0
- corvic/transfer/_common_transformations.py +37 -0
- corvic/{model/_base_model.py → transfer/_orm_backed_proto.py} +116 -109
- corvic/transfer/py.typed +0 -0
- {corvic_engine-0.3.0rc81.dist-info → corvic_engine-0.3.0rc83.dist-info}/METADATA +2 -2
- {corvic_engine-0.3.0rc81.dist-info → corvic_engine-0.3.0rc83.dist-info}/RECORD +30 -26
- {corvic_engine-0.3.0rc81.dist-info → corvic_engine-0.3.0rc83.dist-info}/WHEEL +1 -1
- corvic_generated/orm/v1/agent_pb2.py +8 -8
- corvic_generated/orm/v1/agent_pb2.pyi +8 -4
- /corvic/{model → emodel}/_defaults.py +0 -0
- /corvic/{model → emodel}/_errors.py +0 -0
- /corvic/{model → emodel}/py.typed +0 -0
- {corvic_engine-0.3.0rc81.dist-info → corvic_engine-0.3.0rc83.dist-info}/licenses/LICENSE +0 -0
@@ -14,14 +14,14 @@ import sqlalchemy.orm as sa_orm
|
|
14
14
|
from sqlalchemy.orm.interfaces import LoaderOption
|
15
15
|
|
16
16
|
from corvic import eorm, op_graph, system
|
17
|
-
from corvic.
|
18
|
-
from corvic.
|
19
|
-
from corvic.
|
17
|
+
from corvic.emodel._base_model import StandardModel
|
18
|
+
from corvic.emodel._defaults import Defaults
|
19
|
+
from corvic.emodel._proto_orm_convert import (
|
20
20
|
source_delete_orms,
|
21
21
|
source_orm_to_proto,
|
22
22
|
source_proto_to_orm,
|
23
23
|
)
|
24
|
-
from corvic.
|
24
|
+
from corvic.emodel._resource import Resource, ResourceID
|
25
25
|
from corvic.result import InvalidArgumentError, NotFoundError, Ok
|
26
26
|
from corvic.table import Table
|
27
27
|
from corvic_generated.model.v1alpha import models_pb2
|
@@ -45,7 +45,7 @@ def foreign_key(
|
|
45
45
|
)
|
46
46
|
|
47
47
|
|
48
|
-
class Source(
|
48
|
+
class Source(StandardModel[SourceID, models_pb2.Source, eorm.Source]):
|
49
49
|
"""Sources describe how resources should be treated.
|
50
50
|
|
51
51
|
Example:
|
@@ -261,8 +261,8 @@ class Source(BelongsToRoomModel[SourceID, models_pb2.Source, eorm.Source]):
|
|
261
261
|
Example:
|
262
262
|
>>> with_feature_types(
|
263
263
|
>>> {
|
264
|
-
>>> "id": corvic.
|
265
|
-
>>> "customer_id": corvic.
|
264
|
+
>>> "id": corvic.emodel.feature_type.primary_key(),
|
265
|
+
>>> "customer_id": corvic.emodel.feature_type.foreign_key(
|
266
266
|
>>> customer_source.id
|
267
267
|
>>> ),
|
268
268
|
>>> },
|
@@ -14,10 +14,10 @@ import sqlalchemy as sa
|
|
14
14
|
from sqlalchemy import orm as sa_orm
|
15
15
|
|
16
16
|
from corvic import eorm, op_graph, system
|
17
|
-
from corvic.
|
18
|
-
from corvic.
|
19
|
-
from corvic.
|
20
|
-
from corvic.
|
17
|
+
from corvic.emodel._base_model import StandardModel
|
18
|
+
from corvic.emodel._defaults import Defaults
|
19
|
+
from corvic.emodel._feature_view import FeatureView, FeatureViewEdgeTableMetadata
|
20
|
+
from corvic.emodel._proto_orm_convert import (
|
21
21
|
space_delete_orms,
|
22
22
|
space_orm_to_proto,
|
23
23
|
space_proto_to_orm,
|
@@ -53,13 +53,13 @@ name_to_proto_embedding_model = {
|
|
53
53
|
def image_model_proto_to_name(image_model: embedding_models_pb2.ImageModel):
|
54
54
|
match image_model:
|
55
55
|
case embedding_models_pb2.IMAGE_MODEL_CUSTOM:
|
56
|
-
return Ok(
|
56
|
+
return Ok(system.RandomImageEmbedder.model_name())
|
57
57
|
case embedding_models_pb2.IMAGE_MODEL_CLIP:
|
58
|
-
return Ok(
|
58
|
+
return Ok(system.Clip.model_name())
|
59
59
|
case embedding_models_pb2.IMAGE_MODEL_IDENTITY:
|
60
|
-
return Ok(
|
60
|
+
return Ok(system.IdentityImageEmbedder.model_name())
|
61
61
|
case embedding_models_pb2.IMAGE_MODEL_SIGLIP2:
|
62
|
-
return Ok(
|
62
|
+
return Ok(system.SigLIP2.model_name())
|
63
63
|
case embedding_models_pb2.IMAGE_MODEL_UNSPECIFIED:
|
64
64
|
return Ok("")
|
65
65
|
case _:
|
@@ -114,7 +114,7 @@ name_to_proto_image_model = {
|
|
114
114
|
}
|
115
115
|
|
116
116
|
|
117
|
-
class Space(
|
117
|
+
class Space(StandardModel[SpaceID, models_pb2.Space, eorm.Space]):
|
118
118
|
"""Spaces apply embedding methods to FeatureViews.
|
119
119
|
|
120
120
|
Example:
|
corvic/engine/_native.pyd
CHANGED
Binary file
|
corvic/op_graph/ops.py
CHANGED
@@ -1260,7 +1260,9 @@ class _Base(OneofProtoWrapper[table_pb2.TableComputeOp], ABC):
|
|
1260
1260
|
column_name=column_name,
|
1261
1261
|
)
|
1262
1262
|
|
1263
|
-
|
1263
|
+
# TODO(aneesh): See https://github.com/pola-rs/polars/issues/23111 for
|
1264
|
+
# and remove the rechunk eventually.
|
1265
|
+
field = column.to_frame().rechunk().to_arrow().schema.field(column_name)
|
1264
1266
|
dtype = field.type
|
1265
1267
|
|
1266
1268
|
if ftype is None:
|
@@ -1268,8 +1270,10 @@ class _Base(OneofProtoWrapper[table_pb2.TableComputeOp], ABC):
|
|
1268
1270
|
|
1269
1271
|
# Convert array to record batch with dummy column to use pa_scalar and then
|
1270
1272
|
# remove the dummy column.
|
1273
|
+
# TODO(aneesh): See https://github.com/pola-rs/polars/issues/23111 for
|
1274
|
+
# and remove the rechunk eventually.
|
1271
1275
|
value_batch = pa.record_batch(
|
1272
|
-
[column.to_arrow()], schema=pa.schema([field]), metadata=None
|
1276
|
+
[column.rechunk().to_arrow()], schema=pa.schema([field]), metadata=None
|
1273
1277
|
)
|
1274
1278
|
structs = pa_scalar.batch_to_structs(value_batch)
|
1275
1279
|
literal_values = [
|
corvic/system/__init__.py
CHANGED
@@ -14,6 +14,7 @@ from corvic.system._embedder import (
|
|
14
14
|
EmbedTextContext,
|
15
15
|
EmbedTextResult,
|
16
16
|
ImageEmbedder,
|
17
|
+
SigLIP2Text,
|
17
18
|
TextEmbedder,
|
18
19
|
)
|
19
20
|
from corvic.system._image_embedder import (
|
@@ -21,6 +22,7 @@ from corvic.system._image_embedder import (
|
|
21
22
|
CombinedImageEmbedder,
|
22
23
|
IdentityImageEmbedder,
|
23
24
|
RandomImageEmbedder,
|
25
|
+
SigLIP2,
|
24
26
|
image_from_bytes,
|
25
27
|
)
|
26
28
|
from corvic.system._planner import OpGraphPlanner, ValidateFirstExecutor
|
@@ -68,23 +70,27 @@ __all__ = [
|
|
68
70
|
"Client",
|
69
71
|
"Clip",
|
70
72
|
"ClipText",
|
73
|
+
"CombinedImageEmbedder",
|
71
74
|
"DEFAULT_VECTOR_COLUMN_NAMES_TO_SIZES",
|
72
75
|
"DataMisplacedError",
|
73
76
|
"DimensionReducer",
|
77
|
+
"EmbedImageContext",
|
78
|
+
"EmbedImageResult",
|
74
79
|
"EmbedTextContext",
|
75
80
|
"EmbedTextResult",
|
76
81
|
"ExecutionContext",
|
77
82
|
"ExecutionResult",
|
78
|
-
"
|
79
|
-
"
|
83
|
+
"IdentityImageEmbedder",
|
84
|
+
"IdentityTextEmbedder",
|
80
85
|
"ImageEmbedder",
|
81
86
|
"InMemoryExecutionResult",
|
82
87
|
"InMemoryExecutor",
|
83
88
|
"OpGraphExecutor",
|
84
89
|
"OpGraphPlanner",
|
85
90
|
"RandomImageEmbedder",
|
86
|
-
"CombinedImageEmbedder",
|
87
91
|
"RandomTextEmbedder",
|
92
|
+
"SigLIP2",
|
93
|
+
"SigLIP2Text",
|
88
94
|
"StagingDB",
|
89
95
|
"StorageManager",
|
90
96
|
"TableComputeContext",
|
@@ -97,9 +103,7 @@ __all__ = [
|
|
97
103
|
"VectorSimilarityMetric",
|
98
104
|
"get_polars_embedding",
|
99
105
|
"get_polars_embedding_length",
|
106
|
+
"image_from_bytes",
|
100
107
|
"make_dict_bytes_human_readable",
|
101
108
|
"make_list_bytes_human_readable",
|
102
|
-
"image_from_bytes",
|
103
|
-
"IdentityTextEmbedder",
|
104
|
-
"IdentityImageEmbedder",
|
105
109
|
]
|
corvic/system/_embedder.py
CHANGED
@@ -71,6 +71,9 @@ class EmbedImageResult:
|
|
71
71
|
class ImageEmbedder(Protocol):
|
72
72
|
"""Use a model to embed text."""
|
73
73
|
|
74
|
+
@classmethod
|
75
|
+
def model_name(cls) -> str: ...
|
76
|
+
|
74
77
|
def embed(
|
75
78
|
self, context: EmbedImageContext
|
76
79
|
) -> Ok[EmbedImageResult] | InvalidArgumentError | InternalError: ...
|
corvic/system/_image_embedder.py
CHANGED
@@ -27,6 +27,10 @@ class RandomImageEmbedder(ImageEmbedder):
|
|
27
27
|
Useful for testing.
|
28
28
|
"""
|
29
29
|
|
30
|
+
@classmethod
|
31
|
+
def model_name(cls) -> str:
|
32
|
+
return "random"
|
33
|
+
|
30
34
|
def embed(
|
31
35
|
self, context: EmbedImageContext
|
32
36
|
) -> Ok[EmbedImageResult] | InvalidArgumentError | InternalError:
|
@@ -82,6 +86,10 @@ class LoadedModels:
|
|
82
86
|
class HFModelImageEmbedder(ImageEmbedder):
|
83
87
|
"""Generic image embedder from hugging face models."""
|
84
88
|
|
89
|
+
@classmethod
|
90
|
+
@abc.abstractmethod
|
91
|
+
def model_revision(cls) -> str: ...
|
92
|
+
|
85
93
|
@abc.abstractmethod
|
86
94
|
def _load_models(self) -> LoadedModels: ...
|
87
95
|
|
@@ -165,6 +173,14 @@ class Clip(HFModelImageEmbedder):
|
|
165
173
|
overcoming several major challenges in computer vision.
|
166
174
|
"""
|
167
175
|
|
176
|
+
@classmethod
|
177
|
+
def model_name(cls) -> str:
|
178
|
+
return "openai/clip-vit-base-patch32"
|
179
|
+
|
180
|
+
@classmethod
|
181
|
+
def model_revision(cls) -> str:
|
182
|
+
return "5812e510083bb2d23fa43778a39ac065d205ed4d"
|
183
|
+
|
168
184
|
def _load_models(self) -> LoadedModels:
|
169
185
|
from transformers.models.clip import (
|
170
186
|
CLIPModel,
|
@@ -174,15 +190,15 @@ class Clip(HFModelImageEmbedder):
|
|
174
190
|
model = cast(
|
175
191
|
AutoModel,
|
176
192
|
CLIPModel.from_pretrained( # pyright: ignore[reportUnknownMemberType]
|
177
|
-
pretrained_model_name_or_path=
|
178
|
-
revision=
|
193
|
+
pretrained_model_name_or_path=self.model_name(),
|
194
|
+
revision=self.model_revision(),
|
179
195
|
),
|
180
196
|
)
|
181
197
|
processor = cast(
|
182
198
|
AutoProcessor,
|
183
199
|
CLIPProcessor.from_pretrained( # pyright: ignore[reportUnknownMemberType]
|
184
|
-
pretrained_model_name_or_path=
|
185
|
-
revision=
|
200
|
+
pretrained_model_name_or_path=self.model_name(),
|
201
|
+
revision=self.model_revision(),
|
186
202
|
use_fast=False,
|
187
203
|
),
|
188
204
|
)
|
@@ -192,6 +208,14 @@ class Clip(HFModelImageEmbedder):
|
|
192
208
|
class SigLIP2(HFModelImageEmbedder):
|
193
209
|
"""SigLIP2 image embedder."""
|
194
210
|
|
211
|
+
@classmethod
|
212
|
+
def model_name(cls) -> str:
|
213
|
+
return "google/siglip2-base-patch16-512"
|
214
|
+
|
215
|
+
@classmethod
|
216
|
+
def model_revision(cls) -> str:
|
217
|
+
return "a89f5c5093f902bf39d3cd4d81d2c09867f0724b"
|
218
|
+
|
195
219
|
def _load_models(self):
|
196
220
|
from transformers.models.auto.modeling_auto import AutoModel
|
197
221
|
from transformers.models.auto.processing_auto import AutoProcessor
|
@@ -199,16 +223,16 @@ class SigLIP2(HFModelImageEmbedder):
|
|
199
223
|
model = cast(
|
200
224
|
AutoModel,
|
201
225
|
AutoModel.from_pretrained( # pyright: ignore[reportUnknownMemberType]
|
202
|
-
pretrained_model_name_or_path=
|
203
|
-
revision=
|
226
|
+
pretrained_model_name_or_path=self.model_name(),
|
227
|
+
revision=self.model_revision(),
|
204
228
|
device_map="auto",
|
205
229
|
),
|
206
230
|
)
|
207
231
|
processor = cast(
|
208
232
|
AutoProcessor,
|
209
233
|
AutoProcessor.from_pretrained( # pyright: ignore[reportUnknownMemberType]
|
210
|
-
pretrained_model_name_or_path=
|
211
|
-
revision=
|
234
|
+
pretrained_model_name_or_path=self.model_name(),
|
235
|
+
revision=self.model_revision(),
|
212
236
|
use_fast=True,
|
213
237
|
),
|
214
238
|
)
|
@@ -216,23 +240,25 @@ class SigLIP2(HFModelImageEmbedder):
|
|
216
240
|
|
217
241
|
|
218
242
|
class CombinedImageEmbedder(ImageEmbedder):
|
243
|
+
@classmethod
|
244
|
+
def model_name(cls) -> str:
|
245
|
+
raise InvalidArgumentError(
|
246
|
+
"CombinedImageEmbedder does not have a specific model name"
|
247
|
+
)
|
248
|
+
|
219
249
|
def __init__(self):
|
220
|
-
self.
|
221
|
-
|
222
|
-
|
250
|
+
self._embedders = {
|
251
|
+
emb.model_name(): emb()
|
252
|
+
for emb in [Clip, SigLIP2, RandomImageEmbedder, IdentityImageEmbedder]
|
253
|
+
}
|
223
254
|
|
224
255
|
def embed(
|
225
256
|
self, context: EmbedImageContext
|
226
257
|
) -> Ok[EmbedImageResult] | InvalidArgumentError | InternalError:
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
return self._clip_embedder.embed(context)
|
232
|
-
case "siglip2":
|
233
|
-
return self._siglip2_embedder.embed(context)
|
234
|
-
case _:
|
235
|
-
return InvalidArgumentError(f"Unknown model name {context.model_name}")
|
258
|
+
embedder = self._embedders.get(context.model_name, None)
|
259
|
+
if not embedder:
|
260
|
+
return InvalidArgumentError(f"Unknown model name {context.model_name}")
|
261
|
+
return embedder.embed(context)
|
236
262
|
|
237
263
|
async def aembed(
|
238
264
|
self,
|
@@ -254,6 +280,10 @@ class IdentityImageEmbedder(ImageEmbedder):
|
|
254
280
|
- The resulting list is truncated or padded to match the expected vector length.
|
255
281
|
"""
|
256
282
|
|
283
|
+
@classmethod
|
284
|
+
def model_name(cls) -> str:
|
285
|
+
return "identity"
|
286
|
+
|
257
287
|
def _image_to_embedding(
|
258
288
|
self, image: "Image.Image", vector_length: int, *, normalization: bool = False
|
259
289
|
) -> list[float]:
|
@@ -215,7 +215,12 @@ class _SchemaAndBatches:
|
|
215
215
|
and not len(dataframe)
|
216
216
|
):
|
217
217
|
return cls(expected_schema, [], metrics)
|
218
|
-
|
218
|
+
# TODO(aneesh): without this rechunk, conversion to arrow will
|
219
|
+
# occasionally fail and complain about mismatched child array lengths.
|
220
|
+
# This should probably be fixed internally in polars (note that this
|
221
|
+
# still currently happens on polars 1.30.0 - the latest release).
|
222
|
+
# See https://github.com/pola-rs/polars/issues/23111.
|
223
|
+
table = dataframe.rechunk().to_arrow()
|
219
224
|
schema = table.schema
|
220
225
|
return cls(schema, table.to_batches(), metrics)
|
221
226
|
|
@@ -0,0 +1,43 @@
|
|
1
|
+
"""Common machinery for using protocol buffers as transfer objects."""
|
2
|
+
|
3
|
+
from corvic.transfer._common_transformations import (
|
4
|
+
UNCOMMITTED_ID_PREFIX,
|
5
|
+
OrmIdT,
|
6
|
+
generate_uncommitted_id_str,
|
7
|
+
non_empty_timestamp_to_datetime,
|
8
|
+
translate_orm_id,
|
9
|
+
)
|
10
|
+
from corvic.transfer._orm_backed_proto import (
|
11
|
+
HasIdOrmBackedProto,
|
12
|
+
HasProtoSelf,
|
13
|
+
OrmBackedProto,
|
14
|
+
OrmHasIdModel,
|
15
|
+
OrmHasIdT,
|
16
|
+
OrmModel,
|
17
|
+
OrmT,
|
18
|
+
ProtoHasIdModel,
|
19
|
+
ProtoHasIdT,
|
20
|
+
ProtoModel,
|
21
|
+
ProtoT,
|
22
|
+
UsesOrmID,
|
23
|
+
)
|
24
|
+
|
25
|
+
__all__ = [
|
26
|
+
"UNCOMMITTED_ID_PREFIX",
|
27
|
+
"generate_uncommitted_id_str",
|
28
|
+
"OrmIdT",
|
29
|
+
"OrmModel",
|
30
|
+
"UsesOrmID",
|
31
|
+
"OrmT",
|
32
|
+
"ProtoT",
|
33
|
+
"HasProtoSelf",
|
34
|
+
"ProtoModel",
|
35
|
+
"ProtoHasIdT",
|
36
|
+
"OrmBackedProto",
|
37
|
+
"ProtoHasIdModel",
|
38
|
+
"OrmHasIdT",
|
39
|
+
"OrmHasIdModel",
|
40
|
+
"HasIdOrmBackedProto",
|
41
|
+
"translate_orm_id",
|
42
|
+
"non_empty_timestamp_to_datetime",
|
43
|
+
]
|
@@ -0,0 +1,37 @@
|
|
1
|
+
import datetime
|
2
|
+
import uuid
|
3
|
+
from typing import Any, TypeVar
|
4
|
+
|
5
|
+
from google.protobuf import timestamp_pb2
|
6
|
+
|
7
|
+
from corvic import orm
|
8
|
+
from corvic.result import Ok
|
9
|
+
|
10
|
+
OrmIdT = TypeVar("OrmIdT", bound=orm.BaseID[Any])
|
11
|
+
|
12
|
+
UNCOMMITTED_ID_PREFIX = "__uncommitted_object-"
|
13
|
+
|
14
|
+
|
15
|
+
def generate_uncommitted_id_str():
|
16
|
+
return f"{UNCOMMITTED_ID_PREFIX}{uuid.uuid4()}"
|
17
|
+
|
18
|
+
|
19
|
+
def translate_orm_id(
|
20
|
+
obj_id: str, id_class: type[OrmIdT]
|
21
|
+
) -> Ok[OrmIdT | None] | orm.InvalidORMIdentifierError:
|
22
|
+
if obj_id.startswith(UNCOMMITTED_ID_PREFIX):
|
23
|
+
return Ok(None)
|
24
|
+
parsed_obj_id = id_class(obj_id)
|
25
|
+
match parsed_obj_id.to_db():
|
26
|
+
case orm.InvalidORMIdentifierError() as err:
|
27
|
+
return err
|
28
|
+
case Ok():
|
29
|
+
return Ok(parsed_obj_id)
|
30
|
+
|
31
|
+
|
32
|
+
def non_empty_timestamp_to_datetime(
|
33
|
+
timestamp: timestamp_pb2.Timestamp,
|
34
|
+
) -> datetime.datetime | None:
|
35
|
+
if timestamp != timestamp_pb2.Timestamp():
|
36
|
+
return timestamp.ToDatetime(tzinfo=datetime.UTC)
|
37
|
+
return None
|