corvic-engine 0.3.0rc43__cp38-abi3-win_amd64.whl → 0.3.0rc45__cp38-abi3-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- corvic/embedding_metric/embeddings.py +30 -6
- corvic/engine/_native.pyd +0 -0
- corvic/model/__init__.py +7 -1
- corvic/model/_base_model.py +35 -42
- corvic/model/_feature_view.py +43 -42
- corvic/model/_pipeline.py +2 -14
- corvic/model/_proto_orm_convert.py +245 -227
- corvic/model/_resource.py +2 -7
- corvic/model/_source.py +14 -16
- corvic/model/_space.py +330 -362
- corvic/orm/__init__.py +59 -178
- corvic/system/in_memory_executor.py +6 -4
- {corvic_engine-0.3.0rc43.dist-info → corvic_engine-0.3.0rc45.dist-info}/METADATA +19 -19
- {corvic_engine-0.3.0rc43.dist-info → corvic_engine-0.3.0rc45.dist-info}/RECORD +22 -22
- {corvic_engine-0.3.0rc43.dist-info → corvic_engine-0.3.0rc45.dist-info}/WHEEL +1 -1
- corvic_generated/model/v1alpha/models_pb2.py +28 -28
- corvic_generated/model/v1alpha/models_pb2.pyi +24 -20
- corvic_generated/platform/v1/platform_pb2.py +11 -5
- corvic_generated/platform/v1/platform_pb2.pyi +10 -0
- corvic_generated/platform/v1/platform_pb2_grpc.py +33 -0
- corvic_generated/platform/v1/platform_pb2_grpc.pyi +14 -0
- {corvic_engine-0.3.0rc43.dist-info → corvic_engine-0.3.0rc45.dist-info}/licenses/LICENSE +0 -0
corvic/model/_space.py
CHANGED
@@ -3,15 +3,13 @@
|
|
3
3
|
from __future__ import annotations
|
4
4
|
|
5
5
|
import abc
|
6
|
-
import copy
|
7
|
-
import dataclasses
|
8
6
|
import datetime
|
9
7
|
import uuid
|
10
8
|
from collections.abc import Iterable, Mapping, Sequence
|
11
9
|
from typing import Final, Literal, TypeAlias
|
12
10
|
|
13
11
|
import pyarrow as pa
|
14
|
-
|
12
|
+
import sqlalchemy as sa
|
15
13
|
from sqlalchemy import orm as sa_orm
|
16
14
|
from typing_extensions import Self
|
17
15
|
|
@@ -24,12 +22,10 @@ from corvic.model._proto_orm_convert import (
|
|
24
22
|
space_orm_to_proto,
|
25
23
|
space_proto_to_orm,
|
26
24
|
)
|
27
|
-
from corvic.model._source import Source
|
28
25
|
from corvic.result import InvalidArgumentError, NotFoundError, Ok
|
29
26
|
from corvic.table import Table
|
30
27
|
from corvic_generated.algorithm.graph.v1 import graph_pb2
|
31
28
|
from corvic_generated.embedding.v1 import models_pb2 as embedding_models_pb2
|
32
|
-
from corvic_generated.feature.v2 import space_pb2 as feature_space_pb2
|
33
29
|
from corvic_generated.model.v1alpha import models_pb2
|
34
30
|
from corvic_generated.orm.v1 import space_pb2
|
35
31
|
|
@@ -72,17 +68,6 @@ class Space(BaseModel[SpaceID, models_pb2.Space, orm.Space]):
|
|
72
68
|
>>> space = Space.node2vec(feature_view, dim=10, walk_length=10, window=10)
|
73
69
|
"""
|
74
70
|
|
75
|
-
_feature_view: FeatureView | None
|
76
|
-
|
77
|
-
def __init__(
|
78
|
-
self,
|
79
|
-
client: system.Client,
|
80
|
-
proto_self: models_pb2.Space,
|
81
|
-
feature_view: FeatureView | None = None,
|
82
|
-
):
|
83
|
-
super().__init__(client, proto_self)
|
84
|
-
self._feature_view = feature_view
|
85
|
-
|
86
71
|
@classmethod
|
87
72
|
def orm_class(cls):
|
88
73
|
return orm.Space
|
@@ -107,43 +92,14 @@ class Space(BaseModel[SpaceID, models_pb2.Space, orm.Space]):
|
|
107
92
|
) -> Ok[None] | InvalidArgumentError:
|
108
93
|
return space_delete_orms(ids, session)
|
109
94
|
|
110
|
-
@
|
111
|
-
def
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
@property
|
119
|
-
def node2vec_parameters(self) -> graph_pb2.Node2VecParameters | None:
|
120
|
-
if self.proto_self.space_parameters.HasField("node2vec_parameters"):
|
121
|
-
return self.proto_self.space_parameters.node2vec_parameters
|
122
|
-
return None
|
123
|
-
|
124
|
-
@property
|
125
|
-
def concat_and_embed_parameters(
|
126
|
-
self,
|
127
|
-
) -> embedding_models_pb2.ConcatAndEmbedParameters | None:
|
128
|
-
if self.proto_self.space_parameters.HasField("concat_and_embed_parameters"):
|
129
|
-
return self.proto_self.space_parameters.concat_and_embed_parameters
|
130
|
-
return None
|
131
|
-
|
132
|
-
@property
|
133
|
-
def embed_and_concat_parameters(
|
134
|
-
self,
|
135
|
-
) -> embedding_models_pb2.EmbedAndConcatParameters | None:
|
136
|
-
if self.proto_self.space_parameters.HasField("embed_and_concat_parameters"):
|
137
|
-
return self.proto_self.space_parameters.embed_and_concat_parameters
|
138
|
-
return None
|
139
|
-
|
140
|
-
@property
|
141
|
-
def embed_image_parameters(
|
142
|
-
self,
|
143
|
-
) -> embedding_models_pb2.EmbedImageParameters | None:
|
144
|
-
if self.proto_self.space_parameters.HasField("embed_image_parameters"):
|
145
|
-
return self.proto_self.space_parameters.embed_image_parameters
|
146
|
-
return None
|
95
|
+
@classmethod
|
96
|
+
def orm_load_options(cls) -> list[sa.LoaderOption]:
|
97
|
+
return [
|
98
|
+
sa_orm.selectinload(orm.Space.feature_view)
|
99
|
+
.selectinload(orm.FeatureView.feature_view_sources)
|
100
|
+
.selectinload(orm.FeatureViewSource.source)
|
101
|
+
.selectinload(orm.Source.pipeline_ref)
|
102
|
+
]
|
147
103
|
|
148
104
|
@property
|
149
105
|
def name(self):
|
@@ -158,46 +114,79 @@ class Space(BaseModel[SpaceID, models_pb2.Space, orm.Space]):
|
|
158
114
|
return self.proto_self.description
|
159
115
|
|
160
116
|
@property
|
161
|
-
def
|
162
|
-
return self.proto_self.
|
163
|
-
|
164
|
-
@property
|
165
|
-
def feature_view_id(self):
|
166
|
-
return FeatureViewID(self.proto_self.feature_view_id)
|
117
|
+
def feature_view(self) -> FeatureView:
|
118
|
+
return FeatureView.from_proto(self.proto_self.feature_view, self.client)
|
167
119
|
|
168
120
|
@property
|
169
121
|
def auto_sync(self):
|
170
122
|
return self.proto_self.auto_sync
|
171
123
|
|
172
|
-
@property
|
173
|
-
def space_type(self):
|
174
|
-
if self.node2vec_parameters:
|
175
|
-
return feature_space_pb2.SPACE_TYPE_RELATIONAL
|
176
|
-
if self.concat_and_embed_parameters:
|
177
|
-
return feature_space_pb2.SPACE_TYPE_SEMANTIC
|
178
|
-
if self.embed_and_concat_parameters:
|
179
|
-
return feature_space_pb2.SPACE_TYPE_TABULAR
|
180
|
-
if self.embed_image_parameters:
|
181
|
-
return feature_space_pb2.SPACE_TYPE_IMAGE
|
182
|
-
return feature_space_pb2.SPACE_TYPE_UNSPECIFIED
|
183
|
-
|
184
124
|
def with_auto_sync(self, *, auto_sync: bool):
|
185
125
|
self.proto_self.auto_sync = auto_sync
|
186
126
|
return self
|
187
127
|
|
188
|
-
def feature_view(self):
|
189
|
-
if self._feature_view:
|
190
|
-
return Ok(self._feature_view)
|
191
|
-
match FeatureView.load_proto_for(self.feature_view_id, self.client):
|
192
|
-
case Ok(feature_view):
|
193
|
-
return Ok(FeatureView.from_proto(feature_view, self.client))
|
194
|
-
case NotFoundError() as err:
|
195
|
-
return err
|
196
|
-
|
197
128
|
@abc.abstractmethod
|
198
129
|
def embeddings_tables(self) -> Ok[Mapping[str, Table]] | InvalidArgumentError:
|
199
130
|
"""Generate per-output-source embeddings tables for this space."""
|
200
131
|
|
132
|
+
@classmethod
|
133
|
+
def create_specific(
|
134
|
+
cls,
|
135
|
+
name: str,
|
136
|
+
description: str,
|
137
|
+
feature_view: FeatureView,
|
138
|
+
parameters: SpecificSpaceParameters,
|
139
|
+
client: system.Client | None = None,
|
140
|
+
room_id: RoomID | None = None,
|
141
|
+
*,
|
142
|
+
auto_sync: bool = False,
|
143
|
+
) -> Ok[SpecificSpace] | InvalidArgumentError:
|
144
|
+
client = client or feature_view.client
|
145
|
+
room_id = room_id or feature_view.room_id
|
146
|
+
if room_id != feature_view.room_id:
|
147
|
+
return InvalidArgumentError("room id must match feature_view room id")
|
148
|
+
match parameters:
|
149
|
+
case Node2VecParameters():
|
150
|
+
return RelationalSpace.create(
|
151
|
+
name,
|
152
|
+
description,
|
153
|
+
feature_view,
|
154
|
+
parameters,
|
155
|
+
client,
|
156
|
+
room_id,
|
157
|
+
auto_sync=auto_sync,
|
158
|
+
)
|
159
|
+
case ConcatAndEmbedParameters():
|
160
|
+
return SemanticSpace.create(
|
161
|
+
name,
|
162
|
+
description,
|
163
|
+
feature_view,
|
164
|
+
parameters,
|
165
|
+
client,
|
166
|
+
room_id,
|
167
|
+
auto_sync=auto_sync,
|
168
|
+
)
|
169
|
+
case EmbedAndConcatParameters():
|
170
|
+
return TabularSpace.create(
|
171
|
+
name,
|
172
|
+
description,
|
173
|
+
feature_view,
|
174
|
+
parameters,
|
175
|
+
client,
|
176
|
+
room_id,
|
177
|
+
auto_sync=auto_sync,
|
178
|
+
)
|
179
|
+
case EmbedImageParameters():
|
180
|
+
return ImageSpace.create(
|
181
|
+
name,
|
182
|
+
description,
|
183
|
+
feature_view,
|
184
|
+
parameters,
|
185
|
+
client,
|
186
|
+
room_id,
|
187
|
+
auto_sync=auto_sync,
|
188
|
+
)
|
189
|
+
|
201
190
|
@classmethod
|
202
191
|
def from_id(
|
203
192
|
cls,
|
@@ -258,46 +247,109 @@ class Space(BaseModel[SpaceID, models_pb2.Space, orm.Space]):
|
|
258
247
|
class UnknownSpace(Space):
|
259
248
|
"""A space that this version of the code doesn't know what to do with."""
|
260
249
|
|
250
|
+
@classmethod
|
251
|
+
def create(cls, feature_view: FeatureView, client: system.Client | None = None):
|
252
|
+
client = client or feature_view.client
|
253
|
+
return cls(
|
254
|
+
client,
|
255
|
+
models_pb2.Space(
|
256
|
+
feature_view=feature_view.proto_self,
|
257
|
+
),
|
258
|
+
)
|
259
|
+
|
261
260
|
def embeddings_tables(self) -> Ok[Mapping[str, Table]] | InvalidArgumentError:
|
262
261
|
"""Generate per-ouput-source embeddings tables for this space."""
|
263
262
|
return Ok({})
|
264
263
|
|
265
264
|
|
266
|
-
@dataclasses.dataclass(frozen=True)
|
267
265
|
class Node2VecParameters:
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
266
|
+
proto_self: Final[graph_pb2.Node2VecParameters]
|
267
|
+
|
268
|
+
def __init__(self, proto_self: graph_pb2.Node2VecParameters):
|
269
|
+
self.proto_self = proto_self
|
270
|
+
|
271
|
+
@classmethod
|
272
|
+
def create( # noqa: PLR0913
|
273
|
+
cls,
|
274
|
+
dim: int = 10,
|
275
|
+
walk_length: int = 10,
|
276
|
+
window: int = 10,
|
277
|
+
p: float = 1.0,
|
278
|
+
q: float = 1.0,
|
279
|
+
alpha: float = 0.025,
|
280
|
+
min_alpha: float = 0.0001,
|
281
|
+
negative: int = 5,
|
282
|
+
epochs: int = 10,
|
283
|
+
):
|
284
|
+
return cls(
|
285
|
+
graph_pb2.Node2VecParameters(
|
286
|
+
ndim=dim,
|
287
|
+
walk_length=walk_length,
|
288
|
+
window=window,
|
289
|
+
p=p,
|
290
|
+
q=q,
|
291
|
+
alpha=alpha,
|
292
|
+
min_alpha=min_alpha,
|
293
|
+
negative=negative,
|
294
|
+
epochs=epochs,
|
295
|
+
)
|
289
296
|
)
|
290
297
|
|
298
|
+
@property
|
299
|
+
def dim(self) -> int:
|
300
|
+
return self.proto_self.ndim
|
301
|
+
|
302
|
+
@property
|
303
|
+
def walk_length(self) -> int:
|
304
|
+
return self.proto_self.walk_length
|
305
|
+
|
306
|
+
@property
|
307
|
+
def window(self) -> int:
|
308
|
+
return self.proto_self.window
|
309
|
+
|
310
|
+
@property
|
311
|
+
def p(self) -> float:
|
312
|
+
return self.proto_self.p
|
313
|
+
|
314
|
+
@property
|
315
|
+
def q(self) -> float:
|
316
|
+
return self.proto_self.q
|
317
|
+
|
318
|
+
@property
|
319
|
+
def alpha(self) -> float:
|
320
|
+
return self.proto_self.alpha
|
321
|
+
|
322
|
+
@property
|
323
|
+
def min_alpha(self) -> float:
|
324
|
+
return self.proto_self.min_alpha
|
325
|
+
|
326
|
+
@property
|
327
|
+
def negative(self) -> int:
|
328
|
+
return self.proto_self.negative
|
329
|
+
|
330
|
+
@property
|
331
|
+
def epochs(self) -> int:
|
332
|
+
return self.proto_self.epochs
|
333
|
+
|
291
334
|
|
292
335
|
class RelationalSpace(Space):
|
293
336
|
"""Spaces for embeddings that encode relationships."""
|
294
337
|
|
338
|
+
@property
|
339
|
+
def parameters(self) -> Node2VecParameters:
|
340
|
+
return Node2VecParameters(self.proto_self.space_parameters.node2vec_parameters)
|
341
|
+
|
295
342
|
@classmethod
|
296
343
|
def create(
|
297
344
|
cls,
|
345
|
+
name: str,
|
346
|
+
description: str,
|
298
347
|
feature_view: FeatureView,
|
348
|
+
parameters: Node2VecParameters,
|
299
349
|
client: system.Client | None = None,
|
300
350
|
room_id: orm.RoomID | None = None,
|
351
|
+
*,
|
352
|
+
auto_sync: bool = False,
|
301
353
|
) -> Ok[RelationalSpace] | InvalidArgumentError:
|
302
354
|
if not feature_view.relationships:
|
303
355
|
return InvalidArgumentError(
|
@@ -310,34 +362,25 @@ class RelationalSpace(Space):
|
|
310
362
|
client = client or feature_view.client
|
311
363
|
room_id = room_id or Defaults.get_default_room_id(client)
|
312
364
|
proto_self = models_pb2.Space(
|
313
|
-
|
365
|
+
name=name,
|
366
|
+
description=description,
|
367
|
+
auto_sync=auto_sync,
|
368
|
+
feature_view=feature_view.proto_self,
|
369
|
+
room_id=str(room_id),
|
370
|
+
space_parameters=space_pb2.SpaceParameters(
|
371
|
+
node2vec_parameters=parameters.proto_self
|
372
|
+
),
|
314
373
|
)
|
315
374
|
|
316
375
|
return Ok(
|
317
376
|
RelationalSpace(
|
318
377
|
client,
|
319
378
|
proto_self,
|
320
|
-
feature_view,
|
321
379
|
)
|
322
380
|
)
|
323
381
|
|
324
|
-
def with_node2vec(self, params: Node2VecParameters):
|
325
|
-
proto_self = copy.deepcopy(self.proto_self)
|
326
|
-
proto_self.space_parameters.CopyFrom(
|
327
|
-
space_pb2.SpaceParameters(node2vec_parameters=params.to_proto())
|
328
|
-
)
|
329
|
-
return RelationalSpace(
|
330
|
-
self.client, proto_self, self.feature_view().unwrap_or_raise()
|
331
|
-
)
|
332
|
-
|
333
382
|
def legacy_embeddings_table(self) -> Ok[Table] | InvalidArgumentError:
|
334
|
-
|
335
|
-
return InvalidArgumentError("space was not configured")
|
336
|
-
match self.feature_view():
|
337
|
-
case Ok(feature_view):
|
338
|
-
pass
|
339
|
-
case NotFoundError() as err:
|
340
|
-
return InvalidArgumentError.from_(err)
|
383
|
+
feature_view = self.feature_view
|
341
384
|
|
342
385
|
def gen_edge_list_tables():
|
343
386
|
for edge_table in feature_view.output_edge_tables():
|
@@ -361,7 +404,7 @@ class RelationalSpace(Space):
|
|
361
404
|
|
362
405
|
return op_graph.op.embed_node2vec_from_edge_lists(
|
363
406
|
edge_list_tables=edge_list_tables,
|
364
|
-
params=self.
|
407
|
+
params=self.parameters.proto_self,
|
365
408
|
).map(
|
366
409
|
lambda t: Table(
|
367
410
|
self.client,
|
@@ -377,11 +420,7 @@ class RelationalSpace(Space):
|
|
377
420
|
return err
|
378
421
|
case Ok(embeddings_table):
|
379
422
|
pass
|
380
|
-
|
381
|
-
case Ok(feature_view):
|
382
|
-
pass
|
383
|
-
case NotFoundError() as err:
|
384
|
-
return InvalidArgumentError.from_(err)
|
423
|
+
feature_view = self.feature_view
|
385
424
|
id_fields = [
|
386
425
|
field
|
387
426
|
for field in embeddings_table.schema
|
@@ -392,8 +431,7 @@ class RelationalSpace(Space):
|
|
392
431
|
dtype_to_id_field = {field.dtype: field.name for field in id_fields[:-1]}
|
393
432
|
|
394
433
|
tables: Mapping[str, Table] = {}
|
395
|
-
for
|
396
|
-
source = feature_view.source_id_to_feature_view_source[source_id].source
|
434
|
+
for source in feature_view.output_sources:
|
397
435
|
primary_key_field = source.table.schema.get_primary_key()
|
398
436
|
if primary_key_field is None:
|
399
437
|
return InvalidArgumentError(
|
@@ -416,7 +454,7 @@ class RelationalSpace(Space):
|
|
416
454
|
)
|
417
455
|
)
|
418
456
|
.and_then(
|
419
|
-
lambda t, source_id=
|
457
|
+
lambda t, source_id=source.id: t.add_literal_column(
|
420
458
|
"source_id",
|
421
459
|
str(source_id),
|
422
460
|
pa.string(),
|
@@ -442,126 +480,93 @@ class RelationalSpace(Space):
|
|
442
480
|
)
|
443
481
|
|
444
482
|
|
445
|
-
@dataclasses.dataclass
|
446
483
|
class ConcatAndEmbedParameters:
|
447
|
-
|
448
|
-
model_name: str
|
449
|
-
tokenizer_name: str
|
450
|
-
expected_vector_length: int
|
484
|
+
proto_self: Final[embedding_models_pb2.ConcatAndEmbedParameters]
|
451
485
|
|
486
|
+
def __init__(self, proto_self: embedding_models_pb2.ConcatAndEmbedParameters):
|
487
|
+
self.proto_self = proto_self
|
452
488
|
|
453
|
-
|
454
|
-
|
489
|
+
@classmethod
|
490
|
+
def create(
|
491
|
+
cls, column_names: Sequence[str], model_name: str, expected_vector_length: int
|
492
|
+
):
|
493
|
+
return cls(
|
494
|
+
embedding_models_pb2.ConcatAndEmbedParameters(
|
495
|
+
column_names=column_names,
|
496
|
+
model_parameters=embedding_models_pb2.Parameters(
|
497
|
+
model=name_to_proto_embedding_model.get(
|
498
|
+
model_name, embedding_models_pb2.MODEL_UNSPECIFIED
|
499
|
+
),
|
500
|
+
ndim=expected_vector_length,
|
501
|
+
),
|
502
|
+
)
|
503
|
+
)
|
455
504
|
|
456
|
-
|
505
|
+
@property
|
506
|
+
def model_name(self) -> str:
|
507
|
+
return embedding_model_proto_to_name[self.proto_self.model_parameters.model]
|
457
508
|
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
proto_self: models_pb2.Space,
|
462
|
-
feature_view: FeatureView | None = None,
|
463
|
-
output_sources: list[Source] | None = None,
|
464
|
-
):
|
465
|
-
super().__init__(client, proto_self, feature_view)
|
466
|
-
self._output_sources = output_sources
|
467
|
-
|
468
|
-
def output_sources(self):
|
469
|
-
if self._output_sources is not None:
|
470
|
-
return Ok(self._output_sources)
|
471
|
-
match self.feature_view():
|
472
|
-
case Ok(feature_view):
|
473
|
-
return Ok(
|
474
|
-
[
|
475
|
-
feature_view.source_id_to_feature_view_source[
|
476
|
-
output_source
|
477
|
-
].source
|
478
|
-
for output_source in feature_view.output_sources
|
479
|
-
]
|
480
|
-
)
|
481
|
-
case NotFoundError() as err:
|
482
|
-
return err
|
509
|
+
@property
|
510
|
+
def column_names(self) -> Sequence[str]:
|
511
|
+
return self.proto_self.column_names
|
483
512
|
|
484
513
|
@property
|
485
|
-
def
|
486
|
-
return
|
514
|
+
def expected_vector_length(self) -> int:
|
515
|
+
return self.proto_self.model_parameters.ndim
|
516
|
+
|
517
|
+
|
518
|
+
class SemanticSpace(Space):
|
519
|
+
"""Spaces for embedding source properties."""
|
487
520
|
|
488
521
|
@property
|
489
|
-
def
|
490
|
-
return
|
522
|
+
def parameters(self) -> ConcatAndEmbedParameters:
|
523
|
+
return ConcatAndEmbedParameters(
|
524
|
+
self.proto_self.space_parameters.concat_and_embed_parameters
|
525
|
+
)
|
491
526
|
|
492
527
|
@property
|
493
|
-
def
|
494
|
-
|
495
|
-
return embedding_model_proto_to_name[
|
496
|
-
self.concat_and_embed_parameters.model_parameters.model
|
497
|
-
]
|
498
|
-
return None
|
528
|
+
def expected_coordinate_bitwidth(self) -> Literal[32]:
|
529
|
+
return 32
|
499
530
|
|
500
531
|
@classmethod
|
501
532
|
def create(
|
502
533
|
cls,
|
534
|
+
name: str,
|
535
|
+
description: str,
|
503
536
|
feature_view: FeatureView,
|
537
|
+
parameters: ConcatAndEmbedParameters,
|
504
538
|
client: system.Client | None = None,
|
505
539
|
room_id: orm.RoomID | None = None,
|
540
|
+
*,
|
541
|
+
auto_sync: bool = False,
|
506
542
|
) -> Ok[SemanticSpace] | InvalidArgumentError:
|
507
543
|
client = client or feature_view.client
|
508
544
|
if len(feature_view.output_sources) == 0:
|
509
545
|
return InvalidArgumentError(
|
510
546
|
"feature view must have at least one output source"
|
511
547
|
)
|
512
|
-
sources = [
|
513
|
-
feature_view.source_id_to_feature_view_source[output_source].source
|
514
|
-
for output_source in feature_view.output_sources
|
515
|
-
]
|
516
|
-
|
517
548
|
room_id = room_id or Defaults.get_default_room_id(client)
|
518
549
|
proto_self = models_pb2.Space(
|
519
|
-
|
550
|
+
name=name,
|
551
|
+
description=description,
|
552
|
+
auto_sync=auto_sync,
|
553
|
+
feature_view=feature_view.proto_self,
|
554
|
+
room_id=str(room_id),
|
555
|
+
space_parameters=space_pb2.SpaceParameters(
|
556
|
+
concat_and_embed_parameters=parameters.proto_self
|
557
|
+
),
|
520
558
|
)
|
521
559
|
return Ok(
|
522
560
|
SemanticSpace(
|
523
561
|
client,
|
524
562
|
proto_self,
|
525
|
-
feature_view,
|
526
|
-
sources,
|
527
563
|
)
|
528
564
|
)
|
529
565
|
|
530
|
-
def with_concat_and_embed(self, params: ConcatAndEmbedParameters) -> SemanticSpace:
|
531
|
-
proto_self = copy.deepcopy(self.proto_self)
|
532
|
-
proto_self.space_parameters.CopyFrom(
|
533
|
-
space_pb2.SpaceParameters(
|
534
|
-
concat_and_embed_parameters=embedding_models_pb2.ConcatAndEmbedParameters(
|
535
|
-
column_names=params.column_names,
|
536
|
-
model_parameters=embedding_models_pb2.Parameters(
|
537
|
-
model=name_to_proto_embedding_model.get(
|
538
|
-
params.model_name, embedding_models_pb2.MODEL_UNSPECIFIED
|
539
|
-
),
|
540
|
-
ndim=params.expected_vector_length,
|
541
|
-
),
|
542
|
-
)
|
543
|
-
)
|
544
|
-
)
|
545
|
-
return SemanticSpace(
|
546
|
-
self.client,
|
547
|
-
proto_self,
|
548
|
-
self.feature_view().unwrap_or_raise(),
|
549
|
-
output_sources=self.output_sources().unwrap_or_raise(),
|
550
|
-
)
|
551
|
-
|
552
566
|
def embeddings_tables(self) -> Ok[Mapping[str, Table]] | InvalidArgumentError:
|
553
|
-
|
554
|
-
|
555
|
-
|
556
|
-
"space was not configured to produce embeddings"
|
557
|
-
)
|
558
|
-
match self.output_sources():
|
559
|
-
case Ok(output_sources):
|
560
|
-
pass
|
561
|
-
case NotFoundError() as err:
|
562
|
-
return InvalidArgumentError.from_(err)
|
563
|
-
|
564
|
-
params = self.concat_and_embed_parameters
|
567
|
+
params = self.parameters
|
568
|
+
model_name = params.model_name
|
569
|
+
output_sources = self.feature_view.output_sources
|
565
570
|
combined_column_tmp_name = f"__concat-{uuid.uuid4()}"
|
566
571
|
embedding_column_tmp_name = f"__embed-{uuid.uuid4()}"
|
567
572
|
|
@@ -590,8 +595,8 @@ class SemanticSpace(Space):
|
|
590
595
|
combined_column_tmp_name,
|
591
596
|
embedding_column_tmp_name,
|
592
597
|
model_name,
|
593
|
-
|
594
|
-
params.
|
598
|
+
"",
|
599
|
+
params.expected_vector_length,
|
595
600
|
self.expected_coordinate_bitwidth,
|
596
601
|
)
|
597
602
|
)
|
@@ -632,99 +637,68 @@ class SemanticSpace(Space):
|
|
632
637
|
return Ok(tables)
|
633
638
|
|
634
639
|
|
635
|
-
@dataclasses.dataclass
|
636
640
|
class EmbedAndConcatParameters:
|
637
|
-
|
641
|
+
proto_self: Final[embedding_models_pb2.EmbedAndConcatParameters]
|
642
|
+
|
643
|
+
def __init__(self, proto_self: embedding_models_pb2.EmbedAndConcatParameters):
|
644
|
+
self.proto_self = proto_self
|
645
|
+
|
646
|
+
@classmethod
|
647
|
+
def create(cls, expected_vector_length: int):
|
648
|
+
return cls(
|
649
|
+
embedding_models_pb2.EmbedAndConcatParameters(ndim=expected_vector_length)
|
650
|
+
)
|
651
|
+
|
652
|
+
@property
|
653
|
+
def expected_vector_length(self) -> int:
|
654
|
+
return self.proto_self.ndim
|
638
655
|
|
639
656
|
|
640
657
|
class TabularSpace(Space):
|
641
658
|
"""Spaces for embedding source properties."""
|
642
659
|
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
proto_self: models_pb2.Space,
|
649
|
-
feature_view: FeatureView | None = None,
|
650
|
-
output_sources: list[Source] | None = None,
|
651
|
-
):
|
652
|
-
super().__init__(client, proto_self, feature_view)
|
653
|
-
self._output_sources = output_sources
|
654
|
-
|
655
|
-
def output_sources(self):
|
656
|
-
if self._output_sources is not None:
|
657
|
-
return Ok(self._output_sources)
|
658
|
-
match self.feature_view():
|
659
|
-
case Ok(feature_view):
|
660
|
-
return Ok(
|
661
|
-
[
|
662
|
-
feature_view.source_id_to_feature_view_source[
|
663
|
-
output_source
|
664
|
-
].source
|
665
|
-
for output_source in feature_view.output_sources
|
666
|
-
]
|
667
|
-
)
|
668
|
-
case NotFoundError() as err:
|
669
|
-
return err
|
660
|
+
@property
|
661
|
+
def parameters(self) -> EmbedAndConcatParameters:
|
662
|
+
return EmbedAndConcatParameters(
|
663
|
+
self.proto_self.space_parameters.embed_and_concat_parameters
|
664
|
+
)
|
670
665
|
|
671
666
|
@classmethod
|
672
667
|
def create(
|
673
668
|
cls,
|
669
|
+
name: str,
|
670
|
+
description: str,
|
674
671
|
feature_view: FeatureView,
|
672
|
+
parameters: EmbedAndConcatParameters,
|
675
673
|
client: system.Client | None = None,
|
676
674
|
room_id: orm.RoomID | None = None,
|
675
|
+
*,
|
676
|
+
auto_sync: bool = False,
|
677
677
|
) -> Ok[Self] | InvalidArgumentError:
|
678
678
|
client = client or feature_view.client
|
679
679
|
if len(feature_view.output_sources) == 0:
|
680
680
|
return InvalidArgumentError(
|
681
681
|
"feature view must have at least one output source"
|
682
682
|
)
|
683
|
-
sources = [
|
684
|
-
feature_view.source_id_to_feature_view_source[output_source].source
|
685
|
-
for output_source in feature_view.output_sources
|
686
|
-
]
|
687
683
|
|
688
684
|
room_id = room_id or Defaults.get_default_room_id(client)
|
689
685
|
proto_self = models_pb2.Space(
|
690
|
-
|
691
|
-
|
692
|
-
|
693
|
-
|
694
|
-
|
695
|
-
|
696
|
-
|
697
|
-
|
698
|
-
)
|
699
|
-
)
|
700
|
-
|
701
|
-
def with_embed_and_concat(self, params: EmbedAndConcatParameters) -> TabularSpace:
|
702
|
-
proto_self = copy.deepcopy(self.proto_self)
|
703
|
-
proto_self.space_parameters.CopyFrom(
|
704
|
-
space_pb2.SpaceParameters(
|
705
|
-
embed_and_concat_parameters=embedding_models_pb2.EmbedAndConcatParameters(
|
706
|
-
ndim=params.expected_vector_length,
|
707
|
-
)
|
708
|
-
)
|
709
|
-
)
|
710
|
-
return TabularSpace(
|
711
|
-
self.client,
|
712
|
-
proto_self,
|
713
|
-
self.feature_view().unwrap_or_raise(),
|
714
|
-
output_sources=self.output_sources().unwrap_or_raise(),
|
686
|
+
name=name,
|
687
|
+
description=description,
|
688
|
+
auto_sync=auto_sync,
|
689
|
+
feature_view=feature_view.proto_self,
|
690
|
+
room_id=str(room_id),
|
691
|
+
space_parameters=space_pb2.SpaceParameters(
|
692
|
+
embed_and_concat_parameters=parameters.proto_self
|
693
|
+
),
|
715
694
|
)
|
695
|
+
return Ok(cls(client, proto_self))
|
716
696
|
|
717
|
-
def embeddings_tables(
|
718
|
-
|
719
|
-
|
720
|
-
|
721
|
-
|
722
|
-
match self.output_sources():
|
723
|
-
case Ok(output_sources):
|
724
|
-
pass
|
725
|
-
case NotFoundError() as err:
|
726
|
-
return InvalidArgumentError.from_(err)
|
727
|
-
parameters = self.embed_and_concat_parameters
|
697
|
+
def embeddings_tables( # noqa: C901
|
698
|
+
self,
|
699
|
+
) -> Ok[Mapping[str, Table]] | InvalidArgumentError:
|
700
|
+
output_sources = self.feature_view.output_sources
|
701
|
+
parameters = self.parameters
|
728
702
|
|
729
703
|
tables: Mapping[str, Table] = {}
|
730
704
|
first_schema = output_sources[0].table.schema
|
@@ -783,7 +757,9 @@ class TabularSpace(Space):
|
|
783
757
|
embedding_column_tmp_name = f"__embed-{uuid.uuid4()}"
|
784
758
|
|
785
759
|
# Avoid 0 padding for spaces with small numbers of columns
|
786
|
-
target_list_length = min(
|
760
|
+
target_list_length = min(
|
761
|
+
parameters.expected_vector_length, len(embedding_column_tmp_names)
|
762
|
+
)
|
787
763
|
|
788
764
|
def reduce_dimension(
|
789
765
|
op: op_graph.Op,
|
@@ -856,40 +832,58 @@ class TabularSpace(Space):
|
|
856
832
|
return Ok(tables)
|
857
833
|
|
858
834
|
|
859
|
-
@dataclasses.dataclass
|
860
835
|
class EmbedImageParameters:
|
861
|
-
|
862
|
-
|
863
|
-
|
836
|
+
proto_self: Final[embedding_models_pb2.EmbedImageParameters]
|
837
|
+
|
838
|
+
def __init__(self, proto_self: embedding_models_pb2.EmbedImageParameters):
|
839
|
+
self.proto_self = proto_self
|
840
|
+
|
841
|
+
@classmethod
|
842
|
+
def create(
|
843
|
+
cls, column_name: str, model_name: str, expected_vector_length: int
|
844
|
+
) -> Self:
|
845
|
+
return cls(
|
846
|
+
embedding_models_pb2.EmbedImageParameters(
|
847
|
+
column_name=column_name,
|
848
|
+
model_parameters=embedding_models_pb2.ImageModelParameters(
|
849
|
+
model=name_to_proto_image_model.get(
|
850
|
+
model_name,
|
851
|
+
embedding_models_pb2.IMAGE_MODEL_UNSPECIFIED,
|
852
|
+
),
|
853
|
+
ndim=expected_vector_length,
|
854
|
+
),
|
855
|
+
)
|
856
|
+
)
|
857
|
+
|
858
|
+
@property
|
859
|
+
def column_name(self) -> str:
|
860
|
+
return self.proto_self.column_name
|
861
|
+
|
862
|
+
@property
|
863
|
+
def model_name(self) -> str:
|
864
|
+
return image_model_proto_to_name[self.proto_self.model_parameters.model]
|
865
|
+
|
866
|
+
@property
|
867
|
+
def model(self) -> embedding_models_pb2.ImageModel:
|
868
|
+
return self.proto_self.model_parameters.model
|
869
|
+
|
870
|
+
@property
|
871
|
+
def expected_vector_length(self) -> int:
|
872
|
+
return self.proto_self.model_parameters.ndim
|
864
873
|
|
865
874
|
|
866
875
|
class ImageSpace(Space):
|
867
876
|
"""Spaces for embedding images."""
|
868
877
|
|
869
|
-
|
870
|
-
|
871
|
-
|
872
|
-
|
873
|
-
|
874
|
-
proto_self: models_pb2.Space,
|
875
|
-
feature_view: FeatureView | None = None,
|
876
|
-
output_source: Source | None = None,
|
877
|
-
):
|
878
|
-
super().__init__(client, proto_self, feature_view)
|
879
|
-
self._output_source = output_source
|
878
|
+
@property
|
879
|
+
def parameters(self) -> EmbedImageParameters:
|
880
|
+
return EmbedImageParameters(
|
881
|
+
self.proto_self.space_parameters.embed_image_parameters
|
882
|
+
)
|
880
883
|
|
884
|
+
@property
|
881
885
|
def output_source(self):
|
882
|
-
|
883
|
-
return Ok(self._output_source)
|
884
|
-
match self.feature_view():
|
885
|
-
case Ok(feature_view):
|
886
|
-
return Ok(
|
887
|
-
feature_view.source_id_to_feature_view_source[
|
888
|
-
first(iter(feature_view.output_sources))
|
889
|
-
].source
|
890
|
-
)
|
891
|
-
case NotFoundError() as err:
|
892
|
-
return err
|
886
|
+
return self.feature_view.output_sources[0]
|
893
887
|
|
894
888
|
def _sub_orm_objects(self, orm_object: orm.Space) -> Iterable[orm.Base]:
|
895
889
|
return []
|
@@ -898,78 +892,45 @@ class ImageSpace(Space):
|
|
898
892
|
def expected_coordinate_bitwidth(self) -> Literal[32]:
|
899
893
|
return 32
|
900
894
|
|
901
|
-
@property
|
902
|
-
def model_name(self) -> str | None:
|
903
|
-
if self.embed_image_parameters:
|
904
|
-
return image_model_proto_to_name[
|
905
|
-
self.embed_image_parameters.model_parameters.model
|
906
|
-
]
|
907
|
-
return None
|
908
|
-
|
909
895
|
@classmethod
|
910
896
|
def create(
|
911
897
|
cls,
|
898
|
+
name: str,
|
899
|
+
description: str,
|
912
900
|
feature_view: FeatureView,
|
901
|
+
parameters: EmbedImageParameters,
|
913
902
|
client: system.Client | None = None,
|
914
903
|
room_id: orm.RoomID | None = None,
|
904
|
+
*,
|
905
|
+
auto_sync: bool = False,
|
915
906
|
) -> Ok[Self] | InvalidArgumentError:
|
916
907
|
client = client or feature_view.client
|
917
908
|
if len(feature_view.output_sources) != 1:
|
918
909
|
return InvalidArgumentError(
|
919
910
|
"feature view must have exactly one output source"
|
920
911
|
)
|
921
|
-
source = feature_view.source_id_to_feature_view_source[
|
922
|
-
first(iter(feature_view.output_sources))
|
923
|
-
].source
|
924
|
-
|
925
912
|
room_id = room_id or Defaults.get_default_room_id(client)
|
926
913
|
proto_self = models_pb2.Space(
|
927
|
-
|
914
|
+
name=name,
|
915
|
+
description=description,
|
916
|
+
auto_sync=auto_sync,
|
917
|
+
feature_view=feature_view.proto_self,
|
918
|
+
room_id=str(room_id),
|
919
|
+
space_parameters=space_pb2.SpaceParameters(
|
920
|
+
embed_image_parameters=parameters.proto_self
|
921
|
+
),
|
928
922
|
)
|
929
923
|
return Ok(
|
930
924
|
cls(
|
931
925
|
client,
|
932
926
|
proto_self,
|
933
|
-
feature_view,
|
934
|
-
source,
|
935
927
|
)
|
936
928
|
)
|
937
929
|
|
938
|
-
def with_embed_and_image(self, params: EmbedImageParameters) -> ImageSpace:
|
939
|
-
proto_self = copy.deepcopy(self.proto_self)
|
940
|
-
proto_self.space_parameters.CopyFrom(
|
941
|
-
space_pb2.SpaceParameters(
|
942
|
-
embed_image_parameters=embedding_models_pb2.EmbedImageParameters(
|
943
|
-
column_name=params.column_name,
|
944
|
-
model_parameters=embedding_models_pb2.ImageModelParameters(
|
945
|
-
model=name_to_proto_image_model.get(
|
946
|
-
params.model_name,
|
947
|
-
embedding_models_pb2.IMAGE_MODEL_UNSPECIFIED,
|
948
|
-
),
|
949
|
-
ndim=params.expected_vector_length,
|
950
|
-
),
|
951
|
-
)
|
952
|
-
)
|
953
|
-
)
|
954
|
-
return ImageSpace(
|
955
|
-
self.client,
|
956
|
-
proto_self,
|
957
|
-
self.feature_view().unwrap_or_raise(),
|
958
|
-
output_source=self.output_source().unwrap_or_raise(),
|
959
|
-
)
|
960
|
-
|
961
930
|
def embeddings_tables(self) -> Ok[Mapping[str, Table]] | InvalidArgumentError:
|
962
|
-
|
963
|
-
|
964
|
-
|
965
|
-
"space was not configured to produce embeddings"
|
966
|
-
)
|
967
|
-
match self.output_source():
|
968
|
-
case Ok(output_source):
|
969
|
-
pass
|
970
|
-
case NotFoundError() as err:
|
971
|
-
return InvalidArgumentError.from_(err)
|
972
|
-
params = self.embed_image_parameters
|
931
|
+
params = self.parameters
|
932
|
+
model_name = params.model_name
|
933
|
+
output_source = self.output_source
|
973
934
|
pk_field = output_source.table.schema.get_primary_key()
|
974
935
|
if not pk_field:
|
975
936
|
return InvalidArgumentError("output source must have a primary key")
|
@@ -981,7 +942,7 @@ class ImageSpace(Space):
|
|
981
942
|
column_name=params.column_name,
|
982
943
|
embedding_column_name=embedding_column_tmp_name,
|
983
944
|
model_name=model_name,
|
984
|
-
expected_vector_length=params.
|
945
|
+
expected_vector_length=params.expected_vector_length,
|
985
946
|
expected_coordinate_bitwidth=self.expected_coordinate_bitwidth,
|
986
947
|
)
|
987
948
|
.and_then(
|
@@ -1006,3 +967,10 @@ class ImageSpace(Space):
|
|
1006
967
|
SpecificSpace: TypeAlias = (
|
1007
968
|
RelationalSpace | SemanticSpace | TabularSpace | ImageSpace | UnknownSpace
|
1008
969
|
)
|
970
|
+
|
971
|
+
SpecificSpaceParameters: TypeAlias = (
|
972
|
+
Node2VecParameters
|
973
|
+
| ConcatAndEmbedParameters
|
974
|
+
| EmbedAndConcatParameters
|
975
|
+
| EmbedImageParameters
|
976
|
+
)
|