corvic-engine 0.3.0rc54__cp38-abi3-win_amd64.whl → 0.3.0rc56__cp38-abi3-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- corvic/embed/node2vec.py +2 -2
- corvic/engine/_native.pyd +0 -0
- corvic/model/__init__.py +25 -1
- corvic/model/_agent.py +2 -6
- corvic/model/_base_model.py +43 -21
- corvic/model/_completion_model.py +22 -6
- corvic/model/_feature_view.py +4 -6
- corvic/model/_pipeline.py +2 -6
- corvic/model/_proto_orm_convert.py +127 -164
- corvic/model/_resource.py +2 -6
- corvic/model/_room.py +2 -2
- corvic/model/_source.py +10 -6
- corvic/model/_space.py +35 -8
- corvic/op_graph/encoders.py +48 -1
- corvic/orm/__init__.py +29 -16
- corvic/orm/base.py +1 -1
- corvic/system/in_memory_executor.py +151 -7
- {corvic_engine-0.3.0rc54.dist-info → corvic_engine-0.3.0rc56.dist-info}/METADATA +1 -1
- {corvic_engine-0.3.0rc54.dist-info → corvic_engine-0.3.0rc56.dist-info}/RECORD +25 -25
- {corvic_engine-0.3.0rc54.dist-info → corvic_engine-0.3.0rc56.dist-info}/WHEEL +1 -1
- corvic_generated/model/v1alpha/models_pb2.py +16 -16
- corvic_generated/model/v1alpha/models_pb2.pyi +8 -4
- corvic_generated/orm/v1/table_pb2.py +89 -85
- corvic_generated/orm/v1/table_pb2.pyi +14 -2
- {corvic_engine-0.3.0rc54.dist-info → corvic_engine-0.3.0rc56.dist-info}/licenses/LICENSE +0 -0
corvic/op_graph/encoders.py
CHANGED
@@ -67,7 +67,19 @@ def from_proto(
|
|
67
67
|
) -> StandardScaler: ...
|
68
68
|
|
69
69
|
|
70
|
+
@overload
|
71
|
+
def from_proto(
|
72
|
+
proto: table_pb2.TimestampEncoder,
|
73
|
+
) -> TimestampEncoder: ...
|
74
|
+
|
75
|
+
|
76
|
+
@overload
|
70
77
|
def from_proto(
|
78
|
+
proto: table_pb2.TextEncoder,
|
79
|
+
) -> TextEncoder: ...
|
80
|
+
|
81
|
+
|
82
|
+
def from_proto( # noqa: C901
|
71
83
|
proto: (
|
72
84
|
table_pb2.Encoder
|
73
85
|
| table_pb2.OneHotEncoder
|
@@ -78,6 +90,8 @@ def from_proto(
|
|
78
90
|
| table_pb2.Binarizer
|
79
91
|
| table_pb2.MaxAbsScaler
|
80
92
|
| table_pb2.StandardScaler
|
93
|
+
| table_pb2.TimestampEncoder
|
94
|
+
| table_pb2.TextEncoder
|
81
95
|
),
|
82
96
|
) -> Encoder:
|
83
97
|
"""Create a Encoder wrapper around a Encoder protobuf message."""
|
@@ -100,6 +114,10 @@ def from_proto(
|
|
100
114
|
return MaxAbsScaler(table_pb2.Encoder(max_abs_scaler=proto))
|
101
115
|
case table_pb2.StandardScaler():
|
102
116
|
return StandardScaler(table_pb2.Encoder(standard_scaler=proto))
|
117
|
+
case table_pb2.TimestampEncoder():
|
118
|
+
return TimestampEncoder(table_pb2.Encoder(timestamp_encoder=proto))
|
119
|
+
case table_pb2.TextEncoder():
|
120
|
+
return TextEncoder(table_pb2.Encoder(text_encoder=proto))
|
103
121
|
|
104
122
|
|
105
123
|
def _from_encoder_type(proto: table_pb2.Encoder):
|
@@ -249,6 +267,22 @@ class StandardScaler(_Base):
|
|
249
267
|
return self._proto.standard_scaler.with_std
|
250
268
|
|
251
269
|
|
270
|
+
class TimestampEncoder(_Base):
|
271
|
+
"""Encode timestamp features into a numeric array."""
|
272
|
+
|
273
|
+
@property
|
274
|
+
def output_dtype(self) -> DataType:
|
275
|
+
return pl.List(pl.Float32())
|
276
|
+
|
277
|
+
|
278
|
+
class TextEncoder(_Base):
|
279
|
+
"""Encode text features into a numeric vector."""
|
280
|
+
|
281
|
+
@property
|
282
|
+
def output_dtype(self) -> DataType:
|
283
|
+
return pl.List(pl.Float32())
|
284
|
+
|
285
|
+
|
252
286
|
def one_hot_encoder():
|
253
287
|
"""Build a OneHotEncoder Encoder."""
|
254
288
|
return from_proto(table_pb2.OneHotEncoder())
|
@@ -316,6 +350,16 @@ def standard_scaler(
|
|
316
350
|
)
|
317
351
|
|
318
352
|
|
353
|
+
def timestamp_encoder():
|
354
|
+
"""Build a TimestampEncoder Encoder."""
|
355
|
+
return from_proto(table_pb2.TimestampEncoder())
|
356
|
+
|
357
|
+
|
358
|
+
def text_encoder():
|
359
|
+
"""Build a TextEncoder Encoder."""
|
360
|
+
return from_proto(table_pb2.TextEncoder())
|
361
|
+
|
362
|
+
|
319
363
|
Encoder = (
|
320
364
|
OneHotEncoder
|
321
365
|
| MinMaxScaler
|
@@ -325,9 +369,10 @@ Encoder = (
|
|
325
369
|
| Binarizer
|
326
370
|
| MaxAbsScaler
|
327
371
|
| StandardScaler
|
372
|
+
| TimestampEncoder
|
373
|
+
| TextEncoder
|
328
374
|
)
|
329
375
|
|
330
|
-
|
331
376
|
_ENCODER_NAME_TO_ENCODER_TYPE: Final = {
|
332
377
|
"one_hot_encoder": OneHotEncoder,
|
333
378
|
"min_max_scaler": MinMaxScaler,
|
@@ -337,6 +382,8 @@ _ENCODER_NAME_TO_ENCODER_TYPE: Final = {
|
|
337
382
|
"binarizer": Binarizer,
|
338
383
|
"max_abs_scaler": MaxAbsScaler,
|
339
384
|
"standard_scaler": StandardScaler,
|
385
|
+
"timestamp_encoder": TimestampEncoder,
|
386
|
+
"text_encoder": TextEncoder,
|
340
387
|
}
|
341
388
|
|
342
389
|
_ENCODER_TYPE_TO_ENCODER_FIELD_NAME: Final[dict[type[Any], str]] = {
|
corvic/orm/__init__.py
CHANGED
@@ -2,6 +2,8 @@
|
|
2
2
|
|
3
3
|
from __future__ import annotations
|
4
4
|
|
5
|
+
from datetime import datetime
|
6
|
+
|
5
7
|
import sqlalchemy as sa
|
6
8
|
from sqlalchemy import orm as sa_orm
|
7
9
|
|
@@ -37,6 +39,7 @@ from corvic.orm.keys import (
|
|
37
39
|
ForeignKey,
|
38
40
|
primary_key_foreign_column,
|
39
41
|
primary_key_identity_column,
|
42
|
+
primary_key_uuid_column,
|
40
43
|
)
|
41
44
|
from corvic.orm.mixins import (
|
42
45
|
BelongsToOrgMixin,
|
@@ -64,11 +67,11 @@ from corvic_generated.status.v1 import event_pb2
|
|
64
67
|
# and if sub-orm-model updates are required they are explicit.
|
65
68
|
|
66
69
|
|
67
|
-
class Org(SoftDeleteMixin, OrgBase):
|
70
|
+
class Org(SoftDeleteMixin, OrgBase, kw_only=True):
|
68
71
|
"""An organization it a top level grouping of resources."""
|
69
72
|
|
70
73
|
|
71
|
-
class Room(BelongsToOrgMixin, SoftDeleteMixin, Base):
|
74
|
+
class Room(BelongsToOrgMixin, SoftDeleteMixin, Base, kw_only=True):
|
72
75
|
"""A Room is a logical collection of Documents."""
|
73
76
|
|
74
77
|
__tablename__ = "room"
|
@@ -86,15 +89,17 @@ class BelongsToRoomMixin(sa_orm.MappedAsDataclass):
|
|
86
89
|
room_id: sa_orm.Mapped[RoomID | None] = sa_orm.mapped_column(
|
87
90
|
ForeignKey(Room).make(ondelete="CASCADE"),
|
88
91
|
nullable=True,
|
92
|
+
default=None,
|
89
93
|
)
|
90
94
|
|
91
95
|
|
92
|
-
class DefaultObjects(Base):
|
96
|
+
class DefaultObjects(Base, kw_only=True):
|
93
97
|
"""Holds the identifiers for default objects."""
|
94
98
|
|
95
99
|
__tablename__ = "default_objects"
|
96
|
-
default_org: sa_orm.Mapped[OrgID] = sa_orm.mapped_column(
|
97
|
-
ForeignKey(Org).make(ondelete="CASCADE")
|
100
|
+
default_org: sa_orm.Mapped[OrgID | None] = sa_orm.mapped_column(
|
101
|
+
ForeignKey(Org).make(ondelete="CASCADE"),
|
102
|
+
nullable=False,
|
98
103
|
)
|
99
104
|
default_room: sa_orm.Mapped[RoomID | None] = sa_orm.mapped_column(
|
100
105
|
ForeignKey(Room).make(ondelete="CASCADE"), nullable=True, default=None
|
@@ -102,7 +107,7 @@ class DefaultObjects(Base):
|
|
102
107
|
version: sa_orm.Mapped[int | None] = primary_key_identity_column(type_=INT_PK_TYPE)
|
103
108
|
|
104
109
|
|
105
|
-
class Resource(BelongsToOrgMixin, BelongsToRoomMixin, Base):
|
110
|
+
class Resource(BelongsToOrgMixin, BelongsToRoomMixin, Base, kw_only=True):
|
106
111
|
"""A Resource is a reference to some durably stored file.
|
107
112
|
|
108
113
|
E.g., a document could be a PDF file, an image, or a text transcript of a
|
@@ -127,7 +132,7 @@ class Resource(BelongsToOrgMixin, BelongsToRoomMixin, Base):
|
|
127
132
|
)
|
128
133
|
|
129
134
|
|
130
|
-
class Source(BelongsToOrgMixin, BelongsToRoomMixin, Base):
|
135
|
+
class Source(BelongsToOrgMixin, BelongsToRoomMixin, Base, kw_only=True):
|
131
136
|
"""A source."""
|
132
137
|
|
133
138
|
__tablename__ = "source"
|
@@ -150,7 +155,7 @@ class Source(BelongsToOrgMixin, BelongsToRoomMixin, Base):
|
|
150
155
|
return self.name
|
151
156
|
|
152
157
|
|
153
|
-
class Pipeline(BelongsToOrgMixin, BelongsToRoomMixin, Base):
|
158
|
+
class Pipeline(BelongsToOrgMixin, BelongsToRoomMixin, Base, kw_only=True):
|
154
159
|
"""A resource to source pipeline."""
|
155
160
|
|
156
161
|
__tablename__ = "pipeline"
|
@@ -170,7 +175,7 @@ class Pipeline(BelongsToOrgMixin, BelongsToRoomMixin, Base):
|
|
170
175
|
)
|
171
176
|
|
172
177
|
|
173
|
-
class PipelineInput(BelongsToOrgMixin, BelongsToRoomMixin, Base):
|
178
|
+
class PipelineInput(BelongsToOrgMixin, BelongsToRoomMixin, Base, kw_only=True):
|
174
179
|
"""Pipeline input resources."""
|
175
180
|
|
176
181
|
__tablename__ = "pipeline_input"
|
@@ -188,7 +193,7 @@ class PipelineInput(BelongsToOrgMixin, BelongsToRoomMixin, Base):
|
|
188
193
|
)
|
189
194
|
|
190
195
|
|
191
|
-
class PipelineOutput(BelongsToOrgMixin, BelongsToRoomMixin, Base):
|
196
|
+
class PipelineOutput(BelongsToOrgMixin, BelongsToRoomMixin, Base, kw_only=True):
|
192
197
|
"""Objects for tracking pipeline output sources."""
|
193
198
|
|
194
199
|
__tablename__ = "pipeline_output"
|
@@ -206,7 +211,9 @@ class PipelineOutput(BelongsToOrgMixin, BelongsToRoomMixin, Base):
|
|
206
211
|
)
|
207
212
|
|
208
213
|
|
209
|
-
class FeatureView(
|
214
|
+
class FeatureView(
|
215
|
+
SoftDeleteMixin, BelongsToOrgMixin, BelongsToRoomMixin, Base, kw_only=True
|
216
|
+
):
|
210
217
|
"""A FeatureView is a logical collection of sources used by various spaces."""
|
211
218
|
|
212
219
|
__tablename__ = "feature_view"
|
@@ -231,7 +238,7 @@ class FeatureView(SoftDeleteMixin, BelongsToOrgMixin, BelongsToRoomMixin, Base):
|
|
231
238
|
)
|
232
239
|
|
233
240
|
|
234
|
-
class FeatureViewSource(BelongsToOrgMixin, BelongsToRoomMixin, Base):
|
241
|
+
class FeatureViewSource(BelongsToOrgMixin, BelongsToRoomMixin, Base, kw_only=True):
|
235
242
|
"""A source inside of a feature view."""
|
236
243
|
|
237
244
|
__tablename__ = "feature_view_source"
|
@@ -255,7 +262,7 @@ class FeatureViewSource(BelongsToOrgMixin, BelongsToRoomMixin, Base):
|
|
255
262
|
)
|
256
263
|
|
257
264
|
|
258
|
-
class Space(BelongsToOrgMixin, BelongsToRoomMixin, Base):
|
265
|
+
class Space(BelongsToOrgMixin, BelongsToRoomMixin, Base, kw_only=True):
|
259
266
|
"""A space is a named evaluation of space parameters."""
|
260
267
|
|
261
268
|
__tablename__ = "space"
|
@@ -320,7 +327,7 @@ class SpaceRun(BelongsToOrgMixin, BelongsToRoomMixin, Base, kw_only=True):
|
|
320
327
|
)
|
321
328
|
|
322
329
|
|
323
|
-
class Agent(SoftDeleteMixin, BelongsToOrgMixin, BelongsToRoomMixin, Base):
|
330
|
+
class Agent(SoftDeleteMixin, BelongsToOrgMixin, BelongsToRoomMixin, Base, kw_only=True):
|
324
331
|
"""An Agent."""
|
325
332
|
|
326
333
|
__tablename__ = "agent"
|
@@ -342,7 +349,7 @@ class Agent(SoftDeleteMixin, BelongsToOrgMixin, BelongsToRoomMixin, Base):
|
|
342
349
|
)
|
343
350
|
|
344
351
|
|
345
|
-
class AgentSpaceAssociation(BelongsToOrgMixin, BelongsToRoomMixin, Base):
|
352
|
+
class AgentSpaceAssociation(BelongsToOrgMixin, BelongsToRoomMixin, Base, kw_only=True):
|
346
353
|
__tablename__ = "agent_space_association"
|
347
354
|
|
348
355
|
space_run_id: sa_orm.Mapped[SpaceRunID | None] = sa_orm.mapped_column(
|
@@ -356,7 +363,7 @@ class AgentSpaceAssociation(BelongsToOrgMixin, BelongsToRoomMixin, Base):
|
|
356
363
|
)
|
357
364
|
|
358
365
|
|
359
|
-
class CompletionModel(SoftDeleteMixin, BelongsToOrgMixin, Base):
|
366
|
+
class CompletionModel(SoftDeleteMixin, BelongsToOrgMixin, Base, kw_only=True):
|
360
367
|
"""A customer's custom completion model definition."""
|
361
368
|
|
362
369
|
__tablename__ = "completion_model"
|
@@ -369,6 +376,11 @@ class CompletionModel(SoftDeleteMixin, BelongsToOrgMixin, Base):
|
|
369
376
|
sa_orm.mapped_column(default=None)
|
370
377
|
)
|
371
378
|
secret_api_key: sa_orm.Mapped[str] = sa_orm.mapped_column(sa.Text, default=None)
|
379
|
+
last_validation_time: sa_orm.Mapped[datetime | None] = sa_orm.mapped_column(
|
380
|
+
sa.DateTime(timezone=True),
|
381
|
+
server_default=None,
|
382
|
+
default=None,
|
383
|
+
)
|
372
384
|
|
373
385
|
@property
|
374
386
|
def model_key(self):
|
@@ -434,6 +446,7 @@ __all__ = [
|
|
434
446
|
"UserMessageID",
|
435
447
|
"primary_key_foreign_column",
|
436
448
|
"primary_key_identity_column",
|
449
|
+
"primary_key_uuid_column",
|
437
450
|
"ProtoMessageDecorator",
|
438
451
|
"IntIDDecorator",
|
439
452
|
]
|
corvic/orm/base.py
CHANGED
@@ -178,7 +178,7 @@ class OrgBase(Base):
|
|
178
178
|
# overriding table_args is the recommending way of defining these base model types
|
179
179
|
__table_args__: ClassVar[Any] = ({"extend_existing": True},)
|
180
180
|
|
181
|
-
id: sa_orm.Mapped[OrgID] = primary_key_uuid_column()
|
181
|
+
id: sa_orm.Mapped[OrgID | None] = primary_key_uuid_column()
|
182
182
|
|
183
183
|
@property
|
184
184
|
def name(self) -> str:
|
@@ -5,6 +5,7 @@ from __future__ import annotations
|
|
5
5
|
import dataclasses
|
6
6
|
import datetime
|
7
7
|
import functools
|
8
|
+
import math
|
8
9
|
from collections.abc import MutableMapping
|
9
10
|
from contextlib import nullcontext
|
10
11
|
from typing import Any, Final, cast
|
@@ -46,6 +47,9 @@ from corvic_generated.orm.v1 import table_pb2
|
|
46
47
|
|
47
48
|
_logger = structlog.get_logger()
|
48
49
|
|
50
|
+
"""Reference and Maximum number of years for normalizing year in Datetime encoder"""
|
51
|
+
REFERENCE_YEAR: Final = 1900
|
52
|
+
MAX_NUMBER_OF_YEARS: Final = 200
|
49
53
|
|
50
54
|
_MIN_EMBEDDINGS_FOR_EMBEDDINGS_SUMMARY: Final = 3
|
51
55
|
|
@@ -815,7 +819,119 @@ class InMemoryExecutor(OpGraphExecutor):
|
|
815
819
|
)
|
816
820
|
)
|
817
821
|
|
818
|
-
|
822
|
+
@staticmethod
|
823
|
+
def get_cyclic_encoding(
|
824
|
+
series: pl.Series,
|
825
|
+
period: int,
|
826
|
+
) -> tuple[pl.Series, pl.Series]:
|
827
|
+
sine_series = (2 * math.pi * series / period).sin().alias(f"{series.name}_sine")
|
828
|
+
cosine_series = (
|
829
|
+
(2 * math.pi * series / period).cos().alias(f"{series.name}_cosine")
|
830
|
+
)
|
831
|
+
return sine_series, cosine_series
|
832
|
+
|
833
|
+
@staticmethod
|
834
|
+
def encode_datetime(series: pl.Series) -> pl.Series:
|
835
|
+
match series.dtype:
|
836
|
+
case pl.Date | pl.Time:
|
837
|
+
pass
|
838
|
+
case pl.Datetime:
|
839
|
+
series = series.dt.replace_time_zone("UTC")
|
840
|
+
case _:
|
841
|
+
raise ValueError("Invalid arguments, expected a datetime series")
|
842
|
+
|
843
|
+
if series.is_null().all():
|
844
|
+
zero_vector = pl.zeros(11, dtype=pl.Float32, eager=True)
|
845
|
+
return pl.Series([zero_vector] * len(series), dtype=pl.List(pl.Float32))
|
846
|
+
|
847
|
+
n = len(series)
|
848
|
+
year_norm = pl.zeros(n, dtype=pl.Float32, eager=True).alias("year")
|
849
|
+
month_sine = pl.zeros(n, dtype=pl.Float32, eager=True).alias("month_sine")
|
850
|
+
month_cosine = pl.zeros(n, dtype=pl.Float32, eager=True).alias("month_cosine")
|
851
|
+
day_sine = pl.zeros(n, dtype=pl.Float32, eager=True).alias("day_sine")
|
852
|
+
day_cosine = pl.zeros(n, dtype=pl.Float32, eager=True).alias("day_cosine")
|
853
|
+
hour_sine = pl.zeros(n, dtype=pl.Float32, eager=True).alias("hour_sine")
|
854
|
+
hour_cosine = pl.zeros(n, dtype=pl.Float32, eager=True).alias("hour_cosine")
|
855
|
+
minute_sine = pl.zeros(n, dtype=pl.Float32, eager=True).alias("minute_sine")
|
856
|
+
minute_cosine = pl.zeros(n, dtype=pl.Float32, eager=True).alias("minute_cosine")
|
857
|
+
second_sine = pl.zeros(n, dtype=pl.Float32, eager=True).alias("second_sine")
|
858
|
+
second_cosine = pl.zeros(n, dtype=pl.Float32, eager=True).alias("second_cosine")
|
859
|
+
|
860
|
+
if series.dtype in [pl.Date, pl.Datetime]:
|
861
|
+
try:
|
862
|
+
year = series.dt.year().cast(pl.Float32).alias("year")
|
863
|
+
month = series.dt.month().cast(pl.Float32).alias("month")
|
864
|
+
day = series.dt.day().cast(pl.Float32).alias("day")
|
865
|
+
|
866
|
+
year_norm = (year - REFERENCE_YEAR) / MAX_NUMBER_OF_YEARS
|
867
|
+
month_sine, month_cosine = InMemoryExecutor.get_cyclic_encoding(
|
868
|
+
month, 12
|
869
|
+
)
|
870
|
+
day_sine, day_cosine = InMemoryExecutor.get_cyclic_encoding(day, 31)
|
871
|
+
except pl.exceptions.PanicException as e:
|
872
|
+
_logger.exception("Error extracting datetime", exc_info=e)
|
873
|
+
|
874
|
+
if series.dtype in [pl.Time, pl.Datetime]:
|
875
|
+
try:
|
876
|
+
hour = series.dt.hour().cast(pl.Float32).alias("hour")
|
877
|
+
minute = series.dt.minute().cast(pl.Float32).alias("minute")
|
878
|
+
second = series.dt.second().cast(pl.Float32).alias("second")
|
879
|
+
|
880
|
+
hour_sine, hour_cosine = InMemoryExecutor.get_cyclic_encoding(hour, 24)
|
881
|
+
minute_sine, minute_cosine = InMemoryExecutor.get_cyclic_encoding(
|
882
|
+
minute, 60
|
883
|
+
)
|
884
|
+
second_sine, second_cosine = InMemoryExecutor.get_cyclic_encoding(
|
885
|
+
second, 60
|
886
|
+
)
|
887
|
+
except pl.exceptions.PanicException as e:
|
888
|
+
_logger.exception("Error extracting datetime", exc_info=e)
|
889
|
+
|
890
|
+
return pl.DataFrame(
|
891
|
+
[
|
892
|
+
year_norm.fill_null(0.0),
|
893
|
+
month_sine.fill_null(0.0),
|
894
|
+
month_cosine.fill_null(0.0),
|
895
|
+
day_sine.fill_null(0.0),
|
896
|
+
day_cosine.fill_null(0.0),
|
897
|
+
hour_sine.fill_null(0.0),
|
898
|
+
hour_cosine.fill_null(0.0),
|
899
|
+
minute_sine.fill_null(0.0),
|
900
|
+
minute_cosine.fill_null(0.0),
|
901
|
+
second_sine.fill_null(0.0),
|
902
|
+
second_cosine.fill_null(0.0),
|
903
|
+
]
|
904
|
+
).select(pl.concat_list(pl.all()).alias(series.name))[series.name]
|
905
|
+
|
906
|
+
@staticmethod
|
907
|
+
def encode_duration(series: pl.Series) -> pl.Series:
|
908
|
+
if series.dtype != pl.Duration:
|
909
|
+
raise ValueError("Invalid arguments, expected a duration series")
|
910
|
+
if series.is_null().all():
|
911
|
+
return pl.zeros(len(series), dtype=pl.Float32, eager=True)
|
912
|
+
|
913
|
+
return series.dt.total_seconds().cast(pl.Float32).fill_null(0.0)
|
914
|
+
|
915
|
+
@staticmethod
|
916
|
+
def encode_text(series: pl.Series) -> pl.Series:
|
917
|
+
match series.dtype:
|
918
|
+
case pl.String:
|
919
|
+
pass
|
920
|
+
case pl.Binary:
|
921
|
+
series = series.map_elements(
|
922
|
+
lambda x: x.decode("utf-8", errors="replace"),
|
923
|
+
return_dtype=pl.String,
|
924
|
+
)
|
925
|
+
case _:
|
926
|
+
raise ValueError("Invalid arguments, expected a string series")
|
927
|
+
series = series.fill_null(" ").replace("", " ")
|
928
|
+
return pl.Series(
|
929
|
+
series.name,
|
930
|
+
[[1 / (len(doc) + 1)] for doc in series],
|
931
|
+
pl.List(pl.Float32),
|
932
|
+
)
|
933
|
+
|
934
|
+
def _execute_encode_columns( # noqa: C901, PLR0915
|
819
935
|
self, op: op_graph.op.EncodeColumns, context: _InMemoryExecutionContext
|
820
936
|
):
|
821
937
|
match self._execute(op.source, context):
|
@@ -903,9 +1019,12 @@ class InMemoryExecutor(OpGraphExecutor):
|
|
903
1019
|
from sklearn.preprocessing import MaxAbsScaler
|
904
1020
|
|
905
1021
|
encoder = MaxAbsScaler()
|
906
|
-
|
907
|
-
|
908
|
-
|
1022
|
+
try:
|
1023
|
+
encoded = encoder.fit_transform(
|
1024
|
+
np.nan_to_num(to_encode.to_numpy()).reshape(-1, 1)
|
1025
|
+
).flatten()
|
1026
|
+
except ValueError:
|
1027
|
+
encoded = np.array([])
|
909
1028
|
|
910
1029
|
case op_graph.encoder.StandardScaler():
|
911
1030
|
from sklearn.preprocessing import StandardScaler
|
@@ -917,6 +1036,28 @@ class InMemoryExecutor(OpGraphExecutor):
|
|
917
1036
|
encoded = encoder.fit_transform(
|
918
1037
|
to_encode.to_numpy().reshape(-1, 1)
|
919
1038
|
).flatten()
|
1039
|
+
|
1040
|
+
case op_graph.encoder.TimestampEncoder():
|
1041
|
+
if to_encode.dtype == pl.datatypes.Duration:
|
1042
|
+
encoded = self.encode_duration(to_encode)
|
1043
|
+
else:
|
1044
|
+
encoded = self.encode_datetime(to_encode)
|
1045
|
+
source_df = source_df.with_columns(
|
1046
|
+
encoded.rename(encoder_arg.encoded_column_name).cast(
|
1047
|
+
encoder_arg.encoder.output_dtype
|
1048
|
+
)
|
1049
|
+
)
|
1050
|
+
continue
|
1051
|
+
|
1052
|
+
case op_graph.encoder.TextEncoder():
|
1053
|
+
encoded = self.encode_text(to_encode)
|
1054
|
+
source_df = source_df.with_columns(
|
1055
|
+
encoded.rename(encoder_arg.encoded_column_name).cast(
|
1056
|
+
encoder_arg.encoder.output_dtype
|
1057
|
+
)
|
1058
|
+
)
|
1059
|
+
continue
|
1060
|
+
|
920
1061
|
source_df = source_df.with_columns(
|
921
1062
|
pl.Series(
|
922
1063
|
name=encoder_arg.encoded_column_name,
|
@@ -1158,9 +1299,12 @@ class InMemoryExecutor(OpGraphExecutor):
|
|
1158
1299
|
case err:
|
1159
1300
|
return err
|
1160
1301
|
source_df = _as_df(source_batches)
|
1161
|
-
|
1162
|
-
|
1163
|
-
|
1302
|
+
if len(source_df):
|
1303
|
+
existing_length = get_polars_embedding_length(
|
1304
|
+
source_df, op.column_name
|
1305
|
+
).unwrap_or_raise()
|
1306
|
+
else:
|
1307
|
+
existing_length = 0
|
1164
1308
|
head_length = (
|
1165
1309
|
op.target_column_length
|
1166
1310
|
if existing_length >= op.target_column_length
|
@@ -1,9 +1,9 @@
|
|
1
|
-
corvic_engine-0.3.
|
2
|
-
corvic_engine-0.3.
|
3
|
-
corvic_engine-0.3.
|
1
|
+
corvic_engine-0.3.0rc56.dist-info/METADATA,sha256=kGcTUywSpPxHTYF5tu-LPySqBXMZgKxvNVgkOssAn0A,1876
|
2
|
+
corvic_engine-0.3.0rc56.dist-info/WHEEL,sha256=hKPP3BCTWtTwj6SFaSI--T5aOGqh_llYfbZ_BsqivwA,94
|
3
|
+
corvic_engine-0.3.0rc56.dist-info/licenses/LICENSE,sha256=DSS1OD0oIgssKOmAzkMRBv5jvvVuZQbrIv8lpl9DXY8,1035
|
4
4
|
corvic/context/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
5
|
corvic/context/__init__.py,sha256=zBnPiP-tStGSVMG_0-G_0ay6-yIX2aerW_oYRzAex74,1702
|
6
|
-
corvic/embed/node2vec.py,sha256=
|
6
|
+
corvic/embed/node2vec.py,sha256=XIJjFDdT-JnmZ43lgP-K-dLgnR17L_uaJqBPAYlsPsk,11148
|
7
7
|
corvic/embed/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
8
8
|
corvic/embed/__init__.py,sha256=cZZSrRXmezJuTafcQgrB1rbitqXZTVY1B5ryRzAlvgs,144
|
9
9
|
corvic/embedding_metric/embeddings.py,sha256=5jvSY0cg5P-Wg_KN7DsrcPo5AfJ_1-XKdErx_dNN5B8,14082
|
@@ -13,22 +13,22 @@ corvic/engine/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
13
13
|
corvic/engine/_native.pyi,sha256=KYMPtvXqHZ-jMgZohLf4se3rr-rBpCihmjANcr6s8ag,1390
|
14
14
|
corvic/engine/__init__.py,sha256=XL4Vg7rNcBi29ccVelpeFizR9oJtGYXDn84W9zok9d4,975
|
15
15
|
corvic/model/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
16
|
-
corvic/model/_agent.py,sha256=
|
17
|
-
corvic/model/_base_model.py,sha256=
|
18
|
-
corvic/model/_completion_model.py,sha256=
|
16
|
+
corvic/model/_agent.py,sha256=8tle_IGhy0LTPd1nNXDfBypnzF3CI7S9fhZbDsVxnZc,4737
|
17
|
+
corvic/model/_base_model.py,sha256=WYBBPa1TeU9wchh-UBFlMwuQMDY5cKHBlDznhLbnSHA,8989
|
18
|
+
corvic/model/_completion_model.py,sha256=f_ud3xW1iFXSijGMo0WYDmLfmM6mQayAcjTW37AM3q8,7337
|
19
19
|
corvic/model/_defaults.py,sha256=yoKPPSmYJCE5YAD5jLTEmT4XNf_zXoggNK-uyG8MfVs,1524
|
20
20
|
corvic/model/_errors.py,sha256=Ctlq04SDwHzJPvLaL1rzqzwVqf2b50EILfW3cH4vnh8,261
|
21
21
|
corvic/model/_feature_type.py,sha256=Y-_-wa9fv7XaCAkxfjjoCLxxK2Ftfba-PMefD7bNXzs,917
|
22
|
-
corvic/model/_feature_view.py,sha256=
|
23
|
-
corvic/model/_pipeline.py,sha256=
|
24
|
-
corvic/model/_proto_orm_convert.py,sha256
|
25
|
-
corvic/model/_resource.py,sha256=
|
26
|
-
corvic/model/_room.py,sha256=
|
27
|
-
corvic/model/_source.py,sha256=
|
28
|
-
corvic/model/_space.py,sha256=
|
29
|
-
corvic/model/__init__.py,sha256=
|
22
|
+
corvic/model/_feature_view.py,sha256=gdcXzsMuxpJ7vwbIGYgZlLYNxi2zvdZXvFsb36x6lKg,49694
|
23
|
+
corvic/model/_pipeline.py,sha256=c16ap3yHQXqBmjG_2bMzz8hBYJCr14V2WxwlAYOw5Zw,16279
|
24
|
+
corvic/model/_proto_orm_convert.py,sha256=jmzmaaUkSxeHB5OMef92AyGw7sorJ6pP4ylbeKXoHvA,26120
|
25
|
+
corvic/model/_resource.py,sha256=w5m6mmD8KrHJ8efPTfRV0JKaCmkDRaxlGeuRMmVbw10,7773
|
26
|
+
corvic/model/_room.py,sha256=36mXngZ38L4mr6_LgUm-QgsUUaoGMiYQRfvXLV_jd-4,2914
|
27
|
+
corvic/model/_source.py,sha256=A1Jk4r5mB0f-Y3L8esaQFCUAu7CCTlwAm7f4qSnvjsM,9603
|
28
|
+
corvic/model/_space.py,sha256=13ggLTCQMNTYYpP5PldMqtJiKp3sWOVRhQcktmoHefA,35590
|
29
|
+
corvic/model/__init__.py,sha256=Lb-yC04t17Hr2TlnGfn5Ewzd2h1nH4hb9tKdMNAak9s,3075
|
30
30
|
corvic/op_graph/aggregation.py,sha256=8X6vqXD7dLHrhYJU0BqmhUsWGbzD1zSP5Db5VHdIru4,6187
|
31
|
-
corvic/op_graph/encoders.py,sha256=
|
31
|
+
corvic/op_graph/encoders.py,sha256=93wYoBCn_us5lRCkqvjaP0LTg3LBB3yEfhzICv06bB0,10460
|
32
32
|
corvic/op_graph/errors.py,sha256=I4NE5053d0deGm5xx5EmyP4f98qx42xnIsW1IA-2hy4,163
|
33
33
|
corvic/op_graph/feature_types.py,sha256=ZE6onUGW4Xa7tPL4XgRVQ1Tvj5FVJJ66di3ShDTR0Ak,9623
|
34
34
|
corvic/op_graph/ops.py,sha256=G2bDIK_hlKJxqOX5Xu9hEoLBkdqi-TsZSn6tTagqjgg,109823
|
@@ -40,7 +40,7 @@ corvic/op_graph/sample_strategy.py,sha256=DrbtJ3ORkIRfyIE_FdlOh_UMnCW_K9jL1LeonV
|
|
40
40
|
corvic/op_graph/_schema.py,sha256=STbxY5PIqIA6xkSDeK8k72Nutsxq5jGe7e_aT35aznI,5733
|
41
41
|
corvic/op_graph/_transformations.py,sha256=L9Au_GcciPynww4ZXojMtNdPJ36Qboc9gn0bVzXLifU,9445
|
42
42
|
corvic/op_graph/__init__.py,sha256=1DMrQfuuS3FkLa9DXYDjSDLurdxxpG5H1jB2ctaa9xo,1444
|
43
|
-
corvic/orm/base.py,sha256=
|
43
|
+
corvic/orm/base.py,sha256=95nkqycCZ1FaWAhTsa7zbZ0YuwNFkMUW7Wk8yhtYau8,8824
|
44
44
|
corvic/orm/errors.py,sha256=uFhFXpVG6pby1lndJZHGHxv3Y0Fbt0RiaZ-CqDfuY1o,545
|
45
45
|
corvic/orm/func/utc_func.py,sha256=-FC6w9wBWXejMv1AICT2Gg7tdkSo7gqL2dFT-YKPGQ4,4518
|
46
46
|
corvic/orm/func/uuid_func.py,sha256=oXPjDGAl3mvlNtvcvBrLmRRHPJgtKffShIPbHm-EswA,1152
|
@@ -50,7 +50,7 @@ corvic/orm/keys.py,sha256=Ag6Xbpvxev-VByT1KJ8ChUn9vKVEzkkMXxrjvtADCtY,2182
|
|
50
50
|
corvic/orm/mixins.py,sha256=HfmzJ7LblHtddbbkDmv7nNWURL87Bnj8NeOnNbfmSN4,17794
|
51
51
|
corvic/orm/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
52
52
|
corvic/orm/_proto_columns.py,sha256=tcOu92UjFJFYZLasS6sWJQBDRK26yrnmpTii_LDY4iw,913
|
53
|
-
corvic/orm/__init__.py,sha256=
|
53
|
+
corvic/orm/__init__.py,sha256=Yzfn_GyCGHzf-wt-CmtamW15PyuZ7tHI7IqQw-3aPmQ,14827
|
54
54
|
corvic/pa_scalar/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
55
55
|
corvic/pa_scalar/_const.py,sha256=1nk6w3Y7crd3J5jSCq7DRVa1lcGk4H1RUr1l4NjnlzE,868
|
56
56
|
corvic/pa_scalar/_from_value.py,sha256=fS3TNPcPI3jAKGmcUIhn8rdqdQEAwgTLEneVxFUeK6M,27531
|
@@ -68,7 +68,7 @@ corvic/sql/parse_ops.py,sha256=1ZXVlDzIzqwW_KP0mwMxaY91tLSXqpeaUHyrGJkh56o,29444
|
|
68
68
|
corvic/sql/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
69
69
|
corvic/sql/__init__.py,sha256=kZ1a39KVZ08P8Bg6XuXDLD_dTQX0k620u4nwxZF4SnY,303
|
70
70
|
corvic/system/client.py,sha256=hGhZX8RtHrFEOlOmJNlUHktOZrutOwNYUY_a1htQSrg,821
|
71
|
-
corvic/system/in_memory_executor.py,sha256=
|
71
|
+
corvic/system/in_memory_executor.py,sha256=t5zYx2SC7SXlG_iGa5gKnaTgOKUoTS6FQUL9FYVFex0,67586
|
72
72
|
corvic/system/op_graph_executor.py,sha256=gXFnVkemS5EwNegJdU-xVAfMLPULqMFPF7d3EG3AD_U,3482
|
73
73
|
corvic/system/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
74
74
|
corvic/system/staging.py,sha256=K5P5moiuAMfPx7lxK4mArxeURBwKoyB6x9HGu9JJ16E,1846
|
@@ -156,7 +156,7 @@ corvic_generated/ingest/v2/table_pb2.py,sha256=aTJHaliZm5DMtp7gslNxyn9uDagz-2-_e
|
|
156
156
|
corvic_generated/ingest/v2/table_pb2_grpc.py,sha256=tVs7wMWyAfvHcCQEiUOHLwaptKxgMFG6E7Ki9vNmmvQ,8151
|
157
157
|
corvic_generated/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
158
158
|
corvic_generated/model/v1alpha/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
159
|
-
corvic_generated/model/v1alpha/models_pb2.py,sha256=
|
159
|
+
corvic_generated/model/v1alpha/models_pb2.py,sha256=Jvw4rYuekrbjI7sx0QPcLnTDL5aXI3l0drMiM7dy4ac,8703
|
160
160
|
corvic_generated/model/v1alpha/models_pb2_grpc.py,sha256=_bXoS025FcWrXR1E_3Mh4GHB1RMvgz8lIpit-Awnf-s,163
|
161
161
|
corvic_generated/orm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
162
162
|
corvic_generated/orm/v1/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -172,7 +172,7 @@ corvic_generated/orm/v1/pipeline_pb2.py,sha256=_J1kHAe2CQT_epXows4eOZKdQFLTLqZgz
|
|
172
172
|
corvic_generated/orm/v1/pipeline_pb2_grpc.py,sha256=_bXoS025FcWrXR1E_3Mh4GHB1RMvgz8lIpit-Awnf-s,163
|
173
173
|
corvic_generated/orm/v1/space_pb2.py,sha256=grI4123GBbA-iHnbtbK8xyfIv1lZL1hDl3q43vXGta8,2147
|
174
174
|
corvic_generated/orm/v1/space_pb2_grpc.py,sha256=_bXoS025FcWrXR1E_3Mh4GHB1RMvgz8lIpit-Awnf-s,163
|
175
|
-
corvic_generated/orm/v1/table_pb2.py,sha256=
|
175
|
+
corvic_generated/orm/v1/table_pb2.py,sha256=QQwgItxOncMAM_BG9tWX4K3_vYgtlsGoOXocExcQTJI,40062
|
176
176
|
corvic_generated/orm/v1/table_pb2_grpc.py,sha256=ixBOrA7wwNxEQCRT1kO2N_LayeFYEdFJjVRkkhesWbY,4558
|
177
177
|
corvic_generated/platform/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
178
178
|
corvic_generated/platform/v1/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -220,7 +220,7 @@ corvic_generated/ingest/v2/source_pb2.pyi,sha256=k7FdbgurQLk0JA1WiTUerznzxLv8b50
|
|
220
220
|
corvic_generated/ingest/v2/source_pb2_grpc.pyi,sha256=VG5gpql2SREHgqMC_ycT-QJBVpPeSYKOYS2COgGrZa4,6195
|
221
221
|
corvic_generated/ingest/v2/table_pb2.pyi,sha256=p22F8kv0HfM-9OzGP88bLofxmUtxfLR5eVN0HOxXiEo,4382
|
222
222
|
corvic_generated/ingest/v2/table_pb2_grpc.pyi,sha256=AEXYNtrU4xyENumcCrkD2FmFV7T1UVidxxeZ5pyE4Qc,4554
|
223
|
-
corvic_generated/model/v1alpha/models_pb2.pyi,sha256=
|
223
|
+
corvic_generated/model/v1alpha/models_pb2.pyi,sha256=K8clNf_M36tu0DEOb4Lo4l_fh4DA2IOD3c1uTI07Wgo,11513
|
224
224
|
corvic_generated/model/v1alpha/models_pb2_grpc.pyi,sha256=H9-ADaiKR9iyVZvmnXutZqWwRRCDxjUIktkfJrJFIHg,417
|
225
225
|
corvic_generated/orm/v1/agent_pb2.pyi,sha256=AxcZC0AJqiOyu_5quSMR-E0MjVhDY7b5ym4uZa7WFug,4670
|
226
226
|
corvic_generated/orm/v1/agent_pb2_grpc.pyi,sha256=H9-ADaiKR9iyVZvmnXutZqWwRRCDxjUIktkfJrJFIHg,417
|
@@ -234,7 +234,7 @@ corvic_generated/orm/v1/pipeline_pb2.pyi,sha256=i2VWx5wqO5GR8OGGSUUr3w7n-TqSo_UP
|
|
234
234
|
corvic_generated/orm/v1/pipeline_pb2_grpc.pyi,sha256=H9-ADaiKR9iyVZvmnXutZqWwRRCDxjUIktkfJrJFIHg,417
|
235
235
|
corvic_generated/orm/v1/space_pb2.pyi,sha256=qKMymobwu_qQAlFxayifiUkQBpjrK9tAgoQOB2VL7Rc,1748
|
236
236
|
corvic_generated/orm/v1/space_pb2_grpc.pyi,sha256=H9-ADaiKR9iyVZvmnXutZqWwRRCDxjUIktkfJrJFIHg,417
|
237
|
-
corvic_generated/orm/v1/table_pb2.pyi,sha256=
|
237
|
+
corvic_generated/orm/v1/table_pb2.pyi,sha256=FsWPyzsP-1Ocj4rEyFHPHHCLmgX-135lH9zHLcVB33I,50224
|
238
238
|
corvic_generated/orm/v1/table_pb2_grpc.pyi,sha256=K4hyNndkiKpxt9PYxcn_98RTpb4yxET3Um2rDe3VJTI,2499
|
239
239
|
corvic_generated/platform/v1/platform_pb2.pyi,sha256=y6kR7rBuar5cFgn7vTaAVTETOsmki-fC4I-4Y1M8JrQ,5627
|
240
240
|
corvic_generated/platform/v1/platform_pb2_grpc.pyi,sha256=wlnUjgSOjBJzyG6ubpMv9H9XD_jJUQyUsgYIyx_hx20,7652
|
@@ -244,5 +244,5 @@ corvic_generated/status/v1/event_pb2.pyi,sha256=eU-ibrYpvEAJSIDlSa62-bC96AQU1ykF
|
|
244
244
|
corvic_generated/status/v1/event_pb2_grpc.pyi,sha256=H9-ADaiKR9iyVZvmnXutZqWwRRCDxjUIktkfJrJFIHg,417
|
245
245
|
corvic_generated/status/v1/service_pb2.pyi,sha256=iXLR2FOKQJpBgvBzpD2kVwcYOCksP2aRwK4JYaI9CBw,558
|
246
246
|
corvic_generated/status/v1/service_pb2_grpc.pyi,sha256=OoAnaZ64FD0UTzPoRhYvQU8ecoilhHj3ySjSfHbVDaU,1501
|
247
|
-
corvic/engine/_native.pyd,sha256=
|
248
|
-
corvic_engine-0.3.
|
247
|
+
corvic/engine/_native.pyd,sha256=XkU3bVVXAk3up15IfaE0ih1d0_Lo8jRl_mJp1ZwbBls,438272
|
248
|
+
corvic_engine-0.3.0rc56.dist-info/RECORD,,
|