corvic-engine 0.3.0rc56__cp38-abi3-win_amd64.whl → 0.3.0rc57__cp38-abi3-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- corvic/engine/_native.pyd +0 -0
- corvic/model/_space.py +11 -4
- corvic/op_graph/_schema.py +2 -4
- corvic/op_graph/ops.py +30 -12
- corvic/system/in_memory_executor.py +42 -12
- corvic/table/table.py +9 -3
- {corvic_engine-0.3.0rc56.dist-info → corvic_engine-0.3.0rc57.dist-info}/METADATA +1 -1
- {corvic_engine-0.3.0rc56.dist-info → corvic_engine-0.3.0rc57.dist-info}/RECORD +10 -10
- {corvic_engine-0.3.0rc56.dist-info → corvic_engine-0.3.0rc57.dist-info}/WHEEL +0 -0
- {corvic_engine-0.3.0rc56.dist-info → corvic_engine-0.3.0rc57.dist-info}/licenses/LICENSE +0 -0
corvic/engine/_native.pyd
CHANGED
Binary file
|
corvic/model/_space.py
CHANGED
@@ -836,13 +836,20 @@ class TabularSpace(Space):
|
|
836
836
|
pa.string(),
|
837
837
|
)
|
838
838
|
|
839
|
-
|
840
|
-
op.concat_list(
|
839
|
+
if len(embedding_column_tmp_names):
|
840
|
+
op = op.concat_list(
|
841
841
|
column_names=embedding_column_tmp_names,
|
842
842
|
concat_list_column_name=embedding_column_tmp_name,
|
843
|
+
).and_then(reduce_dimension)
|
844
|
+
else:
|
845
|
+
op = op.add_literal_column(
|
846
|
+
column_name=embedding_column_tmp_name,
|
847
|
+
literal=[],
|
848
|
+
dtype=pa.large_list(pa.float32()),
|
849
|
+
ftype=op_graph.feature_type.embedding(),
|
843
850
|
)
|
844
|
-
|
845
|
-
.and_then(select_columns)
|
851
|
+
op = (
|
852
|
+
op.and_then(select_columns)
|
846
853
|
.and_then(update_feature_types)
|
847
854
|
.and_then(rename_columns)
|
848
855
|
.and_then(add_literal_column)
|
corvic/op_graph/_schema.py
CHANGED
@@ -2,8 +2,6 @@
|
|
2
2
|
|
3
3
|
from __future__ import annotations
|
4
4
|
|
5
|
-
from collections.abc import Sequence
|
6
|
-
|
7
5
|
import polars as pl
|
8
6
|
import pyarrow as pa
|
9
7
|
|
@@ -14,7 +12,7 @@ from corvic.op_graph.feature_types import FeatureType
|
|
14
12
|
|
15
13
|
def _infer_possible_feature_types(
|
16
14
|
data_type: pa.DataType,
|
17
|
-
) ->
|
15
|
+
) -> list[FeatureType]:
|
18
16
|
if pa.types.is_integer(data_type):
|
19
17
|
# TODO(thunt): Add multi-categorical in future versions
|
20
18
|
return [
|
@@ -168,6 +166,6 @@ class Field:
|
|
168
166
|
def rename(self, new_name: str) -> Field:
|
169
167
|
return Field(new_name, self.dtype, self.ftype)
|
170
168
|
|
171
|
-
def possible_feature_types(self) ->
|
169
|
+
def possible_feature_types(self) -> list[FeatureType]:
|
172
170
|
"""Infer possible feature types given the data type."""
|
173
171
|
return _infer_possible_feature_types(self.dtype)
|
corvic/op_graph/ops.py
CHANGED
@@ -59,7 +59,7 @@ from corvic.op_graph.row_filters import RowFilter
|
|
59
59
|
from corvic.op_graph.row_filters import from_proto as row_filters_from_proto
|
60
60
|
from corvic.op_graph.sample_strategy import SampleStrategy
|
61
61
|
from corvic.op_graph.sample_strategy import from_proto as sample_strategy_from_proto
|
62
|
-
from corvic.pa_scalar import PyValue, from_value, to_value
|
62
|
+
from corvic.pa_scalar import PyValue, Scalar, from_value, to_value
|
63
63
|
from corvic.proto_wrapper import OneofProtoWrapper
|
64
64
|
from corvic.result import InternalError, InvalidArgumentError, Ok
|
65
65
|
from corvic_generated.algorithm.graph.v1 import graph_pb2
|
@@ -1275,7 +1275,7 @@ class _Base(OneofProtoWrapper[table_pb2.TableComputeOp], ABC):
|
|
1275
1275
|
def add_literal_column(
|
1276
1276
|
self,
|
1277
1277
|
column_name: str,
|
1278
|
-
literal: struct_pb2.Value |
|
1278
|
+
literal: struct_pb2.Value | Scalar | PyValue,
|
1279
1279
|
dtype: pa.DataType,
|
1280
1280
|
ftype: FeatureType | None = None,
|
1281
1281
|
) -> Ok[AddLiteralColumn] | InvalidArgumentError:
|
@@ -1313,8 +1313,13 @@ class _Base(OneofProtoWrapper[table_pb2.TableComputeOp], ABC):
|
|
1313
1313
|
separator: str,
|
1314
1314
|
) -> Ok[CombineColumns] | InvalidArgumentError:
|
1315
1315
|
for col in column_names:
|
1316
|
-
|
1316
|
+
field = self.schema.get(col)
|
1317
|
+
if not field:
|
1317
1318
|
return InvalidArgumentError("no column with given name", given_name=col)
|
1319
|
+
if pa.types.is_binary(field.dtype) | pa.types.is_large_binary(field.dtype):
|
1320
|
+
return InvalidArgumentError(
|
1321
|
+
"cannot concat binary columns", given_name=col
|
1322
|
+
)
|
1318
1323
|
|
1319
1324
|
if self.schema.has_column(combined_column_name):
|
1320
1325
|
return InvalidArgumentError("name given for combined column already exists")
|
@@ -1698,6 +1703,9 @@ class _Base(OneofProtoWrapper[table_pb2.TableComputeOp], ABC):
|
|
1698
1703
|
Returns:
|
1699
1704
|
An AddDecisionTreeSummaryOp.
|
1700
1705
|
"""
|
1706
|
+
if max_depth < 0:
|
1707
|
+
return InvalidArgumentError("max_depth must be strictly positive")
|
1708
|
+
|
1701
1709
|
return Ok(
|
1702
1710
|
from_proto(
|
1703
1711
|
table_pb2.AddDecisionTreeSummaryOp(
|
@@ -2938,16 +2946,26 @@ def _make_schema_for_combine_columns(op: CombineColumns):
|
|
2938
2946
|
if column.ftype != ftype:
|
2939
2947
|
ftype = feature_type.unknown()
|
2940
2948
|
|
2941
|
-
|
2942
|
-
|
2943
|
-
|
2944
|
-
|
2949
|
+
if op.column_names:
|
2950
|
+
dtype = (
|
2951
|
+
pl.DataFrame(schema=schema.to_polars())
|
2952
|
+
.with_columns(
|
2953
|
+
pl.concat_list(*op.column_names).alias(op.combined_column_name)
|
2954
|
+
)
|
2955
|
+
.to_arrow()
|
2956
|
+
.schema.field(op.combined_column_name)
|
2957
|
+
.type
|
2958
|
+
)
|
2959
|
+
else:
|
2960
|
+
dtype = (
|
2961
|
+
pl.DataFrame(schema=schema.to_polars())
|
2962
|
+
.with_columns(
|
2963
|
+
pl.Series(op.combined_column_name, [], pl.List(pl.Float32))
|
2964
|
+
)
|
2965
|
+
.to_arrow()
|
2966
|
+
.schema.field(op.combined_column_name)
|
2967
|
+
.type
|
2945
2968
|
)
|
2946
|
-
.to_arrow()
|
2947
|
-
.schema.field(op.combined_column_name)
|
2948
|
-
.type
|
2949
|
-
)
|
2950
|
-
|
2951
2969
|
return Schema(
|
2952
2970
|
[
|
2953
2971
|
*op.source.schema,
|
@@ -12,6 +12,7 @@ from typing import Any, Final, cast
|
|
12
12
|
|
13
13
|
import numpy as np
|
14
14
|
import polars as pl
|
15
|
+
import polars.selectors as cs
|
15
16
|
import pyarrow as pa
|
16
17
|
import pyarrow.parquet as pq
|
17
18
|
import structlog
|
@@ -778,9 +779,14 @@ class InMemoryExecutor(OpGraphExecutor):
|
|
778
779
|
)
|
779
780
|
|
780
781
|
case op_graph.ConcatList():
|
781
|
-
|
782
|
-
|
783
|
-
|
782
|
+
if op.column_names:
|
783
|
+
result_df = source_df.with_columns(
|
784
|
+
pl.concat_list(*op.column_names).alias(op.combined_column_name)
|
785
|
+
)
|
786
|
+
else:
|
787
|
+
result_df = source_df.with_columns(
|
788
|
+
pl.Series(op.combined_column_name, [])
|
789
|
+
)
|
784
790
|
|
785
791
|
return Ok(_SchemaAndBatches.from_dataframe(result_df, source_batches.metrics))
|
786
792
|
|
@@ -1394,17 +1400,41 @@ class InMemoryExecutor(OpGraphExecutor):
|
|
1394
1400
|
pass
|
1395
1401
|
case err:
|
1396
1402
|
return err
|
1403
|
+
|
1397
1404
|
df_input = _as_df(source_batches)
|
1405
|
+
dataframe = df_input[list({*op.feature_column_names, op.label_column_name})]
|
1406
|
+
boolean_columns = [
|
1407
|
+
name
|
1408
|
+
for name, dtype in dataframe.schema.items()
|
1409
|
+
if dtype == pl.Boolean() and name in op.feature_column_names
|
1410
|
+
]
|
1398
1411
|
|
1399
|
-
|
1400
|
-
|
1412
|
+
# Drop Nan and Null and infinite rows as not supported by decision tree
|
1413
|
+
dataframe = dataframe.with_columns(
|
1414
|
+
*[pl.col(col).cast(pl.Float32) for col in op.feature_column_names]
|
1415
|
+
)
|
1416
|
+
dataframe = dataframe.drop_nans().drop_nulls()
|
1417
|
+
try:
|
1418
|
+
# is_infinite is not implemented for all datatypes
|
1419
|
+
dataframe = dataframe.filter(~pl.any_horizontal(cs.numeric().is_infinite()))
|
1420
|
+
except pl.exceptions.InvalidOperationError as err:
|
1421
|
+
return InvalidArgumentError.from_(err)
|
1422
|
+
|
1423
|
+
if not len(dataframe):
|
1424
|
+
return InvalidArgumentError(
|
1425
|
+
"a minimum of 1 sample is required by DecisionTreeClassifier"
|
1426
|
+
)
|
1427
|
+
features = dataframe[op.feature_column_names]
|
1428
|
+
classes = dataframe[op.label_column_name]
|
1401
1429
|
max_depth = op.max_depth
|
1402
1430
|
|
1403
|
-
binary_columns = [
|
1404
|
-
name for name, dtype in features.schema.items() if dtype == pl.Boolean()
|
1405
|
-
]
|
1406
|
-
|
1407
1431
|
from sklearn.tree import DecisionTreeClassifier, export_graphviz, export_text
|
1432
|
+
from sklearn.utils.multiclass import check_classification_targets
|
1433
|
+
|
1434
|
+
try:
|
1435
|
+
check_classification_targets(classes)
|
1436
|
+
except ValueError as err:
|
1437
|
+
return InvalidArgumentError.from_(err)
|
1408
1438
|
|
1409
1439
|
decision_tree = DecisionTreeClassifier(random_state=0, max_depth=max_depth)
|
1410
1440
|
try:
|
@@ -1426,11 +1456,11 @@ class InMemoryExecutor(OpGraphExecutor):
|
|
1426
1456
|
max_depth=max_depth,
|
1427
1457
|
)
|
1428
1458
|
|
1429
|
-
for
|
1459
|
+
for boolean_column in boolean_columns:
|
1430
1460
|
tree_str = tree_str.replace(
|
1431
|
-
f"{
|
1461
|
+
f"{boolean_column} <= 0.50", f"NOT {boolean_column}"
|
1432
1462
|
)
|
1433
|
-
tree_str = tree_str.replace(f"{
|
1463
|
+
tree_str = tree_str.replace(f"{boolean_column} > 0.50", boolean_column)
|
1434
1464
|
|
1435
1465
|
metrics = source_batches.metrics.copy()
|
1436
1466
|
metrics[op.output_metric_key] = table_pb2.DecisionTreeSummary(
|
corvic/table/table.py
CHANGED
@@ -412,12 +412,18 @@ class Table:
|
|
412
412
|
case _:
|
413
413
|
return more_itertools.flatten(map(cls._get_staging_ops, op.sources()))
|
414
414
|
|
415
|
-
def head(self) -> InvalidArgumentError | Ok[Table]:
|
416
|
-
"""Get
|
417
|
-
return self.op_graph.limit_rows(num_rows=
|
415
|
+
def head(self, num_rows: int = 10) -> InvalidArgumentError | Ok[Table]:
|
416
|
+
"""Get the first `num_rows` rows of the table."""
|
417
|
+
return self.op_graph.limit_rows(num_rows=num_rows).map(
|
418
418
|
lambda op: Table(self.client, op)
|
419
419
|
)
|
420
420
|
|
421
|
+
def sample_rows(self, num_rows: int = 10) -> InvalidArgumentError | Ok[Table]:
|
422
|
+
"""Get a sample of `num_rows` rows of the table."""
|
423
|
+
return self.op_graph.sample_rows(
|
424
|
+
sample_strategy=op_graph.sample_strategy.uniform_random(), num_rows=num_rows
|
425
|
+
).map(lambda op: Table(self.client, op))
|
426
|
+
|
421
427
|
def distinct_rows(self) -> Table:
|
422
428
|
return Table(
|
423
429
|
self.client,
|
@@ -1,6 +1,6 @@
|
|
1
|
-
corvic_engine-0.3.
|
2
|
-
corvic_engine-0.3.
|
3
|
-
corvic_engine-0.3.
|
1
|
+
corvic_engine-0.3.0rc57.dist-info/METADATA,sha256=T40bDZQGa5aI1lYDYYWrtEwMrowLcANHpje9gUdPpsM,1876
|
2
|
+
corvic_engine-0.3.0rc57.dist-info/WHEEL,sha256=hKPP3BCTWtTwj6SFaSI--T5aOGqh_llYfbZ_BsqivwA,94
|
3
|
+
corvic_engine-0.3.0rc57.dist-info/licenses/LICENSE,sha256=DSS1OD0oIgssKOmAzkMRBv5jvvVuZQbrIv8lpl9DXY8,1035
|
4
4
|
corvic/context/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
5
|
corvic/context/__init__.py,sha256=zBnPiP-tStGSVMG_0-G_0ay6-yIX2aerW_oYRzAex74,1702
|
6
6
|
corvic/embed/node2vec.py,sha256=XIJjFDdT-JnmZ43lgP-K-dLgnR17L_uaJqBPAYlsPsk,11148
|
@@ -25,19 +25,19 @@ corvic/model/_proto_orm_convert.py,sha256=jmzmaaUkSxeHB5OMef92AyGw7sorJ6pP4ylbeK
|
|
25
25
|
corvic/model/_resource.py,sha256=w5m6mmD8KrHJ8efPTfRV0JKaCmkDRaxlGeuRMmVbw10,7773
|
26
26
|
corvic/model/_room.py,sha256=36mXngZ38L4mr6_LgUm-QgsUUaoGMiYQRfvXLV_jd-4,2914
|
27
27
|
corvic/model/_source.py,sha256=A1Jk4r5mB0f-Y3L8esaQFCUAu7CCTlwAm7f4qSnvjsM,9603
|
28
|
-
corvic/model/_space.py,sha256=
|
28
|
+
corvic/model/_space.py,sha256=ZljalsBDrcnsx2sUOpJd6qQO2nFYDFttNoJMiLdGTBM,35922
|
29
29
|
corvic/model/__init__.py,sha256=Lb-yC04t17Hr2TlnGfn5Ewzd2h1nH4hb9tKdMNAak9s,3075
|
30
30
|
corvic/op_graph/aggregation.py,sha256=8X6vqXD7dLHrhYJU0BqmhUsWGbzD1zSP5Db5VHdIru4,6187
|
31
31
|
corvic/op_graph/encoders.py,sha256=93wYoBCn_us5lRCkqvjaP0LTg3LBB3yEfhzICv06bB0,10460
|
32
32
|
corvic/op_graph/errors.py,sha256=I4NE5053d0deGm5xx5EmyP4f98qx42xnIsW1IA-2hy4,163
|
33
33
|
corvic/op_graph/feature_types.py,sha256=ZE6onUGW4Xa7tPL4XgRVQ1Tvj5FVJJ66di3ShDTR0Ak,9623
|
34
|
-
corvic/op_graph/ops.py,sha256=
|
34
|
+
corvic/op_graph/ops.py,sha256=1YOFnnN6WgzBajkXRM9UgdMLd-NEfa4tRmIQVj5cyeo,110637
|
35
35
|
corvic/op_graph/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
36
36
|
corvic/op_graph/row_filters/_jsonlogic.py,sha256=tBd-wOwE6AIx9XEkuSVdBx9iB08nsdHJdvNzEmWzrB0,6432
|
37
37
|
corvic/op_graph/row_filters/_row_filters.py,sha256=d7oUbB-vThi-Kn5GupGnEwr5UNlsGFgCgR3Q7NR_tkI,9554
|
38
38
|
corvic/op_graph/row_filters/__init__.py,sha256=1sibH_kLw7t_9bpRccnEGWqdCiN0VaUh9LMMIMCRyL8,575
|
39
39
|
corvic/op_graph/sample_strategy.py,sha256=DrbtJ3ORkIRfyIE_FdlOh_UMnCW_K9jL1LeonVYb3bU,3007
|
40
|
-
corvic/op_graph/_schema.py,sha256=
|
40
|
+
corvic/op_graph/_schema.py,sha256=7Uuun9e6PRrtOeJLsFD8VzkwWeUpbnBcD37NpMKOcmQ,5685
|
41
41
|
corvic/op_graph/_transformations.py,sha256=L9Au_GcciPynww4ZXojMtNdPJ36Qboc9gn0bVzXLifU,9445
|
42
42
|
corvic/op_graph/__init__.py,sha256=1DMrQfuuS3FkLa9DXYDjSDLurdxxpG5H1jB2ctaa9xo,1444
|
43
43
|
corvic/orm/base.py,sha256=95nkqycCZ1FaWAhTsa7zbZ0YuwNFkMUW7Wk8yhtYau8,8824
|
@@ -68,7 +68,7 @@ corvic/sql/parse_ops.py,sha256=1ZXVlDzIzqwW_KP0mwMxaY91tLSXqpeaUHyrGJkh56o,29444
|
|
68
68
|
corvic/sql/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
69
69
|
corvic/sql/__init__.py,sha256=kZ1a39KVZ08P8Bg6XuXDLD_dTQX0k620u4nwxZF4SnY,303
|
70
70
|
corvic/system/client.py,sha256=hGhZX8RtHrFEOlOmJNlUHktOZrutOwNYUY_a1htQSrg,821
|
71
|
-
corvic/system/in_memory_executor.py,sha256=
|
71
|
+
corvic/system/in_memory_executor.py,sha256=tRYzoVCNHemlpPfYRaVM_Nc3uFsLYaOFof1nVR-6hGc,68943
|
72
72
|
corvic/system/op_graph_executor.py,sha256=gXFnVkemS5EwNegJdU-xVAfMLPULqMFPF7d3EG3AD_U,3482
|
73
73
|
corvic/system/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
74
74
|
corvic/system/staging.py,sha256=K5P5moiuAMfPx7lxK4mArxeURBwKoyB6x9HGu9JJ16E,1846
|
@@ -86,7 +86,7 @@ corvic/system_sqlite/rdbms_blob_store.py,sha256=gTP_tQfTVb3wzZkzo8ys1zaz0rSrERzb
|
|
86
86
|
corvic/system_sqlite/staging.py,sha256=P6XdWhjpgcpOZkYxKEjpsTxaAdBKOeSVfARjqt4_xJA,16948
|
87
87
|
corvic/system_sqlite/__init__.py,sha256=F4UN9vFsXiDY2AKk1jYZPuWWJpSugKHS7ghXeZYlbZs,390
|
88
88
|
corvic/table/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
89
|
-
corvic/table/table.py,sha256=
|
89
|
+
corvic/table/table.py,sha256=v3MTV_nHaSAXFjPurn0Gp9Pe4UVL8RhYUHhxR6MVfmE,25396
|
90
90
|
corvic/table/__init__.py,sha256=Gj0IR8BQF5PZK92Us7PP0ZigMsVyrfWJupzH8TgzRQk,588
|
91
91
|
corvic/version/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
92
92
|
corvic/version/__init__.py,sha256=JlkRLvKXsu3zIxhdynO_0Ub5NfQOvGjfwCRkNnaOu9U,1125
|
@@ -244,5 +244,5 @@ corvic_generated/status/v1/event_pb2.pyi,sha256=eU-ibrYpvEAJSIDlSa62-bC96AQU1ykF
|
|
244
244
|
corvic_generated/status/v1/event_pb2_grpc.pyi,sha256=H9-ADaiKR9iyVZvmnXutZqWwRRCDxjUIktkfJrJFIHg,417
|
245
245
|
corvic_generated/status/v1/service_pb2.pyi,sha256=iXLR2FOKQJpBgvBzpD2kVwcYOCksP2aRwK4JYaI9CBw,558
|
246
246
|
corvic_generated/status/v1/service_pb2_grpc.pyi,sha256=OoAnaZ64FD0UTzPoRhYvQU8ecoilhHj3ySjSfHbVDaU,1501
|
247
|
-
corvic/engine/_native.pyd,sha256
|
248
|
-
corvic_engine-0.3.
|
247
|
+
corvic/engine/_native.pyd,sha256=-QxbxEBeQo7SFPLJlh84nSQEIcAM-zOfcmElul0VQ1U,438272
|
248
|
+
corvic_engine-0.3.0rc57.dist-info/RECORD,,
|
File without changes
|
File without changes
|