corvic-engine 0.3.0rc56__cp38-abi3-win_amd64.whl → 0.3.0rc57__cp38-abi3-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
corvic/engine/_native.pyd CHANGED
Binary file
corvic/model/_space.py CHANGED
@@ -836,13 +836,20 @@ class TabularSpace(Space):
836
836
  pa.string(),
837
837
  )
838
838
 
839
- op = (
840
- op.concat_list(
839
+ if len(embedding_column_tmp_names):
840
+ op = op.concat_list(
841
841
  column_names=embedding_column_tmp_names,
842
842
  concat_list_column_name=embedding_column_tmp_name,
843
+ ).and_then(reduce_dimension)
844
+ else:
845
+ op = op.add_literal_column(
846
+ column_name=embedding_column_tmp_name,
847
+ literal=[],
848
+ dtype=pa.large_list(pa.float32()),
849
+ ftype=op_graph.feature_type.embedding(),
843
850
  )
844
- .and_then(reduce_dimension)
845
- .and_then(select_columns)
851
+ op = (
852
+ op.and_then(select_columns)
846
853
  .and_then(update_feature_types)
847
854
  .and_then(rename_columns)
848
855
  .and_then(add_literal_column)
@@ -2,8 +2,6 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- from collections.abc import Sequence
6
-
7
5
  import polars as pl
8
6
  import pyarrow as pa
9
7
 
@@ -14,7 +12,7 @@ from corvic.op_graph.feature_types import FeatureType
14
12
 
15
13
  def _infer_possible_feature_types(
16
14
  data_type: pa.DataType,
17
- ) -> Sequence[FeatureType]:
15
+ ) -> list[FeatureType]:
18
16
  if pa.types.is_integer(data_type):
19
17
  # TODO(thunt): Add multi-categorical in future versions
20
18
  return [
@@ -168,6 +166,6 @@ class Field:
168
166
  def rename(self, new_name: str) -> Field:
169
167
  return Field(new_name, self.dtype, self.ftype)
170
168
 
171
- def possible_feature_types(self) -> Sequence[FeatureType]:
169
+ def possible_feature_types(self) -> list[FeatureType]:
172
170
  """Infer possible feature types given the data type."""
173
171
  return _infer_possible_feature_types(self.dtype)
corvic/op_graph/ops.py CHANGED
@@ -59,7 +59,7 @@ from corvic.op_graph.row_filters import RowFilter
59
59
  from corvic.op_graph.row_filters import from_proto as row_filters_from_proto
60
60
  from corvic.op_graph.sample_strategy import SampleStrategy
61
61
  from corvic.op_graph.sample_strategy import from_proto as sample_strategy_from_proto
62
- from corvic.pa_scalar import PyValue, from_value, to_value
62
+ from corvic.pa_scalar import PyValue, Scalar, from_value, to_value
63
63
  from corvic.proto_wrapper import OneofProtoWrapper
64
64
  from corvic.result import InternalError, InvalidArgumentError, Ok
65
65
  from corvic_generated.algorithm.graph.v1 import graph_pb2
@@ -1275,7 +1275,7 @@ class _Base(OneofProtoWrapper[table_pb2.TableComputeOp], ABC):
1275
1275
  def add_literal_column(
1276
1276
  self,
1277
1277
  column_name: str,
1278
- literal: struct_pb2.Value | float | str | bool,
1278
+ literal: struct_pb2.Value | Scalar | PyValue,
1279
1279
  dtype: pa.DataType,
1280
1280
  ftype: FeatureType | None = None,
1281
1281
  ) -> Ok[AddLiteralColumn] | InvalidArgumentError:
@@ -1313,8 +1313,13 @@ class _Base(OneofProtoWrapper[table_pb2.TableComputeOp], ABC):
1313
1313
  separator: str,
1314
1314
  ) -> Ok[CombineColumns] | InvalidArgumentError:
1315
1315
  for col in column_names:
1316
- if not self.schema.has_column(col):
1316
+ field = self.schema.get(col)
1317
+ if not field:
1317
1318
  return InvalidArgumentError("no column with given name", given_name=col)
1319
+ if pa.types.is_binary(field.dtype) | pa.types.is_large_binary(field.dtype):
1320
+ return InvalidArgumentError(
1321
+ "cannot concat binary columns", given_name=col
1322
+ )
1318
1323
 
1319
1324
  if self.schema.has_column(combined_column_name):
1320
1325
  return InvalidArgumentError("name given for combined column already exists")
@@ -1698,6 +1703,9 @@ class _Base(OneofProtoWrapper[table_pb2.TableComputeOp], ABC):
1698
1703
  Returns:
1699
1704
  An AddDecisionTreeSummaryOp.
1700
1705
  """
1706
+ if max_depth < 0:
1707
+ return InvalidArgumentError("max_depth must be strictly positive")
1708
+
1701
1709
  return Ok(
1702
1710
  from_proto(
1703
1711
  table_pb2.AddDecisionTreeSummaryOp(
@@ -2938,16 +2946,26 @@ def _make_schema_for_combine_columns(op: CombineColumns):
2938
2946
  if column.ftype != ftype:
2939
2947
  ftype = feature_type.unknown()
2940
2948
 
2941
- dtype = (
2942
- pl.DataFrame(schema=schema.to_polars())
2943
- .with_columns(
2944
- pl.concat_list(*op.column_names).alias(op.combined_column_name)
2949
+ if op.column_names:
2950
+ dtype = (
2951
+ pl.DataFrame(schema=schema.to_polars())
2952
+ .with_columns(
2953
+ pl.concat_list(*op.column_names).alias(op.combined_column_name)
2954
+ )
2955
+ .to_arrow()
2956
+ .schema.field(op.combined_column_name)
2957
+ .type
2958
+ )
2959
+ else:
2960
+ dtype = (
2961
+ pl.DataFrame(schema=schema.to_polars())
2962
+ .with_columns(
2963
+ pl.Series(op.combined_column_name, [], pl.List(pl.Float32))
2964
+ )
2965
+ .to_arrow()
2966
+ .schema.field(op.combined_column_name)
2967
+ .type
2945
2968
  )
2946
- .to_arrow()
2947
- .schema.field(op.combined_column_name)
2948
- .type
2949
- )
2950
-
2951
2969
  return Schema(
2952
2970
  [
2953
2971
  *op.source.schema,
@@ -12,6 +12,7 @@ from typing import Any, Final, cast
12
12
 
13
13
  import numpy as np
14
14
  import polars as pl
15
+ import polars.selectors as cs
15
16
  import pyarrow as pa
16
17
  import pyarrow.parquet as pq
17
18
  import structlog
@@ -778,9 +779,14 @@ class InMemoryExecutor(OpGraphExecutor):
778
779
  )
779
780
 
780
781
  case op_graph.ConcatList():
781
- result_df = source_df.with_columns(
782
- pl.concat_list(*op.column_names).alias(op.combined_column_name)
783
- )
782
+ if op.column_names:
783
+ result_df = source_df.with_columns(
784
+ pl.concat_list(*op.column_names).alias(op.combined_column_name)
785
+ )
786
+ else:
787
+ result_df = source_df.with_columns(
788
+ pl.Series(op.combined_column_name, [])
789
+ )
784
790
 
785
791
  return Ok(_SchemaAndBatches.from_dataframe(result_df, source_batches.metrics))
786
792
 
@@ -1394,17 +1400,41 @@ class InMemoryExecutor(OpGraphExecutor):
1394
1400
  pass
1395
1401
  case err:
1396
1402
  return err
1403
+
1397
1404
  df_input = _as_df(source_batches)
1405
+ dataframe = df_input[list({*op.feature_column_names, op.label_column_name})]
1406
+ boolean_columns = [
1407
+ name
1408
+ for name, dtype in dataframe.schema.items()
1409
+ if dtype == pl.Boolean() and name in op.feature_column_names
1410
+ ]
1398
1411
 
1399
- features = df_input[op.feature_column_names]
1400
- classes = df_input[op.label_column_name]
1412
+ # Drop Nan and Null and infinite rows as not supported by decision tree
1413
+ dataframe = dataframe.with_columns(
1414
+ *[pl.col(col).cast(pl.Float32) for col in op.feature_column_names]
1415
+ )
1416
+ dataframe = dataframe.drop_nans().drop_nulls()
1417
+ try:
1418
+ # is_infinite is not implemented for all datatypes
1419
+ dataframe = dataframe.filter(~pl.any_horizontal(cs.numeric().is_infinite()))
1420
+ except pl.exceptions.InvalidOperationError as err:
1421
+ return InvalidArgumentError.from_(err)
1422
+
1423
+ if not len(dataframe):
1424
+ return InvalidArgumentError(
1425
+ "a minimum of 1 sample is required by DecisionTreeClassifier"
1426
+ )
1427
+ features = dataframe[op.feature_column_names]
1428
+ classes = dataframe[op.label_column_name]
1401
1429
  max_depth = op.max_depth
1402
1430
 
1403
- binary_columns = [
1404
- name for name, dtype in features.schema.items() if dtype == pl.Boolean()
1405
- ]
1406
-
1407
1431
  from sklearn.tree import DecisionTreeClassifier, export_graphviz, export_text
1432
+ from sklearn.utils.multiclass import check_classification_targets
1433
+
1434
+ try:
1435
+ check_classification_targets(classes)
1436
+ except ValueError as err:
1437
+ return InvalidArgumentError.from_(err)
1408
1438
 
1409
1439
  decision_tree = DecisionTreeClassifier(random_state=0, max_depth=max_depth)
1410
1440
  try:
@@ -1426,11 +1456,11 @@ class InMemoryExecutor(OpGraphExecutor):
1426
1456
  max_depth=max_depth,
1427
1457
  )
1428
1458
 
1429
- for binary_column in binary_columns:
1459
+ for boolean_column in boolean_columns:
1430
1460
  tree_str = tree_str.replace(
1431
- f"{binary_column} <= 0.50", f"NOT {binary_column}"
1461
+ f"{boolean_column} <= 0.50", f"NOT {boolean_column}"
1432
1462
  )
1433
- tree_str = tree_str.replace(f"{binary_column} > 0.50", binary_column)
1463
+ tree_str = tree_str.replace(f"{boolean_column} > 0.50", boolean_column)
1434
1464
 
1435
1465
  metrics = source_batches.metrics.copy()
1436
1466
  metrics[op.output_metric_key] = table_pb2.DecisionTreeSummary(
corvic/table/table.py CHANGED
@@ -412,12 +412,18 @@ class Table:
412
412
  case _:
413
413
  return more_itertools.flatten(map(cls._get_staging_ops, op.sources()))
414
414
 
415
- def head(self) -> InvalidArgumentError | Ok[Table]:
416
- """Get up to the first 10 rows of the table."""
417
- return self.op_graph.limit_rows(num_rows=10).map(
415
+ def head(self, num_rows: int = 10) -> InvalidArgumentError | Ok[Table]:
416
+ """Get the first `num_rows` rows of the table."""
417
+ return self.op_graph.limit_rows(num_rows=num_rows).map(
418
418
  lambda op: Table(self.client, op)
419
419
  )
420
420
 
421
+ def sample_rows(self, num_rows: int = 10) -> InvalidArgumentError | Ok[Table]:
422
+ """Get a sample of `num_rows` rows of the table."""
423
+ return self.op_graph.sample_rows(
424
+ sample_strategy=op_graph.sample_strategy.uniform_random(), num_rows=num_rows
425
+ ).map(lambda op: Table(self.client, op))
426
+
421
427
  def distinct_rows(self) -> Table:
422
428
  return Table(
423
429
  self.client,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: corvic-engine
3
- Version: 0.3.0rc56
3
+ Version: 0.3.0rc57
4
4
  Classifier: Environment :: Console
5
5
  Classifier: License :: Other/Proprietary License
6
6
  Classifier: Programming Language :: Python :: Implementation :: CPython
@@ -1,6 +1,6 @@
1
- corvic_engine-0.3.0rc56.dist-info/METADATA,sha256=kGcTUywSpPxHTYF5tu-LPySqBXMZgKxvNVgkOssAn0A,1876
2
- corvic_engine-0.3.0rc56.dist-info/WHEEL,sha256=hKPP3BCTWtTwj6SFaSI--T5aOGqh_llYfbZ_BsqivwA,94
3
- corvic_engine-0.3.0rc56.dist-info/licenses/LICENSE,sha256=DSS1OD0oIgssKOmAzkMRBv5jvvVuZQbrIv8lpl9DXY8,1035
1
+ corvic_engine-0.3.0rc57.dist-info/METADATA,sha256=T40bDZQGa5aI1lYDYYWrtEwMrowLcANHpje9gUdPpsM,1876
2
+ corvic_engine-0.3.0rc57.dist-info/WHEEL,sha256=hKPP3BCTWtTwj6SFaSI--T5aOGqh_llYfbZ_BsqivwA,94
3
+ corvic_engine-0.3.0rc57.dist-info/licenses/LICENSE,sha256=DSS1OD0oIgssKOmAzkMRBv5jvvVuZQbrIv8lpl9DXY8,1035
4
4
  corvic/context/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
5
  corvic/context/__init__.py,sha256=zBnPiP-tStGSVMG_0-G_0ay6-yIX2aerW_oYRzAex74,1702
6
6
  corvic/embed/node2vec.py,sha256=XIJjFDdT-JnmZ43lgP-K-dLgnR17L_uaJqBPAYlsPsk,11148
@@ -25,19 +25,19 @@ corvic/model/_proto_orm_convert.py,sha256=jmzmaaUkSxeHB5OMef92AyGw7sorJ6pP4ylbeK
25
25
  corvic/model/_resource.py,sha256=w5m6mmD8KrHJ8efPTfRV0JKaCmkDRaxlGeuRMmVbw10,7773
26
26
  corvic/model/_room.py,sha256=36mXngZ38L4mr6_LgUm-QgsUUaoGMiYQRfvXLV_jd-4,2914
27
27
  corvic/model/_source.py,sha256=A1Jk4r5mB0f-Y3L8esaQFCUAu7CCTlwAm7f4qSnvjsM,9603
28
- corvic/model/_space.py,sha256=13ggLTCQMNTYYpP5PldMqtJiKp3sWOVRhQcktmoHefA,35590
28
+ corvic/model/_space.py,sha256=ZljalsBDrcnsx2sUOpJd6qQO2nFYDFttNoJMiLdGTBM,35922
29
29
  corvic/model/__init__.py,sha256=Lb-yC04t17Hr2TlnGfn5Ewzd2h1nH4hb9tKdMNAak9s,3075
30
30
  corvic/op_graph/aggregation.py,sha256=8X6vqXD7dLHrhYJU0BqmhUsWGbzD1zSP5Db5VHdIru4,6187
31
31
  corvic/op_graph/encoders.py,sha256=93wYoBCn_us5lRCkqvjaP0LTg3LBB3yEfhzICv06bB0,10460
32
32
  corvic/op_graph/errors.py,sha256=I4NE5053d0deGm5xx5EmyP4f98qx42xnIsW1IA-2hy4,163
33
33
  corvic/op_graph/feature_types.py,sha256=ZE6onUGW4Xa7tPL4XgRVQ1Tvj5FVJJ66di3ShDTR0Ak,9623
34
- corvic/op_graph/ops.py,sha256=G2bDIK_hlKJxqOX5Xu9hEoLBkdqi-TsZSn6tTagqjgg,109823
34
+ corvic/op_graph/ops.py,sha256=1YOFnnN6WgzBajkXRM9UgdMLd-NEfa4tRmIQVj5cyeo,110637
35
35
  corvic/op_graph/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
36
36
  corvic/op_graph/row_filters/_jsonlogic.py,sha256=tBd-wOwE6AIx9XEkuSVdBx9iB08nsdHJdvNzEmWzrB0,6432
37
37
  corvic/op_graph/row_filters/_row_filters.py,sha256=d7oUbB-vThi-Kn5GupGnEwr5UNlsGFgCgR3Q7NR_tkI,9554
38
38
  corvic/op_graph/row_filters/__init__.py,sha256=1sibH_kLw7t_9bpRccnEGWqdCiN0VaUh9LMMIMCRyL8,575
39
39
  corvic/op_graph/sample_strategy.py,sha256=DrbtJ3ORkIRfyIE_FdlOh_UMnCW_K9jL1LeonVYb3bU,3007
40
- corvic/op_graph/_schema.py,sha256=STbxY5PIqIA6xkSDeK8k72Nutsxq5jGe7e_aT35aznI,5733
40
+ corvic/op_graph/_schema.py,sha256=7Uuun9e6PRrtOeJLsFD8VzkwWeUpbnBcD37NpMKOcmQ,5685
41
41
  corvic/op_graph/_transformations.py,sha256=L9Au_GcciPynww4ZXojMtNdPJ36Qboc9gn0bVzXLifU,9445
42
42
  corvic/op_graph/__init__.py,sha256=1DMrQfuuS3FkLa9DXYDjSDLurdxxpG5H1jB2ctaa9xo,1444
43
43
  corvic/orm/base.py,sha256=95nkqycCZ1FaWAhTsa7zbZ0YuwNFkMUW7Wk8yhtYau8,8824
@@ -68,7 +68,7 @@ corvic/sql/parse_ops.py,sha256=1ZXVlDzIzqwW_KP0mwMxaY91tLSXqpeaUHyrGJkh56o,29444
68
68
  corvic/sql/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
69
69
  corvic/sql/__init__.py,sha256=kZ1a39KVZ08P8Bg6XuXDLD_dTQX0k620u4nwxZF4SnY,303
70
70
  corvic/system/client.py,sha256=hGhZX8RtHrFEOlOmJNlUHktOZrutOwNYUY_a1htQSrg,821
71
- corvic/system/in_memory_executor.py,sha256=t5zYx2SC7SXlG_iGa5gKnaTgOKUoTS6FQUL9FYVFex0,67586
71
+ corvic/system/in_memory_executor.py,sha256=tRYzoVCNHemlpPfYRaVM_Nc3uFsLYaOFof1nVR-6hGc,68943
72
72
  corvic/system/op_graph_executor.py,sha256=gXFnVkemS5EwNegJdU-xVAfMLPULqMFPF7d3EG3AD_U,3482
73
73
  corvic/system/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
74
74
  corvic/system/staging.py,sha256=K5P5moiuAMfPx7lxK4mArxeURBwKoyB6x9HGu9JJ16E,1846
@@ -86,7 +86,7 @@ corvic/system_sqlite/rdbms_blob_store.py,sha256=gTP_tQfTVb3wzZkzo8ys1zaz0rSrERzb
86
86
  corvic/system_sqlite/staging.py,sha256=P6XdWhjpgcpOZkYxKEjpsTxaAdBKOeSVfARjqt4_xJA,16948
87
87
  corvic/system_sqlite/__init__.py,sha256=F4UN9vFsXiDY2AKk1jYZPuWWJpSugKHS7ghXeZYlbZs,390
88
88
  corvic/table/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
89
- corvic/table/table.py,sha256=wgA4d1QslrGu7rvzyiob21hLd7esyjgO_QntaSYl-4Q,25038
89
+ corvic/table/table.py,sha256=v3MTV_nHaSAXFjPurn0Gp9Pe4UVL8RhYUHhxR6MVfmE,25396
90
90
  corvic/table/__init__.py,sha256=Gj0IR8BQF5PZK92Us7PP0ZigMsVyrfWJupzH8TgzRQk,588
91
91
  corvic/version/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
92
92
  corvic/version/__init__.py,sha256=JlkRLvKXsu3zIxhdynO_0Ub5NfQOvGjfwCRkNnaOu9U,1125
@@ -244,5 +244,5 @@ corvic_generated/status/v1/event_pb2.pyi,sha256=eU-ibrYpvEAJSIDlSa62-bC96AQU1ykF
244
244
  corvic_generated/status/v1/event_pb2_grpc.pyi,sha256=H9-ADaiKR9iyVZvmnXutZqWwRRCDxjUIktkfJrJFIHg,417
245
245
  corvic_generated/status/v1/service_pb2.pyi,sha256=iXLR2FOKQJpBgvBzpD2kVwcYOCksP2aRwK4JYaI9CBw,558
246
246
  corvic_generated/status/v1/service_pb2_grpc.pyi,sha256=OoAnaZ64FD0UTzPoRhYvQU8ecoilhHj3ySjSfHbVDaU,1501
247
- corvic/engine/_native.pyd,sha256=XkU3bVVXAk3up15IfaE0ih1d0_Lo8jRl_mJp1ZwbBls,438272
248
- corvic_engine-0.3.0rc56.dist-info/RECORD,,
247
+ corvic/engine/_native.pyd,sha256=-QxbxEBeQo7SFPLJlh84nSQEIcAM-zOfcmElul0VQ1U,438272
248
+ corvic_engine-0.3.0rc57.dist-info/RECORD,,