chalkpy 2.89.22__py3-none-any.whl → 2.95.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chalk/__init__.py +2 -1
- chalk/_gen/chalk/arrow/v1/arrow_pb2.py +7 -5
- chalk/_gen/chalk/arrow/v1/arrow_pb2.pyi +6 -0
- chalk/_gen/chalk/artifacts/v1/chart_pb2.py +36 -33
- chalk/_gen/chalk/artifacts/v1/chart_pb2.pyi +41 -1
- chalk/_gen/chalk/artifacts/v1/cron_query_pb2.py +8 -7
- chalk/_gen/chalk/artifacts/v1/cron_query_pb2.pyi +5 -0
- chalk/_gen/chalk/common/v1/offline_query_pb2.py +19 -13
- chalk/_gen/chalk/common/v1/offline_query_pb2.pyi +37 -0
- chalk/_gen/chalk/common/v1/online_query_pb2.py +54 -54
- chalk/_gen/chalk/common/v1/online_query_pb2.pyi +13 -1
- chalk/_gen/chalk/common/v1/script_task_pb2.py +13 -11
- chalk/_gen/chalk/common/v1/script_task_pb2.pyi +19 -1
- chalk/_gen/chalk/dataframe/__init__.py +0 -0
- chalk/_gen/chalk/dataframe/v1/__init__.py +0 -0
- chalk/_gen/chalk/dataframe/v1/dataframe_pb2.py +48 -0
- chalk/_gen/chalk/dataframe/v1/dataframe_pb2.pyi +123 -0
- chalk/_gen/chalk/dataframe/v1/dataframe_pb2_grpc.py +4 -0
- chalk/_gen/chalk/dataframe/v1/dataframe_pb2_grpc.pyi +4 -0
- chalk/_gen/chalk/graph/v1/graph_pb2.py +150 -149
- chalk/_gen/chalk/graph/v1/graph_pb2.pyi +25 -0
- chalk/_gen/chalk/graph/v1/sources_pb2.py +94 -84
- chalk/_gen/chalk/graph/v1/sources_pb2.pyi +56 -0
- chalk/_gen/chalk/kubernetes/v1/horizontalpodautoscaler_pb2.py +79 -0
- chalk/_gen/chalk/kubernetes/v1/horizontalpodautoscaler_pb2.pyi +377 -0
- chalk/_gen/chalk/kubernetes/v1/horizontalpodautoscaler_pb2_grpc.py +4 -0
- chalk/_gen/chalk/kubernetes/v1/horizontalpodautoscaler_pb2_grpc.pyi +4 -0
- chalk/_gen/chalk/kubernetes/v1/scaledobject_pb2.py +43 -7
- chalk/_gen/chalk/kubernetes/v1/scaledobject_pb2.pyi +252 -2
- chalk/_gen/chalk/protosql/v1/sql_service_pb2.py +54 -27
- chalk/_gen/chalk/protosql/v1/sql_service_pb2.pyi +131 -3
- chalk/_gen/chalk/protosql/v1/sql_service_pb2_grpc.py +45 -0
- chalk/_gen/chalk/protosql/v1/sql_service_pb2_grpc.pyi +14 -0
- chalk/_gen/chalk/python/v1/types_pb2.py +14 -14
- chalk/_gen/chalk/python/v1/types_pb2.pyi +8 -0
- chalk/_gen/chalk/server/v1/benchmark_pb2.py +76 -0
- chalk/_gen/chalk/server/v1/benchmark_pb2.pyi +156 -0
- chalk/_gen/chalk/server/v1/benchmark_pb2_grpc.py +258 -0
- chalk/_gen/chalk/server/v1/benchmark_pb2_grpc.pyi +84 -0
- chalk/_gen/chalk/server/v1/billing_pb2.py +40 -38
- chalk/_gen/chalk/server/v1/billing_pb2.pyi +17 -1
- chalk/_gen/chalk/server/v1/branches_pb2.py +45 -0
- chalk/_gen/chalk/server/v1/branches_pb2.pyi +80 -0
- chalk/_gen/chalk/server/v1/branches_pb2_grpc.pyi +36 -0
- chalk/_gen/chalk/server/v1/builder_pb2.py +372 -272
- chalk/_gen/chalk/server/v1/builder_pb2.pyi +479 -12
- chalk/_gen/chalk/server/v1/builder_pb2_grpc.py +360 -0
- chalk/_gen/chalk/server/v1/builder_pb2_grpc.pyi +96 -0
- chalk/_gen/chalk/server/v1/chart_pb2.py +10 -10
- chalk/_gen/chalk/server/v1/chart_pb2.pyi +18 -2
- chalk/_gen/chalk/server/v1/clickhouse_pb2.py +42 -0
- chalk/_gen/chalk/server/v1/clickhouse_pb2.pyi +17 -0
- chalk/_gen/chalk/server/v1/clickhouse_pb2_grpc.py +78 -0
- chalk/_gen/chalk/server/v1/clickhouse_pb2_grpc.pyi +38 -0
- chalk/_gen/chalk/server/v1/cloud_components_pb2.py +153 -107
- chalk/_gen/chalk/server/v1/cloud_components_pb2.pyi +146 -4
- chalk/_gen/chalk/server/v1/cloud_components_pb2_grpc.py +180 -0
- chalk/_gen/chalk/server/v1/cloud_components_pb2_grpc.pyi +48 -0
- chalk/_gen/chalk/server/v1/cloud_credentials_pb2.py +11 -3
- chalk/_gen/chalk/server/v1/cloud_credentials_pb2.pyi +20 -0
- chalk/_gen/chalk/server/v1/cloud_credentials_pb2_grpc.py +45 -0
- chalk/_gen/chalk/server/v1/cloud_credentials_pb2_grpc.pyi +12 -0
- chalk/_gen/chalk/server/v1/dataplanejobqueue_pb2.py +59 -35
- chalk/_gen/chalk/server/v1/dataplanejobqueue_pb2.pyi +127 -1
- chalk/_gen/chalk/server/v1/dataplanejobqueue_pb2_grpc.py +135 -0
- chalk/_gen/chalk/server/v1/dataplanejobqueue_pb2_grpc.pyi +36 -0
- chalk/_gen/chalk/server/v1/dataplaneworkflows_pb2.py +90 -0
- chalk/_gen/chalk/server/v1/dataplaneworkflows_pb2.pyi +264 -0
- chalk/_gen/chalk/server/v1/dataplaneworkflows_pb2_grpc.py +170 -0
- chalk/_gen/chalk/server/v1/dataplaneworkflows_pb2_grpc.pyi +62 -0
- chalk/_gen/chalk/server/v1/datasets_pb2.py +36 -24
- chalk/_gen/chalk/server/v1/datasets_pb2.pyi +71 -2
- chalk/_gen/chalk/server/v1/datasets_pb2_grpc.py +45 -0
- chalk/_gen/chalk/server/v1/datasets_pb2_grpc.pyi +12 -0
- chalk/_gen/chalk/server/v1/deploy_pb2.py +9 -3
- chalk/_gen/chalk/server/v1/deploy_pb2.pyi +12 -0
- chalk/_gen/chalk/server/v1/deploy_pb2_grpc.py +45 -0
- chalk/_gen/chalk/server/v1/deploy_pb2_grpc.pyi +12 -0
- chalk/_gen/chalk/server/v1/deployment_pb2.py +20 -15
- chalk/_gen/chalk/server/v1/deployment_pb2.pyi +25 -0
- chalk/_gen/chalk/server/v1/environment_pb2.py +25 -15
- chalk/_gen/chalk/server/v1/environment_pb2.pyi +93 -1
- chalk/_gen/chalk/server/v1/eventbus_pb2.py +44 -0
- chalk/_gen/chalk/server/v1/eventbus_pb2.pyi +64 -0
- chalk/_gen/chalk/server/v1/eventbus_pb2_grpc.py +4 -0
- chalk/_gen/chalk/server/v1/eventbus_pb2_grpc.pyi +4 -0
- chalk/_gen/chalk/server/v1/files_pb2.py +65 -0
- chalk/_gen/chalk/server/v1/files_pb2.pyi +167 -0
- chalk/_gen/chalk/server/v1/files_pb2_grpc.py +4 -0
- chalk/_gen/chalk/server/v1/files_pb2_grpc.pyi +4 -0
- chalk/_gen/chalk/server/v1/graph_pb2.py +41 -3
- chalk/_gen/chalk/server/v1/graph_pb2.pyi +191 -0
- chalk/_gen/chalk/server/v1/graph_pb2_grpc.py +92 -0
- chalk/_gen/chalk/server/v1/graph_pb2_grpc.pyi +32 -0
- chalk/_gen/chalk/server/v1/incident_pb2.py +57 -0
- chalk/_gen/chalk/server/v1/incident_pb2.pyi +165 -0
- chalk/_gen/chalk/server/v1/incident_pb2_grpc.py +4 -0
- chalk/_gen/chalk/server/v1/incident_pb2_grpc.pyi +4 -0
- chalk/_gen/chalk/server/v1/indexing_job_pb2.py +44 -0
- chalk/_gen/chalk/server/v1/indexing_job_pb2.pyi +38 -0
- chalk/_gen/chalk/server/v1/indexing_job_pb2_grpc.py +78 -0
- chalk/_gen/chalk/server/v1/indexing_job_pb2_grpc.pyi +38 -0
- chalk/_gen/chalk/server/v1/integrations_pb2.py +11 -9
- chalk/_gen/chalk/server/v1/integrations_pb2.pyi +34 -2
- chalk/_gen/chalk/server/v1/kube_pb2.py +29 -19
- chalk/_gen/chalk/server/v1/kube_pb2.pyi +28 -0
- chalk/_gen/chalk/server/v1/kube_pb2_grpc.py +45 -0
- chalk/_gen/chalk/server/v1/kube_pb2_grpc.pyi +12 -0
- chalk/_gen/chalk/server/v1/log_pb2.py +21 -3
- chalk/_gen/chalk/server/v1/log_pb2.pyi +68 -0
- chalk/_gen/chalk/server/v1/log_pb2_grpc.py +90 -0
- chalk/_gen/chalk/server/v1/log_pb2_grpc.pyi +24 -0
- chalk/_gen/chalk/server/v1/metadataplanejobqueue_pb2.py +73 -0
- chalk/_gen/chalk/server/v1/metadataplanejobqueue_pb2.pyi +212 -0
- chalk/_gen/chalk/server/v1/metadataplanejobqueue_pb2_grpc.py +217 -0
- chalk/_gen/chalk/server/v1/metadataplanejobqueue_pb2_grpc.pyi +74 -0
- chalk/_gen/chalk/server/v1/model_registry_pb2.py +10 -10
- chalk/_gen/chalk/server/v1/model_registry_pb2.pyi +4 -1
- chalk/_gen/chalk/server/v1/monitoring_pb2.py +84 -75
- chalk/_gen/chalk/server/v1/monitoring_pb2.pyi +1 -0
- chalk/_gen/chalk/server/v1/monitoring_pb2_grpc.py +136 -0
- chalk/_gen/chalk/server/v1/monitoring_pb2_grpc.pyi +38 -0
- chalk/_gen/chalk/server/v1/offline_queries_pb2.py +32 -10
- chalk/_gen/chalk/server/v1/offline_queries_pb2.pyi +73 -0
- chalk/_gen/chalk/server/v1/offline_queries_pb2_grpc.py +90 -0
- chalk/_gen/chalk/server/v1/offline_queries_pb2_grpc.pyi +24 -0
- chalk/_gen/chalk/server/v1/plandebug_pb2.py +53 -0
- chalk/_gen/chalk/server/v1/plandebug_pb2.pyi +86 -0
- chalk/_gen/chalk/server/v1/plandebug_pb2_grpc.py +168 -0
- chalk/_gen/chalk/server/v1/plandebug_pb2_grpc.pyi +60 -0
- chalk/_gen/chalk/server/v1/queries_pb2.py +76 -48
- chalk/_gen/chalk/server/v1/queries_pb2.pyi +155 -2
- chalk/_gen/chalk/server/v1/queries_pb2_grpc.py +180 -0
- chalk/_gen/chalk/server/v1/queries_pb2_grpc.pyi +48 -0
- chalk/_gen/chalk/server/v1/scheduled_query_pb2.py +4 -2
- chalk/_gen/chalk/server/v1/scheduled_query_pb2_grpc.py +45 -0
- chalk/_gen/chalk/server/v1/scheduled_query_pb2_grpc.pyi +12 -0
- chalk/_gen/chalk/server/v1/scheduled_query_run_pb2.py +12 -6
- chalk/_gen/chalk/server/v1/scheduled_query_run_pb2.pyi +75 -2
- chalk/_gen/chalk/server/v1/scheduler_pb2.py +24 -12
- chalk/_gen/chalk/server/v1/scheduler_pb2.pyi +61 -1
- chalk/_gen/chalk/server/v1/scheduler_pb2_grpc.py +90 -0
- chalk/_gen/chalk/server/v1/scheduler_pb2_grpc.pyi +24 -0
- chalk/_gen/chalk/server/v1/script_tasks_pb2.py +26 -14
- chalk/_gen/chalk/server/v1/script_tasks_pb2.pyi +33 -3
- chalk/_gen/chalk/server/v1/script_tasks_pb2_grpc.py +90 -0
- chalk/_gen/chalk/server/v1/script_tasks_pb2_grpc.pyi +24 -0
- chalk/_gen/chalk/server/v1/sql_interface_pb2.py +75 -0
- chalk/_gen/chalk/server/v1/sql_interface_pb2.pyi +142 -0
- chalk/_gen/chalk/server/v1/sql_interface_pb2_grpc.py +349 -0
- chalk/_gen/chalk/server/v1/sql_interface_pb2_grpc.pyi +114 -0
- chalk/_gen/chalk/server/v1/sql_queries_pb2.py +48 -0
- chalk/_gen/chalk/server/v1/sql_queries_pb2.pyi +150 -0
- chalk/_gen/chalk/server/v1/sql_queries_pb2_grpc.py +123 -0
- chalk/_gen/chalk/server/v1/sql_queries_pb2_grpc.pyi +52 -0
- chalk/_gen/chalk/server/v1/team_pb2.py +156 -137
- chalk/_gen/chalk/server/v1/team_pb2.pyi +56 -10
- chalk/_gen/chalk/server/v1/team_pb2_grpc.py +90 -0
- chalk/_gen/chalk/server/v1/team_pb2_grpc.pyi +24 -0
- chalk/_gen/chalk/server/v1/topic_pb2.py +5 -3
- chalk/_gen/chalk/server/v1/topic_pb2.pyi +10 -1
- chalk/_gen/chalk/server/v1/trace_pb2.py +50 -28
- chalk/_gen/chalk/server/v1/trace_pb2.pyi +121 -0
- chalk/_gen/chalk/server/v1/trace_pb2_grpc.py +135 -0
- chalk/_gen/chalk/server/v1/trace_pb2_grpc.pyi +42 -0
- chalk/_gen/chalk/server/v1/webhook_pb2.py +9 -3
- chalk/_gen/chalk/server/v1/webhook_pb2.pyi +18 -0
- chalk/_gen/chalk/server/v1/webhook_pb2_grpc.py +45 -0
- chalk/_gen/chalk/server/v1/webhook_pb2_grpc.pyi +12 -0
- chalk/_gen/chalk/streaming/v1/debug_service_pb2.py +62 -0
- chalk/_gen/chalk/streaming/v1/debug_service_pb2.pyi +75 -0
- chalk/_gen/chalk/streaming/v1/debug_service_pb2_grpc.py +221 -0
- chalk/_gen/chalk/streaming/v1/debug_service_pb2_grpc.pyi +88 -0
- chalk/_gen/chalk/streaming/v1/simple_streaming_service_pb2.py +19 -7
- chalk/_gen/chalk/streaming/v1/simple_streaming_service_pb2.pyi +96 -3
- chalk/_gen/chalk/streaming/v1/simple_streaming_service_pb2_grpc.py +48 -0
- chalk/_gen/chalk/streaming/v1/simple_streaming_service_pb2_grpc.pyi +20 -0
- chalk/_gen/chalk/utils/v1/field_change_pb2.py +32 -0
- chalk/_gen/chalk/utils/v1/field_change_pb2.pyi +42 -0
- chalk/_gen/chalk/utils/v1/field_change_pb2_grpc.py +4 -0
- chalk/_gen/chalk/utils/v1/field_change_pb2_grpc.pyi +4 -0
- chalk/_lsp/error_builder.py +11 -0
- chalk/_monitoring/Chart.py +1 -3
- chalk/_version.py +1 -1
- chalk/cli.py +5 -10
- chalk/client/client.py +178 -64
- chalk/client/client_async.py +154 -0
- chalk/client/client_async_impl.py +22 -0
- chalk/client/client_grpc.py +738 -112
- chalk/client/client_impl.py +541 -136
- chalk/client/dataset.py +27 -6
- chalk/client/models.py +99 -2
- chalk/client/serialization/model_serialization.py +126 -10
- chalk/config/project_config.py +1 -1
- chalk/df/LazyFramePlaceholder.py +1154 -0
- chalk/df/ast_parser.py +2 -10
- chalk/features/_class_property.py +7 -0
- chalk/features/_embedding/embedding.py +1 -0
- chalk/features/_embedding/sentence_transformer.py +1 -1
- chalk/features/_encoding/converter.py +83 -2
- chalk/features/_encoding/pyarrow.py +20 -4
- chalk/features/_encoding/rich.py +1 -3
- chalk/features/_tensor.py +1 -2
- chalk/features/dataframe/_filters.py +14 -5
- chalk/features/dataframe/_impl.py +91 -36
- chalk/features/dataframe/_validation.py +11 -7
- chalk/features/feature_field.py +40 -30
- chalk/features/feature_set.py +1 -2
- chalk/features/feature_set_decorator.py +1 -0
- chalk/features/feature_wrapper.py +42 -3
- chalk/features/hooks.py +81 -12
- chalk/features/inference.py +65 -10
- chalk/features/resolver.py +338 -56
- chalk/features/tag.py +1 -3
- chalk/features/underscore_features.py +2 -1
- chalk/functions/__init__.py +456 -21
- chalk/functions/holidays.py +1 -3
- chalk/gitignore/gitignore_parser.py +5 -1
- chalk/importer.py +186 -74
- chalk/ml/__init__.py +6 -2
- chalk/ml/model_hooks.py +368 -51
- chalk/ml/model_reference.py +68 -10
- chalk/ml/model_version.py +34 -21
- chalk/ml/utils.py +143 -40
- chalk/operators/_utils.py +14 -3
- chalk/parsed/_proto/export.py +22 -0
- chalk/parsed/duplicate_input_gql.py +4 -0
- chalk/parsed/expressions.py +1 -3
- chalk/parsed/json_conversions.py +21 -14
- chalk/parsed/to_proto.py +16 -4
- chalk/parsed/user_types_to_json.py +31 -10
- chalk/parsed/validation_from_registries.py +182 -0
- chalk/queries/named_query.py +16 -6
- chalk/queries/scheduled_query.py +13 -1
- chalk/serialization/parsed_annotation.py +25 -12
- chalk/sql/__init__.py +221 -0
- chalk/sql/_internal/integrations/athena.py +6 -1
- chalk/sql/_internal/integrations/bigquery.py +22 -2
- chalk/sql/_internal/integrations/databricks.py +61 -18
- chalk/sql/_internal/integrations/mssql.py +281 -0
- chalk/sql/_internal/integrations/postgres.py +11 -3
- chalk/sql/_internal/integrations/redshift.py +4 -0
- chalk/sql/_internal/integrations/snowflake.py +11 -2
- chalk/sql/_internal/integrations/util.py +2 -1
- chalk/sql/_internal/sql_file_resolver.py +55 -10
- chalk/sql/_internal/sql_source.py +36 -2
- chalk/streams/__init__.py +1 -3
- chalk/streams/_kafka_source.py +5 -1
- chalk/streams/_windows.py +16 -4
- chalk/streams/types.py +1 -2
- chalk/utils/__init__.py +1 -3
- chalk/utils/_otel_version.py +13 -0
- chalk/utils/async_helpers.py +14 -5
- chalk/utils/df_utils.py +2 -2
- chalk/utils/duration.py +1 -3
- chalk/utils/job_log_display.py +538 -0
- chalk/utils/missing_dependency.py +5 -4
- chalk/utils/notebook.py +255 -2
- chalk/utils/pl_helpers.py +190 -37
- chalk/utils/pydanticutil/pydantic_compat.py +1 -2
- chalk/utils/storage_client.py +246 -0
- chalk/utils/threading.py +1 -3
- chalk/utils/tracing.py +194 -86
- {chalkpy-2.89.22.dist-info → chalkpy-2.95.3.dist-info}/METADATA +53 -21
- {chalkpy-2.89.22.dist-info → chalkpy-2.95.3.dist-info}/RECORD +268 -198
- {chalkpy-2.89.22.dist-info → chalkpy-2.95.3.dist-info}/WHEEL +0 -0
- {chalkpy-2.89.22.dist-info → chalkpy-2.95.3.dist-info}/entry_points.txt +0 -0
- {chalkpy-2.89.22.dist-info → chalkpy-2.95.3.dist-info}/top_level.txt +0 -0
chalk/df/ast_parser.py
CHANGED
|
@@ -93,9 +93,7 @@ def parse_dataframe_getitem():
|
|
|
93
93
|
)
|
|
94
94
|
assert isinstance(func_node, ast.Subscript)
|
|
95
95
|
slc = func_node.slice
|
|
96
|
-
|
|
97
|
-
slc = slc.value # type: ignore
|
|
98
|
-
assert isinstance(slc, ast.expr)
|
|
96
|
+
assert isinstance(slc, ast.expr)
|
|
99
97
|
converted_slice = convert_slice(slc)
|
|
100
98
|
return eval_converted_expr(converted_slice, glbs=func_frame.f_globals, lcls=func_frame.f_locals)
|
|
101
99
|
|
|
@@ -227,13 +225,7 @@ def _convert_maybe_tuple(slc: ast.expr):
|
|
|
227
225
|
return _convert_ops(slc)
|
|
228
226
|
|
|
229
227
|
|
|
230
|
-
def convert_slice(slc:
|
|
231
|
-
if isinstance(slc, ast.Index):
|
|
232
|
-
# Index is deprecated in Python 3.9+
|
|
233
|
-
slc = slc.value # type: ignore
|
|
234
|
-
assert isinstance(slc, ast.expr)
|
|
235
|
-
slc = _convert_maybe_tuple(slc)
|
|
236
|
-
return ast.Index(value=slc) # pyright: ignore[reportCallIssue]
|
|
228
|
+
def convert_slice(slc: ast.expr):
|
|
237
229
|
return _convert_maybe_tuple(slc)
|
|
238
230
|
|
|
239
231
|
|
|
@@ -5,6 +5,8 @@ import functools
|
|
|
5
5
|
from typing import Any, Callable, List, Type, TypeVar, cast
|
|
6
6
|
|
|
7
7
|
from chalk._lsp.error_builder import FeatureClassErrorBuilder
|
|
8
|
+
from chalk.features.feature_wrapper import UnresolvedFeature
|
|
9
|
+
from chalk.utils.notebook import is_notebook
|
|
8
10
|
|
|
9
11
|
T = TypeVar("T")
|
|
10
12
|
V = TypeVar("V")
|
|
@@ -54,6 +56,11 @@ def classproperty_support(cls: Type[T]) -> Type[T]:
|
|
|
54
56
|
if (res := self.__chalk_notebook_feature_expressions__.get(item)) is not None:
|
|
55
57
|
return res
|
|
56
58
|
|
|
59
|
+
# If in notebook, fallback to constructing FQN string instead of raising error
|
|
60
|
+
if is_notebook():
|
|
61
|
+
fqn = f"{self.namespace}.{item}"
|
|
62
|
+
return UnresolvedFeature(fqn)
|
|
63
|
+
|
|
57
64
|
builder: FeatureClassErrorBuilder = self.__chalk_error_builder__
|
|
58
65
|
builder.invalid_attribute(
|
|
59
66
|
root_feature_str=self.namespace,
|
|
@@ -25,6 +25,7 @@ from chalk.utils.collections import ensure_tuple
|
|
|
25
25
|
SUPPORTED_LOCAL_MODELS = {
|
|
26
26
|
"all-MiniLM-L6-v2", # https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
|
|
27
27
|
"sample-bert", # For internal Chalk use
|
|
28
|
+
"sample-linear-nn", # For internal Chalk use
|
|
28
29
|
}
|
|
29
30
|
|
|
30
31
|
# This will eventually be included in SUPPORTED_LOCAL_MODELS
|
|
@@ -111,7 +111,7 @@ class SentenceTransformerProvider(EmbeddingProvider):
|
|
|
111
111
|
raise ValueError(
|
|
112
112
|
f"Expected to find an embedding for input at position {idx}, but the response data was exhausted."
|
|
113
113
|
)
|
|
114
|
-
yield create_fixedsize_with_nulls(
|
|
114
|
+
yield create_fixedsize_with_nulls(values_with_nulls, self.dimensions)
|
|
115
115
|
|
|
116
116
|
def get_vector_class(self) -> Type[Vector]:
|
|
117
117
|
return Vector[self.dimensions]
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import io
|
|
3
4
|
import json
|
|
4
5
|
import types
|
|
5
6
|
import typing
|
|
@@ -50,6 +51,7 @@ from chalk.features._encoding.pyarrow import (
|
|
|
50
51
|
rich_to_pyarrow,
|
|
51
52
|
)
|
|
52
53
|
from chalk.features._encoding.rich import structure_primitive_to_rich, unstructure_rich_to_primitive
|
|
54
|
+
from chalk.features.feature_wrapper import UnresolvedFeature
|
|
53
55
|
from chalk.utils.collections import unwrap_annotated_if_needed, unwrap_optional_and_annotated_if_needed
|
|
54
56
|
from chalk.utils.df_utils import pa_array_to_pl_series
|
|
55
57
|
from chalk.utils.json import JSON, TJSON, is_pyarrow_json_type, pyarrow_json_type
|
|
@@ -870,6 +872,79 @@ class PrimitiveFeatureConverter(Generic[_TPrim]):
|
|
|
870
872
|
else:
|
|
871
873
|
raise TypeError(f"Could not convert the pyarrow dtype {dtype} to a protobuf message")
|
|
872
874
|
|
|
875
|
+
@classmethod
|
|
876
|
+
def convert_pa_field_to_proto_field(cls, field: pa.Field) -> pb.Field:
|
|
877
|
+
"""Convert a PyArrow Field to proto Field."""
|
|
878
|
+
field_proto = pb.Field(
|
|
879
|
+
name=field.name, arrow_type=cls.convert_pa_dtype_to_proto_dtype(field.type), nullable=field.nullable
|
|
880
|
+
)
|
|
881
|
+
|
|
882
|
+
if field.metadata:
|
|
883
|
+
# field.metadata is of types dict[bytes, bytes]
|
|
884
|
+
for k, v in field.metadata.items():
|
|
885
|
+
field_proto.metadata[k.decode("utf-8")] = v.decode("utf-8")
|
|
886
|
+
|
|
887
|
+
return field_proto
|
|
888
|
+
|
|
889
|
+
@classmethod
|
|
890
|
+
def convert_proto_field_to_pa_field(cls, proto_field: pb.Field) -> pa.Field:
|
|
891
|
+
"""Convert a proto Field to PyArrow Field."""
|
|
892
|
+
arrow_type = cls.convert_proto_dtype_to_pa_dtype(proto_field.arrow_type)
|
|
893
|
+
|
|
894
|
+
# don't have to convert back to dict[bytes, bytes] as can initialize with dict[str, str]
|
|
895
|
+
metadata = dict(proto_field.metadata) if proto_field.metadata else None
|
|
896
|
+
|
|
897
|
+
return pa.field(
|
|
898
|
+
name=proto_field.name,
|
|
899
|
+
type=arrow_type,
|
|
900
|
+
nullable=proto_field.nullable,
|
|
901
|
+
metadata=metadata,
|
|
902
|
+
)
|
|
903
|
+
|
|
904
|
+
@classmethod
|
|
905
|
+
def convert_pa_schema_to_proto_schema(cls, schema: pa.Schema) -> pb.Schema:
|
|
906
|
+
schema_proto = pb.Schema(
|
|
907
|
+
columns=[cls.convert_pa_field_to_proto_field(field) for field in schema],
|
|
908
|
+
)
|
|
909
|
+
|
|
910
|
+
if schema.metadata:
|
|
911
|
+
# schema.metadata is of types dict[bytes, bytes]
|
|
912
|
+
for k, v in schema.metadata.items():
|
|
913
|
+
schema_proto.metadata[k.decode("utf-8")] = v.decode("utf-8")
|
|
914
|
+
|
|
915
|
+
return schema_proto
|
|
916
|
+
|
|
917
|
+
@classmethod
|
|
918
|
+
def convert_proto_schema_to_pa_schema(cls, proto_schema: pb.Schema) -> pa.Schema:
|
|
919
|
+
fields = [cls.convert_proto_field_to_pa_field(proto_field) for proto_field in proto_schema.columns]
|
|
920
|
+
|
|
921
|
+
# don't have to convert back to dict[bytes, bytes] as can initialize with dict[str, str]
|
|
922
|
+
metadata = dict(proto_schema.metadata) if proto_schema.metadata else None
|
|
923
|
+
|
|
924
|
+
return pa.schema(fields, metadata=metadata)
|
|
925
|
+
|
|
926
|
+
@staticmethod
|
|
927
|
+
def convert_arrow_table_to_proto(table: pa.Table | pa.RecordBatch) -> pb.TableParquetBytes:
|
|
928
|
+
if isinstance(table, pa.RecordBatch):
|
|
929
|
+
table = pa.Table.from_batches([table])
|
|
930
|
+
elif isinstance(table, pa.Table):
|
|
931
|
+
pass
|
|
932
|
+
else:
|
|
933
|
+
raise TypeError(f"expected pa.Table or pa.RecordBatch, got {type(table)!r}")
|
|
934
|
+
|
|
935
|
+
sink = io.BytesIO()
|
|
936
|
+
import pyarrow.parquet
|
|
937
|
+
|
|
938
|
+
pyarrow.parquet.write_table(table, sink)
|
|
939
|
+
return pb.TableParquetBytes(encoded_parquet_bytes=sink.getvalue())
|
|
940
|
+
|
|
941
|
+
@staticmethod
|
|
942
|
+
def convert_arrow_table_from_proto(proto: pb.TableParquetBytes) -> pa.Table:
|
|
943
|
+
import pyarrow.parquet
|
|
944
|
+
|
|
945
|
+
pf = pyarrow.parquet.ParquetFile(io.BytesIO(proto.encoded_parquet_bytes))
|
|
946
|
+
return pyarrow.parquet.read_table(pf)
|
|
947
|
+
|
|
873
948
|
@staticmethod
|
|
874
949
|
def _serialize_pa_decimal_to_pb(value: Union[pa.Decimal128Scalar, pa.Decimal256Scalar]) -> pb.ScalarValue:
|
|
875
950
|
dec_val = value.as_py()
|
|
@@ -1183,8 +1258,14 @@ class FeatureConverter(PrimitiveFeatureConverter[_TPrim], Generic[_TPrim, _TRich
|
|
|
1183
1258
|
# because it is also used for error handling inside of `from_rich_to_primitive`.
|
|
1184
1259
|
self._name = name
|
|
1185
1260
|
if rich_default != ...:
|
|
1186
|
-
#
|
|
1187
|
-
|
|
1261
|
+
# In notebook environments, UnresolvedFeature may be used as a placeholder
|
|
1262
|
+
# for features that can't be resolved due to a stale registry.
|
|
1263
|
+
# Treat these as missing defaults since they're not concrete values.
|
|
1264
|
+
if isinstance(rich_default, UnresolvedFeature):
|
|
1265
|
+
rich_default = ...
|
|
1266
|
+
else:
|
|
1267
|
+
# The missing value strategy doesn't really matter because rich_default is not missing
|
|
1268
|
+
primitive_default = self.from_rich_to_primitive(rich_default, missing_value_strategy="allow")
|
|
1188
1269
|
super().__init__(
|
|
1189
1270
|
name, is_nullable=is_nullable, pyarrow_dtype=pyarrow_dtype, primitive_default=primitive_default
|
|
1190
1271
|
)
|
|
@@ -8,12 +8,28 @@ import ipaddress
|
|
|
8
8
|
import typing
|
|
9
9
|
import uuid
|
|
10
10
|
from datetime import date, datetime, time, timedelta
|
|
11
|
-
from typing import
|
|
11
|
+
from typing import (
|
|
12
|
+
TYPE_CHECKING,
|
|
13
|
+
Annotated,
|
|
14
|
+
Any,
|
|
15
|
+
Dict,
|
|
16
|
+
FrozenSet,
|
|
17
|
+
List,
|
|
18
|
+
Literal,
|
|
19
|
+
Mapping,
|
|
20
|
+
Set,
|
|
21
|
+
Tuple,
|
|
22
|
+
Type,
|
|
23
|
+
TypeGuard,
|
|
24
|
+
cast,
|
|
25
|
+
get_args,
|
|
26
|
+
get_origin,
|
|
27
|
+
is_typeddict,
|
|
28
|
+
)
|
|
12
29
|
|
|
13
30
|
import attrs
|
|
14
31
|
import google.protobuf.message
|
|
15
32
|
import pyarrow as pa
|
|
16
|
-
from typing_extensions import Annotated, Literal, TypeGuard, get_args, get_origin, is_typeddict
|
|
17
33
|
|
|
18
34
|
from chalk.features._encoding.http import HttpResponse, get_http_response_as_pyarrow
|
|
19
35
|
from chalk.features._encoding.primitive import ChalkStructType, TPrimitive
|
|
@@ -24,7 +40,7 @@ from chalk.utils.collections import is_namedtuple, is_optional, unwrap_optional_
|
|
|
24
40
|
from chalk.utils.enum import get_enum_value_type
|
|
25
41
|
from chalk.utils.json import JSON, is_pyarrow_json_type
|
|
26
42
|
from chalk.utils.missing_dependency import missing_dependency_exception
|
|
27
|
-
from chalk.utils.pl_helpers import is_new_polars
|
|
43
|
+
from chalk.utils.pl_helpers import is_new_polars, pl_array
|
|
28
44
|
from chalk.utils.pydanticutil.pydantic_compat import is_pydantic_basemodel
|
|
29
45
|
|
|
30
46
|
if TYPE_CHECKING:
|
|
@@ -418,7 +434,7 @@ def pyarrow_to_polars(
|
|
|
418
434
|
underlying = pa_type.value_type
|
|
419
435
|
if is_new_polars and use_fixed_size_list:
|
|
420
436
|
# pl.Array is only available in polars >=0.18
|
|
421
|
-
return
|
|
437
|
+
return pl_array(inner=pyarrow_to_polars(underlying, name=f"{name}[]"), size=pa_type.list_size)
|
|
422
438
|
else:
|
|
423
439
|
return pl.List(pyarrow_to_polars(underlying, name=f"{name}[]"))
|
|
424
440
|
if pa.types.is_struct(pa_type):
|
chalk/features/_encoding/rich.py
CHANGED
|
@@ -8,7 +8,7 @@ import enum
|
|
|
8
8
|
import ipaddress
|
|
9
9
|
import uuid
|
|
10
10
|
from datetime import date, datetime, time, timedelta
|
|
11
|
-
from typing import Any, FrozenSet, List, Set, Tuple, Type, TypeVar, Union, cast
|
|
11
|
+
from typing import Any, FrozenSet, List, Set, Tuple, Type, TypeVar, Union, cast, get_args, get_origin, is_typeddict
|
|
12
12
|
|
|
13
13
|
import attrs
|
|
14
14
|
import cattrs
|
|
@@ -23,8 +23,6 @@ try:
|
|
|
23
23
|
except ImportError:
|
|
24
24
|
V1BaseModel = None
|
|
25
25
|
|
|
26
|
-
from typing_extensions import get_args, get_origin, is_typeddict
|
|
27
|
-
|
|
28
26
|
from chalk.features._encoding.primitive import ChalkStructType, TPrimitive
|
|
29
27
|
from chalk.utils.cached_type_hints import cached_get_type_hints
|
|
30
28
|
from chalk.utils.collections import is_namedtuple, unwrap_optional_and_annotated_if_needed
|
chalk/features/_tensor.py
CHANGED
|
@@ -1,11 +1,10 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from enum import Enum
|
|
4
|
-
from typing import Any, Tuple, Type, Union, overload
|
|
4
|
+
from typing import Any, Tuple, Type, TypeGuard, Union, overload
|
|
5
5
|
|
|
6
6
|
import numpy as np
|
|
7
7
|
import pyarrow as pa
|
|
8
|
-
from typing_extensions import TypeGuard
|
|
9
8
|
|
|
10
9
|
TensorDimension = Union[int, str]
|
|
11
10
|
|
|
@@ -4,16 +4,17 @@ import collections.abc
|
|
|
4
4
|
import datetime
|
|
5
5
|
import enum
|
|
6
6
|
import functools
|
|
7
|
-
from typing import TYPE_CHECKING, Any, List, Mapping, Optional, Protocol, Sequence, TypeVar, Union, cast
|
|
7
|
+
from typing import TYPE_CHECKING, Any, List, Mapping, Optional, Protocol, Sequence, TypeGuard, TypeVar, Union, cast
|
|
8
8
|
|
|
9
9
|
import pyarrow as pa
|
|
10
|
-
from typing_extensions import Self
|
|
10
|
+
from typing_extensions import Self
|
|
11
11
|
|
|
12
12
|
from chalk.features._encoding.converter import pyarrow_to_polars
|
|
13
13
|
from chalk.features.feature_field import Feature
|
|
14
14
|
from chalk.features.feature_wrapper import FeatureWrapper, unwrap_feature
|
|
15
15
|
from chalk.features.filter import Filter, TimeDelta, get_filter_now
|
|
16
16
|
from chalk.utils.collections import ensure_tuple
|
|
17
|
+
from chalk.utils.pl_helpers import polars_lazy_frame_collect_schema
|
|
17
18
|
|
|
18
19
|
if TYPE_CHECKING:
|
|
19
20
|
import polars as pl
|
|
@@ -442,7 +443,7 @@ class _PolarsStructAdapter(StructAdapter["pl.Expr"]):
|
|
|
442
443
|
|
|
443
444
|
def filter_data_frame(
|
|
444
445
|
item: Any,
|
|
445
|
-
underlying:
|
|
446
|
+
underlying: pl.LazyFrame,
|
|
446
447
|
namespace: Optional[str],
|
|
447
448
|
) -> Union[pl.DataFrame, pl.LazyFrame]:
|
|
448
449
|
|
|
@@ -463,7 +464,10 @@ def filter_data_frame(
|
|
|
463
464
|
)
|
|
464
465
|
now = get_filter_now()
|
|
465
466
|
if len(projections) > 0:
|
|
466
|
-
key_error_or_none = dataframe_missing_key_error(
|
|
467
|
+
key_error_or_none = dataframe_missing_key_error(
|
|
468
|
+
projections,
|
|
469
|
+
(underlying.collect_schema().names() if polars_lazy_frame_collect_schema else underlying.columns),
|
|
470
|
+
)
|
|
467
471
|
if key_error_or_none is not None:
|
|
468
472
|
raise key_error_or_none
|
|
469
473
|
# now = datetime.datetime.now(tz=datetime.timezone.utc)
|
|
@@ -472,7 +476,12 @@ def filter_data_frame(
|
|
|
472
476
|
timestamp_feature = (
|
|
473
477
|
None if namespace is None else CURRENT_FEATURE_REGISTRY.get().get_feature_sets()[namespace].__chalk_ts__
|
|
474
478
|
)
|
|
475
|
-
pl_expr = convert_filters_to_pl_expr(
|
|
479
|
+
pl_expr = convert_filters_to_pl_expr(
|
|
480
|
+
filters,
|
|
481
|
+
(underlying.collect_schema() if polars_lazy_frame_collect_schema else underlying.schema),
|
|
482
|
+
timestamp_feature,
|
|
483
|
+
now,
|
|
484
|
+
)
|
|
476
485
|
df = underlying
|
|
477
486
|
if pl_expr is not None:
|
|
478
487
|
df = df.filter(pl_expr)
|
|
@@ -32,7 +32,6 @@ from typing import (
|
|
|
32
32
|
overload,
|
|
33
33
|
)
|
|
34
34
|
|
|
35
|
-
import packaging.version
|
|
36
35
|
import pyarrow as pa
|
|
37
36
|
|
|
38
37
|
from chalk.features._chalkop import Aggregation
|
|
@@ -56,6 +55,13 @@ from chalk.utils.df_utils import (
|
|
|
56
55
|
)
|
|
57
56
|
from chalk.utils.duration import Duration, parse_chalk_duration
|
|
58
57
|
from chalk.utils.missing_dependency import missing_dependency_exception
|
|
58
|
+
from chalk.utils.pl_helpers import (
|
|
59
|
+
polars_group_by_instead_of_groupby,
|
|
60
|
+
polars_lazy_frame_collect_schema,
|
|
61
|
+
polars_name_dot_suffix_instead_of_suffix,
|
|
62
|
+
polars_uses_schema_overrides,
|
|
63
|
+
schema_compat,
|
|
64
|
+
)
|
|
59
65
|
from chalk.utils.pydanticutil.pydantic_compat import is_pydantic_basemodel
|
|
60
66
|
|
|
61
67
|
if TYPE_CHECKING:
|
|
@@ -473,7 +479,12 @@ class DataFrame(metaclass=DataFrameMeta):
|
|
|
473
479
|
raise ValueError(f"Unable to convert data of type {type(data).__name__} into a DataFrame")
|
|
474
480
|
# Rename / validate that all column names are root fqns
|
|
475
481
|
if self._pydantic_model is None:
|
|
476
|
-
self.columns = tuple(
|
|
482
|
+
self.columns = tuple(
|
|
483
|
+
Feature.from_root_fqn(str(c))
|
|
484
|
+
for c in (
|
|
485
|
+
underlying.collect_schema().names() if polars_lazy_frame_collect_schema else underlying.columns
|
|
486
|
+
)
|
|
487
|
+
)
|
|
477
488
|
else:
|
|
478
489
|
self.columns = ()
|
|
479
490
|
|
|
@@ -505,7 +516,13 @@ class DataFrame(metaclass=DataFrameMeta):
|
|
|
505
516
|
"""
|
|
506
517
|
import polars as pl
|
|
507
518
|
|
|
508
|
-
rename_map = {
|
|
519
|
+
rename_map = {
|
|
520
|
+
x: Distance.fqn
|
|
521
|
+
for x in (
|
|
522
|
+
underlying.collect_schema().names() if polars_lazy_frame_collect_schema else underlying.columns
|
|
523
|
+
) # pyright: ignore
|
|
524
|
+
if x in self._distance_feature_fqns
|
|
525
|
+
}
|
|
509
526
|
|
|
510
527
|
underlying = underlying.rename(rename_map)
|
|
511
528
|
if len(rename_map) > 0:
|
|
@@ -620,7 +637,14 @@ class DataFrame(metaclass=DataFrameMeta):
|
|
|
620
637
|
)
|
|
621
638
|
elif all(isinstance(col, str) for col in ensure_tuple(item)):
|
|
622
639
|
# Select the columns with `.select()` since they're by name.
|
|
623
|
-
key_error_or_none = dataframe_missing_key_error(
|
|
640
|
+
key_error_or_none = dataframe_missing_key_error(
|
|
641
|
+
ensure_tuple(item),
|
|
642
|
+
(
|
|
643
|
+
self._underlying.collect_schema().names()
|
|
644
|
+
if polars_lazy_frame_collect_schema
|
|
645
|
+
else self._underlying.columns
|
|
646
|
+
),
|
|
647
|
+
)
|
|
624
648
|
if key_error_or_none is not None:
|
|
625
649
|
raise key_error_or_none
|
|
626
650
|
materialized = self._materialize()
|
|
@@ -700,7 +724,7 @@ class DataFrame(metaclass=DataFrameMeta):
|
|
|
700
724
|
if len(operation.filters) > 0:
|
|
701
725
|
f = convert_filters_to_pl_expr(
|
|
702
726
|
operation.filters,
|
|
703
|
-
self._underlying
|
|
727
|
+
schema_compat(self._underlying),
|
|
704
728
|
timestamp_feature,
|
|
705
729
|
now,
|
|
706
730
|
)
|
|
@@ -711,10 +735,10 @@ class DataFrame(metaclass=DataFrameMeta):
|
|
|
711
735
|
|
|
712
736
|
data = self._underlying.lazy()
|
|
713
737
|
|
|
714
|
-
if
|
|
715
|
-
data = data.groupby(groupby)
|
|
716
|
-
else:
|
|
738
|
+
if polars_group_by_instead_of_groupby:
|
|
717
739
|
data = data.group_by(groupby)
|
|
740
|
+
else:
|
|
741
|
+
data = data.groupby(groupby) # pyright: ignore
|
|
718
742
|
|
|
719
743
|
data = data.agg(cols).collect()
|
|
720
744
|
|
|
@@ -778,7 +802,7 @@ class DataFrame(metaclass=DataFrameMeta):
|
|
|
778
802
|
|
|
779
803
|
col_str = str(column)
|
|
780
804
|
|
|
781
|
-
col_dtype = self._underlying
|
|
805
|
+
col_dtype = schema_compat(self._underlying)[col_str]
|
|
782
806
|
underlying = self._underlying
|
|
783
807
|
if col_dtype != pl.Float64() and col_dtype != pl.Float32():
|
|
784
808
|
underlying = underlying.select(pl.col(col_str).cast(pl.Float32))
|
|
@@ -965,7 +989,7 @@ class DataFrame(metaclass=DataFrameMeta):
|
|
|
965
989
|
if len(operation.filters) > 0:
|
|
966
990
|
f = convert_filters_to_pl_expr(
|
|
967
991
|
operation.filters,
|
|
968
|
-
self._underlying
|
|
992
|
+
schema_compat(self._underlying),
|
|
969
993
|
timestamp_feature,
|
|
970
994
|
now,
|
|
971
995
|
)
|
|
@@ -974,22 +998,40 @@ class DataFrame(metaclass=DataFrameMeta):
|
|
|
974
998
|
|
|
975
999
|
cols.append(operation.fn(c).alias(str(alias)))
|
|
976
1000
|
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
|
|
1001
|
+
if polars_group_by_instead_of_groupby:
|
|
1002
|
+
return DataFrame(
|
|
1003
|
+
self._underlying.lazy()
|
|
1004
|
+
.sort(str(index), descending=False)
|
|
1005
|
+
.group_by_dynamic(
|
|
1006
|
+
index_column=str(index),
|
|
1007
|
+
group_by=groupby,
|
|
1008
|
+
offset=offset,
|
|
1009
|
+
every=every,
|
|
1010
|
+
period=period,
|
|
1011
|
+
start_by=start_by,
|
|
1012
|
+
)
|
|
1013
|
+
.agg(cols)
|
|
1014
|
+
.collect(),
|
|
1015
|
+
convert_dtypes=self._convert_dtypes,
|
|
1016
|
+
pydantic_model=self._pydantic_model,
|
|
1017
|
+
)
|
|
1018
|
+
else:
|
|
1019
|
+
return DataFrame(
|
|
1020
|
+
self._underlying.lazy()
|
|
1021
|
+
.sort(str(index), descending=False)
|
|
1022
|
+
.groupby_dynamic( # pyright: ignore
|
|
1023
|
+
index_column=str(index),
|
|
1024
|
+
by=groupby,
|
|
1025
|
+
offset=offset,
|
|
1026
|
+
every=every,
|
|
1027
|
+
period=period,
|
|
1028
|
+
start_by=start_by,
|
|
1029
|
+
)
|
|
1030
|
+
.agg(cols)
|
|
1031
|
+
.collect(),
|
|
1032
|
+
convert_dtypes=self._convert_dtypes,
|
|
1033
|
+
pydantic_model=self._pydantic_model,
|
|
987
1034
|
)
|
|
988
|
-
.agg(cols)
|
|
989
|
-
.collect(),
|
|
990
|
-
convert_dtypes=self._convert_dtypes,
|
|
991
|
-
pydantic_model=self._pydantic_model,
|
|
992
|
-
)
|
|
993
1035
|
|
|
994
1036
|
def join(
|
|
995
1037
|
self,
|
|
@@ -1200,7 +1242,7 @@ class DataFrame(metaclass=DataFrameMeta):
|
|
|
1200
1242
|
if len(operation.filters) > 0:
|
|
1201
1243
|
f = convert_filters_to_pl_expr(
|
|
1202
1244
|
operation.filters,
|
|
1203
|
-
self._underlying
|
|
1245
|
+
schema_compat(self._underlying),
|
|
1204
1246
|
timestamp_feature,
|
|
1205
1247
|
now,
|
|
1206
1248
|
)
|
|
@@ -1424,15 +1466,25 @@ class DataFrame(metaclass=DataFrameMeta):
|
|
|
1424
1466
|
else:
|
|
1425
1467
|
cols_to_select, dtypes, new_columns = cls._parse_columns(columns)
|
|
1426
1468
|
|
|
1427
|
-
# 'dtypes' deprecated for 'schema_overrides' in polars 0.20
|
|
1428
|
-
|
|
1429
|
-
|
|
1430
|
-
|
|
1431
|
-
|
|
1432
|
-
|
|
1433
|
-
|
|
1434
|
-
|
|
1435
|
-
|
|
1469
|
+
# 'dtypes' deprecated for 'schema_overrides' in polars 0.20.31+
|
|
1470
|
+
if polars_uses_schema_overrides:
|
|
1471
|
+
data = pl.read_csv(
|
|
1472
|
+
source=path,
|
|
1473
|
+
has_header=has_header,
|
|
1474
|
+
columns=cols_to_select,
|
|
1475
|
+
schema_overrides=dtypes, # pyright: ignore[reportCallIssue]
|
|
1476
|
+
new_columns=new_columns,
|
|
1477
|
+
storage_options=DataFrame._get_storage_options(),
|
|
1478
|
+
)
|
|
1479
|
+
else:
|
|
1480
|
+
data = pl.read_csv(
|
|
1481
|
+
source=path,
|
|
1482
|
+
has_header=has_header,
|
|
1483
|
+
columns=cols_to_select,
|
|
1484
|
+
dtypes=dtypes, # pyright: ignore[reportCallIssue]
|
|
1485
|
+
new_columns=new_columns,
|
|
1486
|
+
storage_options=DataFrame._get_storage_options(),
|
|
1487
|
+
)
|
|
1436
1488
|
return cls(data)
|
|
1437
1489
|
|
|
1438
1490
|
@classmethod
|
|
@@ -1922,7 +1974,10 @@ class DataFrame(metaclass=DataFrameMeta):
|
|
|
1922
1974
|
raise ValueError("DataFrame dimensions do not match")
|
|
1923
1975
|
|
|
1924
1976
|
suffix = "__POLARS_CMP_OTHER"
|
|
1925
|
-
|
|
1977
|
+
if polars_name_dot_suffix_instead_of_suffix:
|
|
1978
|
+
other_renamed = other.select(pl.all().name.suffix(suffix))
|
|
1979
|
+
else:
|
|
1980
|
+
other_renamed = other.select(pl.all().suffix(suffix)) # pyright: ignore
|
|
1926
1981
|
combined = pl.concat([materialized, other_renamed], how="horizontal")
|
|
1927
1982
|
|
|
1928
1983
|
if op == "eq":
|
|
@@ -8,6 +8,7 @@ import isodate
|
|
|
8
8
|
from chalk.features._encoding.missing_value import MissingValueStrategy
|
|
9
9
|
from chalk.features.feature_field import Feature, FeatureNotFoundException
|
|
10
10
|
from chalk.utils.collections import get_unique_item
|
|
11
|
+
from chalk.utils.pl_helpers import apply_compat, schema_compat, str_json_decode_compat
|
|
11
12
|
|
|
12
13
|
if TYPE_CHECKING:
|
|
13
14
|
import polars as pl
|
|
@@ -67,7 +68,7 @@ def validate_df_schema(underlying: Union[pl.DataFrame, pl.LazyFrame]):
|
|
|
67
68
|
# This is called from within DataFrame.__init__, which validates that polars is installed
|
|
68
69
|
import polars as pl
|
|
69
70
|
|
|
70
|
-
for root_fqn, actual_dtype in underlying.
|
|
71
|
+
for root_fqn, actual_dtype in schema_compat(underlying).items():
|
|
71
72
|
feature = Feature.from_root_fqn(root_fqn)
|
|
72
73
|
if feature.is_has_one or feature.is_has_many:
|
|
73
74
|
continue
|
|
@@ -87,7 +88,7 @@ def validate_df_schema(underlying: Union[pl.DataFrame, pl.LazyFrame]):
|
|
|
87
88
|
isinstance(expected_dtype, pl.List)
|
|
88
89
|
and actual_dtype == pl.Utf8 # pyright: ignore[reportUnnecessaryComparison]
|
|
89
90
|
):
|
|
90
|
-
col = pl.col(root_fqn)
|
|
91
|
+
col = str_json_decode_compat(pl.col(root_fqn), expected_dtype)
|
|
91
92
|
try:
|
|
92
93
|
underlying = underlying.with_columns(col.cast(expected_dtype))
|
|
93
94
|
except (Exception, pl.PolarsPanicError) as e:
|
|
@@ -123,21 +124,24 @@ def validate_df_schema(underlying: Union[pl.DataFrame, pl.LazyFrame]):
|
|
|
123
124
|
if isinstance(expected_dtype, pl.Datetime):
|
|
124
125
|
# tzinfo = None if expected_dtype.time_zone is None else zoneinfo.ZoneInfo(expected_dtype.time_zone)
|
|
125
126
|
underlying = underlying.with_columns(pl.col(root_fqn).str.strptime(pl.Datetime).alias(root_fqn))
|
|
126
|
-
if cast(pl.Datetime, underlying
|
|
127
|
+
if cast(pl.Datetime, schema_compat(underlying)[root_fqn]).time_zone is not None:
|
|
127
128
|
assert expected_dtype.time_zone is not None
|
|
128
129
|
cast_expr = pl.col(root_fqn).dt.convert_time_zone(expected_dtype.time_zone)
|
|
129
130
|
else:
|
|
130
131
|
cast_expr = pl.col(root_fqn).dt.replace_time_zone(expected_dtype.time_zone)
|
|
131
132
|
elif expected_dtype == pl.Date:
|
|
132
|
-
cast_expr =
|
|
133
|
+
cast_expr = apply_compat(
|
|
134
|
+
pl.col(root_fqn),
|
|
133
135
|
lambda x: None if x is None else isodate.parse_date(x),
|
|
134
136
|
)
|
|
135
137
|
elif expected_dtype == pl.Time:
|
|
136
|
-
cast_expr =
|
|
138
|
+
cast_expr = apply_compat(
|
|
139
|
+
pl.col(root_fqn),
|
|
137
140
|
lambda x: None if x is None else isodate.parse_time(x),
|
|
138
141
|
)
|
|
139
142
|
elif expected_dtype == pl.Duration:
|
|
140
|
-
cast_expr =
|
|
143
|
+
cast_expr = apply_compat(
|
|
144
|
+
pl.col(root_fqn),
|
|
141
145
|
lambda x: None if x is None else isodate.parse_duration(x),
|
|
142
146
|
)
|
|
143
147
|
else:
|
|
@@ -168,7 +172,7 @@ def validate_nulls(
|
|
|
168
172
|
|
|
169
173
|
if isinstance(underlying, pl.LazyFrame):
|
|
170
174
|
underlying = underlying.collect()
|
|
171
|
-
schema = underlying
|
|
175
|
+
schema = schema_compat(underlying)
|
|
172
176
|
null_count_rows = underlying.null_count().to_dicts()
|
|
173
177
|
if len(null_count_rows) == 0:
|
|
174
178
|
return underlying # Empty dataframe
|