chalkruntime 3.32.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. _chalk_shared_public/__init__.py +0 -0
  2. _chalk_shared_public/arrow_type_promotion.py +279 -0
  3. _chalk_shared_public/chalk_function_registry.py +1650 -0
  4. _chalk_shared_public/py.typed +0 -0
  5. chalkdf/__init__.py +107 -0
  6. chalkdf/_chalk_import.py +152 -0
  7. chalkdf/_display.py +317 -0
  8. chalkdf/_libchalk_bootstrap.py +365 -0
  9. chalkdf/_metaclass.py +149 -0
  10. chalkdf/catalog.py +180 -0
  11. chalkdf/config.py +319 -0
  12. chalkdf/dataframe.py +2596 -0
  13. chalkdf/dataframe_methods/__init__.py +1 -0
  14. chalkdf/dataframe_methods/aggregations.py +260 -0
  15. chalkdf/dataframe_methods/joins.py +254 -0
  16. chalkdf/dataframe_methods/utils.py +43 -0
  17. chalkdf/debug.py +35 -0
  18. chalkdf/exceptions.py +24 -0
  19. chalkdf/functions.py +94 -0
  20. chalkdf/lazyframe.py +20 -0
  21. chalkdf/materialized_dataframe.py +517 -0
  22. chalkdf/performance_summary.py +407 -0
  23. chalkdf/remote.py +473 -0
  24. chalkdf/remote_run.py +68 -0
  25. chalkdf/schema.py +103 -0
  26. chalkdf/series.py +73 -0
  27. chalkdf/sql.py +510 -0
  28. chalkdf/testing.py +185 -0
  29. chalkdf/underscore_conversion/__init__.py +0 -0
  30. chalkdf/underscore_conversion/convert_underscore_to_expr.py +704 -0
  31. chalkdf/util.py +31 -0
  32. chalkdf_onnx_runtime/__init__.py +1 -0
  33. chalkdf_onnx_runtime/setup.py +46 -0
  34. chalkruntime/__init__.py +0 -0
  35. chalkruntime/constants.py +48 -0
  36. chalkruntime/dataframe/__init__.py +4 -0
  37. chalkruntime/dataframe/dataframe.py +18 -0
  38. chalkruntime/dataframe/lazyframe.py +7 -0
  39. chalkruntime/exc/__init__.py +0 -0
  40. chalkruntime/exc/failed_argument.py +7 -0
  41. chalkruntime/exc/resolver_errors.py +546 -0
  42. chalkruntime/exc/wrapped_resolver_exception.py +5 -0
  43. chalkruntime/graph/__init__.py +0 -0
  44. chalkruntime/graph/chalk_overload.py +25 -0
  45. chalkruntime/graph/convert_chalkpy_underscore.py +1961 -0
  46. chalkruntime/graph/feature.py +3611 -0
  47. chalkruntime/graph/filter_conversion.py +296 -0
  48. chalkruntime/graph/global_graph.py +16 -0
  49. chalkruntime/graph/graph.py +285 -0
  50. chalkruntime/graph/graph_impl.py +932 -0
  51. chalkruntime/graph/graph_proxy.py +117 -0
  52. chalkruntime/graph/graph_state.py +81 -0
  53. chalkruntime/graph/jinja_parser.py +235 -0
  54. chalkruntime/graph/materializations.py +263 -0
  55. chalkruntime/graph/maybe_named_collection.py +101 -0
  56. chalkruntime/graph/named_query.py +160 -0
  57. chalkruntime/graph/nearest_neighbor.py +106 -0
  58. chalkruntime/graph/overlay_graph.py +155 -0
  59. chalkruntime/graph/prompt_service.py +9 -0
  60. chalkruntime/graph/protograph_deserializer.py +2367 -0
  61. chalkruntime/graph/protograph_serializer.py +202 -0
  62. chalkruntime/graph/resolver.py +905 -0
  63. chalkruntime/graph/singletons.py +257 -0
  64. chalkruntime/graph/sklearn_model_parser.py +240 -0
  65. chalkruntime/graph/stream_resolver.py +410 -0
  66. chalkruntime/graph/underscore.py +1600 -0
  67. chalkruntime/graph/underscore_codec_info.py +59 -0
  68. chalkruntime/graph/underscore_operation_registry.py +285 -0
  69. chalkruntime/graph/variables.py +75 -0
  70. chalkruntime/heaptrack_launcher.py +40 -0
  71. chalkruntime/incrementalization/__init__.py +0 -0
  72. chalkruntime/incrementalization/group_incrementalizer.py +61 -0
  73. chalkruntime/incrementalization/incrementalizer.py +269 -0
  74. chalkruntime/invoker/__init__.py +0 -0
  75. chalkruntime/invoker/batch_result_collector.py +1058 -0
  76. chalkruntime/invoker/bound_invoker.py +108 -0
  77. chalkruntime/invoker/bound_invoker_cache.py +156 -0
  78. chalkruntime/invoker/general_bound_invoker.py +1037 -0
  79. chalkruntime/invoker/no_arg_scalar_invoker.py +202 -0
  80. chalkruntime/invoker/one_to_one_invoker.py +1022 -0
  81. chalkruntime/invoker/overlay_features.py +307 -0
  82. chalkruntime/invoker/parse_external_resolver.py +221 -0
  83. chalkruntime/invoker/partition_batch.py +184 -0
  84. chalkruntime/invoker/query_execution_parameters.py +76 -0
  85. chalkruntime/invoker/resolver_args_builder.py +283 -0
  86. chalkruntime/invoker/resolver_input.py +139 -0
  87. chalkruntime/invoker/resolver_input_upload.py +195 -0
  88. chalkruntime/invoker/resolver_output_metadata.py +261 -0
  89. chalkruntime/invoker/resolver_raw_output_parsing.py +407 -0
  90. chalkruntime/invoker/resolver_result.py +51 -0
  91. chalkruntime/invoker/resolver_runner.py +764 -0
  92. chalkruntime/invoker/sample.py +425 -0
  93. chalkruntime/invoker/validator.py +281 -0
  94. chalkruntime/invoker/vectorized_hasmany_sampler.py +1534 -0
  95. chalkruntime/loader/__init__.py +0 -0
  96. chalkruntime/loader/converter.py +1616 -0
  97. chalkruntime/loader/importer.py +234 -0
  98. chalkruntime/memray_launcher.py +31 -0
  99. chalkruntime/metadata.py +286 -0
  100. chalkruntime/py.typed +0 -0
  101. chalkruntime/server/__init__.py +0 -0
  102. chalkruntime/server/config.py +189 -0
  103. chalkruntime/server/entrypoint.py +793 -0
  104. chalkruntime/server/env_helper.py +81 -0
  105. chalkruntime/server/remote_python_function_registry_client.py +87 -0
  106. chalkruntime/server/service.py +465 -0
  107. chalkruntime/sql_rewriter/__init__.py +0 -0
  108. chalkruntime/sql_rewriter/composed_rewriter.py +14 -0
  109. chalkruntime/sql_rewriter/contextual_query_rewriter.py +101 -0
  110. chalkruntime/sql_rewriter/filter_query_rewriter.py +1170 -0
  111. chalkruntime/sql_rewriter/identity_rewriter.py +19 -0
  112. chalkruntime/sql_rewriter/query_rewriter.py +11 -0
  113. chalkruntime/sql_rewriter/query_rewriter_helper.py +207 -0
  114. chalkruntime/streaming/__init__.py +0 -0
  115. chalkruntime/streaming/converter_utils.py +54 -0
  116. chalkruntime/streaming/exc.py +14 -0
  117. chalkruntime/streaming/message_parsing.py +601 -0
  118. chalkruntime/streaming/resolver_utils.py +63 -0
  119. chalkruntime/streaming/types.py +237 -0
  120. chalkruntime/streaming/window_keys.py +9 -0
  121. chalkruntime/utils/__init__.py +0 -0
  122. chalkruntime/utils/async_helpers.py +63 -0
  123. chalkruntime/utils/contextvars.py +14 -0
  124. chalkruntime/utils/datadog.py +44 -0
  125. chalkruntime/utils/internal_pl_utils.py +146 -0
  126. chalkruntime/utils/tracing.py +171 -0
  127. chalkruntime/utils/viztracer_profiling.py +174 -0
  128. chalkruntime/valgrind_launcher.py +43 -0
  129. chalkruntime-3.32.1.dist-info/METADATA +47 -0
  130. chalkruntime-3.32.1.dist-info/RECORD +132 -0
  131. chalkruntime-3.32.1.dist-info/WHEEL +5 -0
  132. chalkruntime-3.32.1.dist-info/top_level.txt +4 -0
File without changes
@@ -0,0 +1,279 @@
1
+ from typing import Any, Callable, Sequence, TypeVar, cast
2
+
3
+ import pyarrow as pa
4
+ from chalk.utils.collections import OrderedSet
5
+
6
+ from libchalk.chalkfunction import (
7
+ ArgumentType,
8
+ CallbackType,
9
+ DataFrameParameterType,
10
+ default_arrow_type_promoter,
11
+ )
12
+
13
+ pa_int_types = {8: pa.int8(), 16: pa.int16(), 32: pa.int32(), 64: pa.int64()}
14
+ pa_uint_types = {8: pa.uint8(), 16: pa.uint16(), 32: pa.uint32(), 64: pa.uint64()}
15
+ pa_float_types = {16: pa.float16(), 32: pa.float32(), 64: pa.float64()}
16
+ pa_date_types = {32: pa.date32(), 64: pa.date64()}
17
+ datetime_type = pa.timestamp("us", "UTC")
18
+ datetime_type_no_tz = pa.timestamp("us")
19
+ duration_type = pa.duration("us")
20
+
21
+
22
+ def _most_precise_type_lists_arrow(
23
+ *,
24
+ types: Sequence[pa.DataType],
25
+ minimum_type: pa.DataType | None,
26
+ enforce_equal_list_size: bool = False,
27
+ ) -> pa.DataType:
28
+ """
29
+ Returns the most precise numeric type which encompasses all inputs, promoting if needed.
30
+
31
+ If `enforce_equal_list_size` is True, then all inputs must be of type FixedSizeListType
32
+ and have the same list size.
33
+ """
34
+
35
+ list_element_types: list[pa.DataType] = []
36
+ list_element_sizes: list[int] = []
37
+ for input_type in types:
38
+ if not isinstance(input_type, (pa.FixedSizeListType, pa.ListType, pa.LargeListType)):
39
+ raise ValueError(f"Expected scalar list type, got operand type '{input_type}'")
40
+
41
+ if enforce_equal_list_size:
42
+ if not isinstance(input_type, pa.FixedSizeListType):
43
+ raise ValueError(f"Expected scalar fixed size list type, got operand type '{input_type}'")
44
+ list_element_sizes.append(input_type.list_size)
45
+
46
+ list_element_types.append(input_type.value_type)
47
+
48
+ if enforce_equal_list_size and len(set(list_element_sizes)) > 1:
49
+ raise ValueError("Expected all operands of input to be of the same length")
50
+
51
+ return promote_types_from_arrow(types=list_element_types, minimum_type=minimum_type)
52
+
53
+
54
+ def _most_precise_type_structs_arrow(
55
+ *,
56
+ types: Sequence[pa.DataType],
57
+ minimum_type: pa.DataType | None,
58
+ enforce_complete_structs: bool = False,
59
+ ) -> pa.DataType:
60
+ """
61
+ Returns the most precise numeric type which encompasses all inputs, promoting if needed.
62
+
63
+ If `enforce_equal_struct_size` is True, then all datatypes must contain every field in every other datatype.
64
+ """
65
+
66
+ mapping_of_field_types: dict[str, list[pa.DataType]] = {}
67
+ for input_type in types:
68
+ if not isinstance(input_type, pa.StructType):
69
+ raise ValueError(f"Expected struct type, got operand type '{input_type}'")
70
+ for field in input_type:
71
+ if field.name not in mapping_of_field_types:
72
+ mapping_of_field_types[field.name] = []
73
+ mapping_of_field_types[field.name].append(field.type)
74
+ if enforce_complete_structs:
75
+ for field_name, field_types in mapping_of_field_types.items():
76
+ if len(field_types) != len(types):
77
+ raise ValueError(f"Expected all struct operands of input to contain field '{field_name}'")
78
+
79
+ if minimum_type is not None:
80
+ if not isinstance(minimum_type, pa.StructType):
81
+ raise ValueError(f"Expected minimum type to be struct type, got operand type '{minimum_type}'")
82
+ for field in minimum_type:
83
+ if field.name not in mapping_of_field_types:
84
+ mapping_of_field_types[field.name] = []
85
+ mapping_of_field_types[field.name].append(field.type)
86
+
87
+ new_fields = {
88
+ field: promote_types_from_arrow(types=field_types, minimum_type=None)
89
+ for field, field_types in mapping_of_field_types.items()
90
+ }
91
+ return pa.struct(new_fields)
92
+
93
+
94
+ def most_precise_numeric_type_from_arrow(
95
+ *,
96
+ types: Sequence[pa.DataType],
97
+ minimum_type: pa.DataType | None = None,
98
+ ) -> pa.DataType:
99
+ if minimum_type is not None:
100
+ types = [z for z in types] + [minimum_type]
101
+
102
+ for t in types:
103
+ if (
104
+ t not in pa_int_types.values()
105
+ and t not in pa_uint_types.values()
106
+ and t not in pa_float_types.values()
107
+ and t not in pa_date_types.values()
108
+ and t != datetime_type
109
+ and t != duration_type
110
+ and not (pa.types.is_fixed_size_list(t) and t.value_type in pa_float_types.values())
111
+ ):
112
+ raise ValueError(f"Expected numeric type, got {t}")
113
+ if t == datetime_type_no_tz:
114
+ raise ValueError("UTC Timezone must be specified on your timestamp objects")
115
+
116
+ if (
117
+ all(pa.types.is_fixed_size_list(t) and t.value_type in pa_float_types.values() for t in types)
118
+ and len(types) > 0
119
+ ):
120
+ first_type = types[0]
121
+ # Included to help pyright understand types.
122
+ assert pa.types.is_fixed_size_list(first_type), f"Expected FixedSizeListType but got {first_type}"
123
+ fixed_t: pa.FixedSizeListType = first_type
124
+ return pa.list_(fixed_t.value_type, fixed_t.list_size)
125
+
126
+ if all(t in pa_float_types.values() for t in types):
127
+ return pa_float_types[max(t.bit_width for t in types)]
128
+
129
+ elif all(t in pa_uint_types.values() for t in types):
130
+ return pa_uint_types[max(t.bit_width for t in types)]
131
+
132
+ elif all(t in pa_date_types.values() for t in types):
133
+ # same as chalk/features/_encoding/pyarrow.py::rich_to_pyarrow timedelta
134
+ return pa.duration("us")
135
+
136
+ elif all(t == datetime_type for t in types):
137
+ # same as chalk/features/_encoding/pyarrow.py::rich_to_pyarrow timedelta
138
+ return pa.duration("us")
139
+
140
+ elif len(types) == 2 and types[0] == datetime_type and types[1] == duration_type:
141
+ # TODO: need to check that the operation type is subtraction
142
+ # Currently errors downstream, so not urgent
143
+ return pa.timestamp("us", "UTC")
144
+
145
+ elif any(t == datetime_type for t in types):
146
+ raise ValueError("Only subtraction is supported on datetime types")
147
+
148
+ elif any(t in pa_date_types.values() for t in types):
149
+ raise ValueError("Only subtraction is supported on date types")
150
+
151
+ elif any(t in pa_float_types.values() for t in types):
152
+ return pa.float64()
153
+
154
+ elif (max_int_type := max(t.bit_width for t in types)) in pa_int_types:
155
+ return pa_int_types[max_int_type]
156
+
157
+ else:
158
+ raise ValueError(
159
+ (
160
+ f"Unsupported numeric type for {types}. "
161
+ f"Expected int with bit width in {tuple(pa_int_types.keys())}, got {max_int_type}"
162
+ )
163
+ )
164
+
165
+
166
+ T = TypeVar("T")
167
+
168
+
169
+ def cast_elements_to_arrow_type(
170
+ *,
171
+ types: Sequence[T],
172
+ target_types: Sequence[ArgumentType],
173
+ cast_fn: Callable[[T, pa.DataType], T],
174
+ extract_dtype: Callable[[T], pa.DataType | CallbackType],
175
+ lit: Callable[[Any, pa.DataType], T],
176
+ ) -> list[T]:
177
+ if len(types) != len(target_types):
178
+ raise ValueError(
179
+ f"Length of types and target_types must be equal. Got {len(types)} input types and {len(target_types)} target types"
180
+ )
181
+
182
+ promoted_operands: list[T] = []
183
+ for target, e in zip(target_types, types):
184
+ if isinstance(target, CallbackType):
185
+ # Callback arguments cannot be promoted.
186
+ promoted_operands.append(e)
187
+ elif isinstance(target, DataFrameParameterType):
188
+ # DataFrame parameter type arguments cannot be promoted.
189
+ promoted_operands.append(e)
190
+ elif extract_dtype(e) == pa.null():
191
+ # Null inputs are always replaced by the literal null.
192
+ promoted_operands.append(lit(None, target))
193
+ elif extract_dtype(e) == target:
194
+ # The argument type already matches the overload target type.
195
+ promoted_operands.append(e)
196
+ else:
197
+ # In order to compute this underscore expression, an implicit cast is
198
+ # required. Therefore, cast the argument before the function is called.
199
+ promoted_operands.append(cast_fn(e, target))
200
+ return promoted_operands
201
+
202
+
203
+ def promote_types_from_arrow(
204
+ types: Sequence[pa.DataType],
205
+ minimum_type: pa.DataType | None,
206
+ ) -> pa.DataType:
207
+ types_without_minimum = types
208
+ types_with_minimum: list[pa.DataType] = (
209
+ [z for z in types] + [minimum_type] if minimum_type is not None else [z for z in types]
210
+ )
211
+ del types
212
+ non_null_underlying_input_types = list(OrderedSet([x for x in types_with_minimum if not pa.types.is_null(x)]))
213
+
214
+ if len(non_null_underlying_input_types) == 1:
215
+ return non_null_underlying_input_types[0]
216
+ if len(types_with_minimum) == 1:
217
+ return types_with_minimum[0]
218
+ if all(pa.types.is_fixed_size_list(t) for t in types_with_minimum):
219
+ most_precise_list_type = _most_precise_type_lists_arrow(
220
+ types=types_without_minimum,
221
+ minimum_type=minimum_type,
222
+ enforce_equal_list_size=True,
223
+ )
224
+ list_size = cast(pa.FixedSizeListType, types_with_minimum[0]).list_size
225
+ return pa.list_(most_precise_list_type, list_size)
226
+ if all(
227
+ pa.types.is_list(t) or pa.types.is_large_list(t) or pa.types.is_fixed_size_list(t) for t in types_with_minimum
228
+ ):
229
+ most_precise_list_type = _most_precise_type_lists_arrow(
230
+ types=types_without_minimum,
231
+ minimum_type=minimum_type,
232
+ enforce_equal_list_size=False,
233
+ )
234
+ return pa.large_list(most_precise_list_type)
235
+ if all(pa.types.is_struct(t) for t in types_with_minimum):
236
+ return _most_precise_type_structs_arrow(
237
+ types=types_without_minimum,
238
+ minimum_type=minimum_type,
239
+ enforce_complete_structs=False,
240
+ )
241
+
242
+ if any(t == datetime_type_no_tz for t in types_with_minimum):
243
+ raise ValueError("UTC Timezone must be specified on your timestamp objects")
244
+
245
+ if all(pa.types.is_string(t) or pa.types.is_large_string(t) for t in types_with_minimum):
246
+ return pa.large_utf8()
247
+ elif any(pa.types.is_string(t) or pa.types.is_large_string(t) for t in types_with_minimum):
248
+ other_types = [t for t in types_with_minimum if not pa.types.is_string(t) and not pa.types.is_large_string(t)]
249
+ raise ValueError(f"Cannot implicitly convert between string and other type(s): {other_types}")
250
+
251
+ if all(pa.types.is_binary(t) or pa.types.is_large_binary(t) for t in types_with_minimum):
252
+ return pa.large_binary()
253
+ elif any(pa.types.is_binary(t) or pa.types.is_large_binary(t) for t in types_with_minimum):
254
+ other_types = [t for t in types_with_minimum if not pa.types.is_binary(t) and not pa.types.is_large_binary(t)]
255
+ raise ValueError(f"Cannot implicitly convert between binary and other type(s): {other_types}")
256
+
257
+ if all(pa.types.is_boolean(t) for t in types_with_minimum):
258
+ return pa.bool_()
259
+
260
+ if all(t in pa_date_types.values() for t in types_with_minimum):
261
+ # same as chalk/features/_encoding/pyarrow.py::rich_to_pyarrow date
262
+ return pa.date64()
263
+ elif any(t in pa_date_types.values() for t in types_with_minimum):
264
+ raise ValueError("Only subtraction is supported on date types")
265
+
266
+ if all(t == datetime_type for t in types_with_minimum):
267
+ return datetime_type
268
+ elif any(t == datetime_type for t in types_with_minimum):
269
+ raise ValueError("Only subtraction is supported on datetime types")
270
+
271
+ return most_precise_numeric_type_from_arrow(
272
+ types=types_without_minimum,
273
+ minimum_type=minimum_type,
274
+ )
275
+
276
+
277
+ def can_promote_by_casting(src: ArgumentType, target: ArgumentType) -> bool:
278
+ # TODO inline?
279
+ return default_arrow_type_promoter.can_promote(from_type=src, to_type=target)