pixeltable 0.4.0rc3__py3-none-any.whl → 0.4.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +23 -5
- pixeltable/_version.py +1 -0
- pixeltable/catalog/__init__.py +5 -3
- pixeltable/catalog/catalog.py +1318 -404
- pixeltable/catalog/column.py +186 -115
- pixeltable/catalog/dir.py +1 -2
- pixeltable/catalog/globals.py +11 -43
- pixeltable/catalog/insertable_table.py +167 -79
- pixeltable/catalog/path.py +61 -23
- pixeltable/catalog/schema_object.py +9 -10
- pixeltable/catalog/table.py +626 -308
- pixeltable/catalog/table_metadata.py +101 -0
- pixeltable/catalog/table_version.py +713 -569
- pixeltable/catalog/table_version_handle.py +37 -6
- pixeltable/catalog/table_version_path.py +42 -29
- pixeltable/catalog/tbl_ops.py +50 -0
- pixeltable/catalog/update_status.py +191 -0
- pixeltable/catalog/view.py +108 -94
- pixeltable/config.py +128 -22
- pixeltable/dataframe.py +188 -100
- pixeltable/env.py +407 -136
- pixeltable/exceptions.py +6 -0
- pixeltable/exec/__init__.py +3 -0
- pixeltable/exec/aggregation_node.py +7 -8
- pixeltable/exec/cache_prefetch_node.py +83 -110
- pixeltable/exec/cell_materialization_node.py +231 -0
- pixeltable/exec/cell_reconstruction_node.py +135 -0
- pixeltable/exec/component_iteration_node.py +4 -3
- pixeltable/exec/data_row_batch.py +8 -65
- pixeltable/exec/exec_context.py +16 -4
- pixeltable/exec/exec_node.py +13 -36
- pixeltable/exec/expr_eval/evaluators.py +7 -6
- pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
- pixeltable/exec/expr_eval/globals.py +8 -5
- pixeltable/exec/expr_eval/row_buffer.py +1 -2
- pixeltable/exec/expr_eval/schedulers.py +190 -30
- pixeltable/exec/globals.py +32 -0
- pixeltable/exec/in_memory_data_node.py +18 -18
- pixeltable/exec/object_store_save_node.py +293 -0
- pixeltable/exec/row_update_node.py +16 -9
- pixeltable/exec/sql_node.py +206 -101
- pixeltable/exprs/__init__.py +1 -1
- pixeltable/exprs/arithmetic_expr.py +27 -22
- pixeltable/exprs/array_slice.py +3 -3
- pixeltable/exprs/column_property_ref.py +34 -30
- pixeltable/exprs/column_ref.py +92 -96
- pixeltable/exprs/comparison.py +5 -5
- pixeltable/exprs/compound_predicate.py +5 -4
- pixeltable/exprs/data_row.py +152 -55
- pixeltable/exprs/expr.py +62 -43
- pixeltable/exprs/expr_dict.py +3 -3
- pixeltable/exprs/expr_set.py +17 -10
- pixeltable/exprs/function_call.py +75 -37
- pixeltable/exprs/globals.py +1 -2
- pixeltable/exprs/in_predicate.py +4 -4
- pixeltable/exprs/inline_expr.py +10 -27
- pixeltable/exprs/is_null.py +1 -3
- pixeltable/exprs/json_mapper.py +8 -8
- pixeltable/exprs/json_path.py +56 -22
- pixeltable/exprs/literal.py +5 -5
- pixeltable/exprs/method_ref.py +2 -2
- pixeltable/exprs/object_ref.py +2 -2
- pixeltable/exprs/row_builder.py +127 -53
- pixeltable/exprs/rowid_ref.py +8 -12
- pixeltable/exprs/similarity_expr.py +50 -25
- pixeltable/exprs/sql_element_cache.py +4 -4
- pixeltable/exprs/string_op.py +5 -5
- pixeltable/exprs/type_cast.py +3 -5
- pixeltable/func/__init__.py +1 -0
- pixeltable/func/aggregate_function.py +8 -8
- pixeltable/func/callable_function.py +9 -9
- pixeltable/func/expr_template_function.py +10 -10
- pixeltable/func/function.py +18 -20
- pixeltable/func/function_registry.py +6 -7
- pixeltable/func/globals.py +2 -3
- pixeltable/func/mcp.py +74 -0
- pixeltable/func/query_template_function.py +20 -18
- pixeltable/func/signature.py +43 -16
- pixeltable/func/tools.py +23 -13
- pixeltable/func/udf.py +18 -20
- pixeltable/functions/__init__.py +6 -0
- pixeltable/functions/anthropic.py +93 -33
- pixeltable/functions/audio.py +114 -10
- pixeltable/functions/bedrock.py +13 -6
- pixeltable/functions/date.py +1 -1
- pixeltable/functions/deepseek.py +20 -9
- pixeltable/functions/fireworks.py +2 -2
- pixeltable/functions/gemini.py +28 -11
- pixeltable/functions/globals.py +13 -13
- pixeltable/functions/groq.py +108 -0
- pixeltable/functions/huggingface.py +1046 -23
- pixeltable/functions/image.py +9 -18
- pixeltable/functions/llama_cpp.py +23 -8
- pixeltable/functions/math.py +3 -4
- pixeltable/functions/mistralai.py +4 -15
- pixeltable/functions/ollama.py +16 -9
- pixeltable/functions/openai.py +104 -82
- pixeltable/functions/openrouter.py +143 -0
- pixeltable/functions/replicate.py +2 -2
- pixeltable/functions/reve.py +250 -0
- pixeltable/functions/string.py +21 -28
- pixeltable/functions/timestamp.py +13 -14
- pixeltable/functions/together.py +4 -6
- pixeltable/functions/twelvelabs.py +92 -0
- pixeltable/functions/util.py +6 -1
- pixeltable/functions/video.py +1388 -106
- pixeltable/functions/vision.py +7 -7
- pixeltable/functions/whisper.py +15 -7
- pixeltable/functions/whisperx.py +179 -0
- pixeltable/{ext/functions → functions}/yolox.py +2 -4
- pixeltable/globals.py +332 -105
- pixeltable/index/base.py +13 -22
- pixeltable/index/btree.py +23 -22
- pixeltable/index/embedding_index.py +32 -44
- pixeltable/io/__init__.py +4 -2
- pixeltable/io/datarows.py +7 -6
- pixeltable/io/external_store.py +49 -77
- pixeltable/io/fiftyone.py +11 -11
- pixeltable/io/globals.py +29 -28
- pixeltable/io/hf_datasets.py +17 -9
- pixeltable/io/label_studio.py +70 -66
- pixeltable/io/lancedb.py +3 -0
- pixeltable/io/pandas.py +12 -11
- pixeltable/io/parquet.py +13 -93
- pixeltable/io/table_data_conduit.py +71 -47
- pixeltable/io/utils.py +3 -3
- pixeltable/iterators/__init__.py +2 -1
- pixeltable/iterators/audio.py +21 -11
- pixeltable/iterators/document.py +116 -55
- pixeltable/iterators/image.py +5 -2
- pixeltable/iterators/video.py +293 -13
- pixeltable/metadata/__init__.py +4 -2
- pixeltable/metadata/converters/convert_18.py +2 -2
- pixeltable/metadata/converters/convert_19.py +2 -2
- pixeltable/metadata/converters/convert_20.py +2 -2
- pixeltable/metadata/converters/convert_21.py +2 -2
- pixeltable/metadata/converters/convert_22.py +2 -2
- pixeltable/metadata/converters/convert_24.py +2 -2
- pixeltable/metadata/converters/convert_25.py +2 -2
- pixeltable/metadata/converters/convert_26.py +2 -2
- pixeltable/metadata/converters/convert_29.py +4 -4
- pixeltable/metadata/converters/convert_34.py +2 -2
- pixeltable/metadata/converters/convert_36.py +2 -2
- pixeltable/metadata/converters/convert_37.py +15 -0
- pixeltable/metadata/converters/convert_38.py +39 -0
- pixeltable/metadata/converters/convert_39.py +124 -0
- pixeltable/metadata/converters/convert_40.py +73 -0
- pixeltable/metadata/converters/util.py +13 -12
- pixeltable/metadata/notes.py +4 -0
- pixeltable/metadata/schema.py +79 -42
- pixeltable/metadata/utils.py +74 -0
- pixeltable/mypy/__init__.py +3 -0
- pixeltable/mypy/mypy_plugin.py +123 -0
- pixeltable/plan.py +274 -223
- pixeltable/share/__init__.py +1 -1
- pixeltable/share/packager.py +259 -129
- pixeltable/share/protocol/__init__.py +34 -0
- pixeltable/share/protocol/common.py +170 -0
- pixeltable/share/protocol/operation_types.py +33 -0
- pixeltable/share/protocol/replica.py +109 -0
- pixeltable/share/publish.py +213 -57
- pixeltable/store.py +238 -175
- pixeltable/type_system.py +104 -63
- pixeltable/utils/__init__.py +2 -3
- pixeltable/utils/arrow.py +108 -13
- pixeltable/utils/av.py +298 -0
- pixeltable/utils/azure_store.py +305 -0
- pixeltable/utils/code.py +3 -3
- pixeltable/utils/console_output.py +4 -1
- pixeltable/utils/coroutine.py +6 -23
- pixeltable/utils/dbms.py +31 -5
- pixeltable/utils/description_helper.py +4 -5
- pixeltable/utils/documents.py +5 -6
- pixeltable/utils/exception_handler.py +7 -30
- pixeltable/utils/filecache.py +6 -6
- pixeltable/utils/formatter.py +4 -6
- pixeltable/utils/gcs_store.py +283 -0
- pixeltable/utils/http_server.py +2 -3
- pixeltable/utils/iceberg.py +1 -2
- pixeltable/utils/image.py +17 -0
- pixeltable/utils/lancedb.py +88 -0
- pixeltable/utils/local_store.py +316 -0
- pixeltable/utils/misc.py +5 -0
- pixeltable/utils/object_stores.py +528 -0
- pixeltable/utils/pydantic.py +60 -0
- pixeltable/utils/pytorch.py +5 -6
- pixeltable/utils/s3_store.py +392 -0
- pixeltable-0.4.20.dist-info/METADATA +587 -0
- pixeltable-0.4.20.dist-info/RECORD +218 -0
- {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info}/WHEEL +1 -1
- pixeltable-0.4.20.dist-info/entry_points.txt +2 -0
- pixeltable/__version__.py +0 -3
- pixeltable/ext/__init__.py +0 -17
- pixeltable/ext/functions/__init__.py +0 -11
- pixeltable/ext/functions/whisperx.py +0 -77
- pixeltable/utils/media_store.py +0 -77
- pixeltable/utils/s3.py +0 -17
- pixeltable/utils/sample.py +0 -25
- pixeltable-0.4.0rc3.dist-info/METADATA +0 -435
- pixeltable-0.4.0rc3.dist-info/RECORD +0 -189
- pixeltable-0.4.0rc3.dist-info/entry_points.txt +0 -3
- {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info/licenses}/LICENSE +0 -0
pixeltable/type_system.py
CHANGED
|
@@ -5,10 +5,14 @@ import datetime
|
|
|
5
5
|
import enum
|
|
6
6
|
import io
|
|
7
7
|
import json
|
|
8
|
+
import types
|
|
8
9
|
import typing
|
|
9
10
|
import urllib.parse
|
|
10
11
|
import urllib.request
|
|
11
|
-
from
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Any, ClassVar, Iterable, Literal, Mapping, Sequence, Union
|
|
14
|
+
|
|
15
|
+
from typing import _GenericAlias # type: ignore[attr-defined] # isort: skip
|
|
12
16
|
|
|
13
17
|
import av
|
|
14
18
|
import jsonschema
|
|
@@ -21,10 +25,9 @@ import sqlalchemy as sql
|
|
|
21
25
|
from typing_extensions import _AnnotatedAlias
|
|
22
26
|
|
|
23
27
|
import pixeltable.exceptions as excs
|
|
28
|
+
from pixeltable.env import Env
|
|
24
29
|
from pixeltable.utils import parse_local_file_path
|
|
25
30
|
|
|
26
|
-
from typing import _GenericAlias # type: ignore[attr-defined] # isort: skip
|
|
27
|
-
|
|
28
31
|
|
|
29
32
|
class ColumnType:
|
|
30
33
|
@enum.unique
|
|
@@ -48,11 +51,11 @@ class ColumnType:
|
|
|
48
51
|
@classmethod
|
|
49
52
|
def supertype(
|
|
50
53
|
cls,
|
|
51
|
-
type1:
|
|
52
|
-
type2:
|
|
54
|
+
type1: 'ColumnType.Type' | None,
|
|
55
|
+
type2: 'ColumnType.Type' | None,
|
|
53
56
|
# we need to pass this in because we can't easily append it as a class member
|
|
54
57
|
common_supertypes: dict[tuple['ColumnType.Type', 'ColumnType.Type'], 'ColumnType.Type'],
|
|
55
|
-
) ->
|
|
58
|
+
) -> 'ColumnType.Type' | None:
|
|
56
59
|
if type1 == type2:
|
|
57
60
|
return type1
|
|
58
61
|
t = common_supertypes.get((type1, type2))
|
|
@@ -185,7 +188,7 @@ class ColumnType:
|
|
|
185
188
|
if as_schema:
|
|
186
189
|
return base_str if self.nullable else f'Required[{base_str}]'
|
|
187
190
|
else:
|
|
188
|
-
return f'
|
|
191
|
+
return f'{base_str} | None' if self.nullable else base_str
|
|
189
192
|
|
|
190
193
|
def _to_base_str(self) -> str:
|
|
191
194
|
"""
|
|
@@ -214,7 +217,7 @@ class ColumnType:
|
|
|
214
217
|
# Default: just compare base types (this works for all types whose only parameter is nullable)
|
|
215
218
|
return self._type == other._type
|
|
216
219
|
|
|
217
|
-
def supertype(self, other: ColumnType) ->
|
|
220
|
+
def supertype(self, other: ColumnType) -> ColumnType | None:
|
|
218
221
|
if self == other:
|
|
219
222
|
return self
|
|
220
223
|
if self.matches(other):
|
|
@@ -234,7 +237,7 @@ class ColumnType:
|
|
|
234
237
|
return None
|
|
235
238
|
|
|
236
239
|
@classmethod
|
|
237
|
-
def infer_literal_type(cls, val: Any, nullable: bool = False) ->
|
|
240
|
+
def infer_literal_type(cls, val: Any, nullable: bool = False) -> ColumnType | None:
|
|
238
241
|
if val is None:
|
|
239
242
|
return InvalidType(nullable=True)
|
|
240
243
|
if isinstance(val, str):
|
|
@@ -268,7 +271,7 @@ class ColumnType:
|
|
|
268
271
|
return None
|
|
269
272
|
|
|
270
273
|
@classmethod
|
|
271
|
-
def infer_common_literal_type(cls, vals: Iterable[Any]) ->
|
|
274
|
+
def infer_common_literal_type(cls, vals: Iterable[Any]) -> ColumnType | None:
|
|
272
275
|
"""
|
|
273
276
|
Returns the most specific type that is a supertype of all literals in `vals`. If no such type
|
|
274
277
|
exists, returns None.
|
|
@@ -276,7 +279,7 @@ class ColumnType:
|
|
|
276
279
|
Args:
|
|
277
280
|
vals: A collection of literals.
|
|
278
281
|
"""
|
|
279
|
-
inferred_type:
|
|
282
|
+
inferred_type: ColumnType | None = None
|
|
280
283
|
for val in vals:
|
|
281
284
|
val_type = cls.infer_literal_type(val)
|
|
282
285
|
if inferred_type is None:
|
|
@@ -291,8 +294,12 @@ class ColumnType:
|
|
|
291
294
|
|
|
292
295
|
@classmethod
|
|
293
296
|
def from_python_type(
|
|
294
|
-
cls,
|
|
295
|
-
|
|
297
|
+
cls,
|
|
298
|
+
t: type | _GenericAlias,
|
|
299
|
+
nullable_default: bool = False,
|
|
300
|
+
allow_builtin_types: bool = True,
|
|
301
|
+
infer_pydantic_json: bool = False,
|
|
302
|
+
) -> ColumnType | None:
|
|
296
303
|
"""
|
|
297
304
|
Convert a Python type into a Pixeltable `ColumnType` instance.
|
|
298
305
|
|
|
@@ -304,16 +311,20 @@ class ColumnType:
|
|
|
304
311
|
allowed (as in UDF definitions). If False, then only Pixeltable types such as `pxt.String`,
|
|
305
312
|
`pxt.Int`, etc., will be allowed (as in schema definitions). `Optional` and `Required`
|
|
306
313
|
designations will be allowed regardless.
|
|
314
|
+
infer_pydantic_json: If True, accepts an extended set of built-ins (eg, Enum, Path) and returns the type to
|
|
315
|
+
which pydantic.BaseModel.model_dump(mode='json') serializes it.
|
|
307
316
|
"""
|
|
308
317
|
origin = typing.get_origin(t)
|
|
309
318
|
type_args = typing.get_args(t)
|
|
310
|
-
if origin
|
|
311
|
-
# Check if `t` has the form
|
|
319
|
+
if origin in (typing.Union, types.UnionType):
|
|
320
|
+
# Check if `t` has the form T | None.
|
|
312
321
|
if len(type_args) == 2 and type(None) in type_args:
|
|
313
|
-
# `t` is a type of the form
|
|
322
|
+
# `t` is a type of the form T | None (equivalently, T | None or None | T).
|
|
314
323
|
# We treat it as the underlying type but with nullable=True.
|
|
315
324
|
underlying_py_type = type_args[0] if type_args[1] is type(None) else type_args[1]
|
|
316
|
-
underlying = cls.from_python_type(
|
|
325
|
+
underlying = cls.from_python_type(
|
|
326
|
+
underlying_py_type, allow_builtin_types=allow_builtin_types, infer_pydantic_json=infer_pydantic_json
|
|
327
|
+
)
|
|
317
328
|
if underlying is not None:
|
|
318
329
|
return underlying.copy(nullable=True)
|
|
319
330
|
elif origin is Required:
|
|
@@ -327,7 +338,7 @@ class ColumnType:
|
|
|
327
338
|
if isinstance(parameters, ColumnType):
|
|
328
339
|
return parameters.copy(nullable=nullable_default)
|
|
329
340
|
else:
|
|
330
|
-
# It's something other than
|
|
341
|
+
# It's something other than T | None, Required[T], or an explicitly annotated type.
|
|
331
342
|
if origin is not None:
|
|
332
343
|
# Discard type parameters to ensure that parameterized types such as `list[T]`
|
|
333
344
|
# are correctly mapped to Pixeltable types.
|
|
@@ -340,6 +351,13 @@ class ColumnType:
|
|
|
340
351
|
if literal_type is None:
|
|
341
352
|
return None
|
|
342
353
|
return literal_type.copy(nullable=(literal_type.nullable or nullable_default))
|
|
354
|
+
if infer_pydantic_json and isinstance(t, type) and issubclass(t, enum.Enum):
|
|
355
|
+
literal_type = cls.infer_common_literal_type(member.value for member in t)
|
|
356
|
+
if literal_type is None:
|
|
357
|
+
return None
|
|
358
|
+
return literal_type.copy(nullable=(literal_type.nullable or nullable_default))
|
|
359
|
+
if infer_pydantic_json and t is Path:
|
|
360
|
+
return StringType(nullable=nullable_default)
|
|
343
361
|
if t is str:
|
|
344
362
|
return StringType(nullable=nullable_default)
|
|
345
363
|
if t is int:
|
|
@@ -360,10 +378,7 @@ class ColumnType:
|
|
|
360
378
|
|
|
361
379
|
@classmethod
|
|
362
380
|
def normalize_type(
|
|
363
|
-
cls,
|
|
364
|
-
t: Union[ColumnType, type, _AnnotatedAlias],
|
|
365
|
-
nullable_default: bool = False,
|
|
366
|
-
allow_builtin_types: bool = True,
|
|
381
|
+
cls, t: ColumnType | type | _AnnotatedAlias, nullable_default: bool = False, allow_builtin_types: bool = True
|
|
367
382
|
) -> ColumnType:
|
|
368
383
|
"""
|
|
369
384
|
Convert any type recognizable by Pixeltable to its corresponding ColumnType.
|
|
@@ -388,13 +403,43 @@ class ColumnType:
|
|
|
388
403
|
]
|
|
389
404
|
|
|
390
405
|
@classmethod
|
|
391
|
-
def __raise_exc_for_invalid_type(cls, t:
|
|
406
|
+
def __raise_exc_for_invalid_type(cls, t: type | _AnnotatedAlias) -> None:
|
|
392
407
|
for builtin_type, suggestion in cls.__TYPE_SUGGESTIONS:
|
|
393
408
|
if t is builtin_type or (isinstance(t, type) and issubclass(t, builtin_type)):
|
|
394
409
|
name = t.__name__ if t.__module__ == 'builtins' else f'{t.__module__}.{t.__name__}'
|
|
395
410
|
raise excs.Error(f'Standard Python type `{name}` cannot be used here; use `{suggestion}` instead')
|
|
396
411
|
raise excs.Error(f'Unknown type: {t}')
|
|
397
412
|
|
|
413
|
+
@classmethod
|
|
414
|
+
def from_json_schema(cls, schema: dict[str, Any]) -> ColumnType | None:
|
|
415
|
+
# We first express the JSON schema as a Python type, and then convert it to a Pixeltable type.
|
|
416
|
+
# TODO: Is there a meaningful fallback if one of these operations fails? (Maybe another use case for a pxt Any
|
|
417
|
+
# type?)
|
|
418
|
+
py_type = cls.__json_schema_to_py_type(schema)
|
|
419
|
+
return cls.from_python_type(py_type) if py_type is not None else None
|
|
420
|
+
|
|
421
|
+
@classmethod
|
|
422
|
+
def __json_schema_to_py_type(cls, schema: dict[str, Any]) -> type | _GenericAlias | None:
|
|
423
|
+
if 'type' in schema:
|
|
424
|
+
if schema['type'] == 'null':
|
|
425
|
+
return type(None)
|
|
426
|
+
if schema['type'] == 'string':
|
|
427
|
+
return str
|
|
428
|
+
if schema['type'] == 'integer':
|
|
429
|
+
return int
|
|
430
|
+
if schema['type'] == 'number':
|
|
431
|
+
return float
|
|
432
|
+
if schema['type'] == 'boolean':
|
|
433
|
+
return bool
|
|
434
|
+
if schema['type'] in ('array', 'object'):
|
|
435
|
+
return list
|
|
436
|
+
elif 'anyOf' in schema:
|
|
437
|
+
subscripts = tuple(cls.__json_schema_to_py_type(subschema) for subschema in schema['anyOf'])
|
|
438
|
+
if all(subscript is not None for subscript in subscripts):
|
|
439
|
+
return Union[subscripts]
|
|
440
|
+
|
|
441
|
+
return None
|
|
442
|
+
|
|
398
443
|
def validate_literal(self, val: Any) -> None:
|
|
399
444
|
"""Raise TypeError if val is not a valid literal for this type"""
|
|
400
445
|
if val is None:
|
|
@@ -629,8 +674,9 @@ class TimestampType(ColumnType):
|
|
|
629
674
|
def _create_literal(self, val: Any) -> Any:
|
|
630
675
|
if isinstance(val, str):
|
|
631
676
|
return datetime.datetime.fromisoformat(val)
|
|
632
|
-
|
|
633
|
-
|
|
677
|
+
# Place naive timestamps in the default time zone
|
|
678
|
+
if isinstance(val, datetime.datetime) and val.tzinfo is None:
|
|
679
|
+
return val.replace(tzinfo=Env.get().default_time_zone)
|
|
634
680
|
return val
|
|
635
681
|
|
|
636
682
|
|
|
@@ -658,10 +704,10 @@ class DateType(ColumnType):
|
|
|
658
704
|
|
|
659
705
|
|
|
660
706
|
class JsonType(ColumnType):
|
|
661
|
-
json_schema:
|
|
662
|
-
__validator:
|
|
707
|
+
json_schema: dict[str, Any] | None
|
|
708
|
+
__validator: jsonschema.protocols.Validator | None
|
|
663
709
|
|
|
664
|
-
def __init__(self, json_schema:
|
|
710
|
+
def __init__(self, json_schema: dict[str, Any] | None = None, nullable: bool = False):
|
|
665
711
|
super().__init__(self.Type.JSON, nullable=nullable)
|
|
666
712
|
self.json_schema = json_schema
|
|
667
713
|
if json_schema is None:
|
|
@@ -716,7 +762,7 @@ class JsonType(ColumnType):
|
|
|
716
762
|
|
|
717
763
|
@classmethod
|
|
718
764
|
def __is_valid_json(cls, val: Any) -> bool:
|
|
719
|
-
if val is None or isinstance(val, (str, int, float, bool)):
|
|
765
|
+
if val is None or isinstance(val, (str, int, float, bool, np.ndarray, PIL.Image.Image)):
|
|
720
766
|
return True
|
|
721
767
|
if isinstance(val, (list, tuple)):
|
|
722
768
|
return all(cls.__is_valid_json(v) for v in val)
|
|
@@ -731,7 +777,7 @@ class JsonType(ColumnType):
|
|
|
731
777
|
return val.model_dump()
|
|
732
778
|
return val
|
|
733
779
|
|
|
734
|
-
def supertype(self, other: ColumnType) ->
|
|
780
|
+
def supertype(self, other: ColumnType) -> JsonType | None:
|
|
735
781
|
# Try using the (much faster) supertype logic in ColumnType first. That will work if, for example, the types
|
|
736
782
|
# are identical except for nullability. If that doesn't work and both types are JsonType, then we will need to
|
|
737
783
|
# merge their schemas.
|
|
@@ -753,7 +799,7 @@ class JsonType(ColumnType):
|
|
|
753
799
|
)
|
|
754
800
|
|
|
755
801
|
@classmethod
|
|
756
|
-
def __superschema(cls, a: dict[str, Any], b: dict[str, Any]) ->
|
|
802
|
+
def __superschema(cls, a: dict[str, Any], b: dict[str, Any]) -> dict[str, Any] | None:
|
|
757
803
|
# Defining a general type hierarchy over all JSON schemas would be a challenging problem. In order to keep
|
|
758
804
|
# things manageable, we only define a hierarchy among "conforming" schemas, which provides enough generality
|
|
759
805
|
# for the most important use cases (unions for type inference, validation of inline exprs). A schema is
|
|
@@ -813,7 +859,7 @@ class JsonType(ColumnType):
|
|
|
813
859
|
return {} # Unresolvable type conflict; the supertype is an unrestricted JsonType.
|
|
814
860
|
|
|
815
861
|
@classmethod
|
|
816
|
-
def __superschema_with_nulls(cls, a: dict[str, Any], b: dict[str, Any]) ->
|
|
862
|
+
def __superschema_with_nulls(cls, a: dict[str, Any], b: dict[str, Any]) -> dict[str, Any] | None:
|
|
817
863
|
a, a_nullable = cls.__unpack_null_from_schema(a)
|
|
818
864
|
b, b_nullable = cls.__unpack_null_from_schema(b)
|
|
819
865
|
|
|
@@ -842,15 +888,12 @@ class JsonType(ColumnType):
|
|
|
842
888
|
|
|
843
889
|
|
|
844
890
|
class ArrayType(ColumnType):
|
|
845
|
-
shape:
|
|
846
|
-
pxt_dtype:
|
|
847
|
-
dtype:
|
|
891
|
+
shape: tuple[int | None, ...] | None
|
|
892
|
+
pxt_dtype: ColumnType | None
|
|
893
|
+
dtype: ColumnType.Type | None
|
|
848
894
|
|
|
849
895
|
def __init__(
|
|
850
|
-
self,
|
|
851
|
-
shape: Optional[tuple[Optional[int], ...]] = None,
|
|
852
|
-
dtype: Optional[ColumnType] = None,
|
|
853
|
-
nullable: bool = False,
|
|
896
|
+
self, shape: tuple[int | None, ...] | None = None, dtype: ColumnType | None = None, nullable: bool = False
|
|
854
897
|
):
|
|
855
898
|
super().__init__(self.Type.ARRAY, nullable=nullable)
|
|
856
899
|
assert shape is None or dtype is not None, (shape, dtype) # cannot specify a shape without a dtype
|
|
@@ -875,7 +918,7 @@ class ArrayType(ColumnType):
|
|
|
875
918
|
def __hash__(self) -> int:
|
|
876
919
|
return hash((self._type, self.nullable, self.shape, self.dtype))
|
|
877
920
|
|
|
878
|
-
def supertype(self, other: ColumnType) ->
|
|
921
|
+
def supertype(self, other: ColumnType) -> ArrayType | None:
|
|
879
922
|
basic_supertype = super().supertype(other)
|
|
880
923
|
if basic_supertype is not None:
|
|
881
924
|
assert isinstance(basic_supertype, ArrayType)
|
|
@@ -888,7 +931,7 @@ class ArrayType(ColumnType):
|
|
|
888
931
|
if super_dtype is None:
|
|
889
932
|
# if the dtypes are incompatible, then the supertype is a fully general array
|
|
890
933
|
return ArrayType(nullable=(self.nullable or other.nullable))
|
|
891
|
-
super_shape:
|
|
934
|
+
super_shape: tuple[int | None, ...] | None
|
|
892
935
|
if self.shape is None or other.shape is None or len(self.shape) != len(other.shape):
|
|
893
936
|
super_shape = None
|
|
894
937
|
else:
|
|
@@ -919,7 +962,7 @@ class ArrayType(ColumnType):
|
|
|
919
962
|
return cls(shape, dtype, nullable=d['nullable'])
|
|
920
963
|
|
|
921
964
|
@classmethod
|
|
922
|
-
def from_np_dtype(cls, dtype: np.dtype, nullable: bool) ->
|
|
965
|
+
def from_np_dtype(cls, dtype: np.dtype, nullable: bool) -> ColumnType | None:
|
|
923
966
|
"""
|
|
924
967
|
Return pixeltable type corresponding to a given simple numpy dtype
|
|
925
968
|
"""
|
|
@@ -948,10 +991,10 @@ class ArrayType(ColumnType):
|
|
|
948
991
|
return None
|
|
949
992
|
|
|
950
993
|
@classmethod
|
|
951
|
-
def from_literal(cls, val: np.ndarray, nullable: bool = False) ->
|
|
994
|
+
def from_literal(cls, val: np.ndarray, nullable: bool = False) -> ArrayType | None:
|
|
952
995
|
# determine our dtype
|
|
953
996
|
assert isinstance(val, np.ndarray)
|
|
954
|
-
pxttype:
|
|
997
|
+
pxttype: ColumnType | None = cls.from_np_dtype(val.dtype, nullable)
|
|
955
998
|
if pxttype is None:
|
|
956
999
|
return None
|
|
957
1000
|
return cls(val.shape, dtype=pxttype, nullable=nullable)
|
|
@@ -1014,7 +1057,7 @@ class ArrayType(ColumnType):
|
|
|
1014
1057
|
def to_sa_type(cls) -> sql.types.TypeEngine:
|
|
1015
1058
|
return sql.LargeBinary()
|
|
1016
1059
|
|
|
1017
|
-
def numpy_dtype(self) ->
|
|
1060
|
+
def numpy_dtype(self) -> np.dtype | None:
|
|
1018
1061
|
if self.dtype is None:
|
|
1019
1062
|
return None
|
|
1020
1063
|
if self.dtype == self.Type.INT:
|
|
@@ -1031,15 +1074,13 @@ class ArrayType(ColumnType):
|
|
|
1031
1074
|
class ImageType(ColumnType):
|
|
1032
1075
|
def __init__(
|
|
1033
1076
|
self,
|
|
1034
|
-
width:
|
|
1035
|
-
height:
|
|
1036
|
-
size:
|
|
1037
|
-
mode:
|
|
1077
|
+
width: int | None = None,
|
|
1078
|
+
height: int | None = None,
|
|
1079
|
+
size: tuple[int, int] | None = None,
|
|
1080
|
+
mode: str | None = None,
|
|
1038
1081
|
nullable: bool = False,
|
|
1039
1082
|
):
|
|
1040
|
-
|
|
1041
|
-
TODO: does it make sense to specify only width or height?
|
|
1042
|
-
"""
|
|
1083
|
+
# TODO: does it make sense to specify only width or height?
|
|
1043
1084
|
super().__init__(self.Type.IMAGE, nullable=nullable)
|
|
1044
1085
|
assert not (width is not None and size is not None)
|
|
1045
1086
|
assert not (height is not None and size is not None)
|
|
@@ -1077,7 +1118,7 @@ class ImageType(ColumnType):
|
|
|
1077
1118
|
def __hash__(self) -> int:
|
|
1078
1119
|
return hash((self._type, self.nullable, self.size, self.mode))
|
|
1079
1120
|
|
|
1080
|
-
def supertype(self, other: ColumnType) ->
|
|
1121
|
+
def supertype(self, other: ColumnType) -> ImageType | None:
|
|
1081
1122
|
basic_supertype = super().supertype(other)
|
|
1082
1123
|
if basic_supertype is not None:
|
|
1083
1124
|
assert isinstance(basic_supertype, ImageType)
|
|
@@ -1092,7 +1133,7 @@ class ImageType(ColumnType):
|
|
|
1092
1133
|
return ImageType(width=width, height=height, mode=mode, nullable=(self.nullable or other.nullable))
|
|
1093
1134
|
|
|
1094
1135
|
@property
|
|
1095
|
-
def size(self) ->
|
|
1136
|
+
def size(self) -> tuple[int, int] | None:
|
|
1096
1137
|
if self.width is None or self.height is None:
|
|
1097
1138
|
return None
|
|
1098
1139
|
return (self.width, self.height)
|
|
@@ -1123,8 +1164,8 @@ class ImageType(ColumnType):
|
|
|
1123
1164
|
img.load()
|
|
1124
1165
|
return img
|
|
1125
1166
|
except Exception as exc:
|
|
1126
|
-
|
|
1127
|
-
raise excs.Error(f'data URL could not be decoded into a valid image: {
|
|
1167
|
+
error_msg_val = val if len(val) < 50 else val[:50] + '...'
|
|
1168
|
+
raise excs.Error(f'data URL could not be decoded into a valid image: {error_msg_val}') from exc
|
|
1128
1169
|
return val
|
|
1129
1170
|
|
|
1130
1171
|
def _validate_literal(self, val: Any) -> None:
|
|
@@ -1211,7 +1252,7 @@ class DocumentType(ColumnType):
|
|
|
1211
1252
|
TXT = 4
|
|
1212
1253
|
|
|
1213
1254
|
@classmethod
|
|
1214
|
-
def from_extension(cls, ext: str) ->
|
|
1255
|
+
def from_extension(cls, ext: str) -> 'DocumentType.DocumentFormat' | None:
|
|
1215
1256
|
if ext in ('.htm', '.html'):
|
|
1216
1257
|
return cls.HTML
|
|
1217
1258
|
if ext == '.md':
|
|
@@ -1224,7 +1265,7 @@ class DocumentType(ColumnType):
|
|
|
1224
1265
|
return cls.TXT
|
|
1225
1266
|
return None
|
|
1226
1267
|
|
|
1227
|
-
def __init__(self, nullable: bool = False, doc_formats:
|
|
1268
|
+
def __init__(self, nullable: bool = False, doc_formats: str | None = None):
|
|
1228
1269
|
super().__init__(self.Type.DOCUMENT, nullable=nullable)
|
|
1229
1270
|
self.doc_formats = doc_formats
|
|
1230
1271
|
if doc_formats is not None:
|
|
@@ -1321,13 +1362,13 @@ class Array(np.ndarray, _PxtType):
|
|
|
1321
1362
|
def __class_getitem__(cls, item: Any) -> _AnnotatedAlias:
|
|
1322
1363
|
"""
|
|
1323
1364
|
`item` (the type subscript) must be a tuple with exactly two elements (in any order):
|
|
1324
|
-
- A tuple of `
|
|
1365
|
+
- A tuple of `int | None`s, specifying the shape of the array
|
|
1325
1366
|
- A type, specifying the dtype of the array
|
|
1326
1367
|
Example: Array[(3, None, 2), pxt.Float]
|
|
1327
1368
|
"""
|
|
1328
1369
|
params = item if isinstance(item, tuple) else (item,)
|
|
1329
|
-
shape:
|
|
1330
|
-
dtype:
|
|
1370
|
+
shape: tuple | None = None
|
|
1371
|
+
dtype: ColumnType | None = None
|
|
1331
1372
|
if not any(isinstance(param, (type, _AnnotatedAlias)) for param in params):
|
|
1332
1373
|
raise TypeError('Array type parameter must include a dtype.')
|
|
1333
1374
|
for param in params:
|
|
@@ -1367,8 +1408,8 @@ class Image(PIL.Image.Image, _PxtType):
|
|
|
1367
1408
|
else:
|
|
1368
1409
|
# Not a tuple (single arg)
|
|
1369
1410
|
params = (item,)
|
|
1370
|
-
size:
|
|
1371
|
-
mode:
|
|
1411
|
+
size: tuple | None = None
|
|
1412
|
+
mode: str | None = None
|
|
1372
1413
|
for param in params:
|
|
1373
1414
|
if isinstance(param, tuple):
|
|
1374
1415
|
if (
|
pixeltable/utils/__init__.py
CHANGED
|
@@ -2,7 +2,6 @@ import hashlib
|
|
|
2
2
|
import urllib.parse
|
|
3
3
|
import urllib.request
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import Optional, Union
|
|
6
5
|
|
|
7
6
|
|
|
8
7
|
def print_perf_counter_delta(delta: float) -> str:
|
|
@@ -24,7 +23,7 @@ def print_perf_counter_delta(delta: float) -> str:
|
|
|
24
23
|
return f'{delta:.2f} s'
|
|
25
24
|
|
|
26
25
|
|
|
27
|
-
def sha256sum(path:
|
|
26
|
+
def sha256sum(path: Path | str) -> str:
|
|
28
27
|
"""
|
|
29
28
|
Compute the SHA256 hash of a file.
|
|
30
29
|
"""
|
|
@@ -39,7 +38,7 @@ def sha256sum(path: Union[Path, str]) -> str:
|
|
|
39
38
|
return h.hexdigest()
|
|
40
39
|
|
|
41
40
|
|
|
42
|
-
def parse_local_file_path(file_or_url: str) ->
|
|
41
|
+
def parse_local_file_path(file_or_url: str) -> Path | None:
|
|
43
42
|
"""
|
|
44
43
|
Parses a string that may be either a URL or a local file path.
|
|
45
44
|
|
pixeltable/utils/arrow.py
CHANGED
|
@@ -1,15 +1,22 @@
|
|
|
1
1
|
import datetime
|
|
2
|
-
|
|
2
|
+
import io
|
|
3
|
+
import json
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Iterator, cast
|
|
3
5
|
|
|
4
6
|
import numpy as np
|
|
7
|
+
import PIL.Image
|
|
5
8
|
import pyarrow as pa
|
|
6
9
|
|
|
10
|
+
import pixeltable.exceptions as excs
|
|
7
11
|
import pixeltable.type_system as ts
|
|
8
12
|
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
import pixeltable as pxt
|
|
15
|
+
|
|
9
16
|
PA_TO_PXT_TYPES: dict[pa.DataType, ts.ColumnType] = {
|
|
10
17
|
pa.string(): ts.StringType(nullable=True),
|
|
11
18
|
pa.large_string(): ts.StringType(nullable=True),
|
|
12
|
-
pa.timestamp('us', tz=
|
|
19
|
+
pa.timestamp('us', tz='UTC'): ts.TimestampType(nullable=True),
|
|
13
20
|
pa.bool_(): ts.BoolType(nullable=True),
|
|
14
21
|
pa.int8(): ts.IntType(nullable=True),
|
|
15
22
|
pa.int16(): ts.IntType(nullable=True),
|
|
@@ -28,7 +35,7 @@ PA_TO_PXT_TYPES: dict[pa.DataType, ts.ColumnType] = {
|
|
|
28
35
|
|
|
29
36
|
PXT_TO_PA_TYPES: dict[type[ts.ColumnType], pa.DataType] = {
|
|
30
37
|
ts.StringType: pa.string(),
|
|
31
|
-
ts.TimestampType: pa.timestamp('us', tz=
|
|
38
|
+
ts.TimestampType: pa.timestamp('us', tz='UTC'), # postgres timestamp is microseconds
|
|
32
39
|
ts.DateType: pa.date32(), # This could be date64
|
|
33
40
|
ts.BoolType: pa.bool_(),
|
|
34
41
|
ts.IntType: pa.int64(),
|
|
@@ -41,7 +48,7 @@ PXT_TO_PA_TYPES: dict[type[ts.ColumnType], pa.DataType] = {
|
|
|
41
48
|
}
|
|
42
49
|
|
|
43
50
|
|
|
44
|
-
def to_pixeltable_type(arrow_type: pa.DataType, nullable: bool) ->
|
|
51
|
+
def to_pixeltable_type(arrow_type: pa.DataType, nullable: bool) -> ts.ColumnType | None:
|
|
45
52
|
"""Convert a pyarrow DataType to a pixeltable ColumnType if one is defined.
|
|
46
53
|
Returns None if no conversion is currently implemented.
|
|
47
54
|
"""
|
|
@@ -54,12 +61,12 @@ def to_pixeltable_type(arrow_type: pa.DataType, nullable: bool) -> Optional[ts.C
|
|
|
54
61
|
dtype = to_pixeltable_type(arrow_type.value_type, nullable)
|
|
55
62
|
if dtype is None:
|
|
56
63
|
return None
|
|
57
|
-
return ts.ArrayType(shape=arrow_type.shape, dtype=dtype, nullable=nullable)
|
|
64
|
+
return ts.ArrayType(shape=tuple(arrow_type.shape), dtype=dtype, nullable=nullable)
|
|
58
65
|
else:
|
|
59
66
|
return None
|
|
60
67
|
|
|
61
68
|
|
|
62
|
-
def to_arrow_type(pixeltable_type: ts.ColumnType) ->
|
|
69
|
+
def to_arrow_type(pixeltable_type: ts.ColumnType) -> pa.DataType | None:
|
|
63
70
|
"""Convert a pixeltable DataType to a pyarrow datatype if one is defined.
|
|
64
71
|
Returns None if no conversion is currently implemented.
|
|
65
72
|
"""
|
|
@@ -71,7 +78,7 @@ def to_arrow_type(pixeltable_type: ts.ColumnType) -> Optional[pa.DataType]:
|
|
|
71
78
|
return None
|
|
72
79
|
|
|
73
80
|
|
|
74
|
-
def
|
|
81
|
+
def to_pxt_schema(
|
|
75
82
|
arrow_schema: pa.Schema, schema_overrides: dict[str, Any], primary_key: list[str]
|
|
76
83
|
) -> dict[str, ts.ColumnType]:
|
|
77
84
|
"""Convert a pyarrow Schema to a schema using pyarrow names and pixeltable types."""
|
|
@@ -85,19 +92,107 @@ def ar_infer_schema(
|
|
|
85
92
|
|
|
86
93
|
|
|
87
94
|
def to_arrow_schema(pixeltable_schema: dict[str, Any]) -> pa.Schema:
|
|
88
|
-
return pa.schema((name, to_arrow_type(typ)) for name, typ in pixeltable_schema.items())
|
|
95
|
+
return pa.schema((name, to_arrow_type(typ)) for name, typ in pixeltable_schema.items())
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _to_record_batch(column_vals: dict[str, list[Any]], schema: pa.Schema) -> pa.RecordBatch:
|
|
99
|
+
import pyarrow as pa
|
|
100
|
+
|
|
101
|
+
pa_arrays: list[pa.Array] = []
|
|
102
|
+
for field in schema:
|
|
103
|
+
if isinstance(field.type, pa.FixedShapeTensorType):
|
|
104
|
+
stacked_arr = np.stack(column_vals[field.name])
|
|
105
|
+
pa_arrays.append(pa.FixedShapeTensorArray.from_numpy_ndarray(stacked_arr))
|
|
106
|
+
else:
|
|
107
|
+
pa_array = cast(pa.Array, pa.array(column_vals[field.name]))
|
|
108
|
+
pa_arrays.append(pa_array)
|
|
109
|
+
return pa.RecordBatch.from_arrays(pa_arrays, schema=schema)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def to_record_batches(df: 'pxt.DataFrame', batch_size_bytes: int) -> Iterator[pa.RecordBatch]:
|
|
113
|
+
arrow_schema = to_arrow_schema(df.schema)
|
|
114
|
+
batch_columns: dict[str, list[Any]] = {k: [] for k in df.schema}
|
|
115
|
+
current_byte_estimate = 0
|
|
116
|
+
num_batch_rows = 0
|
|
117
|
+
|
|
118
|
+
# TODO: in order to avoid having to deal with ExprEvalError here, DataFrameResultSet should be an iterator
|
|
119
|
+
# over _exec()
|
|
120
|
+
try:
|
|
121
|
+
for data_row in df._exec():
|
|
122
|
+
num_batch_rows += 1
|
|
123
|
+
for (col_name, col_type), e in zip(df.schema.items(), df._select_list_exprs):
|
|
124
|
+
val = data_row[e.slot_idx]
|
|
125
|
+
val_size_bytes: int
|
|
126
|
+
if val is None:
|
|
127
|
+
batch_columns[col_name].append(val)
|
|
128
|
+
continue
|
|
129
|
+
|
|
130
|
+
assert val is not None
|
|
131
|
+
if col_type.is_image_type():
|
|
132
|
+
# images get inlined into the parquet file
|
|
133
|
+
if data_row.file_paths[e.slot_idx] is not None:
|
|
134
|
+
# if there is a file, read directly to preserve information
|
|
135
|
+
with open(data_row.file_paths[e.slot_idx], 'rb') as f:
|
|
136
|
+
val = f.read()
|
|
137
|
+
elif isinstance(val, PIL.Image.Image):
|
|
138
|
+
# no file available: save as png
|
|
139
|
+
buf = io.BytesIO()
|
|
140
|
+
val.save(buf, format='png')
|
|
141
|
+
val = buf.getvalue()
|
|
142
|
+
else:
|
|
143
|
+
raise excs.Error(f'unknown image type {type(val)}')
|
|
144
|
+
val_size_bytes = len(val)
|
|
145
|
+
elif col_type.is_string_type():
|
|
146
|
+
val_size_bytes = len(val)
|
|
147
|
+
elif col_type.is_media_type():
|
|
148
|
+
assert data_row.file_paths[e.slot_idx] is not None
|
|
149
|
+
val = data_row.file_paths[e.slot_idx]
|
|
150
|
+
val_size_bytes = len(val)
|
|
151
|
+
elif col_type.is_json_type():
|
|
152
|
+
val = json.dumps(val)
|
|
153
|
+
val_size_bytes = len(val)
|
|
154
|
+
elif col_type.is_array_type():
|
|
155
|
+
val_size_bytes = val.nbytes
|
|
156
|
+
elif col_type.is_int_type() or col_type.is_float_type():
|
|
157
|
+
val_size_bytes = 8
|
|
158
|
+
elif col_type.is_bool_type():
|
|
159
|
+
val_size_bytes = 1
|
|
160
|
+
elif col_type.is_date_type():
|
|
161
|
+
val_size_bytes = 4
|
|
162
|
+
elif col_type.is_timestamp_type():
|
|
163
|
+
val = val.astimezone(datetime.timezone.utc)
|
|
164
|
+
val_size_bytes = 8
|
|
165
|
+
else:
|
|
166
|
+
raise excs.Error(f'unknown type {col_type} for {col_name}')
|
|
167
|
+
|
|
168
|
+
batch_columns[col_name].append(val)
|
|
169
|
+
current_byte_estimate += val_size_bytes
|
|
170
|
+
|
|
171
|
+
if current_byte_estimate > batch_size_bytes and num_batch_rows > 0:
|
|
172
|
+
record_batch = _to_record_batch(batch_columns, arrow_schema)
|
|
173
|
+
yield record_batch
|
|
174
|
+
batch_columns = {k: [] for k in df.schema}
|
|
175
|
+
current_byte_estimate = 0
|
|
176
|
+
num_batch_rows = 0
|
|
177
|
+
|
|
178
|
+
except excs.ExprEvalError as e:
|
|
179
|
+
df._raise_expr_eval_err(e)
|
|
180
|
+
|
|
181
|
+
if num_batch_rows > 0:
|
|
182
|
+
record_batch = _to_record_batch(batch_columns, arrow_schema)
|
|
183
|
+
yield record_batch
|
|
89
184
|
|
|
90
185
|
|
|
91
|
-
def to_pydict(batch:
|
|
186
|
+
def to_pydict(batch: pa.Table | pa.RecordBatch) -> dict[str, list | np.ndarray]:
|
|
92
187
|
"""Convert a RecordBatch to a dictionary of lists, unlike pa.lib.RecordBatch.to_pydict,
|
|
93
188
|
this function will not convert numpy arrays to lists, and will preserve the original numpy dtype.
|
|
94
189
|
"""
|
|
95
|
-
out: dict[str,
|
|
190
|
+
out: dict[str, list | np.ndarray] = {}
|
|
96
191
|
for k, name in enumerate(batch.schema.names):
|
|
97
192
|
col = batch.column(k)
|
|
98
193
|
if isinstance(col.type, pa.FixedShapeTensorType):
|
|
99
194
|
# treat array columns as numpy arrays to easily preserve numpy type
|
|
100
|
-
out[name] = col.to_numpy(zero_copy_only=False)
|
|
195
|
+
out[name] = col.to_numpy(zero_copy_only=False)
|
|
101
196
|
else:
|
|
102
197
|
# for the rest, use pydict to preserve python types
|
|
103
198
|
out[name] = col.to_pylist()
|
|
@@ -105,7 +200,7 @@ def to_pydict(batch: Union[pa.Table, pa.RecordBatch]) -> dict[str, Union[list, n
|
|
|
105
200
|
return out
|
|
106
201
|
|
|
107
202
|
|
|
108
|
-
def iter_tuples(batch:
|
|
203
|
+
def iter_tuples(batch: pa.Table | pa.RecordBatch) -> Iterator[dict[str, Any]]:
|
|
109
204
|
"""Convert a RecordBatch to an iterator of dictionaries. also works with pa.Table and pa.RowGroup"""
|
|
110
205
|
pydict = to_pydict(batch)
|
|
111
206
|
assert len(pydict) > 0, 'empty record batch'
|
|
@@ -145,7 +240,7 @@ def _ar_val_to_pxt_val(val: Any, pxt_type: ts.ColumnType) -> Any:
|
|
|
145
240
|
|
|
146
241
|
|
|
147
242
|
def iter_tuples2(
|
|
148
|
-
batch:
|
|
243
|
+
batch: pa.Table | pa.RecordBatch, col_mapping: dict[str, str] | None, schema: dict[str, ts.ColumnType]
|
|
149
244
|
) -> Iterator[dict[str, Any]]:
|
|
150
245
|
"""Convert a RecordBatch to an iterator of dictionaries. also works with pa.Table and pa.RowGroup"""
|
|
151
246
|
pydict = to_pydict(batch)
|