pixeltable 0.4.0rc3__py3-none-any.whl → 0.4.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (202) hide show
  1. pixeltable/__init__.py +23 -5
  2. pixeltable/_version.py +1 -0
  3. pixeltable/catalog/__init__.py +5 -3
  4. pixeltable/catalog/catalog.py +1318 -404
  5. pixeltable/catalog/column.py +186 -115
  6. pixeltable/catalog/dir.py +1 -2
  7. pixeltable/catalog/globals.py +11 -43
  8. pixeltable/catalog/insertable_table.py +167 -79
  9. pixeltable/catalog/path.py +61 -23
  10. pixeltable/catalog/schema_object.py +9 -10
  11. pixeltable/catalog/table.py +626 -308
  12. pixeltable/catalog/table_metadata.py +101 -0
  13. pixeltable/catalog/table_version.py +713 -569
  14. pixeltable/catalog/table_version_handle.py +37 -6
  15. pixeltable/catalog/table_version_path.py +42 -29
  16. pixeltable/catalog/tbl_ops.py +50 -0
  17. pixeltable/catalog/update_status.py +191 -0
  18. pixeltable/catalog/view.py +108 -94
  19. pixeltable/config.py +128 -22
  20. pixeltable/dataframe.py +188 -100
  21. pixeltable/env.py +407 -136
  22. pixeltable/exceptions.py +6 -0
  23. pixeltable/exec/__init__.py +3 -0
  24. pixeltable/exec/aggregation_node.py +7 -8
  25. pixeltable/exec/cache_prefetch_node.py +83 -110
  26. pixeltable/exec/cell_materialization_node.py +231 -0
  27. pixeltable/exec/cell_reconstruction_node.py +135 -0
  28. pixeltable/exec/component_iteration_node.py +4 -3
  29. pixeltable/exec/data_row_batch.py +8 -65
  30. pixeltable/exec/exec_context.py +16 -4
  31. pixeltable/exec/exec_node.py +13 -36
  32. pixeltable/exec/expr_eval/evaluators.py +7 -6
  33. pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
  34. pixeltable/exec/expr_eval/globals.py +8 -5
  35. pixeltable/exec/expr_eval/row_buffer.py +1 -2
  36. pixeltable/exec/expr_eval/schedulers.py +190 -30
  37. pixeltable/exec/globals.py +32 -0
  38. pixeltable/exec/in_memory_data_node.py +18 -18
  39. pixeltable/exec/object_store_save_node.py +293 -0
  40. pixeltable/exec/row_update_node.py +16 -9
  41. pixeltable/exec/sql_node.py +206 -101
  42. pixeltable/exprs/__init__.py +1 -1
  43. pixeltable/exprs/arithmetic_expr.py +27 -22
  44. pixeltable/exprs/array_slice.py +3 -3
  45. pixeltable/exprs/column_property_ref.py +34 -30
  46. pixeltable/exprs/column_ref.py +92 -96
  47. pixeltable/exprs/comparison.py +5 -5
  48. pixeltable/exprs/compound_predicate.py +5 -4
  49. pixeltable/exprs/data_row.py +152 -55
  50. pixeltable/exprs/expr.py +62 -43
  51. pixeltable/exprs/expr_dict.py +3 -3
  52. pixeltable/exprs/expr_set.py +17 -10
  53. pixeltable/exprs/function_call.py +75 -37
  54. pixeltable/exprs/globals.py +1 -2
  55. pixeltable/exprs/in_predicate.py +4 -4
  56. pixeltable/exprs/inline_expr.py +10 -27
  57. pixeltable/exprs/is_null.py +1 -3
  58. pixeltable/exprs/json_mapper.py +8 -8
  59. pixeltable/exprs/json_path.py +56 -22
  60. pixeltable/exprs/literal.py +5 -5
  61. pixeltable/exprs/method_ref.py +2 -2
  62. pixeltable/exprs/object_ref.py +2 -2
  63. pixeltable/exprs/row_builder.py +127 -53
  64. pixeltable/exprs/rowid_ref.py +8 -12
  65. pixeltable/exprs/similarity_expr.py +50 -25
  66. pixeltable/exprs/sql_element_cache.py +4 -4
  67. pixeltable/exprs/string_op.py +5 -5
  68. pixeltable/exprs/type_cast.py +3 -5
  69. pixeltable/func/__init__.py +1 -0
  70. pixeltable/func/aggregate_function.py +8 -8
  71. pixeltable/func/callable_function.py +9 -9
  72. pixeltable/func/expr_template_function.py +10 -10
  73. pixeltable/func/function.py +18 -20
  74. pixeltable/func/function_registry.py +6 -7
  75. pixeltable/func/globals.py +2 -3
  76. pixeltable/func/mcp.py +74 -0
  77. pixeltable/func/query_template_function.py +20 -18
  78. pixeltable/func/signature.py +43 -16
  79. pixeltable/func/tools.py +23 -13
  80. pixeltable/func/udf.py +18 -20
  81. pixeltable/functions/__init__.py +6 -0
  82. pixeltable/functions/anthropic.py +93 -33
  83. pixeltable/functions/audio.py +114 -10
  84. pixeltable/functions/bedrock.py +13 -6
  85. pixeltable/functions/date.py +1 -1
  86. pixeltable/functions/deepseek.py +20 -9
  87. pixeltable/functions/fireworks.py +2 -2
  88. pixeltable/functions/gemini.py +28 -11
  89. pixeltable/functions/globals.py +13 -13
  90. pixeltable/functions/groq.py +108 -0
  91. pixeltable/functions/huggingface.py +1046 -23
  92. pixeltable/functions/image.py +9 -18
  93. pixeltable/functions/llama_cpp.py +23 -8
  94. pixeltable/functions/math.py +3 -4
  95. pixeltable/functions/mistralai.py +4 -15
  96. pixeltable/functions/ollama.py +16 -9
  97. pixeltable/functions/openai.py +104 -82
  98. pixeltable/functions/openrouter.py +143 -0
  99. pixeltable/functions/replicate.py +2 -2
  100. pixeltable/functions/reve.py +250 -0
  101. pixeltable/functions/string.py +21 -28
  102. pixeltable/functions/timestamp.py +13 -14
  103. pixeltable/functions/together.py +4 -6
  104. pixeltable/functions/twelvelabs.py +92 -0
  105. pixeltable/functions/util.py +6 -1
  106. pixeltable/functions/video.py +1388 -106
  107. pixeltable/functions/vision.py +7 -7
  108. pixeltable/functions/whisper.py +15 -7
  109. pixeltable/functions/whisperx.py +179 -0
  110. pixeltable/{ext/functions → functions}/yolox.py +2 -4
  111. pixeltable/globals.py +332 -105
  112. pixeltable/index/base.py +13 -22
  113. pixeltable/index/btree.py +23 -22
  114. pixeltable/index/embedding_index.py +32 -44
  115. pixeltable/io/__init__.py +4 -2
  116. pixeltable/io/datarows.py +7 -6
  117. pixeltable/io/external_store.py +49 -77
  118. pixeltable/io/fiftyone.py +11 -11
  119. pixeltable/io/globals.py +29 -28
  120. pixeltable/io/hf_datasets.py +17 -9
  121. pixeltable/io/label_studio.py +70 -66
  122. pixeltable/io/lancedb.py +3 -0
  123. pixeltable/io/pandas.py +12 -11
  124. pixeltable/io/parquet.py +13 -93
  125. pixeltable/io/table_data_conduit.py +71 -47
  126. pixeltable/io/utils.py +3 -3
  127. pixeltable/iterators/__init__.py +2 -1
  128. pixeltable/iterators/audio.py +21 -11
  129. pixeltable/iterators/document.py +116 -55
  130. pixeltable/iterators/image.py +5 -2
  131. pixeltable/iterators/video.py +293 -13
  132. pixeltable/metadata/__init__.py +4 -2
  133. pixeltable/metadata/converters/convert_18.py +2 -2
  134. pixeltable/metadata/converters/convert_19.py +2 -2
  135. pixeltable/metadata/converters/convert_20.py +2 -2
  136. pixeltable/metadata/converters/convert_21.py +2 -2
  137. pixeltable/metadata/converters/convert_22.py +2 -2
  138. pixeltable/metadata/converters/convert_24.py +2 -2
  139. pixeltable/metadata/converters/convert_25.py +2 -2
  140. pixeltable/metadata/converters/convert_26.py +2 -2
  141. pixeltable/metadata/converters/convert_29.py +4 -4
  142. pixeltable/metadata/converters/convert_34.py +2 -2
  143. pixeltable/metadata/converters/convert_36.py +2 -2
  144. pixeltable/metadata/converters/convert_37.py +15 -0
  145. pixeltable/metadata/converters/convert_38.py +39 -0
  146. pixeltable/metadata/converters/convert_39.py +124 -0
  147. pixeltable/metadata/converters/convert_40.py +73 -0
  148. pixeltable/metadata/converters/util.py +13 -12
  149. pixeltable/metadata/notes.py +4 -0
  150. pixeltable/metadata/schema.py +79 -42
  151. pixeltable/metadata/utils.py +74 -0
  152. pixeltable/mypy/__init__.py +3 -0
  153. pixeltable/mypy/mypy_plugin.py +123 -0
  154. pixeltable/plan.py +274 -223
  155. pixeltable/share/__init__.py +1 -1
  156. pixeltable/share/packager.py +259 -129
  157. pixeltable/share/protocol/__init__.py +34 -0
  158. pixeltable/share/protocol/common.py +170 -0
  159. pixeltable/share/protocol/operation_types.py +33 -0
  160. pixeltable/share/protocol/replica.py +109 -0
  161. pixeltable/share/publish.py +213 -57
  162. pixeltable/store.py +238 -175
  163. pixeltable/type_system.py +104 -63
  164. pixeltable/utils/__init__.py +2 -3
  165. pixeltable/utils/arrow.py +108 -13
  166. pixeltable/utils/av.py +298 -0
  167. pixeltable/utils/azure_store.py +305 -0
  168. pixeltable/utils/code.py +3 -3
  169. pixeltable/utils/console_output.py +4 -1
  170. pixeltable/utils/coroutine.py +6 -23
  171. pixeltable/utils/dbms.py +31 -5
  172. pixeltable/utils/description_helper.py +4 -5
  173. pixeltable/utils/documents.py +5 -6
  174. pixeltable/utils/exception_handler.py +7 -30
  175. pixeltable/utils/filecache.py +6 -6
  176. pixeltable/utils/formatter.py +4 -6
  177. pixeltable/utils/gcs_store.py +283 -0
  178. pixeltable/utils/http_server.py +2 -3
  179. pixeltable/utils/iceberg.py +1 -2
  180. pixeltable/utils/image.py +17 -0
  181. pixeltable/utils/lancedb.py +88 -0
  182. pixeltable/utils/local_store.py +316 -0
  183. pixeltable/utils/misc.py +5 -0
  184. pixeltable/utils/object_stores.py +528 -0
  185. pixeltable/utils/pydantic.py +60 -0
  186. pixeltable/utils/pytorch.py +5 -6
  187. pixeltable/utils/s3_store.py +392 -0
  188. pixeltable-0.4.20.dist-info/METADATA +587 -0
  189. pixeltable-0.4.20.dist-info/RECORD +218 -0
  190. {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info}/WHEEL +1 -1
  191. pixeltable-0.4.20.dist-info/entry_points.txt +2 -0
  192. pixeltable/__version__.py +0 -3
  193. pixeltable/ext/__init__.py +0 -17
  194. pixeltable/ext/functions/__init__.py +0 -11
  195. pixeltable/ext/functions/whisperx.py +0 -77
  196. pixeltable/utils/media_store.py +0 -77
  197. pixeltable/utils/s3.py +0 -17
  198. pixeltable/utils/sample.py +0 -25
  199. pixeltable-0.4.0rc3.dist-info/METADATA +0 -435
  200. pixeltable-0.4.0rc3.dist-info/RECORD +0 -189
  201. pixeltable-0.4.0rc3.dist-info/entry_points.txt +0 -3
  202. {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info/licenses}/LICENSE +0 -0
pixeltable/type_system.py CHANGED
@@ -5,10 +5,14 @@ import datetime
5
5
  import enum
6
6
  import io
7
7
  import json
8
+ import types
8
9
  import typing
9
10
  import urllib.parse
10
11
  import urllib.request
11
- from typing import Any, ClassVar, Iterable, Literal, Mapping, Optional, Sequence, Union
12
+ from pathlib import Path
13
+ from typing import Any, ClassVar, Iterable, Literal, Mapping, Sequence, Union
14
+
15
+ from typing import _GenericAlias # type: ignore[attr-defined] # isort: skip
12
16
 
13
17
  import av
14
18
  import jsonschema
@@ -21,10 +25,9 @@ import sqlalchemy as sql
21
25
  from typing_extensions import _AnnotatedAlias
22
26
 
23
27
  import pixeltable.exceptions as excs
28
+ from pixeltable.env import Env
24
29
  from pixeltable.utils import parse_local_file_path
25
30
 
26
- from typing import _GenericAlias # type: ignore[attr-defined] # isort: skip
27
-
28
31
 
29
32
  class ColumnType:
30
33
  @enum.unique
@@ -48,11 +51,11 @@ class ColumnType:
48
51
  @classmethod
49
52
  def supertype(
50
53
  cls,
51
- type1: Optional['ColumnType.Type'],
52
- type2: Optional['ColumnType.Type'],
54
+ type1: 'ColumnType.Type' | None,
55
+ type2: 'ColumnType.Type' | None,
53
56
  # we need to pass this in because we can't easily append it as a class member
54
57
  common_supertypes: dict[tuple['ColumnType.Type', 'ColumnType.Type'], 'ColumnType.Type'],
55
- ) -> Optional['ColumnType.Type']:
58
+ ) -> 'ColumnType.Type' | None:
56
59
  if type1 == type2:
57
60
  return type1
58
61
  t = common_supertypes.get((type1, type2))
@@ -185,7 +188,7 @@ class ColumnType:
185
188
  if as_schema:
186
189
  return base_str if self.nullable else f'Required[{base_str}]'
187
190
  else:
188
- return f'Optional[{base_str}]' if self.nullable else base_str
191
+ return f'{base_str} | None' if self.nullable else base_str
189
192
 
190
193
  def _to_base_str(self) -> str:
191
194
  """
@@ -214,7 +217,7 @@ class ColumnType:
214
217
  # Default: just compare base types (this works for all types whose only parameter is nullable)
215
218
  return self._type == other._type
216
219
 
217
- def supertype(self, other: ColumnType) -> Optional[ColumnType]:
220
+ def supertype(self, other: ColumnType) -> ColumnType | None:
218
221
  if self == other:
219
222
  return self
220
223
  if self.matches(other):
@@ -234,7 +237,7 @@ class ColumnType:
234
237
  return None
235
238
 
236
239
  @classmethod
237
- def infer_literal_type(cls, val: Any, nullable: bool = False) -> Optional[ColumnType]:
240
+ def infer_literal_type(cls, val: Any, nullable: bool = False) -> ColumnType | None:
238
241
  if val is None:
239
242
  return InvalidType(nullable=True)
240
243
  if isinstance(val, str):
@@ -268,7 +271,7 @@ class ColumnType:
268
271
  return None
269
272
 
270
273
  @classmethod
271
- def infer_common_literal_type(cls, vals: Iterable[Any]) -> Optional[ColumnType]:
274
+ def infer_common_literal_type(cls, vals: Iterable[Any]) -> ColumnType | None:
272
275
  """
273
276
  Returns the most specific type that is a supertype of all literals in `vals`. If no such type
274
277
  exists, returns None.
@@ -276,7 +279,7 @@ class ColumnType:
276
279
  Args:
277
280
  vals: A collection of literals.
278
281
  """
279
- inferred_type: Optional[ColumnType] = None
282
+ inferred_type: ColumnType | None = None
280
283
  for val in vals:
281
284
  val_type = cls.infer_literal_type(val)
282
285
  if inferred_type is None:
@@ -291,8 +294,12 @@ class ColumnType:
291
294
 
292
295
  @classmethod
293
296
  def from_python_type(
294
- cls, t: Union[type, _GenericAlias], nullable_default: bool = False, allow_builtin_types: bool = True
295
- ) -> Optional[ColumnType]:
297
+ cls,
298
+ t: type | _GenericAlias,
299
+ nullable_default: bool = False,
300
+ allow_builtin_types: bool = True,
301
+ infer_pydantic_json: bool = False,
302
+ ) -> ColumnType | None:
296
303
  """
297
304
  Convert a Python type into a Pixeltable `ColumnType` instance.
298
305
 
@@ -304,16 +311,20 @@ class ColumnType:
304
311
  allowed (as in UDF definitions). If False, then only Pixeltable types such as `pxt.String`,
305
312
  `pxt.Int`, etc., will be allowed (as in schema definitions). `Optional` and `Required`
306
313
  designations will be allowed regardless.
314
+ infer_pydantic_json: If True, accepts an extended set of built-ins (eg, Enum, Path) and returns the type to
315
+ which pydantic.BaseModel.model_dump(mode='json') serializes it.
307
316
  """
308
317
  origin = typing.get_origin(t)
309
318
  type_args = typing.get_args(t)
310
- if origin is typing.Union:
311
- # Check if `t` has the form Optional[T].
319
+ if origin in (typing.Union, types.UnionType):
320
+ # Check if `t` has the form T | None.
312
321
  if len(type_args) == 2 and type(None) in type_args:
313
- # `t` is a type of the form Optional[T] (equivalently, Union[T, None] or Union[None, T]).
322
+ # `t` is a type of the form T | None (equivalently, T | None or None | T).
314
323
  # We treat it as the underlying type but with nullable=True.
315
324
  underlying_py_type = type_args[0] if type_args[1] is type(None) else type_args[1]
316
- underlying = cls.from_python_type(underlying_py_type, allow_builtin_types=allow_builtin_types)
325
+ underlying = cls.from_python_type(
326
+ underlying_py_type, allow_builtin_types=allow_builtin_types, infer_pydantic_json=infer_pydantic_json
327
+ )
317
328
  if underlying is not None:
318
329
  return underlying.copy(nullable=True)
319
330
  elif origin is Required:
@@ -327,7 +338,7 @@ class ColumnType:
327
338
  if isinstance(parameters, ColumnType):
328
339
  return parameters.copy(nullable=nullable_default)
329
340
  else:
330
- # It's something other than Optional[T], Required[T], or an explicitly annotated type.
341
+ # It's something other than T | None, Required[T], or an explicitly annotated type.
331
342
  if origin is not None:
332
343
  # Discard type parameters to ensure that parameterized types such as `list[T]`
333
344
  # are correctly mapped to Pixeltable types.
@@ -340,6 +351,13 @@ class ColumnType:
340
351
  if literal_type is None:
341
352
  return None
342
353
  return literal_type.copy(nullable=(literal_type.nullable or nullable_default))
354
+ if infer_pydantic_json and isinstance(t, type) and issubclass(t, enum.Enum):
355
+ literal_type = cls.infer_common_literal_type(member.value for member in t)
356
+ if literal_type is None:
357
+ return None
358
+ return literal_type.copy(nullable=(literal_type.nullable or nullable_default))
359
+ if infer_pydantic_json and t is Path:
360
+ return StringType(nullable=nullable_default)
343
361
  if t is str:
344
362
  return StringType(nullable=nullable_default)
345
363
  if t is int:
@@ -360,10 +378,7 @@ class ColumnType:
360
378
 
361
379
  @classmethod
362
380
  def normalize_type(
363
- cls,
364
- t: Union[ColumnType, type, _AnnotatedAlias],
365
- nullable_default: bool = False,
366
- allow_builtin_types: bool = True,
381
+ cls, t: ColumnType | type | _AnnotatedAlias, nullable_default: bool = False, allow_builtin_types: bool = True
367
382
  ) -> ColumnType:
368
383
  """
369
384
  Convert any type recognizable by Pixeltable to its corresponding ColumnType.
@@ -388,13 +403,43 @@ class ColumnType:
388
403
  ]
389
404
 
390
405
  @classmethod
391
- def __raise_exc_for_invalid_type(cls, t: Union[type, _AnnotatedAlias]) -> None:
406
+ def __raise_exc_for_invalid_type(cls, t: type | _AnnotatedAlias) -> None:
392
407
  for builtin_type, suggestion in cls.__TYPE_SUGGESTIONS:
393
408
  if t is builtin_type or (isinstance(t, type) and issubclass(t, builtin_type)):
394
409
  name = t.__name__ if t.__module__ == 'builtins' else f'{t.__module__}.{t.__name__}'
395
410
  raise excs.Error(f'Standard Python type `{name}` cannot be used here; use `{suggestion}` instead')
396
411
  raise excs.Error(f'Unknown type: {t}')
397
412
 
413
+ @classmethod
414
+ def from_json_schema(cls, schema: dict[str, Any]) -> ColumnType | None:
415
+ # We first express the JSON schema as a Python type, and then convert it to a Pixeltable type.
416
+ # TODO: Is there a meaningful fallback if one of these operations fails? (Maybe another use case for a pxt Any
417
+ # type?)
418
+ py_type = cls.__json_schema_to_py_type(schema)
419
+ return cls.from_python_type(py_type) if py_type is not None else None
420
+
421
+ @classmethod
422
+ def __json_schema_to_py_type(cls, schema: dict[str, Any]) -> type | _GenericAlias | None:
423
+ if 'type' in schema:
424
+ if schema['type'] == 'null':
425
+ return type(None)
426
+ if schema['type'] == 'string':
427
+ return str
428
+ if schema['type'] == 'integer':
429
+ return int
430
+ if schema['type'] == 'number':
431
+ return float
432
+ if schema['type'] == 'boolean':
433
+ return bool
434
+ if schema['type'] in ('array', 'object'):
435
+ return list
436
+ elif 'anyOf' in schema:
437
+ subscripts = tuple(cls.__json_schema_to_py_type(subschema) for subschema in schema['anyOf'])
438
+ if all(subscript is not None for subscript in subscripts):
439
+ return Union[subscripts]
440
+
441
+ return None
442
+
398
443
  def validate_literal(self, val: Any) -> None:
399
444
  """Raise TypeError if val is not a valid literal for this type"""
400
445
  if val is None:
@@ -629,8 +674,9 @@ class TimestampType(ColumnType):
629
674
  def _create_literal(self, val: Any) -> Any:
630
675
  if isinstance(val, str):
631
676
  return datetime.datetime.fromisoformat(val)
632
- if isinstance(val, datetime.datetime):
633
- return val
677
+ # Place naive timestamps in the default time zone
678
+ if isinstance(val, datetime.datetime) and val.tzinfo is None:
679
+ return val.replace(tzinfo=Env.get().default_time_zone)
634
680
  return val
635
681
 
636
682
 
@@ -658,10 +704,10 @@ class DateType(ColumnType):
658
704
 
659
705
 
660
706
  class JsonType(ColumnType):
661
- json_schema: Optional[dict[str, Any]]
662
- __validator: Optional[jsonschema.protocols.Validator]
707
+ json_schema: dict[str, Any] | None
708
+ __validator: jsonschema.protocols.Validator | None
663
709
 
664
- def __init__(self, json_schema: Optional[dict[str, Any]] = None, nullable: bool = False):
710
+ def __init__(self, json_schema: dict[str, Any] | None = None, nullable: bool = False):
665
711
  super().__init__(self.Type.JSON, nullable=nullable)
666
712
  self.json_schema = json_schema
667
713
  if json_schema is None:
@@ -716,7 +762,7 @@ class JsonType(ColumnType):
716
762
 
717
763
  @classmethod
718
764
  def __is_valid_json(cls, val: Any) -> bool:
719
- if val is None or isinstance(val, (str, int, float, bool)):
765
+ if val is None or isinstance(val, (str, int, float, bool, np.ndarray, PIL.Image.Image)):
720
766
  return True
721
767
  if isinstance(val, (list, tuple)):
722
768
  return all(cls.__is_valid_json(v) for v in val)
@@ -731,7 +777,7 @@ class JsonType(ColumnType):
731
777
  return val.model_dump()
732
778
  return val
733
779
 
734
- def supertype(self, other: ColumnType) -> Optional[JsonType]:
780
+ def supertype(self, other: ColumnType) -> JsonType | None:
735
781
  # Try using the (much faster) supertype logic in ColumnType first. That will work if, for example, the types
736
782
  # are identical except for nullability. If that doesn't work and both types are JsonType, then we will need to
737
783
  # merge their schemas.
@@ -753,7 +799,7 @@ class JsonType(ColumnType):
753
799
  )
754
800
 
755
801
  @classmethod
756
- def __superschema(cls, a: dict[str, Any], b: dict[str, Any]) -> Optional[dict[str, Any]]:
802
+ def __superschema(cls, a: dict[str, Any], b: dict[str, Any]) -> dict[str, Any] | None:
757
803
  # Defining a general type hierarchy over all JSON schemas would be a challenging problem. In order to keep
758
804
  # things manageable, we only define a hierarchy among "conforming" schemas, which provides enough generality
759
805
  # for the most important use cases (unions for type inference, validation of inline exprs). A schema is
@@ -813,7 +859,7 @@ class JsonType(ColumnType):
813
859
  return {} # Unresolvable type conflict; the supertype is an unrestricted JsonType.
814
860
 
815
861
  @classmethod
816
- def __superschema_with_nulls(cls, a: dict[str, Any], b: dict[str, Any]) -> Optional[dict[str, Any]]:
862
+ def __superschema_with_nulls(cls, a: dict[str, Any], b: dict[str, Any]) -> dict[str, Any] | None:
817
863
  a, a_nullable = cls.__unpack_null_from_schema(a)
818
864
  b, b_nullable = cls.__unpack_null_from_schema(b)
819
865
 
@@ -842,15 +888,12 @@ class JsonType(ColumnType):
842
888
 
843
889
 
844
890
  class ArrayType(ColumnType):
845
- shape: Optional[tuple[Optional[int], ...]]
846
- pxt_dtype: Optional[ColumnType]
847
- dtype: Optional[ColumnType.Type]
891
+ shape: tuple[int | None, ...] | None
892
+ pxt_dtype: ColumnType | None
893
+ dtype: ColumnType.Type | None
848
894
 
849
895
  def __init__(
850
- self,
851
- shape: Optional[tuple[Optional[int], ...]] = None,
852
- dtype: Optional[ColumnType] = None,
853
- nullable: bool = False,
896
+ self, shape: tuple[int | None, ...] | None = None, dtype: ColumnType | None = None, nullable: bool = False
854
897
  ):
855
898
  super().__init__(self.Type.ARRAY, nullable=nullable)
856
899
  assert shape is None or dtype is not None, (shape, dtype) # cannot specify a shape without a dtype
@@ -875,7 +918,7 @@ class ArrayType(ColumnType):
875
918
  def __hash__(self) -> int:
876
919
  return hash((self._type, self.nullable, self.shape, self.dtype))
877
920
 
878
- def supertype(self, other: ColumnType) -> Optional[ArrayType]:
921
+ def supertype(self, other: ColumnType) -> ArrayType | None:
879
922
  basic_supertype = super().supertype(other)
880
923
  if basic_supertype is not None:
881
924
  assert isinstance(basic_supertype, ArrayType)
@@ -888,7 +931,7 @@ class ArrayType(ColumnType):
888
931
  if super_dtype is None:
889
932
  # if the dtypes are incompatible, then the supertype is a fully general array
890
933
  return ArrayType(nullable=(self.nullable or other.nullable))
891
- super_shape: Optional[tuple[Optional[int], ...]]
934
+ super_shape: tuple[int | None, ...] | None
892
935
  if self.shape is None or other.shape is None or len(self.shape) != len(other.shape):
893
936
  super_shape = None
894
937
  else:
@@ -919,7 +962,7 @@ class ArrayType(ColumnType):
919
962
  return cls(shape, dtype, nullable=d['nullable'])
920
963
 
921
964
  @classmethod
922
- def from_np_dtype(cls, dtype: np.dtype, nullable: bool) -> Optional[ColumnType]:
965
+ def from_np_dtype(cls, dtype: np.dtype, nullable: bool) -> ColumnType | None:
923
966
  """
924
967
  Return pixeltable type corresponding to a given simple numpy dtype
925
968
  """
@@ -948,10 +991,10 @@ class ArrayType(ColumnType):
948
991
  return None
949
992
 
950
993
  @classmethod
951
- def from_literal(cls, val: np.ndarray, nullable: bool = False) -> Optional[ArrayType]:
994
+ def from_literal(cls, val: np.ndarray, nullable: bool = False) -> ArrayType | None:
952
995
  # determine our dtype
953
996
  assert isinstance(val, np.ndarray)
954
- pxttype: Optional[ColumnType] = cls.from_np_dtype(val.dtype, nullable)
997
+ pxttype: ColumnType | None = cls.from_np_dtype(val.dtype, nullable)
955
998
  if pxttype is None:
956
999
  return None
957
1000
  return cls(val.shape, dtype=pxttype, nullable=nullable)
@@ -1014,7 +1057,7 @@ class ArrayType(ColumnType):
1014
1057
  def to_sa_type(cls) -> sql.types.TypeEngine:
1015
1058
  return sql.LargeBinary()
1016
1059
 
1017
- def numpy_dtype(self) -> Optional[np.dtype]:
1060
+ def numpy_dtype(self) -> np.dtype | None:
1018
1061
  if self.dtype is None:
1019
1062
  return None
1020
1063
  if self.dtype == self.Type.INT:
@@ -1031,15 +1074,13 @@ class ArrayType(ColumnType):
1031
1074
  class ImageType(ColumnType):
1032
1075
  def __init__(
1033
1076
  self,
1034
- width: Optional[int] = None,
1035
- height: Optional[int] = None,
1036
- size: Optional[tuple[int, int]] = None,
1037
- mode: Optional[str] = None,
1077
+ width: int | None = None,
1078
+ height: int | None = None,
1079
+ size: tuple[int, int] | None = None,
1080
+ mode: str | None = None,
1038
1081
  nullable: bool = False,
1039
1082
  ):
1040
- """
1041
- TODO: does it make sense to specify only width or height?
1042
- """
1083
+ # TODO: does it make sense to specify only width or height?
1043
1084
  super().__init__(self.Type.IMAGE, nullable=nullable)
1044
1085
  assert not (width is not None and size is not None)
1045
1086
  assert not (height is not None and size is not None)
@@ -1077,7 +1118,7 @@ class ImageType(ColumnType):
1077
1118
  def __hash__(self) -> int:
1078
1119
  return hash((self._type, self.nullable, self.size, self.mode))
1079
1120
 
1080
- def supertype(self, other: ColumnType) -> Optional[ImageType]:
1121
+ def supertype(self, other: ColumnType) -> ImageType | None:
1081
1122
  basic_supertype = super().supertype(other)
1082
1123
  if basic_supertype is not None:
1083
1124
  assert isinstance(basic_supertype, ImageType)
@@ -1092,7 +1133,7 @@ class ImageType(ColumnType):
1092
1133
  return ImageType(width=width, height=height, mode=mode, nullable=(self.nullable or other.nullable))
1093
1134
 
1094
1135
  @property
1095
- def size(self) -> Optional[tuple[int, int]]:
1136
+ def size(self) -> tuple[int, int] | None:
1096
1137
  if self.width is None or self.height is None:
1097
1138
  return None
1098
1139
  return (self.width, self.height)
@@ -1123,8 +1164,8 @@ class ImageType(ColumnType):
1123
1164
  img.load()
1124
1165
  return img
1125
1166
  except Exception as exc:
1126
- errormsg_val = val if len(val) < 50 else val[:50] + '...'
1127
- raise excs.Error(f'data URL could not be decoded into a valid image: {errormsg_val}') from exc
1167
+ error_msg_val = val if len(val) < 50 else val[:50] + '...'
1168
+ raise excs.Error(f'data URL could not be decoded into a valid image: {error_msg_val}') from exc
1128
1169
  return val
1129
1170
 
1130
1171
  def _validate_literal(self, val: Any) -> None:
@@ -1211,7 +1252,7 @@ class DocumentType(ColumnType):
1211
1252
  TXT = 4
1212
1253
 
1213
1254
  @classmethod
1214
- def from_extension(cls, ext: str) -> Optional['DocumentType.DocumentFormat']:
1255
+ def from_extension(cls, ext: str) -> 'DocumentType.DocumentFormat' | None:
1215
1256
  if ext in ('.htm', '.html'):
1216
1257
  return cls.HTML
1217
1258
  if ext == '.md':
@@ -1224,7 +1265,7 @@ class DocumentType(ColumnType):
1224
1265
  return cls.TXT
1225
1266
  return None
1226
1267
 
1227
- def __init__(self, nullable: bool = False, doc_formats: Optional[str] = None):
1268
+ def __init__(self, nullable: bool = False, doc_formats: str | None = None):
1228
1269
  super().__init__(self.Type.DOCUMENT, nullable=nullable)
1229
1270
  self.doc_formats = doc_formats
1230
1271
  if doc_formats is not None:
@@ -1321,13 +1362,13 @@ class Array(np.ndarray, _PxtType):
1321
1362
  def __class_getitem__(cls, item: Any) -> _AnnotatedAlias:
1322
1363
  """
1323
1364
  `item` (the type subscript) must be a tuple with exactly two elements (in any order):
1324
- - A tuple of `Optional[int]`s, specifying the shape of the array
1365
+ - A tuple of `int | None`s, specifying the shape of the array
1325
1366
  - A type, specifying the dtype of the array
1326
1367
  Example: Array[(3, None, 2), pxt.Float]
1327
1368
  """
1328
1369
  params = item if isinstance(item, tuple) else (item,)
1329
- shape: Optional[tuple] = None
1330
- dtype: Optional[ColumnType] = None
1370
+ shape: tuple | None = None
1371
+ dtype: ColumnType | None = None
1331
1372
  if not any(isinstance(param, (type, _AnnotatedAlias)) for param in params):
1332
1373
  raise TypeError('Array type parameter must include a dtype.')
1333
1374
  for param in params:
@@ -1367,8 +1408,8 @@ class Image(PIL.Image.Image, _PxtType):
1367
1408
  else:
1368
1409
  # Not a tuple (single arg)
1369
1410
  params = (item,)
1370
- size: Optional[tuple] = None
1371
- mode: Optional[str] = None
1411
+ size: tuple | None = None
1412
+ mode: str | None = None
1372
1413
  for param in params:
1373
1414
  if isinstance(param, tuple):
1374
1415
  if (
@@ -2,7 +2,6 @@ import hashlib
2
2
  import urllib.parse
3
3
  import urllib.request
4
4
  from pathlib import Path
5
- from typing import Optional, Union
6
5
 
7
6
 
8
7
  def print_perf_counter_delta(delta: float) -> str:
@@ -24,7 +23,7 @@ def print_perf_counter_delta(delta: float) -> str:
24
23
  return f'{delta:.2f} s'
25
24
 
26
25
 
27
- def sha256sum(path: Union[Path, str]) -> str:
26
+ def sha256sum(path: Path | str) -> str:
28
27
  """
29
28
  Compute the SHA256 hash of a file.
30
29
  """
@@ -39,7 +38,7 @@ def sha256sum(path: Union[Path, str]) -> str:
39
38
  return h.hexdigest()
40
39
 
41
40
 
42
- def parse_local_file_path(file_or_url: str) -> Optional[Path]:
41
+ def parse_local_file_path(file_or_url: str) -> Path | None:
43
42
  """
44
43
  Parses a string that may be either a URL or a local file path.
45
44
 
pixeltable/utils/arrow.py CHANGED
@@ -1,15 +1,22 @@
1
1
  import datetime
2
- from typing import Any, Iterator, Optional, Union
2
+ import io
3
+ import json
4
+ from typing import TYPE_CHECKING, Any, Iterator, cast
3
5
 
4
6
  import numpy as np
7
+ import PIL.Image
5
8
  import pyarrow as pa
6
9
 
10
+ import pixeltable.exceptions as excs
7
11
  import pixeltable.type_system as ts
8
12
 
13
+ if TYPE_CHECKING:
14
+ import pixeltable as pxt
15
+
9
16
  PA_TO_PXT_TYPES: dict[pa.DataType, ts.ColumnType] = {
10
17
  pa.string(): ts.StringType(nullable=True),
11
18
  pa.large_string(): ts.StringType(nullable=True),
12
- pa.timestamp('us', tz=datetime.timezone.utc): ts.TimestampType(nullable=True),
19
+ pa.timestamp('us', tz='UTC'): ts.TimestampType(nullable=True),
13
20
  pa.bool_(): ts.BoolType(nullable=True),
14
21
  pa.int8(): ts.IntType(nullable=True),
15
22
  pa.int16(): ts.IntType(nullable=True),
@@ -28,7 +35,7 @@ PA_TO_PXT_TYPES: dict[pa.DataType, ts.ColumnType] = {
28
35
 
29
36
  PXT_TO_PA_TYPES: dict[type[ts.ColumnType], pa.DataType] = {
30
37
  ts.StringType: pa.string(),
31
- ts.TimestampType: pa.timestamp('us', tz=datetime.timezone.utc), # postgres timestamp is microseconds
38
+ ts.TimestampType: pa.timestamp('us', tz='UTC'), # postgres timestamp is microseconds
32
39
  ts.DateType: pa.date32(), # This could be date64
33
40
  ts.BoolType: pa.bool_(),
34
41
  ts.IntType: pa.int64(),
@@ -41,7 +48,7 @@ PXT_TO_PA_TYPES: dict[type[ts.ColumnType], pa.DataType] = {
41
48
  }
42
49
 
43
50
 
44
- def to_pixeltable_type(arrow_type: pa.DataType, nullable: bool) -> Optional[ts.ColumnType]:
51
+ def to_pixeltable_type(arrow_type: pa.DataType, nullable: bool) -> ts.ColumnType | None:
45
52
  """Convert a pyarrow DataType to a pixeltable ColumnType if one is defined.
46
53
  Returns None if no conversion is currently implemented.
47
54
  """
@@ -54,12 +61,12 @@ def to_pixeltable_type(arrow_type: pa.DataType, nullable: bool) -> Optional[ts.C
54
61
  dtype = to_pixeltable_type(arrow_type.value_type, nullable)
55
62
  if dtype is None:
56
63
  return None
57
- return ts.ArrayType(shape=arrow_type.shape, dtype=dtype, nullable=nullable)
64
+ return ts.ArrayType(shape=tuple(arrow_type.shape), dtype=dtype, nullable=nullable)
58
65
  else:
59
66
  return None
60
67
 
61
68
 
62
- def to_arrow_type(pixeltable_type: ts.ColumnType) -> Optional[pa.DataType]:
69
+ def to_arrow_type(pixeltable_type: ts.ColumnType) -> pa.DataType | None:
63
70
  """Convert a pixeltable DataType to a pyarrow datatype if one is defined.
64
71
  Returns None if no conversion is currently implemented.
65
72
  """
@@ -71,7 +78,7 @@ def to_arrow_type(pixeltable_type: ts.ColumnType) -> Optional[pa.DataType]:
71
78
  return None
72
79
 
73
80
 
74
- def ar_infer_schema(
81
+ def to_pxt_schema(
75
82
  arrow_schema: pa.Schema, schema_overrides: dict[str, Any], primary_key: list[str]
76
83
  ) -> dict[str, ts.ColumnType]:
77
84
  """Convert a pyarrow Schema to a schema using pyarrow names and pixeltable types."""
@@ -85,19 +92,107 @@ def ar_infer_schema(
85
92
 
86
93
 
87
94
  def to_arrow_schema(pixeltable_schema: dict[str, Any]) -> pa.Schema:
88
- return pa.schema((name, to_arrow_type(typ)) for name, typ in pixeltable_schema.items()) # type: ignore[misc]
95
+ return pa.schema((name, to_arrow_type(typ)) for name, typ in pixeltable_schema.items())
96
+
97
+
98
+ def _to_record_batch(column_vals: dict[str, list[Any]], schema: pa.Schema) -> pa.RecordBatch:
99
+ import pyarrow as pa
100
+
101
+ pa_arrays: list[pa.Array] = []
102
+ for field in schema:
103
+ if isinstance(field.type, pa.FixedShapeTensorType):
104
+ stacked_arr = np.stack(column_vals[field.name])
105
+ pa_arrays.append(pa.FixedShapeTensorArray.from_numpy_ndarray(stacked_arr))
106
+ else:
107
+ pa_array = cast(pa.Array, pa.array(column_vals[field.name]))
108
+ pa_arrays.append(pa_array)
109
+ return pa.RecordBatch.from_arrays(pa_arrays, schema=schema)
110
+
111
+
112
+ def to_record_batches(df: 'pxt.DataFrame', batch_size_bytes: int) -> Iterator[pa.RecordBatch]:
113
+ arrow_schema = to_arrow_schema(df.schema)
114
+ batch_columns: dict[str, list[Any]] = {k: [] for k in df.schema}
115
+ current_byte_estimate = 0
116
+ num_batch_rows = 0
117
+
118
+ # TODO: in order to avoid having to deal with ExprEvalError here, DataFrameResultSet should be an iterator
119
+ # over _exec()
120
+ try:
121
+ for data_row in df._exec():
122
+ num_batch_rows += 1
123
+ for (col_name, col_type), e in zip(df.schema.items(), df._select_list_exprs):
124
+ val = data_row[e.slot_idx]
125
+ val_size_bytes: int
126
+ if val is None:
127
+ batch_columns[col_name].append(val)
128
+ continue
129
+
130
+ assert val is not None
131
+ if col_type.is_image_type():
132
+ # images get inlined into the parquet file
133
+ if data_row.file_paths[e.slot_idx] is not None:
134
+ # if there is a file, read directly to preserve information
135
+ with open(data_row.file_paths[e.slot_idx], 'rb') as f:
136
+ val = f.read()
137
+ elif isinstance(val, PIL.Image.Image):
138
+ # no file available: save as png
139
+ buf = io.BytesIO()
140
+ val.save(buf, format='png')
141
+ val = buf.getvalue()
142
+ else:
143
+ raise excs.Error(f'unknown image type {type(val)}')
144
+ val_size_bytes = len(val)
145
+ elif col_type.is_string_type():
146
+ val_size_bytes = len(val)
147
+ elif col_type.is_media_type():
148
+ assert data_row.file_paths[e.slot_idx] is not None
149
+ val = data_row.file_paths[e.slot_idx]
150
+ val_size_bytes = len(val)
151
+ elif col_type.is_json_type():
152
+ val = json.dumps(val)
153
+ val_size_bytes = len(val)
154
+ elif col_type.is_array_type():
155
+ val_size_bytes = val.nbytes
156
+ elif col_type.is_int_type() or col_type.is_float_type():
157
+ val_size_bytes = 8
158
+ elif col_type.is_bool_type():
159
+ val_size_bytes = 1
160
+ elif col_type.is_date_type():
161
+ val_size_bytes = 4
162
+ elif col_type.is_timestamp_type():
163
+ val = val.astimezone(datetime.timezone.utc)
164
+ val_size_bytes = 8
165
+ else:
166
+ raise excs.Error(f'unknown type {col_type} for {col_name}')
167
+
168
+ batch_columns[col_name].append(val)
169
+ current_byte_estimate += val_size_bytes
170
+
171
+ if current_byte_estimate > batch_size_bytes and num_batch_rows > 0:
172
+ record_batch = _to_record_batch(batch_columns, arrow_schema)
173
+ yield record_batch
174
+ batch_columns = {k: [] for k in df.schema}
175
+ current_byte_estimate = 0
176
+ num_batch_rows = 0
177
+
178
+ except excs.ExprEvalError as e:
179
+ df._raise_expr_eval_err(e)
180
+
181
+ if num_batch_rows > 0:
182
+ record_batch = _to_record_batch(batch_columns, arrow_schema)
183
+ yield record_batch
89
184
 
90
185
 
91
- def to_pydict(batch: Union[pa.Table, pa.RecordBatch]) -> dict[str, Union[list, np.ndarray]]:
186
+ def to_pydict(batch: pa.Table | pa.RecordBatch) -> dict[str, list | np.ndarray]:
92
187
  """Convert a RecordBatch to a dictionary of lists, unlike pa.lib.RecordBatch.to_pydict,
93
188
  this function will not convert numpy arrays to lists, and will preserve the original numpy dtype.
94
189
  """
95
- out: dict[str, Union[list, np.ndarray]] = {}
190
+ out: dict[str, list | np.ndarray] = {}
96
191
  for k, name in enumerate(batch.schema.names):
97
192
  col = batch.column(k)
98
193
  if isinstance(col.type, pa.FixedShapeTensorType):
99
194
  # treat array columns as numpy arrays to easily preserve numpy type
100
- out[name] = col.to_numpy(zero_copy_only=False) # type: ignore[call-arg]
195
+ out[name] = col.to_numpy(zero_copy_only=False)
101
196
  else:
102
197
  # for the rest, use pydict to preserve python types
103
198
  out[name] = col.to_pylist()
@@ -105,7 +200,7 @@ def to_pydict(batch: Union[pa.Table, pa.RecordBatch]) -> dict[str, Union[list, n
105
200
  return out
106
201
 
107
202
 
108
- def iter_tuples(batch: Union[pa.Table, pa.RecordBatch]) -> Iterator[dict[str, Any]]:
203
+ def iter_tuples(batch: pa.Table | pa.RecordBatch) -> Iterator[dict[str, Any]]:
109
204
  """Convert a RecordBatch to an iterator of dictionaries. also works with pa.Table and pa.RowGroup"""
110
205
  pydict = to_pydict(batch)
111
206
  assert len(pydict) > 0, 'empty record batch'
@@ -145,7 +240,7 @@ def _ar_val_to_pxt_val(val: Any, pxt_type: ts.ColumnType) -> Any:
145
240
 
146
241
 
147
242
  def iter_tuples2(
148
- batch: Union[pa.Table, pa.RecordBatch], col_mapping: Optional[dict[str, str]], schema: dict[str, ts.ColumnType]
243
+ batch: pa.Table | pa.RecordBatch, col_mapping: dict[str, str] | None, schema: dict[str, ts.ColumnType]
149
244
  ) -> Iterator[dict[str, Any]]:
150
245
  """Convert a RecordBatch to an iterator of dictionaries. also works with pa.Table and pa.RowGroup"""
151
246
  pydict = to_pydict(batch)