pixeltable 0.4.0rc3__py3-none-any.whl → 0.4.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (202) hide show
  1. pixeltable/__init__.py +23 -5
  2. pixeltable/_version.py +1 -0
  3. pixeltable/catalog/__init__.py +5 -3
  4. pixeltable/catalog/catalog.py +1318 -404
  5. pixeltable/catalog/column.py +186 -115
  6. pixeltable/catalog/dir.py +1 -2
  7. pixeltable/catalog/globals.py +11 -43
  8. pixeltable/catalog/insertable_table.py +167 -79
  9. pixeltable/catalog/path.py +61 -23
  10. pixeltable/catalog/schema_object.py +9 -10
  11. pixeltable/catalog/table.py +626 -308
  12. pixeltable/catalog/table_metadata.py +101 -0
  13. pixeltable/catalog/table_version.py +713 -569
  14. pixeltable/catalog/table_version_handle.py +37 -6
  15. pixeltable/catalog/table_version_path.py +42 -29
  16. pixeltable/catalog/tbl_ops.py +50 -0
  17. pixeltable/catalog/update_status.py +191 -0
  18. pixeltable/catalog/view.py +108 -94
  19. pixeltable/config.py +128 -22
  20. pixeltable/dataframe.py +188 -100
  21. pixeltable/env.py +407 -136
  22. pixeltable/exceptions.py +6 -0
  23. pixeltable/exec/__init__.py +3 -0
  24. pixeltable/exec/aggregation_node.py +7 -8
  25. pixeltable/exec/cache_prefetch_node.py +83 -110
  26. pixeltable/exec/cell_materialization_node.py +231 -0
  27. pixeltable/exec/cell_reconstruction_node.py +135 -0
  28. pixeltable/exec/component_iteration_node.py +4 -3
  29. pixeltable/exec/data_row_batch.py +8 -65
  30. pixeltable/exec/exec_context.py +16 -4
  31. pixeltable/exec/exec_node.py +13 -36
  32. pixeltable/exec/expr_eval/evaluators.py +7 -6
  33. pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
  34. pixeltable/exec/expr_eval/globals.py +8 -5
  35. pixeltable/exec/expr_eval/row_buffer.py +1 -2
  36. pixeltable/exec/expr_eval/schedulers.py +190 -30
  37. pixeltable/exec/globals.py +32 -0
  38. pixeltable/exec/in_memory_data_node.py +18 -18
  39. pixeltable/exec/object_store_save_node.py +293 -0
  40. pixeltable/exec/row_update_node.py +16 -9
  41. pixeltable/exec/sql_node.py +206 -101
  42. pixeltable/exprs/__init__.py +1 -1
  43. pixeltable/exprs/arithmetic_expr.py +27 -22
  44. pixeltable/exprs/array_slice.py +3 -3
  45. pixeltable/exprs/column_property_ref.py +34 -30
  46. pixeltable/exprs/column_ref.py +92 -96
  47. pixeltable/exprs/comparison.py +5 -5
  48. pixeltable/exprs/compound_predicate.py +5 -4
  49. pixeltable/exprs/data_row.py +152 -55
  50. pixeltable/exprs/expr.py +62 -43
  51. pixeltable/exprs/expr_dict.py +3 -3
  52. pixeltable/exprs/expr_set.py +17 -10
  53. pixeltable/exprs/function_call.py +75 -37
  54. pixeltable/exprs/globals.py +1 -2
  55. pixeltable/exprs/in_predicate.py +4 -4
  56. pixeltable/exprs/inline_expr.py +10 -27
  57. pixeltable/exprs/is_null.py +1 -3
  58. pixeltable/exprs/json_mapper.py +8 -8
  59. pixeltable/exprs/json_path.py +56 -22
  60. pixeltable/exprs/literal.py +5 -5
  61. pixeltable/exprs/method_ref.py +2 -2
  62. pixeltable/exprs/object_ref.py +2 -2
  63. pixeltable/exprs/row_builder.py +127 -53
  64. pixeltable/exprs/rowid_ref.py +8 -12
  65. pixeltable/exprs/similarity_expr.py +50 -25
  66. pixeltable/exprs/sql_element_cache.py +4 -4
  67. pixeltable/exprs/string_op.py +5 -5
  68. pixeltable/exprs/type_cast.py +3 -5
  69. pixeltable/func/__init__.py +1 -0
  70. pixeltable/func/aggregate_function.py +8 -8
  71. pixeltable/func/callable_function.py +9 -9
  72. pixeltable/func/expr_template_function.py +10 -10
  73. pixeltable/func/function.py +18 -20
  74. pixeltable/func/function_registry.py +6 -7
  75. pixeltable/func/globals.py +2 -3
  76. pixeltable/func/mcp.py +74 -0
  77. pixeltable/func/query_template_function.py +20 -18
  78. pixeltable/func/signature.py +43 -16
  79. pixeltable/func/tools.py +23 -13
  80. pixeltable/func/udf.py +18 -20
  81. pixeltable/functions/__init__.py +6 -0
  82. pixeltable/functions/anthropic.py +93 -33
  83. pixeltable/functions/audio.py +114 -10
  84. pixeltable/functions/bedrock.py +13 -6
  85. pixeltable/functions/date.py +1 -1
  86. pixeltable/functions/deepseek.py +20 -9
  87. pixeltable/functions/fireworks.py +2 -2
  88. pixeltable/functions/gemini.py +28 -11
  89. pixeltable/functions/globals.py +13 -13
  90. pixeltable/functions/groq.py +108 -0
  91. pixeltable/functions/huggingface.py +1046 -23
  92. pixeltable/functions/image.py +9 -18
  93. pixeltable/functions/llama_cpp.py +23 -8
  94. pixeltable/functions/math.py +3 -4
  95. pixeltable/functions/mistralai.py +4 -15
  96. pixeltable/functions/ollama.py +16 -9
  97. pixeltable/functions/openai.py +104 -82
  98. pixeltable/functions/openrouter.py +143 -0
  99. pixeltable/functions/replicate.py +2 -2
  100. pixeltable/functions/reve.py +250 -0
  101. pixeltable/functions/string.py +21 -28
  102. pixeltable/functions/timestamp.py +13 -14
  103. pixeltable/functions/together.py +4 -6
  104. pixeltable/functions/twelvelabs.py +92 -0
  105. pixeltable/functions/util.py +6 -1
  106. pixeltable/functions/video.py +1388 -106
  107. pixeltable/functions/vision.py +7 -7
  108. pixeltable/functions/whisper.py +15 -7
  109. pixeltable/functions/whisperx.py +179 -0
  110. pixeltable/{ext/functions → functions}/yolox.py +2 -4
  111. pixeltable/globals.py +332 -105
  112. pixeltable/index/base.py +13 -22
  113. pixeltable/index/btree.py +23 -22
  114. pixeltable/index/embedding_index.py +32 -44
  115. pixeltable/io/__init__.py +4 -2
  116. pixeltable/io/datarows.py +7 -6
  117. pixeltable/io/external_store.py +49 -77
  118. pixeltable/io/fiftyone.py +11 -11
  119. pixeltable/io/globals.py +29 -28
  120. pixeltable/io/hf_datasets.py +17 -9
  121. pixeltable/io/label_studio.py +70 -66
  122. pixeltable/io/lancedb.py +3 -0
  123. pixeltable/io/pandas.py +12 -11
  124. pixeltable/io/parquet.py +13 -93
  125. pixeltable/io/table_data_conduit.py +71 -47
  126. pixeltable/io/utils.py +3 -3
  127. pixeltable/iterators/__init__.py +2 -1
  128. pixeltable/iterators/audio.py +21 -11
  129. pixeltable/iterators/document.py +116 -55
  130. pixeltable/iterators/image.py +5 -2
  131. pixeltable/iterators/video.py +293 -13
  132. pixeltable/metadata/__init__.py +4 -2
  133. pixeltable/metadata/converters/convert_18.py +2 -2
  134. pixeltable/metadata/converters/convert_19.py +2 -2
  135. pixeltable/metadata/converters/convert_20.py +2 -2
  136. pixeltable/metadata/converters/convert_21.py +2 -2
  137. pixeltable/metadata/converters/convert_22.py +2 -2
  138. pixeltable/metadata/converters/convert_24.py +2 -2
  139. pixeltable/metadata/converters/convert_25.py +2 -2
  140. pixeltable/metadata/converters/convert_26.py +2 -2
  141. pixeltable/metadata/converters/convert_29.py +4 -4
  142. pixeltable/metadata/converters/convert_34.py +2 -2
  143. pixeltable/metadata/converters/convert_36.py +2 -2
  144. pixeltable/metadata/converters/convert_37.py +15 -0
  145. pixeltable/metadata/converters/convert_38.py +39 -0
  146. pixeltable/metadata/converters/convert_39.py +124 -0
  147. pixeltable/metadata/converters/convert_40.py +73 -0
  148. pixeltable/metadata/converters/util.py +13 -12
  149. pixeltable/metadata/notes.py +4 -0
  150. pixeltable/metadata/schema.py +79 -42
  151. pixeltable/metadata/utils.py +74 -0
  152. pixeltable/mypy/__init__.py +3 -0
  153. pixeltable/mypy/mypy_plugin.py +123 -0
  154. pixeltable/plan.py +274 -223
  155. pixeltable/share/__init__.py +1 -1
  156. pixeltable/share/packager.py +259 -129
  157. pixeltable/share/protocol/__init__.py +34 -0
  158. pixeltable/share/protocol/common.py +170 -0
  159. pixeltable/share/protocol/operation_types.py +33 -0
  160. pixeltable/share/protocol/replica.py +109 -0
  161. pixeltable/share/publish.py +213 -57
  162. pixeltable/store.py +238 -175
  163. pixeltable/type_system.py +104 -63
  164. pixeltable/utils/__init__.py +2 -3
  165. pixeltable/utils/arrow.py +108 -13
  166. pixeltable/utils/av.py +298 -0
  167. pixeltable/utils/azure_store.py +305 -0
  168. pixeltable/utils/code.py +3 -3
  169. pixeltable/utils/console_output.py +4 -1
  170. pixeltable/utils/coroutine.py +6 -23
  171. pixeltable/utils/dbms.py +31 -5
  172. pixeltable/utils/description_helper.py +4 -5
  173. pixeltable/utils/documents.py +5 -6
  174. pixeltable/utils/exception_handler.py +7 -30
  175. pixeltable/utils/filecache.py +6 -6
  176. pixeltable/utils/formatter.py +4 -6
  177. pixeltable/utils/gcs_store.py +283 -0
  178. pixeltable/utils/http_server.py +2 -3
  179. pixeltable/utils/iceberg.py +1 -2
  180. pixeltable/utils/image.py +17 -0
  181. pixeltable/utils/lancedb.py +88 -0
  182. pixeltable/utils/local_store.py +316 -0
  183. pixeltable/utils/misc.py +5 -0
  184. pixeltable/utils/object_stores.py +528 -0
  185. pixeltable/utils/pydantic.py +60 -0
  186. pixeltable/utils/pytorch.py +5 -6
  187. pixeltable/utils/s3_store.py +392 -0
  188. pixeltable-0.4.20.dist-info/METADATA +587 -0
  189. pixeltable-0.4.20.dist-info/RECORD +218 -0
  190. {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info}/WHEEL +1 -1
  191. pixeltable-0.4.20.dist-info/entry_points.txt +2 -0
  192. pixeltable/__version__.py +0 -3
  193. pixeltable/ext/__init__.py +0 -17
  194. pixeltable/ext/functions/__init__.py +0 -11
  195. pixeltable/ext/functions/whisperx.py +0 -77
  196. pixeltable/utils/media_store.py +0 -77
  197. pixeltable/utils/s3.py +0 -17
  198. pixeltable/utils/sample.py +0 -25
  199. pixeltable-0.4.0rc3.dist-info/METADATA +0 -435
  200. pixeltable-0.4.0rc3.dist-info/RECORD +0 -189
  201. pixeltable-0.4.0rc3.dist-info/entry_points.txt +0 -3
  202. {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info/licenses}/LICENSE +0 -0
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import Generic, Iterable, Iterator, Optional, TypeVar
3
+ from typing import Generic, Iterable, Iterator, TypeVar
4
4
 
5
5
  from .expr import Expr
6
6
 
@@ -9,26 +9,33 @@ T = TypeVar('T', bound='Expr')
9
9
 
10
10
  class ExprSet(Generic[T]):
11
11
  """
12
- A set that also supports indexed lookup (by slot_idx and Expr.id). Exprs are uniquely identified by Expr.id.
12
+ An ordered set that also supports indexed lookup (by slot_idx and Expr.id). Exprs are uniquely identified by
13
+ Expr.id.
13
14
  """
14
15
 
15
16
  exprs: dict[int, T] # key: Expr.id
17
+ expr_offsets: dict[int, int] # key: Expr.id, value: offset into self.exprs.keys()
16
18
  exprs_by_idx: dict[int, T] # key: slot_idx
17
19
 
18
- def __init__(self, elements: Optional[Iterable[T]] = None):
20
+ def __init__(self, elements: Iterable[T] | None = None):
19
21
  self.exprs = {}
22
+ self.expr_offsets = {}
20
23
  self.exprs_by_idx = {}
21
24
  if elements is not None:
22
25
  for e in elements:
23
26
  self.add(e)
24
27
 
25
- def add(self, expr: T) -> None:
26
- if expr.id in self.exprs:
27
- return
28
+ def add(self, expr: T) -> int:
29
+ """Returns offset corresponding to iteration order"""
30
+ offset = self.expr_offsets.get(expr.id)
31
+ if offset is not None:
32
+ return offset
33
+ offset = len(self.exprs)
28
34
  self.exprs[expr.id] = expr
29
- if expr.slot_idx is None:
30
- return
31
- self.exprs_by_idx[expr.slot_idx] = expr
35
+ self.expr_offsets[expr.id] = offset
36
+ if expr.slot_idx is not None:
37
+ self.exprs_by_idx[expr.slot_idx] = expr
38
+ return offset
32
39
 
33
40
  def update(self, *others: Iterable[T]) -> None:
34
41
  for other in others:
@@ -44,7 +51,7 @@ class ExprSet(Generic[T]):
44
51
  def __iter__(self) -> Iterator[T]:
45
52
  return iter(self.exprs.values())
46
53
 
47
- def __getitem__(self, index: object) -> Optional[T]:
54
+ def __getitem__(self, index: object) -> T | None:
48
55
  """Indexed lookup by slot_idx or Expr.id."""
49
56
  assert isinstance(index, (int, Expr))
50
57
  if isinstance(index, int):
@@ -4,7 +4,7 @@ import inspect
4
4
  import logging
5
5
  import sys
6
6
  from textwrap import dedent
7
- from typing import Any, Optional, Sequence, Union
7
+ from typing import Any, Sequence
8
8
 
9
9
  import sqlalchemy as sql
10
10
 
@@ -24,7 +24,7 @@ class FunctionCall(Expr):
24
24
  fn: func.Function
25
25
  is_method_call: bool
26
26
  agg_init_args: dict[str, Any]
27
- resource_pool: Optional[str]
27
+ resource_pool: str | None
28
28
 
29
29
  # These collections hold the component indices corresponding to the args and kwargs
30
30
  # that were passed to the FunctionCall. They're 1:1 with the original call pattern.
@@ -36,17 +36,17 @@ class FunctionCall(Expr):
36
36
  # - a component index, if the parameter is a non-variadic parameter
37
37
  # - a list of component indices, if the parameter is a variadic positional parameter
38
38
  # - a dict mapping keyword names to component indices, if the parameter is a variadic keyword parameter
39
- bound_idxs: dict[str, Union[int, list[int], dict[str, int]]]
39
+ bound_idxs: dict[str, int | list[int] | dict[str, int]]
40
40
 
41
41
  return_type: ts.ColumnType
42
42
  group_by_start_idx: int
43
43
  group_by_stop_idx: int
44
44
  fn_expr_idx: int
45
45
  order_by_start_idx: int
46
- aggregator: Optional[Any]
47
- current_partition_vals: Optional[list[Any]]
46
+ aggregator: Any | None
47
+ current_partition_vals: list[Any] | None
48
48
 
49
- _validation_error: Optional[str]
49
+ _validation_error: str | None
50
50
 
51
51
  def __init__(
52
52
  self,
@@ -54,10 +54,10 @@ class FunctionCall(Expr):
54
54
  args: list[Expr],
55
55
  kwargs: dict[str, Expr],
56
56
  return_type: ts.ColumnType,
57
- order_by_clause: Optional[list[Any]] = None,
58
- group_by_clause: Optional[list[Any]] = None,
57
+ order_by_clause: list[Any] | None = None,
58
+ group_by_clause: list[Any] | None = None,
59
59
  is_method_call: bool = False,
60
- validation_error: Optional[str] = None,
60
+ validation_error: str | None = None,
61
61
  ):
62
62
  assert not fn.is_polymorphic
63
63
  assert all(isinstance(arg, Expr) for arg in args)
@@ -115,6 +115,7 @@ class FunctionCall(Expr):
115
115
  self._validation_error = validation_error
116
116
 
117
117
  if validation_error is not None:
118
+ self.bound_idxs = {}
118
119
  self.resource_pool = None
119
120
  return
120
121
 
@@ -148,7 +149,7 @@ class FunctionCall(Expr):
148
149
  target = tbl._tbl_version_path.tbl_version
149
150
  return [RowidRef(target, i) for i in range(target.get().num_rowid_columns())]
150
151
 
151
- def default_column_name(self) -> Optional[str]:
152
+ def default_column_name(self) -> str | None:
152
153
  return self.fn.name
153
154
 
154
155
  def _equals(self, other: FunctionCall) -> bool:
@@ -177,7 +178,7 @@ class FunctionCall(Expr):
177
178
  return self.display_str()
178
179
 
179
180
  @property
180
- def validation_error(self) -> Optional[str]:
181
+ def validation_error(self) -> str | None:
181
182
  return self._validation_error or super().validation_error
182
183
 
183
184
  def display_str(self, inline: bool = True) -> str:
@@ -244,7 +245,7 @@ class FunctionCall(Expr):
244
245
  assert self.is_agg_fn_call
245
246
  return self.order_by
246
247
 
247
- def sql_expr(self, sql_elements: SqlElementCache) -> Optional[sql.ColumnElement]:
248
+ def sql_expr(self, sql_elements: SqlElementCache) -> sql.ColumnElement | None:
248
249
  assert self.is_valid
249
250
 
250
251
  # we currently can't translate aggregate functions with grouping and/or ordering to SQL
@@ -300,8 +301,16 @@ class FunctionCall(Expr):
300
301
  """
301
302
  res = super().substitute(spec)
302
303
  assert res is self
303
- self.return_type = self.fn.call_return_type(self.bound_args)
304
- self.col_type = self.return_type
304
+ if self.is_valid:
305
+ # If this FunctionCall is valid, re-evaluate the call_return_type of the substituted expression. If the
306
+ # FunctionCall is not valid, it isn't safe to do this. (Really we should be asserting that it *is* valid,
307
+ # but we still need to be able to do substitutions on invalid FunctionCalls, because loading an
308
+ # EmbeddingIndex from the db involves reconstructing the requisite (substituted) FunctionCalls. We could
309
+ # fix this by separately persisting the FunctionCall instances held by EmbeddingIndex to the db. That's
310
+ # probably a good idea, but it's also probably not urgent, since it only affects Functions that have a
311
+ # conditional_return_type implemented.)
312
+ self.return_type = self.fn.call_return_type(self.bound_args)
313
+ self.col_type = self.return_type
305
314
  return self
306
315
 
307
316
  def update(self, data_row: DataRow) -> None:
@@ -312,7 +321,7 @@ class FunctionCall(Expr):
312
321
  args, kwargs = self.make_args(data_row)
313
322
  self.aggregator.update(*args, **kwargs)
314
323
 
315
- def make_args(self, data_row: DataRow) -> Optional[tuple[list[Any], dict[str, Any]]]:
324
+ def make_args(self, data_row: DataRow) -> tuple[list[Any], dict[str, Any]] | None:
316
325
  """Return args and kwargs, constructed for data_row; returns None if any non-nullable arg is None."""
317
326
  args: list[Any] = []
318
327
  parameters_by_pos = self.fn.signature.parameters_by_pos
@@ -439,18 +448,18 @@ class FunctionCall(Expr):
439
448
  group_by_exprs = components[group_by_start_idx:group_by_stop_idx]
440
449
  order_by_exprs = components[order_by_start_idx:]
441
450
 
442
- validation_error: Optional[str] = None
451
+ validation_error: str | None = None
443
452
 
444
453
  if isinstance(fn, func.InvalidFunction):
445
454
  validation_error = (
446
455
  dedent(
447
456
  f"""
448
457
  The UDF '{fn.self_path}' cannot be located, because
449
- {{errormsg}}
458
+ {{error_msg}}
450
459
  """
451
460
  )
452
461
  .strip()
453
- .format(errormsg=fn.errormsg)
462
+ .format(error_msg=fn.error_msg)
454
463
  )
455
464
  return cls(fn, args, kwargs, return_type, is_method_call=is_method_call, validation_error=validation_error)
456
465
 
@@ -480,25 +489,54 @@ class FunctionCall(Expr):
480
489
  ).strip()
481
490
  else:
482
491
  # Evaluate the call_return_type as defined in the current codebase.
483
- call_return_type = resolved_fn.call_return_type(bound_args)
484
- if return_type is None:
485
- # Schema versions prior to 25 did not store the return_type in metadata, and there is no obvious way to
486
- # infer it during DB migration, so we might encounter a stored return_type of None. In that case, we use
487
- # the call_return_type that we just inferred (which matches the deserialization behavior prior to
488
- # version 25).
489
- return_type = call_return_type
490
- elif not return_type.is_supertype_of(call_return_type, ignore_nullable=True):
491
- # There is a return_type stored in metadata (schema version >= 25),
492
- # and the stored return_type of the UDF call doesn't match the column type of the FunctionCall.
493
- validation_error = dedent(
494
- f"""
495
- The return type stored in the database for a UDF call to {fn.self_path!r} no longer
496
- matches its return type as currently defined in the code. This probably means that the
497
- code for {fn.self_path!r} has changed in a backward-incompatible way.
498
- Return type of UDF call in the database: {return_type}
499
- Return type of UDF as currently defined in code: {call_return_type}
500
- """
501
- ).strip()
492
+ call_return_type: ts.ColumnType | None = None
493
+
494
+ if isinstance(resolved_fn, func.ExprTemplateFunction) and not resolved_fn.template.expr.is_valid:
495
+ # The FunctionCall is based on an ExprTemplateFunction, but the template expression is not valid
496
+ # (because it in turn contains an invalid FunctionCall). In this case, inherit the validation error
497
+ # from the template expression.
498
+ validation_error = resolved_fn.template.expr.validation_error
499
+ else:
500
+ try:
501
+ call_return_type = resolved_fn.call_return_type(bound_args)
502
+ except ImportError as exc:
503
+ validation_error = dedent(
504
+ f"""
505
+ A UDF call to {fn.self_path!r} could not be fully resolved, because a module required
506
+ by the UDF could not be imported:
507
+ {exc}
508
+ """
509
+ )
510
+
511
+ assert (call_return_type is None) != (validation_error is None)
512
+
513
+ if call_return_type is None and return_type is None:
514
+ # Schema versions prior to 25 did not store the return_type in metadata, and there is no obvious
515
+ # way to infer it during DB migration, so we might encounter a stored return_type of None. If the
516
+ # resolution of call_return_type also fails, then we're out of luck; we have no choice but to
517
+ # fail-fast.
518
+ raise excs.Error(validation_error)
519
+
520
+ if call_return_type is not None:
521
+ # call_return_type resolution succeeded.
522
+ if return_type is None:
523
+ # Schema versions prior to 25 did not store the return_type in metadata (as mentioned above), so
524
+ # fall back on the call_return_type.
525
+ return_type = call_return_type
526
+ elif not return_type.is_supertype_of(call_return_type, ignore_nullable=True):
527
+ # There is a return_type stored in metadata (schema version >= 25),
528
+ # and the stored return_type of the UDF call doesn't match the column type of the FunctionCall.
529
+ validation_error = dedent(
530
+ f"""
531
+ The return type stored in the database for a UDF call to {fn.self_path!r} no longer
532
+ matches its return type as currently defined in the code. This probably means that the
533
+ code for {fn.self_path!r} has changed in a backward-incompatible way.
534
+ Return type of UDF call in the database: {return_type}
535
+ Return type of UDF as currently defined in code: {call_return_type}
536
+ """
537
+ ).strip()
538
+
539
+ assert return_type is not None # Guaranteed by the above logic.
502
540
 
503
541
  fn_call = cls(
504
542
  resolved_fn,
@@ -2,10 +2,9 @@ from __future__ import annotations
2
2
 
3
3
  import datetime
4
4
  import enum
5
- from typing import Union
6
5
 
7
6
  # Python types corresponding to our literal types
8
- LiteralPythonTypes = Union[str, int, float, bool, datetime.datetime, datetime.date]
7
+ LiteralPythonTypes = str | int | float | bool | datetime.datetime | datetime.date
9
8
 
10
9
 
11
10
  def print_slice(s: slice) -> str:
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import Any, Iterable, Optional
3
+ from typing import Any, Iterable
4
4
 
5
5
  import sqlalchemy as sql
6
6
 
@@ -16,13 +16,13 @@ from .sql_element_cache import SqlElementCache
16
16
  class InPredicate(Expr):
17
17
  """Predicate corresponding to the SQL IN operator."""
18
18
 
19
- def __init__(self, lhs: Expr, value_set_literal: Optional[Iterable] = None, value_set_expr: Optional[Expr] = None):
19
+ def __init__(self, lhs: Expr, value_set_literal: Iterable | None = None, value_set_expr: Expr | None = None):
20
20
  assert (value_set_literal is None) != (value_set_expr is None)
21
21
  if not lhs.col_type.is_scalar_type():
22
22
  raise excs.Error(f'isin(): only supported for scalar types, not {lhs.col_type}')
23
23
  super().__init__(ts.BoolType())
24
24
 
25
- self.value_list: Optional[list] = None # only contains values of the correct type
25
+ self.value_list: list | None = None # only contains values of the correct type
26
26
  if value_set_expr is not None:
27
27
  if not value_set_expr.col_type.is_json_type():
28
28
  raise excs.Error(
@@ -73,7 +73,7 @@ class InPredicate(Expr):
73
73
  def _id_attrs(self) -> list[tuple[str, Any]]:
74
74
  return [*super()._id_attrs(), ('value_list', self.value_list)]
75
75
 
76
- def sql_expr(self, sql_elements: SqlElementCache) -> Optional[sql.ColumnElement]:
76
+ def sql_expr(self, sql_elements: SqlElementCache) -> sql.ColumnElement | None:
77
77
  lhs_sql_exprs = sql_elements.get(self.components[0])
78
78
  if lhs_sql_exprs is None or self.value_list is None:
79
79
  return None
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import Any, Iterable, Optional
3
+ from typing import Any, Iterable
4
4
 
5
5
  import numpy as np
6
6
  import sqlalchemy as sql
@@ -30,7 +30,7 @@ class InlineArray(Expr):
30
30
  else:
31
31
  exprs.append(Literal(el))
32
32
 
33
- inferred_element_type: Optional[ts.ColumnType] = ts.InvalidType()
33
+ inferred_element_type: ts.ColumnType | None = ts.InvalidType()
34
34
  for i, expr in enumerate(exprs):
35
35
  supertype = inferred_element_type.supertype(expr.col_type)
36
36
  if supertype is None:
@@ -61,7 +61,7 @@ class InlineArray(Expr):
61
61
  def _equals(self, _: InlineArray) -> bool:
62
62
  return True # Always true if components match
63
63
 
64
- def sql_expr(self, _: SqlElementCache) -> Optional[sql.ColumnElement]:
64
+ def sql_expr(self, _: SqlElementCache) -> sql.ColumnElement | None:
65
65
  return None
66
66
 
67
67
  def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
@@ -81,7 +81,7 @@ class InlineArray(Expr):
81
81
  # loaded and their types are known.
82
82
  return InlineList(components) # type: ignore[return-value]
83
83
 
84
- def as_literal(self) -> Optional[Literal]:
84
+ def as_literal(self) -> Literal | None:
85
85
  assert isinstance(self.col_type, ts.ArrayType)
86
86
  if not all(isinstance(comp, Literal) for comp in self.components):
87
87
  return None
@@ -98,13 +98,7 @@ class InlineList(Expr):
98
98
  def __init__(self, elements: Iterable):
99
99
  exprs = [Expr.from_object(el) for el in elements]
100
100
 
101
- json_schema = {
102
- 'type': 'array',
103
- 'prefixItems': [expr.col_type.to_json_schema() for expr in exprs],
104
- 'items': False, # No additional items (fixed length)
105
- }
106
-
107
- super().__init__(ts.JsonType(json_schema))
101
+ super().__init__(ts.JsonType())
108
102
  self.components.extend(exprs)
109
103
  self.id = self._create_id()
110
104
 
@@ -115,7 +109,7 @@ class InlineList(Expr):
115
109
  def _equals(self, _: InlineList) -> bool:
116
110
  return True # Always true if components match
117
111
 
118
- def sql_expr(self, _: SqlElementCache) -> Optional[sql.ColumnElement]:
112
+ def sql_expr(self, _: SqlElementCache) -> sql.ColumnElement | None:
119
113
  return None
120
114
 
121
115
  def eval(self, data_row: DataRow, _: RowBuilder) -> None:
@@ -128,7 +122,7 @@ class InlineList(Expr):
128
122
  def _from_dict(cls, _: dict, components: list[Expr]) -> InlineList:
129
123
  return cls(components)
130
124
 
131
- def as_literal(self) -> Optional[Literal]:
125
+ def as_literal(self) -> Literal | None:
132
126
  if not all(isinstance(comp, Literal) for comp in self.components):
133
127
  return None
134
128
  return Literal([c.as_literal().val for c in self.components], self.col_type)
@@ -150,18 +144,7 @@ class InlineDict(Expr):
150
144
  self.keys.append(key)
151
145
  exprs.append(Expr.from_object(val))
152
146
 
153
- json_schema: Optional[dict[str, Any]]
154
- try:
155
- json_schema = {
156
- 'type': 'object',
157
- 'properties': {key: expr.col_type.to_json_schema() for key, expr in zip(self.keys, exprs)},
158
- }
159
- except excs.Error:
160
- # InlineDicts are used to store iterator arguments, which are not required to be valid JSON types,
161
- # so we can't always construct a valid schema.
162
- json_schema = None
163
-
164
- super().__init__(ts.JsonType(json_schema))
147
+ super().__init__(ts.JsonType())
165
148
  self.components.extend(exprs)
166
149
  self.id = self._create_id()
167
150
 
@@ -176,7 +159,7 @@ class InlineDict(Expr):
176
159
  def _id_attrs(self) -> list[tuple[str, Any]]:
177
160
  return [*super()._id_attrs(), ('keys', self.keys)]
178
161
 
179
- def sql_expr(self, _: SqlElementCache) -> Optional[sql.ColumnElement]:
162
+ def sql_expr(self, _: SqlElementCache) -> sql.ColumnElement | None:
180
163
  return None
181
164
 
182
165
  def eval(self, data_row: DataRow, _: RowBuilder) -> None:
@@ -208,7 +191,7 @@ class InlineDict(Expr):
208
191
  arg = dict(zip(d['keys'], components))
209
192
  return InlineDict(arg)
210
193
 
211
- def as_literal(self) -> Optional[Literal]:
194
+ def as_literal(self) -> Literal | None:
212
195
  if not all(isinstance(comp, Literal) for comp in self.components):
213
196
  return None
214
197
  return Literal(dict(zip(self.keys, (c.as_literal().val for c in self.components))), self.col_type)
@@ -1,7 +1,5 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import Optional
4
-
5
3
  import sqlalchemy as sql
6
4
 
7
5
  import pixeltable.type_system as ts
@@ -24,7 +22,7 @@ class IsNull(Expr):
24
22
  def _equals(self, other: IsNull) -> bool:
25
23
  return True
26
24
 
27
- def sql_expr(self, sql_elements: SqlElementCache) -> Optional[sql.ColumnElement]:
25
+ def sql_expr(self, sql_elements: SqlElementCache) -> sql.ColumnElement | None:
28
26
  e = sql_elements.get(self.components[0])
29
27
  if e is None:
30
28
  return None
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import TYPE_CHECKING, Optional
3
+ from typing import TYPE_CHECKING
4
4
 
5
5
  import sqlalchemy as sql
6
6
 
@@ -29,10 +29,10 @@ class JsonMapper(Expr):
29
29
  """
30
30
 
31
31
  target_expr_scope: ExprScope
32
- parent_mapper: Optional[JsonMapper]
33
- target_expr_eval_ctx: Optional[RowBuilder.EvalCtx]
32
+ parent_mapper: JsonMapper | None
33
+ target_expr_eval_ctx: RowBuilder.EvalCtx | None
34
34
 
35
- def __init__(self, src_expr: Optional[Expr], target_expr: Optional[Expr]):
35
+ def __init__(self, src_expr: Expr | None, target_expr: Expr | None):
36
36
  # TODO: type spec should be list[target_expr.col_type]
37
37
  super().__init__(ts.JsonType())
38
38
 
@@ -54,7 +54,7 @@ class JsonMapper(Expr):
54
54
  def _equals(self, _: JsonMapper) -> bool:
55
55
  return True
56
56
 
57
- def sql_expr(self, _: SqlElementCache) -> Optional[sql.ColumnElement]:
57
+ def sql_expr(self, _: SqlElementCache) -> sql.ColumnElement | None:
58
58
  return None
59
59
 
60
60
  def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
@@ -92,8 +92,8 @@ class JsonMapperDispatch(Expr):
92
92
  """
93
93
 
94
94
  target_expr_scope: ExprScope
95
- parent_mapper: Optional[JsonMapperDispatch]
96
- target_expr_eval_ctx: Optional[RowBuilder.EvalCtx]
95
+ parent_mapper: JsonMapperDispatch | None
96
+ target_expr_eval_ctx: RowBuilder.EvalCtx | None
97
97
 
98
98
  def __init__(self, src_expr: Expr, target_expr: Expr):
99
99
  super().__init__(ts.InvalidType())
@@ -116,7 +116,7 @@ class JsonMapperDispatch(Expr):
116
116
  scope_anchor = ObjectRef(self.target_expr_scope, self)
117
117
  self.components.append(scope_anchor)
118
118
 
119
- def _bind_rel_paths(self, mapper: Optional[JsonMapperDispatch] = None) -> None:
119
+ def _bind_rel_paths(self, mapper: JsonMapperDispatch | None = None) -> None:
120
120
  self.src_expr._bind_rel_paths(mapper)
121
121
  self.target_expr._bind_rel_paths(self)
122
122
  self.parent_mapper = mapper
@@ -1,12 +1,15 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import Any, Optional, Union
3
+ import io
4
+ from pathlib import Path
5
+ from typing import Any
4
6
 
5
7
  import jmespath
6
8
  import sqlalchemy as sql
7
9
 
8
10
  from pixeltable import catalog, exceptions as excs, type_system as ts
9
11
 
12
+ from .column_ref import ColumnRef
10
13
  from .data_row import DataRow
11
14
  from .expr import Expr
12
15
  from .globals import print_slice
@@ -17,29 +20,41 @@ from .sql_element_cache import SqlElementCache
17
20
 
18
21
 
19
22
  class JsonPath(Expr):
23
+ """
24
+ anchor can be None, in which case this is a relative JsonPath and the anchor is set later via set_anchor().
25
+ scope_idx: for relative paths, index of referenced JsonMapper
26
+ (0: indicates the immediately preceding JsonMapper, -1: the parent of the immediately preceding mapper, ...)
27
+ """
28
+
29
+ path_elements: list[str | int | slice]
30
+ compiled_path: jmespath.parser.ParsedResult | None
31
+ scope_idx: int
32
+ file_handles: dict[Path, io.BufferedReader] # key: file path
33
+
20
34
  def __init__(
21
- self, anchor: Optional[Expr], path_elements: Optional[list[Union[str, int, slice]]] = None, scope_idx: int = 0
35
+ self, anchor: Expr | None, path_elements: list[str | int | slice] | None = None, scope_idx: int = 0
22
36
  ) -> None:
23
- """
24
- anchor can be None, in which case this is a relative JsonPath and the anchor is set later via set_anchor().
25
- scope_idx: for relative paths, index of referenced JsonMapper
26
- (0: indicates the immediately preceding JsonMapper, -1: the parent of the immediately preceding mapper, ...)
27
- """
28
37
  if path_elements is None:
29
38
  path_elements = []
30
39
  super().__init__(ts.JsonType(nullable=True)) # JsonPath expressions are always nullable
31
40
  if anchor is not None:
32
41
  self.components = [anchor]
33
- self.path_elements: list[Union[str, int, slice]] = path_elements
42
+ self.path_elements = path_elements
34
43
  self.compiled_path = jmespath.compile(self._json_path()) if len(path_elements) > 0 else None
35
44
  self.scope_idx = scope_idx
36
45
  # NOTE: the _create_id() result will change if set_anchor() gets called;
37
46
  # this is not a problem, because _create_id() shouldn't be called after init()
38
47
  self.id = self._create_id()
48
+ self.file_handles = {}
49
+
50
+ def release(self) -> None:
51
+ for fh in self.file_handles.values():
52
+ fh.close()
53
+ self.file_handles.clear()
39
54
 
40
55
  def __repr__(self) -> str:
41
56
  # else 'R': the anchor is RELATIVE_PATH_ROOT
42
- anchor_str = str(self._anchor) if self._anchor is not None else 'R'
57
+ anchor_str = str(self.anchor) if self.anchor is not None else 'R'
43
58
  if len(self.path_elements) == 0:
44
59
  return anchor_str
45
60
  return f'{anchor_str}{"." if isinstance(self.path_elements[0], str) else ""}{self._json_path()}'
@@ -66,7 +81,7 @@ class JsonPath(Expr):
66
81
  return cls(anchor, path_elements, d['scope_idx'])
67
82
 
68
83
  @property
69
- def _anchor(self) -> Optional[Expr]:
84
+ def anchor(self) -> Expr | None:
70
85
  return None if len(self.components) == 0 else self.components[0]
71
86
 
72
87
  def set_anchor(self, anchor: Expr) -> None:
@@ -74,17 +89,17 @@ class JsonPath(Expr):
74
89
  self.components = [anchor]
75
90
 
76
91
  def is_relative_path(self) -> bool:
77
- return self._anchor is None
92
+ return self.anchor is None
78
93
 
79
94
  def _has_relative_path(self) -> bool:
80
95
  return self.is_relative_path() or super()._has_relative_path()
81
96
 
82
- def _bind_rel_paths(self, mapper: Optional['JsonMapperDispatch'] = None) -> None:
97
+ def _bind_rel_paths(self, mapper: 'JsonMapperDispatch' | None = None) -> None:
83
98
  if self.is_relative_path():
84
99
  # TODO: take scope_idx into account
85
100
  self.set_anchor(mapper.scope_anchor)
86
101
  else:
87
- self._anchor._bind_rel_paths(mapper)
102
+ self.anchor._bind_rel_paths(mapper)
88
103
 
89
104
  def __call__(self, *args: object, **kwargs: object) -> 'JsonPath':
90
105
  """
@@ -98,15 +113,15 @@ class JsonPath(Expr):
98
113
 
99
114
  def __getattr__(self, name: str) -> 'JsonPath':
100
115
  assert isinstance(name, str)
101
- return JsonPath(self._anchor, [*self.path_elements, name])
116
+ return JsonPath(self.anchor, [*self.path_elements, name])
102
117
 
103
118
  def __getitem__(self, index: object) -> 'JsonPath':
104
119
  if isinstance(index, (int, slice, str)):
105
- return JsonPath(self._anchor, [*self.path_elements, index])
120
+ return JsonPath(self.anchor, [*self.path_elements, index])
106
121
  raise excs.Error(f'Invalid json list index: {index}')
107
122
 
108
- def default_column_name(self) -> Optional[str]:
109
- anchor_name = self._anchor.default_column_name() if self._anchor is not None else ''
123
+ def default_column_name(self) -> str | None:
124
+ anchor_name = self.anchor.default_column_name() if self.anchor is not None else ''
110
125
  ret_name = f'{anchor_name}.{self._json_path()}'
111
126
 
112
127
  def cleanup_char(s: str) -> str:
@@ -133,7 +148,7 @@ class JsonPath(Expr):
133
148
  def _id_attrs(self) -> list[tuple[str, Any]]:
134
149
  return [*super()._id_attrs(), ('path_elements', self.path_elements)]
135
150
 
136
- def sql_expr(self, _: SqlElementCache) -> Optional[sql.ColumnElement]:
151
+ def sql_expr(self, _: SqlElementCache) -> sql.ColumnElement | None:
137
152
  """
138
153
  Postgres appears to have a bug: jsonb_path_query('{a: [{b: 0}, {b: 1}]}', '$.a.b') returns
139
154
  *two* rows (each containing col val 0), not a single row with [0, 0].
@@ -158,12 +173,31 @@ class JsonPath(Expr):
158
173
  result.append(f'[{print_slice(element)}]')
159
174
  return ''.join(result)
160
175
 
161
- def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
162
- assert self._anchor is not None, self
163
- val = data_row[self._anchor.slot_idx]
176
+ def eval(self, row: DataRow, row_builder: RowBuilder) -> None:
177
+ assert self.anchor is not None, self
178
+ val = row[self.anchor.slot_idx]
164
179
  if self.compiled_path is not None:
165
180
  val = self.compiled_path.search(val)
166
- data_row[self.slot_idx] = val
181
+ row[self.slot_idx] = val
182
+ if val is None or self.anchor is None or not isinstance(self.anchor, ColumnRef):
183
+ return
184
+
185
+ # the origin of val is a json-typed column, which might stored inlined objects
186
+ if self.anchor.slot_idx not in row.slot_md:
187
+ # we can infer that there aren't any inlined objects because our execution plan doesn't include
188
+ # materializing the cellmd (eg, insert plans)
189
+ # TODO: have the planner pass that fact into ExprEvalNode explicitly to streamline this path a bit more
190
+ return
191
+
192
+ # defer import until it's needed
193
+ from pixeltable.exec.cell_reconstruction_node import json_has_inlined_objs, reconstruct_json
194
+
195
+ cell_md = row.slot_md[self.anchor.slot_idx]
196
+ if cell_md is None or cell_md.file_urls is None or not json_has_inlined_objs(val):
197
+ # val doesn't contain inlined objects
198
+ return
199
+
200
+ row.vals[self.slot_idx] = reconstruct_json(val, cell_md.file_urls, self.file_handles)
167
201
 
168
202
 
169
203
  RELATIVE_PATH_ROOT = JsonPath(None)