pixeltable 0.3.14__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. pixeltable/__init__.py +42 -8
  2. pixeltable/{dataframe.py → _query.py} +470 -206
  3. pixeltable/_version.py +1 -0
  4. pixeltable/catalog/__init__.py +5 -4
  5. pixeltable/catalog/catalog.py +1785 -432
  6. pixeltable/catalog/column.py +190 -113
  7. pixeltable/catalog/dir.py +2 -4
  8. pixeltable/catalog/globals.py +19 -46
  9. pixeltable/catalog/insertable_table.py +191 -98
  10. pixeltable/catalog/path.py +63 -23
  11. pixeltable/catalog/schema_object.py +11 -15
  12. pixeltable/catalog/table.py +843 -436
  13. pixeltable/catalog/table_metadata.py +103 -0
  14. pixeltable/catalog/table_version.py +978 -657
  15. pixeltable/catalog/table_version_handle.py +72 -16
  16. pixeltable/catalog/table_version_path.py +112 -43
  17. pixeltable/catalog/tbl_ops.py +53 -0
  18. pixeltable/catalog/update_status.py +191 -0
  19. pixeltable/catalog/view.py +134 -90
  20. pixeltable/config.py +134 -22
  21. pixeltable/env.py +471 -157
  22. pixeltable/exceptions.py +6 -0
  23. pixeltable/exec/__init__.py +4 -1
  24. pixeltable/exec/aggregation_node.py +7 -8
  25. pixeltable/exec/cache_prefetch_node.py +83 -110
  26. pixeltable/exec/cell_materialization_node.py +268 -0
  27. pixeltable/exec/cell_reconstruction_node.py +168 -0
  28. pixeltable/exec/component_iteration_node.py +4 -3
  29. pixeltable/exec/data_row_batch.py +8 -65
  30. pixeltable/exec/exec_context.py +16 -4
  31. pixeltable/exec/exec_node.py +13 -36
  32. pixeltable/exec/expr_eval/evaluators.py +11 -7
  33. pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
  34. pixeltable/exec/expr_eval/globals.py +8 -5
  35. pixeltable/exec/expr_eval/row_buffer.py +1 -2
  36. pixeltable/exec/expr_eval/schedulers.py +106 -56
  37. pixeltable/exec/globals.py +35 -0
  38. pixeltable/exec/in_memory_data_node.py +19 -19
  39. pixeltable/exec/object_store_save_node.py +293 -0
  40. pixeltable/exec/row_update_node.py +16 -9
  41. pixeltable/exec/sql_node.py +351 -84
  42. pixeltable/exprs/__init__.py +1 -1
  43. pixeltable/exprs/arithmetic_expr.py +27 -22
  44. pixeltable/exprs/array_slice.py +3 -3
  45. pixeltable/exprs/column_property_ref.py +36 -23
  46. pixeltable/exprs/column_ref.py +213 -89
  47. pixeltable/exprs/comparison.py +5 -5
  48. pixeltable/exprs/compound_predicate.py +5 -4
  49. pixeltable/exprs/data_row.py +164 -54
  50. pixeltable/exprs/expr.py +70 -44
  51. pixeltable/exprs/expr_dict.py +3 -3
  52. pixeltable/exprs/expr_set.py +17 -10
  53. pixeltable/exprs/function_call.py +100 -40
  54. pixeltable/exprs/globals.py +2 -2
  55. pixeltable/exprs/in_predicate.py +4 -4
  56. pixeltable/exprs/inline_expr.py +18 -32
  57. pixeltable/exprs/is_null.py +7 -3
  58. pixeltable/exprs/json_mapper.py +8 -8
  59. pixeltable/exprs/json_path.py +56 -22
  60. pixeltable/exprs/literal.py +27 -5
  61. pixeltable/exprs/method_ref.py +2 -2
  62. pixeltable/exprs/object_ref.py +2 -2
  63. pixeltable/exprs/row_builder.py +167 -67
  64. pixeltable/exprs/rowid_ref.py +25 -10
  65. pixeltable/exprs/similarity_expr.py +58 -40
  66. pixeltable/exprs/sql_element_cache.py +4 -4
  67. pixeltable/exprs/string_op.py +5 -5
  68. pixeltable/exprs/type_cast.py +3 -5
  69. pixeltable/func/__init__.py +1 -0
  70. pixeltable/func/aggregate_function.py +8 -8
  71. pixeltable/func/callable_function.py +9 -9
  72. pixeltable/func/expr_template_function.py +17 -11
  73. pixeltable/func/function.py +18 -20
  74. pixeltable/func/function_registry.py +6 -7
  75. pixeltable/func/globals.py +2 -3
  76. pixeltable/func/mcp.py +74 -0
  77. pixeltable/func/query_template_function.py +29 -27
  78. pixeltable/func/signature.py +46 -19
  79. pixeltable/func/tools.py +31 -13
  80. pixeltable/func/udf.py +18 -20
  81. pixeltable/functions/__init__.py +16 -0
  82. pixeltable/functions/anthropic.py +123 -77
  83. pixeltable/functions/audio.py +147 -10
  84. pixeltable/functions/bedrock.py +13 -6
  85. pixeltable/functions/date.py +7 -4
  86. pixeltable/functions/deepseek.py +35 -43
  87. pixeltable/functions/document.py +81 -0
  88. pixeltable/functions/fal.py +76 -0
  89. pixeltable/functions/fireworks.py +11 -20
  90. pixeltable/functions/gemini.py +195 -39
  91. pixeltable/functions/globals.py +142 -14
  92. pixeltable/functions/groq.py +108 -0
  93. pixeltable/functions/huggingface.py +1056 -24
  94. pixeltable/functions/image.py +115 -57
  95. pixeltable/functions/json.py +1 -1
  96. pixeltable/functions/llama_cpp.py +28 -13
  97. pixeltable/functions/math.py +67 -5
  98. pixeltable/functions/mistralai.py +18 -55
  99. pixeltable/functions/net.py +70 -0
  100. pixeltable/functions/ollama.py +20 -13
  101. pixeltable/functions/openai.py +240 -226
  102. pixeltable/functions/openrouter.py +143 -0
  103. pixeltable/functions/replicate.py +4 -4
  104. pixeltable/functions/reve.py +250 -0
  105. pixeltable/functions/string.py +239 -69
  106. pixeltable/functions/timestamp.py +16 -16
  107. pixeltable/functions/together.py +24 -84
  108. pixeltable/functions/twelvelabs.py +188 -0
  109. pixeltable/functions/util.py +6 -1
  110. pixeltable/functions/uuid.py +30 -0
  111. pixeltable/functions/video.py +1515 -107
  112. pixeltable/functions/vision.py +8 -8
  113. pixeltable/functions/voyageai.py +289 -0
  114. pixeltable/functions/whisper.py +16 -8
  115. pixeltable/functions/whisperx.py +179 -0
  116. pixeltable/{ext/functions → functions}/yolox.py +2 -4
  117. pixeltable/globals.py +362 -115
  118. pixeltable/index/base.py +17 -21
  119. pixeltable/index/btree.py +28 -22
  120. pixeltable/index/embedding_index.py +100 -118
  121. pixeltable/io/__init__.py +4 -2
  122. pixeltable/io/datarows.py +8 -7
  123. pixeltable/io/external_store.py +56 -105
  124. pixeltable/io/fiftyone.py +13 -13
  125. pixeltable/io/globals.py +31 -30
  126. pixeltable/io/hf_datasets.py +61 -16
  127. pixeltable/io/label_studio.py +74 -70
  128. pixeltable/io/lancedb.py +3 -0
  129. pixeltable/io/pandas.py +21 -12
  130. pixeltable/io/parquet.py +25 -105
  131. pixeltable/io/table_data_conduit.py +250 -123
  132. pixeltable/io/utils.py +4 -4
  133. pixeltable/iterators/__init__.py +2 -1
  134. pixeltable/iterators/audio.py +26 -25
  135. pixeltable/iterators/base.py +9 -3
  136. pixeltable/iterators/document.py +112 -78
  137. pixeltable/iterators/image.py +12 -15
  138. pixeltable/iterators/string.py +11 -4
  139. pixeltable/iterators/video.py +523 -120
  140. pixeltable/metadata/__init__.py +14 -3
  141. pixeltable/metadata/converters/convert_13.py +2 -2
  142. pixeltable/metadata/converters/convert_18.py +2 -2
  143. pixeltable/metadata/converters/convert_19.py +2 -2
  144. pixeltable/metadata/converters/convert_20.py +2 -2
  145. pixeltable/metadata/converters/convert_21.py +2 -2
  146. pixeltable/metadata/converters/convert_22.py +2 -2
  147. pixeltable/metadata/converters/convert_24.py +2 -2
  148. pixeltable/metadata/converters/convert_25.py +2 -2
  149. pixeltable/metadata/converters/convert_26.py +2 -2
  150. pixeltable/metadata/converters/convert_29.py +4 -4
  151. pixeltable/metadata/converters/convert_30.py +34 -21
  152. pixeltable/metadata/converters/convert_34.py +2 -2
  153. pixeltable/metadata/converters/convert_35.py +9 -0
  154. pixeltable/metadata/converters/convert_36.py +38 -0
  155. pixeltable/metadata/converters/convert_37.py +15 -0
  156. pixeltable/metadata/converters/convert_38.py +39 -0
  157. pixeltable/metadata/converters/convert_39.py +124 -0
  158. pixeltable/metadata/converters/convert_40.py +73 -0
  159. pixeltable/metadata/converters/convert_41.py +12 -0
  160. pixeltable/metadata/converters/convert_42.py +9 -0
  161. pixeltable/metadata/converters/convert_43.py +44 -0
  162. pixeltable/metadata/converters/util.py +20 -31
  163. pixeltable/metadata/notes.py +9 -0
  164. pixeltable/metadata/schema.py +140 -53
  165. pixeltable/metadata/utils.py +74 -0
  166. pixeltable/mypy/__init__.py +3 -0
  167. pixeltable/mypy/mypy_plugin.py +123 -0
  168. pixeltable/plan.py +382 -115
  169. pixeltable/share/__init__.py +1 -1
  170. pixeltable/share/packager.py +547 -83
  171. pixeltable/share/protocol/__init__.py +33 -0
  172. pixeltable/share/protocol/common.py +165 -0
  173. pixeltable/share/protocol/operation_types.py +33 -0
  174. pixeltable/share/protocol/replica.py +119 -0
  175. pixeltable/share/publish.py +257 -59
  176. pixeltable/store.py +311 -194
  177. pixeltable/type_system.py +373 -211
  178. pixeltable/utils/__init__.py +2 -3
  179. pixeltable/utils/arrow.py +131 -17
  180. pixeltable/utils/av.py +298 -0
  181. pixeltable/utils/azure_store.py +346 -0
  182. pixeltable/utils/coco.py +6 -6
  183. pixeltable/utils/code.py +3 -3
  184. pixeltable/utils/console_output.py +4 -1
  185. pixeltable/utils/coroutine.py +6 -23
  186. pixeltable/utils/dbms.py +32 -6
  187. pixeltable/utils/description_helper.py +4 -5
  188. pixeltable/utils/documents.py +7 -18
  189. pixeltable/utils/exception_handler.py +7 -30
  190. pixeltable/utils/filecache.py +6 -6
  191. pixeltable/utils/formatter.py +86 -48
  192. pixeltable/utils/gcs_store.py +295 -0
  193. pixeltable/utils/http.py +133 -0
  194. pixeltable/utils/http_server.py +2 -3
  195. pixeltable/utils/iceberg.py +1 -2
  196. pixeltable/utils/image.py +17 -0
  197. pixeltable/utils/lancedb.py +90 -0
  198. pixeltable/utils/local_store.py +322 -0
  199. pixeltable/utils/misc.py +5 -0
  200. pixeltable/utils/object_stores.py +573 -0
  201. pixeltable/utils/pydantic.py +60 -0
  202. pixeltable/utils/pytorch.py +5 -6
  203. pixeltable/utils/s3_store.py +527 -0
  204. pixeltable/utils/sql.py +26 -0
  205. pixeltable/utils/system.py +30 -0
  206. pixeltable-0.5.7.dist-info/METADATA +579 -0
  207. pixeltable-0.5.7.dist-info/RECORD +227 -0
  208. {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
  209. pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
  210. pixeltable/__version__.py +0 -3
  211. pixeltable/catalog/named_function.py +0 -40
  212. pixeltable/ext/__init__.py +0 -17
  213. pixeltable/ext/functions/__init__.py +0 -11
  214. pixeltable/ext/functions/whisperx.py +0 -77
  215. pixeltable/utils/media_store.py +0 -77
  216. pixeltable/utils/s3.py +0 -17
  217. pixeltable-0.3.14.dist-info/METADATA +0 -434
  218. pixeltable-0.3.14.dist-info/RECORD +0 -186
  219. pixeltable-0.3.14.dist-info/entry_points.txt +0 -3
  220. {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
@@ -4,7 +4,7 @@ import inspect
4
4
  import logging
5
5
  import sys
6
6
  from textwrap import dedent
7
- from typing import Any, Optional, Sequence, Union
7
+ from typing import Any, Sequence
8
8
 
9
9
  import sqlalchemy as sql
10
10
 
@@ -24,7 +24,7 @@ class FunctionCall(Expr):
24
24
  fn: func.Function
25
25
  is_method_call: bool
26
26
  agg_init_args: dict[str, Any]
27
- resource_pool: Optional[str]
27
+ resource_pool: str | None
28
28
 
29
29
  # These collections hold the component indices corresponding to the args and kwargs
30
30
  # that were passed to the FunctionCall. They're 1:1 with the original call pattern.
@@ -36,17 +36,17 @@ class FunctionCall(Expr):
36
36
  # - a component index, if the parameter is a non-variadic parameter
37
37
  # - a list of component indices, if the parameter is a variadic positional parameter
38
38
  # - a dict mapping keyword names to component indices, if the parameter is a variadic keyword parameter
39
- bound_idxs: dict[str, Union[int, list[int], dict[str, int]]]
39
+ bound_idxs: dict[str, int | list[int] | dict[str, int]]
40
40
 
41
41
  return_type: ts.ColumnType
42
42
  group_by_start_idx: int
43
43
  group_by_stop_idx: int
44
44
  fn_expr_idx: int
45
45
  order_by_start_idx: int
46
- aggregator: Optional[Any]
47
- current_partition_vals: Optional[list[Any]]
46
+ aggregator: Any | None
47
+ current_partition_vals: list[Any] | None
48
48
 
49
- _validation_error: Optional[str]
49
+ _validation_error: str | None
50
50
 
51
51
  def __init__(
52
52
  self,
@@ -54,10 +54,10 @@ class FunctionCall(Expr):
54
54
  args: list[Expr],
55
55
  kwargs: dict[str, Expr],
56
56
  return_type: ts.ColumnType,
57
- order_by_clause: Optional[list[Any]] = None,
58
- group_by_clause: Optional[list[Any]] = None,
57
+ order_by_clause: list[Any] | None = None,
58
+ group_by_clause: list[Any] | None = None,
59
59
  is_method_call: bool = False,
60
- validation_error: Optional[str] = None,
60
+ validation_error: str | None = None,
61
61
  ):
62
62
  assert not fn.is_polymorphic
63
63
  assert all(isinstance(arg, Expr) for arg in args)
@@ -115,6 +115,7 @@ class FunctionCall(Expr):
115
115
  self._validation_error = validation_error
116
116
 
117
117
  if validation_error is not None:
118
+ self.bound_idxs = {}
118
119
  self.resource_pool = None
119
120
  return
120
121
 
@@ -148,7 +149,7 @@ class FunctionCall(Expr):
148
149
  target = tbl._tbl_version_path.tbl_version
149
150
  return [RowidRef(target, i) for i in range(target.get().num_rowid_columns())]
150
151
 
151
- def default_column_name(self) -> Optional[str]:
152
+ def default_column_name(self) -> str | None:
152
153
  return self.fn.name
153
154
 
154
155
  def _equals(self, other: FunctionCall) -> bool:
@@ -176,11 +177,19 @@ class FunctionCall(Expr):
176
177
  def __repr__(self) -> str:
177
178
  return self.display_str()
178
179
 
180
+ # def __repr__(self) -> str:
181
+ # return f'FunctionCall(fn={self.fn!r}, args={self.args!r}, kwargs={self.kwargs!r})'
182
+
179
183
  @property
180
- def validation_error(self) -> Optional[str]:
184
+ def validation_error(self) -> str | None:
181
185
  return self._validation_error or super().validation_error
182
186
 
183
187
  def display_str(self, inline: bool = True) -> str:
188
+ if isinstance(self.fn, func.ExprTemplateFunction) and isinstance(self.fn.template.expr, FunctionCall):
189
+ # If this FunctionCall uses an ExprTemplateFunction with a nested FunctionCall, then resolve the
190
+ # indirection by substitution into the ExprTemplateFunction.
191
+ subst = self.fn.instantiate(self.args, self.kwargs)
192
+ return subst.display_str(inline)
184
193
  if self.is_method_call:
185
194
  return f'{self.components[0]}.{self.fn.name}({self._print_args(1, inline)})'
186
195
  else:
@@ -244,7 +253,7 @@ class FunctionCall(Expr):
244
253
  assert self.is_agg_fn_call
245
254
  return self.order_by
246
255
 
247
- def sql_expr(self, sql_elements: SqlElementCache) -> Optional[sql.ColumnElement]:
256
+ def sql_expr(self, sql_elements: SqlElementCache) -> sql.ColumnElement | None:
248
257
  assert self.is_valid
249
258
 
250
259
  # we currently can't translate aggregate functions with grouping and/or ordering to SQL
@@ -300,10 +309,32 @@ class FunctionCall(Expr):
300
309
  """
301
310
  res = super().substitute(spec)
302
311
  assert res is self
303
- self.return_type = self.fn.call_return_type(self.bound_args)
304
- self.col_type = self.return_type
312
+ if self.is_valid:
313
+ # If this FunctionCall is valid, re-evaluate the call_return_type of the substituted expression. If the
314
+ # FunctionCall is not valid, it isn't safe to do this. (Really we should be asserting that it *is* valid,
315
+ # but we still need to be able to do substitutions on invalid FunctionCalls, because loading an
316
+ # EmbeddingIndex from the db involves reconstructing the requisite (substituted) FunctionCalls. We could
317
+ # fix this by separately persisting the FunctionCall instances held by EmbeddingIndex to the db. That's
318
+ # probably a good idea, but it's also probably not urgent, since it only affects Functions that have a
319
+ # conditional_return_type implemented.)
320
+ self.return_type = self.fn.call_return_type(self.bound_args)
321
+ self.col_type = self.return_type
305
322
  return self
306
323
 
324
+ @property
325
+ def args(self) -> list[Expr]:
326
+ return [self.components[idx] for idx in self.arg_idxs]
327
+
328
+ @property
329
+ def kwargs(self) -> dict[str, Expr]:
330
+ return {name: self.components[idx] for name, idx in self.kwarg_idxs.items()}
331
+
332
+ @property
333
+ def fn_expr(self) -> Expr | None:
334
+ if self.fn_expr_idx != sys.maxsize:
335
+ return self.components[self.fn_expr_idx]
336
+ return None
337
+
307
338
  def update(self, data_row: DataRow) -> None:
308
339
  """
309
340
  Update agg state
@@ -312,7 +343,7 @@ class FunctionCall(Expr):
312
343
  args, kwargs = self.make_args(data_row)
313
344
  self.aggregator.update(*args, **kwargs)
314
345
 
315
- def make_args(self, data_row: DataRow) -> Optional[tuple[list[Any], dict[str, Any]]]:
346
+ def make_args(self, data_row: DataRow) -> tuple[list[Any], dict[str, Any]] | None:
316
347
  """Return args and kwargs, constructed for data_row; returns None if any non-nullable arg is None."""
317
348
  args: list[Any] = []
318
349
  parameters_by_pos = self.fn.signature.parameters_by_pos
@@ -439,18 +470,18 @@ class FunctionCall(Expr):
439
470
  group_by_exprs = components[group_by_start_idx:group_by_stop_idx]
440
471
  order_by_exprs = components[order_by_start_idx:]
441
472
 
442
- validation_error: Optional[str] = None
473
+ validation_error: str | None = None
443
474
 
444
475
  if isinstance(fn, func.InvalidFunction):
445
476
  validation_error = (
446
477
  dedent(
447
478
  f"""
448
479
  The UDF '{fn.self_path}' cannot be located, because
449
- {{errormsg}}
480
+ {{error_msg}}
450
481
  """
451
482
  )
452
483
  .strip()
453
- .format(errormsg=fn.errormsg)
484
+ .format(error_msg=fn.error_msg)
454
485
  )
455
486
  return cls(fn, args, kwargs, return_type, is_method_call=is_method_call, validation_error=validation_error)
456
487
 
@@ -465,9 +496,9 @@ class FunctionCall(Expr):
465
496
  resolved_fn, bound_args = fn._bind_to_matching_signature(args, kwargs)
466
497
  except (TypeError, excs.Error):
467
498
  signature_note_str = 'any of its signatures' if fn.is_polymorphic else 'its signature'
468
- args_str = [str(arg.col_type) for arg in args]
469
- args_str.extend(f'{name}: {arg.col_type}' for name, arg in kwargs.items())
470
- call_signature_str = f'({", ".join(args_str)}) -> {return_type}'
499
+ args_str = [f'pxt.{arg.col_type}' for arg in args]
500
+ args_str.extend(f'{name}: pxt.{arg.col_type}' for name, arg in kwargs.items())
501
+ call_signature_str = f'({", ".join(args_str)}) -> pxt.{return_type}'
471
502
  fn_signature_str = f'{len(fn.signatures)} signatures' if fn.is_polymorphic else str(fn.signature)
472
503
  validation_error = dedent(
473
504
  f"""
@@ -480,25 +511,54 @@ class FunctionCall(Expr):
480
511
  ).strip()
481
512
  else:
482
513
  # Evaluate the call_return_type as defined in the current codebase.
483
- call_return_type = resolved_fn.call_return_type(bound_args)
484
- if return_type is None:
485
- # Schema versions prior to 25 did not store the return_type in metadata, and there is no obvious way to
486
- # infer it during DB migration, so we might encounter a stored return_type of None. In that case, we use
487
- # the call_return_type that we just inferred (which matches the deserialization behavior prior to
488
- # version 25).
489
- return_type = call_return_type
490
- elif not return_type.is_supertype_of(call_return_type, ignore_nullable=True):
491
- # There is a return_type stored in metadata (schema version >= 25),
492
- # and the stored return_type of the UDF call doesn't match the column type of the FunctionCall.
493
- validation_error = dedent(
494
- f"""
495
- The return type stored in the database for a UDF call to {fn.self_path!r} no longer
496
- matches its return type as currently defined in the code. This probably means that the
497
- code for {fn.self_path!r} has changed in a backward-incompatible way.
498
- Return type of UDF call in the database: {return_type}
499
- Return type of UDF as currently defined in code: {call_return_type}
500
- """
501
- ).strip()
514
+ call_return_type: ts.ColumnType | None = None
515
+
516
+ if isinstance(resolved_fn, func.ExprTemplateFunction) and not resolved_fn.template.expr.is_valid:
517
+ # The FunctionCall is based on an ExprTemplateFunction, but the template expression is not valid
518
+ # (because it in turn contains an invalid FunctionCall). In this case, inherit the validation error
519
+ # from the template expression.
520
+ validation_error = resolved_fn.template.expr.validation_error
521
+ else:
522
+ try:
523
+ call_return_type = resolved_fn.call_return_type(bound_args)
524
+ except ImportError as exc:
525
+ validation_error = dedent(
526
+ f"""
527
+ A UDF call to {fn.self_path!r} could not be fully resolved, because a module required
528
+ by the UDF could not be imported:
529
+ {exc}
530
+ """
531
+ )
532
+
533
+ assert (call_return_type is None) != (validation_error is None)
534
+
535
+ if call_return_type is None and return_type is None:
536
+ # Schema versions prior to 25 did not store the return_type in metadata, and there is no obvious
537
+ # way to infer it during DB migration, so we might encounter a stored return_type of None. If the
538
+ # resolution of call_return_type also fails, then we're out of luck; we have no choice but to
539
+ # fail-fast.
540
+ raise excs.Error(validation_error)
541
+
542
+ if call_return_type is not None:
543
+ # call_return_type resolution succeeded.
544
+ if return_type is None:
545
+ # Schema versions prior to 25 did not store the return_type in metadata (as mentioned above), so
546
+ # fall back on the call_return_type.
547
+ return_type = call_return_type
548
+ elif not return_type.is_supertype_of(call_return_type, ignore_nullable=True):
549
+ # There is a return_type stored in metadata (schema version >= 25),
550
+ # and the stored return_type of the UDF call doesn't match the column type of the FunctionCall.
551
+ validation_error = dedent(
552
+ f"""
553
+ The return type stored in the database for a UDF call to {fn.self_path!r} no longer
554
+ matches its return type as currently defined in the code. This probably means that the
555
+ code for {fn.self_path!r} has changed in a backward-incompatible way.
556
+ Return type of UDF call in the database: {return_type}
557
+ Return type of UDF as currently defined in code: {call_return_type}
558
+ """
559
+ ).strip()
560
+
561
+ assert return_type is not None # Guaranteed by the above logic.
502
562
 
503
563
  fn_call = cls(
504
564
  resolved_fn,
@@ -2,10 +2,10 @@ from __future__ import annotations
2
2
 
3
3
  import datetime
4
4
  import enum
5
- from typing import Union
5
+ import uuid
6
6
 
7
7
  # Python types corresponding to our literal types
8
- LiteralPythonTypes = Union[str, int, float, bool, datetime.datetime, datetime.date]
8
+ LiteralPythonTypes = str | int | float | bool | datetime.datetime | datetime.date | uuid.UUID
9
9
 
10
10
 
11
11
  def print_slice(s: slice) -> str:
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import Any, Iterable, Optional
3
+ from typing import Any, Iterable
4
4
 
5
5
  import sqlalchemy as sql
6
6
 
@@ -16,13 +16,13 @@ from .sql_element_cache import SqlElementCache
16
16
  class InPredicate(Expr):
17
17
  """Predicate corresponding to the SQL IN operator."""
18
18
 
19
- def __init__(self, lhs: Expr, value_set_literal: Optional[Iterable] = None, value_set_expr: Optional[Expr] = None):
19
+ def __init__(self, lhs: Expr, value_set_literal: Iterable | None = None, value_set_expr: Expr | None = None):
20
20
  assert (value_set_literal is None) != (value_set_expr is None)
21
21
  if not lhs.col_type.is_scalar_type():
22
22
  raise excs.Error(f'isin(): only supported for scalar types, not {lhs.col_type}')
23
23
  super().__init__(ts.BoolType())
24
24
 
25
- self.value_list: Optional[list] = None # only contains values of the correct type
25
+ self.value_list: list | None = None # only contains values of the correct type
26
26
  if value_set_expr is not None:
27
27
  if not value_set_expr.col_type.is_json_type():
28
28
  raise excs.Error(
@@ -73,7 +73,7 @@ class InPredicate(Expr):
73
73
  def _id_attrs(self) -> list[tuple[str, Any]]:
74
74
  return [*super()._id_attrs(), ('value_list', self.value_list)]
75
75
 
76
- def sql_expr(self, sql_elements: SqlElementCache) -> Optional[sql.ColumnElement]:
76
+ def sql_expr(self, sql_elements: SqlElementCache) -> sql.ColumnElement | None:
77
77
  lhs_sql_exprs = sql_elements.get(self.components[0])
78
78
  if lhs_sql_exprs is None or self.value_list is None:
79
79
  return None
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import Any, Iterable, Optional
3
+ from typing import Any, Iterable
4
4
 
5
5
  import numpy as np
6
6
  import sqlalchemy as sql
@@ -30,9 +30,9 @@ class InlineArray(Expr):
30
30
  else:
31
31
  exprs.append(Literal(el))
32
32
 
33
- inferred_element_type: Optional[ts.ColumnType] = ts.InvalidType()
33
+ inferred_element_type: ts.ColumnType | None = ts.InvalidType()
34
34
  for i, expr in enumerate(exprs):
35
- supertype = inferred_element_type.supertype(expr.col_type)
35
+ supertype = inferred_element_type.supertype(expr.col_type, for_inference=True)
36
36
  if supertype is None:
37
37
  raise excs.Error(
38
38
  f'Could not infer element type of array: element of type `{expr.col_type}` at index {i} '
@@ -44,9 +44,12 @@ class InlineArray(Expr):
44
44
  col_type = ts.ArrayType((len(exprs),), inferred_element_type)
45
45
  elif inferred_element_type.is_array_type():
46
46
  assert isinstance(inferred_element_type, ts.ArrayType)
47
- col_type = ts.ArrayType(
48
- (len(exprs), *inferred_element_type.shape), ts.ColumnType.make_type(inferred_element_type.dtype)
49
- )
47
+ dtype = inferred_element_type.dtype
48
+ shape = inferred_element_type.shape
49
+ if shape is not None and dtype is not None:
50
+ col_type = ts.ArrayType(shape=(len(exprs), *shape), dtype=dtype)
51
+ else:
52
+ col_type = ts.ArrayType(shape=None, dtype=dtype)
50
53
  else:
51
54
  raise excs.Error(f'Element type is not a valid dtype for an array: {inferred_element_type}')
52
55
 
@@ -61,7 +64,7 @@ class InlineArray(Expr):
61
64
  def _equals(self, _: InlineArray) -> bool:
62
65
  return True # Always true if components match
63
66
 
64
- def sql_expr(self, _: SqlElementCache) -> Optional[sql.ColumnElement]:
67
+ def sql_expr(self, _: SqlElementCache) -> sql.ColumnElement | None:
65
68
  return None
66
69
 
67
70
  def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
@@ -81,12 +84,12 @@ class InlineArray(Expr):
81
84
  # loaded and their types are known.
82
85
  return InlineList(components) # type: ignore[return-value]
83
86
 
84
- def as_literal(self) -> Optional[Literal]:
87
+ def as_literal(self) -> Literal | None:
85
88
  assert isinstance(self.col_type, ts.ArrayType)
86
89
  if not all(isinstance(comp, Literal) for comp in self.components):
87
90
  return None
88
91
  return Literal(
89
- np.array([c.as_literal().val for c in self.components], dtype=self.col_type.numpy_dtype()), self.col_type
92
+ np.array([c.as_literal().val for c in self.components], dtype=self.col_type.dtype), self.col_type
90
93
  )
91
94
 
92
95
 
@@ -98,13 +101,7 @@ class InlineList(Expr):
98
101
  def __init__(self, elements: Iterable):
99
102
  exprs = [Expr.from_object(el) for el in elements]
100
103
 
101
- json_schema = {
102
- 'type': 'array',
103
- 'prefixItems': [expr.col_type.to_json_schema() for expr in exprs],
104
- 'items': False, # No additional items (fixed length)
105
- }
106
-
107
- super().__init__(ts.JsonType(json_schema))
104
+ super().__init__(ts.JsonType())
108
105
  self.components.extend(exprs)
109
106
  self.id = self._create_id()
110
107
 
@@ -115,7 +112,7 @@ class InlineList(Expr):
115
112
  def _equals(self, _: InlineList) -> bool:
116
113
  return True # Always true if components match
117
114
 
118
- def sql_expr(self, _: SqlElementCache) -> Optional[sql.ColumnElement]:
115
+ def sql_expr(self, _: SqlElementCache) -> sql.ColumnElement | None:
119
116
  return None
120
117
 
121
118
  def eval(self, data_row: DataRow, _: RowBuilder) -> None:
@@ -128,7 +125,7 @@ class InlineList(Expr):
128
125
  def _from_dict(cls, _: dict, components: list[Expr]) -> InlineList:
129
126
  return cls(components)
130
127
 
131
- def as_literal(self) -> Optional[Literal]:
128
+ def as_literal(self) -> Literal | None:
132
129
  if not all(isinstance(comp, Literal) for comp in self.components):
133
130
  return None
134
131
  return Literal([c.as_literal().val for c in self.components], self.col_type)
@@ -150,18 +147,7 @@ class InlineDict(Expr):
150
147
  self.keys.append(key)
151
148
  exprs.append(Expr.from_object(val))
152
149
 
153
- json_schema: Optional[dict[str, Any]]
154
- try:
155
- json_schema = {
156
- 'type': 'object',
157
- 'properties': {key: expr.col_type.to_json_schema() for key, expr in zip(self.keys, exprs)},
158
- }
159
- except excs.Error:
160
- # InlineDicts are used to store iterator arguments, which are not required to be valid JSON types,
161
- # so we can't always construct a valid schema.
162
- json_schema = None
163
-
164
- super().__init__(ts.JsonType(json_schema))
150
+ super().__init__(ts.JsonType())
165
151
  self.components.extend(exprs)
166
152
  self.id = self._create_id()
167
153
 
@@ -176,7 +162,7 @@ class InlineDict(Expr):
176
162
  def _id_attrs(self) -> list[tuple[str, Any]]:
177
163
  return [*super()._id_attrs(), ('keys', self.keys)]
178
164
 
179
- def sql_expr(self, _: SqlElementCache) -> Optional[sql.ColumnElement]:
165
+ def sql_expr(self, _: SqlElementCache) -> sql.ColumnElement | None:
180
166
  return None
181
167
 
182
168
  def eval(self, data_row: DataRow, _: RowBuilder) -> None:
@@ -208,7 +194,7 @@ class InlineDict(Expr):
208
194
  arg = dict(zip(d['keys'], components))
209
195
  return InlineDict(arg)
210
196
 
211
- def as_literal(self) -> Optional[Literal]:
197
+ def as_literal(self) -> Literal | None:
212
198
  if not all(isinstance(comp, Literal) for comp in self.components):
213
199
  return None
214
200
  return Literal(dict(zip(self.keys, (c.as_literal().val for c in self.components))), self.col_type)
@@ -1,11 +1,10 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import Optional
4
-
5
3
  import sqlalchemy as sql
6
4
 
7
5
  import pixeltable.type_system as ts
8
6
 
7
+ from .column_ref import ColumnRef
9
8
  from .data_row import DataRow
10
9
  from .expr import Expr
11
10
  from .row_builder import RowBuilder
@@ -24,7 +23,12 @@ class IsNull(Expr):
24
23
  def _equals(self, other: IsNull) -> bool:
25
24
  return True
26
25
 
27
- def sql_expr(self, sql_elements: SqlElementCache) -> Optional[sql.ColumnElement]:
26
+ def sql_expr(self, sql_elements: SqlElementCache) -> sql.ColumnElement | None:
27
+ c = self.components[0]
28
+ if isinstance(c, ColumnRef) and c.col.stores_external_array():
29
+ # we also need to check CellMd.file_urls for null
30
+ e = sql.and_(c.col.sa_cellmd_col['file_urls'] == None, c.col.sa_col == None)
31
+ return e
28
32
  e = sql_elements.get(self.components[0])
29
33
  if e is None:
30
34
  return None
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import TYPE_CHECKING, Optional
3
+ from typing import TYPE_CHECKING
4
4
 
5
5
  import sqlalchemy as sql
6
6
 
@@ -29,10 +29,10 @@ class JsonMapper(Expr):
29
29
  """
30
30
 
31
31
  target_expr_scope: ExprScope
32
- parent_mapper: Optional[JsonMapper]
33
- target_expr_eval_ctx: Optional[RowBuilder.EvalCtx]
32
+ parent_mapper: JsonMapper | None
33
+ target_expr_eval_ctx: RowBuilder.EvalCtx | None
34
34
 
35
- def __init__(self, src_expr: Optional[Expr], target_expr: Optional[Expr]):
35
+ def __init__(self, src_expr: Expr | None, target_expr: Expr | None):
36
36
  # TODO: type spec should be list[target_expr.col_type]
37
37
  super().__init__(ts.JsonType())
38
38
 
@@ -54,7 +54,7 @@ class JsonMapper(Expr):
54
54
  def _equals(self, _: JsonMapper) -> bool:
55
55
  return True
56
56
 
57
- def sql_expr(self, _: SqlElementCache) -> Optional[sql.ColumnElement]:
57
+ def sql_expr(self, _: SqlElementCache) -> sql.ColumnElement | None:
58
58
  return None
59
59
 
60
60
  def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
@@ -92,8 +92,8 @@ class JsonMapperDispatch(Expr):
92
92
  """
93
93
 
94
94
  target_expr_scope: ExprScope
95
- parent_mapper: Optional[JsonMapperDispatch]
96
- target_expr_eval_ctx: Optional[RowBuilder.EvalCtx]
95
+ parent_mapper: JsonMapperDispatch | None
96
+ target_expr_eval_ctx: RowBuilder.EvalCtx | None
97
97
 
98
98
  def __init__(self, src_expr: Expr, target_expr: Expr):
99
99
  super().__init__(ts.InvalidType())
@@ -116,7 +116,7 @@ class JsonMapperDispatch(Expr):
116
116
  scope_anchor = ObjectRef(self.target_expr_scope, self)
117
117
  self.components.append(scope_anchor)
118
118
 
119
- def _bind_rel_paths(self, mapper: Optional[JsonMapperDispatch] = None) -> None:
119
+ def _bind_rel_paths(self, mapper: JsonMapperDispatch | None = None) -> None:
120
120
  self.src_expr._bind_rel_paths(mapper)
121
121
  self.target_expr._bind_rel_paths(self)
122
122
  self.parent_mapper = mapper
@@ -1,12 +1,15 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import Any, Optional, Union
3
+ import io
4
+ from pathlib import Path
5
+ from typing import Any
4
6
 
5
7
  import jmespath
6
8
  import sqlalchemy as sql
7
9
 
8
10
  from pixeltable import catalog, exceptions as excs, type_system as ts
9
11
 
12
+ from .column_ref import ColumnRef
10
13
  from .data_row import DataRow
11
14
  from .expr import Expr
12
15
  from .globals import print_slice
@@ -17,29 +20,41 @@ from .sql_element_cache import SqlElementCache
17
20
 
18
21
 
19
22
  class JsonPath(Expr):
23
+ """
24
+ anchor can be None, in which case this is a relative JsonPath and the anchor is set later via set_anchor().
25
+ scope_idx: for relative paths, index of referenced JsonMapper
26
+ (0: indicates the immediately preceding JsonMapper, -1: the parent of the immediately preceding mapper, ...)
27
+ """
28
+
29
+ path_elements: list[str | int | slice]
30
+ compiled_path: jmespath.parser.ParsedResult | None
31
+ scope_idx: int
32
+ file_handles: dict[Path, io.BufferedReader] # key: file path
33
+
20
34
  def __init__(
21
- self, anchor: Optional[Expr], path_elements: Optional[list[Union[str, int, slice]]] = None, scope_idx: int = 0
35
+ self, anchor: Expr | None, path_elements: list[str | int | slice] | None = None, scope_idx: int = 0
22
36
  ) -> None:
23
- """
24
- anchor can be None, in which case this is a relative JsonPath and the anchor is set later via set_anchor().
25
- scope_idx: for relative paths, index of referenced JsonMapper
26
- (0: indicates the immediately preceding JsonMapper, -1: the parent of the immediately preceding mapper, ...)
27
- """
28
37
  if path_elements is None:
29
38
  path_elements = []
30
39
  super().__init__(ts.JsonType(nullable=True)) # JsonPath expressions are always nullable
31
40
  if anchor is not None:
32
41
  self.components = [anchor]
33
- self.path_elements: list[Union[str, int, slice]] = path_elements
42
+ self.path_elements = path_elements
34
43
  self.compiled_path = jmespath.compile(self._json_path()) if len(path_elements) > 0 else None
35
44
  self.scope_idx = scope_idx
36
45
  # NOTE: the _create_id() result will change if set_anchor() gets called;
37
46
  # this is not a problem, because _create_id() shouldn't be called after init()
38
47
  self.id = self._create_id()
48
+ self.file_handles = {}
49
+
50
+ def release(self) -> None:
51
+ for fh in self.file_handles.values():
52
+ fh.close()
53
+ self.file_handles.clear()
39
54
 
40
55
  def __repr__(self) -> str:
41
56
  # else 'R': the anchor is RELATIVE_PATH_ROOT
42
- anchor_str = str(self._anchor) if self._anchor is not None else 'R'
57
+ anchor_str = str(self.anchor) if self.anchor is not None else 'R'
43
58
  if len(self.path_elements) == 0:
44
59
  return anchor_str
45
60
  return f'{anchor_str}{"." if isinstance(self.path_elements[0], str) else ""}{self._json_path()}'
@@ -66,7 +81,7 @@ class JsonPath(Expr):
66
81
  return cls(anchor, path_elements, d['scope_idx'])
67
82
 
68
83
  @property
69
- def _anchor(self) -> Optional[Expr]:
84
+ def anchor(self) -> Expr | None:
70
85
  return None if len(self.components) == 0 else self.components[0]
71
86
 
72
87
  def set_anchor(self, anchor: Expr) -> None:
@@ -74,17 +89,17 @@ class JsonPath(Expr):
74
89
  self.components = [anchor]
75
90
 
76
91
  def is_relative_path(self) -> bool:
77
- return self._anchor is None
92
+ return self.anchor is None
78
93
 
79
94
  def _has_relative_path(self) -> bool:
80
95
  return self.is_relative_path() or super()._has_relative_path()
81
96
 
82
- def _bind_rel_paths(self, mapper: Optional['JsonMapperDispatch'] = None) -> None:
97
+ def _bind_rel_paths(self, mapper: 'JsonMapperDispatch' | None = None) -> None:
83
98
  if self.is_relative_path():
84
99
  # TODO: take scope_idx into account
85
100
  self.set_anchor(mapper.scope_anchor)
86
101
  else:
87
- self._anchor._bind_rel_paths(mapper)
102
+ self.anchor._bind_rel_paths(mapper)
88
103
 
89
104
  def __call__(self, *args: object, **kwargs: object) -> 'JsonPath':
90
105
  """
@@ -98,15 +113,15 @@ class JsonPath(Expr):
98
113
 
99
114
  def __getattr__(self, name: str) -> 'JsonPath':
100
115
  assert isinstance(name, str)
101
- return JsonPath(self._anchor, [*self.path_elements, name])
116
+ return JsonPath(self.anchor, [*self.path_elements, name])
102
117
 
103
118
  def __getitem__(self, index: object) -> 'JsonPath':
104
119
  if isinstance(index, (int, slice, str)):
105
- return JsonPath(self._anchor, [*self.path_elements, index])
120
+ return JsonPath(self.anchor, [*self.path_elements, index])
106
121
  raise excs.Error(f'Invalid json list index: {index}')
107
122
 
108
- def default_column_name(self) -> Optional[str]:
109
- anchor_name = self._anchor.default_column_name() if self._anchor is not None else ''
123
+ def default_column_name(self) -> str | None:
124
+ anchor_name = self.anchor.default_column_name() if self.anchor is not None else ''
110
125
  ret_name = f'{anchor_name}.{self._json_path()}'
111
126
 
112
127
  def cleanup_char(s: str) -> str:
@@ -133,7 +148,7 @@ class JsonPath(Expr):
133
148
  def _id_attrs(self) -> list[tuple[str, Any]]:
134
149
  return [*super()._id_attrs(), ('path_elements', self.path_elements)]
135
150
 
136
- def sql_expr(self, _: SqlElementCache) -> Optional[sql.ColumnElement]:
151
+ def sql_expr(self, _: SqlElementCache) -> sql.ColumnElement | None:
137
152
  """
138
153
  Postgres appears to have a bug: jsonb_path_query('{a: [{b: 0}, {b: 1}]}', '$.a.b') returns
139
154
  *two* rows (each containing col val 0), not a single row with [0, 0].
@@ -158,12 +173,31 @@ class JsonPath(Expr):
158
173
  result.append(f'[{print_slice(element)}]')
159
174
  return ''.join(result)
160
175
 
161
- def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
162
- assert self._anchor is not None, self
163
- val = data_row[self._anchor.slot_idx]
176
+ def eval(self, row: DataRow, row_builder: RowBuilder) -> None:
177
+ assert self.anchor is not None, self
178
+ val = row[self.anchor.slot_idx]
164
179
  if self.compiled_path is not None:
165
180
  val = self.compiled_path.search(val)
166
- data_row[self.slot_idx] = val
181
+ row[self.slot_idx] = val
182
+ if val is None or self.anchor is None or not isinstance(self.anchor, ColumnRef):
183
+ return
184
+
185
+ # the origin of val is a json-typed column, which might stored inlined objects
186
+ if self.anchor.slot_idx not in row.slot_md:
187
+ # we can infer that there aren't any inlined objects because our execution plan doesn't include
188
+ # materializing the cellmd (eg, insert plans)
189
+ # TODO: have the planner pass that fact into ExprEvalNode explicitly to streamline this path a bit more
190
+ return
191
+
192
+ # defer import until it's needed
193
+ from pixeltable.exec.cell_reconstruction_node import json_has_inlined_objs, reconstruct_json
194
+
195
+ cell_md = row.slot_md[self.anchor.slot_idx]
196
+ if cell_md is None or cell_md.file_urls is None or not json_has_inlined_objs(val):
197
+ # val doesn't contain inlined objects
198
+ return
199
+
200
+ row.vals[self.slot_idx] = reconstruct_json(val, cell_md.file_urls, self.file_handles)
167
201
 
168
202
 
169
203
  RELATIVE_PATH_ROOT = JsonPath(None)