pixeltable 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (122) hide show
  1. pixeltable/__init__.py +2 -3
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/__init__.py +2 -1
  4. pixeltable/catalog/catalog.py +63 -36
  5. pixeltable/catalog/column.py +11 -4
  6. pixeltable/catalog/dir.py +5 -5
  7. pixeltable/catalog/globals.py +28 -14
  8. pixeltable/catalog/insertable_table.py +81 -43
  9. pixeltable/catalog/path.py +2 -2
  10. pixeltable/catalog/table.py +140 -109
  11. pixeltable/catalog/table_version.py +60 -43
  12. pixeltable/catalog/table_version_handle.py +3 -0
  13. pixeltable/catalog/table_version_path.py +1 -1
  14. pixeltable/catalog/view.py +17 -9
  15. pixeltable/dataframe.py +5 -3
  16. pixeltable/env.py +109 -43
  17. pixeltable/exec/__init__.py +2 -0
  18. pixeltable/exec/aggregation_node.py +6 -8
  19. pixeltable/exec/cache_prefetch_node.py +4 -7
  20. pixeltable/exec/component_iteration_node.py +1 -3
  21. pixeltable/exec/data_row_batch.py +1 -2
  22. pixeltable/exec/exec_context.py +1 -1
  23. pixeltable/exec/exec_node.py +2 -3
  24. pixeltable/exec/expr_eval/__init__.py +2 -0
  25. pixeltable/exec/expr_eval/evaluators.py +137 -20
  26. pixeltable/exec/expr_eval/expr_eval_node.py +43 -64
  27. pixeltable/exec/expr_eval/globals.py +68 -7
  28. pixeltable/exec/expr_eval/schedulers.py +25 -23
  29. pixeltable/exec/in_memory_data_node.py +8 -6
  30. pixeltable/exec/row_update_node.py +3 -4
  31. pixeltable/exec/sql_node.py +16 -17
  32. pixeltable/exprs/__init__.py +3 -2
  33. pixeltable/exprs/arithmetic_expr.py +2 -0
  34. pixeltable/exprs/column_property_ref.py +1 -1
  35. pixeltable/exprs/column_ref.py +39 -3
  36. pixeltable/exprs/compound_predicate.py +1 -1
  37. pixeltable/exprs/data_row.py +17 -1
  38. pixeltable/exprs/expr.py +51 -21
  39. pixeltable/exprs/function_call.py +34 -2
  40. pixeltable/exprs/globals.py +12 -0
  41. pixeltable/exprs/json_mapper.py +95 -48
  42. pixeltable/exprs/json_path.py +3 -10
  43. pixeltable/exprs/method_ref.py +2 -2
  44. pixeltable/exprs/object_ref.py +2 -2
  45. pixeltable/exprs/row_builder.py +33 -6
  46. pixeltable/exprs/similarity_expr.py +6 -21
  47. pixeltable/exprs/sql_element_cache.py +1 -1
  48. pixeltable/exprs/string_op.py +107 -0
  49. pixeltable/ext/__init__.py +1 -1
  50. pixeltable/ext/functions/__init__.py +1 -1
  51. pixeltable/ext/functions/whisperx.py +1 -1
  52. pixeltable/ext/functions/yolox.py +22 -65
  53. pixeltable/func/aggregate_function.py +1 -1
  54. pixeltable/func/callable_function.py +2 -5
  55. pixeltable/func/expr_template_function.py +22 -2
  56. pixeltable/func/function.py +4 -5
  57. pixeltable/func/function_registry.py +1 -1
  58. pixeltable/func/signature.py +1 -1
  59. pixeltable/func/tools.py +2 -2
  60. pixeltable/func/udf.py +2 -2
  61. pixeltable/functions/__init__.py +2 -2
  62. pixeltable/functions/anthropic.py +2 -2
  63. pixeltable/functions/audio.py +1 -1
  64. pixeltable/functions/deepseek.py +1 -1
  65. pixeltable/functions/fireworks.py +1 -1
  66. pixeltable/functions/globals.py +22 -11
  67. pixeltable/functions/huggingface.py +1 -1
  68. pixeltable/functions/image.py +1 -1
  69. pixeltable/functions/json.py +1 -1
  70. pixeltable/functions/llama_cpp.py +1 -1
  71. pixeltable/functions/math.py +1 -1
  72. pixeltable/functions/mistralai.py +1 -1
  73. pixeltable/functions/ollama.py +1 -1
  74. pixeltable/functions/openai.py +2 -2
  75. pixeltable/functions/replicate.py +1 -1
  76. pixeltable/functions/string.py +1 -1
  77. pixeltable/functions/timestamp.py +1 -1
  78. pixeltable/functions/together.py +1 -1
  79. pixeltable/functions/util.py +1 -1
  80. pixeltable/functions/video.py +2 -2
  81. pixeltable/functions/vision.py +2 -2
  82. pixeltable/globals.py +85 -33
  83. pixeltable/index/embedding_index.py +12 -1
  84. pixeltable/io/__init__.py +8 -5
  85. pixeltable/io/datarows.py +138 -0
  86. pixeltable/io/external_store.py +8 -5
  87. pixeltable/io/fiftyone.py +6 -7
  88. pixeltable/io/globals.py +7 -160
  89. pixeltable/io/hf_datasets.py +21 -98
  90. pixeltable/io/label_studio.py +21 -20
  91. pixeltable/io/pandas.py +35 -48
  92. pixeltable/io/parquet.py +17 -42
  93. pixeltable/io/table_data_conduit.py +569 -0
  94. pixeltable/io/utils.py +6 -21
  95. pixeltable/iterators/__init__.py +1 -1
  96. pixeltable/metadata/__init__.py +6 -4
  97. pixeltable/metadata/converters/convert_24.py +3 -3
  98. pixeltable/metadata/converters/convert_25.py +1 -1
  99. pixeltable/metadata/converters/convert_29.py +1 -1
  100. pixeltable/metadata/converters/convert_30.py +50 -0
  101. pixeltable/metadata/converters/util.py +26 -1
  102. pixeltable/metadata/notes.py +1 -0
  103. pixeltable/metadata/schema.py +3 -0
  104. pixeltable/store.py +2 -2
  105. pixeltable/type_system.py +19 -7
  106. pixeltable/utils/arrow.py +32 -7
  107. pixeltable/utils/console_output.py +3 -2
  108. pixeltable/utils/coroutine.py +3 -3
  109. pixeltable/utils/dbms.py +66 -0
  110. pixeltable/utils/documents.py +61 -67
  111. pixeltable/utils/filecache.py +1 -1
  112. pixeltable/utils/http_server.py +3 -2
  113. pixeltable/utils/pytorch.py +1 -1
  114. pixeltable/utils/sql.py +1 -1
  115. pixeltable-0.3.11.dist-info/METADATA +436 -0
  116. pixeltable-0.3.11.dist-info/RECORD +179 -0
  117. {pixeltable-0.3.9.dist-info → pixeltable-0.3.11.dist-info}/WHEEL +1 -1
  118. pixeltable/catalog/path_dict.py +0 -169
  119. pixeltable-0.3.9.dist-info/METADATA +0 -382
  120. pixeltable-0.3.9.dist-info/RECORD +0 -175
  121. {pixeltable-0.3.9.dist-info → pixeltable-0.3.11.dist-info}/LICENSE +0 -0
  122. {pixeltable-0.3.9.dist-info → pixeltable-0.3.11.dist-info}/entry_points.txt +0 -0
@@ -11,7 +11,7 @@ from typing import Awaitable, Collection, Optional
11
11
  from pixeltable import env, func
12
12
  from pixeltable.config import Config
13
13
 
14
- from .globals import Dispatcher, FnCallArgs, Scheduler
14
+ from .globals import Dispatcher, ExecCtx, FnCallArgs, Scheduler
15
15
 
16
16
  _logger = logging.getLogger('pixeltable')
17
17
 
@@ -62,9 +62,6 @@ class RateLimitsScheduler(Scheduler):
62
62
  def matches(cls, resource_pool: str) -> bool:
63
63
  return resource_pool.startswith('rate-limits:')
64
64
 
65
- def submit(self, item: FnCallArgs) -> None:
66
- self.queue.put_nowait(self.QueueItem(item, 0))
67
-
68
65
  def _set_pool_info(self) -> None:
69
66
  """Initialize pool_info with the RateLimitsInfo for the resource pool, if available"""
70
67
  if self.pool_info is not None:
@@ -76,7 +73,7 @@ class RateLimitsScheduler(Scheduler):
76
73
  assert hasattr(self.pool_info, 'get_request_resources')
77
74
  sig = inspect.signature(self.pool_info.get_request_resources)
78
75
  self.get_request_resources_param_names = [p.name for p in sig.parameters.values()]
79
- self.est_usage = {r: 0 for r in self._resources}
76
+ self.est_usage = dict.fromkeys(self._resources, 0)
80
77
 
81
78
  async def _main_loop(self) -> None:
82
79
  item: Optional[RateLimitsScheduler.QueueItem] = None
@@ -90,7 +87,7 @@ class RateLimitsScheduler(Scheduler):
90
87
  if self.pool_info is None or not self.pool_info.is_initialized():
91
88
  # wait for a single request to get rate limits
92
89
  _logger.debug(f'initializing rate limits for {self.resource_pool}')
93
- await self._exec(item.request, item.num_retries, is_task=False)
90
+ await self._exec(item.request, item.exec_ctx, item.num_retries, is_task=False)
94
91
  _logger.debug(f'initialized rate limits for {self.resource_pool}')
95
92
  item = None
96
93
  # if this was the first request, it created the pool_info
@@ -141,7 +138,7 @@ class RateLimitsScheduler(Scheduler):
141
138
  self.est_usage[resource] += val
142
139
  _logger.debug(f'creating task for {self.resource_pool}')
143
140
  self.num_in_flight += 1
144
- task = asyncio.create_task(self._exec(item.request, item.num_retries, is_task=True))
141
+ task = asyncio.create_task(self._exec(item.request, item.exec_ctx, item.num_retries, is_task=True))
145
142
  self.dispatcher.register_task(task)
146
143
  item = None
147
144
 
@@ -171,7 +168,7 @@ class RateLimitsScheduler(Scheduler):
171
168
  return None
172
169
  return min(candidates, key=lambda x: x[1])[0]
173
170
 
174
- async def _exec(self, request: FnCallArgs, num_retries: int, is_task: bool) -> None:
171
+ async def _exec(self, request: FnCallArgs, exec_ctx: ExecCtx, num_retries: int, is_task: bool) -> None:
175
172
  assert all(not row.has_val[request.fn_call.slot_idx] for row in request.rows)
176
173
  assert all(not row.has_exc(request.fn_call.slot_idx) for row in request.rows)
177
174
 
@@ -180,7 +177,8 @@ class RateLimitsScheduler(Scheduler):
180
177
  pxt_fn = request.fn_call.fn
181
178
  assert isinstance(pxt_fn, func.CallableFunction)
182
179
  _logger.debug(
183
- f'scheduler {self.resource_pool}: start evaluating slot {request.fn_call.slot_idx}, batch_size={len(request.rows)}'
180
+ f'scheduler {self.resource_pool}: '
181
+ f'start evaluating slot {request.fn_call.slot_idx}, batch_size={len(request.rows)}'
184
182
  )
185
183
  self.total_requests += 1
186
184
  if request.is_batched:
@@ -193,13 +191,14 @@ class RateLimitsScheduler(Scheduler):
193
191
  request.row[request.fn_call.slot_idx] = result
194
192
  end_ts = datetime.datetime.now(tz=datetime.timezone.utc)
195
193
  _logger.debug(
196
- f'scheduler {self.resource_pool}: evaluated slot {request.fn_call.slot_idx} in {end_ts - start_ts}, batch_size={len(request.rows)}'
194
+ f'scheduler {self.resource_pool}: evaluated slot {request.fn_call.slot_idx} '
195
+ f'in {end_ts - start_ts}, batch_size={len(request.rows)}'
197
196
  )
198
197
 
199
198
  # purge accumulated usage estimate, now that we have a new report
200
- self.est_usage = {r: 0 for r in self._resources}
199
+ self.est_usage = dict.fromkeys(self._resources, 0)
201
200
 
202
- self.dispatcher.dispatch(request.rows)
201
+ self.dispatcher.dispatch(request.rows, exec_ctx)
203
202
  except Exception as exc:
204
203
  _logger.debug(f'scheduler {self.resource_pool}: exception in slot {request.fn_call.slot_idx}: {exc}')
205
204
  if self.pool_info is None:
@@ -212,7 +211,7 @@ class RateLimitsScheduler(Scheduler):
212
211
  self.total_retried += 1
213
212
  _logger.debug(f'scheduler {self.resource_pool}: retrying in {retry_delay} seconds')
214
213
  await asyncio.sleep(retry_delay)
215
- self.queue.put_nowait(self.QueueItem(request, num_retries + 1))
214
+ self.queue.put_nowait(self.QueueItem(request, num_retries + 1, exec_ctx))
216
215
  return
217
216
  # TODO: update resource limits reported in exc.response.headers, if present
218
217
 
@@ -220,7 +219,7 @@ class RateLimitsScheduler(Scheduler):
220
219
  _, _, exc_tb = sys.exc_info()
221
220
  for row in request.rows:
222
221
  row.set_exc(request.fn_call.slot_idx, exc)
223
- self.dispatcher.dispatch_exc(request.rows, request.fn_call.slot_idx, exc_tb)
222
+ self.dispatcher.dispatch_exc(request.rows, request.fn_call.slot_idx, exc_tb, exec_ctx)
224
223
  finally:
225
224
  _logger.debug(f'Scheduler stats: #requests={self.total_requests}, #retried={self.total_retried}')
226
225
  if is_task:
@@ -301,15 +300,15 @@ class RequestRateScheduler(Scheduler):
301
300
  if item.num_retries > 0:
302
301
  # the last request encountered some problem: retry it synchronously, to wait for the problem to pass
303
302
  _logger.debug(f'retrying request for {self.resource_pool}: #retries={item.num_retries}')
304
- await self._exec(item.request, item.num_retries, is_task=False)
303
+ await self._exec(item.request, item.exec_ctx, item.num_retries, is_task=False)
305
304
  _logger.debug(f'retried request for {self.resource_pool}: #retries={item.num_retries}')
306
305
  else:
307
306
  _logger.debug(f'creating task for {self.resource_pool}')
308
307
  self.num_in_flight += 1
309
- task = asyncio.create_task(self._exec(item.request, item.num_retries, is_task=True))
308
+ task = asyncio.create_task(self._exec(item.request, item.exec_ctx, item.num_retries, is_task=True))
310
309
  self.dispatcher.register_task(task)
311
310
 
312
- async def _exec(self, request: FnCallArgs, num_retries: int, is_task: bool) -> None:
311
+ async def _exec(self, request: FnCallArgs, exec_ctx: ExecCtx, num_retries: int, is_task: bool) -> None:
313
312
  assert all(not row.has_val[request.fn_call.slot_idx] for row in request.rows)
314
313
  assert all(not row.has_exc(request.fn_call.slot_idx) for row in request.rows)
315
314
 
@@ -318,7 +317,8 @@ class RequestRateScheduler(Scheduler):
318
317
  pxt_fn = request.fn_call.fn
319
318
  assert isinstance(pxt_fn, func.CallableFunction)
320
319
  _logger.debug(
321
- f'scheduler {self.resource_pool}: start evaluating slot {request.fn_call.slot_idx}, batch_size={len(request.rows)}'
320
+ f'scheduler {self.resource_pool}: '
321
+ f'start evaluating slot {request.fn_call.slot_idx}, batch_size={len(request.rows)}'
322
322
  )
323
323
  self.total_requests += 1
324
324
  if request.is_batched:
@@ -331,9 +331,10 @@ class RequestRateScheduler(Scheduler):
331
331
  request.row[request.fn_call.slot_idx] = result
332
332
  end_ts = datetime.datetime.now(tz=datetime.timezone.utc)
333
333
  _logger.debug(
334
- f'scheduler {self.resource_pool}: evaluated slot {request.fn_call.slot_idx} in {end_ts - start_ts}, batch_size={len(request.rows)}'
334
+ f'scheduler {self.resource_pool}: evaluated slot {request.fn_call.slot_idx} '
335
+ f'in {end_ts - start_ts}, batch_size={len(request.rows)}'
335
336
  )
336
- self.dispatcher.dispatch(request.rows)
337
+ self.dispatcher.dispatch(request.rows, exec_ctx)
337
338
 
338
339
  except Exception as exc:
339
340
  # TODO: which exception can be retried?
@@ -341,17 +342,18 @@ class RequestRateScheduler(Scheduler):
341
342
  status = getattr(exc, 'status', None)
342
343
  _logger.debug(f'type={type(exc)} has_status={hasattr(exc, "status")} status={status}')
343
344
  if num_retries < self.MAX_RETRIES:
344
- self.queue.put_nowait(self.QueueItem(request, num_retries + 1))
345
+ self.queue.put_nowait(self.QueueItem(request, num_retries + 1, exec_ctx))
345
346
  return
346
347
 
347
348
  # record the exception
348
349
  _, _, exc_tb = sys.exc_info()
349
350
  for row in request.rows:
350
351
  row.set_exc(request.fn_call.slot_idx, exc)
351
- self.dispatcher.dispatch_exc(request.rows, request.fn_call.slot_idx, exc_tb)
352
+ self.dispatcher.dispatch_exc(request.rows, request.fn_call.slot_idx, exc_tb, exec_ctx)
352
353
  finally:
353
354
  _logger.debug(
354
- f'Scheduler stats: #in-flight={self.num_in_flight} #requests={self.total_requests}, #retried={self.total_retried}'
355
+ f'Scheduler stats: #in-flight={self.num_in_flight} #requests={self.total_requests}, '
356
+ f'#retried={self.total_retried}'
355
357
  )
356
358
  if is_task:
357
359
  self.num_in_flight -= 1
@@ -1,8 +1,7 @@
1
1
  import logging
2
- from typing import Any, AsyncIterator, Iterator, Optional
2
+ from typing import Any, AsyncIterator, Optional
3
3
 
4
- import pixeltable.catalog as catalog
5
- import pixeltable.exprs as exprs
4
+ from pixeltable import catalog, exprs
6
5
  from pixeltable.utils.media_store import MediaStore
7
6
 
8
7
  from .data_row_batch import DataRowBatch
@@ -68,9 +67,12 @@ class InMemoryDataNode(ExecNode):
68
67
  if col_info.col.col_type.is_image_type() and isinstance(val, bytes):
69
68
  # this is a literal image, ie, a sequence of bytes; we save this as a media file and store the path
70
69
  path = str(MediaStore.prepare_media_path(self.tbl.id, col_info.col.id, self.tbl.get().version))
71
- open(path, 'wb').write(val)
72
- val = path
73
- self.output_rows[row_idx][col_info.slot_idx] = val
70
+ with open(path, 'wb') as fp:
71
+ fp.write(val)
72
+ self.output_rows[row_idx][col_info.slot_idx] = path
73
+ else:
74
+ self.output_rows[row_idx][col_info.slot_idx] = val
75
+
74
76
  input_slot_idxs.add(col_info.slot_idx)
75
77
 
76
78
  # set the remaining output slots to their default values (presently None)
@@ -1,8 +1,7 @@
1
1
  import logging
2
2
  from typing import Any, AsyncIterator
3
3
 
4
- import pixeltable.catalog as catalog
5
- import pixeltable.exprs as exprs
4
+ from pixeltable import catalog, exprs
6
5
 
7
6
  from .data_row_batch import DataRowBatch
8
7
  from .exec_node import ExecNode
@@ -29,7 +28,7 @@ class RowUpdateNode(ExecNode):
29
28
  input: ExecNode,
30
29
  ):
31
30
  super().__init__(row_builder, [], [], input)
32
- self.updates = {key_vals: col_vals for key_vals, col_vals in zip(key_vals_batch, col_vals_batch)}
31
+ self.updates = dict(zip(key_vals_batch, col_vals_batch))
33
32
  self.is_rowid_key = is_rowid_key
34
33
  # determine slot idxs of all columns we need to read or write
35
34
  # retrieve ColumnRefs from the RowBuilder (has slot_idx set)
@@ -38,7 +37,7 @@ class RowUpdateNode(ExecNode):
38
37
  for col_ref in row_builder.unique_exprs
39
38
  if isinstance(col_ref, exprs.ColumnRef)
40
39
  }
41
- self.col_slot_idxs = {col: all_col_slot_idxs[col] for col in col_vals_batch[0].keys()}
40
+ self.col_slot_idxs = {col: all_col_slot_idxs[col] for col in col_vals_batch[0]}
42
41
  self.key_slot_idxs = {col: all_col_slot_idxs[col] for col in tbl.tbl_version.get().primary_key_columns()}
43
42
  self.matched_key_vals: set[tuple] = set()
44
43
 
@@ -6,8 +6,7 @@ from uuid import UUID
6
6
 
7
7
  import sqlalchemy as sql
8
8
 
9
- import pixeltable.catalog as catalog
10
- import pixeltable.exprs as exprs
9
+ from pixeltable import catalog, exprs
11
10
  from pixeltable.env import Env
12
11
 
13
12
  from .data_row_batch import DataRowBatch
@@ -217,31 +216,31 @@ class SqlNode(ExecNode):
217
216
  candidates = tbl.get_tbl_versions()
218
217
  assert len(candidates) > 0
219
218
  joined_tbls: list[catalog.TableVersionHandle] = [candidates[0]]
220
- for tbl in candidates[1:]:
221
- if tbl.id in refd_tbl_ids:
222
- joined_tbls.append(tbl)
219
+ for t in candidates[1:]:
220
+ if t.id in refd_tbl_ids:
221
+ joined_tbls.append(t)
223
222
 
224
223
  first = True
225
- prev_tbl: catalog.TableVersionHandle
226
- for tbl in joined_tbls[::-1]:
224
+ prev_tbl: Optional[catalog.TableVersionHandle] = None
225
+ for t in joined_tbls[::-1]:
227
226
  if first:
228
- stmt = stmt.select_from(tbl.get().store_tbl.sa_tbl)
227
+ stmt = stmt.select_from(t.get().store_tbl.sa_tbl)
229
228
  first = False
230
229
  else:
231
230
  # join tbl to prev_tbl on prev_tbl's rowid cols
232
231
  prev_tbl_rowid_cols = prev_tbl.get().store_tbl.rowid_columns()
233
- tbl_rowid_cols = tbl.get().store_tbl.rowid_columns()
232
+ tbl_rowid_cols = t.get().store_tbl.rowid_columns()
234
233
  rowid_clauses = [
235
234
  c1 == c2 for c1, c2 in zip(prev_tbl_rowid_cols, tbl_rowid_cols[: len(prev_tbl_rowid_cols)])
236
235
  ]
237
- stmt = stmt.join(tbl.get().store_tbl.sa_tbl, sql.and_(*rowid_clauses))
238
- if tbl.id in exact_version_only:
239
- stmt = stmt.where(tbl.get().store_tbl.v_min_col == tbl.get().version)
236
+ stmt = stmt.join(t.get().store_tbl.sa_tbl, sql.and_(*rowid_clauses))
237
+ if t.id in exact_version_only:
238
+ stmt = stmt.where(t.get().store_tbl.v_min_col == t.get().version)
240
239
  else:
241
- stmt = stmt.where(tbl.get().store_tbl.v_min_col <= tbl.get().version).where(
242
- tbl.get().store_tbl.v_max_col > tbl.get().version
240
+ stmt = stmt.where(t.get().store_tbl.v_min_col <= t.get().version).where(
241
+ t.get().store_tbl.v_max_col > t.get().version
243
242
  )
244
- prev_tbl = tbl
243
+ prev_tbl = t
245
244
  return stmt
246
245
 
247
246
  def set_where(self, where_clause: exprs.Expr) -> None:
@@ -291,7 +290,7 @@ class SqlNode(ExecNode):
291
290
 
292
291
  conn = Env.get().conn
293
292
  result_cursor = conn.execute(stmt)
294
- for warning in w:
293
+ for _ in w:
295
294
  pass
296
295
 
297
296
  tbl_version = self.tbl.tbl_version if self.tbl is not None else None
@@ -494,7 +493,7 @@ class SqlJoinNode(SqlNode):
494
493
  if join_clause.join_type != plan.JoinType.CROSS
495
494
  else sql.sql.expression.literal(True)
496
495
  )
497
- is_outer = join_clause.join_type == plan.JoinType.LEFT or join_clause.join_type == plan.JoinType.FULL_OUTER
496
+ is_outer = join_clause.join_type in (plan.JoinType.LEFT, plan.JoinType.FULL_OUTER)
498
497
  stmt = stmt.join(
499
498
  self.input_ctes[i + 1],
500
499
  onclause=on_clause,
@@ -15,8 +15,8 @@ from .globals import ArithmeticOperator, ComparisonOperator, LogicalOperator
15
15
  from .in_predicate import InPredicate
16
16
  from .inline_expr import InlineArray, InlineDict, InlineList
17
17
  from .is_null import IsNull
18
- from .json_mapper import JsonMapper
19
- from .json_path import RELATIVE_PATH_ROOT, JsonPath
18
+ from .json_mapper import JsonMapper, JsonMapperDispatch
19
+ from .json_path import JsonPath
20
20
  from .literal import Literal
21
21
  from .method_ref import MethodRef
22
22
  from .object_ref import ObjectRef
@@ -24,5 +24,6 @@ from .row_builder import ColumnSlotIdx, ExecProfile, RowBuilder
24
24
  from .rowid_ref import RowidRef
25
25
  from .similarity_expr import SimilarityExpr
26
26
  from .sql_element_cache import SqlElementCache
27
+ from .string_op import StringOp
27
28
  from .type_cast import TypeCast
28
29
  from .variable import Variable
@@ -19,6 +19,8 @@ class ArithmeticExpr(Expr):
19
19
  Allows arithmetic exprs on json paths
20
20
  """
21
21
 
22
+ operator: ArithmeticOperator
23
+
22
24
  def __init__(self, operator: ArithmeticOperator, op1: Expr, op2: Expr):
23
25
  if op1.col_type.is_json_type() or op2.col_type.is_json_type() or operator == ArithmeticOperator.DIV:
24
26
  # we assume it's a float
@@ -52,7 +52,7 @@ class ColumnPropertyRef(Expr):
52
52
  return f'{self._col_ref}.{self.prop.name.lower()}'
53
53
 
54
54
  def is_error_prop(self) -> bool:
55
- return self.prop in {self.Property.ERRORTYPE, self.Property.ERRORMSG}
55
+ return self.prop in (self.Property.ERRORTYPE, self.Property.ERRORMSG)
56
56
 
57
57
  def sql_expr(self, sql_elements: SqlElementCache) -> Optional[sql.ColumnElement]:
58
58
  if not self._col_ref.col.is_stored:
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import copy
3
4
  from typing import Any, Optional, Sequence
4
5
  from uuid import UUID
5
6
 
@@ -125,11 +126,46 @@ class ColumnRef(Expr):
125
126
 
126
127
  return super().__getattr__(name)
127
128
 
129
+ @classmethod
130
+ def find_embedding_index(
131
+ cls, col: catalog.Column, idx_name: Optional[str], method_name: str
132
+ ) -> dict[str, catalog.TableVersion.IndexInfo]:
133
+ """Return IndexInfo for a column, with an optional given name"""
134
+ # determine index to use
135
+ idx_info_dict = col.get_idx_info()
136
+ from pixeltable import index
137
+
138
+ embedding_idx_info = {
139
+ info: value for info, value in idx_info_dict.items() if isinstance(value.idx, index.EmbeddingIndex)
140
+ }
141
+ if len(embedding_idx_info) == 0:
142
+ raise excs.Error(f'No indices found for {method_name!r} on column {col.name!r}')
143
+ if idx_name is not None and idx_name not in embedding_idx_info:
144
+ raise excs.Error(f'Index {idx_name!r} not found for {method_name!r} on column {col.name!r}')
145
+ if len(embedding_idx_info) > 1:
146
+ if idx_name is None:
147
+ raise excs.Error(
148
+ f'Column {col.name!r} has multiple indices; use the index name to disambiguate: '
149
+ f'`{method_name}(..., idx=<index_name>)`'
150
+ )
151
+ idx_info = {idx_name: embedding_idx_info[idx_name]}
152
+ else:
153
+ idx_info = embedding_idx_info
154
+ return idx_info
155
+
128
156
  def similarity(self, item: Any, *, idx: Optional[str] = None) -> Expr:
129
157
  from .similarity_expr import SimilarityExpr
130
158
 
131
159
  return SimilarityExpr(self, item, idx_name=idx)
132
160
 
161
+ def embedding(self, *, idx: Optional[str] = None) -> ColumnRef:
162
+ idx_info = ColumnRef.find_embedding_index(self.col, idx, 'embedding')
163
+ assert len(idx_info) == 1
164
+ col = copy.copy(next(iter(idx_info.values())).val_col)
165
+ col.name = f'{self.col.name}_embedding_{idx if idx is not None else ""}'
166
+ col.create_sa_cols()
167
+ return ColumnRef(col)
168
+
133
169
  def default_column_name(self) -> Optional[str]:
134
170
  return str(self)
135
171
 
@@ -140,13 +176,13 @@ class ColumnRef(Expr):
140
176
  tbl = catalog.Catalog.get().get_table_by_id(self.col.tbl.id)
141
177
  return tbl.select(self)
142
178
 
143
- def show(self, *args, **kwargs) -> 'pxt.dataframe.DataFrameResultSet':
179
+ def show(self, *args: Any, **kwargs: Any) -> 'pxt.dataframe.DataFrameResultSet':
144
180
  return self._df().show(*args, **kwargs)
145
181
 
146
- def head(self, *args, **kwargs) -> 'pxt.dataframe.DataFrameResultSet':
182
+ def head(self, *args: Any, **kwargs: Any) -> 'pxt.dataframe.DataFrameResultSet':
147
183
  return self._df().head(*args, **kwargs)
148
184
 
149
- def tail(self, *args, **kwargs) -> 'pxt.dataframe.DataFrameResultSet':
185
+ def tail(self, *args: Any, **kwargs: Any) -> 'pxt.dataframe.DataFrameResultSet':
150
186
  return self._df().tail(*args, **kwargs)
151
187
 
152
188
  def count(self) -> int:
@@ -61,7 +61,7 @@ class CompoundPredicate(Expr):
61
61
  return [*super()._id_attrs(), ('operator', self.operator.value)]
62
62
 
63
63
  def split_conjuncts(self, condition: Callable[[Expr], bool]) -> tuple[list[Expr], Optional[Expr]]:
64
- if self.operator in {LogicalOperator.OR, LogicalOperator.NOT}:
64
+ if self.operator in (LogicalOperator.OR, LogicalOperator.NOT):
65
65
  return super().split_conjuncts(condition)
66
66
  matches = [op for op in self.components if condition(op)]
67
67
  non_matches = [op for op in self.components if not condition(op)]
@@ -63,11 +63,25 @@ class DataRow:
63
63
  # - None if vals[i] is not a media type or if there is no local file yet for file_urls[i]
64
64
  file_paths: np.ndarray # of str
65
65
 
66
- def __init__(self, size: int, img_slot_idxs: list[int], media_slot_idxs: list[int], array_slot_idxs: list[int]):
66
+ # for nested rows (ie, those produced by JsonMapperDispatcher)
67
+ parent_row: Optional[DataRow]
68
+ parent_slot_idx: Optional[int]
69
+
70
+ def __init__(
71
+ self,
72
+ size: int,
73
+ img_slot_idxs: list[int],
74
+ media_slot_idxs: list[int],
75
+ array_slot_idxs: list[int],
76
+ parent_row: Optional[DataRow] = None,
77
+ parent_slot_idx: Optional[int] = None,
78
+ ):
67
79
  self.img_slot_idxs = img_slot_idxs
68
80
  self.media_slot_idxs = media_slot_idxs
69
81
  self.array_slot_idxs = array_slot_idxs
70
82
  self.init(size)
83
+ self.parent_row = parent_row
84
+ self.parent_slot_idx = parent_slot_idx
71
85
 
72
86
  def init(self, num_slots: int) -> None:
73
87
  self.vals = np.full(num_slots, None, dtype=object)
@@ -79,6 +93,8 @@ class DataRow:
79
93
  self.pk = None
80
94
  self.file_urls = np.full(num_slots, None, dtype=object)
81
95
  self.file_paths = np.full(num_slots, None, dtype=object)
96
+ self.parent_row = None
97
+ self.parent_slot_idx = None
82
98
 
83
99
  def clear(self, idxs: Optional[np.ndarray] = None) -> None:
84
100
  if idxs is not None:
pixeltable/exprs/expr.py CHANGED
@@ -17,7 +17,7 @@ from typing_extensions import Self, _AnnotatedAlias
17
17
  from pixeltable import catalog, exceptions as excs, func, type_system as ts
18
18
 
19
19
  from .data_row import DataRow
20
- from .globals import ArithmeticOperator, ComparisonOperator, LiteralPythonTypes, LogicalOperator
20
+ from .globals import ArithmeticOperator, ComparisonOperator, LiteralPythonTypes, LogicalOperator, StringOperator
21
21
 
22
22
  if TYPE_CHECKING:
23
23
  from pixeltable import exprs
@@ -69,6 +69,8 @@ class Expr(abc.ABC):
69
69
  # - not set for subexprs that don't need to be materialized because the parent can be materialized via SQL
70
70
  slot_idx: Optional[int]
71
71
 
72
+ T = TypeVar('T', bound='Expr')
73
+
72
74
  def __init__(self, col_type: ts.ColumnType):
73
75
  self.col_type = col_type
74
76
  self.components = []
@@ -97,9 +99,11 @@ class Expr(abc.ABC):
97
99
  by the immediately containing JsonMapper during initialization.
98
100
  """
99
101
  self._bind_rel_paths()
100
- assert not self._has_relative_path, self._expr_tree()
102
+ has_rel_path = self._has_relative_path()
103
+ assert not has_rel_path, self._expr_tree()
104
+ assert not self._has_relative_path(), self._expr_tree()
101
105
 
102
- def _bind_rel_paths(self, mapper: Optional['exprs.JsonMapper'] = None) -> None:
106
+ def _bind_rel_paths(self, mapper: Optional['exprs.JsonMapperDispatch'] = None) -> None:
103
107
  for c in self.components:
104
108
  c._bind_rel_paths(mapper)
105
109
 
@@ -188,7 +192,7 @@ class Expr(abc.ABC):
188
192
  return False
189
193
  return all(a[i].equals(b[i]) for i in range(len(a)))
190
194
 
191
- def copy(self) -> Expr:
195
+ def copy(self: T) -> T:
192
196
  """
193
197
  Creates a copy that can be evaluated separately: it doesn't share any eval context (slot_idx)
194
198
  but shares everything else (catalog objects, etc.)
@@ -206,7 +210,7 @@ class Expr(abc.ABC):
206
210
  return None
207
211
  return [e.copy() for e in expr_list]
208
212
 
209
- def __deepcopy__(self, memo=None) -> Expr:
213
+ def __deepcopy__(self, memo: Optional[dict[int, Any]] = None) -> Expr:
210
214
  # we don't need to create an actual deep copy because all state other than execution state is read-only
211
215
  if memo is None:
212
216
  memo = {}
@@ -296,8 +300,6 @@ class Expr(abc.ABC):
296
300
  # instances of that subclass; and another that returns all subexpressions that match the given filter.
297
301
  # In order for type checking to behave correctly on both forms, we provide two overloaded signatures.
298
302
 
299
- T = TypeVar('T', bound='Expr')
300
-
301
303
  @overload
302
304
  def subexprs(
303
305
  self, *, filter: Optional[Callable[[Expr], bool]] = None, traverse_matches: bool = True
@@ -370,9 +372,8 @@ class Expr(abc.ABC):
370
372
  except StopIteration:
371
373
  return False
372
374
 
373
- @property
374
375
  def _has_relative_path(self) -> bool:
375
- return any(c._has_relative_path for c in self.components)
376
+ return any(c._has_relative_path() for c in self.components)
376
377
 
377
378
  def tbl_ids(self) -> set[UUID]:
378
379
  """Returns table ids referenced by this expr."""
@@ -459,7 +460,6 @@ class Expr(abc.ABC):
459
460
  return Literal(o, col_type=obj_type)
460
461
  return None
461
462
 
462
- @abc.abstractmethod
463
463
  def sql_expr(self, sql_elements: 'exprs.SqlElementCache') -> Optional[sql.ColumnElement]:
464
464
  """
465
465
  If this expr can be materialized directly in SQL:
@@ -469,7 +469,7 @@ class Expr(abc.ABC):
469
469
  - returns None
470
470
  - eval() will be called
471
471
  """
472
- pass
472
+ return None
473
473
 
474
474
  @abc.abstractmethod
475
475
  def eval(self, data_row: DataRow, row_builder: 'exprs.RowBuilder') -> None:
@@ -605,10 +605,6 @@ class Expr(abc.ABC):
605
605
  # Return the `MethodRef` object itself; it requires arguments to become a `FunctionCall`
606
606
  return method_ref
607
607
 
608
- def __rshift__(self, other: object) -> 'exprs.Expr':
609
- # Implemented here for type-checking purposes
610
- raise excs.Error('The `>>` operator can only be applied to Json expressions')
611
-
612
608
  def __bool__(self) -> bool:
613
609
  raise TypeError(
614
610
  f'Pixeltable expressions cannot be used in conjunction with Python boolean operators (and/or/not)\n{self!r}'
@@ -658,13 +654,17 @@ class Expr(abc.ABC):
658
654
  def __neg__(self) -> 'exprs.ArithmeticExpr':
659
655
  return self._make_arithmetic_expr(ArithmeticOperator.MUL, -1)
660
656
 
661
- def __add__(self, other: object) -> 'exprs.ArithmeticExpr':
657
+ def __add__(self, other: object) -> Union[exprs.ArithmeticExpr, exprs.StringOp]:
658
+ if isinstance(self, str) or (isinstance(self, Expr) and self.col_type.is_string_type()):
659
+ return self._make_string_expr(StringOperator.CONCAT, other)
662
660
  return self._make_arithmetic_expr(ArithmeticOperator.ADD, other)
663
661
 
664
662
  def __sub__(self, other: object) -> 'exprs.ArithmeticExpr':
665
663
  return self._make_arithmetic_expr(ArithmeticOperator.SUB, other)
666
664
 
667
- def __mul__(self, other: object) -> 'exprs.ArithmeticExpr':
665
+ def __mul__(self, other: object) -> Union['exprs.ArithmeticExpr', 'exprs.StringOp']:
666
+ if isinstance(self, str) or (isinstance(self, Expr) and self.col_type.is_string_type()):
667
+ return self._make_string_expr(StringOperator.REPEAT, other)
668
668
  return self._make_arithmetic_expr(ArithmeticOperator.MUL, other)
669
669
 
670
670
  def __truediv__(self, other: object) -> 'exprs.ArithmeticExpr':
@@ -676,13 +676,17 @@ class Expr(abc.ABC):
676
676
  def __floordiv__(self, other: object) -> 'exprs.ArithmeticExpr':
677
677
  return self._make_arithmetic_expr(ArithmeticOperator.FLOORDIV, other)
678
678
 
679
- def __radd__(self, other: object) -> 'exprs.ArithmeticExpr':
679
+ def __radd__(self, other: object) -> Union['exprs.ArithmeticExpr', 'exprs.StringOp']:
680
+ if isinstance(other, str) or (isinstance(other, Expr) and other.col_type.is_string_type()):
681
+ return self._rmake_string_expr(StringOperator.CONCAT, other)
680
682
  return self._rmake_arithmetic_expr(ArithmeticOperator.ADD, other)
681
683
 
682
684
  def __rsub__(self, other: object) -> 'exprs.ArithmeticExpr':
683
685
  return self._rmake_arithmetic_expr(ArithmeticOperator.SUB, other)
684
686
 
685
- def __rmul__(self, other: object) -> 'exprs.ArithmeticExpr':
687
+ def __rmul__(self, other: object) -> Union['exprs.ArithmeticExpr', 'exprs.StringOp']:
688
+ if isinstance(other, str) or (isinstance(other, Expr) and other.col_type.is_string_type()):
689
+ return self._rmake_string_expr(StringOperator.REPEAT, other)
686
690
  return self._rmake_arithmetic_expr(ArithmeticOperator.MUL, other)
687
691
 
688
692
  def __rtruediv__(self, other: object) -> 'exprs.ArithmeticExpr':
@@ -694,6 +698,32 @@ class Expr(abc.ABC):
694
698
  def __rfloordiv__(self, other: object) -> 'exprs.ArithmeticExpr':
695
699
  return self._rmake_arithmetic_expr(ArithmeticOperator.FLOORDIV, other)
696
700
 
701
+ def _make_string_expr(self, op: StringOperator, other: object) -> 'exprs.StringOp':
702
+ """
703
+ Make left-handed version of string expression.
704
+ """
705
+ from .literal import Literal
706
+ from .string_op import StringOp
707
+
708
+ if isinstance(other, Expr):
709
+ return StringOp(op, self, other)
710
+ if isinstance(other, typing.get_args(LiteralPythonTypes)):
711
+ return StringOp(op, self, Literal(other))
712
+ raise TypeError(f'Other must be Expr or literal: {type(other)}')
713
+
714
+ def _rmake_string_expr(self, op: StringOperator, other: object) -> 'exprs.StringOp':
715
+ """
716
+ Right-handed version of _make_string_expr. other must be a literal; if it were an Expr,
717
+ the operation would have already been evaluated in its left-handed form.
718
+ """
719
+ from .literal import Literal
720
+ from .string_op import StringOp
721
+
722
+ assert not isinstance(other, Expr) # Else the left-handed form would have evaluated first
723
+ if isinstance(other, typing.get_args(LiteralPythonTypes)):
724
+ return StringOp(op, Literal(other), self)
725
+ raise TypeError(f'Other must be Expr or literal: {type(other)}')
726
+
697
727
  def _make_arithmetic_expr(self, op: ArithmeticOperator, other: object) -> 'exprs.ArithmeticExpr':
698
728
  """
699
729
  other: Union[Expr, LiteralPythonTypes]
@@ -805,13 +835,13 @@ class Expr(abc.ABC):
805
835
  first_param = next(params_iter) if len(params) >= 1 else None
806
836
  second_param = next(params_iter) if len(params) >= 2 else None
807
837
  # Check that fn has at least one positional parameter
808
- if len(params) == 0 or first_param.kind in {inspect.Parameter.KEYWORD_ONLY, inspect.Parameter.VAR_KEYWORD}:
838
+ if len(params) == 0 or first_param.kind in (inspect.Parameter.KEYWORD_ONLY, inspect.Parameter.VAR_KEYWORD):
809
839
  raise excs.Error(f'Function `{fn.__name__}` has no positional parameters.')
810
840
  # Check that fn has at most one required parameter, i.e., its second parameter
811
841
  # has no default and is not a varargs
812
842
  if (
813
843
  len(params) >= 2
814
- and second_param.kind not in {inspect.Parameter.VAR_POSITIONAL, inspect.Parameter.VAR_KEYWORD}
844
+ and second_param.kind not in (inspect.Parameter.VAR_POSITIONAL, inspect.Parameter.VAR_KEYWORD)
815
845
  and second_param.default is inspect.Parameter.empty
816
846
  ):
817
847
  raise excs.Error(f'Function `{fn.__name__}` has multiple required parameters.')