pixeltable 0.4.0rc3__py3-none-any.whl → 0.4.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (202) hide show
  1. pixeltable/__init__.py +23 -5
  2. pixeltable/_version.py +1 -0
  3. pixeltable/catalog/__init__.py +5 -3
  4. pixeltable/catalog/catalog.py +1318 -404
  5. pixeltable/catalog/column.py +186 -115
  6. pixeltable/catalog/dir.py +1 -2
  7. pixeltable/catalog/globals.py +11 -43
  8. pixeltable/catalog/insertable_table.py +167 -79
  9. pixeltable/catalog/path.py +61 -23
  10. pixeltable/catalog/schema_object.py +9 -10
  11. pixeltable/catalog/table.py +626 -308
  12. pixeltable/catalog/table_metadata.py +101 -0
  13. pixeltable/catalog/table_version.py +713 -569
  14. pixeltable/catalog/table_version_handle.py +37 -6
  15. pixeltable/catalog/table_version_path.py +42 -29
  16. pixeltable/catalog/tbl_ops.py +50 -0
  17. pixeltable/catalog/update_status.py +191 -0
  18. pixeltable/catalog/view.py +108 -94
  19. pixeltable/config.py +128 -22
  20. pixeltable/dataframe.py +188 -100
  21. pixeltable/env.py +407 -136
  22. pixeltable/exceptions.py +6 -0
  23. pixeltable/exec/__init__.py +3 -0
  24. pixeltable/exec/aggregation_node.py +7 -8
  25. pixeltable/exec/cache_prefetch_node.py +83 -110
  26. pixeltable/exec/cell_materialization_node.py +231 -0
  27. pixeltable/exec/cell_reconstruction_node.py +135 -0
  28. pixeltable/exec/component_iteration_node.py +4 -3
  29. pixeltable/exec/data_row_batch.py +8 -65
  30. pixeltable/exec/exec_context.py +16 -4
  31. pixeltable/exec/exec_node.py +13 -36
  32. pixeltable/exec/expr_eval/evaluators.py +7 -6
  33. pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
  34. pixeltable/exec/expr_eval/globals.py +8 -5
  35. pixeltable/exec/expr_eval/row_buffer.py +1 -2
  36. pixeltable/exec/expr_eval/schedulers.py +190 -30
  37. pixeltable/exec/globals.py +32 -0
  38. pixeltable/exec/in_memory_data_node.py +18 -18
  39. pixeltable/exec/object_store_save_node.py +293 -0
  40. pixeltable/exec/row_update_node.py +16 -9
  41. pixeltable/exec/sql_node.py +206 -101
  42. pixeltable/exprs/__init__.py +1 -1
  43. pixeltable/exprs/arithmetic_expr.py +27 -22
  44. pixeltable/exprs/array_slice.py +3 -3
  45. pixeltable/exprs/column_property_ref.py +34 -30
  46. pixeltable/exprs/column_ref.py +92 -96
  47. pixeltable/exprs/comparison.py +5 -5
  48. pixeltable/exprs/compound_predicate.py +5 -4
  49. pixeltable/exprs/data_row.py +152 -55
  50. pixeltable/exprs/expr.py +62 -43
  51. pixeltable/exprs/expr_dict.py +3 -3
  52. pixeltable/exprs/expr_set.py +17 -10
  53. pixeltable/exprs/function_call.py +75 -37
  54. pixeltable/exprs/globals.py +1 -2
  55. pixeltable/exprs/in_predicate.py +4 -4
  56. pixeltable/exprs/inline_expr.py +10 -27
  57. pixeltable/exprs/is_null.py +1 -3
  58. pixeltable/exprs/json_mapper.py +8 -8
  59. pixeltable/exprs/json_path.py +56 -22
  60. pixeltable/exprs/literal.py +5 -5
  61. pixeltable/exprs/method_ref.py +2 -2
  62. pixeltable/exprs/object_ref.py +2 -2
  63. pixeltable/exprs/row_builder.py +127 -53
  64. pixeltable/exprs/rowid_ref.py +8 -12
  65. pixeltable/exprs/similarity_expr.py +50 -25
  66. pixeltable/exprs/sql_element_cache.py +4 -4
  67. pixeltable/exprs/string_op.py +5 -5
  68. pixeltable/exprs/type_cast.py +3 -5
  69. pixeltable/func/__init__.py +1 -0
  70. pixeltable/func/aggregate_function.py +8 -8
  71. pixeltable/func/callable_function.py +9 -9
  72. pixeltable/func/expr_template_function.py +10 -10
  73. pixeltable/func/function.py +18 -20
  74. pixeltable/func/function_registry.py +6 -7
  75. pixeltable/func/globals.py +2 -3
  76. pixeltable/func/mcp.py +74 -0
  77. pixeltable/func/query_template_function.py +20 -18
  78. pixeltable/func/signature.py +43 -16
  79. pixeltable/func/tools.py +23 -13
  80. pixeltable/func/udf.py +18 -20
  81. pixeltable/functions/__init__.py +6 -0
  82. pixeltable/functions/anthropic.py +93 -33
  83. pixeltable/functions/audio.py +114 -10
  84. pixeltable/functions/bedrock.py +13 -6
  85. pixeltable/functions/date.py +1 -1
  86. pixeltable/functions/deepseek.py +20 -9
  87. pixeltable/functions/fireworks.py +2 -2
  88. pixeltable/functions/gemini.py +28 -11
  89. pixeltable/functions/globals.py +13 -13
  90. pixeltable/functions/groq.py +108 -0
  91. pixeltable/functions/huggingface.py +1046 -23
  92. pixeltable/functions/image.py +9 -18
  93. pixeltable/functions/llama_cpp.py +23 -8
  94. pixeltable/functions/math.py +3 -4
  95. pixeltable/functions/mistralai.py +4 -15
  96. pixeltable/functions/ollama.py +16 -9
  97. pixeltable/functions/openai.py +104 -82
  98. pixeltable/functions/openrouter.py +143 -0
  99. pixeltable/functions/replicate.py +2 -2
  100. pixeltable/functions/reve.py +250 -0
  101. pixeltable/functions/string.py +21 -28
  102. pixeltable/functions/timestamp.py +13 -14
  103. pixeltable/functions/together.py +4 -6
  104. pixeltable/functions/twelvelabs.py +92 -0
  105. pixeltable/functions/util.py +6 -1
  106. pixeltable/functions/video.py +1388 -106
  107. pixeltable/functions/vision.py +7 -7
  108. pixeltable/functions/whisper.py +15 -7
  109. pixeltable/functions/whisperx.py +179 -0
  110. pixeltable/{ext/functions → functions}/yolox.py +2 -4
  111. pixeltable/globals.py +332 -105
  112. pixeltable/index/base.py +13 -22
  113. pixeltable/index/btree.py +23 -22
  114. pixeltable/index/embedding_index.py +32 -44
  115. pixeltable/io/__init__.py +4 -2
  116. pixeltable/io/datarows.py +7 -6
  117. pixeltable/io/external_store.py +49 -77
  118. pixeltable/io/fiftyone.py +11 -11
  119. pixeltable/io/globals.py +29 -28
  120. pixeltable/io/hf_datasets.py +17 -9
  121. pixeltable/io/label_studio.py +70 -66
  122. pixeltable/io/lancedb.py +3 -0
  123. pixeltable/io/pandas.py +12 -11
  124. pixeltable/io/parquet.py +13 -93
  125. pixeltable/io/table_data_conduit.py +71 -47
  126. pixeltable/io/utils.py +3 -3
  127. pixeltable/iterators/__init__.py +2 -1
  128. pixeltable/iterators/audio.py +21 -11
  129. pixeltable/iterators/document.py +116 -55
  130. pixeltable/iterators/image.py +5 -2
  131. pixeltable/iterators/video.py +293 -13
  132. pixeltable/metadata/__init__.py +4 -2
  133. pixeltable/metadata/converters/convert_18.py +2 -2
  134. pixeltable/metadata/converters/convert_19.py +2 -2
  135. pixeltable/metadata/converters/convert_20.py +2 -2
  136. pixeltable/metadata/converters/convert_21.py +2 -2
  137. pixeltable/metadata/converters/convert_22.py +2 -2
  138. pixeltable/metadata/converters/convert_24.py +2 -2
  139. pixeltable/metadata/converters/convert_25.py +2 -2
  140. pixeltable/metadata/converters/convert_26.py +2 -2
  141. pixeltable/metadata/converters/convert_29.py +4 -4
  142. pixeltable/metadata/converters/convert_34.py +2 -2
  143. pixeltable/metadata/converters/convert_36.py +2 -2
  144. pixeltable/metadata/converters/convert_37.py +15 -0
  145. pixeltable/metadata/converters/convert_38.py +39 -0
  146. pixeltable/metadata/converters/convert_39.py +124 -0
  147. pixeltable/metadata/converters/convert_40.py +73 -0
  148. pixeltable/metadata/converters/util.py +13 -12
  149. pixeltable/metadata/notes.py +4 -0
  150. pixeltable/metadata/schema.py +79 -42
  151. pixeltable/metadata/utils.py +74 -0
  152. pixeltable/mypy/__init__.py +3 -0
  153. pixeltable/mypy/mypy_plugin.py +123 -0
  154. pixeltable/plan.py +274 -223
  155. pixeltable/share/__init__.py +1 -1
  156. pixeltable/share/packager.py +259 -129
  157. pixeltable/share/protocol/__init__.py +34 -0
  158. pixeltable/share/protocol/common.py +170 -0
  159. pixeltable/share/protocol/operation_types.py +33 -0
  160. pixeltable/share/protocol/replica.py +109 -0
  161. pixeltable/share/publish.py +213 -57
  162. pixeltable/store.py +238 -175
  163. pixeltable/type_system.py +104 -63
  164. pixeltable/utils/__init__.py +2 -3
  165. pixeltable/utils/arrow.py +108 -13
  166. pixeltable/utils/av.py +298 -0
  167. pixeltable/utils/azure_store.py +305 -0
  168. pixeltable/utils/code.py +3 -3
  169. pixeltable/utils/console_output.py +4 -1
  170. pixeltable/utils/coroutine.py +6 -23
  171. pixeltable/utils/dbms.py +31 -5
  172. pixeltable/utils/description_helper.py +4 -5
  173. pixeltable/utils/documents.py +5 -6
  174. pixeltable/utils/exception_handler.py +7 -30
  175. pixeltable/utils/filecache.py +6 -6
  176. pixeltable/utils/formatter.py +4 -6
  177. pixeltable/utils/gcs_store.py +283 -0
  178. pixeltable/utils/http_server.py +2 -3
  179. pixeltable/utils/iceberg.py +1 -2
  180. pixeltable/utils/image.py +17 -0
  181. pixeltable/utils/lancedb.py +88 -0
  182. pixeltable/utils/local_store.py +316 -0
  183. pixeltable/utils/misc.py +5 -0
  184. pixeltable/utils/object_stores.py +528 -0
  185. pixeltable/utils/pydantic.py +60 -0
  186. pixeltable/utils/pytorch.py +5 -6
  187. pixeltable/utils/s3_store.py +392 -0
  188. pixeltable-0.4.20.dist-info/METADATA +587 -0
  189. pixeltable-0.4.20.dist-info/RECORD +218 -0
  190. {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info}/WHEEL +1 -1
  191. pixeltable-0.4.20.dist-info/entry_points.txt +2 -0
  192. pixeltable/__version__.py +0 -3
  193. pixeltable/ext/__init__.py +0 -17
  194. pixeltable/ext/functions/__init__.py +0 -11
  195. pixeltable/ext/functions/whisperx.py +0 -77
  196. pixeltable/utils/media_store.py +0 -77
  197. pixeltable/utils/s3.py +0 -17
  198. pixeltable/utils/sample.py +0 -25
  199. pixeltable-0.4.0rc3.dist-info/METADATA +0 -435
  200. pixeltable-0.4.0rc3.dist-info/RECORD +0 -189
  201. pixeltable-0.4.0rc3.dist-info/entry_points.txt +0 -3
  202. {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info/licenses}/LICENSE +0 -0
pixeltable/exceptions.py CHANGED
@@ -10,6 +10,12 @@ class Error(Exception):
10
10
 
11
11
 
12
12
  class ExprEvalError(Exception):
13
+ """
14
+ Used during query execution to signal expr evaluation failures.
15
+
16
+ NOT A USER-FACING EXCEPTION. All ExprEvalError instances need to be converted into Error instances.
17
+ """
18
+
13
19
  expr: 'exprs.Expr'
14
20
  expr_msg: str
15
21
  exc: Exception
@@ -2,11 +2,14 @@
2
2
 
3
3
  from .aggregation_node import AggregationNode
4
4
  from .cache_prefetch_node import CachePrefetchNode
5
+ from .cell_materialization_node import CellMaterializationNode
6
+ from .cell_reconstruction_node import CellReconstructionNode
5
7
  from .component_iteration_node import ComponentIterationNode
6
8
  from .data_row_batch import DataRowBatch
7
9
  from .exec_context import ExecContext
8
10
  from .exec_node import ExecNode
9
11
  from .expr_eval import ExprEvalNode
10
12
  from .in_memory_data_node import InMemoryDataNode
13
+ from .object_store_save_node import ObjectStoreSaveNode
11
14
  from .row_update_node import RowUpdateNode
12
15
  from .sql_node import SqlAggregationNode, SqlJoinNode, SqlLookupNode, SqlNode, SqlSampleNode, SqlScanNode
@@ -2,7 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  import logging
4
4
  import sys
5
- from typing import Any, AsyncIterator, Iterable, Optional, cast
5
+ from typing import Any, AsyncIterator, Iterable, cast
6
6
 
7
7
  from pixeltable import catalog, exceptions as excs, exprs
8
8
 
@@ -19,18 +19,18 @@ class AggregationNode(ExecNode):
19
19
  At the moment, this returns all results in a single DataRowBatch.
20
20
  """
21
21
 
22
- group_by: Optional[list[exprs.Expr]]
22
+ group_by: list[exprs.Expr] | None
23
23
  input_exprs: list[exprs.Expr]
24
24
  agg_fn_eval_ctx: exprs.RowBuilder.EvalCtx
25
25
  agg_fn_calls: list[exprs.FunctionCall]
26
26
  output_batch: DataRowBatch
27
- limit: Optional[int]
27
+ limit: int | None
28
28
 
29
29
  def __init__(
30
30
  self,
31
31
  tbl: catalog.TableVersionHandle,
32
32
  row_builder: exprs.RowBuilder,
33
- group_by: Optional[list[exprs.Expr]],
33
+ group_by: list[exprs.Expr] | None,
34
34
  agg_fn_calls: list[exprs.FunctionCall],
35
35
  input_exprs: Iterable[exprs.Expr],
36
36
  input: ExecNode,
@@ -45,7 +45,7 @@ class AggregationNode(ExecNode):
45
45
  # we need to make sure to refer to the same exprs that RowBuilder.eval() will use
46
46
  self.agg_fn_calls = [cast(exprs.FunctionCall, e) for e in self.agg_fn_eval_ctx.target_exprs]
47
47
  # create output_batch here, rather than in __iter__(), so we don't need to remember tbl and row_builder
48
- self.output_batch = DataRowBatch(tbl, row_builder, 0)
48
+ self.output_batch = DataRowBatch(row_builder)
49
49
  self.limit = None
50
50
 
51
51
  def set_limit(self, limit: int) -> None:
@@ -72,8 +72,8 @@ class AggregationNode(ExecNode):
72
72
  raise excs.ExprEvalError(fn_call, expr_msg, exc, exc_tb, input_vals, row_num) from exc
73
73
 
74
74
  async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
75
- prev_row: Optional[exprs.DataRow] = None
76
- current_group: Optional[list[Any]] = None # the values of the group-by exprs
75
+ prev_row: exprs.DataRow | None = None
76
+ current_group: list[Any] | None = None # the values of the group-by exprs
77
77
  num_input_rows = 0
78
78
  num_output_rows = 0
79
79
  async for row_batch in self.input:
@@ -103,6 +103,5 @@ class AggregationNode(ExecNode):
103
103
  self.row_builder.eval(prev_row, self.agg_fn_eval_ctx, profile=self.ctx.profile)
104
104
  self.output_batch.add_row(prev_row)
105
105
 
106
- self.output_batch.flush_imgs(None, self.stored_img_cols, self.flushed_img_slots)
107
106
  _logger.debug(f'AggregateNode: consumed {num_input_rows} rows, returning {len(self.output_batch.rows)} rows')
108
107
  yield self.output_batch
@@ -9,11 +9,12 @@ import urllib.request
9
9
  from collections import deque
10
10
  from concurrent import futures
11
11
  from pathlib import Path
12
- from typing import Any, AsyncIterator, Iterator, Optional
12
+ from typing import AsyncIterator, Iterator
13
13
  from uuid import UUID
14
14
 
15
- from pixeltable import catalog, env, exceptions as excs, exprs
15
+ from pixeltable import exceptions as excs, exprs
16
16
  from pixeltable.utils.filecache import FileCache
17
+ from pixeltable.utils.object_stores import ObjectOps
17
18
 
18
19
  from .data_row_batch import DataRowBatch
19
20
  from .exec_node import ExecNode
@@ -25,35 +26,35 @@ class CachePrefetchNode(ExecNode):
25
26
  """Brings files with external URLs into the cache
26
27
 
27
28
  TODO:
28
- - adapting the number of download threads at runtime to maximize throughput
29
+ - Process a row at a time and limit the number of in-flight rows to control memory usage
30
+ - Create asyncio.Tasks to consume our input in order to increase concurrency.
29
31
  """
30
32
 
33
+ QUEUE_DEPTH_HIGH_WATER = 50 # target number of in-flight requests
34
+ QUEUE_DEPTH_LOW_WATER = 20 # target number of in-flight requests
31
35
  BATCH_SIZE = 16
32
- NUM_EXECUTOR_THREADS = 16
36
+ MAX_WORKERS = 15
33
37
 
34
38
  retain_input_order: bool # if True, return rows in the exact order they were received
35
39
  file_col_info: list[exprs.ColumnSlotIdx]
36
- boto_client: Optional[Any]
37
- boto_client_lock: threading.Lock
38
40
 
39
41
  # execution state
40
- batch_tbl_version: Optional[catalog.TableVersionHandle] # needed to construct output batches
41
42
  num_returned_rows: int
42
43
 
43
44
  # ready_rows: rows that are ready to be returned, ordered by row idx;
44
45
  # the implied row idx of ready_rows[0] is num_returned_rows
45
- ready_rows: deque[Optional[exprs.DataRow]]
46
+ ready_rows: deque[exprs.DataRow | None]
46
47
 
47
48
  in_flight_rows: dict[int, CachePrefetchNode.RowState] # rows with in-flight urls; id(row) -> RowState
48
49
  in_flight_requests: dict[futures.Future, str] # in-flight requests for urls; future -> URL
49
50
  in_flight_urls: dict[str, list[tuple[exprs.DataRow, exprs.ColumnSlotIdx]]] # URL -> [(row, info)]
50
51
  input_finished: bool
51
- row_idx: Iterator[Optional[int]]
52
+ row_idx: Iterator[int | None]
52
53
 
53
54
  @dataclasses.dataclass
54
55
  class RowState:
55
56
  row: exprs.DataRow
56
- idx: Optional[int] # position in input stream; None if we don't retain input order
57
+ idx: int | None # position in input stream; None if we don't retain input order
57
58
  num_missing: int # number of missing URLs in this row
58
59
 
59
60
  def __init__(
@@ -64,11 +65,6 @@ class CachePrefetchNode(ExecNode):
64
65
  self.retain_input_order = retain_input_order
65
66
  self.file_col_info = file_col_info
66
67
 
67
- # clients for specific services are constructed as needed, because it's time-consuming
68
- self.boto_client = None
69
- self.boto_client_lock = threading.Lock()
70
-
71
- self.batch_tbl_version = None
72
68
  self.num_returned_rows = 0
73
69
  self.ready_rows = deque()
74
70
  self.in_flight_rows = {}
@@ -76,26 +72,44 @@ class CachePrefetchNode(ExecNode):
76
72
  self.in_flight_urls = {}
77
73
  self.input_finished = False
78
74
  self.row_idx = itertools.count() if retain_input_order else itertools.repeat(None)
75
+ assert self.QUEUE_DEPTH_HIGH_WATER > self.QUEUE_DEPTH_LOW_WATER
79
76
 
80
- async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
81
- input_iter = self.input.__aiter__()
82
- with futures.ThreadPoolExecutor(max_workers=self.NUM_EXECUTOR_THREADS) as executor:
83
- # we create enough in-flight requests to fill the first batch
84
- while not self.input_finished and self.__num_pending_rows() < self.BATCH_SIZE:
85
- await self.__submit_input_batch(input_iter, executor)
77
+ @property
78
+ def queued_work(self) -> int:
79
+ return len(self.in_flight_requests)
86
80
 
87
- while True:
88
- # try to assemble a full batch of output rows
89
- if not self.__has_ready_batch() and len(self.in_flight_requests) > 0:
90
- self.__wait_for_requests()
91
-
92
- # try to create enough in-flight requests to fill the next batch
93
- while not self.input_finished and self.__num_pending_rows() < self.BATCH_SIZE:
94
- await self.__submit_input_batch(input_iter, executor)
81
+ async def get_input_batch(self, input_iter: AsyncIterator[DataRowBatch]) -> DataRowBatch | None:
82
+ """Get the next batch of input rows, or None if there are no more rows"""
83
+ try:
84
+ input_batch = await anext(input_iter)
85
+ if input_batch is None:
86
+ self.input_finished = True
87
+ return input_batch
88
+ except StopAsyncIteration:
89
+ self.input_finished = True
90
+ return None
95
91
 
96
- if len(self.ready_rows) > 0:
92
+ async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
93
+ input_iter = aiter(self.input)
94
+ with futures.ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
95
+ while True:
96
+ # Create work to fill the queue to the high water mark ... ?without overrunning the in-flight row limit.
97
+ while not self.input_finished and self.queued_work < self.QUEUE_DEPTH_HIGH_WATER:
98
+ input_batch = await self.get_input_batch(input_iter)
99
+ if input_batch is not None:
100
+ self.__process_input_batch(input_batch, executor)
101
+
102
+ # Wait for enough completions to enable more queueing or if we're done
103
+ while self.queued_work > self.QUEUE_DEPTH_LOW_WATER or (self.input_finished and self.queued_work > 0):
104
+ done, _ = futures.wait(self.in_flight_requests, return_when=futures.FIRST_COMPLETED)
105
+ self.__process_completions(done, ignore_errors=self.ctx.ignore_errors)
106
+
107
+ # Emit results to meet batch size requirements or empty the in-flight row queue
108
+ if self.__has_ready_batch() or (
109
+ len(self.ready_rows) > 0 and self.input_finished and self.queued_work == 0
110
+ ):
97
111
  # create DataRowBatch from the first BATCH_SIZE ready rows
98
- batch = DataRowBatch(self.batch_tbl_version, self.row_builder)
112
+ batch = DataRowBatch(self.row_builder)
99
113
  rows = [self.ready_rows.popleft() for _ in range(min(self.BATCH_SIZE, len(self.ready_rows)))]
100
114
  for row in rows:
101
115
  assert row is not None
@@ -104,23 +118,16 @@ class CachePrefetchNode(ExecNode):
104
118
  _logger.debug(f'returning {len(rows)} rows')
105
119
  yield batch
106
120
 
107
- if self.input_finished and self.__num_pending_rows() == 0:
121
+ if self.input_finished and self.queued_work == 0 and len(self.ready_rows) == 0:
108
122
  return
109
123
 
110
- def __num_pending_rows(self) -> int:
111
- return len(self.in_flight_rows) + len(self.ready_rows)
112
-
113
124
  def __has_ready_batch(self) -> bool:
114
125
  """True if there are >= BATCH_SIZES entries in ready_rows and the first BATCH_SIZE ones are all non-None"""
115
126
  return (
116
127
  sum(int(row is not None) for row in itertools.islice(self.ready_rows, self.BATCH_SIZE)) == self.BATCH_SIZE
117
128
  )
118
129
 
119
- def __ready_prefix_len(self) -> int:
120
- """Length of the non-None prefix of ready_rows (= what we can return right now)"""
121
- return sum(1 for _ in itertools.takewhile(lambda x: x is not None, self.ready_rows))
122
-
123
- def __add_ready_row(self, row: exprs.DataRow, row_idx: Optional[int]) -> None:
130
+ def __add_ready_row(self, row: exprs.DataRow, row_idx: int | None) -> None:
124
131
  if row_idx is None:
125
132
  self.ready_rows.append(row)
126
133
  else:
@@ -130,52 +137,36 @@ class CachePrefetchNode(ExecNode):
130
137
  self.ready_rows.extend([None] * (idx - len(self.ready_rows) + 1))
131
138
  self.ready_rows[idx] = row
132
139
 
133
- def __wait_for_requests(self) -> None:
134
- """Wait for in-flight requests to complete until we have a full batch of rows"""
140
+ def __process_completions(self, done: set[futures.Future], ignore_errors: bool) -> None:
135
141
  file_cache = FileCache.get()
136
- _logger.debug(f'waiting for requests; ready_batch_size={self.__ready_prefix_len()}')
137
- while not self.__has_ready_batch() and len(self.in_flight_requests) > 0:
138
- done, _ = futures.wait(self.in_flight_requests, return_when=futures.FIRST_COMPLETED)
139
- for f in done:
140
- url = self.in_flight_requests.pop(f)
141
- tmp_path, exc = f.result()
142
- local_path: Optional[Path] = None
143
- if tmp_path is not None:
144
- # register the file with the cache for the first column in which it's missing
145
- assert url in self.in_flight_urls
146
- _, info = self.in_flight_urls[url][0]
147
- local_path = file_cache.add(info.col.tbl.id, info.col.id, url, tmp_path)
148
- _logger.debug(f'cached {url} as {local_path}')
149
-
150
- # add the local path/exception to the slots that reference the url
151
- for row, info in self.in_flight_urls.pop(url):
152
- if exc is not None:
153
- self.row_builder.set_exc(row, info.slot_idx, exc)
154
- else:
155
- assert local_path is not None
156
- row.set_file_path(info.slot_idx, str(local_path))
157
- state = self.in_flight_rows[id(row)]
158
- state.num_missing -= 1
159
- if state.num_missing == 0:
160
- del self.in_flight_rows[id(row)]
161
- self.__add_ready_row(row, state.idx)
162
- _logger.debug(f'row {state.idx} is ready (ready_batch_size={self.__ready_prefix_len()})')
163
-
164
- async def __submit_input_batch(
165
- self, input: AsyncIterator[DataRowBatch], executor: futures.ThreadPoolExecutor
166
- ) -> None:
167
- assert not self.input_finished
168
- input_batch: Optional[DataRowBatch]
169
- try:
170
- input_batch = await anext(input)
171
- except StopAsyncIteration:
172
- input_batch = None
173
- if input_batch is None:
174
- self.input_finished = True
175
- return
176
- if self.batch_tbl_version is None:
177
- self.batch_tbl_version = input_batch.tbl
178
-
142
+ for f in done:
143
+ url = self.in_flight_requests.pop(f)
144
+ tmp_path, exc = f.result()
145
+ if exc is not None and not ignore_errors:
146
+ raise exc
147
+ local_path: Path | None = None
148
+ if tmp_path is not None:
149
+ # register the file with the cache for the first column in which it's missing
150
+ assert url in self.in_flight_urls
151
+ _, info = self.in_flight_urls[url][0]
152
+ local_path = file_cache.add(info.col.get_tbl().id, info.col.id, url, tmp_path)
153
+ _logger.debug(f'cached {url} as {local_path}')
154
+
155
+ # add the local path/exception to the slots that reference the url
156
+ for row, info in self.in_flight_urls.pop(url):
157
+ if exc is not None:
158
+ self.row_builder.set_exc(row, info.slot_idx, exc)
159
+ else:
160
+ assert local_path is not None
161
+ row.set_file_path(info.slot_idx, str(local_path))
162
+ state = self.in_flight_rows[id(row)]
163
+ state.num_missing -= 1
164
+ if state.num_missing == 0:
165
+ del self.in_flight_rows[id(row)]
166
+ self.__add_ready_row(row, state.idx)
167
+
168
+ def __process_input_batch(self, input_batch: DataRowBatch, executor: futures.ThreadPoolExecutor) -> None:
169
+ """Process a batch of input rows, submitting URLs for download and adding ready rows to ready_rows"""
179
170
  file_cache = FileCache.get()
180
171
 
181
172
  # URLs from this input batch that aren't already in the file cache;
@@ -183,7 +174,7 @@ class CachePrefetchNode(ExecNode):
183
174
  # the time it takes to get the next batch together
184
175
  cache_misses: list[str] = []
185
176
 
186
- url_pos: dict[str, int] = {} # url -> row_idx; used for logging
177
+ url_pos: dict[str, int | None] = {} # url -> row_idx; used for logging
187
178
  for row in input_batch:
188
179
  # identify missing local files in input batch, or fill in their paths if they're already cached
189
180
  num_missing = 0
@@ -222,8 +213,10 @@ class CachePrefetchNode(ExecNode):
222
213
  _logger.debug(f'submitted {url} for idx {url_pos[url]}')
223
214
  self.in_flight_requests[f] = url
224
215
 
225
- def __fetch_url(self, url: str) -> tuple[Optional[Path], Optional[Exception]]:
226
- """Fetches a remote URL into Env.tmp_dir and returns its path"""
216
+ def __fetch_url(self, url: str) -> tuple[Path | None, Exception | None]:
217
+ """Fetches a remote URL into the TempStore and returns its path"""
218
+ from pixeltable.utils.local_store import TempStore
219
+
227
220
  _logger.debug(f'fetching url={url} thread_name={threading.current_thread().name}')
228
221
  parsed = urllib.parse.urlparse(url)
229
222
  # Use len(parsed.scheme) > 1 here to ensure we're not being passed
@@ -234,34 +227,14 @@ class CachePrefetchNode(ExecNode):
234
227
  if parsed.path:
235
228
  p = Path(urllib.parse.unquote(urllib.request.url2pathname(parsed.path)))
236
229
  extension = p.suffix
237
- tmp_path = env.Env.get().create_tmp_path(extension=extension)
230
+ tmp_path = TempStore.create_path(extension=extension)
238
231
  try:
239
232
  _logger.debug(f'Downloading {url} to {tmp_path}')
240
- if parsed.scheme == 's3':
241
- from pixeltable.utils.s3 import get_client
242
-
243
- with self.boto_client_lock:
244
- if self.boto_client is None:
245
- config = {
246
- 'max_pool_connections': self.NUM_EXECUTOR_THREADS + 4, # +4: leave some headroom
247
- 'connect_timeout': 5,
248
- 'read_timeout': 30,
249
- 'retries': {'max_attempts': 3, 'mode': 'adaptive'},
250
- }
251
- self.boto_client = get_client(**config)
252
- self.boto_client.download_file(parsed.netloc, parsed.path.lstrip('/'), str(tmp_path))
253
- elif parsed.scheme in ('http', 'https'):
254
- with urllib.request.urlopen(url) as resp, open(tmp_path, 'wb') as f:
255
- data = resp.read()
256
- f.write(data)
257
- else:
258
- raise AssertionError(f'Unsupported URL scheme: {parsed.scheme}')
233
+ ObjectOps.copy_object_to_local_file(url, tmp_path)
259
234
  _logger.debug(f'Downloaded {url} to {tmp_path}')
260
235
  return tmp_path, None
261
236
  except Exception as e:
262
237
  # we want to add the file url to the exception message
263
238
  exc = excs.Error(f'Failed to download {url}: {e}')
264
239
  _logger.debug(f'Failed to download {url}: {e}', exc_info=e)
265
- if not self.ctx.ignore_errors:
266
- raise exc from None # suppress original exception
267
240
  return None, exc
@@ -0,0 +1,231 @@
1
+ from __future__ import annotations
2
+
3
+ import io
4
+ import logging
5
+ import os
6
+ from pathlib import Path
7
+ from typing import Any, AsyncIterator
8
+
9
+ import numpy as np
10
+ import pgvector.sqlalchemy # type: ignore[import-untyped]
11
+ import PIL.Image
12
+ import sqlalchemy as sql
13
+
14
+ import pixeltable.type_system as ts
15
+ import pixeltable.utils.image as image_utils
16
+ from pixeltable import catalog, exprs
17
+ from pixeltable.env import Env
18
+ from pixeltable.utils.local_store import LocalStore
19
+
20
+ from .data_row_batch import DataRowBatch
21
+ from .exec_node import ExecNode
22
+ from .globals import INLINED_OBJECT_MD_KEY, InlinedObjectMd
23
+
24
+ _logger = logging.getLogger('pixeltable')
25
+
26
+
27
+ class CellMaterializationNode(ExecNode):
28
+ """
29
+ Node to populate DataRow.cell_vals/cell_md.
30
+
31
+ For now, the scope is limited to populating DataRow.cells_vals for json and array columns.
32
+
33
+ Array values:
34
+ - Arrays < MAX_DB_ARRAY_SIZE are stored inline in the db column
35
+ - Larger arrays are written to inlined_obj_files
36
+ - Bool arrays are stored as packed bits (uint8)
37
+ - cell_md: holds the url of the file, plus start and end offsets, plus bool flag and shape for bool arrays
38
+ (this allows us to query cell_md to get the total external storage size of an array column)
39
+
40
+ Json values:
41
+ - Inlined images and arrays are written to inlined_obj_files and replaced with a dict containing the object
42
+ location
43
+ - Bool arrays are also stored as packed bits; the dict also contains the shape and bool flag
44
+ - cell_md contains the list of urls for the inlined objects.
45
+
46
+ TODO:
47
+ - execute file IO via asyncio Tasks in a thread pool?
48
+ (we already seem to be getting 90% of hardware IO throughput)
49
+ - subsume all cell materialization
50
+ """
51
+
52
+ output_col_info: dict[catalog.Column, int] # value: slot idx
53
+
54
+ # execution state
55
+ inlined_obj_files: list[Path] # only [-1] is open for writing
56
+ buffered_writer: io.BufferedWriter | None # BufferedWriter for inlined_obj_files[-1]
57
+
58
+ MIN_FILE_SIZE = 8 * 2**20 # 8MB
59
+ MAX_DB_ARRAY_SIZE = 512 # max size of array stored in table column; in bytes
60
+
61
+ def __init__(self, input: ExecNode):
62
+ super().__init__(input.row_builder, [], [], input)
63
+ self.output_col_info = {
64
+ col: slot_idx
65
+ for col, slot_idx in input.row_builder.table_columns.items()
66
+ if slot_idx is not None and (col.col_type.is_json_type() or col.col_type.is_array_type())
67
+ }
68
+ self.inlined_obj_files = []
69
+ self.buffered_writer = None
70
+
71
+ async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
72
+ async for batch in self.input:
73
+ for row in batch:
74
+ for col, slot_idx in self.output_col_info.items():
75
+ if row.has_exc(slot_idx):
76
+ # Nulls in JSONB columns need to be stored as sql.sql.null(), otherwise it stores a json 'null'
77
+ row.cell_vals[col.id] = sql.sql.null() if col.col_type.is_json_type() else None
78
+ exc = row.get_exc(slot_idx)
79
+ row.cell_md[col.id] = exprs.CellMd(errortype=type(exc).__name__, errormsg=str(exc))
80
+ continue
81
+
82
+ val = row[slot_idx]
83
+ if val is None:
84
+ row.cell_vals[col.id] = sql.sql.null() if col.col_type.is_json_type() else None
85
+ row.cell_md[col.id] = None
86
+ continue
87
+
88
+ if col.col_type.is_json_type():
89
+ self._materialize_json_cell(row, col, val)
90
+ else:
91
+ assert col.col_type.is_array_type()
92
+ assert isinstance(val, np.ndarray)
93
+ self._materialize_array_cell(row, col, val)
94
+
95
+ # continue with only the currently open file
96
+ self.inlined_obj_files = self.inlined_obj_files[-1:]
97
+
98
+ yield batch
99
+
100
+ self._flush_buffer(finalize=True)
101
+
102
+ def init_writer(self) -> None:
103
+ if self.buffered_writer is None:
104
+ self._reset_buffer()
105
+ assert self.buffered_writer is not None
106
+
107
+ def close(self) -> None:
108
+ if self.buffered_writer is not None:
109
+ # there must have been an error, otherwise _flush_full_buffer(finalize=True) would have set this to None
110
+ self.buffered_writer.close()
111
+ self.buffered_writer = None
112
+
113
+ def _materialize_json_cell(self, row: exprs.DataRow, col: catalog.Column, val: Any) -> None:
114
+ if self._json_has_inlined_objs(val):
115
+ row.cell_vals[col.id] = self._rewrite_json(val)
116
+ row.cell_md[col.id] = exprs.CellMd(file_urls=[local_path.as_uri() for local_path in self.inlined_obj_files])
117
+ else:
118
+ row.cell_vals[col.id] = val
119
+ row.cell_md[col.id] = None
120
+
121
+ def _materialize_array_cell(self, row: exprs.DataRow, col: catalog.Column, val: np.ndarray) -> None:
122
+ if isinstance(col.sa_col_type, pgvector.sqlalchemy.Vector):
123
+ # this is a vector column (ie, used for a vector index): store the array itself
124
+ row.cell_vals[col.id] = val
125
+ row.cell_md[col.id] = None
126
+ elif val.nbytes <= self.MAX_DB_ARRAY_SIZE:
127
+ # this array is small enough to store in the db column (type: binary) directly
128
+ buffer = io.BytesIO()
129
+ np.save(buffer, val, allow_pickle=False)
130
+ row.cell_vals[col.id] = buffer.getvalue()
131
+ row.cell_md[col.id] = None
132
+ else:
133
+ # append this array to the buffer and store its location in the cell md
134
+ ar: np.ndarray
135
+ if np.issubdtype(val.dtype, np.bool_):
136
+ # for bool arrays, store as packed bits, otherwise it's 1 byte per element
137
+ ar = np.packbits(val)
138
+ else:
139
+ ar = val
140
+ self.init_writer()
141
+ start = self.buffered_writer.tell()
142
+ np.save(self.buffered_writer, ar, allow_pickle=False)
143
+ end = self.buffered_writer.tell()
144
+ row.cell_vals[col.id] = None
145
+ cell_md = exprs.CellMd(
146
+ file_urls=[self.inlined_obj_files[-1].as_uri()], array_md=exprs.ArrayMd(start=start, end=end)
147
+ )
148
+ if np.issubdtype(val.dtype, np.bool_):
149
+ cell_md.array_md.is_bool = True
150
+ cell_md.array_md.shape = val.shape
151
+ row.cell_md[col.id] = cell_md
152
+ self._flush_buffer()
153
+
154
+ assert row.cell_vals[col.id] is not None or row.cell_md[col.id] is not None
155
+
156
+ def _json_has_inlined_objs(self, element: Any) -> bool:
157
+ if isinstance(element, list):
158
+ return any(self._json_has_inlined_objs(v) for v in element)
159
+ if isinstance(element, dict):
160
+ return any(self._json_has_inlined_objs(v) for v in element.values())
161
+ return isinstance(element, (np.ndarray, PIL.Image.Image))
162
+
163
+ def _rewrite_json(self, element: Any) -> Any:
164
+ """Recursively rewrites a JSON structure by writing any inlined arrays or images to self.buffered_writer."""
165
+ if isinstance(element, list):
166
+ return [self._rewrite_json(v) for v in element]
167
+ if isinstance(element, dict):
168
+ return {k: self._rewrite_json(v) for k, v in element.items()}
169
+ if isinstance(element, np.ndarray):
170
+ obj_md = self._write_inlined_array(element)
171
+ return {INLINED_OBJECT_MD_KEY: obj_md.as_dict()}
172
+ if isinstance(element, PIL.Image.Image):
173
+ obj_md = self._write_inlined_image(element)
174
+ return {INLINED_OBJECT_MD_KEY: obj_md.as_dict()}
175
+ return element
176
+
177
+ def _write_inlined_array(self, ar: np.ndarray) -> InlinedObjectMd:
178
+ """Write an ndarray to buffered_writer and return its metadata."""
179
+ self.init_writer()
180
+ url_idx = len(self.inlined_obj_files) - 1
181
+ start = self.buffered_writer.tell()
182
+ shape: tuple[int, ...] | None
183
+ is_bool_array: bool
184
+ if np.issubdtype(ar.dtype, np.bool_):
185
+ shape = ar.shape
186
+ ar = np.packbits(ar)
187
+ is_bool_array = True
188
+ else:
189
+ shape = None
190
+ is_bool_array = False
191
+ np.save(self.buffered_writer, ar, allow_pickle=False)
192
+ end = self.buffered_writer.tell()
193
+ self._flush_buffer()
194
+ return InlinedObjectMd(
195
+ type=ts.ColumnType.Type.ARRAY.name,
196
+ url_idx=url_idx,
197
+ array_md=exprs.ArrayMd(start=start, end=end, is_bool=is_bool_array, shape=shape),
198
+ )
199
+
200
+ def _write_inlined_image(self, img: PIL.Image.Image) -> InlinedObjectMd:
201
+ """Write a PIL image to buffered_writer and return: index into inlined_obj_files, start offset, end offset"""
202
+ self.init_writer()
203
+ url_idx = len(self.inlined_obj_files) - 1
204
+ start = self.buffered_writer.tell()
205
+ img.save(self.buffered_writer, format=image_utils.default_format(img))
206
+ end = self.buffered_writer.tell()
207
+ self._flush_buffer()
208
+ return InlinedObjectMd(type=ts.ColumnType.Type.IMAGE.name, url_idx=url_idx, img_start=start, img_end=end)
209
+
210
+ def _reset_buffer(self) -> None:
211
+ local_path = LocalStore(Env.get().media_dir)._prepare_path_raw(
212
+ self.row_builder.tbl.id, 0, self.row_builder.tbl.version
213
+ )
214
+ self.inlined_obj_files.append(local_path)
215
+ fh = open(local_path, 'wb', buffering=self.MIN_FILE_SIZE * 2) # noqa: SIM115
216
+ assert isinstance(fh, io.BufferedWriter)
217
+ self.buffered_writer = fh
218
+
219
+ def _flush_buffer(self, finalize: bool = False) -> None:
220
+ """Flush buffered_writer to storage if it exceeds its minimum size or finalize is True."""
221
+ if self.buffered_writer is None:
222
+ return
223
+ if self.buffered_writer.tell() < self.MIN_FILE_SIZE and not finalize:
224
+ return
225
+ self.buffered_writer.flush()
226
+ os.fsync(self.buffered_writer.fileno()) # needed to force bytes cached by OS to storage
227
+ self.buffered_writer.close()
228
+ if finalize:
229
+ self.buffered_writer = None
230
+ else:
231
+ self._reset_buffer()