pixeltable 0.3.14__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. pixeltable/__init__.py +42 -8
  2. pixeltable/{dataframe.py → _query.py} +470 -206
  3. pixeltable/_version.py +1 -0
  4. pixeltable/catalog/__init__.py +5 -4
  5. pixeltable/catalog/catalog.py +1785 -432
  6. pixeltable/catalog/column.py +190 -113
  7. pixeltable/catalog/dir.py +2 -4
  8. pixeltable/catalog/globals.py +19 -46
  9. pixeltable/catalog/insertable_table.py +191 -98
  10. pixeltable/catalog/path.py +63 -23
  11. pixeltable/catalog/schema_object.py +11 -15
  12. pixeltable/catalog/table.py +843 -436
  13. pixeltable/catalog/table_metadata.py +103 -0
  14. pixeltable/catalog/table_version.py +978 -657
  15. pixeltable/catalog/table_version_handle.py +72 -16
  16. pixeltable/catalog/table_version_path.py +112 -43
  17. pixeltable/catalog/tbl_ops.py +53 -0
  18. pixeltable/catalog/update_status.py +191 -0
  19. pixeltable/catalog/view.py +134 -90
  20. pixeltable/config.py +134 -22
  21. pixeltable/env.py +471 -157
  22. pixeltable/exceptions.py +6 -0
  23. pixeltable/exec/__init__.py +4 -1
  24. pixeltable/exec/aggregation_node.py +7 -8
  25. pixeltable/exec/cache_prefetch_node.py +83 -110
  26. pixeltable/exec/cell_materialization_node.py +268 -0
  27. pixeltable/exec/cell_reconstruction_node.py +168 -0
  28. pixeltable/exec/component_iteration_node.py +4 -3
  29. pixeltable/exec/data_row_batch.py +8 -65
  30. pixeltable/exec/exec_context.py +16 -4
  31. pixeltable/exec/exec_node.py +13 -36
  32. pixeltable/exec/expr_eval/evaluators.py +11 -7
  33. pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
  34. pixeltable/exec/expr_eval/globals.py +8 -5
  35. pixeltable/exec/expr_eval/row_buffer.py +1 -2
  36. pixeltable/exec/expr_eval/schedulers.py +106 -56
  37. pixeltable/exec/globals.py +35 -0
  38. pixeltable/exec/in_memory_data_node.py +19 -19
  39. pixeltable/exec/object_store_save_node.py +293 -0
  40. pixeltable/exec/row_update_node.py +16 -9
  41. pixeltable/exec/sql_node.py +351 -84
  42. pixeltable/exprs/__init__.py +1 -1
  43. pixeltable/exprs/arithmetic_expr.py +27 -22
  44. pixeltable/exprs/array_slice.py +3 -3
  45. pixeltable/exprs/column_property_ref.py +36 -23
  46. pixeltable/exprs/column_ref.py +213 -89
  47. pixeltable/exprs/comparison.py +5 -5
  48. pixeltable/exprs/compound_predicate.py +5 -4
  49. pixeltable/exprs/data_row.py +164 -54
  50. pixeltable/exprs/expr.py +70 -44
  51. pixeltable/exprs/expr_dict.py +3 -3
  52. pixeltable/exprs/expr_set.py +17 -10
  53. pixeltable/exprs/function_call.py +100 -40
  54. pixeltable/exprs/globals.py +2 -2
  55. pixeltable/exprs/in_predicate.py +4 -4
  56. pixeltable/exprs/inline_expr.py +18 -32
  57. pixeltable/exprs/is_null.py +7 -3
  58. pixeltable/exprs/json_mapper.py +8 -8
  59. pixeltable/exprs/json_path.py +56 -22
  60. pixeltable/exprs/literal.py +27 -5
  61. pixeltable/exprs/method_ref.py +2 -2
  62. pixeltable/exprs/object_ref.py +2 -2
  63. pixeltable/exprs/row_builder.py +167 -67
  64. pixeltable/exprs/rowid_ref.py +25 -10
  65. pixeltable/exprs/similarity_expr.py +58 -40
  66. pixeltable/exprs/sql_element_cache.py +4 -4
  67. pixeltable/exprs/string_op.py +5 -5
  68. pixeltable/exprs/type_cast.py +3 -5
  69. pixeltable/func/__init__.py +1 -0
  70. pixeltable/func/aggregate_function.py +8 -8
  71. pixeltable/func/callable_function.py +9 -9
  72. pixeltable/func/expr_template_function.py +17 -11
  73. pixeltable/func/function.py +18 -20
  74. pixeltable/func/function_registry.py +6 -7
  75. pixeltable/func/globals.py +2 -3
  76. pixeltable/func/mcp.py +74 -0
  77. pixeltable/func/query_template_function.py +29 -27
  78. pixeltable/func/signature.py +46 -19
  79. pixeltable/func/tools.py +31 -13
  80. pixeltable/func/udf.py +18 -20
  81. pixeltable/functions/__init__.py +16 -0
  82. pixeltable/functions/anthropic.py +123 -77
  83. pixeltable/functions/audio.py +147 -10
  84. pixeltable/functions/bedrock.py +13 -6
  85. pixeltable/functions/date.py +7 -4
  86. pixeltable/functions/deepseek.py +35 -43
  87. pixeltable/functions/document.py +81 -0
  88. pixeltable/functions/fal.py +76 -0
  89. pixeltable/functions/fireworks.py +11 -20
  90. pixeltable/functions/gemini.py +195 -39
  91. pixeltable/functions/globals.py +142 -14
  92. pixeltable/functions/groq.py +108 -0
  93. pixeltable/functions/huggingface.py +1056 -24
  94. pixeltable/functions/image.py +115 -57
  95. pixeltable/functions/json.py +1 -1
  96. pixeltable/functions/llama_cpp.py +28 -13
  97. pixeltable/functions/math.py +67 -5
  98. pixeltable/functions/mistralai.py +18 -55
  99. pixeltable/functions/net.py +70 -0
  100. pixeltable/functions/ollama.py +20 -13
  101. pixeltable/functions/openai.py +240 -226
  102. pixeltable/functions/openrouter.py +143 -0
  103. pixeltable/functions/replicate.py +4 -4
  104. pixeltable/functions/reve.py +250 -0
  105. pixeltable/functions/string.py +239 -69
  106. pixeltable/functions/timestamp.py +16 -16
  107. pixeltable/functions/together.py +24 -84
  108. pixeltable/functions/twelvelabs.py +188 -0
  109. pixeltable/functions/util.py +6 -1
  110. pixeltable/functions/uuid.py +30 -0
  111. pixeltable/functions/video.py +1515 -107
  112. pixeltable/functions/vision.py +8 -8
  113. pixeltable/functions/voyageai.py +289 -0
  114. pixeltable/functions/whisper.py +16 -8
  115. pixeltable/functions/whisperx.py +179 -0
  116. pixeltable/{ext/functions → functions}/yolox.py +2 -4
  117. pixeltable/globals.py +362 -115
  118. pixeltable/index/base.py +17 -21
  119. pixeltable/index/btree.py +28 -22
  120. pixeltable/index/embedding_index.py +100 -118
  121. pixeltable/io/__init__.py +4 -2
  122. pixeltable/io/datarows.py +8 -7
  123. pixeltable/io/external_store.py +56 -105
  124. pixeltable/io/fiftyone.py +13 -13
  125. pixeltable/io/globals.py +31 -30
  126. pixeltable/io/hf_datasets.py +61 -16
  127. pixeltable/io/label_studio.py +74 -70
  128. pixeltable/io/lancedb.py +3 -0
  129. pixeltable/io/pandas.py +21 -12
  130. pixeltable/io/parquet.py +25 -105
  131. pixeltable/io/table_data_conduit.py +250 -123
  132. pixeltable/io/utils.py +4 -4
  133. pixeltable/iterators/__init__.py +2 -1
  134. pixeltable/iterators/audio.py +26 -25
  135. pixeltable/iterators/base.py +9 -3
  136. pixeltable/iterators/document.py +112 -78
  137. pixeltable/iterators/image.py +12 -15
  138. pixeltable/iterators/string.py +11 -4
  139. pixeltable/iterators/video.py +523 -120
  140. pixeltable/metadata/__init__.py +14 -3
  141. pixeltable/metadata/converters/convert_13.py +2 -2
  142. pixeltable/metadata/converters/convert_18.py +2 -2
  143. pixeltable/metadata/converters/convert_19.py +2 -2
  144. pixeltable/metadata/converters/convert_20.py +2 -2
  145. pixeltable/metadata/converters/convert_21.py +2 -2
  146. pixeltable/metadata/converters/convert_22.py +2 -2
  147. pixeltable/metadata/converters/convert_24.py +2 -2
  148. pixeltable/metadata/converters/convert_25.py +2 -2
  149. pixeltable/metadata/converters/convert_26.py +2 -2
  150. pixeltable/metadata/converters/convert_29.py +4 -4
  151. pixeltable/metadata/converters/convert_30.py +34 -21
  152. pixeltable/metadata/converters/convert_34.py +2 -2
  153. pixeltable/metadata/converters/convert_35.py +9 -0
  154. pixeltable/metadata/converters/convert_36.py +38 -0
  155. pixeltable/metadata/converters/convert_37.py +15 -0
  156. pixeltable/metadata/converters/convert_38.py +39 -0
  157. pixeltable/metadata/converters/convert_39.py +124 -0
  158. pixeltable/metadata/converters/convert_40.py +73 -0
  159. pixeltable/metadata/converters/convert_41.py +12 -0
  160. pixeltable/metadata/converters/convert_42.py +9 -0
  161. pixeltable/metadata/converters/convert_43.py +44 -0
  162. pixeltable/metadata/converters/util.py +20 -31
  163. pixeltable/metadata/notes.py +9 -0
  164. pixeltable/metadata/schema.py +140 -53
  165. pixeltable/metadata/utils.py +74 -0
  166. pixeltable/mypy/__init__.py +3 -0
  167. pixeltable/mypy/mypy_plugin.py +123 -0
  168. pixeltable/plan.py +382 -115
  169. pixeltable/share/__init__.py +1 -1
  170. pixeltable/share/packager.py +547 -83
  171. pixeltable/share/protocol/__init__.py +33 -0
  172. pixeltable/share/protocol/common.py +165 -0
  173. pixeltable/share/protocol/operation_types.py +33 -0
  174. pixeltable/share/protocol/replica.py +119 -0
  175. pixeltable/share/publish.py +257 -59
  176. pixeltable/store.py +311 -194
  177. pixeltable/type_system.py +373 -211
  178. pixeltable/utils/__init__.py +2 -3
  179. pixeltable/utils/arrow.py +131 -17
  180. pixeltable/utils/av.py +298 -0
  181. pixeltable/utils/azure_store.py +346 -0
  182. pixeltable/utils/coco.py +6 -6
  183. pixeltable/utils/code.py +3 -3
  184. pixeltable/utils/console_output.py +4 -1
  185. pixeltable/utils/coroutine.py +6 -23
  186. pixeltable/utils/dbms.py +32 -6
  187. pixeltable/utils/description_helper.py +4 -5
  188. pixeltable/utils/documents.py +7 -18
  189. pixeltable/utils/exception_handler.py +7 -30
  190. pixeltable/utils/filecache.py +6 -6
  191. pixeltable/utils/formatter.py +86 -48
  192. pixeltable/utils/gcs_store.py +295 -0
  193. pixeltable/utils/http.py +133 -0
  194. pixeltable/utils/http_server.py +2 -3
  195. pixeltable/utils/iceberg.py +1 -2
  196. pixeltable/utils/image.py +17 -0
  197. pixeltable/utils/lancedb.py +90 -0
  198. pixeltable/utils/local_store.py +322 -0
  199. pixeltable/utils/misc.py +5 -0
  200. pixeltable/utils/object_stores.py +573 -0
  201. pixeltable/utils/pydantic.py +60 -0
  202. pixeltable/utils/pytorch.py +5 -6
  203. pixeltable/utils/s3_store.py +527 -0
  204. pixeltable/utils/sql.py +26 -0
  205. pixeltable/utils/system.py +30 -0
  206. pixeltable-0.5.7.dist-info/METADATA +579 -0
  207. pixeltable-0.5.7.dist-info/RECORD +227 -0
  208. {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
  209. pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
  210. pixeltable/__version__.py +0 -3
  211. pixeltable/catalog/named_function.py +0 -40
  212. pixeltable/ext/__init__.py +0 -17
  213. pixeltable/ext/functions/__init__.py +0 -11
  214. pixeltable/ext/functions/whisperx.py +0 -77
  215. pixeltable/utils/media_store.py +0 -77
  216. pixeltable/utils/s3.py +0 -17
  217. pixeltable-0.3.14.dist-info/METADATA +0 -434
  218. pixeltable-0.3.14.dist-info/RECORD +0 -186
  219. pixeltable-0.3.14.dist-info/entry_points.txt +0 -3
  220. {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
pixeltable/exceptions.py CHANGED
@@ -10,6 +10,12 @@ class Error(Exception):
10
10
 
11
11
 
12
12
  class ExprEvalError(Exception):
13
+ """
14
+ Used during query execution to signal expr evaluation failures.
15
+
16
+ NOT A USER-FACING EXCEPTION. All ExprEvalError instances need to be converted into Error instances.
17
+ """
18
+
13
19
  expr: 'exprs.Expr'
14
20
  expr_msg: str
15
21
  exc: Exception
@@ -2,11 +2,14 @@
2
2
 
3
3
  from .aggregation_node import AggregationNode
4
4
  from .cache_prefetch_node import CachePrefetchNode
5
+ from .cell_materialization_node import CellMaterializationNode
6
+ from .cell_reconstruction_node import CellReconstructionNode
5
7
  from .component_iteration_node import ComponentIterationNode
6
8
  from .data_row_batch import DataRowBatch
7
9
  from .exec_context import ExecContext
8
10
  from .exec_node import ExecNode
9
11
  from .expr_eval import ExprEvalNode
10
12
  from .in_memory_data_node import InMemoryDataNode
13
+ from .object_store_save_node import ObjectStoreSaveNode
11
14
  from .row_update_node import RowUpdateNode
12
- from .sql_node import SqlAggregationNode, SqlJoinNode, SqlLookupNode, SqlNode, SqlScanNode
15
+ from .sql_node import SqlAggregationNode, SqlJoinNode, SqlLookupNode, SqlNode, SqlSampleNode, SqlScanNode
@@ -2,7 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  import logging
4
4
  import sys
5
- from typing import Any, AsyncIterator, Iterable, Optional, cast
5
+ from typing import Any, AsyncIterator, Iterable, cast
6
6
 
7
7
  from pixeltable import catalog, exceptions as excs, exprs
8
8
 
@@ -19,18 +19,18 @@ class AggregationNode(ExecNode):
19
19
  At the moment, this returns all results in a single DataRowBatch.
20
20
  """
21
21
 
22
- group_by: Optional[list[exprs.Expr]]
22
+ group_by: list[exprs.Expr] | None
23
23
  input_exprs: list[exprs.Expr]
24
24
  agg_fn_eval_ctx: exprs.RowBuilder.EvalCtx
25
25
  agg_fn_calls: list[exprs.FunctionCall]
26
26
  output_batch: DataRowBatch
27
- limit: Optional[int]
27
+ limit: int | None
28
28
 
29
29
  def __init__(
30
30
  self,
31
31
  tbl: catalog.TableVersionHandle,
32
32
  row_builder: exprs.RowBuilder,
33
- group_by: Optional[list[exprs.Expr]],
33
+ group_by: list[exprs.Expr] | None,
34
34
  agg_fn_calls: list[exprs.FunctionCall],
35
35
  input_exprs: Iterable[exprs.Expr],
36
36
  input: ExecNode,
@@ -45,7 +45,7 @@ class AggregationNode(ExecNode):
45
45
  # we need to make sure to refer to the same exprs that RowBuilder.eval() will use
46
46
  self.agg_fn_calls = [cast(exprs.FunctionCall, e) for e in self.agg_fn_eval_ctx.target_exprs]
47
47
  # create output_batch here, rather than in __iter__(), so we don't need to remember tbl and row_builder
48
- self.output_batch = DataRowBatch(tbl, row_builder, 0)
48
+ self.output_batch = DataRowBatch(row_builder)
49
49
  self.limit = None
50
50
 
51
51
  def set_limit(self, limit: int) -> None:
@@ -72,8 +72,8 @@ class AggregationNode(ExecNode):
72
72
  raise excs.ExprEvalError(fn_call, expr_msg, exc, exc_tb, input_vals, row_num) from exc
73
73
 
74
74
  async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
75
- prev_row: Optional[exprs.DataRow] = None
76
- current_group: Optional[list[Any]] = None # the values of the group-by exprs
75
+ prev_row: exprs.DataRow | None = None
76
+ current_group: list[Any] | None = None # the values of the group-by exprs
77
77
  num_input_rows = 0
78
78
  num_output_rows = 0
79
79
  async for row_batch in self.input:
@@ -103,6 +103,5 @@ class AggregationNode(ExecNode):
103
103
  self.row_builder.eval(prev_row, self.agg_fn_eval_ctx, profile=self.ctx.profile)
104
104
  self.output_batch.add_row(prev_row)
105
105
 
106
- self.output_batch.flush_imgs(None, self.stored_img_cols, self.flushed_img_slots)
107
106
  _logger.debug(f'AggregateNode: consumed {num_input_rows} rows, returning {len(self.output_batch.rows)} rows')
108
107
  yield self.output_batch
@@ -9,11 +9,12 @@ import urllib.request
9
9
  from collections import deque
10
10
  from concurrent import futures
11
11
  from pathlib import Path
12
- from typing import Any, AsyncIterator, Iterator, Optional
12
+ from typing import AsyncIterator, Iterator
13
13
  from uuid import UUID
14
14
 
15
- from pixeltable import catalog, env, exceptions as excs, exprs
15
+ from pixeltable import exceptions as excs, exprs
16
16
  from pixeltable.utils.filecache import FileCache
17
+ from pixeltable.utils.object_stores import ObjectOps
17
18
 
18
19
  from .data_row_batch import DataRowBatch
19
20
  from .exec_node import ExecNode
@@ -25,35 +26,35 @@ class CachePrefetchNode(ExecNode):
25
26
  """Brings files with external URLs into the cache
26
27
 
27
28
  TODO:
28
- - adapting the number of download threads at runtime to maximize throughput
29
+ - Process a row at a time and limit the number of in-flight rows to control memory usage
30
+ - Create asyncio.Tasks to consume our input in order to increase concurrency.
29
31
  """
30
32
 
33
+ QUEUE_DEPTH_HIGH_WATER = 50 # target number of in-flight requests
34
+ QUEUE_DEPTH_LOW_WATER = 20 # target number of in-flight requests
31
35
  BATCH_SIZE = 16
32
- NUM_EXECUTOR_THREADS = 16
36
+ MAX_WORKERS = 15
33
37
 
34
38
  retain_input_order: bool # if True, return rows in the exact order they were received
35
39
  file_col_info: list[exprs.ColumnSlotIdx]
36
- boto_client: Optional[Any]
37
- boto_client_lock: threading.Lock
38
40
 
39
41
  # execution state
40
- batch_tbl_version: Optional[catalog.TableVersionHandle] # needed to construct output batches
41
42
  num_returned_rows: int
42
43
 
43
44
  # ready_rows: rows that are ready to be returned, ordered by row idx;
44
45
  # the implied row idx of ready_rows[0] is num_returned_rows
45
- ready_rows: deque[Optional[exprs.DataRow]]
46
+ ready_rows: deque[exprs.DataRow | None]
46
47
 
47
48
  in_flight_rows: dict[int, CachePrefetchNode.RowState] # rows with in-flight urls; id(row) -> RowState
48
49
  in_flight_requests: dict[futures.Future, str] # in-flight requests for urls; future -> URL
49
50
  in_flight_urls: dict[str, list[tuple[exprs.DataRow, exprs.ColumnSlotIdx]]] # URL -> [(row, info)]
50
51
  input_finished: bool
51
- row_idx: Iterator[Optional[int]]
52
+ row_idx: Iterator[int | None]
52
53
 
53
54
  @dataclasses.dataclass
54
55
  class RowState:
55
56
  row: exprs.DataRow
56
- idx: Optional[int] # position in input stream; None if we don't retain input order
57
+ idx: int | None # position in input stream; None if we don't retain input order
57
58
  num_missing: int # number of missing URLs in this row
58
59
 
59
60
  def __init__(
@@ -64,11 +65,6 @@ class CachePrefetchNode(ExecNode):
64
65
  self.retain_input_order = retain_input_order
65
66
  self.file_col_info = file_col_info
66
67
 
67
- # clients for specific services are constructed as needed, because it's time-consuming
68
- self.boto_client = None
69
- self.boto_client_lock = threading.Lock()
70
-
71
- self.batch_tbl_version = None
72
68
  self.num_returned_rows = 0
73
69
  self.ready_rows = deque()
74
70
  self.in_flight_rows = {}
@@ -76,26 +72,44 @@ class CachePrefetchNode(ExecNode):
76
72
  self.in_flight_urls = {}
77
73
  self.input_finished = False
78
74
  self.row_idx = itertools.count() if retain_input_order else itertools.repeat(None)
75
+ assert self.QUEUE_DEPTH_HIGH_WATER > self.QUEUE_DEPTH_LOW_WATER
79
76
 
80
- async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
81
- input_iter = self.input.__aiter__()
82
- with futures.ThreadPoolExecutor(max_workers=self.NUM_EXECUTOR_THREADS) as executor:
83
- # we create enough in-flight requests to fill the first batch
84
- while not self.input_finished and self.__num_pending_rows() < self.BATCH_SIZE:
85
- await self.__submit_input_batch(input_iter, executor)
77
+ @property
78
+ def queued_work(self) -> int:
79
+ return len(self.in_flight_requests)
86
80
 
87
- while True:
88
- # try to assemble a full batch of output rows
89
- if not self.__has_ready_batch() and len(self.in_flight_requests) > 0:
90
- self.__wait_for_requests()
91
-
92
- # try to create enough in-flight requests to fill the next batch
93
- while not self.input_finished and self.__num_pending_rows() < self.BATCH_SIZE:
94
- await self.__submit_input_batch(input_iter, executor)
81
+ async def get_input_batch(self, input_iter: AsyncIterator[DataRowBatch]) -> DataRowBatch | None:
82
+ """Get the next batch of input rows, or None if there are no more rows"""
83
+ try:
84
+ input_batch = await anext(input_iter)
85
+ if input_batch is None:
86
+ self.input_finished = True
87
+ return input_batch
88
+ except StopAsyncIteration:
89
+ self.input_finished = True
90
+ return None
95
91
 
96
- if len(self.ready_rows) > 0:
92
+ async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
93
+ input_iter = aiter(self.input)
94
+ with futures.ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
95
+ while True:
96
+ # Create work to fill the queue to the high water mark ... ?without overrunning the in-flight row limit.
97
+ while not self.input_finished and self.queued_work < self.QUEUE_DEPTH_HIGH_WATER:
98
+ input_batch = await self.get_input_batch(input_iter)
99
+ if input_batch is not None:
100
+ self.__process_input_batch(input_batch, executor)
101
+
102
+ # Wait for enough completions to enable more queueing or if we're done
103
+ while self.queued_work > self.QUEUE_DEPTH_LOW_WATER or (self.input_finished and self.queued_work > 0):
104
+ done, _ = futures.wait(self.in_flight_requests, return_when=futures.FIRST_COMPLETED)
105
+ self.__process_completions(done, ignore_errors=self.ctx.ignore_errors)
106
+
107
+ # Emit results to meet batch size requirements or empty the in-flight row queue
108
+ if self.__has_ready_batch() or (
109
+ len(self.ready_rows) > 0 and self.input_finished and self.queued_work == 0
110
+ ):
97
111
  # create DataRowBatch from the first BATCH_SIZE ready rows
98
- batch = DataRowBatch(self.batch_tbl_version, self.row_builder)
112
+ batch = DataRowBatch(self.row_builder)
99
113
  rows = [self.ready_rows.popleft() for _ in range(min(self.BATCH_SIZE, len(self.ready_rows)))]
100
114
  for row in rows:
101
115
  assert row is not None
@@ -104,23 +118,16 @@ class CachePrefetchNode(ExecNode):
104
118
  _logger.debug(f'returning {len(rows)} rows')
105
119
  yield batch
106
120
 
107
- if self.input_finished and self.__num_pending_rows() == 0:
121
+ if self.input_finished and self.queued_work == 0 and len(self.ready_rows) == 0:
108
122
  return
109
123
 
110
- def __num_pending_rows(self) -> int:
111
- return len(self.in_flight_rows) + len(self.ready_rows)
112
-
113
124
  def __has_ready_batch(self) -> bool:
114
125
  """True if there are >= BATCH_SIZES entries in ready_rows and the first BATCH_SIZE ones are all non-None"""
115
126
  return (
116
127
  sum(int(row is not None) for row in itertools.islice(self.ready_rows, self.BATCH_SIZE)) == self.BATCH_SIZE
117
128
  )
118
129
 
119
- def __ready_prefix_len(self) -> int:
120
- """Length of the non-None prefix of ready_rows (= what we can return right now)"""
121
- return sum(1 for _ in itertools.takewhile(lambda x: x is not None, self.ready_rows))
122
-
123
- def __add_ready_row(self, row: exprs.DataRow, row_idx: Optional[int]) -> None:
130
+ def __add_ready_row(self, row: exprs.DataRow, row_idx: int | None) -> None:
124
131
  if row_idx is None:
125
132
  self.ready_rows.append(row)
126
133
  else:
@@ -130,52 +137,36 @@ class CachePrefetchNode(ExecNode):
130
137
  self.ready_rows.extend([None] * (idx - len(self.ready_rows) + 1))
131
138
  self.ready_rows[idx] = row
132
139
 
133
- def __wait_for_requests(self) -> None:
134
- """Wait for in-flight requests to complete until we have a full batch of rows"""
140
+ def __process_completions(self, done: set[futures.Future], ignore_errors: bool) -> None:
135
141
  file_cache = FileCache.get()
136
- _logger.debug(f'waiting for requests; ready_batch_size={self.__ready_prefix_len()}')
137
- while not self.__has_ready_batch() and len(self.in_flight_requests) > 0:
138
- done, _ = futures.wait(self.in_flight_requests, return_when=futures.FIRST_COMPLETED)
139
- for f in done:
140
- url = self.in_flight_requests.pop(f)
141
- tmp_path, exc = f.result()
142
- local_path: Optional[Path] = None
143
- if tmp_path is not None:
144
- # register the file with the cache for the first column in which it's missing
145
- assert url in self.in_flight_urls
146
- _, info = self.in_flight_urls[url][0]
147
- local_path = file_cache.add(info.col.tbl.id, info.col.id, url, tmp_path)
148
- _logger.debug(f'cached {url} as {local_path}')
149
-
150
- # add the local path/exception to the slots that reference the url
151
- for row, info in self.in_flight_urls.pop(url):
152
- if exc is not None:
153
- self.row_builder.set_exc(row, info.slot_idx, exc)
154
- else:
155
- assert local_path is not None
156
- row.set_file_path(info.slot_idx, str(local_path))
157
- state = self.in_flight_rows[id(row)]
158
- state.num_missing -= 1
159
- if state.num_missing == 0:
160
- del self.in_flight_rows[id(row)]
161
- self.__add_ready_row(row, state.idx)
162
- _logger.debug(f'row {state.idx} is ready (ready_batch_size={self.__ready_prefix_len()})')
163
-
164
- async def __submit_input_batch(
165
- self, input: AsyncIterator[DataRowBatch], executor: futures.ThreadPoolExecutor
166
- ) -> None:
167
- assert not self.input_finished
168
- input_batch: Optional[DataRowBatch]
169
- try:
170
- input_batch = await anext(input)
171
- except StopAsyncIteration:
172
- input_batch = None
173
- if input_batch is None:
174
- self.input_finished = True
175
- return
176
- if self.batch_tbl_version is None:
177
- self.batch_tbl_version = input_batch.tbl
178
-
142
+ for f in done:
143
+ url = self.in_flight_requests.pop(f)
144
+ tmp_path, exc = f.result()
145
+ if exc is not None and not ignore_errors:
146
+ raise exc
147
+ local_path: Path | None = None
148
+ if tmp_path is not None:
149
+ # register the file with the cache for the first column in which it's missing
150
+ assert url in self.in_flight_urls
151
+ _, info = self.in_flight_urls[url][0]
152
+ local_path = file_cache.add(info.col.get_tbl().id, info.col.id, url, tmp_path)
153
+ _logger.debug(f'cached {url} as {local_path}')
154
+
155
+ # add the local path/exception to the slots that reference the url
156
+ for row, info in self.in_flight_urls.pop(url):
157
+ if exc is not None:
158
+ self.row_builder.set_exc(row, info.slot_idx, exc)
159
+ else:
160
+ assert local_path is not None
161
+ row.set_file_path(info.slot_idx, str(local_path))
162
+ state = self.in_flight_rows[id(row)]
163
+ state.num_missing -= 1
164
+ if state.num_missing == 0:
165
+ del self.in_flight_rows[id(row)]
166
+ self.__add_ready_row(row, state.idx)
167
+
168
+ def __process_input_batch(self, input_batch: DataRowBatch, executor: futures.ThreadPoolExecutor) -> None:
169
+ """Process a batch of input rows, submitting URLs for download and adding ready rows to ready_rows"""
179
170
  file_cache = FileCache.get()
180
171
 
181
172
  # URLs from this input batch that aren't already in the file cache;
@@ -183,7 +174,7 @@ class CachePrefetchNode(ExecNode):
183
174
  # the time it takes to get the next batch together
184
175
  cache_misses: list[str] = []
185
176
 
186
- url_pos: dict[str, int] = {} # url -> row_idx; used for logging
177
+ url_pos: dict[str, int | None] = {} # url -> row_idx; used for logging
187
178
  for row in input_batch:
188
179
  # identify missing local files in input batch, or fill in their paths if they're already cached
189
180
  num_missing = 0
@@ -222,8 +213,10 @@ class CachePrefetchNode(ExecNode):
222
213
  _logger.debug(f'submitted {url} for idx {url_pos[url]}')
223
214
  self.in_flight_requests[f] = url
224
215
 
225
- def __fetch_url(self, url: str) -> tuple[Optional[Path], Optional[Exception]]:
226
- """Fetches a remote URL into Env.tmp_dir and returns its path"""
216
+ def __fetch_url(self, url: str) -> tuple[Path | None, Exception | None]:
217
+ """Fetches a remote URL into the TempStore and returns its path"""
218
+ from pixeltable.utils.local_store import TempStore
219
+
227
220
  _logger.debug(f'fetching url={url} thread_name={threading.current_thread().name}')
228
221
  parsed = urllib.parse.urlparse(url)
229
222
  # Use len(parsed.scheme) > 1 here to ensure we're not being passed
@@ -234,34 +227,14 @@ class CachePrefetchNode(ExecNode):
234
227
  if parsed.path:
235
228
  p = Path(urllib.parse.unquote(urllib.request.url2pathname(parsed.path)))
236
229
  extension = p.suffix
237
- tmp_path = env.Env.get().create_tmp_path(extension=extension)
230
+ tmp_path = TempStore.create_path(extension=extension)
238
231
  try:
239
232
  _logger.debug(f'Downloading {url} to {tmp_path}')
240
- if parsed.scheme == 's3':
241
- from pixeltable.utils.s3 import get_client
242
-
243
- with self.boto_client_lock:
244
- if self.boto_client is None:
245
- config = {
246
- 'max_pool_connections': self.NUM_EXECUTOR_THREADS + 4, # +4: leave some headroom
247
- 'connect_timeout': 5,
248
- 'read_timeout': 30,
249
- 'retries': {'max_attempts': 3, 'mode': 'adaptive'},
250
- }
251
- self.boto_client = get_client(**config)
252
- self.boto_client.download_file(parsed.netloc, parsed.path.lstrip('/'), str(tmp_path))
253
- elif parsed.scheme in ('http', 'https'):
254
- with urllib.request.urlopen(url) as resp, open(tmp_path, 'wb') as f:
255
- data = resp.read()
256
- f.write(data)
257
- else:
258
- raise AssertionError(f'Unsupported URL scheme: {parsed.scheme}')
233
+ ObjectOps.copy_object_to_local_file(url, tmp_path)
259
234
  _logger.debug(f'Downloaded {url} to {tmp_path}')
260
235
  return tmp_path, None
261
236
  except Exception as e:
262
237
  # we want to add the file url to the exception message
263
238
  exc = excs.Error(f'Failed to download {url}: {e}')
264
239
  _logger.debug(f'Failed to download {url}: {e}', exc_info=e)
265
- if not self.ctx.ignore_errors:
266
- raise exc from None # suppress original exception
267
240
  return None, exc
@@ -0,0 +1,268 @@
1
+ from __future__ import annotations
2
+
3
+ import io
4
+ import logging
5
+ import os
6
+ from pathlib import Path
7
+ from typing import Any, AsyncIterator
8
+
9
+ import numpy as np
10
+ import pgvector.sqlalchemy # type: ignore[import-untyped]
11
+ import PIL.Image
12
+ import sqlalchemy as sql
13
+
14
+ import pixeltable.type_system as ts
15
+ import pixeltable.utils.image as image_utils
16
+ from pixeltable import catalog, exprs
17
+ from pixeltable.env import Env
18
+ from pixeltable.utils.local_store import LocalStore
19
+
20
+ from .data_row_batch import DataRowBatch
21
+ from .exec_node import ExecNode
22
+ from .globals import INLINED_OBJECT_MD_KEY, InlinedObjectMd
23
+
24
+ _logger = logging.getLogger('pixeltable')
25
+
26
+
27
+ class CellMaterializationNode(ExecNode):
28
+ """
29
+ Node to populate DataRow.cell_vals/cell_md.
30
+
31
+ For now, the scope is limited to populating DataRow.cells_vals for json and array columns.
32
+
33
+ Array values:
34
+ - Arrays < MAX_DB_ARRAY_SIZE are stored inline in the db column
35
+ - Larger arrays are written to inlined_obj_files
36
+ - Bool arrays are stored as packed bits (uint8)
37
+ - cell_md: holds the url of the file, plus start and end offsets, plus bool flag and shape for bool arrays
38
+ (this allows us to query cell_md to get the total external storage size of an array column)
39
+
40
+ Json values:
41
+ - Inlined images and arrays are written to inlined_obj_files and replaced with a dict containing the object
42
+ location
43
+ - Bool arrays are also stored as packed bits; the dict also contains the shape and bool flag
44
+ - cell_md contains the list of urls for the inlined objects.
45
+
46
+ TODO:
47
+ - execute file IO via asyncio Tasks in a thread pool?
48
+ (we already seem to be getting 90% of hardware IO throughput)
49
+ - subsume all cell materialization
50
+ """
51
+
52
+ output_col_info: dict[catalog.Column, int] # value: slot idx
53
+
54
+ # execution state
55
+ inlined_obj_files: list[Path] # only [-1] is open for writing
56
+ buffered_writer: io.BufferedWriter | None # BufferedWriter for inlined_obj_files[-1]
57
+
58
+ MIN_FILE_SIZE = 8 * 2**20 # 8MB
59
+ MAX_DB_BINARY_SIZE = 512 # max size of binary data stored in table column; in bytes
60
+
61
+ def __init__(self, input: ExecNode):
62
+ super().__init__(input.row_builder, [], [], input)
63
+ self.output_col_info = {
64
+ col: slot_idx
65
+ for col, slot_idx in input.row_builder.table_columns.items()
66
+ if slot_idx is not None and col.col_type.supports_file_offloading()
67
+ }
68
+ self.inlined_obj_files = []
69
+ self.buffered_writer = None
70
+
71
+ async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
72
+ async for batch in self.input:
73
+ for row in batch:
74
+ for col, slot_idx in self.output_col_info.items():
75
+ if row.has_exc(slot_idx):
76
+ # Nulls in JSONB columns need to be stored as sql.sql.null(), otherwise it stores a json 'null'
77
+ row.cell_vals[col.id] = sql.sql.null() if col.col_type.is_json_type() else None
78
+ exc = row.get_exc(slot_idx)
79
+ row.cell_md[col.id] = exprs.CellMd(errortype=type(exc).__name__, errormsg=str(exc))
80
+ continue
81
+
82
+ val = row[slot_idx]
83
+ if val is None:
84
+ row.cell_vals[col.id] = sql.sql.null() if col.col_type.is_json_type() else None
85
+ row.cell_md[col.id] = None
86
+ continue
87
+
88
+ if col.col_type.is_json_type():
89
+ self._materialize_json_cell(row, col, val)
90
+ elif col.col_type.is_array_type():
91
+ assert isinstance(val, np.ndarray)
92
+ self._materialize_array_cell(row, col, val)
93
+ else:
94
+ assert col.col_type.is_binary_type()
95
+ assert isinstance(val, bytes)
96
+ self._materialize_binary_cell(row, col, val)
97
+
98
+ # continue with only the currently open file
99
+ self.inlined_obj_files = self.inlined_obj_files[-1:]
100
+
101
+ yield batch
102
+
103
+ self._flush_buffer(finalize=True)
104
+
105
+ def init_writer(self) -> None:
106
+ if self.buffered_writer is None:
107
+ self._reset_buffer()
108
+ assert self.buffered_writer is not None
109
+
110
+ def close(self) -> None:
111
+ if self.buffered_writer is not None:
112
+ # there must have been an error, otherwise _flush_full_buffer(finalize=True) would have set this to None
113
+ self.buffered_writer.close()
114
+ self.buffered_writer = None
115
+
116
+ def _materialize_json_cell(self, row: exprs.DataRow, col: catalog.Column, val: Any) -> None:
117
+ if self._json_has_inlined_objs(val):
118
+ row.cell_vals[col.id] = self._rewrite_json(val)
119
+ row.cell_md[col.id] = exprs.CellMd(file_urls=[local_path.as_uri() for local_path in self.inlined_obj_files])
120
+ else:
121
+ row.cell_vals[col.id] = val
122
+ row.cell_md[col.id] = None
123
+
124
+ def _materialize_array_cell(self, row: exprs.DataRow, col: catalog.Column, val: np.ndarray) -> None:
125
+ if isinstance(col.sa_col_type, pgvector.sqlalchemy.Vector):
126
+ # this is a vector column (ie, used for a vector index): store the array itself
127
+ row.cell_vals[col.id] = val
128
+ row.cell_md[col.id] = None
129
+ elif val.nbytes <= self.MAX_DB_BINARY_SIZE:
130
+ # this array is small enough to store in the db column (type: binary) directly
131
+ buffer = io.BytesIO()
132
+ np.save(buffer, val, allow_pickle=False)
133
+ row.cell_vals[col.id] = buffer.getvalue()
134
+ row.cell_md[col.id] = None
135
+ else:
136
+ # append this array to the buffer and store its location in the cell md
137
+ ar: np.ndarray
138
+ if np.issubdtype(val.dtype, np.bool_):
139
+ # for bool arrays, store as packed bits, otherwise it's 1 byte per element
140
+ ar = np.packbits(val)
141
+ else:
142
+ ar = val
143
+ self.init_writer()
144
+ start = self.buffered_writer.tell()
145
+ np.save(self.buffered_writer, ar, allow_pickle=False)
146
+ end = self.buffered_writer.tell()
147
+ row.cell_vals[col.id] = None
148
+ cell_md = exprs.CellMd(
149
+ file_urls=[self.inlined_obj_files[-1].as_uri()], array_md=exprs.ArrayMd(start=start, end=end)
150
+ )
151
+ if np.issubdtype(val.dtype, np.bool_):
152
+ cell_md.array_md.is_bool = True
153
+ cell_md.array_md.shape = val.shape
154
+ row.cell_md[col.id] = cell_md
155
+ self._flush_buffer()
156
+
157
+ assert row.cell_vals[col.id] is not None or row.cell_md[col.id] is not None
158
+
159
+ def _materialize_binary_cell(self, row: exprs.DataRow, col: catalog.Column, val: bytes) -> None:
160
+ if len(val) <= self.MAX_DB_BINARY_SIZE:
161
+ # this `bytes` object is small enough to store in the db column (type: binary) directly
162
+ row.cell_vals[col.id] = val
163
+ row.cell_md[col.id] = None
164
+ else:
165
+ self.init_writer()
166
+ start = self.buffered_writer.tell()
167
+ self.buffered_writer.write(val)
168
+ end = self.buffered_writer.tell()
169
+ row.cell_vals[col.id] = None
170
+ cell_md = exprs.CellMd(
171
+ file_urls=[self.inlined_obj_files[-1].as_uri()], binary_md=exprs.BinaryMd(start=start, end=end)
172
+ )
173
+ row.cell_md[col.id] = cell_md
174
+ self._flush_buffer()
175
+
176
+ assert row.cell_vals[col.id] is not None or row.cell_md[col.id] is not None
177
+
178
+ def _json_has_inlined_objs(self, element: Any) -> bool:
179
+ if isinstance(element, list):
180
+ return any(self._json_has_inlined_objs(v) for v in element)
181
+ if isinstance(element, dict):
182
+ return any(self._json_has_inlined_objs(v) for v in element.values())
183
+ return isinstance(element, (np.ndarray, PIL.Image.Image, bytes))
184
+
185
+ def _rewrite_json(self, element: Any) -> Any:
186
+ """Recursively rewrites a JSON structure by writing any inlined arrays or images to self.buffered_writer."""
187
+ if isinstance(element, list):
188
+ return [self._rewrite_json(v) for v in element]
189
+ if isinstance(element, dict):
190
+ return {k: self._rewrite_json(v) for k, v in element.items()}
191
+ if isinstance(element, np.ndarray):
192
+ obj_md = self._write_inlined_array(element)
193
+ return {INLINED_OBJECT_MD_KEY: obj_md.as_dict()}
194
+ if isinstance(element, PIL.Image.Image):
195
+ obj_md = self._write_inlined_image(element)
196
+ return {INLINED_OBJECT_MD_KEY: obj_md.as_dict()}
197
+ if isinstance(element, bytes):
198
+ obj_md = self._write_inlined_bytes(element)
199
+ return {INLINED_OBJECT_MD_KEY: obj_md.as_dict()}
200
+ return element
201
+
202
+ def _write_inlined_array(self, ar: np.ndarray) -> InlinedObjectMd:
203
+ """Write an ndarray to buffered_writer and return its metadata."""
204
+ self.init_writer()
205
+ url_idx = len(self.inlined_obj_files) - 1
206
+ start = self.buffered_writer.tell()
207
+ shape: tuple[int, ...] | None
208
+ is_bool_array: bool
209
+ if np.issubdtype(ar.dtype, np.bool_):
210
+ shape = ar.shape
211
+ ar = np.packbits(ar)
212
+ is_bool_array = True
213
+ else:
214
+ shape = None
215
+ is_bool_array = False
216
+ np.save(self.buffered_writer, ar, allow_pickle=False)
217
+ end = self.buffered_writer.tell()
218
+ self._flush_buffer()
219
+ return InlinedObjectMd(
220
+ type=ts.ColumnType.Type.ARRAY.name,
221
+ url_idx=url_idx,
222
+ array_md=exprs.ArrayMd(start=start, end=end, is_bool=is_bool_array, shape=shape),
223
+ )
224
+
225
+ def _write_inlined_image(self, img: PIL.Image.Image) -> InlinedObjectMd:
226
+ """Write a PIL image to buffered_writer and return: index into inlined_obj_files, start offset, end offset"""
227
+ self.init_writer()
228
+ url_idx = len(self.inlined_obj_files) - 1
229
+ start = self.buffered_writer.tell()
230
+ img.save(self.buffered_writer, format=image_utils.default_format(img))
231
+ end = self.buffered_writer.tell()
232
+ self._flush_buffer()
233
+ return InlinedObjectMd(type=ts.ColumnType.Type.IMAGE.name, url_idx=url_idx, img_start=start, img_end=end)
234
+
235
+ def _write_inlined_bytes(self, data: bytes) -> InlinedObjectMd:
236
+ """Write raw bytes to buffered_writer and return: index into inlined_obj_files, start offset, end offset"""
237
+ self.init_writer()
238
+ url_idx = len(self.inlined_obj_files) - 1
239
+ start = self.buffered_writer.tell()
240
+ self.buffered_writer.write(data)
241
+ end = self.buffered_writer.tell()
242
+ self._flush_buffer()
243
+ return InlinedObjectMd(
244
+ type=ts.ColumnType.Type.BINARY.name, url_idx=url_idx, binary_md=exprs.BinaryMd(start, end)
245
+ )
246
+
247
+ def _reset_buffer(self) -> None:
248
+ local_path = LocalStore(Env.get().media_dir)._prepare_path_raw(
249
+ self.row_builder.tbl.id, 0, self.row_builder.tbl.version
250
+ )
251
+ self.inlined_obj_files.append(local_path)
252
+ fh = open(local_path, 'wb', buffering=self.MIN_FILE_SIZE * 2) # noqa: SIM115
253
+ assert isinstance(fh, io.BufferedWriter)
254
+ self.buffered_writer = fh
255
+
256
+ def _flush_buffer(self, finalize: bool = False) -> None:
257
+ """Flush buffered_writer to storage if it exceeds its minimum size or finalize is True."""
258
+ if self.buffered_writer is None:
259
+ return
260
+ if self.buffered_writer.tell() < self.MIN_FILE_SIZE and not finalize:
261
+ return
262
+ self.buffered_writer.flush()
263
+ os.fsync(self.buffered_writer.fileno()) # needed to force bytes cached by OS to storage
264
+ self.buffered_writer.close()
265
+ if finalize:
266
+ self.buffered_writer = None
267
+ else:
268
+ self._reset_buffer()