pixeltable 0.2.26__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. pixeltable/__init__.py +83 -19
  2. pixeltable/_query.py +1444 -0
  3. pixeltable/_version.py +1 -0
  4. pixeltable/catalog/__init__.py +7 -4
  5. pixeltable/catalog/catalog.py +2394 -119
  6. pixeltable/catalog/column.py +225 -104
  7. pixeltable/catalog/dir.py +38 -9
  8. pixeltable/catalog/globals.py +53 -34
  9. pixeltable/catalog/insertable_table.py +265 -115
  10. pixeltable/catalog/path.py +80 -17
  11. pixeltable/catalog/schema_object.py +28 -43
  12. pixeltable/catalog/table.py +1270 -677
  13. pixeltable/catalog/table_metadata.py +103 -0
  14. pixeltable/catalog/table_version.py +1270 -751
  15. pixeltable/catalog/table_version_handle.py +109 -0
  16. pixeltable/catalog/table_version_path.py +137 -42
  17. pixeltable/catalog/tbl_ops.py +53 -0
  18. pixeltable/catalog/update_status.py +191 -0
  19. pixeltable/catalog/view.py +251 -134
  20. pixeltable/config.py +215 -0
  21. pixeltable/env.py +736 -285
  22. pixeltable/exceptions.py +26 -2
  23. pixeltable/exec/__init__.py +7 -2
  24. pixeltable/exec/aggregation_node.py +39 -21
  25. pixeltable/exec/cache_prefetch_node.py +87 -109
  26. pixeltable/exec/cell_materialization_node.py +268 -0
  27. pixeltable/exec/cell_reconstruction_node.py +168 -0
  28. pixeltable/exec/component_iteration_node.py +25 -28
  29. pixeltable/exec/data_row_batch.py +11 -46
  30. pixeltable/exec/exec_context.py +26 -11
  31. pixeltable/exec/exec_node.py +35 -27
  32. pixeltable/exec/expr_eval/__init__.py +3 -0
  33. pixeltable/exec/expr_eval/evaluators.py +365 -0
  34. pixeltable/exec/expr_eval/expr_eval_node.py +413 -0
  35. pixeltable/exec/expr_eval/globals.py +200 -0
  36. pixeltable/exec/expr_eval/row_buffer.py +74 -0
  37. pixeltable/exec/expr_eval/schedulers.py +413 -0
  38. pixeltable/exec/globals.py +35 -0
  39. pixeltable/exec/in_memory_data_node.py +35 -27
  40. pixeltable/exec/object_store_save_node.py +293 -0
  41. pixeltable/exec/row_update_node.py +44 -29
  42. pixeltable/exec/sql_node.py +414 -115
  43. pixeltable/exprs/__init__.py +8 -5
  44. pixeltable/exprs/arithmetic_expr.py +79 -45
  45. pixeltable/exprs/array_slice.py +5 -5
  46. pixeltable/exprs/column_property_ref.py +40 -26
  47. pixeltable/exprs/column_ref.py +254 -61
  48. pixeltable/exprs/comparison.py +14 -9
  49. pixeltable/exprs/compound_predicate.py +9 -10
  50. pixeltable/exprs/data_row.py +213 -72
  51. pixeltable/exprs/expr.py +270 -104
  52. pixeltable/exprs/expr_dict.py +6 -5
  53. pixeltable/exprs/expr_set.py +20 -11
  54. pixeltable/exprs/function_call.py +383 -284
  55. pixeltable/exprs/globals.py +18 -5
  56. pixeltable/exprs/in_predicate.py +7 -7
  57. pixeltable/exprs/inline_expr.py +37 -37
  58. pixeltable/exprs/is_null.py +8 -4
  59. pixeltable/exprs/json_mapper.py +120 -54
  60. pixeltable/exprs/json_path.py +90 -60
  61. pixeltable/exprs/literal.py +61 -16
  62. pixeltable/exprs/method_ref.py +7 -6
  63. pixeltable/exprs/object_ref.py +19 -8
  64. pixeltable/exprs/row_builder.py +238 -75
  65. pixeltable/exprs/rowid_ref.py +53 -15
  66. pixeltable/exprs/similarity_expr.py +65 -50
  67. pixeltable/exprs/sql_element_cache.py +5 -5
  68. pixeltable/exprs/string_op.py +107 -0
  69. pixeltable/exprs/type_cast.py +25 -13
  70. pixeltable/exprs/variable.py +2 -2
  71. pixeltable/func/__init__.py +9 -5
  72. pixeltable/func/aggregate_function.py +197 -92
  73. pixeltable/func/callable_function.py +119 -35
  74. pixeltable/func/expr_template_function.py +101 -48
  75. pixeltable/func/function.py +375 -62
  76. pixeltable/func/function_registry.py +20 -19
  77. pixeltable/func/globals.py +6 -5
  78. pixeltable/func/mcp.py +74 -0
  79. pixeltable/func/query_template_function.py +151 -35
  80. pixeltable/func/signature.py +178 -49
  81. pixeltable/func/tools.py +164 -0
  82. pixeltable/func/udf.py +176 -53
  83. pixeltable/functions/__init__.py +44 -4
  84. pixeltable/functions/anthropic.py +226 -47
  85. pixeltable/functions/audio.py +148 -11
  86. pixeltable/functions/bedrock.py +137 -0
  87. pixeltable/functions/date.py +188 -0
  88. pixeltable/functions/deepseek.py +113 -0
  89. pixeltable/functions/document.py +81 -0
  90. pixeltable/functions/fal.py +76 -0
  91. pixeltable/functions/fireworks.py +72 -20
  92. pixeltable/functions/gemini.py +249 -0
  93. pixeltable/functions/globals.py +208 -53
  94. pixeltable/functions/groq.py +108 -0
  95. pixeltable/functions/huggingface.py +1088 -95
  96. pixeltable/functions/image.py +155 -84
  97. pixeltable/functions/json.py +8 -11
  98. pixeltable/functions/llama_cpp.py +31 -19
  99. pixeltable/functions/math.py +169 -0
  100. pixeltable/functions/mistralai.py +50 -75
  101. pixeltable/functions/net.py +70 -0
  102. pixeltable/functions/ollama.py +29 -36
  103. pixeltable/functions/openai.py +548 -160
  104. pixeltable/functions/openrouter.py +143 -0
  105. pixeltable/functions/replicate.py +15 -14
  106. pixeltable/functions/reve.py +250 -0
  107. pixeltable/functions/string.py +310 -85
  108. pixeltable/functions/timestamp.py +37 -19
  109. pixeltable/functions/together.py +77 -120
  110. pixeltable/functions/twelvelabs.py +188 -0
  111. pixeltable/functions/util.py +7 -2
  112. pixeltable/functions/uuid.py +30 -0
  113. pixeltable/functions/video.py +1528 -117
  114. pixeltable/functions/vision.py +26 -26
  115. pixeltable/functions/voyageai.py +289 -0
  116. pixeltable/functions/whisper.py +19 -10
  117. pixeltable/functions/whisperx.py +179 -0
  118. pixeltable/functions/yolox.py +112 -0
  119. pixeltable/globals.py +716 -236
  120. pixeltable/index/__init__.py +3 -1
  121. pixeltable/index/base.py +17 -21
  122. pixeltable/index/btree.py +32 -22
  123. pixeltable/index/embedding_index.py +155 -92
  124. pixeltable/io/__init__.py +12 -7
  125. pixeltable/io/datarows.py +140 -0
  126. pixeltable/io/external_store.py +83 -125
  127. pixeltable/io/fiftyone.py +24 -33
  128. pixeltable/io/globals.py +47 -182
  129. pixeltable/io/hf_datasets.py +96 -127
  130. pixeltable/io/label_studio.py +171 -156
  131. pixeltable/io/lancedb.py +3 -0
  132. pixeltable/io/pandas.py +136 -115
  133. pixeltable/io/parquet.py +40 -153
  134. pixeltable/io/table_data_conduit.py +702 -0
  135. pixeltable/io/utils.py +100 -0
  136. pixeltable/iterators/__init__.py +8 -4
  137. pixeltable/iterators/audio.py +207 -0
  138. pixeltable/iterators/base.py +9 -3
  139. pixeltable/iterators/document.py +144 -87
  140. pixeltable/iterators/image.py +17 -38
  141. pixeltable/iterators/string.py +15 -12
  142. pixeltable/iterators/video.py +523 -127
  143. pixeltable/metadata/__init__.py +33 -8
  144. pixeltable/metadata/converters/convert_10.py +2 -3
  145. pixeltable/metadata/converters/convert_13.py +2 -2
  146. pixeltable/metadata/converters/convert_15.py +15 -11
  147. pixeltable/metadata/converters/convert_16.py +4 -5
  148. pixeltable/metadata/converters/convert_17.py +4 -5
  149. pixeltable/metadata/converters/convert_18.py +4 -6
  150. pixeltable/metadata/converters/convert_19.py +6 -9
  151. pixeltable/metadata/converters/convert_20.py +3 -6
  152. pixeltable/metadata/converters/convert_21.py +6 -8
  153. pixeltable/metadata/converters/convert_22.py +3 -2
  154. pixeltable/metadata/converters/convert_23.py +33 -0
  155. pixeltable/metadata/converters/convert_24.py +55 -0
  156. pixeltable/metadata/converters/convert_25.py +19 -0
  157. pixeltable/metadata/converters/convert_26.py +23 -0
  158. pixeltable/metadata/converters/convert_27.py +29 -0
  159. pixeltable/metadata/converters/convert_28.py +13 -0
  160. pixeltable/metadata/converters/convert_29.py +110 -0
  161. pixeltable/metadata/converters/convert_30.py +63 -0
  162. pixeltable/metadata/converters/convert_31.py +11 -0
  163. pixeltable/metadata/converters/convert_32.py +15 -0
  164. pixeltable/metadata/converters/convert_33.py +17 -0
  165. pixeltable/metadata/converters/convert_34.py +21 -0
  166. pixeltable/metadata/converters/convert_35.py +9 -0
  167. pixeltable/metadata/converters/convert_36.py +38 -0
  168. pixeltable/metadata/converters/convert_37.py +15 -0
  169. pixeltable/metadata/converters/convert_38.py +39 -0
  170. pixeltable/metadata/converters/convert_39.py +124 -0
  171. pixeltable/metadata/converters/convert_40.py +73 -0
  172. pixeltable/metadata/converters/convert_41.py +12 -0
  173. pixeltable/metadata/converters/convert_42.py +9 -0
  174. pixeltable/metadata/converters/convert_43.py +44 -0
  175. pixeltable/metadata/converters/util.py +44 -18
  176. pixeltable/metadata/notes.py +21 -0
  177. pixeltable/metadata/schema.py +185 -42
  178. pixeltable/metadata/utils.py +74 -0
  179. pixeltable/mypy/__init__.py +3 -0
  180. pixeltable/mypy/mypy_plugin.py +123 -0
  181. pixeltable/plan.py +616 -225
  182. pixeltable/share/__init__.py +3 -0
  183. pixeltable/share/packager.py +797 -0
  184. pixeltable/share/protocol/__init__.py +33 -0
  185. pixeltable/share/protocol/common.py +165 -0
  186. pixeltable/share/protocol/operation_types.py +33 -0
  187. pixeltable/share/protocol/replica.py +119 -0
  188. pixeltable/share/publish.py +349 -0
  189. pixeltable/store.py +398 -232
  190. pixeltable/type_system.py +730 -267
  191. pixeltable/utils/__init__.py +40 -0
  192. pixeltable/utils/arrow.py +201 -29
  193. pixeltable/utils/av.py +298 -0
  194. pixeltable/utils/azure_store.py +346 -0
  195. pixeltable/utils/coco.py +26 -27
  196. pixeltable/utils/code.py +4 -4
  197. pixeltable/utils/console_output.py +46 -0
  198. pixeltable/utils/coroutine.py +24 -0
  199. pixeltable/utils/dbms.py +92 -0
  200. pixeltable/utils/description_helper.py +11 -12
  201. pixeltable/utils/documents.py +60 -61
  202. pixeltable/utils/exception_handler.py +36 -0
  203. pixeltable/utils/filecache.py +38 -22
  204. pixeltable/utils/formatter.py +88 -51
  205. pixeltable/utils/gcs_store.py +295 -0
  206. pixeltable/utils/http.py +133 -0
  207. pixeltable/utils/http_server.py +14 -13
  208. pixeltable/utils/iceberg.py +13 -0
  209. pixeltable/utils/image.py +17 -0
  210. pixeltable/utils/lancedb.py +90 -0
  211. pixeltable/utils/local_store.py +322 -0
  212. pixeltable/utils/misc.py +5 -0
  213. pixeltable/utils/object_stores.py +573 -0
  214. pixeltable/utils/pydantic.py +60 -0
  215. pixeltable/utils/pytorch.py +20 -20
  216. pixeltable/utils/s3_store.py +527 -0
  217. pixeltable/utils/sql.py +32 -5
  218. pixeltable/utils/system.py +30 -0
  219. pixeltable/utils/transactional_directory.py +4 -3
  220. pixeltable-0.5.7.dist-info/METADATA +579 -0
  221. pixeltable-0.5.7.dist-info/RECORD +227 -0
  222. {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
  223. pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
  224. pixeltable/__version__.py +0 -3
  225. pixeltable/catalog/named_function.py +0 -36
  226. pixeltable/catalog/path_dict.py +0 -141
  227. pixeltable/dataframe.py +0 -894
  228. pixeltable/exec/expr_eval_node.py +0 -232
  229. pixeltable/ext/__init__.py +0 -14
  230. pixeltable/ext/functions/__init__.py +0 -8
  231. pixeltable/ext/functions/whisperx.py +0 -77
  232. pixeltable/ext/functions/yolox.py +0 -157
  233. pixeltable/tool/create_test_db_dump.py +0 -311
  234. pixeltable/tool/create_test_video.py +0 -81
  235. pixeltable/tool/doc_plugins/griffe.py +0 -50
  236. pixeltable/tool/doc_plugins/mkdocstrings.py +0 -6
  237. pixeltable/tool/doc_plugins/templates/material/udf.html.jinja +0 -135
  238. pixeltable/tool/embed_udf.py +0 -9
  239. pixeltable/tool/mypy_plugin.py +0 -55
  240. pixeltable/utils/media_store.py +0 -76
  241. pixeltable/utils/s3.py +0 -16
  242. pixeltable-0.2.26.dist-info/METADATA +0 -400
  243. pixeltable-0.2.26.dist-info/RECORD +0 -156
  244. pixeltable-0.2.26.dist-info/entry_points.txt +0 -3
  245. {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
pixeltable/exceptions.py CHANGED
@@ -1,4 +1,3 @@
1
- from dataclasses import dataclass
2
1
  from types import TracebackType
3
2
  from typing import TYPE_CHECKING, Any
4
3
 
@@ -10,8 +9,13 @@ class Error(Exception):
10
9
  pass
11
10
 
12
11
 
13
- @dataclass
14
12
  class ExprEvalError(Exception):
13
+ """
14
+ Used during query execution to signal expr evaluation failures.
15
+
16
+ NOT A USER-FACING EXCEPTION. All ExprEvalError instances need to be converted into Error instances.
17
+ """
18
+
15
19
  expr: 'exprs.Expr'
16
20
  expr_msg: str
17
21
  exc: Exception
@@ -19,6 +23,26 @@ class ExprEvalError(Exception):
19
23
  input_vals: list[Any]
20
24
  row_num: int
21
25
 
26
+ def __init__(
27
+ self,
28
+ expr: 'exprs.Expr',
29
+ expr_msg: str,
30
+ exc: Exception,
31
+ exc_tb: TracebackType,
32
+ input_vals: list[Any],
33
+ row_num: int,
34
+ ) -> None:
35
+ exct = type(exc)
36
+ super().__init__(
37
+ f'Expression evaluation failed with an error of type `{exct.__module__}.{exct.__qualname__}`:\n{expr}'
38
+ )
39
+ self.expr = expr
40
+ self.expr_msg = expr_msg
41
+ self.exc = exc
42
+ self.exc_tb = exc_tb
43
+ self.input_vals = input_vals
44
+ self.row_num = row_num
45
+
22
46
 
23
47
  class PixeltableWarning(Warning):
24
48
  pass
@@ -1,10 +1,15 @@
1
+ # ruff: noqa: F401
2
+
1
3
  from .aggregation_node import AggregationNode
2
4
  from .cache_prefetch_node import CachePrefetchNode
5
+ from .cell_materialization_node import CellMaterializationNode
6
+ from .cell_reconstruction_node import CellReconstructionNode
3
7
  from .component_iteration_node import ComponentIterationNode
4
8
  from .data_row_batch import DataRowBatch
5
9
  from .exec_context import ExecContext
6
10
  from .exec_node import ExecNode
7
- from .expr_eval_node import ExprEvalNode
11
+ from .expr_eval import ExprEvalNode
8
12
  from .in_memory_data_node import InMemoryDataNode
13
+ from .object_store_save_node import ObjectStoreSaveNode
9
14
  from .row_update_node import RowUpdateNode
10
- from .sql_node import SqlLookupNode, SqlScanNode, SqlAggregationNode, SqlNode, SqlJoinNode
15
+ from .sql_node import SqlAggregationNode, SqlJoinNode, SqlLookupNode, SqlNode, SqlSampleNode, SqlScanNode
@@ -2,32 +2,38 @@ from __future__ import annotations
2
2
 
3
3
  import logging
4
4
  import sys
5
- from typing import Any, Iterable, Iterator, Optional, cast
5
+ from typing import Any, AsyncIterator, Iterable, cast
6
6
 
7
- import pixeltable.catalog as catalog
8
- import pixeltable.exceptions as excs
9
- import pixeltable.exprs as exprs
7
+ from pixeltable import catalog, exceptions as excs, exprs
10
8
 
11
9
  from .data_row_batch import DataRowBatch
12
10
  from .exec_node import ExecNode
13
11
 
14
12
  _logger = logging.getLogger('pixeltable')
15
13
 
14
+
16
15
  class AggregationNode(ExecNode):
17
16
  """
18
17
  In-memory aggregation for UDAs.
19
18
 
20
19
  At the moment, this returns all results in a single DataRowBatch.
21
20
  """
22
- group_by: Optional[list[exprs.Expr]]
21
+
22
+ group_by: list[exprs.Expr] | None
23
23
  input_exprs: list[exprs.Expr]
24
24
  agg_fn_eval_ctx: exprs.RowBuilder.EvalCtx
25
25
  agg_fn_calls: list[exprs.FunctionCall]
26
26
  output_batch: DataRowBatch
27
+ limit: int | None
27
28
 
28
29
  def __init__(
29
- self, tbl: catalog.TableVersion, row_builder: exprs.RowBuilder, group_by: Optional[list[exprs.Expr]],
30
- agg_fn_calls: list[exprs.FunctionCall], input_exprs: Iterable[exprs.Expr], input: ExecNode
30
+ self,
31
+ tbl: catalog.TableVersionHandle,
32
+ row_builder: exprs.RowBuilder,
33
+ group_by: list[exprs.Expr] | None,
34
+ agg_fn_calls: list[exprs.FunctionCall],
35
+ input_exprs: Iterable[exprs.Expr],
36
+ input: ExecNode,
31
37
  ):
32
38
  output_exprs: list[exprs.Expr] = [] if group_by is None else list(group_by)
33
39
  output_exprs.extend(agg_fn_calls)
@@ -39,51 +45,63 @@ class AggregationNode(ExecNode):
39
45
  # we need to make sure to refer to the same exprs that RowBuilder.eval() will use
40
46
  self.agg_fn_calls = [cast(exprs.FunctionCall, e) for e in self.agg_fn_eval_ctx.target_exprs]
41
47
  # create output_batch here, rather than in __iter__(), so we don't need to remember tbl and row_builder
42
- self.output_batch = DataRowBatch(tbl, row_builder, 0)
48
+ self.output_batch = DataRowBatch(row_builder)
49
+ self.limit = None
50
+
51
+ def set_limit(self, limit: int) -> None:
52
+ # we can't propagate the limit to our input
53
+ self.limit = limit
43
54
 
44
55
  def _reset_agg_state(self, row_num: int) -> None:
45
56
  for fn_call in self.agg_fn_calls:
46
57
  try:
47
58
  fn_call.reset_agg()
48
- except Exception as e:
59
+ except Exception as exc:
49
60
  _, _, exc_tb = sys.exc_info()
50
61
  expr_msg = f'init() function of the aggregate {fn_call}'
51
- raise excs.ExprEvalError(fn_call, expr_msg, e, exc_tb, [], row_num)
62
+ raise excs.ExprEvalError(fn_call, expr_msg, exc, exc_tb, [], row_num) from exc
52
63
 
53
64
  def _update_agg_state(self, row: exprs.DataRow, row_num: int) -> None:
54
65
  for fn_call in self.agg_fn_calls:
55
66
  try:
56
67
  fn_call.update(row)
57
- except Exception as e:
68
+ except Exception as exc:
58
69
  _, _, exc_tb = sys.exc_info()
59
70
  expr_msg = f'update() function of the aggregate {fn_call}'
60
71
  input_vals = [row[d.slot_idx] for d in fn_call.dependencies()]
61
- raise excs.ExprEvalError(fn_call, expr_msg, e, exc_tb, input_vals, row_num)
72
+ raise excs.ExprEvalError(fn_call, expr_msg, exc, exc_tb, input_vals, row_num) from exc
62
73
 
63
- def __iter__(self) -> Iterator[DataRowBatch]:
64
- prev_row: Optional[exprs.DataRow] = None
65
- current_group: Optional[list[Any]] = None # the values of the group-by exprs
74
+ async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
75
+ prev_row: exprs.DataRow | None = None
76
+ current_group: list[Any] | None = None # the values of the group-by exprs
66
77
  num_input_rows = 0
67
- for row_batch in self.input:
78
+ num_output_rows = 0
79
+ async for row_batch in self.input:
68
80
  num_input_rows += len(row_batch)
69
81
  for row in row_batch:
70
82
  group = [row[e.slot_idx] for e in self.group_by] if self.group_by is not None else None
83
+
71
84
  if current_group is None:
72
85
  current_group = group
73
86
  self._reset_agg_state(0)
87
+
74
88
  if group != current_group:
75
89
  # we're entering a new group, emit a row for the previous one
76
90
  self.row_builder.eval(prev_row, self.agg_fn_eval_ctx, profile=self.ctx.profile)
77
91
  self.output_batch.add_row(prev_row)
92
+ num_output_rows += 1
93
+ if self.limit is not None and num_output_rows == self.limit:
94
+ yield self.output_batch
95
+ return
78
96
  current_group = group
79
97
  self._reset_agg_state(0)
80
98
  self._update_agg_state(row, 0)
81
99
  prev_row = row
82
- # emit the last group
83
- self.row_builder.eval(prev_row, self.agg_fn_eval_ctx, profile=self.ctx.profile)
84
- self.output_batch.add_row(prev_row)
85
100
 
86
- self.output_batch.flush_imgs(None, self.stored_img_cols, self.flushed_img_slots)
101
+ if prev_row is not None:
102
+ # emit the last group
103
+ self.row_builder.eval(prev_row, self.agg_fn_eval_ctx, profile=self.ctx.profile)
104
+ self.output_batch.add_row(prev_row)
105
+
87
106
  _logger.debug(f'AggregateNode: consumed {num_input_rows} rows, returning {len(self.output_batch.rows)} rows')
88
107
  yield self.output_batch
89
-
@@ -9,14 +9,12 @@ import urllib.request
9
9
  from collections import deque
10
10
  from concurrent import futures
11
11
  from pathlib import Path
12
- from typing import Optional, Any, Iterator
12
+ from typing import AsyncIterator, Iterator
13
13
  from uuid import UUID
14
14
 
15
- import pixeltable.env as env
16
- import pixeltable.exceptions as excs
17
- import pixeltable.exprs as exprs
18
- from pixeltable import catalog
15
+ from pixeltable import exceptions as excs, exprs
19
16
  from pixeltable.utils.filecache import FileCache
17
+ from pixeltable.utils.object_stores import ObjectOps
20
18
 
21
19
  from .data_row_batch import DataRowBatch
22
20
  from .exec_node import ExecNode
@@ -28,49 +26,45 @@ class CachePrefetchNode(ExecNode):
28
26
  """Brings files with external URLs into the cache
29
27
 
30
28
  TODO:
31
- - adapting the number of download threads at runtime to maximize throughput
29
+ - Process a row at a time and limit the number of in-flight rows to control memory usage
30
+ - Create asyncio.Tasks to consume our input in order to increase concurrency.
32
31
  """
32
+
33
+ QUEUE_DEPTH_HIGH_WATER = 50 # target number of in-flight requests
34
+ QUEUE_DEPTH_LOW_WATER = 20 # target number of in-flight requests
33
35
  BATCH_SIZE = 16
34
- NUM_EXECUTOR_THREADS = 16
36
+ MAX_WORKERS = 15
35
37
 
36
38
  retain_input_order: bool # if True, return rows in the exact order they were received
37
39
  file_col_info: list[exprs.ColumnSlotIdx]
38
- boto_client: Optional[Any]
39
- boto_client_lock: threading.Lock
40
40
 
41
41
  # execution state
42
- batch_tbl_version: Optional[catalog.TableVersion] # needed to construct output batches
43
42
  num_returned_rows: int
44
43
 
45
44
  # ready_rows: rows that are ready to be returned, ordered by row idx;
46
45
  # the implied row idx of ready_rows[0] is num_returned_rows
47
- ready_rows: deque[Optional[exprs.DataRow]]
46
+ ready_rows: deque[exprs.DataRow | None]
48
47
 
49
48
  in_flight_rows: dict[int, CachePrefetchNode.RowState] # rows with in-flight urls; id(row) -> RowState
50
49
  in_flight_requests: dict[futures.Future, str] # in-flight requests for urls; future -> URL
51
50
  in_flight_urls: dict[str, list[tuple[exprs.DataRow, exprs.ColumnSlotIdx]]] # URL -> [(row, info)]
52
51
  input_finished: bool
53
- row_idx: Iterator[Optional[int]]
52
+ row_idx: Iterator[int | None]
54
53
 
55
54
  @dataclasses.dataclass
56
55
  class RowState:
57
56
  row: exprs.DataRow
58
- idx: Optional[int] # position in input stream; None if we don't retain input order
57
+ idx: int | None # position in input stream; None if we don't retain input order
59
58
  num_missing: int # number of missing URLs in this row
60
59
 
61
60
  def __init__(
62
- self, tbl_id: UUID, file_col_info: list[exprs.ColumnSlotIdx], input: ExecNode,
63
- retain_input_order: bool = True):
61
+ self, tbl_id: UUID, file_col_info: list[exprs.ColumnSlotIdx], input: ExecNode, retain_input_order: bool = True
62
+ ):
64
63
  # input_/output_exprs=[]: we don't have anything to evaluate
65
64
  super().__init__(input.row_builder, [], [], input)
66
65
  self.retain_input_order = retain_input_order
67
66
  self.file_col_info = file_col_info
68
67
 
69
- # clients for specific services are constructed as needed, because it's time-consuming
70
- self.boto_client = None
71
- self.boto_client_lock = threading.Lock()
72
-
73
- self.batch_tbl_version = None
74
68
  self.num_returned_rows = 0
75
69
  self.ready_rows = deque()
76
70
  self.in_flight_rows = {}
@@ -78,26 +72,44 @@ class CachePrefetchNode(ExecNode):
78
72
  self.in_flight_urls = {}
79
73
  self.input_finished = False
80
74
  self.row_idx = itertools.count() if retain_input_order else itertools.repeat(None)
75
+ assert self.QUEUE_DEPTH_HIGH_WATER > self.QUEUE_DEPTH_LOW_WATER
81
76
 
82
- def __iter__(self) -> Iterator[DataRowBatch]:
83
- input_iter = iter(self.input)
84
- with futures.ThreadPoolExecutor(max_workers=self.NUM_EXECUTOR_THREADS) as executor:
85
- # we create enough in-flight requests to fill the first batch
86
- while not self.input_finished and self.__num_pending_rows() < self.BATCH_SIZE:
87
- self.__submit_input_batch(input_iter, executor)
88
-
89
- while True:
90
- # try to assemble a full batch of output rows
91
- if not self.__has_ready_batch() and len(self.in_flight_requests) > 0:
92
- self.__wait_for_requests()
77
+ @property
78
+ def queued_work(self) -> int:
79
+ return len(self.in_flight_requests)
93
80
 
94
- # try to create enough in-flight requests to fill the next batch
95
- while not self.input_finished and self.__num_pending_rows() < self.BATCH_SIZE:
96
- self.__submit_input_batch(input_iter, executor)
81
+ async def get_input_batch(self, input_iter: AsyncIterator[DataRowBatch]) -> DataRowBatch | None:
82
+ """Get the next batch of input rows, or None if there are no more rows"""
83
+ try:
84
+ input_batch = await anext(input_iter)
85
+ if input_batch is None:
86
+ self.input_finished = True
87
+ return input_batch
88
+ except StopAsyncIteration:
89
+ self.input_finished = True
90
+ return None
97
91
 
98
- if len(self.ready_rows) > 0:
92
+ async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
93
+ input_iter = aiter(self.input)
94
+ with futures.ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
95
+ while True:
96
+ # Create work to fill the queue to the high water mark ... ?without overrunning the in-flight row limit.
97
+ while not self.input_finished and self.queued_work < self.QUEUE_DEPTH_HIGH_WATER:
98
+ input_batch = await self.get_input_batch(input_iter)
99
+ if input_batch is not None:
100
+ self.__process_input_batch(input_batch, executor)
101
+
102
+ # Wait for enough completions to enable more queueing or if we're done
103
+ while self.queued_work > self.QUEUE_DEPTH_LOW_WATER or (self.input_finished and self.queued_work > 0):
104
+ done, _ = futures.wait(self.in_flight_requests, return_when=futures.FIRST_COMPLETED)
105
+ self.__process_completions(done, ignore_errors=self.ctx.ignore_errors)
106
+
107
+ # Emit results to meet batch size requirements or empty the in-flight row queue
108
+ if self.__has_ready_batch() or (
109
+ len(self.ready_rows) > 0 and self.input_finished and self.queued_work == 0
110
+ ):
99
111
  # create DataRowBatch from the first BATCH_SIZE ready rows
100
- batch = DataRowBatch(self.batch_tbl_version, self.row_builder)
112
+ batch = DataRowBatch(self.row_builder)
101
113
  rows = [self.ready_rows.popleft() for _ in range(min(self.BATCH_SIZE, len(self.ready_rows)))]
102
114
  for row in rows:
103
115
  assert row is not None
@@ -106,23 +118,16 @@ class CachePrefetchNode(ExecNode):
106
118
  _logger.debug(f'returning {len(rows)} rows')
107
119
  yield batch
108
120
 
109
- if self.input_finished and self.__num_pending_rows() == 0:
121
+ if self.input_finished and self.queued_work == 0 and len(self.ready_rows) == 0:
110
122
  return
111
123
 
112
- def __num_pending_rows(self) -> int:
113
- return len(self.in_flight_rows) + len(self.ready_rows)
114
-
115
124
  def __has_ready_batch(self) -> bool:
116
125
  """True if there are >= BATCH_SIZES entries in ready_rows and the first BATCH_SIZE ones are all non-None"""
117
126
  return (
118
127
  sum(int(row is not None) for row in itertools.islice(self.ready_rows, self.BATCH_SIZE)) == self.BATCH_SIZE
119
128
  )
120
129
 
121
- def __ready_prefix_len(self) -> int:
122
- """Length of the non-None prefix of ready_rows (= what we can return right now)"""
123
- return sum(1 for _ in itertools.takewhile(lambda x: x is not None, self.ready_rows))
124
-
125
- def __add_ready_row(self, row: exprs.DataRow, row_idx: Optional[int]) -> None:
130
+ def __add_ready_row(self, row: exprs.DataRow, row_idx: int | None) -> None:
126
131
  if row_idx is None:
127
132
  self.ready_rows.append(row)
128
133
  else:
@@ -132,46 +137,36 @@ class CachePrefetchNode(ExecNode):
132
137
  self.ready_rows.extend([None] * (idx - len(self.ready_rows) + 1))
133
138
  self.ready_rows[idx] = row
134
139
 
135
- def __wait_for_requests(self) -> None:
136
- """Wait for in-flight requests to complete until we have a full batch of rows"""
140
+ def __process_completions(self, done: set[futures.Future], ignore_errors: bool) -> None:
137
141
  file_cache = FileCache.get()
138
- _logger.debug(f'waiting for requests; ready_batch_size={self.__ready_prefix_len()}')
139
- while not self.__has_ready_batch() and len(self.in_flight_requests) > 0:
140
- done, _ = futures.wait(self.in_flight_requests, return_when=futures.FIRST_COMPLETED)
141
- for f in done:
142
- url = self.in_flight_requests.pop(f)
143
- tmp_path, exc = f.result()
144
- local_path: Optional[Path] = None
145
- if tmp_path is not None:
146
- # register the file with the cache for the first column in which it's missing
147
- assert url in self.in_flight_urls
148
- _, info = self.in_flight_urls[url][0]
149
- local_path = file_cache.add(info.col.tbl.id, info.col.id, url, tmp_path)
150
- _logger.debug(f'cached {url} as {local_path}')
151
-
152
- # add the local path/exception to the slots that reference the url
153
- for row, info in self.in_flight_urls.pop(url):
154
- if exc is not None:
155
- self.row_builder.set_exc(row, info.slot_idx, exc)
156
- else:
157
- assert local_path is not None
158
- row.set_file_path(info.slot_idx, str(local_path))
159
- state = self.in_flight_rows[id(row)]
160
- state.num_missing -= 1
161
- if state.num_missing == 0:
162
- del self.in_flight_rows[id(row)]
163
- self.__add_ready_row(row, state.idx)
164
- _logger.debug(f'row {state.idx} is ready (ready_batch_size={self.__ready_prefix_len()})')
165
-
166
- def __submit_input_batch(self, input: Iterator[DataRowBatch], executor: futures.ThreadPoolExecutor) -> None:
167
- assert not self.input_finished
168
- input_batch = next(input, None)
169
- if input_batch is None:
170
- self.input_finished = True
171
- return
172
- if self.batch_tbl_version is None:
173
- self.batch_tbl_version = input_batch.tbl
174
-
142
+ for f in done:
143
+ url = self.in_flight_requests.pop(f)
144
+ tmp_path, exc = f.result()
145
+ if exc is not None and not ignore_errors:
146
+ raise exc
147
+ local_path: Path | None = None
148
+ if tmp_path is not None:
149
+ # register the file with the cache for the first column in which it's missing
150
+ assert url in self.in_flight_urls
151
+ _, info = self.in_flight_urls[url][0]
152
+ local_path = file_cache.add(info.col.get_tbl().id, info.col.id, url, tmp_path)
153
+ _logger.debug(f'cached {url} as {local_path}')
154
+
155
+ # add the local path/exception to the slots that reference the url
156
+ for row, info in self.in_flight_urls.pop(url):
157
+ if exc is not None:
158
+ self.row_builder.set_exc(row, info.slot_idx, exc)
159
+ else:
160
+ assert local_path is not None
161
+ row.set_file_path(info.slot_idx, str(local_path))
162
+ state = self.in_flight_rows[id(row)]
163
+ state.num_missing -= 1
164
+ if state.num_missing == 0:
165
+ del self.in_flight_rows[id(row)]
166
+ self.__add_ready_row(row, state.idx)
167
+
168
+ def __process_input_batch(self, input_batch: DataRowBatch, executor: futures.ThreadPoolExecutor) -> None:
169
+ """Process a batch of input rows, submitting URLs for download and adding ready rows to ready_rows"""
175
170
  file_cache = FileCache.get()
176
171
 
177
172
  # URLs from this input batch that aren't already in the file cache;
@@ -179,7 +174,7 @@ class CachePrefetchNode(ExecNode):
179
174
  # the time it takes to get the next batch together
180
175
  cache_misses: list[str] = []
181
176
 
182
- url_pos: dict[str, int] = {} # url -> row_idx; used for logging
177
+ url_pos: dict[str, int | None] = {} # url -> row_idx; used for logging
183
178
  for row in input_batch:
184
179
  # identify missing local files in input batch, or fill in their paths if they're already cached
185
180
  num_missing = 0
@@ -218,8 +213,10 @@ class CachePrefetchNode(ExecNode):
218
213
  _logger.debug(f'submitted {url} for idx {url_pos[url]}')
219
214
  self.in_flight_requests[f] = url
220
215
 
221
- def __fetch_url(self, url: str) -> tuple[Optional[Path], Optional[Exception]]:
222
- """Fetches a remote URL into Env.tmp_dir and returns its path"""
216
+ def __fetch_url(self, url: str) -> tuple[Path | None, Exception | None]:
217
+ """Fetches a remote URL into the TempStore and returns its path"""
218
+ from pixeltable.utils.local_store import TempStore
219
+
223
220
  _logger.debug(f'fetching url={url} thread_name={threading.current_thread().name}')
224
221
  parsed = urllib.parse.urlparse(url)
225
222
  # Use len(parsed.scheme) > 1 here to ensure we're not being passed
@@ -227,36 +224,17 @@ class CachePrefetchNode(ExecNode):
227
224
  assert len(parsed.scheme) > 1 and parsed.scheme != 'file'
228
225
  # preserve the file extension, if there is one
229
226
  extension = ''
230
- if parsed.path != '':
227
+ if parsed.path:
231
228
  p = Path(urllib.parse.unquote(urllib.request.url2pathname(parsed.path)))
232
229
  extension = p.suffix
233
- tmp_path = env.Env.get().create_tmp_path(extension=extension)
230
+ tmp_path = TempStore.create_path(extension=extension)
234
231
  try:
235
232
  _logger.debug(f'Downloading {url} to {tmp_path}')
236
- if parsed.scheme == 's3':
237
- from pixeltable.utils.s3 import get_client
238
- with self.boto_client_lock:
239
- if self.boto_client is None:
240
- config = {
241
- 'max_pool_connections': self.NUM_EXECUTOR_THREADS + 4, # +4: leave some headroom
242
- 'connect_timeout': 5,
243
- 'read_timeout': 30,
244
- 'retries': {'max_attempts': 3, 'mode': 'adaptive'},
245
- }
246
- self.boto_client = get_client(**config)
247
- self.boto_client.download_file(parsed.netloc, parsed.path.lstrip('/'), str(tmp_path))
248
- elif parsed.scheme == 'http' or parsed.scheme == 'https':
249
- with urllib.request.urlopen(url) as resp, open(tmp_path, 'wb') as f:
250
- data = resp.read()
251
- f.write(data)
252
- else:
253
- assert False, f'Unsupported URL scheme: {parsed.scheme}'
233
+ ObjectOps.copy_object_to_local_file(url, tmp_path)
254
234
  _logger.debug(f'Downloaded {url} to {tmp_path}')
255
235
  return tmp_path, None
256
236
  except Exception as e:
257
237
  # we want to add the file url to the exception message
258
238
  exc = excs.Error(f'Failed to download {url}: {e}')
259
239
  _logger.debug(f'Failed to download {url}: {e}', exc_info=e)
260
- if not self.ctx.ignore_errors:
261
- raise exc from None # suppress original exception
262
240
  return None, exc