pixeltable 0.2.26__py3-none-any.whl → 0.5.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pixeltable/__init__.py +83 -19
- pixeltable/_query.py +1444 -0
- pixeltable/_version.py +1 -0
- pixeltable/catalog/__init__.py +7 -4
- pixeltable/catalog/catalog.py +2394 -119
- pixeltable/catalog/column.py +225 -104
- pixeltable/catalog/dir.py +38 -9
- pixeltable/catalog/globals.py +53 -34
- pixeltable/catalog/insertable_table.py +265 -115
- pixeltable/catalog/path.py +80 -17
- pixeltable/catalog/schema_object.py +28 -43
- pixeltable/catalog/table.py +1270 -677
- pixeltable/catalog/table_metadata.py +103 -0
- pixeltable/catalog/table_version.py +1270 -751
- pixeltable/catalog/table_version_handle.py +109 -0
- pixeltable/catalog/table_version_path.py +137 -42
- pixeltable/catalog/tbl_ops.py +53 -0
- pixeltable/catalog/update_status.py +191 -0
- pixeltable/catalog/view.py +251 -134
- pixeltable/config.py +215 -0
- pixeltable/env.py +736 -285
- pixeltable/exceptions.py +26 -2
- pixeltable/exec/__init__.py +7 -2
- pixeltable/exec/aggregation_node.py +39 -21
- pixeltable/exec/cache_prefetch_node.py +87 -109
- pixeltable/exec/cell_materialization_node.py +268 -0
- pixeltable/exec/cell_reconstruction_node.py +168 -0
- pixeltable/exec/component_iteration_node.py +25 -28
- pixeltable/exec/data_row_batch.py +11 -46
- pixeltable/exec/exec_context.py +26 -11
- pixeltable/exec/exec_node.py +35 -27
- pixeltable/exec/expr_eval/__init__.py +3 -0
- pixeltable/exec/expr_eval/evaluators.py +365 -0
- pixeltable/exec/expr_eval/expr_eval_node.py +413 -0
- pixeltable/exec/expr_eval/globals.py +200 -0
- pixeltable/exec/expr_eval/row_buffer.py +74 -0
- pixeltable/exec/expr_eval/schedulers.py +413 -0
- pixeltable/exec/globals.py +35 -0
- pixeltable/exec/in_memory_data_node.py +35 -27
- pixeltable/exec/object_store_save_node.py +293 -0
- pixeltable/exec/row_update_node.py +44 -29
- pixeltable/exec/sql_node.py +414 -115
- pixeltable/exprs/__init__.py +8 -5
- pixeltable/exprs/arithmetic_expr.py +79 -45
- pixeltable/exprs/array_slice.py +5 -5
- pixeltable/exprs/column_property_ref.py +40 -26
- pixeltable/exprs/column_ref.py +254 -61
- pixeltable/exprs/comparison.py +14 -9
- pixeltable/exprs/compound_predicate.py +9 -10
- pixeltable/exprs/data_row.py +213 -72
- pixeltable/exprs/expr.py +270 -104
- pixeltable/exprs/expr_dict.py +6 -5
- pixeltable/exprs/expr_set.py +20 -11
- pixeltable/exprs/function_call.py +383 -284
- pixeltable/exprs/globals.py +18 -5
- pixeltable/exprs/in_predicate.py +7 -7
- pixeltable/exprs/inline_expr.py +37 -37
- pixeltable/exprs/is_null.py +8 -4
- pixeltable/exprs/json_mapper.py +120 -54
- pixeltable/exprs/json_path.py +90 -60
- pixeltable/exprs/literal.py +61 -16
- pixeltable/exprs/method_ref.py +7 -6
- pixeltable/exprs/object_ref.py +19 -8
- pixeltable/exprs/row_builder.py +238 -75
- pixeltable/exprs/rowid_ref.py +53 -15
- pixeltable/exprs/similarity_expr.py +65 -50
- pixeltable/exprs/sql_element_cache.py +5 -5
- pixeltable/exprs/string_op.py +107 -0
- pixeltable/exprs/type_cast.py +25 -13
- pixeltable/exprs/variable.py +2 -2
- pixeltable/func/__init__.py +9 -5
- pixeltable/func/aggregate_function.py +197 -92
- pixeltable/func/callable_function.py +119 -35
- pixeltable/func/expr_template_function.py +101 -48
- pixeltable/func/function.py +375 -62
- pixeltable/func/function_registry.py +20 -19
- pixeltable/func/globals.py +6 -5
- pixeltable/func/mcp.py +74 -0
- pixeltable/func/query_template_function.py +151 -35
- pixeltable/func/signature.py +178 -49
- pixeltable/func/tools.py +164 -0
- pixeltable/func/udf.py +176 -53
- pixeltable/functions/__init__.py +44 -4
- pixeltable/functions/anthropic.py +226 -47
- pixeltable/functions/audio.py +148 -11
- pixeltable/functions/bedrock.py +137 -0
- pixeltable/functions/date.py +188 -0
- pixeltable/functions/deepseek.py +113 -0
- pixeltable/functions/document.py +81 -0
- pixeltable/functions/fal.py +76 -0
- pixeltable/functions/fireworks.py +72 -20
- pixeltable/functions/gemini.py +249 -0
- pixeltable/functions/globals.py +208 -53
- pixeltable/functions/groq.py +108 -0
- pixeltable/functions/huggingface.py +1088 -95
- pixeltable/functions/image.py +155 -84
- pixeltable/functions/json.py +8 -11
- pixeltable/functions/llama_cpp.py +31 -19
- pixeltable/functions/math.py +169 -0
- pixeltable/functions/mistralai.py +50 -75
- pixeltable/functions/net.py +70 -0
- pixeltable/functions/ollama.py +29 -36
- pixeltable/functions/openai.py +548 -160
- pixeltable/functions/openrouter.py +143 -0
- pixeltable/functions/replicate.py +15 -14
- pixeltable/functions/reve.py +250 -0
- pixeltable/functions/string.py +310 -85
- pixeltable/functions/timestamp.py +37 -19
- pixeltable/functions/together.py +77 -120
- pixeltable/functions/twelvelabs.py +188 -0
- pixeltable/functions/util.py +7 -2
- pixeltable/functions/uuid.py +30 -0
- pixeltable/functions/video.py +1528 -117
- pixeltable/functions/vision.py +26 -26
- pixeltable/functions/voyageai.py +289 -0
- pixeltable/functions/whisper.py +19 -10
- pixeltable/functions/whisperx.py +179 -0
- pixeltable/functions/yolox.py +112 -0
- pixeltable/globals.py +716 -236
- pixeltable/index/__init__.py +3 -1
- pixeltable/index/base.py +17 -21
- pixeltable/index/btree.py +32 -22
- pixeltable/index/embedding_index.py +155 -92
- pixeltable/io/__init__.py +12 -7
- pixeltable/io/datarows.py +140 -0
- pixeltable/io/external_store.py +83 -125
- pixeltable/io/fiftyone.py +24 -33
- pixeltable/io/globals.py +47 -182
- pixeltable/io/hf_datasets.py +96 -127
- pixeltable/io/label_studio.py +171 -156
- pixeltable/io/lancedb.py +3 -0
- pixeltable/io/pandas.py +136 -115
- pixeltable/io/parquet.py +40 -153
- pixeltable/io/table_data_conduit.py +702 -0
- pixeltable/io/utils.py +100 -0
- pixeltable/iterators/__init__.py +8 -4
- pixeltable/iterators/audio.py +207 -0
- pixeltable/iterators/base.py +9 -3
- pixeltable/iterators/document.py +144 -87
- pixeltable/iterators/image.py +17 -38
- pixeltable/iterators/string.py +15 -12
- pixeltable/iterators/video.py +523 -127
- pixeltable/metadata/__init__.py +33 -8
- pixeltable/metadata/converters/convert_10.py +2 -3
- pixeltable/metadata/converters/convert_13.py +2 -2
- pixeltable/metadata/converters/convert_15.py +15 -11
- pixeltable/metadata/converters/convert_16.py +4 -5
- pixeltable/metadata/converters/convert_17.py +4 -5
- pixeltable/metadata/converters/convert_18.py +4 -6
- pixeltable/metadata/converters/convert_19.py +6 -9
- pixeltable/metadata/converters/convert_20.py +3 -6
- pixeltable/metadata/converters/convert_21.py +6 -8
- pixeltable/metadata/converters/convert_22.py +3 -2
- pixeltable/metadata/converters/convert_23.py +33 -0
- pixeltable/metadata/converters/convert_24.py +55 -0
- pixeltable/metadata/converters/convert_25.py +19 -0
- pixeltable/metadata/converters/convert_26.py +23 -0
- pixeltable/metadata/converters/convert_27.py +29 -0
- pixeltable/metadata/converters/convert_28.py +13 -0
- pixeltable/metadata/converters/convert_29.py +110 -0
- pixeltable/metadata/converters/convert_30.py +63 -0
- pixeltable/metadata/converters/convert_31.py +11 -0
- pixeltable/metadata/converters/convert_32.py +15 -0
- pixeltable/metadata/converters/convert_33.py +17 -0
- pixeltable/metadata/converters/convert_34.py +21 -0
- pixeltable/metadata/converters/convert_35.py +9 -0
- pixeltable/metadata/converters/convert_36.py +38 -0
- pixeltable/metadata/converters/convert_37.py +15 -0
- pixeltable/metadata/converters/convert_38.py +39 -0
- pixeltable/metadata/converters/convert_39.py +124 -0
- pixeltable/metadata/converters/convert_40.py +73 -0
- pixeltable/metadata/converters/convert_41.py +12 -0
- pixeltable/metadata/converters/convert_42.py +9 -0
- pixeltable/metadata/converters/convert_43.py +44 -0
- pixeltable/metadata/converters/util.py +44 -18
- pixeltable/metadata/notes.py +21 -0
- pixeltable/metadata/schema.py +185 -42
- pixeltable/metadata/utils.py +74 -0
- pixeltable/mypy/__init__.py +3 -0
- pixeltable/mypy/mypy_plugin.py +123 -0
- pixeltable/plan.py +616 -225
- pixeltable/share/__init__.py +3 -0
- pixeltable/share/packager.py +797 -0
- pixeltable/share/protocol/__init__.py +33 -0
- pixeltable/share/protocol/common.py +165 -0
- pixeltable/share/protocol/operation_types.py +33 -0
- pixeltable/share/protocol/replica.py +119 -0
- pixeltable/share/publish.py +349 -0
- pixeltable/store.py +398 -232
- pixeltable/type_system.py +730 -267
- pixeltable/utils/__init__.py +40 -0
- pixeltable/utils/arrow.py +201 -29
- pixeltable/utils/av.py +298 -0
- pixeltable/utils/azure_store.py +346 -0
- pixeltable/utils/coco.py +26 -27
- pixeltable/utils/code.py +4 -4
- pixeltable/utils/console_output.py +46 -0
- pixeltable/utils/coroutine.py +24 -0
- pixeltable/utils/dbms.py +92 -0
- pixeltable/utils/description_helper.py +11 -12
- pixeltable/utils/documents.py +60 -61
- pixeltable/utils/exception_handler.py +36 -0
- pixeltable/utils/filecache.py +38 -22
- pixeltable/utils/formatter.py +88 -51
- pixeltable/utils/gcs_store.py +295 -0
- pixeltable/utils/http.py +133 -0
- pixeltable/utils/http_server.py +14 -13
- pixeltable/utils/iceberg.py +13 -0
- pixeltable/utils/image.py +17 -0
- pixeltable/utils/lancedb.py +90 -0
- pixeltable/utils/local_store.py +322 -0
- pixeltable/utils/misc.py +5 -0
- pixeltable/utils/object_stores.py +573 -0
- pixeltable/utils/pydantic.py +60 -0
- pixeltable/utils/pytorch.py +20 -20
- pixeltable/utils/s3_store.py +527 -0
- pixeltable/utils/sql.py +32 -5
- pixeltable/utils/system.py +30 -0
- pixeltable/utils/transactional_directory.py +4 -3
- pixeltable-0.5.7.dist-info/METADATA +579 -0
- pixeltable-0.5.7.dist-info/RECORD +227 -0
- {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
- pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
- pixeltable/__version__.py +0 -3
- pixeltable/catalog/named_function.py +0 -36
- pixeltable/catalog/path_dict.py +0 -141
- pixeltable/dataframe.py +0 -894
- pixeltable/exec/expr_eval_node.py +0 -232
- pixeltable/ext/__init__.py +0 -14
- pixeltable/ext/functions/__init__.py +0 -8
- pixeltable/ext/functions/whisperx.py +0 -77
- pixeltable/ext/functions/yolox.py +0 -157
- pixeltable/tool/create_test_db_dump.py +0 -311
- pixeltable/tool/create_test_video.py +0 -81
- pixeltable/tool/doc_plugins/griffe.py +0 -50
- pixeltable/tool/doc_plugins/mkdocstrings.py +0 -6
- pixeltable/tool/doc_plugins/templates/material/udf.html.jinja +0 -135
- pixeltable/tool/embed_udf.py +0 -9
- pixeltable/tool/mypy_plugin.py +0 -55
- pixeltable/utils/media_store.py +0 -76
- pixeltable/utils/s3.py +0 -16
- pixeltable-0.2.26.dist-info/METADATA +0 -400
- pixeltable-0.2.26.dist-info/RECORD +0 -156
- pixeltable-0.2.26.dist-info/entry_points.txt +0 -3
- {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
pixeltable/exceptions.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
1
|
from types import TracebackType
|
|
3
2
|
from typing import TYPE_CHECKING, Any
|
|
4
3
|
|
|
@@ -10,8 +9,13 @@ class Error(Exception):
|
|
|
10
9
|
pass
|
|
11
10
|
|
|
12
11
|
|
|
13
|
-
@dataclass
|
|
14
12
|
class ExprEvalError(Exception):
|
|
13
|
+
"""
|
|
14
|
+
Used during query execution to signal expr evaluation failures.
|
|
15
|
+
|
|
16
|
+
NOT A USER-FACING EXCEPTION. All ExprEvalError instances need to be converted into Error instances.
|
|
17
|
+
"""
|
|
18
|
+
|
|
15
19
|
expr: 'exprs.Expr'
|
|
16
20
|
expr_msg: str
|
|
17
21
|
exc: Exception
|
|
@@ -19,6 +23,26 @@ class ExprEvalError(Exception):
|
|
|
19
23
|
input_vals: list[Any]
|
|
20
24
|
row_num: int
|
|
21
25
|
|
|
26
|
+
def __init__(
|
|
27
|
+
self,
|
|
28
|
+
expr: 'exprs.Expr',
|
|
29
|
+
expr_msg: str,
|
|
30
|
+
exc: Exception,
|
|
31
|
+
exc_tb: TracebackType,
|
|
32
|
+
input_vals: list[Any],
|
|
33
|
+
row_num: int,
|
|
34
|
+
) -> None:
|
|
35
|
+
exct = type(exc)
|
|
36
|
+
super().__init__(
|
|
37
|
+
f'Expression evaluation failed with an error of type `{exct.__module__}.{exct.__qualname__}`:\n{expr}'
|
|
38
|
+
)
|
|
39
|
+
self.expr = expr
|
|
40
|
+
self.expr_msg = expr_msg
|
|
41
|
+
self.exc = exc
|
|
42
|
+
self.exc_tb = exc_tb
|
|
43
|
+
self.input_vals = input_vals
|
|
44
|
+
self.row_num = row_num
|
|
45
|
+
|
|
22
46
|
|
|
23
47
|
class PixeltableWarning(Warning):
|
|
24
48
|
pass
|
pixeltable/exec/__init__.py
CHANGED
|
@@ -1,10 +1,15 @@
|
|
|
1
|
+
# ruff: noqa: F401
|
|
2
|
+
|
|
1
3
|
from .aggregation_node import AggregationNode
|
|
2
4
|
from .cache_prefetch_node import CachePrefetchNode
|
|
5
|
+
from .cell_materialization_node import CellMaterializationNode
|
|
6
|
+
from .cell_reconstruction_node import CellReconstructionNode
|
|
3
7
|
from .component_iteration_node import ComponentIterationNode
|
|
4
8
|
from .data_row_batch import DataRowBatch
|
|
5
9
|
from .exec_context import ExecContext
|
|
6
10
|
from .exec_node import ExecNode
|
|
7
|
-
from .
|
|
11
|
+
from .expr_eval import ExprEvalNode
|
|
8
12
|
from .in_memory_data_node import InMemoryDataNode
|
|
13
|
+
from .object_store_save_node import ObjectStoreSaveNode
|
|
9
14
|
from .row_update_node import RowUpdateNode
|
|
10
|
-
from .sql_node import
|
|
15
|
+
from .sql_node import SqlAggregationNode, SqlJoinNode, SqlLookupNode, SqlNode, SqlSampleNode, SqlScanNode
|
|
@@ -2,32 +2,38 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
4
|
import sys
|
|
5
|
-
from typing import Any,
|
|
5
|
+
from typing import Any, AsyncIterator, Iterable, cast
|
|
6
6
|
|
|
7
|
-
import
|
|
8
|
-
import pixeltable.exceptions as excs
|
|
9
|
-
import pixeltable.exprs as exprs
|
|
7
|
+
from pixeltable import catalog, exceptions as excs, exprs
|
|
10
8
|
|
|
11
9
|
from .data_row_batch import DataRowBatch
|
|
12
10
|
from .exec_node import ExecNode
|
|
13
11
|
|
|
14
12
|
_logger = logging.getLogger('pixeltable')
|
|
15
13
|
|
|
14
|
+
|
|
16
15
|
class AggregationNode(ExecNode):
|
|
17
16
|
"""
|
|
18
17
|
In-memory aggregation for UDAs.
|
|
19
18
|
|
|
20
19
|
At the moment, this returns all results in a single DataRowBatch.
|
|
21
20
|
"""
|
|
22
|
-
|
|
21
|
+
|
|
22
|
+
group_by: list[exprs.Expr] | None
|
|
23
23
|
input_exprs: list[exprs.Expr]
|
|
24
24
|
agg_fn_eval_ctx: exprs.RowBuilder.EvalCtx
|
|
25
25
|
agg_fn_calls: list[exprs.FunctionCall]
|
|
26
26
|
output_batch: DataRowBatch
|
|
27
|
+
limit: int | None
|
|
27
28
|
|
|
28
29
|
def __init__(
|
|
29
|
-
|
|
30
|
-
|
|
30
|
+
self,
|
|
31
|
+
tbl: catalog.TableVersionHandle,
|
|
32
|
+
row_builder: exprs.RowBuilder,
|
|
33
|
+
group_by: list[exprs.Expr] | None,
|
|
34
|
+
agg_fn_calls: list[exprs.FunctionCall],
|
|
35
|
+
input_exprs: Iterable[exprs.Expr],
|
|
36
|
+
input: ExecNode,
|
|
31
37
|
):
|
|
32
38
|
output_exprs: list[exprs.Expr] = [] if group_by is None else list(group_by)
|
|
33
39
|
output_exprs.extend(agg_fn_calls)
|
|
@@ -39,51 +45,63 @@ class AggregationNode(ExecNode):
|
|
|
39
45
|
# we need to make sure to refer to the same exprs that RowBuilder.eval() will use
|
|
40
46
|
self.agg_fn_calls = [cast(exprs.FunctionCall, e) for e in self.agg_fn_eval_ctx.target_exprs]
|
|
41
47
|
# create output_batch here, rather than in __iter__(), so we don't need to remember tbl and row_builder
|
|
42
|
-
self.output_batch = DataRowBatch(
|
|
48
|
+
self.output_batch = DataRowBatch(row_builder)
|
|
49
|
+
self.limit = None
|
|
50
|
+
|
|
51
|
+
def set_limit(self, limit: int) -> None:
|
|
52
|
+
# we can't propagate the limit to our input
|
|
53
|
+
self.limit = limit
|
|
43
54
|
|
|
44
55
|
def _reset_agg_state(self, row_num: int) -> None:
|
|
45
56
|
for fn_call in self.agg_fn_calls:
|
|
46
57
|
try:
|
|
47
58
|
fn_call.reset_agg()
|
|
48
|
-
except Exception as
|
|
59
|
+
except Exception as exc:
|
|
49
60
|
_, _, exc_tb = sys.exc_info()
|
|
50
61
|
expr_msg = f'init() function of the aggregate {fn_call}'
|
|
51
|
-
raise excs.ExprEvalError(fn_call, expr_msg,
|
|
62
|
+
raise excs.ExprEvalError(fn_call, expr_msg, exc, exc_tb, [], row_num) from exc
|
|
52
63
|
|
|
53
64
|
def _update_agg_state(self, row: exprs.DataRow, row_num: int) -> None:
|
|
54
65
|
for fn_call in self.agg_fn_calls:
|
|
55
66
|
try:
|
|
56
67
|
fn_call.update(row)
|
|
57
|
-
except Exception as
|
|
68
|
+
except Exception as exc:
|
|
58
69
|
_, _, exc_tb = sys.exc_info()
|
|
59
70
|
expr_msg = f'update() function of the aggregate {fn_call}'
|
|
60
71
|
input_vals = [row[d.slot_idx] for d in fn_call.dependencies()]
|
|
61
|
-
raise excs.ExprEvalError(fn_call, expr_msg,
|
|
72
|
+
raise excs.ExprEvalError(fn_call, expr_msg, exc, exc_tb, input_vals, row_num) from exc
|
|
62
73
|
|
|
63
|
-
def
|
|
64
|
-
prev_row:
|
|
65
|
-
current_group:
|
|
74
|
+
async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
|
|
75
|
+
prev_row: exprs.DataRow | None = None
|
|
76
|
+
current_group: list[Any] | None = None # the values of the group-by exprs
|
|
66
77
|
num_input_rows = 0
|
|
67
|
-
|
|
78
|
+
num_output_rows = 0
|
|
79
|
+
async for row_batch in self.input:
|
|
68
80
|
num_input_rows += len(row_batch)
|
|
69
81
|
for row in row_batch:
|
|
70
82
|
group = [row[e.slot_idx] for e in self.group_by] if self.group_by is not None else None
|
|
83
|
+
|
|
71
84
|
if current_group is None:
|
|
72
85
|
current_group = group
|
|
73
86
|
self._reset_agg_state(0)
|
|
87
|
+
|
|
74
88
|
if group != current_group:
|
|
75
89
|
# we're entering a new group, emit a row for the previous one
|
|
76
90
|
self.row_builder.eval(prev_row, self.agg_fn_eval_ctx, profile=self.ctx.profile)
|
|
77
91
|
self.output_batch.add_row(prev_row)
|
|
92
|
+
num_output_rows += 1
|
|
93
|
+
if self.limit is not None and num_output_rows == self.limit:
|
|
94
|
+
yield self.output_batch
|
|
95
|
+
return
|
|
78
96
|
current_group = group
|
|
79
97
|
self._reset_agg_state(0)
|
|
80
98
|
self._update_agg_state(row, 0)
|
|
81
99
|
prev_row = row
|
|
82
|
-
# emit the last group
|
|
83
|
-
self.row_builder.eval(prev_row, self.agg_fn_eval_ctx, profile=self.ctx.profile)
|
|
84
|
-
self.output_batch.add_row(prev_row)
|
|
85
100
|
|
|
86
|
-
|
|
101
|
+
if prev_row is not None:
|
|
102
|
+
# emit the last group
|
|
103
|
+
self.row_builder.eval(prev_row, self.agg_fn_eval_ctx, profile=self.ctx.profile)
|
|
104
|
+
self.output_batch.add_row(prev_row)
|
|
105
|
+
|
|
87
106
|
_logger.debug(f'AggregateNode: consumed {num_input_rows} rows, returning {len(self.output_batch.rows)} rows')
|
|
88
107
|
yield self.output_batch
|
|
89
|
-
|
|
@@ -9,14 +9,12 @@ import urllib.request
|
|
|
9
9
|
from collections import deque
|
|
10
10
|
from concurrent import futures
|
|
11
11
|
from pathlib import Path
|
|
12
|
-
from typing import
|
|
12
|
+
from typing import AsyncIterator, Iterator
|
|
13
13
|
from uuid import UUID
|
|
14
14
|
|
|
15
|
-
|
|
16
|
-
import pixeltable.exceptions as excs
|
|
17
|
-
import pixeltable.exprs as exprs
|
|
18
|
-
from pixeltable import catalog
|
|
15
|
+
from pixeltable import exceptions as excs, exprs
|
|
19
16
|
from pixeltable.utils.filecache import FileCache
|
|
17
|
+
from pixeltable.utils.object_stores import ObjectOps
|
|
20
18
|
|
|
21
19
|
from .data_row_batch import DataRowBatch
|
|
22
20
|
from .exec_node import ExecNode
|
|
@@ -28,49 +26,45 @@ class CachePrefetchNode(ExecNode):
|
|
|
28
26
|
"""Brings files with external URLs into the cache
|
|
29
27
|
|
|
30
28
|
TODO:
|
|
31
|
-
-
|
|
29
|
+
- Process a row at a time and limit the number of in-flight rows to control memory usage
|
|
30
|
+
- Create asyncio.Tasks to consume our input in order to increase concurrency.
|
|
32
31
|
"""
|
|
32
|
+
|
|
33
|
+
QUEUE_DEPTH_HIGH_WATER = 50 # target number of in-flight requests
|
|
34
|
+
QUEUE_DEPTH_LOW_WATER = 20 # target number of in-flight requests
|
|
33
35
|
BATCH_SIZE = 16
|
|
34
|
-
|
|
36
|
+
MAX_WORKERS = 15
|
|
35
37
|
|
|
36
38
|
retain_input_order: bool # if True, return rows in the exact order they were received
|
|
37
39
|
file_col_info: list[exprs.ColumnSlotIdx]
|
|
38
|
-
boto_client: Optional[Any]
|
|
39
|
-
boto_client_lock: threading.Lock
|
|
40
40
|
|
|
41
41
|
# execution state
|
|
42
|
-
batch_tbl_version: Optional[catalog.TableVersion] # needed to construct output batches
|
|
43
42
|
num_returned_rows: int
|
|
44
43
|
|
|
45
44
|
# ready_rows: rows that are ready to be returned, ordered by row idx;
|
|
46
45
|
# the implied row idx of ready_rows[0] is num_returned_rows
|
|
47
|
-
ready_rows: deque[
|
|
46
|
+
ready_rows: deque[exprs.DataRow | None]
|
|
48
47
|
|
|
49
48
|
in_flight_rows: dict[int, CachePrefetchNode.RowState] # rows with in-flight urls; id(row) -> RowState
|
|
50
49
|
in_flight_requests: dict[futures.Future, str] # in-flight requests for urls; future -> URL
|
|
51
50
|
in_flight_urls: dict[str, list[tuple[exprs.DataRow, exprs.ColumnSlotIdx]]] # URL -> [(row, info)]
|
|
52
51
|
input_finished: bool
|
|
53
|
-
row_idx: Iterator[
|
|
52
|
+
row_idx: Iterator[int | None]
|
|
54
53
|
|
|
55
54
|
@dataclasses.dataclass
|
|
56
55
|
class RowState:
|
|
57
56
|
row: exprs.DataRow
|
|
58
|
-
idx:
|
|
57
|
+
idx: int | None # position in input stream; None if we don't retain input order
|
|
59
58
|
num_missing: int # number of missing URLs in this row
|
|
60
59
|
|
|
61
60
|
def __init__(
|
|
62
|
-
|
|
63
|
-
|
|
61
|
+
self, tbl_id: UUID, file_col_info: list[exprs.ColumnSlotIdx], input: ExecNode, retain_input_order: bool = True
|
|
62
|
+
):
|
|
64
63
|
# input_/output_exprs=[]: we don't have anything to evaluate
|
|
65
64
|
super().__init__(input.row_builder, [], [], input)
|
|
66
65
|
self.retain_input_order = retain_input_order
|
|
67
66
|
self.file_col_info = file_col_info
|
|
68
67
|
|
|
69
|
-
# clients for specific services are constructed as needed, because it's time-consuming
|
|
70
|
-
self.boto_client = None
|
|
71
|
-
self.boto_client_lock = threading.Lock()
|
|
72
|
-
|
|
73
|
-
self.batch_tbl_version = None
|
|
74
68
|
self.num_returned_rows = 0
|
|
75
69
|
self.ready_rows = deque()
|
|
76
70
|
self.in_flight_rows = {}
|
|
@@ -78,26 +72,44 @@ class CachePrefetchNode(ExecNode):
|
|
|
78
72
|
self.in_flight_urls = {}
|
|
79
73
|
self.input_finished = False
|
|
80
74
|
self.row_idx = itertools.count() if retain_input_order else itertools.repeat(None)
|
|
75
|
+
assert self.QUEUE_DEPTH_HIGH_WATER > self.QUEUE_DEPTH_LOW_WATER
|
|
81
76
|
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
# we create enough in-flight requests to fill the first batch
|
|
86
|
-
while not self.input_finished and self.__num_pending_rows() < self.BATCH_SIZE:
|
|
87
|
-
self.__submit_input_batch(input_iter, executor)
|
|
88
|
-
|
|
89
|
-
while True:
|
|
90
|
-
# try to assemble a full batch of output rows
|
|
91
|
-
if not self.__has_ready_batch() and len(self.in_flight_requests) > 0:
|
|
92
|
-
self.__wait_for_requests()
|
|
77
|
+
@property
|
|
78
|
+
def queued_work(self) -> int:
|
|
79
|
+
return len(self.in_flight_requests)
|
|
93
80
|
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
81
|
+
async def get_input_batch(self, input_iter: AsyncIterator[DataRowBatch]) -> DataRowBatch | None:
|
|
82
|
+
"""Get the next batch of input rows, or None if there are no more rows"""
|
|
83
|
+
try:
|
|
84
|
+
input_batch = await anext(input_iter)
|
|
85
|
+
if input_batch is None:
|
|
86
|
+
self.input_finished = True
|
|
87
|
+
return input_batch
|
|
88
|
+
except StopAsyncIteration:
|
|
89
|
+
self.input_finished = True
|
|
90
|
+
return None
|
|
97
91
|
|
|
98
|
-
|
|
92
|
+
async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
|
|
93
|
+
input_iter = aiter(self.input)
|
|
94
|
+
with futures.ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
|
|
95
|
+
while True:
|
|
96
|
+
# Create work to fill the queue to the high water mark ... ?without overrunning the in-flight row limit.
|
|
97
|
+
while not self.input_finished and self.queued_work < self.QUEUE_DEPTH_HIGH_WATER:
|
|
98
|
+
input_batch = await self.get_input_batch(input_iter)
|
|
99
|
+
if input_batch is not None:
|
|
100
|
+
self.__process_input_batch(input_batch, executor)
|
|
101
|
+
|
|
102
|
+
# Wait for enough completions to enable more queueing or if we're done
|
|
103
|
+
while self.queued_work > self.QUEUE_DEPTH_LOW_WATER or (self.input_finished and self.queued_work > 0):
|
|
104
|
+
done, _ = futures.wait(self.in_flight_requests, return_when=futures.FIRST_COMPLETED)
|
|
105
|
+
self.__process_completions(done, ignore_errors=self.ctx.ignore_errors)
|
|
106
|
+
|
|
107
|
+
# Emit results to meet batch size requirements or empty the in-flight row queue
|
|
108
|
+
if self.__has_ready_batch() or (
|
|
109
|
+
len(self.ready_rows) > 0 and self.input_finished and self.queued_work == 0
|
|
110
|
+
):
|
|
99
111
|
# create DataRowBatch from the first BATCH_SIZE ready rows
|
|
100
|
-
batch = DataRowBatch(self.
|
|
112
|
+
batch = DataRowBatch(self.row_builder)
|
|
101
113
|
rows = [self.ready_rows.popleft() for _ in range(min(self.BATCH_SIZE, len(self.ready_rows)))]
|
|
102
114
|
for row in rows:
|
|
103
115
|
assert row is not None
|
|
@@ -106,23 +118,16 @@ class CachePrefetchNode(ExecNode):
|
|
|
106
118
|
_logger.debug(f'returning {len(rows)} rows')
|
|
107
119
|
yield batch
|
|
108
120
|
|
|
109
|
-
if self.input_finished and self.
|
|
121
|
+
if self.input_finished and self.queued_work == 0 and len(self.ready_rows) == 0:
|
|
110
122
|
return
|
|
111
123
|
|
|
112
|
-
def __num_pending_rows(self) -> int:
|
|
113
|
-
return len(self.in_flight_rows) + len(self.ready_rows)
|
|
114
|
-
|
|
115
124
|
def __has_ready_batch(self) -> bool:
|
|
116
125
|
"""True if there are >= BATCH_SIZES entries in ready_rows and the first BATCH_SIZE ones are all non-None"""
|
|
117
126
|
return (
|
|
118
127
|
sum(int(row is not None) for row in itertools.islice(self.ready_rows, self.BATCH_SIZE)) == self.BATCH_SIZE
|
|
119
128
|
)
|
|
120
129
|
|
|
121
|
-
def
|
|
122
|
-
"""Length of the non-None prefix of ready_rows (= what we can return right now)"""
|
|
123
|
-
return sum(1 for _ in itertools.takewhile(lambda x: x is not None, self.ready_rows))
|
|
124
|
-
|
|
125
|
-
def __add_ready_row(self, row: exprs.DataRow, row_idx: Optional[int]) -> None:
|
|
130
|
+
def __add_ready_row(self, row: exprs.DataRow, row_idx: int | None) -> None:
|
|
126
131
|
if row_idx is None:
|
|
127
132
|
self.ready_rows.append(row)
|
|
128
133
|
else:
|
|
@@ -132,46 +137,36 @@ class CachePrefetchNode(ExecNode):
|
|
|
132
137
|
self.ready_rows.extend([None] * (idx - len(self.ready_rows) + 1))
|
|
133
138
|
self.ready_rows[idx] = row
|
|
134
139
|
|
|
135
|
-
def
|
|
136
|
-
"""Wait for in-flight requests to complete until we have a full batch of rows"""
|
|
140
|
+
def __process_completions(self, done: set[futures.Future], ignore_errors: bool) -> None:
|
|
137
141
|
file_cache = FileCache.get()
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
def __submit_input_batch(self, input: Iterator[DataRowBatch], executor: futures.ThreadPoolExecutor) -> None:
|
|
167
|
-
assert not self.input_finished
|
|
168
|
-
input_batch = next(input, None)
|
|
169
|
-
if input_batch is None:
|
|
170
|
-
self.input_finished = True
|
|
171
|
-
return
|
|
172
|
-
if self.batch_tbl_version is None:
|
|
173
|
-
self.batch_tbl_version = input_batch.tbl
|
|
174
|
-
|
|
142
|
+
for f in done:
|
|
143
|
+
url = self.in_flight_requests.pop(f)
|
|
144
|
+
tmp_path, exc = f.result()
|
|
145
|
+
if exc is not None and not ignore_errors:
|
|
146
|
+
raise exc
|
|
147
|
+
local_path: Path | None = None
|
|
148
|
+
if tmp_path is not None:
|
|
149
|
+
# register the file with the cache for the first column in which it's missing
|
|
150
|
+
assert url in self.in_flight_urls
|
|
151
|
+
_, info = self.in_flight_urls[url][0]
|
|
152
|
+
local_path = file_cache.add(info.col.get_tbl().id, info.col.id, url, tmp_path)
|
|
153
|
+
_logger.debug(f'cached {url} as {local_path}')
|
|
154
|
+
|
|
155
|
+
# add the local path/exception to the slots that reference the url
|
|
156
|
+
for row, info in self.in_flight_urls.pop(url):
|
|
157
|
+
if exc is not None:
|
|
158
|
+
self.row_builder.set_exc(row, info.slot_idx, exc)
|
|
159
|
+
else:
|
|
160
|
+
assert local_path is not None
|
|
161
|
+
row.set_file_path(info.slot_idx, str(local_path))
|
|
162
|
+
state = self.in_flight_rows[id(row)]
|
|
163
|
+
state.num_missing -= 1
|
|
164
|
+
if state.num_missing == 0:
|
|
165
|
+
del self.in_flight_rows[id(row)]
|
|
166
|
+
self.__add_ready_row(row, state.idx)
|
|
167
|
+
|
|
168
|
+
def __process_input_batch(self, input_batch: DataRowBatch, executor: futures.ThreadPoolExecutor) -> None:
|
|
169
|
+
"""Process a batch of input rows, submitting URLs for download and adding ready rows to ready_rows"""
|
|
175
170
|
file_cache = FileCache.get()
|
|
176
171
|
|
|
177
172
|
# URLs from this input batch that aren't already in the file cache;
|
|
@@ -179,7 +174,7 @@ class CachePrefetchNode(ExecNode):
|
|
|
179
174
|
# the time it takes to get the next batch together
|
|
180
175
|
cache_misses: list[str] = []
|
|
181
176
|
|
|
182
|
-
url_pos: dict[str, int] = {} # url -> row_idx; used for logging
|
|
177
|
+
url_pos: dict[str, int | None] = {} # url -> row_idx; used for logging
|
|
183
178
|
for row in input_batch:
|
|
184
179
|
# identify missing local files in input batch, or fill in their paths if they're already cached
|
|
185
180
|
num_missing = 0
|
|
@@ -218,8 +213,10 @@ class CachePrefetchNode(ExecNode):
|
|
|
218
213
|
_logger.debug(f'submitted {url} for idx {url_pos[url]}')
|
|
219
214
|
self.in_flight_requests[f] = url
|
|
220
215
|
|
|
221
|
-
def __fetch_url(self, url: str) -> tuple[
|
|
222
|
-
"""Fetches a remote URL into
|
|
216
|
+
def __fetch_url(self, url: str) -> tuple[Path | None, Exception | None]:
|
|
217
|
+
"""Fetches a remote URL into the TempStore and returns its path"""
|
|
218
|
+
from pixeltable.utils.local_store import TempStore
|
|
219
|
+
|
|
223
220
|
_logger.debug(f'fetching url={url} thread_name={threading.current_thread().name}')
|
|
224
221
|
parsed = urllib.parse.urlparse(url)
|
|
225
222
|
# Use len(parsed.scheme) > 1 here to ensure we're not being passed
|
|
@@ -227,36 +224,17 @@ class CachePrefetchNode(ExecNode):
|
|
|
227
224
|
assert len(parsed.scheme) > 1 and parsed.scheme != 'file'
|
|
228
225
|
# preserve the file extension, if there is one
|
|
229
226
|
extension = ''
|
|
230
|
-
if parsed.path
|
|
227
|
+
if parsed.path:
|
|
231
228
|
p = Path(urllib.parse.unquote(urllib.request.url2pathname(parsed.path)))
|
|
232
229
|
extension = p.suffix
|
|
233
|
-
tmp_path =
|
|
230
|
+
tmp_path = TempStore.create_path(extension=extension)
|
|
234
231
|
try:
|
|
235
232
|
_logger.debug(f'Downloading {url} to {tmp_path}')
|
|
236
|
-
|
|
237
|
-
from pixeltable.utils.s3 import get_client
|
|
238
|
-
with self.boto_client_lock:
|
|
239
|
-
if self.boto_client is None:
|
|
240
|
-
config = {
|
|
241
|
-
'max_pool_connections': self.NUM_EXECUTOR_THREADS + 4, # +4: leave some headroom
|
|
242
|
-
'connect_timeout': 5,
|
|
243
|
-
'read_timeout': 30,
|
|
244
|
-
'retries': {'max_attempts': 3, 'mode': 'adaptive'},
|
|
245
|
-
}
|
|
246
|
-
self.boto_client = get_client(**config)
|
|
247
|
-
self.boto_client.download_file(parsed.netloc, parsed.path.lstrip('/'), str(tmp_path))
|
|
248
|
-
elif parsed.scheme == 'http' or parsed.scheme == 'https':
|
|
249
|
-
with urllib.request.urlopen(url) as resp, open(tmp_path, 'wb') as f:
|
|
250
|
-
data = resp.read()
|
|
251
|
-
f.write(data)
|
|
252
|
-
else:
|
|
253
|
-
assert False, f'Unsupported URL scheme: {parsed.scheme}'
|
|
233
|
+
ObjectOps.copy_object_to_local_file(url, tmp_path)
|
|
254
234
|
_logger.debug(f'Downloaded {url} to {tmp_path}')
|
|
255
235
|
return tmp_path, None
|
|
256
236
|
except Exception as e:
|
|
257
237
|
# we want to add the file url to the exception message
|
|
258
238
|
exc = excs.Error(f'Failed to download {url}: {e}')
|
|
259
239
|
_logger.debug(f'Failed to download {url}: {e}', exc_info=e)
|
|
260
|
-
if not self.ctx.ignore_errors:
|
|
261
|
-
raise exc from None # suppress original exception
|
|
262
240
|
return None, exc
|