pixeltable 0.3.14__py3-none-any.whl → 0.5.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pixeltable/__init__.py +42 -8
- pixeltable/{dataframe.py → _query.py} +470 -206
- pixeltable/_version.py +1 -0
- pixeltable/catalog/__init__.py +5 -4
- pixeltable/catalog/catalog.py +1785 -432
- pixeltable/catalog/column.py +190 -113
- pixeltable/catalog/dir.py +2 -4
- pixeltable/catalog/globals.py +19 -46
- pixeltable/catalog/insertable_table.py +191 -98
- pixeltable/catalog/path.py +63 -23
- pixeltable/catalog/schema_object.py +11 -15
- pixeltable/catalog/table.py +843 -436
- pixeltable/catalog/table_metadata.py +103 -0
- pixeltable/catalog/table_version.py +978 -657
- pixeltable/catalog/table_version_handle.py +72 -16
- pixeltable/catalog/table_version_path.py +112 -43
- pixeltable/catalog/tbl_ops.py +53 -0
- pixeltable/catalog/update_status.py +191 -0
- pixeltable/catalog/view.py +134 -90
- pixeltable/config.py +134 -22
- pixeltable/env.py +471 -157
- pixeltable/exceptions.py +6 -0
- pixeltable/exec/__init__.py +4 -1
- pixeltable/exec/aggregation_node.py +7 -8
- pixeltable/exec/cache_prefetch_node.py +83 -110
- pixeltable/exec/cell_materialization_node.py +268 -0
- pixeltable/exec/cell_reconstruction_node.py +168 -0
- pixeltable/exec/component_iteration_node.py +4 -3
- pixeltable/exec/data_row_batch.py +8 -65
- pixeltable/exec/exec_context.py +16 -4
- pixeltable/exec/exec_node.py +13 -36
- pixeltable/exec/expr_eval/evaluators.py +11 -7
- pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
- pixeltable/exec/expr_eval/globals.py +8 -5
- pixeltable/exec/expr_eval/row_buffer.py +1 -2
- pixeltable/exec/expr_eval/schedulers.py +106 -56
- pixeltable/exec/globals.py +35 -0
- pixeltable/exec/in_memory_data_node.py +19 -19
- pixeltable/exec/object_store_save_node.py +293 -0
- pixeltable/exec/row_update_node.py +16 -9
- pixeltable/exec/sql_node.py +351 -84
- pixeltable/exprs/__init__.py +1 -1
- pixeltable/exprs/arithmetic_expr.py +27 -22
- pixeltable/exprs/array_slice.py +3 -3
- pixeltable/exprs/column_property_ref.py +36 -23
- pixeltable/exprs/column_ref.py +213 -89
- pixeltable/exprs/comparison.py +5 -5
- pixeltable/exprs/compound_predicate.py +5 -4
- pixeltable/exprs/data_row.py +164 -54
- pixeltable/exprs/expr.py +70 -44
- pixeltable/exprs/expr_dict.py +3 -3
- pixeltable/exprs/expr_set.py +17 -10
- pixeltable/exprs/function_call.py +100 -40
- pixeltable/exprs/globals.py +2 -2
- pixeltable/exprs/in_predicate.py +4 -4
- pixeltable/exprs/inline_expr.py +18 -32
- pixeltable/exprs/is_null.py +7 -3
- pixeltable/exprs/json_mapper.py +8 -8
- pixeltable/exprs/json_path.py +56 -22
- pixeltable/exprs/literal.py +27 -5
- pixeltable/exprs/method_ref.py +2 -2
- pixeltable/exprs/object_ref.py +2 -2
- pixeltable/exprs/row_builder.py +167 -67
- pixeltable/exprs/rowid_ref.py +25 -10
- pixeltable/exprs/similarity_expr.py +58 -40
- pixeltable/exprs/sql_element_cache.py +4 -4
- pixeltable/exprs/string_op.py +5 -5
- pixeltable/exprs/type_cast.py +3 -5
- pixeltable/func/__init__.py +1 -0
- pixeltable/func/aggregate_function.py +8 -8
- pixeltable/func/callable_function.py +9 -9
- pixeltable/func/expr_template_function.py +17 -11
- pixeltable/func/function.py +18 -20
- pixeltable/func/function_registry.py +6 -7
- pixeltable/func/globals.py +2 -3
- pixeltable/func/mcp.py +74 -0
- pixeltable/func/query_template_function.py +29 -27
- pixeltable/func/signature.py +46 -19
- pixeltable/func/tools.py +31 -13
- pixeltable/func/udf.py +18 -20
- pixeltable/functions/__init__.py +16 -0
- pixeltable/functions/anthropic.py +123 -77
- pixeltable/functions/audio.py +147 -10
- pixeltable/functions/bedrock.py +13 -6
- pixeltable/functions/date.py +7 -4
- pixeltable/functions/deepseek.py +35 -43
- pixeltable/functions/document.py +81 -0
- pixeltable/functions/fal.py +76 -0
- pixeltable/functions/fireworks.py +11 -20
- pixeltable/functions/gemini.py +195 -39
- pixeltable/functions/globals.py +142 -14
- pixeltable/functions/groq.py +108 -0
- pixeltable/functions/huggingface.py +1056 -24
- pixeltable/functions/image.py +115 -57
- pixeltable/functions/json.py +1 -1
- pixeltable/functions/llama_cpp.py +28 -13
- pixeltable/functions/math.py +67 -5
- pixeltable/functions/mistralai.py +18 -55
- pixeltable/functions/net.py +70 -0
- pixeltable/functions/ollama.py +20 -13
- pixeltable/functions/openai.py +240 -226
- pixeltable/functions/openrouter.py +143 -0
- pixeltable/functions/replicate.py +4 -4
- pixeltable/functions/reve.py +250 -0
- pixeltable/functions/string.py +239 -69
- pixeltable/functions/timestamp.py +16 -16
- pixeltable/functions/together.py +24 -84
- pixeltable/functions/twelvelabs.py +188 -0
- pixeltable/functions/util.py +6 -1
- pixeltable/functions/uuid.py +30 -0
- pixeltable/functions/video.py +1515 -107
- pixeltable/functions/vision.py +8 -8
- pixeltable/functions/voyageai.py +289 -0
- pixeltable/functions/whisper.py +16 -8
- pixeltable/functions/whisperx.py +179 -0
- pixeltable/{ext/functions → functions}/yolox.py +2 -4
- pixeltable/globals.py +362 -115
- pixeltable/index/base.py +17 -21
- pixeltable/index/btree.py +28 -22
- pixeltable/index/embedding_index.py +100 -118
- pixeltable/io/__init__.py +4 -2
- pixeltable/io/datarows.py +8 -7
- pixeltable/io/external_store.py +56 -105
- pixeltable/io/fiftyone.py +13 -13
- pixeltable/io/globals.py +31 -30
- pixeltable/io/hf_datasets.py +61 -16
- pixeltable/io/label_studio.py +74 -70
- pixeltable/io/lancedb.py +3 -0
- pixeltable/io/pandas.py +21 -12
- pixeltable/io/parquet.py +25 -105
- pixeltable/io/table_data_conduit.py +250 -123
- pixeltable/io/utils.py +4 -4
- pixeltable/iterators/__init__.py +2 -1
- pixeltable/iterators/audio.py +26 -25
- pixeltable/iterators/base.py +9 -3
- pixeltable/iterators/document.py +112 -78
- pixeltable/iterators/image.py +12 -15
- pixeltable/iterators/string.py +11 -4
- pixeltable/iterators/video.py +523 -120
- pixeltable/metadata/__init__.py +14 -3
- pixeltable/metadata/converters/convert_13.py +2 -2
- pixeltable/metadata/converters/convert_18.py +2 -2
- pixeltable/metadata/converters/convert_19.py +2 -2
- pixeltable/metadata/converters/convert_20.py +2 -2
- pixeltable/metadata/converters/convert_21.py +2 -2
- pixeltable/metadata/converters/convert_22.py +2 -2
- pixeltable/metadata/converters/convert_24.py +2 -2
- pixeltable/metadata/converters/convert_25.py +2 -2
- pixeltable/metadata/converters/convert_26.py +2 -2
- pixeltable/metadata/converters/convert_29.py +4 -4
- pixeltable/metadata/converters/convert_30.py +34 -21
- pixeltable/metadata/converters/convert_34.py +2 -2
- pixeltable/metadata/converters/convert_35.py +9 -0
- pixeltable/metadata/converters/convert_36.py +38 -0
- pixeltable/metadata/converters/convert_37.py +15 -0
- pixeltable/metadata/converters/convert_38.py +39 -0
- pixeltable/metadata/converters/convert_39.py +124 -0
- pixeltable/metadata/converters/convert_40.py +73 -0
- pixeltable/metadata/converters/convert_41.py +12 -0
- pixeltable/metadata/converters/convert_42.py +9 -0
- pixeltable/metadata/converters/convert_43.py +44 -0
- pixeltable/metadata/converters/util.py +20 -31
- pixeltable/metadata/notes.py +9 -0
- pixeltable/metadata/schema.py +140 -53
- pixeltable/metadata/utils.py +74 -0
- pixeltable/mypy/__init__.py +3 -0
- pixeltable/mypy/mypy_plugin.py +123 -0
- pixeltable/plan.py +382 -115
- pixeltable/share/__init__.py +1 -1
- pixeltable/share/packager.py +547 -83
- pixeltable/share/protocol/__init__.py +33 -0
- pixeltable/share/protocol/common.py +165 -0
- pixeltable/share/protocol/operation_types.py +33 -0
- pixeltable/share/protocol/replica.py +119 -0
- pixeltable/share/publish.py +257 -59
- pixeltable/store.py +311 -194
- pixeltable/type_system.py +373 -211
- pixeltable/utils/__init__.py +2 -3
- pixeltable/utils/arrow.py +131 -17
- pixeltable/utils/av.py +298 -0
- pixeltable/utils/azure_store.py +346 -0
- pixeltable/utils/coco.py +6 -6
- pixeltable/utils/code.py +3 -3
- pixeltable/utils/console_output.py +4 -1
- pixeltable/utils/coroutine.py +6 -23
- pixeltable/utils/dbms.py +32 -6
- pixeltable/utils/description_helper.py +4 -5
- pixeltable/utils/documents.py +7 -18
- pixeltable/utils/exception_handler.py +7 -30
- pixeltable/utils/filecache.py +6 -6
- pixeltable/utils/formatter.py +86 -48
- pixeltable/utils/gcs_store.py +295 -0
- pixeltable/utils/http.py +133 -0
- pixeltable/utils/http_server.py +2 -3
- pixeltable/utils/iceberg.py +1 -2
- pixeltable/utils/image.py +17 -0
- pixeltable/utils/lancedb.py +90 -0
- pixeltable/utils/local_store.py +322 -0
- pixeltable/utils/misc.py +5 -0
- pixeltable/utils/object_stores.py +573 -0
- pixeltable/utils/pydantic.py +60 -0
- pixeltable/utils/pytorch.py +5 -6
- pixeltable/utils/s3_store.py +527 -0
- pixeltable/utils/sql.py +26 -0
- pixeltable/utils/system.py +30 -0
- pixeltable-0.5.7.dist-info/METADATA +579 -0
- pixeltable-0.5.7.dist-info/RECORD +227 -0
- {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
- pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
- pixeltable/__version__.py +0 -3
- pixeltable/catalog/named_function.py +0 -40
- pixeltable/ext/__init__.py +0 -17
- pixeltable/ext/functions/__init__.py +0 -11
- pixeltable/ext/functions/whisperx.py +0 -77
- pixeltable/utils/media_store.py +0 -77
- pixeltable/utils/s3.py +0 -17
- pixeltable-0.3.14.dist-info/METADATA +0 -434
- pixeltable-0.3.14.dist-info/RECORD +0 -186
- pixeltable-0.3.14.dist-info/entry_points.txt +0 -3
- {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import io
|
|
4
|
+
import logging
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from types import NoneType
|
|
7
|
+
from typing import Any, AsyncIterator
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
import PIL.Image
|
|
11
|
+
|
|
12
|
+
import pixeltable.type_system as ts
|
|
13
|
+
from pixeltable import exprs
|
|
14
|
+
from pixeltable.utils import parse_local_file_path
|
|
15
|
+
|
|
16
|
+
from .data_row_batch import DataRowBatch
|
|
17
|
+
from .exec_node import ExecNode
|
|
18
|
+
from .globals import INLINED_OBJECT_MD_KEY, InlinedObjectMd
|
|
19
|
+
|
|
20
|
+
_logger = logging.getLogger('pixeltable')
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def json_has_inlined_objs(element: Any) -> bool:
|
|
24
|
+
"""Returns True if element contains inlined objects produced by CellMaterializationNode."""
|
|
25
|
+
if isinstance(element, list):
|
|
26
|
+
return any(json_has_inlined_objs(v) for v in element)
|
|
27
|
+
if isinstance(element, dict):
|
|
28
|
+
if INLINED_OBJECT_MD_KEY in element:
|
|
29
|
+
return True
|
|
30
|
+
return any(json_has_inlined_objs(v) for v in element.values())
|
|
31
|
+
return False
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def reconstruct_json(element: Any, urls: list[str], file_handles: dict[Path, io.BufferedReader]) -> Any:
|
|
35
|
+
"""Recursively reconstructs inlined objects in a json structure."""
|
|
36
|
+
if isinstance(element, list):
|
|
37
|
+
return [reconstruct_json(v, urls, file_handles) for v in element]
|
|
38
|
+
if isinstance(element, dict):
|
|
39
|
+
if INLINED_OBJECT_MD_KEY in element:
|
|
40
|
+
obj_md = InlinedObjectMd.from_dict(element[INLINED_OBJECT_MD_KEY])
|
|
41
|
+
url = urls[obj_md.url_idx]
|
|
42
|
+
local_path = parse_local_file_path(url)
|
|
43
|
+
if local_path not in file_handles:
|
|
44
|
+
file_handles[local_path] = open(local_path, 'rb') # noqa: SIM115
|
|
45
|
+
fp = file_handles[local_path]
|
|
46
|
+
|
|
47
|
+
if obj_md.type == ts.ColumnType.Type.ARRAY.name:
|
|
48
|
+
fp.seek(obj_md.array_md.start)
|
|
49
|
+
ar = load_array(
|
|
50
|
+
fp, obj_md.array_md.start, obj_md.array_md.end, obj_md.array_md.is_bool, obj_md.array_md.shape
|
|
51
|
+
)
|
|
52
|
+
return ar
|
|
53
|
+
elif obj_md.type == ts.ColumnType.Type.IMAGE.name:
|
|
54
|
+
fp.seek(obj_md.img_start)
|
|
55
|
+
bytesio = io.BytesIO(fp.read(obj_md.img_end - obj_md.img_start))
|
|
56
|
+
img = PIL.Image.open(bytesio)
|
|
57
|
+
img.load()
|
|
58
|
+
assert fp.tell() == obj_md.img_end, f'{fp.tell()} != {obj_md.img_end} ({obj_md.img_start})'
|
|
59
|
+
return img
|
|
60
|
+
else:
|
|
61
|
+
assert obj_md.type == ts.ColumnType.Type.BINARY.name
|
|
62
|
+
assert obj_md.binary_md is not None
|
|
63
|
+
fp.seek(obj_md.binary_md.start)
|
|
64
|
+
data = fp.read(obj_md.binary_md.end - obj_md.binary_md.start)
|
|
65
|
+
assert fp.tell() == obj_md.binary_md.end, (
|
|
66
|
+
f'{fp.tell()} != {obj_md.binary_md.end} ({obj_md.binary_md.start})'
|
|
67
|
+
)
|
|
68
|
+
return data
|
|
69
|
+
else:
|
|
70
|
+
return {k: reconstruct_json(v, urls, file_handles) for k, v in element.items()}
|
|
71
|
+
return element
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def load_array(
|
|
75
|
+
fh: io.BufferedReader, start: int, end: int, is_bool_array: bool, shape: tuple[int, ...] | None
|
|
76
|
+
) -> np.ndarray:
|
|
77
|
+
"""Loads an array from a section of a file."""
|
|
78
|
+
fh.seek(start)
|
|
79
|
+
ar = np.load(fh, allow_pickle=False)
|
|
80
|
+
assert fh.tell() == end
|
|
81
|
+
if is_bool_array:
|
|
82
|
+
assert shape is not None
|
|
83
|
+
ar = np.unpackbits(ar, count=np.prod(shape)).reshape(shape).astype(bool)
|
|
84
|
+
return ar
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class CellReconstructionNode(ExecNode):
|
|
88
|
+
"""
|
|
89
|
+
Reconstruction of stored json and array cells that were produced by CellMaterializationNode.
|
|
90
|
+
"""
|
|
91
|
+
|
|
92
|
+
json_refs: list[exprs.ColumnRef]
|
|
93
|
+
array_refs: list[exprs.ColumnRef]
|
|
94
|
+
binary_refs: list[exprs.ColumnRef]
|
|
95
|
+
file_handles: dict[Path, io.BufferedReader] # key: file path
|
|
96
|
+
|
|
97
|
+
def __init__(
|
|
98
|
+
self,
|
|
99
|
+
json_refs: list[exprs.ColumnRef],
|
|
100
|
+
array_refs: list[exprs.ColumnRef],
|
|
101
|
+
binary_refs: list[exprs.ColumnRef],
|
|
102
|
+
row_builder: exprs.RowBuilder,
|
|
103
|
+
input: ExecNode | None = None,
|
|
104
|
+
):
|
|
105
|
+
super().__init__(row_builder, [], [], input)
|
|
106
|
+
self.json_refs = json_refs
|
|
107
|
+
self.array_refs = array_refs
|
|
108
|
+
self.binary_refs = binary_refs
|
|
109
|
+
self.file_handles = {}
|
|
110
|
+
|
|
111
|
+
async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
|
|
112
|
+
async for batch in self.input:
|
|
113
|
+
for row in batch:
|
|
114
|
+
for col_ref in self.json_refs:
|
|
115
|
+
val = row[col_ref.slot_idx]
|
|
116
|
+
if val is None:
|
|
117
|
+
continue
|
|
118
|
+
cell_md = row.slot_md.get(col_ref.slot_idx)
|
|
119
|
+
if cell_md is None or cell_md.file_urls is None or not json_has_inlined_objs(row[col_ref.slot_idx]):
|
|
120
|
+
continue
|
|
121
|
+
row[col_ref.slot_idx] = reconstruct_json(val, cell_md.file_urls, self.file_handles)
|
|
122
|
+
|
|
123
|
+
for col_ref in self.array_refs:
|
|
124
|
+
cell_md = row.slot_md.get(col_ref.slot_idx)
|
|
125
|
+
if cell_md is not None and cell_md.array_md is not None:
|
|
126
|
+
assert row[col_ref.slot_idx] is None
|
|
127
|
+
row[col_ref.slot_idx] = self._reconstruct_array(cell_md)
|
|
128
|
+
else:
|
|
129
|
+
assert isinstance(row[col_ref.slot_idx], (NoneType, np.ndarray))
|
|
130
|
+
|
|
131
|
+
for col_ref in self.binary_refs:
|
|
132
|
+
cell_md = row.slot_md.get(col_ref.slot_idx)
|
|
133
|
+
if cell_md is not None and cell_md.binary_md is not None:
|
|
134
|
+
assert row[col_ref.slot_idx] is None
|
|
135
|
+
row[col_ref.slot_idx] = self._reconstruct_binary(cell_md)
|
|
136
|
+
else:
|
|
137
|
+
assert isinstance(row[col_ref.slot_idx], (NoneType, bytes))
|
|
138
|
+
|
|
139
|
+
yield batch
|
|
140
|
+
|
|
141
|
+
def close(self) -> None:
|
|
142
|
+
for fp in self.file_handles.values():
|
|
143
|
+
fp.close()
|
|
144
|
+
|
|
145
|
+
def _reconstruct_array(self, cell_md: exprs.CellMd) -> np.ndarray:
|
|
146
|
+
assert cell_md.array_md is not None
|
|
147
|
+
assert cell_md.file_urls is not None and len(cell_md.file_urls) == 1
|
|
148
|
+
fp = self.__get_file_pointer(cell_md.file_urls[0])
|
|
149
|
+
ar = load_array(
|
|
150
|
+
fp, cell_md.array_md.start, cell_md.array_md.end, bool(cell_md.array_md.is_bool), cell_md.array_md.shape
|
|
151
|
+
)
|
|
152
|
+
return ar
|
|
153
|
+
|
|
154
|
+
def _reconstruct_binary(self, cell_md: exprs.CellMd) -> bytes:
|
|
155
|
+
assert cell_md.binary_md is not None
|
|
156
|
+
assert cell_md.file_urls is not None and len(cell_md.file_urls) == 1
|
|
157
|
+
fp = self.__get_file_pointer(cell_md.file_urls[0])
|
|
158
|
+
fp.seek(cell_md.binary_md.start)
|
|
159
|
+
data = fp.read(cell_md.binary_md.end - cell_md.binary_md.start)
|
|
160
|
+
assert fp.tell() == cell_md.binary_md.end
|
|
161
|
+
return data
|
|
162
|
+
|
|
163
|
+
def __get_file_pointer(self, file_url: str) -> io.BufferedReader:
|
|
164
|
+
local_path = parse_local_file_path(file_url)
|
|
165
|
+
assert local_path is not None
|
|
166
|
+
if local_path not in self.file_handles:
|
|
167
|
+
self.file_handles[local_path] = open(str(local_path), 'rb') # noqa: SIM115
|
|
168
|
+
return self.file_handles[local_path]
|
|
@@ -40,7 +40,7 @@ class ComponentIterationNode(ExecNode):
|
|
|
40
40
|
}
|
|
41
41
|
|
|
42
42
|
async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
|
|
43
|
-
output_batch = DataRowBatch(self.
|
|
43
|
+
output_batch = DataRowBatch(self.row_builder)
|
|
44
44
|
async for input_batch in self.input:
|
|
45
45
|
for input_row in input_batch:
|
|
46
46
|
self.row_builder.eval(input_row, self.iterator_args_ctx)
|
|
@@ -52,13 +52,14 @@ class ComponentIterationNode(ExecNode):
|
|
|
52
52
|
if self.__non_nullable_args_specified(iterator_args):
|
|
53
53
|
iterator = self.view.get().iterator_cls(**iterator_args)
|
|
54
54
|
for pos, component_dict in enumerate(iterator):
|
|
55
|
-
output_row =
|
|
55
|
+
output_row = self.row_builder.make_row()
|
|
56
56
|
input_row.copy(output_row)
|
|
57
57
|
# we're expanding the input and need to add the iterator position to the pk
|
|
58
58
|
self.__populate_output_row(output_row, pos, component_dict)
|
|
59
|
+
output_batch.add_row(output_row)
|
|
59
60
|
if len(output_batch) == self.__OUTPUT_BATCH_SIZE:
|
|
60
61
|
yield output_batch
|
|
61
|
-
output_batch = DataRowBatch(self.
|
|
62
|
+
output_batch = DataRowBatch(self.row_builder)
|
|
62
63
|
|
|
63
64
|
if len(output_batch) > 0:
|
|
64
65
|
yield output_batch
|
|
@@ -1,10 +1,9 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
|
-
from typing import Iterator
|
|
4
|
+
from typing import Iterator
|
|
5
5
|
|
|
6
|
-
from pixeltable import
|
|
7
|
-
from pixeltable.utils.media_store import MediaStore
|
|
6
|
+
from pixeltable import exprs
|
|
8
7
|
|
|
9
8
|
_logger = logging.getLogger('pixeltable')
|
|
10
9
|
|
|
@@ -13,53 +12,20 @@ class DataRowBatch:
|
|
|
13
12
|
"""Set of DataRows, indexed by rowid.
|
|
14
13
|
|
|
15
14
|
Contains the metadata needed to initialize DataRows.
|
|
15
|
+
|
|
16
|
+
Requires either num_rows or rows to be specified, but not both.
|
|
16
17
|
"""
|
|
17
18
|
|
|
18
|
-
tbl: Optional[catalog.TableVersionHandle]
|
|
19
19
|
row_builder: exprs.RowBuilder
|
|
20
|
-
img_slot_idxs: list[int]
|
|
21
|
-
media_slot_idxs: list[int] # non-image media slots
|
|
22
|
-
array_slot_idxs: list[int]
|
|
23
20
|
rows: list[exprs.DataRow]
|
|
24
21
|
|
|
25
|
-
def __init__(
|
|
26
|
-
self,
|
|
27
|
-
tbl: Optional[catalog.TableVersionHandle],
|
|
28
|
-
row_builder: exprs.RowBuilder,
|
|
29
|
-
num_rows: Optional[int] = None,
|
|
30
|
-
rows: Optional[list[exprs.DataRow]] = None,
|
|
31
|
-
):
|
|
32
|
-
"""
|
|
33
|
-
Requires either num_rows or rows to be specified, but not both.
|
|
34
|
-
"""
|
|
35
|
-
assert num_rows is None or rows is None
|
|
36
|
-
self.tbl = tbl
|
|
22
|
+
def __init__(self, row_builder: exprs.RowBuilder, rows: list[exprs.DataRow] | None = None):
|
|
37
23
|
self.row_builder = row_builder
|
|
38
|
-
self.
|
|
39
|
-
# non-image media slots
|
|
40
|
-
self.media_slot_idxs = [
|
|
41
|
-
e.slot_idx
|
|
42
|
-
for e in row_builder.unique_exprs
|
|
43
|
-
if e.col_type.is_media_type() and not e.col_type.is_image_type()
|
|
44
|
-
]
|
|
45
|
-
self.array_slot_idxs = [e.slot_idx for e in row_builder.unique_exprs if e.col_type.is_array_type()]
|
|
46
|
-
if rows is not None:
|
|
47
|
-
self.rows = rows
|
|
48
|
-
else:
|
|
49
|
-
if num_rows is None:
|
|
50
|
-
num_rows = 0
|
|
51
|
-
self.rows = [
|
|
52
|
-
exprs.DataRow(
|
|
53
|
-
row_builder.num_materialized, self.img_slot_idxs, self.media_slot_idxs, self.array_slot_idxs
|
|
54
|
-
)
|
|
55
|
-
for _ in range(num_rows)
|
|
56
|
-
]
|
|
24
|
+
self.rows = [] if rows is None else rows
|
|
57
25
|
|
|
58
|
-
def add_row(self, row:
|
|
26
|
+
def add_row(self, row: exprs.DataRow | None) -> exprs.DataRow:
|
|
59
27
|
if row is None:
|
|
60
|
-
row =
|
|
61
|
-
self.row_builder.num_materialized, self.img_slot_idxs, self.media_slot_idxs, self.array_slot_idxs
|
|
62
|
-
)
|
|
28
|
+
row = self.row_builder.make_row()
|
|
63
29
|
self.rows.append(row)
|
|
64
30
|
return row
|
|
65
31
|
|
|
@@ -72,28 +38,5 @@ class DataRowBatch:
|
|
|
72
38
|
def __getitem__(self, index: int) -> exprs.DataRow:
|
|
73
39
|
return self.rows[index]
|
|
74
40
|
|
|
75
|
-
def flush_imgs(
|
|
76
|
-
self,
|
|
77
|
-
idx_range: Optional[slice] = None,
|
|
78
|
-
stored_img_info: Optional[list[exprs.ColumnSlotIdx]] = None,
|
|
79
|
-
flushed_slot_idxs: Optional[list[int]] = None,
|
|
80
|
-
) -> None:
|
|
81
|
-
"""Flushes images in the given range of rows."""
|
|
82
|
-
assert self.tbl is not None
|
|
83
|
-
if stored_img_info is None:
|
|
84
|
-
stored_img_info = []
|
|
85
|
-
if flushed_slot_idxs is None:
|
|
86
|
-
flushed_slot_idxs = []
|
|
87
|
-
if len(stored_img_info) == 0 and len(flushed_slot_idxs) == 0:
|
|
88
|
-
return
|
|
89
|
-
if idx_range is None:
|
|
90
|
-
idx_range = slice(0, len(self.rows))
|
|
91
|
-
for row in self.rows[idx_range]:
|
|
92
|
-
for info in stored_img_info:
|
|
93
|
-
filepath = str(MediaStore.prepare_media_path(self.tbl.id, info.col.id, self.tbl.get().version))
|
|
94
|
-
row.flush_img(info.slot_idx, filepath)
|
|
95
|
-
for slot_idx in flushed_slot_idxs:
|
|
96
|
-
row.flush_img(slot_idx)
|
|
97
|
-
|
|
98
41
|
def __iter__(self) -> Iterator[exprs.DataRow]:
|
|
99
42
|
return iter(self.rows)
|
pixeltable/exec/exec_context.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
|
|
1
|
+
import random
|
|
2
2
|
|
|
3
3
|
import sqlalchemy as sql
|
|
4
4
|
|
|
@@ -8,13 +8,24 @@ from pixeltable import exprs
|
|
|
8
8
|
class ExecContext:
|
|
9
9
|
"""Class for execution runtime constants"""
|
|
10
10
|
|
|
11
|
+
row_builder: exprs.RowBuilder
|
|
12
|
+
profile: exprs.ExecProfile
|
|
13
|
+
show_pbar: bool
|
|
14
|
+
batch_size: int
|
|
15
|
+
num_rows: int | None
|
|
16
|
+
conn: sql.engine.Connection | None
|
|
17
|
+
pk_clause: list[sql.ClauseElement] | None
|
|
18
|
+
num_computed_exprs: int
|
|
19
|
+
ignore_errors: bool
|
|
20
|
+
random_seed: int # general-purpose source of randomness with execution scope
|
|
21
|
+
|
|
11
22
|
def __init__(
|
|
12
23
|
self,
|
|
13
24
|
row_builder: exprs.RowBuilder,
|
|
14
25
|
*,
|
|
15
26
|
show_pbar: bool = False,
|
|
16
27
|
batch_size: int = 0,
|
|
17
|
-
pk_clause:
|
|
28
|
+
pk_clause: list[sql.ClauseElement] | None = None,
|
|
18
29
|
num_computed_exprs: int = 0,
|
|
19
30
|
ignore_errors: bool = False,
|
|
20
31
|
):
|
|
@@ -23,8 +34,9 @@ class ExecContext:
|
|
|
23
34
|
self.row_builder = row_builder
|
|
24
35
|
self.profile = exprs.ExecProfile(row_builder)
|
|
25
36
|
# num_rows is used to compute the total number of computed cells used for the progress bar
|
|
26
|
-
self.num_rows
|
|
27
|
-
self.conn
|
|
37
|
+
self.num_rows = None
|
|
38
|
+
self.conn = None # if present, use this to execute SQL queries
|
|
28
39
|
self.pk_clause = pk_clause
|
|
29
40
|
self.num_computed_exprs = num_computed_exprs
|
|
30
41
|
self.ignore_errors = ignore_errors
|
|
42
|
+
self.random_seed = random.randint(0, 1 << 63)
|
pixeltable/exec/exec_node.py
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import abc
|
|
4
|
-
import asyncio
|
|
5
4
|
import logging
|
|
6
|
-
from typing import AsyncIterator, Iterable, Iterator,
|
|
5
|
+
from typing import AsyncIterator, Iterable, Iterator, TypeVar
|
|
7
6
|
|
|
8
7
|
from pixeltable import exprs
|
|
8
|
+
from pixeltable.env import Env
|
|
9
9
|
|
|
10
10
|
from .data_row_batch import DataRowBatch
|
|
11
11
|
from .exec_context import ExecContext
|
|
@@ -18,17 +18,16 @@ class ExecNode(abc.ABC):
|
|
|
18
18
|
|
|
19
19
|
output_exprs: Iterable[exprs.Expr]
|
|
20
20
|
row_builder: exprs.RowBuilder
|
|
21
|
-
input:
|
|
21
|
+
input: ExecNode | None
|
|
22
22
|
flushed_img_slots: list[int] # idxs of image slots of our output_exprs dependencies
|
|
23
|
-
|
|
24
|
-
ctx: Optional[ExecContext]
|
|
23
|
+
ctx: ExecContext | None
|
|
25
24
|
|
|
26
25
|
def __init__(
|
|
27
26
|
self,
|
|
28
27
|
row_builder: exprs.RowBuilder,
|
|
29
28
|
output_exprs: Iterable[exprs.Expr],
|
|
30
29
|
input_exprs: Iterable[exprs.Expr],
|
|
31
|
-
input:
|
|
30
|
+
input: ExecNode | None = None,
|
|
32
31
|
):
|
|
33
32
|
assert all(expr.is_valid for expr in output_exprs)
|
|
34
33
|
self.output_exprs = output_exprs
|
|
@@ -40,43 +39,19 @@ class ExecNode(abc.ABC):
|
|
|
40
39
|
self.flushed_img_slots = [
|
|
41
40
|
e.slot_idx for e in output_dependencies if e.col_type.is_image_type() and e.slot_idx not in output_slot_idxs
|
|
42
41
|
]
|
|
43
|
-
self.
|
|
44
|
-
self.ctx = None # all nodes of a tree share the same context
|
|
42
|
+
self.ctx = input.ctx if input is not None else None
|
|
45
43
|
|
|
46
44
|
def set_ctx(self, ctx: ExecContext) -> None:
|
|
47
45
|
self.ctx = ctx
|
|
48
46
|
if self.input is not None:
|
|
49
47
|
self.input.set_ctx(ctx)
|
|
50
48
|
|
|
51
|
-
def set_stored_img_cols(self, stored_img_cols: list[exprs.ColumnSlotIdx]) -> None:
|
|
52
|
-
self.stored_img_cols = stored_img_cols
|
|
53
|
-
# propagate batch size to the source
|
|
54
|
-
if self.input is not None:
|
|
55
|
-
self.input.set_stored_img_cols(stored_img_cols)
|
|
56
|
-
|
|
57
49
|
@abc.abstractmethod
|
|
58
50
|
def __aiter__(self) -> AsyncIterator[DataRowBatch]:
|
|
59
51
|
pass
|
|
60
52
|
|
|
61
53
|
def __iter__(self) -> Iterator[DataRowBatch]:
|
|
62
|
-
|
|
63
|
-
loop: asyncio.AbstractEventLoop
|
|
64
|
-
try:
|
|
65
|
-
# check if we are already in an event loop (eg, Jupyter's); if so, patch it to allow
|
|
66
|
-
# multiple run_until_complete()
|
|
67
|
-
running_loop = asyncio.get_running_loop()
|
|
68
|
-
import nest_asyncio # type: ignore[import-untyped]
|
|
69
|
-
|
|
70
|
-
nest_asyncio.apply()
|
|
71
|
-
loop = running_loop
|
|
72
|
-
_logger.debug('Patched running loop')
|
|
73
|
-
except RuntimeError:
|
|
74
|
-
loop = asyncio.new_event_loop()
|
|
75
|
-
asyncio.set_event_loop(loop)
|
|
76
|
-
|
|
77
|
-
if _logger.isEnabledFor(logging.DEBUG):
|
|
78
|
-
loop.set_debug(True)
|
|
79
|
-
|
|
54
|
+
loop = Env.get().event_loop
|
|
80
55
|
aiter = self.__aiter__()
|
|
81
56
|
try:
|
|
82
57
|
while True:
|
|
@@ -84,9 +59,11 @@ class ExecNode(abc.ABC):
|
|
|
84
59
|
yield batch
|
|
85
60
|
except StopAsyncIteration:
|
|
86
61
|
pass
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
62
|
+
# TODO:
|
|
63
|
+
# - we seem to have some tasks that aren't accounted for by ExprEvalNode and don't get cancelled by the time
|
|
64
|
+
# we end up here
|
|
65
|
+
# - however, blindly cancelling all pending tasks doesn't work when running in a jupyter environment, which
|
|
66
|
+
# creates tasks on its own
|
|
90
67
|
|
|
91
68
|
def open(self) -> None:
|
|
92
69
|
"""Bottom-up initialization of nodes for execution. Must be called before __next__."""
|
|
@@ -108,7 +85,7 @@ class ExecNode(abc.ABC):
|
|
|
108
85
|
|
|
109
86
|
T = TypeVar('T', bound='ExecNode')
|
|
110
87
|
|
|
111
|
-
def get_node(self, node_class: type[T]) ->
|
|
88
|
+
def get_node(self, node_class: type[T]) -> T | None:
|
|
112
89
|
if isinstance(self, node_class):
|
|
113
90
|
return self
|
|
114
91
|
if self.input is not None:
|
|
@@ -5,7 +5,7 @@ import datetime
|
|
|
5
5
|
import itertools
|
|
6
6
|
import logging
|
|
7
7
|
import sys
|
|
8
|
-
from typing import Any, Callable, Iterator,
|
|
8
|
+
from typing import Any, Callable, Iterator, cast
|
|
9
9
|
|
|
10
10
|
from pixeltable import exprs, func
|
|
11
11
|
|
|
@@ -64,11 +64,11 @@ class FnCallEvaluator(Evaluator):
|
|
|
64
64
|
|
|
65
65
|
fn_call: exprs.FunctionCall
|
|
66
66
|
fn: func.CallableFunction
|
|
67
|
-
scalar_py_fn:
|
|
67
|
+
scalar_py_fn: Callable | None # only set for non-batching CallableFunctions
|
|
68
68
|
|
|
69
69
|
# only set if fn.is_batched
|
|
70
|
-
call_args_queue:
|
|
71
|
-
batch_size:
|
|
70
|
+
call_args_queue: asyncio.Queue[FnCallArgs] | None # FnCallArgs waiting for execution
|
|
71
|
+
batch_size: int | None
|
|
72
72
|
|
|
73
73
|
def __init__(self, fn_call: exprs.FunctionCall, dispatcher: Dispatcher, exec_ctx: ExecCtx):
|
|
74
74
|
super().__init__(dispatcher, exec_ctx)
|
|
@@ -160,8 +160,8 @@ class FnCallEvaluator(Evaluator):
|
|
|
160
160
|
|
|
161
161
|
def _create_batch_call_args(self, call_args: list[FnCallArgs]) -> FnCallArgs:
|
|
162
162
|
"""Roll call_args into a single batched FnCallArgs"""
|
|
163
|
-
batch_args: list[list[
|
|
164
|
-
batch_kwargs: dict[str, list[
|
|
163
|
+
batch_args: list[list[Any | None]] = [[None] * len(call_args) for _ in range(len(self.fn_call.arg_idxs))]
|
|
164
|
+
batch_kwargs: dict[str, list[Any | None]] = {k: [None] * len(call_args) for k in self.fn_call.kwarg_idxs}
|
|
165
165
|
assert isinstance(self.fn, func.CallableFunction)
|
|
166
166
|
for i, item in enumerate(call_args):
|
|
167
167
|
for j in range(len(item.args)):
|
|
@@ -311,13 +311,17 @@ class JsonMapperDispatcher(Evaluator):
|
|
|
311
311
|
img_slot_idxs=[],
|
|
312
312
|
media_slot_idxs=[],
|
|
313
313
|
array_slot_idxs=[],
|
|
314
|
+
json_slot_idxs=[],
|
|
314
315
|
parent_row=row,
|
|
315
316
|
parent_slot_idx=self.e.slot_idx,
|
|
316
317
|
)
|
|
317
318
|
for _ in src
|
|
318
319
|
]
|
|
319
320
|
for nested_row, anchor_val in zip(nested_rows, src):
|
|
320
|
-
|
|
321
|
+
# It's possible that self.scope_anchor.slot_idx is None; this corresponds to the case where the
|
|
322
|
+
# mapper expression doesn't actually contain references to RELATIVE_PATH_ROOT.
|
|
323
|
+
if self.scope_anchor.slot_idx is not None:
|
|
324
|
+
nested_row[self.scope_anchor.slot_idx] = anchor_val
|
|
321
325
|
for slot_idx_, nested_slot_idx in self.external_slot_map.items():
|
|
322
326
|
nested_row[nested_slot_idx] = row[slot_idx_]
|
|
323
327
|
self.nested_exec_ctx.init_rows(nested_rows)
|
|
@@ -4,7 +4,7 @@ import asyncio
|
|
|
4
4
|
import logging
|
|
5
5
|
import traceback
|
|
6
6
|
from types import TracebackType
|
|
7
|
-
from typing import AsyncIterator, Iterable
|
|
7
|
+
from typing import AsyncIterator, Iterable
|
|
8
8
|
|
|
9
9
|
import numpy as np
|
|
10
10
|
|
|
@@ -49,17 +49,17 @@ class ExprEvalNode(ExecNode):
|
|
|
49
49
|
# execution state
|
|
50
50
|
tasks: set[asyncio.Task] # collects all running tasks to prevent them from getting gc'd
|
|
51
51
|
exc_event: asyncio.Event # set if an exception needs to be propagated
|
|
52
|
-
error:
|
|
52
|
+
error: Exception | None # exception that needs to be propagated
|
|
53
53
|
completed_rows: asyncio.Queue[exprs.DataRow] # rows that have completed evaluation
|
|
54
54
|
completed_event: asyncio.Event # set when completed_rows is non-empty
|
|
55
55
|
input_iter: AsyncIterator[DataRowBatch]
|
|
56
|
-
current_input_batch:
|
|
56
|
+
current_input_batch: DataRowBatch | None # batch from which we're currently consuming rows
|
|
57
57
|
input_row_idx: int # next row to consume from current_input_batch
|
|
58
|
-
next_input_batch:
|
|
58
|
+
next_input_batch: DataRowBatch | None # read-ahead input batch
|
|
59
59
|
avail_input_rows: int # total number across both current_/next_input_batch
|
|
60
60
|
input_complete: bool # True if we've received all input batches
|
|
61
61
|
num_in_flight: int # number of dispatched rows that haven't completed
|
|
62
|
-
row_pos_map:
|
|
62
|
+
row_pos_map: dict[int, int] | None # id(row) -> position of row in input; only set if maintain_input_order
|
|
63
63
|
output_buffer: RowBuffer # holds rows that are ready to be returned, in order
|
|
64
64
|
|
|
65
65
|
# debugging
|
|
@@ -133,10 +133,10 @@ class ExprEvalNode(ExecNode):
|
|
|
133
133
|
except StopAsyncIteration:
|
|
134
134
|
self.input_complete = True
|
|
135
135
|
_logger.debug(f'finished input: #input_rows={self.num_input_rows}, #avail={self.avail_input_rows}')
|
|
136
|
-
|
|
137
|
-
|
|
136
|
+
# make sure to pass DBAPIError through, so the transaction handling logic sees it
|
|
137
|
+
except Exception as exc:
|
|
138
|
+
self.error = exc
|
|
138
139
|
self.exc_event.set()
|
|
139
|
-
# TODO: should we also handle Exception here and create an excs.Error from it?
|
|
140
140
|
|
|
141
141
|
@property
|
|
142
142
|
def total_buffered(self) -> int:
|
|
@@ -217,9 +217,10 @@ class ExprEvalNode(ExecNode):
|
|
|
217
217
|
|
|
218
218
|
row: exprs.DataRow
|
|
219
219
|
exc_event_aw = asyncio.create_task(self.exc_event.wait(), name='exc_event.wait()')
|
|
220
|
-
input_batch_aw:
|
|
221
|
-
completed_aw:
|
|
220
|
+
input_batch_aw: asyncio.Task | None = None
|
|
221
|
+
completed_aw: asyncio.Task | None = None
|
|
222
222
|
closed_evaluators = False # True after calling Evaluator.close()
|
|
223
|
+
exprs.Expr.prepare_list(self.exec_ctx.all_exprs)
|
|
223
224
|
|
|
224
225
|
try:
|
|
225
226
|
while True:
|
|
@@ -240,7 +241,7 @@ class ExprEvalNode(ExecNode):
|
|
|
240
241
|
# make sure we top up our in-flight rows before yielding
|
|
241
242
|
self._dispatch_input_rows()
|
|
242
243
|
self._log_state(f'yielding {len(batch_rows)} rows')
|
|
243
|
-
yield DataRowBatch(
|
|
244
|
+
yield DataRowBatch(row_builder=self.row_builder, rows=batch_rows)
|
|
244
245
|
# at this point, we may have more completed rows
|
|
245
246
|
|
|
246
247
|
assert self.completed_rows.empty() # all completed rows should be sitting in output_buffer
|
|
@@ -254,7 +255,7 @@ class ExprEvalNode(ExecNode):
|
|
|
254
255
|
batch_rows = self.output_buffer.get_rows(self.output_buffer.num_ready)
|
|
255
256
|
self.num_output_rows += len(batch_rows)
|
|
256
257
|
self._log_state(f'yielding {len(batch_rows)} rows')
|
|
257
|
-
yield DataRowBatch(
|
|
258
|
+
yield DataRowBatch(row_builder=self.row_builder, rows=batch_rows)
|
|
258
259
|
|
|
259
260
|
assert self.output_buffer.num_rows == 0
|
|
260
261
|
return
|
|
@@ -306,6 +307,9 @@ class ExprEvalNode(ExecNode):
|
|
|
306
307
|
task.cancel()
|
|
307
308
|
_ = await asyncio.gather(*active_tasks, return_exceptions=True)
|
|
308
309
|
|
|
310
|
+
# expr cleanup
|
|
311
|
+
exprs.Expr.release_list(self.exec_ctx.all_exprs)
|
|
312
|
+
|
|
309
313
|
def dispatch_exc(
|
|
310
314
|
self, rows: list[exprs.DataRow], slot_with_exc: int, exc_tb: TracebackType, exec_ctx: ExecCtx
|
|
311
315
|
) -> None:
|
|
@@ -390,6 +394,17 @@ class ExprEvalNode(ExecNode):
|
|
|
390
394
|
# end the main loop if we had an unhandled exception
|
|
391
395
|
try:
|
|
392
396
|
t.result()
|
|
397
|
+
except KeyboardInterrupt:
|
|
398
|
+
# ExprEvalNode instances are long-running and reused across multiple operations.
|
|
399
|
+
# When a user interrupts an operation (Ctrl+C), the main evaluation loop properly
|
|
400
|
+
# handles the KeyboardInterrupt and terminates the current operation. However,
|
|
401
|
+
# background tasks spawned by evaluators may complete asynchronously after the
|
|
402
|
+
# operation has ended, and their done callbacks will fire during subsequent
|
|
403
|
+
# operations. These "phantom" KeyboardInterrupt exceptions from previous
|
|
404
|
+
# operations' background tasks should not interfere with new operations, so we
|
|
405
|
+
# absorb them here rather than propagating them via self.error/self.exc_event.
|
|
406
|
+
_logger.debug('Task completed with KeyboardInterrupt (user cancellation)')
|
|
407
|
+
pass
|
|
393
408
|
except asyncio.CancelledError:
|
|
394
409
|
pass
|
|
395
410
|
except Exception as exc:
|
|
@@ -4,7 +4,7 @@ import abc
|
|
|
4
4
|
import asyncio
|
|
5
5
|
from dataclasses import dataclass
|
|
6
6
|
from types import TracebackType
|
|
7
|
-
from typing import Any, Iterable,
|
|
7
|
+
from typing import Any, Iterable, Protocol
|
|
8
8
|
|
|
9
9
|
import numpy as np
|
|
10
10
|
|
|
@@ -18,11 +18,11 @@ class FnCallArgs:
|
|
|
18
18
|
fn_call: exprs.FunctionCall
|
|
19
19
|
rows: list[exprs.DataRow]
|
|
20
20
|
# single call
|
|
21
|
-
args:
|
|
22
|
-
kwargs:
|
|
21
|
+
args: list[Any] | None = None
|
|
22
|
+
kwargs: dict[str, Any] | None = None
|
|
23
23
|
# batch call
|
|
24
|
-
batch_args:
|
|
25
|
-
batch_kwargs:
|
|
24
|
+
batch_args: list[list[Any | None]] | None = None
|
|
25
|
+
batch_kwargs: dict[str, list[Any | None]] | None = None
|
|
26
26
|
|
|
27
27
|
@property
|
|
28
28
|
def pxt_fn(self) -> func.CallableFunction:
|
|
@@ -56,6 +56,7 @@ class Scheduler(abc.ABC):
|
|
|
56
56
|
request: FnCallArgs
|
|
57
57
|
num_retries: int
|
|
58
58
|
exec_ctx: ExecCtx
|
|
59
|
+
retry_after: float | None = None # time.monotonic()
|
|
59
60
|
|
|
60
61
|
def __lt__(self, other: Scheduler.QueueItem) -> bool:
|
|
61
62
|
# prioritize by number of retries (more retries = higher priority)
|
|
@@ -148,6 +149,7 @@ class ExecCtx:
|
|
|
148
149
|
gc_targets: np.ndarray # bool per slot; True if this is an intermediate expr (ie, not part of our output)
|
|
149
150
|
eval_ctx: np.ndarray # bool per slot; EvalCtx.slot_idxs as a mask
|
|
150
151
|
literals: dict[int, Any] # key: slot idx; value: literal value for this slot; used to pre-populate rows
|
|
152
|
+
all_exprs: list[exprs.Expr] # all evaluated exprs; needed for cleanup
|
|
151
153
|
|
|
152
154
|
def __init__(
|
|
153
155
|
self,
|
|
@@ -164,6 +166,7 @@ class ExecCtx:
|
|
|
164
166
|
self.gc_targets[[e.slot_idx for e in self.row_builder.output_exprs]] = False
|
|
165
167
|
|
|
166
168
|
output_ctx = self.row_builder.create_eval_ctx(output_exprs, exclude=input_exprs)
|
|
169
|
+
self.all_exprs = output_ctx.exprs
|
|
167
170
|
self.literals = {e.slot_idx: e.val for e in output_ctx.exprs if isinstance(e, exprs.Literal)}
|
|
168
171
|
self.eval_ctx = np.zeros(self.row_builder.num_materialized, dtype=bool)
|
|
169
172
|
non_literal_slot_idxs = [e.slot_idx for e in output_ctx.exprs if not isinstance(e, exprs.Literal)]
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
|
-
from typing import Optional
|
|
5
4
|
|
|
6
5
|
import numpy as np
|
|
7
6
|
|
|
@@ -14,7 +13,7 @@ class RowBuffer:
|
|
|
14
13
|
"""Fixed-length circular buffer of DataRows; knows how to maintain input order"""
|
|
15
14
|
|
|
16
15
|
size: int
|
|
17
|
-
row_pos_map:
|
|
16
|
+
row_pos_map: dict[int, int] | None # id(row) -> position of row in output; None if not maintaining order
|
|
18
17
|
num_rows: int # number of rows in the buffer
|
|
19
18
|
num_ready: int # number of consecutive non-None rows at head
|
|
20
19
|
buffer: np.ndarray # of object
|