pixeltable 0.2.26__py3-none-any.whl → 0.5.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pixeltable/__init__.py +83 -19
- pixeltable/_query.py +1444 -0
- pixeltable/_version.py +1 -0
- pixeltable/catalog/__init__.py +7 -4
- pixeltable/catalog/catalog.py +2394 -119
- pixeltable/catalog/column.py +225 -104
- pixeltable/catalog/dir.py +38 -9
- pixeltable/catalog/globals.py +53 -34
- pixeltable/catalog/insertable_table.py +265 -115
- pixeltable/catalog/path.py +80 -17
- pixeltable/catalog/schema_object.py +28 -43
- pixeltable/catalog/table.py +1270 -677
- pixeltable/catalog/table_metadata.py +103 -0
- pixeltable/catalog/table_version.py +1270 -751
- pixeltable/catalog/table_version_handle.py +109 -0
- pixeltable/catalog/table_version_path.py +137 -42
- pixeltable/catalog/tbl_ops.py +53 -0
- pixeltable/catalog/update_status.py +191 -0
- pixeltable/catalog/view.py +251 -134
- pixeltable/config.py +215 -0
- pixeltable/env.py +736 -285
- pixeltable/exceptions.py +26 -2
- pixeltable/exec/__init__.py +7 -2
- pixeltable/exec/aggregation_node.py +39 -21
- pixeltable/exec/cache_prefetch_node.py +87 -109
- pixeltable/exec/cell_materialization_node.py +268 -0
- pixeltable/exec/cell_reconstruction_node.py +168 -0
- pixeltable/exec/component_iteration_node.py +25 -28
- pixeltable/exec/data_row_batch.py +11 -46
- pixeltable/exec/exec_context.py +26 -11
- pixeltable/exec/exec_node.py +35 -27
- pixeltable/exec/expr_eval/__init__.py +3 -0
- pixeltable/exec/expr_eval/evaluators.py +365 -0
- pixeltable/exec/expr_eval/expr_eval_node.py +413 -0
- pixeltable/exec/expr_eval/globals.py +200 -0
- pixeltable/exec/expr_eval/row_buffer.py +74 -0
- pixeltable/exec/expr_eval/schedulers.py +413 -0
- pixeltable/exec/globals.py +35 -0
- pixeltable/exec/in_memory_data_node.py +35 -27
- pixeltable/exec/object_store_save_node.py +293 -0
- pixeltable/exec/row_update_node.py +44 -29
- pixeltable/exec/sql_node.py +414 -115
- pixeltable/exprs/__init__.py +8 -5
- pixeltable/exprs/arithmetic_expr.py +79 -45
- pixeltable/exprs/array_slice.py +5 -5
- pixeltable/exprs/column_property_ref.py +40 -26
- pixeltable/exprs/column_ref.py +254 -61
- pixeltable/exprs/comparison.py +14 -9
- pixeltable/exprs/compound_predicate.py +9 -10
- pixeltable/exprs/data_row.py +213 -72
- pixeltable/exprs/expr.py +270 -104
- pixeltable/exprs/expr_dict.py +6 -5
- pixeltable/exprs/expr_set.py +20 -11
- pixeltable/exprs/function_call.py +383 -284
- pixeltable/exprs/globals.py +18 -5
- pixeltable/exprs/in_predicate.py +7 -7
- pixeltable/exprs/inline_expr.py +37 -37
- pixeltable/exprs/is_null.py +8 -4
- pixeltable/exprs/json_mapper.py +120 -54
- pixeltable/exprs/json_path.py +90 -60
- pixeltable/exprs/literal.py +61 -16
- pixeltable/exprs/method_ref.py +7 -6
- pixeltable/exprs/object_ref.py +19 -8
- pixeltable/exprs/row_builder.py +238 -75
- pixeltable/exprs/rowid_ref.py +53 -15
- pixeltable/exprs/similarity_expr.py +65 -50
- pixeltable/exprs/sql_element_cache.py +5 -5
- pixeltable/exprs/string_op.py +107 -0
- pixeltable/exprs/type_cast.py +25 -13
- pixeltable/exprs/variable.py +2 -2
- pixeltable/func/__init__.py +9 -5
- pixeltable/func/aggregate_function.py +197 -92
- pixeltable/func/callable_function.py +119 -35
- pixeltable/func/expr_template_function.py +101 -48
- pixeltable/func/function.py +375 -62
- pixeltable/func/function_registry.py +20 -19
- pixeltable/func/globals.py +6 -5
- pixeltable/func/mcp.py +74 -0
- pixeltable/func/query_template_function.py +151 -35
- pixeltable/func/signature.py +178 -49
- pixeltable/func/tools.py +164 -0
- pixeltable/func/udf.py +176 -53
- pixeltable/functions/__init__.py +44 -4
- pixeltable/functions/anthropic.py +226 -47
- pixeltable/functions/audio.py +148 -11
- pixeltable/functions/bedrock.py +137 -0
- pixeltable/functions/date.py +188 -0
- pixeltable/functions/deepseek.py +113 -0
- pixeltable/functions/document.py +81 -0
- pixeltable/functions/fal.py +76 -0
- pixeltable/functions/fireworks.py +72 -20
- pixeltable/functions/gemini.py +249 -0
- pixeltable/functions/globals.py +208 -53
- pixeltable/functions/groq.py +108 -0
- pixeltable/functions/huggingface.py +1088 -95
- pixeltable/functions/image.py +155 -84
- pixeltable/functions/json.py +8 -11
- pixeltable/functions/llama_cpp.py +31 -19
- pixeltable/functions/math.py +169 -0
- pixeltable/functions/mistralai.py +50 -75
- pixeltable/functions/net.py +70 -0
- pixeltable/functions/ollama.py +29 -36
- pixeltable/functions/openai.py +548 -160
- pixeltable/functions/openrouter.py +143 -0
- pixeltable/functions/replicate.py +15 -14
- pixeltable/functions/reve.py +250 -0
- pixeltable/functions/string.py +310 -85
- pixeltable/functions/timestamp.py +37 -19
- pixeltable/functions/together.py +77 -120
- pixeltable/functions/twelvelabs.py +188 -0
- pixeltable/functions/util.py +7 -2
- pixeltable/functions/uuid.py +30 -0
- pixeltable/functions/video.py +1528 -117
- pixeltable/functions/vision.py +26 -26
- pixeltable/functions/voyageai.py +289 -0
- pixeltable/functions/whisper.py +19 -10
- pixeltable/functions/whisperx.py +179 -0
- pixeltable/functions/yolox.py +112 -0
- pixeltable/globals.py +716 -236
- pixeltable/index/__init__.py +3 -1
- pixeltable/index/base.py +17 -21
- pixeltable/index/btree.py +32 -22
- pixeltable/index/embedding_index.py +155 -92
- pixeltable/io/__init__.py +12 -7
- pixeltable/io/datarows.py +140 -0
- pixeltable/io/external_store.py +83 -125
- pixeltable/io/fiftyone.py +24 -33
- pixeltable/io/globals.py +47 -182
- pixeltable/io/hf_datasets.py +96 -127
- pixeltable/io/label_studio.py +171 -156
- pixeltable/io/lancedb.py +3 -0
- pixeltable/io/pandas.py +136 -115
- pixeltable/io/parquet.py +40 -153
- pixeltable/io/table_data_conduit.py +702 -0
- pixeltable/io/utils.py +100 -0
- pixeltable/iterators/__init__.py +8 -4
- pixeltable/iterators/audio.py +207 -0
- pixeltable/iterators/base.py +9 -3
- pixeltable/iterators/document.py +144 -87
- pixeltable/iterators/image.py +17 -38
- pixeltable/iterators/string.py +15 -12
- pixeltable/iterators/video.py +523 -127
- pixeltable/metadata/__init__.py +33 -8
- pixeltable/metadata/converters/convert_10.py +2 -3
- pixeltable/metadata/converters/convert_13.py +2 -2
- pixeltable/metadata/converters/convert_15.py +15 -11
- pixeltable/metadata/converters/convert_16.py +4 -5
- pixeltable/metadata/converters/convert_17.py +4 -5
- pixeltable/metadata/converters/convert_18.py +4 -6
- pixeltable/metadata/converters/convert_19.py +6 -9
- pixeltable/metadata/converters/convert_20.py +3 -6
- pixeltable/metadata/converters/convert_21.py +6 -8
- pixeltable/metadata/converters/convert_22.py +3 -2
- pixeltable/metadata/converters/convert_23.py +33 -0
- pixeltable/metadata/converters/convert_24.py +55 -0
- pixeltable/metadata/converters/convert_25.py +19 -0
- pixeltable/metadata/converters/convert_26.py +23 -0
- pixeltable/metadata/converters/convert_27.py +29 -0
- pixeltable/metadata/converters/convert_28.py +13 -0
- pixeltable/metadata/converters/convert_29.py +110 -0
- pixeltable/metadata/converters/convert_30.py +63 -0
- pixeltable/metadata/converters/convert_31.py +11 -0
- pixeltable/metadata/converters/convert_32.py +15 -0
- pixeltable/metadata/converters/convert_33.py +17 -0
- pixeltable/metadata/converters/convert_34.py +21 -0
- pixeltable/metadata/converters/convert_35.py +9 -0
- pixeltable/metadata/converters/convert_36.py +38 -0
- pixeltable/metadata/converters/convert_37.py +15 -0
- pixeltable/metadata/converters/convert_38.py +39 -0
- pixeltable/metadata/converters/convert_39.py +124 -0
- pixeltable/metadata/converters/convert_40.py +73 -0
- pixeltable/metadata/converters/convert_41.py +12 -0
- pixeltable/metadata/converters/convert_42.py +9 -0
- pixeltable/metadata/converters/convert_43.py +44 -0
- pixeltable/metadata/converters/util.py +44 -18
- pixeltable/metadata/notes.py +21 -0
- pixeltable/metadata/schema.py +185 -42
- pixeltable/metadata/utils.py +74 -0
- pixeltable/mypy/__init__.py +3 -0
- pixeltable/mypy/mypy_plugin.py +123 -0
- pixeltable/plan.py +616 -225
- pixeltable/share/__init__.py +3 -0
- pixeltable/share/packager.py +797 -0
- pixeltable/share/protocol/__init__.py +33 -0
- pixeltable/share/protocol/common.py +165 -0
- pixeltable/share/protocol/operation_types.py +33 -0
- pixeltable/share/protocol/replica.py +119 -0
- pixeltable/share/publish.py +349 -0
- pixeltable/store.py +398 -232
- pixeltable/type_system.py +730 -267
- pixeltable/utils/__init__.py +40 -0
- pixeltable/utils/arrow.py +201 -29
- pixeltable/utils/av.py +298 -0
- pixeltable/utils/azure_store.py +346 -0
- pixeltable/utils/coco.py +26 -27
- pixeltable/utils/code.py +4 -4
- pixeltable/utils/console_output.py +46 -0
- pixeltable/utils/coroutine.py +24 -0
- pixeltable/utils/dbms.py +92 -0
- pixeltable/utils/description_helper.py +11 -12
- pixeltable/utils/documents.py +60 -61
- pixeltable/utils/exception_handler.py +36 -0
- pixeltable/utils/filecache.py +38 -22
- pixeltable/utils/formatter.py +88 -51
- pixeltable/utils/gcs_store.py +295 -0
- pixeltable/utils/http.py +133 -0
- pixeltable/utils/http_server.py +14 -13
- pixeltable/utils/iceberg.py +13 -0
- pixeltable/utils/image.py +17 -0
- pixeltable/utils/lancedb.py +90 -0
- pixeltable/utils/local_store.py +322 -0
- pixeltable/utils/misc.py +5 -0
- pixeltable/utils/object_stores.py +573 -0
- pixeltable/utils/pydantic.py +60 -0
- pixeltable/utils/pytorch.py +20 -20
- pixeltable/utils/s3_store.py +527 -0
- pixeltable/utils/sql.py +32 -5
- pixeltable/utils/system.py +30 -0
- pixeltable/utils/transactional_directory.py +4 -3
- pixeltable-0.5.7.dist-info/METADATA +579 -0
- pixeltable-0.5.7.dist-info/RECORD +227 -0
- {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
- pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
- pixeltable/__version__.py +0 -3
- pixeltable/catalog/named_function.py +0 -36
- pixeltable/catalog/path_dict.py +0 -141
- pixeltable/dataframe.py +0 -894
- pixeltable/exec/expr_eval_node.py +0 -232
- pixeltable/ext/__init__.py +0 -14
- pixeltable/ext/functions/__init__.py +0 -8
- pixeltable/ext/functions/whisperx.py +0 -77
- pixeltable/ext/functions/yolox.py +0 -157
- pixeltable/tool/create_test_db_dump.py +0 -311
- pixeltable/tool/create_test_video.py +0 -81
- pixeltable/tool/doc_plugins/griffe.py +0 -50
- pixeltable/tool/doc_plugins/mkdocstrings.py +0 -6
- pixeltable/tool/doc_plugins/templates/material/udf.html.jinja +0 -135
- pixeltable/tool/embed_udf.py +0 -9
- pixeltable/tool/mypy_plugin.py +0 -55
- pixeltable/utils/media_store.py +0 -76
- pixeltable/utils/s3.py +0 -16
- pixeltable-0.2.26.dist-info/METADATA +0 -400
- pixeltable-0.2.26.dist-info/RECORD +0 -156
- pixeltable-0.2.26.dist-info/entry_points.txt +0 -3
- {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
pixeltable/exec/exec_context.py
CHANGED
|
@@ -1,27 +1,42 @@
|
|
|
1
|
-
|
|
1
|
+
import random
|
|
2
2
|
|
|
3
3
|
import sqlalchemy as sql
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
from pixeltable import exprs
|
|
6
|
+
|
|
6
7
|
|
|
7
8
|
class ExecContext:
|
|
8
9
|
"""Class for execution runtime constants"""
|
|
10
|
+
|
|
11
|
+
row_builder: exprs.RowBuilder
|
|
12
|
+
profile: exprs.ExecProfile
|
|
13
|
+
show_pbar: bool
|
|
14
|
+
batch_size: int
|
|
15
|
+
num_rows: int | None
|
|
16
|
+
conn: sql.engine.Connection | None
|
|
17
|
+
pk_clause: list[sql.ClauseElement] | None
|
|
18
|
+
num_computed_exprs: int
|
|
19
|
+
ignore_errors: bool
|
|
20
|
+
random_seed: int # general-purpose source of randomness with execution scope
|
|
21
|
+
|
|
9
22
|
def __init__(
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
23
|
+
self,
|
|
24
|
+
row_builder: exprs.RowBuilder,
|
|
25
|
+
*,
|
|
26
|
+
show_pbar: bool = False,
|
|
27
|
+
batch_size: int = 0,
|
|
28
|
+
pk_clause: list[sql.ClauseElement] | None = None,
|
|
29
|
+
num_computed_exprs: int = 0,
|
|
30
|
+
ignore_errors: bool = False,
|
|
13
31
|
):
|
|
14
32
|
self.show_pbar = show_pbar
|
|
15
33
|
self.batch_size = batch_size
|
|
16
34
|
self.row_builder = row_builder
|
|
17
35
|
self.profile = exprs.ExecProfile(row_builder)
|
|
18
36
|
# num_rows is used to compute the total number of computed cells used for the progress bar
|
|
19
|
-
self.num_rows
|
|
20
|
-
self.conn
|
|
37
|
+
self.num_rows = None
|
|
38
|
+
self.conn = None # if present, use this to execute SQL queries
|
|
21
39
|
self.pk_clause = pk_clause
|
|
22
40
|
self.num_computed_exprs = num_computed_exprs
|
|
23
41
|
self.ignore_errors = ignore_errors
|
|
24
|
-
|
|
25
|
-
def set_conn(self, conn: sql.engine.Connection) -> None:
|
|
26
|
-
self.conn = conn
|
|
27
|
-
self.row_builder.set_conn(conn)
|
|
42
|
+
self.random_seed = random.randint(0, 1 << 63)
|
pixeltable/exec/exec_node.py
CHANGED
|
@@ -1,27 +1,35 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import abc
|
|
4
|
-
|
|
4
|
+
import logging
|
|
5
|
+
from typing import AsyncIterator, Iterable, Iterator, TypeVar
|
|
5
6
|
|
|
6
|
-
|
|
7
|
+
from pixeltable import exprs
|
|
8
|
+
from pixeltable.env import Env
|
|
7
9
|
|
|
8
10
|
from .data_row_batch import DataRowBatch
|
|
9
11
|
from .exec_context import ExecContext
|
|
10
12
|
|
|
13
|
+
_logger = logging.getLogger('pixeltable')
|
|
14
|
+
|
|
11
15
|
|
|
12
16
|
class ExecNode(abc.ABC):
|
|
13
17
|
"""Base class of all execution nodes"""
|
|
18
|
+
|
|
14
19
|
output_exprs: Iterable[exprs.Expr]
|
|
15
20
|
row_builder: exprs.RowBuilder
|
|
16
|
-
input:
|
|
21
|
+
input: ExecNode | None
|
|
17
22
|
flushed_img_slots: list[int] # idxs of image slots of our output_exprs dependencies
|
|
18
|
-
|
|
19
|
-
ctx: Optional[ExecContext]
|
|
20
|
-
__iter: Optional[Iterator[DataRowBatch]]
|
|
23
|
+
ctx: ExecContext | None
|
|
21
24
|
|
|
22
25
|
def __init__(
|
|
23
|
-
|
|
24
|
-
|
|
26
|
+
self,
|
|
27
|
+
row_builder: exprs.RowBuilder,
|
|
28
|
+
output_exprs: Iterable[exprs.Expr],
|
|
29
|
+
input_exprs: Iterable[exprs.Expr],
|
|
30
|
+
input: ExecNode | None = None,
|
|
31
|
+
):
|
|
32
|
+
assert all(expr.is_valid for expr in output_exprs)
|
|
25
33
|
self.output_exprs = output_exprs
|
|
26
34
|
self.row_builder = row_builder
|
|
27
35
|
self.input = input
|
|
@@ -29,33 +37,33 @@ class ExecNode(abc.ABC):
|
|
|
29
37
|
output_slot_idxs = {e.slot_idx for e in output_exprs}
|
|
30
38
|
output_dependencies = row_builder.get_dependencies(output_exprs, exclude=input_exprs)
|
|
31
39
|
self.flushed_img_slots = [
|
|
32
|
-
e.slot_idx for e in output_dependencies
|
|
33
|
-
if e.col_type.is_image_type() and e.slot_idx not in output_slot_idxs
|
|
40
|
+
e.slot_idx for e in output_dependencies if e.col_type.is_image_type() and e.slot_idx not in output_slot_idxs
|
|
34
41
|
]
|
|
35
|
-
self.
|
|
36
|
-
self.ctx = None # all nodes of a tree share the same context
|
|
37
|
-
self.__iter = None
|
|
42
|
+
self.ctx = input.ctx if input is not None else None
|
|
38
43
|
|
|
39
44
|
def set_ctx(self, ctx: ExecContext) -> None:
|
|
40
45
|
self.ctx = ctx
|
|
41
46
|
if self.input is not None:
|
|
42
47
|
self.input.set_ctx(ctx)
|
|
43
48
|
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
if self.input is not None:
|
|
48
|
-
self.input.set_stored_img_cols(stored_img_cols)
|
|
49
|
+
@abc.abstractmethod
|
|
50
|
+
def __aiter__(self) -> AsyncIterator[DataRowBatch]:
|
|
51
|
+
pass
|
|
49
52
|
|
|
50
|
-
# TODO: make this an abstractmethod when __next__() is removed
|
|
51
53
|
def __iter__(self) -> Iterator[DataRowBatch]:
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
54
|
+
loop = Env.get().event_loop
|
|
55
|
+
aiter = self.__aiter__()
|
|
56
|
+
try:
|
|
57
|
+
while True:
|
|
58
|
+
batch: DataRowBatch = loop.run_until_complete(aiter.__anext__())
|
|
59
|
+
yield batch
|
|
60
|
+
except StopAsyncIteration:
|
|
61
|
+
pass
|
|
62
|
+
# TODO:
|
|
63
|
+
# - we seem to have some tasks that aren't accounted for by ExprEvalNode and don't get cancelled by the time
|
|
64
|
+
# we end up here
|
|
65
|
+
# - however, blindly cancelling all pending tasks doesn't work when running in a jupyter environment, which
|
|
66
|
+
# creates tasks on its own
|
|
59
67
|
|
|
60
68
|
def open(self) -> None:
|
|
61
69
|
"""Bottom-up initialization of nodes for execution. Must be called before __next__."""
|
|
@@ -77,7 +85,7 @@ class ExecNode(abc.ABC):
|
|
|
77
85
|
|
|
78
86
|
T = TypeVar('T', bound='ExecNode')
|
|
79
87
|
|
|
80
|
-
def get_node(self, node_class: type[T]) ->
|
|
88
|
+
def get_node(self, node_class: type[T]) -> T | None:
|
|
81
89
|
if isinstance(self, node_class):
|
|
82
90
|
return self
|
|
83
91
|
if self.input is not None:
|
|
@@ -0,0 +1,365 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import datetime
|
|
5
|
+
import itertools
|
|
6
|
+
import logging
|
|
7
|
+
import sys
|
|
8
|
+
from typing import Any, Callable, Iterator, cast
|
|
9
|
+
|
|
10
|
+
from pixeltable import exprs, func
|
|
11
|
+
|
|
12
|
+
from .globals import Dispatcher, Evaluator, ExecCtx, FnCallArgs
|
|
13
|
+
|
|
14
|
+
_logger = logging.getLogger('pixeltable')
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class DefaultExprEvaluator(Evaluator):
|
|
18
|
+
"""
|
|
19
|
+
Standard expression evaluation using Expr.eval().
|
|
20
|
+
|
|
21
|
+
Creates one task per set of rows handed to schedule().
|
|
22
|
+
|
|
23
|
+
TODO:
|
|
24
|
+
- parallelize via Ray
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
e: exprs.Expr
|
|
28
|
+
|
|
29
|
+
def __init__(self, e: exprs.Expr, dispatcher: Dispatcher, exec_ctx: ExecCtx):
|
|
30
|
+
super().__init__(dispatcher, exec_ctx)
|
|
31
|
+
self.e = e
|
|
32
|
+
|
|
33
|
+
def schedule(self, rows: list[exprs.DataRow], slot_idx: int) -> None:
|
|
34
|
+
assert self.e.slot_idx >= 0
|
|
35
|
+
task = asyncio.create_task(self.eval(rows))
|
|
36
|
+
self.dispatcher.register_task(task)
|
|
37
|
+
|
|
38
|
+
async def eval(self, rows: list[exprs.DataRow]) -> None:
|
|
39
|
+
rows_with_excs: set[int] = set() # records idxs into rows
|
|
40
|
+
for idx, row in enumerate(rows):
|
|
41
|
+
assert not row.has_val[self.e.slot_idx] and not row.has_exc(self.e.slot_idx)
|
|
42
|
+
if asyncio.current_task().cancelled() or self.dispatcher.exc_event.is_set():
|
|
43
|
+
return
|
|
44
|
+
try:
|
|
45
|
+
self.e.eval(row, self.dispatcher.row_builder)
|
|
46
|
+
except Exception as exc:
|
|
47
|
+
_, _, exc_tb = sys.exc_info()
|
|
48
|
+
row.set_exc(self.e.slot_idx, exc)
|
|
49
|
+
rows_with_excs.add(idx)
|
|
50
|
+
self.dispatcher.dispatch_exc([row], self.e.slot_idx, exc_tb, self.exec_ctx)
|
|
51
|
+
self.dispatcher.dispatch([rows[i] for i in range(len(rows)) if i not in rows_with_excs], self.exec_ctx)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class FnCallEvaluator(Evaluator):
|
|
55
|
+
"""
|
|
56
|
+
Evaluates function calls:
|
|
57
|
+
- batched functions (sync and async): one task per batch
|
|
58
|
+
- async functions: one task per row
|
|
59
|
+
- the rest: one task per set of rows handed to schedule()
|
|
60
|
+
|
|
61
|
+
TODO:
|
|
62
|
+
- adaptive batching: finding the optimal batch size based on observed execution times
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
fn_call: exprs.FunctionCall
|
|
66
|
+
fn: func.CallableFunction
|
|
67
|
+
scalar_py_fn: Callable | None # only set for non-batching CallableFunctions
|
|
68
|
+
|
|
69
|
+
# only set if fn.is_batched
|
|
70
|
+
call_args_queue: asyncio.Queue[FnCallArgs] | None # FnCallArgs waiting for execution
|
|
71
|
+
batch_size: int | None
|
|
72
|
+
|
|
73
|
+
def __init__(self, fn_call: exprs.FunctionCall, dispatcher: Dispatcher, exec_ctx: ExecCtx):
|
|
74
|
+
super().__init__(dispatcher, exec_ctx)
|
|
75
|
+
self.fn_call = fn_call
|
|
76
|
+
self.fn = cast(func.CallableFunction, fn_call.fn)
|
|
77
|
+
if isinstance(self.fn, func.CallableFunction) and self.fn.is_batched:
|
|
78
|
+
self.call_args_queue = asyncio.Queue[FnCallArgs]()
|
|
79
|
+
# we're not supplying sample arguments there, they're ignored anyway
|
|
80
|
+
self.batch_size = self.fn.get_batch_size()
|
|
81
|
+
self.scalar_py_fn = None
|
|
82
|
+
else:
|
|
83
|
+
self.call_args_queue = None
|
|
84
|
+
self.batch_size = None
|
|
85
|
+
if isinstance(self.fn, func.CallableFunction):
|
|
86
|
+
self.scalar_py_fn = self.fn.py_fn
|
|
87
|
+
else:
|
|
88
|
+
self.scalar_py_fn = None
|
|
89
|
+
|
|
90
|
+
def schedule(self, rows: list[exprs.DataRow], slot_idx: int) -> None:
|
|
91
|
+
assert self.fn_call.slot_idx >= 0
|
|
92
|
+
|
|
93
|
+
# create FnCallArgs for incoming rows
|
|
94
|
+
skip_rows: list[exprs.DataRow] = [] # skip rows with Nones in non-nullable parameters
|
|
95
|
+
rows_call_args: list[FnCallArgs] = []
|
|
96
|
+
for row in rows:
|
|
97
|
+
args_kwargs = self.fn_call.make_args(row)
|
|
98
|
+
if args_kwargs is None:
|
|
99
|
+
# nothing to do here
|
|
100
|
+
row[self.fn_call.slot_idx] = None
|
|
101
|
+
skip_rows.append(row)
|
|
102
|
+
else:
|
|
103
|
+
args, kwargs = args_kwargs
|
|
104
|
+
rows_call_args.append(FnCallArgs(self.fn_call, [row], args=args, kwargs=kwargs))
|
|
105
|
+
|
|
106
|
+
if len(skip_rows) > 0:
|
|
107
|
+
self.dispatcher.dispatch(skip_rows, self.exec_ctx)
|
|
108
|
+
|
|
109
|
+
if self.batch_size is not None:
|
|
110
|
+
if not self.is_closed and (len(rows_call_args) + self.call_args_queue.qsize() < self.batch_size):
|
|
111
|
+
# we don't have enough FnCallArgs for a batch, so add them to the queue
|
|
112
|
+
for item in rows_call_args:
|
|
113
|
+
self.call_args_queue.put_nowait(item)
|
|
114
|
+
return
|
|
115
|
+
|
|
116
|
+
# create one task per batch
|
|
117
|
+
combined_call_args = itertools.chain(self._queued_call_args_iter(), rows_call_args)
|
|
118
|
+
while True:
|
|
119
|
+
call_args_batch = list(itertools.islice(combined_call_args, self.batch_size))
|
|
120
|
+
if len(call_args_batch) == 0:
|
|
121
|
+
break
|
|
122
|
+
if len(call_args_batch) < self.batch_size and not self.is_closed:
|
|
123
|
+
# we don't have a full batch left: return the rest to the queue
|
|
124
|
+
assert self.call_args_queue.empty() # we saw all queued items
|
|
125
|
+
for item in call_args_batch:
|
|
126
|
+
self.call_args_queue.put_nowait(item)
|
|
127
|
+
return
|
|
128
|
+
|
|
129
|
+
# turn call_args_batch into a single batched FnCallArgs
|
|
130
|
+
_logger.debug(f'Creating batch of size {len(call_args_batch)} for slot {slot_idx}')
|
|
131
|
+
batched_call_args = self._create_batch_call_args(call_args_batch)
|
|
132
|
+
if self.fn_call.resource_pool is not None:
|
|
133
|
+
# hand the call off to the resource pool's scheduler
|
|
134
|
+
scheduler = self.dispatcher.schedulers[self.fn_call.resource_pool]
|
|
135
|
+
scheduler.submit(batched_call_args, self.exec_ctx)
|
|
136
|
+
else:
|
|
137
|
+
task = asyncio.create_task(self.eval_batch(batched_call_args))
|
|
138
|
+
self.dispatcher.register_task(task)
|
|
139
|
+
|
|
140
|
+
elif self.fn.is_async:
|
|
141
|
+
if self.fn_call.resource_pool is not None:
|
|
142
|
+
# hand the call off to the resource pool's scheduler
|
|
143
|
+
scheduler = self.dispatcher.schedulers[self.fn_call.resource_pool]
|
|
144
|
+
for item in rows_call_args:
|
|
145
|
+
scheduler.submit(item, self.exec_ctx)
|
|
146
|
+
else:
|
|
147
|
+
# create one task per call
|
|
148
|
+
for item in rows_call_args:
|
|
149
|
+
task = asyncio.create_task(self.eval_async(item))
|
|
150
|
+
self.dispatcher.register_task(task)
|
|
151
|
+
|
|
152
|
+
else:
|
|
153
|
+
# create a single task for all rows
|
|
154
|
+
task = asyncio.create_task(self.eval(rows_call_args))
|
|
155
|
+
self.dispatcher.register_task(task)
|
|
156
|
+
|
|
157
|
+
def _queued_call_args_iter(self) -> Iterator[FnCallArgs]:
|
|
158
|
+
while not self.call_args_queue.empty():
|
|
159
|
+
yield self.call_args_queue.get_nowait()
|
|
160
|
+
|
|
161
|
+
def _create_batch_call_args(self, call_args: list[FnCallArgs]) -> FnCallArgs:
|
|
162
|
+
"""Roll call_args into a single batched FnCallArgs"""
|
|
163
|
+
batch_args: list[list[Any | None]] = [[None] * len(call_args) for _ in range(len(self.fn_call.arg_idxs))]
|
|
164
|
+
batch_kwargs: dict[str, list[Any | None]] = {k: [None] * len(call_args) for k in self.fn_call.kwarg_idxs}
|
|
165
|
+
assert isinstance(self.fn, func.CallableFunction)
|
|
166
|
+
for i, item in enumerate(call_args):
|
|
167
|
+
for j in range(len(item.args)):
|
|
168
|
+
batch_args[j][i] = item.args[j]
|
|
169
|
+
for k in item.kwargs:
|
|
170
|
+
batch_kwargs[k][i] = item.kwargs[k]
|
|
171
|
+
return FnCallArgs(
|
|
172
|
+
self.fn_call, [item.row for item in call_args], batch_args=batch_args, batch_kwargs=batch_kwargs
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
async def eval_batch(self, batched_call_args: FnCallArgs) -> None:
|
|
176
|
+
result_batch: list[Any]
|
|
177
|
+
try:
|
|
178
|
+
if self.fn.is_async:
|
|
179
|
+
result_batch = await self.fn.aexec_batch(
|
|
180
|
+
*batched_call_args.batch_args, **batched_call_args.batch_kwargs
|
|
181
|
+
)
|
|
182
|
+
else:
|
|
183
|
+
# check for cancellation before starting something potentially long-running
|
|
184
|
+
if asyncio.current_task().cancelled() or self.dispatcher.exc_event.is_set():
|
|
185
|
+
return
|
|
186
|
+
result_batch = self.fn.exec_batch(batched_call_args.batch_args, batched_call_args.batch_kwargs)
|
|
187
|
+
except Exception as exc:
|
|
188
|
+
_, _, exc_tb = sys.exc_info()
|
|
189
|
+
for row in batched_call_args.rows:
|
|
190
|
+
row.set_exc(self.fn_call.slot_idx, exc)
|
|
191
|
+
self.dispatcher.dispatch_exc(batched_call_args.rows, self.fn_call.slot_idx, exc_tb, self.exec_ctx)
|
|
192
|
+
return
|
|
193
|
+
|
|
194
|
+
for i, row in enumerate(batched_call_args.rows):
|
|
195
|
+
row[self.fn_call.slot_idx] = result_batch[i]
|
|
196
|
+
self.dispatcher.dispatch(batched_call_args.rows, self.exec_ctx)
|
|
197
|
+
|
|
198
|
+
async def eval_async(self, call_args: FnCallArgs) -> None:
|
|
199
|
+
assert len(call_args.rows) == 1
|
|
200
|
+
assert not call_args.row.has_val[self.fn_call.slot_idx]
|
|
201
|
+
assert not call_args.row.has_exc(self.fn_call.slot_idx)
|
|
202
|
+
|
|
203
|
+
try:
|
|
204
|
+
start_ts = datetime.datetime.now()
|
|
205
|
+
_logger.debug(f'Start evaluating slot {self.fn_call.slot_idx}')
|
|
206
|
+
call_args.row[self.fn_call.slot_idx] = await self.fn.aexec(*call_args.args, **call_args.kwargs)
|
|
207
|
+
end_ts = datetime.datetime.now()
|
|
208
|
+
_logger.debug(f'Evaluated slot {self.fn_call.slot_idx} in {end_ts - start_ts}')
|
|
209
|
+
self.dispatcher.dispatch([call_args.row], self.exec_ctx)
|
|
210
|
+
except Exception as exc:
|
|
211
|
+
_, _, exc_tb = sys.exc_info()
|
|
212
|
+
call_args.row.set_exc(self.fn_call.slot_idx, exc)
|
|
213
|
+
self.dispatcher.dispatch_exc(call_args.rows, self.fn_call.slot_idx, exc_tb, self.exec_ctx)
|
|
214
|
+
|
|
215
|
+
async def eval(self, call_args_batch: list[FnCallArgs]) -> None:
|
|
216
|
+
rows_with_excs: set[int] = set() # records idxs into 'rows'
|
|
217
|
+
for idx, item in enumerate(call_args_batch):
|
|
218
|
+
assert len(item.rows) == 1
|
|
219
|
+
assert not item.row.has_val[self.fn_call.slot_idx]
|
|
220
|
+
assert not item.row.has_exc(self.fn_call.slot_idx)
|
|
221
|
+
# check for cancellation before starting something potentially long-running
|
|
222
|
+
if asyncio.current_task().cancelled() or self.dispatcher.exc_event.is_set():
|
|
223
|
+
return
|
|
224
|
+
try:
|
|
225
|
+
item.row[self.fn_call.slot_idx] = self.scalar_py_fn(*item.args, **item.kwargs)
|
|
226
|
+
except Exception as exc:
|
|
227
|
+
_, _, exc_tb = sys.exc_info()
|
|
228
|
+
item.row.set_exc(self.fn_call.slot_idx, exc)
|
|
229
|
+
rows_with_excs.add(idx)
|
|
230
|
+
self.dispatcher.dispatch_exc(item.rows, self.fn_call.slot_idx, exc_tb, self.exec_ctx)
|
|
231
|
+
self.dispatcher.dispatch(
|
|
232
|
+
[call_args_batch[i].row for i in range(len(call_args_batch)) if i not in rows_with_excs], self.exec_ctx
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
def _close(self) -> None:
|
|
236
|
+
"""Create a task for the incomplete batch of queued FnCallArgs, if any"""
|
|
237
|
+
_logger.debug(f'FnCallEvaluator.close(): slot_idx={self.fn_call.slot_idx}')
|
|
238
|
+
if self.call_args_queue is None or self.call_args_queue.empty():
|
|
239
|
+
return
|
|
240
|
+
batched_call_args = self._create_batch_call_args(list(self._queued_call_args_iter()))
|
|
241
|
+
task = asyncio.create_task(self.eval_batch(batched_call_args))
|
|
242
|
+
self.dispatcher.register_task(task)
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
class NestedRowList:
|
|
246
|
+
"""
|
|
247
|
+
A list of nested rows, used by JsonMapperDispatcher to store the rows corresponding to the elements of the
|
|
248
|
+
JsonMapper source list and make completion awaitable.
|
|
249
|
+
"""
|
|
250
|
+
|
|
251
|
+
rows: list[exprs.DataRow]
|
|
252
|
+
num_completed: int
|
|
253
|
+
completion: asyncio.Event
|
|
254
|
+
|
|
255
|
+
def __init__(self, rows: list[exprs.DataRow]):
|
|
256
|
+
self.num_completed = 0
|
|
257
|
+
self.rows = rows
|
|
258
|
+
self.completion = asyncio.Event()
|
|
259
|
+
|
|
260
|
+
def complete_row(self) -> None:
|
|
261
|
+
self.num_completed += 1
|
|
262
|
+
if self.num_completed == len(self.rows):
|
|
263
|
+
self.completion.set()
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
class JsonMapperDispatcher(Evaluator):
|
|
267
|
+
"""
|
|
268
|
+
The execution logic for materializing the nested DataRows of a JsonMapper/JsonMapperDispatch.
|
|
269
|
+
|
|
270
|
+
The rows are stored in a NestedRowList, which itself is stored in the JsonMapperDispatch instance's slot.
|
|
271
|
+
"""
|
|
272
|
+
|
|
273
|
+
e: exprs.JsonMapperDispatch
|
|
274
|
+
target_expr: exprs.Expr
|
|
275
|
+
scope_anchor: exprs.ObjectRef
|
|
276
|
+
nested_exec_ctx: ExecCtx # ExecCtx needed to evaluate the nested rows
|
|
277
|
+
external_slot_map: dict[int, int] # slot idx in parent row -> slot idx in nested row
|
|
278
|
+
has_async_calls: bool # True if target_expr contains any async FunctionCalls
|
|
279
|
+
|
|
280
|
+
def __init__(self, e: exprs.JsonMapperDispatch, dispatcher: Dispatcher, exec_ctx: ExecCtx):
|
|
281
|
+
super().__init__(dispatcher, exec_ctx)
|
|
282
|
+
self.e = e
|
|
283
|
+
self.target_expr = e.target_expr.copy() # we need new slot idxs
|
|
284
|
+
self.scope_anchor = e.scope_anchor.copy()
|
|
285
|
+
nested_row_builder = exprs.RowBuilder(output_exprs=[self.target_expr], columns=[], input_exprs=[])
|
|
286
|
+
nested_row_builder.set_slot_idxs([self.target_expr, self.scope_anchor])
|
|
287
|
+
target_expr_ctx = nested_row_builder.create_eval_ctx([self.target_expr], limit_scope=True)
|
|
288
|
+
self.has_async_calls = any(isinstance(e, exprs.FunctionCall) and e.is_async for e in target_expr_ctx.exprs)
|
|
289
|
+
target_scope = self.target_expr.scope()
|
|
290
|
+
# we need to pre-populated nested rows with slot values that are produced in an outer scope (literals excluded)
|
|
291
|
+
parent_exprs = [
|
|
292
|
+
e for e in target_expr_ctx.exprs if e.scope() != target_scope and not isinstance(e, exprs.Literal)
|
|
293
|
+
]
|
|
294
|
+
self.external_slot_map = {exec_ctx.row_builder.unique_exprs[e].slot_idx: e.slot_idx for e in parent_exprs}
|
|
295
|
+
self.nested_exec_ctx = ExecCtx(dispatcher, nested_row_builder, [self.target_expr], parent_exprs)
|
|
296
|
+
|
|
297
|
+
def schedule(self, rows: list[exprs.DataRow], slot_idx: int) -> None:
|
|
298
|
+
"""Create nested rows for all source list elements and dispatch them"""
|
|
299
|
+
assert self.e.slot_idx >= 0
|
|
300
|
+
all_nested_rows: list[exprs.DataRow] = []
|
|
301
|
+
for row in rows:
|
|
302
|
+
src = row[self.e.src_expr.slot_idx]
|
|
303
|
+
if not isinstance(src, list):
|
|
304
|
+
# invalid/non-list src path
|
|
305
|
+
row[self.e.slot_idx] = None
|
|
306
|
+
continue
|
|
307
|
+
|
|
308
|
+
nested_rows = [
|
|
309
|
+
exprs.DataRow(
|
|
310
|
+
size=self.nested_exec_ctx.row_builder.num_materialized,
|
|
311
|
+
img_slot_idxs=[],
|
|
312
|
+
media_slot_idxs=[],
|
|
313
|
+
array_slot_idxs=[],
|
|
314
|
+
json_slot_idxs=[],
|
|
315
|
+
parent_row=row,
|
|
316
|
+
parent_slot_idx=self.e.slot_idx,
|
|
317
|
+
)
|
|
318
|
+
for _ in src
|
|
319
|
+
]
|
|
320
|
+
for nested_row, anchor_val in zip(nested_rows, src):
|
|
321
|
+
# It's possible that self.scope_anchor.slot_idx is None; this corresponds to the case where the
|
|
322
|
+
# mapper expression doesn't actually contain references to RELATIVE_PATH_ROOT.
|
|
323
|
+
if self.scope_anchor.slot_idx is not None:
|
|
324
|
+
nested_row[self.scope_anchor.slot_idx] = anchor_val
|
|
325
|
+
for slot_idx_, nested_slot_idx in self.external_slot_map.items():
|
|
326
|
+
nested_row[nested_slot_idx] = row[slot_idx_]
|
|
327
|
+
self.nested_exec_ctx.init_rows(nested_rows)
|
|
328
|
+
|
|
329
|
+
# we modify DataRow.vals here directly, rather than going through __getitem__(), because we don't have
|
|
330
|
+
# an official "value" yet (the nested rows are not yet materialized)
|
|
331
|
+
row.vals[self.e.slot_idx] = NestedRowList(nested_rows)
|
|
332
|
+
all_nested_rows.extend(nested_rows)
|
|
333
|
+
|
|
334
|
+
self.dispatcher.dispatch(all_nested_rows, self.nested_exec_ctx)
|
|
335
|
+
task = asyncio.create_task(self.gather(rows))
|
|
336
|
+
self.dispatcher.register_task(task)
|
|
337
|
+
|
|
338
|
+
async def gather(self, rows: list[exprs.DataRow]) -> None:
|
|
339
|
+
"""Wait for nested rows to complete, then signal completion to the parent rows"""
|
|
340
|
+
if self.has_async_calls:
|
|
341
|
+
# if our target expr contains async FunctionCalls, they typically get completed out-of-order, and it's
|
|
342
|
+
# more effective to dispatch them as they complete
|
|
343
|
+
remaining = {
|
|
344
|
+
asyncio.create_task(row.vals[self.e.slot_idx].completion.wait()): row
|
|
345
|
+
for row in rows
|
|
346
|
+
if not row.has_val[self.e.slot_idx]
|
|
347
|
+
}
|
|
348
|
+
while len(remaining) > 0:
|
|
349
|
+
done, _ = await asyncio.wait(remaining.keys(), return_when=asyncio.FIRST_COMPLETED)
|
|
350
|
+
done_rows = [remaining.pop(task) for task in done]
|
|
351
|
+
for row in done_rows:
|
|
352
|
+
row.has_val[self.e.slot_idx] = True
|
|
353
|
+
self.dispatcher.dispatch(done_rows, self.exec_ctx)
|
|
354
|
+
|
|
355
|
+
else:
|
|
356
|
+
# our target expr doesn't contain async FunctionCalls, which means they will get completed in-order
|
|
357
|
+
for row in rows:
|
|
358
|
+
if row.has_val[self.e.slot_idx]:
|
|
359
|
+
# the source_expr's value is not a list
|
|
360
|
+
assert row.vals[self.e.slot_idx] is None
|
|
361
|
+
continue
|
|
362
|
+
assert row.vals[self.e.slot_idx] is not None and isinstance(row.vals[self.e.slot_idx], NestedRowList)
|
|
363
|
+
await row.vals[self.e.slot_idx].completion.wait()
|
|
364
|
+
row.has_val[self.e.slot_idx] = True
|
|
365
|
+
self.dispatcher.dispatch(rows, self.exec_ctx)
|