pixeltable 0.4.17__py3-none-any.whl → 0.4.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +1 -1
- pixeltable/_version.py +1 -0
- pixeltable/catalog/catalog.py +144 -118
- pixeltable/catalog/column.py +104 -115
- pixeltable/catalog/globals.py +1 -2
- pixeltable/catalog/insertable_table.py +44 -49
- pixeltable/catalog/path.py +3 -4
- pixeltable/catalog/schema_object.py +4 -4
- pixeltable/catalog/table.py +139 -124
- pixeltable/catalog/table_metadata.py +6 -6
- pixeltable/catalog/table_version.py +315 -246
- pixeltable/catalog/table_version_handle.py +4 -4
- pixeltable/catalog/table_version_path.py +9 -10
- pixeltable/catalog/tbl_ops.py +9 -3
- pixeltable/catalog/view.py +34 -28
- pixeltable/config.py +14 -10
- pixeltable/dataframe.py +69 -78
- pixeltable/env.py +78 -64
- pixeltable/exec/aggregation_node.py +6 -6
- pixeltable/exec/cache_prefetch_node.py +10 -10
- pixeltable/exec/data_row_batch.py +3 -3
- pixeltable/exec/exec_context.py +16 -4
- pixeltable/exec/exec_node.py +5 -5
- pixeltable/exec/expr_eval/evaluators.py +6 -6
- pixeltable/exec/expr_eval/expr_eval_node.py +8 -7
- pixeltable/exec/expr_eval/globals.py +6 -6
- pixeltable/exec/expr_eval/row_buffer.py +1 -2
- pixeltable/exec/expr_eval/schedulers.py +11 -11
- pixeltable/exec/in_memory_data_node.py +2 -2
- pixeltable/exec/object_store_save_node.py +14 -17
- pixeltable/exec/sql_node.py +28 -27
- pixeltable/exprs/arithmetic_expr.py +4 -4
- pixeltable/exprs/array_slice.py +2 -2
- pixeltable/exprs/column_property_ref.py +3 -3
- pixeltable/exprs/column_ref.py +61 -74
- pixeltable/exprs/comparison.py +5 -5
- pixeltable/exprs/compound_predicate.py +3 -3
- pixeltable/exprs/data_row.py +12 -12
- pixeltable/exprs/expr.py +41 -31
- pixeltable/exprs/expr_dict.py +3 -3
- pixeltable/exprs/expr_set.py +3 -3
- pixeltable/exprs/function_call.py +14 -14
- pixeltable/exprs/in_predicate.py +4 -4
- pixeltable/exprs/inline_expr.py +8 -8
- pixeltable/exprs/is_null.py +1 -3
- pixeltable/exprs/json_mapper.py +8 -8
- pixeltable/exprs/json_path.py +6 -6
- pixeltable/exprs/literal.py +5 -5
- pixeltable/exprs/method_ref.py +2 -2
- pixeltable/exprs/object_ref.py +2 -2
- pixeltable/exprs/row_builder.py +14 -14
- pixeltable/exprs/rowid_ref.py +8 -8
- pixeltable/exprs/similarity_expr.py +50 -25
- pixeltable/exprs/sql_element_cache.py +4 -4
- pixeltable/exprs/string_op.py +2 -2
- pixeltable/exprs/type_cast.py +3 -5
- pixeltable/func/aggregate_function.py +8 -8
- pixeltable/func/callable_function.py +9 -9
- pixeltable/func/expr_template_function.py +3 -3
- pixeltable/func/function.py +15 -17
- pixeltable/func/function_registry.py +6 -7
- pixeltable/func/globals.py +2 -3
- pixeltable/func/mcp.py +2 -2
- pixeltable/func/query_template_function.py +16 -16
- pixeltable/func/signature.py +14 -14
- pixeltable/func/tools.py +11 -11
- pixeltable/func/udf.py +16 -18
- pixeltable/functions/__init__.py +1 -0
- pixeltable/functions/anthropic.py +7 -7
- pixeltable/functions/audio.py +76 -0
- pixeltable/functions/bedrock.py +6 -6
- pixeltable/functions/deepseek.py +4 -4
- pixeltable/functions/fireworks.py +2 -2
- pixeltable/functions/gemini.py +6 -6
- pixeltable/functions/globals.py +12 -12
- pixeltable/functions/groq.py +4 -4
- pixeltable/functions/huggingface.py +1033 -6
- pixeltable/functions/image.py +7 -10
- pixeltable/functions/llama_cpp.py +7 -7
- pixeltable/functions/math.py +2 -3
- pixeltable/functions/mistralai.py +3 -3
- pixeltable/functions/ollama.py +9 -9
- pixeltable/functions/openai.py +21 -21
- pixeltable/functions/openrouter.py +7 -7
- pixeltable/functions/string.py +21 -28
- pixeltable/functions/timestamp.py +7 -8
- pixeltable/functions/together.py +4 -6
- pixeltable/functions/twelvelabs.py +92 -0
- pixeltable/functions/video.py +36 -31
- pixeltable/functions/vision.py +6 -6
- pixeltable/functions/whisper.py +7 -7
- pixeltable/functions/whisperx.py +16 -16
- pixeltable/globals.py +75 -40
- pixeltable/index/base.py +12 -8
- pixeltable/index/btree.py +19 -22
- pixeltable/index/embedding_index.py +30 -39
- pixeltable/io/datarows.py +3 -3
- pixeltable/io/external_store.py +13 -16
- pixeltable/io/fiftyone.py +5 -5
- pixeltable/io/globals.py +5 -5
- pixeltable/io/hf_datasets.py +4 -4
- pixeltable/io/label_studio.py +12 -12
- pixeltable/io/pandas.py +6 -6
- pixeltable/io/parquet.py +2 -2
- pixeltable/io/table_data_conduit.py +12 -12
- pixeltable/io/utils.py +2 -2
- pixeltable/iterators/audio.py +2 -2
- pixeltable/iterators/document.py +88 -57
- pixeltable/iterators/video.py +66 -37
- pixeltable/metadata/converters/convert_18.py +2 -2
- pixeltable/metadata/converters/convert_19.py +2 -2
- pixeltable/metadata/converters/convert_20.py +2 -2
- pixeltable/metadata/converters/convert_21.py +2 -2
- pixeltable/metadata/converters/convert_22.py +2 -2
- pixeltable/metadata/converters/convert_24.py +2 -2
- pixeltable/metadata/converters/convert_25.py +2 -2
- pixeltable/metadata/converters/convert_26.py +2 -2
- pixeltable/metadata/converters/convert_29.py +4 -4
- pixeltable/metadata/converters/convert_34.py +2 -2
- pixeltable/metadata/converters/convert_36.py +2 -2
- pixeltable/metadata/converters/convert_38.py +2 -2
- pixeltable/metadata/converters/convert_39.py +1 -2
- pixeltable/metadata/converters/util.py +11 -13
- pixeltable/metadata/schema.py +22 -21
- pixeltable/metadata/utils.py +2 -6
- pixeltable/mypy/mypy_plugin.py +5 -5
- pixeltable/plan.py +32 -34
- pixeltable/share/packager.py +7 -7
- pixeltable/share/publish.py +3 -3
- pixeltable/store.py +126 -41
- pixeltable/type_system.py +43 -46
- pixeltable/utils/__init__.py +1 -2
- pixeltable/utils/arrow.py +4 -4
- pixeltable/utils/av.py +74 -38
- pixeltable/utils/azure_store.py +305 -0
- pixeltable/utils/code.py +1 -2
- pixeltable/utils/dbms.py +15 -19
- pixeltable/utils/description_helper.py +2 -3
- pixeltable/utils/documents.py +5 -6
- pixeltable/utils/exception_handler.py +2 -2
- pixeltable/utils/filecache.py +5 -5
- pixeltable/utils/formatter.py +4 -6
- pixeltable/utils/gcs_store.py +9 -9
- pixeltable/utils/local_store.py +17 -17
- pixeltable/utils/object_stores.py +59 -43
- pixeltable/utils/s3_store.py +35 -30
- {pixeltable-0.4.17.dist-info → pixeltable-0.4.19.dist-info}/METADATA +4 -4
- pixeltable-0.4.19.dist-info/RECORD +213 -0
- pixeltable/__version__.py +0 -3
- pixeltable-0.4.17.dist-info/RECORD +0 -211
- {pixeltable-0.4.17.dist-info → pixeltable-0.4.19.dist-info}/WHEEL +0 -0
- {pixeltable-0.4.17.dist-info → pixeltable-0.4.19.dist-info}/entry_points.txt +0 -0
- {pixeltable-0.4.17.dist-info → pixeltable-0.4.19.dist-info}/licenses/LICENSE +0 -0
|
@@ -4,7 +4,7 @@ import abc
|
|
|
4
4
|
import asyncio
|
|
5
5
|
from dataclasses import dataclass
|
|
6
6
|
from types import TracebackType
|
|
7
|
-
from typing import Any, Iterable,
|
|
7
|
+
from typing import Any, Iterable, Protocol
|
|
8
8
|
|
|
9
9
|
import numpy as np
|
|
10
10
|
|
|
@@ -18,11 +18,11 @@ class FnCallArgs:
|
|
|
18
18
|
fn_call: exprs.FunctionCall
|
|
19
19
|
rows: list[exprs.DataRow]
|
|
20
20
|
# single call
|
|
21
|
-
args:
|
|
22
|
-
kwargs:
|
|
21
|
+
args: list[Any] | None = None
|
|
22
|
+
kwargs: dict[str, Any] | None = None
|
|
23
23
|
# batch call
|
|
24
|
-
batch_args:
|
|
25
|
-
batch_kwargs:
|
|
24
|
+
batch_args: list[list[Any | None]] | None = None
|
|
25
|
+
batch_kwargs: dict[str, list[Any | None]] | None = None
|
|
26
26
|
|
|
27
27
|
@property
|
|
28
28
|
def pxt_fn(self) -> func.CallableFunction:
|
|
@@ -56,7 +56,7 @@ class Scheduler(abc.ABC):
|
|
|
56
56
|
request: FnCallArgs
|
|
57
57
|
num_retries: int
|
|
58
58
|
exec_ctx: ExecCtx
|
|
59
|
-
retry_after:
|
|
59
|
+
retry_after: float | None = None # time.monotonic()
|
|
60
60
|
|
|
61
61
|
def __lt__(self, other: Scheduler.QueueItem) -> bool:
|
|
62
62
|
# prioritize by number of retries (more retries = higher priority)
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
|
-
from typing import Optional
|
|
5
4
|
|
|
6
5
|
import numpy as np
|
|
7
6
|
|
|
@@ -14,7 +13,7 @@ class RowBuffer:
|
|
|
14
13
|
"""Fixed-length circular buffer of DataRows; knows how to maintain input order"""
|
|
15
14
|
|
|
16
15
|
size: int
|
|
17
|
-
row_pos_map:
|
|
16
|
+
row_pos_map: dict[int, int] | None # id(row) -> position of row in output; None if not maintaining order
|
|
18
17
|
num_rows: int # number of rows in the buffer
|
|
19
18
|
num_ready: int # number of consecutive non-None rows at head
|
|
20
19
|
buffer: np.ndarray # of object
|
|
@@ -7,7 +7,7 @@ import logging
|
|
|
7
7
|
import re
|
|
8
8
|
import sys
|
|
9
9
|
import time
|
|
10
|
-
from typing import Any, Awaitable, Collection
|
|
10
|
+
from typing import Any, Awaitable, Collection
|
|
11
11
|
|
|
12
12
|
from pixeltable import env, func
|
|
13
13
|
from pixeltable.config import Config
|
|
@@ -35,7 +35,7 @@ class RateLimitsScheduler(Scheduler):
|
|
|
35
35
|
get_request_resources_param_names: list[str] # names of parameters of RateLimitsInfo.get_request_resources()
|
|
36
36
|
|
|
37
37
|
# scheduling-related state
|
|
38
|
-
pool_info:
|
|
38
|
+
pool_info: env.RateLimitsInfo | None
|
|
39
39
|
est_usage: dict[str, int] # value per resource; accumulated estimates since the last util. report
|
|
40
40
|
|
|
41
41
|
num_in_flight: int # unfinished tasks
|
|
@@ -77,7 +77,7 @@ class RateLimitsScheduler(Scheduler):
|
|
|
77
77
|
self.est_usage = dict.fromkeys(self._resources, 0)
|
|
78
78
|
|
|
79
79
|
async def _main_loop(self) -> None:
|
|
80
|
-
item:
|
|
80
|
+
item: RateLimitsScheduler.QueueItem | None = None
|
|
81
81
|
while True:
|
|
82
82
|
if item is None:
|
|
83
83
|
item = await self.queue.get()
|
|
@@ -102,8 +102,8 @@ class RateLimitsScheduler(Scheduler):
|
|
|
102
102
|
request_resources = self._get_request_resources(item.request)
|
|
103
103
|
limits_info = self._check_resource_limits(request_resources)
|
|
104
104
|
aws: list[Awaitable[None]] = []
|
|
105
|
-
completed_aw:
|
|
106
|
-
wait_for_reset:
|
|
105
|
+
completed_aw: asyncio.Task | None = None
|
|
106
|
+
wait_for_reset: asyncio.Task | None = None
|
|
107
107
|
if limits_info is not None:
|
|
108
108
|
# limits_info's resource is depleted, wait for capacity to free up
|
|
109
109
|
|
|
@@ -167,7 +167,7 @@ class RateLimitsScheduler(Scheduler):
|
|
|
167
167
|
constant_kwargs, batch_kwargs = request.pxt_fn.create_batch_kwargs(batch_kwargs)
|
|
168
168
|
return self.pool_info.get_request_resources(**constant_kwargs, **batch_kwargs)
|
|
169
169
|
|
|
170
|
-
def _check_resource_limits(self, request_resources: dict[str, int]) ->
|
|
170
|
+
def _check_resource_limits(self, request_resources: dict[str, int]) -> env.RateLimitInfo | None:
|
|
171
171
|
"""Returns the most depleted resource, relative to its limit, or None if all resources are within limits"""
|
|
172
172
|
candidates: list[tuple[env.RateLimitInfo, float]] = [] # (info, relative remaining)
|
|
173
173
|
for resource, usage in request_resources.items():
|
|
@@ -405,7 +405,7 @@ class RequestRateScheduler(Scheduler):
|
|
|
405
405
|
if is_task:
|
|
406
406
|
self.num_in_flight -= 1
|
|
407
407
|
|
|
408
|
-
def _is_rate_limit_error(self, exc: Exception) -> tuple[bool,
|
|
408
|
+
def _is_rate_limit_error(self, exc: Exception) -> tuple[bool, float | None]:
|
|
409
409
|
"""Returns True if the exception indicates a rate limit error, and the retry delay in seconds."""
|
|
410
410
|
from http import HTTPStatus
|
|
411
411
|
|
|
@@ -413,7 +413,7 @@ class RequestRateScheduler(Scheduler):
|
|
|
413
413
|
# We look for attributes that contain status codes, instead of checking the type of the exception,
|
|
414
414
|
# in order to handle a wider variety of exception classes.
|
|
415
415
|
is_rate_limit_error = False
|
|
416
|
-
retry_delay:
|
|
416
|
+
retry_delay: float | None = None
|
|
417
417
|
|
|
418
418
|
# requests.HTTPError/httpx.HTTPStatusError
|
|
419
419
|
if (
|
|
@@ -443,7 +443,7 @@ class RequestRateScheduler(Scheduler):
|
|
|
443
443
|
|
|
444
444
|
return False, None
|
|
445
445
|
|
|
446
|
-
def _extract_retry_delay_from_headers(self, headers:
|
|
446
|
+
def _extract_retry_delay_from_headers(self, headers: Any | None) -> float | None:
|
|
447
447
|
"""Extract retry delay from HTTP headers."""
|
|
448
448
|
if headers is None:
|
|
449
449
|
return None
|
|
@@ -489,7 +489,7 @@ class RequestRateScheduler(Scheduler):
|
|
|
489
489
|
|
|
490
490
|
return None
|
|
491
491
|
|
|
492
|
-
def _extract_retry_delay_from_message(self, msg: str) ->
|
|
492
|
+
def _extract_retry_delay_from_message(self, msg: str) -> float | None:
|
|
493
493
|
msg_lower = msg.lower()
|
|
494
494
|
for pattern in self.RETRY_AFTER_PATTERNS:
|
|
495
495
|
match = re.search(pattern, msg_lower)
|
|
@@ -500,7 +500,7 @@ class RequestRateScheduler(Scheduler):
|
|
|
500
500
|
continue
|
|
501
501
|
return None
|
|
502
502
|
|
|
503
|
-
def _compute_retry_delay(self, num_retries: int, retry_after:
|
|
503
|
+
def _compute_retry_delay(self, num_retries: int, retry_after: float | None = None) -> float:
|
|
504
504
|
"""
|
|
505
505
|
Calculate exponential backoff delay for rate limit errors.
|
|
506
506
|
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Any, AsyncIterator
|
|
2
|
+
from typing import Any, AsyncIterator
|
|
3
3
|
|
|
4
4
|
from pixeltable import catalog, exprs
|
|
5
5
|
from pixeltable.utils.local_store import TempStore
|
|
@@ -23,7 +23,7 @@ class InMemoryDataNode(ExecNode):
|
|
|
23
23
|
|
|
24
24
|
input_rows: list[dict[str, Any]]
|
|
25
25
|
start_row_id: int
|
|
26
|
-
output_batch:
|
|
26
|
+
output_batch: DataRowBatch | None
|
|
27
27
|
|
|
28
28
|
# output_exprs is declared in the superclass, but we redeclare it here with a more specific type
|
|
29
29
|
output_exprs: list[exprs.ColumnRef]
|
|
@@ -6,7 +6,7 @@ import logging
|
|
|
6
6
|
from collections import defaultdict, deque
|
|
7
7
|
from concurrent import futures
|
|
8
8
|
from pathlib import Path
|
|
9
|
-
from typing import AsyncIterator, Iterator, NamedTuple
|
|
9
|
+
from typing import AsyncIterator, Iterator, NamedTuple
|
|
10
10
|
|
|
11
11
|
from pixeltable import exprs
|
|
12
12
|
from pixeltable.utils.object_stores import ObjectOps, ObjectPath, StorageTarget
|
|
@@ -44,11 +44,11 @@ class ObjectStoreSaveNode(ExecNode):
|
|
|
44
44
|
"""Specify the source and destination for a WorkItem"""
|
|
45
45
|
|
|
46
46
|
src_path: str # source of the file to be processed
|
|
47
|
-
destination:
|
|
47
|
+
destination: str # destination URI for the file to be processed
|
|
48
48
|
|
|
49
49
|
class WorkItem(NamedTuple):
|
|
50
50
|
src_path: Path
|
|
51
|
-
destination:
|
|
51
|
+
destination: str | None
|
|
52
52
|
info: exprs.ColumnSlotIdx # column info for the file being processed
|
|
53
53
|
destination_count: int = 1 # number of unique destinations for this file
|
|
54
54
|
|
|
@@ -60,7 +60,7 @@ class ObjectStoreSaveNode(ExecNode):
|
|
|
60
60
|
|
|
61
61
|
# ready_rows: rows that are ready to be returned, ordered by row idx;
|
|
62
62
|
# the implied row idx of ready_rows[0] is num_returned_rows
|
|
63
|
-
ready_rows: deque[
|
|
63
|
+
ready_rows: deque[exprs.DataRow | None]
|
|
64
64
|
|
|
65
65
|
in_flight_rows: dict[int, ObjectStoreSaveNode.RowState] # rows with in-flight work; id(row) -> RowState
|
|
66
66
|
in_flight_requests: dict[
|
|
@@ -71,12 +71,12 @@ class ObjectStoreSaveNode(ExecNode):
|
|
|
71
71
|
] # WorkDesignator -> [(row, info)]
|
|
72
72
|
|
|
73
73
|
input_finished: bool
|
|
74
|
-
row_idx: Iterator[
|
|
74
|
+
row_idx: Iterator[int | None]
|
|
75
75
|
|
|
76
76
|
@dataclasses.dataclass
|
|
77
77
|
class RowState:
|
|
78
78
|
row: exprs.DataRow
|
|
79
|
-
idx:
|
|
79
|
+
idx: int | None # position in input stream; None if we don't retain input order
|
|
80
80
|
num_missing: int # number of references to media files in this row
|
|
81
81
|
delete_destinations: list[Path] # paths to delete after all copies are complete
|
|
82
82
|
|
|
@@ -99,7 +99,7 @@ class ObjectStoreSaveNode(ExecNode):
|
|
|
99
99
|
def queued_work(self) -> int:
|
|
100
100
|
return len(self.in_flight_requests)
|
|
101
101
|
|
|
102
|
-
async def get_input_batch(self, input_iter: AsyncIterator[DataRowBatch]) ->
|
|
102
|
+
async def get_input_batch(self, input_iter: AsyncIterator[DataRowBatch]) -> DataRowBatch | None:
|
|
103
103
|
"""Get the next batch of input rows, or None if there are no more rows"""
|
|
104
104
|
try:
|
|
105
105
|
input_batch = await anext(input_iter)
|
|
@@ -148,7 +148,7 @@ class ObjectStoreSaveNode(ExecNode):
|
|
|
148
148
|
sum(int(row is not None) for row in itertools.islice(self.ready_rows, self.BATCH_SIZE)) == self.BATCH_SIZE
|
|
149
149
|
)
|
|
150
150
|
|
|
151
|
-
def __add_ready_row(self, row: exprs.DataRow, row_idx:
|
|
151
|
+
def __add_ready_row(self, row: exprs.DataRow, row_idx: int | None) -> None:
|
|
152
152
|
if row_idx is None:
|
|
153
153
|
self.ready_rows.append(row)
|
|
154
154
|
else:
|
|
@@ -209,14 +209,11 @@ class ObjectStoreSaveNode(ExecNode):
|
|
|
209
209
|
assert col.col_type.is_media_type()
|
|
210
210
|
|
|
211
211
|
destination = info.col.destination
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
soa is not None
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
):
|
|
218
|
-
# A local non-default destination was specified, and the url already points there
|
|
219
|
-
continue
|
|
212
|
+
if destination is not None:
|
|
213
|
+
soa = ObjectPath.parse_object_storage_addr(destination, False)
|
|
214
|
+
if soa.storage_target == StorageTarget.LOCAL_STORE and LocalStore(soa).resolve_url(url) is not None:
|
|
215
|
+
# A local non-default destination was specified, and the url already points there
|
|
216
|
+
continue
|
|
220
217
|
|
|
221
218
|
src_path = LocalStore.file_url_to_path(url)
|
|
222
219
|
if src_path is None:
|
|
@@ -283,7 +280,7 @@ class ObjectStoreSaveNode(ExecNode):
|
|
|
283
280
|
)
|
|
284
281
|
_logger.debug(f'submitted {work_item}')
|
|
285
282
|
|
|
286
|
-
def __persist_media_file(self, work_item: WorkItem) -> tuple[
|
|
283
|
+
def __persist_media_file(self, work_item: WorkItem) -> tuple[str | None, Exception | None]:
|
|
287
284
|
"""Move data from the TempStore to another location"""
|
|
288
285
|
src_path = work_item.src_path
|
|
289
286
|
col = work_item.info.col
|
pixeltable/exec/sql_node.py
CHANGED
|
@@ -2,7 +2,7 @@ import datetime
|
|
|
2
2
|
import logging
|
|
3
3
|
import warnings
|
|
4
4
|
from decimal import Decimal
|
|
5
|
-
from typing import TYPE_CHECKING, AsyncIterator, Iterable, NamedTuple,
|
|
5
|
+
from typing import TYPE_CHECKING, AsyncIterator, Iterable, NamedTuple, Sequence
|
|
6
6
|
from uuid import UUID
|
|
7
7
|
|
|
8
8
|
import sqlalchemy as sql
|
|
@@ -22,13 +22,13 @@ _logger = logging.getLogger('pixeltable')
|
|
|
22
22
|
|
|
23
23
|
class OrderByItem(NamedTuple):
|
|
24
24
|
expr: exprs.Expr
|
|
25
|
-
asc:
|
|
25
|
+
asc: bool | None
|
|
26
26
|
|
|
27
27
|
|
|
28
28
|
OrderByClause = list[OrderByItem]
|
|
29
29
|
|
|
30
30
|
|
|
31
|
-
def combine_order_by_clauses(clauses: Iterable[OrderByClause]) ->
|
|
31
|
+
def combine_order_by_clauses(clauses: Iterable[OrderByClause]) -> OrderByClause | None:
|
|
32
32
|
"""Returns a clause that's compatible with 'clauses', or None if that doesn't exist.
|
|
33
33
|
Two clauses are compatible if for each of their respective items c1[i] and c2[i]
|
|
34
34
|
a) the exprs are identical and
|
|
@@ -81,15 +81,15 @@ class SqlNode(ExecNode):
|
|
|
81
81
|
set_pk: if True, sets the primary for each DataRow
|
|
82
82
|
"""
|
|
83
83
|
|
|
84
|
-
tbl:
|
|
84
|
+
tbl: catalog.TableVersionPath | None
|
|
85
85
|
select_list: exprs.ExprSet
|
|
86
86
|
columns: list[catalog.Column] # for which columns to populate DataRow.cell_vals/cell_md
|
|
87
87
|
cell_md_refs: list[exprs.ColumnPropertyRef] # of ColumnRefs which also need DataRow.slot_cellmd for evaluation
|
|
88
88
|
set_pk: bool
|
|
89
89
|
num_pk_cols: int
|
|
90
|
-
py_filter:
|
|
91
|
-
py_filter_eval_ctx:
|
|
92
|
-
cte:
|
|
90
|
+
py_filter: exprs.Expr | None # a predicate that can only be run in Python
|
|
91
|
+
py_filter_eval_ctx: exprs.RowBuilder.EvalCtx | None
|
|
92
|
+
cte: sql.CTE | None
|
|
93
93
|
sql_elements: exprs.SqlElementCache
|
|
94
94
|
|
|
95
95
|
# execution state
|
|
@@ -99,15 +99,15 @@ class SqlNode(ExecNode):
|
|
|
99
99
|
result_cursor: sql.engine.CursorResult | None
|
|
100
100
|
|
|
101
101
|
# where_clause/-_element: allow subclass to set one or the other (but not both)
|
|
102
|
-
where_clause:
|
|
103
|
-
where_clause_element:
|
|
102
|
+
where_clause: exprs.Expr | None
|
|
103
|
+
where_clause_element: sql.ColumnElement | None
|
|
104
104
|
|
|
105
105
|
order_by_clause: OrderByClause
|
|
106
|
-
limit:
|
|
106
|
+
limit: int | None
|
|
107
107
|
|
|
108
108
|
def __init__(
|
|
109
109
|
self,
|
|
110
|
-
tbl:
|
|
110
|
+
tbl: catalog.TableVersionPath | None,
|
|
111
111
|
row_builder: exprs.RowBuilder,
|
|
112
112
|
select_list: Iterable[exprs.Expr],
|
|
113
113
|
columns: list[catalog.Column],
|
|
@@ -216,7 +216,7 @@ class SqlNode(ExecNode):
|
|
|
216
216
|
def _ordering_tbl_ids(self) -> set[UUID]:
|
|
217
217
|
return exprs.Expr.all_tbl_ids(e for e, _ in self.order_by_clause)
|
|
218
218
|
|
|
219
|
-
def to_cte(self, keep_pk: bool = False) ->
|
|
219
|
+
def to_cte(self, keep_pk: bool = False) -> tuple[sql.CTE, exprs.ExprDict[sql.ColumnElement]] | None:
|
|
220
220
|
"""
|
|
221
221
|
Creates a CTE that materializes the output of this node plus a mapping from select list expr to output column.
|
|
222
222
|
keep_pk: if True, the PK columns are included in the CTE Select statement
|
|
@@ -245,8 +245,8 @@ class SqlNode(ExecNode):
|
|
|
245
245
|
cls,
|
|
246
246
|
tbl: catalog.TableVersionPath,
|
|
247
247
|
stmt: sql.Select,
|
|
248
|
-
refd_tbl_ids:
|
|
249
|
-
exact_version_only:
|
|
248
|
+
refd_tbl_ids: set[UUID] | None = None,
|
|
249
|
+
exact_version_only: set[UUID] | None = None,
|
|
250
250
|
) -> sql.Select:
|
|
251
251
|
"""Add From clause to stmt for tables/views referenced by materialized_exprs
|
|
252
252
|
Args:
|
|
@@ -270,7 +270,7 @@ class SqlNode(ExecNode):
|
|
|
270
270
|
joined_tbls.append(t)
|
|
271
271
|
|
|
272
272
|
first = True
|
|
273
|
-
prev_tv:
|
|
273
|
+
prev_tv: catalog.TableVersion | None = None
|
|
274
274
|
for t in joined_tbls[::-1]:
|
|
275
275
|
tv = t.get()
|
|
276
276
|
# _logger.debug(f'create_from_clause: tbl_id={tv.id} {id(tv.store_tbl.sa_tbl)}')
|
|
@@ -347,7 +347,7 @@ class SqlNode(ExecNode):
|
|
|
347
347
|
pass
|
|
348
348
|
|
|
349
349
|
output_batch = DataRowBatch(self.row_builder)
|
|
350
|
-
output_row:
|
|
350
|
+
output_row: exprs.DataRow | None = None
|
|
351
351
|
num_rows_returned = 0
|
|
352
352
|
is_using_cockroachdb = Env.get().is_using_cockroachdb
|
|
353
353
|
tzinfo = Env.get().default_time_zone
|
|
@@ -450,7 +450,7 @@ class SqlScanNode(SqlNode):
|
|
|
450
450
|
columns: list[catalog.Column],
|
|
451
451
|
cell_md_col_refs: list[exprs.ColumnRef] | None = None,
|
|
452
452
|
set_pk: bool = False,
|
|
453
|
-
exact_version_only:
|
|
453
|
+
exact_version_only: list[catalog.TableVersionHandle] | None = None,
|
|
454
454
|
):
|
|
455
455
|
sql_elements = exprs.SqlElementCache()
|
|
456
456
|
super().__init__(
|
|
@@ -528,17 +528,17 @@ class SqlAggregationNode(SqlNode):
|
|
|
528
528
|
limit: max number of rows to return: None = no limit
|
|
529
529
|
"""
|
|
530
530
|
|
|
531
|
-
group_by_items:
|
|
532
|
-
input_cte:
|
|
531
|
+
group_by_items: list[exprs.Expr] | None
|
|
532
|
+
input_cte: sql.CTE | None
|
|
533
533
|
|
|
534
534
|
def __init__(
|
|
535
535
|
self,
|
|
536
536
|
row_builder: exprs.RowBuilder,
|
|
537
537
|
input: SqlNode,
|
|
538
538
|
select_list: Iterable[exprs.Expr],
|
|
539
|
-
group_by_items:
|
|
540
|
-
limit:
|
|
541
|
-
exact_version_only:
|
|
539
|
+
group_by_items: list[exprs.Expr] | None = None,
|
|
540
|
+
limit: int | None = None,
|
|
541
|
+
exact_version_only: list[catalog.TableVersion] | None = None,
|
|
542
542
|
):
|
|
543
543
|
assert len(input.cell_md_refs) == 0 # there's no aggregation over json or arrays in SQL
|
|
544
544
|
self.input_cte, input_col_map = input.to_cte()
|
|
@@ -617,9 +617,9 @@ class SqlSampleNode(SqlNode):
|
|
|
617
617
|
stratify_exprs: Analyzer processed list of expressions to stratify by.
|
|
618
618
|
"""
|
|
619
619
|
|
|
620
|
-
input_cte:
|
|
620
|
+
input_cte: sql.CTE | None
|
|
621
621
|
pk_count: int
|
|
622
|
-
stratify_exprs:
|
|
622
|
+
stratify_exprs: list[exprs.Expr] | None
|
|
623
623
|
sample_clause: 'SampleClause'
|
|
624
624
|
|
|
625
625
|
def __init__(
|
|
@@ -648,7 +648,6 @@ class SqlSampleNode(SqlNode):
|
|
|
648
648
|
)
|
|
649
649
|
self.stratify_exprs = stratify_exprs
|
|
650
650
|
self.sample_clause = sample_clause
|
|
651
|
-
assert isinstance(self.sample_clause.seed, int)
|
|
652
651
|
|
|
653
652
|
@classmethod
|
|
654
653
|
def key_sql_expr(cls, seed: sql.ColumnElement, sql_cols: Iterable[sql.ColumnElement]) -> sql.ColumnElement:
|
|
@@ -667,7 +666,9 @@ class SqlSampleNode(SqlNode):
|
|
|
667
666
|
"""Create an expression for randomly ordering rows with a given seed"""
|
|
668
667
|
rowid_cols = [*cte.c[-self.pk_count : -1]] # exclude the version column
|
|
669
668
|
assert len(rowid_cols) > 0
|
|
670
|
-
|
|
669
|
+
# If seed is not set in the sample clause, use the random seed given by the execution context
|
|
670
|
+
seed = self.sample_clause.seed if self.sample_clause.seed is not None else self.ctx.random_seed
|
|
671
|
+
return self.key_sql_expr(sql.literal_column(str(seed)), rowid_cols)
|
|
671
672
|
|
|
672
673
|
def _create_stmt(self) -> sql.Select:
|
|
673
674
|
from pixeltable.plan import SampleClause
|
|
@@ -691,7 +692,7 @@ class SqlSampleNode(SqlNode):
|
|
|
691
692
|
|
|
692
693
|
return self._create_stmt_stratified_n(self.sample_clause.n, self.sample_clause.n_per_stratum)
|
|
693
694
|
|
|
694
|
-
def _create_stmt_stratified_n(self, n:
|
|
695
|
+
def _create_stmt_stratified_n(self, n: int | None, n_per_stratum: int | None) -> sql.Select:
|
|
695
696
|
"""Create a Select stmt that returns n samples across all strata or n_per_stratum samples per stratum"""
|
|
696
697
|
|
|
697
698
|
sql_strata_exprs = [self.sql_elements.get(e) for e in self.stratify_exprs]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from typing import Any
|
|
3
|
+
from typing import Any
|
|
4
4
|
|
|
5
5
|
import sqlalchemy as sql
|
|
6
6
|
|
|
@@ -58,7 +58,7 @@ class ArithmeticExpr(Expr):
|
|
|
58
58
|
def _id_attrs(self) -> list[tuple[str, Any]]:
|
|
59
59
|
return [*super()._id_attrs(), ('operator', self.operator.value)]
|
|
60
60
|
|
|
61
|
-
def sql_expr(self, sql_elements: SqlElementCache) ->
|
|
61
|
+
def sql_expr(self, sql_elements: SqlElementCache) -> sql.ColumnElement | None:
|
|
62
62
|
assert self.col_type.is_int_type() or self.col_type.is_float_type() or self.col_type.is_json_type()
|
|
63
63
|
left = sql_elements.get(self._op1)
|
|
64
64
|
right = sql_elements.get(self._op2)
|
|
@@ -118,7 +118,7 @@ class ArithmeticExpr(Expr):
|
|
|
118
118
|
|
|
119
119
|
data_row[self.slot_idx] = self.eval_nullable(op1_val, op2_val)
|
|
120
120
|
|
|
121
|
-
def eval_nullable(self, op1_val:
|
|
121
|
+
def eval_nullable(self, op1_val: float | None, op2_val: float | None) -> float | None:
|
|
122
122
|
"""
|
|
123
123
|
Return the result of evaluating the expression on two nullable int/float operands,
|
|
124
124
|
None is interpreted as SQL NULL
|
|
@@ -144,7 +144,7 @@ class ArithmeticExpr(Expr):
|
|
|
144
144
|
elif self.operator == ArithmeticOperator.FLOORDIV:
|
|
145
145
|
return op1_val // op2_val
|
|
146
146
|
|
|
147
|
-
def as_literal(self) ->
|
|
147
|
+
def as_literal(self) -> Literal | None:
|
|
148
148
|
op1_lit = self._op1.as_literal()
|
|
149
149
|
if op1_lit is None:
|
|
150
150
|
return None
|
pixeltable/exprs/array_slice.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from typing import Any
|
|
3
|
+
from typing import Any
|
|
4
4
|
|
|
5
5
|
import sqlalchemy as sql
|
|
6
6
|
|
|
@@ -43,7 +43,7 @@ class ArraySlice(Expr):
|
|
|
43
43
|
def _id_attrs(self) -> list[tuple[str, Any]]:
|
|
44
44
|
return [*super()._id_attrs(), ('index', self.index)]
|
|
45
45
|
|
|
46
|
-
def sql_expr(self, _: SqlElementCache) ->
|
|
46
|
+
def sql_expr(self, _: SqlElementCache) -> sql.ColumnElement | None:
|
|
47
47
|
return None
|
|
48
48
|
|
|
49
49
|
def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import enum
|
|
4
|
-
from typing import Any
|
|
4
|
+
from typing import Any
|
|
5
5
|
|
|
6
6
|
import sqlalchemy as sql
|
|
7
7
|
|
|
@@ -34,7 +34,7 @@ class ColumnPropertyRef(Expr):
|
|
|
34
34
|
self.prop = prop
|
|
35
35
|
self.id = self._create_id()
|
|
36
36
|
|
|
37
|
-
def default_column_name(self) ->
|
|
37
|
+
def default_column_name(self) -> str | None:
|
|
38
38
|
return str(self).replace('.', '_')
|
|
39
39
|
|
|
40
40
|
def _equals(self, other: ColumnPropertyRef) -> bool:
|
|
@@ -55,7 +55,7 @@ class ColumnPropertyRef(Expr):
|
|
|
55
55
|
def is_cellmd_prop(self) -> bool:
|
|
56
56
|
return self.prop in (self.Property.ERRORTYPE, self.Property.ERRORMSG, self.Property.CELLMD)
|
|
57
57
|
|
|
58
|
-
def sql_expr(self, sql_elements: SqlElementCache) ->
|
|
58
|
+
def sql_expr(self, sql_elements: SqlElementCache) -> sql.ColumnElement | None:
|
|
59
59
|
if not self.col_ref.col_handle.get().is_stored:
|
|
60
60
|
return None
|
|
61
61
|
col = self.col_ref.col_handle.get()
|