pixeltable 0.3.2__py3-none-any.whl → 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +64 -11
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +1 -1
- pixeltable/catalog/catalog.py +50 -27
- pixeltable/catalog/column.py +27 -11
- pixeltable/catalog/dir.py +6 -4
- pixeltable/catalog/globals.py +8 -1
- pixeltable/catalog/insertable_table.py +22 -12
- pixeltable/catalog/named_function.py +10 -6
- pixeltable/catalog/path.py +3 -2
- pixeltable/catalog/path_dict.py +8 -6
- pixeltable/catalog/schema_object.py +2 -1
- pixeltable/catalog/table.py +121 -101
- pixeltable/catalog/table_version.py +291 -142
- pixeltable/catalog/table_version_path.py +8 -5
- pixeltable/catalog/view.py +67 -26
- pixeltable/dataframe.py +102 -72
- pixeltable/env.py +20 -21
- pixeltable/exec/__init__.py +2 -2
- pixeltable/exec/aggregation_node.py +10 -4
- pixeltable/exec/cache_prefetch_node.py +5 -3
- pixeltable/exec/component_iteration_node.py +9 -8
- pixeltable/exec/data_row_batch.py +21 -10
- pixeltable/exec/exec_context.py +10 -3
- pixeltable/exec/exec_node.py +23 -12
- pixeltable/exec/expr_eval/evaluators.py +13 -7
- pixeltable/exec/expr_eval/expr_eval_node.py +24 -15
- pixeltable/exec/expr_eval/globals.py +30 -7
- pixeltable/exec/expr_eval/row_buffer.py +5 -6
- pixeltable/exec/expr_eval/schedulers.py +151 -31
- pixeltable/exec/in_memory_data_node.py +8 -7
- pixeltable/exec/row_update_node.py +15 -5
- pixeltable/exec/sql_node.py +56 -27
- pixeltable/exprs/__init__.py +2 -2
- pixeltable/exprs/arithmetic_expr.py +57 -26
- pixeltable/exprs/array_slice.py +1 -1
- pixeltable/exprs/column_property_ref.py +2 -1
- pixeltable/exprs/column_ref.py +20 -15
- pixeltable/exprs/comparison.py +6 -2
- pixeltable/exprs/compound_predicate.py +1 -3
- pixeltable/exprs/data_row.py +2 -2
- pixeltable/exprs/expr.py +101 -72
- pixeltable/exprs/expr_dict.py +2 -1
- pixeltable/exprs/expr_set.py +3 -1
- pixeltable/exprs/function_call.py +39 -41
- pixeltable/exprs/globals.py +1 -0
- pixeltable/exprs/in_predicate.py +2 -2
- pixeltable/exprs/inline_expr.py +20 -17
- pixeltable/exprs/json_mapper.py +4 -2
- pixeltable/exprs/json_path.py +12 -18
- pixeltable/exprs/literal.py +5 -9
- pixeltable/exprs/method_ref.py +1 -0
- pixeltable/exprs/object_ref.py +1 -1
- pixeltable/exprs/row_builder.py +32 -17
- pixeltable/exprs/rowid_ref.py +14 -5
- pixeltable/exprs/similarity_expr.py +11 -6
- pixeltable/exprs/sql_element_cache.py +1 -1
- pixeltable/exprs/type_cast.py +24 -9
- pixeltable/ext/__init__.py +1 -0
- pixeltable/ext/functions/__init__.py +1 -0
- pixeltable/ext/functions/whisperx.py +2 -2
- pixeltable/ext/functions/yolox.py +11 -11
- pixeltable/func/aggregate_function.py +17 -13
- pixeltable/func/callable_function.py +6 -6
- pixeltable/func/expr_template_function.py +15 -14
- pixeltable/func/function.py +16 -16
- pixeltable/func/function_registry.py +11 -8
- pixeltable/func/globals.py +4 -2
- pixeltable/func/query_template_function.py +12 -13
- pixeltable/func/signature.py +18 -9
- pixeltable/func/tools.py +10 -17
- pixeltable/func/udf.py +106 -11
- pixeltable/functions/__init__.py +21 -2
- pixeltable/functions/anthropic.py +16 -12
- pixeltable/functions/fireworks.py +63 -5
- pixeltable/functions/gemini.py +13 -3
- pixeltable/functions/globals.py +18 -6
- pixeltable/functions/huggingface.py +20 -38
- pixeltable/functions/image.py +7 -3
- pixeltable/functions/json.py +1 -0
- pixeltable/functions/llama_cpp.py +1 -4
- pixeltable/functions/mistralai.py +31 -20
- pixeltable/functions/ollama.py +4 -18
- pixeltable/functions/openai.py +201 -108
- pixeltable/functions/replicate.py +11 -10
- pixeltable/functions/string.py +70 -7
- pixeltable/functions/timestamp.py +21 -8
- pixeltable/functions/together.py +66 -52
- pixeltable/functions/video.py +1 -0
- pixeltable/functions/vision.py +14 -11
- pixeltable/functions/whisper.py +2 -1
- pixeltable/globals.py +60 -26
- pixeltable/index/__init__.py +1 -1
- pixeltable/index/btree.py +5 -3
- pixeltable/index/embedding_index.py +15 -14
- pixeltable/io/__init__.py +1 -1
- pixeltable/io/external_store.py +30 -25
- pixeltable/io/fiftyone.py +6 -14
- pixeltable/io/globals.py +33 -27
- pixeltable/io/hf_datasets.py +2 -1
- pixeltable/io/label_studio.py +77 -68
- pixeltable/io/pandas.py +33 -9
- pixeltable/io/parquet.py +9 -12
- pixeltable/iterators/__init__.py +1 -0
- pixeltable/iterators/audio.py +205 -0
- pixeltable/iterators/document.py +19 -8
- pixeltable/iterators/image.py +6 -24
- pixeltable/iterators/string.py +3 -6
- pixeltable/iterators/video.py +1 -7
- pixeltable/metadata/__init__.py +7 -1
- pixeltable/metadata/converters/convert_10.py +2 -2
- pixeltable/metadata/converters/convert_15.py +1 -5
- pixeltable/metadata/converters/convert_16.py +2 -4
- pixeltable/metadata/converters/convert_17.py +2 -4
- pixeltable/metadata/converters/convert_18.py +2 -4
- pixeltable/metadata/converters/convert_19.py +2 -5
- pixeltable/metadata/converters/convert_20.py +1 -4
- pixeltable/metadata/converters/convert_21.py +4 -6
- pixeltable/metadata/converters/convert_22.py +1 -0
- pixeltable/metadata/converters/convert_23.py +5 -5
- pixeltable/metadata/converters/convert_24.py +12 -13
- pixeltable/metadata/converters/convert_26.py +23 -0
- pixeltable/metadata/converters/util.py +3 -4
- pixeltable/metadata/notes.py +1 -0
- pixeltable/metadata/schema.py +13 -2
- pixeltable/plan.py +173 -98
- pixeltable/store.py +42 -26
- pixeltable/type_system.py +62 -54
- pixeltable/utils/arrow.py +1 -2
- pixeltable/utils/coco.py +16 -17
- pixeltable/utils/code.py +1 -1
- pixeltable/utils/console_output.py +6 -3
- pixeltable/utils/description_helper.py +7 -7
- pixeltable/utils/documents.py +3 -1
- pixeltable/utils/filecache.py +12 -7
- pixeltable/utils/http_server.py +9 -8
- pixeltable/utils/media_store.py +2 -1
- pixeltable/utils/pytorch.py +11 -14
- pixeltable/utils/s3.py +1 -0
- pixeltable/utils/sql.py +1 -0
- pixeltable/utils/transactional_directory.py +2 -2
- {pixeltable-0.3.2.dist-info → pixeltable-0.3.3.dist-info}/METADATA +6 -8
- pixeltable-0.3.3.dist-info/RECORD +163 -0
- pixeltable-0.3.2.dist-info/RECORD +0 -161
- {pixeltable-0.3.2.dist-info → pixeltable-0.3.3.dist-info}/LICENSE +0 -0
- {pixeltable-0.3.2.dist-info → pixeltable-0.3.3.dist-info}/WHEEL +0 -0
- {pixeltable-0.3.2.dist-info → pixeltable-0.3.3.dist-info}/entry_points.txt +0 -0
pixeltable/env.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from abc import abstractmethod
|
|
4
3
|
import datetime
|
|
5
4
|
import glob
|
|
6
5
|
import http.server
|
|
@@ -16,6 +15,7 @@ import sys
|
|
|
16
15
|
import threading
|
|
17
16
|
import uuid
|
|
18
17
|
import warnings
|
|
18
|
+
from abc import abstractmethod
|
|
19
19
|
from dataclasses import dataclass, field
|
|
20
20
|
from pathlib import Path
|
|
21
21
|
from sys import stdout
|
|
@@ -375,6 +375,7 @@ class Env:
|
|
|
375
375
|
|
|
376
376
|
if create_db:
|
|
377
377
|
from pixeltable.metadata import schema
|
|
378
|
+
|
|
378
379
|
schema.base_metadata.create_all(self._sa_engine)
|
|
379
380
|
metadata.create_system_info(self._sa_engine)
|
|
380
381
|
|
|
@@ -387,11 +388,7 @@ class Env:
|
|
|
387
388
|
def _create_engine(self, time_zone_name: Optional[str], echo: bool = False) -> None:
|
|
388
389
|
connect_args = {} if time_zone_name is None else {'options': f'-c timezone={time_zone_name}'}
|
|
389
390
|
self._sa_engine = sql.create_engine(
|
|
390
|
-
self.db_url,
|
|
391
|
-
echo=echo,
|
|
392
|
-
future=True,
|
|
393
|
-
isolation_level='REPEATABLE READ',
|
|
394
|
-
connect_args=connect_args,
|
|
391
|
+
self.db_url, echo=echo, future=True, isolation_level='REPEATABLE READ', connect_args=connect_args
|
|
395
392
|
)
|
|
396
393
|
self._logger.info(f'Created SQLAlchemy engine at: {self.db_url}')
|
|
397
394
|
with self.engine.begin() as conn:
|
|
@@ -424,7 +421,7 @@ class Env:
|
|
|
424
421
|
with engine.begin() as conn:
|
|
425
422
|
# use C collation to get standard C/Python-style sorting
|
|
426
423
|
stmt = (
|
|
427
|
-
f
|
|
424
|
+
f'CREATE DATABASE {preparer.quote(self._db_name)} '
|
|
428
425
|
"ENCODING 'utf-8' LC_COLLATE 'C' LC_CTYPE 'C' TEMPLATE template0"
|
|
429
426
|
)
|
|
430
427
|
conn.execute(sql.text(stmt))
|
|
@@ -448,12 +445,12 @@ class Env:
|
|
|
448
445
|
try:
|
|
449
446
|
with engine.begin() as conn:
|
|
450
447
|
# terminate active connections
|
|
451
|
-
stmt =
|
|
448
|
+
stmt = f"""
|
|
452
449
|
SELECT pg_terminate_backend(pg_stat_activity.pid)
|
|
453
450
|
FROM pg_stat_activity
|
|
454
451
|
WHERE pg_stat_activity.datname = '{self._db_name}'
|
|
455
452
|
AND pid <> pg_backend_pid()
|
|
456
|
-
"""
|
|
453
|
+
"""
|
|
457
454
|
conn.execute(sql.text(stmt))
|
|
458
455
|
# drop db
|
|
459
456
|
stmt = f'DROP DATABASE {preparer.quote(self._db_name)}'
|
|
@@ -563,7 +560,7 @@ class Env:
|
|
|
563
560
|
is_installed = False
|
|
564
561
|
self.__optional_packages[package_name] = PackageInfo(
|
|
565
562
|
is_installed=is_installed,
|
|
566
|
-
library_name=library_name or package_name # defaults to package_name unless specified otherwise
|
|
563
|
+
library_name=library_name or package_name, # defaults to package_name unless specified otherwise
|
|
567
564
|
)
|
|
568
565
|
|
|
569
566
|
def require_package(self, package_name: str, min_version: Optional[list[int]] = None) -> None:
|
|
@@ -609,6 +606,7 @@ class Env:
|
|
|
609
606
|
"""
|
|
610
607
|
import spacy
|
|
611
608
|
from spacy.cli.download import get_model_filename
|
|
609
|
+
|
|
612
610
|
spacy_model = 'en_core_web_sm'
|
|
613
611
|
spacy_model_version = '3.7.1'
|
|
614
612
|
filename = get_model_filename(spacy_model, spacy_model_version, sdist=False)
|
|
@@ -626,7 +624,7 @@ class Env:
|
|
|
626
624
|
self._logger.warn(f'Failed to load spaCy model: {spacy_model}', exc_info=exc)
|
|
627
625
|
warnings.warn(
|
|
628
626
|
f"Failed to load spaCy model '{spacy_model}'. spaCy features will not be available.",
|
|
629
|
-
excs.PixeltableWarning
|
|
627
|
+
excs.PixeltableWarning,
|
|
630
628
|
)
|
|
631
629
|
self.__optional_packages['spacy'].is_installed = False
|
|
632
630
|
|
|
@@ -636,8 +634,7 @@ class Env:
|
|
|
636
634
|
def create_tmp_path(self, extension: str = '') -> Path:
|
|
637
635
|
return self._tmp_dir / f'{uuid.uuid4()}{extension}'
|
|
638
636
|
|
|
639
|
-
|
|
640
|
-
#def get_resource_pool_info(self, pool_id: str, pool_info_cls: Optional[Type[T]]) -> T:
|
|
637
|
+
# def get_resource_pool_info(self, pool_id: str, pool_info_cls: Optional[Type[T]]) -> T:
|
|
641
638
|
def get_resource_pool_info(self, pool_id: str, make_pool_info: Optional[Callable[[], T]] = None) -> T:
|
|
642
639
|
"""Returns the info object for the given id, creating it if necessary."""
|
|
643
640
|
info = self._resource_pool_info.get(pool_id)
|
|
@@ -707,6 +704,7 @@ def register_client(name: str) -> Callable:
|
|
|
707
704
|
Args:
|
|
708
705
|
- name (str): The name of the API client (e.g., 'openai' or 'label-studio').
|
|
709
706
|
"""
|
|
707
|
+
|
|
710
708
|
def decorator(fn: Callable) -> None:
|
|
711
709
|
global _registered_clients
|
|
712
710
|
sig = inspect.signature(fn)
|
|
@@ -721,6 +719,7 @@ class Config:
|
|
|
721
719
|
The (global) Pixeltable configuration, as loaded from `config.toml`. Provides methods for retrieving
|
|
722
720
|
configuration values, which can be set in the config file or as environment variables.
|
|
723
721
|
"""
|
|
722
|
+
|
|
724
723
|
__config: dict[str, Any]
|
|
725
724
|
|
|
726
725
|
@classmethod
|
|
@@ -750,12 +749,7 @@ class Config:
|
|
|
750
749
|
free_disk_space_bytes = shutil.disk_usage(config_path.parent).free
|
|
751
750
|
# Default cache size is 1/5 of free disk space
|
|
752
751
|
file_cache_size_g = free_disk_space_bytes / 5 / (1 << 30)
|
|
753
|
-
return {
|
|
754
|
-
'pixeltable': {
|
|
755
|
-
'file_cache_size_g': round(file_cache_size_g, 1),
|
|
756
|
-
'hide_warnings': False,
|
|
757
|
-
}
|
|
758
|
-
}
|
|
752
|
+
return {'pixeltable': {'file_cache_size_g': round(file_cache_size_g, 1), 'hide_warnings': False}}
|
|
759
753
|
|
|
760
754
|
def __init__(self, config: dict[str, Any]) -> None:
|
|
761
755
|
self.__config = config
|
|
@@ -840,7 +834,9 @@ class RateLimitsInfo:
|
|
|
840
834
|
self.resource_limits = {k: RateLimitInfo(k, now, *v) for k, v in kwargs.items() if v is not None}
|
|
841
835
|
# TODO: remove
|
|
842
836
|
for info in self.resource_limits.values():
|
|
843
|
-
_logger.debug(
|
|
837
|
+
_logger.debug(
|
|
838
|
+
f'Init {info.resource} rate limit: rem={info.remaining} reset={info.reset_at.strftime(TIME_FORMAT)} delta={(info.reset_at - now).total_seconds()}'
|
|
839
|
+
)
|
|
844
840
|
else:
|
|
845
841
|
for k, v in kwargs.items():
|
|
846
842
|
if v is not None:
|
|
@@ -855,6 +851,7 @@ class RateLimitsInfo:
|
|
|
855
851
|
@dataclass
|
|
856
852
|
class RateLimitInfo:
|
|
857
853
|
"""Container for rate limit-related information for a single resource."""
|
|
854
|
+
|
|
858
855
|
resource: str
|
|
859
856
|
recorded_at: datetime.datetime
|
|
860
857
|
limit: int
|
|
@@ -871,4 +868,6 @@ class RateLimitInfo:
|
|
|
871
868
|
reset_delta = reset_at - self.reset_at
|
|
872
869
|
self.reset_at = reset_at
|
|
873
870
|
# TODO: remove
|
|
874
|
-
_logger.debug(
|
|
871
|
+
_logger.debug(
|
|
872
|
+
f'Update {self.resource} rate limit: rem={self.remaining} reset={self.reset_at.strftime(TIME_FORMAT)} reset_delta={reset_delta.total_seconds()} recorded_delta={(self.reset_at - recorded_at).total_seconds()}'
|
|
873
|
+
)
|
pixeltable/exec/__init__.py
CHANGED
|
@@ -4,7 +4,7 @@ from .component_iteration_node import ComponentIterationNode
|
|
|
4
4
|
from .data_row_batch import DataRowBatch
|
|
5
5
|
from .exec_context import ExecContext
|
|
6
6
|
from .exec_node import ExecNode
|
|
7
|
+
from .expr_eval import ExprEvalNode
|
|
7
8
|
from .in_memory_data_node import InMemoryDataNode
|
|
8
9
|
from .row_update_node import RowUpdateNode
|
|
9
|
-
from .sql_node import
|
|
10
|
-
from .expr_eval import ExprEvalNode
|
|
10
|
+
from .sql_node import SqlAggregationNode, SqlJoinNode, SqlLookupNode, SqlNode, SqlScanNode
|
|
@@ -2,7 +2,7 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
4
|
import sys
|
|
5
|
-
from typing import Any, Iterable, Iterator, Optional, cast
|
|
5
|
+
from typing import Any, AsyncIterator, Iterable, Iterator, Optional, cast
|
|
6
6
|
|
|
7
7
|
import pixeltable.catalog as catalog
|
|
8
8
|
import pixeltable.exceptions as excs
|
|
@@ -13,12 +13,14 @@ from .exec_node import ExecNode
|
|
|
13
13
|
|
|
14
14
|
_logger = logging.getLogger('pixeltable')
|
|
15
15
|
|
|
16
|
+
|
|
16
17
|
class AggregationNode(ExecNode):
|
|
17
18
|
"""
|
|
18
19
|
In-memory aggregation for UDAs.
|
|
19
20
|
|
|
20
21
|
At the moment, this returns all results in a single DataRowBatch.
|
|
21
22
|
"""
|
|
23
|
+
|
|
22
24
|
group_by: Optional[list[exprs.Expr]]
|
|
23
25
|
input_exprs: list[exprs.Expr]
|
|
24
26
|
agg_fn_eval_ctx: exprs.RowBuilder.EvalCtx
|
|
@@ -26,8 +28,13 @@ class AggregationNode(ExecNode):
|
|
|
26
28
|
output_batch: DataRowBatch
|
|
27
29
|
|
|
28
30
|
def __init__(
|
|
29
|
-
|
|
30
|
-
|
|
31
|
+
self,
|
|
32
|
+
tbl: catalog.TableVersion,
|
|
33
|
+
row_builder: exprs.RowBuilder,
|
|
34
|
+
group_by: Optional[list[exprs.Expr]],
|
|
35
|
+
agg_fn_calls: list[exprs.FunctionCall],
|
|
36
|
+
input_exprs: Iterable[exprs.Expr],
|
|
37
|
+
input: ExecNode,
|
|
31
38
|
):
|
|
32
39
|
output_exprs: list[exprs.Expr] = [] if group_by is None else list(group_by)
|
|
33
40
|
output_exprs.extend(agg_fn_calls)
|
|
@@ -86,4 +93,3 @@ class AggregationNode(ExecNode):
|
|
|
86
93
|
self.output_batch.flush_imgs(None, self.stored_img_cols, self.flushed_img_slots)
|
|
87
94
|
_logger.debug(f'AggregateNode: consumed {num_input_rows} rows, returning {len(self.output_batch.rows)} rows')
|
|
88
95
|
yield self.output_batch
|
|
89
|
-
|
|
@@ -9,7 +9,7 @@ import urllib.request
|
|
|
9
9
|
from collections import deque
|
|
10
10
|
from concurrent import futures
|
|
11
11
|
from pathlib import Path
|
|
12
|
-
from typing import
|
|
12
|
+
from typing import Any, AsyncIterator, Iterator, Optional
|
|
13
13
|
from uuid import UUID
|
|
14
14
|
|
|
15
15
|
import pixeltable.env as env
|
|
@@ -30,6 +30,7 @@ class CachePrefetchNode(ExecNode):
|
|
|
30
30
|
TODO:
|
|
31
31
|
- adapting the number of download threads at runtime to maximize throughput
|
|
32
32
|
"""
|
|
33
|
+
|
|
33
34
|
BATCH_SIZE = 16
|
|
34
35
|
NUM_EXECUTOR_THREADS = 16
|
|
35
36
|
|
|
@@ -59,8 +60,8 @@ class CachePrefetchNode(ExecNode):
|
|
|
59
60
|
num_missing: int # number of missing URLs in this row
|
|
60
61
|
|
|
61
62
|
def __init__(
|
|
62
|
-
|
|
63
|
-
|
|
63
|
+
self, tbl_id: UUID, file_col_info: list[exprs.ColumnSlotIdx], input: ExecNode, retain_input_order: bool = True
|
|
64
|
+
):
|
|
64
65
|
# input_/output_exprs=[]: we don't have anything to evaluate
|
|
65
66
|
super().__init__(input.row_builder, [], [], input)
|
|
66
67
|
self.retain_input_order = retain_input_order
|
|
@@ -241,6 +242,7 @@ class CachePrefetchNode(ExecNode):
|
|
|
241
242
|
_logger.debug(f'Downloading {url} to {tmp_path}')
|
|
242
243
|
if parsed.scheme == 's3':
|
|
243
244
|
from pixeltable.utils.s3 import get_client
|
|
245
|
+
|
|
244
246
|
with self.boto_client_lock:
|
|
245
247
|
if self.boto_client is None:
|
|
246
248
|
config = {
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import inspect
|
|
2
|
-
from typing import Iterator, Optional
|
|
2
|
+
from typing import AsyncIterator, Iterator, Optional
|
|
3
3
|
|
|
4
4
|
import pixeltable.catalog as catalog
|
|
5
5
|
import pixeltable.exceptions as excs
|
|
@@ -14,6 +14,7 @@ class ComponentIterationNode(ExecNode):
|
|
|
14
14
|
|
|
15
15
|
Returns row batches of OUTPUT_BATCH_SIZE size.
|
|
16
16
|
"""
|
|
17
|
+
|
|
17
18
|
__OUTPUT_BATCH_SIZE = 1024
|
|
18
19
|
|
|
19
20
|
def __init__(self, view: catalog.TableVersion, input: ExecNode):
|
|
@@ -25,8 +26,8 @@ class ComponentIterationNode(ExecNode):
|
|
|
25
26
|
self.iterator_args = iterator_args[0]
|
|
26
27
|
assert isinstance(self.iterator_args, exprs.InlineDict)
|
|
27
28
|
self.iterator_args_ctx = self.row_builder.create_eval_ctx([self.iterator_args])
|
|
28
|
-
self.iterator_output_schema, self.unstored_column_names = (
|
|
29
|
-
|
|
29
|
+
self.iterator_output_schema, self.unstored_column_names = self.view.iterator_cls.output_schema(
|
|
30
|
+
**self.iterator_args.to_kwargs()
|
|
30
31
|
)
|
|
31
32
|
self.iterator_output_fields = list(self.iterator_output_schema.keys())
|
|
32
33
|
self.iterator_output_cols = {
|
|
@@ -34,7 +35,8 @@ class ComponentIterationNode(ExecNode):
|
|
|
34
35
|
}
|
|
35
36
|
# referenced iterator output fields
|
|
36
37
|
self.refd_output_slot_idxs = {
|
|
37
|
-
e.col.name: e.slot_idx
|
|
38
|
+
e.col.name: e.slot_idx
|
|
39
|
+
for e in self.row_builder.unique_exprs
|
|
38
40
|
if isinstance(e, exprs.ColumnRef) and e.col.name in self.iterator_output_fields
|
|
39
41
|
}
|
|
40
42
|
|
|
@@ -79,8 +81,7 @@ class ComponentIterationNode(ExecNode):
|
|
|
79
81
|
# verify and copy component_dict fields to their respective slots in output_row
|
|
80
82
|
for field_name, field_val in component_dict.items():
|
|
81
83
|
if field_name not in self.iterator_output_fields:
|
|
82
|
-
raise excs.Error(
|
|
83
|
-
f'Invalid field name {field_name} in output of {self.view.iterator_cls.__name__}')
|
|
84
|
+
raise excs.Error(f'Invalid field name {field_name} in output of {self.view.iterator_cls.__name__}')
|
|
84
85
|
if field_name not in self.refd_output_slot_idxs:
|
|
85
86
|
# we can ignore this
|
|
86
87
|
continue
|
|
@@ -90,5 +91,5 @@ class ComponentIterationNode(ExecNode):
|
|
|
90
91
|
if len(component_dict) != len(self.iterator_output_fields):
|
|
91
92
|
missing_fields = set(self.refd_output_slot_idxs.keys()) - set(component_dict.keys())
|
|
92
93
|
raise excs.Error(
|
|
93
|
-
f'Invalid output of {self.view.iterator_cls.__name__}: '
|
|
94
|
-
|
|
94
|
+
f'Invalid output of {self.view.iterator_cls.__name__}: missing fields {", ".join(missing_fields)}'
|
|
95
|
+
)
|
|
@@ -1,19 +1,21 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
|
-
|
|
2
|
+
|
|
3
3
|
import logging
|
|
4
|
+
from typing import Iterator, Optional
|
|
4
5
|
|
|
5
|
-
import pixeltable.exprs as exprs
|
|
6
6
|
import pixeltable.catalog as catalog
|
|
7
|
+
import pixeltable.exprs as exprs
|
|
7
8
|
from pixeltable.utils.media_store import MediaStore
|
|
8
9
|
|
|
9
|
-
|
|
10
10
|
_logger = logging.getLogger('pixeltable')
|
|
11
11
|
|
|
12
|
+
|
|
12
13
|
class DataRowBatch:
|
|
13
14
|
"""Set of DataRows, indexed by rowid.
|
|
14
15
|
|
|
15
16
|
Contains the metadata needed to initialize DataRows.
|
|
16
17
|
"""
|
|
18
|
+
|
|
17
19
|
tbl: Optional[catalog.TableVersion]
|
|
18
20
|
row_builder: exprs.RowBuilder
|
|
19
21
|
img_slot_idxs: list[int]
|
|
@@ -22,8 +24,11 @@ class DataRowBatch:
|
|
|
22
24
|
rows: list[exprs.DataRow]
|
|
23
25
|
|
|
24
26
|
def __init__(
|
|
25
|
-
self,
|
|
26
|
-
|
|
27
|
+
self,
|
|
28
|
+
tbl: Optional[catalog.TableVersion],
|
|
29
|
+
row_builder: exprs.RowBuilder,
|
|
30
|
+
num_rows: Optional[int] = None,
|
|
31
|
+
rows: Optional[list[exprs.DataRow]] = None,
|
|
27
32
|
):
|
|
28
33
|
"""
|
|
29
34
|
Requires either num_rows or rows to be specified, but not both.
|
|
@@ -34,7 +39,8 @@ class DataRowBatch:
|
|
|
34
39
|
self.img_slot_idxs = [e.slot_idx for e in row_builder.unique_exprs if e.col_type.is_image_type()]
|
|
35
40
|
# non-image media slots
|
|
36
41
|
self.media_slot_idxs = [
|
|
37
|
-
e.slot_idx
|
|
42
|
+
e.slot_idx
|
|
43
|
+
for e in row_builder.unique_exprs
|
|
38
44
|
if e.col_type.is_media_type() and not e.col_type.is_image_type()
|
|
39
45
|
]
|
|
40
46
|
self.array_slot_idxs = [e.slot_idx for e in row_builder.unique_exprs if e.col_type.is_array_type()]
|
|
@@ -44,14 +50,17 @@ class DataRowBatch:
|
|
|
44
50
|
if num_rows is None:
|
|
45
51
|
num_rows = 0
|
|
46
52
|
self.rows = [
|
|
47
|
-
exprs.DataRow(
|
|
53
|
+
exprs.DataRow(
|
|
54
|
+
row_builder.num_materialized, self.img_slot_idxs, self.media_slot_idxs, self.array_slot_idxs
|
|
55
|
+
)
|
|
48
56
|
for _ in range(num_rows)
|
|
49
57
|
]
|
|
50
58
|
|
|
51
59
|
def add_row(self, row: Optional[exprs.DataRow] = None) -> exprs.DataRow:
|
|
52
60
|
if row is None:
|
|
53
61
|
row = exprs.DataRow(
|
|
54
|
-
self.row_builder.num_materialized, self.img_slot_idxs, self.media_slot_idxs, self.array_slot_idxs
|
|
62
|
+
self.row_builder.num_materialized, self.img_slot_idxs, self.media_slot_idxs, self.array_slot_idxs
|
|
63
|
+
)
|
|
55
64
|
self.rows.append(row)
|
|
56
65
|
return row
|
|
57
66
|
|
|
@@ -65,8 +74,10 @@ class DataRowBatch:
|
|
|
65
74
|
return self.rows[index]
|
|
66
75
|
|
|
67
76
|
def flush_imgs(
|
|
68
|
-
|
|
69
|
-
|
|
77
|
+
self,
|
|
78
|
+
idx_range: Optional[slice] = None,
|
|
79
|
+
stored_img_info: Optional[list[exprs.ColumnSlotIdx]] = None,
|
|
80
|
+
flushed_slot_idxs: Optional[list[int]] = None,
|
|
70
81
|
) -> None:
|
|
71
82
|
"""Flushes images in the given range of rows."""
|
|
72
83
|
assert self.tbl is not None
|
pixeltable/exec/exec_context.py
CHANGED
|
@@ -4,12 +4,19 @@ import sqlalchemy as sql
|
|
|
4
4
|
|
|
5
5
|
import pixeltable.exprs as exprs
|
|
6
6
|
|
|
7
|
+
|
|
7
8
|
class ExecContext:
|
|
8
9
|
"""Class for execution runtime constants"""
|
|
10
|
+
|
|
9
11
|
def __init__(
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
12
|
+
self,
|
|
13
|
+
row_builder: exprs.RowBuilder,
|
|
14
|
+
*,
|
|
15
|
+
show_pbar: bool = False,
|
|
16
|
+
batch_size: int = 0,
|
|
17
|
+
pk_clause: Optional[list[sql.ClauseElement]] = None,
|
|
18
|
+
num_computed_exprs: int = 0,
|
|
19
|
+
ignore_errors: bool = False,
|
|
13
20
|
):
|
|
14
21
|
self.show_pbar = show_pbar
|
|
15
22
|
self.batch_size = batch_size
|
pixeltable/exec/exec_node.py
CHANGED
|
@@ -4,16 +4,19 @@ import abc
|
|
|
4
4
|
import asyncio
|
|
5
5
|
import logging
|
|
6
6
|
import sys
|
|
7
|
-
from typing import Iterable, Iterator, Optional, TypeVar
|
|
7
|
+
from typing import AsyncIterator, Iterable, Iterator, Optional, TypeVar
|
|
8
8
|
|
|
9
9
|
import pixeltable.exprs as exprs
|
|
10
|
+
|
|
10
11
|
from .data_row_batch import DataRowBatch
|
|
11
12
|
from .exec_context import ExecContext
|
|
12
13
|
|
|
13
14
|
_logger = logging.getLogger('pixeltable')
|
|
14
15
|
|
|
16
|
+
|
|
15
17
|
class ExecNode(abc.ABC):
|
|
16
18
|
"""Base class of all execution nodes"""
|
|
19
|
+
|
|
17
20
|
output_exprs: Iterable[exprs.Expr]
|
|
18
21
|
row_builder: exprs.RowBuilder
|
|
19
22
|
input: Optional[ExecNode]
|
|
@@ -22,8 +25,12 @@ class ExecNode(abc.ABC):
|
|
|
22
25
|
ctx: Optional[ExecContext]
|
|
23
26
|
|
|
24
27
|
def __init__(
|
|
25
|
-
|
|
26
|
-
|
|
28
|
+
self,
|
|
29
|
+
row_builder: exprs.RowBuilder,
|
|
30
|
+
output_exprs: Iterable[exprs.Expr],
|
|
31
|
+
input_exprs: Iterable[exprs.Expr],
|
|
32
|
+
input: Optional[ExecNode] = None,
|
|
33
|
+
):
|
|
27
34
|
self.output_exprs = output_exprs
|
|
28
35
|
self.row_builder = row_builder
|
|
29
36
|
self.input = input
|
|
@@ -31,8 +38,7 @@ class ExecNode(abc.ABC):
|
|
|
31
38
|
output_slot_idxs = {e.slot_idx for e in output_exprs}
|
|
32
39
|
output_dependencies = row_builder.get_dependencies(output_exprs, exclude=input_exprs)
|
|
33
40
|
self.flushed_img_slots = [
|
|
34
|
-
e.slot_idx for e in output_dependencies
|
|
35
|
-
if e.col_type.is_image_type() and e.slot_idx not in output_slot_idxs
|
|
41
|
+
e.slot_idx for e in output_dependencies if e.col_type.is_image_type() and e.slot_idx not in output_slot_idxs
|
|
36
42
|
]
|
|
37
43
|
self.stored_img_cols = []
|
|
38
44
|
self.ctx = None # all nodes of a tree share the same context
|
|
@@ -53,16 +59,20 @@ class ExecNode(abc.ABC):
|
|
|
53
59
|
pass
|
|
54
60
|
|
|
55
61
|
def __iter__(self) -> Iterator[DataRowBatch]:
|
|
62
|
+
running_loop: Optional[asyncio.AbstractEventLoop] = None
|
|
63
|
+
loop: asyncio.AbstractEventLoop
|
|
56
64
|
try:
|
|
57
|
-
# check if we are already in an event loop (eg, Jupyter's); if so, patch it to allow
|
|
58
|
-
|
|
65
|
+
# check if we are already in an event loop (eg, Jupyter's); if so, patch it to allow
|
|
66
|
+
# multiple run_until_complete()
|
|
67
|
+
running_loop = asyncio.get_running_loop()
|
|
59
68
|
import nest_asyncio # type: ignore
|
|
69
|
+
|
|
60
70
|
nest_asyncio.apply()
|
|
71
|
+
loop = running_loop
|
|
72
|
+
_logger.debug(f'Patched running loop')
|
|
61
73
|
except RuntimeError:
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
loop = asyncio.new_event_loop()
|
|
65
|
-
asyncio.set_event_loop(loop)
|
|
74
|
+
loop = asyncio.new_event_loop()
|
|
75
|
+
asyncio.set_event_loop(loop)
|
|
66
76
|
|
|
67
77
|
if 'pytest' in sys.modules:
|
|
68
78
|
loop.set_debug(True)
|
|
@@ -75,7 +85,8 @@ class ExecNode(abc.ABC):
|
|
|
75
85
|
except StopAsyncIteration:
|
|
76
86
|
pass
|
|
77
87
|
finally:
|
|
78
|
-
loop
|
|
88
|
+
if loop != running_loop:
|
|
89
|
+
loop.close()
|
|
79
90
|
|
|
80
91
|
def open(self) -> None:
|
|
81
92
|
"""Bottom-up initialization of nodes for execution. Must be called before __next__."""
|
|
@@ -5,10 +5,10 @@ import datetime
|
|
|
5
5
|
import itertools
|
|
6
6
|
import logging
|
|
7
7
|
import sys
|
|
8
|
-
from typing import
|
|
8
|
+
from typing import Any, Callable, Iterator, Optional, cast
|
|
9
|
+
|
|
10
|
+
from pixeltable import exprs, func
|
|
9
11
|
|
|
10
|
-
from pixeltable import exprs
|
|
11
|
-
from pixeltable import func
|
|
12
12
|
from .globals import Dispatcher, Evaluator, FnCallArgs
|
|
13
13
|
|
|
14
14
|
_logger = logging.getLogger('pixeltable')
|
|
@@ -23,6 +23,7 @@ class DefaultExprEvaluator(Evaluator):
|
|
|
23
23
|
TODO:
|
|
24
24
|
- parallelize via Ray
|
|
25
25
|
"""
|
|
26
|
+
|
|
26
27
|
e: exprs.Expr
|
|
27
28
|
|
|
28
29
|
def __init__(self, e: exprs.Expr, dispatcher: Dispatcher):
|
|
@@ -60,6 +61,7 @@ class FnCallEvaluator(Evaluator):
|
|
|
60
61
|
TODO:
|
|
61
62
|
- adaptive batching: finding the optimal batch size based on observed execution times
|
|
62
63
|
"""
|
|
64
|
+
|
|
63
65
|
fn_call: exprs.FunctionCall
|
|
64
66
|
fn: func.CallableFunction
|
|
65
67
|
scalar_py_fn: Optional[Callable] # only set for non-batching CallableFunctions
|
|
@@ -73,7 +75,7 @@ class FnCallEvaluator(Evaluator):
|
|
|
73
75
|
self.fn_call = fn_call
|
|
74
76
|
self.fn = cast(func.CallableFunction, fn_call.fn)
|
|
75
77
|
if isinstance(self.fn, func.CallableFunction) and self.fn.is_batched:
|
|
76
|
-
self.call_args_queue =
|
|
78
|
+
self.call_args_queue = asyncio.Queue[FnCallArgs]()
|
|
77
79
|
# we're not supplying sample arguments there, they're ignored anyway
|
|
78
80
|
self.batch_size = self.fn.get_batch_size()
|
|
79
81
|
self.scalar_py_fn = None
|
|
@@ -167,14 +169,16 @@ class FnCallEvaluator(Evaluator):
|
|
|
167
169
|
for k in item.kwargs.keys():
|
|
168
170
|
batch_kwargs[k][i] = item.kwargs[k]
|
|
169
171
|
return FnCallArgs(
|
|
170
|
-
self.fn_call, [item.row for item in call_args], batch_args=batch_args, batch_kwargs=batch_kwargs
|
|
172
|
+
self.fn_call, [item.row for item in call_args], batch_args=batch_args, batch_kwargs=batch_kwargs
|
|
173
|
+
)
|
|
171
174
|
|
|
172
175
|
async def eval_batch(self, batched_call_args: FnCallArgs) -> None:
|
|
173
176
|
result_batch: list[Any]
|
|
174
177
|
try:
|
|
175
178
|
if self.fn.is_async:
|
|
176
179
|
result_batch = await self.fn.aexec_batch(
|
|
177
|
-
*batched_call_args.batch_args, **batched_call_args.batch_kwargs
|
|
180
|
+
*batched_call_args.batch_args, **batched_call_args.batch_kwargs
|
|
181
|
+
)
|
|
178
182
|
else:
|
|
179
183
|
# check for cancellation before starting something potentially long-running
|
|
180
184
|
if asyncio.current_task().cancelled() or self.dispatcher.exc_event.is_set():
|
|
@@ -205,6 +209,7 @@ class FnCallEvaluator(Evaluator):
|
|
|
205
209
|
self.dispatcher.dispatch([call_args.row])
|
|
206
210
|
except Exception as exc:
|
|
207
211
|
import anthropic
|
|
212
|
+
|
|
208
213
|
if isinstance(exc, anthropic.RateLimitError):
|
|
209
214
|
_logger.debug(f'RateLimitError: {exc}')
|
|
210
215
|
_, _, exc_tb = sys.exc_info()
|
|
@@ -228,7 +233,8 @@ class FnCallEvaluator(Evaluator):
|
|
|
228
233
|
rows_with_excs.add(idx)
|
|
229
234
|
self.dispatcher.dispatch_exc(item.rows, self.fn_call.slot_idx, exc_tb)
|
|
230
235
|
self.dispatcher.dispatch(
|
|
231
|
-
[call_args_batch[i].row for i in range(len(call_args_batch)) if i not in rows_with_excs]
|
|
236
|
+
[call_args_batch[i].row for i in range(len(call_args_batch)) if i not in rows_with_excs]
|
|
237
|
+
)
|
|
232
238
|
|
|
233
239
|
def _close(self) -> None:
|
|
234
240
|
"""Create a task for the incomplete batch of queued FnCallArgs, if any"""
|
|
@@ -4,24 +4,23 @@ import asyncio
|
|
|
4
4
|
import logging
|
|
5
5
|
import traceback
|
|
6
6
|
from types import TracebackType
|
|
7
|
-
from typing import
|
|
7
|
+
from typing import AsyncIterator, Iterable, Optional, Union
|
|
8
8
|
|
|
9
9
|
import numpy as np
|
|
10
10
|
|
|
11
11
|
import pixeltable.exceptions as excs
|
|
12
|
-
from pixeltable import exprs
|
|
13
|
-
|
|
12
|
+
from pixeltable import exprs, func
|
|
13
|
+
|
|
14
|
+
from ..data_row_batch import DataRowBatch
|
|
15
|
+
from ..exec_node import ExecNode
|
|
14
16
|
from .evaluators import DefaultExprEvaluator, FnCallEvaluator
|
|
15
17
|
from .globals import Evaluator, Scheduler
|
|
16
18
|
from .row_buffer import RowBuffer
|
|
17
19
|
from .schedulers import SCHEDULERS
|
|
18
|
-
from ..data_row_batch import DataRowBatch
|
|
19
|
-
from ..exec_node import ExecNode
|
|
20
20
|
|
|
21
21
|
_logger = logging.getLogger('pixeltable')
|
|
22
22
|
|
|
23
23
|
|
|
24
|
-
|
|
25
24
|
class ExprEvalNode(ExecNode):
|
|
26
25
|
"""
|
|
27
26
|
Expression evaluation
|
|
@@ -35,10 +34,13 @@ class ExprEvalNode(ExecNode):
|
|
|
35
34
|
TODO:
|
|
36
35
|
- Literal handling: currently, Literal values are copied into slots via the normal evaluation mechanism, which is
|
|
37
36
|
needless overhead; instead: pre-populate Literal slots in _init_row()
|
|
37
|
+
- dynamically determine MAX_BUFFERED_ROWS, based on the avg memory consumption of a row and our configured memory
|
|
38
|
+
limit
|
|
38
39
|
- local model inference on gpu: currently, no attempt is made to ensure that models can fit onto the gpu
|
|
39
40
|
simultaneously, which will cause errors; instead, the execution should be divided into sequential phases, each
|
|
40
41
|
of which only contains a subset of the models which is known to fit onto the gpu simultaneously
|
|
41
42
|
"""
|
|
43
|
+
|
|
42
44
|
maintain_input_order: bool # True if we're returning rows in the order we received them from our input
|
|
43
45
|
num_dependencies: np.ndarray # number of dependencies for our output slots; indexed by slot idx
|
|
44
46
|
outputs: np.ndarray # bool per slot; True if this slot is part of our output
|
|
@@ -68,11 +70,15 @@ class ExprEvalNode(ExecNode):
|
|
|
68
70
|
num_output_rows: int
|
|
69
71
|
|
|
70
72
|
BATCH_SIZE = 64
|
|
71
|
-
MAX_BUFFERED_ROWS =
|
|
73
|
+
MAX_BUFFERED_ROWS = 2048 # maximum number of rows that have been dispatched but not yet returned
|
|
72
74
|
|
|
73
75
|
def __init__(
|
|
74
|
-
self,
|
|
75
|
-
|
|
76
|
+
self,
|
|
77
|
+
row_builder: exprs.RowBuilder,
|
|
78
|
+
output_exprs: Iterable[exprs.Expr],
|
|
79
|
+
input_exprs: Iterable[exprs.Expr],
|
|
80
|
+
input: ExecNode,
|
|
81
|
+
maintain_input_order: bool = True,
|
|
76
82
|
):
|
|
77
83
|
super().__init__(row_builder, output_exprs, input_exprs, input)
|
|
78
84
|
self.maintain_input_order = maintain_input_order
|
|
@@ -148,7 +154,9 @@ class ExprEvalNode(ExecNode):
|
|
|
148
154
|
self.row_pos_map[id(row)] = self.num_input_rows + idx
|
|
149
155
|
self.num_input_rows += len(batch)
|
|
150
156
|
self.avail_input_rows += len(batch)
|
|
151
|
-
_logger.debug(
|
|
157
|
+
_logger.debug(
|
|
158
|
+
f'adding input: batch_size={len(batch)} #input_rows={self.num_input_rows} #avail={self.avail_input_rows}'
|
|
159
|
+
)
|
|
152
160
|
except StopAsyncIteration:
|
|
153
161
|
self.input_complete = True
|
|
154
162
|
_logger.debug(f'finished input: #input_rows={self.num_input_rows}, #avail={self.avail_input_rows}')
|
|
@@ -175,11 +183,11 @@ class ExprEvalNode(ExecNode):
|
|
|
175
183
|
rows: list[exprs.DataRow]
|
|
176
184
|
if avail_current_batch_rows > num_rows:
|
|
177
185
|
# we only need rows from current_input_batch
|
|
178
|
-
rows = self.current_input_batch.rows[self.input_row_idx:self.input_row_idx + num_rows]
|
|
186
|
+
rows = self.current_input_batch.rows[self.input_row_idx : self.input_row_idx + num_rows]
|
|
179
187
|
self.input_row_idx += num_rows
|
|
180
188
|
else:
|
|
181
189
|
# we need rows from both current_/next_input_batch
|
|
182
|
-
rows = self.current_input_batch.rows[self.input_row_idx:]
|
|
190
|
+
rows = self.current_input_batch.rows[self.input_row_idx :]
|
|
183
191
|
self.current_input_batch = self.next_input_batch
|
|
184
192
|
self.next_input_batch = None
|
|
185
193
|
self.input_row_idx = 0
|
|
@@ -236,6 +244,7 @@ class ExprEvalNode(ExecNode):
|
|
|
236
244
|
exc_event_aw = asyncio.create_task(self.exc_event.wait(), name='exc_event.wait()')
|
|
237
245
|
input_batch_aw: Optional[asyncio.Task] = None
|
|
238
246
|
completed_aw: Optional[asyncio.Task] = None
|
|
247
|
+
closed_evaluators = False # True after calling Evaluator.close()
|
|
239
248
|
|
|
240
249
|
try:
|
|
241
250
|
while True:
|
|
@@ -275,11 +284,12 @@ class ExprEvalNode(ExecNode):
|
|
|
275
284
|
assert self.output_buffer.num_rows == 0
|
|
276
285
|
return
|
|
277
286
|
|
|
278
|
-
if self.input_complete and self.avail_input_rows == 0:
|
|
287
|
+
if self.input_complete and self.avail_input_rows == 0 and not closed_evaluators:
|
|
279
288
|
# no more input rows to dispatch, but we're still waiting for rows to finish:
|
|
280
289
|
# close all slot evaluators to flush queued rows
|
|
281
290
|
for evaluator in self.slot_evaluators.values():
|
|
282
291
|
evaluator.close()
|
|
292
|
+
closed_evaluators = True
|
|
283
293
|
|
|
284
294
|
# we don't have a full batch of rows at this point and need to wait
|
|
285
295
|
aws = {exc_event_aw} # always wait for an exception
|
|
@@ -335,8 +345,7 @@ class ExprEvalNode(ExecNode):
|
|
|
335
345
|
first_row = rows[0]
|
|
336
346
|
input_vals = [first_row[idx] for idx in dependency_idxs]
|
|
337
347
|
e = self.row_builder.unique_exprs[slot_with_exc]
|
|
338
|
-
self.error = excs.ExprEvalError(
|
|
339
|
-
e, f'expression {e}', first_row.get_exc(e.slot_idx), exc_tb, input_vals, 0)
|
|
348
|
+
self.error = excs.ExprEvalError(e, f'expression {e}', first_row.get_exc(e.slot_idx), exc_tb, input_vals, 0)
|
|
340
349
|
self.exc_event.set()
|
|
341
350
|
return
|
|
342
351
|
|