pixeltable 0.3.2__py3-none-any.whl → 0.3.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +64 -11
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +1 -1
- pixeltable/catalog/catalog.py +50 -27
- pixeltable/catalog/column.py +27 -11
- pixeltable/catalog/dir.py +6 -4
- pixeltable/catalog/globals.py +8 -1
- pixeltable/catalog/insertable_table.py +22 -12
- pixeltable/catalog/named_function.py +10 -6
- pixeltable/catalog/path.py +3 -2
- pixeltable/catalog/path_dict.py +8 -6
- pixeltable/catalog/schema_object.py +2 -1
- pixeltable/catalog/table.py +121 -101
- pixeltable/catalog/table_version.py +291 -142
- pixeltable/catalog/table_version_path.py +8 -5
- pixeltable/catalog/view.py +67 -26
- pixeltable/dataframe.py +106 -81
- pixeltable/env.py +28 -24
- pixeltable/exec/__init__.py +2 -2
- pixeltable/exec/aggregation_node.py +10 -4
- pixeltable/exec/cache_prefetch_node.py +5 -3
- pixeltable/exec/component_iteration_node.py +9 -9
- pixeltable/exec/data_row_batch.py +21 -10
- pixeltable/exec/exec_context.py +10 -3
- pixeltable/exec/exec_node.py +23 -12
- pixeltable/exec/expr_eval/evaluators.py +13 -7
- pixeltable/exec/expr_eval/expr_eval_node.py +24 -15
- pixeltable/exec/expr_eval/globals.py +30 -7
- pixeltable/exec/expr_eval/row_buffer.py +5 -6
- pixeltable/exec/expr_eval/schedulers.py +151 -31
- pixeltable/exec/in_memory_data_node.py +8 -7
- pixeltable/exec/row_update_node.py +15 -5
- pixeltable/exec/sql_node.py +56 -27
- pixeltable/exprs/__init__.py +2 -2
- pixeltable/exprs/arithmetic_expr.py +57 -26
- pixeltable/exprs/array_slice.py +1 -1
- pixeltable/exprs/column_property_ref.py +2 -1
- pixeltable/exprs/column_ref.py +20 -15
- pixeltable/exprs/comparison.py +6 -2
- pixeltable/exprs/compound_predicate.py +1 -3
- pixeltable/exprs/data_row.py +2 -2
- pixeltable/exprs/expr.py +108 -72
- pixeltable/exprs/expr_dict.py +2 -1
- pixeltable/exprs/expr_set.py +3 -1
- pixeltable/exprs/function_call.py +39 -41
- pixeltable/exprs/globals.py +1 -0
- pixeltable/exprs/in_predicate.py +2 -2
- pixeltable/exprs/inline_expr.py +20 -17
- pixeltable/exprs/json_mapper.py +4 -2
- pixeltable/exprs/json_path.py +12 -18
- pixeltable/exprs/literal.py +5 -9
- pixeltable/exprs/method_ref.py +1 -0
- pixeltable/exprs/object_ref.py +1 -1
- pixeltable/exprs/row_builder.py +32 -17
- pixeltable/exprs/rowid_ref.py +14 -5
- pixeltable/exprs/similarity_expr.py +11 -6
- pixeltable/exprs/sql_element_cache.py +1 -1
- pixeltable/exprs/type_cast.py +24 -9
- pixeltable/ext/__init__.py +1 -0
- pixeltable/ext/functions/__init__.py +1 -0
- pixeltable/ext/functions/whisperx.py +2 -2
- pixeltable/ext/functions/yolox.py +11 -11
- pixeltable/func/aggregate_function.py +17 -13
- pixeltable/func/callable_function.py +6 -6
- pixeltable/func/expr_template_function.py +15 -14
- pixeltable/func/function.py +16 -16
- pixeltable/func/function_registry.py +11 -8
- pixeltable/func/globals.py +4 -2
- pixeltable/func/query_template_function.py +12 -13
- pixeltable/func/signature.py +18 -9
- pixeltable/func/tools.py +10 -17
- pixeltable/func/udf.py +106 -11
- pixeltable/functions/__init__.py +21 -2
- pixeltable/functions/anthropic.py +16 -12
- pixeltable/functions/fireworks.py +63 -5
- pixeltable/functions/gemini.py +13 -3
- pixeltable/functions/globals.py +18 -6
- pixeltable/functions/huggingface.py +20 -38
- pixeltable/functions/image.py +7 -3
- pixeltable/functions/json.py +1 -0
- pixeltable/functions/llama_cpp.py +1 -4
- pixeltable/functions/mistralai.py +31 -20
- pixeltable/functions/ollama.py +4 -18
- pixeltable/functions/openai.py +231 -113
- pixeltable/functions/replicate.py +11 -10
- pixeltable/functions/string.py +70 -7
- pixeltable/functions/timestamp.py +21 -8
- pixeltable/functions/together.py +66 -52
- pixeltable/functions/video.py +1 -0
- pixeltable/functions/vision.py +14 -11
- pixeltable/functions/whisper.py +2 -1
- pixeltable/globals.py +60 -26
- pixeltable/index/__init__.py +1 -1
- pixeltable/index/btree.py +5 -3
- pixeltable/index/embedding_index.py +15 -14
- pixeltable/io/__init__.py +1 -1
- pixeltable/io/external_store.py +30 -25
- pixeltable/io/fiftyone.py +6 -14
- pixeltable/io/globals.py +33 -27
- pixeltable/io/hf_datasets.py +2 -1
- pixeltable/io/label_studio.py +77 -68
- pixeltable/io/pandas.py +36 -23
- pixeltable/io/parquet.py +9 -12
- pixeltable/iterators/__init__.py +1 -0
- pixeltable/iterators/audio.py +205 -0
- pixeltable/iterators/document.py +19 -8
- pixeltable/iterators/image.py +6 -24
- pixeltable/iterators/string.py +3 -6
- pixeltable/iterators/video.py +1 -7
- pixeltable/metadata/__init__.py +7 -1
- pixeltable/metadata/converters/convert_10.py +2 -2
- pixeltable/metadata/converters/convert_15.py +1 -5
- pixeltable/metadata/converters/convert_16.py +2 -4
- pixeltable/metadata/converters/convert_17.py +2 -4
- pixeltable/metadata/converters/convert_18.py +2 -4
- pixeltable/metadata/converters/convert_19.py +2 -5
- pixeltable/metadata/converters/convert_20.py +1 -4
- pixeltable/metadata/converters/convert_21.py +4 -6
- pixeltable/metadata/converters/convert_22.py +1 -0
- pixeltable/metadata/converters/convert_23.py +5 -5
- pixeltable/metadata/converters/convert_24.py +12 -13
- pixeltable/metadata/converters/convert_26.py +23 -0
- pixeltable/metadata/converters/util.py +3 -4
- pixeltable/metadata/notes.py +1 -0
- pixeltable/metadata/schema.py +13 -2
- pixeltable/plan.py +173 -98
- pixeltable/share/__init__.py +0 -0
- pixeltable/share/packager.py +218 -0
- pixeltable/store.py +42 -26
- pixeltable/type_system.py +102 -75
- pixeltable/utils/arrow.py +7 -8
- pixeltable/utils/coco.py +16 -17
- pixeltable/utils/code.py +1 -1
- pixeltable/utils/console_output.py +6 -3
- pixeltable/utils/description_helper.py +7 -7
- pixeltable/utils/documents.py +3 -1
- pixeltable/utils/filecache.py +12 -7
- pixeltable/utils/http_server.py +9 -8
- pixeltable/utils/iceberg.py +14 -0
- pixeltable/utils/media_store.py +3 -2
- pixeltable/utils/pytorch.py +11 -14
- pixeltable/utils/s3.py +1 -0
- pixeltable/utils/sql.py +1 -0
- pixeltable/utils/transactional_directory.py +2 -2
- {pixeltable-0.3.2.dist-info → pixeltable-0.3.4.dist-info}/METADATA +9 -9
- pixeltable-0.3.4.dist-info/RECORD +166 -0
- pixeltable-0.3.2.dist-info/RECORD +0 -161
- {pixeltable-0.3.2.dist-info → pixeltable-0.3.4.dist-info}/LICENSE +0 -0
- {pixeltable-0.3.2.dist-info → pixeltable-0.3.4.dist-info}/WHEEL +0 -0
- {pixeltable-0.3.2.dist-info → pixeltable-0.3.4.dist-info}/entry_points.txt +0 -0
pixeltable/env.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from abc import abstractmethod
|
|
4
3
|
import datetime
|
|
5
4
|
import glob
|
|
6
5
|
import http.server
|
|
@@ -16,6 +15,7 @@ import sys
|
|
|
16
15
|
import threading
|
|
17
16
|
import uuid
|
|
18
17
|
import warnings
|
|
18
|
+
from abc import abstractmethod
|
|
19
19
|
from dataclasses import dataclass, field
|
|
20
20
|
from pathlib import Path
|
|
21
21
|
from sys import stdout
|
|
@@ -333,9 +333,7 @@ class Env:
|
|
|
333
333
|
http_logger.addHandler(http_fh)
|
|
334
334
|
http_logger.propagate = False
|
|
335
335
|
|
|
336
|
-
|
|
337
|
-
for path in glob.glob(f'{self._tmp_dir}/*'):
|
|
338
|
-
os.remove(path)
|
|
336
|
+
self.clear_tmp_dir()
|
|
339
337
|
|
|
340
338
|
self._db_name = os.environ.get('PIXELTABLE_DB', 'pixeltable')
|
|
341
339
|
self._pgdata_dir = Path(os.environ.get('PIXELTABLE_PGDATA', str(self._home / 'pgdata')))
|
|
@@ -375,6 +373,7 @@ class Env:
|
|
|
375
373
|
|
|
376
374
|
if create_db:
|
|
377
375
|
from pixeltable.metadata import schema
|
|
376
|
+
|
|
378
377
|
schema.base_metadata.create_all(self._sa_engine)
|
|
379
378
|
metadata.create_system_info(self._sa_engine)
|
|
380
379
|
|
|
@@ -387,11 +386,7 @@ class Env:
|
|
|
387
386
|
def _create_engine(self, time_zone_name: Optional[str], echo: bool = False) -> None:
|
|
388
387
|
connect_args = {} if time_zone_name is None else {'options': f'-c timezone={time_zone_name}'}
|
|
389
388
|
self._sa_engine = sql.create_engine(
|
|
390
|
-
self.db_url,
|
|
391
|
-
echo=echo,
|
|
392
|
-
future=True,
|
|
393
|
-
isolation_level='REPEATABLE READ',
|
|
394
|
-
connect_args=connect_args,
|
|
389
|
+
self.db_url, echo=echo, future=True, isolation_level='REPEATABLE READ', connect_args=connect_args
|
|
395
390
|
)
|
|
396
391
|
self._logger.info(f'Created SQLAlchemy engine at: {self.db_url}')
|
|
397
392
|
with self.engine.begin() as conn:
|
|
@@ -424,7 +419,7 @@ class Env:
|
|
|
424
419
|
with engine.begin() as conn:
|
|
425
420
|
# use C collation to get standard C/Python-style sorting
|
|
426
421
|
stmt = (
|
|
427
|
-
f
|
|
422
|
+
f'CREATE DATABASE {preparer.quote(self._db_name)} '
|
|
428
423
|
"ENCODING 'utf-8' LC_COLLATE 'C' LC_CTYPE 'C' TEMPLATE template0"
|
|
429
424
|
)
|
|
430
425
|
conn.execute(sql.text(stmt))
|
|
@@ -448,12 +443,12 @@ class Env:
|
|
|
448
443
|
try:
|
|
449
444
|
with engine.begin() as conn:
|
|
450
445
|
# terminate active connections
|
|
451
|
-
stmt =
|
|
446
|
+
stmt = f"""
|
|
452
447
|
SELECT pg_terminate_backend(pg_stat_activity.pid)
|
|
453
448
|
FROM pg_stat_activity
|
|
454
449
|
WHERE pg_stat_activity.datname = '{self._db_name}'
|
|
455
450
|
AND pid <> pg_backend_pid()
|
|
456
|
-
"""
|
|
451
|
+
"""
|
|
457
452
|
conn.execute(sql.text(stmt))
|
|
458
453
|
# drop db
|
|
459
454
|
stmt = f'DROP DATABASE {preparer.quote(self._db_name)}'
|
|
@@ -563,7 +558,7 @@ class Env:
|
|
|
563
558
|
is_installed = False
|
|
564
559
|
self.__optional_packages[package_name] = PackageInfo(
|
|
565
560
|
is_installed=is_installed,
|
|
566
|
-
library_name=library_name or package_name # defaults to package_name unless specified otherwise
|
|
561
|
+
library_name=library_name or package_name, # defaults to package_name unless specified otherwise
|
|
567
562
|
)
|
|
568
563
|
|
|
569
564
|
def require_package(self, package_name: str, min_version: Optional[list[int]] = None) -> None:
|
|
@@ -609,6 +604,7 @@ class Env:
|
|
|
609
604
|
"""
|
|
610
605
|
import spacy
|
|
611
606
|
from spacy.cli.download import get_model_filename
|
|
607
|
+
|
|
612
608
|
spacy_model = 'en_core_web_sm'
|
|
613
609
|
spacy_model_version = '3.7.1'
|
|
614
610
|
filename = get_model_filename(spacy_model, spacy_model_version, sdist=False)
|
|
@@ -626,18 +622,24 @@ class Env:
|
|
|
626
622
|
self._logger.warn(f'Failed to load spaCy model: {spacy_model}', exc_info=exc)
|
|
627
623
|
warnings.warn(
|
|
628
624
|
f"Failed to load spaCy model '{spacy_model}'. spaCy features will not be available.",
|
|
629
|
-
excs.PixeltableWarning
|
|
625
|
+
excs.PixeltableWarning,
|
|
630
626
|
)
|
|
631
627
|
self.__optional_packages['spacy'].is_installed = False
|
|
632
628
|
|
|
629
|
+
def clear_tmp_dir(self) -> None:
|
|
630
|
+
for path in glob.glob(f'{self._tmp_dir}/*'):
|
|
631
|
+
if os.path.isdir(path):
|
|
632
|
+
shutil.rmtree(path)
|
|
633
|
+
else:
|
|
634
|
+
os.remove(path)
|
|
635
|
+
|
|
633
636
|
def num_tmp_files(self) -> int:
|
|
634
637
|
return len(glob.glob(f'{self._tmp_dir}/*'))
|
|
635
638
|
|
|
636
639
|
def create_tmp_path(self, extension: str = '') -> Path:
|
|
637
640
|
return self._tmp_dir / f'{uuid.uuid4()}{extension}'
|
|
638
641
|
|
|
639
|
-
|
|
640
|
-
#def get_resource_pool_info(self, pool_id: str, pool_info_cls: Optional[Type[T]]) -> T:
|
|
642
|
+
# def get_resource_pool_info(self, pool_id: str, pool_info_cls: Optional[Type[T]]) -> T:
|
|
641
643
|
def get_resource_pool_info(self, pool_id: str, make_pool_info: Optional[Callable[[], T]] = None) -> T:
|
|
642
644
|
"""Returns the info object for the given id, creating it if necessary."""
|
|
643
645
|
info = self._resource_pool_info.get(pool_id)
|
|
@@ -707,6 +709,7 @@ def register_client(name: str) -> Callable:
|
|
|
707
709
|
Args:
|
|
708
710
|
- name (str): The name of the API client (e.g., 'openai' or 'label-studio').
|
|
709
711
|
"""
|
|
712
|
+
|
|
710
713
|
def decorator(fn: Callable) -> None:
|
|
711
714
|
global _registered_clients
|
|
712
715
|
sig = inspect.signature(fn)
|
|
@@ -721,6 +724,7 @@ class Config:
|
|
|
721
724
|
The (global) Pixeltable configuration, as loaded from `config.toml`. Provides methods for retrieving
|
|
722
725
|
configuration values, which can be set in the config file or as environment variables.
|
|
723
726
|
"""
|
|
727
|
+
|
|
724
728
|
__config: dict[str, Any]
|
|
725
729
|
|
|
726
730
|
@classmethod
|
|
@@ -750,12 +754,7 @@ class Config:
|
|
|
750
754
|
free_disk_space_bytes = shutil.disk_usage(config_path.parent).free
|
|
751
755
|
# Default cache size is 1/5 of free disk space
|
|
752
756
|
file_cache_size_g = free_disk_space_bytes / 5 / (1 << 30)
|
|
753
|
-
return {
|
|
754
|
-
'pixeltable': {
|
|
755
|
-
'file_cache_size_g': round(file_cache_size_g, 1),
|
|
756
|
-
'hide_warnings': False,
|
|
757
|
-
}
|
|
758
|
-
}
|
|
757
|
+
return {'pixeltable': {'file_cache_size_g': round(file_cache_size_g, 1), 'hide_warnings': False}}
|
|
759
758
|
|
|
760
759
|
def __init__(self, config: dict[str, Any]) -> None:
|
|
761
760
|
self.__config = config
|
|
@@ -840,7 +839,9 @@ class RateLimitsInfo:
|
|
|
840
839
|
self.resource_limits = {k: RateLimitInfo(k, now, *v) for k, v in kwargs.items() if v is not None}
|
|
841
840
|
# TODO: remove
|
|
842
841
|
for info in self.resource_limits.values():
|
|
843
|
-
_logger.debug(
|
|
842
|
+
_logger.debug(
|
|
843
|
+
f'Init {info.resource} rate limit: rem={info.remaining} reset={info.reset_at.strftime(TIME_FORMAT)} delta={(info.reset_at - now).total_seconds()}'
|
|
844
|
+
)
|
|
844
845
|
else:
|
|
845
846
|
for k, v in kwargs.items():
|
|
846
847
|
if v is not None:
|
|
@@ -855,6 +856,7 @@ class RateLimitsInfo:
|
|
|
855
856
|
@dataclass
|
|
856
857
|
class RateLimitInfo:
|
|
857
858
|
"""Container for rate limit-related information for a single resource."""
|
|
859
|
+
|
|
858
860
|
resource: str
|
|
859
861
|
recorded_at: datetime.datetime
|
|
860
862
|
limit: int
|
|
@@ -871,4 +873,6 @@ class RateLimitInfo:
|
|
|
871
873
|
reset_delta = reset_at - self.reset_at
|
|
872
874
|
self.reset_at = reset_at
|
|
873
875
|
# TODO: remove
|
|
874
|
-
_logger.debug(
|
|
876
|
+
_logger.debug(
|
|
877
|
+
f'Update {self.resource} rate limit: rem={self.remaining} reset={self.reset_at.strftime(TIME_FORMAT)} reset_delta={reset_delta.total_seconds()} recorded_delta={(self.reset_at - recorded_at).total_seconds()}'
|
|
878
|
+
)
|
pixeltable/exec/__init__.py
CHANGED
|
@@ -4,7 +4,7 @@ from .component_iteration_node import ComponentIterationNode
|
|
|
4
4
|
from .data_row_batch import DataRowBatch
|
|
5
5
|
from .exec_context import ExecContext
|
|
6
6
|
from .exec_node import ExecNode
|
|
7
|
+
from .expr_eval import ExprEvalNode
|
|
7
8
|
from .in_memory_data_node import InMemoryDataNode
|
|
8
9
|
from .row_update_node import RowUpdateNode
|
|
9
|
-
from .sql_node import
|
|
10
|
-
from .expr_eval import ExprEvalNode
|
|
10
|
+
from .sql_node import SqlAggregationNode, SqlJoinNode, SqlLookupNode, SqlNode, SqlScanNode
|
|
@@ -2,7 +2,7 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
4
|
import sys
|
|
5
|
-
from typing import Any, Iterable, Iterator, Optional, cast
|
|
5
|
+
from typing import Any, AsyncIterator, Iterable, Iterator, Optional, cast
|
|
6
6
|
|
|
7
7
|
import pixeltable.catalog as catalog
|
|
8
8
|
import pixeltable.exceptions as excs
|
|
@@ -13,12 +13,14 @@ from .exec_node import ExecNode
|
|
|
13
13
|
|
|
14
14
|
_logger = logging.getLogger('pixeltable')
|
|
15
15
|
|
|
16
|
+
|
|
16
17
|
class AggregationNode(ExecNode):
|
|
17
18
|
"""
|
|
18
19
|
In-memory aggregation for UDAs.
|
|
19
20
|
|
|
20
21
|
At the moment, this returns all results in a single DataRowBatch.
|
|
21
22
|
"""
|
|
23
|
+
|
|
22
24
|
group_by: Optional[list[exprs.Expr]]
|
|
23
25
|
input_exprs: list[exprs.Expr]
|
|
24
26
|
agg_fn_eval_ctx: exprs.RowBuilder.EvalCtx
|
|
@@ -26,8 +28,13 @@ class AggregationNode(ExecNode):
|
|
|
26
28
|
output_batch: DataRowBatch
|
|
27
29
|
|
|
28
30
|
def __init__(
|
|
29
|
-
|
|
30
|
-
|
|
31
|
+
self,
|
|
32
|
+
tbl: catalog.TableVersion,
|
|
33
|
+
row_builder: exprs.RowBuilder,
|
|
34
|
+
group_by: Optional[list[exprs.Expr]],
|
|
35
|
+
agg_fn_calls: list[exprs.FunctionCall],
|
|
36
|
+
input_exprs: Iterable[exprs.Expr],
|
|
37
|
+
input: ExecNode,
|
|
31
38
|
):
|
|
32
39
|
output_exprs: list[exprs.Expr] = [] if group_by is None else list(group_by)
|
|
33
40
|
output_exprs.extend(agg_fn_calls)
|
|
@@ -86,4 +93,3 @@ class AggregationNode(ExecNode):
|
|
|
86
93
|
self.output_batch.flush_imgs(None, self.stored_img_cols, self.flushed_img_slots)
|
|
87
94
|
_logger.debug(f'AggregateNode: consumed {num_input_rows} rows, returning {len(self.output_batch.rows)} rows')
|
|
88
95
|
yield self.output_batch
|
|
89
|
-
|
|
@@ -9,7 +9,7 @@ import urllib.request
|
|
|
9
9
|
from collections import deque
|
|
10
10
|
from concurrent import futures
|
|
11
11
|
from pathlib import Path
|
|
12
|
-
from typing import
|
|
12
|
+
from typing import Any, AsyncIterator, Iterator, Optional
|
|
13
13
|
from uuid import UUID
|
|
14
14
|
|
|
15
15
|
import pixeltable.env as env
|
|
@@ -30,6 +30,7 @@ class CachePrefetchNode(ExecNode):
|
|
|
30
30
|
TODO:
|
|
31
31
|
- adapting the number of download threads at runtime to maximize throughput
|
|
32
32
|
"""
|
|
33
|
+
|
|
33
34
|
BATCH_SIZE = 16
|
|
34
35
|
NUM_EXECUTOR_THREADS = 16
|
|
35
36
|
|
|
@@ -59,8 +60,8 @@ class CachePrefetchNode(ExecNode):
|
|
|
59
60
|
num_missing: int # number of missing URLs in this row
|
|
60
61
|
|
|
61
62
|
def __init__(
|
|
62
|
-
|
|
63
|
-
|
|
63
|
+
self, tbl_id: UUID, file_col_info: list[exprs.ColumnSlotIdx], input: ExecNode, retain_input_order: bool = True
|
|
64
|
+
):
|
|
64
65
|
# input_/output_exprs=[]: we don't have anything to evaluate
|
|
65
66
|
super().__init__(input.row_builder, [], [], input)
|
|
66
67
|
self.retain_input_order = retain_input_order
|
|
@@ -241,6 +242,7 @@ class CachePrefetchNode(ExecNode):
|
|
|
241
242
|
_logger.debug(f'Downloading {url} to {tmp_path}')
|
|
242
243
|
if parsed.scheme == 's3':
|
|
243
244
|
from pixeltable.utils.s3 import get_client
|
|
245
|
+
|
|
244
246
|
with self.boto_client_lock:
|
|
245
247
|
if self.boto_client is None:
|
|
246
248
|
config = {
|
|
@@ -1,5 +1,4 @@
|
|
|
1
|
-
import
|
|
2
|
-
from typing import Iterator, Optional, AsyncIterator
|
|
1
|
+
from typing import AsyncIterator
|
|
3
2
|
|
|
4
3
|
import pixeltable.catalog as catalog
|
|
5
4
|
import pixeltable.exceptions as excs
|
|
@@ -14,6 +13,7 @@ class ComponentIterationNode(ExecNode):
|
|
|
14
13
|
|
|
15
14
|
Returns row batches of OUTPUT_BATCH_SIZE size.
|
|
16
15
|
"""
|
|
16
|
+
|
|
17
17
|
__OUTPUT_BATCH_SIZE = 1024
|
|
18
18
|
|
|
19
19
|
def __init__(self, view: catalog.TableVersion, input: ExecNode):
|
|
@@ -25,8 +25,8 @@ class ComponentIterationNode(ExecNode):
|
|
|
25
25
|
self.iterator_args = iterator_args[0]
|
|
26
26
|
assert isinstance(self.iterator_args, exprs.InlineDict)
|
|
27
27
|
self.iterator_args_ctx = self.row_builder.create_eval_ctx([self.iterator_args])
|
|
28
|
-
self.iterator_output_schema, self.unstored_column_names = (
|
|
29
|
-
|
|
28
|
+
self.iterator_output_schema, self.unstored_column_names = self.view.iterator_cls.output_schema(
|
|
29
|
+
**self.iterator_args.to_kwargs()
|
|
30
30
|
)
|
|
31
31
|
self.iterator_output_fields = list(self.iterator_output_schema.keys())
|
|
32
32
|
self.iterator_output_cols = {
|
|
@@ -34,7 +34,8 @@ class ComponentIterationNode(ExecNode):
|
|
|
34
34
|
}
|
|
35
35
|
# referenced iterator output fields
|
|
36
36
|
self.refd_output_slot_idxs = {
|
|
37
|
-
e.col.name: e.slot_idx
|
|
37
|
+
e.col.name: e.slot_idx
|
|
38
|
+
for e in self.row_builder.unique_exprs
|
|
38
39
|
if isinstance(e, exprs.ColumnRef) and e.col.name in self.iterator_output_fields
|
|
39
40
|
}
|
|
40
41
|
|
|
@@ -79,8 +80,7 @@ class ComponentIterationNode(ExecNode):
|
|
|
79
80
|
# verify and copy component_dict fields to their respective slots in output_row
|
|
80
81
|
for field_name, field_val in component_dict.items():
|
|
81
82
|
if field_name not in self.iterator_output_fields:
|
|
82
|
-
raise excs.Error(
|
|
83
|
-
f'Invalid field name {field_name} in output of {self.view.iterator_cls.__name__}')
|
|
83
|
+
raise excs.Error(f'Invalid field name {field_name} in output of {self.view.iterator_cls.__name__}')
|
|
84
84
|
if field_name not in self.refd_output_slot_idxs:
|
|
85
85
|
# we can ignore this
|
|
86
86
|
continue
|
|
@@ -90,5 +90,5 @@ class ComponentIterationNode(ExecNode):
|
|
|
90
90
|
if len(component_dict) != len(self.iterator_output_fields):
|
|
91
91
|
missing_fields = set(self.refd_output_slot_idxs.keys()) - set(component_dict.keys())
|
|
92
92
|
raise excs.Error(
|
|
93
|
-
f'Invalid output of {self.view.iterator_cls.__name__}: '
|
|
94
|
-
|
|
93
|
+
f'Invalid output of {self.view.iterator_cls.__name__}: missing fields {", ".join(missing_fields)}'
|
|
94
|
+
)
|
|
@@ -1,19 +1,21 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
|
-
|
|
2
|
+
|
|
3
3
|
import logging
|
|
4
|
+
from typing import Iterator, Optional
|
|
4
5
|
|
|
5
|
-
import pixeltable.exprs as exprs
|
|
6
6
|
import pixeltable.catalog as catalog
|
|
7
|
+
import pixeltable.exprs as exprs
|
|
7
8
|
from pixeltable.utils.media_store import MediaStore
|
|
8
9
|
|
|
9
|
-
|
|
10
10
|
_logger = logging.getLogger('pixeltable')
|
|
11
11
|
|
|
12
|
+
|
|
12
13
|
class DataRowBatch:
|
|
13
14
|
"""Set of DataRows, indexed by rowid.
|
|
14
15
|
|
|
15
16
|
Contains the metadata needed to initialize DataRows.
|
|
16
17
|
"""
|
|
18
|
+
|
|
17
19
|
tbl: Optional[catalog.TableVersion]
|
|
18
20
|
row_builder: exprs.RowBuilder
|
|
19
21
|
img_slot_idxs: list[int]
|
|
@@ -22,8 +24,11 @@ class DataRowBatch:
|
|
|
22
24
|
rows: list[exprs.DataRow]
|
|
23
25
|
|
|
24
26
|
def __init__(
|
|
25
|
-
self,
|
|
26
|
-
|
|
27
|
+
self,
|
|
28
|
+
tbl: Optional[catalog.TableVersion],
|
|
29
|
+
row_builder: exprs.RowBuilder,
|
|
30
|
+
num_rows: Optional[int] = None,
|
|
31
|
+
rows: Optional[list[exprs.DataRow]] = None,
|
|
27
32
|
):
|
|
28
33
|
"""
|
|
29
34
|
Requires either num_rows or rows to be specified, but not both.
|
|
@@ -34,7 +39,8 @@ class DataRowBatch:
|
|
|
34
39
|
self.img_slot_idxs = [e.slot_idx for e in row_builder.unique_exprs if e.col_type.is_image_type()]
|
|
35
40
|
# non-image media slots
|
|
36
41
|
self.media_slot_idxs = [
|
|
37
|
-
e.slot_idx
|
|
42
|
+
e.slot_idx
|
|
43
|
+
for e in row_builder.unique_exprs
|
|
38
44
|
if e.col_type.is_media_type() and not e.col_type.is_image_type()
|
|
39
45
|
]
|
|
40
46
|
self.array_slot_idxs = [e.slot_idx for e in row_builder.unique_exprs if e.col_type.is_array_type()]
|
|
@@ -44,14 +50,17 @@ class DataRowBatch:
|
|
|
44
50
|
if num_rows is None:
|
|
45
51
|
num_rows = 0
|
|
46
52
|
self.rows = [
|
|
47
|
-
exprs.DataRow(
|
|
53
|
+
exprs.DataRow(
|
|
54
|
+
row_builder.num_materialized, self.img_slot_idxs, self.media_slot_idxs, self.array_slot_idxs
|
|
55
|
+
)
|
|
48
56
|
for _ in range(num_rows)
|
|
49
57
|
]
|
|
50
58
|
|
|
51
59
|
def add_row(self, row: Optional[exprs.DataRow] = None) -> exprs.DataRow:
|
|
52
60
|
if row is None:
|
|
53
61
|
row = exprs.DataRow(
|
|
54
|
-
self.row_builder.num_materialized, self.img_slot_idxs, self.media_slot_idxs, self.array_slot_idxs
|
|
62
|
+
self.row_builder.num_materialized, self.img_slot_idxs, self.media_slot_idxs, self.array_slot_idxs
|
|
63
|
+
)
|
|
55
64
|
self.rows.append(row)
|
|
56
65
|
return row
|
|
57
66
|
|
|
@@ -65,8 +74,10 @@ class DataRowBatch:
|
|
|
65
74
|
return self.rows[index]
|
|
66
75
|
|
|
67
76
|
def flush_imgs(
|
|
68
|
-
|
|
69
|
-
|
|
77
|
+
self,
|
|
78
|
+
idx_range: Optional[slice] = None,
|
|
79
|
+
stored_img_info: Optional[list[exprs.ColumnSlotIdx]] = None,
|
|
80
|
+
flushed_slot_idxs: Optional[list[int]] = None,
|
|
70
81
|
) -> None:
|
|
71
82
|
"""Flushes images in the given range of rows."""
|
|
72
83
|
assert self.tbl is not None
|
pixeltable/exec/exec_context.py
CHANGED
|
@@ -4,12 +4,19 @@ import sqlalchemy as sql
|
|
|
4
4
|
|
|
5
5
|
import pixeltable.exprs as exprs
|
|
6
6
|
|
|
7
|
+
|
|
7
8
|
class ExecContext:
|
|
8
9
|
"""Class for execution runtime constants"""
|
|
10
|
+
|
|
9
11
|
def __init__(
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
12
|
+
self,
|
|
13
|
+
row_builder: exprs.RowBuilder,
|
|
14
|
+
*,
|
|
15
|
+
show_pbar: bool = False,
|
|
16
|
+
batch_size: int = 0,
|
|
17
|
+
pk_clause: Optional[list[sql.ClauseElement]] = None,
|
|
18
|
+
num_computed_exprs: int = 0,
|
|
19
|
+
ignore_errors: bool = False,
|
|
13
20
|
):
|
|
14
21
|
self.show_pbar = show_pbar
|
|
15
22
|
self.batch_size = batch_size
|
pixeltable/exec/exec_node.py
CHANGED
|
@@ -4,16 +4,19 @@ import abc
|
|
|
4
4
|
import asyncio
|
|
5
5
|
import logging
|
|
6
6
|
import sys
|
|
7
|
-
from typing import Iterable, Iterator, Optional, TypeVar
|
|
7
|
+
from typing import AsyncIterator, Iterable, Iterator, Optional, TypeVar
|
|
8
8
|
|
|
9
9
|
import pixeltable.exprs as exprs
|
|
10
|
+
|
|
10
11
|
from .data_row_batch import DataRowBatch
|
|
11
12
|
from .exec_context import ExecContext
|
|
12
13
|
|
|
13
14
|
_logger = logging.getLogger('pixeltable')
|
|
14
15
|
|
|
16
|
+
|
|
15
17
|
class ExecNode(abc.ABC):
|
|
16
18
|
"""Base class of all execution nodes"""
|
|
19
|
+
|
|
17
20
|
output_exprs: Iterable[exprs.Expr]
|
|
18
21
|
row_builder: exprs.RowBuilder
|
|
19
22
|
input: Optional[ExecNode]
|
|
@@ -22,8 +25,12 @@ class ExecNode(abc.ABC):
|
|
|
22
25
|
ctx: Optional[ExecContext]
|
|
23
26
|
|
|
24
27
|
def __init__(
|
|
25
|
-
|
|
26
|
-
|
|
28
|
+
self,
|
|
29
|
+
row_builder: exprs.RowBuilder,
|
|
30
|
+
output_exprs: Iterable[exprs.Expr],
|
|
31
|
+
input_exprs: Iterable[exprs.Expr],
|
|
32
|
+
input: Optional[ExecNode] = None,
|
|
33
|
+
):
|
|
27
34
|
self.output_exprs = output_exprs
|
|
28
35
|
self.row_builder = row_builder
|
|
29
36
|
self.input = input
|
|
@@ -31,8 +38,7 @@ class ExecNode(abc.ABC):
|
|
|
31
38
|
output_slot_idxs = {e.slot_idx for e in output_exprs}
|
|
32
39
|
output_dependencies = row_builder.get_dependencies(output_exprs, exclude=input_exprs)
|
|
33
40
|
self.flushed_img_slots = [
|
|
34
|
-
e.slot_idx for e in output_dependencies
|
|
35
|
-
if e.col_type.is_image_type() and e.slot_idx not in output_slot_idxs
|
|
41
|
+
e.slot_idx for e in output_dependencies if e.col_type.is_image_type() and e.slot_idx not in output_slot_idxs
|
|
36
42
|
]
|
|
37
43
|
self.stored_img_cols = []
|
|
38
44
|
self.ctx = None # all nodes of a tree share the same context
|
|
@@ -53,16 +59,20 @@ class ExecNode(abc.ABC):
|
|
|
53
59
|
pass
|
|
54
60
|
|
|
55
61
|
def __iter__(self) -> Iterator[DataRowBatch]:
|
|
62
|
+
running_loop: Optional[asyncio.AbstractEventLoop] = None
|
|
63
|
+
loop: asyncio.AbstractEventLoop
|
|
56
64
|
try:
|
|
57
|
-
# check if we are already in an event loop (eg, Jupyter's); if so, patch it to allow
|
|
58
|
-
|
|
65
|
+
# check if we are already in an event loop (eg, Jupyter's); if so, patch it to allow
|
|
66
|
+
# multiple run_until_complete()
|
|
67
|
+
running_loop = asyncio.get_running_loop()
|
|
59
68
|
import nest_asyncio # type: ignore
|
|
69
|
+
|
|
60
70
|
nest_asyncio.apply()
|
|
71
|
+
loop = running_loop
|
|
72
|
+
_logger.debug(f'Patched running loop')
|
|
61
73
|
except RuntimeError:
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
loop = asyncio.new_event_loop()
|
|
65
|
-
asyncio.set_event_loop(loop)
|
|
74
|
+
loop = asyncio.new_event_loop()
|
|
75
|
+
asyncio.set_event_loop(loop)
|
|
66
76
|
|
|
67
77
|
if 'pytest' in sys.modules:
|
|
68
78
|
loop.set_debug(True)
|
|
@@ -75,7 +85,8 @@ class ExecNode(abc.ABC):
|
|
|
75
85
|
except StopAsyncIteration:
|
|
76
86
|
pass
|
|
77
87
|
finally:
|
|
78
|
-
loop
|
|
88
|
+
if loop != running_loop:
|
|
89
|
+
loop.close()
|
|
79
90
|
|
|
80
91
|
def open(self) -> None:
|
|
81
92
|
"""Bottom-up initialization of nodes for execution. Must be called before __next__."""
|
|
@@ -5,10 +5,10 @@ import datetime
|
|
|
5
5
|
import itertools
|
|
6
6
|
import logging
|
|
7
7
|
import sys
|
|
8
|
-
from typing import
|
|
8
|
+
from typing import Any, Callable, Iterator, Optional, cast
|
|
9
|
+
|
|
10
|
+
from pixeltable import exprs, func
|
|
9
11
|
|
|
10
|
-
from pixeltable import exprs
|
|
11
|
-
from pixeltable import func
|
|
12
12
|
from .globals import Dispatcher, Evaluator, FnCallArgs
|
|
13
13
|
|
|
14
14
|
_logger = logging.getLogger('pixeltable')
|
|
@@ -23,6 +23,7 @@ class DefaultExprEvaluator(Evaluator):
|
|
|
23
23
|
TODO:
|
|
24
24
|
- parallelize via Ray
|
|
25
25
|
"""
|
|
26
|
+
|
|
26
27
|
e: exprs.Expr
|
|
27
28
|
|
|
28
29
|
def __init__(self, e: exprs.Expr, dispatcher: Dispatcher):
|
|
@@ -60,6 +61,7 @@ class FnCallEvaluator(Evaluator):
|
|
|
60
61
|
TODO:
|
|
61
62
|
- adaptive batching: finding the optimal batch size based on observed execution times
|
|
62
63
|
"""
|
|
64
|
+
|
|
63
65
|
fn_call: exprs.FunctionCall
|
|
64
66
|
fn: func.CallableFunction
|
|
65
67
|
scalar_py_fn: Optional[Callable] # only set for non-batching CallableFunctions
|
|
@@ -73,7 +75,7 @@ class FnCallEvaluator(Evaluator):
|
|
|
73
75
|
self.fn_call = fn_call
|
|
74
76
|
self.fn = cast(func.CallableFunction, fn_call.fn)
|
|
75
77
|
if isinstance(self.fn, func.CallableFunction) and self.fn.is_batched:
|
|
76
|
-
self.call_args_queue =
|
|
78
|
+
self.call_args_queue = asyncio.Queue[FnCallArgs]()
|
|
77
79
|
# we're not supplying sample arguments there, they're ignored anyway
|
|
78
80
|
self.batch_size = self.fn.get_batch_size()
|
|
79
81
|
self.scalar_py_fn = None
|
|
@@ -167,14 +169,16 @@ class FnCallEvaluator(Evaluator):
|
|
|
167
169
|
for k in item.kwargs.keys():
|
|
168
170
|
batch_kwargs[k][i] = item.kwargs[k]
|
|
169
171
|
return FnCallArgs(
|
|
170
|
-
self.fn_call, [item.row for item in call_args], batch_args=batch_args, batch_kwargs=batch_kwargs
|
|
172
|
+
self.fn_call, [item.row for item in call_args], batch_args=batch_args, batch_kwargs=batch_kwargs
|
|
173
|
+
)
|
|
171
174
|
|
|
172
175
|
async def eval_batch(self, batched_call_args: FnCallArgs) -> None:
|
|
173
176
|
result_batch: list[Any]
|
|
174
177
|
try:
|
|
175
178
|
if self.fn.is_async:
|
|
176
179
|
result_batch = await self.fn.aexec_batch(
|
|
177
|
-
*batched_call_args.batch_args, **batched_call_args.batch_kwargs
|
|
180
|
+
*batched_call_args.batch_args, **batched_call_args.batch_kwargs
|
|
181
|
+
)
|
|
178
182
|
else:
|
|
179
183
|
# check for cancellation before starting something potentially long-running
|
|
180
184
|
if asyncio.current_task().cancelled() or self.dispatcher.exc_event.is_set():
|
|
@@ -205,6 +209,7 @@ class FnCallEvaluator(Evaluator):
|
|
|
205
209
|
self.dispatcher.dispatch([call_args.row])
|
|
206
210
|
except Exception as exc:
|
|
207
211
|
import anthropic
|
|
212
|
+
|
|
208
213
|
if isinstance(exc, anthropic.RateLimitError):
|
|
209
214
|
_logger.debug(f'RateLimitError: {exc}')
|
|
210
215
|
_, _, exc_tb = sys.exc_info()
|
|
@@ -228,7 +233,8 @@ class FnCallEvaluator(Evaluator):
|
|
|
228
233
|
rows_with_excs.add(idx)
|
|
229
234
|
self.dispatcher.dispatch_exc(item.rows, self.fn_call.slot_idx, exc_tb)
|
|
230
235
|
self.dispatcher.dispatch(
|
|
231
|
-
[call_args_batch[i].row for i in range(len(call_args_batch)) if i not in rows_with_excs]
|
|
236
|
+
[call_args_batch[i].row for i in range(len(call_args_batch)) if i not in rows_with_excs]
|
|
237
|
+
)
|
|
232
238
|
|
|
233
239
|
def _close(self) -> None:
|
|
234
240
|
"""Create a task for the incomplete batch of queued FnCallArgs, if any"""
|