pixeltable 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +64 -11
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +1 -1
- pixeltable/catalog/catalog.py +50 -27
- pixeltable/catalog/column.py +27 -11
- pixeltable/catalog/dir.py +6 -4
- pixeltable/catalog/globals.py +8 -1
- pixeltable/catalog/insertable_table.py +25 -15
- pixeltable/catalog/named_function.py +10 -6
- pixeltable/catalog/path.py +3 -2
- pixeltable/catalog/path_dict.py +8 -6
- pixeltable/catalog/schema_object.py +2 -1
- pixeltable/catalog/table.py +123 -103
- pixeltable/catalog/table_version.py +292 -143
- pixeltable/catalog/table_version_path.py +8 -5
- pixeltable/catalog/view.py +68 -27
- pixeltable/dataframe.py +102 -72
- pixeltable/env.py +39 -23
- pixeltable/exec/__init__.py +2 -2
- pixeltable/exec/aggregation_node.py +10 -4
- pixeltable/exec/cache_prefetch_node.py +5 -3
- pixeltable/exec/component_iteration_node.py +9 -8
- pixeltable/exec/data_row_batch.py +21 -10
- pixeltable/exec/exec_context.py +10 -3
- pixeltable/exec/exec_node.py +23 -12
- pixeltable/exec/expr_eval/evaluators.py +18 -17
- pixeltable/exec/expr_eval/expr_eval_node.py +29 -16
- pixeltable/exec/expr_eval/globals.py +33 -11
- pixeltable/exec/expr_eval/row_buffer.py +5 -6
- pixeltable/exec/expr_eval/schedulers.py +170 -42
- pixeltable/exec/in_memory_data_node.py +8 -7
- pixeltable/exec/row_update_node.py +15 -5
- pixeltable/exec/sql_node.py +56 -27
- pixeltable/exprs/__init__.py +2 -2
- pixeltable/exprs/arithmetic_expr.py +57 -26
- pixeltable/exprs/array_slice.py +1 -1
- pixeltable/exprs/column_property_ref.py +2 -1
- pixeltable/exprs/column_ref.py +20 -15
- pixeltable/exprs/comparison.py +6 -2
- pixeltable/exprs/compound_predicate.py +1 -3
- pixeltable/exprs/data_row.py +2 -2
- pixeltable/exprs/expr.py +101 -72
- pixeltable/exprs/expr_dict.py +2 -1
- pixeltable/exprs/expr_set.py +3 -1
- pixeltable/exprs/function_call.py +39 -41
- pixeltable/exprs/globals.py +1 -0
- pixeltable/exprs/in_predicate.py +2 -2
- pixeltable/exprs/inline_expr.py +20 -17
- pixeltable/exprs/json_mapper.py +4 -2
- pixeltable/exprs/json_path.py +12 -18
- pixeltable/exprs/literal.py +5 -9
- pixeltable/exprs/method_ref.py +1 -0
- pixeltable/exprs/object_ref.py +1 -1
- pixeltable/exprs/row_builder.py +31 -16
- pixeltable/exprs/rowid_ref.py +14 -5
- pixeltable/exprs/similarity_expr.py +11 -6
- pixeltable/exprs/sql_element_cache.py +1 -1
- pixeltable/exprs/type_cast.py +24 -9
- pixeltable/ext/__init__.py +1 -0
- pixeltable/ext/functions/__init__.py +1 -0
- pixeltable/ext/functions/whisperx.py +2 -2
- pixeltable/ext/functions/yolox.py +11 -11
- pixeltable/func/aggregate_function.py +17 -13
- pixeltable/func/callable_function.py +6 -6
- pixeltable/func/expr_template_function.py +15 -14
- pixeltable/func/function.py +16 -16
- pixeltable/func/function_registry.py +11 -8
- pixeltable/func/globals.py +4 -2
- pixeltable/func/query_template_function.py +12 -13
- pixeltable/func/signature.py +18 -9
- pixeltable/func/tools.py +10 -17
- pixeltable/func/udf.py +106 -11
- pixeltable/functions/__init__.py +21 -2
- pixeltable/functions/anthropic.py +21 -15
- pixeltable/functions/fireworks.py +63 -5
- pixeltable/functions/gemini.py +13 -3
- pixeltable/functions/globals.py +18 -6
- pixeltable/functions/huggingface.py +20 -38
- pixeltable/functions/image.py +7 -3
- pixeltable/functions/json.py +1 -0
- pixeltable/functions/llama_cpp.py +1 -4
- pixeltable/functions/mistralai.py +31 -20
- pixeltable/functions/ollama.py +4 -18
- pixeltable/functions/openai.py +214 -109
- pixeltable/functions/replicate.py +11 -10
- pixeltable/functions/string.py +70 -7
- pixeltable/functions/timestamp.py +21 -8
- pixeltable/functions/together.py +66 -52
- pixeltable/functions/video.py +1 -0
- pixeltable/functions/vision.py +14 -11
- pixeltable/functions/whisper.py +2 -1
- pixeltable/globals.py +61 -28
- pixeltable/index/__init__.py +1 -1
- pixeltable/index/btree.py +5 -3
- pixeltable/index/embedding_index.py +15 -14
- pixeltable/io/__init__.py +1 -1
- pixeltable/io/external_store.py +30 -25
- pixeltable/io/fiftyone.py +6 -14
- pixeltable/io/globals.py +33 -27
- pixeltable/io/hf_datasets.py +3 -2
- pixeltable/io/label_studio.py +80 -71
- pixeltable/io/pandas.py +33 -9
- pixeltable/io/parquet.py +10 -13
- pixeltable/iterators/__init__.py +1 -0
- pixeltable/iterators/audio.py +205 -0
- pixeltable/iterators/document.py +19 -8
- pixeltable/iterators/image.py +6 -24
- pixeltable/iterators/string.py +3 -6
- pixeltable/iterators/video.py +1 -7
- pixeltable/metadata/__init__.py +9 -2
- pixeltable/metadata/converters/convert_10.py +2 -2
- pixeltable/metadata/converters/convert_15.py +1 -5
- pixeltable/metadata/converters/convert_16.py +2 -4
- pixeltable/metadata/converters/convert_17.py +2 -4
- pixeltable/metadata/converters/convert_18.py +2 -4
- pixeltable/metadata/converters/convert_19.py +2 -5
- pixeltable/metadata/converters/convert_20.py +1 -4
- pixeltable/metadata/converters/convert_21.py +4 -6
- pixeltable/metadata/converters/convert_22.py +1 -0
- pixeltable/metadata/converters/convert_23.py +5 -5
- pixeltable/metadata/converters/convert_24.py +12 -13
- pixeltable/metadata/converters/convert_26.py +23 -0
- pixeltable/metadata/converters/util.py +3 -4
- pixeltable/metadata/notes.py +1 -0
- pixeltable/metadata/schema.py +13 -2
- pixeltable/plan.py +173 -98
- pixeltable/store.py +42 -26
- pixeltable/type_system.py +130 -85
- pixeltable/utils/arrow.py +1 -7
- pixeltable/utils/coco.py +16 -17
- pixeltable/utils/code.py +1 -1
- pixeltable/utils/console_output.py +44 -0
- pixeltable/utils/description_helper.py +7 -7
- pixeltable/utils/documents.py +3 -1
- pixeltable/utils/filecache.py +13 -8
- pixeltable/utils/http_server.py +9 -8
- pixeltable/utils/media_store.py +2 -1
- pixeltable/utils/pytorch.py +11 -14
- pixeltable/utils/s3.py +1 -0
- pixeltable/utils/sql.py +1 -0
- pixeltable/utils/transactional_directory.py +2 -2
- {pixeltable-0.3.1.dist-info → pixeltable-0.3.3.dist-info}/METADATA +7 -8
- pixeltable-0.3.3.dist-info/RECORD +163 -0
- pixeltable-0.3.1.dist-info/RECORD +0 -160
- {pixeltable-0.3.1.dist-info → pixeltable-0.3.3.dist-info}/LICENSE +0 -0
- {pixeltable-0.3.1.dist-info → pixeltable-0.3.3.dist-info}/WHEEL +0 -0
- {pixeltable-0.3.1.dist-info → pixeltable-0.3.3.dist-info}/entry_points.txt +0 -0
pixeltable/env.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from abc import abstractmethod
|
|
4
3
|
import datetime
|
|
5
4
|
import glob
|
|
6
5
|
import http.server
|
|
@@ -16,9 +15,11 @@ import sys
|
|
|
16
15
|
import threading
|
|
17
16
|
import uuid
|
|
18
17
|
import warnings
|
|
18
|
+
from abc import abstractmethod
|
|
19
19
|
from dataclasses import dataclass, field
|
|
20
20
|
from pathlib import Path
|
|
21
|
-
from
|
|
21
|
+
from sys import stdout
|
|
22
|
+
from typing import TYPE_CHECKING, Any, Callable, Optional, TypeVar
|
|
22
23
|
from zoneinfo import ZoneInfo, ZoneInfoNotFoundError
|
|
23
24
|
|
|
24
25
|
import pixeltable_pgserver
|
|
@@ -28,6 +29,7 @@ from tqdm import TqdmWarning
|
|
|
28
29
|
|
|
29
30
|
import pixeltable.exceptions as excs
|
|
30
31
|
from pixeltable import metadata
|
|
32
|
+
from pixeltable.utils.console_output import ConsoleLogger, ConsoleMessageFilter, ConsoleOutputHandler, map_level
|
|
31
33
|
from pixeltable.utils.http_server import make_server
|
|
32
34
|
|
|
33
35
|
if TYPE_CHECKING:
|
|
@@ -67,6 +69,7 @@ class Env:
|
|
|
67
69
|
_httpd: Optional[http.server.HTTPServer]
|
|
68
70
|
_http_address: Optional[str]
|
|
69
71
|
_logger: logging.Logger
|
|
72
|
+
_console_logger: ConsoleLogger
|
|
70
73
|
_default_log_level: int
|
|
71
74
|
_logfilename: Optional[str]
|
|
72
75
|
_log_to_stdout: bool
|
|
@@ -92,6 +95,8 @@ class Env:
|
|
|
92
95
|
cls._instance = env
|
|
93
96
|
|
|
94
97
|
def __init__(self):
|
|
98
|
+
assert self._instance is None, 'Env is a singleton; use Env.get() to access the instance'
|
|
99
|
+
|
|
95
100
|
self._home = None
|
|
96
101
|
self._media_dir = None # computed media files
|
|
97
102
|
self._file_cache_dir = None # cached media files with external URL
|
|
@@ -231,6 +236,10 @@ class Env:
|
|
|
231
236
|
else:
|
|
232
237
|
return False
|
|
233
238
|
|
|
239
|
+
@property
|
|
240
|
+
def console_logger(self) -> ConsoleLogger:
|
|
241
|
+
return self._console_logger
|
|
242
|
+
|
|
234
243
|
def _set_up(self, echo: bool = False, reinit_db: bool = False) -> None:
|
|
235
244
|
if self._initialized:
|
|
236
245
|
return
|
|
@@ -288,6 +297,14 @@ class Env:
|
|
|
288
297
|
warnings.simplefilter('ignore', category=UserWarning)
|
|
289
298
|
warnings.simplefilter('ignore', category=FutureWarning)
|
|
290
299
|
|
|
300
|
+
# Set verbose level for user visible console messages
|
|
301
|
+
verbosity = map_level(self._config.get_int_value('verbosity'))
|
|
302
|
+
stdout_handler = ConsoleOutputHandler(stream=stdout)
|
|
303
|
+
stdout_handler.setLevel(verbosity)
|
|
304
|
+
stdout_handler.addFilter(ConsoleMessageFilter())
|
|
305
|
+
self._logger.addHandler(stdout_handler)
|
|
306
|
+
self._console_logger = ConsoleLogger(self._logger)
|
|
307
|
+
|
|
291
308
|
# configure _logger to log to a file
|
|
292
309
|
self._logfilename = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') + '.log'
|
|
293
310
|
fh = logging.FileHandler(self._log_dir / self._logfilename, mode='w')
|
|
@@ -358,10 +375,11 @@ class Env:
|
|
|
358
375
|
|
|
359
376
|
if create_db:
|
|
360
377
|
from pixeltable.metadata import schema
|
|
378
|
+
|
|
361
379
|
schema.base_metadata.create_all(self._sa_engine)
|
|
362
380
|
metadata.create_system_info(self._sa_engine)
|
|
363
381
|
|
|
364
|
-
|
|
382
|
+
self.console_logger.info(f'Connected to Pixeltable database at: {self.db_url}')
|
|
365
383
|
|
|
366
384
|
# we now have a home directory and db; start other services
|
|
367
385
|
self._set_up_runtime()
|
|
@@ -370,11 +388,7 @@ class Env:
|
|
|
370
388
|
def _create_engine(self, time_zone_name: Optional[str], echo: bool = False) -> None:
|
|
371
389
|
connect_args = {} if time_zone_name is None else {'options': f'-c timezone={time_zone_name}'}
|
|
372
390
|
self._sa_engine = sql.create_engine(
|
|
373
|
-
self.db_url,
|
|
374
|
-
echo=echo,
|
|
375
|
-
future=True,
|
|
376
|
-
isolation_level='REPEATABLE READ',
|
|
377
|
-
connect_args=connect_args,
|
|
391
|
+
self.db_url, echo=echo, future=True, isolation_level='REPEATABLE READ', connect_args=connect_args
|
|
378
392
|
)
|
|
379
393
|
self._logger.info(f'Created SQLAlchemy engine at: {self.db_url}')
|
|
380
394
|
with self.engine.begin() as conn:
|
|
@@ -407,7 +421,7 @@ class Env:
|
|
|
407
421
|
with engine.begin() as conn:
|
|
408
422
|
# use C collation to get standard C/Python-style sorting
|
|
409
423
|
stmt = (
|
|
410
|
-
f
|
|
424
|
+
f'CREATE DATABASE {preparer.quote(self._db_name)} '
|
|
411
425
|
"ENCODING 'utf-8' LC_COLLATE 'C' LC_CTYPE 'C' TEMPLATE template0"
|
|
412
426
|
)
|
|
413
427
|
conn.execute(sql.text(stmt))
|
|
@@ -431,12 +445,12 @@ class Env:
|
|
|
431
445
|
try:
|
|
432
446
|
with engine.begin() as conn:
|
|
433
447
|
# terminate active connections
|
|
434
|
-
stmt =
|
|
448
|
+
stmt = f"""
|
|
435
449
|
SELECT pg_terminate_backend(pg_stat_activity.pid)
|
|
436
450
|
FROM pg_stat_activity
|
|
437
451
|
WHERE pg_stat_activity.datname = '{self._db_name}'
|
|
438
452
|
AND pid <> pg_backend_pid()
|
|
439
|
-
"""
|
|
453
|
+
"""
|
|
440
454
|
conn.execute(sql.text(stmt))
|
|
441
455
|
# drop db
|
|
442
456
|
stmt = f'DROP DATABASE {preparer.quote(self._db_name)}'
|
|
@@ -546,7 +560,7 @@ class Env:
|
|
|
546
560
|
is_installed = False
|
|
547
561
|
self.__optional_packages[package_name] = PackageInfo(
|
|
548
562
|
is_installed=is_installed,
|
|
549
|
-
library_name=library_name or package_name # defaults to package_name unless specified otherwise
|
|
563
|
+
library_name=library_name or package_name, # defaults to package_name unless specified otherwise
|
|
550
564
|
)
|
|
551
565
|
|
|
552
566
|
def require_package(self, package_name: str, min_version: Optional[list[int]] = None) -> None:
|
|
@@ -592,6 +606,7 @@ class Env:
|
|
|
592
606
|
"""
|
|
593
607
|
import spacy
|
|
594
608
|
from spacy.cli.download import get_model_filename
|
|
609
|
+
|
|
595
610
|
spacy_model = 'en_core_web_sm'
|
|
596
611
|
spacy_model_version = '3.7.1'
|
|
597
612
|
filename = get_model_filename(spacy_model, spacy_model_version, sdist=False)
|
|
@@ -609,7 +624,7 @@ class Env:
|
|
|
609
624
|
self._logger.warn(f'Failed to load spaCy model: {spacy_model}', exc_info=exc)
|
|
610
625
|
warnings.warn(
|
|
611
626
|
f"Failed to load spaCy model '{spacy_model}'. spaCy features will not be available.",
|
|
612
|
-
excs.PixeltableWarning
|
|
627
|
+
excs.PixeltableWarning,
|
|
613
628
|
)
|
|
614
629
|
self.__optional_packages['spacy'].is_installed = False
|
|
615
630
|
|
|
@@ -619,8 +634,7 @@ class Env:
|
|
|
619
634
|
def create_tmp_path(self, extension: str = '') -> Path:
|
|
620
635
|
return self._tmp_dir / f'{uuid.uuid4()}{extension}'
|
|
621
636
|
|
|
622
|
-
|
|
623
|
-
#def get_resource_pool_info(self, pool_id: str, pool_info_cls: Optional[Type[T]]) -> T:
|
|
637
|
+
# def get_resource_pool_info(self, pool_id: str, pool_info_cls: Optional[Type[T]]) -> T:
|
|
624
638
|
def get_resource_pool_info(self, pool_id: str, make_pool_info: Optional[Callable[[], T]] = None) -> T:
|
|
625
639
|
"""Returns the info object for the given id, creating it if necessary."""
|
|
626
640
|
info = self._resource_pool_info.get(pool_id)
|
|
@@ -690,6 +704,7 @@ def register_client(name: str) -> Callable:
|
|
|
690
704
|
Args:
|
|
691
705
|
- name (str): The name of the API client (e.g., 'openai' or 'label-studio').
|
|
692
706
|
"""
|
|
707
|
+
|
|
693
708
|
def decorator(fn: Callable) -> None:
|
|
694
709
|
global _registered_clients
|
|
695
710
|
sig = inspect.signature(fn)
|
|
@@ -704,6 +719,7 @@ class Config:
|
|
|
704
719
|
The (global) Pixeltable configuration, as loaded from `config.toml`. Provides methods for retrieving
|
|
705
720
|
configuration values, which can be set in the config file or as environment variables.
|
|
706
721
|
"""
|
|
722
|
+
|
|
707
723
|
__config: dict[str, Any]
|
|
708
724
|
|
|
709
725
|
@classmethod
|
|
@@ -733,12 +749,7 @@ class Config:
|
|
|
733
749
|
free_disk_space_bytes = shutil.disk_usage(config_path.parent).free
|
|
734
750
|
# Default cache size is 1/5 of free disk space
|
|
735
751
|
file_cache_size_g = free_disk_space_bytes / 5 / (1 << 30)
|
|
736
|
-
return {
|
|
737
|
-
'pixeltable': {
|
|
738
|
-
'file_cache_size_g': round(file_cache_size_g, 1),
|
|
739
|
-
'hide_warnings': False,
|
|
740
|
-
}
|
|
741
|
-
}
|
|
752
|
+
return {'pixeltable': {'file_cache_size_g': round(file_cache_size_g, 1), 'hide_warnings': False}}
|
|
742
753
|
|
|
743
754
|
def __init__(self, config: dict[str, Any]) -> None:
|
|
744
755
|
self.__config = config
|
|
@@ -823,7 +834,9 @@ class RateLimitsInfo:
|
|
|
823
834
|
self.resource_limits = {k: RateLimitInfo(k, now, *v) for k, v in kwargs.items() if v is not None}
|
|
824
835
|
# TODO: remove
|
|
825
836
|
for info in self.resource_limits.values():
|
|
826
|
-
_logger.debug(
|
|
837
|
+
_logger.debug(
|
|
838
|
+
f'Init {info.resource} rate limit: rem={info.remaining} reset={info.reset_at.strftime(TIME_FORMAT)} delta={(info.reset_at - now).total_seconds()}'
|
|
839
|
+
)
|
|
827
840
|
else:
|
|
828
841
|
for k, v in kwargs.items():
|
|
829
842
|
if v is not None:
|
|
@@ -838,6 +851,7 @@ class RateLimitsInfo:
|
|
|
838
851
|
@dataclass
|
|
839
852
|
class RateLimitInfo:
|
|
840
853
|
"""Container for rate limit-related information for a single resource."""
|
|
854
|
+
|
|
841
855
|
resource: str
|
|
842
856
|
recorded_at: datetime.datetime
|
|
843
857
|
limit: int
|
|
@@ -854,4 +868,6 @@ class RateLimitInfo:
|
|
|
854
868
|
reset_delta = reset_at - self.reset_at
|
|
855
869
|
self.reset_at = reset_at
|
|
856
870
|
# TODO: remove
|
|
857
|
-
_logger.debug(
|
|
871
|
+
_logger.debug(
|
|
872
|
+
f'Update {self.resource} rate limit: rem={self.remaining} reset={self.reset_at.strftime(TIME_FORMAT)} reset_delta={reset_delta.total_seconds()} recorded_delta={(self.reset_at - recorded_at).total_seconds()}'
|
|
873
|
+
)
|
pixeltable/exec/__init__.py
CHANGED
|
@@ -4,7 +4,7 @@ from .component_iteration_node import ComponentIterationNode
|
|
|
4
4
|
from .data_row_batch import DataRowBatch
|
|
5
5
|
from .exec_context import ExecContext
|
|
6
6
|
from .exec_node import ExecNode
|
|
7
|
+
from .expr_eval import ExprEvalNode
|
|
7
8
|
from .in_memory_data_node import InMemoryDataNode
|
|
8
9
|
from .row_update_node import RowUpdateNode
|
|
9
|
-
from .sql_node import
|
|
10
|
-
from .expr_eval import ExprEvalNode
|
|
10
|
+
from .sql_node import SqlAggregationNode, SqlJoinNode, SqlLookupNode, SqlNode, SqlScanNode
|
|
@@ -2,7 +2,7 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
4
|
import sys
|
|
5
|
-
from typing import Any, Iterable, Iterator, Optional, cast
|
|
5
|
+
from typing import Any, AsyncIterator, Iterable, Iterator, Optional, cast
|
|
6
6
|
|
|
7
7
|
import pixeltable.catalog as catalog
|
|
8
8
|
import pixeltable.exceptions as excs
|
|
@@ -13,12 +13,14 @@ from .exec_node import ExecNode
|
|
|
13
13
|
|
|
14
14
|
_logger = logging.getLogger('pixeltable')
|
|
15
15
|
|
|
16
|
+
|
|
16
17
|
class AggregationNode(ExecNode):
|
|
17
18
|
"""
|
|
18
19
|
In-memory aggregation for UDAs.
|
|
19
20
|
|
|
20
21
|
At the moment, this returns all results in a single DataRowBatch.
|
|
21
22
|
"""
|
|
23
|
+
|
|
22
24
|
group_by: Optional[list[exprs.Expr]]
|
|
23
25
|
input_exprs: list[exprs.Expr]
|
|
24
26
|
agg_fn_eval_ctx: exprs.RowBuilder.EvalCtx
|
|
@@ -26,8 +28,13 @@ class AggregationNode(ExecNode):
|
|
|
26
28
|
output_batch: DataRowBatch
|
|
27
29
|
|
|
28
30
|
def __init__(
|
|
29
|
-
|
|
30
|
-
|
|
31
|
+
self,
|
|
32
|
+
tbl: catalog.TableVersion,
|
|
33
|
+
row_builder: exprs.RowBuilder,
|
|
34
|
+
group_by: Optional[list[exprs.Expr]],
|
|
35
|
+
agg_fn_calls: list[exprs.FunctionCall],
|
|
36
|
+
input_exprs: Iterable[exprs.Expr],
|
|
37
|
+
input: ExecNode,
|
|
31
38
|
):
|
|
32
39
|
output_exprs: list[exprs.Expr] = [] if group_by is None else list(group_by)
|
|
33
40
|
output_exprs.extend(agg_fn_calls)
|
|
@@ -86,4 +93,3 @@ class AggregationNode(ExecNode):
|
|
|
86
93
|
self.output_batch.flush_imgs(None, self.stored_img_cols, self.flushed_img_slots)
|
|
87
94
|
_logger.debug(f'AggregateNode: consumed {num_input_rows} rows, returning {len(self.output_batch.rows)} rows')
|
|
88
95
|
yield self.output_batch
|
|
89
|
-
|
|
@@ -9,7 +9,7 @@ import urllib.request
|
|
|
9
9
|
from collections import deque
|
|
10
10
|
from concurrent import futures
|
|
11
11
|
from pathlib import Path
|
|
12
|
-
from typing import
|
|
12
|
+
from typing import Any, AsyncIterator, Iterator, Optional
|
|
13
13
|
from uuid import UUID
|
|
14
14
|
|
|
15
15
|
import pixeltable.env as env
|
|
@@ -30,6 +30,7 @@ class CachePrefetchNode(ExecNode):
|
|
|
30
30
|
TODO:
|
|
31
31
|
- adapting the number of download threads at runtime to maximize throughput
|
|
32
32
|
"""
|
|
33
|
+
|
|
33
34
|
BATCH_SIZE = 16
|
|
34
35
|
NUM_EXECUTOR_THREADS = 16
|
|
35
36
|
|
|
@@ -59,8 +60,8 @@ class CachePrefetchNode(ExecNode):
|
|
|
59
60
|
num_missing: int # number of missing URLs in this row
|
|
60
61
|
|
|
61
62
|
def __init__(
|
|
62
|
-
|
|
63
|
-
|
|
63
|
+
self, tbl_id: UUID, file_col_info: list[exprs.ColumnSlotIdx], input: ExecNode, retain_input_order: bool = True
|
|
64
|
+
):
|
|
64
65
|
# input_/output_exprs=[]: we don't have anything to evaluate
|
|
65
66
|
super().__init__(input.row_builder, [], [], input)
|
|
66
67
|
self.retain_input_order = retain_input_order
|
|
@@ -241,6 +242,7 @@ class CachePrefetchNode(ExecNode):
|
|
|
241
242
|
_logger.debug(f'Downloading {url} to {tmp_path}')
|
|
242
243
|
if parsed.scheme == 's3':
|
|
243
244
|
from pixeltable.utils.s3 import get_client
|
|
245
|
+
|
|
244
246
|
with self.boto_client_lock:
|
|
245
247
|
if self.boto_client is None:
|
|
246
248
|
config = {
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import inspect
|
|
2
|
-
from typing import Iterator, Optional
|
|
2
|
+
from typing import AsyncIterator, Iterator, Optional
|
|
3
3
|
|
|
4
4
|
import pixeltable.catalog as catalog
|
|
5
5
|
import pixeltable.exceptions as excs
|
|
@@ -14,6 +14,7 @@ class ComponentIterationNode(ExecNode):
|
|
|
14
14
|
|
|
15
15
|
Returns row batches of OUTPUT_BATCH_SIZE size.
|
|
16
16
|
"""
|
|
17
|
+
|
|
17
18
|
__OUTPUT_BATCH_SIZE = 1024
|
|
18
19
|
|
|
19
20
|
def __init__(self, view: catalog.TableVersion, input: ExecNode):
|
|
@@ -25,8 +26,8 @@ class ComponentIterationNode(ExecNode):
|
|
|
25
26
|
self.iterator_args = iterator_args[0]
|
|
26
27
|
assert isinstance(self.iterator_args, exprs.InlineDict)
|
|
27
28
|
self.iterator_args_ctx = self.row_builder.create_eval_ctx([self.iterator_args])
|
|
28
|
-
self.iterator_output_schema, self.unstored_column_names = (
|
|
29
|
-
|
|
29
|
+
self.iterator_output_schema, self.unstored_column_names = self.view.iterator_cls.output_schema(
|
|
30
|
+
**self.iterator_args.to_kwargs()
|
|
30
31
|
)
|
|
31
32
|
self.iterator_output_fields = list(self.iterator_output_schema.keys())
|
|
32
33
|
self.iterator_output_cols = {
|
|
@@ -34,7 +35,8 @@ class ComponentIterationNode(ExecNode):
|
|
|
34
35
|
}
|
|
35
36
|
# referenced iterator output fields
|
|
36
37
|
self.refd_output_slot_idxs = {
|
|
37
|
-
e.col.name: e.slot_idx
|
|
38
|
+
e.col.name: e.slot_idx
|
|
39
|
+
for e in self.row_builder.unique_exprs
|
|
38
40
|
if isinstance(e, exprs.ColumnRef) and e.col.name in self.iterator_output_fields
|
|
39
41
|
}
|
|
40
42
|
|
|
@@ -79,8 +81,7 @@ class ComponentIterationNode(ExecNode):
|
|
|
79
81
|
# verify and copy component_dict fields to their respective slots in output_row
|
|
80
82
|
for field_name, field_val in component_dict.items():
|
|
81
83
|
if field_name not in self.iterator_output_fields:
|
|
82
|
-
raise excs.Error(
|
|
83
|
-
f'Invalid field name {field_name} in output of {self.view.iterator_cls.__name__}')
|
|
84
|
+
raise excs.Error(f'Invalid field name {field_name} in output of {self.view.iterator_cls.__name__}')
|
|
84
85
|
if field_name not in self.refd_output_slot_idxs:
|
|
85
86
|
# we can ignore this
|
|
86
87
|
continue
|
|
@@ -90,5 +91,5 @@ class ComponentIterationNode(ExecNode):
|
|
|
90
91
|
if len(component_dict) != len(self.iterator_output_fields):
|
|
91
92
|
missing_fields = set(self.refd_output_slot_idxs.keys()) - set(component_dict.keys())
|
|
92
93
|
raise excs.Error(
|
|
93
|
-
f'Invalid output of {self.view.iterator_cls.__name__}: '
|
|
94
|
-
|
|
94
|
+
f'Invalid output of {self.view.iterator_cls.__name__}: missing fields {", ".join(missing_fields)}'
|
|
95
|
+
)
|
|
@@ -1,19 +1,21 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
|
-
|
|
2
|
+
|
|
3
3
|
import logging
|
|
4
|
+
from typing import Iterator, Optional
|
|
4
5
|
|
|
5
|
-
import pixeltable.exprs as exprs
|
|
6
6
|
import pixeltable.catalog as catalog
|
|
7
|
+
import pixeltable.exprs as exprs
|
|
7
8
|
from pixeltable.utils.media_store import MediaStore
|
|
8
9
|
|
|
9
|
-
|
|
10
10
|
_logger = logging.getLogger('pixeltable')
|
|
11
11
|
|
|
12
|
+
|
|
12
13
|
class DataRowBatch:
|
|
13
14
|
"""Set of DataRows, indexed by rowid.
|
|
14
15
|
|
|
15
16
|
Contains the metadata needed to initialize DataRows.
|
|
16
17
|
"""
|
|
18
|
+
|
|
17
19
|
tbl: Optional[catalog.TableVersion]
|
|
18
20
|
row_builder: exprs.RowBuilder
|
|
19
21
|
img_slot_idxs: list[int]
|
|
@@ -22,8 +24,11 @@ class DataRowBatch:
|
|
|
22
24
|
rows: list[exprs.DataRow]
|
|
23
25
|
|
|
24
26
|
def __init__(
|
|
25
|
-
self,
|
|
26
|
-
|
|
27
|
+
self,
|
|
28
|
+
tbl: Optional[catalog.TableVersion],
|
|
29
|
+
row_builder: exprs.RowBuilder,
|
|
30
|
+
num_rows: Optional[int] = None,
|
|
31
|
+
rows: Optional[list[exprs.DataRow]] = None,
|
|
27
32
|
):
|
|
28
33
|
"""
|
|
29
34
|
Requires either num_rows or rows to be specified, but not both.
|
|
@@ -34,7 +39,8 @@ class DataRowBatch:
|
|
|
34
39
|
self.img_slot_idxs = [e.slot_idx for e in row_builder.unique_exprs if e.col_type.is_image_type()]
|
|
35
40
|
# non-image media slots
|
|
36
41
|
self.media_slot_idxs = [
|
|
37
|
-
e.slot_idx
|
|
42
|
+
e.slot_idx
|
|
43
|
+
for e in row_builder.unique_exprs
|
|
38
44
|
if e.col_type.is_media_type() and not e.col_type.is_image_type()
|
|
39
45
|
]
|
|
40
46
|
self.array_slot_idxs = [e.slot_idx for e in row_builder.unique_exprs if e.col_type.is_array_type()]
|
|
@@ -44,14 +50,17 @@ class DataRowBatch:
|
|
|
44
50
|
if num_rows is None:
|
|
45
51
|
num_rows = 0
|
|
46
52
|
self.rows = [
|
|
47
|
-
exprs.DataRow(
|
|
53
|
+
exprs.DataRow(
|
|
54
|
+
row_builder.num_materialized, self.img_slot_idxs, self.media_slot_idxs, self.array_slot_idxs
|
|
55
|
+
)
|
|
48
56
|
for _ in range(num_rows)
|
|
49
57
|
]
|
|
50
58
|
|
|
51
59
|
def add_row(self, row: Optional[exprs.DataRow] = None) -> exprs.DataRow:
|
|
52
60
|
if row is None:
|
|
53
61
|
row = exprs.DataRow(
|
|
54
|
-
self.row_builder.num_materialized, self.img_slot_idxs, self.media_slot_idxs, self.array_slot_idxs
|
|
62
|
+
self.row_builder.num_materialized, self.img_slot_idxs, self.media_slot_idxs, self.array_slot_idxs
|
|
63
|
+
)
|
|
55
64
|
self.rows.append(row)
|
|
56
65
|
return row
|
|
57
66
|
|
|
@@ -65,8 +74,10 @@ class DataRowBatch:
|
|
|
65
74
|
return self.rows[index]
|
|
66
75
|
|
|
67
76
|
def flush_imgs(
|
|
68
|
-
|
|
69
|
-
|
|
77
|
+
self,
|
|
78
|
+
idx_range: Optional[slice] = None,
|
|
79
|
+
stored_img_info: Optional[list[exprs.ColumnSlotIdx]] = None,
|
|
80
|
+
flushed_slot_idxs: Optional[list[int]] = None,
|
|
70
81
|
) -> None:
|
|
71
82
|
"""Flushes images in the given range of rows."""
|
|
72
83
|
assert self.tbl is not None
|
pixeltable/exec/exec_context.py
CHANGED
|
@@ -4,12 +4,19 @@ import sqlalchemy as sql
|
|
|
4
4
|
|
|
5
5
|
import pixeltable.exprs as exprs
|
|
6
6
|
|
|
7
|
+
|
|
7
8
|
class ExecContext:
|
|
8
9
|
"""Class for execution runtime constants"""
|
|
10
|
+
|
|
9
11
|
def __init__(
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
12
|
+
self,
|
|
13
|
+
row_builder: exprs.RowBuilder,
|
|
14
|
+
*,
|
|
15
|
+
show_pbar: bool = False,
|
|
16
|
+
batch_size: int = 0,
|
|
17
|
+
pk_clause: Optional[list[sql.ClauseElement]] = None,
|
|
18
|
+
num_computed_exprs: int = 0,
|
|
19
|
+
ignore_errors: bool = False,
|
|
13
20
|
):
|
|
14
21
|
self.show_pbar = show_pbar
|
|
15
22
|
self.batch_size = batch_size
|
pixeltable/exec/exec_node.py
CHANGED
|
@@ -4,16 +4,19 @@ import abc
|
|
|
4
4
|
import asyncio
|
|
5
5
|
import logging
|
|
6
6
|
import sys
|
|
7
|
-
from typing import Iterable, Iterator, Optional, TypeVar
|
|
7
|
+
from typing import AsyncIterator, Iterable, Iterator, Optional, TypeVar
|
|
8
8
|
|
|
9
9
|
import pixeltable.exprs as exprs
|
|
10
|
+
|
|
10
11
|
from .data_row_batch import DataRowBatch
|
|
11
12
|
from .exec_context import ExecContext
|
|
12
13
|
|
|
13
14
|
_logger = logging.getLogger('pixeltable')
|
|
14
15
|
|
|
16
|
+
|
|
15
17
|
class ExecNode(abc.ABC):
|
|
16
18
|
"""Base class of all execution nodes"""
|
|
19
|
+
|
|
17
20
|
output_exprs: Iterable[exprs.Expr]
|
|
18
21
|
row_builder: exprs.RowBuilder
|
|
19
22
|
input: Optional[ExecNode]
|
|
@@ -22,8 +25,12 @@ class ExecNode(abc.ABC):
|
|
|
22
25
|
ctx: Optional[ExecContext]
|
|
23
26
|
|
|
24
27
|
def __init__(
|
|
25
|
-
|
|
26
|
-
|
|
28
|
+
self,
|
|
29
|
+
row_builder: exprs.RowBuilder,
|
|
30
|
+
output_exprs: Iterable[exprs.Expr],
|
|
31
|
+
input_exprs: Iterable[exprs.Expr],
|
|
32
|
+
input: Optional[ExecNode] = None,
|
|
33
|
+
):
|
|
27
34
|
self.output_exprs = output_exprs
|
|
28
35
|
self.row_builder = row_builder
|
|
29
36
|
self.input = input
|
|
@@ -31,8 +38,7 @@ class ExecNode(abc.ABC):
|
|
|
31
38
|
output_slot_idxs = {e.slot_idx for e in output_exprs}
|
|
32
39
|
output_dependencies = row_builder.get_dependencies(output_exprs, exclude=input_exprs)
|
|
33
40
|
self.flushed_img_slots = [
|
|
34
|
-
e.slot_idx for e in output_dependencies
|
|
35
|
-
if e.col_type.is_image_type() and e.slot_idx not in output_slot_idxs
|
|
41
|
+
e.slot_idx for e in output_dependencies if e.col_type.is_image_type() and e.slot_idx not in output_slot_idxs
|
|
36
42
|
]
|
|
37
43
|
self.stored_img_cols = []
|
|
38
44
|
self.ctx = None # all nodes of a tree share the same context
|
|
@@ -53,16 +59,20 @@ class ExecNode(abc.ABC):
|
|
|
53
59
|
pass
|
|
54
60
|
|
|
55
61
|
def __iter__(self) -> Iterator[DataRowBatch]:
|
|
62
|
+
running_loop: Optional[asyncio.AbstractEventLoop] = None
|
|
63
|
+
loop: asyncio.AbstractEventLoop
|
|
56
64
|
try:
|
|
57
|
-
# check if we are already in an event loop (eg, Jupyter's); if so, patch it to allow
|
|
58
|
-
|
|
65
|
+
# check if we are already in an event loop (eg, Jupyter's); if so, patch it to allow
|
|
66
|
+
# multiple run_until_complete()
|
|
67
|
+
running_loop = asyncio.get_running_loop()
|
|
59
68
|
import nest_asyncio # type: ignore
|
|
69
|
+
|
|
60
70
|
nest_asyncio.apply()
|
|
71
|
+
loop = running_loop
|
|
72
|
+
_logger.debug(f'Patched running loop')
|
|
61
73
|
except RuntimeError:
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
loop = asyncio.new_event_loop()
|
|
65
|
-
asyncio.set_event_loop(loop)
|
|
74
|
+
loop = asyncio.new_event_loop()
|
|
75
|
+
asyncio.set_event_loop(loop)
|
|
66
76
|
|
|
67
77
|
if 'pytest' in sys.modules:
|
|
68
78
|
loop.set_debug(True)
|
|
@@ -75,7 +85,8 @@ class ExecNode(abc.ABC):
|
|
|
75
85
|
except StopAsyncIteration:
|
|
76
86
|
pass
|
|
77
87
|
finally:
|
|
78
|
-
loop
|
|
88
|
+
if loop != running_loop:
|
|
89
|
+
loop.close()
|
|
79
90
|
|
|
80
91
|
def open(self) -> None:
|
|
81
92
|
"""Bottom-up initialization of nodes for execution. Must be called before __next__."""
|
|
@@ -5,10 +5,10 @@ import datetime
|
|
|
5
5
|
import itertools
|
|
6
6
|
import logging
|
|
7
7
|
import sys
|
|
8
|
-
from typing import
|
|
8
|
+
from typing import Any, Callable, Iterator, Optional, cast
|
|
9
|
+
|
|
10
|
+
from pixeltable import exprs, func
|
|
9
11
|
|
|
10
|
-
from pixeltable import exprs
|
|
11
|
-
from pixeltable import func
|
|
12
12
|
from .globals import Dispatcher, Evaluator, FnCallArgs
|
|
13
13
|
|
|
14
14
|
_logger = logging.getLogger('pixeltable')
|
|
@@ -23,6 +23,7 @@ class DefaultExprEvaluator(Evaluator):
|
|
|
23
23
|
TODO:
|
|
24
24
|
- parallelize via Ray
|
|
25
25
|
"""
|
|
26
|
+
|
|
26
27
|
e: exprs.Expr
|
|
27
28
|
|
|
28
29
|
def __init__(self, e: exprs.Expr, dispatcher: Dispatcher):
|
|
@@ -32,8 +33,7 @@ class DefaultExprEvaluator(Evaluator):
|
|
|
32
33
|
def schedule(self, rows: list[exprs.DataRow], slot_idx: int) -> None:
|
|
33
34
|
assert self.e.slot_idx >= 0
|
|
34
35
|
task = asyncio.create_task(self.eval(rows))
|
|
35
|
-
self.dispatcher.
|
|
36
|
-
task.add_done_callback(self.dispatcher.done_cb)
|
|
36
|
+
self.dispatcher.register_task(task)
|
|
37
37
|
|
|
38
38
|
async def eval(self, rows: list[exprs.DataRow]) -> None:
|
|
39
39
|
rows_with_excs: set[int] = set() # records idxs into rows
|
|
@@ -61,6 +61,7 @@ class FnCallEvaluator(Evaluator):
|
|
|
61
61
|
TODO:
|
|
62
62
|
- adaptive batching: finding the optimal batch size based on observed execution times
|
|
63
63
|
"""
|
|
64
|
+
|
|
64
65
|
fn_call: exprs.FunctionCall
|
|
65
66
|
fn: func.CallableFunction
|
|
66
67
|
scalar_py_fn: Optional[Callable] # only set for non-batching CallableFunctions
|
|
@@ -74,7 +75,7 @@ class FnCallEvaluator(Evaluator):
|
|
|
74
75
|
self.fn_call = fn_call
|
|
75
76
|
self.fn = cast(func.CallableFunction, fn_call.fn)
|
|
76
77
|
if isinstance(self.fn, func.CallableFunction) and self.fn.is_batched:
|
|
77
|
-
self.call_args_queue =
|
|
78
|
+
self.call_args_queue = asyncio.Queue[FnCallArgs]()
|
|
78
79
|
# we're not supplying sample arguments there, they're ignored anyway
|
|
79
80
|
self.batch_size = self.fn.get_batch_size()
|
|
80
81
|
self.scalar_py_fn = None
|
|
@@ -134,8 +135,7 @@ class FnCallEvaluator(Evaluator):
|
|
|
134
135
|
scheduler.submit(batched_call_args)
|
|
135
136
|
else:
|
|
136
137
|
task = asyncio.create_task(self.eval_batch(batched_call_args))
|
|
137
|
-
self.dispatcher.
|
|
138
|
-
task.add_done_callback(self.dispatcher.done_cb)
|
|
138
|
+
self.dispatcher.register_task(task)
|
|
139
139
|
|
|
140
140
|
elif self.fn.is_async:
|
|
141
141
|
if self.fn_call.resource_pool is not None:
|
|
@@ -147,14 +147,12 @@ class FnCallEvaluator(Evaluator):
|
|
|
147
147
|
# create one task per call
|
|
148
148
|
for item in rows_call_args:
|
|
149
149
|
task = asyncio.create_task(self.eval_async(item))
|
|
150
|
-
self.dispatcher.
|
|
151
|
-
task.add_done_callback(self.dispatcher.done_cb)
|
|
150
|
+
self.dispatcher.register_task(task)
|
|
152
151
|
|
|
153
152
|
else:
|
|
154
153
|
# create a single task for all rows
|
|
155
154
|
task = asyncio.create_task(self.eval(rows_call_args))
|
|
156
|
-
self.dispatcher.
|
|
157
|
-
task.add_done_callback(self.dispatcher.done_cb)
|
|
155
|
+
self.dispatcher.register_task(task)
|
|
158
156
|
|
|
159
157
|
def _queued_call_args_iter(self) -> Iterator[FnCallArgs]:
|
|
160
158
|
while not self.call_args_queue.empty():
|
|
@@ -171,14 +169,16 @@ class FnCallEvaluator(Evaluator):
|
|
|
171
169
|
for k in item.kwargs.keys():
|
|
172
170
|
batch_kwargs[k][i] = item.kwargs[k]
|
|
173
171
|
return FnCallArgs(
|
|
174
|
-
self.fn_call, [item.row for item in call_args], batch_args=batch_args, batch_kwargs=batch_kwargs
|
|
172
|
+
self.fn_call, [item.row for item in call_args], batch_args=batch_args, batch_kwargs=batch_kwargs
|
|
173
|
+
)
|
|
175
174
|
|
|
176
175
|
async def eval_batch(self, batched_call_args: FnCallArgs) -> None:
|
|
177
176
|
result_batch: list[Any]
|
|
178
177
|
try:
|
|
179
178
|
if self.fn.is_async:
|
|
180
179
|
result_batch = await self.fn.aexec_batch(
|
|
181
|
-
*batched_call_args.batch_args, **batched_call_args.batch_kwargs
|
|
180
|
+
*batched_call_args.batch_args, **batched_call_args.batch_kwargs
|
|
181
|
+
)
|
|
182
182
|
else:
|
|
183
183
|
# check for cancellation before starting something potentially long-running
|
|
184
184
|
if asyncio.current_task().cancelled() or self.dispatcher.exc_event.is_set():
|
|
@@ -209,6 +209,7 @@ class FnCallEvaluator(Evaluator):
|
|
|
209
209
|
self.dispatcher.dispatch([call_args.row])
|
|
210
210
|
except Exception as exc:
|
|
211
211
|
import anthropic
|
|
212
|
+
|
|
212
213
|
if isinstance(exc, anthropic.RateLimitError):
|
|
213
214
|
_logger.debug(f'RateLimitError: {exc}')
|
|
214
215
|
_, _, exc_tb = sys.exc_info()
|
|
@@ -232,7 +233,8 @@ class FnCallEvaluator(Evaluator):
|
|
|
232
233
|
rows_with_excs.add(idx)
|
|
233
234
|
self.dispatcher.dispatch_exc(item.rows, self.fn_call.slot_idx, exc_tb)
|
|
234
235
|
self.dispatcher.dispatch(
|
|
235
|
-
[call_args_batch[i].row for i in range(len(call_args_batch)) if i not in rows_with_excs]
|
|
236
|
+
[call_args_batch[i].row for i in range(len(call_args_batch)) if i not in rows_with_excs]
|
|
237
|
+
)
|
|
236
238
|
|
|
237
239
|
def _close(self) -> None:
|
|
238
240
|
"""Create a task for the incomplete batch of queued FnCallArgs, if any"""
|
|
@@ -241,5 +243,4 @@ class FnCallEvaluator(Evaluator):
|
|
|
241
243
|
return
|
|
242
244
|
batched_call_args = self._create_batch_call_args(list(self._queued_call_args_iter()))
|
|
243
245
|
task = asyncio.create_task(self.eval_batch(batched_call_args))
|
|
244
|
-
self.dispatcher.
|
|
245
|
-
task.add_done_callback(self.dispatcher.done_cb)
|
|
246
|
+
self.dispatcher.register_task(task)
|