pixeltable 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (147) hide show
  1. pixeltable/__init__.py +64 -11
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/__init__.py +1 -1
  4. pixeltable/catalog/catalog.py +50 -27
  5. pixeltable/catalog/column.py +27 -11
  6. pixeltable/catalog/dir.py +6 -4
  7. pixeltable/catalog/globals.py +8 -1
  8. pixeltable/catalog/insertable_table.py +25 -15
  9. pixeltable/catalog/named_function.py +10 -6
  10. pixeltable/catalog/path.py +3 -2
  11. pixeltable/catalog/path_dict.py +8 -6
  12. pixeltable/catalog/schema_object.py +2 -1
  13. pixeltable/catalog/table.py +123 -103
  14. pixeltable/catalog/table_version.py +292 -143
  15. pixeltable/catalog/table_version_path.py +8 -5
  16. pixeltable/catalog/view.py +68 -27
  17. pixeltable/dataframe.py +102 -72
  18. pixeltable/env.py +39 -23
  19. pixeltable/exec/__init__.py +2 -2
  20. pixeltable/exec/aggregation_node.py +10 -4
  21. pixeltable/exec/cache_prefetch_node.py +5 -3
  22. pixeltable/exec/component_iteration_node.py +9 -8
  23. pixeltable/exec/data_row_batch.py +21 -10
  24. pixeltable/exec/exec_context.py +10 -3
  25. pixeltable/exec/exec_node.py +23 -12
  26. pixeltable/exec/expr_eval/evaluators.py +18 -17
  27. pixeltable/exec/expr_eval/expr_eval_node.py +29 -16
  28. pixeltable/exec/expr_eval/globals.py +33 -11
  29. pixeltable/exec/expr_eval/row_buffer.py +5 -6
  30. pixeltable/exec/expr_eval/schedulers.py +170 -42
  31. pixeltable/exec/in_memory_data_node.py +8 -7
  32. pixeltable/exec/row_update_node.py +15 -5
  33. pixeltable/exec/sql_node.py +56 -27
  34. pixeltable/exprs/__init__.py +2 -2
  35. pixeltable/exprs/arithmetic_expr.py +57 -26
  36. pixeltable/exprs/array_slice.py +1 -1
  37. pixeltable/exprs/column_property_ref.py +2 -1
  38. pixeltable/exprs/column_ref.py +20 -15
  39. pixeltable/exprs/comparison.py +6 -2
  40. pixeltable/exprs/compound_predicate.py +1 -3
  41. pixeltable/exprs/data_row.py +2 -2
  42. pixeltable/exprs/expr.py +101 -72
  43. pixeltable/exprs/expr_dict.py +2 -1
  44. pixeltable/exprs/expr_set.py +3 -1
  45. pixeltable/exprs/function_call.py +39 -41
  46. pixeltable/exprs/globals.py +1 -0
  47. pixeltable/exprs/in_predicate.py +2 -2
  48. pixeltable/exprs/inline_expr.py +20 -17
  49. pixeltable/exprs/json_mapper.py +4 -2
  50. pixeltable/exprs/json_path.py +12 -18
  51. pixeltable/exprs/literal.py +5 -9
  52. pixeltable/exprs/method_ref.py +1 -0
  53. pixeltable/exprs/object_ref.py +1 -1
  54. pixeltable/exprs/row_builder.py +31 -16
  55. pixeltable/exprs/rowid_ref.py +14 -5
  56. pixeltable/exprs/similarity_expr.py +11 -6
  57. pixeltable/exprs/sql_element_cache.py +1 -1
  58. pixeltable/exprs/type_cast.py +24 -9
  59. pixeltable/ext/__init__.py +1 -0
  60. pixeltable/ext/functions/__init__.py +1 -0
  61. pixeltable/ext/functions/whisperx.py +2 -2
  62. pixeltable/ext/functions/yolox.py +11 -11
  63. pixeltable/func/aggregate_function.py +17 -13
  64. pixeltable/func/callable_function.py +6 -6
  65. pixeltable/func/expr_template_function.py +15 -14
  66. pixeltable/func/function.py +16 -16
  67. pixeltable/func/function_registry.py +11 -8
  68. pixeltable/func/globals.py +4 -2
  69. pixeltable/func/query_template_function.py +12 -13
  70. pixeltable/func/signature.py +18 -9
  71. pixeltable/func/tools.py +10 -17
  72. pixeltable/func/udf.py +106 -11
  73. pixeltable/functions/__init__.py +21 -2
  74. pixeltable/functions/anthropic.py +21 -15
  75. pixeltable/functions/fireworks.py +63 -5
  76. pixeltable/functions/gemini.py +13 -3
  77. pixeltable/functions/globals.py +18 -6
  78. pixeltable/functions/huggingface.py +20 -38
  79. pixeltable/functions/image.py +7 -3
  80. pixeltable/functions/json.py +1 -0
  81. pixeltable/functions/llama_cpp.py +1 -4
  82. pixeltable/functions/mistralai.py +31 -20
  83. pixeltable/functions/ollama.py +4 -18
  84. pixeltable/functions/openai.py +214 -109
  85. pixeltable/functions/replicate.py +11 -10
  86. pixeltable/functions/string.py +70 -7
  87. pixeltable/functions/timestamp.py +21 -8
  88. pixeltable/functions/together.py +66 -52
  89. pixeltable/functions/video.py +1 -0
  90. pixeltable/functions/vision.py +14 -11
  91. pixeltable/functions/whisper.py +2 -1
  92. pixeltable/globals.py +61 -28
  93. pixeltable/index/__init__.py +1 -1
  94. pixeltable/index/btree.py +5 -3
  95. pixeltable/index/embedding_index.py +15 -14
  96. pixeltable/io/__init__.py +1 -1
  97. pixeltable/io/external_store.py +30 -25
  98. pixeltable/io/fiftyone.py +6 -14
  99. pixeltable/io/globals.py +33 -27
  100. pixeltable/io/hf_datasets.py +3 -2
  101. pixeltable/io/label_studio.py +80 -71
  102. pixeltable/io/pandas.py +33 -9
  103. pixeltable/io/parquet.py +10 -13
  104. pixeltable/iterators/__init__.py +1 -0
  105. pixeltable/iterators/audio.py +205 -0
  106. pixeltable/iterators/document.py +19 -8
  107. pixeltable/iterators/image.py +6 -24
  108. pixeltable/iterators/string.py +3 -6
  109. pixeltable/iterators/video.py +1 -7
  110. pixeltable/metadata/__init__.py +9 -2
  111. pixeltable/metadata/converters/convert_10.py +2 -2
  112. pixeltable/metadata/converters/convert_15.py +1 -5
  113. pixeltable/metadata/converters/convert_16.py +2 -4
  114. pixeltable/metadata/converters/convert_17.py +2 -4
  115. pixeltable/metadata/converters/convert_18.py +2 -4
  116. pixeltable/metadata/converters/convert_19.py +2 -5
  117. pixeltable/metadata/converters/convert_20.py +1 -4
  118. pixeltable/metadata/converters/convert_21.py +4 -6
  119. pixeltable/metadata/converters/convert_22.py +1 -0
  120. pixeltable/metadata/converters/convert_23.py +5 -5
  121. pixeltable/metadata/converters/convert_24.py +12 -13
  122. pixeltable/metadata/converters/convert_26.py +23 -0
  123. pixeltable/metadata/converters/util.py +3 -4
  124. pixeltable/metadata/notes.py +1 -0
  125. pixeltable/metadata/schema.py +13 -2
  126. pixeltable/plan.py +173 -98
  127. pixeltable/store.py +42 -26
  128. pixeltable/type_system.py +130 -85
  129. pixeltable/utils/arrow.py +1 -7
  130. pixeltable/utils/coco.py +16 -17
  131. pixeltable/utils/code.py +1 -1
  132. pixeltable/utils/console_output.py +44 -0
  133. pixeltable/utils/description_helper.py +7 -7
  134. pixeltable/utils/documents.py +3 -1
  135. pixeltable/utils/filecache.py +13 -8
  136. pixeltable/utils/http_server.py +9 -8
  137. pixeltable/utils/media_store.py +2 -1
  138. pixeltable/utils/pytorch.py +11 -14
  139. pixeltable/utils/s3.py +1 -0
  140. pixeltable/utils/sql.py +1 -0
  141. pixeltable/utils/transactional_directory.py +2 -2
  142. {pixeltable-0.3.1.dist-info → pixeltable-0.3.3.dist-info}/METADATA +7 -8
  143. pixeltable-0.3.3.dist-info/RECORD +163 -0
  144. pixeltable-0.3.1.dist-info/RECORD +0 -160
  145. {pixeltable-0.3.1.dist-info → pixeltable-0.3.3.dist-info}/LICENSE +0 -0
  146. {pixeltable-0.3.1.dist-info → pixeltable-0.3.3.dist-info}/WHEEL +0 -0
  147. {pixeltable-0.3.1.dist-info → pixeltable-0.3.3.dist-info}/entry_points.txt +0 -0
pixeltable/env.py CHANGED
@@ -1,6 +1,5 @@
1
1
  from __future__ import annotations
2
2
 
3
- from abc import abstractmethod
4
3
  import datetime
5
4
  import glob
6
5
  import http.server
@@ -16,9 +15,11 @@ import sys
16
15
  import threading
17
16
  import uuid
18
17
  import warnings
18
+ from abc import abstractmethod
19
19
  from dataclasses import dataclass, field
20
20
  from pathlib import Path
21
- from typing import TYPE_CHECKING, Any, Callable, Optional, TypeVar, Type
21
+ from sys import stdout
22
+ from typing import TYPE_CHECKING, Any, Callable, Optional, TypeVar
22
23
  from zoneinfo import ZoneInfo, ZoneInfoNotFoundError
23
24
 
24
25
  import pixeltable_pgserver
@@ -28,6 +29,7 @@ from tqdm import TqdmWarning
28
29
 
29
30
  import pixeltable.exceptions as excs
30
31
  from pixeltable import metadata
32
+ from pixeltable.utils.console_output import ConsoleLogger, ConsoleMessageFilter, ConsoleOutputHandler, map_level
31
33
  from pixeltable.utils.http_server import make_server
32
34
 
33
35
  if TYPE_CHECKING:
@@ -67,6 +69,7 @@ class Env:
67
69
  _httpd: Optional[http.server.HTTPServer]
68
70
  _http_address: Optional[str]
69
71
  _logger: logging.Logger
72
+ _console_logger: ConsoleLogger
70
73
  _default_log_level: int
71
74
  _logfilename: Optional[str]
72
75
  _log_to_stdout: bool
@@ -92,6 +95,8 @@ class Env:
92
95
  cls._instance = env
93
96
 
94
97
  def __init__(self):
98
+ assert self._instance is None, 'Env is a singleton; use Env.get() to access the instance'
99
+
95
100
  self._home = None
96
101
  self._media_dir = None # computed media files
97
102
  self._file_cache_dir = None # cached media files with external URL
@@ -231,6 +236,10 @@ class Env:
231
236
  else:
232
237
  return False
233
238
 
239
+ @property
240
+ def console_logger(self) -> ConsoleLogger:
241
+ return self._console_logger
242
+
234
243
  def _set_up(self, echo: bool = False, reinit_db: bool = False) -> None:
235
244
  if self._initialized:
236
245
  return
@@ -288,6 +297,14 @@ class Env:
288
297
  warnings.simplefilter('ignore', category=UserWarning)
289
298
  warnings.simplefilter('ignore', category=FutureWarning)
290
299
 
300
+ # Set verbose level for user visible console messages
301
+ verbosity = map_level(self._config.get_int_value('verbosity'))
302
+ stdout_handler = ConsoleOutputHandler(stream=stdout)
303
+ stdout_handler.setLevel(verbosity)
304
+ stdout_handler.addFilter(ConsoleMessageFilter())
305
+ self._logger.addHandler(stdout_handler)
306
+ self._console_logger = ConsoleLogger(self._logger)
307
+
291
308
  # configure _logger to log to a file
292
309
  self._logfilename = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') + '.log'
293
310
  fh = logging.FileHandler(self._log_dir / self._logfilename, mode='w')
@@ -358,10 +375,11 @@ class Env:
358
375
 
359
376
  if create_db:
360
377
  from pixeltable.metadata import schema
378
+
361
379
  schema.base_metadata.create_all(self._sa_engine)
362
380
  metadata.create_system_info(self._sa_engine)
363
381
 
364
- print(f'Connected to Pixeltable database at: {self.db_url}')
382
+ self.console_logger.info(f'Connected to Pixeltable database at: {self.db_url}')
365
383
 
366
384
  # we now have a home directory and db; start other services
367
385
  self._set_up_runtime()
@@ -370,11 +388,7 @@ class Env:
370
388
  def _create_engine(self, time_zone_name: Optional[str], echo: bool = False) -> None:
371
389
  connect_args = {} if time_zone_name is None else {'options': f'-c timezone={time_zone_name}'}
372
390
  self._sa_engine = sql.create_engine(
373
- self.db_url,
374
- echo=echo,
375
- future=True,
376
- isolation_level='REPEATABLE READ',
377
- connect_args=connect_args,
391
+ self.db_url, echo=echo, future=True, isolation_level='REPEATABLE READ', connect_args=connect_args
378
392
  )
379
393
  self._logger.info(f'Created SQLAlchemy engine at: {self.db_url}')
380
394
  with self.engine.begin() as conn:
@@ -407,7 +421,7 @@ class Env:
407
421
  with engine.begin() as conn:
408
422
  # use C collation to get standard C/Python-style sorting
409
423
  stmt = (
410
- f"CREATE DATABASE {preparer.quote(self._db_name)} "
424
+ f'CREATE DATABASE {preparer.quote(self._db_name)} '
411
425
  "ENCODING 'utf-8' LC_COLLATE 'C' LC_CTYPE 'C' TEMPLATE template0"
412
426
  )
413
427
  conn.execute(sql.text(stmt))
@@ -431,12 +445,12 @@ class Env:
431
445
  try:
432
446
  with engine.begin() as conn:
433
447
  # terminate active connections
434
- stmt = (f"""
448
+ stmt = f"""
435
449
  SELECT pg_terminate_backend(pg_stat_activity.pid)
436
450
  FROM pg_stat_activity
437
451
  WHERE pg_stat_activity.datname = '{self._db_name}'
438
452
  AND pid <> pg_backend_pid()
439
- """)
453
+ """
440
454
  conn.execute(sql.text(stmt))
441
455
  # drop db
442
456
  stmt = f'DROP DATABASE {preparer.quote(self._db_name)}'
@@ -546,7 +560,7 @@ class Env:
546
560
  is_installed = False
547
561
  self.__optional_packages[package_name] = PackageInfo(
548
562
  is_installed=is_installed,
549
- library_name=library_name or package_name # defaults to package_name unless specified otherwise
563
+ library_name=library_name or package_name, # defaults to package_name unless specified otherwise
550
564
  )
551
565
 
552
566
  def require_package(self, package_name: str, min_version: Optional[list[int]] = None) -> None:
@@ -592,6 +606,7 @@ class Env:
592
606
  """
593
607
  import spacy
594
608
  from spacy.cli.download import get_model_filename
609
+
595
610
  spacy_model = 'en_core_web_sm'
596
611
  spacy_model_version = '3.7.1'
597
612
  filename = get_model_filename(spacy_model, spacy_model_version, sdist=False)
@@ -609,7 +624,7 @@ class Env:
609
624
  self._logger.warn(f'Failed to load spaCy model: {spacy_model}', exc_info=exc)
610
625
  warnings.warn(
611
626
  f"Failed to load spaCy model '{spacy_model}'. spaCy features will not be available.",
612
- excs.PixeltableWarning
627
+ excs.PixeltableWarning,
613
628
  )
614
629
  self.__optional_packages['spacy'].is_installed = False
615
630
 
@@ -619,8 +634,7 @@ class Env:
619
634
  def create_tmp_path(self, extension: str = '') -> Path:
620
635
  return self._tmp_dir / f'{uuid.uuid4()}{extension}'
621
636
 
622
-
623
- #def get_resource_pool_info(self, pool_id: str, pool_info_cls: Optional[Type[T]]) -> T:
637
+ # def get_resource_pool_info(self, pool_id: str, pool_info_cls: Optional[Type[T]]) -> T:
624
638
  def get_resource_pool_info(self, pool_id: str, make_pool_info: Optional[Callable[[], T]] = None) -> T:
625
639
  """Returns the info object for the given id, creating it if necessary."""
626
640
  info = self._resource_pool_info.get(pool_id)
@@ -690,6 +704,7 @@ def register_client(name: str) -> Callable:
690
704
  Args:
691
705
  - name (str): The name of the API client (e.g., 'openai' or 'label-studio').
692
706
  """
707
+
693
708
  def decorator(fn: Callable) -> None:
694
709
  global _registered_clients
695
710
  sig = inspect.signature(fn)
@@ -704,6 +719,7 @@ class Config:
704
719
  The (global) Pixeltable configuration, as loaded from `config.toml`. Provides methods for retrieving
705
720
  configuration values, which can be set in the config file or as environment variables.
706
721
  """
722
+
707
723
  __config: dict[str, Any]
708
724
 
709
725
  @classmethod
@@ -733,12 +749,7 @@ class Config:
733
749
  free_disk_space_bytes = shutil.disk_usage(config_path.parent).free
734
750
  # Default cache size is 1/5 of free disk space
735
751
  file_cache_size_g = free_disk_space_bytes / 5 / (1 << 30)
736
- return {
737
- 'pixeltable': {
738
- 'file_cache_size_g': round(file_cache_size_g, 1),
739
- 'hide_warnings': False,
740
- }
741
- }
752
+ return {'pixeltable': {'file_cache_size_g': round(file_cache_size_g, 1), 'hide_warnings': False}}
742
753
 
743
754
  def __init__(self, config: dict[str, Any]) -> None:
744
755
  self.__config = config
@@ -823,7 +834,9 @@ class RateLimitsInfo:
823
834
  self.resource_limits = {k: RateLimitInfo(k, now, *v) for k, v in kwargs.items() if v is not None}
824
835
  # TODO: remove
825
836
  for info in self.resource_limits.values():
826
- _logger.debug(f'Init {info.resource} rate limit: rem={info.remaining} reset={info.reset_at.strftime(TIME_FORMAT)} delta={(info.reset_at - now).total_seconds()}')
837
+ _logger.debug(
838
+ f'Init {info.resource} rate limit: rem={info.remaining} reset={info.reset_at.strftime(TIME_FORMAT)} delta={(info.reset_at - now).total_seconds()}'
839
+ )
827
840
  else:
828
841
  for k, v in kwargs.items():
829
842
  if v is not None:
@@ -838,6 +851,7 @@ class RateLimitsInfo:
838
851
  @dataclass
839
852
  class RateLimitInfo:
840
853
  """Container for rate limit-related information for a single resource."""
854
+
841
855
  resource: str
842
856
  recorded_at: datetime.datetime
843
857
  limit: int
@@ -854,4 +868,6 @@ class RateLimitInfo:
854
868
  reset_delta = reset_at - self.reset_at
855
869
  self.reset_at = reset_at
856
870
  # TODO: remove
857
- _logger.debug(f'Update {self.resource} rate limit: rem={self.remaining} reset={self.reset_at.strftime(TIME_FORMAT)} reset_delta={reset_delta.total_seconds()} recorded_delta={(self.reset_at - recorded_at).total_seconds()}')
871
+ _logger.debug(
872
+ f'Update {self.resource} rate limit: rem={self.remaining} reset={self.reset_at.strftime(TIME_FORMAT)} reset_delta={reset_delta.total_seconds()} recorded_delta={(self.reset_at - recorded_at).total_seconds()}'
873
+ )
@@ -4,7 +4,7 @@ from .component_iteration_node import ComponentIterationNode
4
4
  from .data_row_batch import DataRowBatch
5
5
  from .exec_context import ExecContext
6
6
  from .exec_node import ExecNode
7
+ from .expr_eval import ExprEvalNode
7
8
  from .in_memory_data_node import InMemoryDataNode
8
9
  from .row_update_node import RowUpdateNode
9
- from .sql_node import SqlLookupNode, SqlScanNode, SqlAggregationNode, SqlNode, SqlJoinNode
10
- from .expr_eval import ExprEvalNode
10
+ from .sql_node import SqlAggregationNode, SqlJoinNode, SqlLookupNode, SqlNode, SqlScanNode
@@ -2,7 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  import logging
4
4
  import sys
5
- from typing import Any, Iterable, Iterator, Optional, cast, AsyncIterator
5
+ from typing import Any, AsyncIterator, Iterable, Iterator, Optional, cast
6
6
 
7
7
  import pixeltable.catalog as catalog
8
8
  import pixeltable.exceptions as excs
@@ -13,12 +13,14 @@ from .exec_node import ExecNode
13
13
 
14
14
  _logger = logging.getLogger('pixeltable')
15
15
 
16
+
16
17
  class AggregationNode(ExecNode):
17
18
  """
18
19
  In-memory aggregation for UDAs.
19
20
 
20
21
  At the moment, this returns all results in a single DataRowBatch.
21
22
  """
23
+
22
24
  group_by: Optional[list[exprs.Expr]]
23
25
  input_exprs: list[exprs.Expr]
24
26
  agg_fn_eval_ctx: exprs.RowBuilder.EvalCtx
@@ -26,8 +28,13 @@ class AggregationNode(ExecNode):
26
28
  output_batch: DataRowBatch
27
29
 
28
30
  def __init__(
29
- self, tbl: catalog.TableVersion, row_builder: exprs.RowBuilder, group_by: Optional[list[exprs.Expr]],
30
- agg_fn_calls: list[exprs.FunctionCall], input_exprs: Iterable[exprs.Expr], input: ExecNode
31
+ self,
32
+ tbl: catalog.TableVersion,
33
+ row_builder: exprs.RowBuilder,
34
+ group_by: Optional[list[exprs.Expr]],
35
+ agg_fn_calls: list[exprs.FunctionCall],
36
+ input_exprs: Iterable[exprs.Expr],
37
+ input: ExecNode,
31
38
  ):
32
39
  output_exprs: list[exprs.Expr] = [] if group_by is None else list(group_by)
33
40
  output_exprs.extend(agg_fn_calls)
@@ -86,4 +93,3 @@ class AggregationNode(ExecNode):
86
93
  self.output_batch.flush_imgs(None, self.stored_img_cols, self.flushed_img_slots)
87
94
  _logger.debug(f'AggregateNode: consumed {num_input_rows} rows, returning {len(self.output_batch.rows)} rows')
88
95
  yield self.output_batch
89
-
@@ -9,7 +9,7 @@ import urllib.request
9
9
  from collections import deque
10
10
  from concurrent import futures
11
11
  from pathlib import Path
12
- from typing import Optional, Any, Iterator, AsyncIterator
12
+ from typing import Any, AsyncIterator, Iterator, Optional
13
13
  from uuid import UUID
14
14
 
15
15
  import pixeltable.env as env
@@ -30,6 +30,7 @@ class CachePrefetchNode(ExecNode):
30
30
  TODO:
31
31
  - adapting the number of download threads at runtime to maximize throughput
32
32
  """
33
+
33
34
  BATCH_SIZE = 16
34
35
  NUM_EXECUTOR_THREADS = 16
35
36
 
@@ -59,8 +60,8 @@ class CachePrefetchNode(ExecNode):
59
60
  num_missing: int # number of missing URLs in this row
60
61
 
61
62
  def __init__(
62
- self, tbl_id: UUID, file_col_info: list[exprs.ColumnSlotIdx], input: ExecNode,
63
- retain_input_order: bool = True):
63
+ self, tbl_id: UUID, file_col_info: list[exprs.ColumnSlotIdx], input: ExecNode, retain_input_order: bool = True
64
+ ):
64
65
  # input_/output_exprs=[]: we don't have anything to evaluate
65
66
  super().__init__(input.row_builder, [], [], input)
66
67
  self.retain_input_order = retain_input_order
@@ -241,6 +242,7 @@ class CachePrefetchNode(ExecNode):
241
242
  _logger.debug(f'Downloading {url} to {tmp_path}')
242
243
  if parsed.scheme == 's3':
243
244
  from pixeltable.utils.s3 import get_client
245
+
244
246
  with self.boto_client_lock:
245
247
  if self.boto_client is None:
246
248
  config = {
@@ -1,5 +1,5 @@
1
1
  import inspect
2
- from typing import Iterator, Optional, AsyncIterator
2
+ from typing import AsyncIterator, Iterator, Optional
3
3
 
4
4
  import pixeltable.catalog as catalog
5
5
  import pixeltable.exceptions as excs
@@ -14,6 +14,7 @@ class ComponentIterationNode(ExecNode):
14
14
 
15
15
  Returns row batches of OUTPUT_BATCH_SIZE size.
16
16
  """
17
+
17
18
  __OUTPUT_BATCH_SIZE = 1024
18
19
 
19
20
  def __init__(self, view: catalog.TableVersion, input: ExecNode):
@@ -25,8 +26,8 @@ class ComponentIterationNode(ExecNode):
25
26
  self.iterator_args = iterator_args[0]
26
27
  assert isinstance(self.iterator_args, exprs.InlineDict)
27
28
  self.iterator_args_ctx = self.row_builder.create_eval_ctx([self.iterator_args])
28
- self.iterator_output_schema, self.unstored_column_names = (
29
- self.view.iterator_cls.output_schema(**self.iterator_args.to_kwargs())
29
+ self.iterator_output_schema, self.unstored_column_names = self.view.iterator_cls.output_schema(
30
+ **self.iterator_args.to_kwargs()
30
31
  )
31
32
  self.iterator_output_fields = list(self.iterator_output_schema.keys())
32
33
  self.iterator_output_cols = {
@@ -34,7 +35,8 @@ class ComponentIterationNode(ExecNode):
34
35
  }
35
36
  # referenced iterator output fields
36
37
  self.refd_output_slot_idxs = {
37
- e.col.name: e.slot_idx for e in self.row_builder.unique_exprs
38
+ e.col.name: e.slot_idx
39
+ for e in self.row_builder.unique_exprs
38
40
  if isinstance(e, exprs.ColumnRef) and e.col.name in self.iterator_output_fields
39
41
  }
40
42
 
@@ -79,8 +81,7 @@ class ComponentIterationNode(ExecNode):
79
81
  # verify and copy component_dict fields to their respective slots in output_row
80
82
  for field_name, field_val in component_dict.items():
81
83
  if field_name not in self.iterator_output_fields:
82
- raise excs.Error(
83
- f'Invalid field name {field_name} in output of {self.view.iterator_cls.__name__}')
84
+ raise excs.Error(f'Invalid field name {field_name} in output of {self.view.iterator_cls.__name__}')
84
85
  if field_name not in self.refd_output_slot_idxs:
85
86
  # we can ignore this
86
87
  continue
@@ -90,5 +91,5 @@ class ComponentIterationNode(ExecNode):
90
91
  if len(component_dict) != len(self.iterator_output_fields):
91
92
  missing_fields = set(self.refd_output_slot_idxs.keys()) - set(component_dict.keys())
92
93
  raise excs.Error(
93
- f'Invalid output of {self.view.iterator_cls.__name__}: '
94
- f'missing fields {", ".join(missing_fields)}')
94
+ f'Invalid output of {self.view.iterator_cls.__name__}: missing fields {", ".join(missing_fields)}'
95
+ )
@@ -1,19 +1,21 @@
1
1
  from __future__ import annotations
2
- from typing import Iterator, Optional
2
+
3
3
  import logging
4
+ from typing import Iterator, Optional
4
5
 
5
- import pixeltable.exprs as exprs
6
6
  import pixeltable.catalog as catalog
7
+ import pixeltable.exprs as exprs
7
8
  from pixeltable.utils.media_store import MediaStore
8
9
 
9
-
10
10
  _logger = logging.getLogger('pixeltable')
11
11
 
12
+
12
13
  class DataRowBatch:
13
14
  """Set of DataRows, indexed by rowid.
14
15
 
15
16
  Contains the metadata needed to initialize DataRows.
16
17
  """
18
+
17
19
  tbl: Optional[catalog.TableVersion]
18
20
  row_builder: exprs.RowBuilder
19
21
  img_slot_idxs: list[int]
@@ -22,8 +24,11 @@ class DataRowBatch:
22
24
  rows: list[exprs.DataRow]
23
25
 
24
26
  def __init__(
25
- self, tbl: Optional[catalog.TableVersion], row_builder: exprs.RowBuilder, num_rows: Optional[int] = None,
26
- rows: Optional[list[exprs.DataRow]] = None
27
+ self,
28
+ tbl: Optional[catalog.TableVersion],
29
+ row_builder: exprs.RowBuilder,
30
+ num_rows: Optional[int] = None,
31
+ rows: Optional[list[exprs.DataRow]] = None,
27
32
  ):
28
33
  """
29
34
  Requires either num_rows or rows to be specified, but not both.
@@ -34,7 +39,8 @@ class DataRowBatch:
34
39
  self.img_slot_idxs = [e.slot_idx for e in row_builder.unique_exprs if e.col_type.is_image_type()]
35
40
  # non-image media slots
36
41
  self.media_slot_idxs = [
37
- e.slot_idx for e in row_builder.unique_exprs
42
+ e.slot_idx
43
+ for e in row_builder.unique_exprs
38
44
  if e.col_type.is_media_type() and not e.col_type.is_image_type()
39
45
  ]
40
46
  self.array_slot_idxs = [e.slot_idx for e in row_builder.unique_exprs if e.col_type.is_array_type()]
@@ -44,14 +50,17 @@ class DataRowBatch:
44
50
  if num_rows is None:
45
51
  num_rows = 0
46
52
  self.rows = [
47
- exprs.DataRow(row_builder.num_materialized, self.img_slot_idxs, self.media_slot_idxs, self.array_slot_idxs)
53
+ exprs.DataRow(
54
+ row_builder.num_materialized, self.img_slot_idxs, self.media_slot_idxs, self.array_slot_idxs
55
+ )
48
56
  for _ in range(num_rows)
49
57
  ]
50
58
 
51
59
  def add_row(self, row: Optional[exprs.DataRow] = None) -> exprs.DataRow:
52
60
  if row is None:
53
61
  row = exprs.DataRow(
54
- self.row_builder.num_materialized, self.img_slot_idxs, self.media_slot_idxs, self.array_slot_idxs)
62
+ self.row_builder.num_materialized, self.img_slot_idxs, self.media_slot_idxs, self.array_slot_idxs
63
+ )
55
64
  self.rows.append(row)
56
65
  return row
57
66
 
@@ -65,8 +74,10 @@ class DataRowBatch:
65
74
  return self.rows[index]
66
75
 
67
76
  def flush_imgs(
68
- self, idx_range: Optional[slice] = None, stored_img_info: Optional[list[exprs.ColumnSlotIdx]] = None,
69
- flushed_slot_idxs: Optional[list[int]] = None
77
+ self,
78
+ idx_range: Optional[slice] = None,
79
+ stored_img_info: Optional[list[exprs.ColumnSlotIdx]] = None,
80
+ flushed_slot_idxs: Optional[list[int]] = None,
70
81
  ) -> None:
71
82
  """Flushes images in the given range of rows."""
72
83
  assert self.tbl is not None
@@ -4,12 +4,19 @@ import sqlalchemy as sql
4
4
 
5
5
  import pixeltable.exprs as exprs
6
6
 
7
+
7
8
  class ExecContext:
8
9
  """Class for execution runtime constants"""
10
+
9
11
  def __init__(
10
- self, row_builder: exprs.RowBuilder, *, show_pbar: bool = False, batch_size: int = 0,
11
- pk_clause: Optional[list[sql.ClauseElement]] = None, num_computed_exprs: int = 0,
12
- ignore_errors: bool = False
12
+ self,
13
+ row_builder: exprs.RowBuilder,
14
+ *,
15
+ show_pbar: bool = False,
16
+ batch_size: int = 0,
17
+ pk_clause: Optional[list[sql.ClauseElement]] = None,
18
+ num_computed_exprs: int = 0,
19
+ ignore_errors: bool = False,
13
20
  ):
14
21
  self.show_pbar = show_pbar
15
22
  self.batch_size = batch_size
@@ -4,16 +4,19 @@ import abc
4
4
  import asyncio
5
5
  import logging
6
6
  import sys
7
- from typing import Iterable, Iterator, Optional, TypeVar, AsyncIterator
7
+ from typing import AsyncIterator, Iterable, Iterator, Optional, TypeVar
8
8
 
9
9
  import pixeltable.exprs as exprs
10
+
10
11
  from .data_row_batch import DataRowBatch
11
12
  from .exec_context import ExecContext
12
13
 
13
14
  _logger = logging.getLogger('pixeltable')
14
15
 
16
+
15
17
  class ExecNode(abc.ABC):
16
18
  """Base class of all execution nodes"""
19
+
17
20
  output_exprs: Iterable[exprs.Expr]
18
21
  row_builder: exprs.RowBuilder
19
22
  input: Optional[ExecNode]
@@ -22,8 +25,12 @@ class ExecNode(abc.ABC):
22
25
  ctx: Optional[ExecContext]
23
26
 
24
27
  def __init__(
25
- self, row_builder: exprs.RowBuilder, output_exprs: Iterable[exprs.Expr],
26
- input_exprs: Iterable[exprs.Expr], input: Optional[ExecNode] = None):
28
+ self,
29
+ row_builder: exprs.RowBuilder,
30
+ output_exprs: Iterable[exprs.Expr],
31
+ input_exprs: Iterable[exprs.Expr],
32
+ input: Optional[ExecNode] = None,
33
+ ):
27
34
  self.output_exprs = output_exprs
28
35
  self.row_builder = row_builder
29
36
  self.input = input
@@ -31,8 +38,7 @@ class ExecNode(abc.ABC):
31
38
  output_slot_idxs = {e.slot_idx for e in output_exprs}
32
39
  output_dependencies = row_builder.get_dependencies(output_exprs, exclude=input_exprs)
33
40
  self.flushed_img_slots = [
34
- e.slot_idx for e in output_dependencies
35
- if e.col_type.is_image_type() and e.slot_idx not in output_slot_idxs
41
+ e.slot_idx for e in output_dependencies if e.col_type.is_image_type() and e.slot_idx not in output_slot_idxs
36
42
  ]
37
43
  self.stored_img_cols = []
38
44
  self.ctx = None # all nodes of a tree share the same context
@@ -53,16 +59,20 @@ class ExecNode(abc.ABC):
53
59
  pass
54
60
 
55
61
  def __iter__(self) -> Iterator[DataRowBatch]:
62
+ running_loop: Optional[asyncio.AbstractEventLoop] = None
63
+ loop: asyncio.AbstractEventLoop
56
64
  try:
57
- # check if we are already in an event loop (eg, Jupyter's); if so, patch it to allow nested event loops
58
- _ = asyncio.get_event_loop()
65
+ # check if we are already in an event loop (eg, Jupyter's); if so, patch it to allow
66
+ # multiple run_until_complete()
67
+ running_loop = asyncio.get_running_loop()
59
68
  import nest_asyncio # type: ignore
69
+
60
70
  nest_asyncio.apply()
71
+ loop = running_loop
72
+ _logger.debug(f'Patched running loop')
61
73
  except RuntimeError:
62
- pass
63
-
64
- loop = asyncio.new_event_loop()
65
- asyncio.set_event_loop(loop)
74
+ loop = asyncio.new_event_loop()
75
+ asyncio.set_event_loop(loop)
66
76
 
67
77
  if 'pytest' in sys.modules:
68
78
  loop.set_debug(True)
@@ -75,7 +85,8 @@ class ExecNode(abc.ABC):
75
85
  except StopAsyncIteration:
76
86
  pass
77
87
  finally:
78
- loop.close()
88
+ if loop != running_loop:
89
+ loop.close()
79
90
 
80
91
  def open(self) -> None:
81
92
  """Bottom-up initialization of nodes for execution. Must be called before __next__."""
@@ -5,10 +5,10 @@ import datetime
5
5
  import itertools
6
6
  import logging
7
7
  import sys
8
- from typing import Iterator, Any, Optional, Callable, cast
8
+ from typing import Any, Callable, Iterator, Optional, cast
9
+
10
+ from pixeltable import exprs, func
9
11
 
10
- from pixeltable import exprs
11
- from pixeltable import func
12
12
  from .globals import Dispatcher, Evaluator, FnCallArgs
13
13
 
14
14
  _logger = logging.getLogger('pixeltable')
@@ -23,6 +23,7 @@ class DefaultExprEvaluator(Evaluator):
23
23
  TODO:
24
24
  - parallelize via Ray
25
25
  """
26
+
26
27
  e: exprs.Expr
27
28
 
28
29
  def __init__(self, e: exprs.Expr, dispatcher: Dispatcher):
@@ -32,8 +33,7 @@ class DefaultExprEvaluator(Evaluator):
32
33
  def schedule(self, rows: list[exprs.DataRow], slot_idx: int) -> None:
33
34
  assert self.e.slot_idx >= 0
34
35
  task = asyncio.create_task(self.eval(rows))
35
- self.dispatcher.tasks.add(task)
36
- task.add_done_callback(self.dispatcher.done_cb)
36
+ self.dispatcher.register_task(task)
37
37
 
38
38
  async def eval(self, rows: list[exprs.DataRow]) -> None:
39
39
  rows_with_excs: set[int] = set() # records idxs into rows
@@ -61,6 +61,7 @@ class FnCallEvaluator(Evaluator):
61
61
  TODO:
62
62
  - adaptive batching: finding the optimal batch size based on observed execution times
63
63
  """
64
+
64
65
  fn_call: exprs.FunctionCall
65
66
  fn: func.CallableFunction
66
67
  scalar_py_fn: Optional[Callable] # only set for non-batching CallableFunctions
@@ -74,7 +75,7 @@ class FnCallEvaluator(Evaluator):
74
75
  self.fn_call = fn_call
75
76
  self.fn = cast(func.CallableFunction, fn_call.fn)
76
77
  if isinstance(self.fn, func.CallableFunction) and self.fn.is_batched:
77
- self.call_args_queue = asyncio.Queue[FnCallArgs]()
78
+ self.call_args_queue = asyncio.Queue[FnCallArgs]()
78
79
  # we're not supplying sample arguments there, they're ignored anyway
79
80
  self.batch_size = self.fn.get_batch_size()
80
81
  self.scalar_py_fn = None
@@ -134,8 +135,7 @@ class FnCallEvaluator(Evaluator):
134
135
  scheduler.submit(batched_call_args)
135
136
  else:
136
137
  task = asyncio.create_task(self.eval_batch(batched_call_args))
137
- self.dispatcher.tasks.add(task)
138
- task.add_done_callback(self.dispatcher.done_cb)
138
+ self.dispatcher.register_task(task)
139
139
 
140
140
  elif self.fn.is_async:
141
141
  if self.fn_call.resource_pool is not None:
@@ -147,14 +147,12 @@ class FnCallEvaluator(Evaluator):
147
147
  # create one task per call
148
148
  for item in rows_call_args:
149
149
  task = asyncio.create_task(self.eval_async(item))
150
- self.dispatcher.tasks.add(task)
151
- task.add_done_callback(self.dispatcher.done_cb)
150
+ self.dispatcher.register_task(task)
152
151
 
153
152
  else:
154
153
  # create a single task for all rows
155
154
  task = asyncio.create_task(self.eval(rows_call_args))
156
- self.dispatcher.tasks.add(task)
157
- task.add_done_callback(self.dispatcher.done_cb)
155
+ self.dispatcher.register_task(task)
158
156
 
159
157
  def _queued_call_args_iter(self) -> Iterator[FnCallArgs]:
160
158
  while not self.call_args_queue.empty():
@@ -171,14 +169,16 @@ class FnCallEvaluator(Evaluator):
171
169
  for k in item.kwargs.keys():
172
170
  batch_kwargs[k][i] = item.kwargs[k]
173
171
  return FnCallArgs(
174
- self.fn_call, [item.row for item in call_args], batch_args=batch_args, batch_kwargs=batch_kwargs)
172
+ self.fn_call, [item.row for item in call_args], batch_args=batch_args, batch_kwargs=batch_kwargs
173
+ )
175
174
 
176
175
  async def eval_batch(self, batched_call_args: FnCallArgs) -> None:
177
176
  result_batch: list[Any]
178
177
  try:
179
178
  if self.fn.is_async:
180
179
  result_batch = await self.fn.aexec_batch(
181
- *batched_call_args.batch_args, **batched_call_args.batch_kwargs)
180
+ *batched_call_args.batch_args, **batched_call_args.batch_kwargs
181
+ )
182
182
  else:
183
183
  # check for cancellation before starting something potentially long-running
184
184
  if asyncio.current_task().cancelled() or self.dispatcher.exc_event.is_set():
@@ -209,6 +209,7 @@ class FnCallEvaluator(Evaluator):
209
209
  self.dispatcher.dispatch([call_args.row])
210
210
  except Exception as exc:
211
211
  import anthropic
212
+
212
213
  if isinstance(exc, anthropic.RateLimitError):
213
214
  _logger.debug(f'RateLimitError: {exc}')
214
215
  _, _, exc_tb = sys.exc_info()
@@ -232,7 +233,8 @@ class FnCallEvaluator(Evaluator):
232
233
  rows_with_excs.add(idx)
233
234
  self.dispatcher.dispatch_exc(item.rows, self.fn_call.slot_idx, exc_tb)
234
235
  self.dispatcher.dispatch(
235
- [call_args_batch[i].row for i in range(len(call_args_batch)) if i not in rows_with_excs])
236
+ [call_args_batch[i].row for i in range(len(call_args_batch)) if i not in rows_with_excs]
237
+ )
236
238
 
237
239
  def _close(self) -> None:
238
240
  """Create a task for the incomplete batch of queued FnCallArgs, if any"""
@@ -241,5 +243,4 @@ class FnCallEvaluator(Evaluator):
241
243
  return
242
244
  batched_call_args = self._create_batch_call_args(list(self._queued_call_args_iter()))
243
245
  task = asyncio.create_task(self.eval_batch(batched_call_args))
244
- self.dispatcher.tasks.add(task)
245
- task.add_done_callback(self.dispatcher.done_cb)
246
+ self.dispatcher.register_task(task)