pixeltable 0.3.2__py3-none-any.whl → 0.3.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (150) hide show
  1. pixeltable/__init__.py +64 -11
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/__init__.py +1 -1
  4. pixeltable/catalog/catalog.py +50 -27
  5. pixeltable/catalog/column.py +27 -11
  6. pixeltable/catalog/dir.py +6 -4
  7. pixeltable/catalog/globals.py +8 -1
  8. pixeltable/catalog/insertable_table.py +22 -12
  9. pixeltable/catalog/named_function.py +10 -6
  10. pixeltable/catalog/path.py +3 -2
  11. pixeltable/catalog/path_dict.py +8 -6
  12. pixeltable/catalog/schema_object.py +2 -1
  13. pixeltable/catalog/table.py +121 -101
  14. pixeltable/catalog/table_version.py +291 -142
  15. pixeltable/catalog/table_version_path.py +8 -5
  16. pixeltable/catalog/view.py +67 -26
  17. pixeltable/dataframe.py +106 -81
  18. pixeltable/env.py +28 -24
  19. pixeltable/exec/__init__.py +2 -2
  20. pixeltable/exec/aggregation_node.py +10 -4
  21. pixeltable/exec/cache_prefetch_node.py +5 -3
  22. pixeltable/exec/component_iteration_node.py +9 -9
  23. pixeltable/exec/data_row_batch.py +21 -10
  24. pixeltable/exec/exec_context.py +10 -3
  25. pixeltable/exec/exec_node.py +23 -12
  26. pixeltable/exec/expr_eval/evaluators.py +13 -7
  27. pixeltable/exec/expr_eval/expr_eval_node.py +24 -15
  28. pixeltable/exec/expr_eval/globals.py +30 -7
  29. pixeltable/exec/expr_eval/row_buffer.py +5 -6
  30. pixeltable/exec/expr_eval/schedulers.py +151 -31
  31. pixeltable/exec/in_memory_data_node.py +8 -7
  32. pixeltable/exec/row_update_node.py +15 -5
  33. pixeltable/exec/sql_node.py +56 -27
  34. pixeltable/exprs/__init__.py +2 -2
  35. pixeltable/exprs/arithmetic_expr.py +57 -26
  36. pixeltable/exprs/array_slice.py +1 -1
  37. pixeltable/exprs/column_property_ref.py +2 -1
  38. pixeltable/exprs/column_ref.py +20 -15
  39. pixeltable/exprs/comparison.py +6 -2
  40. pixeltable/exprs/compound_predicate.py +1 -3
  41. pixeltable/exprs/data_row.py +2 -2
  42. pixeltable/exprs/expr.py +108 -72
  43. pixeltable/exprs/expr_dict.py +2 -1
  44. pixeltable/exprs/expr_set.py +3 -1
  45. pixeltable/exprs/function_call.py +39 -41
  46. pixeltable/exprs/globals.py +1 -0
  47. pixeltable/exprs/in_predicate.py +2 -2
  48. pixeltable/exprs/inline_expr.py +20 -17
  49. pixeltable/exprs/json_mapper.py +4 -2
  50. pixeltable/exprs/json_path.py +12 -18
  51. pixeltable/exprs/literal.py +5 -9
  52. pixeltable/exprs/method_ref.py +1 -0
  53. pixeltable/exprs/object_ref.py +1 -1
  54. pixeltable/exprs/row_builder.py +32 -17
  55. pixeltable/exprs/rowid_ref.py +14 -5
  56. pixeltable/exprs/similarity_expr.py +11 -6
  57. pixeltable/exprs/sql_element_cache.py +1 -1
  58. pixeltable/exprs/type_cast.py +24 -9
  59. pixeltable/ext/__init__.py +1 -0
  60. pixeltable/ext/functions/__init__.py +1 -0
  61. pixeltable/ext/functions/whisperx.py +2 -2
  62. pixeltable/ext/functions/yolox.py +11 -11
  63. pixeltable/func/aggregate_function.py +17 -13
  64. pixeltable/func/callable_function.py +6 -6
  65. pixeltable/func/expr_template_function.py +15 -14
  66. pixeltable/func/function.py +16 -16
  67. pixeltable/func/function_registry.py +11 -8
  68. pixeltable/func/globals.py +4 -2
  69. pixeltable/func/query_template_function.py +12 -13
  70. pixeltable/func/signature.py +18 -9
  71. pixeltable/func/tools.py +10 -17
  72. pixeltable/func/udf.py +106 -11
  73. pixeltable/functions/__init__.py +21 -2
  74. pixeltable/functions/anthropic.py +16 -12
  75. pixeltable/functions/fireworks.py +63 -5
  76. pixeltable/functions/gemini.py +13 -3
  77. pixeltable/functions/globals.py +18 -6
  78. pixeltable/functions/huggingface.py +20 -38
  79. pixeltable/functions/image.py +7 -3
  80. pixeltable/functions/json.py +1 -0
  81. pixeltable/functions/llama_cpp.py +1 -4
  82. pixeltable/functions/mistralai.py +31 -20
  83. pixeltable/functions/ollama.py +4 -18
  84. pixeltable/functions/openai.py +231 -113
  85. pixeltable/functions/replicate.py +11 -10
  86. pixeltable/functions/string.py +70 -7
  87. pixeltable/functions/timestamp.py +21 -8
  88. pixeltable/functions/together.py +66 -52
  89. pixeltable/functions/video.py +1 -0
  90. pixeltable/functions/vision.py +14 -11
  91. pixeltable/functions/whisper.py +2 -1
  92. pixeltable/globals.py +60 -26
  93. pixeltable/index/__init__.py +1 -1
  94. pixeltable/index/btree.py +5 -3
  95. pixeltable/index/embedding_index.py +15 -14
  96. pixeltable/io/__init__.py +1 -1
  97. pixeltable/io/external_store.py +30 -25
  98. pixeltable/io/fiftyone.py +6 -14
  99. pixeltable/io/globals.py +33 -27
  100. pixeltable/io/hf_datasets.py +2 -1
  101. pixeltable/io/label_studio.py +77 -68
  102. pixeltable/io/pandas.py +36 -23
  103. pixeltable/io/parquet.py +9 -12
  104. pixeltable/iterators/__init__.py +1 -0
  105. pixeltable/iterators/audio.py +205 -0
  106. pixeltable/iterators/document.py +19 -8
  107. pixeltable/iterators/image.py +6 -24
  108. pixeltable/iterators/string.py +3 -6
  109. pixeltable/iterators/video.py +1 -7
  110. pixeltable/metadata/__init__.py +7 -1
  111. pixeltable/metadata/converters/convert_10.py +2 -2
  112. pixeltable/metadata/converters/convert_15.py +1 -5
  113. pixeltable/metadata/converters/convert_16.py +2 -4
  114. pixeltable/metadata/converters/convert_17.py +2 -4
  115. pixeltable/metadata/converters/convert_18.py +2 -4
  116. pixeltable/metadata/converters/convert_19.py +2 -5
  117. pixeltable/metadata/converters/convert_20.py +1 -4
  118. pixeltable/metadata/converters/convert_21.py +4 -6
  119. pixeltable/metadata/converters/convert_22.py +1 -0
  120. pixeltable/metadata/converters/convert_23.py +5 -5
  121. pixeltable/metadata/converters/convert_24.py +12 -13
  122. pixeltable/metadata/converters/convert_26.py +23 -0
  123. pixeltable/metadata/converters/util.py +3 -4
  124. pixeltable/metadata/notes.py +1 -0
  125. pixeltable/metadata/schema.py +13 -2
  126. pixeltable/plan.py +173 -98
  127. pixeltable/share/__init__.py +0 -0
  128. pixeltable/share/packager.py +218 -0
  129. pixeltable/store.py +42 -26
  130. pixeltable/type_system.py +102 -75
  131. pixeltable/utils/arrow.py +7 -8
  132. pixeltable/utils/coco.py +16 -17
  133. pixeltable/utils/code.py +1 -1
  134. pixeltable/utils/console_output.py +6 -3
  135. pixeltable/utils/description_helper.py +7 -7
  136. pixeltable/utils/documents.py +3 -1
  137. pixeltable/utils/filecache.py +12 -7
  138. pixeltable/utils/http_server.py +9 -8
  139. pixeltable/utils/iceberg.py +14 -0
  140. pixeltable/utils/media_store.py +3 -2
  141. pixeltable/utils/pytorch.py +11 -14
  142. pixeltable/utils/s3.py +1 -0
  143. pixeltable/utils/sql.py +1 -0
  144. pixeltable/utils/transactional_directory.py +2 -2
  145. {pixeltable-0.3.2.dist-info → pixeltable-0.3.4.dist-info}/METADATA +9 -9
  146. pixeltable-0.3.4.dist-info/RECORD +166 -0
  147. pixeltable-0.3.2.dist-info/RECORD +0 -161
  148. {pixeltable-0.3.2.dist-info → pixeltable-0.3.4.dist-info}/LICENSE +0 -0
  149. {pixeltable-0.3.2.dist-info → pixeltable-0.3.4.dist-info}/WHEEL +0 -0
  150. {pixeltable-0.3.2.dist-info → pixeltable-0.3.4.dist-info}/entry_points.txt +0 -0
pixeltable/env.py CHANGED
@@ -1,6 +1,5 @@
1
1
  from __future__ import annotations
2
2
 
3
- from abc import abstractmethod
4
3
  import datetime
5
4
  import glob
6
5
  import http.server
@@ -16,6 +15,7 @@ import sys
16
15
  import threading
17
16
  import uuid
18
17
  import warnings
18
+ from abc import abstractmethod
19
19
  from dataclasses import dataclass, field
20
20
  from pathlib import Path
21
21
  from sys import stdout
@@ -333,9 +333,7 @@ class Env:
333
333
  http_logger.addHandler(http_fh)
334
334
  http_logger.propagate = False
335
335
 
336
- # empty tmp dir
337
- for path in glob.glob(f'{self._tmp_dir}/*'):
338
- os.remove(path)
336
+ self.clear_tmp_dir()
339
337
 
340
338
  self._db_name = os.environ.get('PIXELTABLE_DB', 'pixeltable')
341
339
  self._pgdata_dir = Path(os.environ.get('PIXELTABLE_PGDATA', str(self._home / 'pgdata')))
@@ -375,6 +373,7 @@ class Env:
375
373
 
376
374
  if create_db:
377
375
  from pixeltable.metadata import schema
376
+
378
377
  schema.base_metadata.create_all(self._sa_engine)
379
378
  metadata.create_system_info(self._sa_engine)
380
379
 
@@ -387,11 +386,7 @@ class Env:
387
386
  def _create_engine(self, time_zone_name: Optional[str], echo: bool = False) -> None:
388
387
  connect_args = {} if time_zone_name is None else {'options': f'-c timezone={time_zone_name}'}
389
388
  self._sa_engine = sql.create_engine(
390
- self.db_url,
391
- echo=echo,
392
- future=True,
393
- isolation_level='REPEATABLE READ',
394
- connect_args=connect_args,
389
+ self.db_url, echo=echo, future=True, isolation_level='REPEATABLE READ', connect_args=connect_args
395
390
  )
396
391
  self._logger.info(f'Created SQLAlchemy engine at: {self.db_url}')
397
392
  with self.engine.begin() as conn:
@@ -424,7 +419,7 @@ class Env:
424
419
  with engine.begin() as conn:
425
420
  # use C collation to get standard C/Python-style sorting
426
421
  stmt = (
427
- f"CREATE DATABASE {preparer.quote(self._db_name)} "
422
+ f'CREATE DATABASE {preparer.quote(self._db_name)} '
428
423
  "ENCODING 'utf-8' LC_COLLATE 'C' LC_CTYPE 'C' TEMPLATE template0"
429
424
  )
430
425
  conn.execute(sql.text(stmt))
@@ -448,12 +443,12 @@ class Env:
448
443
  try:
449
444
  with engine.begin() as conn:
450
445
  # terminate active connections
451
- stmt = (f"""
446
+ stmt = f"""
452
447
  SELECT pg_terminate_backend(pg_stat_activity.pid)
453
448
  FROM pg_stat_activity
454
449
  WHERE pg_stat_activity.datname = '{self._db_name}'
455
450
  AND pid <> pg_backend_pid()
456
- """)
451
+ """
457
452
  conn.execute(sql.text(stmt))
458
453
  # drop db
459
454
  stmt = f'DROP DATABASE {preparer.quote(self._db_name)}'
@@ -563,7 +558,7 @@ class Env:
563
558
  is_installed = False
564
559
  self.__optional_packages[package_name] = PackageInfo(
565
560
  is_installed=is_installed,
566
- library_name=library_name or package_name # defaults to package_name unless specified otherwise
561
+ library_name=library_name or package_name, # defaults to package_name unless specified otherwise
567
562
  )
568
563
 
569
564
  def require_package(self, package_name: str, min_version: Optional[list[int]] = None) -> None:
@@ -609,6 +604,7 @@ class Env:
609
604
  """
610
605
  import spacy
611
606
  from spacy.cli.download import get_model_filename
607
+
612
608
  spacy_model = 'en_core_web_sm'
613
609
  spacy_model_version = '3.7.1'
614
610
  filename = get_model_filename(spacy_model, spacy_model_version, sdist=False)
@@ -626,18 +622,24 @@ class Env:
626
622
  self._logger.warn(f'Failed to load spaCy model: {spacy_model}', exc_info=exc)
627
623
  warnings.warn(
628
624
  f"Failed to load spaCy model '{spacy_model}'. spaCy features will not be available.",
629
- excs.PixeltableWarning
625
+ excs.PixeltableWarning,
630
626
  )
631
627
  self.__optional_packages['spacy'].is_installed = False
632
628
 
629
+ def clear_tmp_dir(self) -> None:
630
+ for path in glob.glob(f'{self._tmp_dir}/*'):
631
+ if os.path.isdir(path):
632
+ shutil.rmtree(path)
633
+ else:
634
+ os.remove(path)
635
+
633
636
  def num_tmp_files(self) -> int:
634
637
  return len(glob.glob(f'{self._tmp_dir}/*'))
635
638
 
636
639
  def create_tmp_path(self, extension: str = '') -> Path:
637
640
  return self._tmp_dir / f'{uuid.uuid4()}{extension}'
638
641
 
639
-
640
- #def get_resource_pool_info(self, pool_id: str, pool_info_cls: Optional[Type[T]]) -> T:
642
+ # def get_resource_pool_info(self, pool_id: str, pool_info_cls: Optional[Type[T]]) -> T:
641
643
  def get_resource_pool_info(self, pool_id: str, make_pool_info: Optional[Callable[[], T]] = None) -> T:
642
644
  """Returns the info object for the given id, creating it if necessary."""
643
645
  info = self._resource_pool_info.get(pool_id)
@@ -707,6 +709,7 @@ def register_client(name: str) -> Callable:
707
709
  Args:
708
710
  - name (str): The name of the API client (e.g., 'openai' or 'label-studio').
709
711
  """
712
+
710
713
  def decorator(fn: Callable) -> None:
711
714
  global _registered_clients
712
715
  sig = inspect.signature(fn)
@@ -721,6 +724,7 @@ class Config:
721
724
  The (global) Pixeltable configuration, as loaded from `config.toml`. Provides methods for retrieving
722
725
  configuration values, which can be set in the config file or as environment variables.
723
726
  """
727
+
724
728
  __config: dict[str, Any]
725
729
 
726
730
  @classmethod
@@ -750,12 +754,7 @@ class Config:
750
754
  free_disk_space_bytes = shutil.disk_usage(config_path.parent).free
751
755
  # Default cache size is 1/5 of free disk space
752
756
  file_cache_size_g = free_disk_space_bytes / 5 / (1 << 30)
753
- return {
754
- 'pixeltable': {
755
- 'file_cache_size_g': round(file_cache_size_g, 1),
756
- 'hide_warnings': False,
757
- }
758
- }
757
+ return {'pixeltable': {'file_cache_size_g': round(file_cache_size_g, 1), 'hide_warnings': False}}
759
758
 
760
759
  def __init__(self, config: dict[str, Any]) -> None:
761
760
  self.__config = config
@@ -840,7 +839,9 @@ class RateLimitsInfo:
840
839
  self.resource_limits = {k: RateLimitInfo(k, now, *v) for k, v in kwargs.items() if v is not None}
841
840
  # TODO: remove
842
841
  for info in self.resource_limits.values():
843
- _logger.debug(f'Init {info.resource} rate limit: rem={info.remaining} reset={info.reset_at.strftime(TIME_FORMAT)} delta={(info.reset_at - now).total_seconds()}')
842
+ _logger.debug(
843
+ f'Init {info.resource} rate limit: rem={info.remaining} reset={info.reset_at.strftime(TIME_FORMAT)} delta={(info.reset_at - now).total_seconds()}'
844
+ )
844
845
  else:
845
846
  for k, v in kwargs.items():
846
847
  if v is not None:
@@ -855,6 +856,7 @@ class RateLimitsInfo:
855
856
  @dataclass
856
857
  class RateLimitInfo:
857
858
  """Container for rate limit-related information for a single resource."""
859
+
858
860
  resource: str
859
861
  recorded_at: datetime.datetime
860
862
  limit: int
@@ -871,4 +873,6 @@ class RateLimitInfo:
871
873
  reset_delta = reset_at - self.reset_at
872
874
  self.reset_at = reset_at
873
875
  # TODO: remove
874
- _logger.debug(f'Update {self.resource} rate limit: rem={self.remaining} reset={self.reset_at.strftime(TIME_FORMAT)} reset_delta={reset_delta.total_seconds()} recorded_delta={(self.reset_at - recorded_at).total_seconds()}')
876
+ _logger.debug(
877
+ f'Update {self.resource} rate limit: rem={self.remaining} reset={self.reset_at.strftime(TIME_FORMAT)} reset_delta={reset_delta.total_seconds()} recorded_delta={(self.reset_at - recorded_at).total_seconds()}'
878
+ )
@@ -4,7 +4,7 @@ from .component_iteration_node import ComponentIterationNode
4
4
  from .data_row_batch import DataRowBatch
5
5
  from .exec_context import ExecContext
6
6
  from .exec_node import ExecNode
7
+ from .expr_eval import ExprEvalNode
7
8
  from .in_memory_data_node import InMemoryDataNode
8
9
  from .row_update_node import RowUpdateNode
9
- from .sql_node import SqlLookupNode, SqlScanNode, SqlAggregationNode, SqlNode, SqlJoinNode
10
- from .expr_eval import ExprEvalNode
10
+ from .sql_node import SqlAggregationNode, SqlJoinNode, SqlLookupNode, SqlNode, SqlScanNode
@@ -2,7 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  import logging
4
4
  import sys
5
- from typing import Any, Iterable, Iterator, Optional, cast, AsyncIterator
5
+ from typing import Any, AsyncIterator, Iterable, Iterator, Optional, cast
6
6
 
7
7
  import pixeltable.catalog as catalog
8
8
  import pixeltable.exceptions as excs
@@ -13,12 +13,14 @@ from .exec_node import ExecNode
13
13
 
14
14
  _logger = logging.getLogger('pixeltable')
15
15
 
16
+
16
17
  class AggregationNode(ExecNode):
17
18
  """
18
19
  In-memory aggregation for UDAs.
19
20
 
20
21
  At the moment, this returns all results in a single DataRowBatch.
21
22
  """
23
+
22
24
  group_by: Optional[list[exprs.Expr]]
23
25
  input_exprs: list[exprs.Expr]
24
26
  agg_fn_eval_ctx: exprs.RowBuilder.EvalCtx
@@ -26,8 +28,13 @@ class AggregationNode(ExecNode):
26
28
  output_batch: DataRowBatch
27
29
 
28
30
  def __init__(
29
- self, tbl: catalog.TableVersion, row_builder: exprs.RowBuilder, group_by: Optional[list[exprs.Expr]],
30
- agg_fn_calls: list[exprs.FunctionCall], input_exprs: Iterable[exprs.Expr], input: ExecNode
31
+ self,
32
+ tbl: catalog.TableVersion,
33
+ row_builder: exprs.RowBuilder,
34
+ group_by: Optional[list[exprs.Expr]],
35
+ agg_fn_calls: list[exprs.FunctionCall],
36
+ input_exprs: Iterable[exprs.Expr],
37
+ input: ExecNode,
31
38
  ):
32
39
  output_exprs: list[exprs.Expr] = [] if group_by is None else list(group_by)
33
40
  output_exprs.extend(agg_fn_calls)
@@ -86,4 +93,3 @@ class AggregationNode(ExecNode):
86
93
  self.output_batch.flush_imgs(None, self.stored_img_cols, self.flushed_img_slots)
87
94
  _logger.debug(f'AggregateNode: consumed {num_input_rows} rows, returning {len(self.output_batch.rows)} rows')
88
95
  yield self.output_batch
89
-
@@ -9,7 +9,7 @@ import urllib.request
9
9
  from collections import deque
10
10
  from concurrent import futures
11
11
  from pathlib import Path
12
- from typing import Optional, Any, Iterator, AsyncIterator
12
+ from typing import Any, AsyncIterator, Iterator, Optional
13
13
  from uuid import UUID
14
14
 
15
15
  import pixeltable.env as env
@@ -30,6 +30,7 @@ class CachePrefetchNode(ExecNode):
30
30
  TODO:
31
31
  - adapting the number of download threads at runtime to maximize throughput
32
32
  """
33
+
33
34
  BATCH_SIZE = 16
34
35
  NUM_EXECUTOR_THREADS = 16
35
36
 
@@ -59,8 +60,8 @@ class CachePrefetchNode(ExecNode):
59
60
  num_missing: int # number of missing URLs in this row
60
61
 
61
62
  def __init__(
62
- self, tbl_id: UUID, file_col_info: list[exprs.ColumnSlotIdx], input: ExecNode,
63
- retain_input_order: bool = True):
63
+ self, tbl_id: UUID, file_col_info: list[exprs.ColumnSlotIdx], input: ExecNode, retain_input_order: bool = True
64
+ ):
64
65
  # input_/output_exprs=[]: we don't have anything to evaluate
65
66
  super().__init__(input.row_builder, [], [], input)
66
67
  self.retain_input_order = retain_input_order
@@ -241,6 +242,7 @@ class CachePrefetchNode(ExecNode):
241
242
  _logger.debug(f'Downloading {url} to {tmp_path}')
242
243
  if parsed.scheme == 's3':
243
244
  from pixeltable.utils.s3 import get_client
245
+
244
246
  with self.boto_client_lock:
245
247
  if self.boto_client is None:
246
248
  config = {
@@ -1,5 +1,4 @@
1
- import inspect
2
- from typing import Iterator, Optional, AsyncIterator
1
+ from typing import AsyncIterator
3
2
 
4
3
  import pixeltable.catalog as catalog
5
4
  import pixeltable.exceptions as excs
@@ -14,6 +13,7 @@ class ComponentIterationNode(ExecNode):
14
13
 
15
14
  Returns row batches of OUTPUT_BATCH_SIZE size.
16
15
  """
16
+
17
17
  __OUTPUT_BATCH_SIZE = 1024
18
18
 
19
19
  def __init__(self, view: catalog.TableVersion, input: ExecNode):
@@ -25,8 +25,8 @@ class ComponentIterationNode(ExecNode):
25
25
  self.iterator_args = iterator_args[0]
26
26
  assert isinstance(self.iterator_args, exprs.InlineDict)
27
27
  self.iterator_args_ctx = self.row_builder.create_eval_ctx([self.iterator_args])
28
- self.iterator_output_schema, self.unstored_column_names = (
29
- self.view.iterator_cls.output_schema(**self.iterator_args.to_kwargs())
28
+ self.iterator_output_schema, self.unstored_column_names = self.view.iterator_cls.output_schema(
29
+ **self.iterator_args.to_kwargs()
30
30
  )
31
31
  self.iterator_output_fields = list(self.iterator_output_schema.keys())
32
32
  self.iterator_output_cols = {
@@ -34,7 +34,8 @@ class ComponentIterationNode(ExecNode):
34
34
  }
35
35
  # referenced iterator output fields
36
36
  self.refd_output_slot_idxs = {
37
- e.col.name: e.slot_idx for e in self.row_builder.unique_exprs
37
+ e.col.name: e.slot_idx
38
+ for e in self.row_builder.unique_exprs
38
39
  if isinstance(e, exprs.ColumnRef) and e.col.name in self.iterator_output_fields
39
40
  }
40
41
 
@@ -79,8 +80,7 @@ class ComponentIterationNode(ExecNode):
79
80
  # verify and copy component_dict fields to their respective slots in output_row
80
81
  for field_name, field_val in component_dict.items():
81
82
  if field_name not in self.iterator_output_fields:
82
- raise excs.Error(
83
- f'Invalid field name {field_name} in output of {self.view.iterator_cls.__name__}')
83
+ raise excs.Error(f'Invalid field name {field_name} in output of {self.view.iterator_cls.__name__}')
84
84
  if field_name not in self.refd_output_slot_idxs:
85
85
  # we can ignore this
86
86
  continue
@@ -90,5 +90,5 @@ class ComponentIterationNode(ExecNode):
90
90
  if len(component_dict) != len(self.iterator_output_fields):
91
91
  missing_fields = set(self.refd_output_slot_idxs.keys()) - set(component_dict.keys())
92
92
  raise excs.Error(
93
- f'Invalid output of {self.view.iterator_cls.__name__}: '
94
- f'missing fields {", ".join(missing_fields)}')
93
+ f'Invalid output of {self.view.iterator_cls.__name__}: missing fields {", ".join(missing_fields)}'
94
+ )
@@ -1,19 +1,21 @@
1
1
  from __future__ import annotations
2
- from typing import Iterator, Optional
2
+
3
3
  import logging
4
+ from typing import Iterator, Optional
4
5
 
5
- import pixeltable.exprs as exprs
6
6
  import pixeltable.catalog as catalog
7
+ import pixeltable.exprs as exprs
7
8
  from pixeltable.utils.media_store import MediaStore
8
9
 
9
-
10
10
  _logger = logging.getLogger('pixeltable')
11
11
 
12
+
12
13
  class DataRowBatch:
13
14
  """Set of DataRows, indexed by rowid.
14
15
 
15
16
  Contains the metadata needed to initialize DataRows.
16
17
  """
18
+
17
19
  tbl: Optional[catalog.TableVersion]
18
20
  row_builder: exprs.RowBuilder
19
21
  img_slot_idxs: list[int]
@@ -22,8 +24,11 @@ class DataRowBatch:
22
24
  rows: list[exprs.DataRow]
23
25
 
24
26
  def __init__(
25
- self, tbl: Optional[catalog.TableVersion], row_builder: exprs.RowBuilder, num_rows: Optional[int] = None,
26
- rows: Optional[list[exprs.DataRow]] = None
27
+ self,
28
+ tbl: Optional[catalog.TableVersion],
29
+ row_builder: exprs.RowBuilder,
30
+ num_rows: Optional[int] = None,
31
+ rows: Optional[list[exprs.DataRow]] = None,
27
32
  ):
28
33
  """
29
34
  Requires either num_rows or rows to be specified, but not both.
@@ -34,7 +39,8 @@ class DataRowBatch:
34
39
  self.img_slot_idxs = [e.slot_idx for e in row_builder.unique_exprs if e.col_type.is_image_type()]
35
40
  # non-image media slots
36
41
  self.media_slot_idxs = [
37
- e.slot_idx for e in row_builder.unique_exprs
42
+ e.slot_idx
43
+ for e in row_builder.unique_exprs
38
44
  if e.col_type.is_media_type() and not e.col_type.is_image_type()
39
45
  ]
40
46
  self.array_slot_idxs = [e.slot_idx for e in row_builder.unique_exprs if e.col_type.is_array_type()]
@@ -44,14 +50,17 @@ class DataRowBatch:
44
50
  if num_rows is None:
45
51
  num_rows = 0
46
52
  self.rows = [
47
- exprs.DataRow(row_builder.num_materialized, self.img_slot_idxs, self.media_slot_idxs, self.array_slot_idxs)
53
+ exprs.DataRow(
54
+ row_builder.num_materialized, self.img_slot_idxs, self.media_slot_idxs, self.array_slot_idxs
55
+ )
48
56
  for _ in range(num_rows)
49
57
  ]
50
58
 
51
59
  def add_row(self, row: Optional[exprs.DataRow] = None) -> exprs.DataRow:
52
60
  if row is None:
53
61
  row = exprs.DataRow(
54
- self.row_builder.num_materialized, self.img_slot_idxs, self.media_slot_idxs, self.array_slot_idxs)
62
+ self.row_builder.num_materialized, self.img_slot_idxs, self.media_slot_idxs, self.array_slot_idxs
63
+ )
55
64
  self.rows.append(row)
56
65
  return row
57
66
 
@@ -65,8 +74,10 @@ class DataRowBatch:
65
74
  return self.rows[index]
66
75
 
67
76
  def flush_imgs(
68
- self, idx_range: Optional[slice] = None, stored_img_info: Optional[list[exprs.ColumnSlotIdx]] = None,
69
- flushed_slot_idxs: Optional[list[int]] = None
77
+ self,
78
+ idx_range: Optional[slice] = None,
79
+ stored_img_info: Optional[list[exprs.ColumnSlotIdx]] = None,
80
+ flushed_slot_idxs: Optional[list[int]] = None,
70
81
  ) -> None:
71
82
  """Flushes images in the given range of rows."""
72
83
  assert self.tbl is not None
@@ -4,12 +4,19 @@ import sqlalchemy as sql
4
4
 
5
5
  import pixeltable.exprs as exprs
6
6
 
7
+
7
8
  class ExecContext:
8
9
  """Class for execution runtime constants"""
10
+
9
11
  def __init__(
10
- self, row_builder: exprs.RowBuilder, *, show_pbar: bool = False, batch_size: int = 0,
11
- pk_clause: Optional[list[sql.ClauseElement]] = None, num_computed_exprs: int = 0,
12
- ignore_errors: bool = False
12
+ self,
13
+ row_builder: exprs.RowBuilder,
14
+ *,
15
+ show_pbar: bool = False,
16
+ batch_size: int = 0,
17
+ pk_clause: Optional[list[sql.ClauseElement]] = None,
18
+ num_computed_exprs: int = 0,
19
+ ignore_errors: bool = False,
13
20
  ):
14
21
  self.show_pbar = show_pbar
15
22
  self.batch_size = batch_size
@@ -4,16 +4,19 @@ import abc
4
4
  import asyncio
5
5
  import logging
6
6
  import sys
7
- from typing import Iterable, Iterator, Optional, TypeVar, AsyncIterator
7
+ from typing import AsyncIterator, Iterable, Iterator, Optional, TypeVar
8
8
 
9
9
  import pixeltable.exprs as exprs
10
+
10
11
  from .data_row_batch import DataRowBatch
11
12
  from .exec_context import ExecContext
12
13
 
13
14
  _logger = logging.getLogger('pixeltable')
14
15
 
16
+
15
17
  class ExecNode(abc.ABC):
16
18
  """Base class of all execution nodes"""
19
+
17
20
  output_exprs: Iterable[exprs.Expr]
18
21
  row_builder: exprs.RowBuilder
19
22
  input: Optional[ExecNode]
@@ -22,8 +25,12 @@ class ExecNode(abc.ABC):
22
25
  ctx: Optional[ExecContext]
23
26
 
24
27
  def __init__(
25
- self, row_builder: exprs.RowBuilder, output_exprs: Iterable[exprs.Expr],
26
- input_exprs: Iterable[exprs.Expr], input: Optional[ExecNode] = None):
28
+ self,
29
+ row_builder: exprs.RowBuilder,
30
+ output_exprs: Iterable[exprs.Expr],
31
+ input_exprs: Iterable[exprs.Expr],
32
+ input: Optional[ExecNode] = None,
33
+ ):
27
34
  self.output_exprs = output_exprs
28
35
  self.row_builder = row_builder
29
36
  self.input = input
@@ -31,8 +38,7 @@ class ExecNode(abc.ABC):
31
38
  output_slot_idxs = {e.slot_idx for e in output_exprs}
32
39
  output_dependencies = row_builder.get_dependencies(output_exprs, exclude=input_exprs)
33
40
  self.flushed_img_slots = [
34
- e.slot_idx for e in output_dependencies
35
- if e.col_type.is_image_type() and e.slot_idx not in output_slot_idxs
41
+ e.slot_idx for e in output_dependencies if e.col_type.is_image_type() and e.slot_idx not in output_slot_idxs
36
42
  ]
37
43
  self.stored_img_cols = []
38
44
  self.ctx = None # all nodes of a tree share the same context
@@ -53,16 +59,20 @@ class ExecNode(abc.ABC):
53
59
  pass
54
60
 
55
61
  def __iter__(self) -> Iterator[DataRowBatch]:
62
+ running_loop: Optional[asyncio.AbstractEventLoop] = None
63
+ loop: asyncio.AbstractEventLoop
56
64
  try:
57
- # check if we are already in an event loop (eg, Jupyter's); if so, patch it to allow nested event loops
58
- _ = asyncio.get_event_loop()
65
+ # check if we are already in an event loop (eg, Jupyter's); if so, patch it to allow
66
+ # multiple run_until_complete()
67
+ running_loop = asyncio.get_running_loop()
59
68
  import nest_asyncio # type: ignore
69
+
60
70
  nest_asyncio.apply()
71
+ loop = running_loop
72
+ _logger.debug(f'Patched running loop')
61
73
  except RuntimeError:
62
- pass
63
-
64
- loop = asyncio.new_event_loop()
65
- asyncio.set_event_loop(loop)
74
+ loop = asyncio.new_event_loop()
75
+ asyncio.set_event_loop(loop)
66
76
 
67
77
  if 'pytest' in sys.modules:
68
78
  loop.set_debug(True)
@@ -75,7 +85,8 @@ class ExecNode(abc.ABC):
75
85
  except StopAsyncIteration:
76
86
  pass
77
87
  finally:
78
- loop.close()
88
+ if loop != running_loop:
89
+ loop.close()
79
90
 
80
91
  def open(self) -> None:
81
92
  """Bottom-up initialization of nodes for execution. Must be called before __next__."""
@@ -5,10 +5,10 @@ import datetime
5
5
  import itertools
6
6
  import logging
7
7
  import sys
8
- from typing import Iterator, Any, Optional, Callable, cast
8
+ from typing import Any, Callable, Iterator, Optional, cast
9
+
10
+ from pixeltable import exprs, func
9
11
 
10
- from pixeltable import exprs
11
- from pixeltable import func
12
12
  from .globals import Dispatcher, Evaluator, FnCallArgs
13
13
 
14
14
  _logger = logging.getLogger('pixeltable')
@@ -23,6 +23,7 @@ class DefaultExprEvaluator(Evaluator):
23
23
  TODO:
24
24
  - parallelize via Ray
25
25
  """
26
+
26
27
  e: exprs.Expr
27
28
 
28
29
  def __init__(self, e: exprs.Expr, dispatcher: Dispatcher):
@@ -60,6 +61,7 @@ class FnCallEvaluator(Evaluator):
60
61
  TODO:
61
62
  - adaptive batching: finding the optimal batch size based on observed execution times
62
63
  """
64
+
63
65
  fn_call: exprs.FunctionCall
64
66
  fn: func.CallableFunction
65
67
  scalar_py_fn: Optional[Callable] # only set for non-batching CallableFunctions
@@ -73,7 +75,7 @@ class FnCallEvaluator(Evaluator):
73
75
  self.fn_call = fn_call
74
76
  self.fn = cast(func.CallableFunction, fn_call.fn)
75
77
  if isinstance(self.fn, func.CallableFunction) and self.fn.is_batched:
76
- self.call_args_queue = asyncio.Queue[FnCallArgs]()
78
+ self.call_args_queue = asyncio.Queue[FnCallArgs]()
77
79
  # we're not supplying sample arguments there, they're ignored anyway
78
80
  self.batch_size = self.fn.get_batch_size()
79
81
  self.scalar_py_fn = None
@@ -167,14 +169,16 @@ class FnCallEvaluator(Evaluator):
167
169
  for k in item.kwargs.keys():
168
170
  batch_kwargs[k][i] = item.kwargs[k]
169
171
  return FnCallArgs(
170
- self.fn_call, [item.row for item in call_args], batch_args=batch_args, batch_kwargs=batch_kwargs)
172
+ self.fn_call, [item.row for item in call_args], batch_args=batch_args, batch_kwargs=batch_kwargs
173
+ )
171
174
 
172
175
  async def eval_batch(self, batched_call_args: FnCallArgs) -> None:
173
176
  result_batch: list[Any]
174
177
  try:
175
178
  if self.fn.is_async:
176
179
  result_batch = await self.fn.aexec_batch(
177
- *batched_call_args.batch_args, **batched_call_args.batch_kwargs)
180
+ *batched_call_args.batch_args, **batched_call_args.batch_kwargs
181
+ )
178
182
  else:
179
183
  # check for cancellation before starting something potentially long-running
180
184
  if asyncio.current_task().cancelled() or self.dispatcher.exc_event.is_set():
@@ -205,6 +209,7 @@ class FnCallEvaluator(Evaluator):
205
209
  self.dispatcher.dispatch([call_args.row])
206
210
  except Exception as exc:
207
211
  import anthropic
212
+
208
213
  if isinstance(exc, anthropic.RateLimitError):
209
214
  _logger.debug(f'RateLimitError: {exc}')
210
215
  _, _, exc_tb = sys.exc_info()
@@ -228,7 +233,8 @@ class FnCallEvaluator(Evaluator):
228
233
  rows_with_excs.add(idx)
229
234
  self.dispatcher.dispatch_exc(item.rows, self.fn_call.slot_idx, exc_tb)
230
235
  self.dispatcher.dispatch(
231
- [call_args_batch[i].row for i in range(len(call_args_batch)) if i not in rows_with_excs])
236
+ [call_args_batch[i].row for i in range(len(call_args_batch)) if i not in rows_with_excs]
237
+ )
232
238
 
233
239
  def _close(self) -> None:
234
240
  """Create a task for the incomplete batch of queued FnCallArgs, if any"""