pixeltable 0.3.2__py3-none-any.whl → 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (147) hide show
  1. pixeltable/__init__.py +64 -11
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/__init__.py +1 -1
  4. pixeltable/catalog/catalog.py +50 -27
  5. pixeltable/catalog/column.py +27 -11
  6. pixeltable/catalog/dir.py +6 -4
  7. pixeltable/catalog/globals.py +8 -1
  8. pixeltable/catalog/insertable_table.py +22 -12
  9. pixeltable/catalog/named_function.py +10 -6
  10. pixeltable/catalog/path.py +3 -2
  11. pixeltable/catalog/path_dict.py +8 -6
  12. pixeltable/catalog/schema_object.py +2 -1
  13. pixeltable/catalog/table.py +121 -101
  14. pixeltable/catalog/table_version.py +291 -142
  15. pixeltable/catalog/table_version_path.py +8 -5
  16. pixeltable/catalog/view.py +67 -26
  17. pixeltable/dataframe.py +102 -72
  18. pixeltable/env.py +20 -21
  19. pixeltable/exec/__init__.py +2 -2
  20. pixeltable/exec/aggregation_node.py +10 -4
  21. pixeltable/exec/cache_prefetch_node.py +5 -3
  22. pixeltable/exec/component_iteration_node.py +9 -8
  23. pixeltable/exec/data_row_batch.py +21 -10
  24. pixeltable/exec/exec_context.py +10 -3
  25. pixeltable/exec/exec_node.py +23 -12
  26. pixeltable/exec/expr_eval/evaluators.py +13 -7
  27. pixeltable/exec/expr_eval/expr_eval_node.py +24 -15
  28. pixeltable/exec/expr_eval/globals.py +30 -7
  29. pixeltable/exec/expr_eval/row_buffer.py +5 -6
  30. pixeltable/exec/expr_eval/schedulers.py +151 -31
  31. pixeltable/exec/in_memory_data_node.py +8 -7
  32. pixeltable/exec/row_update_node.py +15 -5
  33. pixeltable/exec/sql_node.py +56 -27
  34. pixeltable/exprs/__init__.py +2 -2
  35. pixeltable/exprs/arithmetic_expr.py +57 -26
  36. pixeltable/exprs/array_slice.py +1 -1
  37. pixeltable/exprs/column_property_ref.py +2 -1
  38. pixeltable/exprs/column_ref.py +20 -15
  39. pixeltable/exprs/comparison.py +6 -2
  40. pixeltable/exprs/compound_predicate.py +1 -3
  41. pixeltable/exprs/data_row.py +2 -2
  42. pixeltable/exprs/expr.py +101 -72
  43. pixeltable/exprs/expr_dict.py +2 -1
  44. pixeltable/exprs/expr_set.py +3 -1
  45. pixeltable/exprs/function_call.py +39 -41
  46. pixeltable/exprs/globals.py +1 -0
  47. pixeltable/exprs/in_predicate.py +2 -2
  48. pixeltable/exprs/inline_expr.py +20 -17
  49. pixeltable/exprs/json_mapper.py +4 -2
  50. pixeltable/exprs/json_path.py +12 -18
  51. pixeltable/exprs/literal.py +5 -9
  52. pixeltable/exprs/method_ref.py +1 -0
  53. pixeltable/exprs/object_ref.py +1 -1
  54. pixeltable/exprs/row_builder.py +32 -17
  55. pixeltable/exprs/rowid_ref.py +14 -5
  56. pixeltable/exprs/similarity_expr.py +11 -6
  57. pixeltable/exprs/sql_element_cache.py +1 -1
  58. pixeltable/exprs/type_cast.py +24 -9
  59. pixeltable/ext/__init__.py +1 -0
  60. pixeltable/ext/functions/__init__.py +1 -0
  61. pixeltable/ext/functions/whisperx.py +2 -2
  62. pixeltable/ext/functions/yolox.py +11 -11
  63. pixeltable/func/aggregate_function.py +17 -13
  64. pixeltable/func/callable_function.py +6 -6
  65. pixeltable/func/expr_template_function.py +15 -14
  66. pixeltable/func/function.py +16 -16
  67. pixeltable/func/function_registry.py +11 -8
  68. pixeltable/func/globals.py +4 -2
  69. pixeltable/func/query_template_function.py +12 -13
  70. pixeltable/func/signature.py +18 -9
  71. pixeltable/func/tools.py +10 -17
  72. pixeltable/func/udf.py +106 -11
  73. pixeltable/functions/__init__.py +21 -2
  74. pixeltable/functions/anthropic.py +16 -12
  75. pixeltable/functions/fireworks.py +63 -5
  76. pixeltable/functions/gemini.py +13 -3
  77. pixeltable/functions/globals.py +18 -6
  78. pixeltable/functions/huggingface.py +20 -38
  79. pixeltable/functions/image.py +7 -3
  80. pixeltable/functions/json.py +1 -0
  81. pixeltable/functions/llama_cpp.py +1 -4
  82. pixeltable/functions/mistralai.py +31 -20
  83. pixeltable/functions/ollama.py +4 -18
  84. pixeltable/functions/openai.py +201 -108
  85. pixeltable/functions/replicate.py +11 -10
  86. pixeltable/functions/string.py +70 -7
  87. pixeltable/functions/timestamp.py +21 -8
  88. pixeltable/functions/together.py +66 -52
  89. pixeltable/functions/video.py +1 -0
  90. pixeltable/functions/vision.py +14 -11
  91. pixeltable/functions/whisper.py +2 -1
  92. pixeltable/globals.py +60 -26
  93. pixeltable/index/__init__.py +1 -1
  94. pixeltable/index/btree.py +5 -3
  95. pixeltable/index/embedding_index.py +15 -14
  96. pixeltable/io/__init__.py +1 -1
  97. pixeltable/io/external_store.py +30 -25
  98. pixeltable/io/fiftyone.py +6 -14
  99. pixeltable/io/globals.py +33 -27
  100. pixeltable/io/hf_datasets.py +2 -1
  101. pixeltable/io/label_studio.py +77 -68
  102. pixeltable/io/pandas.py +33 -9
  103. pixeltable/io/parquet.py +9 -12
  104. pixeltable/iterators/__init__.py +1 -0
  105. pixeltable/iterators/audio.py +205 -0
  106. pixeltable/iterators/document.py +19 -8
  107. pixeltable/iterators/image.py +6 -24
  108. pixeltable/iterators/string.py +3 -6
  109. pixeltable/iterators/video.py +1 -7
  110. pixeltable/metadata/__init__.py +7 -1
  111. pixeltable/metadata/converters/convert_10.py +2 -2
  112. pixeltable/metadata/converters/convert_15.py +1 -5
  113. pixeltable/metadata/converters/convert_16.py +2 -4
  114. pixeltable/metadata/converters/convert_17.py +2 -4
  115. pixeltable/metadata/converters/convert_18.py +2 -4
  116. pixeltable/metadata/converters/convert_19.py +2 -5
  117. pixeltable/metadata/converters/convert_20.py +1 -4
  118. pixeltable/metadata/converters/convert_21.py +4 -6
  119. pixeltable/metadata/converters/convert_22.py +1 -0
  120. pixeltable/metadata/converters/convert_23.py +5 -5
  121. pixeltable/metadata/converters/convert_24.py +12 -13
  122. pixeltable/metadata/converters/convert_26.py +23 -0
  123. pixeltable/metadata/converters/util.py +3 -4
  124. pixeltable/metadata/notes.py +1 -0
  125. pixeltable/metadata/schema.py +13 -2
  126. pixeltable/plan.py +173 -98
  127. pixeltable/store.py +42 -26
  128. pixeltable/type_system.py +62 -54
  129. pixeltable/utils/arrow.py +1 -2
  130. pixeltable/utils/coco.py +16 -17
  131. pixeltable/utils/code.py +1 -1
  132. pixeltable/utils/console_output.py +6 -3
  133. pixeltable/utils/description_helper.py +7 -7
  134. pixeltable/utils/documents.py +3 -1
  135. pixeltable/utils/filecache.py +12 -7
  136. pixeltable/utils/http_server.py +9 -8
  137. pixeltable/utils/media_store.py +2 -1
  138. pixeltable/utils/pytorch.py +11 -14
  139. pixeltable/utils/s3.py +1 -0
  140. pixeltable/utils/sql.py +1 -0
  141. pixeltable/utils/transactional_directory.py +2 -2
  142. {pixeltable-0.3.2.dist-info → pixeltable-0.3.3.dist-info}/METADATA +6 -8
  143. pixeltable-0.3.3.dist-info/RECORD +163 -0
  144. pixeltable-0.3.2.dist-info/RECORD +0 -161
  145. {pixeltable-0.3.2.dist-info → pixeltable-0.3.3.dist-info}/LICENSE +0 -0
  146. {pixeltable-0.3.2.dist-info → pixeltable-0.3.3.dist-info}/WHEEL +0 -0
  147. {pixeltable-0.3.2.dist-info → pixeltable-0.3.3.dist-info}/entry_points.txt +0 -0
pixeltable/env.py CHANGED
@@ -1,6 +1,5 @@
1
1
  from __future__ import annotations
2
2
 
3
- from abc import abstractmethod
4
3
  import datetime
5
4
  import glob
6
5
  import http.server
@@ -16,6 +15,7 @@ import sys
16
15
  import threading
17
16
  import uuid
18
17
  import warnings
18
+ from abc import abstractmethod
19
19
  from dataclasses import dataclass, field
20
20
  from pathlib import Path
21
21
  from sys import stdout
@@ -375,6 +375,7 @@ class Env:
375
375
 
376
376
  if create_db:
377
377
  from pixeltable.metadata import schema
378
+
378
379
  schema.base_metadata.create_all(self._sa_engine)
379
380
  metadata.create_system_info(self._sa_engine)
380
381
 
@@ -387,11 +388,7 @@ class Env:
387
388
  def _create_engine(self, time_zone_name: Optional[str], echo: bool = False) -> None:
388
389
  connect_args = {} if time_zone_name is None else {'options': f'-c timezone={time_zone_name}'}
389
390
  self._sa_engine = sql.create_engine(
390
- self.db_url,
391
- echo=echo,
392
- future=True,
393
- isolation_level='REPEATABLE READ',
394
- connect_args=connect_args,
391
+ self.db_url, echo=echo, future=True, isolation_level='REPEATABLE READ', connect_args=connect_args
395
392
  )
396
393
  self._logger.info(f'Created SQLAlchemy engine at: {self.db_url}')
397
394
  with self.engine.begin() as conn:
@@ -424,7 +421,7 @@ class Env:
424
421
  with engine.begin() as conn:
425
422
  # use C collation to get standard C/Python-style sorting
426
423
  stmt = (
427
- f"CREATE DATABASE {preparer.quote(self._db_name)} "
424
+ f'CREATE DATABASE {preparer.quote(self._db_name)} '
428
425
  "ENCODING 'utf-8' LC_COLLATE 'C' LC_CTYPE 'C' TEMPLATE template0"
429
426
  )
430
427
  conn.execute(sql.text(stmt))
@@ -448,12 +445,12 @@ class Env:
448
445
  try:
449
446
  with engine.begin() as conn:
450
447
  # terminate active connections
451
- stmt = (f"""
448
+ stmt = f"""
452
449
  SELECT pg_terminate_backend(pg_stat_activity.pid)
453
450
  FROM pg_stat_activity
454
451
  WHERE pg_stat_activity.datname = '{self._db_name}'
455
452
  AND pid <> pg_backend_pid()
456
- """)
453
+ """
457
454
  conn.execute(sql.text(stmt))
458
455
  # drop db
459
456
  stmt = f'DROP DATABASE {preparer.quote(self._db_name)}'
@@ -563,7 +560,7 @@ class Env:
563
560
  is_installed = False
564
561
  self.__optional_packages[package_name] = PackageInfo(
565
562
  is_installed=is_installed,
566
- library_name=library_name or package_name # defaults to package_name unless specified otherwise
563
+ library_name=library_name or package_name, # defaults to package_name unless specified otherwise
567
564
  )
568
565
 
569
566
  def require_package(self, package_name: str, min_version: Optional[list[int]] = None) -> None:
@@ -609,6 +606,7 @@ class Env:
609
606
  """
610
607
  import spacy
611
608
  from spacy.cli.download import get_model_filename
609
+
612
610
  spacy_model = 'en_core_web_sm'
613
611
  spacy_model_version = '3.7.1'
614
612
  filename = get_model_filename(spacy_model, spacy_model_version, sdist=False)
@@ -626,7 +624,7 @@ class Env:
626
624
  self._logger.warn(f'Failed to load spaCy model: {spacy_model}', exc_info=exc)
627
625
  warnings.warn(
628
626
  f"Failed to load spaCy model '{spacy_model}'. spaCy features will not be available.",
629
- excs.PixeltableWarning
627
+ excs.PixeltableWarning,
630
628
  )
631
629
  self.__optional_packages['spacy'].is_installed = False
632
630
 
@@ -636,8 +634,7 @@ class Env:
636
634
  def create_tmp_path(self, extension: str = '') -> Path:
637
635
  return self._tmp_dir / f'{uuid.uuid4()}{extension}'
638
636
 
639
-
640
- #def get_resource_pool_info(self, pool_id: str, pool_info_cls: Optional[Type[T]]) -> T:
637
+ # def get_resource_pool_info(self, pool_id: str, pool_info_cls: Optional[Type[T]]) -> T:
641
638
  def get_resource_pool_info(self, pool_id: str, make_pool_info: Optional[Callable[[], T]] = None) -> T:
642
639
  """Returns the info object for the given id, creating it if necessary."""
643
640
  info = self._resource_pool_info.get(pool_id)
@@ -707,6 +704,7 @@ def register_client(name: str) -> Callable:
707
704
  Args:
708
705
  - name (str): The name of the API client (e.g., 'openai' or 'label-studio').
709
706
  """
707
+
710
708
  def decorator(fn: Callable) -> None:
711
709
  global _registered_clients
712
710
  sig = inspect.signature(fn)
@@ -721,6 +719,7 @@ class Config:
721
719
  The (global) Pixeltable configuration, as loaded from `config.toml`. Provides methods for retrieving
722
720
  configuration values, which can be set in the config file or as environment variables.
723
721
  """
722
+
724
723
  __config: dict[str, Any]
725
724
 
726
725
  @classmethod
@@ -750,12 +749,7 @@ class Config:
750
749
  free_disk_space_bytes = shutil.disk_usage(config_path.parent).free
751
750
  # Default cache size is 1/5 of free disk space
752
751
  file_cache_size_g = free_disk_space_bytes / 5 / (1 << 30)
753
- return {
754
- 'pixeltable': {
755
- 'file_cache_size_g': round(file_cache_size_g, 1),
756
- 'hide_warnings': False,
757
- }
758
- }
752
+ return {'pixeltable': {'file_cache_size_g': round(file_cache_size_g, 1), 'hide_warnings': False}}
759
753
 
760
754
  def __init__(self, config: dict[str, Any]) -> None:
761
755
  self.__config = config
@@ -840,7 +834,9 @@ class RateLimitsInfo:
840
834
  self.resource_limits = {k: RateLimitInfo(k, now, *v) for k, v in kwargs.items() if v is not None}
841
835
  # TODO: remove
842
836
  for info in self.resource_limits.values():
843
- _logger.debug(f'Init {info.resource} rate limit: rem={info.remaining} reset={info.reset_at.strftime(TIME_FORMAT)} delta={(info.reset_at - now).total_seconds()}')
837
+ _logger.debug(
838
+ f'Init {info.resource} rate limit: rem={info.remaining} reset={info.reset_at.strftime(TIME_FORMAT)} delta={(info.reset_at - now).total_seconds()}'
839
+ )
844
840
  else:
845
841
  for k, v in kwargs.items():
846
842
  if v is not None:
@@ -855,6 +851,7 @@ class RateLimitsInfo:
855
851
  @dataclass
856
852
  class RateLimitInfo:
857
853
  """Container for rate limit-related information for a single resource."""
854
+
858
855
  resource: str
859
856
  recorded_at: datetime.datetime
860
857
  limit: int
@@ -871,4 +868,6 @@ class RateLimitInfo:
871
868
  reset_delta = reset_at - self.reset_at
872
869
  self.reset_at = reset_at
873
870
  # TODO: remove
874
- _logger.debug(f'Update {self.resource} rate limit: rem={self.remaining} reset={self.reset_at.strftime(TIME_FORMAT)} reset_delta={reset_delta.total_seconds()} recorded_delta={(self.reset_at - recorded_at).total_seconds()}')
871
+ _logger.debug(
872
+ f'Update {self.resource} rate limit: rem={self.remaining} reset={self.reset_at.strftime(TIME_FORMAT)} reset_delta={reset_delta.total_seconds()} recorded_delta={(self.reset_at - recorded_at).total_seconds()}'
873
+ )
@@ -4,7 +4,7 @@ from .component_iteration_node import ComponentIterationNode
4
4
  from .data_row_batch import DataRowBatch
5
5
  from .exec_context import ExecContext
6
6
  from .exec_node import ExecNode
7
+ from .expr_eval import ExprEvalNode
7
8
  from .in_memory_data_node import InMemoryDataNode
8
9
  from .row_update_node import RowUpdateNode
9
- from .sql_node import SqlLookupNode, SqlScanNode, SqlAggregationNode, SqlNode, SqlJoinNode
10
- from .expr_eval import ExprEvalNode
10
+ from .sql_node import SqlAggregationNode, SqlJoinNode, SqlLookupNode, SqlNode, SqlScanNode
@@ -2,7 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  import logging
4
4
  import sys
5
- from typing import Any, Iterable, Iterator, Optional, cast, AsyncIterator
5
+ from typing import Any, AsyncIterator, Iterable, Iterator, Optional, cast
6
6
 
7
7
  import pixeltable.catalog as catalog
8
8
  import pixeltable.exceptions as excs
@@ -13,12 +13,14 @@ from .exec_node import ExecNode
13
13
 
14
14
  _logger = logging.getLogger('pixeltable')
15
15
 
16
+
16
17
  class AggregationNode(ExecNode):
17
18
  """
18
19
  In-memory aggregation for UDAs.
19
20
 
20
21
  At the moment, this returns all results in a single DataRowBatch.
21
22
  """
23
+
22
24
  group_by: Optional[list[exprs.Expr]]
23
25
  input_exprs: list[exprs.Expr]
24
26
  agg_fn_eval_ctx: exprs.RowBuilder.EvalCtx
@@ -26,8 +28,13 @@ class AggregationNode(ExecNode):
26
28
  output_batch: DataRowBatch
27
29
 
28
30
  def __init__(
29
- self, tbl: catalog.TableVersion, row_builder: exprs.RowBuilder, group_by: Optional[list[exprs.Expr]],
30
- agg_fn_calls: list[exprs.FunctionCall], input_exprs: Iterable[exprs.Expr], input: ExecNode
31
+ self,
32
+ tbl: catalog.TableVersion,
33
+ row_builder: exprs.RowBuilder,
34
+ group_by: Optional[list[exprs.Expr]],
35
+ agg_fn_calls: list[exprs.FunctionCall],
36
+ input_exprs: Iterable[exprs.Expr],
37
+ input: ExecNode,
31
38
  ):
32
39
  output_exprs: list[exprs.Expr] = [] if group_by is None else list(group_by)
33
40
  output_exprs.extend(agg_fn_calls)
@@ -86,4 +93,3 @@ class AggregationNode(ExecNode):
86
93
  self.output_batch.flush_imgs(None, self.stored_img_cols, self.flushed_img_slots)
87
94
  _logger.debug(f'AggregateNode: consumed {num_input_rows} rows, returning {len(self.output_batch.rows)} rows')
88
95
  yield self.output_batch
89
-
@@ -9,7 +9,7 @@ import urllib.request
9
9
  from collections import deque
10
10
  from concurrent import futures
11
11
  from pathlib import Path
12
- from typing import Optional, Any, Iterator, AsyncIterator
12
+ from typing import Any, AsyncIterator, Iterator, Optional
13
13
  from uuid import UUID
14
14
 
15
15
  import pixeltable.env as env
@@ -30,6 +30,7 @@ class CachePrefetchNode(ExecNode):
30
30
  TODO:
31
31
  - adapting the number of download threads at runtime to maximize throughput
32
32
  """
33
+
33
34
  BATCH_SIZE = 16
34
35
  NUM_EXECUTOR_THREADS = 16
35
36
 
@@ -59,8 +60,8 @@ class CachePrefetchNode(ExecNode):
59
60
  num_missing: int # number of missing URLs in this row
60
61
 
61
62
  def __init__(
62
- self, tbl_id: UUID, file_col_info: list[exprs.ColumnSlotIdx], input: ExecNode,
63
- retain_input_order: bool = True):
63
+ self, tbl_id: UUID, file_col_info: list[exprs.ColumnSlotIdx], input: ExecNode, retain_input_order: bool = True
64
+ ):
64
65
  # input_/output_exprs=[]: we don't have anything to evaluate
65
66
  super().__init__(input.row_builder, [], [], input)
66
67
  self.retain_input_order = retain_input_order
@@ -241,6 +242,7 @@ class CachePrefetchNode(ExecNode):
241
242
  _logger.debug(f'Downloading {url} to {tmp_path}')
242
243
  if parsed.scheme == 's3':
243
244
  from pixeltable.utils.s3 import get_client
245
+
244
246
  with self.boto_client_lock:
245
247
  if self.boto_client is None:
246
248
  config = {
@@ -1,5 +1,5 @@
1
1
  import inspect
2
- from typing import Iterator, Optional, AsyncIterator
2
+ from typing import AsyncIterator, Iterator, Optional
3
3
 
4
4
  import pixeltable.catalog as catalog
5
5
  import pixeltable.exceptions as excs
@@ -14,6 +14,7 @@ class ComponentIterationNode(ExecNode):
14
14
 
15
15
  Returns row batches of OUTPUT_BATCH_SIZE size.
16
16
  """
17
+
17
18
  __OUTPUT_BATCH_SIZE = 1024
18
19
 
19
20
  def __init__(self, view: catalog.TableVersion, input: ExecNode):
@@ -25,8 +26,8 @@ class ComponentIterationNode(ExecNode):
25
26
  self.iterator_args = iterator_args[0]
26
27
  assert isinstance(self.iterator_args, exprs.InlineDict)
27
28
  self.iterator_args_ctx = self.row_builder.create_eval_ctx([self.iterator_args])
28
- self.iterator_output_schema, self.unstored_column_names = (
29
- self.view.iterator_cls.output_schema(**self.iterator_args.to_kwargs())
29
+ self.iterator_output_schema, self.unstored_column_names = self.view.iterator_cls.output_schema(
30
+ **self.iterator_args.to_kwargs()
30
31
  )
31
32
  self.iterator_output_fields = list(self.iterator_output_schema.keys())
32
33
  self.iterator_output_cols = {
@@ -34,7 +35,8 @@ class ComponentIterationNode(ExecNode):
34
35
  }
35
36
  # referenced iterator output fields
36
37
  self.refd_output_slot_idxs = {
37
- e.col.name: e.slot_idx for e in self.row_builder.unique_exprs
38
+ e.col.name: e.slot_idx
39
+ for e in self.row_builder.unique_exprs
38
40
  if isinstance(e, exprs.ColumnRef) and e.col.name in self.iterator_output_fields
39
41
  }
40
42
 
@@ -79,8 +81,7 @@ class ComponentIterationNode(ExecNode):
79
81
  # verify and copy component_dict fields to their respective slots in output_row
80
82
  for field_name, field_val in component_dict.items():
81
83
  if field_name not in self.iterator_output_fields:
82
- raise excs.Error(
83
- f'Invalid field name {field_name} in output of {self.view.iterator_cls.__name__}')
84
+ raise excs.Error(f'Invalid field name {field_name} in output of {self.view.iterator_cls.__name__}')
84
85
  if field_name not in self.refd_output_slot_idxs:
85
86
  # we can ignore this
86
87
  continue
@@ -90,5 +91,5 @@ class ComponentIterationNode(ExecNode):
90
91
  if len(component_dict) != len(self.iterator_output_fields):
91
92
  missing_fields = set(self.refd_output_slot_idxs.keys()) - set(component_dict.keys())
92
93
  raise excs.Error(
93
- f'Invalid output of {self.view.iterator_cls.__name__}: '
94
- f'missing fields {", ".join(missing_fields)}')
94
+ f'Invalid output of {self.view.iterator_cls.__name__}: missing fields {", ".join(missing_fields)}'
95
+ )
@@ -1,19 +1,21 @@
1
1
  from __future__ import annotations
2
- from typing import Iterator, Optional
2
+
3
3
  import logging
4
+ from typing import Iterator, Optional
4
5
 
5
- import pixeltable.exprs as exprs
6
6
  import pixeltable.catalog as catalog
7
+ import pixeltable.exprs as exprs
7
8
  from pixeltable.utils.media_store import MediaStore
8
9
 
9
-
10
10
  _logger = logging.getLogger('pixeltable')
11
11
 
12
+
12
13
  class DataRowBatch:
13
14
  """Set of DataRows, indexed by rowid.
14
15
 
15
16
  Contains the metadata needed to initialize DataRows.
16
17
  """
18
+
17
19
  tbl: Optional[catalog.TableVersion]
18
20
  row_builder: exprs.RowBuilder
19
21
  img_slot_idxs: list[int]
@@ -22,8 +24,11 @@ class DataRowBatch:
22
24
  rows: list[exprs.DataRow]
23
25
 
24
26
  def __init__(
25
- self, tbl: Optional[catalog.TableVersion], row_builder: exprs.RowBuilder, num_rows: Optional[int] = None,
26
- rows: Optional[list[exprs.DataRow]] = None
27
+ self,
28
+ tbl: Optional[catalog.TableVersion],
29
+ row_builder: exprs.RowBuilder,
30
+ num_rows: Optional[int] = None,
31
+ rows: Optional[list[exprs.DataRow]] = None,
27
32
  ):
28
33
  """
29
34
  Requires either num_rows or rows to be specified, but not both.
@@ -34,7 +39,8 @@ class DataRowBatch:
34
39
  self.img_slot_idxs = [e.slot_idx for e in row_builder.unique_exprs if e.col_type.is_image_type()]
35
40
  # non-image media slots
36
41
  self.media_slot_idxs = [
37
- e.slot_idx for e in row_builder.unique_exprs
42
+ e.slot_idx
43
+ for e in row_builder.unique_exprs
38
44
  if e.col_type.is_media_type() and not e.col_type.is_image_type()
39
45
  ]
40
46
  self.array_slot_idxs = [e.slot_idx for e in row_builder.unique_exprs if e.col_type.is_array_type()]
@@ -44,14 +50,17 @@ class DataRowBatch:
44
50
  if num_rows is None:
45
51
  num_rows = 0
46
52
  self.rows = [
47
- exprs.DataRow(row_builder.num_materialized, self.img_slot_idxs, self.media_slot_idxs, self.array_slot_idxs)
53
+ exprs.DataRow(
54
+ row_builder.num_materialized, self.img_slot_idxs, self.media_slot_idxs, self.array_slot_idxs
55
+ )
48
56
  for _ in range(num_rows)
49
57
  ]
50
58
 
51
59
  def add_row(self, row: Optional[exprs.DataRow] = None) -> exprs.DataRow:
52
60
  if row is None:
53
61
  row = exprs.DataRow(
54
- self.row_builder.num_materialized, self.img_slot_idxs, self.media_slot_idxs, self.array_slot_idxs)
62
+ self.row_builder.num_materialized, self.img_slot_idxs, self.media_slot_idxs, self.array_slot_idxs
63
+ )
55
64
  self.rows.append(row)
56
65
  return row
57
66
 
@@ -65,8 +74,10 @@ class DataRowBatch:
65
74
  return self.rows[index]
66
75
 
67
76
  def flush_imgs(
68
- self, idx_range: Optional[slice] = None, stored_img_info: Optional[list[exprs.ColumnSlotIdx]] = None,
69
- flushed_slot_idxs: Optional[list[int]] = None
77
+ self,
78
+ idx_range: Optional[slice] = None,
79
+ stored_img_info: Optional[list[exprs.ColumnSlotIdx]] = None,
80
+ flushed_slot_idxs: Optional[list[int]] = None,
70
81
  ) -> None:
71
82
  """Flushes images in the given range of rows."""
72
83
  assert self.tbl is not None
@@ -4,12 +4,19 @@ import sqlalchemy as sql
4
4
 
5
5
  import pixeltable.exprs as exprs
6
6
 
7
+
7
8
  class ExecContext:
8
9
  """Class for execution runtime constants"""
10
+
9
11
  def __init__(
10
- self, row_builder: exprs.RowBuilder, *, show_pbar: bool = False, batch_size: int = 0,
11
- pk_clause: Optional[list[sql.ClauseElement]] = None, num_computed_exprs: int = 0,
12
- ignore_errors: bool = False
12
+ self,
13
+ row_builder: exprs.RowBuilder,
14
+ *,
15
+ show_pbar: bool = False,
16
+ batch_size: int = 0,
17
+ pk_clause: Optional[list[sql.ClauseElement]] = None,
18
+ num_computed_exprs: int = 0,
19
+ ignore_errors: bool = False,
13
20
  ):
14
21
  self.show_pbar = show_pbar
15
22
  self.batch_size = batch_size
@@ -4,16 +4,19 @@ import abc
4
4
  import asyncio
5
5
  import logging
6
6
  import sys
7
- from typing import Iterable, Iterator, Optional, TypeVar, AsyncIterator
7
+ from typing import AsyncIterator, Iterable, Iterator, Optional, TypeVar
8
8
 
9
9
  import pixeltable.exprs as exprs
10
+
10
11
  from .data_row_batch import DataRowBatch
11
12
  from .exec_context import ExecContext
12
13
 
13
14
  _logger = logging.getLogger('pixeltable')
14
15
 
16
+
15
17
  class ExecNode(abc.ABC):
16
18
  """Base class of all execution nodes"""
19
+
17
20
  output_exprs: Iterable[exprs.Expr]
18
21
  row_builder: exprs.RowBuilder
19
22
  input: Optional[ExecNode]
@@ -22,8 +25,12 @@ class ExecNode(abc.ABC):
22
25
  ctx: Optional[ExecContext]
23
26
 
24
27
  def __init__(
25
- self, row_builder: exprs.RowBuilder, output_exprs: Iterable[exprs.Expr],
26
- input_exprs: Iterable[exprs.Expr], input: Optional[ExecNode] = None):
28
+ self,
29
+ row_builder: exprs.RowBuilder,
30
+ output_exprs: Iterable[exprs.Expr],
31
+ input_exprs: Iterable[exprs.Expr],
32
+ input: Optional[ExecNode] = None,
33
+ ):
27
34
  self.output_exprs = output_exprs
28
35
  self.row_builder = row_builder
29
36
  self.input = input
@@ -31,8 +38,7 @@ class ExecNode(abc.ABC):
31
38
  output_slot_idxs = {e.slot_idx for e in output_exprs}
32
39
  output_dependencies = row_builder.get_dependencies(output_exprs, exclude=input_exprs)
33
40
  self.flushed_img_slots = [
34
- e.slot_idx for e in output_dependencies
35
- if e.col_type.is_image_type() and e.slot_idx not in output_slot_idxs
41
+ e.slot_idx for e in output_dependencies if e.col_type.is_image_type() and e.slot_idx not in output_slot_idxs
36
42
  ]
37
43
  self.stored_img_cols = []
38
44
  self.ctx = None # all nodes of a tree share the same context
@@ -53,16 +59,20 @@ class ExecNode(abc.ABC):
53
59
  pass
54
60
 
55
61
  def __iter__(self) -> Iterator[DataRowBatch]:
62
+ running_loop: Optional[asyncio.AbstractEventLoop] = None
63
+ loop: asyncio.AbstractEventLoop
56
64
  try:
57
- # check if we are already in an event loop (eg, Jupyter's); if so, patch it to allow nested event loops
58
- _ = asyncio.get_event_loop()
65
+ # check if we are already in an event loop (eg, Jupyter's); if so, patch it to allow
66
+ # multiple run_until_complete()
67
+ running_loop = asyncio.get_running_loop()
59
68
  import nest_asyncio # type: ignore
69
+
60
70
  nest_asyncio.apply()
71
+ loop = running_loop
72
+ _logger.debug(f'Patched running loop')
61
73
  except RuntimeError:
62
- pass
63
-
64
- loop = asyncio.new_event_loop()
65
- asyncio.set_event_loop(loop)
74
+ loop = asyncio.new_event_loop()
75
+ asyncio.set_event_loop(loop)
66
76
 
67
77
  if 'pytest' in sys.modules:
68
78
  loop.set_debug(True)
@@ -75,7 +85,8 @@ class ExecNode(abc.ABC):
75
85
  except StopAsyncIteration:
76
86
  pass
77
87
  finally:
78
- loop.close()
88
+ if loop != running_loop:
89
+ loop.close()
79
90
 
80
91
  def open(self) -> None:
81
92
  """Bottom-up initialization of nodes for execution. Must be called before __next__."""
@@ -5,10 +5,10 @@ import datetime
5
5
  import itertools
6
6
  import logging
7
7
  import sys
8
- from typing import Iterator, Any, Optional, Callable, cast
8
+ from typing import Any, Callable, Iterator, Optional, cast
9
+
10
+ from pixeltable import exprs, func
9
11
 
10
- from pixeltable import exprs
11
- from pixeltable import func
12
12
  from .globals import Dispatcher, Evaluator, FnCallArgs
13
13
 
14
14
  _logger = logging.getLogger('pixeltable')
@@ -23,6 +23,7 @@ class DefaultExprEvaluator(Evaluator):
23
23
  TODO:
24
24
  - parallelize via Ray
25
25
  """
26
+
26
27
  e: exprs.Expr
27
28
 
28
29
  def __init__(self, e: exprs.Expr, dispatcher: Dispatcher):
@@ -60,6 +61,7 @@ class FnCallEvaluator(Evaluator):
60
61
  TODO:
61
62
  - adaptive batching: finding the optimal batch size based on observed execution times
62
63
  """
64
+
63
65
  fn_call: exprs.FunctionCall
64
66
  fn: func.CallableFunction
65
67
  scalar_py_fn: Optional[Callable] # only set for non-batching CallableFunctions
@@ -73,7 +75,7 @@ class FnCallEvaluator(Evaluator):
73
75
  self.fn_call = fn_call
74
76
  self.fn = cast(func.CallableFunction, fn_call.fn)
75
77
  if isinstance(self.fn, func.CallableFunction) and self.fn.is_batched:
76
- self.call_args_queue = asyncio.Queue[FnCallArgs]()
78
+ self.call_args_queue = asyncio.Queue[FnCallArgs]()
77
79
  # we're not supplying sample arguments there, they're ignored anyway
78
80
  self.batch_size = self.fn.get_batch_size()
79
81
  self.scalar_py_fn = None
@@ -167,14 +169,16 @@ class FnCallEvaluator(Evaluator):
167
169
  for k in item.kwargs.keys():
168
170
  batch_kwargs[k][i] = item.kwargs[k]
169
171
  return FnCallArgs(
170
- self.fn_call, [item.row for item in call_args], batch_args=batch_args, batch_kwargs=batch_kwargs)
172
+ self.fn_call, [item.row for item in call_args], batch_args=batch_args, batch_kwargs=batch_kwargs
173
+ )
171
174
 
172
175
  async def eval_batch(self, batched_call_args: FnCallArgs) -> None:
173
176
  result_batch: list[Any]
174
177
  try:
175
178
  if self.fn.is_async:
176
179
  result_batch = await self.fn.aexec_batch(
177
- *batched_call_args.batch_args, **batched_call_args.batch_kwargs)
180
+ *batched_call_args.batch_args, **batched_call_args.batch_kwargs
181
+ )
178
182
  else:
179
183
  # check for cancellation before starting something potentially long-running
180
184
  if asyncio.current_task().cancelled() or self.dispatcher.exc_event.is_set():
@@ -205,6 +209,7 @@ class FnCallEvaluator(Evaluator):
205
209
  self.dispatcher.dispatch([call_args.row])
206
210
  except Exception as exc:
207
211
  import anthropic
212
+
208
213
  if isinstance(exc, anthropic.RateLimitError):
209
214
  _logger.debug(f'RateLimitError: {exc}')
210
215
  _, _, exc_tb = sys.exc_info()
@@ -228,7 +233,8 @@ class FnCallEvaluator(Evaluator):
228
233
  rows_with_excs.add(idx)
229
234
  self.dispatcher.dispatch_exc(item.rows, self.fn_call.slot_idx, exc_tb)
230
235
  self.dispatcher.dispatch(
231
- [call_args_batch[i].row for i in range(len(call_args_batch)) if i not in rows_with_excs])
236
+ [call_args_batch[i].row for i in range(len(call_args_batch)) if i not in rows_with_excs]
237
+ )
232
238
 
233
239
  def _close(self) -> None:
234
240
  """Create a task for the incomplete batch of queued FnCallArgs, if any"""
@@ -4,24 +4,23 @@ import asyncio
4
4
  import logging
5
5
  import traceback
6
6
  from types import TracebackType
7
- from typing import Iterable, AsyncIterator, Optional, Union
7
+ from typing import AsyncIterator, Iterable, Optional, Union
8
8
 
9
9
  import numpy as np
10
10
 
11
11
  import pixeltable.exceptions as excs
12
- from pixeltable import exprs
13
- from pixeltable import func
12
+ from pixeltable import exprs, func
13
+
14
+ from ..data_row_batch import DataRowBatch
15
+ from ..exec_node import ExecNode
14
16
  from .evaluators import DefaultExprEvaluator, FnCallEvaluator
15
17
  from .globals import Evaluator, Scheduler
16
18
  from .row_buffer import RowBuffer
17
19
  from .schedulers import SCHEDULERS
18
- from ..data_row_batch import DataRowBatch
19
- from ..exec_node import ExecNode
20
20
 
21
21
  _logger = logging.getLogger('pixeltable')
22
22
 
23
23
 
24
-
25
24
  class ExprEvalNode(ExecNode):
26
25
  """
27
26
  Expression evaluation
@@ -35,10 +34,13 @@ class ExprEvalNode(ExecNode):
35
34
  TODO:
36
35
  - Literal handling: currently, Literal values are copied into slots via the normal evaluation mechanism, which is
37
36
  needless overhead; instead: pre-populate Literal slots in _init_row()
37
+ - dynamically determine MAX_BUFFERED_ROWS, based on the avg memory consumption of a row and our configured memory
38
+ limit
38
39
  - local model inference on gpu: currently, no attempt is made to ensure that models can fit onto the gpu
39
40
  simultaneously, which will cause errors; instead, the execution should be divided into sequential phases, each
40
41
  of which only contains a subset of the models which is known to fit onto the gpu simultaneously
41
42
  """
43
+
42
44
  maintain_input_order: bool # True if we're returning rows in the order we received them from our input
43
45
  num_dependencies: np.ndarray # number of dependencies for our output slots; indexed by slot idx
44
46
  outputs: np.ndarray # bool per slot; True if this slot is part of our output
@@ -68,11 +70,15 @@ class ExprEvalNode(ExecNode):
68
70
  num_output_rows: int
69
71
 
70
72
  BATCH_SIZE = 64
71
- MAX_BUFFERED_ROWS = 512 # maximum number of rows that have been dispatched but not yet returned
73
+ MAX_BUFFERED_ROWS = 2048 # maximum number of rows that have been dispatched but not yet returned
72
74
 
73
75
  def __init__(
74
- self, row_builder: exprs.RowBuilder, output_exprs: Iterable[exprs.Expr], input_exprs: Iterable[exprs.Expr],
75
- input: ExecNode, maintain_input_order: bool = True
76
+ self,
77
+ row_builder: exprs.RowBuilder,
78
+ output_exprs: Iterable[exprs.Expr],
79
+ input_exprs: Iterable[exprs.Expr],
80
+ input: ExecNode,
81
+ maintain_input_order: bool = True,
76
82
  ):
77
83
  super().__init__(row_builder, output_exprs, input_exprs, input)
78
84
  self.maintain_input_order = maintain_input_order
@@ -148,7 +154,9 @@ class ExprEvalNode(ExecNode):
148
154
  self.row_pos_map[id(row)] = self.num_input_rows + idx
149
155
  self.num_input_rows += len(batch)
150
156
  self.avail_input_rows += len(batch)
151
- _logger.debug(f'adding input: batch_size={len(batch)} #input_rows={self.num_input_rows} #avail={self.avail_input_rows}')
157
+ _logger.debug(
158
+ f'adding input: batch_size={len(batch)} #input_rows={self.num_input_rows} #avail={self.avail_input_rows}'
159
+ )
152
160
  except StopAsyncIteration:
153
161
  self.input_complete = True
154
162
  _logger.debug(f'finished input: #input_rows={self.num_input_rows}, #avail={self.avail_input_rows}')
@@ -175,11 +183,11 @@ class ExprEvalNode(ExecNode):
175
183
  rows: list[exprs.DataRow]
176
184
  if avail_current_batch_rows > num_rows:
177
185
  # we only need rows from current_input_batch
178
- rows = self.current_input_batch.rows[self.input_row_idx:self.input_row_idx + num_rows]
186
+ rows = self.current_input_batch.rows[self.input_row_idx : self.input_row_idx + num_rows]
179
187
  self.input_row_idx += num_rows
180
188
  else:
181
189
  # we need rows from both current_/next_input_batch
182
- rows = self.current_input_batch.rows[self.input_row_idx:]
190
+ rows = self.current_input_batch.rows[self.input_row_idx :]
183
191
  self.current_input_batch = self.next_input_batch
184
192
  self.next_input_batch = None
185
193
  self.input_row_idx = 0
@@ -236,6 +244,7 @@ class ExprEvalNode(ExecNode):
236
244
  exc_event_aw = asyncio.create_task(self.exc_event.wait(), name='exc_event.wait()')
237
245
  input_batch_aw: Optional[asyncio.Task] = None
238
246
  completed_aw: Optional[asyncio.Task] = None
247
+ closed_evaluators = False # True after calling Evaluator.close()
239
248
 
240
249
  try:
241
250
  while True:
@@ -275,11 +284,12 @@ class ExprEvalNode(ExecNode):
275
284
  assert self.output_buffer.num_rows == 0
276
285
  return
277
286
 
278
- if self.input_complete and self.avail_input_rows == 0:
287
+ if self.input_complete and self.avail_input_rows == 0 and not closed_evaluators:
279
288
  # no more input rows to dispatch, but we're still waiting for rows to finish:
280
289
  # close all slot evaluators to flush queued rows
281
290
  for evaluator in self.slot_evaluators.values():
282
291
  evaluator.close()
292
+ closed_evaluators = True
283
293
 
284
294
  # we don't have a full batch of rows at this point and need to wait
285
295
  aws = {exc_event_aw} # always wait for an exception
@@ -335,8 +345,7 @@ class ExprEvalNode(ExecNode):
335
345
  first_row = rows[0]
336
346
  input_vals = [first_row[idx] for idx in dependency_idxs]
337
347
  e = self.row_builder.unique_exprs[slot_with_exc]
338
- self.error = excs.ExprEvalError(
339
- e, f'expression {e}', first_row.get_exc(e.slot_idx), exc_tb, input_vals, 0)
348
+ self.error = excs.ExprEvalError(e, f'expression {e}', first_row.get_exc(e.slot_idx), exc_tb, input_vals, 0)
340
349
  self.exc_event.set()
341
350
  return
342
351