pixeltable 0.2.20__py3-none-any.whl → 0.2.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (120) hide show
  1. pixeltable/__init__.py +7 -19
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/__init__.py +7 -7
  4. pixeltable/catalog/column.py +37 -11
  5. pixeltable/catalog/globals.py +21 -0
  6. pixeltable/catalog/insertable_table.py +6 -4
  7. pixeltable/catalog/table.py +227 -148
  8. pixeltable/catalog/table_version.py +66 -28
  9. pixeltable/catalog/table_version_path.py +0 -8
  10. pixeltable/catalog/view.py +18 -19
  11. pixeltable/dataframe.py +16 -32
  12. pixeltable/env.py +6 -1
  13. pixeltable/exec/__init__.py +1 -2
  14. pixeltable/exec/aggregation_node.py +27 -17
  15. pixeltable/exec/cache_prefetch_node.py +1 -1
  16. pixeltable/exec/data_row_batch.py +9 -26
  17. pixeltable/exec/exec_node.py +36 -7
  18. pixeltable/exec/expr_eval_node.py +19 -11
  19. pixeltable/exec/in_memory_data_node.py +14 -11
  20. pixeltable/exec/sql_node.py +266 -138
  21. pixeltable/exprs/__init__.py +1 -0
  22. pixeltable/exprs/arithmetic_expr.py +3 -1
  23. pixeltable/exprs/array_slice.py +7 -7
  24. pixeltable/exprs/column_property_ref.py +37 -10
  25. pixeltable/exprs/column_ref.py +93 -14
  26. pixeltable/exprs/comparison.py +5 -5
  27. pixeltable/exprs/compound_predicate.py +8 -7
  28. pixeltable/exprs/data_row.py +56 -36
  29. pixeltable/exprs/expr.py +65 -63
  30. pixeltable/exprs/expr_dict.py +55 -0
  31. pixeltable/exprs/expr_set.py +26 -15
  32. pixeltable/exprs/function_call.py +53 -24
  33. pixeltable/exprs/globals.py +4 -1
  34. pixeltable/exprs/in_predicate.py +8 -7
  35. pixeltable/exprs/inline_expr.py +4 -4
  36. pixeltable/exprs/is_null.py +4 -4
  37. pixeltable/exprs/json_mapper.py +11 -12
  38. pixeltable/exprs/json_path.py +5 -10
  39. pixeltable/exprs/literal.py +5 -5
  40. pixeltable/exprs/method_ref.py +5 -4
  41. pixeltable/exprs/object_ref.py +2 -1
  42. pixeltable/exprs/row_builder.py +88 -36
  43. pixeltable/exprs/rowid_ref.py +14 -13
  44. pixeltable/exprs/similarity_expr.py +12 -7
  45. pixeltable/exprs/sql_element_cache.py +12 -6
  46. pixeltable/exprs/type_cast.py +8 -6
  47. pixeltable/exprs/variable.py +5 -4
  48. pixeltable/ext/functions/whisperx.py +7 -2
  49. pixeltable/func/aggregate_function.py +1 -1
  50. pixeltable/func/callable_function.py +2 -2
  51. pixeltable/func/function.py +11 -10
  52. pixeltable/func/function_registry.py +6 -7
  53. pixeltable/func/query_template_function.py +11 -12
  54. pixeltable/func/signature.py +17 -15
  55. pixeltable/func/udf.py +0 -4
  56. pixeltable/functions/__init__.py +2 -2
  57. pixeltable/functions/audio.py +4 -6
  58. pixeltable/functions/globals.py +84 -42
  59. pixeltable/functions/huggingface.py +31 -34
  60. pixeltable/functions/image.py +59 -45
  61. pixeltable/functions/json.py +0 -1
  62. pixeltable/functions/llama_cpp.py +106 -0
  63. pixeltable/functions/mistralai.py +2 -2
  64. pixeltable/functions/ollama.py +147 -0
  65. pixeltable/functions/openai.py +22 -25
  66. pixeltable/functions/replicate.py +72 -0
  67. pixeltable/functions/string.py +59 -50
  68. pixeltable/functions/timestamp.py +20 -20
  69. pixeltable/functions/together.py +2 -2
  70. pixeltable/functions/video.py +11 -20
  71. pixeltable/functions/whisper.py +2 -20
  72. pixeltable/globals.py +65 -74
  73. pixeltable/index/base.py +2 -2
  74. pixeltable/index/btree.py +20 -7
  75. pixeltable/index/embedding_index.py +12 -14
  76. pixeltable/io/__init__.py +1 -2
  77. pixeltable/io/external_store.py +11 -5
  78. pixeltable/io/fiftyone.py +178 -0
  79. pixeltable/io/globals.py +98 -2
  80. pixeltable/io/hf_datasets.py +1 -1
  81. pixeltable/io/label_studio.py +6 -6
  82. pixeltable/io/parquet.py +14 -13
  83. pixeltable/iterators/base.py +3 -2
  84. pixeltable/iterators/document.py +10 -8
  85. pixeltable/iterators/video.py +126 -60
  86. pixeltable/metadata/__init__.py +4 -3
  87. pixeltable/metadata/converters/convert_14.py +4 -2
  88. pixeltable/metadata/converters/convert_15.py +1 -1
  89. pixeltable/metadata/converters/convert_19.py +1 -0
  90. pixeltable/metadata/converters/convert_20.py +1 -1
  91. pixeltable/metadata/converters/convert_21.py +34 -0
  92. pixeltable/metadata/converters/util.py +54 -12
  93. pixeltable/metadata/notes.py +1 -0
  94. pixeltable/metadata/schema.py +40 -21
  95. pixeltable/plan.py +149 -165
  96. pixeltable/py.typed +0 -0
  97. pixeltable/store.py +57 -37
  98. pixeltable/tool/create_test_db_dump.py +6 -6
  99. pixeltable/tool/create_test_video.py +1 -1
  100. pixeltable/tool/doc_plugins/griffe.py +3 -34
  101. pixeltable/tool/embed_udf.py +1 -1
  102. pixeltable/tool/mypy_plugin.py +55 -0
  103. pixeltable/type_system.py +260 -61
  104. pixeltable/utils/arrow.py +10 -9
  105. pixeltable/utils/coco.py +4 -4
  106. pixeltable/utils/documents.py +16 -2
  107. pixeltable/utils/filecache.py +9 -9
  108. pixeltable/utils/formatter.py +10 -11
  109. pixeltable/utils/http_server.py +2 -5
  110. pixeltable/utils/media_store.py +6 -6
  111. pixeltable/utils/pytorch.py +10 -11
  112. pixeltable/utils/sql.py +2 -1
  113. {pixeltable-0.2.20.dist-info → pixeltable-0.2.22.dist-info}/METADATA +50 -13
  114. pixeltable-0.2.22.dist-info/RECORD +153 -0
  115. pixeltable/exec/media_validation_node.py +0 -43
  116. pixeltable/utils/help.py +0 -11
  117. pixeltable-0.2.20.dist-info/RECORD +0 -147
  118. {pixeltable-0.2.20.dist-info → pixeltable-0.2.22.dist-info}/LICENSE +0 -0
  119. {pixeltable-0.2.20.dist-info → pixeltable-0.2.22.dist-info}/WHEEL +0 -0
  120. {pixeltable-0.2.20.dist-info → pixeltable-0.2.22.dist-info}/entry_points.txt +0 -0
@@ -1,13 +1,26 @@
1
1
  from __future__ import annotations
2
- from typing import Iterable, Optional, List
2
+
3
3
  import abc
4
+ from typing import TYPE_CHECKING, Iterable, Iterator, List, Optional
5
+
6
+ import pixeltable.exprs as exprs
4
7
 
5
8
  from .data_row_batch import DataRowBatch
6
9
  from .exec_context import ExecContext
7
- import pixeltable.exprs as exprs
10
+
11
+ if TYPE_CHECKING:
12
+ from pixeltable import exec
8
13
 
9
14
  class ExecNode(abc.ABC):
10
15
  """Base class of all execution nodes"""
16
+ output_exprs: Iterable[exprs.Expr]
17
+ row_builder: exprs.RowBuilder
18
+ input: Optional[ExecNode]
19
+ flushed_img_slots: list[int] # idxs of image slots of our output_exprs dependencies
20
+ stored_img_cols: list[exprs.ColumnSlotIdx]
21
+ ctx: Optional[ExecContext]
22
+ __iter: Optional[Iterator[DataRowBatch]]
23
+
11
24
  def __init__(
12
25
  self, row_builder: exprs.RowBuilder, output_exprs: Iterable[exprs.Expr],
13
26
  input_exprs: Iterable[exprs.Expr], input: Optional[ExecNode] = None):
@@ -21,8 +34,9 @@ class ExecNode(abc.ABC):
21
34
  e.slot_idx for e in output_dependencies
22
35
  if e.col_type.is_image_type() and e.slot_idx not in output_slot_idxs
23
36
  ]
24
- self.stored_img_cols: List[exprs.ColumnSlotIdx] = []
25
- self.ctx: Optional[ExecContext] = None # all nodes of a tree share the same context
37
+ self.stored_img_cols = []
38
+ self.ctx = None # all nodes of a tree share the same context
39
+ self.__iter = None
26
40
 
27
41
  def set_ctx(self, ctx: ExecContext) -> None:
28
42
  self.ctx = ctx
@@ -35,12 +49,15 @@ class ExecNode(abc.ABC):
35
49
  if self.input is not None:
36
50
  self.input.set_stored_img_cols(stored_img_cols)
37
51
 
38
- def __iter__(self):
52
+ # TODO: make this an abstractmethod when __next__() is removed
53
+ def __iter__(self) -> Iterator[DataRowBatch]:
39
54
  return self
40
55
 
41
- @abc.abstractmethod
56
+ # TODO: remove this and switch every subclass over to implementing __iter__
42
57
  def __next__(self) -> DataRowBatch:
43
- pass
58
+ if self.__iter is None:
59
+ self.__iter = iter(self)
60
+ return next(self.__iter)
44
61
 
45
62
  def open(self) -> None:
46
63
  """Bottom-up initialization of nodes for execution. Must be called before __next__."""
@@ -60,3 +77,15 @@ class ExecNode(abc.ABC):
60
77
  def _close(self) -> None:
61
78
  pass
62
79
 
80
+ def get_sql_node(self) -> Optional['exec.SqlNode']:
81
+ from .sql_node import SqlNode
82
+ if isinstance(self, SqlNode):
83
+ return self
84
+ if self.input is not None:
85
+ return self.input.get_sql_node()
86
+ return None
87
+
88
+ def set_limit(self, limit: int) -> None:
89
+ """Default implementation propagates to input"""
90
+ if self.input is not None:
91
+ self.input.set_limit(limit)
@@ -5,10 +5,11 @@ import warnings
5
5
  from dataclasses import dataclass
6
6
  from typing import Iterable, List, Optional
7
7
 
8
- from tqdm import tqdm, TqdmWarning
8
+ from tqdm import TqdmWarning, tqdm
9
9
 
10
- import pixeltable.exprs as exprs
10
+ from pixeltable import exprs
11
11
  from pixeltable.func import CallableFunction
12
+
12
13
  from .data_row_batch import DataRowBatch
13
14
  from .exec_node import ExecNode
14
15
 
@@ -21,7 +22,7 @@ class ExprEvalNode(ExecNode):
21
22
  @dataclass
22
23
  class Cohort:
23
24
  """List of exprs that form an evaluation context and contain calls to at most one external function"""
24
- exprs: List[exprs.Expr]
25
+ exprs_: List[exprs.Expr]
25
26
  batched_fn: Optional[CallableFunction]
26
27
  segment_ctxs: List['exprs.RowBuilder.EvalCtx']
27
28
  target_slot_idxs: List[int]
@@ -37,7 +38,7 @@ class ExprEvalNode(ExecNode):
37
38
  # we're only materializing exprs that are not already in the input
38
39
  self.target_exprs = [e for e in output_exprs if e.slot_idx not in input_slot_idxs]
39
40
  self.pbar: Optional[tqdm] = None
40
- self.cohorts: List[List[ExprEvalNode.Cohort]] = []
41
+ self.cohorts: List[ExprEvalNode.Cohort] = []
41
42
  self._create_cohorts()
42
43
 
43
44
  def __next__(self) -> DataRowBatch:
@@ -87,6 +88,8 @@ class ExprEvalNode(ExecNode):
87
88
  for e in all_exprs:
88
89
  if not self._is_batched_fn_call(e):
89
90
  continue
91
+ assert isinstance(e, exprs.FunctionCall)
92
+ assert isinstance(e.fn, CallableFunction)
90
93
  if current_batched_fn is None or current_batched_fn != e.fn:
91
94
  # create a new cohort
92
95
  cohorts.append([])
@@ -95,8 +98,8 @@ class ExprEvalNode(ExecNode):
95
98
 
96
99
  # expand the cohorts to include all exprs that are in the same evaluation context as the external calls;
97
100
  # cohorts are evaluated in order, so we can exclude the target slots from preceding cohorts and input slots
98
- exclude = set([e.slot_idx for e in self.input_exprs])
99
- all_target_slot_idxs = set([e.slot_idx for e in self.target_exprs])
101
+ exclude = set(e.slot_idx for e in self.input_exprs)
102
+ all_target_slot_idxs = set(e.slot_idx for e in self.target_exprs)
100
103
  target_slot_idxs: List[List[int]] = [] # the ones materialized by each cohort
101
104
  for i in range(len(cohorts)):
102
105
  cohorts[i] = self.row_builder.get_dependencies(
@@ -105,7 +108,7 @@ class ExprEvalNode(ExecNode):
105
108
  [e.slot_idx for e in cohorts[i] if e.slot_idx in all_target_slot_idxs])
106
109
  exclude.update(target_slot_idxs[-1])
107
110
 
108
- all_cohort_slot_idxs = set([e.slot_idx for cohort in cohorts for e in cohort])
111
+ all_cohort_slot_idxs = set(e.slot_idx for cohort in cohorts for e in cohort)
109
112
  remaining_slot_idxs = set(all_target_slot_idxs) - all_cohort_slot_idxs
110
113
  if len(remaining_slot_idxs) > 0:
111
114
  cohorts.append(self.row_builder.get_dependencies(
@@ -163,9 +166,10 @@ class ExprEvalNode(ExecNode):
163
166
  rows[row_idx], segment_ctx, self.ctx.profile, ignore_errors=self.ctx.ignore_errors)
164
167
  else:
165
168
  fn_call = segment_ctx.exprs[0]
169
+ assert isinstance(fn_call, exprs.FunctionCall)
166
170
  # make a batched external function call
167
- arg_batches = [[] for _ in range(len(fn_call.args))]
168
- kwarg_batches = {k: [] for k in fn_call.kwargs.keys()}
171
+ arg_batches: list[list[exprs.Expr]] = [[] for _ in range(len(fn_call.args))]
172
+ kwarg_batches: dict[str, list[exprs.Expr]] = {k: [] for k in fn_call.kwargs.keys()}
169
173
 
170
174
  valid_batch_idxs: List[int] = [] # rows with exceptions are not valid
171
175
  for row_idx in range(batch_start_idx, batch_start_idx + num_batch_rows):
@@ -175,12 +179,15 @@ class ExprEvalNode(ExecNode):
175
179
  continue
176
180
  valid_batch_idxs.append(row_idx)
177
181
  args, kwargs = fn_call._make_args(row)
178
- [arg_batches[i].append(args[i]) for i in range(len(args))]
179
- [kwarg_batches[k].append(kwargs[k]) for k in kwargs.keys()]
182
+ for i in range(len(args)):
183
+ arg_batches[i].append(args[i])
184
+ for k in kwargs.keys():
185
+ kwarg_batches[k].append(kwargs[k])
180
186
  num_valid_batch_rows = len(valid_batch_idxs)
181
187
 
182
188
  if ext_batch_size is None:
183
189
  # we need to choose a batch size based on the args
190
+ assert isinstance(fn_call.fn, CallableFunction)
184
191
  sample_args = [arg_batches[i][0] for i in range(len(arg_batches))]
185
192
  ext_batch_size = fn_call.fn.get_batch_size(*sample_args)
186
193
 
@@ -200,6 +207,7 @@ class ExprEvalNode(ExecNode):
200
207
  for k in kwarg_batches.keys()
201
208
  }
202
209
  start_ts = time.perf_counter()
210
+ assert isinstance(fn_call.fn, CallableFunction)
203
211
  result_batch = fn_call.fn.exec_batch(*call_args, **call_kwargs)
204
212
  self.ctx.profile.eval_time[fn_call.slot_idx] += time.perf_counter() - start_ts
205
213
  self.ctx.profile.eval_count[fn_call.slot_idx] += num_ext_batch_rows
@@ -1,5 +1,5 @@
1
1
  import logging
2
- from typing import Any, Optional
2
+ from typing import Any, Iterator, Optional
3
3
 
4
4
  import pixeltable.catalog as catalog
5
5
  import pixeltable.exprs as exprs
@@ -18,19 +18,26 @@ class InMemoryDataNode(ExecNode):
18
18
  - with the values provided in the input rows
19
19
  - if an input row doesn't provide a value, sets the slot to the column default
20
20
  """
21
+ tbl: catalog.TableVersion
22
+ input_rows: list[dict[str, Any]]
23
+ start_row_id: int
24
+ output_rows: Optional[DataRowBatch]
25
+
26
+ # output_exprs is declared in the superclass, but we redeclare it here with a more specific type
27
+ output_exprs: list[exprs.ColumnRef]
28
+
21
29
  def __init__(
22
30
  self, tbl: catalog.TableVersion, rows: list[dict[str, Any]],
23
31
  row_builder: exprs.RowBuilder, start_row_id: int,
24
32
  ):
25
- # we materialize all output slots
26
- output_exprs = [e for e in row_builder.get_output_exprs() if isinstance(e, exprs.ColumnRef)]
33
+ # we materialize the input slots
34
+ output_exprs = list(row_builder.input_exprs)
27
35
  super().__init__(row_builder, output_exprs, [], None)
28
36
  assert tbl.is_insertable()
29
37
  self.tbl = tbl
30
38
  self.input_rows = rows
31
39
  self.start_row_id = start_row_id
32
- self.has_returned_data = False
33
- self.output_rows: Optional[DataRowBatch] = None
40
+ self.output_rows = None
34
41
 
35
42
  def _open(self) -> None:
36
43
  """Create row batch and populate with self.input_rows"""
@@ -67,12 +74,8 @@ class InMemoryDataNode(ExecNode):
67
74
  assert col_info is not None
68
75
  self.output_rows[row_idx][col_info.slot_idx] = None
69
76
 
70
- self.output_rows.set_row_ids([self.start_row_id + i for i in range(len(self.output_rows))])
71
77
  self.ctx.num_rows = len(self.output_rows)
72
78
 
73
- def __next__(self) -> DataRowBatch:
74
- if self.has_returned_data:
75
- raise StopIteration
76
- self.has_returned_data = True
79
+ def __iter__(self) -> Iterator[DataRowBatch]:
77
80
  _logger.debug(f'InMemoryDataNode: created row batch with {len(self.output_rows)} output_rows')
78
- return self.output_rows
81
+ yield self.output_rows