pixeltable 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (122) hide show
  1. pixeltable/__init__.py +2 -3
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/__init__.py +2 -1
  4. pixeltable/catalog/catalog.py +63 -36
  5. pixeltable/catalog/column.py +11 -4
  6. pixeltable/catalog/dir.py +5 -5
  7. pixeltable/catalog/globals.py +28 -14
  8. pixeltable/catalog/insertable_table.py +81 -43
  9. pixeltable/catalog/path.py +2 -2
  10. pixeltable/catalog/table.py +140 -109
  11. pixeltable/catalog/table_version.py +60 -43
  12. pixeltable/catalog/table_version_handle.py +3 -0
  13. pixeltable/catalog/table_version_path.py +1 -1
  14. pixeltable/catalog/view.py +17 -9
  15. pixeltable/dataframe.py +5 -3
  16. pixeltable/env.py +109 -43
  17. pixeltable/exec/__init__.py +2 -0
  18. pixeltable/exec/aggregation_node.py +6 -8
  19. pixeltable/exec/cache_prefetch_node.py +4 -7
  20. pixeltable/exec/component_iteration_node.py +1 -3
  21. pixeltable/exec/data_row_batch.py +1 -2
  22. pixeltable/exec/exec_context.py +1 -1
  23. pixeltable/exec/exec_node.py +2 -3
  24. pixeltable/exec/expr_eval/__init__.py +2 -0
  25. pixeltable/exec/expr_eval/evaluators.py +137 -20
  26. pixeltable/exec/expr_eval/expr_eval_node.py +43 -64
  27. pixeltable/exec/expr_eval/globals.py +68 -7
  28. pixeltable/exec/expr_eval/schedulers.py +25 -23
  29. pixeltable/exec/in_memory_data_node.py +8 -6
  30. pixeltable/exec/row_update_node.py +3 -4
  31. pixeltable/exec/sql_node.py +16 -17
  32. pixeltable/exprs/__init__.py +3 -2
  33. pixeltable/exprs/arithmetic_expr.py +2 -0
  34. pixeltable/exprs/column_property_ref.py +1 -1
  35. pixeltable/exprs/column_ref.py +39 -3
  36. pixeltable/exprs/compound_predicate.py +1 -1
  37. pixeltable/exprs/data_row.py +17 -1
  38. pixeltable/exprs/expr.py +51 -21
  39. pixeltable/exprs/function_call.py +34 -2
  40. pixeltable/exprs/globals.py +12 -0
  41. pixeltable/exprs/json_mapper.py +95 -48
  42. pixeltable/exprs/json_path.py +3 -10
  43. pixeltable/exprs/method_ref.py +2 -2
  44. pixeltable/exprs/object_ref.py +2 -2
  45. pixeltable/exprs/row_builder.py +33 -6
  46. pixeltable/exprs/similarity_expr.py +6 -21
  47. pixeltable/exprs/sql_element_cache.py +1 -1
  48. pixeltable/exprs/string_op.py +107 -0
  49. pixeltable/ext/__init__.py +1 -1
  50. pixeltable/ext/functions/__init__.py +1 -1
  51. pixeltable/ext/functions/whisperx.py +1 -1
  52. pixeltable/ext/functions/yolox.py +22 -65
  53. pixeltable/func/aggregate_function.py +1 -1
  54. pixeltable/func/callable_function.py +2 -5
  55. pixeltable/func/expr_template_function.py +22 -2
  56. pixeltable/func/function.py +4 -5
  57. pixeltable/func/function_registry.py +1 -1
  58. pixeltable/func/signature.py +1 -1
  59. pixeltable/func/tools.py +2 -2
  60. pixeltable/func/udf.py +2 -2
  61. pixeltable/functions/__init__.py +2 -2
  62. pixeltable/functions/anthropic.py +2 -2
  63. pixeltable/functions/audio.py +1 -1
  64. pixeltable/functions/deepseek.py +1 -1
  65. pixeltable/functions/fireworks.py +1 -1
  66. pixeltable/functions/globals.py +22 -11
  67. pixeltable/functions/huggingface.py +1 -1
  68. pixeltable/functions/image.py +1 -1
  69. pixeltable/functions/json.py +1 -1
  70. pixeltable/functions/llama_cpp.py +1 -1
  71. pixeltable/functions/math.py +1 -1
  72. pixeltable/functions/mistralai.py +1 -1
  73. pixeltable/functions/ollama.py +1 -1
  74. pixeltable/functions/openai.py +2 -2
  75. pixeltable/functions/replicate.py +1 -1
  76. pixeltable/functions/string.py +1 -1
  77. pixeltable/functions/timestamp.py +1 -1
  78. pixeltable/functions/together.py +1 -1
  79. pixeltable/functions/util.py +1 -1
  80. pixeltable/functions/video.py +2 -2
  81. pixeltable/functions/vision.py +2 -2
  82. pixeltable/globals.py +85 -33
  83. pixeltable/index/embedding_index.py +12 -1
  84. pixeltable/io/__init__.py +8 -5
  85. pixeltable/io/datarows.py +138 -0
  86. pixeltable/io/external_store.py +8 -5
  87. pixeltable/io/fiftyone.py +6 -7
  88. pixeltable/io/globals.py +7 -160
  89. pixeltable/io/hf_datasets.py +21 -98
  90. pixeltable/io/label_studio.py +21 -20
  91. pixeltable/io/pandas.py +35 -48
  92. pixeltable/io/parquet.py +17 -42
  93. pixeltable/io/table_data_conduit.py +569 -0
  94. pixeltable/io/utils.py +6 -21
  95. pixeltable/iterators/__init__.py +1 -1
  96. pixeltable/metadata/__init__.py +6 -4
  97. pixeltable/metadata/converters/convert_24.py +3 -3
  98. pixeltable/metadata/converters/convert_25.py +1 -1
  99. pixeltable/metadata/converters/convert_29.py +1 -1
  100. pixeltable/metadata/converters/convert_30.py +50 -0
  101. pixeltable/metadata/converters/util.py +26 -1
  102. pixeltable/metadata/notes.py +1 -0
  103. pixeltable/metadata/schema.py +3 -0
  104. pixeltable/store.py +2 -2
  105. pixeltable/type_system.py +19 -7
  106. pixeltable/utils/arrow.py +32 -7
  107. pixeltable/utils/console_output.py +3 -2
  108. pixeltable/utils/coroutine.py +3 -3
  109. pixeltable/utils/dbms.py +66 -0
  110. pixeltable/utils/documents.py +61 -67
  111. pixeltable/utils/filecache.py +1 -1
  112. pixeltable/utils/http_server.py +3 -2
  113. pixeltable/utils/pytorch.py +1 -1
  114. pixeltable/utils/sql.py +1 -1
  115. pixeltable-0.3.11.dist-info/METADATA +436 -0
  116. pixeltable-0.3.11.dist-info/RECORD +179 -0
  117. {pixeltable-0.3.9.dist-info → pixeltable-0.3.11.dist-info}/WHEEL +1 -1
  118. pixeltable/catalog/path_dict.py +0 -169
  119. pixeltable-0.3.9.dist-info/METADATA +0 -382
  120. pixeltable-0.3.9.dist-info/RECORD +0 -175
  121. {pixeltable-0.3.9.dist-info → pixeltable-0.3.11.dist-info}/LICENSE +0 -0
  122. {pixeltable-0.3.9.dist-info → pixeltable-0.3.11.dist-info}/entry_points.txt +0 -0
@@ -205,6 +205,10 @@ class FunctionCall(Expr):
205
205
  def has_group_by(self) -> bool:
206
206
  return self.group_by_stop_idx != 0
207
207
 
208
+ @property
209
+ def is_async(self) -> bool:
210
+ return self.fn.is_async
211
+
208
212
  @property
209
213
  def group_by(self) -> list[Expr]:
210
214
  return self.components[self.group_by_start_idx : self.group_by_stop_idx]
@@ -272,6 +276,34 @@ class FunctionCall(Expr):
272
276
  assert isinstance(self.fn, func.AggregateFunction)
273
277
  self.aggregator = self.fn.agg_class(**self.agg_init_args)
274
278
 
279
+ @property
280
+ def bound_args(self) -> dict[str, Expr]:
281
+ """
282
+ Reconstructs bound arguments from the components of this FunctionCall.
283
+ """
284
+ bound_args: dict[str, Expr] = {}
285
+ for name, idx in self.bound_idxs.items():
286
+ if isinstance(idx, int):
287
+ bound_args[name] = self.components[idx]
288
+ elif isinstance(idx, Sequence):
289
+ bound_args[name] = Expr.from_object([self.components[i] for i in idx])
290
+ elif isinstance(idx, dict):
291
+ bound_args[name] = Expr.from_object({k: self.components[i] for k, i in idx.items()})
292
+ else:
293
+ raise AssertionError(f'{name}: {idx} (of type `{type(idx)}`)')
294
+ return bound_args
295
+
296
+ def substitute(self, spec: dict[Expr, Expr]) -> Expr:
297
+ """
298
+ Substitution of FunctionCall arguments could cause the return value to become more specific, in the case
299
+ where a variable is replaced with a specific value.
300
+ """
301
+ res = super().substitute(spec)
302
+ assert res is self
303
+ self.return_type = self.fn.call_return_type(self.bound_args)
304
+ self.col_type = self.return_type
305
+ return self
306
+
275
307
  def update(self, data_row: DataRow) -> None:
276
308
  """
277
309
  Update agg state
@@ -289,7 +321,7 @@ class FunctionCall(Expr):
289
321
  if (
290
322
  val is None
291
323
  and parameters_by_pos[idx].kind
292
- in {inspect.Parameter.POSITIONAL_ONLY, inspect.Parameter.POSITIONAL_OR_KEYWORD}
324
+ in (inspect.Parameter.POSITIONAL_ONLY, inspect.Parameter.POSITIONAL_OR_KEYWORD)
293
325
  and not parameters_by_pos[idx].col_type.nullable
294
326
  ):
295
327
  return None
@@ -302,7 +334,7 @@ class FunctionCall(Expr):
302
334
  if (
303
335
  val is None
304
336
  and parameters[param_name].kind
305
- in {inspect.Parameter.KEYWORD_ONLY, inspect.Parameter.POSITIONAL_OR_KEYWORD}
337
+ in (inspect.Parameter.KEYWORD_ONLY, inspect.Parameter.POSITIONAL_OR_KEYWORD)
306
338
  and not parameters[param_name].col_type.nullable
307
339
  ):
308
340
  return None
@@ -87,3 +87,15 @@ class ArithmeticOperator(enum.Enum):
87
87
  if self == self.FLOORDIV:
88
88
  return '//'
89
89
  raise AssertionError()
90
+
91
+
92
+ class StringOperator(enum.Enum):
93
+ CONCAT = 0
94
+ REPEAT = 1
95
+
96
+ def __str__(self) -> str:
97
+ if self == self.CONCAT:
98
+ return '+'
99
+ if self == self.REPEAT:
100
+ return '*'
101
+ raise AssertionError()
@@ -20,16 +20,84 @@ class JsonMapper(Expr):
20
20
  JsonMapper transforms the list output of a JsonPath by applying a target expr to every element of the list.
21
21
  The target expr would typically contain relative JsonPaths, which are bound to an ObjectRef, which in turn
22
22
  is populated by JsonMapper.eval(). The JsonMapper effectively creates a new scope for its target expr.
23
+
24
+ JsonMapper is executed in two phases:
25
+ - the first phase is handled by Expr subclass JsonMapperDispatch, which constructs one nested DataRow per source
26
+ list element and evaluates the target expr within that (the nested DataRows are stored as a NestedRowList in the
27
+ slot of JsonMapperDispatch)
28
+ - JsonMapper.eval() collects the slot values of the target expr into its result list
23
29
  """
24
30
 
25
31
  target_expr_scope: ExprScope
26
32
  parent_mapper: Optional[JsonMapper]
27
33
  target_expr_eval_ctx: Optional[RowBuilder.EvalCtx]
28
34
 
29
- def __init__(self, src_expr: Expr, target_expr: Expr):
35
+ def __init__(self, src_expr: Optional[Expr], target_expr: Optional[Expr]):
30
36
  # TODO: type spec should be list[target_expr.col_type]
31
37
  super().__init__(ts.JsonType())
32
38
 
39
+ dispatch = JsonMapperDispatch(src_expr, target_expr)
40
+ self.components.append(dispatch)
41
+ self.id = self._create_id()
42
+
43
+ def __repr__(self) -> str:
44
+ return f'map({self._src_expr}, lambda R: {self._target_expr})'
45
+
46
+ @property
47
+ def _src_expr(self) -> Expr:
48
+ return self.components[0].src_expr
49
+
50
+ @property
51
+ def _target_expr(self) -> Expr:
52
+ return self.components[0].target_expr
53
+
54
+ def _equals(self, _: JsonMapper) -> bool:
55
+ return True
56
+
57
+ def sql_expr(self, _: SqlElementCache) -> Optional[sql.ColumnElement]:
58
+ return None
59
+
60
+ def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
61
+ from ..exec.expr_eval.evaluators import NestedRowList
62
+
63
+ dispatch_slot_idx = self.components[0].slot_idx
64
+ nested_rows = data_row.vals[dispatch_slot_idx]
65
+ if nested_rows is None:
66
+ data_row[self.slot_idx] = None
67
+ return
68
+ assert isinstance(nested_rows, NestedRowList)
69
+ # TODO: get the materialized slot idx, instead of relying on the fact that the target_expr is always at the end
70
+ data_row[self.slot_idx] = [row.vals[-1] for row in nested_rows.rows]
71
+
72
+ def _as_dict(self) -> dict:
73
+ """
74
+ We only serialize src and target exprs, everything else is re-created at runtime.
75
+ """
76
+ return {'components': [self._src_expr.as_dict(), self._target_expr.as_dict()]}
77
+
78
+ @classmethod
79
+ def _from_dict(cls, d: dict, components: list[Expr]) -> JsonMapper:
80
+ assert len(components) == 2
81
+ src_expr, target_expr = components[0], components[1]
82
+ return cls(src_expr, target_expr)
83
+
84
+
85
+ class JsonMapperDispatch(Expr):
86
+ """
87
+ An operational Expr (ie, it doesn't represent any syntactic element) that is used by JsonMapper to materialize
88
+ its input DataRows. It has the same dependencies as the originating JsonMapper.
89
+
90
+ - The execution (= row dispatch) is handled by an expr_eval.Evaluator (JsonMapperDispatcher).
91
+ - It stores a NestedRowList instance in its slot.
92
+ """
93
+
94
+ target_expr_scope: ExprScope
95
+ parent_mapper: Optional[JsonMapperDispatch]
96
+ target_expr_eval_ctx: Optional[RowBuilder.EvalCtx]
97
+
98
+ def __init__(self, src_expr: Expr, target_expr: Expr):
99
+ super().__init__(ts.InvalidType())
100
+
33
101
  # we're creating a new scope, but we don't know yet whether this is nested within another JsonMapper;
34
102
  # this gets resolved in bind_rel_paths(); for now we assume we're in the global scope
35
103
  self.target_expr_scope = ExprScope(_GLOBAL_SCOPE)
@@ -40,28 +108,36 @@ class JsonMapper(Expr):
40
108
  self.parent_mapper = None
41
109
  self.target_expr_eval_ctx = None
42
110
 
43
- # Intentionally create the id now, before adding the scope anchor; this ensures that JsonMappers will
44
- # be recognized as equal so long as they have the same src_expr and target_expr.
111
+ # Intentionally create the id now, before adding the scope anchor; this ensures that JsonMapperDispatch
112
+ # instances will be recognized as equal so long as they have the same src_expr and target_expr.
45
113
  # TODO: Might this cause problems after certain substitutions?
46
114
  self.id = self._create_id()
47
115
 
48
116
  scope_anchor = ObjectRef(self.target_expr_scope, self)
49
117
  self.components.append(scope_anchor)
50
118
 
51
- def _bind_rel_paths(self, mapper: Optional[JsonMapper] = None) -> None:
52
- self._src_expr._bind_rel_paths(mapper)
53
- self._target_expr._bind_rel_paths(self)
119
+ def _bind_rel_paths(self, mapper: Optional[JsonMapperDispatch] = None) -> None:
120
+ self.src_expr._bind_rel_paths(mapper)
121
+ self.target_expr._bind_rel_paths(self)
54
122
  self.parent_mapper = mapper
55
123
  parent_scope = _GLOBAL_SCOPE if mapper is None else mapper.target_expr_scope
56
124
  self.target_expr_scope.parent = parent_scope
57
125
 
126
+ def equals(self, other: Expr) -> bool:
127
+ """
128
+ We override equals() because we need to avoid comparing our scope anchor.
129
+ """
130
+ if type(self) is not type(other):
131
+ return False
132
+ return self.src_expr.equals(other.src_expr) and self.target_expr.equals(other.target_expr)
133
+
58
134
  def scope(self) -> ExprScope:
59
135
  # need to ignore target_expr
60
- return self._src_expr.scope()
136
+ return self.src_expr.scope()
61
137
 
62
138
  def dependencies(self) -> list[Expr]:
63
- result = [self._src_expr]
64
- result.extend(self._target_dependencies(self._target_expr))
139
+ result = [self.src_expr]
140
+ result.extend(self._target_dependencies(self.target_expr))
65
141
  return result
66
142
 
67
143
  def _target_dependencies(self, e: Expr) -> list[Expr]:
@@ -77,23 +153,12 @@ class JsonMapper(Expr):
77
153
  result.extend(self._target_dependencies(c))
78
154
  return result
79
155
 
80
- def equals(self, other: Expr) -> bool:
81
- """
82
- We override equals() because we need to avoid comparing our scope anchor.
83
- """
84
- if type(self) is not type(other):
85
- return False
86
- return self._src_expr.equals(other._src_expr) and self._target_expr.equals(other._target_expr)
87
-
88
- def __repr__(self) -> str:
89
- return f'{self._src_expr} >> {self._target_expr}'
90
-
91
156
  @property
92
- def _src_expr(self) -> Expr:
157
+ def src_expr(self) -> Expr:
93
158
  return self.components[0]
94
159
 
95
160
  @property
96
- def _target_expr(self) -> Expr:
161
+ def target_expr(self) -> Expr:
97
162
  return self.components[1]
98
163
 
99
164
  @property
@@ -104,37 +169,19 @@ class JsonMapper(Expr):
104
169
  assert isinstance(result, ObjectRef)
105
170
  return result
106
171
 
107
- def _equals(self, _: JsonMapper) -> bool:
108
- return True
109
-
110
- def sql_expr(self, _: SqlElementCache) -> Optional[sql.ColumnElement]:
111
- return None
172
+ def __repr__(self) -> str:
173
+ return 'JsonMapperDispatch()'
112
174
 
113
175
  def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
114
- # this will be called, but the value has already been materialized elsewhere
115
- src = data_row[self._src_expr.slot_idx]
116
- if not isinstance(src, list):
117
- # invalid/non-list src path
118
- data_row[self.slot_idx] = None
119
- return
120
-
121
- result = [None] * len(src)
122
- if self.target_expr_eval_ctx is None:
123
- self.target_expr_eval_ctx = row_builder.create_eval_ctx([self._target_expr])
124
- for i, val in enumerate(src):
125
- data_row[self.scope_anchor.slot_idx] = val
126
- # stored target_expr
127
- row_builder.eval(data_row, self.target_expr_eval_ctx, force_eval=self._target_expr.scope())
128
- result[i] = data_row[self._target_expr.slot_idx]
129
- data_row[self.slot_idx] = result
176
+ # eval is handled by JsonMapperDispatcher
177
+ raise AssertionError('this should never be called')
130
178
 
131
179
  def _as_dict(self) -> dict:
132
180
  """
133
- We need to avoid serializing component[2], which is an ObjectRef.
181
+ JsonMapperDispatch instances are only created by the JsonMapper c'tor and never need to be serialized.
134
182
  """
135
- return {'components': [c.as_dict() for c in self.components[0:2]]}
183
+ raise AssertionError('this should never be called')
136
184
 
137
185
  @classmethod
138
- def _from_dict(cls, d: dict, components: list[Expr]) -> JsonMapper:
139
- assert len(components) == 2
140
- return cls(components[0], components[1])
186
+ def _from_dict(cls, d: dict, components: list[Expr]) -> JsonMapperDispatch:
187
+ raise AssertionError('this should never be called')
@@ -11,7 +11,7 @@ from pixeltable import catalog, exceptions as excs, type_system as ts
11
11
  from .data_row import DataRow
12
12
  from .expr import Expr
13
13
  from .globals import print_slice
14
- from .json_mapper import JsonMapper
14
+ from .json_mapper import JsonMapperDispatch
15
15
  from .object_ref import ObjectRef
16
16
  from .row_builder import RowBuilder
17
17
  from .sql_element_cache import SqlElementCache
@@ -80,11 +80,10 @@ class JsonPath(Expr):
80
80
  def is_relative_path(self) -> bool:
81
81
  return self._anchor is None
82
82
 
83
- @property
84
83
  def _has_relative_path(self) -> bool:
85
- return self.is_relative_path() or super()._has_relative_path
84
+ return self.is_relative_path() or super()._has_relative_path()
86
85
 
87
- def _bind_rel_paths(self, mapper: Optional['JsonMapper'] = None) -> None:
86
+ def _bind_rel_paths(self, mapper: Optional['JsonMapperDispatch'] = None) -> None:
88
87
  if self.is_relative_path():
89
88
  # TODO: take scope_idx into account
90
89
  self.set_anchor(mapper.scope_anchor)
@@ -110,12 +109,6 @@ class JsonPath(Expr):
110
109
  return JsonPath(self._anchor, [*self.path_elements, index])
111
110
  raise excs.Error(f'Invalid json list index: {index}')
112
111
 
113
- def __rshift__(self, other: object) -> 'JsonMapper':
114
- rhs_expr = Expr.from_object(other)
115
- if rhs_expr is None:
116
- raise excs.Error(f'>> requires an expression on the right-hand side, found {type(other)}')
117
- return JsonMapper(self, rhs_expr)
118
-
119
112
  def default_column_name(self) -> Optional[str]:
120
113
  anchor_name = self._anchor.default_column_name() if self._anchor is not None else ''
121
114
  ret_name = f'{anchor_name}.{self._json_path()}'
@@ -23,7 +23,7 @@ class MethodRef(Expr):
23
23
  # TODO: Should this even be an `Expr`? It can't actually be evaluated directly (it has to be first
24
24
  # converted to a `FunctionCall` by binding any remaining parameters).
25
25
 
26
- def __init__(self, base_expr: Expr, method_name: str):
26
+ def __init__(self, base_expr: Expr, method_name: str) -> None:
27
27
  super().__init__(ts.InvalidType()) # The `MethodRef` is untyped until it is called.
28
28
  self.base_expr = base_expr
29
29
  self.method_name = method_name
@@ -43,7 +43,7 @@ class MethodRef(Expr):
43
43
  assert len(components) == 1
44
44
  return cls(components[0], d['method_name'])
45
45
 
46
- def __call__(self, *args, **kwargs) -> FunctionCall:
46
+ def __call__(self, *args: Any, **kwargs: Any) -> FunctionCall:
47
47
  result = self.fn(*[self.base_expr, *args], **kwargs)
48
48
  assert isinstance(result, FunctionCall)
49
49
  result.is_method_call = True
@@ -8,7 +8,7 @@ import pixeltable.type_system as ts
8
8
 
9
9
  from .data_row import DataRow
10
10
  from .expr import Expr, ExprScope
11
- from .json_mapper import JsonMapper
11
+ from .json_mapper import JsonMapperDispatch
12
12
  from .row_builder import RowBuilder
13
13
  from .sql_element_cache import SqlElementCache
14
14
 
@@ -19,7 +19,7 @@ class ObjectRef(Expr):
19
19
  The object is generated/materialized elsewhere and establishes a new scope.
20
20
  """
21
21
 
22
- def __init__(self, scope: ExprScope, owner: JsonMapper):
22
+ def __init__(self, scope: ExprScope, owner: JsonMapperDispatch):
23
23
  # TODO: do we need an Unknown type after all?
24
24
  super().__init__(ts.JsonType()) # JsonType: this could be anything
25
25
  self._scope = scope
@@ -77,6 +77,8 @@ class RowBuilder:
77
77
  transitive_dependents: np.ndarray # of bool
78
78
  # dependencies[i] = direct dependencies of expr with slot idx i; transpose of dependents
79
79
  dependencies: np.ndarray # of bool
80
+ # num_dependencies[i] = number of direct dependencies of expr with slot idx i
81
+ num_dependencies: np.ndarray # of int
80
82
 
81
83
  # records the output_expr that a subexpr belongs to
82
84
  # (a subexpr can be shared across multiple output exprs)
@@ -209,6 +211,7 @@ class RowBuilder:
209
211
  exc_dependencies[expr.slot_idx].add(d.slot_idx)
210
212
  exc_dependencies[expr.slot_idx].update(exc_dependencies[d.slot_idx])
211
213
 
214
+ self.num_dependencies = np.sum(self.dependencies, axis=1)
212
215
  self.dependents = self.dependencies.T
213
216
  self.transitive_dependents = np.zeros((self.num_materialized, self.num_materialized), dtype=bool)
214
217
  for i in reversed(range(self.num_materialized)):
@@ -275,8 +278,14 @@ class RowBuilder:
275
278
  for d in e.dependencies():
276
279
  self._record_output_expr_id(d, output_expr_id)
277
280
 
278
- def _compute_dependencies(self, target_slot_idxs: list[int], excluded_slot_idxs: list[int]) -> list[int]:
279
- """Compute exprs needed to materialize the given target slots, excluding 'excluded_slot_idxs'"""
281
+ def _compute_dependencies(
282
+ self, target_slot_idxs: list[int], excluded_slot_idxs: list[int], target_scope: Optional[ExprScope] = None
283
+ ) -> list[int]:
284
+ """Compute exprs needed to materialize the given target slots, excluding 'excluded_slot_idxs'
285
+
286
+ If target_scope != None, stops transitive dependency resolution when leaving target_scope (ie, includes
287
+ immediate dependents that aren't in target_scope, but doesn't resolve those).
288
+ """
280
289
  dependencies: list[set[int]] = [set() for _ in range(self.num_materialized)] # indexed by slot_idx
281
290
  # doing this front-to-back ensures that we capture transitive dependencies
282
291
  max_target_slot_idx = max(target_slot_idxs)
@@ -289,6 +298,9 @@ class RowBuilder:
289
298
  if expr.slot_idx in self.input_expr_slot_idxs:
290
299
  # this is input and therefore doesn't depend on other exprs
291
300
  continue
301
+ if target_scope is not None and expr.scope() != target_scope:
302
+ # don't resolve dependencies outside of target_scope
303
+ continue
292
304
  for d in expr.dependencies():
293
305
  assert d.slot_idx is not None, f'{expr}, {d}'
294
306
  if d.slot_idx in excluded_slot_idxs:
@@ -320,10 +332,15 @@ class RowBuilder:
320
332
  for c in e.components:
321
333
  self.__set_slot_idxs_aux(c)
322
334
 
323
- def get_dependencies(self, targets: Iterable[Expr], exclude: Optional[Iterable[Expr]] = None) -> list[Expr]:
335
+ def get_dependencies(
336
+ self, targets: Iterable[Expr], exclude: Optional[Iterable[Expr]] = None, limit_scope: bool = True
337
+ ) -> list[Expr]:
324
338
  """
325
339
  Return list of dependencies needed to evaluate the given target exprs (expressed as slot idxs).
326
340
  The exprs given in 'exclude' are excluded.
341
+ If limit_scope == True, only returns dependencies in the same scope and immediate (ie, not transitive)
342
+ dependencies from enclosing scopes.
343
+
327
344
  Returns:
328
345
  list of Exprs from unique_exprs (= with slot_idx set)
329
346
  """
@@ -334,23 +351,33 @@ class RowBuilder:
334
351
  return []
335
352
  # make sure we only refer to recorded exprs
336
353
  targets = [self.unique_exprs[e] for e in targets]
354
+ target_scope: Optional[ExprScope] = None
355
+ if limit_scope:
356
+ # make sure all targets are from the same scope
357
+ target_scopes = {e.scope() for e in targets}
358
+ assert len(target_scopes) == 1
359
+ target_scope = target_scopes.pop()
337
360
  exclude = [self.unique_exprs[e] for e in exclude]
338
361
  target_slot_idxs = [e.slot_idx for e in targets]
339
362
  excluded_slot_idxs = [e.slot_idx for e in exclude]
340
- all_dependencies = set(self._compute_dependencies(target_slot_idxs, excluded_slot_idxs))
363
+ all_dependencies = set(
364
+ self._compute_dependencies(target_slot_idxs, excluded_slot_idxs, target_scope=target_scope)
365
+ )
341
366
  all_dependencies.update(target_slot_idxs)
342
367
  result_ids = list(all_dependencies)
343
368
  result_ids.sort()
344
369
  return [self.unique_exprs[id] for id in result_ids]
345
370
 
346
- def create_eval_ctx(self, targets: Iterable[Expr], exclude: Optional[Iterable[Expr]] = None) -> EvalCtx:
371
+ def create_eval_ctx(
372
+ self, targets: Iterable[Expr], exclude: Optional[Iterable[Expr]] = None, limit_scope: bool = True
373
+ ) -> EvalCtx:
347
374
  """Return EvalCtx for targets"""
348
375
  targets = list(targets)
349
376
  if exclude is None:
350
377
  exclude = []
351
378
  if len(targets) == 0:
352
379
  return self.EvalCtx([], [], [], [])
353
- dependencies = self.get_dependencies(targets, exclude)
380
+ dependencies = self.get_dependencies(targets, exclude, limit_scope=limit_scope)
354
381
  targets = [self.unique_exprs[e] for e in targets]
355
382
  target_slot_idxs = [e.slot_idx for e in targets]
356
383
  ctx_slot_idxs = [e.slot_idx for e in dependencies]
@@ -23,26 +23,12 @@ class SimilarityExpr(Expr):
23
23
 
24
24
  self.components = [col_ref, item_expr]
25
25
 
26
- # determine index to use
27
- idx_info = col_ref.col.get_idx_info()
28
26
  from pixeltable import index
29
27
 
30
- embedding_idx_info = {
31
- info.name: info for info in idx_info.values() if isinstance(info.idx, index.EmbeddingIndex)
32
- }
33
- if len(embedding_idx_info) == 0:
34
- raise excs.Error(f'No index found for column {col_ref.col!r}')
35
- if idx_name is not None and idx_name not in embedding_idx_info:
36
- raise excs.Error(f'Index {idx_name!r} not found for column {col_ref.col.name!r}')
37
- if len(embedding_idx_info) > 1:
38
- if idx_name is None:
39
- raise excs.Error(
40
- f'Column {col_ref.col.name!r} has multiple indices; use the index name to disambiguate: '
41
- f'`{col_ref.col.name}.similarity(..., idx=<name>)`'
42
- )
43
- self.idx_info = embedding_idx_info[idx_name]
44
- else:
45
- self.idx_info = next(iter(embedding_idx_info.values()))
28
+ # determine index to use
29
+ idx_dict = ColumnRef.find_embedding_index(col_ref.col, idx_name, 'similarity')
30
+ assert len(idx_dict) == 1
31
+ self.idx_info = next(iter(idx_dict.values()))
46
32
  idx = self.idx_info.idx
47
33
  assert isinstance(idx, index.EmbeddingIndex)
48
34
 
@@ -61,7 +47,7 @@ class SimilarityExpr(Expr):
61
47
  def __repr__(self) -> str:
62
48
  return f'{self.components[0]}.similarity({self.components[1]})'
63
49
 
64
- def _id_attrs(self):
50
+ def _id_attrs(self) -> list[tuple[str, Any]]:
65
51
  return [*super()._id_attrs(), ('idx_name', self.idx_info.name)]
66
52
 
67
53
  def default_column_name(self) -> str:
@@ -86,8 +72,7 @@ class SimilarityExpr(Expr):
86
72
  return self.idx_info.idx.order_by_clause(self.idx_info.val_col, item, is_asc)
87
73
 
88
74
  def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
89
- # this should never get called
90
- raise AssertionError()
75
+ raise excs.Error('similarity(): cannot be used in a computed column')
91
76
 
92
77
  def _as_dict(self) -> dict:
93
78
  return {'idx_name': self.idx_info.name, **super()._as_dict()}
@@ -17,7 +17,7 @@ class SqlElementCache:
17
17
  for e, el in elements.items():
18
18
  self.cache[e.id] = el
19
19
 
20
- def extend(self, elements: ExprDict[sql.ColumnElement]):
20
+ def extend(self, elements: ExprDict[sql.ColumnElement]) -> None:
21
21
  for e, el in elements.items():
22
22
  self.cache[e.id] = el
23
23
 
@@ -0,0 +1,107 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Optional, Union
4
+
5
+ import sqlalchemy as sql
6
+
7
+ import pixeltable.exceptions as excs
8
+ import pixeltable.type_system as ts
9
+
10
+ from .data_row import DataRow
11
+ from .expr import Expr
12
+ from .globals import StringOperator
13
+ from .row_builder import RowBuilder
14
+ from .sql_element_cache import SqlElementCache
15
+
16
+
17
+ class StringOp(Expr):
18
+ """
19
+ Allows operations on strings
20
+ """
21
+
22
+ operator: StringOperator
23
+
24
+ def __init__(self, operator: StringOperator, op1: Expr, op2: Expr):
25
+ super().__init__(ts.StringType(nullable=op1.col_type.nullable))
26
+ self.operator = operator
27
+ self.components = [op1, op2]
28
+ assert op1.col_type.is_string_type()
29
+ if operator in (StringOperator.CONCAT, StringOperator.REPEAT):
30
+ if operator == StringOperator.CONCAT and not op2.col_type.is_string_type():
31
+ raise excs.Error(
32
+ f'{self}: {operator} on strings requires string type, but {op2} has type {op2.col_type}'
33
+ )
34
+ if operator == StringOperator.REPEAT and not op2.col_type.is_int_type():
35
+ raise excs.Error(f'{self}: {operator} on strings requires int type, but {op2} has type {op2.col_type}')
36
+ else:
37
+ raise excs.Error(
38
+ f'{self}: invalid operation {operator} on strings; '
39
+ f'only operators {StringOperator.CONCAT} and {StringOperator.REPEAT} are supported'
40
+ )
41
+ self.id = self._create_id()
42
+
43
+ @property
44
+ def _op1(self) -> Expr:
45
+ return self.components[0]
46
+
47
+ @property
48
+ def _op2(self) -> Expr:
49
+ return self.components[1]
50
+
51
+ def __repr__(self) -> str:
52
+ # add parentheses around operands that are StringOpExpr to express precedence
53
+ op1_str = f'({self._op1})' if isinstance(self._op1, StringOp) else str(self._op1)
54
+ op2_str = f'({self._op2})' if isinstance(self._op2, StringOp) else str(self._op2)
55
+ return f'{op1_str} {self.operator} {op2_str}'
56
+
57
+ def _equals(self, other: StringOp) -> bool:
58
+ return self.operator == other.operator
59
+
60
+ def _id_attrs(self) -> list[tuple[str, Any]]:
61
+ return [*super()._id_attrs(), ('operator', self.operator.value)]
62
+
63
+ def sql_expr(self, sql_elements: SqlElementCache) -> Optional[sql.ColumnElement]:
64
+ left = sql_elements.get(self._op1)
65
+ right = sql_elements.get(self._op2)
66
+ if left is None or right is None:
67
+ return None
68
+ if self.operator == StringOperator.CONCAT:
69
+ return left.concat(right)
70
+ if self.operator == StringOperator.REPEAT:
71
+ return sql.func.repeat(sql.cast(left, sql.String), sql.cast(right, sql.Integer))
72
+ return None
73
+
74
+ def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
75
+ op1_val = data_row[self._op1.slot_idx]
76
+ op2_val = data_row[self._op2.slot_idx]
77
+ data_row[self.slot_idx] = self.eval_nullable(op1_val, op2_val)
78
+
79
+ def eval_nullable(self, op1_val: Union[str, None], op2_val: Union[int, str, None]) -> Union[str, None]:
80
+ """
81
+ Return the result of evaluating the expression on two nullable int/float operands,
82
+ None is interpreted as SQL NULL
83
+ """
84
+ if op1_val is None or op2_val is None:
85
+ return None
86
+ return self.eval_non_null(op1_val, op2_val)
87
+
88
+ def eval_non_null(self, op1_val: str, op2_val: Union[int, str]) -> str:
89
+ """
90
+ Return the result of evaluating the expression on two int/float operands
91
+ """
92
+ assert self.operator in (StringOperator.CONCAT, StringOperator.REPEAT)
93
+ if self.operator == StringOperator.CONCAT:
94
+ assert isinstance(op2_val, str)
95
+ return op1_val + op2_val
96
+ else:
97
+ assert isinstance(op2_val, int)
98
+ return op1_val * op2_val
99
+
100
+ def _as_dict(self) -> dict:
101
+ return {'operator': self.operator.value, **super()._as_dict()}
102
+
103
+ @classmethod
104
+ def _from_dict(cls, d: dict, components: list[Expr]) -> StringOp:
105
+ assert 'operator' in d
106
+ assert len(components) == 2
107
+ return cls(StringOperator(d['operator']), components[0], components[1])
@@ -13,5 +13,5 @@ from . import functions
13
13
  __all__ = local_public_names(__name__)
14
14
 
15
15
 
16
- def __dir__():
16
+ def __dir__() -> list[str]:
17
17
  return __all__
@@ -7,5 +7,5 @@ from . import whisperx, yolox
7
7
  __all__ = local_public_names(__name__)
8
8
 
9
9
 
10
- def __dir__():
10
+ def __dir__() -> list[str]:
11
11
  return __all__
@@ -73,5 +73,5 @@ _model_cache: dict[tuple[str, str, str], 'FasterWhisperPipeline'] = {}
73
73
  __all__ = local_public_names(__name__)
74
74
 
75
75
 
76
- def __dir__():
76
+ def __dir__() -> list[str]:
77
77
  return __all__