pixeltable 0.2.20__py3-none-any.whl → 0.2.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (120) hide show
  1. pixeltable/__init__.py +7 -19
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/__init__.py +7 -7
  4. pixeltable/catalog/column.py +37 -11
  5. pixeltable/catalog/globals.py +21 -0
  6. pixeltable/catalog/insertable_table.py +6 -4
  7. pixeltable/catalog/table.py +227 -148
  8. pixeltable/catalog/table_version.py +66 -28
  9. pixeltable/catalog/table_version_path.py +0 -8
  10. pixeltable/catalog/view.py +18 -19
  11. pixeltable/dataframe.py +16 -32
  12. pixeltable/env.py +6 -1
  13. pixeltable/exec/__init__.py +1 -2
  14. pixeltable/exec/aggregation_node.py +27 -17
  15. pixeltable/exec/cache_prefetch_node.py +1 -1
  16. pixeltable/exec/data_row_batch.py +9 -26
  17. pixeltable/exec/exec_node.py +36 -7
  18. pixeltable/exec/expr_eval_node.py +19 -11
  19. pixeltable/exec/in_memory_data_node.py +14 -11
  20. pixeltable/exec/sql_node.py +266 -138
  21. pixeltable/exprs/__init__.py +1 -0
  22. pixeltable/exprs/arithmetic_expr.py +3 -1
  23. pixeltable/exprs/array_slice.py +7 -7
  24. pixeltable/exprs/column_property_ref.py +37 -10
  25. pixeltable/exprs/column_ref.py +93 -14
  26. pixeltable/exprs/comparison.py +5 -5
  27. pixeltable/exprs/compound_predicate.py +8 -7
  28. pixeltable/exprs/data_row.py +56 -36
  29. pixeltable/exprs/expr.py +65 -63
  30. pixeltable/exprs/expr_dict.py +55 -0
  31. pixeltable/exprs/expr_set.py +26 -15
  32. pixeltable/exprs/function_call.py +53 -24
  33. pixeltable/exprs/globals.py +4 -1
  34. pixeltable/exprs/in_predicate.py +8 -7
  35. pixeltable/exprs/inline_expr.py +4 -4
  36. pixeltable/exprs/is_null.py +4 -4
  37. pixeltable/exprs/json_mapper.py +11 -12
  38. pixeltable/exprs/json_path.py +5 -10
  39. pixeltable/exprs/literal.py +5 -5
  40. pixeltable/exprs/method_ref.py +5 -4
  41. pixeltable/exprs/object_ref.py +2 -1
  42. pixeltable/exprs/row_builder.py +88 -36
  43. pixeltable/exprs/rowid_ref.py +14 -13
  44. pixeltable/exprs/similarity_expr.py +12 -7
  45. pixeltable/exprs/sql_element_cache.py +12 -6
  46. pixeltable/exprs/type_cast.py +8 -6
  47. pixeltable/exprs/variable.py +5 -4
  48. pixeltable/ext/functions/whisperx.py +7 -2
  49. pixeltable/func/aggregate_function.py +1 -1
  50. pixeltable/func/callable_function.py +2 -2
  51. pixeltable/func/function.py +11 -10
  52. pixeltable/func/function_registry.py +6 -7
  53. pixeltable/func/query_template_function.py +11 -12
  54. pixeltable/func/signature.py +17 -15
  55. pixeltable/func/udf.py +0 -4
  56. pixeltable/functions/__init__.py +2 -2
  57. pixeltable/functions/audio.py +4 -6
  58. pixeltable/functions/globals.py +84 -42
  59. pixeltable/functions/huggingface.py +31 -34
  60. pixeltable/functions/image.py +59 -45
  61. pixeltable/functions/json.py +0 -1
  62. pixeltable/functions/llama_cpp.py +106 -0
  63. pixeltable/functions/mistralai.py +2 -2
  64. pixeltable/functions/ollama.py +147 -0
  65. pixeltable/functions/openai.py +22 -25
  66. pixeltable/functions/replicate.py +72 -0
  67. pixeltable/functions/string.py +59 -50
  68. pixeltable/functions/timestamp.py +20 -20
  69. pixeltable/functions/together.py +2 -2
  70. pixeltable/functions/video.py +11 -20
  71. pixeltable/functions/whisper.py +2 -20
  72. pixeltable/globals.py +65 -74
  73. pixeltable/index/base.py +2 -2
  74. pixeltable/index/btree.py +20 -7
  75. pixeltable/index/embedding_index.py +12 -14
  76. pixeltable/io/__init__.py +1 -2
  77. pixeltable/io/external_store.py +11 -5
  78. pixeltable/io/fiftyone.py +178 -0
  79. pixeltable/io/globals.py +98 -2
  80. pixeltable/io/hf_datasets.py +1 -1
  81. pixeltable/io/label_studio.py +6 -6
  82. pixeltable/io/parquet.py +14 -13
  83. pixeltable/iterators/base.py +3 -2
  84. pixeltable/iterators/document.py +10 -8
  85. pixeltable/iterators/video.py +126 -60
  86. pixeltable/metadata/__init__.py +4 -3
  87. pixeltable/metadata/converters/convert_14.py +4 -2
  88. pixeltable/metadata/converters/convert_15.py +1 -1
  89. pixeltable/metadata/converters/convert_19.py +1 -0
  90. pixeltable/metadata/converters/convert_20.py +1 -1
  91. pixeltable/metadata/converters/convert_21.py +34 -0
  92. pixeltable/metadata/converters/util.py +54 -12
  93. pixeltable/metadata/notes.py +1 -0
  94. pixeltable/metadata/schema.py +40 -21
  95. pixeltable/plan.py +149 -165
  96. pixeltable/py.typed +0 -0
  97. pixeltable/store.py +57 -37
  98. pixeltable/tool/create_test_db_dump.py +6 -6
  99. pixeltable/tool/create_test_video.py +1 -1
  100. pixeltable/tool/doc_plugins/griffe.py +3 -34
  101. pixeltable/tool/embed_udf.py +1 -1
  102. pixeltable/tool/mypy_plugin.py +55 -0
  103. pixeltable/type_system.py +260 -61
  104. pixeltable/utils/arrow.py +10 -9
  105. pixeltable/utils/coco.py +4 -4
  106. pixeltable/utils/documents.py +16 -2
  107. pixeltable/utils/filecache.py +9 -9
  108. pixeltable/utils/formatter.py +10 -11
  109. pixeltable/utils/http_server.py +2 -5
  110. pixeltable/utils/media_store.py +6 -6
  111. pixeltable/utils/pytorch.py +10 -11
  112. pixeltable/utils/sql.py +2 -1
  113. {pixeltable-0.2.20.dist-info → pixeltable-0.2.22.dist-info}/METADATA +50 -13
  114. pixeltable-0.2.22.dist-info/RECORD +153 -0
  115. pixeltable/exec/media_validation_node.py +0 -43
  116. pixeltable/utils/help.py +0 -11
  117. pixeltable-0.2.20.dist-info/RECORD +0 -147
  118. {pixeltable-0.2.20.dist-info → pixeltable-0.2.22.dist-info}/LICENSE +0 -0
  119. {pixeltable-0.2.20.dist-info → pixeltable-0.2.22.dist-info}/WHEEL +0 -0
  120. {pixeltable-0.2.20.dist-info → pixeltable-0.2.22.dist-info}/entry_points.txt +0 -0
pixeltable/plan.py CHANGED
@@ -1,5 +1,4 @@
1
- import itertools
2
- from typing import Any, Iterable, Optional, Sequence
1
+ from typing import Any, Iterable, Optional, Sequence, cast
3
2
  from uuid import UUID
4
3
 
5
4
  import sqlalchemy as sql
@@ -9,6 +8,7 @@ import pixeltable.exec as exec
9
8
  from pixeltable import catalog
10
9
  from pixeltable import exceptions as excs
11
10
  from pixeltable import exprs
11
+ from pixeltable.exec.sql_node import OrderByItem, OrderByClause, combine_order_by_clauses, print_order_by_clause
12
12
 
13
13
 
14
14
  def _is_agg_fn_call(e: exprs.Expr) -> bool:
@@ -46,11 +46,9 @@ class Analyzer:
46
46
  tbl: catalog.TableVersionPath
47
47
  all_exprs: list[exprs.Expr]
48
48
  select_list: list[exprs.Expr]
49
- group_by_clause: list[exprs.Expr]
50
- order_by_clause: list[tuple[exprs.Expr, bool]]
51
-
52
- # exprs that can be expressed in SQL and are retrieved directly from the store
53
- #sql_exprs: list[exprs.Expr]
49
+ group_by_clause: Optional[list[exprs.Expr]] # None for non-aggregate queries; [] for agg query w/o grouping
50
+ grouping_exprs: list[exprs.Expr] # [] for non-aggregate queries or agg query w/o grouping
51
+ order_by_clause: OrderByClause
54
52
 
55
53
  sql_elements: exprs.SqlElementCache
56
54
 
@@ -60,15 +58,14 @@ class Analyzer:
60
58
  # filter predicate applied to output rows of the SQL scan
61
59
  filter: Optional[exprs.Expr]
62
60
 
63
- agg_fn_calls: list[exprs.FunctionCall]
61
+ agg_fn_calls: list[exprs.FunctionCall] # grouping aggregation (ie, not window functions)
62
+ window_fn_calls: list[exprs.FunctionCall]
64
63
  agg_order_by: list[exprs.Expr]
65
64
 
66
65
  def __init__(
67
66
  self, tbl: catalog.TableVersionPath, select_list: Sequence[exprs.Expr],
68
67
  where_clause: Optional[exprs.Expr] = None, group_by_clause: Optional[list[exprs.Expr]] = None,
69
68
  order_by_clause: Optional[list[tuple[exprs.Expr, bool]]] = None):
70
- if group_by_clause is None:
71
- group_by_clause = []
72
69
  if order_by_clause is None:
73
70
  order_by_clause = []
74
71
  self.tbl = tbl
@@ -78,8 +75,10 @@ class Analyzer:
78
75
  self.select_list = [e.resolve_computed_cols() for e in select_list]
79
76
  if where_clause is not None:
80
77
  where_clause = where_clause.resolve_computed_cols()
81
- self.group_by_clause = [e.resolve_computed_cols() for e in group_by_clause]
82
- self.order_by_clause = [(e.resolve_computed_cols(), asc) for e, asc in order_by_clause]
78
+ self.group_by_clause = (
79
+ [e.resolve_computed_cols() for e in group_by_clause] if group_by_clause is not None else None
80
+ )
81
+ self.order_by_clause = [OrderByItem(e.resolve_computed_cols(), asc) for e, asc in order_by_clause]
83
82
 
84
83
  self.sql_where_clause = None
85
84
  self.filter = None
@@ -89,20 +88,36 @@ class Analyzer:
89
88
 
90
89
  # all exprs that are evaluated in Python; not executable
91
90
  self.all_exprs = self.select_list.copy()
92
- self.all_exprs.extend(self.group_by_clause)
91
+ if self.group_by_clause is not None:
92
+ self.all_exprs.extend(self.group_by_clause)
93
93
  self.all_exprs.extend(e for e, _ in self.order_by_clause)
94
94
  if self.filter is not None:
95
95
  self.all_exprs.append(self.filter)
96
96
 
97
97
  self.agg_order_by = []
98
+ self.agg_fn_calls = []
99
+ self.window_fn_calls = []
98
100
  self._analyze_agg()
101
+ self.grouping_exprs = self.group_by_clause if self.group_by_clause is not None else []
99
102
 
100
103
  def _analyze_agg(self) -> None:
101
104
  """Check semantic correctness of aggregation and fill in agg-specific fields of Analyzer"""
102
- self.agg_fn_calls = [e for e in self.all_exprs if isinstance(e, exprs.FunctionCall) and _is_agg_fn_call(e)]
105
+ candidates = self.select_list
106
+ agg_fn_calls = exprs.ExprSet(
107
+ exprs.Expr.list_subexprs(
108
+ candidates, expr_class=exprs.FunctionCall,
109
+ filter=lambda e: bool(e.is_agg_fn_call and not e.is_window_fn_call)))
110
+ self.agg_fn_calls = list(agg_fn_calls)
111
+ window_fn_calls = exprs.ExprSet(
112
+ exprs.Expr.list_subexprs(
113
+ candidates, expr_class=exprs.FunctionCall, filter=lambda e: bool(e.is_window_fn_call)))
114
+ self.window_fn_calls = list(window_fn_calls)
103
115
  if len(self.agg_fn_calls) == 0:
104
116
  # nothing to do
105
117
  return
118
+ # if we're doing grouping aggregation and don't have an explicit Group By clause, we're creating a single group
119
+ if self.group_by_clause is None:
120
+ self.group_by_clause = []
106
121
 
107
122
  # check that select list only contains aggregate output
108
123
  grouping_expr_ids = {e.id for e in self.group_by_clause}
@@ -113,8 +128,7 @@ class Analyzer:
113
128
 
114
129
  # check that filter doesn't contain aggregates
115
130
  if self.filter is not None:
116
- agg_fn_calls = [e for e in self.filter.subexprs(expr_class=exprs.FunctionCall, filter=lambda e: _is_agg_fn_call(e))]
117
- if len(agg_fn_calls) > 0:
131
+ if any(_is_agg_fn_call(e) for e in self.filter.subexprs(expr_class=exprs.FunctionCall)):
118
132
  raise excs.Error(f'Filter cannot contain aggregate functions: {self.filter}')
119
133
 
120
134
  # check that grouping exprs don't contain aggregates and can be expressed as SQL (we perform sort-based
@@ -125,27 +139,6 @@ class Analyzer:
125
139
  if e._contains(filter=lambda e: _is_agg_fn_call(e)):
126
140
  raise excs.Error(f'Grouping expression contains aggregate function: {e}')
127
141
 
128
- # check that agg fn calls don't have contradicting ordering requirements
129
- order_by: list[exprs.Expr] = []
130
- order_by_origin: Optional[exprs.Expr] = None # the expr that determines the ordering
131
- for agg_fn_call in self.agg_fn_calls:
132
- fn_call_order_by = agg_fn_call.get_agg_order_by()
133
- if len(fn_call_order_by) == 0:
134
- continue
135
- if len(order_by) == 0:
136
- order_by = fn_call_order_by
137
- order_by_origin = agg_fn_call
138
- else:
139
- combined = _get_combined_ordering(
140
- [(e, True) for e in order_by], [(e, True) for e in fn_call_order_by])
141
- if len(combined) == 0:
142
- raise excs.Error((
143
- f"Incompatible ordering requirements between expressions '{order_by_origin}' and "
144
- f"'{agg_fn_call}':\n"
145
- f"{exprs.Expr.print_list(order_by)} vs {exprs.Expr.print_list(fn_call_order_by)}"
146
- ))
147
- self.agg_order_by = order_by
148
-
149
142
  def _determine_agg_status(self, e: exprs.Expr, grouping_expr_ids: set[int]) -> tuple[bool, bool]:
150
143
  """Determine whether expr is the input to or output of an aggregate function.
151
144
  Returns:
@@ -175,14 +168,14 @@ class Analyzer:
175
168
  raise excs.Error(f'Invalid expression, mixes aggregate with non-aggregate: {e}')
176
169
  return is_output, is_input
177
170
 
178
-
179
171
  def finalize(self, row_builder: exprs.RowBuilder) -> None:
180
172
  """Make all exprs executable
181
173
  TODO: add EvalCtx for each expr list?
182
174
  """
183
175
  # maintain original composition of select list
184
176
  row_builder.set_slot_idxs(self.select_list, remove_duplicates=False)
185
- row_builder.set_slot_idxs(self.group_by_clause)
177
+ if self.group_by_clause is not None:
178
+ row_builder.set_slot_idxs(self.group_by_clause)
186
179
  order_by_exprs = [e for e, _ in self.order_by_clause]
187
180
  row_builder.set_slot_idxs(order_by_exprs)
188
181
  row_builder.set_slot_idxs(self.all_exprs)
@@ -191,6 +184,19 @@ class Analyzer:
191
184
  row_builder.set_slot_idxs(self.agg_fn_calls)
192
185
  row_builder.set_slot_idxs(self.agg_order_by)
193
186
 
187
+ def get_window_fn_ob_clause(self) -> Optional[OrderByClause]:
188
+ clause: list[OrderByClause] = []
189
+ for fn_call in self.window_fn_calls:
190
+ # window functions require ordering by the group_by/order_by clauses
191
+ group_by_exprs, order_by_exprs = fn_call.get_window_sort_exprs()
192
+ clause.append(
193
+ [OrderByItem(e, None) for e in group_by_exprs] + [OrderByItem(e, True) for e in order_by_exprs])
194
+ return combine_order_by_clauses(clause)
195
+
196
+ def has_agg(self) -> bool:
197
+ """True if there is any kind of aggregation in the query"""
198
+ return self.group_by_clause is not None or len(self.agg_fn_calls) > 0 or len(self.window_fn_calls) > 0
199
+
194
200
 
195
201
  class Planner:
196
202
  # TODO: create an exec.CountNode and change this to create_count_plan()
@@ -219,27 +225,28 @@ class Planner:
219
225
  assert not tbl.is_view()
220
226
  # stored_cols: all cols we need to store, incl computed cols (and indices)
221
227
  stored_cols = [c for c in tbl.cols if c.is_stored]
222
- assert len(stored_cols) > 0
223
-
228
+ assert len(stored_cols) > 0 # there needs to be something to store
224
229
  row_builder = exprs.RowBuilder([], stored_cols, [])
225
230
 
226
231
  # create InMemoryDataNode for 'rows'
227
- stored_col_info = row_builder.output_slot_idxs()
228
- stored_img_col_info = [info for info in stored_col_info if info.col.col_type.is_image_type()]
229
- input_col_info = [info for info in stored_col_info if not info.col.is_computed]
230
232
  plan: exec.ExecNode = exec.InMemoryDataNode(tbl, rows, row_builder, tbl.next_rowid)
231
233
 
232
- media_input_cols = [info for info in input_col_info if info.col.col_type.is_media_type()]
233
- if len(media_input_cols) > 0:
234
- # prefetch external files for all input column refs for validation
235
- plan = exec.CachePrefetchNode(tbl.id, media_input_cols, input=plan)
236
- plan = exec.MediaValidationNode(row_builder, media_input_cols, input=plan)
234
+ media_input_col_info = [
235
+ exprs.ColumnSlotIdx(col_ref.col, col_ref.slot_idx)
236
+ for col_ref in row_builder.input_exprs
237
+ if isinstance(col_ref, exprs.ColumnRef) and col_ref.col_type.is_media_type()
238
+ ]
239
+ if len(media_input_col_info) > 0:
240
+ # prefetch external files for all input column refs
241
+ plan = exec.CachePrefetchNode(tbl.id, media_input_col_info, input=plan)
237
242
 
238
- computed_exprs = [e for e in row_builder.default_eval_ctx.target_exprs if not isinstance(e, exprs.ColumnRef)]
243
+ computed_exprs = row_builder.output_exprs - row_builder.input_exprs
239
244
  if len(computed_exprs) > 0:
240
245
  # add an ExprEvalNode when there are exprs to compute
241
246
  plan = exec.ExprEvalNode(row_builder, computed_exprs, plan.output_exprs, input=plan)
242
247
 
248
+ stored_col_info = row_builder.output_slot_idxs()
249
+ stored_img_col_info = [info for info in stored_col_info if info.col.col_type.is_image_type()]
243
250
  plan.set_stored_img_cols(stored_img_col_info)
244
251
  plan.set_ctx(
245
252
  exec.ExecContext(
@@ -507,93 +514,35 @@ class Planner:
507
514
  return plan, len(row_builder.default_eval_ctx.target_exprs)
508
515
 
509
516
  @classmethod
510
- def _determine_ordering(cls, analyzer: Analyzer) -> list[tuple[exprs.Expr, bool]]:
511
- """Returns the exprs for the ORDER BY clause of the SqlScanNode"""
512
- order_by_items: list[tuple[exprs.Expr, Optional[bool]]] = []
513
- order_by_origin: Optional[exprs.Expr] = None # the expr that determines the ordering
514
-
515
-
516
- # window functions require ordering by the group_by/order_by clauses
517
- window_fn_calls = [
518
- e for e in analyzer.all_exprs if isinstance(e, exprs.FunctionCall) and e.is_window_fn_call
519
- ]
520
- if len(window_fn_calls) > 0:
521
- for fn_call in window_fn_calls:
517
+ def _verify_ordering(cls, analyzer: Analyzer, verify_agg: bool) -> None:
518
+ """Verify that the various ordering requirements don't conflict"""
519
+ ob_clauses: list[OrderByClause] = [analyzer.order_by_clause.copy()]
520
+
521
+ if verify_agg:
522
+ ordering: OrderByClause
523
+ for fn_call in analyzer.window_fn_calls:
524
+ # window functions require ordering by the group_by/order_by clauses
522
525
  gb, ob = fn_call.get_window_sort_exprs()
523
- # for now, the ordering is implicitly ascending
524
- fn_call_ordering = [(e, None) for e in gb] + [(e, True) for e in ob]
525
- if len(order_by_items) == 0:
526
- order_by_items = fn_call_ordering
527
- order_by_origin = fn_call
528
- else:
529
- # check for compatibility
530
- other_order_by_clauses = fn_call_ordering
531
- combined = _get_combined_ordering(order_by_items, other_order_by_clauses)
532
- if len(combined) == 0:
533
- raise excs.Error((
534
- f"Incompatible ordering requirements between expressions '{order_by_origin}' and "
535
- f"'{fn_call}':\n"
536
- f"{exprs.Expr.print_list(order_by_items)} vs {exprs.Expr.print_list(other_order_by_clauses)}"
537
- ))
538
- order_by_items = combined
539
-
540
- if len(analyzer.group_by_clause) > 0:
541
- agg_ordering = [(e, None) for e in analyzer.group_by_clause] + [(e, True) for e in analyzer.agg_order_by]
542
- if len(order_by_items) > 0:
543
- # check for compatibility
544
- combined = _get_combined_ordering(order_by_items, agg_ordering)
545
- if len(combined) == 0:
546
- raise excs.Error((
547
- f"Incompatible ordering requirements between expressions '{order_by_origin}' and "
548
- f"grouping expressions:\n"
549
- f"{exprs.Expr.print_list([e for e, _ in order_by_items])} vs "
550
- f"{exprs.Expr.print_list([e for e, _ in agg_ordering])}"
551
- ))
552
- order_by_items = combined
553
- else:
554
- order_by_items = agg_ordering
526
+ ordering = [OrderByItem(e, None) for e in gb] + [OrderByItem(e, True) for e in ob]
527
+ ob_clauses.append(ordering)
528
+ for fn_call in analyzer.agg_fn_calls:
529
+ # agg functions with an ordering requirement are implicitly ascending
530
+ ordering = (
531
+ [OrderByItem(e, None) for e in analyzer.group_by_clause]
532
+ + [OrderByItem(e, True) for e in fn_call.get_agg_order_by()]
533
+ )
534
+ ob_clauses.append(ordering)
535
+ if len(ob_clauses) <= 1:
536
+ return
555
537
 
556
- if len(analyzer.order_by_clause) > 0:
557
- if len(order_by_items) > 0:
558
- # check for compatibility
559
- combined = _get_combined_ordering(order_by_items, analyzer.order_by_clause)
560
- if len(combined) == 0:
561
- raise excs.Error((
562
- f"Incompatible ordering requirements between expressions '{order_by_origin}' and "
563
- f"order-by expressions:\n"
564
- f"{exprs.Expr.print_list([e for e, _ in order_by_items])} vs "
565
- f"{exprs.Expr.print_list([e for e, _ in analyzer.order_by_clause])}"
566
- ))
567
- order_by_items = combined
568
- else:
569
- order_by_items = analyzer.order_by_clause
570
-
571
- # TODO: can this be unified with the same logic in RowBuilder
572
- def refs_unstored_iter_col(e: exprs.Expr) -> bool:
573
- if not isinstance(e, exprs.ColumnRef):
574
- return False
575
- tbl = e.col.tbl
576
- return tbl.is_component_view() and tbl.is_iterator_column(e.col) and not e.col.is_stored
577
- unstored_iter_col_refs = list(exprs.Expr.list_subexprs(analyzer.all_exprs, expr_class=exprs.ColumnRef, filter=refs_unstored_iter_col))
578
- if len(unstored_iter_col_refs) > 0 and len(order_by_items) == 0:
579
- # we don't already have a user-requested ordering and we access unstored iterator columns:
580
- # order by the primary key of the component view, which minimizes the number of iterator instantiations
581
- component_views = {e.col.tbl for e in unstored_iter_col_refs}
582
- # TODO: generalize this to multi-level iteration
583
- assert len(component_views) == 1
584
- component_view = list(component_views)[0]
585
- order_by_items = [
586
- (exprs.RowidRef(component_view, idx), None)
587
- for idx in range(len(component_view.store_tbl.rowid_columns()))
588
- ]
589
- order_by_origin = unstored_iter_col_refs[0]
590
-
591
- for e in [e for e, _ in order_by_items]:
592
- if not analyzer.sql_elements.contains(e):
593
- raise excs.Error(f'order_by element cannot be expressed in SQL: {e}')
594
- # we do ascending ordering by default, if not specified otherwise
595
- order_by_items = [(e, True) if asc is None else (e, asc) for e, asc in order_by_items]
596
- return order_by_items
538
+ combined_ordering = ob_clauses[0]
539
+ for ordering in ob_clauses[1:]:
540
+ combined = combine_order_by_clauses([combined_ordering, ordering])
541
+ if combined is None:
542
+ raise excs.Error(
543
+ f'Incompatible ordering requirements: '
544
+ f'{print_order_by_clause(combined_ordering)} vs {print_order_by_clause(ordering)}')
545
+ combined_ordering = combined
597
546
 
598
547
  @classmethod
599
548
  def _is_contained_in(cls, l1: Iterable[exprs.Expr], l2: Iterable[exprs.Expr]) -> bool:
@@ -632,8 +581,6 @@ class Planner:
632
581
  """
633
582
  if select_list is None:
634
583
  select_list = []
635
- if group_by_clause is None:
636
- group_by_clause = []
637
584
  if order_by_clause is None:
638
585
  order_by_clause = []
639
586
  if exact_version_only is None:
@@ -641,16 +588,12 @@ class Planner:
641
588
  analyzer = Analyzer(
642
589
  tbl, select_list, where_clause=where_clause, group_by_clause=group_by_clause,
643
590
  order_by_clause=order_by_clause)
644
- input_exprs = exprs.ExprSet(exprs.Expr.list_subexprs(
645
- analyzer.all_exprs, filter=analyzer.sql_elements.contains, traverse_matches=False))
646
- # remove Literals from sql_exprs, we don't want to materialize them via a Select
647
- input_exprs = exprs.ExprSet(e for e in input_exprs if not isinstance(e, exprs.Literal))
648
- row_builder = exprs.RowBuilder(analyzer.all_exprs, [], input_exprs)
591
+ row_builder = exprs.RowBuilder(analyzer.all_exprs, [], [])
649
592
 
650
593
  analyzer.finalize(row_builder)
651
594
  # select_list: we need to materialize everything that's been collected
652
595
  # with_pk: for now, we always retrieve the PK, because we need it for the file cache
653
- eval_ctx = row_builder.create_eval_ctx(analyzer.all_exprs)
596
+ eval_ctx = row_builder.create_eval_ctx(analyzer.select_list)
654
597
  plan = cls._create_query_plan(
655
598
  tbl, row_builder, analyzer=analyzer, eval_ctx=eval_ctx, limit=limit, with_pk=True,
656
599
  exact_version_only=exact_version_only)
@@ -677,48 +620,89 @@ class Planner:
677
620
  if exact_version_only is None:
678
621
  exact_version_only = []
679
622
  assert isinstance(tbl, catalog.TableVersionPath)
680
- is_agg_query = len(analyzer.group_by_clause) > 0 or len(analyzer.agg_fn_calls) > 0
623
+ sql_elements = analyzer.sql_elements
624
+ is_python_agg = (
625
+ not sql_elements.contains_all(analyzer.agg_fn_calls)
626
+ or not sql_elements.contains_all(analyzer.window_fn_calls)
627
+ )
681
628
  ctx = exec.ExecContext(row_builder)
629
+ cls._verify_ordering(analyzer, verify_agg=is_python_agg)
630
+
631
+ # materialized with SQL scan:
632
+ # - select list subexprs that aren't aggregates
633
+ # - Where clause conjuncts that can't be run in SQL
634
+ # - all grouping exprs, if any aggregate function call can't be run in SQL (in that case, they all have to be
635
+ # run in Python)
636
+ candidates = list(exprs.Expr.list_subexprs(
637
+ analyzer.select_list,
638
+ filter=lambda e: (
639
+ sql_elements.contains(e)
640
+ and not e._contains(cls=exprs.FunctionCall, filter=lambda e: bool(e.is_agg_fn_call))
641
+ ),
642
+ traverse_matches=False))
643
+ if analyzer.filter is not None:
644
+ candidates.extend(exprs.Expr.subexprs(
645
+ analyzer.filter, filter=lambda e: sql_elements.contains(e), traverse_matches=False))
646
+ if is_python_agg and analyzer.group_by_clause is not None:
647
+ candidates.extend(exprs.Expr.list_subexprs(
648
+ analyzer.group_by_clause, filter=lambda e: sql_elements.contains(e), traverse_matches=False))
649
+ # not isinstance(...): we don't want to materialize Literals via a Select
650
+ sql_scan_exprs = exprs.ExprSet(e for e in candidates if not isinstance(e, exprs.Literal))
682
651
 
683
- order_by_items = cls._determine_ordering(analyzer)
684
- sql_limit = 0 if is_agg_query else limit # if we're aggregating, the limit applies to the agg output
685
- sql_exprs = [
686
- e for e in eval_ctx.exprs if analyzer.sql_elements.contains(e) and not isinstance(e, exprs.Literal)
687
- ]
688
652
  plan = exec.SqlScanNode(
689
- tbl, row_builder, select_list=sql_exprs, where_clause=analyzer.sql_where_clause,
690
- filter=analyzer.filter, order_by_items=order_by_items,
691
- limit=sql_limit, set_pk=with_pk, exact_version_only=exact_version_only)
653
+ tbl, row_builder, select_list=sql_scan_exprs, where_clause=analyzer.sql_where_clause,
654
+ filter=analyzer.filter, set_pk=with_pk, exact_version_only=exact_version_only)
655
+ if len(analyzer.window_fn_calls) > 0:
656
+ # we need to order the input for window functions
657
+ plan.add_order_by(analyzer.get_window_fn_ob_clause())
692
658
  plan = cls._insert_prefetch_node(tbl.tbl_version.id, analyzer.select_list, row_builder, plan)
693
659
 
694
- if len(analyzer.group_by_clause) > 0 or len(analyzer.agg_fn_calls) > 0:
695
- # we're doing aggregation; the input of the AggregateNode are the grouping exprs plus the
660
+ if analyzer.group_by_clause is not None:
661
+ # we're doing grouping aggregation; the input of the AggregateNode are the grouping exprs plus the
696
662
  # args of the agg fn calls
697
- agg_input = exprs.ExprSet(analyzer.group_by_clause.copy())
663
+ agg_input = exprs.ExprSet(analyzer.grouping_exprs.copy())
698
664
  for fn_call in analyzer.agg_fn_calls:
699
665
  agg_input.update(fn_call.components)
700
- if not exprs.ExprSet(sql_exprs).issuperset(agg_input):
666
+ if not sql_scan_exprs.issuperset(agg_input):
701
667
  # we need an ExprEvalNode
702
- plan = exec.ExprEvalNode(row_builder, agg_input, sql_exprs, input=plan)
668
+ plan = exec.ExprEvalNode(row_builder, agg_input, sql_scan_exprs, input=plan)
703
669
 
704
670
  # batch size for aggregation input: this could be the entire table, so we need to divide it into
705
671
  # smaller batches; at the same time, we need to make the batches large enough to amortize the
706
672
  # function call overhead
707
673
  ctx.batch_size = 16
708
674
 
709
- plan = exec.AggregationNode(
710
- tbl.tbl_version, row_builder, analyzer.group_by_clause, analyzer.agg_fn_calls, agg_input, input=plan)
711
- agg_output = exprs.ExprSet(itertools.chain(analyzer.group_by_clause, analyzer.agg_fn_calls))
712
- if not agg_output.issuperset(exprs.ExprSet(eval_ctx.target_exprs)):
713
- # we need an ExprEvalNode to evaluate the remaining output exprs
714
- plan = exec.ExprEvalNode(row_builder, eval_ctx.target_exprs, agg_output, input=plan)
675
+ # do aggregation in SQL if all agg exprs can be translated
676
+ if (sql_elements.contains_all(analyzer.select_list)
677
+ and sql_elements.contains_all(analyzer.grouping_exprs)
678
+ and isinstance(plan, exec.SqlNode)
679
+ and plan.to_cte() is not None):
680
+ plan = exec.SqlAggregationNode(
681
+ row_builder, input=plan, select_list=analyzer.select_list, group_by_items=analyzer.group_by_clause)
682
+ else:
683
+ plan = exec.AggregationNode(
684
+ tbl.tbl_version, row_builder, analyzer.group_by_clause,
685
+ analyzer.agg_fn_calls + analyzer.window_fn_calls, agg_input, input=plan)
686
+ typecheck_dummy = analyzer.grouping_exprs + analyzer.agg_fn_calls + analyzer.window_fn_calls
687
+ agg_output = exprs.ExprSet(typecheck_dummy)
688
+ if not agg_output.issuperset(exprs.ExprSet(eval_ctx.target_exprs)):
689
+ # we need an ExprEvalNode to evaluate the remaining output exprs
690
+ plan = exec.ExprEvalNode(row_builder, eval_ctx.target_exprs, agg_output, input=plan)
715
691
  else:
716
- if not exprs.ExprSet(sql_exprs).issuperset(exprs.ExprSet(eval_ctx.target_exprs)):
692
+ if not exprs.ExprSet(sql_scan_exprs).issuperset(exprs.ExprSet(eval_ctx.target_exprs)):
717
693
  # we need an ExprEvalNode to evaluate the remaining output exprs
718
- plan = exec.ExprEvalNode(row_builder, eval_ctx.target_exprs, sql_exprs, input=plan)
694
+ plan = exec.ExprEvalNode(row_builder, eval_ctx.target_exprs, sql_scan_exprs, input=plan)
719
695
  # we're returning everything to the user, so we might as well do it in a single batch
720
696
  ctx.batch_size = 0
721
697
 
698
+ sql_node = plan.get_sql_node()
699
+ assert sql_node is not None
700
+ if len(analyzer.order_by_clause) > 0:
701
+ sql_node.add_order_by(analyzer.order_by_clause)
702
+
703
+ if limit is not None:
704
+ plan.set_limit(limit)
705
+
722
706
  plan.set_ctx(ctx)
723
707
  return plan
724
708
 
pixeltable/py.typed ADDED
File without changes