pixeltable 0.3.14__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (79) hide show
  1. pixeltable/__init__.py +1 -1
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/__init__.py +9 -1
  4. pixeltable/catalog/catalog.py +559 -134
  5. pixeltable/catalog/column.py +36 -32
  6. pixeltable/catalog/dir.py +1 -2
  7. pixeltable/catalog/globals.py +12 -0
  8. pixeltable/catalog/insertable_table.py +30 -25
  9. pixeltable/catalog/schema_object.py +9 -6
  10. pixeltable/catalog/table.py +334 -267
  11. pixeltable/catalog/table_version.py +360 -241
  12. pixeltable/catalog/table_version_handle.py +18 -2
  13. pixeltable/catalog/table_version_path.py +86 -23
  14. pixeltable/catalog/view.py +47 -23
  15. pixeltable/dataframe.py +198 -19
  16. pixeltable/env.py +6 -4
  17. pixeltable/exceptions.py +6 -0
  18. pixeltable/exec/__init__.py +1 -1
  19. pixeltable/exec/exec_node.py +2 -0
  20. pixeltable/exec/expr_eval/evaluators.py +4 -1
  21. pixeltable/exec/expr_eval/expr_eval_node.py +4 -4
  22. pixeltable/exec/in_memory_data_node.py +1 -1
  23. pixeltable/exec/sql_node.py +188 -22
  24. pixeltable/exprs/column_property_ref.py +16 -6
  25. pixeltable/exprs/column_ref.py +33 -11
  26. pixeltable/exprs/comparison.py +1 -1
  27. pixeltable/exprs/data_row.py +5 -3
  28. pixeltable/exprs/expr.py +11 -4
  29. pixeltable/exprs/literal.py +2 -0
  30. pixeltable/exprs/row_builder.py +4 -6
  31. pixeltable/exprs/rowid_ref.py +8 -0
  32. pixeltable/exprs/similarity_expr.py +1 -0
  33. pixeltable/func/__init__.py +1 -0
  34. pixeltable/func/mcp.py +74 -0
  35. pixeltable/func/query_template_function.py +5 -3
  36. pixeltable/func/tools.py +12 -2
  37. pixeltable/func/udf.py +2 -2
  38. pixeltable/functions/__init__.py +1 -0
  39. pixeltable/functions/anthropic.py +19 -45
  40. pixeltable/functions/deepseek.py +19 -38
  41. pixeltable/functions/fireworks.py +9 -18
  42. pixeltable/functions/gemini.py +165 -33
  43. pixeltable/functions/groq.py +108 -0
  44. pixeltable/functions/llama_cpp.py +6 -6
  45. pixeltable/functions/math.py +63 -0
  46. pixeltable/functions/mistralai.py +16 -53
  47. pixeltable/functions/ollama.py +1 -1
  48. pixeltable/functions/openai.py +82 -165
  49. pixeltable/functions/string.py +212 -58
  50. pixeltable/functions/together.py +22 -80
  51. pixeltable/globals.py +10 -4
  52. pixeltable/index/base.py +5 -0
  53. pixeltable/index/btree.py +5 -0
  54. pixeltable/index/embedding_index.py +5 -0
  55. pixeltable/io/external_store.py +10 -31
  56. pixeltable/io/label_studio.py +5 -5
  57. pixeltable/io/parquet.py +4 -4
  58. pixeltable/io/table_data_conduit.py +1 -32
  59. pixeltable/metadata/__init__.py +11 -2
  60. pixeltable/metadata/converters/convert_13.py +2 -2
  61. pixeltable/metadata/converters/convert_30.py +6 -11
  62. pixeltable/metadata/converters/convert_35.py +9 -0
  63. pixeltable/metadata/converters/convert_36.py +38 -0
  64. pixeltable/metadata/converters/convert_37.py +15 -0
  65. pixeltable/metadata/converters/util.py +3 -9
  66. pixeltable/metadata/notes.py +3 -0
  67. pixeltable/metadata/schema.py +13 -1
  68. pixeltable/plan.py +135 -12
  69. pixeltable/share/packager.py +321 -20
  70. pixeltable/share/publish.py +2 -2
  71. pixeltable/store.py +31 -13
  72. pixeltable/type_system.py +30 -0
  73. pixeltable/utils/dbms.py +1 -1
  74. pixeltable/utils/formatter.py +64 -42
  75. {pixeltable-0.3.14.dist-info → pixeltable-0.4.0.dist-info}/METADATA +2 -1
  76. {pixeltable-0.3.14.dist-info → pixeltable-0.4.0.dist-info}/RECORD +79 -74
  77. {pixeltable-0.3.14.dist-info → pixeltable-0.4.0.dist-info}/LICENSE +0 -0
  78. {pixeltable-0.3.14.dist-info → pixeltable-0.4.0.dist-info}/WHEEL +0 -0
  79. {pixeltable-0.3.14.dist-info → pixeltable-0.4.0.dist-info}/entry_points.txt +0 -0
pixeltable/plan.py CHANGED
@@ -75,6 +75,88 @@ class FromClause:
75
75
  tbls: list[catalog.TableVersionPath]
76
76
  join_clauses: list[JoinClause] = dataclasses.field(default_factory=list)
77
77
 
78
+ @property
79
+ def _first_tbl(self) -> catalog.TableVersionPath:
80
+ assert len(self.tbls) == 1
81
+ return self.tbls[0]
82
+
83
+
84
+ @dataclasses.dataclass
85
+ class SampleClause:
86
+ """Defines a sampling clause for a table."""
87
+
88
+ version: Optional[int]
89
+ n: Optional[int]
90
+ n_per_stratum: Optional[int]
91
+ fraction: Optional[float]
92
+ seed: Optional[int]
93
+ stratify_exprs: Optional[list[exprs.Expr]]
94
+
95
+ # This seed value is used if one is not supplied
96
+ DEFAULT_SEED = 0
97
+
98
+ # The version of the hashing algorithm used for ordering and fractional sampling.
99
+ CURRENT_VERSION = 1
100
+
101
+ def __post_init__(self) -> None:
102
+ """If no version was provided, provide the default version"""
103
+ if self.version is None:
104
+ self.version = self.CURRENT_VERSION
105
+ if self.seed is None:
106
+ self.seed = self.DEFAULT_SEED
107
+
108
+ @property
109
+ def is_stratified(self) -> bool:
110
+ """Check if the sampling is stratified"""
111
+ return self.stratify_exprs is not None and len(self.stratify_exprs) > 0
112
+
113
+ @property
114
+ def is_repeatable(self) -> bool:
115
+ """Return true if the same rows will continue to be sampled if source rows are added or deleted."""
116
+ return not self.is_stratified and self.fraction is not None
117
+
118
+ def display_str(self, inline: bool = False) -> str:
119
+ return str(self)
120
+
121
+ def as_dict(self) -> dict:
122
+ """Return a dictionary representation of the object"""
123
+ d = dataclasses.asdict(self)
124
+ d['_classname'] = self.__class__.__name__
125
+ if self.is_stratified:
126
+ d['stratify_exprs'] = [e.as_dict() for e in self.stratify_exprs]
127
+ return d
128
+
129
+ @classmethod
130
+ def from_dict(cls, d: dict) -> SampleClause:
131
+ """Create a SampleClause from a dictionary representation"""
132
+ d_cleaned = {key: value for key, value in d.items() if key != '_classname'}
133
+ s = cls(**d_cleaned)
134
+ if s.is_stratified:
135
+ s.stratify_exprs = [exprs.Expr.from_dict(e) for e in d_cleaned.get('stratify_exprs', [])]
136
+ return s
137
+
138
+ def __repr__(self) -> str:
139
+ s = ','.join(e.display_str(inline=True) for e in self.stratify_exprs)
140
+ return (
141
+ f'sample_{self.version}(n={self.n}, n_per_stratum={self.n_per_stratum}, '
142
+ f'fraction={self.fraction}, seed={self.seed}, [{s}])'
143
+ )
144
+
145
+ @classmethod
146
+ def fraction_to_md5_hex(cls, fraction: float) -> str:
147
+ """Return the string representation of an approximation (to ~1e-9) of a fraction of the total space
148
+ of md5 hash values.
149
+ This is used for fractional sampling.
150
+ """
151
+ # Maximum count for the upper 32 bits of MD5: 2^32
152
+ max_md5_value = (2**32) - 1
153
+
154
+ # Calculate the fraction of this value
155
+ threshold_int = max_md5_value * int(1_000_000_000 * fraction) // 1_000_000_000
156
+
157
+ # Convert to hexadecimal string with padding
158
+ return format(threshold_int, '08x') + 'ffffffffffffffffffffffff'
159
+
78
160
 
79
161
  class Analyzer:
80
162
  """
@@ -87,6 +169,8 @@ class Analyzer:
87
169
  group_by_clause: Optional[list[exprs.Expr]] # None for non-aggregate queries; [] for agg query w/o grouping
88
170
  grouping_exprs: list[exprs.Expr] # [] for non-aggregate queries or agg query w/o grouping
89
171
  order_by_clause: OrderByClause
172
+ stratify_exprs: list[exprs.Expr] # [] if no stratiifcation is required
173
+ sample_clause: Optional[SampleClause] # None if no sampling clause is present
90
174
 
91
175
  sql_elements: exprs.SqlElementCache
92
176
 
@@ -107,6 +191,7 @@ class Analyzer:
107
191
  where_clause: Optional[exprs.Expr] = None,
108
192
  group_by_clause: Optional[list[exprs.Expr]] = None,
109
193
  order_by_clause: Optional[list[tuple[exprs.Expr, bool]]] = None,
194
+ sample_clause: Optional[SampleClause] = None,
110
195
  ):
111
196
  if order_by_clause is None:
112
197
  order_by_clause = []
@@ -120,6 +205,11 @@ class Analyzer:
120
205
  self.group_by_clause = (
121
206
  [e.resolve_computed_cols() for e in group_by_clause] if group_by_clause is not None else None
122
207
  )
208
+ self.sample_clause = sample_clause
209
+ if self.sample_clause is not None and self.sample_clause.is_stratified:
210
+ self.stratify_exprs = [e.resolve_computed_cols() for e in sample_clause.stratify_exprs]
211
+ else:
212
+ self.stratify_exprs = []
123
213
  self.order_by_clause = [OrderByItem(e.resolve_computed_cols(), asc) for e, asc in order_by_clause]
124
214
 
125
215
  self.sql_where_clause = None
@@ -135,8 +225,11 @@ class Analyzer:
135
225
  self.all_exprs.append(join_clause.join_predicate)
136
226
  if self.group_by_clause is not None:
137
227
  self.all_exprs.extend(self.group_by_clause)
228
+ self.all_exprs.extend(self.stratify_exprs)
138
229
  self.all_exprs.extend(e for e, _ in self.order_by_clause)
139
230
  if self.filter is not None:
231
+ if sample_clause is not None:
232
+ raise excs.Error(f'Filter {self.filter} not expressible in SQL')
140
233
  self.all_exprs.append(self.filter)
141
234
 
142
235
  self.agg_order_by = []
@@ -260,7 +353,7 @@ class Planner:
260
353
  # TODO: create an exec.CountNode and change this to create_count_plan()
261
354
  @classmethod
262
355
  def create_count_stmt(cls, tbl: catalog.TableVersionPath, where_clause: Optional[exprs.Expr] = None) -> sql.Select:
263
- stmt = sql.select(sql.func.count())
356
+ stmt = sql.select(sql.func.count().label('all_count'))
264
357
  refd_tbl_ids: set[UUID] = set()
265
358
  if where_clause is not None:
266
359
  analyzer = cls.analyze(tbl, where_clause)
@@ -289,7 +382,7 @@ class Planner:
289
382
 
290
383
  # create InMemoryDataNode for 'rows'
291
384
  plan: exec.ExecNode = exec.InMemoryDataNode(
292
- TableVersionHandle(tbl.id, tbl.effective_version), rows, row_builder, tbl.next_rowid
385
+ TableVersionHandle(tbl.id, tbl.effective_version), rows, row_builder, tbl.next_row_id
293
386
  )
294
387
 
295
388
  media_input_col_info = [
@@ -322,6 +415,13 @@ class Planner:
322
415
  )
323
416
  return plan
324
417
 
418
+ @classmethod
419
+ def rowid_columns(cls, target: TableVersionHandle, num_rowid_cols: Optional[int] = None) -> list[exprs.Expr]:
420
+ """Return list of RowidRef for the given number of associated rowids"""
421
+ if num_rowid_cols is None:
422
+ num_rowid_cols = target.get().num_rowid_columns()
423
+ return [exprs.RowidRef(target, i) for i in range(num_rowid_cols)]
424
+
325
425
  @classmethod
326
426
  def create_df_insert_plan(
327
427
  cls, tbl: catalog.TableVersion, df: 'pxt.DataFrame', ignore_errors: bool
@@ -385,7 +485,7 @@ class Planner:
385
485
 
386
486
  cls.__check_valid_columns(tbl.tbl_version.get(), recomputed_cols, 'updated in')
387
487
 
388
- recomputed_base_cols = {col for col in recomputed_cols if col.tbl == tbl.tbl_version}
488
+ recomputed_base_cols = {col for col in recomputed_cols if col.tbl.id == tbl.tbl_version.id}
389
489
  copied_cols = [
390
490
  col
391
491
  for col in target.cols_by_id.values()
@@ -409,7 +509,7 @@ class Planner:
409
509
  for i, col in enumerate(all_base_cols):
410
510
  plan.row_builder.add_table_column(col, select_list[i].slot_idx)
411
511
  recomputed_user_cols = [c for c in recomputed_cols if c.name is not None]
412
- return plan, [f'{c.tbl.get().name}.{c.name}' for c in updated_cols + recomputed_user_cols], recomputed_user_cols
512
+ return plan, [f'{c.tbl.name}.{c.name}' for c in updated_cols + recomputed_user_cols], recomputed_user_cols
413
513
 
414
514
  @classmethod
415
515
  def __check_valid_columns(
@@ -465,7 +565,7 @@ class Planner:
465
565
  recomputed_cols.update(idx_val_cols)
466
566
  # we only need to recompute stored columns (unstored ones are substituted away)
467
567
  recomputed_cols = {c for c in recomputed_cols if c.is_stored}
468
- recomputed_base_cols = {col for col in recomputed_cols if col.tbl == target}
568
+ recomputed_base_cols = {col for col in recomputed_cols if col.tbl.id == target.id}
469
569
  copied_cols = [
470
570
  col
471
571
  for col in target.cols_by_id.values()
@@ -591,8 +691,13 @@ class Planner:
591
691
  # 2. for component views: iterator args
592
692
  iterator_args = [target.iterator_args] if target.iterator_args is not None else []
593
693
 
594
- row_builder = exprs.RowBuilder(iterator_args, stored_cols, [])
694
+ from_clause = FromClause(tbls=[view.base])
695
+ base_analyzer = Analyzer(
696
+ from_clause, iterator_args, where_clause=target.predicate, sample_clause=target.sample_clause
697
+ )
698
+ row_builder = exprs.RowBuilder(base_analyzer.all_exprs, stored_cols, [])
595
699
 
700
+ # if we're propagating an insert, we only want to see those base rows that were created for the current version
596
701
  # execution plan:
597
702
  # 1. materialize exprs computed from the base that are needed for stored view columns
598
703
  # 2. if it's an iterator view, expand the base rows into component rows
@@ -603,8 +708,11 @@ class Planner:
603
708
  for e in row_builder.default_eval_ctx.target_exprs
604
709
  if e.is_bound_by([view]) and not e.is_bound_by([view.base])
605
710
  ]
606
- # if we're propagating an insert, we only want to see those base rows that were created for the current version
607
- base_analyzer = Analyzer(FromClause(tbls=[view.base]), base_output_exprs, where_clause=target.predicate)
711
+
712
+ # Create a new analyzer reflecting exactly what is required from the base table
713
+ base_analyzer = Analyzer(
714
+ from_clause, base_output_exprs, where_clause=target.predicate, sample_clause=target.sample_clause
715
+ )
608
716
  base_eval_ctx = row_builder.create_eval_ctx(base_analyzer.all_exprs)
609
717
  plan = cls._create_query_plan(
610
718
  row_builder=row_builder,
@@ -701,6 +809,7 @@ class Planner:
701
809
  group_by_clause: Optional[list[exprs.Expr]] = None,
702
810
  order_by_clause: Optional[list[tuple[exprs.Expr, bool]]] = None,
703
811
  limit: Optional[exprs.Expr] = None,
812
+ sample_clause: Optional[SampleClause] = None,
704
813
  ignore_errors: bool = False,
705
814
  exact_version_only: Optional[list[catalog.TableVersionHandle]] = None,
706
815
  ) -> exec.ExecNode:
@@ -714,12 +823,14 @@ class Planner:
714
823
  order_by_clause = []
715
824
  if exact_version_only is None:
716
825
  exact_version_only = []
826
+
717
827
  analyzer = Analyzer(
718
828
  from_clause,
719
829
  select_list,
720
830
  where_clause=where_clause,
721
831
  group_by_clause=group_by_clause,
722
832
  order_by_clause=order_by_clause,
833
+ sample_clause=sample_clause,
723
834
  )
724
835
  row_builder = exprs.RowBuilder(analyzer.all_exprs, [], [])
725
836
 
@@ -773,6 +884,7 @@ class Planner:
773
884
  # - join clause subexprs
774
885
  # - subexprs of Where clause conjuncts that can't be run in SQL
775
886
  # - all grouping exprs
887
+ # - all stratify exprs
776
888
  candidates = list(
777
889
  exprs.Expr.list_subexprs(
778
890
  analyzer.select_list,
@@ -787,10 +899,12 @@ class Planner:
787
899
  candidates.extend(
788
900
  exprs.Expr.subexprs(analyzer.filter, filter=sql_elements.contains, traverse_matches=False)
789
901
  )
790
- if analyzer.group_by_clause is not None:
791
- candidates.extend(
792
- exprs.Expr.list_subexprs(analyzer.group_by_clause, filter=sql_elements.contains, traverse_matches=False)
793
- )
902
+ candidates.extend(
903
+ exprs.Expr.list_subexprs(analyzer.grouping_exprs, filter=sql_elements.contains, traverse_matches=False)
904
+ )
905
+ candidates.extend(
906
+ exprs.Expr.list_subexprs(analyzer.stratify_exprs, filter=sql_elements.contains, traverse_matches=False)
907
+ )
794
908
  # not isinstance(...): we don't want to materialize Literals via a Select
795
909
  sql_exprs = exprs.ExprSet(e for e in candidates if not isinstance(e, exprs.Literal))
796
910
 
@@ -835,6 +949,15 @@ class Planner:
835
949
  # we need to order the input for window functions
836
950
  plan.set_order_by(analyzer.get_window_fn_ob_clause())
837
951
 
952
+ if analyzer.sample_clause is not None:
953
+ plan = exec.SqlSampleNode(
954
+ row_builder,
955
+ input=plan,
956
+ select_list=tbl_scan_exprs,
957
+ sample_clause=analyzer.sample_clause,
958
+ stratify_exprs=analyzer.stratify_exprs,
959
+ )
960
+
838
961
  plan = cls._insert_prefetch_node(tbl.tbl_version.id, row_builder, plan)
839
962
 
840
963
  if analyzer.group_by_clause is not None: