pixeltable 0.3.14__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +1 -1
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +9 -1
- pixeltable/catalog/catalog.py +559 -134
- pixeltable/catalog/column.py +36 -32
- pixeltable/catalog/dir.py +1 -2
- pixeltable/catalog/globals.py +12 -0
- pixeltable/catalog/insertable_table.py +30 -25
- pixeltable/catalog/schema_object.py +9 -6
- pixeltable/catalog/table.py +334 -267
- pixeltable/catalog/table_version.py +360 -241
- pixeltable/catalog/table_version_handle.py +18 -2
- pixeltable/catalog/table_version_path.py +86 -23
- pixeltable/catalog/view.py +47 -23
- pixeltable/dataframe.py +198 -19
- pixeltable/env.py +6 -4
- pixeltable/exceptions.py +6 -0
- pixeltable/exec/__init__.py +1 -1
- pixeltable/exec/exec_node.py +2 -0
- pixeltable/exec/expr_eval/evaluators.py +4 -1
- pixeltable/exec/expr_eval/expr_eval_node.py +4 -4
- pixeltable/exec/in_memory_data_node.py +1 -1
- pixeltable/exec/sql_node.py +188 -22
- pixeltable/exprs/column_property_ref.py +16 -6
- pixeltable/exprs/column_ref.py +33 -11
- pixeltable/exprs/comparison.py +1 -1
- pixeltable/exprs/data_row.py +5 -3
- pixeltable/exprs/expr.py +11 -4
- pixeltable/exprs/literal.py +2 -0
- pixeltable/exprs/row_builder.py +4 -6
- pixeltable/exprs/rowid_ref.py +8 -0
- pixeltable/exprs/similarity_expr.py +1 -0
- pixeltable/func/__init__.py +1 -0
- pixeltable/func/mcp.py +74 -0
- pixeltable/func/query_template_function.py +5 -3
- pixeltable/func/tools.py +12 -2
- pixeltable/func/udf.py +2 -2
- pixeltable/functions/__init__.py +1 -0
- pixeltable/functions/anthropic.py +19 -45
- pixeltable/functions/deepseek.py +19 -38
- pixeltable/functions/fireworks.py +9 -18
- pixeltable/functions/gemini.py +165 -33
- pixeltable/functions/groq.py +108 -0
- pixeltable/functions/llama_cpp.py +6 -6
- pixeltable/functions/math.py +63 -0
- pixeltable/functions/mistralai.py +16 -53
- pixeltable/functions/ollama.py +1 -1
- pixeltable/functions/openai.py +82 -165
- pixeltable/functions/string.py +212 -58
- pixeltable/functions/together.py +22 -80
- pixeltable/globals.py +10 -4
- pixeltable/index/base.py +5 -0
- pixeltable/index/btree.py +5 -0
- pixeltable/index/embedding_index.py +5 -0
- pixeltable/io/external_store.py +10 -31
- pixeltable/io/label_studio.py +5 -5
- pixeltable/io/parquet.py +4 -4
- pixeltable/io/table_data_conduit.py +1 -32
- pixeltable/metadata/__init__.py +11 -2
- pixeltable/metadata/converters/convert_13.py +2 -2
- pixeltable/metadata/converters/convert_30.py +6 -11
- pixeltable/metadata/converters/convert_35.py +9 -0
- pixeltable/metadata/converters/convert_36.py +38 -0
- pixeltable/metadata/converters/convert_37.py +15 -0
- pixeltable/metadata/converters/util.py +3 -9
- pixeltable/metadata/notes.py +3 -0
- pixeltable/metadata/schema.py +13 -1
- pixeltable/plan.py +135 -12
- pixeltable/share/packager.py +321 -20
- pixeltable/share/publish.py +2 -2
- pixeltable/store.py +31 -13
- pixeltable/type_system.py +30 -0
- pixeltable/utils/dbms.py +1 -1
- pixeltable/utils/formatter.py +64 -42
- {pixeltable-0.3.14.dist-info → pixeltable-0.4.0.dist-info}/METADATA +2 -1
- {pixeltable-0.3.14.dist-info → pixeltable-0.4.0.dist-info}/RECORD +79 -74
- {pixeltable-0.3.14.dist-info → pixeltable-0.4.0.dist-info}/LICENSE +0 -0
- {pixeltable-0.3.14.dist-info → pixeltable-0.4.0.dist-info}/WHEEL +0 -0
- {pixeltable-0.3.14.dist-info → pixeltable-0.4.0.dist-info}/entry_points.txt +0 -0
pixeltable/plan.py
CHANGED
|
@@ -75,6 +75,88 @@ class FromClause:
|
|
|
75
75
|
tbls: list[catalog.TableVersionPath]
|
|
76
76
|
join_clauses: list[JoinClause] = dataclasses.field(default_factory=list)
|
|
77
77
|
|
|
78
|
+
@property
|
|
79
|
+
def _first_tbl(self) -> catalog.TableVersionPath:
|
|
80
|
+
assert len(self.tbls) == 1
|
|
81
|
+
return self.tbls[0]
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
@dataclasses.dataclass
|
|
85
|
+
class SampleClause:
|
|
86
|
+
"""Defines a sampling clause for a table."""
|
|
87
|
+
|
|
88
|
+
version: Optional[int]
|
|
89
|
+
n: Optional[int]
|
|
90
|
+
n_per_stratum: Optional[int]
|
|
91
|
+
fraction: Optional[float]
|
|
92
|
+
seed: Optional[int]
|
|
93
|
+
stratify_exprs: Optional[list[exprs.Expr]]
|
|
94
|
+
|
|
95
|
+
# This seed value is used if one is not supplied
|
|
96
|
+
DEFAULT_SEED = 0
|
|
97
|
+
|
|
98
|
+
# The version of the hashing algorithm used for ordering and fractional sampling.
|
|
99
|
+
CURRENT_VERSION = 1
|
|
100
|
+
|
|
101
|
+
def __post_init__(self) -> None:
|
|
102
|
+
"""If no version was provided, provide the default version"""
|
|
103
|
+
if self.version is None:
|
|
104
|
+
self.version = self.CURRENT_VERSION
|
|
105
|
+
if self.seed is None:
|
|
106
|
+
self.seed = self.DEFAULT_SEED
|
|
107
|
+
|
|
108
|
+
@property
|
|
109
|
+
def is_stratified(self) -> bool:
|
|
110
|
+
"""Check if the sampling is stratified"""
|
|
111
|
+
return self.stratify_exprs is not None and len(self.stratify_exprs) > 0
|
|
112
|
+
|
|
113
|
+
@property
|
|
114
|
+
def is_repeatable(self) -> bool:
|
|
115
|
+
"""Return true if the same rows will continue to be sampled if source rows are added or deleted."""
|
|
116
|
+
return not self.is_stratified and self.fraction is not None
|
|
117
|
+
|
|
118
|
+
def display_str(self, inline: bool = False) -> str:
|
|
119
|
+
return str(self)
|
|
120
|
+
|
|
121
|
+
def as_dict(self) -> dict:
|
|
122
|
+
"""Return a dictionary representation of the object"""
|
|
123
|
+
d = dataclasses.asdict(self)
|
|
124
|
+
d['_classname'] = self.__class__.__name__
|
|
125
|
+
if self.is_stratified:
|
|
126
|
+
d['stratify_exprs'] = [e.as_dict() for e in self.stratify_exprs]
|
|
127
|
+
return d
|
|
128
|
+
|
|
129
|
+
@classmethod
|
|
130
|
+
def from_dict(cls, d: dict) -> SampleClause:
|
|
131
|
+
"""Create a SampleClause from a dictionary representation"""
|
|
132
|
+
d_cleaned = {key: value for key, value in d.items() if key != '_classname'}
|
|
133
|
+
s = cls(**d_cleaned)
|
|
134
|
+
if s.is_stratified:
|
|
135
|
+
s.stratify_exprs = [exprs.Expr.from_dict(e) for e in d_cleaned.get('stratify_exprs', [])]
|
|
136
|
+
return s
|
|
137
|
+
|
|
138
|
+
def __repr__(self) -> str:
|
|
139
|
+
s = ','.join(e.display_str(inline=True) for e in self.stratify_exprs)
|
|
140
|
+
return (
|
|
141
|
+
f'sample_{self.version}(n={self.n}, n_per_stratum={self.n_per_stratum}, '
|
|
142
|
+
f'fraction={self.fraction}, seed={self.seed}, [{s}])'
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
@classmethod
|
|
146
|
+
def fraction_to_md5_hex(cls, fraction: float) -> str:
|
|
147
|
+
"""Return the string representation of an approximation (to ~1e-9) of a fraction of the total space
|
|
148
|
+
of md5 hash values.
|
|
149
|
+
This is used for fractional sampling.
|
|
150
|
+
"""
|
|
151
|
+
# Maximum count for the upper 32 bits of MD5: 2^32
|
|
152
|
+
max_md5_value = (2**32) - 1
|
|
153
|
+
|
|
154
|
+
# Calculate the fraction of this value
|
|
155
|
+
threshold_int = max_md5_value * int(1_000_000_000 * fraction) // 1_000_000_000
|
|
156
|
+
|
|
157
|
+
# Convert to hexadecimal string with padding
|
|
158
|
+
return format(threshold_int, '08x') + 'ffffffffffffffffffffffff'
|
|
159
|
+
|
|
78
160
|
|
|
79
161
|
class Analyzer:
|
|
80
162
|
"""
|
|
@@ -87,6 +169,8 @@ class Analyzer:
|
|
|
87
169
|
group_by_clause: Optional[list[exprs.Expr]] # None for non-aggregate queries; [] for agg query w/o grouping
|
|
88
170
|
grouping_exprs: list[exprs.Expr] # [] for non-aggregate queries or agg query w/o grouping
|
|
89
171
|
order_by_clause: OrderByClause
|
|
172
|
+
stratify_exprs: list[exprs.Expr] # [] if no stratiifcation is required
|
|
173
|
+
sample_clause: Optional[SampleClause] # None if no sampling clause is present
|
|
90
174
|
|
|
91
175
|
sql_elements: exprs.SqlElementCache
|
|
92
176
|
|
|
@@ -107,6 +191,7 @@ class Analyzer:
|
|
|
107
191
|
where_clause: Optional[exprs.Expr] = None,
|
|
108
192
|
group_by_clause: Optional[list[exprs.Expr]] = None,
|
|
109
193
|
order_by_clause: Optional[list[tuple[exprs.Expr, bool]]] = None,
|
|
194
|
+
sample_clause: Optional[SampleClause] = None,
|
|
110
195
|
):
|
|
111
196
|
if order_by_clause is None:
|
|
112
197
|
order_by_clause = []
|
|
@@ -120,6 +205,11 @@ class Analyzer:
|
|
|
120
205
|
self.group_by_clause = (
|
|
121
206
|
[e.resolve_computed_cols() for e in group_by_clause] if group_by_clause is not None else None
|
|
122
207
|
)
|
|
208
|
+
self.sample_clause = sample_clause
|
|
209
|
+
if self.sample_clause is not None and self.sample_clause.is_stratified:
|
|
210
|
+
self.stratify_exprs = [e.resolve_computed_cols() for e in sample_clause.stratify_exprs]
|
|
211
|
+
else:
|
|
212
|
+
self.stratify_exprs = []
|
|
123
213
|
self.order_by_clause = [OrderByItem(e.resolve_computed_cols(), asc) for e, asc in order_by_clause]
|
|
124
214
|
|
|
125
215
|
self.sql_where_clause = None
|
|
@@ -135,8 +225,11 @@ class Analyzer:
|
|
|
135
225
|
self.all_exprs.append(join_clause.join_predicate)
|
|
136
226
|
if self.group_by_clause is not None:
|
|
137
227
|
self.all_exprs.extend(self.group_by_clause)
|
|
228
|
+
self.all_exprs.extend(self.stratify_exprs)
|
|
138
229
|
self.all_exprs.extend(e for e, _ in self.order_by_clause)
|
|
139
230
|
if self.filter is not None:
|
|
231
|
+
if sample_clause is not None:
|
|
232
|
+
raise excs.Error(f'Filter {self.filter} not expressible in SQL')
|
|
140
233
|
self.all_exprs.append(self.filter)
|
|
141
234
|
|
|
142
235
|
self.agg_order_by = []
|
|
@@ -260,7 +353,7 @@ class Planner:
|
|
|
260
353
|
# TODO: create an exec.CountNode and change this to create_count_plan()
|
|
261
354
|
@classmethod
|
|
262
355
|
def create_count_stmt(cls, tbl: catalog.TableVersionPath, where_clause: Optional[exprs.Expr] = None) -> sql.Select:
|
|
263
|
-
stmt = sql.select(sql.func.count())
|
|
356
|
+
stmt = sql.select(sql.func.count().label('all_count'))
|
|
264
357
|
refd_tbl_ids: set[UUID] = set()
|
|
265
358
|
if where_clause is not None:
|
|
266
359
|
analyzer = cls.analyze(tbl, where_clause)
|
|
@@ -289,7 +382,7 @@ class Planner:
|
|
|
289
382
|
|
|
290
383
|
# create InMemoryDataNode for 'rows'
|
|
291
384
|
plan: exec.ExecNode = exec.InMemoryDataNode(
|
|
292
|
-
TableVersionHandle(tbl.id, tbl.effective_version), rows, row_builder, tbl.
|
|
385
|
+
TableVersionHandle(tbl.id, tbl.effective_version), rows, row_builder, tbl.next_row_id
|
|
293
386
|
)
|
|
294
387
|
|
|
295
388
|
media_input_col_info = [
|
|
@@ -322,6 +415,13 @@ class Planner:
|
|
|
322
415
|
)
|
|
323
416
|
return plan
|
|
324
417
|
|
|
418
|
+
@classmethod
|
|
419
|
+
def rowid_columns(cls, target: TableVersionHandle, num_rowid_cols: Optional[int] = None) -> list[exprs.Expr]:
|
|
420
|
+
"""Return list of RowidRef for the given number of associated rowids"""
|
|
421
|
+
if num_rowid_cols is None:
|
|
422
|
+
num_rowid_cols = target.get().num_rowid_columns()
|
|
423
|
+
return [exprs.RowidRef(target, i) for i in range(num_rowid_cols)]
|
|
424
|
+
|
|
325
425
|
@classmethod
|
|
326
426
|
def create_df_insert_plan(
|
|
327
427
|
cls, tbl: catalog.TableVersion, df: 'pxt.DataFrame', ignore_errors: bool
|
|
@@ -385,7 +485,7 @@ class Planner:
|
|
|
385
485
|
|
|
386
486
|
cls.__check_valid_columns(tbl.tbl_version.get(), recomputed_cols, 'updated in')
|
|
387
487
|
|
|
388
|
-
recomputed_base_cols = {col for col in recomputed_cols if col.tbl == tbl.tbl_version}
|
|
488
|
+
recomputed_base_cols = {col for col in recomputed_cols if col.tbl.id == tbl.tbl_version.id}
|
|
389
489
|
copied_cols = [
|
|
390
490
|
col
|
|
391
491
|
for col in target.cols_by_id.values()
|
|
@@ -409,7 +509,7 @@ class Planner:
|
|
|
409
509
|
for i, col in enumerate(all_base_cols):
|
|
410
510
|
plan.row_builder.add_table_column(col, select_list[i].slot_idx)
|
|
411
511
|
recomputed_user_cols = [c for c in recomputed_cols if c.name is not None]
|
|
412
|
-
return plan, [f'{c.tbl.
|
|
512
|
+
return plan, [f'{c.tbl.name}.{c.name}' for c in updated_cols + recomputed_user_cols], recomputed_user_cols
|
|
413
513
|
|
|
414
514
|
@classmethod
|
|
415
515
|
def __check_valid_columns(
|
|
@@ -465,7 +565,7 @@ class Planner:
|
|
|
465
565
|
recomputed_cols.update(idx_val_cols)
|
|
466
566
|
# we only need to recompute stored columns (unstored ones are substituted away)
|
|
467
567
|
recomputed_cols = {c for c in recomputed_cols if c.is_stored}
|
|
468
|
-
recomputed_base_cols = {col for col in recomputed_cols if col.tbl == target}
|
|
568
|
+
recomputed_base_cols = {col for col in recomputed_cols if col.tbl.id == target.id}
|
|
469
569
|
copied_cols = [
|
|
470
570
|
col
|
|
471
571
|
for col in target.cols_by_id.values()
|
|
@@ -591,8 +691,13 @@ class Planner:
|
|
|
591
691
|
# 2. for component views: iterator args
|
|
592
692
|
iterator_args = [target.iterator_args] if target.iterator_args is not None else []
|
|
593
693
|
|
|
594
|
-
|
|
694
|
+
from_clause = FromClause(tbls=[view.base])
|
|
695
|
+
base_analyzer = Analyzer(
|
|
696
|
+
from_clause, iterator_args, where_clause=target.predicate, sample_clause=target.sample_clause
|
|
697
|
+
)
|
|
698
|
+
row_builder = exprs.RowBuilder(base_analyzer.all_exprs, stored_cols, [])
|
|
595
699
|
|
|
700
|
+
# if we're propagating an insert, we only want to see those base rows that were created for the current version
|
|
596
701
|
# execution plan:
|
|
597
702
|
# 1. materialize exprs computed from the base that are needed for stored view columns
|
|
598
703
|
# 2. if it's an iterator view, expand the base rows into component rows
|
|
@@ -603,8 +708,11 @@ class Planner:
|
|
|
603
708
|
for e in row_builder.default_eval_ctx.target_exprs
|
|
604
709
|
if e.is_bound_by([view]) and not e.is_bound_by([view.base])
|
|
605
710
|
]
|
|
606
|
-
|
|
607
|
-
|
|
711
|
+
|
|
712
|
+
# Create a new analyzer reflecting exactly what is required from the base table
|
|
713
|
+
base_analyzer = Analyzer(
|
|
714
|
+
from_clause, base_output_exprs, where_clause=target.predicate, sample_clause=target.sample_clause
|
|
715
|
+
)
|
|
608
716
|
base_eval_ctx = row_builder.create_eval_ctx(base_analyzer.all_exprs)
|
|
609
717
|
plan = cls._create_query_plan(
|
|
610
718
|
row_builder=row_builder,
|
|
@@ -701,6 +809,7 @@ class Planner:
|
|
|
701
809
|
group_by_clause: Optional[list[exprs.Expr]] = None,
|
|
702
810
|
order_by_clause: Optional[list[tuple[exprs.Expr, bool]]] = None,
|
|
703
811
|
limit: Optional[exprs.Expr] = None,
|
|
812
|
+
sample_clause: Optional[SampleClause] = None,
|
|
704
813
|
ignore_errors: bool = False,
|
|
705
814
|
exact_version_only: Optional[list[catalog.TableVersionHandle]] = None,
|
|
706
815
|
) -> exec.ExecNode:
|
|
@@ -714,12 +823,14 @@ class Planner:
|
|
|
714
823
|
order_by_clause = []
|
|
715
824
|
if exact_version_only is None:
|
|
716
825
|
exact_version_only = []
|
|
826
|
+
|
|
717
827
|
analyzer = Analyzer(
|
|
718
828
|
from_clause,
|
|
719
829
|
select_list,
|
|
720
830
|
where_clause=where_clause,
|
|
721
831
|
group_by_clause=group_by_clause,
|
|
722
832
|
order_by_clause=order_by_clause,
|
|
833
|
+
sample_clause=sample_clause,
|
|
723
834
|
)
|
|
724
835
|
row_builder = exprs.RowBuilder(analyzer.all_exprs, [], [])
|
|
725
836
|
|
|
@@ -773,6 +884,7 @@ class Planner:
|
|
|
773
884
|
# - join clause subexprs
|
|
774
885
|
# - subexprs of Where clause conjuncts that can't be run in SQL
|
|
775
886
|
# - all grouping exprs
|
|
887
|
+
# - all stratify exprs
|
|
776
888
|
candidates = list(
|
|
777
889
|
exprs.Expr.list_subexprs(
|
|
778
890
|
analyzer.select_list,
|
|
@@ -787,10 +899,12 @@ class Planner:
|
|
|
787
899
|
candidates.extend(
|
|
788
900
|
exprs.Expr.subexprs(analyzer.filter, filter=sql_elements.contains, traverse_matches=False)
|
|
789
901
|
)
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
902
|
+
candidates.extend(
|
|
903
|
+
exprs.Expr.list_subexprs(analyzer.grouping_exprs, filter=sql_elements.contains, traverse_matches=False)
|
|
904
|
+
)
|
|
905
|
+
candidates.extend(
|
|
906
|
+
exprs.Expr.list_subexprs(analyzer.stratify_exprs, filter=sql_elements.contains, traverse_matches=False)
|
|
907
|
+
)
|
|
794
908
|
# not isinstance(...): we don't want to materialize Literals via a Select
|
|
795
909
|
sql_exprs = exprs.ExprSet(e for e in candidates if not isinstance(e, exprs.Literal))
|
|
796
910
|
|
|
@@ -835,6 +949,15 @@ class Planner:
|
|
|
835
949
|
# we need to order the input for window functions
|
|
836
950
|
plan.set_order_by(analyzer.get_window_fn_ob_clause())
|
|
837
951
|
|
|
952
|
+
if analyzer.sample_clause is not None:
|
|
953
|
+
plan = exec.SqlSampleNode(
|
|
954
|
+
row_builder,
|
|
955
|
+
input=plan,
|
|
956
|
+
select_list=tbl_scan_exprs,
|
|
957
|
+
sample_clause=analyzer.sample_clause,
|
|
958
|
+
stratify_exprs=analyzer.stratify_exprs,
|
|
959
|
+
)
|
|
960
|
+
|
|
838
961
|
plan = cls._insert_prefetch_node(tbl.tbl_version.id, row_builder, plan)
|
|
839
962
|
|
|
840
963
|
if analyzer.group_by_clause is not None:
|