pixeltable 0.4.0rc2__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +1 -1
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +9 -1
- pixeltable/catalog/catalog.py +333 -99
- pixeltable/catalog/column.py +28 -26
- pixeltable/catalog/globals.py +12 -0
- pixeltable/catalog/insertable_table.py +8 -8
- pixeltable/catalog/schema_object.py +6 -0
- pixeltable/catalog/table.py +111 -116
- pixeltable/catalog/table_version.py +36 -50
- pixeltable/catalog/table_version_handle.py +4 -1
- pixeltable/catalog/table_version_path.py +28 -4
- pixeltable/catalog/view.py +10 -18
- pixeltable/config.py +4 -0
- pixeltable/dataframe.py +10 -9
- pixeltable/env.py +5 -11
- pixeltable/exceptions.py +6 -0
- pixeltable/exec/exec_node.py +2 -0
- pixeltable/exec/expr_eval/expr_eval_node.py +4 -4
- pixeltable/exec/sql_node.py +47 -30
- pixeltable/exprs/column_property_ref.py +2 -1
- pixeltable/exprs/column_ref.py +7 -6
- pixeltable/exprs/expr.py +4 -4
- pixeltable/func/__init__.py +1 -0
- pixeltable/func/mcp.py +74 -0
- pixeltable/func/query_template_function.py +4 -2
- pixeltable/func/tools.py +12 -2
- pixeltable/func/udf.py +2 -2
- pixeltable/functions/__init__.py +1 -0
- pixeltable/functions/anthropic.py +19 -45
- pixeltable/functions/deepseek.py +19 -38
- pixeltable/functions/fireworks.py +9 -18
- pixeltable/functions/gemini.py +2 -2
- pixeltable/functions/groq.py +108 -0
- pixeltable/functions/huggingface.py +8 -6
- pixeltable/functions/llama_cpp.py +6 -6
- pixeltable/functions/mistralai.py +16 -53
- pixeltable/functions/ollama.py +1 -1
- pixeltable/functions/openai.py +82 -170
- pixeltable/functions/replicate.py +2 -2
- pixeltable/functions/together.py +22 -80
- pixeltable/functions/util.py +6 -1
- pixeltable/globals.py +0 -2
- pixeltable/io/external_store.py +2 -2
- pixeltable/io/label_studio.py +4 -4
- pixeltable/io/table_data_conduit.py +1 -1
- pixeltable/metadata/__init__.py +1 -1
- pixeltable/metadata/converters/convert_37.py +15 -0
- pixeltable/metadata/notes.py +1 -0
- pixeltable/metadata/schema.py +5 -0
- pixeltable/plan.py +37 -121
- pixeltable/share/packager.py +2 -2
- pixeltable/type_system.py +30 -0
- {pixeltable-0.4.0rc2.dist-info → pixeltable-0.4.1.dist-info}/METADATA +1 -1
- {pixeltable-0.4.0rc2.dist-info → pixeltable-0.4.1.dist-info}/RECORD +58 -56
- pixeltable/utils/sample.py +0 -25
- {pixeltable-0.4.0rc2.dist-info → pixeltable-0.4.1.dist-info}/LICENSE +0 -0
- {pixeltable-0.4.0rc2.dist-info → pixeltable-0.4.1.dist-info}/WHEEL +0 -0
- {pixeltable-0.4.0rc2.dist-info → pixeltable-0.4.1.dist-info}/entry_points.txt +0 -0
pixeltable/globals.py
CHANGED
|
@@ -428,8 +428,6 @@ def get_table(path: str) -> catalog.Table:
|
|
|
428
428
|
"""
|
|
429
429
|
path_obj = catalog.Path(path)
|
|
430
430
|
tbl = Catalog.get().get_table(path_obj)
|
|
431
|
-
tv = tbl._tbl_version.get()
|
|
432
|
-
_logger.debug(f'get_table(): tbl={tv.id}:{tv.effective_version} sa_tbl={id(tv.store_tbl.sa_tbl):x} tv={id(tv):x}')
|
|
433
431
|
return tbl
|
|
434
432
|
|
|
435
433
|
|
pixeltable/io/external_store.py
CHANGED
|
@@ -202,7 +202,7 @@ class Project(ExternalStore, abc.ABC):
|
|
|
202
202
|
resolved_col_mapping: dict[Column, str] = {}
|
|
203
203
|
|
|
204
204
|
# Validate names
|
|
205
|
-
t_cols = set(table.
|
|
205
|
+
t_cols = set(table._get_schema().keys())
|
|
206
206
|
for t_col, ext_col in col_mapping.items():
|
|
207
207
|
if t_col not in t_cols:
|
|
208
208
|
if is_user_specified_col_mapping:
|
|
@@ -225,7 +225,7 @@ class Project(ExternalStore, abc.ABC):
|
|
|
225
225
|
assert isinstance(col_ref, exprs.ColumnRef)
|
|
226
226
|
resolved_col_mapping[col_ref.col] = ext_col
|
|
227
227
|
# Validate column specs
|
|
228
|
-
t_col_types = table.
|
|
228
|
+
t_col_types = table._get_schema()
|
|
229
229
|
for t_col, ext_col in col_mapping.items():
|
|
230
230
|
t_col_type = t_col_types[t_col]
|
|
231
231
|
if ext_col in export_cols:
|
pixeltable/io/label_studio.py
CHANGED
|
@@ -412,8 +412,8 @@ class LabelStudioProject(Project):
|
|
|
412
412
|
# TODO(aaron-siegel): Simplify this once propagation is properly implemented in batch_update
|
|
413
413
|
ancestor = t
|
|
414
414
|
while local_annotations_col not in ancestor._tbl_version.get().cols:
|
|
415
|
-
assert ancestor.
|
|
416
|
-
ancestor = ancestor.
|
|
415
|
+
assert ancestor._get_base_table is not None
|
|
416
|
+
ancestor = ancestor._get_base_table()
|
|
417
417
|
update_status = ancestor.batch_update(updates)
|
|
418
418
|
env.Env.get().console_logger.info(f'Updated annotation(s) from {len(updates)} task(s) in {self}.')
|
|
419
419
|
return SyncStatus(pxt_rows_updated=update_status.num_rows, num_excs=update_status.num_excs)
|
|
@@ -560,7 +560,7 @@ class LabelStudioProject(Project):
|
|
|
560
560
|
|
|
561
561
|
if name is None:
|
|
562
562
|
# Create a default name that's unique to the table
|
|
563
|
-
all_stores = t.external_stores
|
|
563
|
+
all_stores = t.external_stores()
|
|
564
564
|
n = 0
|
|
565
565
|
while f'ls_project_{n}' in all_stores:
|
|
566
566
|
n += 1
|
|
@@ -576,7 +576,7 @@ class LabelStudioProject(Project):
|
|
|
576
576
|
local_annotations_column = ANNOTATIONS_COLUMN
|
|
577
577
|
else:
|
|
578
578
|
local_annotations_column = next(k for k, v in col_mapping.items() if v == ANNOTATIONS_COLUMN)
|
|
579
|
-
if local_annotations_column not in t.
|
|
579
|
+
if local_annotations_column not in t._get_schema():
|
|
580
580
|
t.add_columns({local_annotations_column: ts.Json})
|
|
581
581
|
|
|
582
582
|
resolved_col_mapping = cls.validate_columns(
|
|
@@ -101,7 +101,7 @@ class TableDataConduit:
|
|
|
101
101
|
def add_table_info(self, table: pxt.Table) -> None:
|
|
102
102
|
"""Add information about the table into which we are inserting data"""
|
|
103
103
|
assert isinstance(table, pxt.Table)
|
|
104
|
-
self.pxt_schema = table.
|
|
104
|
+
self.pxt_schema = table._get_schema()
|
|
105
105
|
self.pxt_pk = table._tbl_version.get().primary_key
|
|
106
106
|
for col in table._tbl_version_path.columns():
|
|
107
107
|
if col.is_required_for_insert:
|
pixeltable/metadata/__init__.py
CHANGED
|
@@ -18,7 +18,7 @@ _console_logger = ConsoleLogger(logging.getLogger('pixeltable'))
|
|
|
18
18
|
_logger = logging.getLogger('pixeltable')
|
|
19
19
|
|
|
20
20
|
# current version of the metadata; this is incremented whenever the metadata schema changes
|
|
21
|
-
VERSION =
|
|
21
|
+
VERSION = 38
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
def create_system_info(engine: sql.engine.Engine) -> None:
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from uuid import UUID
|
|
2
|
+
|
|
3
|
+
import sqlalchemy as sql
|
|
4
|
+
|
|
5
|
+
from pixeltable.metadata import register_converter
|
|
6
|
+
from pixeltable.metadata.converters.util import convert_table_md
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@register_converter(version=37)
|
|
10
|
+
def _(engine: sql.engine.Engine) -> None:
|
|
11
|
+
convert_table_md(engine, table_md_updater=__update_table_md)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def __update_table_md(table_md: dict, _: UUID) -> None:
|
|
15
|
+
table_md['view_sn'] = 0
|
pixeltable/metadata/notes.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
# rather than as a comment, so that the existence of a description can be enforced by
|
|
3
3
|
# the unit tests when new versions are added.
|
|
4
4
|
VERSION_NOTES = {
|
|
5
|
+
38: 'Added TableMd.view_sn',
|
|
5
6
|
37: 'Add support for the sample() method on DataFrames',
|
|
6
7
|
36: 'Added Table.lock_dummy',
|
|
7
8
|
35: 'Track reference_tbl in ColumnRef',
|
pixeltable/metadata/schema.py
CHANGED
|
@@ -177,6 +177,11 @@ class TableMd:
|
|
|
177
177
|
# - every row is assigned a unique and immutable rowid on insertion
|
|
178
178
|
next_row_id: int
|
|
179
179
|
|
|
180
|
+
# sequence number to track changes in the set of mutable views of this table (ie, this table = the view base)
|
|
181
|
+
# - incremented for each add/drop of a mutable view
|
|
182
|
+
# - only maintained for mutable tables
|
|
183
|
+
view_sn: int
|
|
184
|
+
|
|
180
185
|
# Metadata format for external stores:
|
|
181
186
|
# {'class': 'pixeltable.io.label_studio.LabelStudioProject', 'md': {'project_id': 3}}
|
|
182
187
|
external_stores: list[dict[str, Any]]
|
pixeltable/plan.py
CHANGED
|
@@ -3,7 +3,7 @@ from __future__ import annotations
|
|
|
3
3
|
import dataclasses
|
|
4
4
|
import enum
|
|
5
5
|
from textwrap import dedent
|
|
6
|
-
from typing import Any, Iterable, Literal,
|
|
6
|
+
from typing import Any, Iterable, Literal, Optional, Sequence
|
|
7
7
|
from uuid import UUID
|
|
8
8
|
|
|
9
9
|
import sqlalchemy as sql
|
|
@@ -12,7 +12,6 @@ import pixeltable as pxt
|
|
|
12
12
|
from pixeltable import catalog, exceptions as excs, exec, exprs
|
|
13
13
|
from pixeltable.catalog import Column, TableVersionHandle
|
|
14
14
|
from pixeltable.exec.sql_node import OrderByClause, OrderByItem, combine_order_by_clauses, print_order_by_clause
|
|
15
|
-
from pixeltable.utils.sample import sample_key
|
|
16
15
|
|
|
17
16
|
|
|
18
17
|
def _is_agg_fn_call(e: exprs.Expr) -> bool:
|
|
@@ -159,16 +158,6 @@ class SampleClause:
|
|
|
159
158
|
return format(threshold_int, '08x') + 'ffffffffffffffffffffffff'
|
|
160
159
|
|
|
161
160
|
|
|
162
|
-
class SamplingClauses(NamedTuple):
|
|
163
|
-
"""Clauses provided when rewriting a SampleClause"""
|
|
164
|
-
|
|
165
|
-
where: exprs.Expr
|
|
166
|
-
group_by_clause: Optional[list[exprs.Expr]]
|
|
167
|
-
order_by_clause: Optional[list[tuple[exprs.Expr, bool]]]
|
|
168
|
-
limit: Optional[exprs.Expr]
|
|
169
|
-
sample_clause: Optional[SampleClause]
|
|
170
|
-
|
|
171
|
-
|
|
172
161
|
class Analyzer:
|
|
173
162
|
"""
|
|
174
163
|
Performs semantic analysis of a query and stores the analysis state.
|
|
@@ -180,6 +169,8 @@ class Analyzer:
|
|
|
180
169
|
group_by_clause: Optional[list[exprs.Expr]] # None for non-aggregate queries; [] for agg query w/o grouping
|
|
181
170
|
grouping_exprs: list[exprs.Expr] # [] for non-aggregate queries or agg query w/o grouping
|
|
182
171
|
order_by_clause: OrderByClause
|
|
172
|
+
stratify_exprs: list[exprs.Expr] # [] if no stratiifcation is required
|
|
173
|
+
sample_clause: Optional[SampleClause] # None if no sampling clause is present
|
|
183
174
|
|
|
184
175
|
sql_elements: exprs.SqlElementCache
|
|
185
176
|
|
|
@@ -200,6 +191,7 @@ class Analyzer:
|
|
|
200
191
|
where_clause: Optional[exprs.Expr] = None,
|
|
201
192
|
group_by_clause: Optional[list[exprs.Expr]] = None,
|
|
202
193
|
order_by_clause: Optional[list[tuple[exprs.Expr, bool]]] = None,
|
|
194
|
+
sample_clause: Optional[SampleClause] = None,
|
|
203
195
|
):
|
|
204
196
|
if order_by_clause is None:
|
|
205
197
|
order_by_clause = []
|
|
@@ -213,6 +205,11 @@ class Analyzer:
|
|
|
213
205
|
self.group_by_clause = (
|
|
214
206
|
[e.resolve_computed_cols() for e in group_by_clause] if group_by_clause is not None else None
|
|
215
207
|
)
|
|
208
|
+
self.sample_clause = sample_clause
|
|
209
|
+
if self.sample_clause is not None and self.sample_clause.is_stratified:
|
|
210
|
+
self.stratify_exprs = [e.resolve_computed_cols() for e in sample_clause.stratify_exprs]
|
|
211
|
+
else:
|
|
212
|
+
self.stratify_exprs = []
|
|
216
213
|
self.order_by_clause = [OrderByItem(e.resolve_computed_cols(), asc) for e, asc in order_by_clause]
|
|
217
214
|
|
|
218
215
|
self.sql_where_clause = None
|
|
@@ -228,8 +225,11 @@ class Analyzer:
|
|
|
228
225
|
self.all_exprs.append(join_clause.join_predicate)
|
|
229
226
|
if self.group_by_clause is not None:
|
|
230
227
|
self.all_exprs.extend(self.group_by_clause)
|
|
228
|
+
self.all_exprs.extend(self.stratify_exprs)
|
|
231
229
|
self.all_exprs.extend(e for e, _ in self.order_by_clause)
|
|
232
230
|
if self.filter is not None:
|
|
231
|
+
if sample_clause is not None:
|
|
232
|
+
raise excs.Error(f'Filter {self.filter} not expressible in SQL')
|
|
233
233
|
self.all_exprs.append(self.filter)
|
|
234
234
|
|
|
235
235
|
self.agg_order_by = []
|
|
@@ -691,25 +691,13 @@ class Planner:
|
|
|
691
691
|
# 2. for component views: iterator args
|
|
692
692
|
iterator_args = [target.iterator_args] if target.iterator_args is not None else []
|
|
693
693
|
|
|
694
|
-
# If this contains a sample specification, modify / create where, group_by, order_by, and limit clauses
|
|
695
694
|
from_clause = FromClause(tbls=[view.base])
|
|
696
|
-
where, group_by_clause, order_by_clause, limit, sample_clause = cls.create_sample_clauses(
|
|
697
|
-
from_clause, target.sample_clause, target.predicate, None, [], None
|
|
698
|
-
)
|
|
699
|
-
|
|
700
|
-
# if we're propagating an insert, we only want to see those base rows that were created for the current version
|
|
701
695
|
base_analyzer = Analyzer(
|
|
702
|
-
from_clause,
|
|
703
|
-
iterator_args,
|
|
704
|
-
where_clause=where,
|
|
705
|
-
group_by_clause=group_by_clause,
|
|
706
|
-
order_by_clause=order_by_clause,
|
|
696
|
+
from_clause, iterator_args, where_clause=target.predicate, sample_clause=target.sample_clause
|
|
707
697
|
)
|
|
708
698
|
row_builder = exprs.RowBuilder(base_analyzer.all_exprs, stored_cols, [])
|
|
709
699
|
|
|
710
|
-
if
|
|
711
|
-
raise excs.Error(f'Filter {base_analyzer.filter} not expressible in SQL')
|
|
712
|
-
|
|
700
|
+
# if we're propagating an insert, we only want to see those base rows that were created for the current version
|
|
713
701
|
# execution plan:
|
|
714
702
|
# 1. materialize exprs computed from the base that are needed for stored view columns
|
|
715
703
|
# 2. if it's an iterator view, expand the base rows into component rows
|
|
@@ -723,19 +711,13 @@ class Planner:
|
|
|
723
711
|
|
|
724
712
|
# Create a new analyzer reflecting exactly what is required from the base table
|
|
725
713
|
base_analyzer = Analyzer(
|
|
726
|
-
from_clause,
|
|
727
|
-
base_output_exprs,
|
|
728
|
-
where_clause=where,
|
|
729
|
-
group_by_clause=group_by_clause,
|
|
730
|
-
order_by_clause=order_by_clause,
|
|
714
|
+
from_clause, base_output_exprs, where_clause=target.predicate, sample_clause=target.sample_clause
|
|
731
715
|
)
|
|
732
716
|
base_eval_ctx = row_builder.create_eval_ctx(base_analyzer.all_exprs)
|
|
733
717
|
plan = cls._create_query_plan(
|
|
734
718
|
row_builder=row_builder,
|
|
735
719
|
analyzer=base_analyzer,
|
|
736
720
|
eval_ctx=base_eval_ctx,
|
|
737
|
-
limit=limit,
|
|
738
|
-
sample_clause=sample_clause,
|
|
739
721
|
with_pk=True,
|
|
740
722
|
exact_version_only=view.get_bases() if propagates_insert else [],
|
|
741
723
|
)
|
|
@@ -818,62 +800,6 @@ class Planner:
|
|
|
818
800
|
prefetch_node = exec.CachePrefetchNode(tbl_id, file_col_info, input_node)
|
|
819
801
|
return prefetch_node
|
|
820
802
|
|
|
821
|
-
@classmethod
|
|
822
|
-
def create_sample_clauses(
|
|
823
|
-
cls,
|
|
824
|
-
from_clause: FromClause,
|
|
825
|
-
sample_clause: SampleClause,
|
|
826
|
-
where_clause: Optional[exprs.Expr],
|
|
827
|
-
group_by_clause: Optional[list[exprs.Expr]],
|
|
828
|
-
order_by_clause: Optional[list[tuple[exprs.Expr, bool]]],
|
|
829
|
-
limit: Optional[exprs.Expr],
|
|
830
|
-
) -> SamplingClauses:
|
|
831
|
-
"""tuple[
|
|
832
|
-
exprs.Expr,
|
|
833
|
-
Optional[list[exprs.Expr]],
|
|
834
|
-
Optional[list[tuple[exprs.Expr, bool]]],
|
|
835
|
-
Optional[exprs.Expr],
|
|
836
|
-
Optional[SampleClause],
|
|
837
|
-
]:"""
|
|
838
|
-
"""Construct clauses required for sampling under various conditions.
|
|
839
|
-
If there is no sampling, then return the original clauses.
|
|
840
|
-
If the sample is stratified, then return only the group by clause. The rest of the
|
|
841
|
-
mechanism for stratified sampling is provided by the SampleSqlNode.
|
|
842
|
-
If the sample is non-stratified, then rewrite the query to accommodate the supplied where clause,
|
|
843
|
-
and provide the other clauses required for sampling
|
|
844
|
-
"""
|
|
845
|
-
|
|
846
|
-
# If no sample clause, return the original clauses
|
|
847
|
-
if sample_clause is None:
|
|
848
|
-
return SamplingClauses(where_clause, group_by_clause, order_by_clause, limit, None)
|
|
849
|
-
|
|
850
|
-
# If the sample clause is stratified, create a group by clause
|
|
851
|
-
if sample_clause.is_stratified:
|
|
852
|
-
group_by = sample_clause.stratify_exprs
|
|
853
|
-
# Note that limit is not possible here
|
|
854
|
-
return SamplingClauses(where_clause, group_by, order_by_clause, None, sample_clause)
|
|
855
|
-
|
|
856
|
-
else:
|
|
857
|
-
# If non-stratified sampling, construct a where clause, order_by, and limit clauses
|
|
858
|
-
# Construct an expression for sorting rows and limiting row counts
|
|
859
|
-
s_key = sample_key(
|
|
860
|
-
exprs.Literal(sample_clause.seed), *cls.rowid_columns(from_clause._first_tbl.tbl_version)
|
|
861
|
-
)
|
|
862
|
-
|
|
863
|
-
# Construct a suitable where clause
|
|
864
|
-
where = where_clause
|
|
865
|
-
if sample_clause.fraction is not None:
|
|
866
|
-
fraction_md5_hex = exprs.Expr.from_object(
|
|
867
|
-
sample_clause.fraction_to_md5_hex(float(sample_clause.fraction))
|
|
868
|
-
)
|
|
869
|
-
f_where = s_key < fraction_md5_hex
|
|
870
|
-
where = where & f_where if where is not None else f_where
|
|
871
|
-
|
|
872
|
-
order_by: list[tuple[exprs.Expr, bool]] = [(s_key, True)]
|
|
873
|
-
limit = exprs.Literal(sample_clause.n)
|
|
874
|
-
# Note that group_by is not possible here
|
|
875
|
-
return SamplingClauses(where, None, order_by, limit, None)
|
|
876
|
-
|
|
877
803
|
@classmethod
|
|
878
804
|
def create_query_plan(
|
|
879
805
|
cls,
|
|
@@ -898,21 +824,15 @@ class Planner:
|
|
|
898
824
|
if exact_version_only is None:
|
|
899
825
|
exact_version_only = []
|
|
900
826
|
|
|
901
|
-
# Modify clauses to include sample clause
|
|
902
|
-
where, group_by_clause, order_by_clause, limit, sample = cls.create_sample_clauses(
|
|
903
|
-
from_clause, sample_clause, where_clause, group_by_clause, order_by_clause, limit
|
|
904
|
-
)
|
|
905
|
-
|
|
906
827
|
analyzer = Analyzer(
|
|
907
828
|
from_clause,
|
|
908
829
|
select_list,
|
|
909
|
-
where_clause=
|
|
830
|
+
where_clause=where_clause,
|
|
910
831
|
group_by_clause=group_by_clause,
|
|
911
832
|
order_by_clause=order_by_clause,
|
|
833
|
+
sample_clause=sample_clause,
|
|
912
834
|
)
|
|
913
835
|
row_builder = exprs.RowBuilder(analyzer.all_exprs, [], [])
|
|
914
|
-
if sample_clause is not None and analyzer.filter is not None:
|
|
915
|
-
raise excs.Error(f'Filter {analyzer.filter} not expressible in SQL')
|
|
916
836
|
|
|
917
837
|
analyzer.finalize(row_builder)
|
|
918
838
|
# select_list: we need to materialize everything that's been collected
|
|
@@ -923,7 +843,6 @@ class Planner:
|
|
|
923
843
|
analyzer=analyzer,
|
|
924
844
|
eval_ctx=eval_ctx,
|
|
925
845
|
limit=limit,
|
|
926
|
-
sample_clause=sample,
|
|
927
846
|
with_pk=True,
|
|
928
847
|
exact_version_only=exact_version_only,
|
|
929
848
|
)
|
|
@@ -939,7 +858,6 @@ class Planner:
|
|
|
939
858
|
analyzer: Analyzer,
|
|
940
859
|
eval_ctx: exprs.RowBuilder.EvalCtx,
|
|
941
860
|
limit: Optional[exprs.Expr] = None,
|
|
942
|
-
sample_clause: Optional[SampleClause] = None,
|
|
943
861
|
with_pk: bool = False,
|
|
944
862
|
exact_version_only: Optional[list[catalog.TableVersionHandle]] = None,
|
|
945
863
|
) -> exec.ExecNode:
|
|
@@ -966,6 +884,7 @@ class Planner:
|
|
|
966
884
|
# - join clause subexprs
|
|
967
885
|
# - subexprs of Where clause conjuncts that can't be run in SQL
|
|
968
886
|
# - all grouping exprs
|
|
887
|
+
# - all stratify exprs
|
|
969
888
|
candidates = list(
|
|
970
889
|
exprs.Expr.list_subexprs(
|
|
971
890
|
analyzer.select_list,
|
|
@@ -980,10 +899,12 @@ class Planner:
|
|
|
980
899
|
candidates.extend(
|
|
981
900
|
exprs.Expr.subexprs(analyzer.filter, filter=sql_elements.contains, traverse_matches=False)
|
|
982
901
|
)
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
|
|
902
|
+
candidates.extend(
|
|
903
|
+
exprs.Expr.list_subexprs(analyzer.grouping_exprs, filter=sql_elements.contains, traverse_matches=False)
|
|
904
|
+
)
|
|
905
|
+
candidates.extend(
|
|
906
|
+
exprs.Expr.list_subexprs(analyzer.stratify_exprs, filter=sql_elements.contains, traverse_matches=False)
|
|
907
|
+
)
|
|
987
908
|
# not isinstance(...): we don't want to materialize Literals via a Select
|
|
988
909
|
sql_exprs = exprs.ExprSet(e for e in candidates if not isinstance(e, exprs.Literal))
|
|
989
910
|
|
|
@@ -1028,6 +949,15 @@ class Planner:
|
|
|
1028
949
|
# we need to order the input for window functions
|
|
1029
950
|
plan.set_order_by(analyzer.get_window_fn_ob_clause())
|
|
1030
951
|
|
|
952
|
+
if analyzer.sample_clause is not None:
|
|
953
|
+
plan = exec.SqlSampleNode(
|
|
954
|
+
row_builder,
|
|
955
|
+
input=plan,
|
|
956
|
+
select_list=tbl_scan_exprs,
|
|
957
|
+
sample_clause=analyzer.sample_clause,
|
|
958
|
+
stratify_exprs=analyzer.stratify_exprs,
|
|
959
|
+
)
|
|
960
|
+
|
|
1031
961
|
plan = cls._insert_prefetch_node(tbl.tbl_version.id, row_builder, plan)
|
|
1032
962
|
|
|
1033
963
|
if analyzer.group_by_clause is not None:
|
|
@@ -1050,26 +980,12 @@ class Planner:
|
|
|
1050
980
|
sql_elements.contains_all(analyzer.select_list)
|
|
1051
981
|
and sql_elements.contains_all(analyzer.grouping_exprs)
|
|
1052
982
|
and isinstance(plan, exec.SqlNode)
|
|
1053
|
-
and plan.to_cte(
|
|
983
|
+
and plan.to_cte() is not None
|
|
1054
984
|
):
|
|
1055
|
-
|
|
1056
|
-
plan =
|
|
1057
|
-
|
|
1058
|
-
input=plan,
|
|
1059
|
-
select_list=analyzer.select_list,
|
|
1060
|
-
stratify_exprs=analyzer.group_by_clause,
|
|
1061
|
-
sample_clause=sample_clause,
|
|
1062
|
-
)
|
|
1063
|
-
else:
|
|
1064
|
-
plan = exec.SqlAggregationNode(
|
|
1065
|
-
row_builder,
|
|
1066
|
-
input=plan,
|
|
1067
|
-
select_list=analyzer.select_list,
|
|
1068
|
-
group_by_items=analyzer.group_by_clause,
|
|
1069
|
-
)
|
|
985
|
+
plan = exec.SqlAggregationNode(
|
|
986
|
+
row_builder, input=plan, select_list=analyzer.select_list, group_by_items=analyzer.group_by_clause
|
|
987
|
+
)
|
|
1070
988
|
else:
|
|
1071
|
-
if sample_clause is not None:
|
|
1072
|
-
raise excs.Error('Sample clause not supported with Python aggregation')
|
|
1073
989
|
input_sql_node = plan.get_node(exec.SqlNode)
|
|
1074
990
|
assert combined_ordering is not None
|
|
1075
991
|
input_sql_node.set_order_by(combined_ordering)
|
pixeltable/share/packager.py
CHANGED
|
@@ -127,7 +127,7 @@ class TablePackager:
|
|
|
127
127
|
# We use snappy compression for the Parquet tables; the entire bundle will be bzip2-compressed later, so
|
|
128
128
|
# faster compression should provide good performance while still reducing temporary storage utilization.
|
|
129
129
|
parquet_writer = pq.ParquetWriter(parquet_file, parquet_schema, compression='SNAPPY')
|
|
130
|
-
filter_tv = self.table.
|
|
130
|
+
filter_tv = self.table._tbl_version_path.tbl_version.get()
|
|
131
131
|
row_iter = tv.store_tbl.dump_rows(tv.version, filter_tv.store_tbl, filter_tv.version)
|
|
132
132
|
for pa_table in self.__to_pa_tables(row_iter, sql_types, media_cols, parquet_schema):
|
|
133
133
|
parquet_writer.write_table(pa_table)
|
|
@@ -238,7 +238,7 @@ class TablePackager:
|
|
|
238
238
|
- Documents are replaced by a thumbnail as a base64-encoded webp
|
|
239
239
|
"""
|
|
240
240
|
# First 8 columns
|
|
241
|
-
preview_cols = dict(itertools.islice(self.table.
|
|
241
|
+
preview_cols = dict(itertools.islice(self.table._get_schema().items(), 0, 8))
|
|
242
242
|
select_list = [self.table[col_name] for col_name in preview_cols]
|
|
243
243
|
# First 5 rows
|
|
244
244
|
rows = list(self.table.select(*select_list).head(n=5))
|
pixeltable/type_system.py
CHANGED
|
@@ -395,6 +395,36 @@ class ColumnType:
|
|
|
395
395
|
raise excs.Error(f'Standard Python type `{name}` cannot be used here; use `{suggestion}` instead')
|
|
396
396
|
raise excs.Error(f'Unknown type: {t}')
|
|
397
397
|
|
|
398
|
+
@classmethod
|
|
399
|
+
def from_json_schema(cls, schema: dict[str, Any]) -> Optional[ColumnType]:
|
|
400
|
+
# We first express the JSON schema as a Python type, and then convert it to a Pixeltable type.
|
|
401
|
+
# TODO: Is there a meaningful fallback if one of these operations fails? (Maybe another use case for a pxt Any
|
|
402
|
+
# type?)
|
|
403
|
+
py_type = cls.__json_schema_to_py_type(schema)
|
|
404
|
+
return cls.from_python_type(py_type) if py_type is not None else None
|
|
405
|
+
|
|
406
|
+
@classmethod
|
|
407
|
+
def __json_schema_to_py_type(cls, schema: dict[str, Any]) -> Union[type, _GenericAlias, None]:
|
|
408
|
+
if 'type' in schema:
|
|
409
|
+
if schema['type'] == 'null':
|
|
410
|
+
return type(None)
|
|
411
|
+
if schema['type'] == 'string':
|
|
412
|
+
return str
|
|
413
|
+
if schema['type'] == 'integer':
|
|
414
|
+
return int
|
|
415
|
+
if schema['type'] == 'number':
|
|
416
|
+
return float
|
|
417
|
+
if schema['type'] == 'boolean':
|
|
418
|
+
return bool
|
|
419
|
+
if schema['type'] in ('array', 'object'):
|
|
420
|
+
return list
|
|
421
|
+
elif 'anyOf' in schema:
|
|
422
|
+
subscripts = tuple(cls.__json_schema_to_py_type(subschema) for subschema in schema['anyOf'])
|
|
423
|
+
if all(subscript is not None for subscript in subscripts):
|
|
424
|
+
return Union[subscripts]
|
|
425
|
+
|
|
426
|
+
return None
|
|
427
|
+
|
|
398
428
|
def validate_literal(self, val: Any) -> None:
|
|
399
429
|
"""Raise TypeError if val is not a valid literal for this type"""
|
|
400
430
|
if val is None:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: pixeltable
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.1
|
|
4
4
|
Summary: AI Data Infrastructure: Declarative, Multimodal, and Incremental
|
|
5
5
|
License: Apache-2.0
|
|
6
6
|
Keywords: data-science,machine-learning,database,ai,computer-vision,chatbot,ml,artificial-intelligence,feature-engineering,multimodal,mlops,feature-store,vector-database,llm,genai
|