pixeltable 0.3.15__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +1 -1
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +9 -1
- pixeltable/catalog/catalog.py +559 -134
- pixeltable/catalog/column.py +36 -32
- pixeltable/catalog/dir.py +1 -2
- pixeltable/catalog/globals.py +12 -0
- pixeltable/catalog/insertable_table.py +30 -25
- pixeltable/catalog/schema_object.py +9 -6
- pixeltable/catalog/table.py +334 -267
- pixeltable/catalog/table_version.py +358 -241
- pixeltable/catalog/table_version_handle.py +18 -2
- pixeltable/catalog/table_version_path.py +86 -16
- pixeltable/catalog/view.py +47 -23
- pixeltable/dataframe.py +198 -19
- pixeltable/env.py +6 -4
- pixeltable/exceptions.py +6 -0
- pixeltable/exec/__init__.py +1 -1
- pixeltable/exec/exec_node.py +2 -0
- pixeltable/exec/expr_eval/evaluators.py +4 -1
- pixeltable/exec/expr_eval/expr_eval_node.py +4 -4
- pixeltable/exec/in_memory_data_node.py +1 -1
- pixeltable/exec/sql_node.py +188 -22
- pixeltable/exprs/column_property_ref.py +16 -6
- pixeltable/exprs/column_ref.py +33 -11
- pixeltable/exprs/comparison.py +1 -1
- pixeltable/exprs/data_row.py +5 -3
- pixeltable/exprs/expr.py +11 -4
- pixeltable/exprs/literal.py +2 -0
- pixeltable/exprs/row_builder.py +4 -6
- pixeltable/exprs/rowid_ref.py +8 -0
- pixeltable/exprs/similarity_expr.py +1 -0
- pixeltable/func/__init__.py +1 -0
- pixeltable/func/mcp.py +74 -0
- pixeltable/func/query_template_function.py +5 -3
- pixeltable/func/tools.py +12 -2
- pixeltable/func/udf.py +2 -2
- pixeltable/functions/__init__.py +1 -0
- pixeltable/functions/anthropic.py +19 -45
- pixeltable/functions/deepseek.py +19 -38
- pixeltable/functions/fireworks.py +9 -18
- pixeltable/functions/gemini.py +2 -3
- pixeltable/functions/groq.py +108 -0
- pixeltable/functions/llama_cpp.py +6 -6
- pixeltable/functions/mistralai.py +16 -53
- pixeltable/functions/ollama.py +1 -1
- pixeltable/functions/openai.py +82 -165
- pixeltable/functions/string.py +212 -58
- pixeltable/functions/together.py +22 -80
- pixeltable/globals.py +10 -4
- pixeltable/index/base.py +5 -0
- pixeltable/index/btree.py +5 -0
- pixeltable/index/embedding_index.py +5 -0
- pixeltable/io/external_store.py +10 -31
- pixeltable/io/label_studio.py +5 -5
- pixeltable/io/parquet.py +2 -2
- pixeltable/io/table_data_conduit.py +1 -32
- pixeltable/metadata/__init__.py +11 -2
- pixeltable/metadata/converters/convert_13.py +2 -2
- pixeltable/metadata/converters/convert_30.py +6 -11
- pixeltable/metadata/converters/convert_35.py +9 -0
- pixeltable/metadata/converters/convert_36.py +38 -0
- pixeltable/metadata/converters/convert_37.py +15 -0
- pixeltable/metadata/converters/util.py +3 -9
- pixeltable/metadata/notes.py +3 -0
- pixeltable/metadata/schema.py +13 -1
- pixeltable/plan.py +135 -12
- pixeltable/share/packager.py +138 -14
- pixeltable/share/publish.py +2 -2
- pixeltable/store.py +19 -13
- pixeltable/type_system.py +30 -0
- pixeltable/utils/dbms.py +1 -1
- pixeltable/utils/formatter.py +64 -42
- {pixeltable-0.3.15.dist-info → pixeltable-0.4.0.dist-info}/METADATA +2 -1
- {pixeltable-0.3.15.dist-info → pixeltable-0.4.0.dist-info}/RECORD +78 -73
- {pixeltable-0.3.15.dist-info → pixeltable-0.4.0.dist-info}/LICENSE +0 -0
- {pixeltable-0.3.15.dist-info → pixeltable-0.4.0.dist-info}/WHEEL +0 -0
- {pixeltable-0.3.15.dist-info → pixeltable-0.4.0.dist-info}/entry_points.txt +0 -0
pixeltable/plan.py
CHANGED
|
@@ -75,6 +75,88 @@ class FromClause:
|
|
|
75
75
|
tbls: list[catalog.TableVersionPath]
|
|
76
76
|
join_clauses: list[JoinClause] = dataclasses.field(default_factory=list)
|
|
77
77
|
|
|
78
|
+
@property
|
|
79
|
+
def _first_tbl(self) -> catalog.TableVersionPath:
|
|
80
|
+
assert len(self.tbls) == 1
|
|
81
|
+
return self.tbls[0]
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
@dataclasses.dataclass
|
|
85
|
+
class SampleClause:
|
|
86
|
+
"""Defines a sampling clause for a table."""
|
|
87
|
+
|
|
88
|
+
version: Optional[int]
|
|
89
|
+
n: Optional[int]
|
|
90
|
+
n_per_stratum: Optional[int]
|
|
91
|
+
fraction: Optional[float]
|
|
92
|
+
seed: Optional[int]
|
|
93
|
+
stratify_exprs: Optional[list[exprs.Expr]]
|
|
94
|
+
|
|
95
|
+
# This seed value is used if one is not supplied
|
|
96
|
+
DEFAULT_SEED = 0
|
|
97
|
+
|
|
98
|
+
# The version of the hashing algorithm used for ordering and fractional sampling.
|
|
99
|
+
CURRENT_VERSION = 1
|
|
100
|
+
|
|
101
|
+
def __post_init__(self) -> None:
|
|
102
|
+
"""If no version was provided, provide the default version"""
|
|
103
|
+
if self.version is None:
|
|
104
|
+
self.version = self.CURRENT_VERSION
|
|
105
|
+
if self.seed is None:
|
|
106
|
+
self.seed = self.DEFAULT_SEED
|
|
107
|
+
|
|
108
|
+
@property
|
|
109
|
+
def is_stratified(self) -> bool:
|
|
110
|
+
"""Check if the sampling is stratified"""
|
|
111
|
+
return self.stratify_exprs is not None and len(self.stratify_exprs) > 0
|
|
112
|
+
|
|
113
|
+
@property
|
|
114
|
+
def is_repeatable(self) -> bool:
|
|
115
|
+
"""Return true if the same rows will continue to be sampled if source rows are added or deleted."""
|
|
116
|
+
return not self.is_stratified and self.fraction is not None
|
|
117
|
+
|
|
118
|
+
def display_str(self, inline: bool = False) -> str:
|
|
119
|
+
return str(self)
|
|
120
|
+
|
|
121
|
+
def as_dict(self) -> dict:
|
|
122
|
+
"""Return a dictionary representation of the object"""
|
|
123
|
+
d = dataclasses.asdict(self)
|
|
124
|
+
d['_classname'] = self.__class__.__name__
|
|
125
|
+
if self.is_stratified:
|
|
126
|
+
d['stratify_exprs'] = [e.as_dict() for e in self.stratify_exprs]
|
|
127
|
+
return d
|
|
128
|
+
|
|
129
|
+
@classmethod
|
|
130
|
+
def from_dict(cls, d: dict) -> SampleClause:
|
|
131
|
+
"""Create a SampleClause from a dictionary representation"""
|
|
132
|
+
d_cleaned = {key: value for key, value in d.items() if key != '_classname'}
|
|
133
|
+
s = cls(**d_cleaned)
|
|
134
|
+
if s.is_stratified:
|
|
135
|
+
s.stratify_exprs = [exprs.Expr.from_dict(e) for e in d_cleaned.get('stratify_exprs', [])]
|
|
136
|
+
return s
|
|
137
|
+
|
|
138
|
+
def __repr__(self) -> str:
|
|
139
|
+
s = ','.join(e.display_str(inline=True) for e in self.stratify_exprs)
|
|
140
|
+
return (
|
|
141
|
+
f'sample_{self.version}(n={self.n}, n_per_stratum={self.n_per_stratum}, '
|
|
142
|
+
f'fraction={self.fraction}, seed={self.seed}, [{s}])'
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
@classmethod
|
|
146
|
+
def fraction_to_md5_hex(cls, fraction: float) -> str:
|
|
147
|
+
"""Return the string representation of an approximation (to ~1e-9) of a fraction of the total space
|
|
148
|
+
of md5 hash values.
|
|
149
|
+
This is used for fractional sampling.
|
|
150
|
+
"""
|
|
151
|
+
# Maximum count for the upper 32 bits of MD5: 2^32
|
|
152
|
+
max_md5_value = (2**32) - 1
|
|
153
|
+
|
|
154
|
+
# Calculate the fraction of this value
|
|
155
|
+
threshold_int = max_md5_value * int(1_000_000_000 * fraction) // 1_000_000_000
|
|
156
|
+
|
|
157
|
+
# Convert to hexadecimal string with padding
|
|
158
|
+
return format(threshold_int, '08x') + 'ffffffffffffffffffffffff'
|
|
159
|
+
|
|
78
160
|
|
|
79
161
|
class Analyzer:
|
|
80
162
|
"""
|
|
@@ -87,6 +169,8 @@ class Analyzer:
|
|
|
87
169
|
group_by_clause: Optional[list[exprs.Expr]] # None for non-aggregate queries; [] for agg query w/o grouping
|
|
88
170
|
grouping_exprs: list[exprs.Expr] # [] for non-aggregate queries or agg query w/o grouping
|
|
89
171
|
order_by_clause: OrderByClause
|
|
172
|
+
stratify_exprs: list[exprs.Expr] # [] if no stratiifcation is required
|
|
173
|
+
sample_clause: Optional[SampleClause] # None if no sampling clause is present
|
|
90
174
|
|
|
91
175
|
sql_elements: exprs.SqlElementCache
|
|
92
176
|
|
|
@@ -107,6 +191,7 @@ class Analyzer:
|
|
|
107
191
|
where_clause: Optional[exprs.Expr] = None,
|
|
108
192
|
group_by_clause: Optional[list[exprs.Expr]] = None,
|
|
109
193
|
order_by_clause: Optional[list[tuple[exprs.Expr, bool]]] = None,
|
|
194
|
+
sample_clause: Optional[SampleClause] = None,
|
|
110
195
|
):
|
|
111
196
|
if order_by_clause is None:
|
|
112
197
|
order_by_clause = []
|
|
@@ -120,6 +205,11 @@ class Analyzer:
|
|
|
120
205
|
self.group_by_clause = (
|
|
121
206
|
[e.resolve_computed_cols() for e in group_by_clause] if group_by_clause is not None else None
|
|
122
207
|
)
|
|
208
|
+
self.sample_clause = sample_clause
|
|
209
|
+
if self.sample_clause is not None and self.sample_clause.is_stratified:
|
|
210
|
+
self.stratify_exprs = [e.resolve_computed_cols() for e in sample_clause.stratify_exprs]
|
|
211
|
+
else:
|
|
212
|
+
self.stratify_exprs = []
|
|
123
213
|
self.order_by_clause = [OrderByItem(e.resolve_computed_cols(), asc) for e, asc in order_by_clause]
|
|
124
214
|
|
|
125
215
|
self.sql_where_clause = None
|
|
@@ -135,8 +225,11 @@ class Analyzer:
|
|
|
135
225
|
self.all_exprs.append(join_clause.join_predicate)
|
|
136
226
|
if self.group_by_clause is not None:
|
|
137
227
|
self.all_exprs.extend(self.group_by_clause)
|
|
228
|
+
self.all_exprs.extend(self.stratify_exprs)
|
|
138
229
|
self.all_exprs.extend(e for e, _ in self.order_by_clause)
|
|
139
230
|
if self.filter is not None:
|
|
231
|
+
if sample_clause is not None:
|
|
232
|
+
raise excs.Error(f'Filter {self.filter} not expressible in SQL')
|
|
140
233
|
self.all_exprs.append(self.filter)
|
|
141
234
|
|
|
142
235
|
self.agg_order_by = []
|
|
@@ -260,7 +353,7 @@ class Planner:
|
|
|
260
353
|
# TODO: create an exec.CountNode and change this to create_count_plan()
|
|
261
354
|
@classmethod
|
|
262
355
|
def create_count_stmt(cls, tbl: catalog.TableVersionPath, where_clause: Optional[exprs.Expr] = None) -> sql.Select:
|
|
263
|
-
stmt = sql.select(sql.func.count())
|
|
356
|
+
stmt = sql.select(sql.func.count().label('all_count'))
|
|
264
357
|
refd_tbl_ids: set[UUID] = set()
|
|
265
358
|
if where_clause is not None:
|
|
266
359
|
analyzer = cls.analyze(tbl, where_clause)
|
|
@@ -289,7 +382,7 @@ class Planner:
|
|
|
289
382
|
|
|
290
383
|
# create InMemoryDataNode for 'rows'
|
|
291
384
|
plan: exec.ExecNode = exec.InMemoryDataNode(
|
|
292
|
-
TableVersionHandle(tbl.id, tbl.effective_version), rows, row_builder, tbl.
|
|
385
|
+
TableVersionHandle(tbl.id, tbl.effective_version), rows, row_builder, tbl.next_row_id
|
|
293
386
|
)
|
|
294
387
|
|
|
295
388
|
media_input_col_info = [
|
|
@@ -322,6 +415,13 @@ class Planner:
|
|
|
322
415
|
)
|
|
323
416
|
return plan
|
|
324
417
|
|
|
418
|
+
@classmethod
|
|
419
|
+
def rowid_columns(cls, target: TableVersionHandle, num_rowid_cols: Optional[int] = None) -> list[exprs.Expr]:
|
|
420
|
+
"""Return list of RowidRef for the given number of associated rowids"""
|
|
421
|
+
if num_rowid_cols is None:
|
|
422
|
+
num_rowid_cols = target.get().num_rowid_columns()
|
|
423
|
+
return [exprs.RowidRef(target, i) for i in range(num_rowid_cols)]
|
|
424
|
+
|
|
325
425
|
@classmethod
|
|
326
426
|
def create_df_insert_plan(
|
|
327
427
|
cls, tbl: catalog.TableVersion, df: 'pxt.DataFrame', ignore_errors: bool
|
|
@@ -385,7 +485,7 @@ class Planner:
|
|
|
385
485
|
|
|
386
486
|
cls.__check_valid_columns(tbl.tbl_version.get(), recomputed_cols, 'updated in')
|
|
387
487
|
|
|
388
|
-
recomputed_base_cols = {col for col in recomputed_cols if col.tbl == tbl.tbl_version}
|
|
488
|
+
recomputed_base_cols = {col for col in recomputed_cols if col.tbl.id == tbl.tbl_version.id}
|
|
389
489
|
copied_cols = [
|
|
390
490
|
col
|
|
391
491
|
for col in target.cols_by_id.values()
|
|
@@ -409,7 +509,7 @@ class Planner:
|
|
|
409
509
|
for i, col in enumerate(all_base_cols):
|
|
410
510
|
plan.row_builder.add_table_column(col, select_list[i].slot_idx)
|
|
411
511
|
recomputed_user_cols = [c for c in recomputed_cols if c.name is not None]
|
|
412
|
-
return plan, [f'{c.tbl.
|
|
512
|
+
return plan, [f'{c.tbl.name}.{c.name}' for c in updated_cols + recomputed_user_cols], recomputed_user_cols
|
|
413
513
|
|
|
414
514
|
@classmethod
|
|
415
515
|
def __check_valid_columns(
|
|
@@ -465,7 +565,7 @@ class Planner:
|
|
|
465
565
|
recomputed_cols.update(idx_val_cols)
|
|
466
566
|
# we only need to recompute stored columns (unstored ones are substituted away)
|
|
467
567
|
recomputed_cols = {c for c in recomputed_cols if c.is_stored}
|
|
468
|
-
recomputed_base_cols = {col for col in recomputed_cols if col.tbl == target}
|
|
568
|
+
recomputed_base_cols = {col for col in recomputed_cols if col.tbl.id == target.id}
|
|
469
569
|
copied_cols = [
|
|
470
570
|
col
|
|
471
571
|
for col in target.cols_by_id.values()
|
|
@@ -591,8 +691,13 @@ class Planner:
|
|
|
591
691
|
# 2. for component views: iterator args
|
|
592
692
|
iterator_args = [target.iterator_args] if target.iterator_args is not None else []
|
|
593
693
|
|
|
594
|
-
|
|
694
|
+
from_clause = FromClause(tbls=[view.base])
|
|
695
|
+
base_analyzer = Analyzer(
|
|
696
|
+
from_clause, iterator_args, where_clause=target.predicate, sample_clause=target.sample_clause
|
|
697
|
+
)
|
|
698
|
+
row_builder = exprs.RowBuilder(base_analyzer.all_exprs, stored_cols, [])
|
|
595
699
|
|
|
700
|
+
# if we're propagating an insert, we only want to see those base rows that were created for the current version
|
|
596
701
|
# execution plan:
|
|
597
702
|
# 1. materialize exprs computed from the base that are needed for stored view columns
|
|
598
703
|
# 2. if it's an iterator view, expand the base rows into component rows
|
|
@@ -603,8 +708,11 @@ class Planner:
|
|
|
603
708
|
for e in row_builder.default_eval_ctx.target_exprs
|
|
604
709
|
if e.is_bound_by([view]) and not e.is_bound_by([view.base])
|
|
605
710
|
]
|
|
606
|
-
|
|
607
|
-
|
|
711
|
+
|
|
712
|
+
# Create a new analyzer reflecting exactly what is required from the base table
|
|
713
|
+
base_analyzer = Analyzer(
|
|
714
|
+
from_clause, base_output_exprs, where_clause=target.predicate, sample_clause=target.sample_clause
|
|
715
|
+
)
|
|
608
716
|
base_eval_ctx = row_builder.create_eval_ctx(base_analyzer.all_exprs)
|
|
609
717
|
plan = cls._create_query_plan(
|
|
610
718
|
row_builder=row_builder,
|
|
@@ -701,6 +809,7 @@ class Planner:
|
|
|
701
809
|
group_by_clause: Optional[list[exprs.Expr]] = None,
|
|
702
810
|
order_by_clause: Optional[list[tuple[exprs.Expr, bool]]] = None,
|
|
703
811
|
limit: Optional[exprs.Expr] = None,
|
|
812
|
+
sample_clause: Optional[SampleClause] = None,
|
|
704
813
|
ignore_errors: bool = False,
|
|
705
814
|
exact_version_only: Optional[list[catalog.TableVersionHandle]] = None,
|
|
706
815
|
) -> exec.ExecNode:
|
|
@@ -714,12 +823,14 @@ class Planner:
|
|
|
714
823
|
order_by_clause = []
|
|
715
824
|
if exact_version_only is None:
|
|
716
825
|
exact_version_only = []
|
|
826
|
+
|
|
717
827
|
analyzer = Analyzer(
|
|
718
828
|
from_clause,
|
|
719
829
|
select_list,
|
|
720
830
|
where_clause=where_clause,
|
|
721
831
|
group_by_clause=group_by_clause,
|
|
722
832
|
order_by_clause=order_by_clause,
|
|
833
|
+
sample_clause=sample_clause,
|
|
723
834
|
)
|
|
724
835
|
row_builder = exprs.RowBuilder(analyzer.all_exprs, [], [])
|
|
725
836
|
|
|
@@ -773,6 +884,7 @@ class Planner:
|
|
|
773
884
|
# - join clause subexprs
|
|
774
885
|
# - subexprs of Where clause conjuncts that can't be run in SQL
|
|
775
886
|
# - all grouping exprs
|
|
887
|
+
# - all stratify exprs
|
|
776
888
|
candidates = list(
|
|
777
889
|
exprs.Expr.list_subexprs(
|
|
778
890
|
analyzer.select_list,
|
|
@@ -787,10 +899,12 @@ class Planner:
|
|
|
787
899
|
candidates.extend(
|
|
788
900
|
exprs.Expr.subexprs(analyzer.filter, filter=sql_elements.contains, traverse_matches=False)
|
|
789
901
|
)
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
902
|
+
candidates.extend(
|
|
903
|
+
exprs.Expr.list_subexprs(analyzer.grouping_exprs, filter=sql_elements.contains, traverse_matches=False)
|
|
904
|
+
)
|
|
905
|
+
candidates.extend(
|
|
906
|
+
exprs.Expr.list_subexprs(analyzer.stratify_exprs, filter=sql_elements.contains, traverse_matches=False)
|
|
907
|
+
)
|
|
794
908
|
# not isinstance(...): we don't want to materialize Literals via a Select
|
|
795
909
|
sql_exprs = exprs.ExprSet(e for e in candidates if not isinstance(e, exprs.Literal))
|
|
796
910
|
|
|
@@ -835,6 +949,15 @@ class Planner:
|
|
|
835
949
|
# we need to order the input for window functions
|
|
836
950
|
plan.set_order_by(analyzer.get_window_fn_ob_clause())
|
|
837
951
|
|
|
952
|
+
if analyzer.sample_clause is not None:
|
|
953
|
+
plan = exec.SqlSampleNode(
|
|
954
|
+
row_builder,
|
|
955
|
+
input=plan,
|
|
956
|
+
select_list=tbl_scan_exprs,
|
|
957
|
+
sample_clause=analyzer.sample_clause,
|
|
958
|
+
stratify_exprs=analyzer.stratify_exprs,
|
|
959
|
+
)
|
|
960
|
+
|
|
838
961
|
plan = cls._insert_prefetch_node(tbl.tbl_version.id, row_builder, plan)
|
|
839
962
|
|
|
840
963
|
if analyzer.group_by_clause is not None:
|
pixeltable/share/packager.py
CHANGED
|
@@ -1,4 +1,7 @@
|
|
|
1
|
+
import base64
|
|
1
2
|
import datetime
|
|
3
|
+
import io
|
|
4
|
+
import itertools
|
|
2
5
|
import json
|
|
3
6
|
import logging
|
|
4
7
|
import tarfile
|
|
@@ -7,17 +10,21 @@ import urllib.request
|
|
|
7
10
|
import uuid
|
|
8
11
|
from pathlib import Path
|
|
9
12
|
from typing import Any, Iterator, Optional
|
|
13
|
+
from uuid import UUID
|
|
10
14
|
|
|
11
15
|
import more_itertools
|
|
16
|
+
import numpy as np
|
|
17
|
+
import PIL.Image
|
|
12
18
|
import pyarrow as pa
|
|
13
19
|
import pyarrow.parquet as pq
|
|
14
20
|
import sqlalchemy as sql
|
|
15
21
|
|
|
16
22
|
import pixeltable as pxt
|
|
17
|
-
from pixeltable import catalog, exceptions as excs, metadata
|
|
23
|
+
from pixeltable import catalog, exceptions as excs, metadata, type_system as ts
|
|
18
24
|
from pixeltable.env import Env
|
|
19
25
|
from pixeltable.metadata import schema
|
|
20
26
|
from pixeltable.utils import sha256sum
|
|
27
|
+
from pixeltable.utils.formatter import Formatter
|
|
21
28
|
from pixeltable.utils.media_store import MediaStore
|
|
22
29
|
|
|
23
30
|
_logger = logging.getLogger('pixeltable')
|
|
@@ -45,13 +52,17 @@ class TablePackager:
|
|
|
45
52
|
media_files: dict[Path, str] # Mapping from local media file paths to their tarball names
|
|
46
53
|
md: dict[str, Any]
|
|
47
54
|
|
|
55
|
+
bundle_path: Path
|
|
56
|
+
preview_header: dict[str, str]
|
|
57
|
+
preview: list[list[Any]]
|
|
58
|
+
|
|
48
59
|
def __init__(self, table: catalog.Table, additional_md: Optional[dict[str, Any]] = None) -> None:
|
|
49
60
|
self.table = table
|
|
50
61
|
self.tmp_dir = Path(Env.get().create_tmp_path())
|
|
51
62
|
self.media_files = {}
|
|
52
63
|
|
|
53
64
|
# Load metadata
|
|
54
|
-
with
|
|
65
|
+
with catalog.Catalog.get().begin_xact(for_write=False):
|
|
55
66
|
tbl_md = catalog.Catalog.get().load_replica_md(table)
|
|
56
67
|
self.md = {
|
|
57
68
|
'pxt_version': pxt.__version__,
|
|
@@ -66,20 +77,29 @@ class TablePackager:
|
|
|
66
77
|
Export the table to a tarball containing Parquet tables and media files.
|
|
67
78
|
"""
|
|
68
79
|
assert not self.tmp_dir.exists() # Packaging can only be done once per TablePackager instance
|
|
69
|
-
|
|
80
|
+
|
|
81
|
+
_logger.info(f'Packaging table {self.table._path()!r} and its ancestors in: {self.tmp_dir}')
|
|
70
82
|
self.tmp_dir.mkdir()
|
|
71
83
|
with open(self.tmp_dir / 'metadata.json', 'w', encoding='utf8') as fp:
|
|
72
84
|
json.dump(self.md, fp)
|
|
73
85
|
self.tables_dir = self.tmp_dir / 'tables'
|
|
74
86
|
self.tables_dir.mkdir()
|
|
75
|
-
with
|
|
87
|
+
with catalog.Catalog.get().begin_xact(for_write=False):
|
|
76
88
|
for tv in self.table._tbl_version_path.get_tbl_versions():
|
|
77
|
-
_logger.info(f
|
|
89
|
+
_logger.info(f'Exporting table {tv.get().versioned_name!r}.')
|
|
78
90
|
self.__export_table(tv.get())
|
|
91
|
+
|
|
79
92
|
_logger.info('Building archive.')
|
|
80
|
-
bundle_path = self.__build_tarball()
|
|
81
|
-
|
|
82
|
-
|
|
93
|
+
self.bundle_path = self.__build_tarball()
|
|
94
|
+
|
|
95
|
+
_logger.info('Extracting preview data.')
|
|
96
|
+
self.md['count'] = self.table.count()
|
|
97
|
+
preview_header, preview = self.__extract_preview_data()
|
|
98
|
+
self.md['preview_header'] = preview_header
|
|
99
|
+
self.md['preview'] = preview
|
|
100
|
+
|
|
101
|
+
_logger.info(f'Packaging complete: {self.bundle_path}')
|
|
102
|
+
return self.bundle_path
|
|
83
103
|
|
|
84
104
|
def __export_table(self, tv: catalog.TableVersion) -> None:
|
|
85
105
|
"""
|
|
@@ -107,7 +127,7 @@ class TablePackager:
|
|
|
107
127
|
# We use snappy compression for the Parquet tables; the entire bundle will be bzip2-compressed later, so
|
|
108
128
|
# faster compression should provide good performance while still reducing temporary storage utilization.
|
|
109
129
|
parquet_writer = pq.ParquetWriter(parquet_file, parquet_schema, compression='SNAPPY')
|
|
110
|
-
filter_tv = self.table.
|
|
130
|
+
filter_tv = self.table._tbl_version_path.tbl_version.get()
|
|
111
131
|
row_iter = tv.store_tbl.dump_rows(tv.version, filter_tv.store_tbl, filter_tv.version)
|
|
112
132
|
for pa_table in self.__to_pa_tables(row_iter, sql_types, media_cols, parquet_schema):
|
|
113
133
|
parquet_writer.write_table(pa_table)
|
|
@@ -206,6 +226,96 @@ class TablePackager:
|
|
|
206
226
|
tf.add(src_file, arcname=f'media/{dest_name}')
|
|
207
227
|
return bundle_path
|
|
208
228
|
|
|
229
|
+
def __extract_preview_data(self) -> tuple[dict[str, str], list[list[Any]]]:
|
|
230
|
+
"""
|
|
231
|
+
Extract a preview of the table data for display in the UI.
|
|
232
|
+
|
|
233
|
+
In order to bound the size of the output data, all "unbounded" data types are resized:
|
|
234
|
+
- Strings are abbreviated as per Formatter.abbreviate()
|
|
235
|
+
- Arrays and JSON are shortened and formatted as strings
|
|
236
|
+
- Images are resized to thumbnail size as a base64-encoded webp
|
|
237
|
+
- Videos are replaced by their first frame and resized as above
|
|
238
|
+
- Documents are replaced by a thumbnail as a base64-encoded webp
|
|
239
|
+
"""
|
|
240
|
+
# First 8 columns
|
|
241
|
+
preview_cols = dict(itertools.islice(self.table._get_schema().items(), 0, 8))
|
|
242
|
+
select_list = [self.table[col_name] for col_name in preview_cols]
|
|
243
|
+
# First 5 rows
|
|
244
|
+
rows = list(self.table.select(*select_list).head(n=5))
|
|
245
|
+
|
|
246
|
+
preview_header = {col_name: str(col_type._type) for col_name, col_type in preview_cols.items()}
|
|
247
|
+
preview = [
|
|
248
|
+
[self.__encode_preview_data(val, col_type)]
|
|
249
|
+
for row in rows
|
|
250
|
+
for val, col_type in zip(row.values(), preview_cols.values())
|
|
251
|
+
]
|
|
252
|
+
|
|
253
|
+
return preview_header, preview
|
|
254
|
+
|
|
255
|
+
def __encode_preview_data(self, val: Any, col_type: ts.ColumnType) -> Any:
|
|
256
|
+
if val is None:
|
|
257
|
+
return None
|
|
258
|
+
|
|
259
|
+
match col_type._type:
|
|
260
|
+
case ts.ColumnType.Type.STRING:
|
|
261
|
+
assert isinstance(val, str)
|
|
262
|
+
return Formatter.abbreviate(val)
|
|
263
|
+
|
|
264
|
+
case ts.ColumnType.Type.INT | ts.ColumnType.Type.FLOAT | ts.ColumnType.Type.BOOL:
|
|
265
|
+
return val
|
|
266
|
+
|
|
267
|
+
case ts.ColumnType.Type.TIMESTAMP | ts.ColumnType.Type.DATE:
|
|
268
|
+
return str(val)
|
|
269
|
+
|
|
270
|
+
case ts.ColumnType.Type.ARRAY:
|
|
271
|
+
assert isinstance(val, np.ndarray)
|
|
272
|
+
return Formatter.format_array(val)
|
|
273
|
+
|
|
274
|
+
case ts.ColumnType.Type.JSON:
|
|
275
|
+
# We need to escape the JSON string server-side for security reasons.
|
|
276
|
+
# Therefore we don't escape it here, in order to avoid double-escaping.
|
|
277
|
+
return Formatter.format_json(val, escape_strings=False)
|
|
278
|
+
|
|
279
|
+
case ts.ColumnType.Type.IMAGE:
|
|
280
|
+
# Rescale the image to minimize data transfer size
|
|
281
|
+
assert isinstance(val, PIL.Image.Image)
|
|
282
|
+
return self.__encode_image(val)
|
|
283
|
+
|
|
284
|
+
case ts.ColumnType.Type.VIDEO:
|
|
285
|
+
assert isinstance(val, str)
|
|
286
|
+
return self.__encode_video(val)
|
|
287
|
+
|
|
288
|
+
case ts.ColumnType.Type.AUDIO:
|
|
289
|
+
return None
|
|
290
|
+
|
|
291
|
+
case ts.ColumnType.Type.DOCUMENT:
|
|
292
|
+
assert isinstance(val, str)
|
|
293
|
+
return self.__encode_document(val)
|
|
294
|
+
|
|
295
|
+
case _:
|
|
296
|
+
raise AssertionError(f'Unrecognized column type: {col_type._type}')
|
|
297
|
+
|
|
298
|
+
def __encode_image(self, img: PIL.Image.Image) -> str:
|
|
299
|
+
# Heuristic for thumbnail sizing:
|
|
300
|
+
# Standardize on a width of 240 pixels (to most efficiently utilize the columnar display).
|
|
301
|
+
# But, if the aspect ratio is below 2:3, bound the height at 360 pixels (to avoid unboundedly tall thumbnails
|
|
302
|
+
# in the case of highly oblong images).
|
|
303
|
+
if img.height > img.width * 1.5:
|
|
304
|
+
scaled_img = img.resize((img.width * 360 // img.height, 360))
|
|
305
|
+
else:
|
|
306
|
+
scaled_img = img.resize((240, img.height * 240 // img.width))
|
|
307
|
+
with io.BytesIO() as buffer:
|
|
308
|
+
scaled_img.save(buffer, 'webp')
|
|
309
|
+
return base64.b64encode(buffer.getvalue()).decode()
|
|
310
|
+
|
|
311
|
+
def __encode_video(self, video_path: str) -> Optional[str]:
|
|
312
|
+
thumb = Formatter.extract_first_video_frame(video_path)
|
|
313
|
+
return self.__encode_image(thumb) if thumb is not None else None
|
|
314
|
+
|
|
315
|
+
def __encode_document(self, doc_path: str) -> Optional[str]:
|
|
316
|
+
thumb = Formatter.make_document_thumbnail(doc_path)
|
|
317
|
+
return self.__encode_image(thumb) if thumb is not None else None
|
|
318
|
+
|
|
209
319
|
|
|
210
320
|
class TableRestorer:
|
|
211
321
|
"""
|
|
@@ -253,13 +363,26 @@ class TableRestorer:
|
|
|
253
363
|
tbl_md = [schema.FullTableMd.from_dict(t) for t in self.md['md']['tables']]
|
|
254
364
|
|
|
255
365
|
# Create the replica table
|
|
256
|
-
#
|
|
257
|
-
|
|
258
|
-
|
|
366
|
+
# The logic here needs to be completely restructured in order to make it concurrency-safe.
|
|
367
|
+
# - Catalog.create_replica() needs to write the metadata and also create the physical store tables
|
|
368
|
+
# and populate them, otherwise concurrent readers will see an inconsistent state (table metadata w/o
|
|
369
|
+
# an actual table)
|
|
370
|
+
# - this could be done one replica at a time (instead of the entire hierarchy)
|
|
371
|
+
cat = catalog.Catalog.get()
|
|
372
|
+
cat.create_replica(catalog.Path(self.tbl_path), tbl_md)
|
|
373
|
+
# don't call get_table() until after the calls to create_replica() and __import_table() below;
|
|
374
|
+
# the TV instances created by get_table() would be replaced by create_replica(), which creates duplicate
|
|
375
|
+
# TV instances for the same replica version, which then leads to failures when constructing queries
|
|
259
376
|
|
|
260
377
|
# Now we need to instantiate and load data for replica_tbl and its ancestors, except that we skip
|
|
261
378
|
# replica_tbl itself if it's a pure snapshot.
|
|
262
|
-
|
|
379
|
+
target_md = tbl_md[0]
|
|
380
|
+
is_pure_snapshot = (
|
|
381
|
+
target_md.tbl_md.view_md is not None
|
|
382
|
+
and target_md.tbl_md.view_md.predicate is None
|
|
383
|
+
and len(target_md.schema_version_md.columns) == 0
|
|
384
|
+
)
|
|
385
|
+
if is_pure_snapshot:
|
|
263
386
|
ancestor_md = tbl_md[1:] # Pure snapshot; skip replica_tbl
|
|
264
387
|
else:
|
|
265
388
|
ancestor_md = tbl_md # Not a pure snapshot; include replica_tbl
|
|
@@ -273,7 +396,8 @@ class TableRestorer:
|
|
|
273
396
|
_logger.info(f'Importing table {tv.name!r}.')
|
|
274
397
|
self.__import_table(self.tmp_dir, tv, md)
|
|
275
398
|
|
|
276
|
-
|
|
399
|
+
with cat.begin_xact(for_write=False):
|
|
400
|
+
return cat.get_table_by_id(UUID(tbl_md[0].tbl_md.tbl_id))
|
|
277
401
|
|
|
278
402
|
def __import_table(self, bundle_path: Path, tv: catalog.TableVersion, tbl_md: schema.FullTableMd) -> None:
|
|
279
403
|
"""
|
pixeltable/share/publish.py
CHANGED
|
@@ -35,7 +35,7 @@ def push_replica(dest_tbl_uri: str, src_tbl: pxt.Table) -> str:
|
|
|
35
35
|
upload_id = response_json['upload_id']
|
|
36
36
|
destination_uri = response_json['destination_uri']
|
|
37
37
|
|
|
38
|
-
Env.get().console_logger.info(f"Creating a snapshot of '{src_tbl._path}' at: {dest_tbl_uri}")
|
|
38
|
+
Env.get().console_logger.info(f"Creating a snapshot of '{src_tbl._path()}' at: {dest_tbl_uri}")
|
|
39
39
|
|
|
40
40
|
bundle = packager.package()
|
|
41
41
|
|
|
@@ -117,7 +117,7 @@ def pull_replica(dest_path: str, src_tbl_uri: str) -> pxt.Table:
|
|
|
117
117
|
|
|
118
118
|
restorer = TableRestorer(dest_path, response_json)
|
|
119
119
|
tbl = restorer.restore(bundle_path)
|
|
120
|
-
Env.get().console_logger.info(f'Created local replica {tbl._path!r} from URI: {src_tbl_uri}')
|
|
120
|
+
Env.get().console_logger.info(f'Created local replica {tbl._path()!r} from URI: {src_tbl_uri}')
|
|
121
121
|
return tbl
|
|
122
122
|
|
|
123
123
|
|
pixeltable/store.py
CHANGED
|
@@ -52,7 +52,8 @@ class StoreBase:
|
|
|
52
52
|
# We need to declare a `base` variable here, even though it's only defined for instances of `StoreView`,
|
|
53
53
|
# since it's referenced by various methods of `StoreBase`
|
|
54
54
|
self.base = tbl_version.base.get().store_tbl if tbl_version.base is not None else None
|
|
55
|
-
|
|
55
|
+
# we're passing in tbl_version to avoid a circular call to TableVersionHandle.get()
|
|
56
|
+
self.create_sa_tbl(tbl_version)
|
|
56
57
|
|
|
57
58
|
def system_columns(self) -> list[sql.Column]:
|
|
58
59
|
return [*self._pk_cols, self.v_max_col]
|
|
@@ -77,11 +78,13 @@ class StoreBase:
|
|
|
77
78
|
self._pk_cols = [*rowid_cols, self.v_min_col]
|
|
78
79
|
return [*rowid_cols, self.v_min_col, self.v_max_col]
|
|
79
80
|
|
|
80
|
-
def create_sa_tbl(self) -> None:
|
|
81
|
+
def create_sa_tbl(self, tbl_version: Optional[catalog.TableVersion] = None) -> None:
|
|
81
82
|
"""Create self.sa_tbl from self.tbl_version."""
|
|
83
|
+
if tbl_version is None:
|
|
84
|
+
tbl_version = self.tbl_version.get()
|
|
82
85
|
system_cols = self._create_system_columns()
|
|
83
86
|
all_cols = system_cols.copy()
|
|
84
|
-
for col in [c for c in
|
|
87
|
+
for col in [c for c in tbl_version.cols if c.is_stored]:
|
|
85
88
|
# re-create sql.Column for each column, regardless of whether it already has sa_col set: it was bound
|
|
86
89
|
# to the last sql.Table version we created and cannot be reused
|
|
87
90
|
col.create_sa_cols()
|
|
@@ -99,16 +102,17 @@ class StoreBase:
|
|
|
99
102
|
# - base x view joins can be executed as merge joins
|
|
100
103
|
# - speeds up ORDER BY rowid DESC
|
|
101
104
|
# - allows filtering for a particular table version in index scan
|
|
102
|
-
idx_name = f'sys_cols_idx_{
|
|
105
|
+
idx_name = f'sys_cols_idx_{tbl_version.id.hex}'
|
|
103
106
|
idxs.append(sql.Index(idx_name, *system_cols))
|
|
104
107
|
|
|
105
108
|
# v_min/v_max indices: speeds up base table scans needed to propagate a base table insert or delete
|
|
106
|
-
idx_name = f'vmin_idx_{
|
|
109
|
+
idx_name = f'vmin_idx_{tbl_version.id.hex}'
|
|
107
110
|
idxs.append(sql.Index(idx_name, self.v_min_col, postgresql_using=Env.get().dbms.version_index_type))
|
|
108
|
-
idx_name = f'vmax_idx_{
|
|
111
|
+
idx_name = f'vmax_idx_{tbl_version.id.hex}'
|
|
109
112
|
idxs.append(sql.Index(idx_name, self.v_max_col, postgresql_using=Env.get().dbms.version_index_type))
|
|
110
113
|
|
|
111
114
|
self.sa_tbl = sql.Table(self._storage_name(), self.sa_md, *all_cols, *idxs)
|
|
115
|
+
# _logger.debug(f'created sa tbl for {tbl_version.id!s} (sa_tbl={id(self.sa_tbl):x}, tv={id(tbl_version):x})')
|
|
112
116
|
|
|
113
117
|
@abc.abstractmethod
|
|
114
118
|
def _rowid_join_predicate(self) -> sql.ColumnElement[bool]:
|
|
@@ -285,7 +289,7 @@ class StoreBase:
|
|
|
285
289
|
else:
|
|
286
290
|
if col.col_type.is_image_type() and result_row.file_urls[value_expr_slot_idx] is None:
|
|
287
291
|
# we have yet to store this image
|
|
288
|
-
filepath = str(MediaStore.prepare_media_path(col.tbl.id, col.id, col.tbl.
|
|
292
|
+
filepath = str(MediaStore.prepare_media_path(col.tbl.id, col.id, col.tbl.version))
|
|
289
293
|
result_row.flush_img(value_expr_slot_idx, filepath)
|
|
290
294
|
val = result_row.get_stored_val(value_expr_slot_idx, col.sa_col.type)
|
|
291
295
|
if col.col_type.is_media_type():
|
|
@@ -415,9 +419,7 @@ class StoreBase:
|
|
|
415
419
|
number of deleted rows
|
|
416
420
|
"""
|
|
417
421
|
where_clause = sql.true() if where_clause is None else where_clause
|
|
418
|
-
|
|
419
|
-
self.v_min_col < current_version, self.v_max_col == schema.Table.MAX_VERSION, where_clause
|
|
420
|
-
)
|
|
422
|
+
version_clause = sql.and_(self.v_min_col < current_version, self.v_max_col == schema.Table.MAX_VERSION)
|
|
421
423
|
rowid_join_clause = self._rowid_join_predicate()
|
|
422
424
|
base_versions_clause = (
|
|
423
425
|
sql.true() if len(base_versions) == 0 else self.base._versions_clause(base_versions, match_on_vmin)
|
|
@@ -428,10 +430,12 @@ class StoreBase:
|
|
|
428
430
|
set_clause[index_info.undo_col.sa_col] = index_info.val_col.sa_col
|
|
429
431
|
# set value column to NULL
|
|
430
432
|
set_clause[index_info.val_col.sa_col] = None
|
|
433
|
+
|
|
431
434
|
stmt = (
|
|
432
435
|
sql.update(self.sa_tbl)
|
|
433
436
|
.values(set_clause)
|
|
434
437
|
.where(where_clause)
|
|
438
|
+
.where(version_clause)
|
|
435
439
|
.where(rowid_join_clause)
|
|
436
440
|
.where(base_versions_clause)
|
|
437
441
|
)
|
|
@@ -528,10 +532,12 @@ class StoreComponentView(StoreView):
|
|
|
528
532
|
self.rowid_cols.append(self.pos_col)
|
|
529
533
|
return self.rowid_cols
|
|
530
534
|
|
|
531
|
-
def create_sa_tbl(self) -> None:
|
|
532
|
-
|
|
535
|
+
def create_sa_tbl(self, tbl_version: Optional[catalog.TableVersion] = None) -> None:
|
|
536
|
+
if tbl_version is None:
|
|
537
|
+
tbl_version = self.tbl_version.get()
|
|
538
|
+
super().create_sa_tbl(tbl_version)
|
|
533
539
|
# we need to fix up the 'pos' column in TableVersion
|
|
534
|
-
|
|
540
|
+
tbl_version.cols_by_name['pos'].sa_col = self.pos_col
|
|
535
541
|
|
|
536
542
|
def _rowid_join_predicate(self) -> sql.ColumnElement[bool]:
|
|
537
543
|
return sql.and_(
|
pixeltable/type_system.py
CHANGED
|
@@ -395,6 +395,36 @@ class ColumnType:
|
|
|
395
395
|
raise excs.Error(f'Standard Python type `{name}` cannot be used here; use `{suggestion}` instead')
|
|
396
396
|
raise excs.Error(f'Unknown type: {t}')
|
|
397
397
|
|
|
398
|
+
@classmethod
|
|
399
|
+
def from_json_schema(cls, schema: dict[str, Any]) -> Optional[ColumnType]:
|
|
400
|
+
# We first express the JSON schema as a Python type, and then convert it to a Pixeltable type.
|
|
401
|
+
# TODO: Is there a meaningful fallback if one of these operations fails? (Maybe another use case for a pxt Any
|
|
402
|
+
# type?)
|
|
403
|
+
py_type = cls.__json_schema_to_py_type(schema)
|
|
404
|
+
return cls.from_python_type(py_type) if py_type is not None else None
|
|
405
|
+
|
|
406
|
+
@classmethod
|
|
407
|
+
def __json_schema_to_py_type(cls, schema: dict[str, Any]) -> Union[type, _GenericAlias, None]:
|
|
408
|
+
if 'type' in schema:
|
|
409
|
+
if schema['type'] == 'null':
|
|
410
|
+
return type(None)
|
|
411
|
+
if schema['type'] == 'string':
|
|
412
|
+
return str
|
|
413
|
+
if schema['type'] == 'integer':
|
|
414
|
+
return int
|
|
415
|
+
if schema['type'] == 'number':
|
|
416
|
+
return float
|
|
417
|
+
if schema['type'] == 'boolean':
|
|
418
|
+
return bool
|
|
419
|
+
if schema['type'] in ('array', 'object'):
|
|
420
|
+
return list
|
|
421
|
+
elif 'anyOf' in schema:
|
|
422
|
+
subscripts = tuple(cls.__json_schema_to_py_type(subschema) for subschema in schema['anyOf'])
|
|
423
|
+
if all(subscript is not None for subscript in subscripts):
|
|
424
|
+
return Union[subscripts]
|
|
425
|
+
|
|
426
|
+
return None
|
|
427
|
+
|
|
398
428
|
def validate_literal(self, val: Any) -> None:
|
|
399
429
|
"""Raise TypeError if val is not a valid literal for this type"""
|
|
400
430
|
if val is None:
|
pixeltable/utils/dbms.py
CHANGED
|
@@ -35,7 +35,7 @@ class PostgresqlDbms(Dbms):
|
|
|
35
35
|
"""
|
|
36
36
|
|
|
37
37
|
def __init__(self, db_url: URL):
|
|
38
|
-
super().__init__('postgresql', '
|
|
38
|
+
super().__init__('postgresql', 'SERIALIZABLE', 'brin', db_url)
|
|
39
39
|
|
|
40
40
|
def drop_db_stmt(self, database: str) -> str:
|
|
41
41
|
return f'DROP DATABASE {database}'
|