pixeltable 0.2.4__py3-none-any.whl → 0.2.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +18 -9
- pixeltable/__version__.py +3 -0
- pixeltable/catalog/column.py +31 -50
- pixeltable/catalog/insertable_table.py +7 -6
- pixeltable/catalog/table.py +171 -57
- pixeltable/catalog/table_version.py +417 -140
- pixeltable/catalog/table_version_path.py +2 -2
- pixeltable/dataframe.py +239 -121
- pixeltable/env.py +82 -16
- pixeltable/exec/__init__.py +2 -1
- pixeltable/exec/cache_prefetch_node.py +1 -1
- pixeltable/exec/data_row_batch.py +6 -7
- pixeltable/exec/expr_eval_node.py +28 -28
- pixeltable/exec/in_memory_data_node.py +11 -7
- pixeltable/exec/sql_scan_node.py +7 -6
- pixeltable/exprs/__init__.py +4 -3
- pixeltable/exprs/column_ref.py +9 -0
- pixeltable/exprs/comparison.py +3 -3
- pixeltable/exprs/data_row.py +5 -1
- pixeltable/exprs/expr.py +15 -7
- pixeltable/exprs/function_call.py +17 -15
- pixeltable/exprs/image_member_access.py +9 -28
- pixeltable/exprs/in_predicate.py +96 -0
- pixeltable/exprs/inline_array.py +13 -11
- pixeltable/exprs/inline_dict.py +15 -13
- pixeltable/exprs/literal.py +16 -4
- pixeltable/exprs/row_builder.py +15 -41
- pixeltable/exprs/similarity_expr.py +65 -0
- pixeltable/ext/__init__.py +5 -0
- pixeltable/ext/functions/yolox.py +92 -0
- pixeltable/func/__init__.py +0 -2
- pixeltable/func/aggregate_function.py +18 -15
- pixeltable/func/callable_function.py +57 -13
- pixeltable/func/expr_template_function.py +20 -3
- pixeltable/func/function.py +35 -4
- pixeltable/func/globals.py +24 -14
- pixeltable/func/signature.py +23 -27
- pixeltable/func/udf.py +13 -12
- pixeltable/functions/__init__.py +8 -8
- pixeltable/functions/eval.py +7 -8
- pixeltable/functions/huggingface.py +64 -17
- pixeltable/functions/openai.py +36 -3
- pixeltable/functions/pil/image.py +61 -64
- pixeltable/functions/together.py +21 -0
- pixeltable/functions/util.py +11 -0
- pixeltable/globals.py +425 -0
- pixeltable/index/__init__.py +2 -0
- pixeltable/index/base.py +51 -0
- pixeltable/index/embedding_index.py +168 -0
- pixeltable/io/__init__.py +3 -0
- pixeltable/{utils → io}/hf_datasets.py +48 -17
- pixeltable/io/pandas.py +148 -0
- pixeltable/{utils → io}/parquet.py +58 -33
- pixeltable/iterators/__init__.py +1 -1
- pixeltable/iterators/base.py +4 -0
- pixeltable/iterators/document.py +218 -97
- pixeltable/iterators/video.py +8 -9
- pixeltable/metadata/__init__.py +7 -3
- pixeltable/metadata/converters/convert_12.py +3 -0
- pixeltable/metadata/converters/convert_13.py +41 -0
- pixeltable/metadata/schema.py +45 -22
- pixeltable/plan.py +15 -51
- pixeltable/store.py +38 -41
- pixeltable/tool/create_test_db_dump.py +39 -4
- pixeltable/type_system.py +47 -96
- pixeltable/utils/documents.py +42 -12
- pixeltable/utils/http_server.py +70 -0
- {pixeltable-0.2.4.dist-info → pixeltable-0.2.6.dist-info}/METADATA +14 -10
- pixeltable-0.2.6.dist-info/RECORD +119 -0
- {pixeltable-0.2.4.dist-info → pixeltable-0.2.6.dist-info}/WHEEL +1 -1
- pixeltable/client.py +0 -604
- pixeltable/exprs/image_similarity_predicate.py +0 -58
- pixeltable/func/batched_function.py +0 -53
- pixeltable/tests/conftest.py +0 -177
- pixeltable/tests/functions/test_fireworks.py +0 -42
- pixeltable/tests/functions/test_functions.py +0 -60
- pixeltable/tests/functions/test_huggingface.py +0 -158
- pixeltable/tests/functions/test_openai.py +0 -152
- pixeltable/tests/functions/test_together.py +0 -111
- pixeltable/tests/test_audio.py +0 -65
- pixeltable/tests/test_catalog.py +0 -27
- pixeltable/tests/test_client.py +0 -21
- pixeltable/tests/test_component_view.py +0 -370
- pixeltable/tests/test_dataframe.py +0 -439
- pixeltable/tests/test_dirs.py +0 -107
- pixeltable/tests/test_document.py +0 -120
- pixeltable/tests/test_exprs.py +0 -805
- pixeltable/tests/test_function.py +0 -324
- pixeltable/tests/test_migration.py +0 -43
- pixeltable/tests/test_nos.py +0 -54
- pixeltable/tests/test_snapshot.py +0 -208
- pixeltable/tests/test_table.py +0 -1267
- pixeltable/tests/test_transactional_directory.py +0 -42
- pixeltable/tests/test_types.py +0 -22
- pixeltable/tests/test_video.py +0 -159
- pixeltable/tests/test_view.py +0 -530
- pixeltable/tests/utils.py +0 -408
- pixeltable-0.2.4.dist-info/RECORD +0 -132
- {pixeltable-0.2.4.dist-info → pixeltable-0.2.6.dist-info}/LICENSE +0 -0
pixeltable/metadata/schema.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Optional, List,
|
|
1
|
+
from typing import Optional, List, get_type_hints, Type, Any, TypeVar, Tuple, Union
|
|
2
2
|
import platform
|
|
3
3
|
import uuid
|
|
4
4
|
import dataclasses
|
|
@@ -71,16 +71,43 @@ class Dir(Base):
|
|
|
71
71
|
|
|
72
72
|
|
|
73
73
|
@dataclasses.dataclass
|
|
74
|
-
class
|
|
74
|
+
class ColumnMd:
|
|
75
75
|
"""
|
|
76
|
-
Records
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
76
|
+
Records the non-versioned metadata of a column.
|
|
77
|
+
- immutable attributes: type, primary key, etc.
|
|
78
|
+
- when a column was added/dropped, which is needed to GC unreachable storage columns
|
|
79
|
+
(a column that was added after table snapshot n and dropped before table snapshot n+1 can be removed
|
|
80
|
+
from the stored table).
|
|
80
81
|
"""
|
|
81
|
-
|
|
82
|
+
id: int
|
|
82
83
|
schema_version_add: int
|
|
83
84
|
schema_version_drop: Optional[int]
|
|
85
|
+
col_type: dict
|
|
86
|
+
|
|
87
|
+
# if True, is part of the primary key
|
|
88
|
+
is_pk: bool
|
|
89
|
+
|
|
90
|
+
# if set, this is a computed column
|
|
91
|
+
value_expr: Optional[dict]
|
|
92
|
+
|
|
93
|
+
# if True, the column is present in the stored table
|
|
94
|
+
stored: Optional[bool]
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
@dataclasses.dataclass
|
|
98
|
+
class IndexMd:
|
|
99
|
+
"""
|
|
100
|
+
Metadata needed to instantiate an EmbeddingIndex
|
|
101
|
+
"""
|
|
102
|
+
id: int
|
|
103
|
+
name: str
|
|
104
|
+
indexed_col_id: int # column being indexed
|
|
105
|
+
index_val_col_id: int # column holding the values to be indexed
|
|
106
|
+
index_val_undo_col_id: int # column holding index values for deleted rows
|
|
107
|
+
schema_version_add: int
|
|
108
|
+
schema_version_drop: Optional[int]
|
|
109
|
+
class_fqn: str
|
|
110
|
+
init_args: dict[str, Any]
|
|
84
111
|
|
|
85
112
|
|
|
86
113
|
@dataclasses.dataclass
|
|
@@ -91,13 +118,13 @@ class ViewMd:
|
|
|
91
118
|
base_versions: List[Tuple[str, Optional[int]]]
|
|
92
119
|
|
|
93
120
|
# filter predicate applied to the base table; view-only
|
|
94
|
-
predicate: Optional[
|
|
121
|
+
predicate: Optional[dict[str, Any]]
|
|
95
122
|
|
|
96
123
|
# ComponentIterator subclass; only for component views
|
|
97
124
|
iterator_class_fqn: Optional[str]
|
|
98
125
|
|
|
99
126
|
# args to pass to the iterator class constructor; only for component views
|
|
100
|
-
iterator_args: Optional[
|
|
127
|
+
iterator_args: Optional[dict[str, Any]]
|
|
101
128
|
|
|
102
129
|
|
|
103
130
|
@dataclasses.dataclass
|
|
@@ -109,15 +136,15 @@ class TableMd:
|
|
|
109
136
|
# each version has a corresponding schema version (current_version >= current_schema_version)
|
|
110
137
|
current_schema_version: int
|
|
111
138
|
|
|
112
|
-
# used to assign Column.id
|
|
113
|
-
|
|
139
|
+
next_col_id: int # used to assign Column.id
|
|
140
|
+
next_idx_id: int # used to assign IndexMd.id
|
|
114
141
|
|
|
115
142
|
# - used to assign the rowid column in the storage table
|
|
116
143
|
# - every row is assigned a unique and immutable rowid on insertion
|
|
117
144
|
next_row_id: int
|
|
118
145
|
|
|
119
|
-
|
|
120
|
-
|
|
146
|
+
column_md: dict[int, ColumnMd] # col_id -> ColumnMd
|
|
147
|
+
index_md: dict[int, IndexMd] # index_id -> IndexMd
|
|
121
148
|
view_md: Optional[ViewMd]
|
|
122
149
|
|
|
123
150
|
|
|
@@ -155,24 +182,20 @@ class TableVersion(Base):
|
|
|
155
182
|
@dataclasses.dataclass
|
|
156
183
|
class SchemaColumn:
|
|
157
184
|
"""
|
|
158
|
-
Records the
|
|
159
|
-
Contains the full set of columns for each new schema version: one record per (column x schema version).
|
|
185
|
+
Records the versioned metadata of a column.
|
|
160
186
|
"""
|
|
161
187
|
pos: int
|
|
162
188
|
name: str
|
|
163
|
-
col_type: dict
|
|
164
|
-
is_pk: bool
|
|
165
|
-
value_expr: Optional[dict]
|
|
166
|
-
stored: Optional[bool]
|
|
167
|
-
# if True, creates vector index for this column
|
|
168
|
-
is_indexed: bool
|
|
169
189
|
|
|
170
190
|
|
|
171
191
|
@dataclasses.dataclass
|
|
172
192
|
class TableSchemaVersionMd:
|
|
193
|
+
"""
|
|
194
|
+
Records all versioned table metadata.
|
|
195
|
+
"""
|
|
173
196
|
schema_version: int
|
|
174
197
|
preceding_schema_version: Optional[int]
|
|
175
|
-
columns:
|
|
198
|
+
columns: dict[int, SchemaColumn] # col_id -> SchemaColumn
|
|
176
199
|
num_retained_versions: int
|
|
177
200
|
comment: str
|
|
178
201
|
|
pixeltable/plan.py
CHANGED
|
@@ -60,24 +60,10 @@ class Analyzer:
|
|
|
60
60
|
# filter predicate applied to output rows of the SQL scan
|
|
61
61
|
self.filter: Optional[exprs.Predicate] = None
|
|
62
62
|
# not executable
|
|
63
|
-
self.similarity_clause: Optional[exprs.ImageSimilarityPredicate] = None
|
|
63
|
+
#self.similarity_clause: Optional[exprs.ImageSimilarityPredicate] = None
|
|
64
64
|
if where_clause is not None:
|
|
65
65
|
where_clause_conjuncts, self.filter = where_clause.split_conjuncts(lambda e: e.sql_expr() is not None)
|
|
66
66
|
self.sql_where_clause = exprs.CompoundPredicate.make_conjunction(where_clause_conjuncts)
|
|
67
|
-
if self.filter is not None:
|
|
68
|
-
similarity_clauses, self.filter = self.filter.split_conjuncts(
|
|
69
|
-
lambda e: isinstance(e, exprs.ImageSimilarityPredicate))
|
|
70
|
-
if len(similarity_clauses) > 1:
|
|
71
|
-
raise excs.Error(f'More than one nearest() not supported')
|
|
72
|
-
if len(similarity_clauses) == 1:
|
|
73
|
-
if len(self.order_by_clause) > 0:
|
|
74
|
-
raise excs.Error((
|
|
75
|
-
f'nearest() returns results in order of proximity and cannot be used in conjunction with '
|
|
76
|
-
f'order_by()'))
|
|
77
|
-
self.similarity_clause = similarity_clauses[0]
|
|
78
|
-
img_col = self.similarity_clause.img_col_ref.col
|
|
79
|
-
if not img_col.is_indexed:
|
|
80
|
-
raise excs.Error(f'nearest() not available for unindexed column {img_col.name}')
|
|
81
67
|
|
|
82
68
|
# all exprs that are evaluated in Python; not executable
|
|
83
69
|
self.all_exprs = self.select_list.copy()
|
|
@@ -203,8 +189,6 @@ class Planner:
|
|
|
203
189
|
refd_tbl_ids: Set[UUID] = set()
|
|
204
190
|
if where_clause is not None:
|
|
205
191
|
analyzer = cls.analyze(tbl, where_clause)
|
|
206
|
-
if analyzer.similarity_clause is not None:
|
|
207
|
-
raise excs.Error('nearest() cannot be used with count()')
|
|
208
192
|
if analyzer.filter is not None:
|
|
209
193
|
raise excs.Error(f'Filter {analyzer.filter} not expressible in SQL')
|
|
210
194
|
clause_element = analyzer.sql_where_clause.sql_expr()
|
|
@@ -220,18 +204,11 @@ class Planner:
|
|
|
220
204
|
) -> exec.ExecNode:
|
|
221
205
|
"""Creates a plan for TableVersion.insert()"""
|
|
222
206
|
assert not tbl.is_view()
|
|
223
|
-
#
|
|
224
|
-
# 1. stored_cols: all cols we need to store, incl computed cols (and indices)
|
|
207
|
+
# stored_cols: all cols we need to store, incl computed cols (and indices)
|
|
225
208
|
stored_cols = [c for c in tbl.cols if c.is_stored]
|
|
226
209
|
assert len(stored_cols) > 0
|
|
227
|
-
# 2. values to insert into indices
|
|
228
|
-
indexed_cols = [c for c in tbl.cols if c.is_indexed]
|
|
229
|
-
index_info: List[Tuple[catalog.Column, func.Function]] = []
|
|
230
|
-
if len(indexed_cols) > 0:
|
|
231
|
-
from pixeltable.functions.nos.image_embedding import openai_clip
|
|
232
|
-
index_info = [(c, openai_clip) for c in tbl.cols if c.is_indexed]
|
|
233
210
|
|
|
234
|
-
row_builder = exprs.RowBuilder([], stored_cols,
|
|
211
|
+
row_builder = exprs.RowBuilder([], stored_cols, [])
|
|
235
212
|
|
|
236
213
|
# create InMemoryDataNode for 'rows'
|
|
237
214
|
stored_col_info = row_builder.output_slot_idxs()
|
|
@@ -260,7 +237,7 @@ class Planner:
|
|
|
260
237
|
@classmethod
|
|
261
238
|
def create_update_plan(
|
|
262
239
|
cls, tbl: catalog.TableVersionPath,
|
|
263
|
-
update_targets:
|
|
240
|
+
update_targets: dict[catalog.Column, exprs.Expr],
|
|
264
241
|
recompute_targets: List[catalog.Column],
|
|
265
242
|
where_clause: Optional[exprs.Predicate], cascade: bool
|
|
266
243
|
) -> Tuple[exec.ExecNode, List[str], List[catalog.Column]]:
|
|
@@ -279,7 +256,7 @@ class Planner:
|
|
|
279
256
|
# retrieve all stored cols and all target exprs
|
|
280
257
|
assert isinstance(tbl, catalog.TableVersionPath)
|
|
281
258
|
target = tbl.tbl_version # the one we need to update
|
|
282
|
-
updated_cols =
|
|
259
|
+
updated_cols = list(update_targets.keys())
|
|
283
260
|
if len(recompute_targets) > 0:
|
|
284
261
|
recomputed_cols = recompute_targets.copy()
|
|
285
262
|
else:
|
|
@@ -291,12 +268,12 @@ class Planner:
|
|
|
291
268
|
col for col in target.cols if col.is_stored and not col in updated_cols and not col in recomputed_base_cols
|
|
292
269
|
]
|
|
293
270
|
select_list = [exprs.ColumnRef(col) for col in copied_cols]
|
|
294
|
-
select_list.extend(
|
|
271
|
+
select_list.extend(update_targets.values())
|
|
295
272
|
|
|
296
273
|
recomputed_exprs = \
|
|
297
274
|
[c.value_expr.copy().resolve_computed_cols(resolve_cols=recomputed_base_cols) for c in recomputed_base_cols]
|
|
298
275
|
# recomputed cols reference the new values of the updated cols
|
|
299
|
-
for col, e in update_targets:
|
|
276
|
+
for col, e in update_targets.items():
|
|
300
277
|
exprs.Expr.list_substitute(recomputed_exprs, exprs.ColumnRef(col), e)
|
|
301
278
|
select_list.extend(recomputed_exprs)
|
|
302
279
|
|
|
@@ -375,16 +352,10 @@ class Planner:
|
|
|
375
352
|
# the store
|
|
376
353
|
target = view.tbl_version # the one we need to populate
|
|
377
354
|
stored_cols = [c for c in target.cols if c.is_stored and (c.is_computed or target.is_iterator_column(c))]
|
|
378
|
-
# 2.
|
|
379
|
-
indexed_cols = [c for c in target.cols if c.is_indexed]
|
|
380
|
-
index_info: List[Tuple[catalog.Column, func.Function]] = []
|
|
381
|
-
if len(indexed_cols) > 0:
|
|
382
|
-
from pixeltable.functions.nos.image_embedding import openai_clip
|
|
383
|
-
index_info = [(c, openai_clip) for c in target.cols if c.is_indexed]
|
|
384
|
-
# 3. for component views: iterator args
|
|
355
|
+
# 2. for component views: iterator args
|
|
385
356
|
iterator_args = [target.iterator_args] if target.iterator_args is not None else []
|
|
386
357
|
|
|
387
|
-
row_builder = exprs.RowBuilder(iterator_args, stored_cols,
|
|
358
|
+
row_builder = exprs.RowBuilder(iterator_args, stored_cols, [])
|
|
388
359
|
|
|
389
360
|
# execution plan:
|
|
390
361
|
# 1. materialize exprs computed from the base that are needed for stored view columns
|
|
@@ -548,7 +519,7 @@ class Planner:
|
|
|
548
519
|
analyzer = Analyzer(
|
|
549
520
|
tbl, select_list, where_clause=where_clause, group_by_clause=group_by_clause,
|
|
550
521
|
order_by_clause=order_by_clause)
|
|
551
|
-
row_builder = exprs.RowBuilder(analyzer.all_exprs, [],
|
|
522
|
+
row_builder = exprs.RowBuilder(analyzer.all_exprs, [], analyzer.sql_exprs)
|
|
552
523
|
|
|
553
524
|
analyzer.finalize(row_builder)
|
|
554
525
|
# select_list: we need to materialize everything that's been collected
|
|
@@ -582,7 +553,7 @@ class Planner:
|
|
|
582
553
|
sql_select_list = analyzer.sql_exprs.copy()
|
|
583
554
|
plan = exec.SqlScanNode(
|
|
584
555
|
tbl, row_builder, select_list=sql_select_list, where_clause=analyzer.sql_where_clause,
|
|
585
|
-
filter=analyzer.filter,
|
|
556
|
+
filter=analyzer.filter, order_by_items=order_by_items,
|
|
586
557
|
limit=sql_limit, set_pk=with_pk, exact_version_only=exact_version_only)
|
|
587
558
|
plan = cls._insert_prefetch_node(tbl.tbl_version.id, analyzer.select_list, row_builder, plan)
|
|
588
559
|
|
|
@@ -627,21 +598,15 @@ class Planner:
|
|
|
627
598
|
@classmethod
|
|
628
599
|
def create_add_column_plan(
|
|
629
600
|
cls, tbl: catalog.TableVersionPath, col: catalog.Column
|
|
630
|
-
) -> Tuple[exec.ExecNode, Optional[int]
|
|
601
|
+
) -> Tuple[exec.ExecNode, Optional[int]]:
|
|
631
602
|
"""Creates a plan for InsertableTable.add_column()
|
|
632
603
|
Returns:
|
|
633
604
|
plan: the plan to execute
|
|
634
|
-
ctx: the context to use for the plan
|
|
635
605
|
value_expr slot idx for the plan output (for computed cols)
|
|
636
|
-
embedding slot idx for the plan output (for indexed image cols)
|
|
637
606
|
"""
|
|
638
607
|
assert isinstance(tbl, catalog.TableVersionPath)
|
|
639
608
|
index_info: List[Tuple[catalog.Column, func.Function]] = []
|
|
640
|
-
|
|
641
|
-
from pixeltable.functions.nos.image_embedding import openai_clip
|
|
642
|
-
index_info = [(col, openai_clip)]
|
|
643
|
-
row_builder = exprs.RowBuilder(
|
|
644
|
-
output_exprs=[], columns=[col], indices=index_info, input_exprs=[])
|
|
609
|
+
row_builder = exprs.RowBuilder(output_exprs=[], columns=[col], input_exprs=[])
|
|
645
610
|
analyzer = Analyzer(tbl, row_builder.default_eval_ctx.target_exprs)
|
|
646
611
|
plan = cls._create_query_plan(tbl, row_builder=row_builder, analyzer=analyzer, with_pk=True)
|
|
647
612
|
plan.ctx.batch_size = 16
|
|
@@ -651,6 +616,5 @@ class Planner:
|
|
|
651
616
|
# we want to flush images
|
|
652
617
|
if col.is_computed and col.is_stored and col.col_type.is_image_type():
|
|
653
618
|
plan.set_stored_img_cols(row_builder.output_slot_idxs())
|
|
654
|
-
value_expr_slot_idx
|
|
655
|
-
|
|
656
|
-
return plan, value_expr_slot_idx, embedding_slot_idx
|
|
619
|
+
value_expr_slot_idx = row_builder.output_slot_idxs()[0].slot_idx if col.is_computed else None
|
|
620
|
+
return plan, value_expr_slot_idx
|
pixeltable/store.py
CHANGED
|
@@ -38,7 +38,7 @@ class StoreBase:
|
|
|
38
38
|
self.tbl_version = tbl_version
|
|
39
39
|
self.sa_md = sql.MetaData()
|
|
40
40
|
self.sa_tbl: Optional[sql.Table] = None
|
|
41
|
-
self.
|
|
41
|
+
self.create_sa_tbl()
|
|
42
42
|
|
|
43
43
|
def pk_columns(self) -> List[sql.Column]:
|
|
44
44
|
return self._pk_columns
|
|
@@ -62,7 +62,7 @@ class StoreBase:
|
|
|
62
62
|
return [*rowid_cols, self.v_min_col, self.v_max_col]
|
|
63
63
|
|
|
64
64
|
|
|
65
|
-
def
|
|
65
|
+
def create_sa_tbl(self) -> None:
|
|
66
66
|
"""Create self.sa_tbl from self.tbl_version."""
|
|
67
67
|
system_cols = self._create_system_columns()
|
|
68
68
|
all_cols = system_cols.copy()
|
|
@@ -76,9 +76,6 @@ class StoreBase:
|
|
|
76
76
|
all_cols.append(col.sa_errormsg_col)
|
|
77
77
|
all_cols.append(col.sa_errortype_col)
|
|
78
78
|
|
|
79
|
-
if col.is_indexed:
|
|
80
|
-
all_cols.append(col.sa_idx_col)
|
|
81
|
-
|
|
82
79
|
# we create an index for:
|
|
83
80
|
# - scalar columns (except for strings, because long strings can't be used for B-tree indices)
|
|
84
81
|
# - non-computed video and image columns (they will contain external paths/urls that users might want to
|
|
@@ -145,8 +142,8 @@ class StoreBase:
|
|
|
145
142
|
"""Move tmp media files that we generated to a permanent location"""
|
|
146
143
|
for c in media_cols:
|
|
147
144
|
for table_row in table_rows:
|
|
148
|
-
file_url = table_row[c.
|
|
149
|
-
table_row[c.
|
|
145
|
+
file_url = table_row[c.store_name()]
|
|
146
|
+
table_row[c.store_name()] = self._move_tmp_media_file(file_url, c, v_min)
|
|
150
147
|
|
|
151
148
|
def _create_table_row(
|
|
152
149
|
self, input_row: exprs.DataRow, row_builder: exprs.RowBuilder, media_cols: List[catalog.Column],
|
|
@@ -168,16 +165,19 @@ class StoreBase:
|
|
|
168
165
|
|
|
169
166
|
return table_row, num_excs
|
|
170
167
|
|
|
171
|
-
def count(self) ->
|
|
168
|
+
def count(self, conn: Optional[sql.engine.Connection] = None) -> int:
|
|
172
169
|
"""Return the number of rows visible in self.tbl_version"""
|
|
173
170
|
stmt = sql.select(sql.func.count('*'))\
|
|
174
171
|
.select_from(self.sa_tbl)\
|
|
175
172
|
.where(self.v_min_col <= self.tbl_version.version)\
|
|
176
173
|
.where(self.v_max_col > self.tbl_version.version)
|
|
177
|
-
|
|
174
|
+
if conn is None:
|
|
175
|
+
with env.Env.get().engine.connect() as conn:
|
|
176
|
+
result = conn.execute(stmt).scalar_one()
|
|
177
|
+
else:
|
|
178
178
|
result = conn.execute(stmt).scalar_one()
|
|
179
|
-
|
|
180
|
-
|
|
179
|
+
assert isinstance(result, int)
|
|
180
|
+
return result
|
|
181
181
|
|
|
182
182
|
def create(self, conn: sql.engine.Connection) -> None:
|
|
183
183
|
self.sa_md.create_all(bind=conn)
|
|
@@ -193,38 +193,35 @@ class StoreBase:
|
|
|
193
193
|
message).
|
|
194
194
|
"""
|
|
195
195
|
assert col.is_stored
|
|
196
|
-
|
|
196
|
+
col_type_str = col.get_sa_col_type().compile(dialect=conn.dialect)
|
|
197
|
+
stmt = sql.text(f'ALTER TABLE {self._storage_name()} ADD COLUMN {col.store_name()} {col_type_str} NULL')
|
|
197
198
|
log_stmt(_logger, stmt)
|
|
198
199
|
conn.execute(stmt)
|
|
199
|
-
added_storage_cols = [col.
|
|
200
|
+
added_storage_cols = [col.store_name()]
|
|
200
201
|
if col.records_errors:
|
|
201
202
|
# we also need to create the errormsg and errortype storage cols
|
|
202
203
|
stmt = (f'ALTER TABLE {self._storage_name()} '
|
|
203
|
-
f'ADD COLUMN {col.
|
|
204
|
+
f'ADD COLUMN {col.errormsg_store_name()} VARCHAR DEFAULT NULL')
|
|
204
205
|
conn.execute(sql.text(stmt))
|
|
205
206
|
stmt = (f'ALTER TABLE {self._storage_name()} '
|
|
206
|
-
f'ADD COLUMN {col.
|
|
207
|
+
f'ADD COLUMN {col.errortype_store_name()} VARCHAR DEFAULT NULL')
|
|
207
208
|
conn.execute(sql.text(stmt))
|
|
208
|
-
|
|
209
|
-
self.
|
|
209
|
+
added_storage_cols.extend([col.errormsg_store_name(), col.errortype_store_name()])
|
|
210
|
+
self.create_sa_tbl()
|
|
210
211
|
_logger.info(f'Added columns {added_storage_cols} to storage table {self._storage_name()}')
|
|
211
212
|
|
|
212
|
-
def drop_column(self, col:
|
|
213
|
-
"""
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
213
|
+
def drop_column(self, col: catalog.Column, conn: sql.engine.Connection) -> None:
|
|
214
|
+
"""Execute Alter Table Drop Column statement"""
|
|
215
|
+
stmt = f'ALTER TABLE {self._storage_name()} DROP COLUMN {col.store_name()}'
|
|
216
|
+
conn.execute(sql.text(stmt))
|
|
217
|
+
if col.records_errors:
|
|
218
|
+
stmt = f'ALTER TABLE {self._storage_name()} DROP COLUMN {col.errormsg_store_name()}'
|
|
219
|
+
conn.execute(sql.text(stmt))
|
|
220
|
+
stmt = f'ALTER TABLE {self._storage_name()} DROP COLUMN {col.errortype_store_name()}'
|
|
217
221
|
conn.execute(sql.text(stmt))
|
|
218
|
-
if col.records_errors:
|
|
219
|
-
stmt = f'ALTER TABLE {self._storage_name()} DROP COLUMN {col.errormsg_storage_name()}'
|
|
220
|
-
conn.execute(sql.text(stmt))
|
|
221
|
-
stmt = f'ALTER TABLE {self._storage_name()} DROP COLUMN {col.errortype_storage_name()}'
|
|
222
|
-
conn.execute(sql.text(stmt))
|
|
223
|
-
self._create_sa_tbl()
|
|
224
222
|
|
|
225
223
|
def load_column(
|
|
226
|
-
self, col: catalog.Column, exec_plan: ExecNode, value_expr_slot_idx: int,
|
|
227
|
-
conn: sql.engine.Connection
|
|
224
|
+
self, col: catalog.Column, exec_plan: ExecNode, value_expr_slot_idx: int, conn: sql.engine.Connection
|
|
228
225
|
) -> int:
|
|
229
226
|
"""Update store column of a computed column with values produced by an execution plan
|
|
230
227
|
|
|
@@ -253,18 +250,11 @@ class StoreBase:
|
|
|
253
250
|
col.sa_errormsg_col: error_msg
|
|
254
251
|
}
|
|
255
252
|
else:
|
|
256
|
-
val = result_row.get_stored_val(value_expr_slot_idx)
|
|
253
|
+
val = result_row.get_stored_val(value_expr_slot_idx, col.sa_col.type)
|
|
257
254
|
if col.col_type.is_media_type():
|
|
258
255
|
val = self._move_tmp_media_file(val, col, result_row.pk[-1])
|
|
259
256
|
values_dict = {col.sa_col: val}
|
|
260
257
|
|
|
261
|
-
if col.is_indexed:
|
|
262
|
-
# TODO: deal with exceptions
|
|
263
|
-
assert not result_row.has_exc(embedding_slot_idx)
|
|
264
|
-
# don't use get_stored_val() here, we need to pass the ndarray
|
|
265
|
-
embedding = result_row[embedding_slot_idx]
|
|
266
|
-
values_dict[col.sa_index_col] = embedding
|
|
267
|
-
|
|
268
258
|
update_stmt = sql.update(self.sa_tbl).values(values_dict)
|
|
269
259
|
for pk_col, pk_val in zip(self.pk_columns(), result_row.pk):
|
|
270
260
|
update_stmt = update_stmt.where(pk_col == pk_val)
|
|
@@ -337,6 +327,7 @@ class StoreBase:
|
|
|
337
327
|
self, current_version: int, base_versions: List[Optional[int]], match_on_vmin: bool,
|
|
338
328
|
where_clause: Optional[sql.ClauseElement], conn: sql.engine.Connection) -> int:
|
|
339
329
|
"""Mark rows as deleted that are live and were created prior to current_version.
|
|
330
|
+
Also: populate the undo columns
|
|
340
331
|
Args:
|
|
341
332
|
base_versions: if non-None, join only to base rows that were created at that version,
|
|
342
333
|
otherwise join to rows that are live in the base's current version (which is distinct from the
|
|
@@ -354,8 +345,14 @@ class StoreBase:
|
|
|
354
345
|
rowid_join_clause = self._rowid_join_predicate()
|
|
355
346
|
base_versions_clause = sql.true() if len(base_versions) == 0 \
|
|
356
347
|
else self.base._versions_clause(base_versions, match_on_vmin)
|
|
348
|
+
set_clause = {self.v_max_col: current_version}
|
|
349
|
+
for index_info in self.tbl_version.idxs_by_name.values():
|
|
350
|
+
# copy value column to undo column
|
|
351
|
+
set_clause[index_info.undo_col.sa_col] = index_info.val_col.sa_col
|
|
352
|
+
# set value column to NULL
|
|
353
|
+
set_clause[index_info.val_col.sa_col] = None
|
|
357
354
|
stmt = sql.update(self.sa_tbl) \
|
|
358
|
-
.values(
|
|
355
|
+
.values(set_clause) \
|
|
359
356
|
.where(where_clause) \
|
|
360
357
|
.where(rowid_join_clause) \
|
|
361
358
|
.where(base_versions_clause)
|
|
@@ -416,8 +413,8 @@ class StoreComponentView(StoreView):
|
|
|
416
413
|
self.rowid_cols.append(self.pos_col)
|
|
417
414
|
return self.rowid_cols
|
|
418
415
|
|
|
419
|
-
def
|
|
420
|
-
super().
|
|
416
|
+
def create_sa_tbl(self) -> None:
|
|
417
|
+
super().create_sa_tbl()
|
|
421
418
|
# we need to fix up the 'pos' column in TableVersion
|
|
422
419
|
self.tbl_version.cols_by_name['pos'].sa_col = self.pos_col
|
|
423
420
|
|
|
@@ -11,6 +11,7 @@ import toml
|
|
|
11
11
|
import pixeltable as pxt
|
|
12
12
|
import pixeltable.metadata as metadata
|
|
13
13
|
from pixeltable.env import Env
|
|
14
|
+
from pixeltable.func import Batch
|
|
14
15
|
from pixeltable.type_system import \
|
|
15
16
|
StringType, IntType, FloatType, BoolType, TimestampType, JsonType
|
|
16
17
|
|
|
@@ -29,9 +30,7 @@ class Dumper:
|
|
|
29
30
|
os.environ['PIXELTABLE_DB'] = db_name
|
|
30
31
|
os.environ['PIXELTABLE_PGDATA'] = str(shared_home / 'pgdata')
|
|
31
32
|
|
|
32
|
-
Env.get().
|
|
33
|
-
self.cl = pxt.Client()
|
|
34
|
-
self.cl.logging(level=logging.DEBUG, to_stdout=True)
|
|
33
|
+
Env.get().configure_logging(level=logging.DEBUG, to_stdout=True)
|
|
35
34
|
|
|
36
35
|
def dump_db(self) -> None:
|
|
37
36
|
md_version = metadata.VERSION
|
|
@@ -76,8 +75,18 @@ class Dumper:
|
|
|
76
75
|
'c6': JsonType(nullable=False),
|
|
77
76
|
'c7': JsonType(nullable=False),
|
|
78
77
|
}
|
|
79
|
-
t =
|
|
78
|
+
t = pxt.create_table('sample_table', schema, primary_key='c2')
|
|
79
|
+
|
|
80
|
+
# Add columns for InlineArray and InlineDict
|
|
80
81
|
t.add_column(c8=[[1, 2, 3], [4, 5, 6]])
|
|
82
|
+
t.add_column(c9=[['a', 'b', 'c'], ['d', 'e', 'f']])
|
|
83
|
+
t.add_column(c10=[t.c1, [t.c1n, t.c2]])
|
|
84
|
+
t.add_column(c11={'int': 22, 'dict': {'key': 'val'}, 'expr': t.c1})
|
|
85
|
+
|
|
86
|
+
# InPredicate
|
|
87
|
+
t.add_column(isin_1=t.c1.isin(['test string 1', 'test string 2', 'test string 3']))
|
|
88
|
+
t.add_column(isin_2=t.c2.isin([1, 2, 3, 4, 5]))
|
|
89
|
+
t.add_column(isin_3=t.c2.isin(t.c6.f5))
|
|
81
90
|
|
|
82
91
|
# Add columns for .astype converters to ensure they're persisted properly
|
|
83
92
|
t.add_column(c2_as_float=t.c2.astype(FloatType()))
|
|
@@ -136,6 +145,32 @@ class Dumper:
|
|
|
136
145
|
for i in range(num_rows)
|
|
137
146
|
]
|
|
138
147
|
t.insert(rows)
|
|
148
|
+
pxt.create_dir('views')
|
|
149
|
+
v = pxt.create_view('views.sample_view', t, filter=(t.c2 < 50))
|
|
150
|
+
_ = pxt.create_view('views.sample_snapshot', t, filter=(t.c2 >= 75), is_snapshot=True)
|
|
151
|
+
e = pxt.create_view('views.empty_view', t, filter=t.c2 == 4171780)
|
|
152
|
+
assert e.count() == 0
|
|
153
|
+
# Computed column using a library function
|
|
154
|
+
v['str_format'] = pxt.functions.string.str_format('{0} {key}', t.c1, key=t.c1)
|
|
155
|
+
# Computed column using a bespoke stored udf
|
|
156
|
+
v['test_udf'] = test_udf_stored(t.c2)
|
|
157
|
+
# Computed column using a batched function
|
|
158
|
+
# (apply this to the empty view, since it's a "heavyweight" function)
|
|
159
|
+
e['batched'] = pxt.functions.huggingface.clip_text(t.c1, model_id='openai/clip-vit-base-patch32')
|
|
160
|
+
# computed column using a stored batched function
|
|
161
|
+
v['test_udf_batched'] = test_udf_stored_batched(t.c1, upper=False)
|
|
162
|
+
# astype
|
|
163
|
+
v['astype'] = t.c1.astype(pxt.FloatType())
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
@pxt.udf(_force_stored=True)
|
|
167
|
+
def test_udf_stored(n: int) -> int:
|
|
168
|
+
return n + 1
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
@pxt.udf(batch_size=4, _force_stored=True)
|
|
172
|
+
def test_udf_stored_batched(strings: Batch[str], *, upper: bool = True) -> Batch[str]:
|
|
173
|
+
return [string.upper() if upper else string.lower() for string in strings]
|
|
139
174
|
|
|
140
175
|
|
|
141
176
|
def main() -> None:
|