pixeltable 0.2.5__py3-none-any.whl → 0.2.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +20 -9
- pixeltable/__version__.py +3 -0
- pixeltable/catalog/column.py +23 -7
- pixeltable/catalog/insertable_table.py +32 -19
- pixeltable/catalog/table.py +210 -20
- pixeltable/catalog/table_version.py +272 -111
- pixeltable/catalog/table_version_path.py +6 -1
- pixeltable/dataframe.py +184 -110
- pixeltable/datatransfer/__init__.py +1 -0
- pixeltable/datatransfer/label_studio.py +526 -0
- pixeltable/datatransfer/remote.py +113 -0
- pixeltable/env.py +213 -79
- pixeltable/exec/__init__.py +2 -1
- pixeltable/exec/data_row_batch.py +6 -7
- pixeltable/exec/expr_eval_node.py +28 -28
- pixeltable/exec/sql_scan_node.py +7 -6
- pixeltable/exprs/__init__.py +4 -3
- pixeltable/exprs/column_ref.py +11 -2
- pixeltable/exprs/comparison.py +39 -1
- pixeltable/exprs/data_row.py +7 -0
- pixeltable/exprs/expr.py +26 -19
- pixeltable/exprs/function_call.py +17 -18
- pixeltable/exprs/globals.py +14 -2
- pixeltable/exprs/image_member_access.py +9 -28
- pixeltable/exprs/in_predicate.py +96 -0
- pixeltable/exprs/inline_array.py +13 -11
- pixeltable/exprs/inline_dict.py +15 -13
- pixeltable/exprs/row_builder.py +7 -1
- pixeltable/exprs/similarity_expr.py +67 -0
- pixeltable/ext/functions/whisperx.py +30 -0
- pixeltable/ext/functions/yolox.py +16 -0
- pixeltable/func/__init__.py +0 -2
- pixeltable/func/aggregate_function.py +5 -2
- pixeltable/func/callable_function.py +57 -13
- pixeltable/func/expr_template_function.py +14 -3
- pixeltable/func/function.py +35 -4
- pixeltable/func/signature.py +5 -15
- pixeltable/func/udf.py +8 -12
- pixeltable/functions/fireworks.py +9 -4
- pixeltable/functions/huggingface.py +48 -5
- pixeltable/functions/openai.py +49 -11
- pixeltable/functions/pil/image.py +61 -64
- pixeltable/functions/together.py +32 -6
- pixeltable/functions/util.py +0 -43
- pixeltable/functions/video.py +46 -8
- pixeltable/globals.py +443 -0
- pixeltable/index/__init__.py +1 -0
- pixeltable/index/base.py +9 -2
- pixeltable/index/btree.py +54 -0
- pixeltable/index/embedding_index.py +91 -15
- pixeltable/io/__init__.py +4 -0
- pixeltable/io/globals.py +59 -0
- pixeltable/{utils → io}/hf_datasets.py +48 -17
- pixeltable/io/pandas.py +148 -0
- pixeltable/{utils → io}/parquet.py +58 -33
- pixeltable/iterators/__init__.py +1 -1
- pixeltable/iterators/base.py +8 -4
- pixeltable/iterators/document.py +225 -93
- pixeltable/iterators/video.py +16 -9
- pixeltable/metadata/__init__.py +8 -4
- pixeltable/metadata/converters/convert_12.py +3 -0
- pixeltable/metadata/converters/convert_13.py +41 -0
- pixeltable/metadata/converters/convert_14.py +13 -0
- pixeltable/metadata/converters/convert_15.py +29 -0
- pixeltable/metadata/converters/util.py +63 -0
- pixeltable/metadata/schema.py +12 -6
- pixeltable/plan.py +11 -24
- pixeltable/store.py +16 -23
- pixeltable/tool/create_test_db_dump.py +49 -14
- pixeltable/type_system.py +27 -58
- pixeltable/utils/coco.py +94 -0
- pixeltable/utils/documents.py +42 -12
- pixeltable/utils/http_server.py +70 -0
- pixeltable-0.2.7.dist-info/METADATA +137 -0
- pixeltable-0.2.7.dist-info/RECORD +126 -0
- {pixeltable-0.2.5.dist-info → pixeltable-0.2.7.dist-info}/WHEEL +1 -1
- pixeltable/client.py +0 -600
- pixeltable/exprs/image_similarity_predicate.py +0 -58
- pixeltable/func/batched_function.py +0 -53
- pixeltable/func/nos_function.py +0 -202
- pixeltable/tests/conftest.py +0 -171
- pixeltable/tests/ext/test_yolox.py +0 -21
- pixeltable/tests/functions/test_fireworks.py +0 -43
- pixeltable/tests/functions/test_functions.py +0 -60
- pixeltable/tests/functions/test_huggingface.py +0 -158
- pixeltable/tests/functions/test_openai.py +0 -162
- pixeltable/tests/functions/test_together.py +0 -112
- pixeltable/tests/test_audio.py +0 -65
- pixeltable/tests/test_catalog.py +0 -27
- pixeltable/tests/test_client.py +0 -21
- pixeltable/tests/test_component_view.py +0 -379
- pixeltable/tests/test_dataframe.py +0 -440
- pixeltable/tests/test_dirs.py +0 -107
- pixeltable/tests/test_document.py +0 -120
- pixeltable/tests/test_exprs.py +0 -802
- pixeltable/tests/test_function.py +0 -332
- pixeltable/tests/test_index.py +0 -138
- pixeltable/tests/test_migration.py +0 -44
- pixeltable/tests/test_nos.py +0 -54
- pixeltable/tests/test_snapshot.py +0 -231
- pixeltable/tests/test_table.py +0 -1343
- pixeltable/tests/test_transactional_directory.py +0 -42
- pixeltable/tests/test_types.py +0 -52
- pixeltable/tests/test_video.py +0 -159
- pixeltable/tests/test_view.py +0 -535
- pixeltable/tests/utils.py +0 -442
- pixeltable/utils/clip.py +0 -18
- pixeltable-0.2.5.dist-info/METADATA +0 -128
- pixeltable-0.2.5.dist-info/RECORD +0 -139
- {pixeltable-0.2.5.dist-info → pixeltable-0.2.7.dist-info}/LICENSE +0 -0
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
import logging
|
|
3
|
+
from typing import Any, Callable, Optional
|
|
4
|
+
|
|
5
|
+
import sqlalchemy as sql
|
|
6
|
+
|
|
7
|
+
from pixeltable.metadata.schema import Table
|
|
8
|
+
|
|
9
|
+
__logger = logging.getLogger('pixeltable')
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def convert_table_md(
|
|
13
|
+
engine: sql.engine.Engine,
|
|
14
|
+
column_md_updater: Optional[Callable[[dict], None]] = None,
|
|
15
|
+
remote_md_updater: Optional[Callable[[dict], None]] = None,
|
|
16
|
+
substitution_fn: Optional[Callable[[Any, Any], Optional[tuple[Any, Any]]]] = None
|
|
17
|
+
) -> None:
|
|
18
|
+
with engine.begin() as conn:
|
|
19
|
+
for row in conn.execute(sql.select(Table)):
|
|
20
|
+
id = row[0]
|
|
21
|
+
table_md = row[2]
|
|
22
|
+
assert isinstance(table_md, dict)
|
|
23
|
+
updated_table_md = copy.deepcopy(table_md)
|
|
24
|
+
if column_md_updater is not None:
|
|
25
|
+
__update_column_md(updated_table_md, column_md_updater)
|
|
26
|
+
if remote_md_updater is not None:
|
|
27
|
+
__update_remote_md(updated_table_md, remote_md_updater)
|
|
28
|
+
if substitution_fn is not None:
|
|
29
|
+
updated_table_md = __substitute_md_rec(updated_table_md, substitution_fn)
|
|
30
|
+
if updated_table_md != table_md:
|
|
31
|
+
__logger.info(f'Updating schema for table: {id}')
|
|
32
|
+
conn.execute(sql.update(Table).where(Table.id == id).values(md=updated_table_md))
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def __update_column_md(table_md: dict, column_md_updater: Callable[[dict], None]) -> None:
|
|
36
|
+
columns_md = table_md['column_md']
|
|
37
|
+
assert isinstance(columns_md, dict)
|
|
38
|
+
for column_md in columns_md.values():
|
|
39
|
+
column_md_updater(column_md)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def __update_remote_md(table_md: dict, remote_md_updater: Callable[[dict], None]) -> None:
|
|
43
|
+
remotes_md = table_md['remotes']
|
|
44
|
+
assert isinstance(remotes_md, list)
|
|
45
|
+
for remote_md in remotes_md:
|
|
46
|
+
remote_md_updater(remote_md)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def __substitute_md_rec(md: Any, substitution_fn: Callable[[Any, Any], Optional[tuple[Any, Any]]]) -> Any:
|
|
50
|
+
if isinstance(md, dict):
|
|
51
|
+
updated_md = {}
|
|
52
|
+
for k, v in md.items():
|
|
53
|
+
substitute = substitution_fn(k, v)
|
|
54
|
+
if substitute is not None:
|
|
55
|
+
updated_k, updated_v = substitute
|
|
56
|
+
updated_md[updated_k] = updated_v
|
|
57
|
+
else:
|
|
58
|
+
updated_md[k] = __substitute_md_rec(v, substitution_fn)
|
|
59
|
+
return updated_md
|
|
60
|
+
elif isinstance(md, list):
|
|
61
|
+
return [__substitute_md_rec(v, substitution_fn) for v in md]
|
|
62
|
+
else:
|
|
63
|
+
return md
|
pixeltable/metadata/schema.py
CHANGED
|
@@ -1,12 +1,11 @@
|
|
|
1
|
-
from typing import Optional, List, get_type_hints, Type, Any, TypeVar, Tuple, Union
|
|
2
|
-
import platform
|
|
3
|
-
import uuid
|
|
4
1
|
import dataclasses
|
|
2
|
+
import uuid
|
|
3
|
+
from typing import Optional, List, get_type_hints, Type, Any, TypeVar, Tuple, Union
|
|
5
4
|
|
|
6
5
|
import sqlalchemy as sql
|
|
7
|
-
from sqlalchemy import
|
|
6
|
+
from sqlalchemy import ForeignKey
|
|
7
|
+
from sqlalchemy import Integer, BigInteger, LargeBinary
|
|
8
8
|
from sqlalchemy.dialects.postgresql import UUID, JSONB
|
|
9
|
-
from sqlalchemy import ForeignKey, UniqueConstraint, ForeignKeyConstraint
|
|
10
9
|
from sqlalchemy.orm import declarative_base
|
|
11
10
|
|
|
12
11
|
Base = declarative_base()
|
|
@@ -93,6 +92,9 @@ class ColumnMd:
|
|
|
93
92
|
# if True, the column is present in the stored table
|
|
94
93
|
stored: Optional[bool]
|
|
95
94
|
|
|
95
|
+
# if specified, the column is a stored proxy of another column
|
|
96
|
+
proxy_base: Optional[int]
|
|
97
|
+
|
|
96
98
|
|
|
97
99
|
@dataclasses.dataclass
|
|
98
100
|
class IndexMd:
|
|
@@ -143,6 +145,10 @@ class TableMd:
|
|
|
143
145
|
# - every row is assigned a unique and immutable rowid on insertion
|
|
144
146
|
next_row_id: int
|
|
145
147
|
|
|
148
|
+
# Metadata format for remotes:
|
|
149
|
+
# {'class': 'pixeltable.datatransfer.LabelStudioProject', 'md': {'project_id': 3}}
|
|
150
|
+
remotes: list[dict[str, Any]]
|
|
151
|
+
|
|
146
152
|
column_md: dict[int, ColumnMd] # col_id -> ColumnMd
|
|
147
153
|
index_md: dict[int, IndexMd] # index_id -> IndexMd
|
|
148
154
|
view_md: Optional[ViewMd]
|
|
@@ -160,7 +166,7 @@ class Table(Base):
|
|
|
160
166
|
|
|
161
167
|
MAX_VERSION = 9223372036854775807 # 2^63 - 1
|
|
162
168
|
|
|
163
|
-
id = sql.Column(UUID(as_uuid=True), primary_key=True,
|
|
169
|
+
id = sql.Column(UUID(as_uuid=True), primary_key=True, nullable=False)
|
|
164
170
|
dir_id = sql.Column(UUID(as_uuid=True), ForeignKey('dirs.id'), nullable=False)
|
|
165
171
|
md = sql.Column(JSONB, nullable=False) # TableMd
|
|
166
172
|
|
pixeltable/plan.py
CHANGED
|
@@ -60,25 +60,10 @@ class Analyzer:
|
|
|
60
60
|
# filter predicate applied to output rows of the SQL scan
|
|
61
61
|
self.filter: Optional[exprs.Predicate] = None
|
|
62
62
|
# not executable
|
|
63
|
-
self.similarity_clause: Optional[exprs.ImageSimilarityPredicate] = None
|
|
63
|
+
#self.similarity_clause: Optional[exprs.ImageSimilarityPredicate] = None
|
|
64
64
|
if where_clause is not None:
|
|
65
65
|
where_clause_conjuncts, self.filter = where_clause.split_conjuncts(lambda e: e.sql_expr() is not None)
|
|
66
66
|
self.sql_where_clause = exprs.CompoundPredicate.make_conjunction(where_clause_conjuncts)
|
|
67
|
-
if self.filter is not None:
|
|
68
|
-
similarity_clauses, self.filter = self.filter.split_conjuncts(
|
|
69
|
-
lambda e: isinstance(e, exprs.ImageSimilarityPredicate))
|
|
70
|
-
if len(similarity_clauses) > 1:
|
|
71
|
-
raise excs.Error(f'More than one nearest() not supported')
|
|
72
|
-
if len(similarity_clauses) == 1:
|
|
73
|
-
if len(self.order_by_clause) > 0:
|
|
74
|
-
raise excs.Error((
|
|
75
|
-
f'nearest() returns results in order of proximity and cannot be used in conjunction with '
|
|
76
|
-
f'order_by()'))
|
|
77
|
-
self.similarity_clause = similarity_clauses[0]
|
|
78
|
-
img_col = self.similarity_clause.img_col_ref.col
|
|
79
|
-
indexed_col_ids = {info.col.id for info in tbl.tbl_version.idxs_by_name.values()}
|
|
80
|
-
if img_col.id not in indexed_col_ids:
|
|
81
|
-
raise excs.Error(f'nearest() not available for unindexed column {img_col.name}')
|
|
82
67
|
|
|
83
68
|
# all exprs that are evaluated in Python; not executable
|
|
84
69
|
self.all_exprs = self.select_list.copy()
|
|
@@ -204,8 +189,6 @@ class Planner:
|
|
|
204
189
|
refd_tbl_ids: Set[UUID] = set()
|
|
205
190
|
if where_clause is not None:
|
|
206
191
|
analyzer = cls.analyze(tbl, where_clause)
|
|
207
|
-
if analyzer.similarity_clause is not None:
|
|
208
|
-
raise excs.Error('nearest() cannot be used with count()')
|
|
209
192
|
if analyzer.filter is not None:
|
|
210
193
|
raise excs.Error(f'Filter {analyzer.filter} not expressible in SQL')
|
|
211
194
|
clause_element = analyzer.sql_where_clause.sql_expr()
|
|
@@ -268,7 +251,7 @@ class Planner:
|
|
|
268
251
|
Returns:
|
|
269
252
|
- root node of the plan
|
|
270
253
|
- list of qualified column names that are getting updated
|
|
271
|
-
- list of columns that are being recomputed
|
|
254
|
+
- list of user-visible columns that are being recomputed
|
|
272
255
|
"""
|
|
273
256
|
# retrieve all stored cols and all target exprs
|
|
274
257
|
assert isinstance(tbl, catalog.TableVersionPath)
|
|
@@ -277,7 +260,10 @@ class Planner:
|
|
|
277
260
|
if len(recompute_targets) > 0:
|
|
278
261
|
recomputed_cols = recompute_targets.copy()
|
|
279
262
|
else:
|
|
280
|
-
recomputed_cols = target.get_dependent_columns(updated_cols) if cascade else
|
|
263
|
+
recomputed_cols = target.get_dependent_columns(updated_cols) if cascade else set()
|
|
264
|
+
# regardless of cascade, we need to update all indices on any updated column
|
|
265
|
+
idx_val_cols = target.get_idx_val_columns(updated_cols)
|
|
266
|
+
recomputed_cols.update(idx_val_cols)
|
|
281
267
|
# we only need to recompute stored columns (unstored ones are substituted away)
|
|
282
268
|
recomputed_cols = {c for c in recomputed_cols if c.is_stored}
|
|
283
269
|
recomputed_base_cols = {col for col in recomputed_cols if col.tbl == target}
|
|
@@ -290,8 +276,8 @@ class Planner:
|
|
|
290
276
|
recomputed_exprs = \
|
|
291
277
|
[c.value_expr.copy().resolve_computed_cols(resolve_cols=recomputed_base_cols) for c in recomputed_base_cols]
|
|
292
278
|
# recomputed cols reference the new values of the updated cols
|
|
293
|
-
for col, e in update_targets.items()
|
|
294
|
-
|
|
279
|
+
spec = {exprs.ColumnRef(col): e for col, e in update_targets.items()}
|
|
280
|
+
exprs.Expr.list_substitute(recomputed_exprs, spec)
|
|
295
281
|
select_list.extend(recomputed_exprs)
|
|
296
282
|
|
|
297
283
|
# we need to retrieve the PK columns of the existing rows
|
|
@@ -299,7 +285,8 @@ class Planner:
|
|
|
299
285
|
all_base_cols = copied_cols + updated_cols + list(recomputed_base_cols) # same order as select_list
|
|
300
286
|
# update row builder with column information
|
|
301
287
|
[plan.row_builder.add_table_column(col, select_list[i].slot_idx) for i, col in enumerate(all_base_cols)]
|
|
302
|
-
|
|
288
|
+
recomputed_user_cols = [c for c in recomputed_cols if c.name is not None]
|
|
289
|
+
return plan, [f'{c.tbl.name}.{c.name}' for c in updated_cols + recomputed_user_cols], recomputed_user_cols
|
|
303
290
|
|
|
304
291
|
@classmethod
|
|
305
292
|
def create_view_update_plan(
|
|
@@ -570,7 +557,7 @@ class Planner:
|
|
|
570
557
|
sql_select_list = analyzer.sql_exprs.copy()
|
|
571
558
|
plan = exec.SqlScanNode(
|
|
572
559
|
tbl, row_builder, select_list=sql_select_list, where_clause=analyzer.sql_where_clause,
|
|
573
|
-
filter=analyzer.filter,
|
|
560
|
+
filter=analyzer.filter, order_by_items=order_by_items,
|
|
574
561
|
limit=sql_limit, set_pk=with_pk, exact_version_only=exact_version_only)
|
|
575
562
|
plan = cls._insert_prefetch_node(tbl.tbl_version.id, analyzer.select_list, row_builder, plan)
|
|
576
563
|
|
pixeltable/store.py
CHANGED
|
@@ -66,7 +66,6 @@ class StoreBase:
|
|
|
66
66
|
"""Create self.sa_tbl from self.tbl_version."""
|
|
67
67
|
system_cols = self._create_system_columns()
|
|
68
68
|
all_cols = system_cols.copy()
|
|
69
|
-
idxs: List[sql.Index] = []
|
|
70
69
|
for col in [c for c in self.tbl_version.cols if c.is_stored]:
|
|
71
70
|
# re-create sql.Column for each column, regardless of whether it already has sa_col set: it was bound
|
|
72
71
|
# to the last sql.Table version we created and cannot be reused
|
|
@@ -76,26 +75,18 @@ class StoreBase:
|
|
|
76
75
|
all_cols.append(col.sa_errormsg_col)
|
|
77
76
|
all_cols.append(col.sa_errortype_col)
|
|
78
77
|
|
|
79
|
-
# we create an index for:
|
|
80
|
-
# - scalar columns (except for strings, because long strings can't be used for B-tree indices)
|
|
81
|
-
# - non-computed video and image columns (they will contain external paths/urls that users might want to
|
|
82
|
-
# filter on)
|
|
83
|
-
if (col.col_type.is_scalar_type() and not col.col_type.is_string_type()) \
|
|
84
|
-
or (col.col_type.is_media_type() and not col.is_computed):
|
|
85
|
-
# index names need to be unique within the Postgres instance
|
|
86
|
-
idx_name = f'idx_{col.id}_{self.tbl_version.id.hex}'
|
|
87
|
-
idxs.append(sql.Index(idx_name, col.sa_col))
|
|
88
|
-
|
|
89
78
|
if self.sa_tbl is not None:
|
|
90
79
|
# if we're called in response to a schema change, we need to remove the old table first
|
|
91
80
|
self.sa_md.remove(self.sa_tbl)
|
|
92
81
|
|
|
82
|
+
idxs: List[sql.Index] = []
|
|
93
83
|
# index for all system columns:
|
|
94
84
|
# - base x view joins can be executed as merge joins
|
|
95
85
|
# - speeds up ORDER BY rowid DESC
|
|
96
86
|
# - allows filtering for a particular table version in index scan
|
|
97
87
|
idx_name = f'sys_cols_idx_{self.tbl_version.id.hex}'
|
|
98
88
|
idxs.append(sql.Index(idx_name, *system_cols))
|
|
89
|
+
|
|
99
90
|
# v_min/v_max indices: speeds up base table scans needed to propagate a base table insert or delete
|
|
100
91
|
idx_name = f'vmin_idx_{self.tbl_version.id.hex}'
|
|
101
92
|
idxs.append(sql.Index(idx_name, self.v_min_col, postgresql_using='brin'))
|
|
@@ -201,10 +192,10 @@ class StoreBase:
|
|
|
201
192
|
if col.records_errors:
|
|
202
193
|
# we also need to create the errormsg and errortype storage cols
|
|
203
194
|
stmt = (f'ALTER TABLE {self._storage_name()} '
|
|
204
|
-
f'ADD COLUMN {col.errormsg_store_name()}
|
|
195
|
+
f'ADD COLUMN {col.errormsg_store_name()} VARCHAR DEFAULT NULL')
|
|
205
196
|
conn.execute(sql.text(stmt))
|
|
206
197
|
stmt = (f'ALTER TABLE {self._storage_name()} '
|
|
207
|
-
f'ADD COLUMN {col.errortype_store_name()}
|
|
198
|
+
f'ADD COLUMN {col.errortype_store_name()} VARCHAR DEFAULT NULL')
|
|
208
199
|
conn.execute(sql.text(stmt))
|
|
209
200
|
added_storage_cols.extend([col.errormsg_store_name(), col.errortype_store_name()])
|
|
210
201
|
self.create_sa_tbl()
|
|
@@ -264,7 +255,8 @@ class StoreBase:
|
|
|
264
255
|
return num_excs
|
|
265
256
|
|
|
266
257
|
def insert_rows(
|
|
267
|
-
self, exec_plan: ExecNode, conn: sql.engine.Connection, v_min: Optional[int] = None
|
|
258
|
+
self, exec_plan: ExecNode, conn: sql.engine.Connection, v_min: Optional[int] = None,
|
|
259
|
+
show_progress: bool = True
|
|
268
260
|
) -> Tuple[int, int, Set[int]]:
|
|
269
261
|
"""Insert rows into the store table and update the catalog table's md
|
|
270
262
|
Returns:
|
|
@@ -293,15 +285,16 @@ class StoreBase:
|
|
|
293
285
|
self._create_table_row(row, row_builder, media_cols, cols_with_excs, v_min=v_min)
|
|
294
286
|
num_excs += num_row_exc
|
|
295
287
|
table_rows.append(table_row)
|
|
296
|
-
if
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
288
|
+
if show_progress:
|
|
289
|
+
if progress_bar is None:
|
|
290
|
+
warnings.simplefilter("ignore", category=TqdmWarning)
|
|
291
|
+
progress_bar = tqdm(
|
|
292
|
+
desc=f'Inserting rows into `{self.tbl_version.name}`',
|
|
293
|
+
unit=' rows',
|
|
294
|
+
ncols=100,
|
|
295
|
+
file=sys.stdout
|
|
296
|
+
)
|
|
297
|
+
progress_bar.update(1)
|
|
305
298
|
self._move_tmp_media_files(table_rows, media_cols, v_min)
|
|
306
299
|
conn.execute(sql.insert(self.sa_tbl), table_rows)
|
|
307
300
|
if progress_bar is not None:
|
|
@@ -11,6 +11,7 @@ import toml
|
|
|
11
11
|
import pixeltable as pxt
|
|
12
12
|
import pixeltable.metadata as metadata
|
|
13
13
|
from pixeltable.env import Env
|
|
14
|
+
from pixeltable.func import Batch
|
|
14
15
|
from pixeltable.type_system import \
|
|
15
16
|
StringType, IntType, FloatType, BoolType, TimestampType, JsonType
|
|
16
17
|
|
|
@@ -29,9 +30,9 @@ class Dumper:
|
|
|
29
30
|
os.environ['PIXELTABLE_DB'] = db_name
|
|
30
31
|
os.environ['PIXELTABLE_PGDATA'] = str(shared_home / 'pgdata')
|
|
31
32
|
|
|
32
|
-
Env.
|
|
33
|
-
|
|
34
|
-
|
|
33
|
+
Env._init_env(reinit_db=True)
|
|
34
|
+
|
|
35
|
+
Env.get().configure_logging(level=logging.DEBUG, to_stdout=True)
|
|
35
36
|
|
|
36
37
|
def dump_db(self) -> None:
|
|
37
38
|
md_version = metadata.VERSION
|
|
@@ -76,8 +77,18 @@ class Dumper:
|
|
|
76
77
|
'c6': JsonType(nullable=False),
|
|
77
78
|
'c7': JsonType(nullable=False),
|
|
78
79
|
}
|
|
79
|
-
t =
|
|
80
|
+
t = pxt.create_table('sample_table', schema, primary_key='c2')
|
|
81
|
+
|
|
82
|
+
# Add columns for InlineArray and InlineDict
|
|
80
83
|
t.add_column(c8=[[1, 2, 3], [4, 5, 6]])
|
|
84
|
+
t.add_column(c9=[['a', 'b', 'c'], ['d', 'e', 'f']])
|
|
85
|
+
t.add_column(c10=[t.c1, [t.c1n, t.c2]])
|
|
86
|
+
t.add_column(c11={'int': 22, 'dict': {'key': 'val'}, 'expr': t.c1})
|
|
87
|
+
|
|
88
|
+
# InPredicate
|
|
89
|
+
t.add_column(isin_1=t.c1.isin(['test string 1', 'test string 2', 'test string 3']))
|
|
90
|
+
t.add_column(isin_2=t.c2.isin([1, 2, 3, 4, 5]))
|
|
91
|
+
t.add_column(isin_3=t.c2.isin(t.c6.f5))
|
|
81
92
|
|
|
82
93
|
# Add columns for .astype converters to ensure they're persisted properly
|
|
83
94
|
t.add_column(c2_as_float=t.c2.astype(FloatType()))
|
|
@@ -136,24 +147,48 @@ class Dumper:
|
|
|
136
147
|
for i in range(num_rows)
|
|
137
148
|
]
|
|
138
149
|
t.insert(rows)
|
|
139
|
-
|
|
140
|
-
v =
|
|
141
|
-
_ =
|
|
150
|
+
pxt.create_dir('views')
|
|
151
|
+
v = pxt.create_view('views.sample_view', t, filter=(t.c2 < 50))
|
|
152
|
+
_ = pxt.create_view('views.sample_snapshot', t, filter=(t.c2 >= 75), is_snapshot=True)
|
|
153
|
+
e = pxt.create_view('views.empty_view', t, filter=t.c2 == 4171780)
|
|
154
|
+
assert e.count() == 0
|
|
142
155
|
# Computed column using a library function
|
|
143
156
|
v['str_format'] = pxt.functions.string.str_format('{0} {key}', t.c1, key=t.c1)
|
|
144
|
-
# Computed column using a bespoke udf
|
|
145
|
-
v['test_udf'] =
|
|
157
|
+
# Computed column using a bespoke stored udf
|
|
158
|
+
v['test_udf'] = test_udf_stored(t.c2)
|
|
159
|
+
# Computed column using a batched function
|
|
160
|
+
# (apply this to the empty view, since it's a "heavyweight" function)
|
|
161
|
+
e['batched'] = pxt.functions.huggingface.clip_text(t.c1, model_id='openai/clip-vit-base-patch32')
|
|
162
|
+
# computed column using a stored batched function
|
|
163
|
+
v['test_udf_batched'] = test_udf_stored_batched(t.c1, upper=False)
|
|
146
164
|
# astype
|
|
147
165
|
v['astype'] = t.c1.astype(pxt.FloatType())
|
|
148
|
-
# computed column using a stored function
|
|
149
|
-
v['stored'] = t.c1.apply(lambda x: f'Hello, {x}', col_type=pxt.StringType())
|
|
150
166
|
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
167
|
+
# Add remotes
|
|
168
|
+
from pixeltable.datatransfer.remote import MockRemote
|
|
169
|
+
v.link(
|
|
170
|
+
MockRemote('remote', {'int_field': pxt.IntType()}, {'str_field': pxt.StringType()}),
|
|
171
|
+
col_mapping={'test_udf': 'int_field', 'c1': 'str_field'}
|
|
172
|
+
)
|
|
173
|
+
# We're just trying to test metadata here, so reach "under the covers" and link a fake
|
|
174
|
+
# Label Studio project without validation (so we don't need a real Label Studio server)
|
|
175
|
+
from pixeltable.datatransfer.label_studio import LabelStudioProject
|
|
176
|
+
v.tbl_version_path.tbl_version.link(
|
|
177
|
+
LabelStudioProject(4171780, media_import_method='file'),
|
|
178
|
+
col_mapping={'str_format': 'str_format'}
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
@pxt.udf(_force_stored=True)
|
|
183
|
+
def test_udf_stored(n: int) -> int:
|
|
154
184
|
return n + 1
|
|
155
185
|
|
|
156
186
|
|
|
187
|
+
@pxt.udf(batch_size=4, _force_stored=True)
|
|
188
|
+
def test_udf_stored_batched(strings: Batch[str], *, upper: bool = True) -> Batch[str]:
|
|
189
|
+
return [string.upper() if upper else string.lower() for string in strings]
|
|
190
|
+
|
|
191
|
+
|
|
157
192
|
def main() -> None:
|
|
158
193
|
_logger.info("Creating pixeltable test artifact.")
|
|
159
194
|
dumper = Dumper()
|
pixeltable/type_system.py
CHANGED
|
@@ -7,7 +7,7 @@ import json
|
|
|
7
7
|
import typing
|
|
8
8
|
import urllib.parse
|
|
9
9
|
import urllib.request
|
|
10
|
-
from copy import
|
|
10
|
+
from copy import deepcopy
|
|
11
11
|
from pathlib import Path
|
|
12
12
|
from typing import Any, Optional, Tuple, Dict, Callable, List, Union, Sequence, Mapping
|
|
13
13
|
|
|
@@ -82,7 +82,11 @@ class ColumnType:
|
|
|
82
82
|
|
|
83
83
|
def __init__(self, t: Type, nullable: bool = False):
|
|
84
84
|
self._type = t
|
|
85
|
-
self.
|
|
85
|
+
self._nullable = nullable
|
|
86
|
+
|
|
87
|
+
@property
|
|
88
|
+
def nullable(self) -> bool:
|
|
89
|
+
return self._nullable
|
|
86
90
|
|
|
87
91
|
@property
|
|
88
92
|
def type_enum(self) -> Type:
|
|
@@ -91,6 +95,12 @@ class ColumnType:
|
|
|
91
95
|
def serialize(self) -> str:
|
|
92
96
|
return json.dumps(self.as_dict())
|
|
93
97
|
|
|
98
|
+
def copy(self, nullable: Optional[bool] = None) -> ColumnType:
|
|
99
|
+
result = deepcopy(self)
|
|
100
|
+
if nullable is not None:
|
|
101
|
+
result._nullable = nullable
|
|
102
|
+
return result
|
|
103
|
+
|
|
94
104
|
@classmethod
|
|
95
105
|
def serialize_list(cls, type_list: List[ColumnType]) -> str:
|
|
96
106
|
return json.dumps([t.as_dict() for t in type_list])
|
|
@@ -177,7 +187,7 @@ class ColumnType:
|
|
|
177
187
|
if type(self) != type(other):
|
|
178
188
|
return False
|
|
179
189
|
for member_var in vars(self).keys():
|
|
180
|
-
if member_var == '
|
|
190
|
+
if member_var == '_nullable':
|
|
181
191
|
continue
|
|
182
192
|
if getattr(self, member_var) != getattr(other, member_var):
|
|
183
193
|
return False
|
|
@@ -225,6 +235,8 @@ class ColumnType:
|
|
|
225
235
|
return BoolType()
|
|
226
236
|
if isinstance(val, datetime.datetime) or isinstance(val, datetime.date):
|
|
227
237
|
return TimestampType()
|
|
238
|
+
if isinstance(val, PIL.Image.Image):
|
|
239
|
+
return ImageType(width=val.width, height=val.height)
|
|
228
240
|
if isinstance(val, np.ndarray):
|
|
229
241
|
col_type = ArrayType.from_literal(val)
|
|
230
242
|
if col_type is not None:
|
|
@@ -248,7 +260,7 @@ class ColumnType:
|
|
|
248
260
|
# We treat it as the underlying type but with nullable=True.
|
|
249
261
|
underlying = cls.from_python_type(union_args[0])
|
|
250
262
|
if underlying is not None:
|
|
251
|
-
underlying.
|
|
263
|
+
underlying._nullable = True
|
|
252
264
|
return underlying
|
|
253
265
|
else:
|
|
254
266
|
# Discard type parameters to ensure that parameterized types such as `list[T]`
|
|
@@ -370,13 +382,6 @@ class ColumnType:
|
|
|
370
382
|
# types that refer to external media files
|
|
371
383
|
return self.is_image_type() or self.is_video_type() or self.is_audio_type() or self.is_document_type()
|
|
372
384
|
|
|
373
|
-
@abc.abstractmethod
|
|
374
|
-
def to_sql(self) -> str:
|
|
375
|
-
"""
|
|
376
|
-
Return corresponding Postgres type.
|
|
377
|
-
"""
|
|
378
|
-
pass
|
|
379
|
-
|
|
380
385
|
@abc.abstractmethod
|
|
381
386
|
def to_sa_type(self) -> sql.types.TypeEngine:
|
|
382
387
|
"""
|
|
@@ -404,9 +409,6 @@ class InvalidType(ColumnType):
|
|
|
404
409
|
def __init__(self, nullable: bool = False):
|
|
405
410
|
super().__init__(self.Type.INVALID, nullable=nullable)
|
|
406
411
|
|
|
407
|
-
def to_sql(self) -> str:
|
|
408
|
-
assert False
|
|
409
|
-
|
|
410
412
|
def to_sa_type(self) -> sql.types.TypeEngine:
|
|
411
413
|
assert False
|
|
412
414
|
|
|
@@ -432,9 +434,6 @@ class StringType(ColumnType):
|
|
|
432
434
|
return None
|
|
433
435
|
return convert
|
|
434
436
|
|
|
435
|
-
def to_sql(self) -> str:
|
|
436
|
-
return 'VARCHAR'
|
|
437
|
-
|
|
438
437
|
def to_sa_type(self) -> sql.types.TypeEngine:
|
|
439
438
|
return sql.String()
|
|
440
439
|
|
|
@@ -458,9 +457,6 @@ class IntType(ColumnType):
|
|
|
458
457
|
def __init__(self, nullable: bool = False):
|
|
459
458
|
super().__init__(self.Type.INT, nullable=nullable)
|
|
460
459
|
|
|
461
|
-
def to_sql(self) -> str:
|
|
462
|
-
return 'BIGINT'
|
|
463
|
-
|
|
464
460
|
def to_sa_type(self) -> sql.types.TypeEngine:
|
|
465
461
|
return sql.BigInteger()
|
|
466
462
|
|
|
@@ -473,9 +469,6 @@ class FloatType(ColumnType):
|
|
|
473
469
|
def __init__(self, nullable: bool = False):
|
|
474
470
|
super().__init__(self.Type.FLOAT, nullable=nullable)
|
|
475
471
|
|
|
476
|
-
def to_sql(self) -> str:
|
|
477
|
-
return 'FLOAT'
|
|
478
|
-
|
|
479
472
|
def to_sa_type(self) -> sql.types.TypeEngine:
|
|
480
473
|
return sql.Float()
|
|
481
474
|
|
|
@@ -493,9 +486,6 @@ class BoolType(ColumnType):
|
|
|
493
486
|
def __init__(self, nullable: bool = False):
|
|
494
487
|
super().__init__(self.Type.BOOL, nullable=nullable)
|
|
495
488
|
|
|
496
|
-
def to_sql(self) -> str:
|
|
497
|
-
return 'BOOLEAN'
|
|
498
|
-
|
|
499
489
|
def to_sa_type(self) -> sql.types.TypeEngine:
|
|
500
490
|
return sql.Boolean()
|
|
501
491
|
|
|
@@ -513,9 +503,6 @@ class TimestampType(ColumnType):
|
|
|
513
503
|
def __init__(self, nullable: bool = False):
|
|
514
504
|
super().__init__(self.Type.TIMESTAMP, nullable=nullable)
|
|
515
505
|
|
|
516
|
-
def to_sql(self) -> str:
|
|
517
|
-
return 'INTEGER'
|
|
518
|
-
|
|
519
506
|
def to_sa_type(self) -> sql.types.TypeEngine:
|
|
520
507
|
return sql.TIMESTAMP()
|
|
521
508
|
|
|
@@ -551,14 +538,13 @@ class JsonType(ColumnType):
|
|
|
551
538
|
}
|
|
552
539
|
return cls(type_spec, nullable=d['nullable'])
|
|
553
540
|
|
|
554
|
-
def to_sql(self) -> str:
|
|
555
|
-
return 'JSONB'
|
|
556
|
-
|
|
557
541
|
def to_sa_type(self) -> sql.types.TypeEngine:
|
|
558
542
|
return sql.dialects.postgresql.JSONB()
|
|
559
543
|
|
|
560
544
|
def print_value(self, val: Any) -> str:
|
|
561
545
|
val_type = self.infer_literal_type(val)
|
|
546
|
+
if val_type is None:
|
|
547
|
+
return super().print_value(val)
|
|
562
548
|
if val_type == self:
|
|
563
549
|
return str(val)
|
|
564
550
|
return val_type.print_value(val)
|
|
@@ -657,9 +643,6 @@ class ArrayType(ColumnType):
|
|
|
657
643
|
return np.array(val, dtype=self.numpy_dtype())
|
|
658
644
|
return val
|
|
659
645
|
|
|
660
|
-
def to_sql(self) -> str:
|
|
661
|
-
return 'BYTEA'
|
|
662
|
-
|
|
663
646
|
def to_sa_type(self) -> sql.types.TypeEngine:
|
|
664
647
|
return sql.LargeBinary()
|
|
665
648
|
|
|
@@ -762,9 +745,6 @@ class ImageType(ColumnType):
|
|
|
762
745
|
return img
|
|
763
746
|
return convert
|
|
764
747
|
|
|
765
|
-
def to_sql(self) -> str:
|
|
766
|
-
return 'VARCHAR'
|
|
767
|
-
|
|
768
748
|
def to_sa_type(self) -> sql.types.TypeEngine:
|
|
769
749
|
return sql.String()
|
|
770
750
|
|
|
@@ -785,11 +765,8 @@ class VideoType(ColumnType):
|
|
|
785
765
|
def __init__(self, nullable: bool = False):
|
|
786
766
|
super().__init__(self.Type.VIDEO, nullable=nullable)
|
|
787
767
|
|
|
788
|
-
def to_sql(self) -> str:
|
|
789
|
-
# stored as a file path
|
|
790
|
-
return 'VARCHAR'
|
|
791
|
-
|
|
792
768
|
def to_sa_type(self) -> sql.types.TypeEngine:
|
|
769
|
+
# stored as a file path
|
|
793
770
|
return sql.String()
|
|
794
771
|
|
|
795
772
|
def _validate_literal(self, val: Any) -> None:
|
|
@@ -820,11 +797,8 @@ class AudioType(ColumnType):
|
|
|
820
797
|
def __init__(self, nullable: bool = False):
|
|
821
798
|
super().__init__(self.Type.AUDIO, nullable=nullable)
|
|
822
799
|
|
|
823
|
-
def to_sql(self) -> str:
|
|
824
|
-
# stored as a file path
|
|
825
|
-
return 'VARCHAR'
|
|
826
|
-
|
|
827
800
|
def to_sa_type(self) -> sql.types.TypeEngine:
|
|
801
|
+
# stored as a file path
|
|
828
802
|
return sql.String()
|
|
829
803
|
|
|
830
804
|
def _validate_literal(self, val: Any) -> None:
|
|
@@ -864,11 +838,8 @@ class DocumentType(ColumnType):
|
|
|
864
838
|
else:
|
|
865
839
|
self._doc_formats = [t for t in self.DocumentFormat]
|
|
866
840
|
|
|
867
|
-
def to_sql(self) -> str:
|
|
868
|
-
# stored as a file path
|
|
869
|
-
return 'VARCHAR'
|
|
870
|
-
|
|
871
841
|
def to_sa_type(self) -> sql.types.TypeEngine:
|
|
842
|
+
# stored as a file path
|
|
872
843
|
return sql.String()
|
|
873
844
|
|
|
874
845
|
def _validate_literal(self, val: Any) -> None:
|
|
@@ -877,11 +848,9 @@ class DocumentType(ColumnType):
|
|
|
877
848
|
def validate_media(self, val: Any) -> None:
|
|
878
849
|
assert isinstance(val, str)
|
|
879
850
|
from pixeltable.utils.documents import get_document_handle
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
except Exception as e:
|
|
887
|
-
raise excs.Error(f'Not a recognized document format: {val}') from None
|
|
851
|
+
try:
|
|
852
|
+
dh = get_document_handle(val)
|
|
853
|
+
if dh is None:
|
|
854
|
+
raise excs.Error(f'Not a recognized document format: {val}')
|
|
855
|
+
except Exception as e:
|
|
856
|
+
raise excs.Error(f'Not a recognized document format: {val}') from None
|