pixeltable 0.3.15__py3-none-any.whl → 0.4.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/catalog.py +296 -105
- pixeltable/catalog/column.py +10 -8
- pixeltable/catalog/dir.py +1 -2
- pixeltable/catalog/insertable_table.py +25 -20
- pixeltable/catalog/schema_object.py +3 -6
- pixeltable/catalog/table.py +261 -189
- pixeltable/catalog/table_version.py +333 -202
- pixeltable/catalog/table_version_handle.py +15 -2
- pixeltable/catalog/table_version_path.py +60 -14
- pixeltable/catalog/view.py +38 -6
- pixeltable/dataframe.py +196 -18
- pixeltable/env.py +4 -4
- pixeltable/exec/__init__.py +1 -1
- pixeltable/exec/expr_eval/evaluators.py +4 -1
- pixeltable/exec/in_memory_data_node.py +1 -1
- pixeltable/exec/sql_node.py +171 -22
- pixeltable/exprs/column_property_ref.py +15 -6
- pixeltable/exprs/column_ref.py +32 -11
- pixeltable/exprs/comparison.py +1 -1
- pixeltable/exprs/data_row.py +5 -3
- pixeltable/exprs/expr.py +7 -0
- pixeltable/exprs/literal.py +2 -0
- pixeltable/exprs/row_builder.py +4 -6
- pixeltable/exprs/rowid_ref.py +8 -0
- pixeltable/exprs/similarity_expr.py +1 -0
- pixeltable/func/query_template_function.py +1 -1
- pixeltable/func/tools.py +1 -1
- pixeltable/functions/gemini.py +0 -1
- pixeltable/functions/string.py +212 -58
- pixeltable/globals.py +12 -4
- pixeltable/index/base.py +5 -0
- pixeltable/index/btree.py +5 -0
- pixeltable/index/embedding_index.py +5 -0
- pixeltable/io/external_store.py +8 -29
- pixeltable/io/label_studio.py +1 -1
- pixeltable/io/parquet.py +2 -2
- pixeltable/io/table_data_conduit.py +0 -31
- pixeltable/metadata/__init__.py +11 -2
- pixeltable/metadata/converters/convert_13.py +2 -2
- pixeltable/metadata/converters/convert_30.py +6 -11
- pixeltable/metadata/converters/convert_35.py +9 -0
- pixeltable/metadata/converters/convert_36.py +38 -0
- pixeltable/metadata/converters/util.py +3 -9
- pixeltable/metadata/notes.py +2 -0
- pixeltable/metadata/schema.py +8 -1
- pixeltable/plan.py +221 -14
- pixeltable/share/packager.py +137 -13
- pixeltable/share/publish.py +2 -2
- pixeltable/store.py +19 -13
- pixeltable/utils/dbms.py +1 -1
- pixeltable/utils/formatter.py +64 -42
- pixeltable/utils/sample.py +25 -0
- {pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc2.dist-info}/METADATA +2 -1
- {pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc2.dist-info}/RECORD +58 -55
- {pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc2.dist-info}/LICENSE +0 -0
- {pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc2.dist-info}/WHEEL +0 -0
- {pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc2.dist-info}/entry_points.txt +0 -0
pixeltable/share/packager.py
CHANGED
|
@@ -1,4 +1,7 @@
|
|
|
1
|
+
import base64
|
|
1
2
|
import datetime
|
|
3
|
+
import io
|
|
4
|
+
import itertools
|
|
2
5
|
import json
|
|
3
6
|
import logging
|
|
4
7
|
import tarfile
|
|
@@ -7,17 +10,21 @@ import urllib.request
|
|
|
7
10
|
import uuid
|
|
8
11
|
from pathlib import Path
|
|
9
12
|
from typing import Any, Iterator, Optional
|
|
13
|
+
from uuid import UUID
|
|
10
14
|
|
|
11
15
|
import more_itertools
|
|
16
|
+
import numpy as np
|
|
17
|
+
import PIL.Image
|
|
12
18
|
import pyarrow as pa
|
|
13
19
|
import pyarrow.parquet as pq
|
|
14
20
|
import sqlalchemy as sql
|
|
15
21
|
|
|
16
22
|
import pixeltable as pxt
|
|
17
|
-
from pixeltable import catalog, exceptions as excs, metadata
|
|
23
|
+
from pixeltable import catalog, exceptions as excs, metadata, type_system as ts
|
|
18
24
|
from pixeltable.env import Env
|
|
19
25
|
from pixeltable.metadata import schema
|
|
20
26
|
from pixeltable.utils import sha256sum
|
|
27
|
+
from pixeltable.utils.formatter import Formatter
|
|
21
28
|
from pixeltable.utils.media_store import MediaStore
|
|
22
29
|
|
|
23
30
|
_logger = logging.getLogger('pixeltable')
|
|
@@ -45,13 +52,17 @@ class TablePackager:
|
|
|
45
52
|
media_files: dict[Path, str] # Mapping from local media file paths to their tarball names
|
|
46
53
|
md: dict[str, Any]
|
|
47
54
|
|
|
55
|
+
bundle_path: Path
|
|
56
|
+
preview_header: dict[str, str]
|
|
57
|
+
preview: list[list[Any]]
|
|
58
|
+
|
|
48
59
|
def __init__(self, table: catalog.Table, additional_md: Optional[dict[str, Any]] = None) -> None:
|
|
49
60
|
self.table = table
|
|
50
61
|
self.tmp_dir = Path(Env.get().create_tmp_path())
|
|
51
62
|
self.media_files = {}
|
|
52
63
|
|
|
53
64
|
# Load metadata
|
|
54
|
-
with
|
|
65
|
+
with catalog.Catalog.get().begin_xact(for_write=False):
|
|
55
66
|
tbl_md = catalog.Catalog.get().load_replica_md(table)
|
|
56
67
|
self.md = {
|
|
57
68
|
'pxt_version': pxt.__version__,
|
|
@@ -66,20 +77,29 @@ class TablePackager:
|
|
|
66
77
|
Export the table to a tarball containing Parquet tables and media files.
|
|
67
78
|
"""
|
|
68
79
|
assert not self.tmp_dir.exists() # Packaging can only be done once per TablePackager instance
|
|
69
|
-
|
|
80
|
+
|
|
81
|
+
_logger.info(f'Packaging table {self.table._path()!r} and its ancestors in: {self.tmp_dir}')
|
|
70
82
|
self.tmp_dir.mkdir()
|
|
71
83
|
with open(self.tmp_dir / 'metadata.json', 'w', encoding='utf8') as fp:
|
|
72
84
|
json.dump(self.md, fp)
|
|
73
85
|
self.tables_dir = self.tmp_dir / 'tables'
|
|
74
86
|
self.tables_dir.mkdir()
|
|
75
|
-
with
|
|
87
|
+
with catalog.Catalog.get().begin_xact(for_write=False):
|
|
76
88
|
for tv in self.table._tbl_version_path.get_tbl_versions():
|
|
77
|
-
_logger.info(f
|
|
89
|
+
_logger.info(f'Exporting table {tv.get().versioned_name!r}.')
|
|
78
90
|
self.__export_table(tv.get())
|
|
91
|
+
|
|
79
92
|
_logger.info('Building archive.')
|
|
80
|
-
bundle_path = self.__build_tarball()
|
|
81
|
-
|
|
82
|
-
|
|
93
|
+
self.bundle_path = self.__build_tarball()
|
|
94
|
+
|
|
95
|
+
_logger.info('Extracting preview data.')
|
|
96
|
+
self.md['count'] = self.table.count()
|
|
97
|
+
preview_header, preview = self.__extract_preview_data()
|
|
98
|
+
self.md['preview_header'] = preview_header
|
|
99
|
+
self.md['preview'] = preview
|
|
100
|
+
|
|
101
|
+
_logger.info(f'Packaging complete: {self.bundle_path}')
|
|
102
|
+
return self.bundle_path
|
|
83
103
|
|
|
84
104
|
def __export_table(self, tv: catalog.TableVersion) -> None:
|
|
85
105
|
"""
|
|
@@ -206,6 +226,96 @@ class TablePackager:
|
|
|
206
226
|
tf.add(src_file, arcname=f'media/{dest_name}')
|
|
207
227
|
return bundle_path
|
|
208
228
|
|
|
229
|
+
def __extract_preview_data(self) -> tuple[dict[str, str], list[list[Any]]]:
|
|
230
|
+
"""
|
|
231
|
+
Extract a preview of the table data for display in the UI.
|
|
232
|
+
|
|
233
|
+
In order to bound the size of the output data, all "unbounded" data types are resized:
|
|
234
|
+
- Strings are abbreviated as per Formatter.abbreviate()
|
|
235
|
+
- Arrays and JSON are shortened and formatted as strings
|
|
236
|
+
- Images are resized to thumbnail size as a base64-encoded webp
|
|
237
|
+
- Videos are replaced by their first frame and resized as above
|
|
238
|
+
- Documents are replaced by a thumbnail as a base64-encoded webp
|
|
239
|
+
"""
|
|
240
|
+
# First 8 columns
|
|
241
|
+
preview_cols = dict(itertools.islice(self.table._schema.items(), 0, 8))
|
|
242
|
+
select_list = [self.table[col_name] for col_name in preview_cols]
|
|
243
|
+
# First 5 rows
|
|
244
|
+
rows = list(self.table.select(*select_list).head(n=5))
|
|
245
|
+
|
|
246
|
+
preview_header = {col_name: str(col_type._type) for col_name, col_type in preview_cols.items()}
|
|
247
|
+
preview = [
|
|
248
|
+
[self.__encode_preview_data(val, col_type)]
|
|
249
|
+
for row in rows
|
|
250
|
+
for val, col_type in zip(row.values(), preview_cols.values())
|
|
251
|
+
]
|
|
252
|
+
|
|
253
|
+
return preview_header, preview
|
|
254
|
+
|
|
255
|
+
def __encode_preview_data(self, val: Any, col_type: ts.ColumnType) -> Any:
|
|
256
|
+
if val is None:
|
|
257
|
+
return None
|
|
258
|
+
|
|
259
|
+
match col_type._type:
|
|
260
|
+
case ts.ColumnType.Type.STRING:
|
|
261
|
+
assert isinstance(val, str)
|
|
262
|
+
return Formatter.abbreviate(val)
|
|
263
|
+
|
|
264
|
+
case ts.ColumnType.Type.INT | ts.ColumnType.Type.FLOAT | ts.ColumnType.Type.BOOL:
|
|
265
|
+
return val
|
|
266
|
+
|
|
267
|
+
case ts.ColumnType.Type.TIMESTAMP | ts.ColumnType.Type.DATE:
|
|
268
|
+
return str(val)
|
|
269
|
+
|
|
270
|
+
case ts.ColumnType.Type.ARRAY:
|
|
271
|
+
assert isinstance(val, np.ndarray)
|
|
272
|
+
return Formatter.format_array(val)
|
|
273
|
+
|
|
274
|
+
case ts.ColumnType.Type.JSON:
|
|
275
|
+
# We need to escape the JSON string server-side for security reasons.
|
|
276
|
+
# Therefore we don't escape it here, in order to avoid double-escaping.
|
|
277
|
+
return Formatter.format_json(val, escape_strings=False)
|
|
278
|
+
|
|
279
|
+
case ts.ColumnType.Type.IMAGE:
|
|
280
|
+
# Rescale the image to minimize data transfer size
|
|
281
|
+
assert isinstance(val, PIL.Image.Image)
|
|
282
|
+
return self.__encode_image(val)
|
|
283
|
+
|
|
284
|
+
case ts.ColumnType.Type.VIDEO:
|
|
285
|
+
assert isinstance(val, str)
|
|
286
|
+
return self.__encode_video(val)
|
|
287
|
+
|
|
288
|
+
case ts.ColumnType.Type.AUDIO:
|
|
289
|
+
return None
|
|
290
|
+
|
|
291
|
+
case ts.ColumnType.Type.DOCUMENT:
|
|
292
|
+
assert isinstance(val, str)
|
|
293
|
+
return self.__encode_document(val)
|
|
294
|
+
|
|
295
|
+
case _:
|
|
296
|
+
raise AssertionError(f'Unrecognized column type: {col_type._type}')
|
|
297
|
+
|
|
298
|
+
def __encode_image(self, img: PIL.Image.Image) -> str:
|
|
299
|
+
# Heuristic for thumbnail sizing:
|
|
300
|
+
# Standardize on a width of 240 pixels (to most efficiently utilize the columnar display).
|
|
301
|
+
# But, if the aspect ratio is below 2:3, bound the height at 360 pixels (to avoid unboundedly tall thumbnails
|
|
302
|
+
# in the case of highly oblong images).
|
|
303
|
+
if img.height > img.width * 1.5:
|
|
304
|
+
scaled_img = img.resize((img.width * 360 // img.height, 360))
|
|
305
|
+
else:
|
|
306
|
+
scaled_img = img.resize((240, img.height * 240 // img.width))
|
|
307
|
+
with io.BytesIO() as buffer:
|
|
308
|
+
scaled_img.save(buffer, 'webp')
|
|
309
|
+
return base64.b64encode(buffer.getvalue()).decode()
|
|
310
|
+
|
|
311
|
+
def __encode_video(self, video_path: str) -> Optional[str]:
|
|
312
|
+
thumb = Formatter.extract_first_video_frame(video_path)
|
|
313
|
+
return self.__encode_image(thumb) if thumb is not None else None
|
|
314
|
+
|
|
315
|
+
def __encode_document(self, doc_path: str) -> Optional[str]:
|
|
316
|
+
thumb = Formatter.make_document_thumbnail(doc_path)
|
|
317
|
+
return self.__encode_image(thumb) if thumb is not None else None
|
|
318
|
+
|
|
209
319
|
|
|
210
320
|
class TableRestorer:
|
|
211
321
|
"""
|
|
@@ -253,13 +363,26 @@ class TableRestorer:
|
|
|
253
363
|
tbl_md = [schema.FullTableMd.from_dict(t) for t in self.md['md']['tables']]
|
|
254
364
|
|
|
255
365
|
# Create the replica table
|
|
256
|
-
#
|
|
257
|
-
|
|
258
|
-
|
|
366
|
+
# The logic here needs to be completely restructured in order to make it concurrency-safe.
|
|
367
|
+
# - Catalog.create_replica() needs to write the metadata and also create the physical store tables
|
|
368
|
+
# and populate them, otherwise concurrent readers will see an inconsistent state (table metadata w/o
|
|
369
|
+
# an actual table)
|
|
370
|
+
# - this could be done one replica at a time (instead of the entire hierarchy)
|
|
371
|
+
cat = catalog.Catalog.get()
|
|
372
|
+
cat.create_replica(catalog.Path(self.tbl_path), tbl_md)
|
|
373
|
+
# don't call get_table() until after the calls to create_replica() and __import_table() below;
|
|
374
|
+
# the TV instances created by get_table() would be replaced by create_replica(), which creates duplicate
|
|
375
|
+
# TV instances for the same replica version, which then leads to failures when constructing queries
|
|
259
376
|
|
|
260
377
|
# Now we need to instantiate and load data for replica_tbl and its ancestors, except that we skip
|
|
261
378
|
# replica_tbl itself if it's a pure snapshot.
|
|
262
|
-
|
|
379
|
+
target_md = tbl_md[0]
|
|
380
|
+
is_pure_snapshot = (
|
|
381
|
+
target_md.tbl_md.view_md is not None
|
|
382
|
+
and target_md.tbl_md.view_md.predicate is None
|
|
383
|
+
and len(target_md.schema_version_md.columns) == 0
|
|
384
|
+
)
|
|
385
|
+
if is_pure_snapshot:
|
|
263
386
|
ancestor_md = tbl_md[1:] # Pure snapshot; skip replica_tbl
|
|
264
387
|
else:
|
|
265
388
|
ancestor_md = tbl_md # Not a pure snapshot; include replica_tbl
|
|
@@ -273,7 +396,8 @@ class TableRestorer:
|
|
|
273
396
|
_logger.info(f'Importing table {tv.name!r}.')
|
|
274
397
|
self.__import_table(self.tmp_dir, tv, md)
|
|
275
398
|
|
|
276
|
-
|
|
399
|
+
with cat.begin_xact(for_write=False):
|
|
400
|
+
return cat.get_table_by_id(UUID(tbl_md[0].tbl_md.tbl_id))
|
|
277
401
|
|
|
278
402
|
def __import_table(self, bundle_path: Path, tv: catalog.TableVersion, tbl_md: schema.FullTableMd) -> None:
|
|
279
403
|
"""
|
pixeltable/share/publish.py
CHANGED
|
@@ -35,7 +35,7 @@ def push_replica(dest_tbl_uri: str, src_tbl: pxt.Table) -> str:
|
|
|
35
35
|
upload_id = response_json['upload_id']
|
|
36
36
|
destination_uri = response_json['destination_uri']
|
|
37
37
|
|
|
38
|
-
Env.get().console_logger.info(f"Creating a snapshot of '{src_tbl._path}' at: {dest_tbl_uri}")
|
|
38
|
+
Env.get().console_logger.info(f"Creating a snapshot of '{src_tbl._path()}' at: {dest_tbl_uri}")
|
|
39
39
|
|
|
40
40
|
bundle = packager.package()
|
|
41
41
|
|
|
@@ -117,7 +117,7 @@ def pull_replica(dest_path: str, src_tbl_uri: str) -> pxt.Table:
|
|
|
117
117
|
|
|
118
118
|
restorer = TableRestorer(dest_path, response_json)
|
|
119
119
|
tbl = restorer.restore(bundle_path)
|
|
120
|
-
Env.get().console_logger.info(f'Created local replica {tbl._path!r} from URI: {src_tbl_uri}')
|
|
120
|
+
Env.get().console_logger.info(f'Created local replica {tbl._path()!r} from URI: {src_tbl_uri}')
|
|
121
121
|
return tbl
|
|
122
122
|
|
|
123
123
|
|
pixeltable/store.py
CHANGED
|
@@ -52,7 +52,8 @@ class StoreBase:
|
|
|
52
52
|
# We need to declare a `base` variable here, even though it's only defined for instances of `StoreView`,
|
|
53
53
|
# since it's referenced by various methods of `StoreBase`
|
|
54
54
|
self.base = tbl_version.base.get().store_tbl if tbl_version.base is not None else None
|
|
55
|
-
|
|
55
|
+
# we're passing in tbl_version to avoid a circular call to TableVersionHandle.get()
|
|
56
|
+
self.create_sa_tbl(tbl_version)
|
|
56
57
|
|
|
57
58
|
def system_columns(self) -> list[sql.Column]:
|
|
58
59
|
return [*self._pk_cols, self.v_max_col]
|
|
@@ -77,11 +78,13 @@ class StoreBase:
|
|
|
77
78
|
self._pk_cols = [*rowid_cols, self.v_min_col]
|
|
78
79
|
return [*rowid_cols, self.v_min_col, self.v_max_col]
|
|
79
80
|
|
|
80
|
-
def create_sa_tbl(self) -> None:
|
|
81
|
+
def create_sa_tbl(self, tbl_version: Optional[catalog.TableVersion] = None) -> None:
|
|
81
82
|
"""Create self.sa_tbl from self.tbl_version."""
|
|
83
|
+
if tbl_version is None:
|
|
84
|
+
tbl_version = self.tbl_version.get()
|
|
82
85
|
system_cols = self._create_system_columns()
|
|
83
86
|
all_cols = system_cols.copy()
|
|
84
|
-
for col in [c for c in
|
|
87
|
+
for col in [c for c in tbl_version.cols if c.is_stored]:
|
|
85
88
|
# re-create sql.Column for each column, regardless of whether it already has sa_col set: it was bound
|
|
86
89
|
# to the last sql.Table version we created and cannot be reused
|
|
87
90
|
col.create_sa_cols()
|
|
@@ -99,16 +102,17 @@ class StoreBase:
|
|
|
99
102
|
# - base x view joins can be executed as merge joins
|
|
100
103
|
# - speeds up ORDER BY rowid DESC
|
|
101
104
|
# - allows filtering for a particular table version in index scan
|
|
102
|
-
idx_name = f'sys_cols_idx_{
|
|
105
|
+
idx_name = f'sys_cols_idx_{tbl_version.id.hex}'
|
|
103
106
|
idxs.append(sql.Index(idx_name, *system_cols))
|
|
104
107
|
|
|
105
108
|
# v_min/v_max indices: speeds up base table scans needed to propagate a base table insert or delete
|
|
106
|
-
idx_name = f'vmin_idx_{
|
|
109
|
+
idx_name = f'vmin_idx_{tbl_version.id.hex}'
|
|
107
110
|
idxs.append(sql.Index(idx_name, self.v_min_col, postgresql_using=Env.get().dbms.version_index_type))
|
|
108
|
-
idx_name = f'vmax_idx_{
|
|
111
|
+
idx_name = f'vmax_idx_{tbl_version.id.hex}'
|
|
109
112
|
idxs.append(sql.Index(idx_name, self.v_max_col, postgresql_using=Env.get().dbms.version_index_type))
|
|
110
113
|
|
|
111
114
|
self.sa_tbl = sql.Table(self._storage_name(), self.sa_md, *all_cols, *idxs)
|
|
115
|
+
# _logger.debug(f'created sa tbl for {tbl_version.id!s} (sa_tbl={id(self.sa_tbl):x}, tv={id(tbl_version):x})')
|
|
112
116
|
|
|
113
117
|
@abc.abstractmethod
|
|
114
118
|
def _rowid_join_predicate(self) -> sql.ColumnElement[bool]:
|
|
@@ -285,7 +289,7 @@ class StoreBase:
|
|
|
285
289
|
else:
|
|
286
290
|
if col.col_type.is_image_type() and result_row.file_urls[value_expr_slot_idx] is None:
|
|
287
291
|
# we have yet to store this image
|
|
288
|
-
filepath = str(MediaStore.prepare_media_path(col.tbl.id, col.id, col.tbl.
|
|
292
|
+
filepath = str(MediaStore.prepare_media_path(col.tbl.id, col.id, col.tbl.version))
|
|
289
293
|
result_row.flush_img(value_expr_slot_idx, filepath)
|
|
290
294
|
val = result_row.get_stored_val(value_expr_slot_idx, col.sa_col.type)
|
|
291
295
|
if col.col_type.is_media_type():
|
|
@@ -415,9 +419,7 @@ class StoreBase:
|
|
|
415
419
|
number of deleted rows
|
|
416
420
|
"""
|
|
417
421
|
where_clause = sql.true() if where_clause is None else where_clause
|
|
418
|
-
|
|
419
|
-
self.v_min_col < current_version, self.v_max_col == schema.Table.MAX_VERSION, where_clause
|
|
420
|
-
)
|
|
422
|
+
version_clause = sql.and_(self.v_min_col < current_version, self.v_max_col == schema.Table.MAX_VERSION)
|
|
421
423
|
rowid_join_clause = self._rowid_join_predicate()
|
|
422
424
|
base_versions_clause = (
|
|
423
425
|
sql.true() if len(base_versions) == 0 else self.base._versions_clause(base_versions, match_on_vmin)
|
|
@@ -428,10 +430,12 @@ class StoreBase:
|
|
|
428
430
|
set_clause[index_info.undo_col.sa_col] = index_info.val_col.sa_col
|
|
429
431
|
# set value column to NULL
|
|
430
432
|
set_clause[index_info.val_col.sa_col] = None
|
|
433
|
+
|
|
431
434
|
stmt = (
|
|
432
435
|
sql.update(self.sa_tbl)
|
|
433
436
|
.values(set_clause)
|
|
434
437
|
.where(where_clause)
|
|
438
|
+
.where(version_clause)
|
|
435
439
|
.where(rowid_join_clause)
|
|
436
440
|
.where(base_versions_clause)
|
|
437
441
|
)
|
|
@@ -528,10 +532,12 @@ class StoreComponentView(StoreView):
|
|
|
528
532
|
self.rowid_cols.append(self.pos_col)
|
|
529
533
|
return self.rowid_cols
|
|
530
534
|
|
|
531
|
-
def create_sa_tbl(self) -> None:
|
|
532
|
-
|
|
535
|
+
def create_sa_tbl(self, tbl_version: Optional[catalog.TableVersion] = None) -> None:
|
|
536
|
+
if tbl_version is None:
|
|
537
|
+
tbl_version = self.tbl_version.get()
|
|
538
|
+
super().create_sa_tbl(tbl_version)
|
|
533
539
|
# we need to fix up the 'pos' column in TableVersion
|
|
534
|
-
|
|
540
|
+
tbl_version.cols_by_name['pos'].sa_col = self.pos_col
|
|
535
541
|
|
|
536
542
|
def _rowid_join_predicate(self) -> sql.ColumnElement[bool]:
|
|
537
543
|
return sql.and_(
|
pixeltable/utils/dbms.py
CHANGED
|
@@ -35,7 +35,7 @@ class PostgresqlDbms(Dbms):
|
|
|
35
35
|
"""
|
|
36
36
|
|
|
37
37
|
def __init__(self, db_url: URL):
|
|
38
|
-
super().__init__('postgresql', '
|
|
38
|
+
super().__init__('postgresql', 'SERIALIZABLE', 'brin', db_url)
|
|
39
39
|
|
|
40
40
|
def drop_db_stmt(self, database: str) -> str:
|
|
41
41
|
return f'DROP DATABASE {database}'
|
pixeltable/utils/formatter.py
CHANGED
|
@@ -63,10 +63,10 @@ class Formatter:
|
|
|
63
63
|
"""
|
|
64
64
|
Escapes special characters in `val`, and abbreviates `val` if its length exceeds `_STRING_MAX_LEN`.
|
|
65
65
|
"""
|
|
66
|
-
return cls.__escape(cls.
|
|
66
|
+
return cls.__escape(cls.abbreviate(val))
|
|
67
67
|
|
|
68
68
|
@classmethod
|
|
69
|
-
def
|
|
69
|
+
def abbreviate(cls, val: str, max_len: int = __STRING_MAX_LEN) -> str:
|
|
70
70
|
if len(val) > max_len:
|
|
71
71
|
edgeitems = (max_len - len(cls.__STRING_SEP)) // 2
|
|
72
72
|
return f'{val[:edgeitems]}{cls.__STRING_SEP}{val[-edgeitems:]}'
|
|
@@ -94,41 +94,45 @@ class Formatter:
|
|
|
94
94
|
)
|
|
95
95
|
|
|
96
96
|
@classmethod
|
|
97
|
-
def format_json(cls, val: Any) -> str:
|
|
97
|
+
def format_json(cls, val: Any, escape_strings: bool = True) -> str:
|
|
98
98
|
if isinstance(val, str):
|
|
99
99
|
# JSON-like formatting will be applied to strings that appear nested within a list or dict
|
|
100
100
|
# (quote the string; escape any quotes inside the string; shorter abbreviations).
|
|
101
101
|
# However, if the string appears in top-level position (i.e., the entire JSON value is a
|
|
102
102
|
# string), then we format it like an ordinary string.
|
|
103
|
-
return cls.format_string(val)
|
|
103
|
+
return cls.format_string(val) if escape_strings else cls.abbreviate(val)
|
|
104
104
|
# In all other cases, dump the JSON struct recursively.
|
|
105
|
-
return cls.__format_json_rec(val)
|
|
105
|
+
return cls.__format_json_rec(val, escape_strings)
|
|
106
106
|
|
|
107
107
|
@classmethod
|
|
108
|
-
def __format_json_rec(cls, val: Any) -> str:
|
|
108
|
+
def __format_json_rec(cls, val: Any, escape_strings: bool) -> str:
|
|
109
109
|
if isinstance(val, str):
|
|
110
|
-
|
|
110
|
+
formatted = json.dumps(cls.abbreviate(val, cls.__NESTED_STRING_MAX_LEN))
|
|
111
|
+
return cls.__escape(formatted) if escape_strings else formatted
|
|
111
112
|
if isinstance(val, float):
|
|
112
113
|
return cls.format_float(val)
|
|
113
114
|
if isinstance(val, np.ndarray):
|
|
114
115
|
return cls.format_array(val)
|
|
115
116
|
if isinstance(val, list):
|
|
116
117
|
if len(val) < cls.__LIST_THRESHOLD:
|
|
117
|
-
components = [cls.__format_json_rec(x) for x in val]
|
|
118
|
+
components = [cls.__format_json_rec(x, escape_strings) for x in val]
|
|
118
119
|
else:
|
|
119
|
-
components = [cls.__format_json_rec(x) for x in val[: cls.__LIST_EDGEITEMS]]
|
|
120
|
+
components = [cls.__format_json_rec(x, escape_strings) for x in val[: cls.__LIST_EDGEITEMS]]
|
|
120
121
|
components.append('...')
|
|
121
|
-
components.extend(cls.__format_json_rec(x) for x in val[-cls.__LIST_EDGEITEMS :])
|
|
122
|
+
components.extend(cls.__format_json_rec(x, escape_strings) for x in val[-cls.__LIST_EDGEITEMS :])
|
|
122
123
|
return '[' + ', '.join(components) + ']'
|
|
123
124
|
if isinstance(val, dict):
|
|
124
|
-
kv_pairs = (
|
|
125
|
+
kv_pairs = (
|
|
126
|
+
f'{cls.__format_json_rec(k, escape_strings)}: {cls.__format_json_rec(v, escape_strings)}'
|
|
127
|
+
for k, v in val.items()
|
|
128
|
+
)
|
|
125
129
|
return '{' + ', '.join(kv_pairs) + '}'
|
|
126
130
|
|
|
127
131
|
# Everything else
|
|
128
132
|
try:
|
|
129
133
|
return json.dumps(val)
|
|
130
134
|
except TypeError: # Not JSON serializable
|
|
131
|
-
return str(val)
|
|
135
|
+
return cls.__escape(str(val))
|
|
132
136
|
|
|
133
137
|
def format_img(self, img: Image.Image) -> str:
|
|
134
138
|
"""
|
|
@@ -152,22 +156,19 @@ class Formatter:
|
|
|
152
156
|
"""
|
|
153
157
|
|
|
154
158
|
def format_video(self, file_path: str) -> str:
|
|
155
|
-
thumb_tag = ''
|
|
156
159
|
# Attempt to extract the first frame of the video to use as a thumbnail,
|
|
157
160
|
# so that the notebook can be exported as HTML and viewed in contexts where
|
|
158
161
|
# the video itself is not accessible.
|
|
159
162
|
# TODO(aaron-siegel): If the video is backed by a concrete external URL,
|
|
160
163
|
# should we link to that instead?
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
except Exception:
|
|
170
|
-
pass
|
|
164
|
+
thumb = self.extract_first_video_frame(file_path)
|
|
165
|
+
if thumb is None:
|
|
166
|
+
thumb_tag = ''
|
|
167
|
+
else:
|
|
168
|
+
with io.BytesIO() as buffer:
|
|
169
|
+
thumb.save(buffer, 'jpeg')
|
|
170
|
+
thumb_base64 = base64.b64encode(buffer.getvalue()).decode()
|
|
171
|
+
thumb_tag = f'poster="data:image/jpeg;base64,{thumb_base64}"'
|
|
171
172
|
if self.__num_rows > 1:
|
|
172
173
|
width = 320
|
|
173
174
|
elif self.__num_cols > 1:
|
|
@@ -182,6 +183,16 @@ class Formatter:
|
|
|
182
183
|
</div>
|
|
183
184
|
"""
|
|
184
185
|
|
|
186
|
+
@classmethod
|
|
187
|
+
def extract_first_video_frame(cls, file_path: str) -> Optional[Image.Image]:
|
|
188
|
+
with av.open(file_path) as container:
|
|
189
|
+
try:
|
|
190
|
+
img = next(container.decode(video=0)).to_image()
|
|
191
|
+
assert isinstance(img, Image.Image)
|
|
192
|
+
return img
|
|
193
|
+
except Exception:
|
|
194
|
+
return None
|
|
195
|
+
|
|
185
196
|
def format_audio(self, file_path: str) -> str:
|
|
186
197
|
return f"""
|
|
187
198
|
<div class="pxt_audio">
|
|
@@ -191,29 +202,18 @@ class Formatter:
|
|
|
191
202
|
</div>
|
|
192
203
|
"""
|
|
193
204
|
|
|
194
|
-
def format_document(self, file_path: str) -> str:
|
|
195
|
-
max_width = max_height = 320
|
|
205
|
+
def format_document(self, file_path: str, max_width: int = 320, max_height: int = 320) -> str:
|
|
196
206
|
# by default, file path will be shown as a link
|
|
197
207
|
inner_element = file_path
|
|
198
208
|
inner_element = html.escape(inner_element)
|
|
199
|
-
# try generating a thumbnail for different types and use that if successful
|
|
200
|
-
if file_path.lower().endswith('.pdf'):
|
|
201
|
-
try:
|
|
202
|
-
import fitz # type: ignore[import-untyped]
|
|
203
209
|
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
img_src = f'data:image/jpeg;base64,{thumb_base64}'
|
|
212
|
-
inner_element = f"""
|
|
213
|
-
<img style="object-fit: contain; border: 1px solid black;" src="{img_src}" />
|
|
214
|
-
"""
|
|
215
|
-
except Exception:
|
|
216
|
-
logging.warning(f'Failed to produce PDF thumbnail {file_path}. Make sure you have PyMuPDF installed.')
|
|
210
|
+
thumb = self.make_document_thumbnail(file_path, max_width, max_height)
|
|
211
|
+
if thumb is not None:
|
|
212
|
+
with io.BytesIO() as buffer:
|
|
213
|
+
thumb.save(buffer, 'webp')
|
|
214
|
+
thumb_base64 = base64.b64encode(buffer.getvalue()).decode()
|
|
215
|
+
thumb_tag = f'data:image/webp;base64,{thumb_base64}'
|
|
216
|
+
inner_element = f'<img style="object-fit: contain; border: 1px solid black;" src="{thumb_tag}" />'
|
|
217
217
|
|
|
218
218
|
return f"""
|
|
219
219
|
<div class="pxt_document" style="width:{max_width}px;">
|
|
@@ -223,6 +223,28 @@ class Formatter:
|
|
|
223
223
|
</div>
|
|
224
224
|
"""
|
|
225
225
|
|
|
226
|
+
@classmethod
|
|
227
|
+
def make_document_thumbnail(
|
|
228
|
+
cls, file_path: str, max_width: int = 320, max_height: int = 320
|
|
229
|
+
) -> Optional[Image.Image]:
|
|
230
|
+
"""
|
|
231
|
+
Returns a thumbnail image of a document.
|
|
232
|
+
"""
|
|
233
|
+
if file_path.lower().endswith('.pdf'):
|
|
234
|
+
try:
|
|
235
|
+
import fitz # type: ignore[import-untyped]
|
|
236
|
+
|
|
237
|
+
doc = fitz.open(file_path)
|
|
238
|
+
pixmap = doc.get_page_pixmap(0)
|
|
239
|
+
while pixmap.width > max_width or pixmap.height > max_height:
|
|
240
|
+
# shrink(1) will halve each dimension
|
|
241
|
+
pixmap.shrink(1)
|
|
242
|
+
return pixmap.pil_image()
|
|
243
|
+
except Exception:
|
|
244
|
+
logging.warning(f'Failed to produce PDF thumbnail {file_path}. Make sure you have PyMuPDF installed.')
|
|
245
|
+
|
|
246
|
+
return None
|
|
247
|
+
|
|
226
248
|
@classmethod
|
|
227
249
|
def __create_source_tag(cls, http_address: str, file_path: str) -> str:
|
|
228
250
|
src_url = get_file_uri(http_address, file_path)
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import sqlalchemy as sql
|
|
2
|
+
|
|
3
|
+
from pixeltable.func.udf import udf
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@udf
|
|
7
|
+
def sample_key(seed: int, *key_fields: int) -> str:
|
|
8
|
+
"""
|
|
9
|
+
Create a sample key from the given seed and key fields.
|
|
10
|
+
|
|
11
|
+
Args:
|
|
12
|
+
seed: The seed value.
|
|
13
|
+
rowids: The rowids to include in the sample key.
|
|
14
|
+
|
|
15
|
+
Returns:
|
|
16
|
+
A string key for each row
|
|
17
|
+
"""
|
|
18
|
+
raise NotImplementedError('SampleKey creation is not implemented in python.')
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@sample_key.to_sql
|
|
22
|
+
def _(seed: sql.ColumnElement, *key_fields: sql.ColumnElement) -> sql.ColumnElement:
|
|
23
|
+
from pixeltable.exec.sql_node import SqlSampleNode
|
|
24
|
+
|
|
25
|
+
return SqlSampleNode.key_sql_expr(seed, key_fields)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: pixeltable
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0rc2
|
|
4
4
|
Summary: AI Data Infrastructure: Declarative, Multimodal, and Incremental
|
|
5
5
|
License: Apache-2.0
|
|
6
6
|
Keywords: data-science,machine-learning,database,ai,computer-vision,chatbot,ml,artificial-intelligence,feature-engineering,multimodal,mlops,feature-store,vector-database,llm,genai
|
|
@@ -36,6 +36,7 @@ Requires-Dist: numpy (>=1.25)
|
|
|
36
36
|
Requires-Dist: pandas (>=2.0,<3.0)
|
|
37
37
|
Requires-Dist: pgvector (>=0.2.1)
|
|
38
38
|
Requires-Dist: pillow (>=9.3.0)
|
|
39
|
+
Requires-Dist: pillow-heif (>=0.15.0)
|
|
39
40
|
Requires-Dist: pixeltable-pgserver (==0.3.1)
|
|
40
41
|
Requires-Dist: psutil (>=5.9.5)
|
|
41
42
|
Requires-Dist: psycopg[binary] (>=3.1.18)
|