pixeltable 0.3.15__py3-none-any.whl → 0.4.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (58) hide show
  1. pixeltable/__version__.py +2 -2
  2. pixeltable/catalog/catalog.py +296 -105
  3. pixeltable/catalog/column.py +10 -8
  4. pixeltable/catalog/dir.py +1 -2
  5. pixeltable/catalog/insertable_table.py +25 -20
  6. pixeltable/catalog/schema_object.py +3 -6
  7. pixeltable/catalog/table.py +261 -189
  8. pixeltable/catalog/table_version.py +333 -202
  9. pixeltable/catalog/table_version_handle.py +15 -2
  10. pixeltable/catalog/table_version_path.py +60 -14
  11. pixeltable/catalog/view.py +38 -6
  12. pixeltable/dataframe.py +196 -18
  13. pixeltable/env.py +4 -4
  14. pixeltable/exec/__init__.py +1 -1
  15. pixeltable/exec/expr_eval/evaluators.py +4 -1
  16. pixeltable/exec/in_memory_data_node.py +1 -1
  17. pixeltable/exec/sql_node.py +171 -22
  18. pixeltable/exprs/column_property_ref.py +15 -6
  19. pixeltable/exprs/column_ref.py +32 -11
  20. pixeltable/exprs/comparison.py +1 -1
  21. pixeltable/exprs/data_row.py +5 -3
  22. pixeltable/exprs/expr.py +7 -0
  23. pixeltable/exprs/literal.py +2 -0
  24. pixeltable/exprs/row_builder.py +4 -6
  25. pixeltable/exprs/rowid_ref.py +8 -0
  26. pixeltable/exprs/similarity_expr.py +1 -0
  27. pixeltable/func/query_template_function.py +1 -1
  28. pixeltable/func/tools.py +1 -1
  29. pixeltable/functions/gemini.py +0 -1
  30. pixeltable/functions/string.py +212 -58
  31. pixeltable/globals.py +12 -4
  32. pixeltable/index/base.py +5 -0
  33. pixeltable/index/btree.py +5 -0
  34. pixeltable/index/embedding_index.py +5 -0
  35. pixeltable/io/external_store.py +8 -29
  36. pixeltable/io/label_studio.py +1 -1
  37. pixeltable/io/parquet.py +2 -2
  38. pixeltable/io/table_data_conduit.py +0 -31
  39. pixeltable/metadata/__init__.py +11 -2
  40. pixeltable/metadata/converters/convert_13.py +2 -2
  41. pixeltable/metadata/converters/convert_30.py +6 -11
  42. pixeltable/metadata/converters/convert_35.py +9 -0
  43. pixeltable/metadata/converters/convert_36.py +38 -0
  44. pixeltable/metadata/converters/util.py +3 -9
  45. pixeltable/metadata/notes.py +2 -0
  46. pixeltable/metadata/schema.py +8 -1
  47. pixeltable/plan.py +221 -14
  48. pixeltable/share/packager.py +137 -13
  49. pixeltable/share/publish.py +2 -2
  50. pixeltable/store.py +19 -13
  51. pixeltable/utils/dbms.py +1 -1
  52. pixeltable/utils/formatter.py +64 -42
  53. pixeltable/utils/sample.py +25 -0
  54. {pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc2.dist-info}/METADATA +2 -1
  55. {pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc2.dist-info}/RECORD +58 -55
  56. {pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc2.dist-info}/LICENSE +0 -0
  57. {pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc2.dist-info}/WHEEL +0 -0
  58. {pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc2.dist-info}/entry_points.txt +0 -0
@@ -1,4 +1,7 @@
1
+ import base64
1
2
  import datetime
3
+ import io
4
+ import itertools
2
5
  import json
3
6
  import logging
4
7
  import tarfile
@@ -7,17 +10,21 @@ import urllib.request
7
10
  import uuid
8
11
  from pathlib import Path
9
12
  from typing import Any, Iterator, Optional
13
+ from uuid import UUID
10
14
 
11
15
  import more_itertools
16
+ import numpy as np
17
+ import PIL.Image
12
18
  import pyarrow as pa
13
19
  import pyarrow.parquet as pq
14
20
  import sqlalchemy as sql
15
21
 
16
22
  import pixeltable as pxt
17
- from pixeltable import catalog, exceptions as excs, metadata
23
+ from pixeltable import catalog, exceptions as excs, metadata, type_system as ts
18
24
  from pixeltable.env import Env
19
25
  from pixeltable.metadata import schema
20
26
  from pixeltable.utils import sha256sum
27
+ from pixeltable.utils.formatter import Formatter
21
28
  from pixeltable.utils.media_store import MediaStore
22
29
 
23
30
  _logger = logging.getLogger('pixeltable')
@@ -45,13 +52,17 @@ class TablePackager:
45
52
  media_files: dict[Path, str] # Mapping from local media file paths to their tarball names
46
53
  md: dict[str, Any]
47
54
 
55
+ bundle_path: Path
56
+ preview_header: dict[str, str]
57
+ preview: list[list[Any]]
58
+
48
59
  def __init__(self, table: catalog.Table, additional_md: Optional[dict[str, Any]] = None) -> None:
49
60
  self.table = table
50
61
  self.tmp_dir = Path(Env.get().create_tmp_path())
51
62
  self.media_files = {}
52
63
 
53
64
  # Load metadata
54
- with Env.get().begin_xact():
65
+ with catalog.Catalog.get().begin_xact(for_write=False):
55
66
  tbl_md = catalog.Catalog.get().load_replica_md(table)
56
67
  self.md = {
57
68
  'pxt_version': pxt.__version__,
@@ -66,20 +77,29 @@ class TablePackager:
66
77
  Export the table to a tarball containing Parquet tables and media files.
67
78
  """
68
79
  assert not self.tmp_dir.exists() # Packaging can only be done once per TablePackager instance
69
- _logger.info(f"Packaging table '{self.table._path}' and its ancestors in: {self.tmp_dir}")
80
+
81
+ _logger.info(f'Packaging table {self.table._path()!r} and its ancestors in: {self.tmp_dir}')
70
82
  self.tmp_dir.mkdir()
71
83
  with open(self.tmp_dir / 'metadata.json', 'w', encoding='utf8') as fp:
72
84
  json.dump(self.md, fp)
73
85
  self.tables_dir = self.tmp_dir / 'tables'
74
86
  self.tables_dir.mkdir()
75
- with Env.get().begin_xact():
87
+ with catalog.Catalog.get().begin_xact(for_write=False):
76
88
  for tv in self.table._tbl_version_path.get_tbl_versions():
77
- _logger.info(f"Exporting table '{tv.get().name}:{tv.get().version}'.")
89
+ _logger.info(f'Exporting table {tv.get().versioned_name!r}.')
78
90
  self.__export_table(tv.get())
91
+
79
92
  _logger.info('Building archive.')
80
- bundle_path = self.__build_tarball()
81
- _logger.info(f'Packaging complete: {bundle_path}')
82
- return bundle_path
93
+ self.bundle_path = self.__build_tarball()
94
+
95
+ _logger.info('Extracting preview data.')
96
+ self.md['count'] = self.table.count()
97
+ preview_header, preview = self.__extract_preview_data()
98
+ self.md['preview_header'] = preview_header
99
+ self.md['preview'] = preview
100
+
101
+ _logger.info(f'Packaging complete: {self.bundle_path}')
102
+ return self.bundle_path
83
103
 
84
104
  def __export_table(self, tv: catalog.TableVersion) -> None:
85
105
  """
@@ -206,6 +226,96 @@ class TablePackager:
206
226
  tf.add(src_file, arcname=f'media/{dest_name}')
207
227
  return bundle_path
208
228
 
229
+ def __extract_preview_data(self) -> tuple[dict[str, str], list[list[Any]]]:
230
+ """
231
+ Extract a preview of the table data for display in the UI.
232
+
233
+ In order to bound the size of the output data, all "unbounded" data types are resized:
234
+ - Strings are abbreviated as per Formatter.abbreviate()
235
+ - Arrays and JSON are shortened and formatted as strings
236
+ - Images are resized to thumbnail size as a base64-encoded webp
237
+ - Videos are replaced by their first frame and resized as above
238
+ - Documents are replaced by a thumbnail as a base64-encoded webp
239
+ """
240
+ # First 8 columns
241
+ preview_cols = dict(itertools.islice(self.table._schema.items(), 0, 8))
242
+ select_list = [self.table[col_name] for col_name in preview_cols]
243
+ # First 5 rows
244
+ rows = list(self.table.select(*select_list).head(n=5))
245
+
246
+ preview_header = {col_name: str(col_type._type) for col_name, col_type in preview_cols.items()}
247
+ preview = [
248
+ [self.__encode_preview_data(val, col_type)]
249
+ for row in rows
250
+ for val, col_type in zip(row.values(), preview_cols.values())
251
+ ]
252
+
253
+ return preview_header, preview
254
+
255
+ def __encode_preview_data(self, val: Any, col_type: ts.ColumnType) -> Any:
256
+ if val is None:
257
+ return None
258
+
259
+ match col_type._type:
260
+ case ts.ColumnType.Type.STRING:
261
+ assert isinstance(val, str)
262
+ return Formatter.abbreviate(val)
263
+
264
+ case ts.ColumnType.Type.INT | ts.ColumnType.Type.FLOAT | ts.ColumnType.Type.BOOL:
265
+ return val
266
+
267
+ case ts.ColumnType.Type.TIMESTAMP | ts.ColumnType.Type.DATE:
268
+ return str(val)
269
+
270
+ case ts.ColumnType.Type.ARRAY:
271
+ assert isinstance(val, np.ndarray)
272
+ return Formatter.format_array(val)
273
+
274
+ case ts.ColumnType.Type.JSON:
275
+ # We need to escape the JSON string server-side for security reasons.
276
+ # Therefore we don't escape it here, in order to avoid double-escaping.
277
+ return Formatter.format_json(val, escape_strings=False)
278
+
279
+ case ts.ColumnType.Type.IMAGE:
280
+ # Rescale the image to minimize data transfer size
281
+ assert isinstance(val, PIL.Image.Image)
282
+ return self.__encode_image(val)
283
+
284
+ case ts.ColumnType.Type.VIDEO:
285
+ assert isinstance(val, str)
286
+ return self.__encode_video(val)
287
+
288
+ case ts.ColumnType.Type.AUDIO:
289
+ return None
290
+
291
+ case ts.ColumnType.Type.DOCUMENT:
292
+ assert isinstance(val, str)
293
+ return self.__encode_document(val)
294
+
295
+ case _:
296
+ raise AssertionError(f'Unrecognized column type: {col_type._type}')
297
+
298
+ def __encode_image(self, img: PIL.Image.Image) -> str:
299
+ # Heuristic for thumbnail sizing:
300
+ # Standardize on a width of 240 pixels (to most efficiently utilize the columnar display).
301
+ # But, if the aspect ratio is below 2:3, bound the height at 360 pixels (to avoid unboundedly tall thumbnails
302
+ # in the case of highly oblong images).
303
+ if img.height > img.width * 1.5:
304
+ scaled_img = img.resize((img.width * 360 // img.height, 360))
305
+ else:
306
+ scaled_img = img.resize((240, img.height * 240 // img.width))
307
+ with io.BytesIO() as buffer:
308
+ scaled_img.save(buffer, 'webp')
309
+ return base64.b64encode(buffer.getvalue()).decode()
310
+
311
+ def __encode_video(self, video_path: str) -> Optional[str]:
312
+ thumb = Formatter.extract_first_video_frame(video_path)
313
+ return self.__encode_image(thumb) if thumb is not None else None
314
+
315
+ def __encode_document(self, doc_path: str) -> Optional[str]:
316
+ thumb = Formatter.make_document_thumbnail(doc_path)
317
+ return self.__encode_image(thumb) if thumb is not None else None
318
+
209
319
 
210
320
  class TableRestorer:
211
321
  """
@@ -253,13 +363,26 @@ class TableRestorer:
253
363
  tbl_md = [schema.FullTableMd.from_dict(t) for t in self.md['md']['tables']]
254
364
 
255
365
  # Create the replica table
256
- # TODO: This needs to be made concurrency-safe.
257
- replica_tbl = catalog.Catalog.get().create_replica(catalog.Path(self.tbl_path), tbl_md)
258
- assert replica_tbl._tbl_version.get().is_snapshot
366
+ # The logic here needs to be completely restructured in order to make it concurrency-safe.
367
+ # - Catalog.create_replica() needs to write the metadata and also create the physical store tables
368
+ # and populate them, otherwise concurrent readers will see an inconsistent state (table metadata w/o
369
+ # an actual table)
370
+ # - this could be done one replica at a time (instead of the entire hierarchy)
371
+ cat = catalog.Catalog.get()
372
+ cat.create_replica(catalog.Path(self.tbl_path), tbl_md)
373
+ # don't call get_table() until after the calls to create_replica() and __import_table() below;
374
+ # the TV instances created by get_table() would be replaced by create_replica(), which creates duplicate
375
+ # TV instances for the same replica version, which then leads to failures when constructing queries
259
376
 
260
377
  # Now we need to instantiate and load data for replica_tbl and its ancestors, except that we skip
261
378
  # replica_tbl itself if it's a pure snapshot.
262
- if replica_tbl._id != replica_tbl._tbl_version.id:
379
+ target_md = tbl_md[0]
380
+ is_pure_snapshot = (
381
+ target_md.tbl_md.view_md is not None
382
+ and target_md.tbl_md.view_md.predicate is None
383
+ and len(target_md.schema_version_md.columns) == 0
384
+ )
385
+ if is_pure_snapshot:
263
386
  ancestor_md = tbl_md[1:] # Pure snapshot; skip replica_tbl
264
387
  else:
265
388
  ancestor_md = tbl_md # Not a pure snapshot; include replica_tbl
@@ -273,7 +396,8 @@ class TableRestorer:
273
396
  _logger.info(f'Importing table {tv.name!r}.')
274
397
  self.__import_table(self.tmp_dir, tv, md)
275
398
 
276
- return replica_tbl
399
+ with cat.begin_xact(for_write=False):
400
+ return cat.get_table_by_id(UUID(tbl_md[0].tbl_md.tbl_id))
277
401
 
278
402
  def __import_table(self, bundle_path: Path, tv: catalog.TableVersion, tbl_md: schema.FullTableMd) -> None:
279
403
  """
@@ -35,7 +35,7 @@ def push_replica(dest_tbl_uri: str, src_tbl: pxt.Table) -> str:
35
35
  upload_id = response_json['upload_id']
36
36
  destination_uri = response_json['destination_uri']
37
37
 
38
- Env.get().console_logger.info(f"Creating a snapshot of '{src_tbl._path}' at: {dest_tbl_uri}")
38
+ Env.get().console_logger.info(f"Creating a snapshot of '{src_tbl._path()}' at: {dest_tbl_uri}")
39
39
 
40
40
  bundle = packager.package()
41
41
 
@@ -117,7 +117,7 @@ def pull_replica(dest_path: str, src_tbl_uri: str) -> pxt.Table:
117
117
 
118
118
  restorer = TableRestorer(dest_path, response_json)
119
119
  tbl = restorer.restore(bundle_path)
120
- Env.get().console_logger.info(f'Created local replica {tbl._path!r} from URI: {src_tbl_uri}')
120
+ Env.get().console_logger.info(f'Created local replica {tbl._path()!r} from URI: {src_tbl_uri}')
121
121
  return tbl
122
122
 
123
123
 
pixeltable/store.py CHANGED
@@ -52,7 +52,8 @@ class StoreBase:
52
52
  # We need to declare a `base` variable here, even though it's only defined for instances of `StoreView`,
53
53
  # since it's referenced by various methods of `StoreBase`
54
54
  self.base = tbl_version.base.get().store_tbl if tbl_version.base is not None else None
55
- self.create_sa_tbl()
55
+ # we're passing in tbl_version to avoid a circular call to TableVersionHandle.get()
56
+ self.create_sa_tbl(tbl_version)
56
57
 
57
58
  def system_columns(self) -> list[sql.Column]:
58
59
  return [*self._pk_cols, self.v_max_col]
@@ -77,11 +78,13 @@ class StoreBase:
77
78
  self._pk_cols = [*rowid_cols, self.v_min_col]
78
79
  return [*rowid_cols, self.v_min_col, self.v_max_col]
79
80
 
80
- def create_sa_tbl(self) -> None:
81
+ def create_sa_tbl(self, tbl_version: Optional[catalog.TableVersion] = None) -> None:
81
82
  """Create self.sa_tbl from self.tbl_version."""
83
+ if tbl_version is None:
84
+ tbl_version = self.tbl_version.get()
82
85
  system_cols = self._create_system_columns()
83
86
  all_cols = system_cols.copy()
84
- for col in [c for c in self.tbl_version.get().cols if c.is_stored]:
87
+ for col in [c for c in tbl_version.cols if c.is_stored]:
85
88
  # re-create sql.Column for each column, regardless of whether it already has sa_col set: it was bound
86
89
  # to the last sql.Table version we created and cannot be reused
87
90
  col.create_sa_cols()
@@ -99,16 +102,17 @@ class StoreBase:
99
102
  # - base x view joins can be executed as merge joins
100
103
  # - speeds up ORDER BY rowid DESC
101
104
  # - allows filtering for a particular table version in index scan
102
- idx_name = f'sys_cols_idx_{self.tbl_version.id.hex}'
105
+ idx_name = f'sys_cols_idx_{tbl_version.id.hex}'
103
106
  idxs.append(sql.Index(idx_name, *system_cols))
104
107
 
105
108
  # v_min/v_max indices: speeds up base table scans needed to propagate a base table insert or delete
106
- idx_name = f'vmin_idx_{self.tbl_version.id.hex}'
109
+ idx_name = f'vmin_idx_{tbl_version.id.hex}'
107
110
  idxs.append(sql.Index(idx_name, self.v_min_col, postgresql_using=Env.get().dbms.version_index_type))
108
- idx_name = f'vmax_idx_{self.tbl_version.id.hex}'
111
+ idx_name = f'vmax_idx_{tbl_version.id.hex}'
109
112
  idxs.append(sql.Index(idx_name, self.v_max_col, postgresql_using=Env.get().dbms.version_index_type))
110
113
 
111
114
  self.sa_tbl = sql.Table(self._storage_name(), self.sa_md, *all_cols, *idxs)
115
+ # _logger.debug(f'created sa tbl for {tbl_version.id!s} (sa_tbl={id(self.sa_tbl):x}, tv={id(tbl_version):x})')
112
116
 
113
117
  @abc.abstractmethod
114
118
  def _rowid_join_predicate(self) -> sql.ColumnElement[bool]:
@@ -285,7 +289,7 @@ class StoreBase:
285
289
  else:
286
290
  if col.col_type.is_image_type() and result_row.file_urls[value_expr_slot_idx] is None:
287
291
  # we have yet to store this image
288
- filepath = str(MediaStore.prepare_media_path(col.tbl.id, col.id, col.tbl.get().version))
292
+ filepath = str(MediaStore.prepare_media_path(col.tbl.id, col.id, col.tbl.version))
289
293
  result_row.flush_img(value_expr_slot_idx, filepath)
290
294
  val = result_row.get_stored_val(value_expr_slot_idx, col.sa_col.type)
291
295
  if col.col_type.is_media_type():
@@ -415,9 +419,7 @@ class StoreBase:
415
419
  number of deleted rows
416
420
  """
417
421
  where_clause = sql.true() if where_clause is None else where_clause
418
- where_clause = sql.and_(
419
- self.v_min_col < current_version, self.v_max_col == schema.Table.MAX_VERSION, where_clause
420
- )
422
+ version_clause = sql.and_(self.v_min_col < current_version, self.v_max_col == schema.Table.MAX_VERSION)
421
423
  rowid_join_clause = self._rowid_join_predicate()
422
424
  base_versions_clause = (
423
425
  sql.true() if len(base_versions) == 0 else self.base._versions_clause(base_versions, match_on_vmin)
@@ -428,10 +430,12 @@ class StoreBase:
428
430
  set_clause[index_info.undo_col.sa_col] = index_info.val_col.sa_col
429
431
  # set value column to NULL
430
432
  set_clause[index_info.val_col.sa_col] = None
433
+
431
434
  stmt = (
432
435
  sql.update(self.sa_tbl)
433
436
  .values(set_clause)
434
437
  .where(where_clause)
438
+ .where(version_clause)
435
439
  .where(rowid_join_clause)
436
440
  .where(base_versions_clause)
437
441
  )
@@ -528,10 +532,12 @@ class StoreComponentView(StoreView):
528
532
  self.rowid_cols.append(self.pos_col)
529
533
  return self.rowid_cols
530
534
 
531
- def create_sa_tbl(self) -> None:
532
- super().create_sa_tbl()
535
+ def create_sa_tbl(self, tbl_version: Optional[catalog.TableVersion] = None) -> None:
536
+ if tbl_version is None:
537
+ tbl_version = self.tbl_version.get()
538
+ super().create_sa_tbl(tbl_version)
533
539
  # we need to fix up the 'pos' column in TableVersion
534
- self.tbl_version.get().cols_by_name['pos'].sa_col = self.pos_col
540
+ tbl_version.cols_by_name['pos'].sa_col = self.pos_col
535
541
 
536
542
  def _rowid_join_predicate(self) -> sql.ColumnElement[bool]:
537
543
  return sql.and_(
pixeltable/utils/dbms.py CHANGED
@@ -35,7 +35,7 @@ class PostgresqlDbms(Dbms):
35
35
  """
36
36
 
37
37
  def __init__(self, db_url: URL):
38
- super().__init__('postgresql', 'REPEATABLE READ', 'brin', db_url)
38
+ super().__init__('postgresql', 'SERIALIZABLE', 'brin', db_url)
39
39
 
40
40
  def drop_db_stmt(self, database: str) -> str:
41
41
  return f'DROP DATABASE {database}'
@@ -63,10 +63,10 @@ class Formatter:
63
63
  """
64
64
  Escapes special characters in `val`, and abbreviates `val` if its length exceeds `_STRING_MAX_LEN`.
65
65
  """
66
- return cls.__escape(cls.__abbreviate(val, cls.__STRING_MAX_LEN))
66
+ return cls.__escape(cls.abbreviate(val))
67
67
 
68
68
  @classmethod
69
- def __abbreviate(cls, val: str, max_len: int) -> str:
69
+ def abbreviate(cls, val: str, max_len: int = __STRING_MAX_LEN) -> str:
70
70
  if len(val) > max_len:
71
71
  edgeitems = (max_len - len(cls.__STRING_SEP)) // 2
72
72
  return f'{val[:edgeitems]}{cls.__STRING_SEP}{val[-edgeitems:]}'
@@ -94,41 +94,45 @@ class Formatter:
94
94
  )
95
95
 
96
96
  @classmethod
97
- def format_json(cls, val: Any) -> str:
97
+ def format_json(cls, val: Any, escape_strings: bool = True) -> str:
98
98
  if isinstance(val, str):
99
99
  # JSON-like formatting will be applied to strings that appear nested within a list or dict
100
100
  # (quote the string; escape any quotes inside the string; shorter abbreviations).
101
101
  # However, if the string appears in top-level position (i.e., the entire JSON value is a
102
102
  # string), then we format it like an ordinary string.
103
- return cls.format_string(val)
103
+ return cls.format_string(val) if escape_strings else cls.abbreviate(val)
104
104
  # In all other cases, dump the JSON struct recursively.
105
- return cls.__format_json_rec(val)
105
+ return cls.__format_json_rec(val, escape_strings)
106
106
 
107
107
  @classmethod
108
- def __format_json_rec(cls, val: Any) -> str:
108
+ def __format_json_rec(cls, val: Any, escape_strings: bool) -> str:
109
109
  if isinstance(val, str):
110
- return cls.__escape(json.dumps(cls.__abbreviate(val, cls.__NESTED_STRING_MAX_LEN)))
110
+ formatted = json.dumps(cls.abbreviate(val, cls.__NESTED_STRING_MAX_LEN))
111
+ return cls.__escape(formatted) if escape_strings else formatted
111
112
  if isinstance(val, float):
112
113
  return cls.format_float(val)
113
114
  if isinstance(val, np.ndarray):
114
115
  return cls.format_array(val)
115
116
  if isinstance(val, list):
116
117
  if len(val) < cls.__LIST_THRESHOLD:
117
- components = [cls.__format_json_rec(x) for x in val]
118
+ components = [cls.__format_json_rec(x, escape_strings) for x in val]
118
119
  else:
119
- components = [cls.__format_json_rec(x) for x in val[: cls.__LIST_EDGEITEMS]]
120
+ components = [cls.__format_json_rec(x, escape_strings) for x in val[: cls.__LIST_EDGEITEMS]]
120
121
  components.append('...')
121
- components.extend(cls.__format_json_rec(x) for x in val[-cls.__LIST_EDGEITEMS :])
122
+ components.extend(cls.__format_json_rec(x, escape_strings) for x in val[-cls.__LIST_EDGEITEMS :])
122
123
  return '[' + ', '.join(components) + ']'
123
124
  if isinstance(val, dict):
124
- kv_pairs = (f'{cls.__format_json_rec(k)}: {cls.__format_json_rec(v)}' for k, v in val.items())
125
+ kv_pairs = (
126
+ f'{cls.__format_json_rec(k, escape_strings)}: {cls.__format_json_rec(v, escape_strings)}'
127
+ for k, v in val.items()
128
+ )
125
129
  return '{' + ', '.join(kv_pairs) + '}'
126
130
 
127
131
  # Everything else
128
132
  try:
129
133
  return json.dumps(val)
130
134
  except TypeError: # Not JSON serializable
131
- return str(val)
135
+ return cls.__escape(str(val))
132
136
 
133
137
  def format_img(self, img: Image.Image) -> str:
134
138
  """
@@ -152,22 +156,19 @@ class Formatter:
152
156
  """
153
157
 
154
158
  def format_video(self, file_path: str) -> str:
155
- thumb_tag = ''
156
159
  # Attempt to extract the first frame of the video to use as a thumbnail,
157
160
  # so that the notebook can be exported as HTML and viewed in contexts where
158
161
  # the video itself is not accessible.
159
162
  # TODO(aaron-siegel): If the video is backed by a concrete external URL,
160
163
  # should we link to that instead?
161
- with av.open(file_path) as container:
162
- try:
163
- thumb = next(container.decode(video=0)).to_image()
164
- assert isinstance(thumb, Image.Image)
165
- with io.BytesIO() as buffer:
166
- thumb.save(buffer, 'jpeg')
167
- thumb_base64 = base64.b64encode(buffer.getvalue()).decode()
168
- thumb_tag = f'poster="data:image/jpeg;base64,{thumb_base64}"'
169
- except Exception:
170
- pass
164
+ thumb = self.extract_first_video_frame(file_path)
165
+ if thumb is None:
166
+ thumb_tag = ''
167
+ else:
168
+ with io.BytesIO() as buffer:
169
+ thumb.save(buffer, 'jpeg')
170
+ thumb_base64 = base64.b64encode(buffer.getvalue()).decode()
171
+ thumb_tag = f'poster="data:image/jpeg;base64,{thumb_base64}"'
171
172
  if self.__num_rows > 1:
172
173
  width = 320
173
174
  elif self.__num_cols > 1:
@@ -182,6 +183,16 @@ class Formatter:
182
183
  </div>
183
184
  """
184
185
 
186
+ @classmethod
187
+ def extract_first_video_frame(cls, file_path: str) -> Optional[Image.Image]:
188
+ with av.open(file_path) as container:
189
+ try:
190
+ img = next(container.decode(video=0)).to_image()
191
+ assert isinstance(img, Image.Image)
192
+ return img
193
+ except Exception:
194
+ return None
195
+
185
196
  def format_audio(self, file_path: str) -> str:
186
197
  return f"""
187
198
  <div class="pxt_audio">
@@ -191,29 +202,18 @@ class Formatter:
191
202
  </div>
192
203
  """
193
204
 
194
- def format_document(self, file_path: str) -> str:
195
- max_width = max_height = 320
205
+ def format_document(self, file_path: str, max_width: int = 320, max_height: int = 320) -> str:
196
206
  # by default, file path will be shown as a link
197
207
  inner_element = file_path
198
208
  inner_element = html.escape(inner_element)
199
- # try generating a thumbnail for different types and use that if successful
200
- if file_path.lower().endswith('.pdf'):
201
- try:
202
- import fitz # type: ignore[import-untyped]
203
209
 
204
- doc = fitz.open(file_path)
205
- p = doc.get_page_pixmap(0)
206
- while p.width > max_width or p.height > max_height:
207
- # shrink(1) will halve each dimension
208
- p.shrink(1)
209
- data = p.tobytes(output='jpeg')
210
- thumb_base64 = base64.b64encode(data).decode()
211
- img_src = f'data:image/jpeg;base64,{thumb_base64}'
212
- inner_element = f"""
213
- <img style="object-fit: contain; border: 1px solid black;" src="{img_src}" />
214
- """
215
- except Exception:
216
- logging.warning(f'Failed to produce PDF thumbnail {file_path}. Make sure you have PyMuPDF installed.')
210
+ thumb = self.make_document_thumbnail(file_path, max_width, max_height)
211
+ if thumb is not None:
212
+ with io.BytesIO() as buffer:
213
+ thumb.save(buffer, 'webp')
214
+ thumb_base64 = base64.b64encode(buffer.getvalue()).decode()
215
+ thumb_tag = f'data:image/webp;base64,{thumb_base64}'
216
+ inner_element = f'<img style="object-fit: contain; border: 1px solid black;" src="{thumb_tag}" />'
217
217
 
218
218
  return f"""
219
219
  <div class="pxt_document" style="width:{max_width}px;">
@@ -223,6 +223,28 @@ class Formatter:
223
223
  </div>
224
224
  """
225
225
 
226
+ @classmethod
227
+ def make_document_thumbnail(
228
+ cls, file_path: str, max_width: int = 320, max_height: int = 320
229
+ ) -> Optional[Image.Image]:
230
+ """
231
+ Returns a thumbnail image of a document.
232
+ """
233
+ if file_path.lower().endswith('.pdf'):
234
+ try:
235
+ import fitz # type: ignore[import-untyped]
236
+
237
+ doc = fitz.open(file_path)
238
+ pixmap = doc.get_page_pixmap(0)
239
+ while pixmap.width > max_width or pixmap.height > max_height:
240
+ # shrink(1) will halve each dimension
241
+ pixmap.shrink(1)
242
+ return pixmap.pil_image()
243
+ except Exception:
244
+ logging.warning(f'Failed to produce PDF thumbnail {file_path}. Make sure you have PyMuPDF installed.')
245
+
246
+ return None
247
+
226
248
  @classmethod
227
249
  def __create_source_tag(cls, http_address: str, file_path: str) -> str:
228
250
  src_url = get_file_uri(http_address, file_path)
@@ -0,0 +1,25 @@
1
+ import sqlalchemy as sql
2
+
3
+ from pixeltable.func.udf import udf
4
+
5
+
6
+ @udf
7
+ def sample_key(seed: int, *key_fields: int) -> str:
8
+ """
9
+ Create a sample key from the given seed and key fields.
10
+
11
+ Args:
12
+ seed: The seed value.
13
+ rowids: The rowids to include in the sample key.
14
+
15
+ Returns:
16
+ A string key for each row
17
+ """
18
+ raise NotImplementedError('SampleKey creation is not implemented in python.')
19
+
20
+
21
+ @sample_key.to_sql
22
+ def _(seed: sql.ColumnElement, *key_fields: sql.ColumnElement) -> sql.ColumnElement:
23
+ from pixeltable.exec.sql_node import SqlSampleNode
24
+
25
+ return SqlSampleNode.key_sql_expr(seed, key_fields)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: pixeltable
3
- Version: 0.3.15
3
+ Version: 0.4.0rc2
4
4
  Summary: AI Data Infrastructure: Declarative, Multimodal, and Incremental
5
5
  License: Apache-2.0
6
6
  Keywords: data-science,machine-learning,database,ai,computer-vision,chatbot,ml,artificial-intelligence,feature-engineering,multimodal,mlops,feature-store,vector-database,llm,genai
@@ -36,6 +36,7 @@ Requires-Dist: numpy (>=1.25)
36
36
  Requires-Dist: pandas (>=2.0,<3.0)
37
37
  Requires-Dist: pgvector (>=0.2.1)
38
38
  Requires-Dist: pillow (>=9.3.0)
39
+ Requires-Dist: pillow-heif (>=0.15.0)
39
40
  Requires-Dist: pixeltable-pgserver (==0.3.1)
40
41
  Requires-Dist: psutil (>=5.9.5)
41
42
  Requires-Dist: psycopg[binary] (>=3.1.18)