pixeltable 0.3.15__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (78) hide show
  1. pixeltable/__init__.py +1 -1
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/__init__.py +9 -1
  4. pixeltable/catalog/catalog.py +559 -134
  5. pixeltable/catalog/column.py +36 -32
  6. pixeltable/catalog/dir.py +1 -2
  7. pixeltable/catalog/globals.py +12 -0
  8. pixeltable/catalog/insertable_table.py +30 -25
  9. pixeltable/catalog/schema_object.py +9 -6
  10. pixeltable/catalog/table.py +334 -267
  11. pixeltable/catalog/table_version.py +358 -241
  12. pixeltable/catalog/table_version_handle.py +18 -2
  13. pixeltable/catalog/table_version_path.py +86 -16
  14. pixeltable/catalog/view.py +47 -23
  15. pixeltable/dataframe.py +198 -19
  16. pixeltable/env.py +6 -4
  17. pixeltable/exceptions.py +6 -0
  18. pixeltable/exec/__init__.py +1 -1
  19. pixeltable/exec/exec_node.py +2 -0
  20. pixeltable/exec/expr_eval/evaluators.py +4 -1
  21. pixeltable/exec/expr_eval/expr_eval_node.py +4 -4
  22. pixeltable/exec/in_memory_data_node.py +1 -1
  23. pixeltable/exec/sql_node.py +188 -22
  24. pixeltable/exprs/column_property_ref.py +16 -6
  25. pixeltable/exprs/column_ref.py +33 -11
  26. pixeltable/exprs/comparison.py +1 -1
  27. pixeltable/exprs/data_row.py +5 -3
  28. pixeltable/exprs/expr.py +11 -4
  29. pixeltable/exprs/literal.py +2 -0
  30. pixeltable/exprs/row_builder.py +4 -6
  31. pixeltable/exprs/rowid_ref.py +8 -0
  32. pixeltable/exprs/similarity_expr.py +1 -0
  33. pixeltable/func/__init__.py +1 -0
  34. pixeltable/func/mcp.py +74 -0
  35. pixeltable/func/query_template_function.py +5 -3
  36. pixeltable/func/tools.py +12 -2
  37. pixeltable/func/udf.py +2 -2
  38. pixeltable/functions/__init__.py +1 -0
  39. pixeltable/functions/anthropic.py +19 -45
  40. pixeltable/functions/deepseek.py +19 -38
  41. pixeltable/functions/fireworks.py +9 -18
  42. pixeltable/functions/gemini.py +2 -3
  43. pixeltable/functions/groq.py +108 -0
  44. pixeltable/functions/llama_cpp.py +6 -6
  45. pixeltable/functions/mistralai.py +16 -53
  46. pixeltable/functions/ollama.py +1 -1
  47. pixeltable/functions/openai.py +82 -165
  48. pixeltable/functions/string.py +212 -58
  49. pixeltable/functions/together.py +22 -80
  50. pixeltable/globals.py +10 -4
  51. pixeltable/index/base.py +5 -0
  52. pixeltable/index/btree.py +5 -0
  53. pixeltable/index/embedding_index.py +5 -0
  54. pixeltable/io/external_store.py +10 -31
  55. pixeltable/io/label_studio.py +5 -5
  56. pixeltable/io/parquet.py +2 -2
  57. pixeltable/io/table_data_conduit.py +1 -32
  58. pixeltable/metadata/__init__.py +11 -2
  59. pixeltable/metadata/converters/convert_13.py +2 -2
  60. pixeltable/metadata/converters/convert_30.py +6 -11
  61. pixeltable/metadata/converters/convert_35.py +9 -0
  62. pixeltable/metadata/converters/convert_36.py +38 -0
  63. pixeltable/metadata/converters/convert_37.py +15 -0
  64. pixeltable/metadata/converters/util.py +3 -9
  65. pixeltable/metadata/notes.py +3 -0
  66. pixeltable/metadata/schema.py +13 -1
  67. pixeltable/plan.py +135 -12
  68. pixeltable/share/packager.py +138 -14
  69. pixeltable/share/publish.py +2 -2
  70. pixeltable/store.py +19 -13
  71. pixeltable/type_system.py +30 -0
  72. pixeltable/utils/dbms.py +1 -1
  73. pixeltable/utils/formatter.py +64 -42
  74. {pixeltable-0.3.15.dist-info → pixeltable-0.4.0.dist-info}/METADATA +2 -1
  75. {pixeltable-0.3.15.dist-info → pixeltable-0.4.0.dist-info}/RECORD +78 -73
  76. {pixeltable-0.3.15.dist-info → pixeltable-0.4.0.dist-info}/LICENSE +0 -0
  77. {pixeltable-0.3.15.dist-info → pixeltable-0.4.0.dist-info}/WHEEL +0 -0
  78. {pixeltable-0.3.15.dist-info → pixeltable-0.4.0.dist-info}/entry_points.txt +0 -0
pixeltable/plan.py CHANGED
@@ -75,6 +75,88 @@ class FromClause:
75
75
  tbls: list[catalog.TableVersionPath]
76
76
  join_clauses: list[JoinClause] = dataclasses.field(default_factory=list)
77
77
 
78
+ @property
79
+ def _first_tbl(self) -> catalog.TableVersionPath:
80
+ assert len(self.tbls) == 1
81
+ return self.tbls[0]
82
+
83
+
84
+ @dataclasses.dataclass
85
+ class SampleClause:
86
+ """Defines a sampling clause for a table."""
87
+
88
+ version: Optional[int]
89
+ n: Optional[int]
90
+ n_per_stratum: Optional[int]
91
+ fraction: Optional[float]
92
+ seed: Optional[int]
93
+ stratify_exprs: Optional[list[exprs.Expr]]
94
+
95
+ # This seed value is used if one is not supplied
96
+ DEFAULT_SEED = 0
97
+
98
+ # The version of the hashing algorithm used for ordering and fractional sampling.
99
+ CURRENT_VERSION = 1
100
+
101
+ def __post_init__(self) -> None:
102
+ """If no version was provided, provide the default version"""
103
+ if self.version is None:
104
+ self.version = self.CURRENT_VERSION
105
+ if self.seed is None:
106
+ self.seed = self.DEFAULT_SEED
107
+
108
+ @property
109
+ def is_stratified(self) -> bool:
110
+ """Check if the sampling is stratified"""
111
+ return self.stratify_exprs is not None and len(self.stratify_exprs) > 0
112
+
113
+ @property
114
+ def is_repeatable(self) -> bool:
115
+ """Return true if the same rows will continue to be sampled if source rows are added or deleted."""
116
+ return not self.is_stratified and self.fraction is not None
117
+
118
+ def display_str(self, inline: bool = False) -> str:
119
+ return str(self)
120
+
121
+ def as_dict(self) -> dict:
122
+ """Return a dictionary representation of the object"""
123
+ d = dataclasses.asdict(self)
124
+ d['_classname'] = self.__class__.__name__
125
+ if self.is_stratified:
126
+ d['stratify_exprs'] = [e.as_dict() for e in self.stratify_exprs]
127
+ return d
128
+
129
+ @classmethod
130
+ def from_dict(cls, d: dict) -> SampleClause:
131
+ """Create a SampleClause from a dictionary representation"""
132
+ d_cleaned = {key: value for key, value in d.items() if key != '_classname'}
133
+ s = cls(**d_cleaned)
134
+ if s.is_stratified:
135
+ s.stratify_exprs = [exprs.Expr.from_dict(e) for e in d_cleaned.get('stratify_exprs', [])]
136
+ return s
137
+
138
+ def __repr__(self) -> str:
139
+ s = ','.join(e.display_str(inline=True) for e in self.stratify_exprs)
140
+ return (
141
+ f'sample_{self.version}(n={self.n}, n_per_stratum={self.n_per_stratum}, '
142
+ f'fraction={self.fraction}, seed={self.seed}, [{s}])'
143
+ )
144
+
145
+ @classmethod
146
+ def fraction_to_md5_hex(cls, fraction: float) -> str:
147
+ """Return the string representation of an approximation (to ~1e-9) of a fraction of the total space
148
+ of md5 hash values.
149
+ This is used for fractional sampling.
150
+ """
151
+ # Maximum count for the upper 32 bits of MD5: 2^32
152
+ max_md5_value = (2**32) - 1
153
+
154
+ # Calculate the fraction of this value
155
+ threshold_int = max_md5_value * int(1_000_000_000 * fraction) // 1_000_000_000
156
+
157
+ # Convert to hexadecimal string with padding
158
+ return format(threshold_int, '08x') + 'ffffffffffffffffffffffff'
159
+
78
160
 
79
161
  class Analyzer:
80
162
  """
@@ -87,6 +169,8 @@ class Analyzer:
87
169
  group_by_clause: Optional[list[exprs.Expr]] # None for non-aggregate queries; [] for agg query w/o grouping
88
170
  grouping_exprs: list[exprs.Expr] # [] for non-aggregate queries or agg query w/o grouping
89
171
  order_by_clause: OrderByClause
172
+ stratify_exprs: list[exprs.Expr] # [] if no stratiifcation is required
173
+ sample_clause: Optional[SampleClause] # None if no sampling clause is present
90
174
 
91
175
  sql_elements: exprs.SqlElementCache
92
176
 
@@ -107,6 +191,7 @@ class Analyzer:
107
191
  where_clause: Optional[exprs.Expr] = None,
108
192
  group_by_clause: Optional[list[exprs.Expr]] = None,
109
193
  order_by_clause: Optional[list[tuple[exprs.Expr, bool]]] = None,
194
+ sample_clause: Optional[SampleClause] = None,
110
195
  ):
111
196
  if order_by_clause is None:
112
197
  order_by_clause = []
@@ -120,6 +205,11 @@ class Analyzer:
120
205
  self.group_by_clause = (
121
206
  [e.resolve_computed_cols() for e in group_by_clause] if group_by_clause is not None else None
122
207
  )
208
+ self.sample_clause = sample_clause
209
+ if self.sample_clause is not None and self.sample_clause.is_stratified:
210
+ self.stratify_exprs = [e.resolve_computed_cols() for e in sample_clause.stratify_exprs]
211
+ else:
212
+ self.stratify_exprs = []
123
213
  self.order_by_clause = [OrderByItem(e.resolve_computed_cols(), asc) for e, asc in order_by_clause]
124
214
 
125
215
  self.sql_where_clause = None
@@ -135,8 +225,11 @@ class Analyzer:
135
225
  self.all_exprs.append(join_clause.join_predicate)
136
226
  if self.group_by_clause is not None:
137
227
  self.all_exprs.extend(self.group_by_clause)
228
+ self.all_exprs.extend(self.stratify_exprs)
138
229
  self.all_exprs.extend(e for e, _ in self.order_by_clause)
139
230
  if self.filter is not None:
231
+ if sample_clause is not None:
232
+ raise excs.Error(f'Filter {self.filter} not expressible in SQL')
140
233
  self.all_exprs.append(self.filter)
141
234
 
142
235
  self.agg_order_by = []
@@ -260,7 +353,7 @@ class Planner:
260
353
  # TODO: create an exec.CountNode and change this to create_count_plan()
261
354
  @classmethod
262
355
  def create_count_stmt(cls, tbl: catalog.TableVersionPath, where_clause: Optional[exprs.Expr] = None) -> sql.Select:
263
- stmt = sql.select(sql.func.count())
356
+ stmt = sql.select(sql.func.count().label('all_count'))
264
357
  refd_tbl_ids: set[UUID] = set()
265
358
  if where_clause is not None:
266
359
  analyzer = cls.analyze(tbl, where_clause)
@@ -289,7 +382,7 @@ class Planner:
289
382
 
290
383
  # create InMemoryDataNode for 'rows'
291
384
  plan: exec.ExecNode = exec.InMemoryDataNode(
292
- TableVersionHandle(tbl.id, tbl.effective_version), rows, row_builder, tbl.next_rowid
385
+ TableVersionHandle(tbl.id, tbl.effective_version), rows, row_builder, tbl.next_row_id
293
386
  )
294
387
 
295
388
  media_input_col_info = [
@@ -322,6 +415,13 @@ class Planner:
322
415
  )
323
416
  return plan
324
417
 
418
+ @classmethod
419
+ def rowid_columns(cls, target: TableVersionHandle, num_rowid_cols: Optional[int] = None) -> list[exprs.Expr]:
420
+ """Return list of RowidRef for the given number of associated rowids"""
421
+ if num_rowid_cols is None:
422
+ num_rowid_cols = target.get().num_rowid_columns()
423
+ return [exprs.RowidRef(target, i) for i in range(num_rowid_cols)]
424
+
325
425
  @classmethod
326
426
  def create_df_insert_plan(
327
427
  cls, tbl: catalog.TableVersion, df: 'pxt.DataFrame', ignore_errors: bool
@@ -385,7 +485,7 @@ class Planner:
385
485
 
386
486
  cls.__check_valid_columns(tbl.tbl_version.get(), recomputed_cols, 'updated in')
387
487
 
388
- recomputed_base_cols = {col for col in recomputed_cols if col.tbl == tbl.tbl_version}
488
+ recomputed_base_cols = {col for col in recomputed_cols if col.tbl.id == tbl.tbl_version.id}
389
489
  copied_cols = [
390
490
  col
391
491
  for col in target.cols_by_id.values()
@@ -409,7 +509,7 @@ class Planner:
409
509
  for i, col in enumerate(all_base_cols):
410
510
  plan.row_builder.add_table_column(col, select_list[i].slot_idx)
411
511
  recomputed_user_cols = [c for c in recomputed_cols if c.name is not None]
412
- return plan, [f'{c.tbl.get().name}.{c.name}' for c in updated_cols + recomputed_user_cols], recomputed_user_cols
512
+ return plan, [f'{c.tbl.name}.{c.name}' for c in updated_cols + recomputed_user_cols], recomputed_user_cols
413
513
 
414
514
  @classmethod
415
515
  def __check_valid_columns(
@@ -465,7 +565,7 @@ class Planner:
465
565
  recomputed_cols.update(idx_val_cols)
466
566
  # we only need to recompute stored columns (unstored ones are substituted away)
467
567
  recomputed_cols = {c for c in recomputed_cols if c.is_stored}
468
- recomputed_base_cols = {col for col in recomputed_cols if col.tbl == target}
568
+ recomputed_base_cols = {col for col in recomputed_cols if col.tbl.id == target.id}
469
569
  copied_cols = [
470
570
  col
471
571
  for col in target.cols_by_id.values()
@@ -591,8 +691,13 @@ class Planner:
591
691
  # 2. for component views: iterator args
592
692
  iterator_args = [target.iterator_args] if target.iterator_args is not None else []
593
693
 
594
- row_builder = exprs.RowBuilder(iterator_args, stored_cols, [])
694
+ from_clause = FromClause(tbls=[view.base])
695
+ base_analyzer = Analyzer(
696
+ from_clause, iterator_args, where_clause=target.predicate, sample_clause=target.sample_clause
697
+ )
698
+ row_builder = exprs.RowBuilder(base_analyzer.all_exprs, stored_cols, [])
595
699
 
700
+ # if we're propagating an insert, we only want to see those base rows that were created for the current version
596
701
  # execution plan:
597
702
  # 1. materialize exprs computed from the base that are needed for stored view columns
598
703
  # 2. if it's an iterator view, expand the base rows into component rows
@@ -603,8 +708,11 @@ class Planner:
603
708
  for e in row_builder.default_eval_ctx.target_exprs
604
709
  if e.is_bound_by([view]) and not e.is_bound_by([view.base])
605
710
  ]
606
- # if we're propagating an insert, we only want to see those base rows that were created for the current version
607
- base_analyzer = Analyzer(FromClause(tbls=[view.base]), base_output_exprs, where_clause=target.predicate)
711
+
712
+ # Create a new analyzer reflecting exactly what is required from the base table
713
+ base_analyzer = Analyzer(
714
+ from_clause, base_output_exprs, where_clause=target.predicate, sample_clause=target.sample_clause
715
+ )
608
716
  base_eval_ctx = row_builder.create_eval_ctx(base_analyzer.all_exprs)
609
717
  plan = cls._create_query_plan(
610
718
  row_builder=row_builder,
@@ -701,6 +809,7 @@ class Planner:
701
809
  group_by_clause: Optional[list[exprs.Expr]] = None,
702
810
  order_by_clause: Optional[list[tuple[exprs.Expr, bool]]] = None,
703
811
  limit: Optional[exprs.Expr] = None,
812
+ sample_clause: Optional[SampleClause] = None,
704
813
  ignore_errors: bool = False,
705
814
  exact_version_only: Optional[list[catalog.TableVersionHandle]] = None,
706
815
  ) -> exec.ExecNode:
@@ -714,12 +823,14 @@ class Planner:
714
823
  order_by_clause = []
715
824
  if exact_version_only is None:
716
825
  exact_version_only = []
826
+
717
827
  analyzer = Analyzer(
718
828
  from_clause,
719
829
  select_list,
720
830
  where_clause=where_clause,
721
831
  group_by_clause=group_by_clause,
722
832
  order_by_clause=order_by_clause,
833
+ sample_clause=sample_clause,
723
834
  )
724
835
  row_builder = exprs.RowBuilder(analyzer.all_exprs, [], [])
725
836
 
@@ -773,6 +884,7 @@ class Planner:
773
884
  # - join clause subexprs
774
885
  # - subexprs of Where clause conjuncts that can't be run in SQL
775
886
  # - all grouping exprs
887
+ # - all stratify exprs
776
888
  candidates = list(
777
889
  exprs.Expr.list_subexprs(
778
890
  analyzer.select_list,
@@ -787,10 +899,12 @@ class Planner:
787
899
  candidates.extend(
788
900
  exprs.Expr.subexprs(analyzer.filter, filter=sql_elements.contains, traverse_matches=False)
789
901
  )
790
- if analyzer.group_by_clause is not None:
791
- candidates.extend(
792
- exprs.Expr.list_subexprs(analyzer.group_by_clause, filter=sql_elements.contains, traverse_matches=False)
793
- )
902
+ candidates.extend(
903
+ exprs.Expr.list_subexprs(analyzer.grouping_exprs, filter=sql_elements.contains, traverse_matches=False)
904
+ )
905
+ candidates.extend(
906
+ exprs.Expr.list_subexprs(analyzer.stratify_exprs, filter=sql_elements.contains, traverse_matches=False)
907
+ )
794
908
  # not isinstance(...): we don't want to materialize Literals via a Select
795
909
  sql_exprs = exprs.ExprSet(e for e in candidates if not isinstance(e, exprs.Literal))
796
910
 
@@ -835,6 +949,15 @@ class Planner:
835
949
  # we need to order the input for window functions
836
950
  plan.set_order_by(analyzer.get_window_fn_ob_clause())
837
951
 
952
+ if analyzer.sample_clause is not None:
953
+ plan = exec.SqlSampleNode(
954
+ row_builder,
955
+ input=plan,
956
+ select_list=tbl_scan_exprs,
957
+ sample_clause=analyzer.sample_clause,
958
+ stratify_exprs=analyzer.stratify_exprs,
959
+ )
960
+
838
961
  plan = cls._insert_prefetch_node(tbl.tbl_version.id, row_builder, plan)
839
962
 
840
963
  if analyzer.group_by_clause is not None:
@@ -1,4 +1,7 @@
1
+ import base64
1
2
  import datetime
3
+ import io
4
+ import itertools
2
5
  import json
3
6
  import logging
4
7
  import tarfile
@@ -7,17 +10,21 @@ import urllib.request
7
10
  import uuid
8
11
  from pathlib import Path
9
12
  from typing import Any, Iterator, Optional
13
+ from uuid import UUID
10
14
 
11
15
  import more_itertools
16
+ import numpy as np
17
+ import PIL.Image
12
18
  import pyarrow as pa
13
19
  import pyarrow.parquet as pq
14
20
  import sqlalchemy as sql
15
21
 
16
22
  import pixeltable as pxt
17
- from pixeltable import catalog, exceptions as excs, metadata
23
+ from pixeltable import catalog, exceptions as excs, metadata, type_system as ts
18
24
  from pixeltable.env import Env
19
25
  from pixeltable.metadata import schema
20
26
  from pixeltable.utils import sha256sum
27
+ from pixeltable.utils.formatter import Formatter
21
28
  from pixeltable.utils.media_store import MediaStore
22
29
 
23
30
  _logger = logging.getLogger('pixeltable')
@@ -45,13 +52,17 @@ class TablePackager:
45
52
  media_files: dict[Path, str] # Mapping from local media file paths to their tarball names
46
53
  md: dict[str, Any]
47
54
 
55
+ bundle_path: Path
56
+ preview_header: dict[str, str]
57
+ preview: list[list[Any]]
58
+
48
59
  def __init__(self, table: catalog.Table, additional_md: Optional[dict[str, Any]] = None) -> None:
49
60
  self.table = table
50
61
  self.tmp_dir = Path(Env.get().create_tmp_path())
51
62
  self.media_files = {}
52
63
 
53
64
  # Load metadata
54
- with Env.get().begin_xact():
65
+ with catalog.Catalog.get().begin_xact(for_write=False):
55
66
  tbl_md = catalog.Catalog.get().load_replica_md(table)
56
67
  self.md = {
57
68
  'pxt_version': pxt.__version__,
@@ -66,20 +77,29 @@ class TablePackager:
66
77
  Export the table to a tarball containing Parquet tables and media files.
67
78
  """
68
79
  assert not self.tmp_dir.exists() # Packaging can only be done once per TablePackager instance
69
- _logger.info(f"Packaging table '{self.table._path}' and its ancestors in: {self.tmp_dir}")
80
+
81
+ _logger.info(f'Packaging table {self.table._path()!r} and its ancestors in: {self.tmp_dir}')
70
82
  self.tmp_dir.mkdir()
71
83
  with open(self.tmp_dir / 'metadata.json', 'w', encoding='utf8') as fp:
72
84
  json.dump(self.md, fp)
73
85
  self.tables_dir = self.tmp_dir / 'tables'
74
86
  self.tables_dir.mkdir()
75
- with Env.get().begin_xact():
87
+ with catalog.Catalog.get().begin_xact(for_write=False):
76
88
  for tv in self.table._tbl_version_path.get_tbl_versions():
77
- _logger.info(f"Exporting table '{tv.get().name}:{tv.get().version}'.")
89
+ _logger.info(f'Exporting table {tv.get().versioned_name!r}.')
78
90
  self.__export_table(tv.get())
91
+
79
92
  _logger.info('Building archive.')
80
- bundle_path = self.__build_tarball()
81
- _logger.info(f'Packaging complete: {bundle_path}')
82
- return bundle_path
93
+ self.bundle_path = self.__build_tarball()
94
+
95
+ _logger.info('Extracting preview data.')
96
+ self.md['count'] = self.table.count()
97
+ preview_header, preview = self.__extract_preview_data()
98
+ self.md['preview_header'] = preview_header
99
+ self.md['preview'] = preview
100
+
101
+ _logger.info(f'Packaging complete: {self.bundle_path}')
102
+ return self.bundle_path
83
103
 
84
104
  def __export_table(self, tv: catalog.TableVersion) -> None:
85
105
  """
@@ -107,7 +127,7 @@ class TablePackager:
107
127
  # We use snappy compression for the Parquet tables; the entire bundle will be bzip2-compressed later, so
108
128
  # faster compression should provide good performance while still reducing temporary storage utilization.
109
129
  parquet_writer = pq.ParquetWriter(parquet_file, parquet_schema, compression='SNAPPY')
110
- filter_tv = self.table._tbl_version.get()
130
+ filter_tv = self.table._tbl_version_path.tbl_version.get()
111
131
  row_iter = tv.store_tbl.dump_rows(tv.version, filter_tv.store_tbl, filter_tv.version)
112
132
  for pa_table in self.__to_pa_tables(row_iter, sql_types, media_cols, parquet_schema):
113
133
  parquet_writer.write_table(pa_table)
@@ -206,6 +226,96 @@ class TablePackager:
206
226
  tf.add(src_file, arcname=f'media/{dest_name}')
207
227
  return bundle_path
208
228
 
229
+ def __extract_preview_data(self) -> tuple[dict[str, str], list[list[Any]]]:
230
+ """
231
+ Extract a preview of the table data for display in the UI.
232
+
233
+ In order to bound the size of the output data, all "unbounded" data types are resized:
234
+ - Strings are abbreviated as per Formatter.abbreviate()
235
+ - Arrays and JSON are shortened and formatted as strings
236
+ - Images are resized to thumbnail size as a base64-encoded webp
237
+ - Videos are replaced by their first frame and resized as above
238
+ - Documents are replaced by a thumbnail as a base64-encoded webp
239
+ """
240
+ # First 8 columns
241
+ preview_cols = dict(itertools.islice(self.table._get_schema().items(), 0, 8))
242
+ select_list = [self.table[col_name] for col_name in preview_cols]
243
+ # First 5 rows
244
+ rows = list(self.table.select(*select_list).head(n=5))
245
+
246
+ preview_header = {col_name: str(col_type._type) for col_name, col_type in preview_cols.items()}
247
+ preview = [
248
+ [self.__encode_preview_data(val, col_type)]
249
+ for row in rows
250
+ for val, col_type in zip(row.values(), preview_cols.values())
251
+ ]
252
+
253
+ return preview_header, preview
254
+
255
+ def __encode_preview_data(self, val: Any, col_type: ts.ColumnType) -> Any:
256
+ if val is None:
257
+ return None
258
+
259
+ match col_type._type:
260
+ case ts.ColumnType.Type.STRING:
261
+ assert isinstance(val, str)
262
+ return Formatter.abbreviate(val)
263
+
264
+ case ts.ColumnType.Type.INT | ts.ColumnType.Type.FLOAT | ts.ColumnType.Type.BOOL:
265
+ return val
266
+
267
+ case ts.ColumnType.Type.TIMESTAMP | ts.ColumnType.Type.DATE:
268
+ return str(val)
269
+
270
+ case ts.ColumnType.Type.ARRAY:
271
+ assert isinstance(val, np.ndarray)
272
+ return Formatter.format_array(val)
273
+
274
+ case ts.ColumnType.Type.JSON:
275
+ # We need to escape the JSON string server-side for security reasons.
276
+ # Therefore we don't escape it here, in order to avoid double-escaping.
277
+ return Formatter.format_json(val, escape_strings=False)
278
+
279
+ case ts.ColumnType.Type.IMAGE:
280
+ # Rescale the image to minimize data transfer size
281
+ assert isinstance(val, PIL.Image.Image)
282
+ return self.__encode_image(val)
283
+
284
+ case ts.ColumnType.Type.VIDEO:
285
+ assert isinstance(val, str)
286
+ return self.__encode_video(val)
287
+
288
+ case ts.ColumnType.Type.AUDIO:
289
+ return None
290
+
291
+ case ts.ColumnType.Type.DOCUMENT:
292
+ assert isinstance(val, str)
293
+ return self.__encode_document(val)
294
+
295
+ case _:
296
+ raise AssertionError(f'Unrecognized column type: {col_type._type}')
297
+
298
+ def __encode_image(self, img: PIL.Image.Image) -> str:
299
+ # Heuristic for thumbnail sizing:
300
+ # Standardize on a width of 240 pixels (to most efficiently utilize the columnar display).
301
+ # But, if the aspect ratio is below 2:3, bound the height at 360 pixels (to avoid unboundedly tall thumbnails
302
+ # in the case of highly oblong images).
303
+ if img.height > img.width * 1.5:
304
+ scaled_img = img.resize((img.width * 360 // img.height, 360))
305
+ else:
306
+ scaled_img = img.resize((240, img.height * 240 // img.width))
307
+ with io.BytesIO() as buffer:
308
+ scaled_img.save(buffer, 'webp')
309
+ return base64.b64encode(buffer.getvalue()).decode()
310
+
311
+ def __encode_video(self, video_path: str) -> Optional[str]:
312
+ thumb = Formatter.extract_first_video_frame(video_path)
313
+ return self.__encode_image(thumb) if thumb is not None else None
314
+
315
+ def __encode_document(self, doc_path: str) -> Optional[str]:
316
+ thumb = Formatter.make_document_thumbnail(doc_path)
317
+ return self.__encode_image(thumb) if thumb is not None else None
318
+
209
319
 
210
320
  class TableRestorer:
211
321
  """
@@ -253,13 +363,26 @@ class TableRestorer:
253
363
  tbl_md = [schema.FullTableMd.from_dict(t) for t in self.md['md']['tables']]
254
364
 
255
365
  # Create the replica table
256
- # TODO: This needs to be made concurrency-safe.
257
- replica_tbl = catalog.Catalog.get().create_replica(catalog.Path(self.tbl_path), tbl_md)
258
- assert replica_tbl._tbl_version.get().is_snapshot
366
+ # The logic here needs to be completely restructured in order to make it concurrency-safe.
367
+ # - Catalog.create_replica() needs to write the metadata and also create the physical store tables
368
+ # and populate them, otherwise concurrent readers will see an inconsistent state (table metadata w/o
369
+ # an actual table)
370
+ # - this could be done one replica at a time (instead of the entire hierarchy)
371
+ cat = catalog.Catalog.get()
372
+ cat.create_replica(catalog.Path(self.tbl_path), tbl_md)
373
+ # don't call get_table() until after the calls to create_replica() and __import_table() below;
374
+ # the TV instances created by get_table() would be replaced by create_replica(), which creates duplicate
375
+ # TV instances for the same replica version, which then leads to failures when constructing queries
259
376
 
260
377
  # Now we need to instantiate and load data for replica_tbl and its ancestors, except that we skip
261
378
  # replica_tbl itself if it's a pure snapshot.
262
- if replica_tbl._id != replica_tbl._tbl_version.id:
379
+ target_md = tbl_md[0]
380
+ is_pure_snapshot = (
381
+ target_md.tbl_md.view_md is not None
382
+ and target_md.tbl_md.view_md.predicate is None
383
+ and len(target_md.schema_version_md.columns) == 0
384
+ )
385
+ if is_pure_snapshot:
263
386
  ancestor_md = tbl_md[1:] # Pure snapshot; skip replica_tbl
264
387
  else:
265
388
  ancestor_md = tbl_md # Not a pure snapshot; include replica_tbl
@@ -273,7 +396,8 @@ class TableRestorer:
273
396
  _logger.info(f'Importing table {tv.name!r}.')
274
397
  self.__import_table(self.tmp_dir, tv, md)
275
398
 
276
- return replica_tbl
399
+ with cat.begin_xact(for_write=False):
400
+ return cat.get_table_by_id(UUID(tbl_md[0].tbl_md.tbl_id))
277
401
 
278
402
  def __import_table(self, bundle_path: Path, tv: catalog.TableVersion, tbl_md: schema.FullTableMd) -> None:
279
403
  """
@@ -35,7 +35,7 @@ def push_replica(dest_tbl_uri: str, src_tbl: pxt.Table) -> str:
35
35
  upload_id = response_json['upload_id']
36
36
  destination_uri = response_json['destination_uri']
37
37
 
38
- Env.get().console_logger.info(f"Creating a snapshot of '{src_tbl._path}' at: {dest_tbl_uri}")
38
+ Env.get().console_logger.info(f"Creating a snapshot of '{src_tbl._path()}' at: {dest_tbl_uri}")
39
39
 
40
40
  bundle = packager.package()
41
41
 
@@ -117,7 +117,7 @@ def pull_replica(dest_path: str, src_tbl_uri: str) -> pxt.Table:
117
117
 
118
118
  restorer = TableRestorer(dest_path, response_json)
119
119
  tbl = restorer.restore(bundle_path)
120
- Env.get().console_logger.info(f'Created local replica {tbl._path!r} from URI: {src_tbl_uri}')
120
+ Env.get().console_logger.info(f'Created local replica {tbl._path()!r} from URI: {src_tbl_uri}')
121
121
  return tbl
122
122
 
123
123
 
pixeltable/store.py CHANGED
@@ -52,7 +52,8 @@ class StoreBase:
52
52
  # We need to declare a `base` variable here, even though it's only defined for instances of `StoreView`,
53
53
  # since it's referenced by various methods of `StoreBase`
54
54
  self.base = tbl_version.base.get().store_tbl if tbl_version.base is not None else None
55
- self.create_sa_tbl()
55
+ # we're passing in tbl_version to avoid a circular call to TableVersionHandle.get()
56
+ self.create_sa_tbl(tbl_version)
56
57
 
57
58
  def system_columns(self) -> list[sql.Column]:
58
59
  return [*self._pk_cols, self.v_max_col]
@@ -77,11 +78,13 @@ class StoreBase:
77
78
  self._pk_cols = [*rowid_cols, self.v_min_col]
78
79
  return [*rowid_cols, self.v_min_col, self.v_max_col]
79
80
 
80
- def create_sa_tbl(self) -> None:
81
+ def create_sa_tbl(self, tbl_version: Optional[catalog.TableVersion] = None) -> None:
81
82
  """Create self.sa_tbl from self.tbl_version."""
83
+ if tbl_version is None:
84
+ tbl_version = self.tbl_version.get()
82
85
  system_cols = self._create_system_columns()
83
86
  all_cols = system_cols.copy()
84
- for col in [c for c in self.tbl_version.get().cols if c.is_stored]:
87
+ for col in [c for c in tbl_version.cols if c.is_stored]:
85
88
  # re-create sql.Column for each column, regardless of whether it already has sa_col set: it was bound
86
89
  # to the last sql.Table version we created and cannot be reused
87
90
  col.create_sa_cols()
@@ -99,16 +102,17 @@ class StoreBase:
99
102
  # - base x view joins can be executed as merge joins
100
103
  # - speeds up ORDER BY rowid DESC
101
104
  # - allows filtering for a particular table version in index scan
102
- idx_name = f'sys_cols_idx_{self.tbl_version.id.hex}'
105
+ idx_name = f'sys_cols_idx_{tbl_version.id.hex}'
103
106
  idxs.append(sql.Index(idx_name, *system_cols))
104
107
 
105
108
  # v_min/v_max indices: speeds up base table scans needed to propagate a base table insert or delete
106
- idx_name = f'vmin_idx_{self.tbl_version.id.hex}'
109
+ idx_name = f'vmin_idx_{tbl_version.id.hex}'
107
110
  idxs.append(sql.Index(idx_name, self.v_min_col, postgresql_using=Env.get().dbms.version_index_type))
108
- idx_name = f'vmax_idx_{self.tbl_version.id.hex}'
111
+ idx_name = f'vmax_idx_{tbl_version.id.hex}'
109
112
  idxs.append(sql.Index(idx_name, self.v_max_col, postgresql_using=Env.get().dbms.version_index_type))
110
113
 
111
114
  self.sa_tbl = sql.Table(self._storage_name(), self.sa_md, *all_cols, *idxs)
115
+ # _logger.debug(f'created sa tbl for {tbl_version.id!s} (sa_tbl={id(self.sa_tbl):x}, tv={id(tbl_version):x})')
112
116
 
113
117
  @abc.abstractmethod
114
118
  def _rowid_join_predicate(self) -> sql.ColumnElement[bool]:
@@ -285,7 +289,7 @@ class StoreBase:
285
289
  else:
286
290
  if col.col_type.is_image_type() and result_row.file_urls[value_expr_slot_idx] is None:
287
291
  # we have yet to store this image
288
- filepath = str(MediaStore.prepare_media_path(col.tbl.id, col.id, col.tbl.get().version))
292
+ filepath = str(MediaStore.prepare_media_path(col.tbl.id, col.id, col.tbl.version))
289
293
  result_row.flush_img(value_expr_slot_idx, filepath)
290
294
  val = result_row.get_stored_val(value_expr_slot_idx, col.sa_col.type)
291
295
  if col.col_type.is_media_type():
@@ -415,9 +419,7 @@ class StoreBase:
415
419
  number of deleted rows
416
420
  """
417
421
  where_clause = sql.true() if where_clause is None else where_clause
418
- where_clause = sql.and_(
419
- self.v_min_col < current_version, self.v_max_col == schema.Table.MAX_VERSION, where_clause
420
- )
422
+ version_clause = sql.and_(self.v_min_col < current_version, self.v_max_col == schema.Table.MAX_VERSION)
421
423
  rowid_join_clause = self._rowid_join_predicate()
422
424
  base_versions_clause = (
423
425
  sql.true() if len(base_versions) == 0 else self.base._versions_clause(base_versions, match_on_vmin)
@@ -428,10 +430,12 @@ class StoreBase:
428
430
  set_clause[index_info.undo_col.sa_col] = index_info.val_col.sa_col
429
431
  # set value column to NULL
430
432
  set_clause[index_info.val_col.sa_col] = None
433
+
431
434
  stmt = (
432
435
  sql.update(self.sa_tbl)
433
436
  .values(set_clause)
434
437
  .where(where_clause)
438
+ .where(version_clause)
435
439
  .where(rowid_join_clause)
436
440
  .where(base_versions_clause)
437
441
  )
@@ -528,10 +532,12 @@ class StoreComponentView(StoreView):
528
532
  self.rowid_cols.append(self.pos_col)
529
533
  return self.rowid_cols
530
534
 
531
- def create_sa_tbl(self) -> None:
532
- super().create_sa_tbl()
535
+ def create_sa_tbl(self, tbl_version: Optional[catalog.TableVersion] = None) -> None:
536
+ if tbl_version is None:
537
+ tbl_version = self.tbl_version.get()
538
+ super().create_sa_tbl(tbl_version)
533
539
  # we need to fix up the 'pos' column in TableVersion
534
- self.tbl_version.get().cols_by_name['pos'].sa_col = self.pos_col
540
+ tbl_version.cols_by_name['pos'].sa_col = self.pos_col
535
541
 
536
542
  def _rowid_join_predicate(self) -> sql.ColumnElement[bool]:
537
543
  return sql.and_(
pixeltable/type_system.py CHANGED
@@ -395,6 +395,36 @@ class ColumnType:
395
395
  raise excs.Error(f'Standard Python type `{name}` cannot be used here; use `{suggestion}` instead')
396
396
  raise excs.Error(f'Unknown type: {t}')
397
397
 
398
+ @classmethod
399
+ def from_json_schema(cls, schema: dict[str, Any]) -> Optional[ColumnType]:
400
+ # We first express the JSON schema as a Python type, and then convert it to a Pixeltable type.
401
+ # TODO: Is there a meaningful fallback if one of these operations fails? (Maybe another use case for a pxt Any
402
+ # type?)
403
+ py_type = cls.__json_schema_to_py_type(schema)
404
+ return cls.from_python_type(py_type) if py_type is not None else None
405
+
406
+ @classmethod
407
+ def __json_schema_to_py_type(cls, schema: dict[str, Any]) -> Union[type, _GenericAlias, None]:
408
+ if 'type' in schema:
409
+ if schema['type'] == 'null':
410
+ return type(None)
411
+ if schema['type'] == 'string':
412
+ return str
413
+ if schema['type'] == 'integer':
414
+ return int
415
+ if schema['type'] == 'number':
416
+ return float
417
+ if schema['type'] == 'boolean':
418
+ return bool
419
+ if schema['type'] in ('array', 'object'):
420
+ return list
421
+ elif 'anyOf' in schema:
422
+ subscripts = tuple(cls.__json_schema_to_py_type(subschema) for subschema in schema['anyOf'])
423
+ if all(subscript is not None for subscript in subscripts):
424
+ return Union[subscripts]
425
+
426
+ return None
427
+
398
428
  def validate_literal(self, val: Any) -> None:
399
429
  """Raise TypeError if val is not a valid literal for this type"""
400
430
  if val is None:
pixeltable/utils/dbms.py CHANGED
@@ -35,7 +35,7 @@ class PostgresqlDbms(Dbms):
35
35
  """
36
36
 
37
37
  def __init__(self, db_url: URL):
38
- super().__init__('postgresql', 'REPEATABLE READ', 'brin', db_url)
38
+ super().__init__('postgresql', 'SERIALIZABLE', 'brin', db_url)
39
39
 
40
40
  def drop_db_stmt(self, database: str) -> str:
41
41
  return f'DROP DATABASE {database}'