pixeltable 0.4.0rc3__py3-none-any.whl → 0.4.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (202) hide show
  1. pixeltable/__init__.py +23 -5
  2. pixeltable/_version.py +1 -0
  3. pixeltable/catalog/__init__.py +5 -3
  4. pixeltable/catalog/catalog.py +1318 -404
  5. pixeltable/catalog/column.py +186 -115
  6. pixeltable/catalog/dir.py +1 -2
  7. pixeltable/catalog/globals.py +11 -43
  8. pixeltable/catalog/insertable_table.py +167 -79
  9. pixeltable/catalog/path.py +61 -23
  10. pixeltable/catalog/schema_object.py +9 -10
  11. pixeltable/catalog/table.py +626 -308
  12. pixeltable/catalog/table_metadata.py +101 -0
  13. pixeltable/catalog/table_version.py +713 -569
  14. pixeltable/catalog/table_version_handle.py +37 -6
  15. pixeltable/catalog/table_version_path.py +42 -29
  16. pixeltable/catalog/tbl_ops.py +50 -0
  17. pixeltable/catalog/update_status.py +191 -0
  18. pixeltable/catalog/view.py +108 -94
  19. pixeltable/config.py +128 -22
  20. pixeltable/dataframe.py +188 -100
  21. pixeltable/env.py +407 -136
  22. pixeltable/exceptions.py +6 -0
  23. pixeltable/exec/__init__.py +3 -0
  24. pixeltable/exec/aggregation_node.py +7 -8
  25. pixeltable/exec/cache_prefetch_node.py +83 -110
  26. pixeltable/exec/cell_materialization_node.py +231 -0
  27. pixeltable/exec/cell_reconstruction_node.py +135 -0
  28. pixeltable/exec/component_iteration_node.py +4 -3
  29. pixeltable/exec/data_row_batch.py +8 -65
  30. pixeltable/exec/exec_context.py +16 -4
  31. pixeltable/exec/exec_node.py +13 -36
  32. pixeltable/exec/expr_eval/evaluators.py +7 -6
  33. pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
  34. pixeltable/exec/expr_eval/globals.py +8 -5
  35. pixeltable/exec/expr_eval/row_buffer.py +1 -2
  36. pixeltable/exec/expr_eval/schedulers.py +190 -30
  37. pixeltable/exec/globals.py +32 -0
  38. pixeltable/exec/in_memory_data_node.py +18 -18
  39. pixeltable/exec/object_store_save_node.py +293 -0
  40. pixeltable/exec/row_update_node.py +16 -9
  41. pixeltable/exec/sql_node.py +206 -101
  42. pixeltable/exprs/__init__.py +1 -1
  43. pixeltable/exprs/arithmetic_expr.py +27 -22
  44. pixeltable/exprs/array_slice.py +3 -3
  45. pixeltable/exprs/column_property_ref.py +34 -30
  46. pixeltable/exprs/column_ref.py +92 -96
  47. pixeltable/exprs/comparison.py +5 -5
  48. pixeltable/exprs/compound_predicate.py +5 -4
  49. pixeltable/exprs/data_row.py +152 -55
  50. pixeltable/exprs/expr.py +62 -43
  51. pixeltable/exprs/expr_dict.py +3 -3
  52. pixeltable/exprs/expr_set.py +17 -10
  53. pixeltable/exprs/function_call.py +75 -37
  54. pixeltable/exprs/globals.py +1 -2
  55. pixeltable/exprs/in_predicate.py +4 -4
  56. pixeltable/exprs/inline_expr.py +10 -27
  57. pixeltable/exprs/is_null.py +1 -3
  58. pixeltable/exprs/json_mapper.py +8 -8
  59. pixeltable/exprs/json_path.py +56 -22
  60. pixeltable/exprs/literal.py +5 -5
  61. pixeltable/exprs/method_ref.py +2 -2
  62. pixeltable/exprs/object_ref.py +2 -2
  63. pixeltable/exprs/row_builder.py +127 -53
  64. pixeltable/exprs/rowid_ref.py +8 -12
  65. pixeltable/exprs/similarity_expr.py +50 -25
  66. pixeltable/exprs/sql_element_cache.py +4 -4
  67. pixeltable/exprs/string_op.py +5 -5
  68. pixeltable/exprs/type_cast.py +3 -5
  69. pixeltable/func/__init__.py +1 -0
  70. pixeltable/func/aggregate_function.py +8 -8
  71. pixeltable/func/callable_function.py +9 -9
  72. pixeltable/func/expr_template_function.py +10 -10
  73. pixeltable/func/function.py +18 -20
  74. pixeltable/func/function_registry.py +6 -7
  75. pixeltable/func/globals.py +2 -3
  76. pixeltable/func/mcp.py +74 -0
  77. pixeltable/func/query_template_function.py +20 -18
  78. pixeltable/func/signature.py +43 -16
  79. pixeltable/func/tools.py +23 -13
  80. pixeltable/func/udf.py +18 -20
  81. pixeltable/functions/__init__.py +6 -0
  82. pixeltable/functions/anthropic.py +93 -33
  83. pixeltable/functions/audio.py +114 -10
  84. pixeltable/functions/bedrock.py +13 -6
  85. pixeltable/functions/date.py +1 -1
  86. pixeltable/functions/deepseek.py +20 -9
  87. pixeltable/functions/fireworks.py +2 -2
  88. pixeltable/functions/gemini.py +28 -11
  89. pixeltable/functions/globals.py +13 -13
  90. pixeltable/functions/groq.py +108 -0
  91. pixeltable/functions/huggingface.py +1046 -23
  92. pixeltable/functions/image.py +9 -18
  93. pixeltable/functions/llama_cpp.py +23 -8
  94. pixeltable/functions/math.py +3 -4
  95. pixeltable/functions/mistralai.py +4 -15
  96. pixeltable/functions/ollama.py +16 -9
  97. pixeltable/functions/openai.py +104 -82
  98. pixeltable/functions/openrouter.py +143 -0
  99. pixeltable/functions/replicate.py +2 -2
  100. pixeltable/functions/reve.py +250 -0
  101. pixeltable/functions/string.py +21 -28
  102. pixeltable/functions/timestamp.py +13 -14
  103. pixeltable/functions/together.py +4 -6
  104. pixeltable/functions/twelvelabs.py +92 -0
  105. pixeltable/functions/util.py +6 -1
  106. pixeltable/functions/video.py +1388 -106
  107. pixeltable/functions/vision.py +7 -7
  108. pixeltable/functions/whisper.py +15 -7
  109. pixeltable/functions/whisperx.py +179 -0
  110. pixeltable/{ext/functions → functions}/yolox.py +2 -4
  111. pixeltable/globals.py +332 -105
  112. pixeltable/index/base.py +13 -22
  113. pixeltable/index/btree.py +23 -22
  114. pixeltable/index/embedding_index.py +32 -44
  115. pixeltable/io/__init__.py +4 -2
  116. pixeltable/io/datarows.py +7 -6
  117. pixeltable/io/external_store.py +49 -77
  118. pixeltable/io/fiftyone.py +11 -11
  119. pixeltable/io/globals.py +29 -28
  120. pixeltable/io/hf_datasets.py +17 -9
  121. pixeltable/io/label_studio.py +70 -66
  122. pixeltable/io/lancedb.py +3 -0
  123. pixeltable/io/pandas.py +12 -11
  124. pixeltable/io/parquet.py +13 -93
  125. pixeltable/io/table_data_conduit.py +71 -47
  126. pixeltable/io/utils.py +3 -3
  127. pixeltable/iterators/__init__.py +2 -1
  128. pixeltable/iterators/audio.py +21 -11
  129. pixeltable/iterators/document.py +116 -55
  130. pixeltable/iterators/image.py +5 -2
  131. pixeltable/iterators/video.py +293 -13
  132. pixeltable/metadata/__init__.py +4 -2
  133. pixeltable/metadata/converters/convert_18.py +2 -2
  134. pixeltable/metadata/converters/convert_19.py +2 -2
  135. pixeltable/metadata/converters/convert_20.py +2 -2
  136. pixeltable/metadata/converters/convert_21.py +2 -2
  137. pixeltable/metadata/converters/convert_22.py +2 -2
  138. pixeltable/metadata/converters/convert_24.py +2 -2
  139. pixeltable/metadata/converters/convert_25.py +2 -2
  140. pixeltable/metadata/converters/convert_26.py +2 -2
  141. pixeltable/metadata/converters/convert_29.py +4 -4
  142. pixeltable/metadata/converters/convert_34.py +2 -2
  143. pixeltable/metadata/converters/convert_36.py +2 -2
  144. pixeltable/metadata/converters/convert_37.py +15 -0
  145. pixeltable/metadata/converters/convert_38.py +39 -0
  146. pixeltable/metadata/converters/convert_39.py +124 -0
  147. pixeltable/metadata/converters/convert_40.py +73 -0
  148. pixeltable/metadata/converters/util.py +13 -12
  149. pixeltable/metadata/notes.py +4 -0
  150. pixeltable/metadata/schema.py +79 -42
  151. pixeltable/metadata/utils.py +74 -0
  152. pixeltable/mypy/__init__.py +3 -0
  153. pixeltable/mypy/mypy_plugin.py +123 -0
  154. pixeltable/plan.py +274 -223
  155. pixeltable/share/__init__.py +1 -1
  156. pixeltable/share/packager.py +259 -129
  157. pixeltable/share/protocol/__init__.py +34 -0
  158. pixeltable/share/protocol/common.py +170 -0
  159. pixeltable/share/protocol/operation_types.py +33 -0
  160. pixeltable/share/protocol/replica.py +109 -0
  161. pixeltable/share/publish.py +213 -57
  162. pixeltable/store.py +238 -175
  163. pixeltable/type_system.py +104 -63
  164. pixeltable/utils/__init__.py +2 -3
  165. pixeltable/utils/arrow.py +108 -13
  166. pixeltable/utils/av.py +298 -0
  167. pixeltable/utils/azure_store.py +305 -0
  168. pixeltable/utils/code.py +3 -3
  169. pixeltable/utils/console_output.py +4 -1
  170. pixeltable/utils/coroutine.py +6 -23
  171. pixeltable/utils/dbms.py +31 -5
  172. pixeltable/utils/description_helper.py +4 -5
  173. pixeltable/utils/documents.py +5 -6
  174. pixeltable/utils/exception_handler.py +7 -30
  175. pixeltable/utils/filecache.py +6 -6
  176. pixeltable/utils/formatter.py +4 -6
  177. pixeltable/utils/gcs_store.py +283 -0
  178. pixeltable/utils/http_server.py +2 -3
  179. pixeltable/utils/iceberg.py +1 -2
  180. pixeltable/utils/image.py +17 -0
  181. pixeltable/utils/lancedb.py +88 -0
  182. pixeltable/utils/local_store.py +316 -0
  183. pixeltable/utils/misc.py +5 -0
  184. pixeltable/utils/object_stores.py +528 -0
  185. pixeltable/utils/pydantic.py +60 -0
  186. pixeltable/utils/pytorch.py +5 -6
  187. pixeltable/utils/s3_store.py +392 -0
  188. pixeltable-0.4.20.dist-info/METADATA +587 -0
  189. pixeltable-0.4.20.dist-info/RECORD +218 -0
  190. {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info}/WHEEL +1 -1
  191. pixeltable-0.4.20.dist-info/entry_points.txt +2 -0
  192. pixeltable/__version__.py +0 -3
  193. pixeltable/ext/__init__.py +0 -17
  194. pixeltable/ext/functions/__init__.py +0 -11
  195. pixeltable/ext/functions/whisperx.py +0 -77
  196. pixeltable/utils/media_store.py +0 -77
  197. pixeltable/utils/s3.py +0 -17
  198. pixeltable/utils/sample.py +0 -25
  199. pixeltable-0.4.0rc3.dist-info/METADATA +0 -435
  200. pixeltable-0.4.0rc3.dist-info/RECORD +0 -189
  201. pixeltable-0.4.0rc3.dist-info/entry_points.txt +0 -3
  202. {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info/licenses}/LICENSE +0 -0
@@ -1,3 +1,3 @@
1
1
  # ruff: noqa: F401
2
2
 
3
- from .publish import pull_replica, push_replica
3
+ from .publish import delete_replica, list_table_versions, pull_replica, push_replica
@@ -1,7 +1,6 @@
1
1
  import base64
2
- import datetime
2
+ import dataclasses
3
3
  import io
4
- import itertools
5
4
  import json
6
5
  import logging
7
6
  import tarfile
@@ -9,11 +8,12 @@ import urllib.parse
9
8
  import urllib.request
10
9
  import uuid
11
10
  from pathlib import Path
12
- from typing import Any, Iterator, Optional
11
+ from typing import Any, Iterator
13
12
  from uuid import UUID
14
13
 
15
14
  import more_itertools
16
15
  import numpy as np
16
+ import pgvector.sqlalchemy as sql_vector # type: ignore[import-untyped]
17
17
  import PIL.Image
18
18
  import pyarrow as pa
19
19
  import pyarrow.parquet as pq
@@ -21,11 +21,14 @@ import sqlalchemy as sql
21
21
 
22
22
  import pixeltable as pxt
23
23
  from pixeltable import catalog, exceptions as excs, metadata, type_system as ts
24
+ from pixeltable.catalog.table_version import TableVersionCompleteMd
24
25
  from pixeltable.env import Env
26
+ from pixeltable.exprs.data_row import CellMd
25
27
  from pixeltable.metadata import schema
26
28
  from pixeltable.utils import sha256sum
27
29
  from pixeltable.utils.formatter import Formatter
28
- from pixeltable.utils.media_store import MediaStore
30
+ from pixeltable.utils.local_store import TempStore
31
+ from pixeltable.utils.object_stores import ObjectOps
29
32
 
30
33
  _logger = logging.getLogger('pixeltable')
31
34
 
@@ -50,27 +53,27 @@ class TablePackager:
50
53
  tmp_dir: Path # Temporary directory where the package will reside
51
54
  tables_dir: Path # Directory where the Parquet tables will be written
52
55
  media_files: dict[Path, str] # Mapping from local media file paths to their tarball names
53
- md: dict[str, Any]
56
+ bundle_md: dict[str, Any]
54
57
 
55
58
  bundle_path: Path
56
59
  preview_header: dict[str, str]
57
60
  preview: list[list[Any]]
58
61
 
59
- def __init__(self, table: catalog.Table, additional_md: Optional[dict[str, Any]] = None) -> None:
62
+ def __init__(self, table: catalog.Table, additional_md: dict[str, Any] | None = None) -> None:
60
63
  self.table = table
61
- self.tmp_dir = Path(Env.get().create_tmp_path())
64
+ self.tmp_dir = TempStore.create_path()
62
65
  self.media_files = {}
63
66
 
64
- # Load metadata
67
+ # Load metadata and convert to JSON immediately
65
68
  with catalog.Catalog.get().begin_xact(for_write=False):
66
69
  tbl_md = catalog.Catalog.get().load_replica_md(table)
67
- self.md = {
70
+ self.bundle_md = {
68
71
  'pxt_version': pxt.__version__,
69
72
  'pxt_md_version': metadata.VERSION,
70
- 'md': {'tables': [md.as_dict() for md in tbl_md]},
73
+ 'md': [dataclasses.asdict(md) for md in tbl_md],
71
74
  }
72
75
  if additional_md is not None:
73
- self.md.update(additional_md)
76
+ self.bundle_md.update(additional_md)
74
77
 
75
78
  def package(self) -> Path:
76
79
  """
@@ -81,7 +84,7 @@ class TablePackager:
81
84
  _logger.info(f'Packaging table {self.table._path()!r} and its ancestors in: {self.tmp_dir}')
82
85
  self.tmp_dir.mkdir()
83
86
  with open(self.tmp_dir / 'metadata.json', 'w', encoding='utf8') as fp:
84
- json.dump(self.md, fp)
87
+ json.dump(self.bundle_md, fp)
85
88
  self.tables_dir = self.tmp_dir / 'tables'
86
89
  self.tables_dir.mkdir()
87
90
  with catalog.Catalog.get().begin_xact(for_write=False):
@@ -93,10 +96,10 @@ class TablePackager:
93
96
  self.bundle_path = self.__build_tarball()
94
97
 
95
98
  _logger.info('Extracting preview data.')
96
- self.md['count'] = self.table.count()
99
+ self.bundle_md['row_count'] = self.table.count()
97
100
  preview_header, preview = self.__extract_preview_data()
98
- self.md['preview_header'] = preview_header
99
- self.md['preview'] = preview
101
+ self.bundle_md['preview_header'] = preview_header
102
+ self.bundle_md['preview_data'] = preview
100
103
 
101
104
  _logger.info(f'Packaging complete: {self.bundle_path}')
102
105
  return self.bundle_path
@@ -109,9 +112,12 @@ class TablePackager:
109
112
  assert any(tv.id == base.id for base in self.table._tbl_version_path.get_tbl_versions())
110
113
  sql_types = {col.name: col.type for col in tv.store_tbl.sa_tbl.columns}
111
114
  media_cols: set[str] = set()
115
+ cellmd_cols: set[str] = set()
112
116
  for col in tv.cols:
113
117
  if col.is_stored and col.col_type.is_media_type():
114
118
  media_cols.add(col.store_name())
119
+ if col.stores_cellmd:
120
+ cellmd_cols.add(col.cellmd_store_name())
115
121
 
116
122
  parquet_schema = self.__to_parquet_schema(tv.store_tbl.sa_tbl)
117
123
  # TODO: Partition larger tables into multiple parquet files. (The parquet file naming scheme anticipates
@@ -126,10 +132,10 @@ class TablePackager:
126
132
  # excessive memory usage. The pyarrow tables are then amalgamated into the (single) Parquet table on disk.
127
133
  # We use snappy compression for the Parquet tables; the entire bundle will be bzip2-compressed later, so
128
134
  # faster compression should provide good performance while still reducing temporary storage utilization.
129
- parquet_writer = pq.ParquetWriter(parquet_file, parquet_schema, compression='SNAPPY')
130
- filter_tv = self.table._tbl_version.get()
135
+ parquet_writer = pq.ParquetWriter(parquet_file, parquet_schema, compression='snappy')
136
+ filter_tv = self.table._tbl_version_path.tbl_version.get()
131
137
  row_iter = tv.store_tbl.dump_rows(tv.version, filter_tv.store_tbl, filter_tv.version)
132
- for pa_table in self.__to_pa_tables(row_iter, sql_types, media_cols, parquet_schema):
138
+ for pa_table in self.__to_pa_tables(row_iter, sql_types, media_cols, cellmd_cols, parquet_schema):
133
139
  parquet_writer.write_table(pa_table)
134
140
  parquet_writer.close()
135
141
 
@@ -138,7 +144,7 @@ class TablePackager:
138
144
  @classmethod
139
145
  def __to_parquet_schema(cls, store_tbl: sql.Table) -> pa.Schema:
140
146
  entries = [(col_name, cls.__to_parquet_type(col.type)) for col_name, col in store_tbl.columns.items()]
141
- return pa.schema(entries) # type: ignore[arg-type]
147
+ return pa.schema(entries)
142
148
 
143
149
  @classmethod
144
150
  def __to_parquet_type(cls, col_type: sql.types.TypeEngine[Any]) -> pa.DataType:
@@ -151,13 +157,17 @@ class TablePackager:
151
157
  if isinstance(col_type, sql.Float):
152
158
  return pa.float32()
153
159
  if isinstance(col_type, sql.TIMESTAMP):
154
- return pa.timestamp('us', tz=datetime.timezone.utc)
160
+ return pa.timestamp('us', tz='UTC')
155
161
  if isinstance(col_type, sql.Date):
156
162
  return pa.date32()
157
163
  if isinstance(col_type, sql.JSON):
158
164
  return pa.string() # JSON will be exported as strings
159
165
  if isinstance(col_type, sql.LargeBinary):
160
166
  return pa.binary()
167
+ if isinstance(col_type, sql_vector.Vector):
168
+ # Parquet/pyarrow do not handle null values properly for fixed_shape_tensor(), so we have to use list_()
169
+ # here instead.
170
+ return pa.list_(pa.float32())
161
171
  raise AssertionError(f'Unrecognized SQL type: {col_type} (type {type(col_type)})')
162
172
 
163
173
  def __to_pa_tables(
@@ -165,6 +175,7 @@ class TablePackager:
165
175
  row_iter: Iterator[dict[str, Any]],
166
176
  sql_types: dict[str, sql.types.TypeEngine[Any]],
167
177
  media_cols: set[str],
178
+ cellmd_cols: set[str],
168
179
  arrow_schema: pa.Schema,
169
180
  batch_size: int = 1_000,
170
181
  ) -> Iterator[pa.Table]:
@@ -176,14 +187,21 @@ class TablePackager:
176
187
  for rows in more_itertools.batched(row_iter, batch_size):
177
188
  cols = {}
178
189
  for name, sql_type in sql_types.items():
179
- is_media_col = name in media_cols
180
- values = [self.__to_pa_value(row.get(name), sql_type, is_media_col) for row in rows]
190
+ values = [
191
+ self.__to_pa_value(row.get(name), sql_type, name in media_cols, name in cellmd_cols) for row in rows
192
+ ]
181
193
  cols[name] = values
182
194
  yield pa.Table.from_pydict(cols, schema=arrow_schema)
183
195
 
184
- def __to_pa_value(self, val: Any, sql_type: sql.types.TypeEngine[Any], is_media_col: bool) -> Any:
196
+ def __to_pa_value(
197
+ self, val: Any, sql_type: sql.types.TypeEngine[Any], is_media_col: bool, is_cellmd_col: bool
198
+ ) -> Any:
185
199
  if val is None:
186
200
  return None
201
+ if is_cellmd_col:
202
+ assert isinstance(val, dict)
203
+ # Export JSON as strings
204
+ return json.dumps(self.__process_cellmd(val))
187
205
  if isinstance(sql_type, sql.JSON):
188
206
  # Export JSON as strings
189
207
  return json.dumps(val)
@@ -194,6 +212,10 @@ class TablePackager:
194
212
  return val
195
213
 
196
214
  def __process_media_url(self, url: str) -> str:
215
+ """
216
+ Process a media URL for export. If it's a local file URL (file://), then replace it with a pxtmedia:// URI,
217
+ copying the file into the tarball if necessary. If it's any other type of URL, return it unchanged.
218
+ """
197
219
  parsed_url = urllib.parse.urlparse(url)
198
220
  if parsed_url.scheme == 'file':
199
221
  # It's the URL of a local file. Replace it with a pxtmedia:// URI.
@@ -214,6 +236,21 @@ class TablePackager:
214
236
  # For any type of URL other than a local file, just return the URL as-is.
215
237
  return url
216
238
 
239
+ def __process_cellmd(self, cellmd: dict[str, Any]) -> dict[str, Any]:
240
+ """
241
+ Process a cellmd dictionary for export. This involves replacing any local file references
242
+ with pxtmedia:// URIs, as described above.
243
+ """
244
+ cellmd_ = CellMd.from_dict(cellmd)
245
+ if cellmd_.file_urls is None:
246
+ return cellmd # No changes
247
+
248
+ updated_urls: list[str] = []
249
+ for url in cellmd_.file_urls:
250
+ updated_urls.append(self.__process_media_url(url))
251
+ cellmd_.file_urls = updated_urls
252
+ return cellmd_.as_dict()
253
+
217
254
  def __build_tarball(self) -> Path:
218
255
  bundle_path = self.tmp_dir / 'bundle.tar.bz2'
219
256
  with tarfile.open(bundle_path, 'w:bz2') as tf:
@@ -237,8 +274,7 @@ class TablePackager:
237
274
  - Videos are replaced by their first frame and resized as above
238
275
  - Documents are replaced by a thumbnail as a base64-encoded webp
239
276
  """
240
- # First 8 columns
241
- preview_cols = dict(itertools.islice(self.table._schema.items(), 0, 8))
277
+ preview_cols = self.table._get_schema()
242
278
  select_list = [self.table[col_name] for col_name in preview_cols]
243
279
  # First 5 rows
244
280
  rows = list(self.table.select(*select_list).head(n=5))
@@ -308,11 +344,11 @@ class TablePackager:
308
344
  scaled_img.save(buffer, 'webp')
309
345
  return base64.b64encode(buffer.getvalue()).decode()
310
346
 
311
- def __encode_video(self, video_path: str) -> Optional[str]:
347
+ def __encode_video(self, video_path: str) -> str | None:
312
348
  thumb = Formatter.extract_first_video_frame(video_path)
313
349
  return self.__encode_image(thumb) if thumb is not None else None
314
350
 
315
- def __encode_document(self, doc_path: str) -> Optional[str]:
351
+ def __encode_document(self, doc_path: str) -> str | None:
316
352
  thumb = Formatter.make_document_thumbnail(doc_path)
317
353
  return self.__encode_image(thumb) if thumb is not None else None
318
354
 
@@ -324,20 +360,21 @@ class TableRestorer:
324
360
 
325
361
  Args:
326
362
  tbl_path: Pixeltable path (such as 'my_dir.my_table') where the materialized table will be made visible.
327
- md: Optional metadata dictionary. If not provided, metadata will be read from the tarball's `metadata.json`.
363
+ bundle_md: Optional metadata dictionary.
364
+ If not provided, metadata will be read from the tarball's `metadata.json`.
328
365
  The metadata contains table_md, table_version_md, and table_schema_version_md entries for each ancestor
329
366
  of the table being restored, as written out by `TablePackager`.
330
367
  """
331
368
 
332
369
  tbl_path: str
333
- md: Optional[dict[str, Any]]
370
+ bundle_md: dict[str, Any] | None
334
371
  tmp_dir: Path
335
372
  media_files: dict[str, str] # Mapping from pxtmedia:// URLs to local file:// URLs
336
373
 
337
- def __init__(self, tbl_path: str, md: Optional[dict[str, Any]] = None) -> None:
374
+ def __init__(self, tbl_path: str, bundle_md: dict[str, Any] | None = None) -> None:
338
375
  self.tbl_path = tbl_path
339
- self.md = md
340
- self.tmp_dir = Path(Env.get().create_tmp_path())
376
+ self.bundle_md = bundle_md
377
+ self.tmp_dir = TempStore.create_path()
341
378
  self.media_files = {}
342
379
 
343
380
  def restore(self, bundle_path: Path) -> pxt.Table:
@@ -346,12 +383,12 @@ class TableRestorer:
346
383
  with tarfile.open(bundle_path, 'r:bz2') as tf:
347
384
  tf.extractall(path=self.tmp_dir)
348
385
 
349
- if self.md is None:
386
+ if self.bundle_md is None:
350
387
  # No metadata supplied; read it from the archive
351
388
  with open(self.tmp_dir / 'metadata.json', 'r', encoding='utf8') as fp:
352
- self.md = json.load(fp)
389
+ self.bundle_md = json.load(fp)
353
390
 
354
- pxt_md_version = self.md['pxt_md_version']
391
+ pxt_md_version = self.bundle_md['pxt_md_version']
355
392
  assert isinstance(pxt_md_version, int)
356
393
 
357
394
  if pxt_md_version != metadata.VERSION:
@@ -359,51 +396,40 @@ class TableRestorer:
359
396
  f'Pixeltable metadata version mismatch: {pxt_md_version} != {metadata.VERSION}.\n'
360
397
  'Please upgrade Pixeltable to use this dataset: pip install -U pixeltable'
361
398
  )
399
+ # Convert tables metadata from dict to list of TableVersionCompleteMd
400
+ tbl_md = [schema.md_from_dict(TableVersionCompleteMd, t) for t in self.bundle_md['md']]
401
+
402
+ for md in tbl_md:
403
+ md.tbl_md.is_replica = True
362
404
 
363
- tbl_md = [schema.FullTableMd.from_dict(t) for t in self.md['md']['tables']]
405
+ assert not tbl_md[0].version_md.is_fragment # Top-level table cannot be a version fragment
364
406
 
365
- # Create the replica table
366
- # The logic here needs to be completely restructured in order to make it concurrency-safe.
367
- # - Catalog.create_replica() needs to write the metadata and also create the physical store tables
368
- # and populate them, otherwise concurrent readers will see an inconsistent state (table metadata w/o
369
- # an actual table)
370
- # - this could be done one replica at a time (instead of the entire hierarchy)
371
407
  cat = catalog.Catalog.get()
372
- cat.create_replica(catalog.Path(self.tbl_path), tbl_md)
373
- # don't call get_table() until after the calls to create_replica() and __import_table() below;
374
- # the TV instances created by get_table() would be replaced by create_replica(), which creates duplicate
375
- # TV instances for the same replica version, which then leads to failures when constructing queries
376
-
377
- # Now we need to instantiate and load data for replica_tbl and its ancestors, except that we skip
378
- # replica_tbl itself if it's a pure snapshot.
379
- target_md = tbl_md[0]
380
- is_pure_snapshot = (
381
- target_md.tbl_md.view_md is not None
382
- and target_md.tbl_md.view_md.predicate is None
383
- and len(target_md.schema_version_md.columns) == 0
384
- )
385
- if is_pure_snapshot:
386
- ancestor_md = tbl_md[1:] # Pure snapshot; skip replica_tbl
387
- else:
388
- ancestor_md = tbl_md # Not a pure snapshot; include replica_tbl
389
-
390
- # Instantiate data from the Parquet tables.
391
- with Env.get().begin_xact():
392
- for md in ancestor_md[::-1]: # Base table first
393
- # Create a TableVersion instance (and a store table) for this ancestor.
394
- tv = catalog.TableVersion.create_replica(md)
395
- # Now import data from Parquet.
396
- _logger.info(f'Importing table {tv.name!r}.')
397
- self.__import_table(self.tmp_dir, tv, md)
398
-
399
- with cat.begin_xact(for_write=False):
408
+
409
+ with cat.begin_xact(for_write=True):
410
+ # Create (or update) the replica table and its ancestors, along with TableVersion instances for any
411
+ # versions that have not been seen before.
412
+ cat.create_replica(catalog.Path.parse(self.tbl_path), tbl_md)
413
+
414
+ _logger.debug(f'Now will import data for {len(tbl_md)} table(s):')
415
+ _logger.debug(repr([md.tbl_md.tbl_id for md in tbl_md[::-1]]))
416
+
417
+ # Now we need to load data for replica_tbl and its ancestors, except that we skip
418
+ # replica_tbl itself if it's a pure snapshot.
419
+ for md in tbl_md[::-1]: # Base table first
420
+ if not md.is_pure_snapshot:
421
+ tv = cat.get_tbl_version(UUID(md.tbl_md.tbl_id), md.version_md.version)
422
+ # Import data from Parquet.
423
+ _logger.info(f'Importing table {tv.name!r}.')
424
+ self.__import_table(self.tmp_dir, tv, md)
425
+
400
426
  return cat.get_table_by_id(UUID(tbl_md[0].tbl_md.tbl_id))
401
427
 
402
- def __import_table(self, bundle_path: Path, tv: catalog.TableVersion, tbl_md: schema.FullTableMd) -> None:
428
+ def __import_table(self, bundle_path: Path, tv: catalog.TableVersion, tbl_md: TableVersionCompleteMd) -> None:
403
429
  """
404
430
  Import the Parquet table into the Pixeltable catalog.
405
431
  """
406
- tbl_id = uuid.UUID(tbl_md.tbl_md.tbl_id)
432
+ tbl_id = UUID(tbl_md.tbl_md.tbl_id)
407
433
  parquet_dir = bundle_path / 'tables' / f'tbl_{tbl_id.hex}'
408
434
  parquet_table = pq.read_table(str(parquet_dir))
409
435
  replica_version = tv.version
@@ -422,6 +448,9 @@ class TableRestorer:
422
448
  # 2. "rectify" the v_max values in both the temporary table and the existing table (more on this below);
423
449
  # 3. Delete any row instances from the temporary table that are already present in the existing table;
424
450
  # 4. Copy the remaining rows from the temporary table into the existing table.
451
+ # 5. Rectify any index columns.
452
+
453
+ # STEP 1: Import the parquet data into a temporary table.
425
454
 
426
455
  # Create a temporary table for the initial data load, containing columns for all columns present in the
427
456
  # parquet table. The parquet columns have identical names to those in the store table, so we can use the
@@ -429,7 +458,7 @@ class TableRestorer:
429
458
  # e.g., pa.string() may hold either VARCHAR or serialized JSONB).
430
459
  temp_cols: dict[str, sql.Column] = {}
431
460
  for field in parquet_table.schema:
432
- assert field.name in store_sa_tbl.columns
461
+ assert field.name in store_sa_tbl.columns, f'{field.name} not in {list(store_sa_tbl.columns)}'
433
462
  col_type = store_sa_tbl.columns[field.name].type
434
463
  temp_cols[field.name] = sql.Column(field.name, col_type)
435
464
  temp_sa_tbl_name = f'temp_{uuid.uuid4().hex}'
@@ -445,6 +474,8 @@ class TableRestorer:
445
474
  rows = self.__from_pa_pydict(tv, pydict)
446
475
  conn.execute(sql.insert(temp_sa_tbl), rows)
447
476
 
477
+ # STEP 2: Rectify v_max values.
478
+
448
479
  # Each row version is identified uniquely by its pk, a tuple (row_id, pos_0, pos_1, ..., pos_k, v_min).
449
480
  # Conversely, v_max is not part of the primary key, but is simply a bookkeeping device.
450
481
  # In an original table, v_max is always equal to the v_min of the succeeding row instance with the same
@@ -478,42 +509,51 @@ class TableRestorer:
478
509
  for col_name, col in temp_cols.items()
479
510
  if col_name not in system_col_names and col_name not in media_col_names
480
511
  ]
481
- mismatch_predicates = [store_col != temp_col for store_col, temp_col in zip(value_store_cols, value_temp_cols)]
482
- mismatch_clause = sql.or_(*mismatch_predicates)
483
-
484
- # This query looks for rows that have matching primary keys (rowid + pos_k + v_min), but differ in at least
485
- # one value column. Pseudo-SQL:
486
- #
487
- # SELECT store_tbl.col_0, ..., store_tbl.col_n, temp_tbl.col_0, ..., temp_tbl.col_n
488
- # FROM store_tbl, temp_tbl
489
- # WHERE store_tbl.rowid = temp_tbl.rowid
490
- # AND store_tbl.pos_0 = temp_tbl.pos_0
491
- # AND ... AND store_tbl.pos_k = temp_tbl.pos_k
492
- # AND store_tbl.v_min = temp_tbl.v_min
493
- # AND (
494
- # store_tbl.col_0 != temp_tbl.col_0
495
- # OR store_tbl.col_1 != temp_tbl.col_1
496
- # OR ... OR store_tbl.col_n != temp_tbl.col_n
497
- # )
498
- #
499
- # The value column comparisons (store_tbl.col_0 != temp_tbl.col_0, etc.) will always be false for rows where
500
- # either column is NULL; this is what we want, since it may indicate a column that is present in one version
501
- # but not the other.
502
- q = sql.select(*value_store_cols, *value_temp_cols).where(pk_clause).where(mismatch_clause)
503
- _logger.debug(q.compile())
504
- result = conn.execute(q)
505
- if result.rowcount > 0:
506
- _logger.debug(
507
- f'Data corruption error between {temp_sa_tbl_name!r} and {store_sa_tbl_name!r}: '
508
- f'{result.rowcount} inconsistent row(s).'
509
- )
510
- row = result.first()
511
- _logger.debug('Example mismatch:')
512
- _logger.debug(f'{store_sa_tbl_name}: {row[: len(value_store_cols)]}')
513
- _logger.debug(f'{temp_sa_tbl_name}: {row[len(value_store_cols) :]}')
514
- raise excs.Error(
515
- 'Data corruption error: the replica data are inconsistent with data retrieved from a previous replica.'
516
- )
512
+
513
+ q: sql.Executable
514
+
515
+ assert len(value_store_cols) == len(value_temp_cols)
516
+ if len(value_store_cols) > 0:
517
+ mismatch_predicates = [
518
+ store_col != temp_col for store_col, temp_col in zip(value_store_cols, value_temp_cols)
519
+ ]
520
+ mismatch_clause = sql.or_(*mismatch_predicates)
521
+
522
+ # This query looks for rows that have matching primary keys (rowid + pos_k + v_min), but differ in at least
523
+ # one value column. Pseudo-SQL:
524
+ #
525
+ # SELECT store_tbl.col_0, ..., store_tbl.col_n, temp_tbl.col_0, ..., temp_tbl.col_n
526
+ # FROM store_tbl, temp_tbl
527
+ # WHERE store_tbl.rowid = temp_tbl.rowid
528
+ # AND store_tbl.pos_0 = temp_tbl.pos_0
529
+ # AND ... AND store_tbl.pos_k = temp_tbl.pos_k
530
+ # AND store_tbl.v_min = temp_tbl.v_min
531
+ # AND (
532
+ # store_tbl.col_0 != temp_tbl.col_0
533
+ # OR store_tbl.col_1 != temp_tbl.col_1
534
+ # OR ... OR store_tbl.col_n != temp_tbl.col_n
535
+ # )
536
+ #
537
+ # The value column comparisons (store_tbl.col_0 != temp_tbl.col_0, etc.) will always be false for rows where
538
+ # either column is NULL; this is what we want, since it may indicate a column that is present in one version
539
+ # but not the other.
540
+ q = sql.select(*value_store_cols, *value_temp_cols).where(pk_clause).where(mismatch_clause)
541
+ _logger.debug(q.compile())
542
+ result = conn.execute(q)
543
+ if result.rowcount > 0:
544
+ _logger.debug(
545
+ f'Data corruption error between {temp_sa_tbl_name!r} and {store_sa_tbl_name!r}: '
546
+ f'{result.rowcount} inconsistent row(s).'
547
+ )
548
+ row = result.first()
549
+ _logger.debug('Example mismatch:')
550
+ _logger.debug(f'{store_sa_tbl_name}: {row[: len(value_store_cols)]}')
551
+ _logger.debug(f'{temp_sa_tbl_name}: {row[len(value_store_cols) :]}')
552
+ raise excs.Error(
553
+ 'Data corruption error: '
554
+ 'the replica data are inconsistent with data retrieved from a previous replica.'
555
+ )
556
+
517
557
  _logger.debug(f'Verified data integrity between {store_sa_tbl_name!r} and {temp_sa_tbl_name!r}.')
518
558
 
519
559
  # Now rectify the v_max values in the temporary table.
@@ -544,6 +584,8 @@ class TableRestorer:
544
584
  result = conn.execute(q)
545
585
  _logger.debug(f'Rectified {result.rowcount} row(s) in {store_sa_tbl_name!r}.')
546
586
 
587
+ # STEP 3: Delete any row instances from the temporary table that are already present in the existing table.
588
+
547
589
  # Now we need to update rows in the existing table that are also present in the temporary table. This is to
548
590
  # account for the scenario where the temporary table has columns that are not present in the existing table.
549
591
  # (We can't simply replace the rows with their versions in the temporary table, because the converse scenario
@@ -574,7 +616,9 @@ class TableRestorer:
574
616
  result = conn.execute(q)
575
617
  _logger.debug(f'Deleted {result.rowcount} row(s) from {temp_sa_tbl_name!r}.')
576
618
 
577
- # Finally, copy the remaining data (consisting entirely of new row instances) from the temporary table into
619
+ # STEP 4: Copy the remaining rows from the temporary table into the existing table.
620
+
621
+ # Now copy the remaining data (consisting entirely of new row instances) from the temporary table into
578
622
  # the actual table.
579
623
  q = store_sa_tbl.insert().from_select(
580
624
  [store_sa_tbl.c[col_name] for col_name in temp_cols], sql.select(*temp_cols.values())
@@ -583,42 +627,118 @@ class TableRestorer:
583
627
  result = conn.execute(q)
584
628
  _logger.debug(f'Inserted {result.rowcount} row(s) from {temp_sa_tbl_name!r} into {store_sa_tbl_name!r}.')
585
629
 
630
+ # STEP 5: Rectify any index columns.
631
+
632
+ # Finally, rectify any index columns in the table. This involves shuffling data between the index's val and
633
+ # undo columns to ensure they appropriately reflect the most recent replicated version of the table.
634
+
635
+ # Get the most recent replicated version of the table. This might be the version we're currently importing,
636
+ # but it might be a different version of the table that was previously imported.
637
+ head_version_md = catalog.Catalog.get()._collect_tbl_history(tv.id, n=1)[0]
638
+ head_version = head_version_md.version_md.version
639
+ _logger.debug(f'Head version for index rectification is {head_version}.')
640
+
641
+ # Get the index info from the table metadata. Here we use the tbl_md that we just collected from the DB.
642
+ # This is to ensure we pick up ALL indices, including dropped indices and indices that are present in
643
+ # a previously replicated version of the table, but not in the one currently being imported.
644
+ index_md = head_version_md.tbl_md.index_md
645
+
646
+ # Now update the table. We can do this for all indices together with just two SQL queries. For each index,
647
+ # at most one of the val or undo columns will be non-NULL in any given row.
648
+ # For rows where v_min <= head_version < v_max, we set, for all indices:
649
+ # val_col = whichever of (val_col, undo_col) is non-NULL (or NULL if both are, e.g., for a dropped index)
650
+ # undo_col = NULL
651
+ # For rows where head_version < v_min or v_max <= head_version, vice versa.
652
+ val_sql_clauses: dict[str, sql.ColumnElement] = {}
653
+ undo_sql_clauses: dict[str, sql.ColumnElement] = {}
654
+ for index in index_md.values():
655
+ if index.class_fqn.endswith('.EmbeddingIndex'):
656
+ val_col_name = f'col_{index.index_val_col_id}'
657
+ undo_col_name = f'col_{index.index_val_undo_col_id}'
658
+ # Check that the val column for the index is actually present in the store table. We need to do this
659
+ # to properly handle the case where the replica represents a table version that was *not* the most
660
+ # recent version at the time it was published. In that case, it is possible for tbl_md to contain
661
+ # metadata for indices not known to any version that has been replicated. (However, the converse
662
+ # *does* hold: all replicated indices must have metadata in tbl_md; and that's what's important.)
663
+ if val_col_name in store_sa_tbl.c:
664
+ assert undo_col_name in store_sa_tbl.c
665
+ coalesce = sql.func.coalesce(store_sa_tbl.c[val_col_name], store_sa_tbl.c[undo_col_name])
666
+ val_sql_clauses[val_col_name] = coalesce
667
+ val_sql_clauses[undo_col_name] = sql.null()
668
+ undo_sql_clauses[undo_col_name] = coalesce
669
+ undo_sql_clauses[val_col_name] = sql.null()
670
+
671
+ if len(val_sql_clauses) > 0:
672
+ q2 = (
673
+ store_sa_tbl.update()
674
+ .values(**val_sql_clauses)
675
+ .where(sql.and_(tv.store_tbl.v_min_col <= head_version, tv.store_tbl.v_max_col > head_version))
676
+ )
677
+ _logger.debug(q2.compile())
678
+ _ = conn.execute(q2)
679
+ q2 = (
680
+ store_sa_tbl.update()
681
+ .values(**undo_sql_clauses)
682
+ .where(sql.or_(tv.store_tbl.v_min_col > head_version, tv.store_tbl.v_max_col <= head_version))
683
+ )
684
+ _logger.debug(q2.compile())
685
+ _ = conn.execute(q2)
686
+ _logger.debug(f'Rectified index columns in {store_sa_tbl_name!r}.')
687
+ else:
688
+ _logger.debug(f'No index columns to rectify in {store_sa_tbl_name!r}.')
689
+
586
690
  def __from_pa_pydict(self, tv: catalog.TableVersion, pydict: dict[str, Any]) -> list[dict[str, Any]]:
587
691
  # Data conversions from pyarrow to Pixeltable
588
692
  sql_types: dict[str, sql.types.TypeEngine[Any]] = {}
589
693
  for col_name in pydict:
590
694
  assert col_name in tv.store_tbl.sa_tbl.columns
591
695
  sql_types[col_name] = tv.store_tbl.sa_tbl.columns[col_name].type
592
- media_col_ids: dict[str, int] = {}
593
- for col in tv.cols:
594
- if col.is_stored and col.col_type.is_media_type():
595
- media_col_ids[col.store_name()] = col.id
696
+ stored_cols: dict[str, catalog.Column] = {col.store_name(): col for col in tv.cols if col.is_stored}
697
+ stored_cols |= {col.cellmd_store_name(): col for col in tv.cols if col.stores_cellmd}
596
698
 
597
699
  row_count = len(next(iter(pydict.values())))
598
- rows: list[dict[str, Any]] = []
599
- for i in range(row_count):
600
- row = {
601
- col_name: self.__from_pa_value(tv, col_vals[i], sql_types[col_name], media_col_ids.get(col_name))
602
- for col_name, col_vals in pydict.items()
603
- }
604
- rows.append(row)
700
+ rows: list[dict[str, Any]] = [{} for _ in range(row_count)]
701
+ for col_name, col_vals in pydict.items():
702
+ assert len(col_vals) == row_count
703
+ col = stored_cols.get(col_name) # Will be None for system columns
704
+ is_media_col = col is not None and col.is_stored and col.col_type.is_media_type()
705
+ is_cellmd_col = col is not None and col.stores_cellmd and col_name == col.cellmd_store_name()
706
+ assert col is None or is_cellmd_col or col_name == col.store_name()
707
+
708
+ for i, val in enumerate(col_vals):
709
+ rows[i][col_name] = self.__from_pa_value(val, sql_types[col_name], col, is_media_col, is_cellmd_col)
605
710
 
606
711
  return rows
607
712
 
608
713
  def __from_pa_value(
609
- self, tv: catalog.TableVersion, val: Any, sql_type: sql.types.TypeEngine[Any], media_col_id: Optional[int]
714
+ self,
715
+ val: Any,
716
+ sql_type: sql.types.TypeEngine[Any],
717
+ col: catalog.Column | None,
718
+ is_media_col: bool,
719
+ is_cellmd_col: bool,
610
720
  ) -> Any:
611
721
  if val is None:
612
722
  return None
723
+ if isinstance(sql_type, sql_vector.Vector):
724
+ if isinstance(val, list):
725
+ val = np.array(val, dtype=np.float32)
726
+ assert isinstance(val, np.ndarray) and val.dtype == np.float32 and val.ndim == 1
727
+ return val
728
+ if is_cellmd_col:
729
+ assert col is not None
730
+ assert isinstance(val, str)
731
+ return self.__restore_cellmd(col, json.loads(val))
613
732
  if isinstance(sql_type, sql.JSON):
614
733
  return json.loads(val)
615
- if media_col_id is not None:
616
- assert isinstance(val, str)
617
- return self.__relocate_media_file(tv, media_col_id, val)
734
+ if is_media_col:
735
+ assert col is not None
736
+ return self.__relocate_media_file(col, val)
618
737
  return val
619
738
 
620
- def __relocate_media_file(self, tv: catalog.TableVersion, media_col_id: int, url: str) -> str:
739
+ def __relocate_media_file(self, media_col: catalog.Column, url: str) -> str:
621
740
  # If this is a pxtmedia:// URL, relocate it
741
+ assert isinstance(url, str)
622
742
  parsed_url = urllib.parse.urlparse(url)
623
743
  assert parsed_url.scheme != 'file' # These should all have been converted to pxtmedia:// URLs
624
744
  if parsed_url.scheme == 'pxtmedia':
@@ -626,9 +746,19 @@ class TableRestorer:
626
746
  # First time seeing this pxtmedia:// URL. Relocate the file to the media store and record the mapping
627
747
  # in self.media_files.
628
748
  src_path = self.tmp_dir / 'media' / parsed_url.netloc
629
- dest_path = MediaStore.prepare_media_path(tv.id, media_col_id, tv.version, ext=src_path.suffix)
630
- src_path.rename(dest_path)
631
- self.media_files[url] = urllib.parse.urljoin('file:', urllib.request.pathname2url(str(dest_path)))
749
+ # Move the file to the media store and update the URL.
750
+ self.media_files[url] = ObjectOps.put_file(media_col, src_path, relocate_or_delete=True)
632
751
  return self.media_files[url]
633
752
  # For any type of URL other than a local file, just return the URL as-is.
634
753
  return url
754
+
755
+ def __restore_cellmd(self, col: catalog.Column, cellmd: dict[str, Any]) -> dict[str, Any]:
756
+ cellmd_ = CellMd.from_dict(cellmd)
757
+ if cellmd_.file_urls is None:
758
+ return cellmd # No changes
759
+
760
+ updated_urls: list[str] = []
761
+ for url in cellmd_.file_urls:
762
+ updated_urls.append(self.__relocate_media_file(col, url))
763
+ cellmd_.file_urls = updated_urls
764
+ return cellmd_.as_dict()