pixeltable 0.3.14__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. pixeltable/__init__.py +42 -8
  2. pixeltable/{dataframe.py → _query.py} +470 -206
  3. pixeltable/_version.py +1 -0
  4. pixeltable/catalog/__init__.py +5 -4
  5. pixeltable/catalog/catalog.py +1785 -432
  6. pixeltable/catalog/column.py +190 -113
  7. pixeltable/catalog/dir.py +2 -4
  8. pixeltable/catalog/globals.py +19 -46
  9. pixeltable/catalog/insertable_table.py +191 -98
  10. pixeltable/catalog/path.py +63 -23
  11. pixeltable/catalog/schema_object.py +11 -15
  12. pixeltable/catalog/table.py +843 -436
  13. pixeltable/catalog/table_metadata.py +103 -0
  14. pixeltable/catalog/table_version.py +978 -657
  15. pixeltable/catalog/table_version_handle.py +72 -16
  16. pixeltable/catalog/table_version_path.py +112 -43
  17. pixeltable/catalog/tbl_ops.py +53 -0
  18. pixeltable/catalog/update_status.py +191 -0
  19. pixeltable/catalog/view.py +134 -90
  20. pixeltable/config.py +134 -22
  21. pixeltable/env.py +471 -157
  22. pixeltable/exceptions.py +6 -0
  23. pixeltable/exec/__init__.py +4 -1
  24. pixeltable/exec/aggregation_node.py +7 -8
  25. pixeltable/exec/cache_prefetch_node.py +83 -110
  26. pixeltable/exec/cell_materialization_node.py +268 -0
  27. pixeltable/exec/cell_reconstruction_node.py +168 -0
  28. pixeltable/exec/component_iteration_node.py +4 -3
  29. pixeltable/exec/data_row_batch.py +8 -65
  30. pixeltable/exec/exec_context.py +16 -4
  31. pixeltable/exec/exec_node.py +13 -36
  32. pixeltable/exec/expr_eval/evaluators.py +11 -7
  33. pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
  34. pixeltable/exec/expr_eval/globals.py +8 -5
  35. pixeltable/exec/expr_eval/row_buffer.py +1 -2
  36. pixeltable/exec/expr_eval/schedulers.py +106 -56
  37. pixeltable/exec/globals.py +35 -0
  38. pixeltable/exec/in_memory_data_node.py +19 -19
  39. pixeltable/exec/object_store_save_node.py +293 -0
  40. pixeltable/exec/row_update_node.py +16 -9
  41. pixeltable/exec/sql_node.py +351 -84
  42. pixeltable/exprs/__init__.py +1 -1
  43. pixeltable/exprs/arithmetic_expr.py +27 -22
  44. pixeltable/exprs/array_slice.py +3 -3
  45. pixeltable/exprs/column_property_ref.py +36 -23
  46. pixeltable/exprs/column_ref.py +213 -89
  47. pixeltable/exprs/comparison.py +5 -5
  48. pixeltable/exprs/compound_predicate.py +5 -4
  49. pixeltable/exprs/data_row.py +164 -54
  50. pixeltable/exprs/expr.py +70 -44
  51. pixeltable/exprs/expr_dict.py +3 -3
  52. pixeltable/exprs/expr_set.py +17 -10
  53. pixeltable/exprs/function_call.py +100 -40
  54. pixeltable/exprs/globals.py +2 -2
  55. pixeltable/exprs/in_predicate.py +4 -4
  56. pixeltable/exprs/inline_expr.py +18 -32
  57. pixeltable/exprs/is_null.py +7 -3
  58. pixeltable/exprs/json_mapper.py +8 -8
  59. pixeltable/exprs/json_path.py +56 -22
  60. pixeltable/exprs/literal.py +27 -5
  61. pixeltable/exprs/method_ref.py +2 -2
  62. pixeltable/exprs/object_ref.py +2 -2
  63. pixeltable/exprs/row_builder.py +167 -67
  64. pixeltable/exprs/rowid_ref.py +25 -10
  65. pixeltable/exprs/similarity_expr.py +58 -40
  66. pixeltable/exprs/sql_element_cache.py +4 -4
  67. pixeltable/exprs/string_op.py +5 -5
  68. pixeltable/exprs/type_cast.py +3 -5
  69. pixeltable/func/__init__.py +1 -0
  70. pixeltable/func/aggregate_function.py +8 -8
  71. pixeltable/func/callable_function.py +9 -9
  72. pixeltable/func/expr_template_function.py +17 -11
  73. pixeltable/func/function.py +18 -20
  74. pixeltable/func/function_registry.py +6 -7
  75. pixeltable/func/globals.py +2 -3
  76. pixeltable/func/mcp.py +74 -0
  77. pixeltable/func/query_template_function.py +29 -27
  78. pixeltable/func/signature.py +46 -19
  79. pixeltable/func/tools.py +31 -13
  80. pixeltable/func/udf.py +18 -20
  81. pixeltable/functions/__init__.py +16 -0
  82. pixeltable/functions/anthropic.py +123 -77
  83. pixeltable/functions/audio.py +147 -10
  84. pixeltable/functions/bedrock.py +13 -6
  85. pixeltable/functions/date.py +7 -4
  86. pixeltable/functions/deepseek.py +35 -43
  87. pixeltable/functions/document.py +81 -0
  88. pixeltable/functions/fal.py +76 -0
  89. pixeltable/functions/fireworks.py +11 -20
  90. pixeltable/functions/gemini.py +195 -39
  91. pixeltable/functions/globals.py +142 -14
  92. pixeltable/functions/groq.py +108 -0
  93. pixeltable/functions/huggingface.py +1056 -24
  94. pixeltable/functions/image.py +115 -57
  95. pixeltable/functions/json.py +1 -1
  96. pixeltable/functions/llama_cpp.py +28 -13
  97. pixeltable/functions/math.py +67 -5
  98. pixeltable/functions/mistralai.py +18 -55
  99. pixeltable/functions/net.py +70 -0
  100. pixeltable/functions/ollama.py +20 -13
  101. pixeltable/functions/openai.py +240 -226
  102. pixeltable/functions/openrouter.py +143 -0
  103. pixeltable/functions/replicate.py +4 -4
  104. pixeltable/functions/reve.py +250 -0
  105. pixeltable/functions/string.py +239 -69
  106. pixeltable/functions/timestamp.py +16 -16
  107. pixeltable/functions/together.py +24 -84
  108. pixeltable/functions/twelvelabs.py +188 -0
  109. pixeltable/functions/util.py +6 -1
  110. pixeltable/functions/uuid.py +30 -0
  111. pixeltable/functions/video.py +1515 -107
  112. pixeltable/functions/vision.py +8 -8
  113. pixeltable/functions/voyageai.py +289 -0
  114. pixeltable/functions/whisper.py +16 -8
  115. pixeltable/functions/whisperx.py +179 -0
  116. pixeltable/{ext/functions → functions}/yolox.py +2 -4
  117. pixeltable/globals.py +362 -115
  118. pixeltable/index/base.py +17 -21
  119. pixeltable/index/btree.py +28 -22
  120. pixeltable/index/embedding_index.py +100 -118
  121. pixeltable/io/__init__.py +4 -2
  122. pixeltable/io/datarows.py +8 -7
  123. pixeltable/io/external_store.py +56 -105
  124. pixeltable/io/fiftyone.py +13 -13
  125. pixeltable/io/globals.py +31 -30
  126. pixeltable/io/hf_datasets.py +61 -16
  127. pixeltable/io/label_studio.py +74 -70
  128. pixeltable/io/lancedb.py +3 -0
  129. pixeltable/io/pandas.py +21 -12
  130. pixeltable/io/parquet.py +25 -105
  131. pixeltable/io/table_data_conduit.py +250 -123
  132. pixeltable/io/utils.py +4 -4
  133. pixeltable/iterators/__init__.py +2 -1
  134. pixeltable/iterators/audio.py +26 -25
  135. pixeltable/iterators/base.py +9 -3
  136. pixeltable/iterators/document.py +112 -78
  137. pixeltable/iterators/image.py +12 -15
  138. pixeltable/iterators/string.py +11 -4
  139. pixeltable/iterators/video.py +523 -120
  140. pixeltable/metadata/__init__.py +14 -3
  141. pixeltable/metadata/converters/convert_13.py +2 -2
  142. pixeltable/metadata/converters/convert_18.py +2 -2
  143. pixeltable/metadata/converters/convert_19.py +2 -2
  144. pixeltable/metadata/converters/convert_20.py +2 -2
  145. pixeltable/metadata/converters/convert_21.py +2 -2
  146. pixeltable/metadata/converters/convert_22.py +2 -2
  147. pixeltable/metadata/converters/convert_24.py +2 -2
  148. pixeltable/metadata/converters/convert_25.py +2 -2
  149. pixeltable/metadata/converters/convert_26.py +2 -2
  150. pixeltable/metadata/converters/convert_29.py +4 -4
  151. pixeltable/metadata/converters/convert_30.py +34 -21
  152. pixeltable/metadata/converters/convert_34.py +2 -2
  153. pixeltable/metadata/converters/convert_35.py +9 -0
  154. pixeltable/metadata/converters/convert_36.py +38 -0
  155. pixeltable/metadata/converters/convert_37.py +15 -0
  156. pixeltable/metadata/converters/convert_38.py +39 -0
  157. pixeltable/metadata/converters/convert_39.py +124 -0
  158. pixeltable/metadata/converters/convert_40.py +73 -0
  159. pixeltable/metadata/converters/convert_41.py +12 -0
  160. pixeltable/metadata/converters/convert_42.py +9 -0
  161. pixeltable/metadata/converters/convert_43.py +44 -0
  162. pixeltable/metadata/converters/util.py +20 -31
  163. pixeltable/metadata/notes.py +9 -0
  164. pixeltable/metadata/schema.py +140 -53
  165. pixeltable/metadata/utils.py +74 -0
  166. pixeltable/mypy/__init__.py +3 -0
  167. pixeltable/mypy/mypy_plugin.py +123 -0
  168. pixeltable/plan.py +382 -115
  169. pixeltable/share/__init__.py +1 -1
  170. pixeltable/share/packager.py +547 -83
  171. pixeltable/share/protocol/__init__.py +33 -0
  172. pixeltable/share/protocol/common.py +165 -0
  173. pixeltable/share/protocol/operation_types.py +33 -0
  174. pixeltable/share/protocol/replica.py +119 -0
  175. pixeltable/share/publish.py +257 -59
  176. pixeltable/store.py +311 -194
  177. pixeltable/type_system.py +373 -211
  178. pixeltable/utils/__init__.py +2 -3
  179. pixeltable/utils/arrow.py +131 -17
  180. pixeltable/utils/av.py +298 -0
  181. pixeltable/utils/azure_store.py +346 -0
  182. pixeltable/utils/coco.py +6 -6
  183. pixeltable/utils/code.py +3 -3
  184. pixeltable/utils/console_output.py +4 -1
  185. pixeltable/utils/coroutine.py +6 -23
  186. pixeltable/utils/dbms.py +32 -6
  187. pixeltable/utils/description_helper.py +4 -5
  188. pixeltable/utils/documents.py +7 -18
  189. pixeltable/utils/exception_handler.py +7 -30
  190. pixeltable/utils/filecache.py +6 -6
  191. pixeltable/utils/formatter.py +86 -48
  192. pixeltable/utils/gcs_store.py +295 -0
  193. pixeltable/utils/http.py +133 -0
  194. pixeltable/utils/http_server.py +2 -3
  195. pixeltable/utils/iceberg.py +1 -2
  196. pixeltable/utils/image.py +17 -0
  197. pixeltable/utils/lancedb.py +90 -0
  198. pixeltable/utils/local_store.py +322 -0
  199. pixeltable/utils/misc.py +5 -0
  200. pixeltable/utils/object_stores.py +573 -0
  201. pixeltable/utils/pydantic.py +60 -0
  202. pixeltable/utils/pytorch.py +5 -6
  203. pixeltable/utils/s3_store.py +527 -0
  204. pixeltable/utils/sql.py +26 -0
  205. pixeltable/utils/system.py +30 -0
  206. pixeltable-0.5.7.dist-info/METADATA +579 -0
  207. pixeltable-0.5.7.dist-info/RECORD +227 -0
  208. {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
  209. pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
  210. pixeltable/__version__.py +0 -3
  211. pixeltable/catalog/named_function.py +0 -40
  212. pixeltable/ext/__init__.py +0 -17
  213. pixeltable/ext/functions/__init__.py +0 -11
  214. pixeltable/ext/functions/whisperx.py +0 -77
  215. pixeltable/utils/media_store.py +0 -77
  216. pixeltable/utils/s3.py +0 -17
  217. pixeltable-0.3.14.dist-info/METADATA +0 -434
  218. pixeltable-0.3.14.dist-info/RECORD +0 -186
  219. pixeltable-0.3.14.dist-info/entry_points.txt +0 -3
  220. {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
@@ -1,23 +1,35 @@
1
- import datetime
1
+ import base64
2
+ import io
2
3
  import json
3
4
  import logging
4
5
  import tarfile
5
6
  import urllib.parse
6
7
  import urllib.request
7
8
  import uuid
9
+ from datetime import timedelta
8
10
  from pathlib import Path
9
- from typing import Any, Iterator, Optional
11
+ from typing import Any, Iterator
12
+ from uuid import UUID
10
13
 
11
14
  import more_itertools
15
+ import numpy as np
16
+ import pgvector.sqlalchemy as sql_vector # type: ignore[import-untyped]
17
+ import PIL.Image
12
18
  import pyarrow as pa
13
19
  import pyarrow.parquet as pq
14
20
  import sqlalchemy as sql
15
21
 
16
22
  import pixeltable as pxt
17
- from pixeltable import catalog, exceptions as excs, metadata
23
+ import pixeltable.utils.av as av_utils
24
+ from pixeltable import catalog, exceptions as excs, metadata, type_system as ts
25
+ from pixeltable.catalog.table_version import TableVersionKey, TableVersionMd
18
26
  from pixeltable.env import Env
27
+ from pixeltable.exprs.data_row import CellMd
19
28
  from pixeltable.metadata import schema
20
- from pixeltable.utils.media_store import MediaStore
29
+ from pixeltable.utils import sha256sum
30
+ from pixeltable.utils.formatter import Formatter
31
+ from pixeltable.utils.local_store import TempStore
32
+ from pixeltable.utils.object_stores import ObjectOps
21
33
 
22
34
  _logger = logging.getLogger('pixeltable')
23
35
 
@@ -42,43 +54,56 @@ class TablePackager:
42
54
  tmp_dir: Path # Temporary directory where the package will reside
43
55
  tables_dir: Path # Directory where the Parquet tables will be written
44
56
  media_files: dict[Path, str] # Mapping from local media file paths to their tarball names
45
- md: dict[str, Any]
57
+ bundle_md: dict[str, Any]
46
58
 
47
- def __init__(self, table: catalog.Table, additional_md: Optional[dict[str, Any]] = None) -> None:
59
+ bundle_path: Path
60
+ preview_header: dict[str, str]
61
+ preview: list[list[Any]]
62
+
63
+ def __init__(self, table: catalog.Table, additional_md: dict[str, Any] | None = None) -> None:
48
64
  self.table = table
49
- self.tmp_dir = Path(Env.get().create_tmp_path())
65
+ self.tmp_dir = TempStore.create_path()
50
66
  self.media_files = {}
51
67
 
52
- # Load metadata
53
- with Env.get().begin_xact():
68
+ # Load metadata and convert to JSON immediately
69
+ with catalog.Catalog.get().begin_xact(for_write=False):
54
70
  tbl_md = catalog.Catalog.get().load_replica_md(table)
55
- self.md = {
71
+ self.bundle_md = {
56
72
  'pxt_version': pxt.__version__,
57
73
  'pxt_md_version': metadata.VERSION,
58
- 'md': {'tables': [md.as_dict() for md in tbl_md]},
74
+ 'md': [md.as_dict() for md in tbl_md],
59
75
  }
60
76
  if additional_md is not None:
61
- self.md.update(additional_md)
77
+ self.bundle_md.update(additional_md)
62
78
 
63
79
  def package(self) -> Path:
64
80
  """
65
81
  Export the table to a tarball containing Parquet tables and media files.
66
82
  """
67
83
  assert not self.tmp_dir.exists() # Packaging can only be done once per TablePackager instance
68
- _logger.info(f"Packaging table '{self.table._path}' and its ancestors in: {self.tmp_dir}")
84
+
85
+ _logger.info(f'Packaging table {self.table._path()!r} and its ancestors in: {self.tmp_dir}')
69
86
  self.tmp_dir.mkdir()
70
87
  with open(self.tmp_dir / 'metadata.json', 'w', encoding='utf8') as fp:
71
- json.dump(self.md, fp)
88
+ json.dump(self.bundle_md, fp)
72
89
  self.tables_dir = self.tmp_dir / 'tables'
73
90
  self.tables_dir.mkdir()
74
- with Env.get().begin_xact():
91
+ with catalog.Catalog.get().begin_xact(for_write=False):
75
92
  for tv in self.table._tbl_version_path.get_tbl_versions():
76
- _logger.info(f"Exporting table '{tv.get().name}:{tv.get().version}'.")
93
+ _logger.info(f'Exporting table {tv.get().versioned_name!r}.')
77
94
  self.__export_table(tv.get())
95
+
78
96
  _logger.info('Building archive.')
79
- bundle_path = self.__build_tarball()
80
- _logger.info(f'Packaging complete: {bundle_path}')
81
- return bundle_path
97
+ self.bundle_path = self.__build_tarball()
98
+
99
+ _logger.info('Extracting preview data.')
100
+ self.bundle_md['row_count'] = self.table.count()
101
+ preview_header, preview = self.__extract_preview_data()
102
+ self.bundle_md['preview_header'] = preview_header
103
+ self.bundle_md['preview_data'] = preview
104
+
105
+ _logger.info(f'Packaging complete: {self.bundle_path}')
106
+ return self.bundle_path
82
107
 
83
108
  def __export_table(self, tv: catalog.TableVersion) -> None:
84
109
  """
@@ -88,9 +113,12 @@ class TablePackager:
88
113
  assert any(tv.id == base.id for base in self.table._tbl_version_path.get_tbl_versions())
89
114
  sql_types = {col.name: col.type for col in tv.store_tbl.sa_tbl.columns}
90
115
  media_cols: set[str] = set()
91
- for col in tv.cols_by_name.values():
116
+ cellmd_cols: set[str] = set()
117
+ for col in tv.cols:
92
118
  if col.is_stored and col.col_type.is_media_type():
93
119
  media_cols.add(col.store_name())
120
+ if col.stores_cellmd:
121
+ cellmd_cols.add(col.cellmd_store_name())
94
122
 
95
123
  parquet_schema = self.__to_parquet_schema(tv.store_tbl.sa_tbl)
96
124
  # TODO: Partition larger tables into multiple parquet files. (The parquet file naming scheme anticipates
@@ -101,14 +129,14 @@ class TablePackager:
101
129
  _logger.info(f'Creating parquet table: {parquet_file}')
102
130
 
103
131
  # Populate the Parquet table with data.
104
- # The data is first loaded from the DataFrame into a sequence of pyarrow tables, batched in order to avoid
132
+ # The data is first loaded from the Query into a sequence of pyarrow tables, batched in order to avoid
105
133
  # excessive memory usage. The pyarrow tables are then amalgamated into the (single) Parquet table on disk.
106
134
  # We use snappy compression for the Parquet tables; the entire bundle will be bzip2-compressed later, so
107
135
  # faster compression should provide good performance while still reducing temporary storage utilization.
108
- parquet_writer = pq.ParquetWriter(parquet_file, parquet_schema, compression='SNAPPY')
109
- filter_tv = self.table._tbl_version.get()
136
+ parquet_writer = pq.ParquetWriter(parquet_file, parquet_schema, compression='snappy')
137
+ filter_tv = self.table._tbl_version_path.tbl_version.get()
110
138
  row_iter = tv.store_tbl.dump_rows(tv.version, filter_tv.store_tbl, filter_tv.version)
111
- for pa_table in self.__to_pa_tables(row_iter, sql_types, media_cols, parquet_schema):
139
+ for pa_table in self.__to_pa_tables(row_iter, sql_types, media_cols, cellmd_cols, parquet_schema):
112
140
  parquet_writer.write_table(pa_table)
113
141
  parquet_writer.close()
114
142
 
@@ -117,7 +145,7 @@ class TablePackager:
117
145
  @classmethod
118
146
  def __to_parquet_schema(cls, store_tbl: sql.Table) -> pa.Schema:
119
147
  entries = [(col_name, cls.__to_parquet_type(col.type)) for col_name, col in store_tbl.columns.items()]
120
- return pa.schema(entries) # type: ignore[arg-type]
148
+ return pa.schema(entries)
121
149
 
122
150
  @classmethod
123
151
  def __to_parquet_type(cls, col_type: sql.types.TypeEngine[Any]) -> pa.DataType:
@@ -130,13 +158,19 @@ class TablePackager:
130
158
  if isinstance(col_type, sql.Float):
131
159
  return pa.float32()
132
160
  if isinstance(col_type, sql.TIMESTAMP):
133
- return pa.timestamp('us', tz=datetime.timezone.utc)
161
+ return pa.timestamp('us', tz='UTC')
134
162
  if isinstance(col_type, sql.Date):
135
163
  return pa.date32()
136
164
  if isinstance(col_type, sql.JSON):
137
165
  return pa.string() # JSON will be exported as strings
138
166
  if isinstance(col_type, sql.LargeBinary):
139
167
  return pa.binary()
168
+ if isinstance(col_type, sql.UUID):
169
+ return pa.uuid()
170
+ if isinstance(col_type, sql_vector.Vector):
171
+ # Parquet/pyarrow do not handle null values properly for fixed_shape_tensor(), so we have to use list_()
172
+ # here instead.
173
+ return pa.list_(pa.float32())
140
174
  raise AssertionError(f'Unrecognized SQL type: {col_type} (type {type(col_type)})')
141
175
 
142
176
  def __to_pa_tables(
@@ -144,6 +178,7 @@ class TablePackager:
144
178
  row_iter: Iterator[dict[str, Any]],
145
179
  sql_types: dict[str, sql.types.TypeEngine[Any]],
146
180
  media_cols: set[str],
181
+ cellmd_cols: set[str],
147
182
  arrow_schema: pa.Schema,
148
183
  batch_size: int = 1_000,
149
184
  ) -> Iterator[pa.Table]:
@@ -155,17 +190,28 @@ class TablePackager:
155
190
  for rows in more_itertools.batched(row_iter, batch_size):
156
191
  cols = {}
157
192
  for name, sql_type in sql_types.items():
158
- is_media_col = name in media_cols
159
- values = [self.__to_pa_value(row.get(name), sql_type, is_media_col) for row in rows]
193
+ values = [
194
+ self.__to_pa_value(row.get(name), sql_type, name in media_cols, name in cellmd_cols) for row in rows
195
+ ]
160
196
  cols[name] = values
161
197
  yield pa.Table.from_pydict(cols, schema=arrow_schema)
162
198
 
163
- def __to_pa_value(self, val: Any, sql_type: sql.types.TypeEngine[Any], is_media_col: bool) -> Any:
199
+ def __to_pa_value(
200
+ self, val: Any, sql_type: sql.types.TypeEngine[Any], is_media_col: bool, is_cellmd_col: bool
201
+ ) -> Any:
164
202
  if val is None:
165
203
  return None
204
+ if is_cellmd_col:
205
+ assert isinstance(val, dict)
206
+ # Export JSON as strings
207
+ return json.dumps(self.__process_cellmd(val))
166
208
  if isinstance(sql_type, sql.JSON):
167
209
  # Export JSON as strings
168
210
  return json.dumps(val)
211
+ if isinstance(sql_type, sql.UUID):
212
+ # PyArrow's pa.uuid() expects bytes
213
+ assert isinstance(val, uuid.UUID)
214
+ return val.bytes
169
215
  if is_media_col:
170
216
  # Handle media files as described above
171
217
  assert isinstance(val, str)
@@ -173,6 +219,10 @@ class TablePackager:
173
219
  return val
174
220
 
175
221
  def __process_media_url(self, url: str) -> str:
222
+ """
223
+ Process a media URL for export. If it's a local file URL (file://), then replace it with a pxtmedia:// URI,
224
+ copying the file into the tarball if necessary. If it's any other type of URL, return it unchanged.
225
+ """
176
226
  parsed_url = urllib.parse.urlparse(url)
177
227
  if parsed_url.scheme == 'file':
178
228
  # It's the URL of a local file. Replace it with a pxtmedia:// URI.
@@ -182,12 +232,32 @@ class TablePackager:
182
232
  path = Path(urllib.parse.unquote(urllib.request.url2pathname(parsed_url.path)))
183
233
  if path not in self.media_files:
184
234
  # Create a new entry in the `media_files` dict so that we can copy the file into the tarball later.
185
- dest_name = f'{uuid.uuid4().hex}{path.suffix}'
235
+ # We name the media files in the archive by their SHA256 hash. This ensures that we can properly
236
+ # deduplicate and validate them later.
237
+ # If we get a collision, it's not a problem; it just means we have two identical files (which will
238
+ # be conveniently deduplicated in the bundle).
239
+ sha = sha256sum(path)
240
+ dest_name = f'{sha}{path.suffix}'
186
241
  self.media_files[path] = dest_name
187
242
  return f'pxtmedia://{self.media_files[path]}'
188
243
  # For any type of URL other than a local file, just return the URL as-is.
189
244
  return url
190
245
 
246
+ def __process_cellmd(self, cellmd: dict[str, Any]) -> dict[str, Any]:
247
+ """
248
+ Process a cellmd dictionary for export. This involves replacing any local file references
249
+ with pxtmedia:// URIs, as described above.
250
+ """
251
+ cellmd_ = CellMd.from_dict(cellmd)
252
+ if cellmd_.file_urls is None:
253
+ return cellmd # No changes
254
+
255
+ updated_urls: list[str] = []
256
+ for url in cellmd_.file_urls:
257
+ updated_urls.append(self.__process_media_url(url))
258
+ cellmd_.file_urls = updated_urls
259
+ return cellmd_.as_dict()
260
+
191
261
  def __build_tarball(self) -> Path:
192
262
  bundle_path = self.tmp_dir / 'bundle.tar.bz2'
193
263
  with tarfile.open(bundle_path, 'w:bz2') as tf:
@@ -200,6 +270,116 @@ class TablePackager:
200
270
  tf.add(src_file, arcname=f'media/{dest_name}')
201
271
  return bundle_path
202
272
 
273
+ def __extract_preview_data(self) -> tuple[dict[str, str], list[list[Any]]]:
274
+ """
275
+ Extract a preview of the table data for display in the UI.
276
+
277
+ In order to bound the size of the output data, all "unbounded" data types are resized:
278
+ - Strings are abbreviated as per Formatter.abbreviate()
279
+ - Arrays and JSON are shortened and formatted as strings
280
+ - Images are resized to thumbnail size as a base64-encoded webp
281
+ - Videos are replaced by their first frame and resized as above
282
+ - Documents are replaced by a thumbnail as a base64-encoded webp
283
+ """
284
+ preview_cols = self.table._get_schema()
285
+ select_list = [self.table[col_name] for col_name in preview_cols]
286
+ # First 5 rows
287
+ rows = list(self.table.select(*select_list).head(n=5))
288
+
289
+ preview_header = {col_name: str(col_type._type) for col_name, col_type in preview_cols.items()}
290
+ preview = [
291
+ [self.__encode_preview_data(val, col_type)]
292
+ for row in rows
293
+ for val, col_type in zip(row.values(), preview_cols.values(), strict=True)
294
+ ]
295
+
296
+ return preview_header, preview
297
+
298
+ def __encode_preview_data(self, val: Any, col_type: ts.ColumnType) -> Any:
299
+ if val is None:
300
+ return None
301
+
302
+ match col_type._type:
303
+ case ts.ColumnType.Type.STRING:
304
+ assert isinstance(val, str)
305
+ return Formatter.abbreviate(val)
306
+
307
+ case ts.ColumnType.Type.INT | ts.ColumnType.Type.FLOAT | ts.ColumnType.Type.BOOL:
308
+ return val
309
+
310
+ case ts.ColumnType.Type.TIMESTAMP | ts.ColumnType.Type.DATE:
311
+ return str(val)
312
+
313
+ case ts.ColumnType.Type.UUID:
314
+ assert isinstance(val, uuid.UUID)
315
+ return str(val)
316
+
317
+ case ts.ColumnType.Type.BINARY:
318
+ assert isinstance(val, bytes)
319
+ return Formatter.format_binary(val)
320
+
321
+ case ts.ColumnType.Type.ARRAY:
322
+ assert isinstance(val, np.ndarray)
323
+ return Formatter.format_array(val)
324
+
325
+ case ts.ColumnType.Type.JSON:
326
+ # We need to escape the JSON string server-side for security reasons.
327
+ # Therefore we don't escape it here, in order to avoid double-escaping.
328
+ return Formatter.format_json(val, escape_strings=False)
329
+
330
+ case ts.ColumnType.Type.IMAGE:
331
+ # Rescale the image to minimize data transfer size
332
+ assert isinstance(val, PIL.Image.Image)
333
+ return self.__encode_image(val)
334
+
335
+ case ts.ColumnType.Type.AUDIO:
336
+ assert isinstance(val, str)
337
+ return self.__encode_audio(val)
338
+
339
+ case ts.ColumnType.Type.VIDEO:
340
+ assert isinstance(val, str)
341
+ return self.__encode_video(val)
342
+
343
+ case ts.ColumnType.Type.DOCUMENT:
344
+ assert isinstance(val, str)
345
+ return self.__encode_document(val)
346
+
347
+ case _:
348
+ raise AssertionError(f'Unrecognized column type: {col_type._type}')
349
+
350
+ def __encode_image(self, img: PIL.Image.Image) -> str:
351
+ # Heuristic for thumbnail sizing:
352
+ # Standardize on a width of 240 pixels (to most efficiently utilize the columnar display).
353
+ # But, if the aspect ratio is below 2:3, bound the height at 360 pixels (to avoid unboundedly tall thumbnails
354
+ # in the case of highly oblong images).
355
+ if img.height > img.width * 1.5:
356
+ scaled_img = img.resize((img.width * 360 // img.height, 360))
357
+ else:
358
+ scaled_img = img.resize((240, img.height * 240 // img.width))
359
+ with io.BytesIO() as buffer:
360
+ scaled_img.save(buffer, 'webp')
361
+ return base64.b64encode(buffer.getvalue()).decode()
362
+
363
+ def __encode_audio(self, audio_path: str) -> str | None:
364
+ try:
365
+ audio_md = av_utils.get_metadata(audio_path)
366
+ if 'streams' in audio_md:
367
+ duration = audio_md['streams'][0]['duration_seconds']
368
+ assert isinstance(duration, float)
369
+ return f'{timedelta(seconds=round(duration))} audio clip'
370
+ return None
371
+ except Exception:
372
+ _logger.info(f'Could not extract audio metadata from file for data preview: {audio_path}', exc_info=True)
373
+ return None
374
+
375
+ def __encode_video(self, video_path: str) -> str | None:
376
+ thumb = Formatter.extract_first_video_frame(video_path)
377
+ return self.__encode_image(thumb) if thumb is not None else None
378
+
379
+ def __encode_document(self, doc_path: str) -> str | None:
380
+ thumb = Formatter.make_document_thumbnail(doc_path)
381
+ return self.__encode_image(thumb) if thumb is not None else None
382
+
203
383
 
204
384
  class TableRestorer:
205
385
  """
@@ -208,34 +388,35 @@ class TableRestorer:
208
388
 
209
389
  Args:
210
390
  tbl_path: Pixeltable path (such as 'my_dir.my_table') where the materialized table will be made visible.
211
- md: Optional metadata dictionary. If not provided, metadata will be read from the tarball's `metadata.json`.
391
+ bundle_md: Optional metadata dictionary.
392
+ If not provided, metadata will be read from the tarball's `metadata.json`.
212
393
  The metadata contains table_md, table_version_md, and table_schema_version_md entries for each ancestor
213
394
  of the table being restored, as written out by `TablePackager`.
214
395
  """
215
396
 
216
397
  tbl_path: str
217
- md: Optional[dict[str, Any]]
398
+ bundle_md: dict[str, Any] | None
218
399
  tmp_dir: Path
219
400
  media_files: dict[str, str] # Mapping from pxtmedia:// URLs to local file:// URLs
220
401
 
221
- def __init__(self, tbl_path: str, md: Optional[dict[str, Any]] = None) -> None:
402
+ def __init__(self, tbl_path: str, bundle_md: dict[str, Any] | None = None) -> None:
222
403
  self.tbl_path = tbl_path
223
- self.md = md
224
- self.tmp_dir = Path(Env.get().create_tmp_path())
404
+ self.bundle_md = bundle_md
405
+ self.tmp_dir = TempStore.create_path()
225
406
  self.media_files = {}
226
407
 
227
- def restore(self, bundle_path: Path) -> pxt.Table:
408
+ def restore(self, bundle_path: Path, pxt_uri: str | None = None, explicit_version: int | None = None) -> pxt.Table:
228
409
  # Extract tarball
229
410
  print(f'Extracting table data into: {self.tmp_dir}')
230
411
  with tarfile.open(bundle_path, 'r:bz2') as tf:
231
412
  tf.extractall(path=self.tmp_dir)
232
413
 
233
- if self.md is None:
414
+ if self.bundle_md is None:
234
415
  # No metadata supplied; read it from the archive
235
416
  with open(self.tmp_dir / 'metadata.json', 'r', encoding='utf8') as fp:
236
- self.md = json.load(fp)
417
+ self.bundle_md = json.load(fp)
237
418
 
238
- pxt_md_version = self.md['pxt_md_version']
419
+ pxt_md_version = self.bundle_md['pxt_md_version']
239
420
  assert isinstance(pxt_md_version, int)
240
421
 
241
422
  if pxt_md_version != metadata.VERSION:
@@ -243,44 +424,301 @@ class TableRestorer:
243
424
  f'Pixeltable metadata version mismatch: {pxt_md_version} != {metadata.VERSION}.\n'
244
425
  'Please upgrade Pixeltable to use this dataset: pip install -U pixeltable'
245
426
  )
427
+ # Convert tables metadata from dict to list of TableVersionMd
428
+ tbl_md = [schema.md_from_dict(TableVersionMd, t) for t in self.bundle_md['md']]
246
429
 
247
- tbl_md = [schema.FullTableMd.from_dict(t) for t in self.md['md']['tables']]
430
+ for md in tbl_md:
431
+ md.tbl_md.is_replica = True
248
432
 
249
- # Create the replica table
250
- # TODO: This needs to be made concurrency-safe.
251
- replica_tbl = catalog.Catalog.get().create_replica(catalog.Path(self.tbl_path), tbl_md)
252
- assert replica_tbl._tbl_version.get().is_snapshot
433
+ assert not tbl_md[0].version_md.is_fragment # Top-level table cannot be a version fragment
253
434
 
254
- # Now we need to instantiate and load data for replica_tbl and its ancestors, except that we skip
255
- # replica_tbl itself if it's a pure snapshot.
256
- if replica_tbl._id != replica_tbl._tbl_version.id:
257
- ancestor_md = tbl_md[1:] # Pure snapshot; skip replica_tbl
258
- else:
259
- ancestor_md = tbl_md # Not a pure snapshot; include replica_tbl
435
+ cat = catalog.Catalog.get()
260
436
 
261
- # Instantiate data from the Parquet tables.
262
- with Env.get().begin_xact():
263
- for md in ancestor_md[::-1]: # Base table first
264
- # Create a TableVersion instance (and a store table) for this ancestor.
265
- tv = catalog.TableVersion.create_replica(md)
266
- # Now import data from Parquet.
267
- _logger.info(f'Importing table {tv.name!r}.')
268
- self.__import_table(self.tmp_dir, tv, md)
437
+ with cat.begin_xact(for_write=True):
438
+ # Create (or update) the replica table and its ancestors, along with TableVersion instances for any
439
+ # versions that have not been seen before.
440
+ cat.create_replica(catalog.Path.parse(self.tbl_path), tbl_md)
269
441
 
270
- return replica_tbl
442
+ _logger.debug(f'Now will import data for {len(tbl_md)} table(s):')
443
+ _logger.debug(repr([md.tbl_md.tbl_id for md in tbl_md[::-1]]))
271
444
 
272
- def __import_table(self, bundle_path: Path, tv: catalog.TableVersion, tbl_md: schema.FullTableMd) -> None:
445
+ # Now we need to load data for replica_tbl and its ancestors, except that we skip
446
+ # replica_tbl itself if it's a pure snapshot.
447
+ for md in tbl_md[::-1]: # Base table first
448
+ if not md.is_pure_snapshot:
449
+ tv = cat.get_tbl_version(TableVersionKey(UUID(md.tbl_md.tbl_id), md.version_md.version, None))
450
+ # Import data from Parquet.
451
+ _logger.info(f'Importing table {tv.name!r}.')
452
+ self.__import_table(self.tmp_dir, tv, md)
453
+
454
+ tbl = cat.get_table_by_id(UUID(tbl_md[0].tbl_md.tbl_id), version=explicit_version)
455
+ if pxt_uri is not None:
456
+ # Set pxt_uri for the newly created table
457
+ cat.update_additional_md(tbl._id, {'pxt_uri': pxt_uri})
458
+ tbl._tbl_version_path.clear_cached_md() # TODO: Clear cached md for ancestors too?
459
+ return tbl
460
+
461
+ def __import_table(self, bundle_path: Path, tv: catalog.TableVersion, tbl_md: TableVersionMd) -> None:
273
462
  """
274
463
  Import the Parquet table into the Pixeltable catalog.
275
464
  """
276
- tbl_id = uuid.UUID(tbl_md.tbl_md.tbl_id)
465
+ tbl_id = UUID(tbl_md.tbl_md.tbl_id)
277
466
  parquet_dir = bundle_path / 'tables' / f'tbl_{tbl_id.hex}'
278
467
  parquet_table = pq.read_table(str(parquet_dir))
279
-
280
- for batch in parquet_table.to_batches():
468
+ replica_version = tv.version
469
+
470
+ conn = Env.get().conn
471
+ store_sa_tbl = tv.store_tbl.sa_tbl
472
+ store_sa_tbl_name = tv.store_tbl._storage_name()
473
+
474
+ # Sometimes we are importing a table that has never been seen before. Other times, however, we are importing
475
+ # an existing replica table, and the table version and/or row selection differs from what was imported
476
+ # previously. Care must be taken to ensure that the new data is merged with existing data in a way that
477
+ # yields an internally consistent version history for each row.
478
+
479
+ # The overall strategy is this:
480
+ # 1. Import the parquet data into a temporary table;
481
+ # 2. "rectify" the v_max values in both the temporary table and the existing table (more on this below);
482
+ # 3. Delete any row instances from the temporary table that are already present in the existing table;
483
+ # 4. Copy the remaining rows from the temporary table into the existing table.
484
+ # 5. Rectify any index columns.
485
+
486
+ # STEP 1: Import the parquet data into a temporary table.
487
+
488
+ # Create a temporary table for the initial data load, containing columns for all columns present in the
489
+ # parquet table. The parquet columns have identical names to those in the store table, so we can use the
490
+ # store table schema to get their SQL types (which are not necessarily derivable from their Parquet types,
491
+ # e.g., pa.string() may hold either VARCHAR or serialized JSONB).
492
+ temp_cols: dict[str, sql.Column] = {}
493
+ for field in parquet_table.schema:
494
+ assert field.name in store_sa_tbl.columns, f'{field.name} not in {list(store_sa_tbl.columns)}'
495
+ col_type = store_sa_tbl.columns[field.name].type
496
+ temp_cols[field.name] = sql.Column(field.name, col_type)
497
+ temp_sa_tbl_name = f'temp_{uuid.uuid4().hex}'
498
+ _logger.debug(f'Creating temporary table: {temp_sa_tbl_name}')
499
+ temp_md = sql.MetaData()
500
+ temp_sa_tbl = sql.Table(temp_sa_tbl_name, temp_md, *temp_cols.values(), prefixes=['TEMPORARY'])
501
+ temp_sa_tbl.create(conn)
502
+
503
+ # Populate the temporary table with data from the Parquet file.
504
+ _logger.debug(f'Loading {parquet_table.num_rows} row(s) into temporary table: {temp_sa_tbl_name}')
505
+ for batch in parquet_table.to_batches(max_chunksize=10_000):
281
506
  pydict = batch.to_pydict()
282
507
  rows = self.__from_pa_pydict(tv, pydict)
283
- tv.store_tbl.load_rows(rows)
508
+ conn.execute(sql.insert(temp_sa_tbl), rows)
509
+
510
+ # STEP 2: Rectify v_max values.
511
+
512
+ # Each row version is identified uniquely by its pk, a tuple (row_id, pos_0, pos_1, ..., pos_k, v_min).
513
+ # Conversely, v_max is not part of the primary key, but is simply a bookkeeping device.
514
+ # In an original table, v_max is always equal to the v_min of the succeeding row instance with the same
515
+ # row id, or MAX_VERSION if no such row instance exists. But in the replica, we need to be careful, since
516
+ # we might see only a subset of the original table's versions, and we might see them out of order.
517
+
518
+ # We'll adjust the v_max values according to the principle of "latest provable v_max":
519
+ # they will always correspond to the latest version for which we can prove the row instance was alive. This
520
+ # will enable us to maintain consistency of the v_max values if additional table versions are later imported,
521
+ # regardless of the order in which they are seen. It also means that replica tables (unlike original tables)
522
+ # may have gaps in their row version histories, but this is fine; the gaps simply correspond to table versions
523
+ # that have never been observed.
524
+
525
+ pk_predicates = [col == temp_cols[col.name] for col in tv.store_tbl.pk_columns()]
526
+ pk_clause = sql.and_(*pk_predicates)
527
+
528
+ # If the same pk exists in both the temporary table and the existing table, then the corresponding row data
529
+ # must be identical; the rows can differ only in their v_max value. As a sanity check, we go through the
530
+ # motion of verifying this; a failure implies data corruption in either the replica being imported or in a
531
+ # previously imported replica.
532
+
533
+ system_col_names = {col.name for col in tv.store_tbl.system_columns()}
534
+ media_col_names = {col.store_name() for col in tv.cols if col.col_type.is_media_type() and col.is_stored}
535
+ value_store_cols = [
536
+ store_sa_tbl.c[col_name]
537
+ for col_name in temp_cols
538
+ if col_name not in system_col_names and col_name not in media_col_names
539
+ ]
540
+ value_temp_cols = [
541
+ col
542
+ for col_name, col in temp_cols.items()
543
+ if col_name not in system_col_names and col_name not in media_col_names
544
+ ]
545
+
546
+ q: sql.Executable
547
+
548
+ assert len(value_store_cols) == len(value_temp_cols)
549
+ if len(value_store_cols) > 0:
550
+ mismatch_predicates = [
551
+ store_col != temp_col for store_col, temp_col in zip(value_store_cols, value_temp_cols)
552
+ ]
553
+ mismatch_clause = sql.or_(*mismatch_predicates)
554
+
555
+ # This query looks for rows that have matching primary keys (rowid + pos_k + v_min), but differ in at least
556
+ # one value column. Pseudo-SQL:
557
+ #
558
+ # SELECT store_tbl.col_0, ..., store_tbl.col_n, temp_tbl.col_0, ..., temp_tbl.col_n
559
+ # FROM store_tbl, temp_tbl
560
+ # WHERE store_tbl.rowid = temp_tbl.rowid
561
+ # AND store_tbl.pos_0 = temp_tbl.pos_0
562
+ # AND ... AND store_tbl.pos_k = temp_tbl.pos_k
563
+ # AND store_tbl.v_min = temp_tbl.v_min
564
+ # AND (
565
+ # store_tbl.col_0 != temp_tbl.col_0
566
+ # OR store_tbl.col_1 != temp_tbl.col_1
567
+ # OR ... OR store_tbl.col_n != temp_tbl.col_n
568
+ # )
569
+ #
570
+ # The value column comparisons (store_tbl.col_0 != temp_tbl.col_0, etc.) will always be false for rows where
571
+ # either column is NULL; this is what we want, since it may indicate a column that is present in one version
572
+ # but not the other.
573
+ q = sql.select(*value_store_cols, *value_temp_cols).where(pk_clause).where(mismatch_clause)
574
+ _logger.debug(q.compile())
575
+ result = conn.execute(q)
576
+ if result.rowcount > 0:
577
+ _logger.debug(
578
+ f'Data corruption error between {temp_sa_tbl_name!r} and {store_sa_tbl_name!r}: '
579
+ f'{result.rowcount} inconsistent row(s).'
580
+ )
581
+ row = result.first()
582
+ _logger.debug('Example mismatch:')
583
+ _logger.debug(f'{store_sa_tbl_name}: {row[: len(value_store_cols)]}')
584
+ _logger.debug(f'{temp_sa_tbl_name}: {row[len(value_store_cols) :]}')
585
+ raise excs.Error(
586
+ 'Data corruption error: '
587
+ 'the replica data are inconsistent with data retrieved from a previous replica.'
588
+ )
589
+
590
+ _logger.debug(f'Verified data integrity between {store_sa_tbl_name!r} and {temp_sa_tbl_name!r}.')
591
+
592
+ # Now rectify the v_max values in the temporary table.
593
+ # If a row instance has a concrete v_max value, then we know it's genuine: it's the unique and immutable
594
+ # version when the row was deleted. (This can only happen if later versions of the base table already
595
+ # existed at the time this replica was published.)
596
+ # But if a row instance has a v_max value of MAX_VERSION, then we don't know anything about its future.
597
+ # It might live indefinitely, or it might be deleted as early as version `n + 1`. Following the principle
598
+ # of "latest provable v_max", we simply set v_max equal to `n + 1`.
599
+ q = (
600
+ temp_sa_tbl.update()
601
+ .values(v_max=(replica_version + 1))
602
+ .where(temp_sa_tbl.c.v_max == schema.Table.MAX_VERSION)
603
+ )
604
+ _logger.debug(q.compile())
605
+ result = conn.execute(q)
606
+ _logger.debug(f'Rectified {result.rowcount} row(s) in {temp_sa_tbl_name!r}.')
607
+
608
+ # Now rectify the v_max values in the existing table. This is done by simply taking the later of the two v_max
609
+ # values (the existing one and the new one) for each row instance, following the "latest provable v_max"
610
+ # principle. Obviously we only need to do this for rows that exist in both tables (it's a simple join).
611
+ q = (
612
+ store_sa_tbl.update()
613
+ .values(v_max=sql.func.greatest(store_sa_tbl.c.v_max, temp_sa_tbl.c.v_max))
614
+ .where(pk_clause)
615
+ )
616
+ _logger.debug(q.compile())
617
+ result = conn.execute(q)
618
+ _logger.debug(f'Rectified {result.rowcount} row(s) in {store_sa_tbl_name!r}.')
619
+
620
+ # STEP 3: Delete any row instances from the temporary table that are already present in the existing table.
621
+
622
+ # Now we need to update rows in the existing table that are also present in the temporary table. This is to
623
+ # account for the scenario where the temporary table has columns that are not present in the existing table.
624
+ # (We can't simply replace the rows with their versions in the temporary table, because the converse scenario
625
+ # might also occur; there may be columns in the existing table that are not present in the temporary table.)
626
+ value_update_clauses: dict[str, sql.ColumnElement] = {}
627
+ for temp_col in temp_cols.values():
628
+ if temp_col.name not in system_col_names:
629
+ store_col = store_sa_tbl.c[temp_col.name]
630
+ # Prefer the value from the existing table, substituting the value from the temporary table if it's
631
+ # NULL. This works in all cases (including media columns, where we prefer the existing media file).
632
+ clause = sql.case((store_col == None, temp_col), else_=store_col)
633
+ value_update_clauses[temp_col.name] = clause
634
+ if len(value_update_clauses) > 0:
635
+ q = store_sa_tbl.update().values(**value_update_clauses).where(pk_clause)
636
+ _logger.debug(q.compile())
637
+ result = conn.execute(q)
638
+ _logger.debug(
639
+ f'Merged values from {temp_sa_tbl_name!r} into {store_sa_tbl_name!r} for {result.rowcount} row(s).'
640
+ )
641
+
642
+ # Now drop any rows from the temporary table that are also present in the existing table.
643
+ # The v_max values have been rectified, data has been merged into NULL cells, and all other row values have
644
+ # been verified identical.
645
+ # TODO: Delete any media files that were orphaned by this operation (they're necessarily duplicates of media
646
+ # files that are already present in the existing table).
647
+ q = temp_sa_tbl.delete().where(pk_clause)
648
+ _logger.debug(q.compile())
649
+ result = conn.execute(q)
650
+ _logger.debug(f'Deleted {result.rowcount} row(s) from {temp_sa_tbl_name!r}.')
651
+
652
+ # STEP 4: Copy the remaining rows from the temporary table into the existing table.
653
+
654
+ # Now copy the remaining data (consisting entirely of new row instances) from the temporary table into
655
+ # the actual table.
656
+ q = store_sa_tbl.insert().from_select(
657
+ [store_sa_tbl.c[col_name] for col_name in temp_cols], sql.select(*temp_cols.values())
658
+ )
659
+ _logger.debug(q.compile())
660
+ result = conn.execute(q)
661
+ _logger.debug(f'Inserted {result.rowcount} row(s) from {temp_sa_tbl_name!r} into {store_sa_tbl_name!r}.')
662
+
663
+ # STEP 5: Rectify any index columns.
664
+
665
+ # Finally, rectify any index columns in the table. This involves shuffling data between the index's val and
666
+ # undo columns to ensure they appropriately reflect the most recent replicated version of the table.
667
+
668
+ # Get the most recent replicated version of the table. This might be the version we're currently importing,
669
+ # but it might be a different version of the table that was previously imported.
670
+ head_version_md = catalog.Catalog.get()._collect_tbl_history(tv.id, n=1)[0]
671
+ head_version = head_version_md.version_md.version
672
+ _logger.debug(f'Head version for index rectification is {head_version}.')
673
+
674
+ # Get the index info from the table metadata. Here we use the tbl_md that we just collected from the DB.
675
+ # This is to ensure we pick up ALL indices, including dropped indices and indices that are present in
676
+ # a previously replicated version of the table, but not in the one currently being imported.
677
+ index_md = head_version_md.tbl_md.index_md
678
+
679
+ # Now update the table. We can do this for all indices together with just two SQL queries. For each index,
680
+ # at most one of the val or undo columns will be non-NULL in any given row.
681
+ # For rows where v_min <= head_version < v_max, we set, for all indices:
682
+ # val_col = whichever of (val_col, undo_col) is non-NULL (or NULL if both are, e.g., for a dropped index)
683
+ # undo_col = NULL
684
+ # For rows where head_version < v_min or v_max <= head_version, vice versa.
685
+ val_sql_clauses: dict[str, sql.ColumnElement] = {}
686
+ undo_sql_clauses: dict[str, sql.ColumnElement] = {}
687
+ for index in index_md.values():
688
+ if index.class_fqn.endswith('.EmbeddingIndex'):
689
+ val_col_name = f'col_{index.index_val_col_id}'
690
+ undo_col_name = f'col_{index.index_val_undo_col_id}'
691
+ # Check that the val column for the index is actually present in the store table. We need to do this
692
+ # to properly handle the case where the replica represents a table version that was *not* the most
693
+ # recent version at the time it was published. In that case, it is possible for tbl_md to contain
694
+ # metadata for indices not known to any version that has been replicated. (However, the converse
695
+ # *does* hold: all replicated indices must have metadata in tbl_md; and that's what's important.)
696
+ if val_col_name in store_sa_tbl.c:
697
+ assert undo_col_name in store_sa_tbl.c
698
+ coalesce = sql.func.coalesce(store_sa_tbl.c[val_col_name], store_sa_tbl.c[undo_col_name])
699
+ val_sql_clauses[val_col_name] = coalesce
700
+ val_sql_clauses[undo_col_name] = sql.null()
701
+ undo_sql_clauses[undo_col_name] = coalesce
702
+ undo_sql_clauses[val_col_name] = sql.null()
703
+
704
+ if len(val_sql_clauses) > 0:
705
+ q2 = (
706
+ store_sa_tbl.update()
707
+ .values(**val_sql_clauses)
708
+ .where(sql.and_(tv.store_tbl.v_min_col <= head_version, tv.store_tbl.v_max_col > head_version))
709
+ )
710
+ _logger.debug(q2.compile())
711
+ _ = conn.execute(q2)
712
+ q2 = (
713
+ store_sa_tbl.update()
714
+ .values(**undo_sql_clauses)
715
+ .where(sql.or_(tv.store_tbl.v_min_col > head_version, tv.store_tbl.v_max_col <= head_version))
716
+ )
717
+ _logger.debug(q2.compile())
718
+ _ = conn.execute(q2)
719
+ _logger.debug(f'Rectified index columns in {store_sa_tbl_name!r}.')
720
+ else:
721
+ _logger.debug(f'No index columns to rectify in {store_sa_tbl_name!r}.')
284
722
 
285
723
  def __from_pa_pydict(self, tv: catalog.TableVersion, pydict: dict[str, Any]) -> list[dict[str, Any]]:
286
724
  # Data conversions from pyarrow to Pixeltable
@@ -288,36 +726,52 @@ class TableRestorer:
288
726
  for col_name in pydict:
289
727
  assert col_name in tv.store_tbl.sa_tbl.columns
290
728
  sql_types[col_name] = tv.store_tbl.sa_tbl.columns[col_name].type
291
- media_col_ids: dict[str, int] = {}
292
- for col in tv.cols_by_name.values():
293
- if col.is_stored and col.col_type.is_media_type():
294
- media_col_ids[col.store_name()] = col.id
729
+ stored_cols: dict[str, catalog.Column] = {col.store_name(): col for col in tv.cols if col.is_stored}
730
+ stored_cols |= {col.cellmd_store_name(): col for col in tv.cols if col.stores_cellmd}
295
731
 
296
732
  row_count = len(next(iter(pydict.values())))
297
- rows: list[dict[str, Any]] = []
298
- for i in range(row_count):
299
- row = {
300
- col_name: self.__from_pa_value(tv, col_vals[i], sql_types[col_name], media_col_ids.get(col_name))
301
- for col_name, col_vals in pydict.items()
302
- }
303
- rows.append(row)
733
+ rows: list[dict[str, Any]] = [{} for _ in range(row_count)]
734
+ for col_name, col_vals in pydict.items():
735
+ assert len(col_vals) == row_count
736
+ col = stored_cols.get(col_name) # Will be None for system columns
737
+ is_media_col = col is not None and col.is_stored and col.col_type.is_media_type()
738
+ is_cellmd_col = col is not None and col.stores_cellmd and col_name == col.cellmd_store_name()
739
+ assert col is None or is_cellmd_col or col_name == col.store_name()
740
+
741
+ for i, val in enumerate(col_vals):
742
+ rows[i][col_name] = self.__from_pa_value(val, sql_types[col_name], col, is_media_col, is_cellmd_col)
304
743
 
305
744
  return rows
306
745
 
307
746
  def __from_pa_value(
308
- self, tv: catalog.TableVersion, val: Any, sql_type: sql.types.TypeEngine[Any], media_col_id: Optional[int]
747
+ self,
748
+ val: Any,
749
+ sql_type: sql.types.TypeEngine[Any],
750
+ col: catalog.Column | None,
751
+ is_media_col: bool,
752
+ is_cellmd_col: bool,
309
753
  ) -> Any:
310
754
  if val is None:
311
755
  return None
756
+ if isinstance(sql_type, sql_vector.Vector):
757
+ if isinstance(val, list):
758
+ val = np.array(val, dtype=np.float32)
759
+ assert isinstance(val, np.ndarray) and val.dtype == np.float32 and val.ndim == 1
760
+ return val
761
+ if is_cellmd_col:
762
+ assert col is not None
763
+ assert isinstance(val, str)
764
+ return self.__restore_cellmd(col, json.loads(val))
312
765
  if isinstance(sql_type, sql.JSON):
313
766
  return json.loads(val)
314
- if media_col_id is not None:
315
- assert isinstance(val, str)
316
- return self.__relocate_media_file(tv, media_col_id, val)
767
+ if is_media_col:
768
+ assert col is not None
769
+ return self.__relocate_media_file(col, val)
317
770
  return val
318
771
 
319
- def __relocate_media_file(self, tv: catalog.TableVersion, media_col_id: int, url: str) -> str:
772
+ def __relocate_media_file(self, media_col: catalog.Column, url: str) -> str:
320
773
  # If this is a pxtmedia:// URL, relocate it
774
+ assert isinstance(url, str)
321
775
  parsed_url = urllib.parse.urlparse(url)
322
776
  assert parsed_url.scheme != 'file' # These should all have been converted to pxtmedia:// URLs
323
777
  if parsed_url.scheme == 'pxtmedia':
@@ -325,9 +779,19 @@ class TableRestorer:
325
779
  # First time seeing this pxtmedia:// URL. Relocate the file to the media store and record the mapping
326
780
  # in self.media_files.
327
781
  src_path = self.tmp_dir / 'media' / parsed_url.netloc
328
- dest_path = MediaStore.prepare_media_path(tv.id, media_col_id, tv.version, ext=src_path.suffix)
329
- src_path.rename(dest_path)
330
- self.media_files[url] = urllib.parse.urljoin('file:', urllib.request.pathname2url(str(dest_path)))
782
+ # Move the file to the media store and update the URL.
783
+ self.media_files[url] = ObjectOps.put_file(media_col, src_path, relocate_or_delete=True)
331
784
  return self.media_files[url]
332
785
  # For any type of URL other than a local file, just return the URL as-is.
333
786
  return url
787
+
788
+ def __restore_cellmd(self, col: catalog.Column, cellmd: dict[str, Any]) -> dict[str, Any]:
789
+ cellmd_ = CellMd.from_dict(cellmd)
790
+ if cellmd_.file_urls is None:
791
+ return cellmd # No changes
792
+
793
+ updated_urls: list[str] = []
794
+ for url in cellmd_.file_urls:
795
+ updated_urls.append(self.__relocate_media_file(col, url))
796
+ cellmd_.file_urls = updated_urls
797
+ return cellmd_.as_dict()