pixeltable 0.4.6__py3-none-any.whl → 0.4.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (69) hide show
  1. pixeltable/__init__.py +4 -2
  2. pixeltable/catalog/__init__.py +1 -1
  3. pixeltable/catalog/catalog.py +7 -9
  4. pixeltable/catalog/column.py +49 -0
  5. pixeltable/catalog/insertable_table.py +0 -7
  6. pixeltable/catalog/schema_object.py +1 -14
  7. pixeltable/catalog/table.py +180 -67
  8. pixeltable/catalog/table_version.py +42 -146
  9. pixeltable/catalog/table_version_path.py +6 -5
  10. pixeltable/catalog/view.py +2 -1
  11. pixeltable/config.py +24 -9
  12. pixeltable/dataframe.py +5 -6
  13. pixeltable/env.py +113 -21
  14. pixeltable/exec/aggregation_node.py +1 -1
  15. pixeltable/exec/cache_prefetch_node.py +4 -3
  16. pixeltable/exec/exec_node.py +0 -8
  17. pixeltable/exec/expr_eval/expr_eval_node.py +2 -2
  18. pixeltable/exec/expr_eval/globals.py +1 -0
  19. pixeltable/exec/expr_eval/schedulers.py +52 -19
  20. pixeltable/exec/in_memory_data_node.py +2 -3
  21. pixeltable/exprs/array_slice.py +2 -2
  22. pixeltable/exprs/data_row.py +15 -2
  23. pixeltable/exprs/expr.py +9 -9
  24. pixeltable/exprs/function_call.py +61 -23
  25. pixeltable/exprs/globals.py +1 -2
  26. pixeltable/exprs/json_path.py +3 -3
  27. pixeltable/exprs/row_builder.py +25 -21
  28. pixeltable/exprs/string_op.py +3 -3
  29. pixeltable/func/expr_template_function.py +6 -3
  30. pixeltable/func/query_template_function.py +2 -2
  31. pixeltable/func/signature.py +30 -3
  32. pixeltable/func/tools.py +2 -2
  33. pixeltable/functions/anthropic.py +76 -27
  34. pixeltable/functions/deepseek.py +5 -1
  35. pixeltable/functions/gemini.py +11 -2
  36. pixeltable/functions/globals.py +2 -2
  37. pixeltable/functions/huggingface.py +6 -12
  38. pixeltable/functions/llama_cpp.py +9 -1
  39. pixeltable/functions/openai.py +76 -55
  40. pixeltable/functions/video.py +59 -6
  41. pixeltable/functions/vision.py +2 -2
  42. pixeltable/globals.py +86 -13
  43. pixeltable/io/datarows.py +3 -3
  44. pixeltable/io/fiftyone.py +7 -7
  45. pixeltable/io/globals.py +3 -3
  46. pixeltable/io/hf_datasets.py +4 -4
  47. pixeltable/io/label_studio.py +2 -1
  48. pixeltable/io/pandas.py +6 -6
  49. pixeltable/io/parquet.py +3 -3
  50. pixeltable/io/table_data_conduit.py +2 -2
  51. pixeltable/io/utils.py +2 -2
  52. pixeltable/iterators/audio.py +3 -2
  53. pixeltable/iterators/document.py +2 -8
  54. pixeltable/iterators/video.py +49 -9
  55. pixeltable/plan.py +0 -16
  56. pixeltable/share/packager.py +51 -42
  57. pixeltable/share/publish.py +134 -7
  58. pixeltable/store.py +5 -25
  59. pixeltable/type_system.py +5 -8
  60. pixeltable/utils/__init__.py +2 -2
  61. pixeltable/utils/arrow.py +5 -5
  62. pixeltable/utils/description_helper.py +3 -3
  63. pixeltable/utils/iceberg.py +1 -2
  64. pixeltable/utils/media_store.py +131 -66
  65. {pixeltable-0.4.6.dist-info → pixeltable-0.4.8.dist-info}/METADATA +238 -122
  66. {pixeltable-0.4.6.dist-info → pixeltable-0.4.8.dist-info}/RECORD +69 -69
  67. {pixeltable-0.4.6.dist-info → pixeltable-0.4.8.dist-info}/WHEEL +0 -0
  68. {pixeltable-0.4.6.dist-info → pixeltable-0.4.8.dist-info}/entry_points.txt +0 -0
  69. {pixeltable-0.4.6.dist-info → pixeltable-0.4.8.dist-info}/licenses/LICENSE +0 -0
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import typing
4
- from typing import Any, Optional, Union
4
+ from typing import Any, Optional
5
5
 
6
6
  import pixeltable as pxt
7
7
  import pixeltable.type_system as ts
@@ -66,7 +66,7 @@ def _to_pixeltable_type(feature_type: Any, nullable: bool) -> Optional[ts.Column
66
66
  return None
67
67
 
68
68
 
69
- def _get_hf_schema(dataset: Union[datasets.Dataset, datasets.DatasetDict]) -> datasets.Features:
69
+ def _get_hf_schema(dataset: datasets.Dataset | datasets.DatasetDict) -> datasets.Features:
70
70
  """Get the schema of a huggingface dataset as a dictionary."""
71
71
  import datasets
72
72
 
@@ -91,10 +91,10 @@ def huggingface_schema_to_pxt_schema(
91
91
 
92
92
  def import_huggingface_dataset(
93
93
  table_path: str,
94
- dataset: Union[datasets.Dataset, datasets.DatasetDict],
94
+ dataset: datasets.Dataset | datasets.DatasetDict,
95
95
  *,
96
96
  schema_overrides: Optional[dict[str, Any]] = None,
97
- primary_key: Optional[Union[str, list[str]]] = None,
97
+ primary_key: str | list[str] | None = None,
98
98
  **kwargs: Any,
99
99
  ) -> pxt.Table:
100
100
  """Create a new base table from a Huggingface dataset, or dataset dict with multiple splits.
@@ -19,6 +19,7 @@ from pixeltable.config import Config
19
19
  from pixeltable.exprs import ColumnRef, DataRow, Expr
20
20
  from pixeltable.io.external_store import Project
21
21
  from pixeltable.utils import coco
22
+ from pixeltable.utils.media_store import TempStore
22
23
 
23
24
  # label_studio_sdk>=1 and label_studio_sdk<1 are not compatible, so we need to try
24
25
  # the import two different ways to insure intercompatibility
@@ -215,7 +216,7 @@ class LabelStudioProject(Project):
215
216
  else:
216
217
  # No localpath; create a temp file and upload it
217
218
  assert isinstance(row[media_col_idx], PIL.Image.Image)
218
- file = env.Env.get().create_tmp_path(extension='.png')
219
+ file = TempStore.create_path(extension='.png')
219
220
  row[media_col_idx].save(file, format='png')
220
221
  task_id = self.project.import_tasks(file)[0]
221
222
  os.remove(file)
pixeltable/io/pandas.py CHANGED
@@ -1,5 +1,5 @@
1
1
  import os
2
- from typing import Any, Optional, Union
2
+ from typing import Any, Optional
3
3
 
4
4
  import numpy as np
5
5
  import pandas as pd
@@ -17,7 +17,7 @@ def import_pandas(
17
17
  df: pd.DataFrame,
18
18
  *,
19
19
  schema_overrides: Optional[dict[str, Any]] = None,
20
- primary_key: Optional[Union[str, list[str]]] = None,
20
+ primary_key: str | list[str] | None = None,
21
21
  num_retained_versions: int = 10,
22
22
  comment: str = '',
23
23
  ) -> pxt.Table:
@@ -55,9 +55,9 @@ def import_pandas(
55
55
 
56
56
  def import_csv(
57
57
  tbl_name: str,
58
- filepath_or_buffer: Union[str, os.PathLike],
58
+ filepath_or_buffer: str | os.PathLike,
59
59
  schema_overrides: Optional[dict[str, Any]] = None,
60
- primary_key: Optional[Union[str, list[str]]] = None,
60
+ primary_key: str | list[str] | None = None,
61
61
  num_retained_versions: int = 10,
62
62
  comment: str = '',
63
63
  **kwargs: Any,
@@ -84,10 +84,10 @@ def import_csv(
84
84
 
85
85
  def import_excel(
86
86
  tbl_name: str,
87
- io: Union[str, os.PathLike],
87
+ io: str | os.PathLike,
88
88
  *,
89
89
  schema_overrides: Optional[dict[str, Any]] = None,
90
- primary_key: Optional[Union[str, list[str]]] = None,
90
+ primary_key: str | list[str] | None = None,
91
91
  num_retained_versions: int = 10,
92
92
  comment: str = '',
93
93
  **kwargs: Any,
pixeltable/io/parquet.py CHANGED
@@ -7,7 +7,7 @@ import logging
7
7
  import typing
8
8
  from collections import deque
9
9
  from pathlib import Path
10
- from typing import Any, Optional, Union
10
+ from typing import Any, Optional
11
11
 
12
12
  import numpy as np
13
13
  import PIL.Image
@@ -42,7 +42,7 @@ def _write_batch(value_batch: dict[str, deque], schema: pa.Schema, output_path:
42
42
 
43
43
 
44
44
  def export_parquet(
45
- table_or_df: Union[pxt.Table, pxt.DataFrame],
45
+ table_or_df: pxt.Table | pxt.DataFrame,
46
46
  parquet_path: Path,
47
47
  partition_size_bytes: int = 100_000_000,
48
48
  inline_images: bool = False,
@@ -152,7 +152,7 @@ def import_parquet(
152
152
  *,
153
153
  parquet_path: str,
154
154
  schema_overrides: Optional[dict[str, Any]] = None,
155
- primary_key: Optional[Union[str, list[str]]] = None,
155
+ primary_key: str | list[str] | None = None,
156
156
  **kwargs: Any,
157
157
  ) -> pxt.Table:
158
158
  """Creates a new base table from a Parquet file or set of files. Requires pyarrow to be installed.
@@ -8,7 +8,7 @@ import urllib.parse
8
8
  import urllib.request
9
9
  from dataclasses import dataclass, field, fields
10
10
  from pathlib import Path
11
- from typing import TYPE_CHECKING, Any, Iterable, Iterator, Literal, Optional, Union, cast
11
+ from typing import TYPE_CHECKING, Any, Iterable, Iterator, Literal, Optional, cast
12
12
 
13
13
  import pandas as pd
14
14
  from pyarrow.parquet import ParquetDataset
@@ -325,7 +325,7 @@ class JsonTableDataConduit(TableDataConduit):
325
325
 
326
326
 
327
327
  class HFTableDataConduit(TableDataConduit):
328
- hf_ds: Optional[Union[datasets.Dataset, datasets.DatasetDict]] = None
328
+ hf_ds: datasets.Dataset | datasets.DatasetDict | None = None
329
329
  column_name_for_split: Optional[str] = None
330
330
  categorical_features: dict[str, dict[int, str]]
331
331
  dataset_dict: dict[str, datasets.Dataset] = None
pixeltable/io/utils.py CHANGED
@@ -1,5 +1,5 @@
1
1
  from keyword import iskeyword as is_python_keyword
2
- from typing import Any, Optional, Union
2
+ from typing import Any, Optional
3
3
 
4
4
  import pixeltable as pxt
5
5
  import pixeltable.exceptions as excs
@@ -21,7 +21,7 @@ def normalize_pxt_col_name(name: str) -> str:
21
21
  return id
22
22
 
23
23
 
24
- def normalize_primary_key_parameter(primary_key: Optional[Union[str, list[str]]] = None) -> list[str]:
24
+ def normalize_primary_key_parameter(primary_key: str | list[str] | None = None) -> list[str]:
25
25
  if primary_key is None:
26
26
  primary_key = []
27
27
  elif isinstance(primary_key, str):
@@ -5,7 +5,8 @@ from typing import Any, ClassVar, Optional
5
5
 
6
6
  import av
7
7
 
8
- from pixeltable import env, exceptions as excs, type_system as ts
8
+ from pixeltable import exceptions as excs, type_system as ts
9
+ from pixeltable.utils.media_store import TempStore
9
10
 
10
11
  from .base import ComponentIterator
11
12
 
@@ -149,7 +150,7 @@ class AudioSplitter(ComponentIterator):
149
150
  target_chunk_start, target_chunk_end = self.chunks_to_extract_in_pts[self.next_pos]
150
151
  chunk_start_pts = 0
151
152
  chunk_end_pts = 0
152
- chunk_file = str(env.Env.get().create_tmp_path(self.audio_path.suffix))
153
+ chunk_file = str(TempStore.create_path(extension=self.audio_path.suffix))
153
154
  output_container = av.open(chunk_file, mode='w')
154
155
  input_stream = self.container.streams.audio[0]
155
156
  codec_name = AudioSplitter.__codec_map.get(input_stream.codec_context.name, input_stream.codec_context.name)
@@ -1,7 +1,7 @@
1
1
  import dataclasses
2
2
  import enum
3
3
  import logging
4
- from typing import Any, ClassVar, Iterable, Iterator, Optional, Union
4
+ from typing import Any, ClassVar, Iterable, Iterator, Optional
5
5
 
6
6
  import ftfy
7
7
 
@@ -213,12 +213,6 @@ class DocumentSplitter(ComponentIterator):
213
213
  if kwargs.get('limit') is None:
214
214
  raise Error('limit is required with "token_limit"/"char_limit" separators')
215
215
 
216
- # check dependencies at the end
217
- if Separator.SENTENCE in separators:
218
- _ = Env.get().spacy_nlp
219
- if Separator.TOKEN_LIMIT in separators:
220
- Env.get().require_package('tiktoken')
221
-
222
216
  return schema, []
223
217
 
224
218
  def __next__(self) -> dict[str, Any]:
@@ -273,7 +267,7 @@ class DocumentSplitter(ComponentIterator):
273
267
  yield DocumentSection(text=full_text, metadata=md)
274
268
  accumulated_text = []
275
269
 
276
- def process_element(el: Union[bs4.element.Tag, bs4.NavigableString]) -> Iterator[DocumentSection]:
270
+ def process_element(el: bs4.element.Tag | bs4.NavigableString) -> Iterator[DocumentSection]:
277
271
  # process the element and emit sections as necessary
278
272
  nonlocal accumulated_text, headings, sourceline, emit_on_heading, emit_on_paragraph
279
273
 
@@ -29,12 +29,29 @@ class FrameIterator(ComponentIterator):
29
29
  extracted). If `fps` is greater than the frame rate of the video, an error will be raised.
30
30
  num_frames: Exact number of frames to extract. The frames will be spaced as evenly as possible. If
31
31
  `num_frames` is greater than the number of frames in the video, all frames will be extracted.
32
+ all_frame_attrs:
33
+ If True, outputs a `pxt.Json` column `frame_attrs` with the following `pyav`-provided attributes
34
+ (for more information, see `pyav`'s documentation on
35
+ [VideoFrame](https://pyav.org/docs/develop/api/video.html#module-av.video.frame) and
36
+ [Frame](https://pyav.org/docs/develop/api/frame.html)):
37
+
38
+ * `index` (`int`)
39
+ * `pts` (`Optional[int]`)
40
+ * `dts` (`Optional[int]`)
41
+ * `time` (`Optional[float]`)
42
+ * `is_corrupt` (`bool`)
43
+ * `key_frame` (`bool`)
44
+ * `pict_type` (`int`)
45
+ * `interlaced_frame` (`bool`)
46
+
47
+ If False, only outputs frame attributes `frame_idx`, `pos_msec`, and `pos_frame` as separate columns.
32
48
  """
33
49
 
34
50
  # Input parameters
35
51
  video_path: Path
36
52
  fps: Optional[float]
37
53
  num_frames: Optional[int]
54
+ all_frame_attrs: bool
38
55
 
39
56
  # Video info
40
57
  container: av.container.input.InputContainer
@@ -50,7 +67,14 @@ class FrameIterator(ComponentIterator):
50
67
  # frame index in the video. Otherwise, the corresponding video index is `frames_to_extract[next_pos]`.
51
68
  next_pos: int
52
69
 
53
- def __init__(self, video: str, *, fps: Optional[float] = None, num_frames: Optional[int] = None):
70
+ def __init__(
71
+ self,
72
+ video: str,
73
+ *,
74
+ fps: Optional[float] = None,
75
+ num_frames: Optional[int] = None,
76
+ all_frame_attrs: bool = False,
77
+ ):
54
78
  if fps is not None and num_frames is not None:
55
79
  raise excs.Error('At most one of `fps` or `num_frames` may be specified')
56
80
 
@@ -60,6 +84,7 @@ class FrameIterator(ComponentIterator):
60
84
  self.container = av.open(str(video_path))
61
85
  self.fps = fps
62
86
  self.num_frames = num_frames
87
+ self.all_frame_attrs = all_frame_attrs
63
88
 
64
89
  self.video_framerate = self.container.streams.video[0].average_rate
65
90
  self.video_time_base = self.container.streams.video[0].time_base
@@ -115,16 +140,17 @@ class FrameIterator(ComponentIterator):
115
140
  'video': ts.VideoType(nullable=False),
116
141
  'fps': ts.FloatType(nullable=True),
117
142
  'num_frames': ts.IntType(nullable=True),
143
+ 'all_frame_attrs': ts.BoolType(nullable=False),
118
144
  }
119
145
 
120
146
  @classmethod
121
147
  def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
122
- return {
123
- 'frame_idx': ts.IntType(),
124
- 'pos_msec': ts.FloatType(),
125
- 'pos_frame': ts.IntType(),
126
- 'frame': ts.ImageType(),
127
- }, ['frame']
148
+ attrs: dict[str, ts.ColumnType]
149
+ if kwargs.get('all_frame_attrs'):
150
+ attrs = {'frame_attrs': ts.JsonType()}
151
+ else:
152
+ attrs = {'frame_idx': ts.IntType(), 'pos_msec': ts.FloatType(), 'pos_frame': ts.IntType()}
153
+ return {**attrs, 'frame': ts.ImageType()}, ['frame']
128
154
 
129
155
  def __next__(self) -> dict[str, Any]:
130
156
  # Determine the frame index in the video corresponding to the iterator index `next_pos`;
@@ -164,8 +190,22 @@ class FrameIterator(ComponentIterator):
164
190
  raise excs.Error(f'Frame {next_video_idx} is missing from the video (video file is corrupt)')
165
191
  img = frame.to_image()
166
192
  assert isinstance(img, PIL.Image.Image)
167
- pos_msec = float(pts * self.video_time_base * 1000)
168
- result = {'frame_idx': self.next_pos, 'pos_msec': pos_msec, 'pos_frame': video_idx, 'frame': img}
193
+ pts_msec = float(pts * self.video_time_base * 1000)
194
+ result: dict[str, Any] = {'frame': img}
195
+ if self.all_frame_attrs:
196
+ attrs = {
197
+ 'index': video_idx,
198
+ 'pts': frame.pts,
199
+ 'dts': frame.dts,
200
+ 'time': frame.time,
201
+ 'is_corrupt': frame.is_corrupt,
202
+ 'key_frame': frame.key_frame,
203
+ 'pict_type': frame.pict_type,
204
+ 'interlaced_frame': frame.interlaced_frame,
205
+ }
206
+ result['frame_attrs'] = attrs
207
+ else:
208
+ result.update({'frame_idx': self.next_pos, 'pos_msec': pts_msec, 'pos_frame': video_idx})
169
209
  self.next_pos += 1
170
210
  return result
171
211
 
pixeltable/plan.py CHANGED
@@ -394,9 +394,6 @@ class Planner:
394
394
  row_builder, computed_exprs, plan.output_exprs, input=plan, maintain_input_order=False
395
395
  )
396
396
 
397
- stored_col_info = row_builder.output_slot_idxs()
398
- stored_img_col_info = [info for info in stored_col_info if info.col.col_type.is_image_type()]
399
- plan.set_stored_img_cols(stored_img_col_info)
400
397
  plan.set_ctx(
401
398
  exec.ExecContext(
402
399
  row_builder,
@@ -428,10 +425,6 @@ class Planner:
428
425
  col = tbl.cols_by_name[col_name]
429
426
  plan.row_builder.add_table_column(col, expr.slot_idx)
430
427
 
431
- stored_col_info = plan.row_builder.output_slot_idxs()
432
- stored_img_col_info = [info for info in stored_col_info if info.col.col_type.is_image_type()]
433
- plan.set_stored_img_cols(stored_img_col_info)
434
-
435
428
  plan.set_ctx(
436
429
  exec.ExecContext(
437
430
  plan.row_builder, batch_size=0, show_pbar=True, num_computed_exprs=0, ignore_errors=ignore_errors
@@ -657,10 +650,6 @@ class Planner:
657
650
  for i, col in enumerate(copied_cols + list(recomputed_cols)): # same order as select_list
658
651
  plan.row_builder.add_table_column(col, select_list[i].slot_idx)
659
652
  # TODO: avoid duplication with view_load_plan() logic (where does this belong?)
660
- stored_img_col_info = [
661
- info for info in plan.row_builder.output_slot_idxs() if info.col.col_type.is_image_type()
662
- ]
663
- plan.set_stored_img_cols(stored_img_col_info)
664
653
  return plan
665
654
 
666
655
  @classmethod
@@ -727,8 +716,6 @@ class Planner:
727
716
  row_builder, output_exprs=view_output_exprs, input_exprs=base_output_exprs, input=plan
728
717
  )
729
718
 
730
- stored_img_col_info = [info for info in row_builder.output_slot_idxs() if info.col.col_type.is_image_type()]
731
- plan.set_stored_img_cols(stored_img_col_info)
732
719
  exec_ctx.ignore_errors = True
733
720
  plan.set_ctx(exec_ctx)
734
721
  return plan, len(row_builder.default_eval_ctx.target_exprs)
@@ -1053,7 +1040,4 @@ class Planner:
1053
1040
  computed_exprs = row_builder.output_exprs - row_builder.input_exprs
1054
1041
  plan.ctx.num_computed_exprs = len(computed_exprs) # we are adding a computed column, so we need to evaluate it
1055
1042
 
1056
- # we want to flush images
1057
- if col.is_computed and col.is_stored and col.col_type.is_image_type():
1058
- plan.set_stored_img_cols(row_builder.output_slot_idxs())
1059
1043
  return plan
@@ -24,7 +24,7 @@ from pixeltable.env import Env
24
24
  from pixeltable.metadata import schema
25
25
  from pixeltable.utils import sha256sum
26
26
  from pixeltable.utils.formatter import Formatter
27
- from pixeltable.utils.media_store import MediaStore
27
+ from pixeltable.utils.media_store import MediaStore, TempStore
28
28
 
29
29
  _logger = logging.getLogger('pixeltable')
30
30
 
@@ -57,7 +57,7 @@ class TablePackager:
57
57
 
58
58
  def __init__(self, table: catalog.Table, additional_md: Optional[dict[str, Any]] = None) -> None:
59
59
  self.table = table
60
- self.tmp_dir = Path(Env.get().create_tmp_path())
60
+ self.tmp_dir = TempStore.create_path()
61
61
  self.media_files = {}
62
62
 
63
63
  # Load metadata
@@ -92,10 +92,10 @@ class TablePackager:
92
92
  self.bundle_path = self.__build_tarball()
93
93
 
94
94
  _logger.info('Extracting preview data.')
95
- self.md['count'] = self.table.count()
95
+ self.md['row_count'] = self.table.count()
96
96
  preview_header, preview = self.__extract_preview_data()
97
97
  self.md['preview_header'] = preview_header
98
- self.md['preview'] = preview
98
+ self.md['preview_data'] = preview
99
99
 
100
100
  _logger.info(f'Packaging complete: {self.bundle_path}')
101
101
  return self.bundle_path
@@ -335,7 +335,7 @@ class TableRestorer:
335
335
  def __init__(self, tbl_path: str, md: Optional[dict[str, Any]] = None) -> None:
336
336
  self.tbl_path = tbl_path
337
337
  self.md = md
338
- self.tmp_dir = Path(Env.get().create_tmp_path())
338
+ self.tmp_dir = TempStore.create_path()
339
339
  self.media_files = {}
340
340
 
341
341
  def restore(self, bundle_path: Path) -> pxt.Table:
@@ -459,42 +459,51 @@ class TableRestorer:
459
459
  for col_name, col in temp_cols.items()
460
460
  if col_name not in system_col_names and col_name not in media_col_names
461
461
  ]
462
- mismatch_predicates = [store_col != temp_col for store_col, temp_col in zip(value_store_cols, value_temp_cols)]
463
- mismatch_clause = sql.or_(*mismatch_predicates)
464
-
465
- # This query looks for rows that have matching primary keys (rowid + pos_k + v_min), but differ in at least
466
- # one value column. Pseudo-SQL:
467
- #
468
- # SELECT store_tbl.col_0, ..., store_tbl.col_n, temp_tbl.col_0, ..., temp_tbl.col_n
469
- # FROM store_tbl, temp_tbl
470
- # WHERE store_tbl.rowid = temp_tbl.rowid
471
- # AND store_tbl.pos_0 = temp_tbl.pos_0
472
- # AND ... AND store_tbl.pos_k = temp_tbl.pos_k
473
- # AND store_tbl.v_min = temp_tbl.v_min
474
- # AND (
475
- # store_tbl.col_0 != temp_tbl.col_0
476
- # OR store_tbl.col_1 != temp_tbl.col_1
477
- # OR ... OR store_tbl.col_n != temp_tbl.col_n
478
- # )
479
- #
480
- # The value column comparisons (store_tbl.col_0 != temp_tbl.col_0, etc.) will always be false for rows where
481
- # either column is NULL; this is what we want, since it may indicate a column that is present in one version
482
- # but not the other.
483
- q = sql.select(*value_store_cols, *value_temp_cols).where(pk_clause).where(mismatch_clause)
484
- _logger.debug(q.compile())
485
- result = conn.execute(q)
486
- if result.rowcount > 0:
487
- _logger.debug(
488
- f'Data corruption error between {temp_sa_tbl_name!r} and {store_sa_tbl_name!r}: '
489
- f'{result.rowcount} inconsistent row(s).'
490
- )
491
- row = result.first()
492
- _logger.debug('Example mismatch:')
493
- _logger.debug(f'{store_sa_tbl_name}: {row[: len(value_store_cols)]}')
494
- _logger.debug(f'{temp_sa_tbl_name}: {row[len(value_store_cols) :]}')
495
- raise excs.Error(
496
- 'Data corruption error: the replica data are inconsistent with data retrieved from a previous replica.'
497
- )
462
+
463
+ q: sql.Executable
464
+
465
+ assert len(value_store_cols) == len(value_temp_cols)
466
+ if len(value_store_cols) > 0:
467
+ mismatch_predicates = [
468
+ store_col != temp_col for store_col, temp_col in zip(value_store_cols, value_temp_cols)
469
+ ]
470
+ mismatch_clause = sql.or_(*mismatch_predicates)
471
+
472
+ # This query looks for rows that have matching primary keys (rowid + pos_k + v_min), but differ in at least
473
+ # one value column. Pseudo-SQL:
474
+ #
475
+ # SELECT store_tbl.col_0, ..., store_tbl.col_n, temp_tbl.col_0, ..., temp_tbl.col_n
476
+ # FROM store_tbl, temp_tbl
477
+ # WHERE store_tbl.rowid = temp_tbl.rowid
478
+ # AND store_tbl.pos_0 = temp_tbl.pos_0
479
+ # AND ... AND store_tbl.pos_k = temp_tbl.pos_k
480
+ # AND store_tbl.v_min = temp_tbl.v_min
481
+ # AND (
482
+ # store_tbl.col_0 != temp_tbl.col_0
483
+ # OR store_tbl.col_1 != temp_tbl.col_1
484
+ # OR ... OR store_tbl.col_n != temp_tbl.col_n
485
+ # )
486
+ #
487
+ # The value column comparisons (store_tbl.col_0 != temp_tbl.col_0, etc.) will always be false for rows where
488
+ # either column is NULL; this is what we want, since it may indicate a column that is present in one version
489
+ # but not the other.
490
+ q = sql.select(*value_store_cols, *value_temp_cols).where(pk_clause).where(mismatch_clause)
491
+ _logger.debug(q.compile())
492
+ result = conn.execute(q)
493
+ if result.rowcount > 0:
494
+ _logger.debug(
495
+ f'Data corruption error between {temp_sa_tbl_name!r} and {store_sa_tbl_name!r}: '
496
+ f'{result.rowcount} inconsistent row(s).'
497
+ )
498
+ row = result.first()
499
+ _logger.debug('Example mismatch:')
500
+ _logger.debug(f'{store_sa_tbl_name}: {row[: len(value_store_cols)]}')
501
+ _logger.debug(f'{temp_sa_tbl_name}: {row[len(value_store_cols) :]}')
502
+ raise excs.Error(
503
+ 'Data corruption error: '
504
+ 'the replica data are inconsistent with data retrieved from a previous replica.'
505
+ )
506
+
498
507
  _logger.debug(f'Verified data integrity between {store_sa_tbl_name!r} and {temp_sa_tbl_name!r}.')
499
508
 
500
509
  # Now rectify the v_max values in the temporary table.
@@ -610,7 +619,7 @@ class TableRestorer:
610
619
  # in self.media_files.
611
620
  src_path = self.tmp_dir / 'media' / parsed_url.netloc
612
621
  # Move the file to the media store and update the URL.
613
- self.media_files[url] = MediaStore.relocate_local_media_file(src_path, media_col)
622
+ self.media_files[url] = MediaStore.get().relocate_local_media_file(src_path, media_col)
614
623
  return self.media_files[url]
615
624
  # For any type of URL other than a local file, just return the URL as-is.
616
625
  return url