pixeltable 0.4.6__py3-none-any.whl → 0.4.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +4 -2
- pixeltable/catalog/__init__.py +1 -1
- pixeltable/catalog/catalog.py +7 -9
- pixeltable/catalog/column.py +49 -0
- pixeltable/catalog/insertable_table.py +0 -7
- pixeltable/catalog/schema_object.py +1 -14
- pixeltable/catalog/table.py +180 -67
- pixeltable/catalog/table_version.py +42 -146
- pixeltable/catalog/table_version_path.py +6 -5
- pixeltable/catalog/view.py +2 -1
- pixeltable/config.py +24 -9
- pixeltable/dataframe.py +5 -6
- pixeltable/env.py +113 -21
- pixeltable/exec/aggregation_node.py +1 -1
- pixeltable/exec/cache_prefetch_node.py +4 -3
- pixeltable/exec/exec_node.py +0 -8
- pixeltable/exec/expr_eval/expr_eval_node.py +2 -2
- pixeltable/exec/expr_eval/globals.py +1 -0
- pixeltable/exec/expr_eval/schedulers.py +52 -19
- pixeltable/exec/in_memory_data_node.py +2 -3
- pixeltable/exprs/array_slice.py +2 -2
- pixeltable/exprs/data_row.py +15 -2
- pixeltable/exprs/expr.py +9 -9
- pixeltable/exprs/function_call.py +61 -23
- pixeltable/exprs/globals.py +1 -2
- pixeltable/exprs/json_path.py +3 -3
- pixeltable/exprs/row_builder.py +25 -21
- pixeltable/exprs/string_op.py +3 -3
- pixeltable/func/expr_template_function.py +6 -3
- pixeltable/func/query_template_function.py +2 -2
- pixeltable/func/signature.py +30 -3
- pixeltable/func/tools.py +2 -2
- pixeltable/functions/anthropic.py +76 -27
- pixeltable/functions/deepseek.py +5 -1
- pixeltable/functions/gemini.py +11 -2
- pixeltable/functions/globals.py +2 -2
- pixeltable/functions/huggingface.py +6 -12
- pixeltable/functions/llama_cpp.py +9 -1
- pixeltable/functions/openai.py +76 -55
- pixeltable/functions/video.py +59 -6
- pixeltable/functions/vision.py +2 -2
- pixeltable/globals.py +86 -13
- pixeltable/io/datarows.py +3 -3
- pixeltable/io/fiftyone.py +7 -7
- pixeltable/io/globals.py +3 -3
- pixeltable/io/hf_datasets.py +4 -4
- pixeltable/io/label_studio.py +2 -1
- pixeltable/io/pandas.py +6 -6
- pixeltable/io/parquet.py +3 -3
- pixeltable/io/table_data_conduit.py +2 -2
- pixeltable/io/utils.py +2 -2
- pixeltable/iterators/audio.py +3 -2
- pixeltable/iterators/document.py +2 -8
- pixeltable/iterators/video.py +49 -9
- pixeltable/plan.py +0 -16
- pixeltable/share/packager.py +51 -42
- pixeltable/share/publish.py +134 -7
- pixeltable/store.py +5 -25
- pixeltable/type_system.py +5 -8
- pixeltable/utils/__init__.py +2 -2
- pixeltable/utils/arrow.py +5 -5
- pixeltable/utils/description_helper.py +3 -3
- pixeltable/utils/iceberg.py +1 -2
- pixeltable/utils/media_store.py +131 -66
- {pixeltable-0.4.6.dist-info → pixeltable-0.4.8.dist-info}/METADATA +238 -122
- {pixeltable-0.4.6.dist-info → pixeltable-0.4.8.dist-info}/RECORD +69 -69
- {pixeltable-0.4.6.dist-info → pixeltable-0.4.8.dist-info}/WHEEL +0 -0
- {pixeltable-0.4.6.dist-info → pixeltable-0.4.8.dist-info}/entry_points.txt +0 -0
- {pixeltable-0.4.6.dist-info → pixeltable-0.4.8.dist-info}/licenses/LICENSE +0 -0
pixeltable/io/hf_datasets.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import typing
|
|
4
|
-
from typing import Any, Optional
|
|
4
|
+
from typing import Any, Optional
|
|
5
5
|
|
|
6
6
|
import pixeltable as pxt
|
|
7
7
|
import pixeltable.type_system as ts
|
|
@@ -66,7 +66,7 @@ def _to_pixeltable_type(feature_type: Any, nullable: bool) -> Optional[ts.Column
|
|
|
66
66
|
return None
|
|
67
67
|
|
|
68
68
|
|
|
69
|
-
def _get_hf_schema(dataset:
|
|
69
|
+
def _get_hf_schema(dataset: datasets.Dataset | datasets.DatasetDict) -> datasets.Features:
|
|
70
70
|
"""Get the schema of a huggingface dataset as a dictionary."""
|
|
71
71
|
import datasets
|
|
72
72
|
|
|
@@ -91,10 +91,10 @@ def huggingface_schema_to_pxt_schema(
|
|
|
91
91
|
|
|
92
92
|
def import_huggingface_dataset(
|
|
93
93
|
table_path: str,
|
|
94
|
-
dataset:
|
|
94
|
+
dataset: datasets.Dataset | datasets.DatasetDict,
|
|
95
95
|
*,
|
|
96
96
|
schema_overrides: Optional[dict[str, Any]] = None,
|
|
97
|
-
primary_key:
|
|
97
|
+
primary_key: str | list[str] | None = None,
|
|
98
98
|
**kwargs: Any,
|
|
99
99
|
) -> pxt.Table:
|
|
100
100
|
"""Create a new base table from a Huggingface dataset, or dataset dict with multiple splits.
|
pixeltable/io/label_studio.py
CHANGED
|
@@ -19,6 +19,7 @@ from pixeltable.config import Config
|
|
|
19
19
|
from pixeltable.exprs import ColumnRef, DataRow, Expr
|
|
20
20
|
from pixeltable.io.external_store import Project
|
|
21
21
|
from pixeltable.utils import coco
|
|
22
|
+
from pixeltable.utils.media_store import TempStore
|
|
22
23
|
|
|
23
24
|
# label_studio_sdk>=1 and label_studio_sdk<1 are not compatible, so we need to try
|
|
24
25
|
# the import two different ways to insure intercompatibility
|
|
@@ -215,7 +216,7 @@ class LabelStudioProject(Project):
|
|
|
215
216
|
else:
|
|
216
217
|
# No localpath; create a temp file and upload it
|
|
217
218
|
assert isinstance(row[media_col_idx], PIL.Image.Image)
|
|
218
|
-
file =
|
|
219
|
+
file = TempStore.create_path(extension='.png')
|
|
219
220
|
row[media_col_idx].save(file, format='png')
|
|
220
221
|
task_id = self.project.import_tasks(file)[0]
|
|
221
222
|
os.remove(file)
|
pixeltable/io/pandas.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import os
|
|
2
|
-
from typing import Any, Optional
|
|
2
|
+
from typing import Any, Optional
|
|
3
3
|
|
|
4
4
|
import numpy as np
|
|
5
5
|
import pandas as pd
|
|
@@ -17,7 +17,7 @@ def import_pandas(
|
|
|
17
17
|
df: pd.DataFrame,
|
|
18
18
|
*,
|
|
19
19
|
schema_overrides: Optional[dict[str, Any]] = None,
|
|
20
|
-
primary_key:
|
|
20
|
+
primary_key: str | list[str] | None = None,
|
|
21
21
|
num_retained_versions: int = 10,
|
|
22
22
|
comment: str = '',
|
|
23
23
|
) -> pxt.Table:
|
|
@@ -55,9 +55,9 @@ def import_pandas(
|
|
|
55
55
|
|
|
56
56
|
def import_csv(
|
|
57
57
|
tbl_name: str,
|
|
58
|
-
filepath_or_buffer:
|
|
58
|
+
filepath_or_buffer: str | os.PathLike,
|
|
59
59
|
schema_overrides: Optional[dict[str, Any]] = None,
|
|
60
|
-
primary_key:
|
|
60
|
+
primary_key: str | list[str] | None = None,
|
|
61
61
|
num_retained_versions: int = 10,
|
|
62
62
|
comment: str = '',
|
|
63
63
|
**kwargs: Any,
|
|
@@ -84,10 +84,10 @@ def import_csv(
|
|
|
84
84
|
|
|
85
85
|
def import_excel(
|
|
86
86
|
tbl_name: str,
|
|
87
|
-
io:
|
|
87
|
+
io: str | os.PathLike,
|
|
88
88
|
*,
|
|
89
89
|
schema_overrides: Optional[dict[str, Any]] = None,
|
|
90
|
-
primary_key:
|
|
90
|
+
primary_key: str | list[str] | None = None,
|
|
91
91
|
num_retained_versions: int = 10,
|
|
92
92
|
comment: str = '',
|
|
93
93
|
**kwargs: Any,
|
pixeltable/io/parquet.py
CHANGED
|
@@ -7,7 +7,7 @@ import logging
|
|
|
7
7
|
import typing
|
|
8
8
|
from collections import deque
|
|
9
9
|
from pathlib import Path
|
|
10
|
-
from typing import Any, Optional
|
|
10
|
+
from typing import Any, Optional
|
|
11
11
|
|
|
12
12
|
import numpy as np
|
|
13
13
|
import PIL.Image
|
|
@@ -42,7 +42,7 @@ def _write_batch(value_batch: dict[str, deque], schema: pa.Schema, output_path:
|
|
|
42
42
|
|
|
43
43
|
|
|
44
44
|
def export_parquet(
|
|
45
|
-
table_or_df:
|
|
45
|
+
table_or_df: pxt.Table | pxt.DataFrame,
|
|
46
46
|
parquet_path: Path,
|
|
47
47
|
partition_size_bytes: int = 100_000_000,
|
|
48
48
|
inline_images: bool = False,
|
|
@@ -152,7 +152,7 @@ def import_parquet(
|
|
|
152
152
|
*,
|
|
153
153
|
parquet_path: str,
|
|
154
154
|
schema_overrides: Optional[dict[str, Any]] = None,
|
|
155
|
-
primary_key:
|
|
155
|
+
primary_key: str | list[str] | None = None,
|
|
156
156
|
**kwargs: Any,
|
|
157
157
|
) -> pxt.Table:
|
|
158
158
|
"""Creates a new base table from a Parquet file or set of files. Requires pyarrow to be installed.
|
|
@@ -8,7 +8,7 @@ import urllib.parse
|
|
|
8
8
|
import urllib.request
|
|
9
9
|
from dataclasses import dataclass, field, fields
|
|
10
10
|
from pathlib import Path
|
|
11
|
-
from typing import TYPE_CHECKING, Any, Iterable, Iterator, Literal, Optional,
|
|
11
|
+
from typing import TYPE_CHECKING, Any, Iterable, Iterator, Literal, Optional, cast
|
|
12
12
|
|
|
13
13
|
import pandas as pd
|
|
14
14
|
from pyarrow.parquet import ParquetDataset
|
|
@@ -325,7 +325,7 @@ class JsonTableDataConduit(TableDataConduit):
|
|
|
325
325
|
|
|
326
326
|
|
|
327
327
|
class HFTableDataConduit(TableDataConduit):
|
|
328
|
-
hf_ds:
|
|
328
|
+
hf_ds: datasets.Dataset | datasets.DatasetDict | None = None
|
|
329
329
|
column_name_for_split: Optional[str] = None
|
|
330
330
|
categorical_features: dict[str, dict[int, str]]
|
|
331
331
|
dataset_dict: dict[str, datasets.Dataset] = None
|
pixeltable/io/utils.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from keyword import iskeyword as is_python_keyword
|
|
2
|
-
from typing import Any, Optional
|
|
2
|
+
from typing import Any, Optional
|
|
3
3
|
|
|
4
4
|
import pixeltable as pxt
|
|
5
5
|
import pixeltable.exceptions as excs
|
|
@@ -21,7 +21,7 @@ def normalize_pxt_col_name(name: str) -> str:
|
|
|
21
21
|
return id
|
|
22
22
|
|
|
23
23
|
|
|
24
|
-
def normalize_primary_key_parameter(primary_key:
|
|
24
|
+
def normalize_primary_key_parameter(primary_key: str | list[str] | None = None) -> list[str]:
|
|
25
25
|
if primary_key is None:
|
|
26
26
|
primary_key = []
|
|
27
27
|
elif isinstance(primary_key, str):
|
pixeltable/iterators/audio.py
CHANGED
|
@@ -5,7 +5,8 @@ from typing import Any, ClassVar, Optional
|
|
|
5
5
|
|
|
6
6
|
import av
|
|
7
7
|
|
|
8
|
-
from pixeltable import
|
|
8
|
+
from pixeltable import exceptions as excs, type_system as ts
|
|
9
|
+
from pixeltable.utils.media_store import TempStore
|
|
9
10
|
|
|
10
11
|
from .base import ComponentIterator
|
|
11
12
|
|
|
@@ -149,7 +150,7 @@ class AudioSplitter(ComponentIterator):
|
|
|
149
150
|
target_chunk_start, target_chunk_end = self.chunks_to_extract_in_pts[self.next_pos]
|
|
150
151
|
chunk_start_pts = 0
|
|
151
152
|
chunk_end_pts = 0
|
|
152
|
-
chunk_file = str(
|
|
153
|
+
chunk_file = str(TempStore.create_path(extension=self.audio_path.suffix))
|
|
153
154
|
output_container = av.open(chunk_file, mode='w')
|
|
154
155
|
input_stream = self.container.streams.audio[0]
|
|
155
156
|
codec_name = AudioSplitter.__codec_map.get(input_stream.codec_context.name, input_stream.codec_context.name)
|
pixeltable/iterators/document.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import dataclasses
|
|
2
2
|
import enum
|
|
3
3
|
import logging
|
|
4
|
-
from typing import Any, ClassVar, Iterable, Iterator, Optional
|
|
4
|
+
from typing import Any, ClassVar, Iterable, Iterator, Optional
|
|
5
5
|
|
|
6
6
|
import ftfy
|
|
7
7
|
|
|
@@ -213,12 +213,6 @@ class DocumentSplitter(ComponentIterator):
|
|
|
213
213
|
if kwargs.get('limit') is None:
|
|
214
214
|
raise Error('limit is required with "token_limit"/"char_limit" separators')
|
|
215
215
|
|
|
216
|
-
# check dependencies at the end
|
|
217
|
-
if Separator.SENTENCE in separators:
|
|
218
|
-
_ = Env.get().spacy_nlp
|
|
219
|
-
if Separator.TOKEN_LIMIT in separators:
|
|
220
|
-
Env.get().require_package('tiktoken')
|
|
221
|
-
|
|
222
216
|
return schema, []
|
|
223
217
|
|
|
224
218
|
def __next__(self) -> dict[str, Any]:
|
|
@@ -273,7 +267,7 @@ class DocumentSplitter(ComponentIterator):
|
|
|
273
267
|
yield DocumentSection(text=full_text, metadata=md)
|
|
274
268
|
accumulated_text = []
|
|
275
269
|
|
|
276
|
-
def process_element(el:
|
|
270
|
+
def process_element(el: bs4.element.Tag | bs4.NavigableString) -> Iterator[DocumentSection]:
|
|
277
271
|
# process the element and emit sections as necessary
|
|
278
272
|
nonlocal accumulated_text, headings, sourceline, emit_on_heading, emit_on_paragraph
|
|
279
273
|
|
pixeltable/iterators/video.py
CHANGED
|
@@ -29,12 +29,29 @@ class FrameIterator(ComponentIterator):
|
|
|
29
29
|
extracted). If `fps` is greater than the frame rate of the video, an error will be raised.
|
|
30
30
|
num_frames: Exact number of frames to extract. The frames will be spaced as evenly as possible. If
|
|
31
31
|
`num_frames` is greater than the number of frames in the video, all frames will be extracted.
|
|
32
|
+
all_frame_attrs:
|
|
33
|
+
If True, outputs a `pxt.Json` column `frame_attrs` with the following `pyav`-provided attributes
|
|
34
|
+
(for more information, see `pyav`'s documentation on
|
|
35
|
+
[VideoFrame](https://pyav.org/docs/develop/api/video.html#module-av.video.frame) and
|
|
36
|
+
[Frame](https://pyav.org/docs/develop/api/frame.html)):
|
|
37
|
+
|
|
38
|
+
* `index` (`int`)
|
|
39
|
+
* `pts` (`Optional[int]`)
|
|
40
|
+
* `dts` (`Optional[int]`)
|
|
41
|
+
* `time` (`Optional[float]`)
|
|
42
|
+
* `is_corrupt` (`bool`)
|
|
43
|
+
* `key_frame` (`bool`)
|
|
44
|
+
* `pict_type` (`int`)
|
|
45
|
+
* `interlaced_frame` (`bool`)
|
|
46
|
+
|
|
47
|
+
If False, only outputs frame attributes `frame_idx`, `pos_msec`, and `pos_frame` as separate columns.
|
|
32
48
|
"""
|
|
33
49
|
|
|
34
50
|
# Input parameters
|
|
35
51
|
video_path: Path
|
|
36
52
|
fps: Optional[float]
|
|
37
53
|
num_frames: Optional[int]
|
|
54
|
+
all_frame_attrs: bool
|
|
38
55
|
|
|
39
56
|
# Video info
|
|
40
57
|
container: av.container.input.InputContainer
|
|
@@ -50,7 +67,14 @@ class FrameIterator(ComponentIterator):
|
|
|
50
67
|
# frame index in the video. Otherwise, the corresponding video index is `frames_to_extract[next_pos]`.
|
|
51
68
|
next_pos: int
|
|
52
69
|
|
|
53
|
-
def __init__(
|
|
70
|
+
def __init__(
|
|
71
|
+
self,
|
|
72
|
+
video: str,
|
|
73
|
+
*,
|
|
74
|
+
fps: Optional[float] = None,
|
|
75
|
+
num_frames: Optional[int] = None,
|
|
76
|
+
all_frame_attrs: bool = False,
|
|
77
|
+
):
|
|
54
78
|
if fps is not None and num_frames is not None:
|
|
55
79
|
raise excs.Error('At most one of `fps` or `num_frames` may be specified')
|
|
56
80
|
|
|
@@ -60,6 +84,7 @@ class FrameIterator(ComponentIterator):
|
|
|
60
84
|
self.container = av.open(str(video_path))
|
|
61
85
|
self.fps = fps
|
|
62
86
|
self.num_frames = num_frames
|
|
87
|
+
self.all_frame_attrs = all_frame_attrs
|
|
63
88
|
|
|
64
89
|
self.video_framerate = self.container.streams.video[0].average_rate
|
|
65
90
|
self.video_time_base = self.container.streams.video[0].time_base
|
|
@@ -115,16 +140,17 @@ class FrameIterator(ComponentIterator):
|
|
|
115
140
|
'video': ts.VideoType(nullable=False),
|
|
116
141
|
'fps': ts.FloatType(nullable=True),
|
|
117
142
|
'num_frames': ts.IntType(nullable=True),
|
|
143
|
+
'all_frame_attrs': ts.BoolType(nullable=False),
|
|
118
144
|
}
|
|
119
145
|
|
|
120
146
|
@classmethod
|
|
121
147
|
def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
'
|
|
125
|
-
|
|
126
|
-
'
|
|
127
|
-
}, ['frame']
|
|
148
|
+
attrs: dict[str, ts.ColumnType]
|
|
149
|
+
if kwargs.get('all_frame_attrs'):
|
|
150
|
+
attrs = {'frame_attrs': ts.JsonType()}
|
|
151
|
+
else:
|
|
152
|
+
attrs = {'frame_idx': ts.IntType(), 'pos_msec': ts.FloatType(), 'pos_frame': ts.IntType()}
|
|
153
|
+
return {**attrs, 'frame': ts.ImageType()}, ['frame']
|
|
128
154
|
|
|
129
155
|
def __next__(self) -> dict[str, Any]:
|
|
130
156
|
# Determine the frame index in the video corresponding to the iterator index `next_pos`;
|
|
@@ -164,8 +190,22 @@ class FrameIterator(ComponentIterator):
|
|
|
164
190
|
raise excs.Error(f'Frame {next_video_idx} is missing from the video (video file is corrupt)')
|
|
165
191
|
img = frame.to_image()
|
|
166
192
|
assert isinstance(img, PIL.Image.Image)
|
|
167
|
-
|
|
168
|
-
result
|
|
193
|
+
pts_msec = float(pts * self.video_time_base * 1000)
|
|
194
|
+
result: dict[str, Any] = {'frame': img}
|
|
195
|
+
if self.all_frame_attrs:
|
|
196
|
+
attrs = {
|
|
197
|
+
'index': video_idx,
|
|
198
|
+
'pts': frame.pts,
|
|
199
|
+
'dts': frame.dts,
|
|
200
|
+
'time': frame.time,
|
|
201
|
+
'is_corrupt': frame.is_corrupt,
|
|
202
|
+
'key_frame': frame.key_frame,
|
|
203
|
+
'pict_type': frame.pict_type,
|
|
204
|
+
'interlaced_frame': frame.interlaced_frame,
|
|
205
|
+
}
|
|
206
|
+
result['frame_attrs'] = attrs
|
|
207
|
+
else:
|
|
208
|
+
result.update({'frame_idx': self.next_pos, 'pos_msec': pts_msec, 'pos_frame': video_idx})
|
|
169
209
|
self.next_pos += 1
|
|
170
210
|
return result
|
|
171
211
|
|
pixeltable/plan.py
CHANGED
|
@@ -394,9 +394,6 @@ class Planner:
|
|
|
394
394
|
row_builder, computed_exprs, plan.output_exprs, input=plan, maintain_input_order=False
|
|
395
395
|
)
|
|
396
396
|
|
|
397
|
-
stored_col_info = row_builder.output_slot_idxs()
|
|
398
|
-
stored_img_col_info = [info for info in stored_col_info if info.col.col_type.is_image_type()]
|
|
399
|
-
plan.set_stored_img_cols(stored_img_col_info)
|
|
400
397
|
plan.set_ctx(
|
|
401
398
|
exec.ExecContext(
|
|
402
399
|
row_builder,
|
|
@@ -428,10 +425,6 @@ class Planner:
|
|
|
428
425
|
col = tbl.cols_by_name[col_name]
|
|
429
426
|
plan.row_builder.add_table_column(col, expr.slot_idx)
|
|
430
427
|
|
|
431
|
-
stored_col_info = plan.row_builder.output_slot_idxs()
|
|
432
|
-
stored_img_col_info = [info for info in stored_col_info if info.col.col_type.is_image_type()]
|
|
433
|
-
plan.set_stored_img_cols(stored_img_col_info)
|
|
434
|
-
|
|
435
428
|
plan.set_ctx(
|
|
436
429
|
exec.ExecContext(
|
|
437
430
|
plan.row_builder, batch_size=0, show_pbar=True, num_computed_exprs=0, ignore_errors=ignore_errors
|
|
@@ -657,10 +650,6 @@ class Planner:
|
|
|
657
650
|
for i, col in enumerate(copied_cols + list(recomputed_cols)): # same order as select_list
|
|
658
651
|
plan.row_builder.add_table_column(col, select_list[i].slot_idx)
|
|
659
652
|
# TODO: avoid duplication with view_load_plan() logic (where does this belong?)
|
|
660
|
-
stored_img_col_info = [
|
|
661
|
-
info for info in plan.row_builder.output_slot_idxs() if info.col.col_type.is_image_type()
|
|
662
|
-
]
|
|
663
|
-
plan.set_stored_img_cols(stored_img_col_info)
|
|
664
653
|
return plan
|
|
665
654
|
|
|
666
655
|
@classmethod
|
|
@@ -727,8 +716,6 @@ class Planner:
|
|
|
727
716
|
row_builder, output_exprs=view_output_exprs, input_exprs=base_output_exprs, input=plan
|
|
728
717
|
)
|
|
729
718
|
|
|
730
|
-
stored_img_col_info = [info for info in row_builder.output_slot_idxs() if info.col.col_type.is_image_type()]
|
|
731
|
-
plan.set_stored_img_cols(stored_img_col_info)
|
|
732
719
|
exec_ctx.ignore_errors = True
|
|
733
720
|
plan.set_ctx(exec_ctx)
|
|
734
721
|
return plan, len(row_builder.default_eval_ctx.target_exprs)
|
|
@@ -1053,7 +1040,4 @@ class Planner:
|
|
|
1053
1040
|
computed_exprs = row_builder.output_exprs - row_builder.input_exprs
|
|
1054
1041
|
plan.ctx.num_computed_exprs = len(computed_exprs) # we are adding a computed column, so we need to evaluate it
|
|
1055
1042
|
|
|
1056
|
-
# we want to flush images
|
|
1057
|
-
if col.is_computed and col.is_stored and col.col_type.is_image_type():
|
|
1058
|
-
plan.set_stored_img_cols(row_builder.output_slot_idxs())
|
|
1059
1043
|
return plan
|
pixeltable/share/packager.py
CHANGED
|
@@ -24,7 +24,7 @@ from pixeltable.env import Env
|
|
|
24
24
|
from pixeltable.metadata import schema
|
|
25
25
|
from pixeltable.utils import sha256sum
|
|
26
26
|
from pixeltable.utils.formatter import Formatter
|
|
27
|
-
from pixeltable.utils.media_store import MediaStore
|
|
27
|
+
from pixeltable.utils.media_store import MediaStore, TempStore
|
|
28
28
|
|
|
29
29
|
_logger = logging.getLogger('pixeltable')
|
|
30
30
|
|
|
@@ -57,7 +57,7 @@ class TablePackager:
|
|
|
57
57
|
|
|
58
58
|
def __init__(self, table: catalog.Table, additional_md: Optional[dict[str, Any]] = None) -> None:
|
|
59
59
|
self.table = table
|
|
60
|
-
self.tmp_dir =
|
|
60
|
+
self.tmp_dir = TempStore.create_path()
|
|
61
61
|
self.media_files = {}
|
|
62
62
|
|
|
63
63
|
# Load metadata
|
|
@@ -92,10 +92,10 @@ class TablePackager:
|
|
|
92
92
|
self.bundle_path = self.__build_tarball()
|
|
93
93
|
|
|
94
94
|
_logger.info('Extracting preview data.')
|
|
95
|
-
self.md['
|
|
95
|
+
self.md['row_count'] = self.table.count()
|
|
96
96
|
preview_header, preview = self.__extract_preview_data()
|
|
97
97
|
self.md['preview_header'] = preview_header
|
|
98
|
-
self.md['
|
|
98
|
+
self.md['preview_data'] = preview
|
|
99
99
|
|
|
100
100
|
_logger.info(f'Packaging complete: {self.bundle_path}')
|
|
101
101
|
return self.bundle_path
|
|
@@ -335,7 +335,7 @@ class TableRestorer:
|
|
|
335
335
|
def __init__(self, tbl_path: str, md: Optional[dict[str, Any]] = None) -> None:
|
|
336
336
|
self.tbl_path = tbl_path
|
|
337
337
|
self.md = md
|
|
338
|
-
self.tmp_dir =
|
|
338
|
+
self.tmp_dir = TempStore.create_path()
|
|
339
339
|
self.media_files = {}
|
|
340
340
|
|
|
341
341
|
def restore(self, bundle_path: Path) -> pxt.Table:
|
|
@@ -459,42 +459,51 @@ class TableRestorer:
|
|
|
459
459
|
for col_name, col in temp_cols.items()
|
|
460
460
|
if col_name not in system_col_names and col_name not in media_col_names
|
|
461
461
|
]
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
)
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
462
|
+
|
|
463
|
+
q: sql.Executable
|
|
464
|
+
|
|
465
|
+
assert len(value_store_cols) == len(value_temp_cols)
|
|
466
|
+
if len(value_store_cols) > 0:
|
|
467
|
+
mismatch_predicates = [
|
|
468
|
+
store_col != temp_col for store_col, temp_col in zip(value_store_cols, value_temp_cols)
|
|
469
|
+
]
|
|
470
|
+
mismatch_clause = sql.or_(*mismatch_predicates)
|
|
471
|
+
|
|
472
|
+
# This query looks for rows that have matching primary keys (rowid + pos_k + v_min), but differ in at least
|
|
473
|
+
# one value column. Pseudo-SQL:
|
|
474
|
+
#
|
|
475
|
+
# SELECT store_tbl.col_0, ..., store_tbl.col_n, temp_tbl.col_0, ..., temp_tbl.col_n
|
|
476
|
+
# FROM store_tbl, temp_tbl
|
|
477
|
+
# WHERE store_tbl.rowid = temp_tbl.rowid
|
|
478
|
+
# AND store_tbl.pos_0 = temp_tbl.pos_0
|
|
479
|
+
# AND ... AND store_tbl.pos_k = temp_tbl.pos_k
|
|
480
|
+
# AND store_tbl.v_min = temp_tbl.v_min
|
|
481
|
+
# AND (
|
|
482
|
+
# store_tbl.col_0 != temp_tbl.col_0
|
|
483
|
+
# OR store_tbl.col_1 != temp_tbl.col_1
|
|
484
|
+
# OR ... OR store_tbl.col_n != temp_tbl.col_n
|
|
485
|
+
# )
|
|
486
|
+
#
|
|
487
|
+
# The value column comparisons (store_tbl.col_0 != temp_tbl.col_0, etc.) will always be false for rows where
|
|
488
|
+
# either column is NULL; this is what we want, since it may indicate a column that is present in one version
|
|
489
|
+
# but not the other.
|
|
490
|
+
q = sql.select(*value_store_cols, *value_temp_cols).where(pk_clause).where(mismatch_clause)
|
|
491
|
+
_logger.debug(q.compile())
|
|
492
|
+
result = conn.execute(q)
|
|
493
|
+
if result.rowcount > 0:
|
|
494
|
+
_logger.debug(
|
|
495
|
+
f'Data corruption error between {temp_sa_tbl_name!r} and {store_sa_tbl_name!r}: '
|
|
496
|
+
f'{result.rowcount} inconsistent row(s).'
|
|
497
|
+
)
|
|
498
|
+
row = result.first()
|
|
499
|
+
_logger.debug('Example mismatch:')
|
|
500
|
+
_logger.debug(f'{store_sa_tbl_name}: {row[: len(value_store_cols)]}')
|
|
501
|
+
_logger.debug(f'{temp_sa_tbl_name}: {row[len(value_store_cols) :]}')
|
|
502
|
+
raise excs.Error(
|
|
503
|
+
'Data corruption error: '
|
|
504
|
+
'the replica data are inconsistent with data retrieved from a previous replica.'
|
|
505
|
+
)
|
|
506
|
+
|
|
498
507
|
_logger.debug(f'Verified data integrity between {store_sa_tbl_name!r} and {temp_sa_tbl_name!r}.')
|
|
499
508
|
|
|
500
509
|
# Now rectify the v_max values in the temporary table.
|
|
@@ -610,7 +619,7 @@ class TableRestorer:
|
|
|
610
619
|
# in self.media_files.
|
|
611
620
|
src_path = self.tmp_dir / 'media' / parsed_url.netloc
|
|
612
621
|
# Move the file to the media store and update the URL.
|
|
613
|
-
self.media_files[url] = MediaStore.relocate_local_media_file(src_path, media_col)
|
|
622
|
+
self.media_files[url] = MediaStore.get().relocate_local_media_file(src_path, media_col)
|
|
614
623
|
return self.media_files[url]
|
|
615
624
|
# For any type of URL other than a local file, just return the URL as-is.
|
|
616
625
|
return url
|