pixeltable 0.2.7__py3-none-any.whl → 0.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +15 -33
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/catalog.py +1 -1
- pixeltable/catalog/column.py +28 -16
- pixeltable/catalog/dir.py +2 -2
- pixeltable/catalog/insertable_table.py +5 -55
- pixeltable/catalog/named_function.py +2 -2
- pixeltable/catalog/schema_object.py +2 -7
- pixeltable/catalog/table.py +298 -204
- pixeltable/catalog/table_version.py +104 -139
- pixeltable/catalog/table_version_path.py +22 -4
- pixeltable/catalog/view.py +20 -10
- pixeltable/dataframe.py +128 -25
- pixeltable/env.py +21 -14
- pixeltable/exec/exec_context.py +5 -0
- pixeltable/exec/exec_node.py +1 -0
- pixeltable/exec/in_memory_data_node.py +29 -24
- pixeltable/exec/sql_scan_node.py +1 -1
- pixeltable/exprs/column_ref.py +13 -8
- pixeltable/exprs/data_row.py +4 -0
- pixeltable/exprs/expr.py +16 -1
- pixeltable/exprs/function_call.py +4 -4
- pixeltable/exprs/row_builder.py +29 -20
- pixeltable/exprs/similarity_expr.py +4 -3
- pixeltable/ext/functions/yolox.py +2 -1
- pixeltable/func/__init__.py +1 -0
- pixeltable/func/aggregate_function.py +14 -12
- pixeltable/func/callable_function.py +8 -6
- pixeltable/func/expr_template_function.py +13 -19
- pixeltable/func/function.py +3 -6
- pixeltable/func/query_template_function.py +84 -0
- pixeltable/func/signature.py +68 -23
- pixeltable/func/udf.py +13 -10
- pixeltable/functions/__init__.py +6 -91
- pixeltable/functions/eval.py +26 -14
- pixeltable/functions/fireworks.py +25 -23
- pixeltable/functions/globals.py +62 -0
- pixeltable/functions/huggingface.py +20 -16
- pixeltable/functions/image.py +170 -1
- pixeltable/functions/openai.py +95 -128
- pixeltable/functions/string.py +10 -2
- pixeltable/functions/together.py +95 -84
- pixeltable/functions/util.py +16 -0
- pixeltable/functions/video.py +94 -16
- pixeltable/functions/whisper.py +78 -0
- pixeltable/globals.py +1 -1
- pixeltable/io/__init__.py +10 -0
- pixeltable/io/external_store.py +370 -0
- pixeltable/io/globals.py +50 -22
- pixeltable/{datatransfer → io}/label_studio.py +279 -166
- pixeltable/io/parquet.py +1 -1
- pixeltable/iterators/__init__.py +9 -0
- pixeltable/iterators/string.py +40 -0
- pixeltable/metadata/__init__.py +6 -8
- pixeltable/metadata/converters/convert_10.py +2 -4
- pixeltable/metadata/converters/convert_12.py +7 -2
- pixeltable/metadata/converters/convert_13.py +6 -8
- pixeltable/metadata/converters/convert_14.py +2 -4
- pixeltable/metadata/converters/convert_15.py +40 -25
- pixeltable/metadata/converters/convert_16.py +18 -0
- pixeltable/metadata/converters/util.py +11 -8
- pixeltable/metadata/schema.py +3 -6
- pixeltable/plan.py +8 -7
- pixeltable/store.py +1 -1
- pixeltable/tool/create_test_db_dump.py +145 -54
- pixeltable/tool/embed_udf.py +9 -0
- pixeltable/type_system.py +1 -2
- pixeltable/utils/code.py +34 -0
- {pixeltable-0.2.7.dist-info → pixeltable-0.2.9.dist-info}/METADATA +2 -2
- pixeltable-0.2.9.dist-info/RECORD +131 -0
- pixeltable/datatransfer/__init__.py +0 -1
- pixeltable/datatransfer/remote.py +0 -113
- pixeltable/functions/pil/image.py +0 -147
- pixeltable-0.2.7.dist-info/RECORD +0 -126
- {pixeltable-0.2.7.dist-info → pixeltable-0.2.9.dist-info}/LICENSE +0 -0
- {pixeltable-0.2.7.dist-info → pixeltable-0.2.9.dist-info}/WHEEL +0 -0
pixeltable/functions/video.py
CHANGED
|
@@ -1,11 +1,16 @@
|
|
|
1
|
+
import tempfile
|
|
1
2
|
import uuid
|
|
3
|
+
from pathlib import Path
|
|
2
4
|
from typing import Optional
|
|
3
5
|
|
|
6
|
+
import PIL.Image
|
|
4
7
|
import av
|
|
8
|
+
import numpy as np
|
|
5
9
|
|
|
6
10
|
import pixeltable.env as env
|
|
7
11
|
import pixeltable.func as func
|
|
8
12
|
import pixeltable.type_system as ts
|
|
13
|
+
from pixeltable.utils.code import local_public_names
|
|
9
14
|
|
|
10
15
|
_format_defaults = { # format -> (codec, ext)
|
|
11
16
|
'wav': ('pcm_s16le', 'wav'),
|
|
@@ -30,6 +35,43 @@ _format_defaults = { # format -> (codec, ext)
|
|
|
30
35
|
# output_container.mux(packet)
|
|
31
36
|
|
|
32
37
|
|
|
38
|
+
@func.uda(
|
|
39
|
+
init_types=[ts.IntType()],
|
|
40
|
+
update_types=[ts.ImageType()],
|
|
41
|
+
value_type=ts.VideoType(),
|
|
42
|
+
requires_order_by=True,
|
|
43
|
+
allows_window=False,
|
|
44
|
+
)
|
|
45
|
+
class make_video(func.Aggregator):
|
|
46
|
+
def __init__(self, fps: int = 25):
|
|
47
|
+
"""follows https://pyav.org/docs/develop/cookbook/numpy.html#generating-video"""
|
|
48
|
+
self.container: Optional[av.container.OutputContainer] = None
|
|
49
|
+
self.stream: Optional[av.stream.Stream] = None
|
|
50
|
+
self.fps = fps
|
|
51
|
+
|
|
52
|
+
def update(self, frame: PIL.Image.Image) -> None:
|
|
53
|
+
if frame is None:
|
|
54
|
+
return
|
|
55
|
+
if self.container is None:
|
|
56
|
+
(_, output_filename) = tempfile.mkstemp(suffix='.mp4', dir=str(env.Env.get().tmp_dir))
|
|
57
|
+
self.out_file = Path(output_filename)
|
|
58
|
+
self.container = av.open(str(self.out_file), mode='w')
|
|
59
|
+
self.stream = self.container.add_stream('h264', rate=self.fps)
|
|
60
|
+
self.stream.pix_fmt = 'yuv420p'
|
|
61
|
+
self.stream.width = frame.width
|
|
62
|
+
self.stream.height = frame.height
|
|
63
|
+
|
|
64
|
+
av_frame = av.VideoFrame.from_ndarray(np.array(frame.convert('RGB')), format='rgb24')
|
|
65
|
+
for packet in self.stream.encode(av_frame):
|
|
66
|
+
self.container.mux(packet)
|
|
67
|
+
|
|
68
|
+
def value(self) -> str:
|
|
69
|
+
for packet in self.stream.encode():
|
|
70
|
+
self.container.mux(packet)
|
|
71
|
+
self.container.close()
|
|
72
|
+
return str(self.out_file)
|
|
73
|
+
|
|
74
|
+
|
|
33
75
|
_extract_audio_param_types = [
|
|
34
76
|
ts.VideoType(nullable=False),
|
|
35
77
|
ts.IntType(nullable=False),
|
|
@@ -75,26 +117,62 @@ def get_metadata(video: str) -> dict:
|
|
|
75
117
|
"""
|
|
76
118
|
with av.open(video) as container:
|
|
77
119
|
assert isinstance(container, av.container.InputContainer)
|
|
78
|
-
|
|
79
|
-
{
|
|
80
|
-
'duration': stream.duration,
|
|
81
|
-
'frames': stream.frames,
|
|
82
|
-
'language': stream.language,
|
|
83
|
-
'average_rate': float(stream.average_rate) if stream.average_rate is not None else None,
|
|
84
|
-
'base_rate': float(stream.base_rate) if stream.base_rate is not None else None,
|
|
85
|
-
'guessed_rate': float(stream.guessed_rate) if stream.guessed_rate is not None else None,
|
|
86
|
-
'pix_fmt': getattr(stream.codec_context, 'pix_fmt', None),
|
|
87
|
-
'width': stream.width,
|
|
88
|
-
'height': stream.height,
|
|
89
|
-
}
|
|
90
|
-
for stream in container.streams
|
|
91
|
-
if isinstance(stream, av.video.stream.VideoStream)
|
|
92
|
-
]
|
|
120
|
+
streams_info = [__get_stream_metadata(stream) for stream in container.streams]
|
|
93
121
|
result = {
|
|
94
122
|
'bit_exact': container.bit_exact,
|
|
95
123
|
'bit_rate': container.bit_rate,
|
|
96
124
|
'size': container.size,
|
|
97
125
|
'metadata': container.metadata,
|
|
98
|
-
'streams':
|
|
126
|
+
'streams': streams_info,
|
|
99
127
|
}
|
|
100
128
|
return result
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def __get_stream_metadata(stream: av.stream.Stream) -> dict:
|
|
132
|
+
if stream.type != 'audio' and stream.type != 'video':
|
|
133
|
+
return {'type': stream.type} # Currently unsupported
|
|
134
|
+
|
|
135
|
+
codec_context = stream.codec_context
|
|
136
|
+
codec_context_md = {
|
|
137
|
+
'name': codec_context.name,
|
|
138
|
+
'codec_tag': codec_context.codec_tag.encode('unicode-escape').decode('utf-8'),
|
|
139
|
+
'profile': codec_context.profile,
|
|
140
|
+
}
|
|
141
|
+
metadata = {
|
|
142
|
+
'type': stream.type,
|
|
143
|
+
'duration': stream.duration,
|
|
144
|
+
'time_base': float(stream.time_base) if stream.time_base is not None else None,
|
|
145
|
+
'duration_seconds': float(stream.duration * stream.time_base)
|
|
146
|
+
if stream.duration is not None and stream.time_base is not None
|
|
147
|
+
else None,
|
|
148
|
+
'frames': stream.frames,
|
|
149
|
+
'metadata': stream.metadata,
|
|
150
|
+
'codec_context': codec_context_md,
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
if stream.type == 'audio':
|
|
154
|
+
# Additional metadata for audio
|
|
155
|
+
codec_context_md['channels'] = int(codec_context.channels) if codec_context.channels is not None else None
|
|
156
|
+
else:
|
|
157
|
+
assert stream.type == 'video'
|
|
158
|
+
# Additional metadata for video
|
|
159
|
+
codec_context_md['pix_fmt'] = getattr(stream.codec_context, 'pix_fmt', None)
|
|
160
|
+
metadata.update(
|
|
161
|
+
**{
|
|
162
|
+
'width': stream.width,
|
|
163
|
+
'height': stream.height,
|
|
164
|
+
'frames': stream.frames,
|
|
165
|
+
'average_rate': float(stream.average_rate) if stream.average_rate is not None else None,
|
|
166
|
+
'base_rate': float(stream.base_rate) if stream.base_rate is not None else None,
|
|
167
|
+
'guessed_rate': float(stream.guessed_rate) if stream.guessed_rate is not None else None,
|
|
168
|
+
}
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
return metadata
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
__all__ = local_public_names(__name__)
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def __dir__():
|
|
178
|
+
return __all__
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
from typing import TYPE_CHECKING, Optional
|
|
2
|
+
|
|
3
|
+
import pixeltable as pxt
|
|
4
|
+
|
|
5
|
+
if TYPE_CHECKING:
|
|
6
|
+
from whisper import Whisper
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@pxt.udf(
|
|
10
|
+
param_types=[
|
|
11
|
+
pxt.AudioType(),
|
|
12
|
+
pxt.StringType(),
|
|
13
|
+
pxt.JsonType(nullable=True),
|
|
14
|
+
pxt.FloatType(nullable=True),
|
|
15
|
+
pxt.FloatType(nullable=True),
|
|
16
|
+
pxt.FloatType(nullable=True),
|
|
17
|
+
pxt.BoolType(),
|
|
18
|
+
pxt.StringType(nullable=True),
|
|
19
|
+
pxt.BoolType(),
|
|
20
|
+
pxt.StringType(),
|
|
21
|
+
pxt.StringType(),
|
|
22
|
+
pxt.StringType(),
|
|
23
|
+
pxt.FloatType(nullable=True),
|
|
24
|
+
pxt.JsonType(nullable=True),
|
|
25
|
+
]
|
|
26
|
+
)
|
|
27
|
+
def transcribe(
|
|
28
|
+
audio: str,
|
|
29
|
+
*,
|
|
30
|
+
model: str,
|
|
31
|
+
temperature: Optional[list[float]] = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0],
|
|
32
|
+
compression_ratio_threshold: Optional[float] = 2.4,
|
|
33
|
+
logprob_threshold: Optional[float] = -1.0,
|
|
34
|
+
no_speech_threshold: Optional[float] = 0.6,
|
|
35
|
+
condition_on_previous_text: bool = True,
|
|
36
|
+
initial_prompt: Optional[str] = None,
|
|
37
|
+
word_timestamps: bool = False,
|
|
38
|
+
prepend_punctuations: str = '"\'“¿([{-',
|
|
39
|
+
append_punctuations: str = '"\'.。,,!!??::”)]}、',
|
|
40
|
+
clip_timestamps: str = '0',
|
|
41
|
+
hallucination_silence_threshold: Optional[float] = None,
|
|
42
|
+
decode_options: Optional[dict] = None,
|
|
43
|
+
) -> dict:
|
|
44
|
+
import torch
|
|
45
|
+
|
|
46
|
+
if decode_options is None:
|
|
47
|
+
decode_options = {}
|
|
48
|
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
|
49
|
+
model = _lookup_model(model, device)
|
|
50
|
+
result = model.transcribe(
|
|
51
|
+
audio,
|
|
52
|
+
temperature=tuple(temperature),
|
|
53
|
+
compression_ratio_threshold=compression_ratio_threshold,
|
|
54
|
+
logprob_threshold=logprob_threshold,
|
|
55
|
+
no_speech_threshold=no_speech_threshold,
|
|
56
|
+
condition_on_previous_text=condition_on_previous_text,
|
|
57
|
+
initial_prompt=initial_prompt,
|
|
58
|
+
word_timestamps=word_timestamps,
|
|
59
|
+
prepend_punctuations=prepend_punctuations,
|
|
60
|
+
append_punctuations=append_punctuations,
|
|
61
|
+
clip_timestamps=clip_timestamps,
|
|
62
|
+
hallucination_silence_threshold=hallucination_silence_threshold,
|
|
63
|
+
**decode_options,
|
|
64
|
+
)
|
|
65
|
+
return result
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _lookup_model(model_id: str, device: str) -> 'Whisper':
|
|
69
|
+
import whisper
|
|
70
|
+
|
|
71
|
+
key = (model_id, device)
|
|
72
|
+
if key not in _model_cache:
|
|
73
|
+
model = whisper.load_model(model_id, device)
|
|
74
|
+
_model_cache[key] = model
|
|
75
|
+
return _model_cache[key]
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
_model_cache: dict[tuple[str, str], 'Whisper'] = {}
|
pixeltable/globals.py
CHANGED
|
@@ -213,7 +213,7 @@ def move(path: str, new_path: str) -> None:
|
|
|
213
213
|
obj = Catalog.get().paths[p]
|
|
214
214
|
Catalog.get().paths.move(p, new_p)
|
|
215
215
|
new_dir = Catalog.get().paths[new_p.parent]
|
|
216
|
-
obj.
|
|
216
|
+
obj._move(new_p.name, new_dir._id)
|
|
217
217
|
|
|
218
218
|
|
|
219
219
|
def drop_table(path: str, force: bool = False, ignore_errors: bool = False) -> None:
|
pixeltable/io/__init__.py
CHANGED
|
@@ -1,4 +1,14 @@
|
|
|
1
|
+
from .external_store import ExternalStore, SyncStatus
|
|
1
2
|
from .globals import create_label_studio_project
|
|
2
3
|
from .hf_datasets import import_huggingface_dataset
|
|
3
4
|
from .pandas import import_csv, import_excel, import_pandas
|
|
4
5
|
from .parquet import import_parquet
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
__default_dir = set(symbol for symbol in dir() if not symbol.startswith('_'))
|
|
9
|
+
__removed_symbols = {'globals', 'hf_datasets', 'pandas', 'parquet'}
|
|
10
|
+
__all__ = sorted(list(__default_dir - __removed_symbols))
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def __dir__():
|
|
14
|
+
return __all__
|
|
@@ -0,0 +1,370 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import abc
|
|
4
|
+
import itertools
|
|
5
|
+
import logging
|
|
6
|
+
import time
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from typing import Any, Optional
|
|
9
|
+
from uuid import UUID
|
|
10
|
+
|
|
11
|
+
import pixeltable.exceptions as excs
|
|
12
|
+
import pixeltable.type_system as ts
|
|
13
|
+
from pixeltable import Table, Column
|
|
14
|
+
import sqlalchemy as sql
|
|
15
|
+
|
|
16
|
+
from pixeltable.catalog import TableVersion
|
|
17
|
+
|
|
18
|
+
_logger = logging.getLogger('pixeltable')
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class ExternalStore(abc.ABC):
|
|
22
|
+
"""
|
|
23
|
+
Abstract base class that represents an external data store that is linked to a Pixeltable
|
|
24
|
+
table. Subclasses of `ExternalStore` provide functionality for synchronizing between Pixeltable
|
|
25
|
+
and stateful external stores.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(self, name: str) -> None:
|
|
29
|
+
self.__name = name
|
|
30
|
+
|
|
31
|
+
@property
|
|
32
|
+
def name(self) -> str:
|
|
33
|
+
return self.__name
|
|
34
|
+
|
|
35
|
+
@abc.abstractmethod
|
|
36
|
+
def link(self, tbl_version: TableVersion, conn: sql.Connection) -> None:
|
|
37
|
+
"""
|
|
38
|
+
Called by `TableVersion.link()` to implement store-specific logic.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
@abc.abstractmethod
|
|
42
|
+
def unlink(self, tbl_version: TableVersion, conn: sql.Connection) -> None:
|
|
43
|
+
"""
|
|
44
|
+
Called by `TableVersion.unlink()` to implement store-specific logic.
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
@abc.abstractmethod
|
|
48
|
+
def get_local_columns(self) -> list[Column]:
|
|
49
|
+
"""
|
|
50
|
+
Gets a list of all local (Pixeltable) columns that are associated with this external store.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
@abc.abstractmethod
|
|
54
|
+
def sync(self, t: Table, export_data: bool, import_data: bool) -> SyncStatus:
|
|
55
|
+
"""
|
|
56
|
+
Called by `Table.sync()` to implement store-specific synchronization logic.
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
@abc.abstractmethod
|
|
60
|
+
def as_dict(self) -> dict[str, Any]: ...
|
|
61
|
+
|
|
62
|
+
@classmethod
|
|
63
|
+
@abc.abstractmethod
|
|
64
|
+
def from_dict(cls, md: dict[str, Any]) -> ExternalStore: ...
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class Project(ExternalStore, abc.ABC):
|
|
68
|
+
"""
|
|
69
|
+
An `ExternalStore` that represents a labeling project. Extends `ExternalStore` with a few
|
|
70
|
+
additional capabilities specific to such projects.
|
|
71
|
+
"""
|
|
72
|
+
def __init__(self, name: str, col_mapping: dict[Column, str], stored_proxies: Optional[dict[Column, Column]]):
|
|
73
|
+
super().__init__(name)
|
|
74
|
+
self._col_mapping = col_mapping
|
|
75
|
+
|
|
76
|
+
# A mapping from original columns to proxy columns. A proxy column is an identical copy of a column that is
|
|
77
|
+
# guaranteed to be stored; the Project will dynamically create and tear down proxy columns as needed. There
|
|
78
|
+
# are two reasons this might happen:
|
|
79
|
+
# (i) to force computed media data to be persisted; or
|
|
80
|
+
# (ii) to force media data to be materialized in a particular location.
|
|
81
|
+
# For each entry (k, v) in the dict, `v` is the stored proxy column for `k`. The proxy column `v` will
|
|
82
|
+
# necessarily be a column of the table to which this project is linked, but `k` need not be; it might be a
|
|
83
|
+
# column of a base table.
|
|
84
|
+
# Note from aaron-siegel: This methodology is inefficient in the case where a table has many views with a high
|
|
85
|
+
# proportion of overlapping rows, all proxying the same base column.
|
|
86
|
+
if stored_proxies is None:
|
|
87
|
+
self.stored_proxies: dict[Column, Column] = {}
|
|
88
|
+
else:
|
|
89
|
+
self.stored_proxies = stored_proxies
|
|
90
|
+
|
|
91
|
+
def get_local_columns(self) -> list[Column]:
|
|
92
|
+
return list(self.col_mapping.keys())
|
|
93
|
+
|
|
94
|
+
def link(self, tbl_version: TableVersion, conn: sql.Connection) -> None:
|
|
95
|
+
# All of the media columns being linked need to either be stored computed columns, or else have stored proxies.
|
|
96
|
+
# This ensures that the media in those columns resides in the media store.
|
|
97
|
+
# First determine which columns (if any) need stored proxies, but don't have one yet.
|
|
98
|
+
stored_proxies_needed: list[Column] = []
|
|
99
|
+
for col in self.col_mapping.keys():
|
|
100
|
+
if col.col_type.is_media_type() and not (col.is_stored and col.is_computed):
|
|
101
|
+
# If this column is already proxied in some other Project, use the existing proxy to avoid
|
|
102
|
+
# duplication. Otherwise, we'll create a new one.
|
|
103
|
+
for store in tbl_version.external_stores.values():
|
|
104
|
+
if isinstance(store, Project) and col in store.stored_proxies:
|
|
105
|
+
self.stored_proxies[col] = store.stored_proxies[col]
|
|
106
|
+
break
|
|
107
|
+
if col not in self.stored_proxies:
|
|
108
|
+
# We didn't find it in an existing Project
|
|
109
|
+
stored_proxies_needed.append(col)
|
|
110
|
+
if len(stored_proxies_needed) > 0:
|
|
111
|
+
_logger.info(f'Creating stored proxies for columns: {[col.name for col in stored_proxies_needed]}')
|
|
112
|
+
# Create stored proxies for columns that need one. Increment the schema version
|
|
113
|
+
# accordingly.
|
|
114
|
+
tbl_version.version += 1
|
|
115
|
+
preceding_schema_version = tbl_version.schema_version
|
|
116
|
+
tbl_version.schema_version = tbl_version.version
|
|
117
|
+
proxy_cols = [self.create_stored_proxy(tbl_version, col) for col in stored_proxies_needed]
|
|
118
|
+
# Add the columns; this will also update table metadata.
|
|
119
|
+
tbl_version._add_columns(proxy_cols, conn)
|
|
120
|
+
# We don't need to retain `UpdateStatus` since the stored proxies are intended to be
|
|
121
|
+
# invisible to the user.
|
|
122
|
+
tbl_version._update_md(time.time(), conn, preceding_schema_version=preceding_schema_version)
|
|
123
|
+
|
|
124
|
+
def unlink(self, tbl_version: TableVersion, conn: sql.Connection) -> None:
|
|
125
|
+
# Determine which stored proxies can be deleted. (A stored proxy can be deleted if it is not referenced by
|
|
126
|
+
# any *other* external store for this table.)
|
|
127
|
+
deletions_needed: set[Column] = set(self.stored_proxies.values())
|
|
128
|
+
for name, store in tbl_version.external_stores.items():
|
|
129
|
+
if name != self.name:
|
|
130
|
+
deletions_needed = deletions_needed.difference(set(store.stored_proxies.values()))
|
|
131
|
+
if len(deletions_needed) > 0:
|
|
132
|
+
_logger.info(f'Removing stored proxies for columns: {[col.name for col in deletions_needed]}')
|
|
133
|
+
# Delete stored proxies that are no longer needed.
|
|
134
|
+
tbl_version.version += 1
|
|
135
|
+
preceding_schema_version = tbl_version.schema_version
|
|
136
|
+
tbl_version.schema_version = tbl_version.version
|
|
137
|
+
tbl_version._drop_columns(deletions_needed)
|
|
138
|
+
self.stored_proxies.clear()
|
|
139
|
+
tbl_version._update_md(time.time(), conn, preceding_schema_version=preceding_schema_version)
|
|
140
|
+
|
|
141
|
+
def create_stored_proxy(self, tbl_version: TableVersion, col: Column) -> Column:
|
|
142
|
+
"""
|
|
143
|
+
Creates a proxy column for the specified column. The proxy column will be created in the specified
|
|
144
|
+
`TableVersion`.
|
|
145
|
+
"""
|
|
146
|
+
from pixeltable import exprs
|
|
147
|
+
|
|
148
|
+
assert col.col_type.is_media_type() and not (col.is_stored and col.is_computed) and col not in self.stored_proxies
|
|
149
|
+
proxy_col = Column(
|
|
150
|
+
name=None,
|
|
151
|
+
# Force images in the proxy column to be materialized inside the media store, in a normalized format.
|
|
152
|
+
# TODO(aaron-siegel): This is a temporary solution and it will be replaced by a proper `destination`
|
|
153
|
+
# parameter for computed columns. Among other things, this solution does not work for video or audio.
|
|
154
|
+
# Once `destination` is implemented, it can be replaced with a simple `ColumnRef`.
|
|
155
|
+
computed_with=exprs.ColumnRef(col).apply(lambda x: x, col_type=col.col_type),
|
|
156
|
+
stored=True,
|
|
157
|
+
col_id=tbl_version.next_col_id,
|
|
158
|
+
sa_col_type=col.col_type.to_sa_type(),
|
|
159
|
+
schema_version_add=tbl_version.schema_version
|
|
160
|
+
)
|
|
161
|
+
proxy_col.tbl = tbl_version
|
|
162
|
+
tbl_version.next_col_id += 1
|
|
163
|
+
self.stored_proxies[col] = proxy_col
|
|
164
|
+
return proxy_col
|
|
165
|
+
|
|
166
|
+
@property
|
|
167
|
+
def col_mapping(self) -> dict[Column, str]:
|
|
168
|
+
return self._col_mapping
|
|
169
|
+
|
|
170
|
+
@abc.abstractmethod
|
|
171
|
+
def get_export_columns(self) -> dict[str, ts.ColumnType]:
|
|
172
|
+
"""
|
|
173
|
+
Returns the names and Pixeltable types that this `Project` expects to see in a data export. The keys
|
|
174
|
+
of the `dict` are the names of data fields in the external store, not Pixeltable columns.
|
|
175
|
+
|
|
176
|
+
Returns:
|
|
177
|
+
A `dict` mapping names of external data fields to their expected Pixeltable types.
|
|
178
|
+
"""
|
|
179
|
+
|
|
180
|
+
@abc.abstractmethod
|
|
181
|
+
def get_import_columns(self) -> dict[str, ts.ColumnType]:
|
|
182
|
+
"""
|
|
183
|
+
Returns the names and Pixeltable types that this `Project` provides in a data import.
|
|
184
|
+
|
|
185
|
+
Returns:
|
|
186
|
+
A `dict` mapping names of provided columns to their Pixeltable types.
|
|
187
|
+
"""
|
|
188
|
+
|
|
189
|
+
@abc.abstractmethod
|
|
190
|
+
def delete(self) -> None:
|
|
191
|
+
"""
|
|
192
|
+
Deletes this `Project` and all associated (externally stored) data.
|
|
193
|
+
"""
|
|
194
|
+
|
|
195
|
+
@classmethod
|
|
196
|
+
def validate_columns(
|
|
197
|
+
cls,
|
|
198
|
+
table: Table,
|
|
199
|
+
export_cols: dict[str, ts.ColumnType],
|
|
200
|
+
import_cols: dict[str, ts.ColumnType],
|
|
201
|
+
col_mapping: Optional[dict[str, str]]
|
|
202
|
+
) -> dict[Column, str]:
|
|
203
|
+
"""
|
|
204
|
+
Verifies that the specified `col_mapping` is valid. In particular, checks that:
|
|
205
|
+
(i) the keys of `col_mapping` are valid columns of the specified `Table`;
|
|
206
|
+
(ii) the values of `col_mapping` are valid external columns (i.e., they appear in either `export_cols` or
|
|
207
|
+
`import_cols`; and
|
|
208
|
+
(iii) the Pixeltable types of the `col_mapping` keys are consistent with the expected types of the corresponding
|
|
209
|
+
external (import or export) columns.
|
|
210
|
+
If validation fails, an exception will be raised. If validation succeeds, a new mapping will be returned
|
|
211
|
+
in which the Pixeltable column names are resolved to the corresponding `Column` objects.
|
|
212
|
+
"""
|
|
213
|
+
is_user_specified_col_mapping = col_mapping is not None
|
|
214
|
+
if col_mapping is None:
|
|
215
|
+
col_mapping = {col: col for col in itertools.chain(export_cols.keys(), import_cols.keys())}
|
|
216
|
+
|
|
217
|
+
resolved_col_mapping: dict[Column, str] = {}
|
|
218
|
+
|
|
219
|
+
# Validate names
|
|
220
|
+
t_cols = table.column_names()
|
|
221
|
+
for t_col, ext_col in col_mapping.items():
|
|
222
|
+
if t_col not in t_cols:
|
|
223
|
+
if is_user_specified_col_mapping:
|
|
224
|
+
raise excs.Error(
|
|
225
|
+
f'Column name `{t_col}` appears as a key in `col_mapping`, but Table `{table.get_name()}` '
|
|
226
|
+
'contains no such column.'
|
|
227
|
+
)
|
|
228
|
+
else:
|
|
229
|
+
raise excs.Error(
|
|
230
|
+
f'Column `{t_col}` does not exist in Table `{table.get_name()}`. Either add a column `{t_col}`, '
|
|
231
|
+
f'or specify a `col_mapping` to associate a different column with the external field `{ext_col}`.'
|
|
232
|
+
)
|
|
233
|
+
if ext_col not in export_cols and ext_col not in import_cols:
|
|
234
|
+
raise excs.Error(
|
|
235
|
+
f'Column name `{ext_col}` appears as a value in `col_mapping`, but the external store '
|
|
236
|
+
f'configuration has no column `{ext_col}`.'
|
|
237
|
+
)
|
|
238
|
+
col = table[t_col].col
|
|
239
|
+
resolved_col_mapping[col] = ext_col
|
|
240
|
+
# Validate column specs
|
|
241
|
+
t_col_types = table.column_types()
|
|
242
|
+
for t_col, ext_col in col_mapping.items():
|
|
243
|
+
t_col_type = t_col_types[t_col]
|
|
244
|
+
if ext_col in export_cols:
|
|
245
|
+
# Validate that the table column can be assigned to the external column
|
|
246
|
+
ext_col_type = export_cols[ext_col]
|
|
247
|
+
if not ext_col_type.is_supertype_of(t_col_type):
|
|
248
|
+
raise excs.Error(
|
|
249
|
+
f'Column `{t_col}` cannot be exported to external column `{ext_col}` (incompatible types; expecting `{ext_col_type}`)'
|
|
250
|
+
)
|
|
251
|
+
if ext_col in import_cols:
|
|
252
|
+
# Validate that the external column can be assigned to the table column
|
|
253
|
+
if table._tbl_version_path.get_column(t_col).is_computed:
|
|
254
|
+
raise excs.Error(
|
|
255
|
+
f'Column `{t_col}` is a computed column, which cannot be populated from an external column'
|
|
256
|
+
)
|
|
257
|
+
ext_col_type = import_cols[ext_col]
|
|
258
|
+
if not t_col_type.is_supertype_of(ext_col_type):
|
|
259
|
+
raise excs.Error(
|
|
260
|
+
f'Column `{t_col}` cannot be imported from external column `{ext_col}` (incompatible types; expecting `{ext_col_type}`)'
|
|
261
|
+
)
|
|
262
|
+
return resolved_col_mapping
|
|
263
|
+
|
|
264
|
+
@classmethod
|
|
265
|
+
def _column_as_dict(cls, col: Column) -> dict[str, Any]:
|
|
266
|
+
return {'tbl_id': str(col.tbl.id), 'col_id': col.id}
|
|
267
|
+
|
|
268
|
+
@classmethod
|
|
269
|
+
def _column_from_dict(cls, d: dict[str, Any]) -> Column:
|
|
270
|
+
from pixeltable.catalog import Catalog
|
|
271
|
+
|
|
272
|
+
tbl_id = UUID(d['tbl_id'])
|
|
273
|
+
col_id = d['col_id']
|
|
274
|
+
return Catalog.get().tbl_versions[(tbl_id, None)].cols_by_id[col_id]
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
@dataclass(frozen=True)
|
|
278
|
+
class SyncStatus:
|
|
279
|
+
external_rows_created: int = 0
|
|
280
|
+
external_rows_deleted: int = 0
|
|
281
|
+
external_rows_updated: int = 0
|
|
282
|
+
pxt_rows_updated: int = 0
|
|
283
|
+
num_excs: int = 0
|
|
284
|
+
|
|
285
|
+
def combine(self, other: 'SyncStatus') -> 'SyncStatus':
|
|
286
|
+
return SyncStatus(
|
|
287
|
+
external_rows_created=self.external_rows_created + other.external_rows_created,
|
|
288
|
+
external_rows_deleted=self.external_rows_deleted + other.external_rows_deleted,
|
|
289
|
+
external_rows_updated=self.external_rows_updated + other.external_rows_updated,
|
|
290
|
+
pxt_rows_updated=self.pxt_rows_updated + other.pxt_rows_updated,
|
|
291
|
+
num_excs=self.num_excs + other.num_excs
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
@classmethod
|
|
295
|
+
def empty(cls) -> 'SyncStatus':
|
|
296
|
+
return SyncStatus(0, 0, 0, 0, 0)
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
class MockProject(Project):
|
|
300
|
+
"""A project that cannot be synced, used mainly for testing."""
|
|
301
|
+
def __init__(
|
|
302
|
+
self,
|
|
303
|
+
name: str,
|
|
304
|
+
export_cols: dict[str, ts.ColumnType],
|
|
305
|
+
import_cols: dict[str, ts.ColumnType],
|
|
306
|
+
col_mapping: dict[Column, str],
|
|
307
|
+
stored_proxies: Optional[dict[Column, Column]] = None
|
|
308
|
+
):
|
|
309
|
+
super().__init__(name, col_mapping, stored_proxies)
|
|
310
|
+
self.export_cols = export_cols
|
|
311
|
+
self.import_cols = import_cols
|
|
312
|
+
self.__is_deleted = False
|
|
313
|
+
|
|
314
|
+
@classmethod
|
|
315
|
+
def create(
|
|
316
|
+
cls,
|
|
317
|
+
t: Table,
|
|
318
|
+
name: str,
|
|
319
|
+
export_cols: dict[str, ts.ColumnType],
|
|
320
|
+
import_cols: dict[str, ts.ColumnType],
|
|
321
|
+
col_mapping: Optional[dict[str, str]] = None
|
|
322
|
+
) -> 'MockProject':
|
|
323
|
+
col_mapping = cls.validate_columns(t, export_cols, import_cols, col_mapping)
|
|
324
|
+
return cls(name, export_cols, import_cols, col_mapping)
|
|
325
|
+
|
|
326
|
+
def get_export_columns(self) -> dict[str, ts.ColumnType]:
|
|
327
|
+
return self.export_cols
|
|
328
|
+
|
|
329
|
+
def get_import_columns(self) -> dict[str, ts.ColumnType]:
|
|
330
|
+
return self.import_cols
|
|
331
|
+
|
|
332
|
+
def sync(self, t: Table, export_data: bool, import_data: bool) -> NotImplemented:
|
|
333
|
+
raise NotImplementedError()
|
|
334
|
+
|
|
335
|
+
def delete(self) -> None:
|
|
336
|
+
self.__is_deleted = True
|
|
337
|
+
|
|
338
|
+
@property
|
|
339
|
+
def is_deleted(self) -> bool:
|
|
340
|
+
return self.__is_deleted
|
|
341
|
+
|
|
342
|
+
def as_dict(self) -> dict[str, Any]:
|
|
343
|
+
return {
|
|
344
|
+
'name': self.name,
|
|
345
|
+
'export_cols': {k: v.as_dict() for k, v in self.export_cols.items()},
|
|
346
|
+
'import_cols': {k: v.as_dict() for k, v in self.import_cols.items()},
|
|
347
|
+
'col_mapping': [[self._column_as_dict(k), v] for k, v in self.col_mapping.items()],
|
|
348
|
+
'stored_proxies': [[self._column_as_dict(k), self._column_as_dict(v)] for k, v in self.stored_proxies.items()]
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
@classmethod
|
|
352
|
+
def from_dict(cls, md: dict[str, Any]) -> MockProject:
|
|
353
|
+
return cls(
|
|
354
|
+
md['name'],
|
|
355
|
+
{k: ts.ColumnType.from_dict(v) for k, v in md['export_cols'].items()},
|
|
356
|
+
{k: ts.ColumnType.from_dict(v) for k, v in md['import_cols'].items()},
|
|
357
|
+
{cls._column_from_dict(entry[0]): entry[1] for entry in md['col_mapping']},
|
|
358
|
+
{cls._column_from_dict(entry[0]): cls._column_from_dict(entry[1]) for entry in md['stored_proxies']}
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
def __eq__(self, other: Any) -> bool:
|
|
362
|
+
if not isinstance(other, MockProject):
|
|
363
|
+
return False
|
|
364
|
+
return self.name == other.name
|
|
365
|
+
|
|
366
|
+
def __hash__(self) -> int:
|
|
367
|
+
return hash(self.name)
|
|
368
|
+
|
|
369
|
+
def __repr__(self) -> str:
|
|
370
|
+
return f'MockProject `{self.name}`'
|