pixeltable 0.4.2__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +1 -0
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +2 -10
- pixeltable/catalog/catalog.py +64 -38
- pixeltable/catalog/column.py +22 -23
- pixeltable/catalog/globals.py +2 -148
- pixeltable/catalog/insertable_table.py +6 -4
- pixeltable/catalog/path.py +6 -0
- pixeltable/catalog/table.py +51 -32
- pixeltable/catalog/table_version.py +69 -45
- pixeltable/catalog/update_status.py +179 -0
- pixeltable/catalog/view.py +9 -2
- pixeltable/config.py +76 -12
- pixeltable/dataframe.py +1 -1
- pixeltable/env.py +29 -0
- pixeltable/exec/exec_node.py +7 -24
- pixeltable/exec/expr_eval/schedulers.py +134 -7
- pixeltable/exprs/column_property_ref.py +21 -9
- pixeltable/exprs/column_ref.py +5 -1
- pixeltable/exprs/function_call.py +2 -2
- pixeltable/exprs/row_builder.py +10 -9
- pixeltable/exprs/rowid_ref.py +0 -4
- pixeltable/func/function.py +3 -3
- pixeltable/functions/audio.py +36 -9
- pixeltable/functions/video.py +57 -10
- pixeltable/globals.py +61 -1
- pixeltable/io/__init__.py +1 -1
- pixeltable/io/external_store.py +3 -55
- pixeltable/io/globals.py +4 -4
- pixeltable/io/hf_datasets.py +10 -2
- pixeltable/io/label_studio.py +16 -16
- pixeltable/metadata/__init__.py +1 -1
- pixeltable/metadata/converters/convert_39.py +125 -0
- pixeltable/metadata/converters/util.py +3 -0
- pixeltable/metadata/notes.py +1 -0
- pixeltable/metadata/schema.py +14 -2
- pixeltable/plan.py +4 -0
- pixeltable/share/packager.py +20 -38
- pixeltable/store.py +18 -50
- pixeltable/type_system.py +2 -2
- pixeltable/utils/coroutine.py +6 -23
- pixeltable/utils/media_store.py +39 -0
- {pixeltable-0.4.2.dist-info → pixeltable-0.4.3.dist-info}/METADATA +1 -1
- {pixeltable-0.4.2.dist-info → pixeltable-0.4.3.dist-info}/RECORD +47 -45
- {pixeltable-0.4.2.dist-info → pixeltable-0.4.3.dist-info}/LICENSE +0 -0
- {pixeltable-0.4.2.dist-info → pixeltable-0.4.3.dist-info}/WHEEL +0 -0
- {pixeltable-0.4.2.dist-info → pixeltable-0.4.3.dist-info}/entry_points.txt +0 -0
pixeltable/functions/audio.py
CHANGED
|
@@ -1,14 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Pixeltable [UDFs](https://pixeltable.readme.io/docs/user-defined-functions-udfs) for `AudioType`.
|
|
3
|
-
|
|
4
|
-
Example:
|
|
5
|
-
```python
|
|
6
|
-
import pixeltable as pxt
|
|
7
|
-
import pixeltable.functions as pxtf
|
|
8
|
-
|
|
9
|
-
t = pxt.get_table(...)
|
|
10
|
-
t.select(pxtf.audio.get_metadata()).collect()
|
|
11
|
-
```
|
|
12
3
|
"""
|
|
13
4
|
|
|
14
5
|
import pixeltable as pxt
|
|
@@ -19,6 +10,42 @@ from pixeltable.utils.code import local_public_names
|
|
|
19
10
|
def get_metadata(audio: pxt.Audio) -> dict:
|
|
20
11
|
"""
|
|
21
12
|
Gets various metadata associated with an audio file and returns it as a dictionary.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
audio: The audio to get metadata for.
|
|
16
|
+
|
|
17
|
+
Returns:
|
|
18
|
+
A `dict` such as the following:
|
|
19
|
+
|
|
20
|
+
```json
|
|
21
|
+
{
|
|
22
|
+
'size': 2568827,
|
|
23
|
+
'streams': [
|
|
24
|
+
{
|
|
25
|
+
'type': 'audio',
|
|
26
|
+
'frames': 0,
|
|
27
|
+
'duration': 2646000,
|
|
28
|
+
'metadata': {},
|
|
29
|
+
'time_base': 2.2675736961451248e-05,
|
|
30
|
+
'codec_context': {
|
|
31
|
+
'name': 'flac',
|
|
32
|
+
'profile': None,
|
|
33
|
+
'channels': 1,
|
|
34
|
+
'codec_tag': '\\x00\\x00\\x00\\x00',
|
|
35
|
+
},
|
|
36
|
+
'duration_seconds': 60.0,
|
|
37
|
+
}
|
|
38
|
+
],
|
|
39
|
+
'bit_rate': 342510,
|
|
40
|
+
'metadata': {'encoder': 'Lavf61.1.100'},
|
|
41
|
+
'bit_exact': False,
|
|
42
|
+
}
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
Examples:
|
|
46
|
+
Extract metadata for files in the `audio_col` column of the table `tbl`:
|
|
47
|
+
|
|
48
|
+
>>> tbl.select(tbl.audio_col.get_metadata()).collect()
|
|
22
49
|
"""
|
|
23
50
|
return pxt.functions.video._get_metadata(audio)
|
|
24
51
|
|
pixeltable/functions/video.py
CHANGED
|
@@ -1,14 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Pixeltable [UDFs](https://pixeltable.readme.io/docs/user-defined-functions-udfs) for `VideoType`.
|
|
3
|
-
|
|
4
|
-
Example:
|
|
5
|
-
```python
|
|
6
|
-
import pixeltable as pxt
|
|
7
|
-
import pixeltable.functions as pxtf
|
|
8
|
-
|
|
9
|
-
t = pxt.get_table(...)
|
|
10
|
-
t.select(pxtf.video.extract_audio(t.video_col)).collect()
|
|
11
|
-
```
|
|
12
3
|
"""
|
|
13
4
|
|
|
14
5
|
import tempfile
|
|
@@ -92,12 +83,22 @@ def extract_audio(
|
|
|
92
83
|
video_path: pxt.Video, stream_idx: int = 0, format: str = 'wav', codec: Optional[str] = None
|
|
93
84
|
) -> pxt.Audio:
|
|
94
85
|
"""
|
|
95
|
-
Extract an audio stream from a video
|
|
86
|
+
Extract an audio stream from a video.
|
|
96
87
|
|
|
97
88
|
Args:
|
|
98
89
|
stream_idx: Index of the audio stream to extract.
|
|
99
90
|
format: The target audio format. (`'wav'`, `'mp3'`, `'flac'`).
|
|
100
91
|
codec: The codec to use for the audio stream. If not provided, a default codec will be used.
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
The extracted audio.
|
|
95
|
+
|
|
96
|
+
Examples:
|
|
97
|
+
Add a computed column to a table `tbl` that extracts audio from an existing column `video_col`:
|
|
98
|
+
|
|
99
|
+
>>> tbl.add_computed_column(
|
|
100
|
+
... extracted_audio=tbl.video_col.extract_audio(format='flac')
|
|
101
|
+
... )
|
|
101
102
|
"""
|
|
102
103
|
if format not in _format_defaults:
|
|
103
104
|
raise ValueError(f'extract_audio(): unsupported audio format: {format}')
|
|
@@ -124,6 +125,52 @@ def extract_audio(
|
|
|
124
125
|
def get_metadata(video: pxt.Video) -> dict:
|
|
125
126
|
"""
|
|
126
127
|
Gets various metadata associated with a video file and returns it as a dictionary.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
video: The video to get metadata for.
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
A `dict` such as the following:
|
|
134
|
+
|
|
135
|
+
```json
|
|
136
|
+
{
|
|
137
|
+
'bit_exact': False,
|
|
138
|
+
'bit_rate': 967260,
|
|
139
|
+
'size': 2234371,
|
|
140
|
+
'metadata': {
|
|
141
|
+
'encoder': 'Lavf60.16.100',
|
|
142
|
+
'major_brand': 'isom',
|
|
143
|
+
'minor_version': '512',
|
|
144
|
+
'compatible_brands': 'isomiso2avc1mp41',
|
|
145
|
+
},
|
|
146
|
+
'streams': [
|
|
147
|
+
{
|
|
148
|
+
'type': 'video',
|
|
149
|
+
'width': 640,
|
|
150
|
+
'height': 360,
|
|
151
|
+
'frames': 462,
|
|
152
|
+
'time_base': 1.0 / 12800,
|
|
153
|
+
'duration': 236544,
|
|
154
|
+
'duration_seconds': 236544.0 / 12800,
|
|
155
|
+
'average_rate': 25.0,
|
|
156
|
+
'base_rate': 25.0,
|
|
157
|
+
'guessed_rate': 25.0,
|
|
158
|
+
'metadata': {
|
|
159
|
+
'language': 'und',
|
|
160
|
+
'handler_name': 'L-SMASH Video Handler',
|
|
161
|
+
'vendor_id': '[0][0][0][0]',
|
|
162
|
+
'encoder': 'Lavc60.31.102 libx264',
|
|
163
|
+
},
|
|
164
|
+
'codec_context': {'name': 'h264', 'codec_tag': 'avc1', 'profile': 'High', 'pix_fmt': 'yuv420p'},
|
|
165
|
+
}
|
|
166
|
+
],
|
|
167
|
+
}
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
Examples:
|
|
171
|
+
Extract metadata for files in the `video_col` column of the table `tbl`:
|
|
172
|
+
|
|
173
|
+
>>> tbl.select(tbl.video_col.get_metadata()).collect()
|
|
127
174
|
"""
|
|
128
175
|
return _get_metadata(video)
|
|
129
176
|
|
pixeltable/globals.py
CHANGED
|
@@ -11,6 +11,7 @@ from pandas.io.formats.style import Styler
|
|
|
11
11
|
from pixeltable import DataFrame, catalog, exceptions as excs, exprs, func, share
|
|
12
12
|
from pixeltable.catalog import Catalog, TableVersionPath
|
|
13
13
|
from pixeltable.catalog.insertable_table import OnErrorParameter
|
|
14
|
+
from pixeltable.config import Config
|
|
14
15
|
from pixeltable.env import Env
|
|
15
16
|
from pixeltable.iterators import ComponentIterator
|
|
16
17
|
|
|
@@ -34,8 +35,11 @@ if TYPE_CHECKING:
|
|
|
34
35
|
_logger = logging.getLogger('pixeltable')
|
|
35
36
|
|
|
36
37
|
|
|
37
|
-
def init() -> None:
|
|
38
|
+
def init(config_overrides: Optional[dict[str, Any]] = None) -> None:
|
|
38
39
|
"""Initializes the Pixeltable environment."""
|
|
40
|
+
if config_overrides is None:
|
|
41
|
+
config_overrides = {}
|
|
42
|
+
Config.init(config_overrides)
|
|
39
43
|
_ = Catalog.get()
|
|
40
44
|
|
|
41
45
|
|
|
@@ -633,6 +637,62 @@ def drop_dir(path: str, force: bool = False, if_not_exists: Literal['error', 'ig
|
|
|
633
637
|
Catalog.get().drop_dir(path_obj, if_not_exists=if_not_exists_, force=force)
|
|
634
638
|
|
|
635
639
|
|
|
640
|
+
def ls(path: str = '') -> pd.DataFrame:
|
|
641
|
+
"""
|
|
642
|
+
List the contents of a Pixeltable directory.
|
|
643
|
+
|
|
644
|
+
This function returns a Pandas DataFrame representing a human-readable listing of the specified directory,
|
|
645
|
+
including various attributes such as version and base table, as appropriate.
|
|
646
|
+
|
|
647
|
+
To get a programmatic list of tables and/or directories, use [list_tables()][pixeltable.list_tables] and/or
|
|
648
|
+
[list_dirs()][pixeltable.list_dirs] instead.
|
|
649
|
+
"""
|
|
650
|
+
from pixeltable.metadata import schema
|
|
651
|
+
|
|
652
|
+
cat = Catalog.get()
|
|
653
|
+
path_obj = catalog.Path(path, empty_is_valid=True)
|
|
654
|
+
dir_entries = cat.get_dir_contents(path_obj)
|
|
655
|
+
rows: list[list[str]] = []
|
|
656
|
+
with Catalog.get().begin_xact():
|
|
657
|
+
for name, entry in dir_entries.items():
|
|
658
|
+
if name.startswith('_'):
|
|
659
|
+
continue
|
|
660
|
+
if entry.dir is not None:
|
|
661
|
+
kind = 'dir'
|
|
662
|
+
version = ''
|
|
663
|
+
base = ''
|
|
664
|
+
else:
|
|
665
|
+
assert entry.table is not None
|
|
666
|
+
assert isinstance(entry.table, schema.Table)
|
|
667
|
+
tbl = cat.get_table_by_id(entry.table.id)
|
|
668
|
+
md = tbl.get_metadata()
|
|
669
|
+
base = md['base'] or ''
|
|
670
|
+
if base.startswith('_'):
|
|
671
|
+
base = '<anonymous base table>'
|
|
672
|
+
if md['is_snapshot']:
|
|
673
|
+
kind = 'snapshot'
|
|
674
|
+
elif md['is_view']:
|
|
675
|
+
kind = 'view'
|
|
676
|
+
else:
|
|
677
|
+
kind = 'table'
|
|
678
|
+
version = '' if kind == 'snapshot' else md['version']
|
|
679
|
+
if md['is_replica']:
|
|
680
|
+
kind = f'{kind}-replica'
|
|
681
|
+
rows.append([name, kind, version, base])
|
|
682
|
+
|
|
683
|
+
rows = sorted(rows, key=lambda x: x[0])
|
|
684
|
+
df = pd.DataFrame(
|
|
685
|
+
{
|
|
686
|
+
'Name': [row[0] for row in rows],
|
|
687
|
+
'Kind': [row[1] for row in rows],
|
|
688
|
+
'Version': [row[2] for row in rows],
|
|
689
|
+
'Base': [row[3] for row in rows],
|
|
690
|
+
},
|
|
691
|
+
index=([''] * len(rows)),
|
|
692
|
+
)
|
|
693
|
+
return df
|
|
694
|
+
|
|
695
|
+
|
|
636
696
|
def _extract_paths(
|
|
637
697
|
dir_entries: dict[str, Catalog.DirEntry],
|
|
638
698
|
parent: catalog.Path,
|
pixeltable/io/__init__.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# ruff: noqa: F401
|
|
2
2
|
|
|
3
3
|
from .datarows import import_json, import_rows
|
|
4
|
-
from .external_store import ExternalStore
|
|
4
|
+
from .external_store import ExternalStore
|
|
5
5
|
from .globals import create_label_studio_project, export_images_as_fo_dataset
|
|
6
6
|
from .hf_datasets import import_huggingface_dataset
|
|
7
7
|
from .pandas import import_csv, import_excel, import_pandas
|
pixeltable/io/external_store.py
CHANGED
|
@@ -3,14 +3,13 @@ from __future__ import annotations
|
|
|
3
3
|
import abc
|
|
4
4
|
import itertools
|
|
5
5
|
import logging
|
|
6
|
-
from dataclasses import dataclass, field
|
|
7
6
|
from typing import Any, Optional
|
|
8
7
|
|
|
9
8
|
import pixeltable.exceptions as excs
|
|
10
9
|
import pixeltable.type_system as ts
|
|
11
10
|
from pixeltable import Column, Table
|
|
12
11
|
from pixeltable.catalog import ColumnHandle, TableVersion
|
|
13
|
-
from pixeltable.catalog.
|
|
12
|
+
from pixeltable.catalog.update_status import UpdateStatus
|
|
14
13
|
|
|
15
14
|
_logger = logging.getLogger('pixeltable')
|
|
16
15
|
|
|
@@ -46,7 +45,7 @@ class ExternalStore(abc.ABC):
|
|
|
46
45
|
"""
|
|
47
46
|
|
|
48
47
|
@abc.abstractmethod
|
|
49
|
-
def sync(self, t: Table, export_data: bool, import_data: bool) ->
|
|
48
|
+
def sync(self, t: Table, export_data: bool, import_data: bool) -> UpdateStatus:
|
|
50
49
|
"""
|
|
51
50
|
Called by `Table.sync()` to implement store-specific synchronization logic.
|
|
52
51
|
"""
|
|
@@ -263,57 +262,6 @@ class Project(ExternalStore, abc.ABC):
|
|
|
263
262
|
return resolved_col_mapping
|
|
264
263
|
|
|
265
264
|
|
|
266
|
-
@dataclass(frozen=True)
|
|
267
|
-
class SyncStatus:
|
|
268
|
-
# stats for the rows affected by the operation in the external store
|
|
269
|
-
ext_row_count_stats: RowCountStats = field(default_factory=RowCountStats)
|
|
270
|
-
|
|
271
|
-
# stats for the rows affected by the operation
|
|
272
|
-
row_count_stats: RowCountStats = field(default_factory=RowCountStats)
|
|
273
|
-
|
|
274
|
-
@property
|
|
275
|
-
def num_excs(self) -> int:
|
|
276
|
-
"""
|
|
277
|
-
Returns the total number of Pixeltable exceptions that occurred during the operation.
|
|
278
|
-
"""
|
|
279
|
-
return self.row_count_stats.num_excs
|
|
280
|
-
|
|
281
|
-
@property
|
|
282
|
-
def pxt_rows_updated(self) -> int:
|
|
283
|
-
"""
|
|
284
|
-
Returns the number of Pixeltable rows that were updated as a result of the operation.
|
|
285
|
-
"""
|
|
286
|
-
return self.row_count_stats.upd_rows
|
|
287
|
-
|
|
288
|
-
@property
|
|
289
|
-
def external_rows_updated(self) -> int:
|
|
290
|
-
return self.ext_row_count_stats.upd_rows
|
|
291
|
-
|
|
292
|
-
@property
|
|
293
|
-
def external_rows_created(self) -> int:
|
|
294
|
-
return self.ext_row_count_stats.ins_rows
|
|
295
|
-
|
|
296
|
-
@property
|
|
297
|
-
def external_rows_deleted(self) -> int:
|
|
298
|
-
return self.ext_row_count_stats.del_rows
|
|
299
|
-
|
|
300
|
-
def __add__(self, other: 'SyncStatus') -> 'SyncStatus':
|
|
301
|
-
"""
|
|
302
|
-
Add the sync status from two SyncStatus objects together.
|
|
303
|
-
"""
|
|
304
|
-
return SyncStatus(
|
|
305
|
-
ext_row_count_stats=self.ext_row_count_stats + other.ext_row_count_stats,
|
|
306
|
-
row_count_stats=self.row_count_stats + other.row_count_stats,
|
|
307
|
-
)
|
|
308
|
-
|
|
309
|
-
@classmethod
|
|
310
|
-
def from_update_status(cls, us: UpdateStatus) -> 'SyncStatus':
|
|
311
|
-
"""
|
|
312
|
-
Copy information from an UpdateStatus to a SyncStatus.
|
|
313
|
-
"""
|
|
314
|
-
return SyncStatus(row_count_stats=us.row_count_stats + us.cascade_row_count_stats)
|
|
315
|
-
|
|
316
|
-
|
|
317
265
|
class MockProject(Project):
|
|
318
266
|
"""A project that cannot be synced, used mainly for testing."""
|
|
319
267
|
|
|
@@ -348,7 +296,7 @@ class MockProject(Project):
|
|
|
348
296
|
def get_import_columns(self) -> dict[str, ts.ColumnType]:
|
|
349
297
|
return self.import_cols
|
|
350
298
|
|
|
351
|
-
def sync(self, t: Table, export_data: bool, import_data: bool) ->
|
|
299
|
+
def sync(self, t: Table, export_data: bool, import_data: bool) -> UpdateStatus:
|
|
352
300
|
raise NotImplementedError()
|
|
353
301
|
|
|
354
302
|
def delete(self) -> None:
|
pixeltable/io/globals.py
CHANGED
|
@@ -5,8 +5,8 @@ from typing import TYPE_CHECKING, Any, Literal, Optional, Union
|
|
|
5
5
|
import pixeltable as pxt
|
|
6
6
|
import pixeltable.exceptions as excs
|
|
7
7
|
from pixeltable import Table, exprs
|
|
8
|
+
from pixeltable.catalog.update_status import UpdateStatus
|
|
8
9
|
from pixeltable.env import Env
|
|
9
|
-
from pixeltable.io.external_store import SyncStatus
|
|
10
10
|
|
|
11
11
|
if TYPE_CHECKING:
|
|
12
12
|
import fiftyone as fo # type: ignore[import-untyped]
|
|
@@ -22,7 +22,7 @@ def create_label_studio_project(
|
|
|
22
22
|
sync_immediately: bool = True,
|
|
23
23
|
s3_configuration: Optional[dict[str, Any]] = None,
|
|
24
24
|
**kwargs: Any,
|
|
25
|
-
) ->
|
|
25
|
+
) -> UpdateStatus:
|
|
26
26
|
"""
|
|
27
27
|
Create a new Label Studio project and link it to the specified [`Table`][pixeltable.Table].
|
|
28
28
|
|
|
@@ -96,7 +96,7 @@ def create_label_studio_project(
|
|
|
96
96
|
[Label Studio start_project docs](https://labelstud.io/sdk/project.html#label_studio_sdk.project.Project.start_project).
|
|
97
97
|
|
|
98
98
|
Returns:
|
|
99
|
-
|
|
99
|
+
An `UpdateStatus` representing the status of any synchronization operations that occurred.
|
|
100
100
|
|
|
101
101
|
Examples:
|
|
102
102
|
Create a Label Studio project whose tasks correspond to videos stored in the `video_col`
|
|
@@ -136,7 +136,7 @@ def create_label_studio_project(
|
|
|
136
136
|
if sync_immediately:
|
|
137
137
|
return t.sync()
|
|
138
138
|
else:
|
|
139
|
-
return
|
|
139
|
+
return UpdateStatus()
|
|
140
140
|
|
|
141
141
|
|
|
142
142
|
def export_images_as_fo_dataset(
|
pixeltable/io/hf_datasets.py
CHANGED
|
@@ -50,10 +50,18 @@ def _to_pixeltable_type(feature_type: Any, nullable: bool) -> Optional[ts.Column
|
|
|
50
50
|
elif isinstance(feature_type, datasets.Sequence):
|
|
51
51
|
# example: cohere wiki. Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None)
|
|
52
52
|
dtype = _to_pixeltable_type(feature_type.feature, nullable)
|
|
53
|
-
|
|
54
|
-
|
|
53
|
+
if dtype is None:
|
|
54
|
+
return None
|
|
55
|
+
if dtype.is_int_type() or dtype.is_float_type() or dtype.is_bool_type() or dtype.is_string_type():
|
|
56
|
+
length = feature_type.length if feature_type.length != -1 else None
|
|
57
|
+
return ts.ArrayType(shape=(length,), dtype=dtype, nullable=nullable)
|
|
58
|
+
else:
|
|
59
|
+
# Sequence of dicts must be cast as Json
|
|
60
|
+
return ts.JsonType(nullable=nullable)
|
|
55
61
|
elif isinstance(feature_type, datasets.Image):
|
|
56
62
|
return ts.ImageType(nullable=nullable)
|
|
63
|
+
elif isinstance(feature_type, dict):
|
|
64
|
+
return ts.JsonType(nullable=nullable)
|
|
57
65
|
else:
|
|
58
66
|
return None
|
|
59
67
|
|
pixeltable/io/label_studio.py
CHANGED
|
@@ -14,10 +14,10 @@ from requests.exceptions import HTTPError
|
|
|
14
14
|
import pixeltable.type_system as ts
|
|
15
15
|
from pixeltable import Column, Table, env, exceptions as excs
|
|
16
16
|
from pixeltable.catalog import ColumnHandle
|
|
17
|
-
from pixeltable.catalog.
|
|
17
|
+
from pixeltable.catalog.update_status import RowCountStats, UpdateStatus
|
|
18
18
|
from pixeltable.config import Config
|
|
19
19
|
from pixeltable.exprs import ColumnRef, DataRow, Expr
|
|
20
|
-
from pixeltable.io.external_store import Project
|
|
20
|
+
from pixeltable.io.external_store import Project
|
|
21
21
|
from pixeltable.utils import coco
|
|
22
22
|
|
|
23
23
|
# label_studio_sdk>=1 and label_studio_sdk<1 are not compatible, so we need to try
|
|
@@ -111,14 +111,14 @@ class LabelStudioProject(Project):
|
|
|
111
111
|
"""
|
|
112
112
|
return {ANNOTATIONS_COLUMN: ts.JsonType(nullable=True)}
|
|
113
113
|
|
|
114
|
-
def sync(self, t: Table, export_data: bool, import_data: bool) ->
|
|
114
|
+
def sync(self, t: Table, export_data: bool, import_data: bool) -> UpdateStatus:
|
|
115
115
|
_logger.info(
|
|
116
116
|
f'Syncing Label Studio project "{self.project_title}" with table `{t._name}`'
|
|
117
117
|
f' (export: {export_data}, import: {import_data}).'
|
|
118
118
|
)
|
|
119
119
|
# Collect all existing tasks into a dict with entries `rowid: task`
|
|
120
120
|
tasks = {tuple(task['meta']['rowid']): task for task in self.__fetch_all_tasks()}
|
|
121
|
-
sync_status =
|
|
121
|
+
sync_status = UpdateStatus()
|
|
122
122
|
if export_data:
|
|
123
123
|
export_sync_status = self.__update_tasks(t, tasks)
|
|
124
124
|
sync_status += export_sync_status
|
|
@@ -148,7 +148,7 @@ class LabelStudioProject(Project):
|
|
|
148
148
|
f'Label Studio project {self.project_title!r}.'
|
|
149
149
|
)
|
|
150
150
|
|
|
151
|
-
def __update_tasks(self, t: Table, existing_tasks: dict[tuple, dict]) ->
|
|
151
|
+
def __update_tasks(self, t: Table, existing_tasks: dict[tuple, dict]) -> UpdateStatus:
|
|
152
152
|
"""
|
|
153
153
|
Updates all tasks in this Label Studio project based on the Pixeltable data:
|
|
154
154
|
- Creates new tasks for rows that don't map to any existing task;
|
|
@@ -161,7 +161,7 @@ class LabelStudioProject(Project):
|
|
|
161
161
|
t_data_cols = [t_col for t_col, ext_col_name in self.col_mapping.items() if ext_col_name in config.data_keys]
|
|
162
162
|
|
|
163
163
|
if len(t_data_cols) == 0:
|
|
164
|
-
return
|
|
164
|
+
return UpdateStatus()
|
|
165
165
|
|
|
166
166
|
# Columns in `t` that map to `rectanglelabels` preannotations
|
|
167
167
|
t_rl_cols = [
|
|
@@ -192,7 +192,7 @@ class LabelStudioProject(Project):
|
|
|
192
192
|
media_col: ColumnHandle,
|
|
193
193
|
t_rl_cols: list[ColumnHandle],
|
|
194
194
|
rl_info: list['_RectangleLabel'],
|
|
195
|
-
) ->
|
|
195
|
+
) -> UpdateStatus:
|
|
196
196
|
is_stored = media_col.get().is_stored
|
|
197
197
|
# If it's a stored column, we can use `localpath`
|
|
198
198
|
localpath_col_opt = [t[media_col.get().name].localpath] if is_stored else []
|
|
@@ -238,7 +238,7 @@ class LabelStudioProject(Project):
|
|
|
238
238
|
|
|
239
239
|
env.Env.get().console_logger.info(f'Created {tasks_created} new task(s) in {self}.')
|
|
240
240
|
|
|
241
|
-
sync_status =
|
|
241
|
+
sync_status = UpdateStatus(ext_row_count_stats=RowCountStats(ins_rows=tasks_created))
|
|
242
242
|
|
|
243
243
|
deletion_sync_status = self.__delete_stale_tasks(existing_tasks, row_ids_in_pxt, tasks_created)
|
|
244
244
|
sync_status += deletion_sync_status
|
|
@@ -251,7 +251,7 @@ class LabelStudioProject(Project):
|
|
|
251
251
|
t_data_cols: list[ColumnHandle],
|
|
252
252
|
t_rl_cols: list[ColumnHandle],
|
|
253
253
|
rl_info: list['_RectangleLabel'],
|
|
254
|
-
) ->
|
|
254
|
+
) -> UpdateStatus:
|
|
255
255
|
ext_data_cols = [self.col_mapping[col] for col in t_data_cols]
|
|
256
256
|
expr_refs: dict[str, Expr] = {} # kwargs for the select statement
|
|
257
257
|
for col in t_data_cols:
|
|
@@ -342,7 +342,7 @@ class LabelStudioProject(Project):
|
|
|
342
342
|
f'Created {tasks_created} new task(s) and updated {tasks_updated} existing task(s) in {self}.'
|
|
343
343
|
)
|
|
344
344
|
|
|
345
|
-
sync_status =
|
|
345
|
+
sync_status = UpdateStatus(ext_row_count_stats=RowCountStats(ins_rows=tasks_created, upd_rows=tasks_updated))
|
|
346
346
|
|
|
347
347
|
deletion_sync_status = self.__delete_stale_tasks(existing_tasks, row_ids_in_pxt, tasks_created)
|
|
348
348
|
sync_status += deletion_sync_status
|
|
@@ -367,7 +367,7 @@ class LabelStudioProject(Project):
|
|
|
367
367
|
|
|
368
368
|
def __delete_stale_tasks(
|
|
369
369
|
self, existing_tasks: dict[tuple, dict], row_ids_in_pxt: set[tuple], tasks_created: int
|
|
370
|
-
) ->
|
|
370
|
+
) -> UpdateStatus:
|
|
371
371
|
deleted_rowids = set(existing_tasks.keys()) - row_ids_in_pxt
|
|
372
372
|
# Sanity check the math
|
|
373
373
|
assert len(deleted_rowids) == len(existing_tasks) + tasks_created - len(row_ids_in_pxt)
|
|
@@ -383,11 +383,11 @@ class LabelStudioProject(Project):
|
|
|
383
383
|
for rowid in deleted_rowids:
|
|
384
384
|
del existing_tasks[rowid]
|
|
385
385
|
|
|
386
|
-
return
|
|
386
|
+
return UpdateStatus(ext_row_count_stats=RowCountStats(del_rows=len(deleted_rowids)))
|
|
387
387
|
|
|
388
|
-
def __update_table_from_tasks(self, t: Table, tasks: dict[tuple, dict]) ->
|
|
388
|
+
def __update_table_from_tasks(self, t: Table, tasks: dict[tuple, dict]) -> UpdateStatus:
|
|
389
389
|
if ANNOTATIONS_COLUMN not in self.col_mapping.values():
|
|
390
|
-
return
|
|
390
|
+
return UpdateStatus()
|
|
391
391
|
|
|
392
392
|
annotations = {
|
|
393
393
|
# Replace [] by None to indicate no annotations. We do want to sync rows with no annotations,
|
|
@@ -422,9 +422,9 @@ class LabelStudioProject(Project):
|
|
|
422
422
|
ancestor = ancestor._get_base_table()
|
|
423
423
|
update_status = ancestor.batch_update(updates)
|
|
424
424
|
env.Env.get().console_logger.info(f'Updated annotation(s) from {len(updates)} task(s) in {self}.')
|
|
425
|
-
return
|
|
425
|
+
return update_status
|
|
426
426
|
else:
|
|
427
|
-
return
|
|
427
|
+
return UpdateStatus()
|
|
428
428
|
|
|
429
429
|
def as_dict(self) -> dict[str, Any]:
|
|
430
430
|
return {
|
pixeltable/metadata/__init__.py
CHANGED
|
@@ -18,7 +18,7 @@ _console_logger = ConsoleLogger(logging.getLogger('pixeltable'))
|
|
|
18
18
|
_logger = logging.getLogger('pixeltable')
|
|
19
19
|
|
|
20
20
|
# current version of the metadata; this is incremented whenever the metadata schema changes
|
|
21
|
-
VERSION =
|
|
21
|
+
VERSION = 40
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
def create_system_info(engine: sql.engine.Engine) -> None:
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Optional
|
|
3
|
+
from uuid import UUID
|
|
4
|
+
|
|
5
|
+
import sqlalchemy as sql
|
|
6
|
+
|
|
7
|
+
from pixeltable.metadata import register_converter
|
|
8
|
+
from pixeltable.metadata.converters.util import convert_table_md
|
|
9
|
+
|
|
10
|
+
_logger = logging.getLogger('pixeltable')
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@register_converter(version=39)
|
|
14
|
+
def _(engine: sql.engine.Engine) -> None:
|
|
15
|
+
convert_table_md(engine, table_modifier=__table_modifier)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def __table_modifier(conn: sql.Connection, tbl_id: UUID, orig_table_md: dict, updated_table_md: dict) -> None:
|
|
19
|
+
store_prefix = 'view' if orig_table_md['view_md'] is not None else 'tbl'
|
|
20
|
+
store_name = f'{store_prefix}_{tbl_id.hex}'
|
|
21
|
+
|
|
22
|
+
# Get the list of column names that need to be migrated
|
|
23
|
+
col_names = find_error_columns(conn=conn, store_name=store_name)
|
|
24
|
+
if len(col_names) == 0:
|
|
25
|
+
_logger.info(f'No error columns found in table {store_name}. Skipping migration.')
|
|
26
|
+
return
|
|
27
|
+
|
|
28
|
+
# Check if the table exists, outside of the metadata we were given
|
|
29
|
+
# There seem to be cases where the metadata is present in the catalog,
|
|
30
|
+
# but the table itself is not in the database.
|
|
31
|
+
check_table_sql = sql.text(f"""
|
|
32
|
+
SELECT EXISTS (
|
|
33
|
+
SELECT 1
|
|
34
|
+
FROM information_schema.tables
|
|
35
|
+
WHERE table_name = '{store_name}'
|
|
36
|
+
)
|
|
37
|
+
""")
|
|
38
|
+
table_exists = conn.execute(check_table_sql).scalar()
|
|
39
|
+
if not table_exists:
|
|
40
|
+
_logger.warning(f'Table {store_name} does not exist. Skipping migration.')
|
|
41
|
+
return
|
|
42
|
+
|
|
43
|
+
return migrate_error_to_cellmd_columns(conn, store_name, col_names)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def find_error_columns(conn: sql.Connection, store_name: str) -> list[str]:
|
|
47
|
+
"""
|
|
48
|
+
Return and errormsg or errortype columns in the given table
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
conn: SQLAlchemy connection
|
|
52
|
+
store_name: Name of the table to check
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
List of column name roots (root_errormsg, root_errortype)
|
|
56
|
+
"""
|
|
57
|
+
check_columns_sql = sql.text(f"""
|
|
58
|
+
SELECT column_name
|
|
59
|
+
FROM information_schema.columns
|
|
60
|
+
WHERE table_name = '{store_name}'
|
|
61
|
+
""")
|
|
62
|
+
found_columns = [
|
|
63
|
+
row[0]
|
|
64
|
+
for row in conn.execute(check_columns_sql)
|
|
65
|
+
if row[0].endswith('_errormsg') or row[0].endswith('_errortype')
|
|
66
|
+
]
|
|
67
|
+
column_roots = {s.removesuffix('_errormsg').removesuffix('_errortype') for s in found_columns}
|
|
68
|
+
return [*column_roots]
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def migrate_error_to_cellmd_columns(
|
|
72
|
+
conn: sql.Connection, store_name: str, col_names: list[str], backup_table: Optional[str] = None
|
|
73
|
+
) -> None:
|
|
74
|
+
"""
|
|
75
|
+
Safe version with error handling and optional backup.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
engine: SQLAlchemy engine
|
|
79
|
+
store_name: Name of the table to modify
|
|
80
|
+
col_names: List of column name prefixes
|
|
81
|
+
backup_table: Optional name for backup table
|
|
82
|
+
|
|
83
|
+
Usage:
|
|
84
|
+
migrate_error_to_cellmd_columns(engine, 'my_table', ['columnname'], 'my_table_backup')
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
try:
|
|
88
|
+
# Optional: Create backup
|
|
89
|
+
if backup_table:
|
|
90
|
+
backup_sql = sql.text(f"""
|
|
91
|
+
CREATE TABLE {backup_table} AS SELECT * FROM {store_name}
|
|
92
|
+
""")
|
|
93
|
+
conn.execute(backup_sql)
|
|
94
|
+
_logger.info(f'Backup created: {backup_table}')
|
|
95
|
+
|
|
96
|
+
# Step 1: Add new columns
|
|
97
|
+
add_column_str = ', '.join(f'ADD COLUMN {col}_cellmd JSONB DEFAULT NULL' for col in col_names)
|
|
98
|
+
add_column_sql = sql.text(f'ALTER TABLE {store_name} {add_column_str}')
|
|
99
|
+
conn.execute(add_column_sql)
|
|
100
|
+
_logger.info(f'Added columns: {", ".join(f"{col}_cellmd" for col in col_names)}')
|
|
101
|
+
|
|
102
|
+
# Step 2: Populate new columns
|
|
103
|
+
set_column_str = ', '.join(
|
|
104
|
+
[
|
|
105
|
+
f'{col}_cellmd = CASE WHEN {col}_errormsg IS NULL OR {col}_errortype IS NULL '
|
|
106
|
+
f"THEN NULL ELSE jsonb_build_object('errormsg', {col}_errormsg, 'errortype', {col}_errortype) END"
|
|
107
|
+
for col in col_names
|
|
108
|
+
]
|
|
109
|
+
)
|
|
110
|
+
populate_sql = sql.text(f'UPDATE {store_name} SET {set_column_str}')
|
|
111
|
+
result = conn.execute(populate_sql)
|
|
112
|
+
_logger.info(f'Updated {result.rowcount} rows')
|
|
113
|
+
|
|
114
|
+
# Step 3: Drop old columns
|
|
115
|
+
drop_columns_str = ', '.join(
|
|
116
|
+
[f'DROP COLUMN IF EXISTS {col}_errormsg, DROP COLUMN IF EXISTS {col}_errortype' for col in col_names]
|
|
117
|
+
)
|
|
118
|
+
drop_columns_sql = sql.text(f'ALTER TABLE {store_name} {drop_columns_str}')
|
|
119
|
+
conn.execute(drop_columns_sql)
|
|
120
|
+
_logger.info(f'Dropped columns: {", ".join(f"{col}_errormsg, {col}_errortype" for col in col_names)}')
|
|
121
|
+
_logger.info(f'Migration completed successfully for table: {store_name}')
|
|
122
|
+
|
|
123
|
+
except sql.exc.SQLAlchemyError as e:
|
|
124
|
+
_logger.error(f'Migration for table {store_name} failed: {e}')
|
|
125
|
+
raise
|
|
@@ -16,6 +16,7 @@ def convert_table_md(
|
|
|
16
16
|
column_md_updater: Optional[Callable[[dict], None]] = None,
|
|
17
17
|
external_store_md_updater: Optional[Callable[[dict], None]] = None,
|
|
18
18
|
substitution_fn: Optional[Callable[[Optional[str], Any], Optional[tuple[Optional[str], Any]]]] = None,
|
|
19
|
+
table_modifier: Optional[Callable[[sql.Connection, UUID, dict, dict], None]] = None,
|
|
19
20
|
) -> None:
|
|
20
21
|
"""
|
|
21
22
|
Converts schema.TableMd dicts based on the specified conversion functions.
|
|
@@ -50,6 +51,8 @@ def convert_table_md(
|
|
|
50
51
|
if updated_table_md != table_md:
|
|
51
52
|
__logger.info(f'Updating schema for table: {tbl_id}')
|
|
52
53
|
conn.execute(sql.update(Table).where(Table.id == tbl_id).values(md=updated_table_md))
|
|
54
|
+
if table_modifier is not None:
|
|
55
|
+
table_modifier(conn, tbl_id, table_md, updated_table_md)
|
|
53
56
|
|
|
54
57
|
for row in conn.execute(sql.select(Function)):
|
|
55
58
|
fn_id = row[0]
|
pixeltable/metadata/notes.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
# rather than as a comment, so that the existence of a description can be enforced by
|
|
3
3
|
# the unit tests when new versions are added.
|
|
4
4
|
VERSION_NOTES = {
|
|
5
|
+
40: 'Convert error property columns to cellmd columns',
|
|
5
6
|
39: 'ColumnHandles in external stores',
|
|
6
7
|
38: 'Added TableMd.view_sn',
|
|
7
8
|
37: 'Add support for the sample() method on DataFrames',
|