pixeltable 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +1 -0
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +3 -10
- pixeltable/catalog/catalog.py +139 -59
- pixeltable/catalog/column.py +32 -23
- pixeltable/catalog/globals.py +2 -45
- pixeltable/catalog/insertable_table.py +5 -2
- pixeltable/catalog/path.py +6 -0
- pixeltable/catalog/table.py +173 -23
- pixeltable/catalog/table_version.py +156 -92
- pixeltable/catalog/table_version_handle.py +26 -1
- pixeltable/catalog/update_status.py +179 -0
- pixeltable/catalog/view.py +12 -3
- pixeltable/config.py +76 -12
- pixeltable/dataframe.py +1 -1
- pixeltable/env.py +29 -0
- pixeltable/exec/exec_node.py +7 -24
- pixeltable/exec/expr_eval/schedulers.py +134 -7
- pixeltable/exprs/column_property_ref.py +23 -20
- pixeltable/exprs/column_ref.py +24 -18
- pixeltable/exprs/data_row.py +9 -0
- pixeltable/exprs/function_call.py +2 -2
- pixeltable/exprs/row_builder.py +46 -14
- pixeltable/exprs/rowid_ref.py +0 -4
- pixeltable/func/function.py +3 -3
- pixeltable/functions/audio.py +36 -9
- pixeltable/functions/video.py +57 -10
- pixeltable/globals.py +61 -1
- pixeltable/io/__init__.py +1 -1
- pixeltable/io/external_store.py +39 -64
- pixeltable/io/globals.py +4 -4
- pixeltable/io/hf_datasets.py +10 -2
- pixeltable/io/label_studio.py +52 -48
- pixeltable/metadata/__init__.py +1 -1
- pixeltable/metadata/converters/convert_38.py +39 -0
- pixeltable/metadata/converters/convert_39.py +125 -0
- pixeltable/metadata/converters/util.py +3 -0
- pixeltable/metadata/notes.py +2 -0
- pixeltable/metadata/schema.py +14 -2
- pixeltable/metadata/utils.py +78 -0
- pixeltable/plan.py +26 -18
- pixeltable/share/packager.py +20 -38
- pixeltable/store.py +121 -142
- pixeltable/type_system.py +2 -2
- pixeltable/utils/coroutine.py +6 -23
- pixeltable/utils/media_store.py +39 -0
- {pixeltable-0.4.1.dist-info → pixeltable-0.4.3.dist-info}/METADATA +1 -1
- {pixeltable-0.4.1.dist-info → pixeltable-0.4.3.dist-info}/RECORD +51 -47
- {pixeltable-0.4.1.dist-info → pixeltable-0.4.3.dist-info}/LICENSE +0 -0
- {pixeltable-0.4.1.dist-info → pixeltable-0.4.3.dist-info}/WHEEL +0 -0
- {pixeltable-0.4.1.dist-info → pixeltable-0.4.3.dist-info}/entry_points.txt +0 -0
pixeltable/io/external_store.py
CHANGED
|
@@ -3,14 +3,13 @@ from __future__ import annotations
|
|
|
3
3
|
import abc
|
|
4
4
|
import itertools
|
|
5
5
|
import logging
|
|
6
|
-
from dataclasses import dataclass
|
|
7
6
|
from typing import Any, Optional
|
|
8
|
-
from uuid import UUID
|
|
9
7
|
|
|
10
8
|
import pixeltable.exceptions as excs
|
|
11
9
|
import pixeltable.type_system as ts
|
|
12
10
|
from pixeltable import Column, Table
|
|
13
|
-
from pixeltable.catalog import TableVersion
|
|
11
|
+
from pixeltable.catalog import ColumnHandle, TableVersion
|
|
12
|
+
from pixeltable.catalog.update_status import UpdateStatus
|
|
14
13
|
|
|
15
14
|
_logger = logging.getLogger('pixeltable')
|
|
16
15
|
|
|
@@ -22,6 +21,8 @@ class ExternalStore(abc.ABC):
|
|
|
22
21
|
and stateful external stores.
|
|
23
22
|
"""
|
|
24
23
|
|
|
24
|
+
__name: str
|
|
25
|
+
|
|
25
26
|
def __init__(self, name: str) -> None:
|
|
26
27
|
self.__name = name
|
|
27
28
|
|
|
@@ -38,13 +39,13 @@ class ExternalStore(abc.ABC):
|
|
|
38
39
|
"""Removes store-specific metadata created in link()."""
|
|
39
40
|
|
|
40
41
|
@abc.abstractmethod
|
|
41
|
-
def get_local_columns(self) -> list[
|
|
42
|
+
def get_local_columns(self) -> list[ColumnHandle]:
|
|
42
43
|
"""
|
|
43
44
|
Gets a list of all local (Pixeltable) columns that are associated with this external store.
|
|
44
45
|
"""
|
|
45
46
|
|
|
46
47
|
@abc.abstractmethod
|
|
47
|
-
def sync(self, t: Table, export_data: bool, import_data: bool) ->
|
|
48
|
+
def sync(self, t: Table, export_data: bool, import_data: bool) -> UpdateStatus:
|
|
48
49
|
"""
|
|
49
50
|
Called by `Table.sync()` to implement store-specific synchronization logic.
|
|
50
51
|
"""
|
|
@@ -63,9 +64,15 @@ class Project(ExternalStore, abc.ABC):
|
|
|
63
64
|
additional capabilities specific to such projects.
|
|
64
65
|
"""
|
|
65
66
|
|
|
66
|
-
|
|
67
|
+
_col_mapping: dict[ColumnHandle, str] # col -> external col name
|
|
68
|
+
stored_proxies: dict[ColumnHandle, ColumnHandle] # original col -> proxy col
|
|
67
69
|
|
|
68
|
-
def __init__(
|
|
70
|
+
def __init__(
|
|
71
|
+
self,
|
|
72
|
+
name: str,
|
|
73
|
+
col_mapping: dict[ColumnHandle, str],
|
|
74
|
+
stored_proxies: Optional[dict[ColumnHandle, ColumnHandle]],
|
|
75
|
+
):
|
|
69
76
|
super().__init__(name)
|
|
70
77
|
self._col_mapping = col_mapping
|
|
71
78
|
|
|
@@ -80,11 +87,11 @@ class Project(ExternalStore, abc.ABC):
|
|
|
80
87
|
# Note from aaron-siegel: This methodology is inefficient in the case where a table has many views with a high
|
|
81
88
|
# proportion of overlapping rows, all proxying the same base column.
|
|
82
89
|
if stored_proxies is None:
|
|
83
|
-
self.stored_proxies: dict[
|
|
90
|
+
self.stored_proxies: dict[ColumnHandle, ColumnHandle] = {}
|
|
84
91
|
else:
|
|
85
92
|
self.stored_proxies = stored_proxies
|
|
86
93
|
|
|
87
|
-
def get_local_columns(self) -> list[
|
|
94
|
+
def get_local_columns(self) -> list[ColumnHandle]:
|
|
88
95
|
return list(self.col_mapping.keys())
|
|
89
96
|
|
|
90
97
|
def link(self, tbl_version: TableVersion) -> None:
|
|
@@ -92,15 +99,16 @@ class Project(ExternalStore, abc.ABC):
|
|
|
92
99
|
# This ensures that the media in those columns resides in the media store.
|
|
93
100
|
# First determine which columns (if any) need stored proxies, but don't have one yet.
|
|
94
101
|
stored_proxies_needed: list[Column] = []
|
|
95
|
-
for
|
|
102
|
+
for col_handle in self.col_mapping:
|
|
103
|
+
col = col_handle.get()
|
|
96
104
|
if col.col_type.is_media_type() and not (col.is_stored and col.is_computed):
|
|
97
105
|
# If this column is already proxied in some other Project, use the existing proxy to avoid
|
|
98
106
|
# duplication. Otherwise, we'll create a new one.
|
|
99
107
|
for store in tbl_version.external_stores.values():
|
|
100
|
-
if isinstance(store, Project) and
|
|
101
|
-
self.stored_proxies[
|
|
108
|
+
if isinstance(store, Project) and col_handle in store.stored_proxies:
|
|
109
|
+
self.stored_proxies[col_handle] = store.stored_proxies[col_handle]
|
|
102
110
|
break
|
|
103
|
-
if
|
|
111
|
+
if col_handle not in self.stored_proxies:
|
|
104
112
|
# We didn't find it in an existing Project
|
|
105
113
|
stored_proxies_needed.append(col)
|
|
106
114
|
|
|
@@ -110,17 +118,20 @@ class Project(ExternalStore, abc.ABC):
|
|
|
110
118
|
proxy_cols = [self.create_stored_proxy(col) for col in stored_proxies_needed]
|
|
111
119
|
# Add the columns; this will also update table metadata.
|
|
112
120
|
tbl_version.add_columns(proxy_cols, print_stats=False, on_error='ignore')
|
|
121
|
+
self.stored_proxies.update(
|
|
122
|
+
{col.handle: proxy_col.handle for col, proxy_col in zip(stored_proxies_needed, proxy_cols)}
|
|
123
|
+
)
|
|
113
124
|
|
|
114
125
|
def unlink(self, tbl_version: TableVersion) -> None:
|
|
115
126
|
# Determine which stored proxies can be deleted. (A stored proxy can be deleted if it is not referenced by
|
|
116
127
|
# any *other* external store for this table.)
|
|
117
|
-
deletions_needed: set[
|
|
128
|
+
deletions_needed: set[ColumnHandle] = set(self.stored_proxies.values())
|
|
118
129
|
for name, store in tbl_version.external_stores.items():
|
|
119
130
|
if isinstance(store, Project) and name != self.name:
|
|
120
131
|
deletions_needed = deletions_needed.difference(set(store.stored_proxies.values()))
|
|
121
132
|
if len(deletions_needed) > 0:
|
|
122
|
-
_logger.info(f'Removing stored proxies for columns: {[col.name for col in deletions_needed]}')
|
|
123
|
-
tbl_version._drop_columns(deletions_needed)
|
|
133
|
+
_logger.info(f'Removing stored proxies for columns: {[col.get().name for col in deletions_needed]}')
|
|
134
|
+
tbl_version._drop_columns(col.get() for col in deletions_needed)
|
|
124
135
|
self.stored_proxies.clear()
|
|
125
136
|
|
|
126
137
|
def create_stored_proxy(self, col: Column) -> Column:
|
|
@@ -142,11 +153,10 @@ class Project(ExternalStore, abc.ABC):
|
|
|
142
153
|
computed_with=exprs.ColumnRef(col).apply(lambda x: x, col_type=col.col_type),
|
|
143
154
|
stored=True,
|
|
144
155
|
)
|
|
145
|
-
self.stored_proxies[col] = proxy_col
|
|
146
156
|
return proxy_col
|
|
147
157
|
|
|
148
158
|
@property
|
|
149
|
-
def col_mapping(self) -> dict[
|
|
159
|
+
def col_mapping(self) -> dict[ColumnHandle, str]:
|
|
150
160
|
return self._col_mapping
|
|
151
161
|
|
|
152
162
|
@abc.abstractmethod
|
|
@@ -181,7 +191,7 @@ class Project(ExternalStore, abc.ABC):
|
|
|
181
191
|
export_cols: dict[str, ts.ColumnType],
|
|
182
192
|
import_cols: dict[str, ts.ColumnType],
|
|
183
193
|
col_mapping: Optional[dict[str, str]],
|
|
184
|
-
) -> dict[
|
|
194
|
+
) -> dict[ColumnHandle, str]:
|
|
185
195
|
"""
|
|
186
196
|
Verifies that the specified `col_mapping` is valid. In particular, checks that:
|
|
187
197
|
(i) the keys of `col_mapping` are valid columns of the specified `Table`;
|
|
@@ -199,7 +209,7 @@ class Project(ExternalStore, abc.ABC):
|
|
|
199
209
|
if col_mapping is None:
|
|
200
210
|
col_mapping = {col: col for col in itertools.chain(export_cols.keys(), import_cols.keys())}
|
|
201
211
|
|
|
202
|
-
resolved_col_mapping: dict[
|
|
212
|
+
resolved_col_mapping: dict[ColumnHandle, str] = {}
|
|
203
213
|
|
|
204
214
|
# Validate names
|
|
205
215
|
t_cols = set(table._get_schema().keys())
|
|
@@ -223,7 +233,8 @@ class Project(ExternalStore, abc.ABC):
|
|
|
223
233
|
)
|
|
224
234
|
col_ref = table[t_col]
|
|
225
235
|
assert isinstance(col_ref, exprs.ColumnRef)
|
|
226
|
-
resolved_col_mapping[col_ref.col] = ext_col
|
|
236
|
+
resolved_col_mapping[col_ref.col.handle] = ext_col
|
|
237
|
+
|
|
227
238
|
# Validate column specs
|
|
228
239
|
t_col_types = table._get_schema()
|
|
229
240
|
for t_col, ext_col in col_mapping.items():
|
|
@@ -250,40 +261,6 @@ class Project(ExternalStore, abc.ABC):
|
|
|
250
261
|
)
|
|
251
262
|
return resolved_col_mapping
|
|
252
263
|
|
|
253
|
-
@classmethod
|
|
254
|
-
def _column_as_dict(cls, col: Column) -> dict[str, Any]:
|
|
255
|
-
return {'tbl_id': str(col.tbl.id), 'col_id': col.id}
|
|
256
|
-
|
|
257
|
-
@classmethod
|
|
258
|
-
def _column_from_dict(cls, d: dict[str, Any]) -> Column:
|
|
259
|
-
from pixeltable.catalog import Catalog
|
|
260
|
-
|
|
261
|
-
tbl_id = UUID(d['tbl_id'])
|
|
262
|
-
col_id = d['col_id']
|
|
263
|
-
return Catalog.get().get_tbl_version(tbl_id, None).cols_by_id[col_id]
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
@dataclass(frozen=True)
|
|
267
|
-
class SyncStatus:
|
|
268
|
-
external_rows_created: int = 0
|
|
269
|
-
external_rows_deleted: int = 0
|
|
270
|
-
external_rows_updated: int = 0
|
|
271
|
-
pxt_rows_updated: int = 0
|
|
272
|
-
num_excs: int = 0
|
|
273
|
-
|
|
274
|
-
def combine(self, other: 'SyncStatus') -> 'SyncStatus':
|
|
275
|
-
return SyncStatus(
|
|
276
|
-
external_rows_created=self.external_rows_created + other.external_rows_created,
|
|
277
|
-
external_rows_deleted=self.external_rows_deleted + other.external_rows_deleted,
|
|
278
|
-
external_rows_updated=self.external_rows_updated + other.external_rows_updated,
|
|
279
|
-
pxt_rows_updated=self.pxt_rows_updated + other.pxt_rows_updated,
|
|
280
|
-
num_excs=self.num_excs + other.num_excs,
|
|
281
|
-
)
|
|
282
|
-
|
|
283
|
-
@classmethod
|
|
284
|
-
def empty(cls) -> 'SyncStatus':
|
|
285
|
-
return SyncStatus(0, 0, 0, 0, 0)
|
|
286
|
-
|
|
287
264
|
|
|
288
265
|
class MockProject(Project):
|
|
289
266
|
"""A project that cannot be synced, used mainly for testing."""
|
|
@@ -293,8 +270,8 @@ class MockProject(Project):
|
|
|
293
270
|
name: str,
|
|
294
271
|
export_cols: dict[str, ts.ColumnType],
|
|
295
272
|
import_cols: dict[str, ts.ColumnType],
|
|
296
|
-
col_mapping: dict[
|
|
297
|
-
stored_proxies: Optional[dict[
|
|
273
|
+
col_mapping: dict[ColumnHandle, str],
|
|
274
|
+
stored_proxies: Optional[dict[ColumnHandle, ColumnHandle]] = None,
|
|
298
275
|
):
|
|
299
276
|
super().__init__(name, col_mapping, stored_proxies)
|
|
300
277
|
self.export_cols = export_cols
|
|
@@ -319,7 +296,7 @@ class MockProject(Project):
|
|
|
319
296
|
def get_import_columns(self) -> dict[str, ts.ColumnType]:
|
|
320
297
|
return self.import_cols
|
|
321
298
|
|
|
322
|
-
def sync(self, t: Table, export_data: bool, import_data: bool) ->
|
|
299
|
+
def sync(self, t: Table, export_data: bool, import_data: bool) -> UpdateStatus:
|
|
323
300
|
raise NotImplementedError()
|
|
324
301
|
|
|
325
302
|
def delete(self) -> None:
|
|
@@ -334,10 +311,8 @@ class MockProject(Project):
|
|
|
334
311
|
'name': self.name,
|
|
335
312
|
'export_cols': {k: v.as_dict() for k, v in self.export_cols.items()},
|
|
336
313
|
'import_cols': {k: v.as_dict() for k, v in self.import_cols.items()},
|
|
337
|
-
'col_mapping': [[
|
|
338
|
-
'stored_proxies': [
|
|
339
|
-
[self._column_as_dict(k), self._column_as_dict(v)] for k, v in self.stored_proxies.items()
|
|
340
|
-
],
|
|
314
|
+
'col_mapping': [[k.as_dict(), v] for k, v in self.col_mapping.items()],
|
|
315
|
+
'stored_proxies': [[k.as_dict(), v.as_dict()] for k, v in self.stored_proxies.items()],
|
|
341
316
|
}
|
|
342
317
|
|
|
343
318
|
@classmethod
|
|
@@ -346,8 +321,8 @@ class MockProject(Project):
|
|
|
346
321
|
md['name'],
|
|
347
322
|
{k: ts.ColumnType.from_dict(v) for k, v in md['export_cols'].items()},
|
|
348
323
|
{k: ts.ColumnType.from_dict(v) for k, v in md['import_cols'].items()},
|
|
349
|
-
{
|
|
350
|
-
{
|
|
324
|
+
{ColumnHandle.from_dict(entry[0]): entry[1] for entry in md['col_mapping']},
|
|
325
|
+
{ColumnHandle.from_dict(entry[0]): ColumnHandle.from_dict(entry[1]) for entry in md['stored_proxies']},
|
|
351
326
|
)
|
|
352
327
|
|
|
353
328
|
def __eq__(self, other: object) -> bool:
|
pixeltable/io/globals.py
CHANGED
|
@@ -5,8 +5,8 @@ from typing import TYPE_CHECKING, Any, Literal, Optional, Union
|
|
|
5
5
|
import pixeltable as pxt
|
|
6
6
|
import pixeltable.exceptions as excs
|
|
7
7
|
from pixeltable import Table, exprs
|
|
8
|
+
from pixeltable.catalog.update_status import UpdateStatus
|
|
8
9
|
from pixeltable.env import Env
|
|
9
|
-
from pixeltable.io.external_store import SyncStatus
|
|
10
10
|
|
|
11
11
|
if TYPE_CHECKING:
|
|
12
12
|
import fiftyone as fo # type: ignore[import-untyped]
|
|
@@ -22,7 +22,7 @@ def create_label_studio_project(
|
|
|
22
22
|
sync_immediately: bool = True,
|
|
23
23
|
s3_configuration: Optional[dict[str, Any]] = None,
|
|
24
24
|
**kwargs: Any,
|
|
25
|
-
) ->
|
|
25
|
+
) -> UpdateStatus:
|
|
26
26
|
"""
|
|
27
27
|
Create a new Label Studio project and link it to the specified [`Table`][pixeltable.Table].
|
|
28
28
|
|
|
@@ -96,7 +96,7 @@ def create_label_studio_project(
|
|
|
96
96
|
[Label Studio start_project docs](https://labelstud.io/sdk/project.html#label_studio_sdk.project.Project.start_project).
|
|
97
97
|
|
|
98
98
|
Returns:
|
|
99
|
-
|
|
99
|
+
An `UpdateStatus` representing the status of any synchronization operations that occurred.
|
|
100
100
|
|
|
101
101
|
Examples:
|
|
102
102
|
Create a Label Studio project whose tasks correspond to videos stored in the `video_col`
|
|
@@ -136,7 +136,7 @@ def create_label_studio_project(
|
|
|
136
136
|
if sync_immediately:
|
|
137
137
|
return t.sync()
|
|
138
138
|
else:
|
|
139
|
-
return
|
|
139
|
+
return UpdateStatus()
|
|
140
140
|
|
|
141
141
|
|
|
142
142
|
def export_images_as_fo_dataset(
|
pixeltable/io/hf_datasets.py
CHANGED
|
@@ -50,10 +50,18 @@ def _to_pixeltable_type(feature_type: Any, nullable: bool) -> Optional[ts.Column
|
|
|
50
50
|
elif isinstance(feature_type, datasets.Sequence):
|
|
51
51
|
# example: cohere wiki. Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None)
|
|
52
52
|
dtype = _to_pixeltable_type(feature_type.feature, nullable)
|
|
53
|
-
|
|
54
|
-
|
|
53
|
+
if dtype is None:
|
|
54
|
+
return None
|
|
55
|
+
if dtype.is_int_type() or dtype.is_float_type() or dtype.is_bool_type() or dtype.is_string_type():
|
|
56
|
+
length = feature_type.length if feature_type.length != -1 else None
|
|
57
|
+
return ts.ArrayType(shape=(length,), dtype=dtype, nullable=nullable)
|
|
58
|
+
else:
|
|
59
|
+
# Sequence of dicts must be cast as Json
|
|
60
|
+
return ts.JsonType(nullable=nullable)
|
|
55
61
|
elif isinstance(feature_type, datasets.Image):
|
|
56
62
|
return ts.ImageType(nullable=nullable)
|
|
63
|
+
elif isinstance(feature_type, dict):
|
|
64
|
+
return ts.JsonType(nullable=nullable)
|
|
57
65
|
else:
|
|
58
66
|
return None
|
|
59
67
|
|
pixeltable/io/label_studio.py
CHANGED
|
@@ -4,18 +4,20 @@ import logging
|
|
|
4
4
|
import os
|
|
5
5
|
from dataclasses import dataclass
|
|
6
6
|
from pathlib import Path
|
|
7
|
-
from typing import Any, Iterator, Literal, Optional
|
|
7
|
+
from typing import Any, Iterator, Literal, Optional
|
|
8
8
|
from xml.etree import ElementTree as ET
|
|
9
9
|
|
|
10
|
-
import label_studio_sdk
|
|
10
|
+
import label_studio_sdk
|
|
11
11
|
import PIL.Image
|
|
12
12
|
from requests.exceptions import HTTPError
|
|
13
13
|
|
|
14
14
|
import pixeltable.type_system as ts
|
|
15
15
|
from pixeltable import Column, Table, env, exceptions as excs
|
|
16
|
+
from pixeltable.catalog import ColumnHandle
|
|
17
|
+
from pixeltable.catalog.update_status import RowCountStats, UpdateStatus
|
|
16
18
|
from pixeltable.config import Config
|
|
17
19
|
from pixeltable.exprs import ColumnRef, DataRow, Expr
|
|
18
|
-
from pixeltable.io.external_store import Project
|
|
20
|
+
from pixeltable.io.external_store import Project
|
|
19
21
|
from pixeltable.utils import coco
|
|
20
22
|
|
|
21
23
|
# label_studio_sdk>=1 and label_studio_sdk<1 are not compatible, so we need to try
|
|
@@ -25,7 +27,7 @@ try:
|
|
|
25
27
|
import label_studio_sdk.project as ls_project # type: ignore
|
|
26
28
|
except ImportError:
|
|
27
29
|
# label_studio_sdk>=1 compatibility
|
|
28
|
-
import label_studio_sdk._legacy.project as ls_project
|
|
30
|
+
import label_studio_sdk._legacy.project as ls_project
|
|
29
31
|
|
|
30
32
|
_logger = logging.getLogger('pixeltable')
|
|
31
33
|
|
|
@@ -45,13 +47,17 @@ class LabelStudioProject(Project):
|
|
|
45
47
|
for synchronizing between a Pixeltable table and a Label Studio project.
|
|
46
48
|
"""
|
|
47
49
|
|
|
50
|
+
project_id: int # Label Studio project ID
|
|
51
|
+
media_import_method: Literal['post', 'file', 'url']
|
|
52
|
+
_project: Optional[ls_project.Project]
|
|
53
|
+
|
|
48
54
|
def __init__(
|
|
49
55
|
self,
|
|
50
56
|
name: str,
|
|
51
57
|
project_id: int,
|
|
52
58
|
media_import_method: Literal['post', 'file', 'url'],
|
|
53
|
-
col_mapping: dict[
|
|
54
|
-
stored_proxies: Optional[dict[
|
|
59
|
+
col_mapping: dict[ColumnHandle, str],
|
|
60
|
+
stored_proxies: Optional[dict[ColumnHandle, ColumnHandle]] = None,
|
|
55
61
|
):
|
|
56
62
|
"""
|
|
57
63
|
The constructor will NOT create a new Label Studio project; it is also used when loading
|
|
@@ -59,7 +65,7 @@ class LabelStudioProject(Project):
|
|
|
59
65
|
"""
|
|
60
66
|
self.project_id = project_id
|
|
61
67
|
self.media_import_method = media_import_method
|
|
62
|
-
self._project
|
|
68
|
+
self._project = None
|
|
63
69
|
super().__init__(name, col_mapping, stored_proxies)
|
|
64
70
|
|
|
65
71
|
@property
|
|
@@ -105,20 +111,20 @@ class LabelStudioProject(Project):
|
|
|
105
111
|
"""
|
|
106
112
|
return {ANNOTATIONS_COLUMN: ts.JsonType(nullable=True)}
|
|
107
113
|
|
|
108
|
-
def sync(self, t: Table, export_data: bool, import_data: bool) ->
|
|
114
|
+
def sync(self, t: Table, export_data: bool, import_data: bool) -> UpdateStatus:
|
|
109
115
|
_logger.info(
|
|
110
116
|
f'Syncing Label Studio project "{self.project_title}" with table `{t._name}`'
|
|
111
117
|
f' (export: {export_data}, import: {import_data}).'
|
|
112
118
|
)
|
|
113
119
|
# Collect all existing tasks into a dict with entries `rowid: task`
|
|
114
120
|
tasks = {tuple(task['meta']['rowid']): task for task in self.__fetch_all_tasks()}
|
|
115
|
-
sync_status =
|
|
121
|
+
sync_status = UpdateStatus()
|
|
116
122
|
if export_data:
|
|
117
123
|
export_sync_status = self.__update_tasks(t, tasks)
|
|
118
|
-
sync_status
|
|
124
|
+
sync_status += export_sync_status
|
|
119
125
|
if import_data:
|
|
120
126
|
import_sync_status = self.__update_table_from_tasks(t, tasks)
|
|
121
|
-
sync_status
|
|
127
|
+
sync_status += import_sync_status
|
|
122
128
|
return sync_status
|
|
123
129
|
|
|
124
130
|
def __fetch_all_tasks(self) -> Iterator[dict[str, Any]]:
|
|
@@ -142,7 +148,7 @@ class LabelStudioProject(Project):
|
|
|
142
148
|
f'Label Studio project {self.project_title!r}.'
|
|
143
149
|
)
|
|
144
150
|
|
|
145
|
-
def __update_tasks(self, t: Table, existing_tasks: dict[tuple, dict]) ->
|
|
151
|
+
def __update_tasks(self, t: Table, existing_tasks: dict[tuple, dict]) -> UpdateStatus:
|
|
146
152
|
"""
|
|
147
153
|
Updates all tasks in this Label Studio project based on the Pixeltable data:
|
|
148
154
|
- Creates new tasks for rows that don't map to any existing task;
|
|
@@ -155,7 +161,7 @@ class LabelStudioProject(Project):
|
|
|
155
161
|
t_data_cols = [t_col for t_col, ext_col_name in self.col_mapping.items() if ext_col_name in config.data_keys]
|
|
156
162
|
|
|
157
163
|
if len(t_data_cols) == 0:
|
|
158
|
-
return
|
|
164
|
+
return UpdateStatus()
|
|
159
165
|
|
|
160
166
|
# Columns in `t` that map to `rectanglelabels` preannotations
|
|
161
167
|
t_rl_cols = [
|
|
@@ -183,15 +189,15 @@ class LabelStudioProject(Project):
|
|
|
183
189
|
self,
|
|
184
190
|
t: Table,
|
|
185
191
|
existing_tasks: dict[tuple, dict],
|
|
186
|
-
media_col:
|
|
187
|
-
t_rl_cols: list[
|
|
192
|
+
media_col: ColumnHandle,
|
|
193
|
+
t_rl_cols: list[ColumnHandle],
|
|
188
194
|
rl_info: list['_RectangleLabel'],
|
|
189
|
-
) ->
|
|
190
|
-
is_stored = media_col.is_stored
|
|
195
|
+
) -> UpdateStatus:
|
|
196
|
+
is_stored = media_col.get().is_stored
|
|
191
197
|
# If it's a stored column, we can use `localpath`
|
|
192
|
-
localpath_col_opt = [t[media_col.name].localpath] if is_stored else []
|
|
198
|
+
localpath_col_opt = [t[media_col.get().name].localpath] if is_stored else []
|
|
193
199
|
# Select the media column, rectanglelabels columns, and localpath (if appropriate)
|
|
194
|
-
rows = t.select(t[media_col.name], *[t[col.name] for col in t_rl_cols], *localpath_col_opt)
|
|
200
|
+
rows = t.select(t[media_col.get().name], *[t[col.get().name] for col in t_rl_cols], *localpath_col_opt)
|
|
195
201
|
tasks_created = 0
|
|
196
202
|
row_ids_in_pxt: set[tuple] = set()
|
|
197
203
|
|
|
@@ -232,42 +238,42 @@ class LabelStudioProject(Project):
|
|
|
232
238
|
|
|
233
239
|
env.Env.get().console_logger.info(f'Created {tasks_created} new task(s) in {self}.')
|
|
234
240
|
|
|
235
|
-
sync_status =
|
|
241
|
+
sync_status = UpdateStatus(ext_row_count_stats=RowCountStats(ins_rows=tasks_created))
|
|
236
242
|
|
|
237
243
|
deletion_sync_status = self.__delete_stale_tasks(existing_tasks, row_ids_in_pxt, tasks_created)
|
|
238
|
-
|
|
239
|
-
return sync_status
|
|
244
|
+
sync_status += deletion_sync_status
|
|
245
|
+
return sync_status
|
|
240
246
|
|
|
241
247
|
def __update_tasks_by_files(
|
|
242
248
|
self,
|
|
243
249
|
t: Table,
|
|
244
250
|
existing_tasks: dict[tuple, dict],
|
|
245
|
-
t_data_cols: list[
|
|
246
|
-
t_rl_cols: list[
|
|
251
|
+
t_data_cols: list[ColumnHandle],
|
|
252
|
+
t_rl_cols: list[ColumnHandle],
|
|
247
253
|
rl_info: list['_RectangleLabel'],
|
|
248
|
-
) ->
|
|
254
|
+
) -> UpdateStatus:
|
|
249
255
|
ext_data_cols = [self.col_mapping[col] for col in t_data_cols]
|
|
250
256
|
expr_refs: dict[str, Expr] = {} # kwargs for the select statement
|
|
251
257
|
for col in t_data_cols:
|
|
252
|
-
col_name = col.name
|
|
258
|
+
col_name = col.get().name
|
|
253
259
|
if self.media_import_method == 'url':
|
|
254
260
|
expr_refs[col_name] = t[col_name].fileurl
|
|
255
261
|
else:
|
|
256
262
|
assert self.media_import_method == 'file'
|
|
257
|
-
if not col.col_type.is_media_type():
|
|
263
|
+
if not col.get().col_type.is_media_type():
|
|
258
264
|
# Not a media column; query the data directly
|
|
259
|
-
expr_refs[col_name] =
|
|
265
|
+
expr_refs[col_name] = t[col_name]
|
|
260
266
|
elif col in self.stored_proxies:
|
|
261
267
|
# Media column that has a stored proxy; use it. We have to give it a name,
|
|
262
268
|
# since it's an anonymous column
|
|
263
|
-
stored_proxy_col = self.stored_proxies[col]
|
|
269
|
+
stored_proxy_col = self.stored_proxies[col].get()
|
|
264
270
|
expr_refs[f'{col_name}_proxy'] = ColumnRef(stored_proxy_col).localpath
|
|
265
271
|
else:
|
|
266
272
|
# Media column without a stored proxy; this means it's a stored computed column,
|
|
267
273
|
# and we can just use the localpath
|
|
268
274
|
expr_refs[col_name] = t[col_name].localpath
|
|
269
275
|
|
|
270
|
-
df = t.select(*[t[col.name] for col in t_rl_cols], **expr_refs)
|
|
276
|
+
df = t.select(*[t[col.get().name] for col in t_rl_cols], **expr_refs)
|
|
271
277
|
# The following buffers will hold `DataRow` indices that correspond to each of the selected
|
|
272
278
|
# columns. `rl_col_idxs` holds the indices for the columns that map to RectangleLabels
|
|
273
279
|
# preannotations; `data_col_idxs` holds the indices for the columns that map to data fields.
|
|
@@ -286,11 +292,11 @@ class LabelStudioProject(Project):
|
|
|
286
292
|
data_vals = [row[idx] for idx in data_col_idxs]
|
|
287
293
|
coco_annotations = [row[idx] for idx in rl_col_idxs]
|
|
288
294
|
for i in range(len(t_data_cols)):
|
|
289
|
-
if t_data_cols[i].col_type.is_media_type():
|
|
295
|
+
if t_data_cols[i].get().col_type.is_media_type():
|
|
290
296
|
# Special handling for media columns
|
|
291
297
|
assert isinstance(data_vals[i], str)
|
|
292
298
|
if self.media_import_method == 'url':
|
|
293
|
-
data_vals[i] = self.__validate_fileurl(t_data_cols[i], data_vals[i])
|
|
299
|
+
data_vals[i] = self.__validate_fileurl(t_data_cols[i].get(), data_vals[i])
|
|
294
300
|
else:
|
|
295
301
|
assert self.media_import_method == 'file'
|
|
296
302
|
data_vals[i] = self.__localpath_to_lspath(data_vals[i])
|
|
@@ -336,11 +342,11 @@ class LabelStudioProject(Project):
|
|
|
336
342
|
f'Created {tasks_created} new task(s) and updated {tasks_updated} existing task(s) in {self}.'
|
|
337
343
|
)
|
|
338
344
|
|
|
339
|
-
sync_status =
|
|
345
|
+
sync_status = UpdateStatus(ext_row_count_stats=RowCountStats(ins_rows=tasks_created, upd_rows=tasks_updated))
|
|
340
346
|
|
|
341
347
|
deletion_sync_status = self.__delete_stale_tasks(existing_tasks, row_ids_in_pxt, tasks_created)
|
|
342
|
-
|
|
343
|
-
return sync_status
|
|
348
|
+
sync_status += deletion_sync_status
|
|
349
|
+
return sync_status
|
|
344
350
|
|
|
345
351
|
@classmethod
|
|
346
352
|
def __validate_fileurl(cls, col: Column, url: str) -> Optional[str]:
|
|
@@ -361,7 +367,7 @@ class LabelStudioProject(Project):
|
|
|
361
367
|
|
|
362
368
|
def __delete_stale_tasks(
|
|
363
369
|
self, existing_tasks: dict[tuple, dict], row_ids_in_pxt: set[tuple], tasks_created: int
|
|
364
|
-
) ->
|
|
370
|
+
) -> UpdateStatus:
|
|
365
371
|
deleted_rowids = set(existing_tasks.keys()) - row_ids_in_pxt
|
|
366
372
|
# Sanity check the math
|
|
367
373
|
assert len(deleted_rowids) == len(existing_tasks) + tasks_created - len(row_ids_in_pxt)
|
|
@@ -377,11 +383,11 @@ class LabelStudioProject(Project):
|
|
|
377
383
|
for rowid in deleted_rowids:
|
|
378
384
|
del existing_tasks[rowid]
|
|
379
385
|
|
|
380
|
-
return
|
|
386
|
+
return UpdateStatus(ext_row_count_stats=RowCountStats(del_rows=len(deleted_rowids)))
|
|
381
387
|
|
|
382
|
-
def __update_table_from_tasks(self, t: Table, tasks: dict[tuple, dict]) ->
|
|
388
|
+
def __update_table_from_tasks(self, t: Table, tasks: dict[tuple, dict]) -> UpdateStatus:
|
|
383
389
|
if ANNOTATIONS_COLUMN not in self.col_mapping.values():
|
|
384
|
-
return
|
|
390
|
+
return UpdateStatus()
|
|
385
391
|
|
|
386
392
|
annotations = {
|
|
387
393
|
# Replace [] by None to indicate no annotations. We do want to sync rows with no annotations,
|
|
@@ -391,7 +397,7 @@ class LabelStudioProject(Project):
|
|
|
391
397
|
for task in tasks.values()
|
|
392
398
|
}
|
|
393
399
|
|
|
394
|
-
local_annotations_col = next(k for k, v in self.col_mapping.items() if v == ANNOTATIONS_COLUMN)
|
|
400
|
+
local_annotations_col = next(k for k, v in self.col_mapping.items() if v == ANNOTATIONS_COLUMN).get()
|
|
395
401
|
|
|
396
402
|
# Prune the annotations down to just the ones that have actually changed.
|
|
397
403
|
rows = t.select(t[local_annotations_col.name])
|
|
@@ -416,19 +422,17 @@ class LabelStudioProject(Project):
|
|
|
416
422
|
ancestor = ancestor._get_base_table()
|
|
417
423
|
update_status = ancestor.batch_update(updates)
|
|
418
424
|
env.Env.get().console_logger.info(f'Updated annotation(s) from {len(updates)} task(s) in {self}.')
|
|
419
|
-
return
|
|
425
|
+
return update_status
|
|
420
426
|
else:
|
|
421
|
-
return
|
|
427
|
+
return UpdateStatus()
|
|
422
428
|
|
|
423
429
|
def as_dict(self) -> dict[str, Any]:
|
|
424
430
|
return {
|
|
425
431
|
'name': self.name,
|
|
426
432
|
'project_id': self.project_id,
|
|
427
433
|
'media_import_method': self.media_import_method,
|
|
428
|
-
'col_mapping': [[
|
|
429
|
-
'stored_proxies': [
|
|
430
|
-
[self._column_as_dict(k), self._column_as_dict(v)] for k, v in self.stored_proxies.items()
|
|
431
|
-
],
|
|
434
|
+
'col_mapping': [[k.as_dict(), v] for k, v in self.col_mapping.items()],
|
|
435
|
+
'stored_proxies': [[k.as_dict(), v.as_dict()] for k, v in self.stored_proxies.items()],
|
|
432
436
|
}
|
|
433
437
|
|
|
434
438
|
@classmethod
|
|
@@ -437,8 +441,8 @@ class LabelStudioProject(Project):
|
|
|
437
441
|
md['name'],
|
|
438
442
|
md['project_id'],
|
|
439
443
|
md['media_import_method'],
|
|
440
|
-
{
|
|
441
|
-
{
|
|
444
|
+
{ColumnHandle.from_dict(entry[0]): entry[1] for entry in md['col_mapping']},
|
|
445
|
+
{ColumnHandle.from_dict(entry[0]): ColumnHandle.from_dict(entry[1]) for entry in md['stored_proxies']},
|
|
442
446
|
)
|
|
443
447
|
|
|
444
448
|
def __repr__(self) -> str:
|
pixeltable/metadata/__init__.py
CHANGED
|
@@ -18,7 +18,7 @@ _console_logger = ConsoleLogger(logging.getLogger('pixeltable'))
|
|
|
18
18
|
_logger = logging.getLogger('pixeltable')
|
|
19
19
|
|
|
20
20
|
# current version of the metadata; this is incremented whenever the metadata schema changes
|
|
21
|
-
VERSION =
|
|
21
|
+
VERSION = 40
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
def create_system_info(engine: sql.engine.Engine) -> None:
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from typing import Any, Optional
|
|
2
|
+
|
|
3
|
+
import sqlalchemy as sql
|
|
4
|
+
|
|
5
|
+
from pixeltable.metadata import register_converter
|
|
6
|
+
from pixeltable.metadata.converters.util import convert_table_md
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@register_converter(version=38)
|
|
10
|
+
def _(engine: sql.engine.Engine) -> None:
|
|
11
|
+
convert_table_md(engine, substitution_fn=__substitute_md)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], Any]]:
|
|
15
|
+
if k == 'col_mapping':
|
|
16
|
+
assert isinstance(v, list)
|
|
17
|
+
return k, [__col_mapping_entry(e) for e in v]
|
|
18
|
+
if k == 'stored_proxies':
|
|
19
|
+
assert isinstance(v, list)
|
|
20
|
+
return k, [__stored_proxies_entry(e) for e in v]
|
|
21
|
+
return None
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def __col_mapping_entry(e: list) -> list:
|
|
25
|
+
assert isinstance(e, list)
|
|
26
|
+
assert isinstance(e[0], dict)
|
|
27
|
+
assert isinstance(e[1], str)
|
|
28
|
+
return [__col_handle(e[0]), e[1]]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def __stored_proxies_entry(e: list) -> list:
|
|
32
|
+
assert isinstance(e, list)
|
|
33
|
+
assert isinstance(e[0], dict)
|
|
34
|
+
assert isinstance(e[1], dict)
|
|
35
|
+
return [__col_handle(e[0]), __col_handle(e[1])]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def __col_handle(e: dict) -> dict:
|
|
39
|
+
return {'tbl_version': {'id': e['tbl_id'], 'effective_version': None}, 'col_id': e['col_id']}
|