pixeltable 0.4.1__py3-none-any.whl → 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +2 -1
- pixeltable/catalog/catalog.py +75 -21
- pixeltable/catalog/column.py +10 -0
- pixeltable/catalog/globals.py +121 -18
- pixeltable/catalog/insertable_table.py +2 -1
- pixeltable/catalog/table.py +135 -4
- pixeltable/catalog/table_version.py +106 -66
- pixeltable/catalog/table_version_handle.py +26 -1
- pixeltable/catalog/view.py +4 -2
- pixeltable/exprs/column_property_ref.py +2 -11
- pixeltable/exprs/column_ref.py +19 -17
- pixeltable/exprs/data_row.py +9 -0
- pixeltable/exprs/row_builder.py +44 -13
- pixeltable/io/external_store.py +79 -52
- pixeltable/io/globals.py +1 -1
- pixeltable/io/label_studio.py +45 -41
- pixeltable/metadata/__init__.py +1 -1
- pixeltable/metadata/converters/convert_38.py +39 -0
- pixeltable/metadata/notes.py +1 -0
- pixeltable/metadata/utils.py +78 -0
- pixeltable/plan.py +22 -18
- pixeltable/store.py +114 -103
- {pixeltable-0.4.1.dist-info → pixeltable-0.4.2.dist-info}/METADATA +1 -1
- {pixeltable-0.4.1.dist-info → pixeltable-0.4.2.dist-info}/RECORD +28 -26
- {pixeltable-0.4.1.dist-info → pixeltable-0.4.2.dist-info}/LICENSE +0 -0
- {pixeltable-0.4.1.dist-info → pixeltable-0.4.2.dist-info}/WHEEL +0 -0
- {pixeltable-0.4.1.dist-info → pixeltable-0.4.2.dist-info}/entry_points.txt +0 -0
pixeltable/exprs/row_builder.py
CHANGED
|
@@ -63,6 +63,7 @@ class RowBuilder:
|
|
|
63
63
|
|
|
64
64
|
input_exprs: ExprSet
|
|
65
65
|
|
|
66
|
+
tbl: Optional[catalog.TableVersion] # reference table of the RowBuilder; used to identify pk columns for writes
|
|
66
67
|
table_columns: list[ColumnSlotIdx]
|
|
67
68
|
default_eval_ctx: EvalCtx
|
|
68
69
|
unstored_iter_args: dict[UUID, Expr]
|
|
@@ -93,7 +94,13 @@ class RowBuilder:
|
|
|
93
94
|
target_slot_idxs: list[int] # slot idxs of target exprs; might contain duplicates
|
|
94
95
|
target_exprs: list[Expr] # exprs corresponding to target_slot_idxs
|
|
95
96
|
|
|
96
|
-
def __init__(
|
|
97
|
+
def __init__(
|
|
98
|
+
self,
|
|
99
|
+
output_exprs: Sequence[Expr],
|
|
100
|
+
columns: Sequence[catalog.Column],
|
|
101
|
+
input_exprs: Iterable[Expr],
|
|
102
|
+
tbl: Optional[catalog.TableVersion] = None,
|
|
103
|
+
):
|
|
97
104
|
"""
|
|
98
105
|
Args:
|
|
99
106
|
output_exprs: list of Exprs to be evaluated
|
|
@@ -125,6 +132,7 @@ class RowBuilder:
|
|
|
125
132
|
# * further references to that column (eg, computed cols) need to resolve to the validating ColumnRef
|
|
126
133
|
from .column_ref import ColumnRef
|
|
127
134
|
|
|
135
|
+
self.tbl = tbl
|
|
128
136
|
self.table_columns: list[ColumnSlotIdx] = []
|
|
129
137
|
self.input_exprs = ExprSet()
|
|
130
138
|
validating_colrefs: dict[Expr, Expr] = {} # key: non-validating colref, value: corresp. validating colref
|
|
@@ -229,6 +237,7 @@ class RowBuilder:
|
|
|
229
237
|
|
|
230
238
|
def add_table_column(self, col: catalog.Column, slot_idx: int) -> None:
|
|
231
239
|
"""Record a column that is part of the table row"""
|
|
240
|
+
assert self.tbl is not None
|
|
232
241
|
self.table_columns.append(ColumnSlotIdx(col, slot_idx))
|
|
233
242
|
|
|
234
243
|
def output_slot_idxs(self) -> list[ColumnSlotIdx]:
|
|
@@ -427,33 +436,55 @@ class RowBuilder:
|
|
|
427
436
|
expr, f'expression {expr}', data_row.get_exc(expr.slot_idx), exc_tb, input_vals, 0
|
|
428
437
|
) from exc
|
|
429
438
|
|
|
430
|
-
def create_table_row(
|
|
439
|
+
def create_table_row(
|
|
440
|
+
self, data_row: DataRow, cols_with_excs: Optional[set[int]], pk: tuple[int, ...]
|
|
441
|
+
) -> tuple[list[Any], int]:
|
|
431
442
|
"""Create a table row from the slots that have an output column assigned
|
|
432
443
|
|
|
433
|
-
Return tuple[
|
|
444
|
+
Return tuple[list of row values in `self.table_columns` order, # of exceptions]
|
|
434
445
|
This excludes system columns.
|
|
435
446
|
"""
|
|
436
447
|
num_excs = 0
|
|
437
|
-
table_row:
|
|
448
|
+
table_row: list[Any] = list(pk)
|
|
438
449
|
for info in self.table_columns:
|
|
439
450
|
col, slot_idx = info.col, info.slot_idx
|
|
440
451
|
if data_row.has_exc(slot_idx):
|
|
441
|
-
# exceptions get stored in the errortype/-msg columns
|
|
442
452
|
exc = data_row.get_exc(slot_idx)
|
|
443
453
|
num_excs += 1
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
table_row
|
|
447
|
-
|
|
454
|
+
if cols_with_excs is not None:
|
|
455
|
+
cols_with_excs.add(col.id)
|
|
456
|
+
table_row.append(None)
|
|
457
|
+
if col.records_errors:
|
|
458
|
+
# exceptions get stored in the errortype/-msg columns
|
|
459
|
+
table_row.extend((type(exc).__name__, str(exc)))
|
|
448
460
|
else:
|
|
449
461
|
if col.col_type.is_image_type() and data_row.file_urls[slot_idx] is None:
|
|
450
462
|
# we have yet to store this image
|
|
451
463
|
filepath = str(MediaStore.prepare_media_path(col.tbl.id, col.id, col.tbl.version))
|
|
452
464
|
data_row.flush_img(slot_idx, filepath)
|
|
453
465
|
val = data_row.get_stored_val(slot_idx, col.get_sa_col_type())
|
|
454
|
-
table_row
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
table_row[col.errormsg_store_name()] = None
|
|
466
|
+
table_row.append(val)
|
|
467
|
+
if col.records_errors:
|
|
468
|
+
table_row.extend((None, None))
|
|
458
469
|
|
|
459
470
|
return table_row, num_excs
|
|
471
|
+
|
|
472
|
+
def store_column_names(self) -> tuple[list[str], dict[int, catalog.Column]]:
|
|
473
|
+
"""
|
|
474
|
+
Returns the list of store column names corresponding to the table_columns of this RowBuilder.
|
|
475
|
+
The second tuple element of the return value is a dictionary containing all media columns in the
|
|
476
|
+
table; it's the mapping {list_index: column}.
|
|
477
|
+
"""
|
|
478
|
+
assert self.tbl is not None, self.table_columns
|
|
479
|
+
store_col_names: list[str] = [pk_col.name for pk_col in self.tbl.store_tbl.pk_columns()]
|
|
480
|
+
media_cols: dict[int, catalog.Column] = {}
|
|
481
|
+
|
|
482
|
+
for col in self.table_columns:
|
|
483
|
+
if col.col.col_type.is_media_type():
|
|
484
|
+
media_cols[len(store_col_names)] = col.col
|
|
485
|
+
store_col_names.append(col.col.store_name())
|
|
486
|
+
if col.col.records_errors:
|
|
487
|
+
store_col_names.append(col.col.errortype_store_name())
|
|
488
|
+
store_col_names.append(col.col.errormsg_store_name())
|
|
489
|
+
|
|
490
|
+
return store_col_names, media_cols
|
pixeltable/io/external_store.py
CHANGED
|
@@ -3,14 +3,14 @@ from __future__ import annotations
|
|
|
3
3
|
import abc
|
|
4
4
|
import itertools
|
|
5
5
|
import logging
|
|
6
|
-
from dataclasses import dataclass
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
7
|
from typing import Any, Optional
|
|
8
|
-
from uuid import UUID
|
|
9
8
|
|
|
10
9
|
import pixeltable.exceptions as excs
|
|
11
10
|
import pixeltable.type_system as ts
|
|
12
11
|
from pixeltable import Column, Table
|
|
13
|
-
from pixeltable.catalog import TableVersion
|
|
12
|
+
from pixeltable.catalog import ColumnHandle, TableVersion
|
|
13
|
+
from pixeltable.catalog.globals import RowCountStats, UpdateStatus
|
|
14
14
|
|
|
15
15
|
_logger = logging.getLogger('pixeltable')
|
|
16
16
|
|
|
@@ -22,6 +22,8 @@ class ExternalStore(abc.ABC):
|
|
|
22
22
|
and stateful external stores.
|
|
23
23
|
"""
|
|
24
24
|
|
|
25
|
+
__name: str
|
|
26
|
+
|
|
25
27
|
def __init__(self, name: str) -> None:
|
|
26
28
|
self.__name = name
|
|
27
29
|
|
|
@@ -38,7 +40,7 @@ class ExternalStore(abc.ABC):
|
|
|
38
40
|
"""Removes store-specific metadata created in link()."""
|
|
39
41
|
|
|
40
42
|
@abc.abstractmethod
|
|
41
|
-
def get_local_columns(self) -> list[
|
|
43
|
+
def get_local_columns(self) -> list[ColumnHandle]:
|
|
42
44
|
"""
|
|
43
45
|
Gets a list of all local (Pixeltable) columns that are associated with this external store.
|
|
44
46
|
"""
|
|
@@ -63,9 +65,15 @@ class Project(ExternalStore, abc.ABC):
|
|
|
63
65
|
additional capabilities specific to such projects.
|
|
64
66
|
"""
|
|
65
67
|
|
|
66
|
-
|
|
68
|
+
_col_mapping: dict[ColumnHandle, str] # col -> external col name
|
|
69
|
+
stored_proxies: dict[ColumnHandle, ColumnHandle] # original col -> proxy col
|
|
67
70
|
|
|
68
|
-
def __init__(
|
|
71
|
+
def __init__(
|
|
72
|
+
self,
|
|
73
|
+
name: str,
|
|
74
|
+
col_mapping: dict[ColumnHandle, str],
|
|
75
|
+
stored_proxies: Optional[dict[ColumnHandle, ColumnHandle]],
|
|
76
|
+
):
|
|
69
77
|
super().__init__(name)
|
|
70
78
|
self._col_mapping = col_mapping
|
|
71
79
|
|
|
@@ -80,11 +88,11 @@ class Project(ExternalStore, abc.ABC):
|
|
|
80
88
|
# Note from aaron-siegel: This methodology is inefficient in the case where a table has many views with a high
|
|
81
89
|
# proportion of overlapping rows, all proxying the same base column.
|
|
82
90
|
if stored_proxies is None:
|
|
83
|
-
self.stored_proxies: dict[
|
|
91
|
+
self.stored_proxies: dict[ColumnHandle, ColumnHandle] = {}
|
|
84
92
|
else:
|
|
85
93
|
self.stored_proxies = stored_proxies
|
|
86
94
|
|
|
87
|
-
def get_local_columns(self) -> list[
|
|
95
|
+
def get_local_columns(self) -> list[ColumnHandle]:
|
|
88
96
|
return list(self.col_mapping.keys())
|
|
89
97
|
|
|
90
98
|
def link(self, tbl_version: TableVersion) -> None:
|
|
@@ -92,15 +100,16 @@ class Project(ExternalStore, abc.ABC):
|
|
|
92
100
|
# This ensures that the media in those columns resides in the media store.
|
|
93
101
|
# First determine which columns (if any) need stored proxies, but don't have one yet.
|
|
94
102
|
stored_proxies_needed: list[Column] = []
|
|
95
|
-
for
|
|
103
|
+
for col_handle in self.col_mapping:
|
|
104
|
+
col = col_handle.get()
|
|
96
105
|
if col.col_type.is_media_type() and not (col.is_stored and col.is_computed):
|
|
97
106
|
# If this column is already proxied in some other Project, use the existing proxy to avoid
|
|
98
107
|
# duplication. Otherwise, we'll create a new one.
|
|
99
108
|
for store in tbl_version.external_stores.values():
|
|
100
|
-
if isinstance(store, Project) and
|
|
101
|
-
self.stored_proxies[
|
|
109
|
+
if isinstance(store, Project) and col_handle in store.stored_proxies:
|
|
110
|
+
self.stored_proxies[col_handle] = store.stored_proxies[col_handle]
|
|
102
111
|
break
|
|
103
|
-
if
|
|
112
|
+
if col_handle not in self.stored_proxies:
|
|
104
113
|
# We didn't find it in an existing Project
|
|
105
114
|
stored_proxies_needed.append(col)
|
|
106
115
|
|
|
@@ -110,17 +119,20 @@ class Project(ExternalStore, abc.ABC):
|
|
|
110
119
|
proxy_cols = [self.create_stored_proxy(col) for col in stored_proxies_needed]
|
|
111
120
|
# Add the columns; this will also update table metadata.
|
|
112
121
|
tbl_version.add_columns(proxy_cols, print_stats=False, on_error='ignore')
|
|
122
|
+
self.stored_proxies.update(
|
|
123
|
+
{col.handle: proxy_col.handle for col, proxy_col in zip(stored_proxies_needed, proxy_cols)}
|
|
124
|
+
)
|
|
113
125
|
|
|
114
126
|
def unlink(self, tbl_version: TableVersion) -> None:
|
|
115
127
|
# Determine which stored proxies can be deleted. (A stored proxy can be deleted if it is not referenced by
|
|
116
128
|
# any *other* external store for this table.)
|
|
117
|
-
deletions_needed: set[
|
|
129
|
+
deletions_needed: set[ColumnHandle] = set(self.stored_proxies.values())
|
|
118
130
|
for name, store in tbl_version.external_stores.items():
|
|
119
131
|
if isinstance(store, Project) and name != self.name:
|
|
120
132
|
deletions_needed = deletions_needed.difference(set(store.stored_proxies.values()))
|
|
121
133
|
if len(deletions_needed) > 0:
|
|
122
|
-
_logger.info(f'Removing stored proxies for columns: {[col.name for col in deletions_needed]}')
|
|
123
|
-
tbl_version._drop_columns(deletions_needed)
|
|
134
|
+
_logger.info(f'Removing stored proxies for columns: {[col.get().name for col in deletions_needed]}')
|
|
135
|
+
tbl_version._drop_columns(col.get() for col in deletions_needed)
|
|
124
136
|
self.stored_proxies.clear()
|
|
125
137
|
|
|
126
138
|
def create_stored_proxy(self, col: Column) -> Column:
|
|
@@ -142,11 +154,10 @@ class Project(ExternalStore, abc.ABC):
|
|
|
142
154
|
computed_with=exprs.ColumnRef(col).apply(lambda x: x, col_type=col.col_type),
|
|
143
155
|
stored=True,
|
|
144
156
|
)
|
|
145
|
-
self.stored_proxies[col] = proxy_col
|
|
146
157
|
return proxy_col
|
|
147
158
|
|
|
148
159
|
@property
|
|
149
|
-
def col_mapping(self) -> dict[
|
|
160
|
+
def col_mapping(self) -> dict[ColumnHandle, str]:
|
|
150
161
|
return self._col_mapping
|
|
151
162
|
|
|
152
163
|
@abc.abstractmethod
|
|
@@ -181,7 +192,7 @@ class Project(ExternalStore, abc.ABC):
|
|
|
181
192
|
export_cols: dict[str, ts.ColumnType],
|
|
182
193
|
import_cols: dict[str, ts.ColumnType],
|
|
183
194
|
col_mapping: Optional[dict[str, str]],
|
|
184
|
-
) -> dict[
|
|
195
|
+
) -> dict[ColumnHandle, str]:
|
|
185
196
|
"""
|
|
186
197
|
Verifies that the specified `col_mapping` is valid. In particular, checks that:
|
|
187
198
|
(i) the keys of `col_mapping` are valid columns of the specified `Table`;
|
|
@@ -199,7 +210,7 @@ class Project(ExternalStore, abc.ABC):
|
|
|
199
210
|
if col_mapping is None:
|
|
200
211
|
col_mapping = {col: col for col in itertools.chain(export_cols.keys(), import_cols.keys())}
|
|
201
212
|
|
|
202
|
-
resolved_col_mapping: dict[
|
|
213
|
+
resolved_col_mapping: dict[ColumnHandle, str] = {}
|
|
203
214
|
|
|
204
215
|
# Validate names
|
|
205
216
|
t_cols = set(table._get_schema().keys())
|
|
@@ -223,7 +234,8 @@ class Project(ExternalStore, abc.ABC):
|
|
|
223
234
|
)
|
|
224
235
|
col_ref = table[t_col]
|
|
225
236
|
assert isinstance(col_ref, exprs.ColumnRef)
|
|
226
|
-
resolved_col_mapping[col_ref.col] = ext_col
|
|
237
|
+
resolved_col_mapping[col_ref.col.handle] = ext_col
|
|
238
|
+
|
|
227
239
|
# Validate column specs
|
|
228
240
|
t_col_types = table._get_schema()
|
|
229
241
|
for t_col, ext_col in col_mapping.items():
|
|
@@ -250,39 +262,56 @@ class Project(ExternalStore, abc.ABC):
|
|
|
250
262
|
)
|
|
251
263
|
return resolved_col_mapping
|
|
252
264
|
|
|
253
|
-
@classmethod
|
|
254
|
-
def _column_as_dict(cls, col: Column) -> dict[str, Any]:
|
|
255
|
-
return {'tbl_id': str(col.tbl.id), 'col_id': col.id}
|
|
256
265
|
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
266
|
+
@dataclass(frozen=True)
|
|
267
|
+
class SyncStatus:
|
|
268
|
+
# stats for the rows affected by the operation in the external store
|
|
269
|
+
ext_row_count_stats: RowCountStats = field(default_factory=RowCountStats)
|
|
260
270
|
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
return Catalog.get().get_tbl_version(tbl_id, None).cols_by_id[col_id]
|
|
271
|
+
# stats for the rows affected by the operation
|
|
272
|
+
row_count_stats: RowCountStats = field(default_factory=RowCountStats)
|
|
264
273
|
|
|
274
|
+
@property
|
|
275
|
+
def num_excs(self) -> int:
|
|
276
|
+
"""
|
|
277
|
+
Returns the total number of Pixeltable exceptions that occurred during the operation.
|
|
278
|
+
"""
|
|
279
|
+
return self.row_count_stats.num_excs
|
|
265
280
|
|
|
266
|
-
@
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
281
|
+
@property
|
|
282
|
+
def pxt_rows_updated(self) -> int:
|
|
283
|
+
"""
|
|
284
|
+
Returns the number of Pixeltable rows that were updated as a result of the operation.
|
|
285
|
+
"""
|
|
286
|
+
return self.row_count_stats.upd_rows
|
|
287
|
+
|
|
288
|
+
@property
|
|
289
|
+
def external_rows_updated(self) -> int:
|
|
290
|
+
return self.ext_row_count_stats.upd_rows
|
|
291
|
+
|
|
292
|
+
@property
|
|
293
|
+
def external_rows_created(self) -> int:
|
|
294
|
+
return self.ext_row_count_stats.ins_rows
|
|
295
|
+
|
|
296
|
+
@property
|
|
297
|
+
def external_rows_deleted(self) -> int:
|
|
298
|
+
return self.ext_row_count_stats.del_rows
|
|
273
299
|
|
|
274
|
-
def
|
|
300
|
+
def __add__(self, other: 'SyncStatus') -> 'SyncStatus':
|
|
301
|
+
"""
|
|
302
|
+
Add the sync status from two SyncStatus objects together.
|
|
303
|
+
"""
|
|
275
304
|
return SyncStatus(
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
external_rows_updated=self.external_rows_updated + other.external_rows_updated,
|
|
279
|
-
pxt_rows_updated=self.pxt_rows_updated + other.pxt_rows_updated,
|
|
280
|
-
num_excs=self.num_excs + other.num_excs,
|
|
305
|
+
ext_row_count_stats=self.ext_row_count_stats + other.ext_row_count_stats,
|
|
306
|
+
row_count_stats=self.row_count_stats + other.row_count_stats,
|
|
281
307
|
)
|
|
282
308
|
|
|
283
309
|
@classmethod
|
|
284
|
-
def
|
|
285
|
-
|
|
310
|
+
def from_update_status(cls, us: UpdateStatus) -> 'SyncStatus':
|
|
311
|
+
"""
|
|
312
|
+
Copy information from an UpdateStatus to a SyncStatus.
|
|
313
|
+
"""
|
|
314
|
+
return SyncStatus(row_count_stats=us.row_count_stats + us.cascade_row_count_stats)
|
|
286
315
|
|
|
287
316
|
|
|
288
317
|
class MockProject(Project):
|
|
@@ -293,8 +322,8 @@ class MockProject(Project):
|
|
|
293
322
|
name: str,
|
|
294
323
|
export_cols: dict[str, ts.ColumnType],
|
|
295
324
|
import_cols: dict[str, ts.ColumnType],
|
|
296
|
-
col_mapping: dict[
|
|
297
|
-
stored_proxies: Optional[dict[
|
|
325
|
+
col_mapping: dict[ColumnHandle, str],
|
|
326
|
+
stored_proxies: Optional[dict[ColumnHandle, ColumnHandle]] = None,
|
|
298
327
|
):
|
|
299
328
|
super().__init__(name, col_mapping, stored_proxies)
|
|
300
329
|
self.export_cols = export_cols
|
|
@@ -334,10 +363,8 @@ class MockProject(Project):
|
|
|
334
363
|
'name': self.name,
|
|
335
364
|
'export_cols': {k: v.as_dict() for k, v in self.export_cols.items()},
|
|
336
365
|
'import_cols': {k: v.as_dict() for k, v in self.import_cols.items()},
|
|
337
|
-
'col_mapping': [[
|
|
338
|
-
'stored_proxies': [
|
|
339
|
-
[self._column_as_dict(k), self._column_as_dict(v)] for k, v in self.stored_proxies.items()
|
|
340
|
-
],
|
|
366
|
+
'col_mapping': [[k.as_dict(), v] for k, v in self.col_mapping.items()],
|
|
367
|
+
'stored_proxies': [[k.as_dict(), v.as_dict()] for k, v in self.stored_proxies.items()],
|
|
341
368
|
}
|
|
342
369
|
|
|
343
370
|
@classmethod
|
|
@@ -346,8 +373,8 @@ class MockProject(Project):
|
|
|
346
373
|
md['name'],
|
|
347
374
|
{k: ts.ColumnType.from_dict(v) for k, v in md['export_cols'].items()},
|
|
348
375
|
{k: ts.ColumnType.from_dict(v) for k, v in md['import_cols'].items()},
|
|
349
|
-
{
|
|
350
|
-
{
|
|
376
|
+
{ColumnHandle.from_dict(entry[0]): entry[1] for entry in md['col_mapping']},
|
|
377
|
+
{ColumnHandle.from_dict(entry[0]): ColumnHandle.from_dict(entry[1]) for entry in md['stored_proxies']},
|
|
351
378
|
)
|
|
352
379
|
|
|
353
380
|
def __eq__(self, other: object) -> bool:
|
pixeltable/io/globals.py
CHANGED
pixeltable/io/label_studio.py
CHANGED
|
@@ -4,15 +4,17 @@ import logging
|
|
|
4
4
|
import os
|
|
5
5
|
from dataclasses import dataclass
|
|
6
6
|
from pathlib import Path
|
|
7
|
-
from typing import Any, Iterator, Literal, Optional
|
|
7
|
+
from typing import Any, Iterator, Literal, Optional
|
|
8
8
|
from xml.etree import ElementTree as ET
|
|
9
9
|
|
|
10
|
-
import label_studio_sdk
|
|
10
|
+
import label_studio_sdk
|
|
11
11
|
import PIL.Image
|
|
12
12
|
from requests.exceptions import HTTPError
|
|
13
13
|
|
|
14
14
|
import pixeltable.type_system as ts
|
|
15
15
|
from pixeltable import Column, Table, env, exceptions as excs
|
|
16
|
+
from pixeltable.catalog import ColumnHandle
|
|
17
|
+
from pixeltable.catalog.globals import RowCountStats
|
|
16
18
|
from pixeltable.config import Config
|
|
17
19
|
from pixeltable.exprs import ColumnRef, DataRow, Expr
|
|
18
20
|
from pixeltable.io.external_store import Project, SyncStatus
|
|
@@ -25,7 +27,7 @@ try:
|
|
|
25
27
|
import label_studio_sdk.project as ls_project # type: ignore
|
|
26
28
|
except ImportError:
|
|
27
29
|
# label_studio_sdk>=1 compatibility
|
|
28
|
-
import label_studio_sdk._legacy.project as ls_project
|
|
30
|
+
import label_studio_sdk._legacy.project as ls_project
|
|
29
31
|
|
|
30
32
|
_logger = logging.getLogger('pixeltable')
|
|
31
33
|
|
|
@@ -45,13 +47,17 @@ class LabelStudioProject(Project):
|
|
|
45
47
|
for synchronizing between a Pixeltable table and a Label Studio project.
|
|
46
48
|
"""
|
|
47
49
|
|
|
50
|
+
project_id: int # Label Studio project ID
|
|
51
|
+
media_import_method: Literal['post', 'file', 'url']
|
|
52
|
+
_project: Optional[ls_project.Project]
|
|
53
|
+
|
|
48
54
|
def __init__(
|
|
49
55
|
self,
|
|
50
56
|
name: str,
|
|
51
57
|
project_id: int,
|
|
52
58
|
media_import_method: Literal['post', 'file', 'url'],
|
|
53
|
-
col_mapping: dict[
|
|
54
|
-
stored_proxies: Optional[dict[
|
|
59
|
+
col_mapping: dict[ColumnHandle, str],
|
|
60
|
+
stored_proxies: Optional[dict[ColumnHandle, ColumnHandle]] = None,
|
|
55
61
|
):
|
|
56
62
|
"""
|
|
57
63
|
The constructor will NOT create a new Label Studio project; it is also used when loading
|
|
@@ -59,7 +65,7 @@ class LabelStudioProject(Project):
|
|
|
59
65
|
"""
|
|
60
66
|
self.project_id = project_id
|
|
61
67
|
self.media_import_method = media_import_method
|
|
62
|
-
self._project
|
|
68
|
+
self._project = None
|
|
63
69
|
super().__init__(name, col_mapping, stored_proxies)
|
|
64
70
|
|
|
65
71
|
@property
|
|
@@ -112,13 +118,13 @@ class LabelStudioProject(Project):
|
|
|
112
118
|
)
|
|
113
119
|
# Collect all existing tasks into a dict with entries `rowid: task`
|
|
114
120
|
tasks = {tuple(task['meta']['rowid']): task for task in self.__fetch_all_tasks()}
|
|
115
|
-
sync_status = SyncStatus
|
|
121
|
+
sync_status = SyncStatus()
|
|
116
122
|
if export_data:
|
|
117
123
|
export_sync_status = self.__update_tasks(t, tasks)
|
|
118
|
-
sync_status
|
|
124
|
+
sync_status += export_sync_status
|
|
119
125
|
if import_data:
|
|
120
126
|
import_sync_status = self.__update_table_from_tasks(t, tasks)
|
|
121
|
-
sync_status
|
|
127
|
+
sync_status += import_sync_status
|
|
122
128
|
return sync_status
|
|
123
129
|
|
|
124
130
|
def __fetch_all_tasks(self) -> Iterator[dict[str, Any]]:
|
|
@@ -155,7 +161,7 @@ class LabelStudioProject(Project):
|
|
|
155
161
|
t_data_cols = [t_col for t_col, ext_col_name in self.col_mapping.items() if ext_col_name in config.data_keys]
|
|
156
162
|
|
|
157
163
|
if len(t_data_cols) == 0:
|
|
158
|
-
return SyncStatus
|
|
164
|
+
return SyncStatus()
|
|
159
165
|
|
|
160
166
|
# Columns in `t` that map to `rectanglelabels` preannotations
|
|
161
167
|
t_rl_cols = [
|
|
@@ -183,15 +189,15 @@ class LabelStudioProject(Project):
|
|
|
183
189
|
self,
|
|
184
190
|
t: Table,
|
|
185
191
|
existing_tasks: dict[tuple, dict],
|
|
186
|
-
media_col:
|
|
187
|
-
t_rl_cols: list[
|
|
192
|
+
media_col: ColumnHandle,
|
|
193
|
+
t_rl_cols: list[ColumnHandle],
|
|
188
194
|
rl_info: list['_RectangleLabel'],
|
|
189
195
|
) -> SyncStatus:
|
|
190
|
-
is_stored = media_col.is_stored
|
|
196
|
+
is_stored = media_col.get().is_stored
|
|
191
197
|
# If it's a stored column, we can use `localpath`
|
|
192
|
-
localpath_col_opt = [t[media_col.name].localpath] if is_stored else []
|
|
198
|
+
localpath_col_opt = [t[media_col.get().name].localpath] if is_stored else []
|
|
193
199
|
# Select the media column, rectanglelabels columns, and localpath (if appropriate)
|
|
194
|
-
rows = t.select(t[media_col.name], *[t[col.name] for col in t_rl_cols], *localpath_col_opt)
|
|
200
|
+
rows = t.select(t[media_col.get().name], *[t[col.get().name] for col in t_rl_cols], *localpath_col_opt)
|
|
195
201
|
tasks_created = 0
|
|
196
202
|
row_ids_in_pxt: set[tuple] = set()
|
|
197
203
|
|
|
@@ -232,42 +238,42 @@ class LabelStudioProject(Project):
|
|
|
232
238
|
|
|
233
239
|
env.Env.get().console_logger.info(f'Created {tasks_created} new task(s) in {self}.')
|
|
234
240
|
|
|
235
|
-
sync_status = SyncStatus(
|
|
241
|
+
sync_status = SyncStatus(ext_row_count_stats=RowCountStats(ins_rows=tasks_created))
|
|
236
242
|
|
|
237
243
|
deletion_sync_status = self.__delete_stale_tasks(existing_tasks, row_ids_in_pxt, tasks_created)
|
|
238
|
-
|
|
239
|
-
return sync_status
|
|
244
|
+
sync_status += deletion_sync_status
|
|
245
|
+
return sync_status
|
|
240
246
|
|
|
241
247
|
def __update_tasks_by_files(
|
|
242
248
|
self,
|
|
243
249
|
t: Table,
|
|
244
250
|
existing_tasks: dict[tuple, dict],
|
|
245
|
-
t_data_cols: list[
|
|
246
|
-
t_rl_cols: list[
|
|
251
|
+
t_data_cols: list[ColumnHandle],
|
|
252
|
+
t_rl_cols: list[ColumnHandle],
|
|
247
253
|
rl_info: list['_RectangleLabel'],
|
|
248
254
|
) -> SyncStatus:
|
|
249
255
|
ext_data_cols = [self.col_mapping[col] for col in t_data_cols]
|
|
250
256
|
expr_refs: dict[str, Expr] = {} # kwargs for the select statement
|
|
251
257
|
for col in t_data_cols:
|
|
252
|
-
col_name = col.name
|
|
258
|
+
col_name = col.get().name
|
|
253
259
|
if self.media_import_method == 'url':
|
|
254
260
|
expr_refs[col_name] = t[col_name].fileurl
|
|
255
261
|
else:
|
|
256
262
|
assert self.media_import_method == 'file'
|
|
257
|
-
if not col.col_type.is_media_type():
|
|
263
|
+
if not col.get().col_type.is_media_type():
|
|
258
264
|
# Not a media column; query the data directly
|
|
259
|
-
expr_refs[col_name] =
|
|
265
|
+
expr_refs[col_name] = t[col_name]
|
|
260
266
|
elif col in self.stored_proxies:
|
|
261
267
|
# Media column that has a stored proxy; use it. We have to give it a name,
|
|
262
268
|
# since it's an anonymous column
|
|
263
|
-
stored_proxy_col = self.stored_proxies[col]
|
|
269
|
+
stored_proxy_col = self.stored_proxies[col].get()
|
|
264
270
|
expr_refs[f'{col_name}_proxy'] = ColumnRef(stored_proxy_col).localpath
|
|
265
271
|
else:
|
|
266
272
|
# Media column without a stored proxy; this means it's a stored computed column,
|
|
267
273
|
# and we can just use the localpath
|
|
268
274
|
expr_refs[col_name] = t[col_name].localpath
|
|
269
275
|
|
|
270
|
-
df = t.select(*[t[col.name] for col in t_rl_cols], **expr_refs)
|
|
276
|
+
df = t.select(*[t[col.get().name] for col in t_rl_cols], **expr_refs)
|
|
271
277
|
# The following buffers will hold `DataRow` indices that correspond to each of the selected
|
|
272
278
|
# columns. `rl_col_idxs` holds the indices for the columns that map to RectangleLabels
|
|
273
279
|
# preannotations; `data_col_idxs` holds the indices for the columns that map to data fields.
|
|
@@ -286,11 +292,11 @@ class LabelStudioProject(Project):
|
|
|
286
292
|
data_vals = [row[idx] for idx in data_col_idxs]
|
|
287
293
|
coco_annotations = [row[idx] for idx in rl_col_idxs]
|
|
288
294
|
for i in range(len(t_data_cols)):
|
|
289
|
-
if t_data_cols[i].col_type.is_media_type():
|
|
295
|
+
if t_data_cols[i].get().col_type.is_media_type():
|
|
290
296
|
# Special handling for media columns
|
|
291
297
|
assert isinstance(data_vals[i], str)
|
|
292
298
|
if self.media_import_method == 'url':
|
|
293
|
-
data_vals[i] = self.__validate_fileurl(t_data_cols[i], data_vals[i])
|
|
299
|
+
data_vals[i] = self.__validate_fileurl(t_data_cols[i].get(), data_vals[i])
|
|
294
300
|
else:
|
|
295
301
|
assert self.media_import_method == 'file'
|
|
296
302
|
data_vals[i] = self.__localpath_to_lspath(data_vals[i])
|
|
@@ -336,11 +342,11 @@ class LabelStudioProject(Project):
|
|
|
336
342
|
f'Created {tasks_created} new task(s) and updated {tasks_updated} existing task(s) in {self}.'
|
|
337
343
|
)
|
|
338
344
|
|
|
339
|
-
sync_status = SyncStatus(
|
|
345
|
+
sync_status = SyncStatus(ext_row_count_stats=RowCountStats(ins_rows=tasks_created, upd_rows=tasks_updated))
|
|
340
346
|
|
|
341
347
|
deletion_sync_status = self.__delete_stale_tasks(existing_tasks, row_ids_in_pxt, tasks_created)
|
|
342
|
-
|
|
343
|
-
return sync_status
|
|
348
|
+
sync_status += deletion_sync_status
|
|
349
|
+
return sync_status
|
|
344
350
|
|
|
345
351
|
@classmethod
|
|
346
352
|
def __validate_fileurl(cls, col: Column, url: str) -> Optional[str]:
|
|
@@ -377,11 +383,11 @@ class LabelStudioProject(Project):
|
|
|
377
383
|
for rowid in deleted_rowids:
|
|
378
384
|
del existing_tasks[rowid]
|
|
379
385
|
|
|
380
|
-
return SyncStatus(
|
|
386
|
+
return SyncStatus(ext_row_count_stats=RowCountStats(del_rows=len(deleted_rowids)))
|
|
381
387
|
|
|
382
388
|
def __update_table_from_tasks(self, t: Table, tasks: dict[tuple, dict]) -> SyncStatus:
|
|
383
389
|
if ANNOTATIONS_COLUMN not in self.col_mapping.values():
|
|
384
|
-
return SyncStatus
|
|
390
|
+
return SyncStatus()
|
|
385
391
|
|
|
386
392
|
annotations = {
|
|
387
393
|
# Replace [] by None to indicate no annotations. We do want to sync rows with no annotations,
|
|
@@ -391,7 +397,7 @@ class LabelStudioProject(Project):
|
|
|
391
397
|
for task in tasks.values()
|
|
392
398
|
}
|
|
393
399
|
|
|
394
|
-
local_annotations_col = next(k for k, v in self.col_mapping.items() if v == ANNOTATIONS_COLUMN)
|
|
400
|
+
local_annotations_col = next(k for k, v in self.col_mapping.items() if v == ANNOTATIONS_COLUMN).get()
|
|
395
401
|
|
|
396
402
|
# Prune the annotations down to just the ones that have actually changed.
|
|
397
403
|
rows = t.select(t[local_annotations_col.name])
|
|
@@ -416,19 +422,17 @@ class LabelStudioProject(Project):
|
|
|
416
422
|
ancestor = ancestor._get_base_table()
|
|
417
423
|
update_status = ancestor.batch_update(updates)
|
|
418
424
|
env.Env.get().console_logger.info(f'Updated annotation(s) from {len(updates)} task(s) in {self}.')
|
|
419
|
-
return SyncStatus(
|
|
425
|
+
return SyncStatus.from_update_status(update_status)
|
|
420
426
|
else:
|
|
421
|
-
return SyncStatus
|
|
427
|
+
return SyncStatus()
|
|
422
428
|
|
|
423
429
|
def as_dict(self) -> dict[str, Any]:
|
|
424
430
|
return {
|
|
425
431
|
'name': self.name,
|
|
426
432
|
'project_id': self.project_id,
|
|
427
433
|
'media_import_method': self.media_import_method,
|
|
428
|
-
'col_mapping': [[
|
|
429
|
-
'stored_proxies': [
|
|
430
|
-
[self._column_as_dict(k), self._column_as_dict(v)] for k, v in self.stored_proxies.items()
|
|
431
|
-
],
|
|
434
|
+
'col_mapping': [[k.as_dict(), v] for k, v in self.col_mapping.items()],
|
|
435
|
+
'stored_proxies': [[k.as_dict(), v.as_dict()] for k, v in self.stored_proxies.items()],
|
|
432
436
|
}
|
|
433
437
|
|
|
434
438
|
@classmethod
|
|
@@ -437,8 +441,8 @@ class LabelStudioProject(Project):
|
|
|
437
441
|
md['name'],
|
|
438
442
|
md['project_id'],
|
|
439
443
|
md['media_import_method'],
|
|
440
|
-
{
|
|
441
|
-
{
|
|
444
|
+
{ColumnHandle.from_dict(entry[0]): entry[1] for entry in md['col_mapping']},
|
|
445
|
+
{ColumnHandle.from_dict(entry[0]): ColumnHandle.from_dict(entry[1]) for entry in md['stored_proxies']},
|
|
442
446
|
)
|
|
443
447
|
|
|
444
448
|
def __repr__(self) -> str:
|
pixeltable/metadata/__init__.py
CHANGED
|
@@ -18,7 +18,7 @@ _console_logger = ConsoleLogger(logging.getLogger('pixeltable'))
|
|
|
18
18
|
_logger = logging.getLogger('pixeltable')
|
|
19
19
|
|
|
20
20
|
# current version of the metadata; this is incremented whenever the metadata schema changes
|
|
21
|
-
VERSION =
|
|
21
|
+
VERSION = 39
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
def create_system_info(engine: sql.engine.Engine) -> None:
|