pixeltable 0.4.0rc3__py3-none-any.whl → 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +1 -1
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +11 -2
- pixeltable/catalog/catalog.py +407 -119
- pixeltable/catalog/column.py +38 -26
- pixeltable/catalog/globals.py +130 -15
- pixeltable/catalog/insertable_table.py +10 -9
- pixeltable/catalog/schema_object.py +6 -0
- pixeltable/catalog/table.py +245 -119
- pixeltable/catalog/table_version.py +142 -116
- pixeltable/catalog/table_version_handle.py +30 -2
- pixeltable/catalog/table_version_path.py +28 -4
- pixeltable/catalog/view.py +14 -20
- pixeltable/config.py +4 -0
- pixeltable/dataframe.py +10 -9
- pixeltable/env.py +5 -11
- pixeltable/exceptions.py +6 -0
- pixeltable/exec/exec_node.py +2 -0
- pixeltable/exec/expr_eval/expr_eval_node.py +4 -4
- pixeltable/exec/sql_node.py +47 -30
- pixeltable/exprs/column_property_ref.py +2 -10
- pixeltable/exprs/column_ref.py +24 -21
- pixeltable/exprs/data_row.py +9 -0
- pixeltable/exprs/expr.py +4 -4
- pixeltable/exprs/row_builder.py +44 -13
- pixeltable/func/__init__.py +1 -0
- pixeltable/func/mcp.py +74 -0
- pixeltable/func/query_template_function.py +4 -2
- pixeltable/func/tools.py +12 -2
- pixeltable/func/udf.py +2 -2
- pixeltable/functions/__init__.py +1 -0
- pixeltable/functions/groq.py +108 -0
- pixeltable/functions/huggingface.py +8 -6
- pixeltable/functions/mistralai.py +2 -13
- pixeltable/functions/openai.py +1 -6
- pixeltable/functions/replicate.py +2 -2
- pixeltable/functions/util.py +6 -1
- pixeltable/globals.py +0 -2
- pixeltable/io/external_store.py +81 -54
- pixeltable/io/globals.py +1 -1
- pixeltable/io/label_studio.py +49 -45
- pixeltable/io/table_data_conduit.py +1 -1
- pixeltable/metadata/__init__.py +1 -1
- pixeltable/metadata/converters/convert_37.py +15 -0
- pixeltable/metadata/converters/convert_38.py +39 -0
- pixeltable/metadata/notes.py +2 -0
- pixeltable/metadata/schema.py +5 -0
- pixeltable/metadata/utils.py +78 -0
- pixeltable/plan.py +59 -139
- pixeltable/share/packager.py +2 -2
- pixeltable/store.py +114 -103
- pixeltable/type_system.py +30 -0
- {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.2.dist-info}/METADATA +1 -1
- {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.2.dist-info}/RECORD +57 -53
- pixeltable/utils/sample.py +0 -25
- {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.2.dist-info}/LICENSE +0 -0
- {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.2.dist-info}/WHEEL +0 -0
- {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.2.dist-info}/entry_points.txt +0 -0
pixeltable/io/external_store.py
CHANGED
|
@@ -3,14 +3,14 @@ from __future__ import annotations
|
|
|
3
3
|
import abc
|
|
4
4
|
import itertools
|
|
5
5
|
import logging
|
|
6
|
-
from dataclasses import dataclass
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
7
|
from typing import Any, Optional
|
|
8
|
-
from uuid import UUID
|
|
9
8
|
|
|
10
9
|
import pixeltable.exceptions as excs
|
|
11
10
|
import pixeltable.type_system as ts
|
|
12
11
|
from pixeltable import Column, Table
|
|
13
|
-
from pixeltable.catalog import TableVersion
|
|
12
|
+
from pixeltable.catalog import ColumnHandle, TableVersion
|
|
13
|
+
from pixeltable.catalog.globals import RowCountStats, UpdateStatus
|
|
14
14
|
|
|
15
15
|
_logger = logging.getLogger('pixeltable')
|
|
16
16
|
|
|
@@ -22,6 +22,8 @@ class ExternalStore(abc.ABC):
|
|
|
22
22
|
and stateful external stores.
|
|
23
23
|
"""
|
|
24
24
|
|
|
25
|
+
__name: str
|
|
26
|
+
|
|
25
27
|
def __init__(self, name: str) -> None:
|
|
26
28
|
self.__name = name
|
|
27
29
|
|
|
@@ -38,7 +40,7 @@ class ExternalStore(abc.ABC):
|
|
|
38
40
|
"""Removes store-specific metadata created in link()."""
|
|
39
41
|
|
|
40
42
|
@abc.abstractmethod
|
|
41
|
-
def get_local_columns(self) -> list[
|
|
43
|
+
def get_local_columns(self) -> list[ColumnHandle]:
|
|
42
44
|
"""
|
|
43
45
|
Gets a list of all local (Pixeltable) columns that are associated with this external store.
|
|
44
46
|
"""
|
|
@@ -63,9 +65,15 @@ class Project(ExternalStore, abc.ABC):
|
|
|
63
65
|
additional capabilities specific to such projects.
|
|
64
66
|
"""
|
|
65
67
|
|
|
66
|
-
|
|
68
|
+
_col_mapping: dict[ColumnHandle, str] # col -> external col name
|
|
69
|
+
stored_proxies: dict[ColumnHandle, ColumnHandle] # original col -> proxy col
|
|
67
70
|
|
|
68
|
-
def __init__(
|
|
71
|
+
def __init__(
|
|
72
|
+
self,
|
|
73
|
+
name: str,
|
|
74
|
+
col_mapping: dict[ColumnHandle, str],
|
|
75
|
+
stored_proxies: Optional[dict[ColumnHandle, ColumnHandle]],
|
|
76
|
+
):
|
|
69
77
|
super().__init__(name)
|
|
70
78
|
self._col_mapping = col_mapping
|
|
71
79
|
|
|
@@ -80,11 +88,11 @@ class Project(ExternalStore, abc.ABC):
|
|
|
80
88
|
# Note from aaron-siegel: This methodology is inefficient in the case where a table has many views with a high
|
|
81
89
|
# proportion of overlapping rows, all proxying the same base column.
|
|
82
90
|
if stored_proxies is None:
|
|
83
|
-
self.stored_proxies: dict[
|
|
91
|
+
self.stored_proxies: dict[ColumnHandle, ColumnHandle] = {}
|
|
84
92
|
else:
|
|
85
93
|
self.stored_proxies = stored_proxies
|
|
86
94
|
|
|
87
|
-
def get_local_columns(self) -> list[
|
|
95
|
+
def get_local_columns(self) -> list[ColumnHandle]:
|
|
88
96
|
return list(self.col_mapping.keys())
|
|
89
97
|
|
|
90
98
|
def link(self, tbl_version: TableVersion) -> None:
|
|
@@ -92,15 +100,16 @@ class Project(ExternalStore, abc.ABC):
|
|
|
92
100
|
# This ensures that the media in those columns resides in the media store.
|
|
93
101
|
# First determine which columns (if any) need stored proxies, but don't have one yet.
|
|
94
102
|
stored_proxies_needed: list[Column] = []
|
|
95
|
-
for
|
|
103
|
+
for col_handle in self.col_mapping:
|
|
104
|
+
col = col_handle.get()
|
|
96
105
|
if col.col_type.is_media_type() and not (col.is_stored and col.is_computed):
|
|
97
106
|
# If this column is already proxied in some other Project, use the existing proxy to avoid
|
|
98
107
|
# duplication. Otherwise, we'll create a new one.
|
|
99
108
|
for store in tbl_version.external_stores.values():
|
|
100
|
-
if isinstance(store, Project) and
|
|
101
|
-
self.stored_proxies[
|
|
109
|
+
if isinstance(store, Project) and col_handle in store.stored_proxies:
|
|
110
|
+
self.stored_proxies[col_handle] = store.stored_proxies[col_handle]
|
|
102
111
|
break
|
|
103
|
-
if
|
|
112
|
+
if col_handle not in self.stored_proxies:
|
|
104
113
|
# We didn't find it in an existing Project
|
|
105
114
|
stored_proxies_needed.append(col)
|
|
106
115
|
|
|
@@ -110,17 +119,20 @@ class Project(ExternalStore, abc.ABC):
|
|
|
110
119
|
proxy_cols = [self.create_stored_proxy(col) for col in stored_proxies_needed]
|
|
111
120
|
# Add the columns; this will also update table metadata.
|
|
112
121
|
tbl_version.add_columns(proxy_cols, print_stats=False, on_error='ignore')
|
|
122
|
+
self.stored_proxies.update(
|
|
123
|
+
{col.handle: proxy_col.handle for col, proxy_col in zip(stored_proxies_needed, proxy_cols)}
|
|
124
|
+
)
|
|
113
125
|
|
|
114
126
|
def unlink(self, tbl_version: TableVersion) -> None:
|
|
115
127
|
# Determine which stored proxies can be deleted. (A stored proxy can be deleted if it is not referenced by
|
|
116
128
|
# any *other* external store for this table.)
|
|
117
|
-
deletions_needed: set[
|
|
129
|
+
deletions_needed: set[ColumnHandle] = set(self.stored_proxies.values())
|
|
118
130
|
for name, store in tbl_version.external_stores.items():
|
|
119
131
|
if isinstance(store, Project) and name != self.name:
|
|
120
132
|
deletions_needed = deletions_needed.difference(set(store.stored_proxies.values()))
|
|
121
133
|
if len(deletions_needed) > 0:
|
|
122
|
-
_logger.info(f'Removing stored proxies for columns: {[col.name for col in deletions_needed]}')
|
|
123
|
-
tbl_version._drop_columns(deletions_needed)
|
|
134
|
+
_logger.info(f'Removing stored proxies for columns: {[col.get().name for col in deletions_needed]}')
|
|
135
|
+
tbl_version._drop_columns(col.get() for col in deletions_needed)
|
|
124
136
|
self.stored_proxies.clear()
|
|
125
137
|
|
|
126
138
|
def create_stored_proxy(self, col: Column) -> Column:
|
|
@@ -142,11 +154,10 @@ class Project(ExternalStore, abc.ABC):
|
|
|
142
154
|
computed_with=exprs.ColumnRef(col).apply(lambda x: x, col_type=col.col_type),
|
|
143
155
|
stored=True,
|
|
144
156
|
)
|
|
145
|
-
self.stored_proxies[col] = proxy_col
|
|
146
157
|
return proxy_col
|
|
147
158
|
|
|
148
159
|
@property
|
|
149
|
-
def col_mapping(self) -> dict[
|
|
160
|
+
def col_mapping(self) -> dict[ColumnHandle, str]:
|
|
150
161
|
return self._col_mapping
|
|
151
162
|
|
|
152
163
|
@abc.abstractmethod
|
|
@@ -181,7 +192,7 @@ class Project(ExternalStore, abc.ABC):
|
|
|
181
192
|
export_cols: dict[str, ts.ColumnType],
|
|
182
193
|
import_cols: dict[str, ts.ColumnType],
|
|
183
194
|
col_mapping: Optional[dict[str, str]],
|
|
184
|
-
) -> dict[
|
|
195
|
+
) -> dict[ColumnHandle, str]:
|
|
185
196
|
"""
|
|
186
197
|
Verifies that the specified `col_mapping` is valid. In particular, checks that:
|
|
187
198
|
(i) the keys of `col_mapping` are valid columns of the specified `Table`;
|
|
@@ -199,10 +210,10 @@ class Project(ExternalStore, abc.ABC):
|
|
|
199
210
|
if col_mapping is None:
|
|
200
211
|
col_mapping = {col: col for col in itertools.chain(export_cols.keys(), import_cols.keys())}
|
|
201
212
|
|
|
202
|
-
resolved_col_mapping: dict[
|
|
213
|
+
resolved_col_mapping: dict[ColumnHandle, str] = {}
|
|
203
214
|
|
|
204
215
|
# Validate names
|
|
205
|
-
t_cols = set(table.
|
|
216
|
+
t_cols = set(table._get_schema().keys())
|
|
206
217
|
for t_col, ext_col in col_mapping.items():
|
|
207
218
|
if t_col not in t_cols:
|
|
208
219
|
if is_user_specified_col_mapping:
|
|
@@ -223,9 +234,10 @@ class Project(ExternalStore, abc.ABC):
|
|
|
223
234
|
)
|
|
224
235
|
col_ref = table[t_col]
|
|
225
236
|
assert isinstance(col_ref, exprs.ColumnRef)
|
|
226
|
-
resolved_col_mapping[col_ref.col] = ext_col
|
|
237
|
+
resolved_col_mapping[col_ref.col.handle] = ext_col
|
|
238
|
+
|
|
227
239
|
# Validate column specs
|
|
228
|
-
t_col_types = table.
|
|
240
|
+
t_col_types = table._get_schema()
|
|
229
241
|
for t_col, ext_col in col_mapping.items():
|
|
230
242
|
t_col_type = t_col_types[t_col]
|
|
231
243
|
if ext_col in export_cols:
|
|
@@ -250,39 +262,56 @@ class Project(ExternalStore, abc.ABC):
|
|
|
250
262
|
)
|
|
251
263
|
return resolved_col_mapping
|
|
252
264
|
|
|
253
|
-
@classmethod
|
|
254
|
-
def _column_as_dict(cls, col: Column) -> dict[str, Any]:
|
|
255
|
-
return {'tbl_id': str(col.tbl.id), 'col_id': col.id}
|
|
256
265
|
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
266
|
+
@dataclass(frozen=True)
|
|
267
|
+
class SyncStatus:
|
|
268
|
+
# stats for the rows affected by the operation in the external store
|
|
269
|
+
ext_row_count_stats: RowCountStats = field(default_factory=RowCountStats)
|
|
260
270
|
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
return Catalog.get().get_tbl_version(tbl_id, None).cols_by_id[col_id]
|
|
271
|
+
# stats for the rows affected by the operation
|
|
272
|
+
row_count_stats: RowCountStats = field(default_factory=RowCountStats)
|
|
264
273
|
|
|
274
|
+
@property
|
|
275
|
+
def num_excs(self) -> int:
|
|
276
|
+
"""
|
|
277
|
+
Returns the total number of Pixeltable exceptions that occurred during the operation.
|
|
278
|
+
"""
|
|
279
|
+
return self.row_count_stats.num_excs
|
|
265
280
|
|
|
266
|
-
@
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
281
|
+
@property
|
|
282
|
+
def pxt_rows_updated(self) -> int:
|
|
283
|
+
"""
|
|
284
|
+
Returns the number of Pixeltable rows that were updated as a result of the operation.
|
|
285
|
+
"""
|
|
286
|
+
return self.row_count_stats.upd_rows
|
|
287
|
+
|
|
288
|
+
@property
|
|
289
|
+
def external_rows_updated(self) -> int:
|
|
290
|
+
return self.ext_row_count_stats.upd_rows
|
|
291
|
+
|
|
292
|
+
@property
|
|
293
|
+
def external_rows_created(self) -> int:
|
|
294
|
+
return self.ext_row_count_stats.ins_rows
|
|
295
|
+
|
|
296
|
+
@property
|
|
297
|
+
def external_rows_deleted(self) -> int:
|
|
298
|
+
return self.ext_row_count_stats.del_rows
|
|
273
299
|
|
|
274
|
-
def
|
|
300
|
+
def __add__(self, other: 'SyncStatus') -> 'SyncStatus':
|
|
301
|
+
"""
|
|
302
|
+
Add the sync status from two SyncStatus objects together.
|
|
303
|
+
"""
|
|
275
304
|
return SyncStatus(
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
external_rows_updated=self.external_rows_updated + other.external_rows_updated,
|
|
279
|
-
pxt_rows_updated=self.pxt_rows_updated + other.pxt_rows_updated,
|
|
280
|
-
num_excs=self.num_excs + other.num_excs,
|
|
305
|
+
ext_row_count_stats=self.ext_row_count_stats + other.ext_row_count_stats,
|
|
306
|
+
row_count_stats=self.row_count_stats + other.row_count_stats,
|
|
281
307
|
)
|
|
282
308
|
|
|
283
309
|
@classmethod
|
|
284
|
-
def
|
|
285
|
-
|
|
310
|
+
def from_update_status(cls, us: UpdateStatus) -> 'SyncStatus':
|
|
311
|
+
"""
|
|
312
|
+
Copy information from an UpdateStatus to a SyncStatus.
|
|
313
|
+
"""
|
|
314
|
+
return SyncStatus(row_count_stats=us.row_count_stats + us.cascade_row_count_stats)
|
|
286
315
|
|
|
287
316
|
|
|
288
317
|
class MockProject(Project):
|
|
@@ -293,8 +322,8 @@ class MockProject(Project):
|
|
|
293
322
|
name: str,
|
|
294
323
|
export_cols: dict[str, ts.ColumnType],
|
|
295
324
|
import_cols: dict[str, ts.ColumnType],
|
|
296
|
-
col_mapping: dict[
|
|
297
|
-
stored_proxies: Optional[dict[
|
|
325
|
+
col_mapping: dict[ColumnHandle, str],
|
|
326
|
+
stored_proxies: Optional[dict[ColumnHandle, ColumnHandle]] = None,
|
|
298
327
|
):
|
|
299
328
|
super().__init__(name, col_mapping, stored_proxies)
|
|
300
329
|
self.export_cols = export_cols
|
|
@@ -334,10 +363,8 @@ class MockProject(Project):
|
|
|
334
363
|
'name': self.name,
|
|
335
364
|
'export_cols': {k: v.as_dict() for k, v in self.export_cols.items()},
|
|
336
365
|
'import_cols': {k: v.as_dict() for k, v in self.import_cols.items()},
|
|
337
|
-
'col_mapping': [[
|
|
338
|
-
'stored_proxies': [
|
|
339
|
-
[self._column_as_dict(k), self._column_as_dict(v)] for k, v in self.stored_proxies.items()
|
|
340
|
-
],
|
|
366
|
+
'col_mapping': [[k.as_dict(), v] for k, v in self.col_mapping.items()],
|
|
367
|
+
'stored_proxies': [[k.as_dict(), v.as_dict()] for k, v in self.stored_proxies.items()],
|
|
341
368
|
}
|
|
342
369
|
|
|
343
370
|
@classmethod
|
|
@@ -346,8 +373,8 @@ class MockProject(Project):
|
|
|
346
373
|
md['name'],
|
|
347
374
|
{k: ts.ColumnType.from_dict(v) for k, v in md['export_cols'].items()},
|
|
348
375
|
{k: ts.ColumnType.from_dict(v) for k, v in md['import_cols'].items()},
|
|
349
|
-
{
|
|
350
|
-
{
|
|
376
|
+
{ColumnHandle.from_dict(entry[0]): entry[1] for entry in md['col_mapping']},
|
|
377
|
+
{ColumnHandle.from_dict(entry[0]): ColumnHandle.from_dict(entry[1]) for entry in md['stored_proxies']},
|
|
351
378
|
)
|
|
352
379
|
|
|
353
380
|
def __eq__(self, other: object) -> bool:
|
pixeltable/io/globals.py
CHANGED
pixeltable/io/label_studio.py
CHANGED
|
@@ -4,15 +4,17 @@ import logging
|
|
|
4
4
|
import os
|
|
5
5
|
from dataclasses import dataclass
|
|
6
6
|
from pathlib import Path
|
|
7
|
-
from typing import Any, Iterator, Literal, Optional
|
|
7
|
+
from typing import Any, Iterator, Literal, Optional
|
|
8
8
|
from xml.etree import ElementTree as ET
|
|
9
9
|
|
|
10
|
-
import label_studio_sdk
|
|
10
|
+
import label_studio_sdk
|
|
11
11
|
import PIL.Image
|
|
12
12
|
from requests.exceptions import HTTPError
|
|
13
13
|
|
|
14
14
|
import pixeltable.type_system as ts
|
|
15
15
|
from pixeltable import Column, Table, env, exceptions as excs
|
|
16
|
+
from pixeltable.catalog import ColumnHandle
|
|
17
|
+
from pixeltable.catalog.globals import RowCountStats
|
|
16
18
|
from pixeltable.config import Config
|
|
17
19
|
from pixeltable.exprs import ColumnRef, DataRow, Expr
|
|
18
20
|
from pixeltable.io.external_store import Project, SyncStatus
|
|
@@ -25,7 +27,7 @@ try:
|
|
|
25
27
|
import label_studio_sdk.project as ls_project # type: ignore
|
|
26
28
|
except ImportError:
|
|
27
29
|
# label_studio_sdk>=1 compatibility
|
|
28
|
-
import label_studio_sdk._legacy.project as ls_project
|
|
30
|
+
import label_studio_sdk._legacy.project as ls_project
|
|
29
31
|
|
|
30
32
|
_logger = logging.getLogger('pixeltable')
|
|
31
33
|
|
|
@@ -45,13 +47,17 @@ class LabelStudioProject(Project):
|
|
|
45
47
|
for synchronizing between a Pixeltable table and a Label Studio project.
|
|
46
48
|
"""
|
|
47
49
|
|
|
50
|
+
project_id: int # Label Studio project ID
|
|
51
|
+
media_import_method: Literal['post', 'file', 'url']
|
|
52
|
+
_project: Optional[ls_project.Project]
|
|
53
|
+
|
|
48
54
|
def __init__(
|
|
49
55
|
self,
|
|
50
56
|
name: str,
|
|
51
57
|
project_id: int,
|
|
52
58
|
media_import_method: Literal['post', 'file', 'url'],
|
|
53
|
-
col_mapping: dict[
|
|
54
|
-
stored_proxies: Optional[dict[
|
|
59
|
+
col_mapping: dict[ColumnHandle, str],
|
|
60
|
+
stored_proxies: Optional[dict[ColumnHandle, ColumnHandle]] = None,
|
|
55
61
|
):
|
|
56
62
|
"""
|
|
57
63
|
The constructor will NOT create a new Label Studio project; it is also used when loading
|
|
@@ -59,7 +65,7 @@ class LabelStudioProject(Project):
|
|
|
59
65
|
"""
|
|
60
66
|
self.project_id = project_id
|
|
61
67
|
self.media_import_method = media_import_method
|
|
62
|
-
self._project
|
|
68
|
+
self._project = None
|
|
63
69
|
super().__init__(name, col_mapping, stored_proxies)
|
|
64
70
|
|
|
65
71
|
@property
|
|
@@ -112,13 +118,13 @@ class LabelStudioProject(Project):
|
|
|
112
118
|
)
|
|
113
119
|
# Collect all existing tasks into a dict with entries `rowid: task`
|
|
114
120
|
tasks = {tuple(task['meta']['rowid']): task for task in self.__fetch_all_tasks()}
|
|
115
|
-
sync_status = SyncStatus
|
|
121
|
+
sync_status = SyncStatus()
|
|
116
122
|
if export_data:
|
|
117
123
|
export_sync_status = self.__update_tasks(t, tasks)
|
|
118
|
-
sync_status
|
|
124
|
+
sync_status += export_sync_status
|
|
119
125
|
if import_data:
|
|
120
126
|
import_sync_status = self.__update_table_from_tasks(t, tasks)
|
|
121
|
-
sync_status
|
|
127
|
+
sync_status += import_sync_status
|
|
122
128
|
return sync_status
|
|
123
129
|
|
|
124
130
|
def __fetch_all_tasks(self) -> Iterator[dict[str, Any]]:
|
|
@@ -155,7 +161,7 @@ class LabelStudioProject(Project):
|
|
|
155
161
|
t_data_cols = [t_col for t_col, ext_col_name in self.col_mapping.items() if ext_col_name in config.data_keys]
|
|
156
162
|
|
|
157
163
|
if len(t_data_cols) == 0:
|
|
158
|
-
return SyncStatus
|
|
164
|
+
return SyncStatus()
|
|
159
165
|
|
|
160
166
|
# Columns in `t` that map to `rectanglelabels` preannotations
|
|
161
167
|
t_rl_cols = [
|
|
@@ -183,15 +189,15 @@ class LabelStudioProject(Project):
|
|
|
183
189
|
self,
|
|
184
190
|
t: Table,
|
|
185
191
|
existing_tasks: dict[tuple, dict],
|
|
186
|
-
media_col:
|
|
187
|
-
t_rl_cols: list[
|
|
192
|
+
media_col: ColumnHandle,
|
|
193
|
+
t_rl_cols: list[ColumnHandle],
|
|
188
194
|
rl_info: list['_RectangleLabel'],
|
|
189
195
|
) -> SyncStatus:
|
|
190
|
-
is_stored = media_col.is_stored
|
|
196
|
+
is_stored = media_col.get().is_stored
|
|
191
197
|
# If it's a stored column, we can use `localpath`
|
|
192
|
-
localpath_col_opt = [t[media_col.name].localpath] if is_stored else []
|
|
198
|
+
localpath_col_opt = [t[media_col.get().name].localpath] if is_stored else []
|
|
193
199
|
# Select the media column, rectanglelabels columns, and localpath (if appropriate)
|
|
194
|
-
rows = t.select(t[media_col.name], *[t[col.name] for col in t_rl_cols], *localpath_col_opt)
|
|
200
|
+
rows = t.select(t[media_col.get().name], *[t[col.get().name] for col in t_rl_cols], *localpath_col_opt)
|
|
195
201
|
tasks_created = 0
|
|
196
202
|
row_ids_in_pxt: set[tuple] = set()
|
|
197
203
|
|
|
@@ -232,42 +238,42 @@ class LabelStudioProject(Project):
|
|
|
232
238
|
|
|
233
239
|
env.Env.get().console_logger.info(f'Created {tasks_created} new task(s) in {self}.')
|
|
234
240
|
|
|
235
|
-
sync_status = SyncStatus(
|
|
241
|
+
sync_status = SyncStatus(ext_row_count_stats=RowCountStats(ins_rows=tasks_created))
|
|
236
242
|
|
|
237
243
|
deletion_sync_status = self.__delete_stale_tasks(existing_tasks, row_ids_in_pxt, tasks_created)
|
|
238
|
-
|
|
239
|
-
return sync_status
|
|
244
|
+
sync_status += deletion_sync_status
|
|
245
|
+
return sync_status
|
|
240
246
|
|
|
241
247
|
def __update_tasks_by_files(
|
|
242
248
|
self,
|
|
243
249
|
t: Table,
|
|
244
250
|
existing_tasks: dict[tuple, dict],
|
|
245
|
-
t_data_cols: list[
|
|
246
|
-
t_rl_cols: list[
|
|
251
|
+
t_data_cols: list[ColumnHandle],
|
|
252
|
+
t_rl_cols: list[ColumnHandle],
|
|
247
253
|
rl_info: list['_RectangleLabel'],
|
|
248
254
|
) -> SyncStatus:
|
|
249
255
|
ext_data_cols = [self.col_mapping[col] for col in t_data_cols]
|
|
250
256
|
expr_refs: dict[str, Expr] = {} # kwargs for the select statement
|
|
251
257
|
for col in t_data_cols:
|
|
252
|
-
col_name = col.name
|
|
258
|
+
col_name = col.get().name
|
|
253
259
|
if self.media_import_method == 'url':
|
|
254
260
|
expr_refs[col_name] = t[col_name].fileurl
|
|
255
261
|
else:
|
|
256
262
|
assert self.media_import_method == 'file'
|
|
257
|
-
if not col.col_type.is_media_type():
|
|
263
|
+
if not col.get().col_type.is_media_type():
|
|
258
264
|
# Not a media column; query the data directly
|
|
259
|
-
expr_refs[col_name] =
|
|
265
|
+
expr_refs[col_name] = t[col_name]
|
|
260
266
|
elif col in self.stored_proxies:
|
|
261
267
|
# Media column that has a stored proxy; use it. We have to give it a name,
|
|
262
268
|
# since it's an anonymous column
|
|
263
|
-
stored_proxy_col = self.stored_proxies[col]
|
|
269
|
+
stored_proxy_col = self.stored_proxies[col].get()
|
|
264
270
|
expr_refs[f'{col_name}_proxy'] = ColumnRef(stored_proxy_col).localpath
|
|
265
271
|
else:
|
|
266
272
|
# Media column without a stored proxy; this means it's a stored computed column,
|
|
267
273
|
# and we can just use the localpath
|
|
268
274
|
expr_refs[col_name] = t[col_name].localpath
|
|
269
275
|
|
|
270
|
-
df = t.select(*[t[col.name] for col in t_rl_cols], **expr_refs)
|
|
276
|
+
df = t.select(*[t[col.get().name] for col in t_rl_cols], **expr_refs)
|
|
271
277
|
# The following buffers will hold `DataRow` indices that correspond to each of the selected
|
|
272
278
|
# columns. `rl_col_idxs` holds the indices for the columns that map to RectangleLabels
|
|
273
279
|
# preannotations; `data_col_idxs` holds the indices for the columns that map to data fields.
|
|
@@ -286,11 +292,11 @@ class LabelStudioProject(Project):
|
|
|
286
292
|
data_vals = [row[idx] for idx in data_col_idxs]
|
|
287
293
|
coco_annotations = [row[idx] for idx in rl_col_idxs]
|
|
288
294
|
for i in range(len(t_data_cols)):
|
|
289
|
-
if t_data_cols[i].col_type.is_media_type():
|
|
295
|
+
if t_data_cols[i].get().col_type.is_media_type():
|
|
290
296
|
# Special handling for media columns
|
|
291
297
|
assert isinstance(data_vals[i], str)
|
|
292
298
|
if self.media_import_method == 'url':
|
|
293
|
-
data_vals[i] = self.__validate_fileurl(t_data_cols[i], data_vals[i])
|
|
299
|
+
data_vals[i] = self.__validate_fileurl(t_data_cols[i].get(), data_vals[i])
|
|
294
300
|
else:
|
|
295
301
|
assert self.media_import_method == 'file'
|
|
296
302
|
data_vals[i] = self.__localpath_to_lspath(data_vals[i])
|
|
@@ -336,11 +342,11 @@ class LabelStudioProject(Project):
|
|
|
336
342
|
f'Created {tasks_created} new task(s) and updated {tasks_updated} existing task(s) in {self}.'
|
|
337
343
|
)
|
|
338
344
|
|
|
339
|
-
sync_status = SyncStatus(
|
|
345
|
+
sync_status = SyncStatus(ext_row_count_stats=RowCountStats(ins_rows=tasks_created, upd_rows=tasks_updated))
|
|
340
346
|
|
|
341
347
|
deletion_sync_status = self.__delete_stale_tasks(existing_tasks, row_ids_in_pxt, tasks_created)
|
|
342
|
-
|
|
343
|
-
return sync_status
|
|
348
|
+
sync_status += deletion_sync_status
|
|
349
|
+
return sync_status
|
|
344
350
|
|
|
345
351
|
@classmethod
|
|
346
352
|
def __validate_fileurl(cls, col: Column, url: str) -> Optional[str]:
|
|
@@ -377,11 +383,11 @@ class LabelStudioProject(Project):
|
|
|
377
383
|
for rowid in deleted_rowids:
|
|
378
384
|
del existing_tasks[rowid]
|
|
379
385
|
|
|
380
|
-
return SyncStatus(
|
|
386
|
+
return SyncStatus(ext_row_count_stats=RowCountStats(del_rows=len(deleted_rowids)))
|
|
381
387
|
|
|
382
388
|
def __update_table_from_tasks(self, t: Table, tasks: dict[tuple, dict]) -> SyncStatus:
|
|
383
389
|
if ANNOTATIONS_COLUMN not in self.col_mapping.values():
|
|
384
|
-
return SyncStatus
|
|
390
|
+
return SyncStatus()
|
|
385
391
|
|
|
386
392
|
annotations = {
|
|
387
393
|
# Replace [] by None to indicate no annotations. We do want to sync rows with no annotations,
|
|
@@ -391,7 +397,7 @@ class LabelStudioProject(Project):
|
|
|
391
397
|
for task in tasks.values()
|
|
392
398
|
}
|
|
393
399
|
|
|
394
|
-
local_annotations_col = next(k for k, v in self.col_mapping.items() if v == ANNOTATIONS_COLUMN)
|
|
400
|
+
local_annotations_col = next(k for k, v in self.col_mapping.items() if v == ANNOTATIONS_COLUMN).get()
|
|
395
401
|
|
|
396
402
|
# Prune the annotations down to just the ones that have actually changed.
|
|
397
403
|
rows = t.select(t[local_annotations_col.name])
|
|
@@ -412,23 +418,21 @@ class LabelStudioProject(Project):
|
|
|
412
418
|
# TODO(aaron-siegel): Simplify this once propagation is properly implemented in batch_update
|
|
413
419
|
ancestor = t
|
|
414
420
|
while local_annotations_col not in ancestor._tbl_version.get().cols:
|
|
415
|
-
assert ancestor.
|
|
416
|
-
ancestor = ancestor.
|
|
421
|
+
assert ancestor._get_base_table is not None
|
|
422
|
+
ancestor = ancestor._get_base_table()
|
|
417
423
|
update_status = ancestor.batch_update(updates)
|
|
418
424
|
env.Env.get().console_logger.info(f'Updated annotation(s) from {len(updates)} task(s) in {self}.')
|
|
419
|
-
return SyncStatus(
|
|
425
|
+
return SyncStatus.from_update_status(update_status)
|
|
420
426
|
else:
|
|
421
|
-
return SyncStatus
|
|
427
|
+
return SyncStatus()
|
|
422
428
|
|
|
423
429
|
def as_dict(self) -> dict[str, Any]:
|
|
424
430
|
return {
|
|
425
431
|
'name': self.name,
|
|
426
432
|
'project_id': self.project_id,
|
|
427
433
|
'media_import_method': self.media_import_method,
|
|
428
|
-
'col_mapping': [[
|
|
429
|
-
'stored_proxies': [
|
|
430
|
-
[self._column_as_dict(k), self._column_as_dict(v)] for k, v in self.stored_proxies.items()
|
|
431
|
-
],
|
|
434
|
+
'col_mapping': [[k.as_dict(), v] for k, v in self.col_mapping.items()],
|
|
435
|
+
'stored_proxies': [[k.as_dict(), v.as_dict()] for k, v in self.stored_proxies.items()],
|
|
432
436
|
}
|
|
433
437
|
|
|
434
438
|
@classmethod
|
|
@@ -437,8 +441,8 @@ class LabelStudioProject(Project):
|
|
|
437
441
|
md['name'],
|
|
438
442
|
md['project_id'],
|
|
439
443
|
md['media_import_method'],
|
|
440
|
-
{
|
|
441
|
-
{
|
|
444
|
+
{ColumnHandle.from_dict(entry[0]): entry[1] for entry in md['col_mapping']},
|
|
445
|
+
{ColumnHandle.from_dict(entry[0]): ColumnHandle.from_dict(entry[1]) for entry in md['stored_proxies']},
|
|
442
446
|
)
|
|
443
447
|
|
|
444
448
|
def __repr__(self) -> str:
|
|
@@ -560,7 +564,7 @@ class LabelStudioProject(Project):
|
|
|
560
564
|
|
|
561
565
|
if name is None:
|
|
562
566
|
# Create a default name that's unique to the table
|
|
563
|
-
all_stores = t.external_stores
|
|
567
|
+
all_stores = t.external_stores()
|
|
564
568
|
n = 0
|
|
565
569
|
while f'ls_project_{n}' in all_stores:
|
|
566
570
|
n += 1
|
|
@@ -576,7 +580,7 @@ class LabelStudioProject(Project):
|
|
|
576
580
|
local_annotations_column = ANNOTATIONS_COLUMN
|
|
577
581
|
else:
|
|
578
582
|
local_annotations_column = next(k for k, v in col_mapping.items() if v == ANNOTATIONS_COLUMN)
|
|
579
|
-
if local_annotations_column not in t.
|
|
583
|
+
if local_annotations_column not in t._get_schema():
|
|
580
584
|
t.add_columns({local_annotations_column: ts.Json})
|
|
581
585
|
|
|
582
586
|
resolved_col_mapping = cls.validate_columns(
|
|
@@ -101,7 +101,7 @@ class TableDataConduit:
|
|
|
101
101
|
def add_table_info(self, table: pxt.Table) -> None:
|
|
102
102
|
"""Add information about the table into which we are inserting data"""
|
|
103
103
|
assert isinstance(table, pxt.Table)
|
|
104
|
-
self.pxt_schema = table.
|
|
104
|
+
self.pxt_schema = table._get_schema()
|
|
105
105
|
self.pxt_pk = table._tbl_version.get().primary_key
|
|
106
106
|
for col in table._tbl_version_path.columns():
|
|
107
107
|
if col.is_required_for_insert:
|
pixeltable/metadata/__init__.py
CHANGED
|
@@ -18,7 +18,7 @@ _console_logger = ConsoleLogger(logging.getLogger('pixeltable'))
|
|
|
18
18
|
_logger = logging.getLogger('pixeltable')
|
|
19
19
|
|
|
20
20
|
# current version of the metadata; this is incremented whenever the metadata schema changes
|
|
21
|
-
VERSION =
|
|
21
|
+
VERSION = 39
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
def create_system_info(engine: sql.engine.Engine) -> None:
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from uuid import UUID
|
|
2
|
+
|
|
3
|
+
import sqlalchemy as sql
|
|
4
|
+
|
|
5
|
+
from pixeltable.metadata import register_converter
|
|
6
|
+
from pixeltable.metadata.converters.util import convert_table_md
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@register_converter(version=37)
|
|
10
|
+
def _(engine: sql.engine.Engine) -> None:
|
|
11
|
+
convert_table_md(engine, table_md_updater=__update_table_md)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def __update_table_md(table_md: dict, _: UUID) -> None:
|
|
15
|
+
table_md['view_sn'] = 0
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from typing import Any, Optional
|
|
2
|
+
|
|
3
|
+
import sqlalchemy as sql
|
|
4
|
+
|
|
5
|
+
from pixeltable.metadata import register_converter
|
|
6
|
+
from pixeltable.metadata.converters.util import convert_table_md
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@register_converter(version=38)
|
|
10
|
+
def _(engine: sql.engine.Engine) -> None:
|
|
11
|
+
convert_table_md(engine, substitution_fn=__substitute_md)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], Any]]:
|
|
15
|
+
if k == 'col_mapping':
|
|
16
|
+
assert isinstance(v, list)
|
|
17
|
+
return k, [__col_mapping_entry(e) for e in v]
|
|
18
|
+
if k == 'stored_proxies':
|
|
19
|
+
assert isinstance(v, list)
|
|
20
|
+
return k, [__stored_proxies_entry(e) for e in v]
|
|
21
|
+
return None
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def __col_mapping_entry(e: list) -> list:
|
|
25
|
+
assert isinstance(e, list)
|
|
26
|
+
assert isinstance(e[0], dict)
|
|
27
|
+
assert isinstance(e[1], str)
|
|
28
|
+
return [__col_handle(e[0]), e[1]]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def __stored_proxies_entry(e: list) -> list:
|
|
32
|
+
assert isinstance(e, list)
|
|
33
|
+
assert isinstance(e[0], dict)
|
|
34
|
+
assert isinstance(e[1], dict)
|
|
35
|
+
return [__col_handle(e[0]), __col_handle(e[1])]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def __col_handle(e: dict) -> dict:
|
|
39
|
+
return {'tbl_version': {'id': e['tbl_id'], 'effective_version': None}, 'col_id': e['col_id']}
|
pixeltable/metadata/notes.py
CHANGED
|
@@ -2,6 +2,8 @@
|
|
|
2
2
|
# rather than as a comment, so that the existence of a description can be enforced by
|
|
3
3
|
# the unit tests when new versions are added.
|
|
4
4
|
VERSION_NOTES = {
|
|
5
|
+
39: 'ColumnHandles in external stores',
|
|
6
|
+
38: 'Added TableMd.view_sn',
|
|
5
7
|
37: 'Add support for the sample() method on DataFrames',
|
|
6
8
|
36: 'Added Table.lock_dummy',
|
|
7
9
|
35: 'Track reference_tbl in ColumnRef',
|
pixeltable/metadata/schema.py
CHANGED
|
@@ -177,6 +177,11 @@ class TableMd:
|
|
|
177
177
|
# - every row is assigned a unique and immutable rowid on insertion
|
|
178
178
|
next_row_id: int
|
|
179
179
|
|
|
180
|
+
# sequence number to track changes in the set of mutable views of this table (ie, this table = the view base)
|
|
181
|
+
# - incremented for each add/drop of a mutable view
|
|
182
|
+
# - only maintained for mutable tables
|
|
183
|
+
view_sn: int
|
|
184
|
+
|
|
180
185
|
# Metadata format for external stores:
|
|
181
186
|
# {'class': 'pixeltable.io.label_studio.LabelStudioProject', 'md': {'project_id': 3}}
|
|
182
187
|
external_stores: list[dict[str, Any]]
|