pixeltable 0.2.8__py3-none-any.whl → 0.2.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +15 -33
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/catalog.py +1 -1
- pixeltable/catalog/column.py +29 -11
- pixeltable/catalog/dir.py +2 -2
- pixeltable/catalog/insertable_table.py +5 -55
- pixeltable/catalog/named_function.py +2 -2
- pixeltable/catalog/schema_object.py +2 -7
- pixeltable/catalog/table.py +307 -186
- pixeltable/catalog/table_version.py +109 -63
- pixeltable/catalog/table_version_path.py +28 -5
- pixeltable/catalog/view.py +20 -10
- pixeltable/dataframe.py +129 -26
- pixeltable/env.py +29 -18
- pixeltable/exec/exec_context.py +5 -0
- pixeltable/exec/exec_node.py +1 -0
- pixeltable/exec/in_memory_data_node.py +29 -24
- pixeltable/exec/sql_scan_node.py +1 -1
- pixeltable/exprs/column_ref.py +13 -8
- pixeltable/exprs/data_row.py +4 -0
- pixeltable/exprs/expr.py +16 -1
- pixeltable/exprs/function_call.py +4 -4
- pixeltable/exprs/row_builder.py +29 -20
- pixeltable/exprs/similarity_expr.py +4 -3
- pixeltable/ext/functions/yolox.py +2 -1
- pixeltable/func/__init__.py +1 -0
- pixeltable/func/aggregate_function.py +14 -12
- pixeltable/func/callable_function.py +8 -6
- pixeltable/func/expr_template_function.py +13 -19
- pixeltable/func/function.py +3 -6
- pixeltable/func/query_template_function.py +84 -0
- pixeltable/func/signature.py +68 -23
- pixeltable/func/udf.py +13 -10
- pixeltable/functions/__init__.py +6 -91
- pixeltable/functions/eval.py +26 -14
- pixeltable/functions/fireworks.py +25 -23
- pixeltable/functions/globals.py +62 -0
- pixeltable/functions/huggingface.py +20 -16
- pixeltable/functions/image.py +170 -1
- pixeltable/functions/openai.py +95 -128
- pixeltable/functions/string.py +10 -2
- pixeltable/functions/together.py +95 -84
- pixeltable/functions/util.py +16 -0
- pixeltable/functions/video.py +94 -16
- pixeltable/functions/whisper.py +74 -0
- pixeltable/globals.py +1 -1
- pixeltable/io/__init__.py +10 -0
- pixeltable/io/external_store.py +370 -0
- pixeltable/io/globals.py +51 -22
- pixeltable/io/label_studio.py +639 -0
- pixeltable/io/parquet.py +1 -1
- pixeltable/iterators/__init__.py +9 -0
- pixeltable/iterators/string.py +40 -0
- pixeltable/metadata/__init__.py +6 -8
- pixeltable/metadata/converters/convert_10.py +2 -4
- pixeltable/metadata/converters/convert_12.py +7 -2
- pixeltable/metadata/converters/convert_13.py +6 -8
- pixeltable/metadata/converters/convert_14.py +2 -4
- pixeltable/metadata/converters/convert_15.py +44 -0
- pixeltable/metadata/converters/convert_16.py +18 -0
- pixeltable/metadata/converters/util.py +66 -0
- pixeltable/metadata/schema.py +3 -3
- pixeltable/plan.py +8 -7
- pixeltable/store.py +1 -1
- pixeltable/tool/create_test_db_dump.py +147 -54
- pixeltable/tool/embed_udf.py +9 -0
- pixeltable/type_system.py +1 -2
- pixeltable/utils/code.py +34 -0
- {pixeltable-0.2.8.dist-info → pixeltable-0.2.10.dist-info}/METADATA +1 -1
- pixeltable-0.2.10.dist-info/RECORD +131 -0
- pixeltable/datatransfer/__init__.py +0 -1
- pixeltable/datatransfer/label_studio.py +0 -452
- pixeltable/datatransfer/remote.py +0 -85
- pixeltable/functions/pil/image.py +0 -147
- pixeltable-0.2.8.dist-info/RECORD +0 -124
- {pixeltable-0.2.8.dist-info → pixeltable-0.2.10.dist-info}/LICENSE +0 -0
- {pixeltable-0.2.8.dist-info → pixeltable-0.2.10.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,639 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
import os
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any, Iterator, Optional, Literal
|
|
7
|
+
from xml.etree import ElementTree
|
|
8
|
+
|
|
9
|
+
import PIL.Image
|
|
10
|
+
import label_studio_sdk
|
|
11
|
+
from requests.exceptions import HTTPError
|
|
12
|
+
|
|
13
|
+
import pixeltable as pxt
|
|
14
|
+
import pixeltable.env as env
|
|
15
|
+
import pixeltable.exceptions as excs
|
|
16
|
+
from pixeltable import Table, Column
|
|
17
|
+
from pixeltable.exprs import ColumnRef, DataRow, Expr
|
|
18
|
+
from pixeltable.io.external_store import Project, SyncStatus
|
|
19
|
+
from pixeltable.utils import coco
|
|
20
|
+
|
|
21
|
+
_logger = logging.getLogger('pixeltable')
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@env.register_client('label_studio')
|
|
25
|
+
def _(api_key: str, url: str) -> label_studio_sdk.Client:
|
|
26
|
+
return label_studio_sdk.Client(api_key=api_key, url=url)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _label_studio_client() -> label_studio_sdk.Client:
|
|
30
|
+
return env.Env.get().get_client('label_studio')
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class LabelStudioProject(Project):
|
|
34
|
+
"""
|
|
35
|
+
An [`ExternalStore`][pixeltable.io.ExternalStore] that represents a Label Studio project, providing functionality
|
|
36
|
+
for synchronizing between a Pixeltable table and a Label Studio project.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
def __init__(
|
|
40
|
+
self,
|
|
41
|
+
name: str,
|
|
42
|
+
project_id: int,
|
|
43
|
+
media_import_method: Literal['post', 'file', 'url'],
|
|
44
|
+
col_mapping: dict[Column, str],
|
|
45
|
+
stored_proxies: Optional[dict[Column, Column]] = None
|
|
46
|
+
):
|
|
47
|
+
"""
|
|
48
|
+
The constructor will NOT create a new Label Studio project; it is also used when loading
|
|
49
|
+
metadata for existing projects.
|
|
50
|
+
"""
|
|
51
|
+
self.project_id = project_id
|
|
52
|
+
self.media_import_method = media_import_method
|
|
53
|
+
self._project: Optional[label_studio_sdk.project.Project] = None
|
|
54
|
+
super().__init__(name, col_mapping, stored_proxies)
|
|
55
|
+
|
|
56
|
+
@property
|
|
57
|
+
def project(self) -> label_studio_sdk.project.Project:
|
|
58
|
+
"""The `Project` object corresponding to this Label Studio project."""
|
|
59
|
+
if self._project is None:
|
|
60
|
+
try:
|
|
61
|
+
self._project = _label_studio_client().get_project(self.project_id)
|
|
62
|
+
except HTTPError as exc:
|
|
63
|
+
raise excs.Error(f'Could not locate Label Studio project: {self.project_id} '
|
|
64
|
+
'(cannot connect to server or project no longer exists)') from exc
|
|
65
|
+
return self._project
|
|
66
|
+
|
|
67
|
+
@property
|
|
68
|
+
def project_params(self) -> dict[str, Any]:
|
|
69
|
+
"""The parameters of this Label Studio project."""
|
|
70
|
+
return self.project.get_params()
|
|
71
|
+
|
|
72
|
+
@property
|
|
73
|
+
def project_title(self) -> str:
|
|
74
|
+
"""The title of this Label Studio project."""
|
|
75
|
+
return self.project_params['title']
|
|
76
|
+
|
|
77
|
+
@property
|
|
78
|
+
def __project_config(self) -> '_LabelStudioConfig':
|
|
79
|
+
return self.__parse_project_config(self.project_params['label_config'])
|
|
80
|
+
|
|
81
|
+
def get_export_columns(self) -> dict[str, pxt.ColumnType]:
|
|
82
|
+
"""
|
|
83
|
+
The data keys and preannotation fields specified in this Label Studio project.
|
|
84
|
+
"""
|
|
85
|
+
return self.__project_config.export_columns
|
|
86
|
+
|
|
87
|
+
def get_import_columns(self) -> dict[str, pxt.ColumnType]:
|
|
88
|
+
"""
|
|
89
|
+
Always contains a single entry:
|
|
90
|
+
|
|
91
|
+
```
|
|
92
|
+
{"annotations": pxt.JsonType(nullable=True)}
|
|
93
|
+
```
|
|
94
|
+
"""
|
|
95
|
+
return {ANNOTATIONS_COLUMN: pxt.JsonType(nullable=True)}
|
|
96
|
+
|
|
97
|
+
def sync(self, t: Table, export_data: bool, import_data: bool) -> SyncStatus:
|
|
98
|
+
_logger.info(f'Syncing Label Studio project "{self.project_title}" with table `{t.get_name()}`'
|
|
99
|
+
f' (export: {export_data}, import: {import_data}).')
|
|
100
|
+
# Collect all existing tasks into a dict with entries `rowid: task`
|
|
101
|
+
tasks = {tuple(task['meta']['rowid']): task for task in self.__fetch_all_tasks()}
|
|
102
|
+
sync_status = SyncStatus.empty()
|
|
103
|
+
if export_data:
|
|
104
|
+
export_sync_status = self.__update_tasks(t, tasks)
|
|
105
|
+
sync_status = sync_status.combine(export_sync_status)
|
|
106
|
+
if import_data:
|
|
107
|
+
import_sync_status = self.__update_table_from_tasks(t, tasks)
|
|
108
|
+
sync_status = sync_status.combine(import_sync_status)
|
|
109
|
+
return sync_status
|
|
110
|
+
|
|
111
|
+
def __fetch_all_tasks(self) -> Iterator[dict[str, Any]]:
|
|
112
|
+
"""Retrieves all tasks and task metadata in this Label Studio project."""
|
|
113
|
+
page = 1
|
|
114
|
+
unknown_task_count = 0
|
|
115
|
+
while True:
|
|
116
|
+
result = self.project.get_paginated_tasks(page=page, page_size=_PAGE_SIZE)
|
|
117
|
+
if result.get('end_pagination'):
|
|
118
|
+
break
|
|
119
|
+
for task in result['tasks']:
|
|
120
|
+
rowid = task['meta'].get('rowid')
|
|
121
|
+
if rowid is None:
|
|
122
|
+
unknown_task_count += 1
|
|
123
|
+
else:
|
|
124
|
+
yield task
|
|
125
|
+
page += 1
|
|
126
|
+
if unknown_task_count > 0:
|
|
127
|
+
_logger.warning(
|
|
128
|
+
f'Skipped {unknown_task_count} unrecognized task(s) when syncing Label Studio project "{self.project_title}".'
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
def __update_tasks(self, t: Table, existing_tasks: dict[tuple, dict]) -> SyncStatus:
|
|
132
|
+
"""
|
|
133
|
+
Updates all tasks in this Label Studio project based on the Pixeltable data:
|
|
134
|
+
- Creates new tasks for rows that don't map to any existing task;
|
|
135
|
+
- Updates existing tasks for rows whose data has changed;
|
|
136
|
+
- Deletes any tasks whose rows no longer exist in the Pixeltable table.
|
|
137
|
+
"""
|
|
138
|
+
config = self.__project_config
|
|
139
|
+
|
|
140
|
+
# Columns in `t` that map to Label Studio data keys
|
|
141
|
+
t_data_cols = [
|
|
142
|
+
t_col for t_col, ext_col_name in self.col_mapping.items()
|
|
143
|
+
if ext_col_name in config.data_keys
|
|
144
|
+
]
|
|
145
|
+
|
|
146
|
+
if len(t_data_cols) == 0:
|
|
147
|
+
return SyncStatus.empty()
|
|
148
|
+
|
|
149
|
+
# Columns in `t` that map to `rectanglelabels` preannotations
|
|
150
|
+
t_rl_cols = [
|
|
151
|
+
t_col for t_col, ext_col_name in self.col_mapping.items()
|
|
152
|
+
if ext_col_name in config.rectangle_labels
|
|
153
|
+
]
|
|
154
|
+
|
|
155
|
+
# Destinations for `rectanglelabels` preannotations
|
|
156
|
+
rl_info = list(config.rectangle_labels.values())
|
|
157
|
+
|
|
158
|
+
_logger.debug('`t_data_cols`: %s', t_data_cols)
|
|
159
|
+
_logger.debug('`t_rl_cols`: %s', t_rl_cols)
|
|
160
|
+
_logger.debug('`rl_info`: %s', rl_info)
|
|
161
|
+
|
|
162
|
+
if self.media_import_method == 'post':
|
|
163
|
+
# Send media to Label Studio by HTTP post.
|
|
164
|
+
assert len(t_data_cols) == 1 # This was verified when the project was set up
|
|
165
|
+
return self.__update_tasks_by_post(t, existing_tasks, t_data_cols[0], t_rl_cols, rl_info)
|
|
166
|
+
elif self.media_import_method == 'file' or self.media_import_method == 'url':
|
|
167
|
+
# Send media to Label Studio by file reference (local file or URL).
|
|
168
|
+
return self.__update_tasks_by_files(t, existing_tasks, t_data_cols, t_rl_cols, rl_info)
|
|
169
|
+
else:
|
|
170
|
+
assert False
|
|
171
|
+
|
|
172
|
+
def __update_tasks_by_post(
|
|
173
|
+
self,
|
|
174
|
+
t: Table,
|
|
175
|
+
existing_tasks: dict[tuple, dict],
|
|
176
|
+
media_col: Column,
|
|
177
|
+
t_rl_cols: list[Column],
|
|
178
|
+
rl_info: list['_RectangleLabel']
|
|
179
|
+
) -> SyncStatus:
|
|
180
|
+
is_stored = media_col.is_stored
|
|
181
|
+
# If it's a stored column, we can use `localpath`
|
|
182
|
+
localpath_col_opt = [t[media_col.name].localpath] if is_stored else []
|
|
183
|
+
# Select the media column, rectanglelabels columns, and localpath (if appropriate)
|
|
184
|
+
rows = t.select(t[media_col.name], *[t[col.name] for col in t_rl_cols], *localpath_col_opt)
|
|
185
|
+
tasks_created = 0
|
|
186
|
+
row_ids_in_pxt: set[tuple] = set()
|
|
187
|
+
|
|
188
|
+
for row in rows._exec():
|
|
189
|
+
media_col_idx = rows._select_list_exprs[0].slot_idx
|
|
190
|
+
rl_col_idxs = [expr.slot_idx for expr in rows._select_list_exprs[1: 1 + len(t_rl_cols)]]
|
|
191
|
+
row_ids_in_pxt.add(row.rowid)
|
|
192
|
+
if row.rowid not in existing_tasks:
|
|
193
|
+
# Upload the media file to Label Studio
|
|
194
|
+
if is_stored:
|
|
195
|
+
# There is an existing localpath; use it!
|
|
196
|
+
localpath_col_idx = rows._select_list_exprs[-1].slot_idx
|
|
197
|
+
file = Path(row[localpath_col_idx])
|
|
198
|
+
task_id: int = self.project.import_tasks(file)[0]
|
|
199
|
+
else:
|
|
200
|
+
# No localpath; create a temp file and upload it
|
|
201
|
+
assert isinstance(row[media_col_idx], PIL.Image.Image)
|
|
202
|
+
file = env.Env.get().create_tmp_path(extension='.png')
|
|
203
|
+
row[media_col_idx].save(file, format='png')
|
|
204
|
+
task_id: int = self.project.import_tasks(file)[0]
|
|
205
|
+
os.remove(file)
|
|
206
|
+
|
|
207
|
+
# Update the task with `rowid` metadata
|
|
208
|
+
self.project.update_task(task_id, meta={'rowid': row.rowid})
|
|
209
|
+
|
|
210
|
+
# Convert coco annotations to predictions
|
|
211
|
+
coco_annotations = [row[i] for i in rl_col_idxs]
|
|
212
|
+
_logger.debug('`coco_annotations`: %s', coco_annotations)
|
|
213
|
+
predictions = [
|
|
214
|
+
self.__coco_to_predictions(
|
|
215
|
+
coco_annotations[i], self.col_mapping[t_rl_cols[i]], rl_info[i], task_id=task_id
|
|
216
|
+
)
|
|
217
|
+
for i in range(len(coco_annotations))
|
|
218
|
+
]
|
|
219
|
+
_logger.debug(f'`predictions`: %s', predictions)
|
|
220
|
+
self.project.create_predictions(predictions)
|
|
221
|
+
tasks_created += 1
|
|
222
|
+
|
|
223
|
+
print(f'Created {tasks_created} new task(s) in {self}.')
|
|
224
|
+
|
|
225
|
+
sync_status = SyncStatus(external_rows_created=tasks_created)
|
|
226
|
+
|
|
227
|
+
deletion_sync_status = self.__delete_stale_tasks(existing_tasks, row_ids_in_pxt, tasks_created)
|
|
228
|
+
|
|
229
|
+
return sync_status.combine(deletion_sync_status)
|
|
230
|
+
|
|
231
|
+
def __update_tasks_by_files(
|
|
232
|
+
self,
|
|
233
|
+
t: Table,
|
|
234
|
+
existing_tasks: dict[tuple, dict],
|
|
235
|
+
t_data_cols: list[Column],
|
|
236
|
+
t_rl_cols: list[Column],
|
|
237
|
+
rl_info: list['_RectangleLabel']
|
|
238
|
+
) -> SyncStatus:
|
|
239
|
+
ext_data_cols = [self.col_mapping[col] for col in t_data_cols]
|
|
240
|
+
expr_refs: dict[str, Expr] = {} # kwargs for the select statement
|
|
241
|
+
for col in t_data_cols:
|
|
242
|
+
col_name = col.name
|
|
243
|
+
if self.media_import_method == 'url':
|
|
244
|
+
expr_refs[col_name] = t[col_name].fileurl
|
|
245
|
+
else:
|
|
246
|
+
assert self.media_import_method == 'file'
|
|
247
|
+
if not col.col_type.is_media_type():
|
|
248
|
+
# Not a media column; query the data directly
|
|
249
|
+
expr_refs[col_name] = t[col_name]
|
|
250
|
+
elif col in self.stored_proxies:
|
|
251
|
+
# Media column that has a stored proxy; use it. We have to give it a name,
|
|
252
|
+
# since it's an anonymous column
|
|
253
|
+
stored_proxy_col = self.stored_proxies[col]
|
|
254
|
+
expr_refs[f'{col_name}_proxy'] = ColumnRef(stored_proxy_col).localpath
|
|
255
|
+
else:
|
|
256
|
+
# Media column without a stored proxy; this means it's a stored computed column,
|
|
257
|
+
# and we can just use the localpath
|
|
258
|
+
expr_refs[col_name] = t[col_name].localpath
|
|
259
|
+
|
|
260
|
+
df = t.select(*[t[col] for col in t_rl_cols], **expr_refs)
|
|
261
|
+
# The following buffers will hold `DataRow` indices that correspond to each of the selected
|
|
262
|
+
# columns. `rl_col_idxs` holds the indices for the columns that map to RectangleLabels
|
|
263
|
+
# preannotations; `data_col_idxs` holds the indices for the columns that map to data fields.
|
|
264
|
+
# We have to wait until we begin iterating to populate them, so they're initially `None`.
|
|
265
|
+
rl_col_idxs: Optional[list[int]] = None
|
|
266
|
+
data_col_idxs: Optional[list[int]] = None
|
|
267
|
+
|
|
268
|
+
row_ids_in_pxt: set[tuple] = set()
|
|
269
|
+
tasks_created = 0
|
|
270
|
+
tasks_updated = 0
|
|
271
|
+
page: list[dict[str, Any]] = [] # buffer to hold tasks for paginated API calls
|
|
272
|
+
|
|
273
|
+
# Function that turns a `DataRow` into a `dict` for creating or updating a task in the
|
|
274
|
+
# Label Studio SDK.
|
|
275
|
+
def create_task_info(row: DataRow) -> dict[str, Any]:
|
|
276
|
+
data_vals = [row[idx] for idx in data_col_idxs]
|
|
277
|
+
coco_annotations = [row[idx] for idx in rl_col_idxs]
|
|
278
|
+
for i in range(len(t_data_cols)):
|
|
279
|
+
if t_data_cols[i].col_type.is_media_type():
|
|
280
|
+
# Special handling for media columns
|
|
281
|
+
assert isinstance(data_vals[i], str)
|
|
282
|
+
if self.media_import_method == 'url':
|
|
283
|
+
data_vals[i] = self.__validate_fileurl(t_data_cols[i], data_vals[i])
|
|
284
|
+
else:
|
|
285
|
+
assert self.media_import_method == 'file'
|
|
286
|
+
data_vals[i] = self.__localpath_to_lspath(data_vals[i])
|
|
287
|
+
predictions = [
|
|
288
|
+
self.__coco_to_predictions(coco_annotations[i], self.col_mapping[t_rl_cols[i]], rl_info[i])
|
|
289
|
+
for i in range(len(coco_annotations))
|
|
290
|
+
]
|
|
291
|
+
return {
|
|
292
|
+
'data': dict(zip(ext_data_cols, data_vals)),
|
|
293
|
+
'meta': {'rowid': row.rowid},
|
|
294
|
+
'predictions': predictions
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
for row in df._exec():
|
|
298
|
+
if rl_col_idxs is None:
|
|
299
|
+
rl_col_idxs = [expr.slot_idx for expr in df._select_list_exprs[:len(t_rl_cols)]]
|
|
300
|
+
data_col_idxs = [expr.slot_idx for expr in df._select_list_exprs[len(t_rl_cols):]]
|
|
301
|
+
row_ids_in_pxt.add(row.rowid)
|
|
302
|
+
task_info = create_task_info(row)
|
|
303
|
+
# TODO(aaron-siegel): Implement more efficient update logic (currently involves a full table scan)
|
|
304
|
+
if row.rowid in existing_tasks:
|
|
305
|
+
# A task for this row already exists; see if it needs an update.
|
|
306
|
+
existing_task = existing_tasks[row.rowid]
|
|
307
|
+
if task_info['data'] != existing_task['data'] or \
|
|
308
|
+
task_info['predictions'] != existing_task['predictions']:
|
|
309
|
+
_logger.debug(f'Updating task for rowid {row.rowid}.')
|
|
310
|
+
self.project.update_task(existing_tasks[row.rowid]['id'], **task_info)
|
|
311
|
+
tasks_updated += 1
|
|
312
|
+
else:
|
|
313
|
+
# No task exists for this row; we need to create one.
|
|
314
|
+
page.append(task_info)
|
|
315
|
+
tasks_created += 1
|
|
316
|
+
if len(page) == _PAGE_SIZE:
|
|
317
|
+
self.project.import_tasks(page)
|
|
318
|
+
page.clear()
|
|
319
|
+
|
|
320
|
+
if len(page) > 0:
|
|
321
|
+
self.project.import_tasks(page)
|
|
322
|
+
|
|
323
|
+
print(f'Created {tasks_created} new task(s) and updated {tasks_updated} existing task(s) in {self}.')
|
|
324
|
+
|
|
325
|
+
sync_status = SyncStatus(external_rows_created=tasks_created, external_rows_updated=tasks_updated)
|
|
326
|
+
|
|
327
|
+
deletion_sync_status = self.__delete_stale_tasks(existing_tasks, row_ids_in_pxt, tasks_created)
|
|
328
|
+
|
|
329
|
+
return sync_status.combine(deletion_sync_status)
|
|
330
|
+
|
|
331
|
+
@classmethod
|
|
332
|
+
def __validate_fileurl(cls, col: Column, url: str) -> Optional[str]:
|
|
333
|
+
# Check that the URL is one that will be visible to Label Studio. If it isn't, log an info message
|
|
334
|
+
# to help users debug the issue.
|
|
335
|
+
if not (url.startswith('http://') or url.startswith('https://')):
|
|
336
|
+
_logger.info(
|
|
337
|
+
f'URL found in media column `{col.name}` will not render correctly in Label Studio, since '
|
|
338
|
+
f'it is not an HTTP URL: {url}'
|
|
339
|
+
)
|
|
340
|
+
return url
|
|
341
|
+
|
|
342
|
+
@classmethod
|
|
343
|
+
def __localpath_to_lspath(cls, localpath: str) -> str:
|
|
344
|
+
# Transform the local path into Label Studio's bespoke path format.
|
|
345
|
+
relpath = Path(localpath).relative_to(env.Env.get().home)
|
|
346
|
+
return f'/data/local-files/?d={str(relpath)}'
|
|
347
|
+
|
|
348
|
+
def __delete_stale_tasks(self, existing_tasks: dict[tuple, dict], row_ids_in_pxt: set[tuple], tasks_created: int) -> SyncStatus:
|
|
349
|
+
deleted_rowids = set(existing_tasks.keys()) - row_ids_in_pxt
|
|
350
|
+
# Sanity check the math
|
|
351
|
+
assert len(deleted_rowids) == len(existing_tasks) + tasks_created - len(row_ids_in_pxt)
|
|
352
|
+
tasks_to_delete = [existing_tasks[rowid]['id'] for rowid in deleted_rowids]
|
|
353
|
+
|
|
354
|
+
if len(tasks_to_delete) > 0:
|
|
355
|
+
self.project.delete_tasks(tasks_to_delete)
|
|
356
|
+
print(f'Deleted {len(tasks_to_delete)} tasks(s) in {self} that are no longer present in Pixeltable.')
|
|
357
|
+
|
|
358
|
+
# Remove them from the `existing_tasks` dict so that future updates are applied correctly
|
|
359
|
+
for rowid in deleted_rowids:
|
|
360
|
+
del existing_tasks[rowid]
|
|
361
|
+
|
|
362
|
+
return SyncStatus(external_rows_deleted=len(deleted_rowids))
|
|
363
|
+
|
|
364
|
+
def __update_table_from_tasks(self, t: Table, tasks: dict[tuple, dict]) -> SyncStatus:
|
|
365
|
+
if ANNOTATIONS_COLUMN not in self.col_mapping.values():
|
|
366
|
+
return SyncStatus.empty()
|
|
367
|
+
|
|
368
|
+
annotations = {
|
|
369
|
+
# Replace [] by None to indicate no annotations. We do want to sync rows with no annotations,
|
|
370
|
+
# in order to properly handle the scenario where existing annotations have been deleted in
|
|
371
|
+
# Label Studio.
|
|
372
|
+
tuple(task['meta']['rowid']): task[ANNOTATIONS_COLUMN] if len(task[ANNOTATIONS_COLUMN]) > 0 else None
|
|
373
|
+
for task in tasks.values()
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
local_annotations_col = next(k for k, v in self.col_mapping.items() if v == ANNOTATIONS_COLUMN)
|
|
377
|
+
|
|
378
|
+
# Prune the annotations down to just the ones that have actually changed.
|
|
379
|
+
rows = t.select(t[local_annotations_col.name])
|
|
380
|
+
for row in rows._exec():
|
|
381
|
+
assert len(row.vals) == 1
|
|
382
|
+
if row.rowid in annotations and annotations[row.rowid] == row[0]:
|
|
383
|
+
del annotations[row.rowid]
|
|
384
|
+
|
|
385
|
+
# Apply updates
|
|
386
|
+
updates = [{'_rowid': rowid, local_annotations_col.name: ann} for rowid, ann in annotations.items()]
|
|
387
|
+
if len(updates) > 0:
|
|
388
|
+
_logger.info(
|
|
389
|
+
f'Updating table `{t.get_name()}`, column `{local_annotations_col.name}` with {len(updates)} total annotations.'
|
|
390
|
+
)
|
|
391
|
+
# batch_update currently doesn't propagate from views to base tables. As a workaround, we call
|
|
392
|
+
# batch_update on the actual ancestor table that holds the annotations column.
|
|
393
|
+
# TODO(aaron-siegel): Simplify this once propagation is properly implemented in batch_update
|
|
394
|
+
ancestor = t
|
|
395
|
+
while local_annotations_col not in ancestor._tbl_version.cols:
|
|
396
|
+
assert ancestor.base is not None
|
|
397
|
+
ancestor = ancestor.base
|
|
398
|
+
update_status = ancestor.batch_update(updates)
|
|
399
|
+
print(f'Updated annotation(s) from {len(updates)} task(s) in {self}.')
|
|
400
|
+
return SyncStatus(pxt_rows_updated=update_status.num_rows, num_excs=update_status.num_excs)
|
|
401
|
+
else:
|
|
402
|
+
return SyncStatus.empty()
|
|
403
|
+
|
|
404
|
+
def as_dict(self) -> dict[str, Any]:
|
|
405
|
+
return {
|
|
406
|
+
'name': self.name,
|
|
407
|
+
'project_id': self.project_id,
|
|
408
|
+
'media_import_method': self.media_import_method,
|
|
409
|
+
'col_mapping': [[self._column_as_dict(k), v] for k, v in self.col_mapping.items()],
|
|
410
|
+
'stored_proxies': [[self._column_as_dict(k), self._column_as_dict(v)] for k, v in self.stored_proxies.items()]
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
@classmethod
|
|
414
|
+
def from_dict(cls, md: dict[str, Any]) -> 'LabelStudioProject':
|
|
415
|
+
return LabelStudioProject(
|
|
416
|
+
md['name'],
|
|
417
|
+
md['project_id'],
|
|
418
|
+
md['media_import_method'],
|
|
419
|
+
{cls._column_from_dict(entry[0]): entry[1] for entry in md['col_mapping']},
|
|
420
|
+
{cls._column_from_dict(entry[0]): cls._column_from_dict(entry[1]) for entry in md['stored_proxies']}
|
|
421
|
+
)
|
|
422
|
+
|
|
423
|
+
def __repr__(self) -> str:
|
|
424
|
+
name = self.project.get_params()['title']
|
|
425
|
+
return f'LabelStudioProject `{name}`'
|
|
426
|
+
|
|
427
|
+
@classmethod
|
|
428
|
+
def __parse_project_config(cls, xml_config: str) -> '_LabelStudioConfig':
|
|
429
|
+
"""
|
|
430
|
+
Parses a Label Studio XML config, extracting the names and Pixeltable types of
|
|
431
|
+
all input variables.
|
|
432
|
+
"""
|
|
433
|
+
root: ElementTree.Element = ElementTree.fromstring(xml_config)
|
|
434
|
+
if root.tag.lower() != 'view':
|
|
435
|
+
raise excs.Error('Root of Label Studio config must be a `View`')
|
|
436
|
+
config = _LabelStudioConfig(
|
|
437
|
+
data_keys=cls.__parse_data_keys_config(root),
|
|
438
|
+
rectangle_labels=cls.__parse_rectangle_labels_config(root)
|
|
439
|
+
)
|
|
440
|
+
config.validate()
|
|
441
|
+
return config
|
|
442
|
+
|
|
443
|
+
@classmethod
|
|
444
|
+
def __parse_data_keys_config(cls, root: ElementTree.Element) -> dict[str, '_DataKey']:
|
|
445
|
+
"""Parses the data keys from a Label Studio XML config."""
|
|
446
|
+
config: dict[str, '_DataKey'] = {}
|
|
447
|
+
for element in root:
|
|
448
|
+
if 'value' in element.attrib and element.attrib['value'][0] == '$':
|
|
449
|
+
external_col_name = element.attrib['value'][1:]
|
|
450
|
+
name = element.attrib.get('name')
|
|
451
|
+
column_type = _LS_TAG_MAP.get(element.tag.lower())
|
|
452
|
+
if column_type is None:
|
|
453
|
+
raise excs.Error(
|
|
454
|
+
f'Unsupported Label Studio data type: `{element.tag}` (in data key `{external_col_name}`)'
|
|
455
|
+
)
|
|
456
|
+
config[external_col_name] = _DataKey(name=name, column_type=column_type)
|
|
457
|
+
return config
|
|
458
|
+
|
|
459
|
+
@classmethod
|
|
460
|
+
def __parse_rectangle_labels_config(cls, root: ElementTree.Element) -> dict[str, '_RectangleLabel']:
|
|
461
|
+
"""Parses the RectangleLabels from a Label Studio XML config."""
|
|
462
|
+
config: dict[str, '_RectangleLabel'] = {}
|
|
463
|
+
for element in root:
|
|
464
|
+
if element.tag.lower() == 'rectanglelabels':
|
|
465
|
+
name = element.attrib['name']
|
|
466
|
+
to_name = element.attrib['toName']
|
|
467
|
+
labels = [
|
|
468
|
+
child.attrib['value']
|
|
469
|
+
for child in element if child.tag.lower() == 'label'
|
|
470
|
+
]
|
|
471
|
+
for label in labels:
|
|
472
|
+
if label not in coco.COCO_2017_CATEGORIES.values():
|
|
473
|
+
raise excs.Error(f'Label in `rectanglelabels` config is not a valid COCO object name: {label}')
|
|
474
|
+
config[name] = _RectangleLabel(to_name=to_name, labels=labels)
|
|
475
|
+
return config
|
|
476
|
+
|
|
477
|
+
@classmethod
|
|
478
|
+
def __coco_to_predictions(
|
|
479
|
+
cls,
|
|
480
|
+
coco_annotations: dict[str, Any],
|
|
481
|
+
from_name: str,
|
|
482
|
+
rl_info: '_RectangleLabel',
|
|
483
|
+
task_id: Optional[int] = None
|
|
484
|
+
) -> dict[str, Any]:
|
|
485
|
+
width = coco_annotations['image']['width']
|
|
486
|
+
height = coco_annotations['image']['height']
|
|
487
|
+
result = [
|
|
488
|
+
{
|
|
489
|
+
'id': f'result_{i}',
|
|
490
|
+
'type': 'rectanglelabels',
|
|
491
|
+
'from_name': from_name,
|
|
492
|
+
'to_name': rl_info.to_name,
|
|
493
|
+
'image_rotation': 0,
|
|
494
|
+
'original_width': width,
|
|
495
|
+
'original_height': height,
|
|
496
|
+
'value': {
|
|
497
|
+
'rotation': 0,
|
|
498
|
+
# Label Studio expects image coordinates as % of image dimensions
|
|
499
|
+
'x': entry['bbox'][0] * 100.0 / width,
|
|
500
|
+
'y': entry['bbox'][1] * 100.0 / height,
|
|
501
|
+
'width': entry['bbox'][2] * 100.0 / width,
|
|
502
|
+
'height': entry['bbox'][3] * 100.0 / height,
|
|
503
|
+
'rectanglelabels': [coco.COCO_2017_CATEGORIES[entry['category']]]
|
|
504
|
+
}
|
|
505
|
+
}
|
|
506
|
+
for i, entry in enumerate(coco_annotations['annotations'])
|
|
507
|
+
# include only the COCO labels that match a rectanglelabel name
|
|
508
|
+
if coco.COCO_2017_CATEGORIES[entry['category']] in rl_info.labels
|
|
509
|
+
]
|
|
510
|
+
if task_id is not None:
|
|
511
|
+
return {'task': task_id, 'result': result}
|
|
512
|
+
else:
|
|
513
|
+
return {'result': result}
|
|
514
|
+
|
|
515
|
+
def delete(self) -> None:
|
|
516
|
+
"""
|
|
517
|
+
Deletes this Label Studio project. This will remove all data and annotations
|
|
518
|
+
associated with this project in Label Studio.
|
|
519
|
+
"""
|
|
520
|
+
title = self.project_title
|
|
521
|
+
_label_studio_client().delete_project(self.project_id)
|
|
522
|
+
print(f'Deleted Label Studio project: {title}')
|
|
523
|
+
|
|
524
|
+
def __eq__(self, other) -> bool:
|
|
525
|
+
return isinstance(other, LabelStudioProject) and self.project_id == other.project_id
|
|
526
|
+
|
|
527
|
+
def __hash__(self) -> int:
|
|
528
|
+
return hash(self.project_id)
|
|
529
|
+
|
|
530
|
+
@classmethod
|
|
531
|
+
def create(
|
|
532
|
+
cls,
|
|
533
|
+
t: Table,
|
|
534
|
+
label_config: str,
|
|
535
|
+
name: Optional[str],
|
|
536
|
+
title: Optional[str],
|
|
537
|
+
media_import_method: Literal['post', 'file', 'url'],
|
|
538
|
+
col_mapping: Optional[dict[str, str]],
|
|
539
|
+
**kwargs: Any
|
|
540
|
+
) -> 'LabelStudioProject':
|
|
541
|
+
"""
|
|
542
|
+
Creates a new Label Studio project, using the Label Studio client configured in Pixeltable.
|
|
543
|
+
"""
|
|
544
|
+
# Check that the config is valid before creating the project
|
|
545
|
+
config = cls.__parse_project_config(label_config)
|
|
546
|
+
|
|
547
|
+
if name is None:
|
|
548
|
+
# Create a default name that's unique to the table
|
|
549
|
+
all_stores = t.external_stores
|
|
550
|
+
n = 0
|
|
551
|
+
while f'ls_project_{n}' in all_stores:
|
|
552
|
+
n += 1
|
|
553
|
+
name = f'ls_project_{n}'
|
|
554
|
+
|
|
555
|
+
if title is None:
|
|
556
|
+
# `title` defaults to table name
|
|
557
|
+
title = t.get_name()
|
|
558
|
+
|
|
559
|
+
# Create a column to hold the annotations, if one does not yet exist
|
|
560
|
+
if col_mapping is None or ANNOTATIONS_COLUMN in col_mapping.values():
|
|
561
|
+
if col_mapping is None:
|
|
562
|
+
local_annotations_column = ANNOTATIONS_COLUMN
|
|
563
|
+
else:
|
|
564
|
+
local_annotations_column = next(k for k, v in col_mapping.items() if v == ANNOTATIONS_COLUMN)
|
|
565
|
+
if local_annotations_column not in t.column_names():
|
|
566
|
+
t[local_annotations_column] = pxt.JsonType(nullable=True)
|
|
567
|
+
|
|
568
|
+
resolved_col_mapping = cls.validate_columns(
|
|
569
|
+
t, config.export_columns, {ANNOTATIONS_COLUMN: pxt.JsonType(nullable=True)}, col_mapping)
|
|
570
|
+
|
|
571
|
+
# Perform some additional validation
|
|
572
|
+
if media_import_method == 'post' and len(config.data_keys) > 1:
|
|
573
|
+
raise excs.Error('`media_import_method` cannot be `post` if there is more than one data key')
|
|
574
|
+
|
|
575
|
+
project = _label_studio_client().start_project(title=title, label_config=label_config, **kwargs)
|
|
576
|
+
|
|
577
|
+
if media_import_method == 'file':
|
|
578
|
+
# We need to set up a local storage connection to receive media files
|
|
579
|
+
os.environ['LABEL_STUDIO_LOCAL_FILES_DOCUMENT_ROOT'] = str(env.Env.get().home)
|
|
580
|
+
try:
|
|
581
|
+
project.connect_local_import_storage(local_store_path=str(env.Env.get().media_dir))
|
|
582
|
+
except HTTPError as exc:
|
|
583
|
+
if exc.errno == 400:
|
|
584
|
+
response: dict = json.loads(exc.response.text)
|
|
585
|
+
if 'validation_errors' in response and 'non_field_errors' in response['validation_errors'] \
|
|
586
|
+
and 'LOCAL_FILES_SERVING_ENABLED' in response['validation_errors']['non_field_errors'][0]:
|
|
587
|
+
raise excs.Error(
|
|
588
|
+
'`media_import_method` is set to `file`, but your Label Studio server is not configured '
|
|
589
|
+
'for local file storage.\nPlease set the `LABEL_STUDIO_LOCAL_FILES_SERVING_ENABLED` '
|
|
590
|
+
'environment variable to `true` in the environment where your Label Studio server is running.'
|
|
591
|
+
) from exc
|
|
592
|
+
raise # Handle any other exception type normally
|
|
593
|
+
|
|
594
|
+
project_id = project.get_params()['id']
|
|
595
|
+
return LabelStudioProject(name, project_id, media_import_method, resolved_col_mapping)
|
|
596
|
+
|
|
597
|
+
|
|
598
|
+
@dataclass(frozen=True)
|
|
599
|
+
class _DataKey:
|
|
600
|
+
name: Optional[str] # The 'name' attribute of the data key; may differ from the field name
|
|
601
|
+
column_type: pxt.ColumnType
|
|
602
|
+
|
|
603
|
+
|
|
604
|
+
@dataclass(frozen=True)
|
|
605
|
+
class _RectangleLabel:
|
|
606
|
+
to_name: str
|
|
607
|
+
labels: list[str]
|
|
608
|
+
|
|
609
|
+
|
|
610
|
+
@dataclass(frozen=True)
|
|
611
|
+
class _LabelStudioConfig:
|
|
612
|
+
data_keys: dict[str, _DataKey]
|
|
613
|
+
rectangle_labels: dict[str, _RectangleLabel]
|
|
614
|
+
|
|
615
|
+
def validate(self) -> None:
|
|
616
|
+
data_key_names = set(key.name for key in self.data_keys.values() if key.name is not None)
|
|
617
|
+
for name, rl in self.rectangle_labels.items():
|
|
618
|
+
if rl.to_name not in data_key_names:
|
|
619
|
+
raise excs.Error(
|
|
620
|
+
f'Invalid Label Studio configuration: `toName` attribute of RectangleLabels `{name}` '
|
|
621
|
+
f'references an unknown data key: `{rl.to_name}`'
|
|
622
|
+
)
|
|
623
|
+
|
|
624
|
+
@property
|
|
625
|
+
def export_columns(self) -> dict[str, pxt.ColumnType]:
|
|
626
|
+
data_key_cols = {key_id: key_info.column_type for key_id, key_info in self.data_keys.items()}
|
|
627
|
+
rl_cols = {name: pxt.JsonType() for name in self.rectangle_labels.keys()}
|
|
628
|
+
return {**data_key_cols, **rl_cols}
|
|
629
|
+
|
|
630
|
+
|
|
631
|
+
ANNOTATIONS_COLUMN = 'annotations'
|
|
632
|
+
_PAGE_SIZE = 100 # This is the default used in the LS SDK
|
|
633
|
+
_LS_TAG_MAP = {
|
|
634
|
+
'header': pxt.StringType(),
|
|
635
|
+
'text': pxt.StringType(),
|
|
636
|
+
'image': pxt.ImageType(),
|
|
637
|
+
'video': pxt.VideoType(),
|
|
638
|
+
'audio': pxt.AudioType()
|
|
639
|
+
}
|
pixeltable/io/parquet.py
CHANGED
|
@@ -63,7 +63,7 @@ def save_parquet(df: pxt.DataFrame, dest_path: Path, partition_size_bytes: int =
|
|
|
63
63
|
# store the changes atomically
|
|
64
64
|
with transactional_directory(dest_path) as temp_path:
|
|
65
65
|
# dump metadata json file so we can inspect what was the source of the parquet file later on.
|
|
66
|
-
json.dump(df.
|
|
66
|
+
json.dump(df.as_dict(), (temp_path / '.pixeltable.json').open('w')) # pylint: disable=protected-access
|
|
67
67
|
json.dump(type_dict, (temp_path / '.pixeltable.column_types.json').open('w')) # keep type metadata
|
|
68
68
|
|
|
69
69
|
batch_num = 0
|
pixeltable/iterators/__init__.py
CHANGED
|
@@ -1,3 +1,12 @@
|
|
|
1
1
|
from .base import ComponentIterator
|
|
2
2
|
from .document import DocumentSplitter
|
|
3
|
+
from .string import StringSplitter
|
|
3
4
|
from .video import FrameIterator
|
|
5
|
+
|
|
6
|
+
__default_dir = set(symbol for symbol in dir() if not symbol.startswith('_'))
|
|
7
|
+
__removed_symbols = {'base', 'document', 'video'}
|
|
8
|
+
__all__ = sorted(list(__default_dir - __removed_symbols))
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def __dir__():
|
|
12
|
+
return __all__
|