pixeltable 0.3.2__py3-none-any.whl → 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +64 -11
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +1 -1
- pixeltable/catalog/catalog.py +50 -27
- pixeltable/catalog/column.py +27 -11
- pixeltable/catalog/dir.py +6 -4
- pixeltable/catalog/globals.py +8 -1
- pixeltable/catalog/insertable_table.py +22 -12
- pixeltable/catalog/named_function.py +10 -6
- pixeltable/catalog/path.py +3 -2
- pixeltable/catalog/path_dict.py +8 -6
- pixeltable/catalog/schema_object.py +2 -1
- pixeltable/catalog/table.py +121 -101
- pixeltable/catalog/table_version.py +291 -142
- pixeltable/catalog/table_version_path.py +8 -5
- pixeltable/catalog/view.py +67 -26
- pixeltable/dataframe.py +102 -72
- pixeltable/env.py +20 -21
- pixeltable/exec/__init__.py +2 -2
- pixeltable/exec/aggregation_node.py +10 -4
- pixeltable/exec/cache_prefetch_node.py +5 -3
- pixeltable/exec/component_iteration_node.py +9 -8
- pixeltable/exec/data_row_batch.py +21 -10
- pixeltable/exec/exec_context.py +10 -3
- pixeltable/exec/exec_node.py +23 -12
- pixeltable/exec/expr_eval/evaluators.py +13 -7
- pixeltable/exec/expr_eval/expr_eval_node.py +24 -15
- pixeltable/exec/expr_eval/globals.py +30 -7
- pixeltable/exec/expr_eval/row_buffer.py +5 -6
- pixeltable/exec/expr_eval/schedulers.py +151 -31
- pixeltable/exec/in_memory_data_node.py +8 -7
- pixeltable/exec/row_update_node.py +15 -5
- pixeltable/exec/sql_node.py +56 -27
- pixeltable/exprs/__init__.py +2 -2
- pixeltable/exprs/arithmetic_expr.py +57 -26
- pixeltable/exprs/array_slice.py +1 -1
- pixeltable/exprs/column_property_ref.py +2 -1
- pixeltable/exprs/column_ref.py +20 -15
- pixeltable/exprs/comparison.py +6 -2
- pixeltable/exprs/compound_predicate.py +1 -3
- pixeltable/exprs/data_row.py +2 -2
- pixeltable/exprs/expr.py +101 -72
- pixeltable/exprs/expr_dict.py +2 -1
- pixeltable/exprs/expr_set.py +3 -1
- pixeltable/exprs/function_call.py +39 -41
- pixeltable/exprs/globals.py +1 -0
- pixeltable/exprs/in_predicate.py +2 -2
- pixeltable/exprs/inline_expr.py +20 -17
- pixeltable/exprs/json_mapper.py +4 -2
- pixeltable/exprs/json_path.py +12 -18
- pixeltable/exprs/literal.py +5 -9
- pixeltable/exprs/method_ref.py +1 -0
- pixeltable/exprs/object_ref.py +1 -1
- pixeltable/exprs/row_builder.py +32 -17
- pixeltable/exprs/rowid_ref.py +14 -5
- pixeltable/exprs/similarity_expr.py +11 -6
- pixeltable/exprs/sql_element_cache.py +1 -1
- pixeltable/exprs/type_cast.py +24 -9
- pixeltable/ext/__init__.py +1 -0
- pixeltable/ext/functions/__init__.py +1 -0
- pixeltable/ext/functions/whisperx.py +2 -2
- pixeltable/ext/functions/yolox.py +11 -11
- pixeltable/func/aggregate_function.py +17 -13
- pixeltable/func/callable_function.py +6 -6
- pixeltable/func/expr_template_function.py +15 -14
- pixeltable/func/function.py +16 -16
- pixeltable/func/function_registry.py +11 -8
- pixeltable/func/globals.py +4 -2
- pixeltable/func/query_template_function.py +12 -13
- pixeltable/func/signature.py +18 -9
- pixeltable/func/tools.py +10 -17
- pixeltable/func/udf.py +106 -11
- pixeltable/functions/__init__.py +21 -2
- pixeltable/functions/anthropic.py +16 -12
- pixeltable/functions/fireworks.py +63 -5
- pixeltable/functions/gemini.py +13 -3
- pixeltable/functions/globals.py +18 -6
- pixeltable/functions/huggingface.py +20 -38
- pixeltable/functions/image.py +7 -3
- pixeltable/functions/json.py +1 -0
- pixeltable/functions/llama_cpp.py +1 -4
- pixeltable/functions/mistralai.py +31 -20
- pixeltable/functions/ollama.py +4 -18
- pixeltable/functions/openai.py +201 -108
- pixeltable/functions/replicate.py +11 -10
- pixeltable/functions/string.py +70 -7
- pixeltable/functions/timestamp.py +21 -8
- pixeltable/functions/together.py +66 -52
- pixeltable/functions/video.py +1 -0
- pixeltable/functions/vision.py +14 -11
- pixeltable/functions/whisper.py +2 -1
- pixeltable/globals.py +60 -26
- pixeltable/index/__init__.py +1 -1
- pixeltable/index/btree.py +5 -3
- pixeltable/index/embedding_index.py +15 -14
- pixeltable/io/__init__.py +1 -1
- pixeltable/io/external_store.py +30 -25
- pixeltable/io/fiftyone.py +6 -14
- pixeltable/io/globals.py +33 -27
- pixeltable/io/hf_datasets.py +2 -1
- pixeltable/io/label_studio.py +77 -68
- pixeltable/io/pandas.py +33 -9
- pixeltable/io/parquet.py +9 -12
- pixeltable/iterators/__init__.py +1 -0
- pixeltable/iterators/audio.py +205 -0
- pixeltable/iterators/document.py +19 -8
- pixeltable/iterators/image.py +6 -24
- pixeltable/iterators/string.py +3 -6
- pixeltable/iterators/video.py +1 -7
- pixeltable/metadata/__init__.py +7 -1
- pixeltable/metadata/converters/convert_10.py +2 -2
- pixeltable/metadata/converters/convert_15.py +1 -5
- pixeltable/metadata/converters/convert_16.py +2 -4
- pixeltable/metadata/converters/convert_17.py +2 -4
- pixeltable/metadata/converters/convert_18.py +2 -4
- pixeltable/metadata/converters/convert_19.py +2 -5
- pixeltable/metadata/converters/convert_20.py +1 -4
- pixeltable/metadata/converters/convert_21.py +4 -6
- pixeltable/metadata/converters/convert_22.py +1 -0
- pixeltable/metadata/converters/convert_23.py +5 -5
- pixeltable/metadata/converters/convert_24.py +12 -13
- pixeltable/metadata/converters/convert_26.py +23 -0
- pixeltable/metadata/converters/util.py +3 -4
- pixeltable/metadata/notes.py +1 -0
- pixeltable/metadata/schema.py +13 -2
- pixeltable/plan.py +173 -98
- pixeltable/store.py +42 -26
- pixeltable/type_system.py +62 -54
- pixeltable/utils/arrow.py +1 -2
- pixeltable/utils/coco.py +16 -17
- pixeltable/utils/code.py +1 -1
- pixeltable/utils/console_output.py +6 -3
- pixeltable/utils/description_helper.py +7 -7
- pixeltable/utils/documents.py +3 -1
- pixeltable/utils/filecache.py +12 -7
- pixeltable/utils/http_server.py +9 -8
- pixeltable/utils/media_store.py +2 -1
- pixeltable/utils/pytorch.py +11 -14
- pixeltable/utils/s3.py +1 -0
- pixeltable/utils/sql.py +1 -0
- pixeltable/utils/transactional_directory.py +2 -2
- {pixeltable-0.3.2.dist-info → pixeltable-0.3.3.dist-info}/METADATA +6 -8
- pixeltable-0.3.3.dist-info/RECORD +163 -0
- pixeltable-0.3.2.dist-info/RECORD +0 -161
- {pixeltable-0.3.2.dist-info → pixeltable-0.3.3.dist-info}/LICENSE +0 -0
- {pixeltable-0.3.2.dist-info → pixeltable-0.3.3.dist-info}/WHEEL +0 -0
- {pixeltable-0.3.2.dist-info → pixeltable-0.3.3.dist-info}/entry_points.txt +0 -0
pixeltable/io/external_store.py
CHANGED
|
@@ -8,11 +8,11 @@ from dataclasses import dataclass
|
|
|
8
8
|
from typing import Any, Optional
|
|
9
9
|
from uuid import UUID
|
|
10
10
|
|
|
11
|
-
import pixeltable.exceptions as excs
|
|
12
|
-
import pixeltable.type_system as ts
|
|
13
|
-
from pixeltable import Table, Column
|
|
14
11
|
import sqlalchemy as sql
|
|
15
12
|
|
|
13
|
+
import pixeltable.exceptions as excs
|
|
14
|
+
import pixeltable.type_system as ts
|
|
15
|
+
from pixeltable import Column, Table
|
|
16
16
|
from pixeltable.catalog import TableVersion
|
|
17
17
|
|
|
18
18
|
_logger = logging.getLogger('pixeltable')
|
|
@@ -148,7 +148,9 @@ class Project(ExternalStore, abc.ABC):
|
|
|
148
148
|
"""
|
|
149
149
|
from pixeltable import exprs
|
|
150
150
|
|
|
151
|
-
assert
|
|
151
|
+
assert (
|
|
152
|
+
col.col_type.is_media_type() and not (col.is_stored and col.is_computed) and col not in self.stored_proxies
|
|
153
|
+
)
|
|
152
154
|
proxy_col = Column(
|
|
153
155
|
name=None,
|
|
154
156
|
# Force images in the proxy column to be materialized inside the media store, in a normalized format.
|
|
@@ -159,7 +161,7 @@ class Project(ExternalStore, abc.ABC):
|
|
|
159
161
|
stored=True,
|
|
160
162
|
col_id=tbl_version.next_col_id,
|
|
161
163
|
sa_col_type=col.col_type.to_sa_type(),
|
|
162
|
-
schema_version_add=tbl_version.schema_version
|
|
164
|
+
schema_version_add=tbl_version.schema_version,
|
|
163
165
|
)
|
|
164
166
|
proxy_col.tbl = tbl_version
|
|
165
167
|
tbl_version.next_col_id += 1
|
|
@@ -197,11 +199,11 @@ class Project(ExternalStore, abc.ABC):
|
|
|
197
199
|
|
|
198
200
|
@classmethod
|
|
199
201
|
def validate_columns(
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
202
|
+
cls,
|
|
203
|
+
table: Table,
|
|
204
|
+
export_cols: dict[str, ts.ColumnType],
|
|
205
|
+
import_cols: dict[str, ts.ColumnType],
|
|
206
|
+
col_mapping: Optional[dict[str, str]],
|
|
205
207
|
) -> dict[Column, str]:
|
|
206
208
|
"""
|
|
207
209
|
Verifies that the specified `col_mapping` is valid. In particular, checks that:
|
|
@@ -294,7 +296,7 @@ class SyncStatus:
|
|
|
294
296
|
external_rows_deleted=self.external_rows_deleted + other.external_rows_deleted,
|
|
295
297
|
external_rows_updated=self.external_rows_updated + other.external_rows_updated,
|
|
296
298
|
pxt_rows_updated=self.pxt_rows_updated + other.pxt_rows_updated,
|
|
297
|
-
num_excs=self.num_excs + other.num_excs
|
|
299
|
+
num_excs=self.num_excs + other.num_excs,
|
|
298
300
|
)
|
|
299
301
|
|
|
300
302
|
@classmethod
|
|
@@ -304,13 +306,14 @@ class SyncStatus:
|
|
|
304
306
|
|
|
305
307
|
class MockProject(Project):
|
|
306
308
|
"""A project that cannot be synced, used mainly for testing."""
|
|
309
|
+
|
|
307
310
|
def __init__(
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
311
|
+
self,
|
|
312
|
+
name: str,
|
|
313
|
+
export_cols: dict[str, ts.ColumnType],
|
|
314
|
+
import_cols: dict[str, ts.ColumnType],
|
|
315
|
+
col_mapping: dict[Column, str],
|
|
316
|
+
stored_proxies: Optional[dict[Column, Column]] = None,
|
|
314
317
|
):
|
|
315
318
|
super().__init__(name, col_mapping, stored_proxies)
|
|
316
319
|
self.export_cols = export_cols
|
|
@@ -319,12 +322,12 @@ class MockProject(Project):
|
|
|
319
322
|
|
|
320
323
|
@classmethod
|
|
321
324
|
def create(
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
325
|
+
cls,
|
|
326
|
+
t: Table,
|
|
327
|
+
name: str,
|
|
328
|
+
export_cols: dict[str, ts.ColumnType],
|
|
329
|
+
import_cols: dict[str, ts.ColumnType],
|
|
330
|
+
col_mapping: Optional[dict[str, str]] = None,
|
|
328
331
|
) -> 'MockProject':
|
|
329
332
|
col_mapping = cls.validate_columns(t, export_cols, import_cols, col_mapping)
|
|
330
333
|
return cls(name, export_cols, import_cols, col_mapping)
|
|
@@ -351,7 +354,9 @@ class MockProject(Project):
|
|
|
351
354
|
'export_cols': {k: v.as_dict() for k, v in self.export_cols.items()},
|
|
352
355
|
'import_cols': {k: v.as_dict() for k, v in self.import_cols.items()},
|
|
353
356
|
'col_mapping': [[self._column_as_dict(k), v] for k, v in self.col_mapping.items()],
|
|
354
|
-
'stored_proxies': [
|
|
357
|
+
'stored_proxies': [
|
|
358
|
+
[self._column_as_dict(k), self._column_as_dict(v)] for k, v in self.stored_proxies.items()
|
|
359
|
+
],
|
|
355
360
|
}
|
|
356
361
|
|
|
357
362
|
@classmethod
|
|
@@ -361,7 +366,7 @@ class MockProject(Project):
|
|
|
361
366
|
{k: ts.ColumnType.from_dict(v) for k, v in md['export_cols'].items()},
|
|
362
367
|
{k: ts.ColumnType.from_dict(v) for k, v in md['import_cols'].items()},
|
|
363
368
|
{cls._column_from_dict(entry[0]): entry[1] for entry in md['col_mapping']},
|
|
364
|
-
{cls._column_from_dict(entry[0]): cls._column_from_dict(entry[1]) for entry in md['stored_proxies']}
|
|
369
|
+
{cls._column_from_dict(entry[0]): cls._column_from_dict(entry[1]) for entry in md['stored_proxies']},
|
|
365
370
|
)
|
|
366
371
|
|
|
367
372
|
def __eq__(self, other: Any) -> bool:
|
pixeltable/io/fiftyone.py
CHANGED
|
@@ -16,6 +16,7 @@ class PxtImageDatasetImporter(foud.LabeledImageDatasetImporter):
|
|
|
16
16
|
"""
|
|
17
17
|
Implementation of a FiftyOne `DatasetImporter` that reads image data from a Pixeltable table.
|
|
18
18
|
"""
|
|
19
|
+
|
|
19
20
|
__image_format: str # format to use for any exported images that are not already stored on disk
|
|
20
21
|
__labels: dict[str, tuple[exprs.Expr, type[fo.Label]]] # label_name -> (expr, label_cls)
|
|
21
22
|
__image_idx: int # index of the image expr in the select list
|
|
@@ -34,12 +35,7 @@ class PxtImageDatasetImporter(foud.LabeledImageDatasetImporter):
|
|
|
34
35
|
seed: Union[int, float, str, bytes, bytearray, None] = None,
|
|
35
36
|
max_samples: Optional[int] = None,
|
|
36
37
|
):
|
|
37
|
-
super().__init__(
|
|
38
|
-
dataset_dir=dataset_dir,
|
|
39
|
-
shuffle=shuffle,
|
|
40
|
-
seed=seed,
|
|
41
|
-
max_samples=max_samples
|
|
42
|
-
)
|
|
38
|
+
super().__init__(dataset_dir=dataset_dir, shuffle=shuffle, seed=seed, max_samples=max_samples)
|
|
43
39
|
|
|
44
40
|
self.__image_format = image_format
|
|
45
41
|
|
|
@@ -54,9 +50,9 @@ class PxtImageDatasetImporter(foud.LabeledImageDatasetImporter):
|
|
|
54
50
|
if isinstance(exprs_, dict):
|
|
55
51
|
for label_name, expr in exprs_.items():
|
|
56
52
|
if not label_name.isidentifier():
|
|
57
|
-
raise excs.Error(f
|
|
53
|
+
raise excs.Error(f'Invalid label name: {label_name}')
|
|
58
54
|
if label_name in self.__labels:
|
|
59
|
-
raise excs.Error(f
|
|
55
|
+
raise excs.Error(f'Duplicate label name: {label_name}')
|
|
60
56
|
self.__labels[label_name] = (expr, label_cls)
|
|
61
57
|
|
|
62
58
|
# Now add the remaining labels, assigning unused default names.
|
|
@@ -137,13 +133,9 @@ class PxtImageDatasetImporter(foud.LabeledImageDatasetImporter):
|
|
|
137
133
|
def __as_fo_classifications(self, data: list) -> list[fo.Classification]:
|
|
138
134
|
if not isinstance(data, list) or any('label' not in entry for entry in data):
|
|
139
135
|
raise excs.Error(
|
|
140
|
-
f
|
|
141
|
-
"(Expected a list of dicts, each containing a 'label' key)"
|
|
136
|
+
f"Invalid classifications data: {data}\n(Expected a list of dicts, each containing a 'label' key)"
|
|
142
137
|
)
|
|
143
|
-
return [
|
|
144
|
-
fo.Classification(label=entry['label'], confidence=entry.get('confidence'))
|
|
145
|
-
for entry in data
|
|
146
|
-
]
|
|
138
|
+
return [fo.Classification(label=entry['label'], confidence=entry.get('confidence')) for entry in data]
|
|
147
139
|
|
|
148
140
|
def __as_fo_detections(self, data: list) -> list[fo.Detections]:
|
|
149
141
|
if not isinstance(data, list) or any('label' not in entry or 'bounding_box' not in entry for entry in data):
|
pixeltable/io/globals.py
CHANGED
|
@@ -11,15 +11,15 @@ if TYPE_CHECKING:
|
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
def create_label_studio_project(
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
14
|
+
t: Table,
|
|
15
|
+
label_config: str,
|
|
16
|
+
name: Optional[str] = None,
|
|
17
|
+
title: Optional[str] = None,
|
|
18
|
+
media_import_method: Literal['post', 'file', 'url'] = 'post',
|
|
19
|
+
col_mapping: Optional[dict[str, str]] = None,
|
|
20
|
+
sync_immediately: bool = True,
|
|
21
|
+
s3_configuration: Optional[dict[str, Any]] = None,
|
|
22
|
+
**kwargs: Any,
|
|
23
23
|
) -> SyncStatus:
|
|
24
24
|
"""
|
|
25
25
|
Create a new Label Studio project and link it to the specified [`Table`][pixeltable.Table].
|
|
@@ -125,14 +125,7 @@ def create_label_studio_project(
|
|
|
125
125
|
from pixeltable.io.label_studio import LabelStudioProject
|
|
126
126
|
|
|
127
127
|
ls_project = LabelStudioProject.create(
|
|
128
|
-
t,
|
|
129
|
-
label_config,
|
|
130
|
-
name,
|
|
131
|
-
title,
|
|
132
|
-
media_import_method,
|
|
133
|
-
col_mapping,
|
|
134
|
-
s3_configuration,
|
|
135
|
-
**kwargs
|
|
128
|
+
t, label_config, name, title, media_import_method, col_mapping, s3_configuration, **kwargs
|
|
136
129
|
)
|
|
137
130
|
|
|
138
131
|
# Link the project to `t`, and sync if appropriate.
|
|
@@ -150,8 +143,8 @@ def import_rows(
|
|
|
150
143
|
schema_overrides: Optional[dict[str, pxt.ColumnType]] = None,
|
|
151
144
|
primary_key: Optional[Union[str, list[str]]] = None,
|
|
152
145
|
num_retained_versions: int = 10,
|
|
153
|
-
comment: str = ''
|
|
154
|
-
|
|
146
|
+
comment: str = '',
|
|
147
|
+
) -> Table:
|
|
155
148
|
"""
|
|
156
149
|
Creates a new base table from a list of dictionaries. The dictionaries must be of the
|
|
157
150
|
form `{column_name: value, ...}`. Pixeltable will attempt to infer the schema of the table from the
|
|
@@ -194,7 +187,9 @@ def import_rows(
|
|
|
194
187
|
# The column type will always be nullable by default.
|
|
195
188
|
col_type = pxt.ColumnType.infer_literal_type(value, nullable=True)
|
|
196
189
|
if col_type is None:
|
|
197
|
-
raise excs.Error(
|
|
190
|
+
raise excs.Error(
|
|
191
|
+
f'Could not infer type for column `{col_name}`; the value in row {n} has an unsupported type: {type(value)}'
|
|
192
|
+
)
|
|
198
193
|
if col_name not in schema:
|
|
199
194
|
schema[col_name] = col_type
|
|
200
195
|
else:
|
|
@@ -210,7 +205,9 @@ def import_rows(
|
|
|
210
205
|
|
|
211
206
|
extraneous_keys = schema_overrides.keys() - schema.keys()
|
|
212
207
|
if len(extraneous_keys) > 0:
|
|
213
|
-
raise excs.Error(
|
|
208
|
+
raise excs.Error(
|
|
209
|
+
f'The following columns specified in `schema_overrides` are not present in the data: {", ".join(extraneous_keys)}'
|
|
210
|
+
)
|
|
214
211
|
|
|
215
212
|
entirely_none_cols = cols_with_nones - schema.keys()
|
|
216
213
|
if len(entirely_none_cols) > 0:
|
|
@@ -221,7 +218,9 @@ def import_rows(
|
|
|
221
218
|
'Consider specifying the type(s) explicitly in `schema_overrides`.'
|
|
222
219
|
)
|
|
223
220
|
|
|
224
|
-
t = pxt.create_table(
|
|
221
|
+
t = pxt.create_table(
|
|
222
|
+
tbl_path, schema, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment
|
|
223
|
+
)
|
|
225
224
|
t.insert(rows)
|
|
226
225
|
return t
|
|
227
226
|
|
|
@@ -234,7 +233,7 @@ def import_json(
|
|
|
234
233
|
primary_key: Optional[Union[str, list[str]]] = None,
|
|
235
234
|
num_retained_versions: int = 10,
|
|
236
235
|
comment: str = '',
|
|
237
|
-
**kwargs: Any
|
|
236
|
+
**kwargs: Any,
|
|
238
237
|
) -> Table:
|
|
239
238
|
"""
|
|
240
239
|
Creates a new base table from a JSON file. This is a convenience method and is
|
|
@@ -272,7 +271,14 @@ def import_json(
|
|
|
272
271
|
# URL
|
|
273
272
|
contents = urllib.request.urlopen(filepath_or_url).read()
|
|
274
273
|
data = json.loads(contents, **kwargs)
|
|
275
|
-
return import_rows(
|
|
274
|
+
return import_rows(
|
|
275
|
+
tbl_path,
|
|
276
|
+
data,
|
|
277
|
+
schema_overrides=schema_overrides,
|
|
278
|
+
primary_key=primary_key,
|
|
279
|
+
num_retained_versions=num_retained_versions,
|
|
280
|
+
comment=comment,
|
|
281
|
+
)
|
|
276
282
|
|
|
277
283
|
|
|
278
284
|
def export_images_as_fo_dataset(
|
|
@@ -358,6 +364,6 @@ def export_images_as_fo_dataset(
|
|
|
358
364
|
if not images.col_type.is_image_type():
|
|
359
365
|
raise excs.Error(f'`images` must be an expression of type Image (got {images.col_type._to_base_str()})')
|
|
360
366
|
|
|
361
|
-
return fo.Dataset.from_importer(
|
|
362
|
-
tbl, images, image_format, classifications=classifications, detections=detections
|
|
363
|
-
)
|
|
367
|
+
return fo.Dataset.from_importer(
|
|
368
|
+
PxtImageDatasetImporter(tbl, images, image_format, classifications=classifications, detections=detections)
|
|
369
|
+
)
|
pixeltable/io/hf_datasets.py
CHANGED
|
@@ -4,7 +4,7 @@ import logging
|
|
|
4
4
|
import math
|
|
5
5
|
import random
|
|
6
6
|
import typing
|
|
7
|
-
from typing import
|
|
7
|
+
from typing import Any, Optional, Union
|
|
8
8
|
|
|
9
9
|
import pixeltable as pxt
|
|
10
10
|
import pixeltable.type_system as ts
|
|
@@ -103,6 +103,7 @@ def import_huggingface_dataset(
|
|
|
103
103
|
A handle to the newly created [`Table`][pixeltable.Table].
|
|
104
104
|
"""
|
|
105
105
|
import datasets
|
|
106
|
+
|
|
106
107
|
import pixeltable as pxt
|
|
107
108
|
|
|
108
109
|
if table_path in pxt.list_tables():
|
pixeltable/io/label_studio.py
CHANGED
|
@@ -47,12 +47,12 @@ class LabelStudioProject(Project):
|
|
|
47
47
|
"""
|
|
48
48
|
|
|
49
49
|
def __init__(
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
50
|
+
self,
|
|
51
|
+
name: str,
|
|
52
|
+
project_id: int,
|
|
53
|
+
media_import_method: Literal['post', 'file', 'url'],
|
|
54
|
+
col_mapping: dict[Column, str],
|
|
55
|
+
stored_proxies: Optional[dict[Column, Column]] = None,
|
|
56
56
|
):
|
|
57
57
|
"""
|
|
58
58
|
The constructor will NOT create a new Label Studio project; it is also used when loading
|
|
@@ -70,8 +70,10 @@ class LabelStudioProject(Project):
|
|
|
70
70
|
try:
|
|
71
71
|
self._project = _label_studio_client().get_project(self.project_id)
|
|
72
72
|
except HTTPError as exc:
|
|
73
|
-
raise excs.Error(
|
|
74
|
-
|
|
73
|
+
raise excs.Error(
|
|
74
|
+
f'Could not locate Label Studio project: {self.project_id} '
|
|
75
|
+
'(cannot connect to server or project no longer exists)'
|
|
76
|
+
) from exc
|
|
75
77
|
return self._project
|
|
76
78
|
|
|
77
79
|
@property
|
|
@@ -105,8 +107,10 @@ class LabelStudioProject(Project):
|
|
|
105
107
|
return {ANNOTATIONS_COLUMN: pxt.JsonType(nullable=True)}
|
|
106
108
|
|
|
107
109
|
def sync(self, t: Table, export_data: bool, import_data: bool) -> SyncStatus:
|
|
108
|
-
_logger.info(
|
|
109
|
-
|
|
110
|
+
_logger.info(
|
|
111
|
+
f'Syncing Label Studio project "{self.project_title}" with table `{t._name}`'
|
|
112
|
+
f' (export: {export_data}, import: {import_data}).'
|
|
113
|
+
)
|
|
110
114
|
# Collect all existing tasks into a dict with entries `rowid: task`
|
|
111
115
|
tasks = {tuple(task['meta']['rowid']): task for task in self.__fetch_all_tasks()}
|
|
112
116
|
sync_status = SyncStatus.empty()
|
|
@@ -148,18 +152,14 @@ class LabelStudioProject(Project):
|
|
|
148
152
|
config = self.__project_config
|
|
149
153
|
|
|
150
154
|
# Columns in `t` that map to Label Studio data keys
|
|
151
|
-
t_data_cols = [
|
|
152
|
-
t_col for t_col, ext_col_name in self.col_mapping.items()
|
|
153
|
-
if ext_col_name in config.data_keys
|
|
154
|
-
]
|
|
155
|
+
t_data_cols = [t_col for t_col, ext_col_name in self.col_mapping.items() if ext_col_name in config.data_keys]
|
|
155
156
|
|
|
156
157
|
if len(t_data_cols) == 0:
|
|
157
158
|
return SyncStatus.empty()
|
|
158
159
|
|
|
159
160
|
# Columns in `t` that map to `rectanglelabels` preannotations
|
|
160
161
|
t_rl_cols = [
|
|
161
|
-
t_col for t_col, ext_col_name in self.col_mapping.items()
|
|
162
|
-
if ext_col_name in config.rectangle_labels
|
|
162
|
+
t_col for t_col, ext_col_name in self.col_mapping.items() if ext_col_name in config.rectangle_labels
|
|
163
163
|
]
|
|
164
164
|
|
|
165
165
|
# Destinations for `rectanglelabels` preannotations
|
|
@@ -180,12 +180,12 @@ class LabelStudioProject(Project):
|
|
|
180
180
|
assert False
|
|
181
181
|
|
|
182
182
|
def __update_tasks_by_post(
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
183
|
+
self,
|
|
184
|
+
t: Table,
|
|
185
|
+
existing_tasks: dict[tuple, dict],
|
|
186
|
+
media_col: Column,
|
|
187
|
+
t_rl_cols: list[Column],
|
|
188
|
+
rl_info: list['_RectangleLabel'],
|
|
189
189
|
) -> SyncStatus:
|
|
190
190
|
is_stored = media_col.is_stored
|
|
191
191
|
# If it's a stored column, we can use `localpath`
|
|
@@ -197,7 +197,7 @@ class LabelStudioProject(Project):
|
|
|
197
197
|
|
|
198
198
|
for row in rows._exec():
|
|
199
199
|
media_col_idx = rows._select_list_exprs[0].slot_idx
|
|
200
|
-
rl_col_idxs = [expr.slot_idx for expr in rows._select_list_exprs[1: 1 + len(t_rl_cols)]]
|
|
200
|
+
rl_col_idxs = [expr.slot_idx for expr in rows._select_list_exprs[1 : 1 + len(t_rl_cols)]]
|
|
201
201
|
row_ids_in_pxt.add(row.rowid)
|
|
202
202
|
if row.rowid not in existing_tasks:
|
|
203
203
|
# Upload the media file to Label Studio
|
|
@@ -239,12 +239,12 @@ class LabelStudioProject(Project):
|
|
|
239
239
|
return sync_status.combine(deletion_sync_status)
|
|
240
240
|
|
|
241
241
|
def __update_tasks_by_files(
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
242
|
+
self,
|
|
243
|
+
t: Table,
|
|
244
|
+
existing_tasks: dict[tuple, dict],
|
|
245
|
+
t_data_cols: list[Column],
|
|
246
|
+
t_rl_cols: list[Column],
|
|
247
|
+
rl_info: list['_RectangleLabel'],
|
|
248
248
|
) -> SyncStatus:
|
|
249
249
|
ext_data_cols = [self.col_mapping[col] for col in t_data_cols]
|
|
250
250
|
expr_refs: dict[str, Expr] = {} # kwargs for the select statement
|
|
@@ -301,21 +301,23 @@ class LabelStudioProject(Project):
|
|
|
301
301
|
return {
|
|
302
302
|
'data': dict(zip(ext_data_cols, data_vals)),
|
|
303
303
|
'meta': {'rowid': row.rowid},
|
|
304
|
-
'predictions': predictions
|
|
304
|
+
'predictions': predictions,
|
|
305
305
|
}
|
|
306
306
|
|
|
307
307
|
for row in df._exec():
|
|
308
308
|
if rl_col_idxs is None:
|
|
309
|
-
rl_col_idxs = [expr.slot_idx for expr in df._select_list_exprs[:len(t_rl_cols)]]
|
|
310
|
-
data_col_idxs = [expr.slot_idx for expr in df._select_list_exprs[len(t_rl_cols):]]
|
|
309
|
+
rl_col_idxs = [expr.slot_idx for expr in df._select_list_exprs[: len(t_rl_cols)]]
|
|
310
|
+
data_col_idxs = [expr.slot_idx for expr in df._select_list_exprs[len(t_rl_cols) :]]
|
|
311
311
|
row_ids_in_pxt.add(row.rowid)
|
|
312
312
|
task_info = create_task_info(row)
|
|
313
313
|
# TODO(aaron-siegel): Implement more efficient update logic (currently involves a full table scan)
|
|
314
314
|
if row.rowid in existing_tasks:
|
|
315
315
|
# A task for this row already exists; see if it needs an update.
|
|
316
316
|
existing_task = existing_tasks[row.rowid]
|
|
317
|
-
if
|
|
318
|
-
|
|
317
|
+
if (
|
|
318
|
+
task_info['data'] != existing_task['data']
|
|
319
|
+
or task_info['predictions'] != existing_task['predictions']
|
|
320
|
+
):
|
|
319
321
|
_logger.debug(f'Updating task for rowid {row.rowid}.')
|
|
320
322
|
self.project.update_task(existing_tasks[row.rowid]['id'], **task_info)
|
|
321
323
|
tasks_updated += 1
|
|
@@ -330,7 +332,9 @@ class LabelStudioProject(Project):
|
|
|
330
332
|
if len(page) > 0:
|
|
331
333
|
self.project.import_tasks(page)
|
|
332
334
|
|
|
333
|
-
env.Env.get().console_logger.info(
|
|
335
|
+
env.Env.get().console_logger.info(
|
|
336
|
+
f'Created {tasks_created} new task(s) and updated {tasks_updated} existing task(s) in {self}.'
|
|
337
|
+
)
|
|
334
338
|
|
|
335
339
|
sync_status = SyncStatus(external_rows_created=tasks_created, external_rows_updated=tasks_updated)
|
|
336
340
|
|
|
@@ -355,7 +359,9 @@ class LabelStudioProject(Project):
|
|
|
355
359
|
relpath = Path(localpath).relative_to(env.Env.get().home)
|
|
356
360
|
return f'/data/local-files/?d={str(relpath)}'
|
|
357
361
|
|
|
358
|
-
def __delete_stale_tasks(
|
|
362
|
+
def __delete_stale_tasks(
|
|
363
|
+
self, existing_tasks: dict[tuple, dict], row_ids_in_pxt: set[tuple], tasks_created: int
|
|
364
|
+
) -> SyncStatus:
|
|
359
365
|
deleted_rowids = set(existing_tasks.keys()) - row_ids_in_pxt
|
|
360
366
|
# Sanity check the math
|
|
361
367
|
assert len(deleted_rowids) == len(existing_tasks) + tasks_created - len(row_ids_in_pxt)
|
|
@@ -363,7 +369,9 @@ class LabelStudioProject(Project):
|
|
|
363
369
|
|
|
364
370
|
if len(tasks_to_delete) > 0:
|
|
365
371
|
self.project.delete_tasks(tasks_to_delete)
|
|
366
|
-
env.Env.get().console_logger.info(
|
|
372
|
+
env.Env.get().console_logger.info(
|
|
373
|
+
f'Deleted {len(tasks_to_delete)} tasks(s) in {self} that are no longer present in Pixeltable.'
|
|
374
|
+
)
|
|
367
375
|
|
|
368
376
|
# Remove them from the `existing_tasks` dict so that future updates are applied correctly
|
|
369
377
|
for rowid in deleted_rowids:
|
|
@@ -417,7 +425,9 @@ class LabelStudioProject(Project):
|
|
|
417
425
|
'project_id': self.project_id,
|
|
418
426
|
'media_import_method': self.media_import_method,
|
|
419
427
|
'col_mapping': [[self._column_as_dict(k), v] for k, v in self.col_mapping.items()],
|
|
420
|
-
'stored_proxies': [
|
|
428
|
+
'stored_proxies': [
|
|
429
|
+
[self._column_as_dict(k), self._column_as_dict(v)] for k, v in self.stored_proxies.items()
|
|
430
|
+
],
|
|
421
431
|
}
|
|
422
432
|
|
|
423
433
|
@classmethod
|
|
@@ -427,7 +437,7 @@ class LabelStudioProject(Project):
|
|
|
427
437
|
md['project_id'],
|
|
428
438
|
md['media_import_method'],
|
|
429
439
|
{cls._column_from_dict(entry[0]): entry[1] for entry in md['col_mapping']},
|
|
430
|
-
{cls._column_from_dict(entry[0]): cls._column_from_dict(entry[1]) for entry in md['stored_proxies']}
|
|
440
|
+
{cls._column_from_dict(entry[0]): cls._column_from_dict(entry[1]) for entry in md['stored_proxies']},
|
|
431
441
|
)
|
|
432
442
|
|
|
433
443
|
def __repr__(self) -> str:
|
|
@@ -444,8 +454,7 @@ class LabelStudioProject(Project):
|
|
|
444
454
|
if root.tag.lower() != 'view':
|
|
445
455
|
raise excs.Error('Root of Label Studio config must be a `View`')
|
|
446
456
|
config = _LabelStudioConfig(
|
|
447
|
-
data_keys=cls.__parse_data_keys_config(root),
|
|
448
|
-
rectangle_labels=cls.__parse_rectangle_labels_config(root)
|
|
457
|
+
data_keys=cls.__parse_data_keys_config(root), rectangle_labels=cls.__parse_rectangle_labels_config(root)
|
|
449
458
|
)
|
|
450
459
|
config.validate()
|
|
451
460
|
return config
|
|
@@ -474,10 +483,7 @@ class LabelStudioProject(Project):
|
|
|
474
483
|
if element.tag.lower() == 'rectanglelabels':
|
|
475
484
|
name = element.attrib['name']
|
|
476
485
|
to_name = element.attrib['toName']
|
|
477
|
-
labels = [
|
|
478
|
-
child.attrib['value']
|
|
479
|
-
for child in element if child.tag.lower() == 'label'
|
|
480
|
-
]
|
|
486
|
+
labels = [child.attrib['value'] for child in element if child.tag.lower() == 'label']
|
|
481
487
|
for label in labels:
|
|
482
488
|
if label not in coco.COCO_2017_CATEGORIES.values():
|
|
483
489
|
raise excs.Error(f'Label in `rectanglelabels` config is not a valid COCO object name: {label}')
|
|
@@ -486,11 +492,7 @@ class LabelStudioProject(Project):
|
|
|
486
492
|
|
|
487
493
|
@classmethod
|
|
488
494
|
def __coco_to_predictions(
|
|
489
|
-
|
|
490
|
-
coco_annotations: dict[str, Any],
|
|
491
|
-
from_name: str,
|
|
492
|
-
rl_info: '_RectangleLabel',
|
|
493
|
-
task_id: Optional[int] = None
|
|
495
|
+
cls, coco_annotations: dict[str, Any], from_name: str, rl_info: '_RectangleLabel', task_id: Optional[int] = None
|
|
494
496
|
) -> dict[str, Any]:
|
|
495
497
|
width = coco_annotations['image']['width']
|
|
496
498
|
height = coco_annotations['image']['height']
|
|
@@ -510,8 +512,8 @@ class LabelStudioProject(Project):
|
|
|
510
512
|
'y': entry['bbox'][1] * 100.0 / height,
|
|
511
513
|
'width': entry['bbox'][2] * 100.0 / width,
|
|
512
514
|
'height': entry['bbox'][3] * 100.0 / height,
|
|
513
|
-
'rectanglelabels': [coco.COCO_2017_CATEGORIES[entry['category']]]
|
|
514
|
-
}
|
|
515
|
+
'rectanglelabels': [coco.COCO_2017_CATEGORIES[entry['category']]],
|
|
516
|
+
},
|
|
515
517
|
}
|
|
516
518
|
for i, entry in enumerate(coco_annotations['annotations'])
|
|
517
519
|
# include only the COCO labels that match a rectanglelabel name
|
|
@@ -539,15 +541,15 @@ class LabelStudioProject(Project):
|
|
|
539
541
|
|
|
540
542
|
@classmethod
|
|
541
543
|
def create(
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
544
|
+
cls,
|
|
545
|
+
t: Table,
|
|
546
|
+
label_config: str,
|
|
547
|
+
name: Optional[str],
|
|
548
|
+
title: Optional[str],
|
|
549
|
+
media_import_method: Literal['post', 'file', 'url'],
|
|
550
|
+
col_mapping: Optional[dict[str, str]],
|
|
551
|
+
s3_configuration: Optional[dict[str, Any]],
|
|
552
|
+
**kwargs: Any,
|
|
551
553
|
) -> 'LabelStudioProject':
|
|
552
554
|
"""
|
|
553
555
|
Creates a new Label Studio project, using the Label Studio client configured in Pixeltable.
|
|
@@ -577,7 +579,8 @@ class LabelStudioProject(Project):
|
|
|
577
579
|
t.add_columns({local_annotations_column: pxt.JsonType(nullable=True)})
|
|
578
580
|
|
|
579
581
|
resolved_col_mapping = cls.validate_columns(
|
|
580
|
-
t, config.export_columns, {ANNOTATIONS_COLUMN: pxt.JsonType(nullable=True)}, col_mapping
|
|
582
|
+
t, config.export_columns, {ANNOTATIONS_COLUMN: pxt.JsonType(nullable=True)}, col_mapping
|
|
583
|
+
)
|
|
581
584
|
|
|
582
585
|
# Perform some additional validation
|
|
583
586
|
if media_import_method == 'post' and len(config.data_keys) > 1:
|
|
@@ -591,12 +594,15 @@ class LabelStudioProject(Project):
|
|
|
591
594
|
raise excs.Error('`s3_configuration` must contain a `bucket` field')
|
|
592
595
|
if not 'title' in s3_configuration:
|
|
593
596
|
s3_configuration['title'] = 'Pixeltable-S3-Import-Storage'
|
|
594
|
-
if (
|
|
595
|
-
'
|
|
596
|
-
'
|
|
597
|
+
if (
|
|
598
|
+
'aws_access_key_id' not in s3_configuration
|
|
599
|
+
and 'aws_secret_access_key' not in s3_configuration
|
|
600
|
+
and 'aws_session_token' not in s3_configuration
|
|
601
|
+
):
|
|
597
602
|
# Attempt to fill any missing credentials from the environment
|
|
598
603
|
try:
|
|
599
604
|
import boto3
|
|
605
|
+
|
|
600
606
|
s3_credentials = boto3.Session().get_credentials().get_frozen_credentials()
|
|
601
607
|
_logger.info(f'Using AWS credentials from the environment for Label Studio project: {title}')
|
|
602
608
|
s3_configuration['aws_access_key_id'] = s3_credentials.access_key
|
|
@@ -618,8 +624,11 @@ class LabelStudioProject(Project):
|
|
|
618
624
|
except HTTPError as exc:
|
|
619
625
|
if exc.errno == 400:
|
|
620
626
|
response: dict = json.loads(exc.response.text)
|
|
621
|
-
if
|
|
622
|
-
|
|
627
|
+
if (
|
|
628
|
+
'validation_errors' in response
|
|
629
|
+
and 'non_field_errors' in response['validation_errors']
|
|
630
|
+
and 'LOCAL_FILES_SERVING_ENABLED' in response['validation_errors']['non_field_errors'][0]
|
|
631
|
+
):
|
|
623
632
|
raise excs.Error(
|
|
624
633
|
'`media_import_method` is set to `file`, but your Label Studio server is not configured '
|
|
625
634
|
'for local file storage.\nPlease set the `LABEL_STUDIO_LOCAL_FILES_SERVING_ENABLED` '
|
|
@@ -675,5 +684,5 @@ _LS_TAG_MAP = {
|
|
|
675
684
|
'text': pxt.StringType(),
|
|
676
685
|
'image': pxt.ImageType(),
|
|
677
686
|
'video': pxt.VideoType(),
|
|
678
|
-
'audio': pxt.AudioType()
|
|
687
|
+
'audio': pxt.AudioType(),
|
|
679
688
|
}
|