pixeltable 0.2.11__py3-none-any.whl → 0.2.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +2 -2
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/insertable_table.py +2 -2
- pixeltable/catalog/schema_object.py +28 -2
- pixeltable/catalog/table.py +68 -30
- pixeltable/catalog/table_version.py +14 -43
- pixeltable/catalog/view.py +2 -2
- pixeltable/dataframe.py +8 -7
- pixeltable/exec/expr_eval_node.py +8 -1
- pixeltable/exec/sql_scan_node.py +1 -1
- pixeltable/exprs/__init__.py +0 -1
- pixeltable/exprs/column_ref.py +2 -7
- pixeltable/exprs/comparison.py +5 -5
- pixeltable/exprs/compound_predicate.py +12 -12
- pixeltable/exprs/expr.py +32 -0
- pixeltable/exprs/in_predicate.py +3 -3
- pixeltable/exprs/is_null.py +5 -5
- pixeltable/exprs/similarity_expr.py +27 -16
- pixeltable/func/aggregate_function.py +10 -4
- pixeltable/func/callable_function.py +4 -0
- pixeltable/func/function_registry.py +2 -0
- pixeltable/functions/globals.py +36 -1
- pixeltable/functions/huggingface.py +62 -4
- pixeltable/functions/image.py +17 -0
- pixeltable/functions/openai.py +1 -1
- pixeltable/functions/string.py +622 -7
- pixeltable/functions/video.py +26 -8
- pixeltable/globals.py +54 -50
- pixeltable/index/embedding_index.py +28 -27
- pixeltable/io/external_store.py +2 -2
- pixeltable/io/globals.py +54 -5
- pixeltable/io/label_studio.py +45 -5
- pixeltable/io/pandas.py +18 -7
- pixeltable/metadata/__init__.py +1 -1
- pixeltable/metadata/converters/convert_17.py +26 -0
- pixeltable/plan.py +6 -6
- pixeltable/tool/create_test_db_dump.py +2 -2
- pixeltable/tool/doc_plugins/griffe.py +77 -0
- pixeltable/tool/doc_plugins/mkdocstrings.py +6 -0
- pixeltable/tool/doc_plugins/templates/material/udf.html.jinja +135 -0
- pixeltable/utils/s3.py +1 -1
- pixeltable-0.2.13.dist-info/METADATA +206 -0
- {pixeltable-0.2.11.dist-info → pixeltable-0.2.13.dist-info}/RECORD +46 -42
- pixeltable-0.2.13.dist-info/entry_points.txt +3 -0
- pixeltable/exprs/predicate.py +0 -44
- pixeltable-0.2.11.dist-info/METADATA +0 -137
- {pixeltable-0.2.11.dist-info → pixeltable-0.2.13.dist-info}/LICENSE +0 -0
- {pixeltable-0.2.11.dist-info → pixeltable-0.2.13.dist-info}/WHEEL +0 -0
pixeltable/globals.py
CHANGED
|
@@ -7,10 +7,10 @@ import sqlalchemy as sql
|
|
|
7
7
|
from sqlalchemy.util.preloaded import orm
|
|
8
8
|
|
|
9
9
|
import pixeltable.exceptions as excs
|
|
10
|
+
import pixeltable.exprs as exprs
|
|
10
11
|
from pixeltable import catalog, func, DataFrame
|
|
11
12
|
from pixeltable.catalog import Catalog
|
|
12
13
|
from pixeltable.env import Env
|
|
13
|
-
from pixeltable.exprs import Predicate
|
|
14
14
|
from pixeltable.iterators import ComponentIterator
|
|
15
15
|
from pixeltable.metadata import schema
|
|
16
16
|
|
|
@@ -81,7 +81,7 @@ def create_view(
|
|
|
81
81
|
base: Union[catalog.Table, DataFrame],
|
|
82
82
|
*,
|
|
83
83
|
schema: Optional[dict[str, Any]] = None,
|
|
84
|
-
filter: Optional[
|
|
84
|
+
filter: Optional[exprs.Expr] = None,
|
|
85
85
|
is_snapshot: bool = False,
|
|
86
86
|
iterator: Optional[tuple[type[ComponentIterator], dict[str, Any]]] = None,
|
|
87
87
|
num_retained_versions: int = 10,
|
|
@@ -94,7 +94,7 @@ def create_view(
|
|
|
94
94
|
path_str: Path to the view.
|
|
95
95
|
base: Table (i.e., table or view or snapshot) or DataFrame to base the view on.
|
|
96
96
|
schema: dictionary mapping column names to column types, value expressions, or to column specifications.
|
|
97
|
-
filter:
|
|
97
|
+
filter: predicate to filter rows of the base table.
|
|
98
98
|
is_snapshot: Whether the view is a snapshot.
|
|
99
99
|
iterator: The iterator to use for this view. If specified, then this view will be a one-to-many view of
|
|
100
100
|
the base table.
|
|
@@ -234,7 +234,7 @@ def drop_table(path: str, force: bool = False, ignore_errors: bool = False) -> N
|
|
|
234
234
|
|
|
235
235
|
Args:
|
|
236
236
|
path: Path to the table.
|
|
237
|
-
force:
|
|
237
|
+
force: If `True`, will also drop all views or sub-views of this table.
|
|
238
238
|
ignore_errors: Whether to ignore errors if the table does not exist.
|
|
239
239
|
|
|
240
240
|
Raises:
|
|
@@ -243,21 +243,27 @@ def drop_table(path: str, force: bool = False, ignore_errors: bool = False) -> N
|
|
|
243
243
|
Examples:
|
|
244
244
|
>>> cl.drop_table('my_table')
|
|
245
245
|
"""
|
|
246
|
+
cat = Catalog.get()
|
|
246
247
|
path_obj = catalog.Path(path)
|
|
247
248
|
try:
|
|
248
|
-
|
|
249
|
+
cat.paths.check_is_valid(path_obj, expected=catalog.Table)
|
|
249
250
|
except Exception as e:
|
|
250
|
-
if ignore_errors:
|
|
251
|
+
if ignore_errors or force:
|
|
251
252
|
_logger.info(f'Skipped table `{path}` (does not exist).')
|
|
252
253
|
return
|
|
253
254
|
else:
|
|
254
255
|
raise e
|
|
255
|
-
tbl =
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
256
|
+
tbl = cat.paths[path_obj]
|
|
257
|
+
assert isinstance(tbl, catalog.Table)
|
|
258
|
+
if len(cat.tbl_dependents[tbl._id]) > 0:
|
|
259
|
+
dependent_paths = [dep.path for dep in cat.tbl_dependents[tbl._id]]
|
|
260
|
+
if force:
|
|
261
|
+
for dependent_path in dependent_paths:
|
|
262
|
+
drop_table(dependent_path, force=True)
|
|
263
|
+
else:
|
|
264
|
+
raise excs.Error(f'Table {path} has dependents: {", ".join(dependent_paths)}')
|
|
259
265
|
tbl._drop()
|
|
260
|
-
del
|
|
266
|
+
del cat.paths[path_obj]
|
|
261
267
|
_logger.info(f'Dropped table `{path}`.')
|
|
262
268
|
|
|
263
269
|
|
|
@@ -291,7 +297,7 @@ def list_tables(dir_path: str = '', recursive: bool = True) -> list[str]:
|
|
|
291
297
|
return [str(p) for p in Catalog.get().paths.get_children(path, child_type=catalog.Table, recursive=recursive)]
|
|
292
298
|
|
|
293
299
|
|
|
294
|
-
def create_dir(path_str: str, ignore_errors: bool = False) ->
|
|
300
|
+
def create_dir(path_str: str, ignore_errors: bool = False) -> catalog.Dir:
|
|
295
301
|
"""Create a directory.
|
|
296
302
|
|
|
297
303
|
Args:
|
|
@@ -319,10 +325,12 @@ def create_dir(path_str: str, ignore_errors: bool = False) -> None:
|
|
|
319
325
|
session.add(dir_record)
|
|
320
326
|
session.flush()
|
|
321
327
|
assert dir_record.id is not None
|
|
322
|
-
|
|
328
|
+
dir = catalog.Dir(dir_record.id, parent._id, path.name)
|
|
329
|
+
Catalog.get().paths[path] = dir
|
|
323
330
|
session.commit()
|
|
324
331
|
_logger.info(f'Created directory `{path_str}`.')
|
|
325
332
|
print(f'Created directory `{path_str}`.')
|
|
333
|
+
return dir
|
|
326
334
|
except excs.Error as e:
|
|
327
335
|
if ignore_errors:
|
|
328
336
|
return
|
|
@@ -330,7 +338,7 @@ def create_dir(path_str: str, ignore_errors: bool = False) -> None:
|
|
|
330
338
|
raise e
|
|
331
339
|
|
|
332
340
|
|
|
333
|
-
def
|
|
341
|
+
def drop_dir(path_str: str, force: bool = False, ignore_errors: bool = False) -> None:
|
|
334
342
|
"""Remove a directory.
|
|
335
343
|
|
|
336
344
|
Args:
|
|
@@ -340,31 +348,49 @@ def rm_dir(path_str: str) -> None:
|
|
|
340
348
|
Error: If the path does not exist or does not designate a directory or if the directory is not empty.
|
|
341
349
|
|
|
342
350
|
Examples:
|
|
343
|
-
>>> cl.
|
|
351
|
+
>>> cl.drop_dir('my_dir')
|
|
344
352
|
|
|
345
353
|
Remove a subdirectory:
|
|
346
354
|
|
|
347
|
-
>>> cl.
|
|
355
|
+
>>> cl.drop_dir('my_dir.sub_dir')
|
|
348
356
|
"""
|
|
357
|
+
cat = Catalog.get()
|
|
349
358
|
path = catalog.Path(path_str)
|
|
350
|
-
Catalog.get().paths.check_is_valid(path, expected=catalog.Dir)
|
|
351
359
|
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
360
|
+
try:
|
|
361
|
+
cat.paths.check_is_valid(path, expected=catalog.Dir)
|
|
362
|
+
except Exception as e:
|
|
363
|
+
if ignore_errors or force:
|
|
364
|
+
_logger.info(f'Skipped directory `{path}` (does not exist).')
|
|
365
|
+
return
|
|
366
|
+
else:
|
|
367
|
+
raise e
|
|
368
|
+
|
|
369
|
+
children = cat.paths.get_children(path, child_type=None, recursive=True)
|
|
370
|
+
|
|
371
|
+
if len(children) > 0 and not force:
|
|
372
|
+
raise excs.Error(f'Directory `{path_str}` is not empty.')
|
|
373
|
+
|
|
374
|
+
for child in children:
|
|
375
|
+
assert isinstance(child, catalog.Path)
|
|
376
|
+
# We need to check that the child is still in `cat.paths`, since it is possible it was
|
|
377
|
+
# already deleted as a dependent of a preceding child in the iteration.
|
|
378
|
+
try:
|
|
379
|
+
obj = cat.paths[child]
|
|
380
|
+
except excs.Error:
|
|
381
|
+
continue
|
|
382
|
+
if isinstance(obj, catalog.Dir):
|
|
383
|
+
drop_dir(str(child), force=True)
|
|
384
|
+
else:
|
|
385
|
+
assert isinstance(obj, catalog.Table)
|
|
386
|
+
assert not obj._is_dropped # else it should have been removed from `cat.paths` already
|
|
387
|
+
drop_table(str(child), force=True)
|
|
362
388
|
|
|
363
389
|
with Env.get().engine.begin() as conn:
|
|
364
390
|
dir = Catalog.get().paths[path]
|
|
365
391
|
conn.execute(sql.delete(schema.Dir.__table__).where(schema.Dir.id == dir._id))
|
|
366
392
|
del Catalog.get().paths[path]
|
|
367
|
-
_logger.info(f'Removed directory {path_str}')
|
|
393
|
+
_logger.info(f'Removed directory `{path_str}`.')
|
|
368
394
|
|
|
369
395
|
|
|
370
396
|
def list_dirs(path_str: str = '', recursive: bool = True) -> list[str]:
|
|
@@ -416,28 +442,6 @@ def list_functions() -> pd.DataFrame:
|
|
|
416
442
|
return pd_df.hide(axis='index')
|
|
417
443
|
|
|
418
444
|
|
|
419
|
-
def get_path(schema_obj: catalog.SchemaObject) -> str:
|
|
420
|
-
"""Returns the path to a SchemaObject.
|
|
421
|
-
|
|
422
|
-
Args:
|
|
423
|
-
schema_obj: SchemaObject to get the path for.
|
|
424
|
-
|
|
425
|
-
Returns:
|
|
426
|
-
Path to the SchemaObject.
|
|
427
|
-
"""
|
|
428
|
-
path_elements: list[str] = []
|
|
429
|
-
dir_id = schema_obj._dir_id
|
|
430
|
-
while dir_id is not None:
|
|
431
|
-
dir = Catalog.get().paths.get_schema_obj(dir_id)
|
|
432
|
-
if dir._dir_id is None:
|
|
433
|
-
# this is the root dir with name '', which we don't want to include in the path
|
|
434
|
-
break
|
|
435
|
-
path_elements.insert(0, dir._name)
|
|
436
|
-
dir_id = dir._dir_id
|
|
437
|
-
path_elements.append(schema_obj._name)
|
|
438
|
-
return '.'.join(path_elements)
|
|
439
|
-
|
|
440
|
-
|
|
441
445
|
def configure_logging(
|
|
442
446
|
*,
|
|
443
447
|
to_stdout: Optional[bool] = None,
|
|
@@ -24,6 +24,7 @@ class EmbeddingIndex(IndexBase):
|
|
|
24
24
|
- similarity_clause() converts those metrics back to their original form; it is used in expressions outside
|
|
25
25
|
the Order By clause
|
|
26
26
|
- order_by_clause() is used exclusively in the ORDER BY clause
|
|
27
|
+
- embedding function parameters are named '<type-name>_embed', where type-name is ColumnType.Type.name
|
|
27
28
|
"""
|
|
28
29
|
|
|
29
30
|
class Metric(enum.Enum):
|
|
@@ -38,30 +39,30 @@ class EmbeddingIndex(IndexBase):
|
|
|
38
39
|
}
|
|
39
40
|
|
|
40
41
|
def __init__(
|
|
41
|
-
self, c: catalog.Column, metric: str,
|
|
42
|
-
|
|
42
|
+
self, c: catalog.Column, metric: str, string_embed: Optional[func.Function] = None,
|
|
43
|
+
image_embed: Optional[func.Function] = None):
|
|
43
44
|
metric_names = [m.name.lower() for m in self.Metric]
|
|
44
45
|
if metric.lower() not in metric_names:
|
|
45
46
|
raise excs.Error(f'Invalid metric {metric}, must be one of {metric_names}')
|
|
46
47
|
if not c.col_type.is_string_type() and not c.col_type.is_image_type():
|
|
47
48
|
raise excs.Error(f'Embedding index requires string or image column')
|
|
48
|
-
if c.col_type.is_string_type() and
|
|
49
|
-
raise excs.Error(f
|
|
50
|
-
if c.col_type.is_image_type() and
|
|
51
|
-
raise excs.Error(f
|
|
52
|
-
if
|
|
49
|
+
if c.col_type.is_string_type() and string_embed is None:
|
|
50
|
+
raise excs.Error(f"Text embedding function is required for column {c.name} (parameter 'string_embed')")
|
|
51
|
+
if c.col_type.is_image_type() and image_embed is None:
|
|
52
|
+
raise excs.Error(f"Image embedding function is required for column {c.name} (parameter 'image_embed')")
|
|
53
|
+
if string_embed is not None:
|
|
53
54
|
# verify signature
|
|
54
|
-
self._validate_embedding_fn(
|
|
55
|
-
if
|
|
55
|
+
self._validate_embedding_fn(string_embed, 'string_embed', ts.ColumnType.Type.STRING)
|
|
56
|
+
if image_embed is not None:
|
|
56
57
|
# verify signature
|
|
57
|
-
self._validate_embedding_fn(
|
|
58
|
+
self._validate_embedding_fn(image_embed, 'image_embed', ts.ColumnType.Type.IMAGE)
|
|
58
59
|
|
|
59
60
|
self.metric = self.Metric[metric.upper()]
|
|
60
61
|
from pixeltable.exprs import ColumnRef
|
|
61
|
-
self.value_expr =
|
|
62
|
+
self.value_expr = string_embed(ColumnRef(c)) if c.col_type.is_string_type() else image_embed(ColumnRef(c))
|
|
62
63
|
assert self.value_expr.col_type.is_array_type()
|
|
63
|
-
self.
|
|
64
|
-
self.
|
|
64
|
+
self.string_embed = string_embed
|
|
65
|
+
self.image_embed = image_embed
|
|
65
66
|
vector_size = self.value_expr.col_type.shape[0]
|
|
66
67
|
assert vector_size is not None
|
|
67
68
|
self.index_col_type = pgvector.sqlalchemy.Vector(vector_size)
|
|
@@ -88,14 +89,14 @@ class EmbeddingIndex(IndexBase):
|
|
|
88
89
|
idx.create(bind=conn)
|
|
89
90
|
|
|
90
91
|
def similarity_clause(self, val_column: catalog.Column, item: Any) -> sql.ClauseElement:
|
|
91
|
-
"""Create a ClauseElement
|
|
92
|
+
"""Create a ClauseElement that represents '<val_column> <op> <item>'"""
|
|
92
93
|
assert isinstance(item, (str, PIL.Image.Image))
|
|
93
94
|
if isinstance(item, str):
|
|
94
|
-
assert self.
|
|
95
|
-
embedding = self.
|
|
95
|
+
assert self.string_embed is not None
|
|
96
|
+
embedding = self.string_embed.exec(item)
|
|
96
97
|
if isinstance(item, PIL.Image.Image):
|
|
97
|
-
assert self.
|
|
98
|
-
embedding = self.
|
|
98
|
+
assert self.image_embed is not None
|
|
99
|
+
embedding = self.image_embed.exec(item)
|
|
99
100
|
|
|
100
101
|
if self.metric == self.Metric.COSINE:
|
|
101
102
|
return val_column.sa_col.cosine_distance(embedding) * -1 + 1
|
|
@@ -110,11 +111,11 @@ class EmbeddingIndex(IndexBase):
|
|
|
110
111
|
assert isinstance(item, (str, PIL.Image.Image))
|
|
111
112
|
embedding: Optional[np.ndarray] = None
|
|
112
113
|
if isinstance(item, str):
|
|
113
|
-
assert self.
|
|
114
|
-
embedding = self.
|
|
114
|
+
assert self.string_embed is not None
|
|
115
|
+
embedding = self.string_embed.exec(item)
|
|
115
116
|
if isinstance(item, PIL.Image.Image):
|
|
116
|
-
assert self.
|
|
117
|
-
embedding = self.
|
|
117
|
+
assert self.image_embed is not None
|
|
118
|
+
embedding = self.image_embed.exec(item)
|
|
118
119
|
assert embedding is not None
|
|
119
120
|
|
|
120
121
|
if self.metric == self.Metric.COSINE:
|
|
@@ -160,12 +161,12 @@ class EmbeddingIndex(IndexBase):
|
|
|
160
161
|
def as_dict(self) -> dict:
|
|
161
162
|
return {
|
|
162
163
|
'metric': self.metric.name.lower(),
|
|
163
|
-
'
|
|
164
|
-
'
|
|
164
|
+
'string_embed': None if self.string_embed is None else self.string_embed.as_dict(),
|
|
165
|
+
'image_embed': None if self.image_embed is None else self.image_embed.as_dict()
|
|
165
166
|
}
|
|
166
167
|
|
|
167
168
|
@classmethod
|
|
168
169
|
def from_dict(cls, c: catalog.Column, d: dict) -> EmbeddingIndex:
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
return cls(c, metric=d['metric'],
|
|
170
|
+
string_embed = func.Function.from_dict(d['string_embed']) if d['string_embed'] is not None else None
|
|
171
|
+
image_embed = func.Function.from_dict(d['image_embed']) if d['image_embed'] is not None else None
|
|
172
|
+
return cls(c, metric=d['metric'], string_embed=string_embed, image_embed=image_embed)
|
pixeltable/io/external_store.py
CHANGED
|
@@ -222,12 +222,12 @@ class Project(ExternalStore, abc.ABC):
|
|
|
222
222
|
if t_col not in t_cols:
|
|
223
223
|
if is_user_specified_col_mapping:
|
|
224
224
|
raise excs.Error(
|
|
225
|
-
f'Column name `{t_col}` appears as a key in `col_mapping`, but Table `{table.
|
|
225
|
+
f'Column name `{t_col}` appears as a key in `col_mapping`, but Table `{table.name}` '
|
|
226
226
|
'contains no such column.'
|
|
227
227
|
)
|
|
228
228
|
else:
|
|
229
229
|
raise excs.Error(
|
|
230
|
-
f'Column `{t_col}` does not exist in Table `{table.
|
|
230
|
+
f'Column `{t_col}` does not exist in Table `{table.name}`. Either add a column `{t_col}`, '
|
|
231
231
|
f'or specify a `col_mapping` to associate a different column with the external field `{ext_col}`.'
|
|
232
232
|
)
|
|
233
233
|
if ext_col not in export_cols and ext_col not in import_cols:
|
pixeltable/io/globals.py
CHANGED
|
@@ -13,11 +13,14 @@ def create_label_studio_project(
|
|
|
13
13
|
media_import_method: Literal['post', 'file', 'url'] = 'post',
|
|
14
14
|
col_mapping: Optional[dict[str, str]] = None,
|
|
15
15
|
sync_immediately: bool = True,
|
|
16
|
+
s3_configuration: Optional[dict[str, Any]] = None,
|
|
16
17
|
**kwargs: Any
|
|
17
18
|
) -> SyncStatus:
|
|
18
|
-
# TODO(aaron-siegel): Add link in docstring to a Label Studio howto
|
|
19
19
|
"""
|
|
20
|
-
|
|
20
|
+
Create a new Label Studio project and link it to the specified `Table`.
|
|
21
|
+
|
|
22
|
+
- A tutorial notebook with fully worked examples can be found here:
|
|
23
|
+
[Using Label Studio for Annotations with Pixeltable](https://pixeltable.readme.io/docs/label-studio)
|
|
21
24
|
|
|
22
25
|
The required parameter `label_config` specifies the Label Studio project configuration,
|
|
23
26
|
in XML format, as described in the Label Studio documentation. The linked project will
|
|
@@ -41,6 +44,11 @@ def create_label_studio_project(
|
|
|
41
44
|
* Set the `LABEL_STUDIO_API_KEY` and `LABEL_STUDIO_URL` environment variables; or
|
|
42
45
|
* Specify `api_key` and `url` fields in the `label-studio` section of `$PIXELTABLE_HOME/config.yaml`.
|
|
43
46
|
|
|
47
|
+
__Requirements:__
|
|
48
|
+
|
|
49
|
+
- `pip install label-studio-sdk`
|
|
50
|
+
- `pip install boto3` (if using S3 import storage)
|
|
51
|
+
|
|
44
52
|
Args:
|
|
45
53
|
t: The Table to link to.
|
|
46
54
|
label_config: The Label Studio project configuration, in XML format.
|
|
@@ -50,8 +58,9 @@ def create_label_studio_project(
|
|
|
50
58
|
`ls_project_0`, `ls_project_1`, etc.
|
|
51
59
|
title: An optional title for the Label Studio project. This is the title that annotators
|
|
52
60
|
will see inside Label Studio. Unlike `name`, it does not need to be an identifier and
|
|
53
|
-
does not need to be unique. If not specified, the table name `t.
|
|
61
|
+
does not need to be unique. If not specified, the table name `t.name` will be used.
|
|
54
62
|
media_import_method: The method to use when transferring media files to Label Studio:
|
|
63
|
+
|
|
55
64
|
- `post`: Media will be sent to Label Studio via HTTP post. This should generally only be used for
|
|
56
65
|
prototyping; due to restrictions in Label Studio, it can only be used with projects that have
|
|
57
66
|
just one data field, and does not scale well.
|
|
@@ -63,9 +72,48 @@ def create_label_studio_project(
|
|
|
63
72
|
col_mapping: An optional mapping of local column names to Label Studio fields.
|
|
64
73
|
sync_immediately: If `True`, immediately perform an initial synchronization by
|
|
65
74
|
exporting all rows of the `Table` as Label Studio tasks.
|
|
75
|
+
s3_configuration: If specified, S3 import storage will be configured for the new project. This can only
|
|
76
|
+
be used with `media_import_method='url'`, and if `media_import_method='url'` and any of the media data is
|
|
77
|
+
referenced by `s3://` URLs, then it must be specified in order for such media to display correctly
|
|
78
|
+
in the Label Studio interface.
|
|
79
|
+
|
|
80
|
+
The items in the `s3_configuration` dictionary correspond to kwarg
|
|
81
|
+
parameters of the Label Studio `connect_s3_import_storage` method, as described in the
|
|
82
|
+
[Label Studio connect_s3_import_storage docs](https://labelstud.io/sdk/project.html#label_studio_sdk.project.Project.connect_s3_import_storage).
|
|
83
|
+
`bucket` must be specified; all other parameters are optional. If credentials are not specified explicitly,
|
|
84
|
+
Pixeltable will attempt to retrieve them from the environment (such as from `~/.aws/credentials`). If a title is not
|
|
85
|
+
specified, Pixeltable will use the default `'Pixeltable-S3-Import-Storage'`. All other parameters use their Label
|
|
86
|
+
Studio defaults.
|
|
66
87
|
kwargs: Additional keyword arguments are passed to the `start_project` method in the Label
|
|
67
|
-
Studio SDK, as described
|
|
68
|
-
https://labelstud.io/sdk/project.html#label_studio_sdk.project.Project.start_project
|
|
88
|
+
Studio SDK, as described in the
|
|
89
|
+
[Label Studio start_project docs](https://labelstud.io/sdk/project.html#label_studio_sdk.project.Project.start_project).
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
A `SyncStatus` representing the status of any synchronization operations that occurred.
|
|
93
|
+
|
|
94
|
+
Examples:
|
|
95
|
+
Create a Label Studio project whose tasks correspond to videos stored in the `video_col` column of the table `tbl`:
|
|
96
|
+
|
|
97
|
+
>>> config = \"\"\"
|
|
98
|
+
<View>
|
|
99
|
+
<Video name="video_obj" value="$video_col"/>
|
|
100
|
+
<Choices name="video-category" toName="video" showInLine="true">
|
|
101
|
+
<Choice value="city"/>
|
|
102
|
+
<Choice value="food"/>
|
|
103
|
+
<Choice value="sports"/>
|
|
104
|
+
</Choices>
|
|
105
|
+
</View>\"\"\"
|
|
106
|
+
create_label_studio_project(tbl, config)
|
|
107
|
+
|
|
108
|
+
Create a Label Studio project with the same configuration, using `media_import_method='url'`,
|
|
109
|
+
whose media are stored in an S3 bucket:
|
|
110
|
+
|
|
111
|
+
>>> create_label_studio_project(
|
|
112
|
+
tbl,
|
|
113
|
+
config,
|
|
114
|
+
media_import_method='url',
|
|
115
|
+
s3_configuration={'bucket': 'my-bucket', 'region_name': 'us-east-2'}
|
|
116
|
+
)
|
|
69
117
|
"""
|
|
70
118
|
from pixeltable.io.label_studio import LabelStudioProject
|
|
71
119
|
|
|
@@ -76,6 +124,7 @@ def create_label_studio_project(
|
|
|
76
124
|
title,
|
|
77
125
|
media_import_method,
|
|
78
126
|
col_mapping,
|
|
127
|
+
s3_configuration,
|
|
79
128
|
**kwargs
|
|
80
129
|
)
|
|
81
130
|
|
pixeltable/io/label_studio.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import copy
|
|
1
2
|
import json
|
|
2
3
|
import logging
|
|
3
4
|
import os
|
|
@@ -18,6 +19,15 @@ from pixeltable.exprs import ColumnRef, DataRow, Expr
|
|
|
18
19
|
from pixeltable.io.external_store import Project, SyncStatus
|
|
19
20
|
from pixeltable.utils import coco
|
|
20
21
|
|
|
22
|
+
# label_studio_sdk>=1 and label_studio_sdk<1 are not compatible, so we need to try
|
|
23
|
+
# the import two different ways to insure intercompatibility
|
|
24
|
+
try:
|
|
25
|
+
# label_studio_sdk<1 compatibility
|
|
26
|
+
import label_studio_sdk.project as ls_project # type: ignore
|
|
27
|
+
except ImportError:
|
|
28
|
+
# label_studio_sdk>=1 compatibility
|
|
29
|
+
import label_studio_sdk._legacy.project as ls_project # type: ignore
|
|
30
|
+
|
|
21
31
|
_logger = logging.getLogger('pixeltable')
|
|
22
32
|
|
|
23
33
|
|
|
@@ -50,11 +60,11 @@ class LabelStudioProject(Project):
|
|
|
50
60
|
"""
|
|
51
61
|
self.project_id = project_id
|
|
52
62
|
self.media_import_method = media_import_method
|
|
53
|
-
self._project: Optional[
|
|
63
|
+
self._project: Optional[ls_project.Project] = None
|
|
54
64
|
super().__init__(name, col_mapping, stored_proxies)
|
|
55
65
|
|
|
56
66
|
@property
|
|
57
|
-
def project(self) ->
|
|
67
|
+
def project(self) -> ls_project.Project:
|
|
58
68
|
"""The `Project` object corresponding to this Label Studio project."""
|
|
59
69
|
if self._project is None:
|
|
60
70
|
try:
|
|
@@ -95,7 +105,7 @@ class LabelStudioProject(Project):
|
|
|
95
105
|
return {ANNOTATIONS_COLUMN: pxt.JsonType(nullable=True)}
|
|
96
106
|
|
|
97
107
|
def sync(self, t: Table, export_data: bool, import_data: bool) -> SyncStatus:
|
|
98
|
-
_logger.info(f'Syncing Label Studio project "{self.project_title}" with table `{t.
|
|
108
|
+
_logger.info(f'Syncing Label Studio project "{self.project_title}" with table `{t.name}`'
|
|
99
109
|
f' (export: {export_data}, import: {import_data}).')
|
|
100
110
|
# Collect all existing tasks into a dict with entries `rowid: task`
|
|
101
111
|
tasks = {tuple(task['meta']['rowid']): task for task in self.__fetch_all_tasks()}
|
|
@@ -386,7 +396,7 @@ class LabelStudioProject(Project):
|
|
|
386
396
|
updates = [{'_rowid': rowid, local_annotations_col.name: ann} for rowid, ann in annotations.items()]
|
|
387
397
|
if len(updates) > 0:
|
|
388
398
|
_logger.info(
|
|
389
|
-
f'Updating table `{t.
|
|
399
|
+
f'Updating table `{t.name}`, column `{local_annotations_col.name}` with {len(updates)} total annotations.'
|
|
390
400
|
)
|
|
391
401
|
# batch_update currently doesn't propagate from views to base tables. As a workaround, we call
|
|
392
402
|
# batch_update on the actual ancestor table that holds the annotations column.
|
|
@@ -536,6 +546,7 @@ class LabelStudioProject(Project):
|
|
|
536
546
|
title: Optional[str],
|
|
537
547
|
media_import_method: Literal['post', 'file', 'url'],
|
|
538
548
|
col_mapping: Optional[dict[str, str]],
|
|
549
|
+
s3_configuration: Optional[dict[str, Any]],
|
|
539
550
|
**kwargs: Any
|
|
540
551
|
) -> 'LabelStudioProject':
|
|
541
552
|
"""
|
|
@@ -554,7 +565,7 @@ class LabelStudioProject(Project):
|
|
|
554
565
|
|
|
555
566
|
if title is None:
|
|
556
567
|
# `title` defaults to table name
|
|
557
|
-
title = t.
|
|
568
|
+
title = t.name
|
|
558
569
|
|
|
559
570
|
# Create a column to hold the annotations, if one does not yet exist
|
|
560
571
|
if col_mapping is None or ANNOTATIONS_COLUMN in col_mapping.values():
|
|
@@ -572,6 +583,31 @@ class LabelStudioProject(Project):
|
|
|
572
583
|
if media_import_method == 'post' and len(config.data_keys) > 1:
|
|
573
584
|
raise excs.Error('`media_import_method` cannot be `post` if there is more than one data key')
|
|
574
585
|
|
|
586
|
+
if s3_configuration is not None:
|
|
587
|
+
if media_import_method != 'url':
|
|
588
|
+
raise excs.Error("`s3_configuration` is only valid when `media_import_method == 'url'`")
|
|
589
|
+
s3_configuration = copy.copy(s3_configuration)
|
|
590
|
+
if not 'bucket' in s3_configuration:
|
|
591
|
+
raise excs.Error('`s3_configuration` must contain a `bucket` field')
|
|
592
|
+
if not 'title' in s3_configuration:
|
|
593
|
+
s3_configuration['title'] = 'Pixeltable-S3-Import-Storage'
|
|
594
|
+
if ('aws_access_key_id' not in s3_configuration and
|
|
595
|
+
'aws_secret_access_key' not in s3_configuration and
|
|
596
|
+
'aws_session_token' not in s3_configuration):
|
|
597
|
+
# Attempt to fill any missing credentials from the environment
|
|
598
|
+
try:
|
|
599
|
+
import boto3
|
|
600
|
+
s3_credentials = boto3.Session().get_credentials().get_frozen_credentials()
|
|
601
|
+
_logger.info(f'Using AWS credentials from the environment for Label Studio project: {title}')
|
|
602
|
+
s3_configuration['aws_access_key_id'] = s3_credentials.access_key
|
|
603
|
+
s3_configuration['aws_secret_access_key'] = s3_credentials.secret_key
|
|
604
|
+
s3_configuration['aws_session_token'] = s3_credentials.token
|
|
605
|
+
except Exception as exc:
|
|
606
|
+
# This is not necessarily a problem, but we should log that it happened
|
|
607
|
+
_logger.debug(f'Unable to retrieve AWS credentials from the environment: {exc}')
|
|
608
|
+
pass
|
|
609
|
+
|
|
610
|
+
_logger.info(f'Creating Label Studio project: {title}')
|
|
575
611
|
project = _label_studio_client().start_project(title=title, label_config=label_config, **kwargs)
|
|
576
612
|
|
|
577
613
|
if media_import_method == 'file':
|
|
@@ -591,6 +627,10 @@ class LabelStudioProject(Project):
|
|
|
591
627
|
) from exc
|
|
592
628
|
raise # Handle any other exception type normally
|
|
593
629
|
|
|
630
|
+
if s3_configuration is not None:
|
|
631
|
+
_logger.info(f'Setting up S3 import storage for Label Studio project: {title}')
|
|
632
|
+
project.connect_s3_import_storage(**s3_configuration)
|
|
633
|
+
|
|
594
634
|
project_id = project.get_params()['id']
|
|
595
635
|
return LabelStudioProject(name, project_id, media_import_method, resolved_col_mapping)
|
|
596
636
|
|
pixeltable/io/pandas.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Optional, Any, Iterable
|
|
1
|
+
from typing import Optional, Any, Iterable, Union
|
|
2
2
|
|
|
3
3
|
import numpy as np
|
|
4
4
|
import pandas as pd
|
|
@@ -9,7 +9,10 @@ import pixeltable.type_system as ts
|
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
def import_pandas(
|
|
12
|
-
tbl_name: str, df: pd.DataFrame, *, schema_overrides: Optional[dict[str, pxt.ColumnType]] = None
|
|
12
|
+
tbl_name: str, df: pd.DataFrame, *, schema_overrides: Optional[dict[str, pxt.ColumnType]] = None,
|
|
13
|
+
primary_key: Optional[Union[str, list[str]]] = None,
|
|
14
|
+
num_retained_versions: int = 10,
|
|
15
|
+
comment: str = ''
|
|
13
16
|
) -> pxt.catalog.InsertableTable:
|
|
14
17
|
"""Creates a new `Table` from a Pandas `DataFrame`, with the specified name. The schema of the table
|
|
15
18
|
will be inferred from the `DataFrame`, unless `schema` is specified.
|
|
@@ -31,13 +34,17 @@ def import_pandas(
|
|
|
31
34
|
"""
|
|
32
35
|
schema = _df_to_pxt_schema(df, schema_overrides)
|
|
33
36
|
tbl_rows = (dict(_df_row_to_pxt_row(row, schema)) for row in df.itertuples())
|
|
34
|
-
table = pxt.create_table(tbl_name, schema)
|
|
37
|
+
table = pxt.create_table(tbl_name, schema, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)
|
|
35
38
|
table.insert(tbl_rows)
|
|
36
39
|
return table
|
|
37
40
|
|
|
38
41
|
|
|
39
42
|
def import_csv(
|
|
40
|
-
|
|
43
|
+
tbl_name: str, filepath_or_buffer, schema_overrides: Optional[dict[str, ts.ColumnType]] = None,
|
|
44
|
+
primary_key: Optional[Union[str, list[str]]] = None,
|
|
45
|
+
num_retained_versions: int = 10,
|
|
46
|
+
comment: str = '',
|
|
47
|
+
**kwargs
|
|
41
48
|
) -> pxt.catalog.InsertableTable:
|
|
42
49
|
"""
|
|
43
50
|
Creates a new `Table` from a csv file. This is a convenience method and is equivalent
|
|
@@ -45,11 +52,15 @@ def import_csv(
|
|
|
45
52
|
See the Pandas documentation for `read_csv` for more details.
|
|
46
53
|
"""
|
|
47
54
|
df = pd.read_csv(filepath_or_buffer, **kwargs)
|
|
48
|
-
return import_pandas(
|
|
55
|
+
return import_pandas(tbl_name, df, schema_overrides=schema_overrides, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)
|
|
49
56
|
|
|
50
57
|
|
|
51
58
|
def import_excel(
|
|
52
|
-
|
|
59
|
+
tbl_name: str, io, *args, schema_overrides: Optional[dict[str, ts.ColumnType]] = None,
|
|
60
|
+
primary_key: Optional[Union[str, list[str]]] = None,
|
|
61
|
+
num_retained_versions: int = 10,
|
|
62
|
+
comment: str = '',
|
|
63
|
+
**kwargs
|
|
53
64
|
) -> pxt.catalog.InsertableTable:
|
|
54
65
|
"""
|
|
55
66
|
Creates a new `Table` from an excel (.xlsx) file. This is a convenience method and is equivalent
|
|
@@ -57,7 +68,7 @@ def import_excel(
|
|
|
57
68
|
See the Pandas documentation for `read_excel` for more details.
|
|
58
69
|
"""
|
|
59
70
|
df = pd.read_excel(io, *args, **kwargs)
|
|
60
|
-
return import_pandas(
|
|
71
|
+
return import_pandas(tbl_name, df, schema_overrides=schema_overrides, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)
|
|
61
72
|
|
|
62
73
|
|
|
63
74
|
def _df_to_pxt_schema(
|
pixeltable/metadata/__init__.py
CHANGED
|
@@ -10,7 +10,7 @@ import sqlalchemy.orm as orm
|
|
|
10
10
|
from .schema import SystemInfo, SystemInfoMd
|
|
11
11
|
|
|
12
12
|
# current version of the metadata; this is incremented whenever the metadata schema changes
|
|
13
|
-
VERSION =
|
|
13
|
+
VERSION = 18
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
def create_system_info(engine: sql.engine.Engine) -> None:
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import sqlalchemy as sql
|
|
2
|
+
|
|
3
|
+
from pixeltable.metadata import register_converter
|
|
4
|
+
from pixeltable.metadata.converters.util import convert_table_md
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@register_converter(version=17)
|
|
8
|
+
def _(engine: sql.engine.Engine) -> None:
|
|
9
|
+
convert_table_md(
|
|
10
|
+
engine,
|
|
11
|
+
table_md_updater=__update_table_md
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def __update_table_md(table_md: dict) -> None:
|
|
16
|
+
# key changes in IndexMd.init_args: img_embed -> image_embed, txt_embed -> string_embed
|
|
17
|
+
if len(table_md['index_md']) == 0:
|
|
18
|
+
return
|
|
19
|
+
for idx_md in table_md['index_md'].values():
|
|
20
|
+
if not idx_md['class_fqn'].endswith('.EmbeddingIndex'):
|
|
21
|
+
continue
|
|
22
|
+
init_dict = idx_md['init_args']
|
|
23
|
+
init_dict['image_embed'] = init_dict['img_embed']
|
|
24
|
+
del init_dict['img_embed']
|
|
25
|
+
init_dict['string_embed'] = init_dict['txt_embed']
|
|
26
|
+
del init_dict['txt_embed']
|