pixeltable 0.2.10__py3-none-any.whl → 0.2.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +2 -2
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/catalog.py +3 -3
- pixeltable/catalog/globals.py +2 -0
- pixeltable/catalog/insertable_table.py +1 -11
- pixeltable/catalog/schema_object.py +28 -2
- pixeltable/catalog/table.py +76 -97
- pixeltable/catalog/table_version.py +96 -58
- pixeltable/catalog/table_version_path.py +1 -1
- pixeltable/catalog/view.py +31 -27
- pixeltable/dataframe.py +32 -115
- pixeltable/exprs/column_ref.py +2 -7
- pixeltable/exprs/similarity_expr.py +27 -16
- pixeltable/functions/openai.py +1 -1
- pixeltable/globals.py +70 -53
- pixeltable/index/embedding_index.py +28 -27
- pixeltable/io/external_store.py +2 -2
- pixeltable/io/globals.py +1 -1
- pixeltable/io/label_studio.py +3 -3
- pixeltable/metadata/__init__.py +1 -1
- pixeltable/metadata/converters/convert_17.py +26 -0
- pixeltable/tool/create_test_db_dump.py +1 -1
- pixeltable/utils/formatter.py +234 -0
- {pixeltable-0.2.10.dist-info → pixeltable-0.2.12.dist-info}/METADATA +4 -4
- {pixeltable-0.2.10.dist-info → pixeltable-0.2.12.dist-info}/RECORD +27 -25
- {pixeltable-0.2.10.dist-info → pixeltable-0.2.12.dist-info}/LICENSE +0 -0
- {pixeltable-0.2.10.dist-info → pixeltable-0.2.12.dist-info}/WHEEL +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Optional, List
|
|
1
|
+
from typing import Optional, List, Any
|
|
2
2
|
|
|
3
3
|
import sqlalchemy as sql
|
|
4
4
|
import PIL.Image
|
|
@@ -14,33 +14,44 @@ from .row_builder import RowBuilder
|
|
|
14
14
|
|
|
15
15
|
class SimilarityExpr(Expr):
|
|
16
16
|
|
|
17
|
-
def __init__(self, col_ref: ColumnRef, item:
|
|
17
|
+
def __init__(self, col_ref: ColumnRef, item: Any, idx_name: Optional[str] = None):
|
|
18
18
|
super().__init__(ts.FloatType())
|
|
19
|
-
|
|
19
|
+
item_expr = Expr.from_object(item)
|
|
20
|
+
if item_expr is None or not(item_expr.col_type.is_string_type() or item_expr.col_type.is_image_type()):
|
|
21
|
+
raise excs.Error(f'similarity(): requires a string or a PIL.Image.Image object, not a {type(item)}')
|
|
22
|
+
assert item_expr.col_type.is_string_type() or item_expr.col_type.is_image_type()
|
|
23
|
+
|
|
24
|
+
self.components = [col_ref, item_expr]
|
|
20
25
|
self.id = self._create_id()
|
|
21
|
-
assert item.col_type.is_string_type() or item.col_type.is_image_type()
|
|
22
26
|
|
|
23
27
|
# determine index to use
|
|
24
28
|
idx_info = col_ref.col.get_idx_info()
|
|
25
29
|
import pixeltable.index as index
|
|
26
|
-
embedding_idx_info =
|
|
30
|
+
embedding_idx_info = {
|
|
31
|
+
info.name: info for info in idx_info.values() if isinstance(info.idx, index.EmbeddingIndex)
|
|
32
|
+
}
|
|
27
33
|
if len(embedding_idx_info) == 0:
|
|
28
|
-
raise excs.Error(f'No index found for column {col_ref.col}')
|
|
34
|
+
raise excs.Error(f'No index found for column {col_ref.col!r}')
|
|
35
|
+
if idx_name is not None and idx_name not in embedding_idx_info:
|
|
36
|
+
raise excs.Error(f'Index {idx_name!r} not found for column {col_ref.col.name!r}')
|
|
29
37
|
if len(embedding_idx_info) > 1:
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
38
|
+
if idx_name is None:
|
|
39
|
+
raise excs.Error(
|
|
40
|
+
f'Column {col_ref.col.name!r} has multiple indices; use the index name to disambiguate: '
|
|
41
|
+
f'`{col_ref.col.name}.similarity(..., idx=<name>)`')
|
|
42
|
+
self.idx_info = embedding_idx_info[idx_name]
|
|
43
|
+
else:
|
|
44
|
+
self.idx_info = next(iter(embedding_idx_info.values()))
|
|
34
45
|
idx = self.idx_info.idx
|
|
35
46
|
|
|
36
|
-
if
|
|
47
|
+
if item_expr.col_type.is_string_type() and idx.string_embed is None:
|
|
37
48
|
raise excs.Error(
|
|
38
|
-
f'Embedding index {self.idx_info.name} on column {self.idx_info.col.name} was created without the '
|
|
39
|
-
f'
|
|
40
|
-
if
|
|
49
|
+
f'Embedding index {self.idx_info.name!r} on column {self.idx_info.col.name!r} was created without the '
|
|
50
|
+
f"'string_embed' parameter and does not support string queries")
|
|
51
|
+
if item_expr.col_type.is_image_type() and idx.image_embed is None:
|
|
41
52
|
raise excs.Error(
|
|
42
|
-
f'Embedding index {self.idx_info.name} on column {self.idx_info.col.name} was created without the '
|
|
43
|
-
f'
|
|
53
|
+
f'Embedding index {self.idx_info.name!r} on column {self.idx_info.col.name!r} was created without the '
|
|
54
|
+
f"'image_embed' parameter and does not support image queries")
|
|
44
55
|
|
|
45
56
|
def __str__(self) -> str:
|
|
46
57
|
return f'{self.components[0]}.similarity({self.components[1]})'
|
pixeltable/functions/openai.py
CHANGED
|
@@ -141,7 +141,7 @@ def chat_completions(
|
|
|
141
141
|
|
|
142
142
|
|
|
143
143
|
@pxt.udf
|
|
144
|
-
def vision(prompt: str, image: PIL.Image.Image, *, model: str
|
|
144
|
+
def vision(prompt: str, image: PIL.Image.Image, *, model: str) -> str:
|
|
145
145
|
# TODO(aaron-siegel): Decompose CPU/GPU ops into separate functions
|
|
146
146
|
bytes_arr = io.BytesIO()
|
|
147
147
|
image.save(bytes_arr, format='png')
|
pixeltable/globals.py
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
import dataclasses
|
|
2
2
|
import logging
|
|
3
|
-
from typing import Any, Optional, Union
|
|
3
|
+
from typing import Any, Optional, Union
|
|
4
4
|
|
|
5
5
|
import pandas as pd
|
|
6
6
|
import sqlalchemy as sql
|
|
7
7
|
from sqlalchemy.util.preloaded import orm
|
|
8
8
|
|
|
9
9
|
import pixeltable.exceptions as excs
|
|
10
|
-
from pixeltable import catalog, func
|
|
10
|
+
from pixeltable import catalog, func, DataFrame
|
|
11
11
|
from pixeltable.catalog import Catalog
|
|
12
12
|
from pixeltable.env import Env
|
|
13
13
|
from pixeltable.exprs import Predicate
|
|
@@ -78,7 +78,7 @@ def create_table(
|
|
|
78
78
|
|
|
79
79
|
def create_view(
|
|
80
80
|
path_str: str,
|
|
81
|
-
base: catalog.Table,
|
|
81
|
+
base: Union[catalog.Table, DataFrame],
|
|
82
82
|
*,
|
|
83
83
|
schema: Optional[dict[str, Any]] = None,
|
|
84
84
|
filter: Optional[Predicate] = None,
|
|
@@ -92,7 +92,7 @@ def create_view(
|
|
|
92
92
|
|
|
93
93
|
Args:
|
|
94
94
|
path_str: Path to the view.
|
|
95
|
-
base: Table (
|
|
95
|
+
base: Table (i.e., table or view or snapshot) or DataFrame to base the view on.
|
|
96
96
|
schema: dictionary mapping column names to column types, value expressions, or to column specifications.
|
|
97
97
|
filter: Predicate to filter rows of the base table.
|
|
98
98
|
is_snapshot: Whether the view is a snapshot.
|
|
@@ -122,7 +122,19 @@ def create_view(
|
|
|
122
122
|
>>> snapshot_view = cl.create_view(
|
|
123
123
|
'my_snapshot', base, schema={'col3': base.col2 + 1}, filter=base.col1 > 10, is_snapshot=True)
|
|
124
124
|
"""
|
|
125
|
-
|
|
125
|
+
if isinstance(base, catalog.Table):
|
|
126
|
+
tbl_version_path = base._tbl_version_path
|
|
127
|
+
elif isinstance(base, DataFrame):
|
|
128
|
+
base._validate_mutable('create_view')
|
|
129
|
+
tbl_version_path = base.tbl
|
|
130
|
+
if base.where_clause is not None and filter is not None:
|
|
131
|
+
raise excs.Error(
|
|
132
|
+
'Cannot specify a `filter` directly if one is already declared in a `DataFrame.where` clause'
|
|
133
|
+
)
|
|
134
|
+
filter = base.where_clause
|
|
135
|
+
else:
|
|
136
|
+
raise excs.Error('`base` must be an instance of `Table` or `DataFrame`')
|
|
137
|
+
assert isinstance(base, catalog.Table) or isinstance(base, DataFrame)
|
|
126
138
|
path = catalog.Path(path_str)
|
|
127
139
|
try:
|
|
128
140
|
Catalog.get().paths.check_is_valid(path, expected=None)
|
|
@@ -139,10 +151,11 @@ def create_view(
|
|
|
139
151
|
iterator_class, iterator_args = None, None
|
|
140
152
|
else:
|
|
141
153
|
iterator_class, iterator_args = iterator
|
|
154
|
+
|
|
142
155
|
view = catalog.View.create(
|
|
143
156
|
dir._id,
|
|
144
157
|
path.name,
|
|
145
|
-
base=
|
|
158
|
+
base=tbl_version_path,
|
|
146
159
|
schema=schema,
|
|
147
160
|
predicate=filter,
|
|
148
161
|
is_snapshot=is_snapshot,
|
|
@@ -221,7 +234,7 @@ def drop_table(path: str, force: bool = False, ignore_errors: bool = False) -> N
|
|
|
221
234
|
|
|
222
235
|
Args:
|
|
223
236
|
path: Path to the table.
|
|
224
|
-
force:
|
|
237
|
+
force: If `True`, will also drop all views or sub-views of this table.
|
|
225
238
|
ignore_errors: Whether to ignore errors if the table does not exist.
|
|
226
239
|
|
|
227
240
|
Raises:
|
|
@@ -230,21 +243,27 @@ def drop_table(path: str, force: bool = False, ignore_errors: bool = False) -> N
|
|
|
230
243
|
Examples:
|
|
231
244
|
>>> cl.drop_table('my_table')
|
|
232
245
|
"""
|
|
246
|
+
cat = Catalog.get()
|
|
233
247
|
path_obj = catalog.Path(path)
|
|
234
248
|
try:
|
|
235
|
-
|
|
249
|
+
cat.paths.check_is_valid(path_obj, expected=catalog.Table)
|
|
236
250
|
except Exception as e:
|
|
237
|
-
if ignore_errors:
|
|
251
|
+
if ignore_errors or force:
|
|
238
252
|
_logger.info(f'Skipped table `{path}` (does not exist).')
|
|
239
253
|
return
|
|
240
254
|
else:
|
|
241
255
|
raise e
|
|
242
|
-
tbl =
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
256
|
+
tbl = cat.paths[path_obj]
|
|
257
|
+
assert isinstance(tbl, catalog.Table)
|
|
258
|
+
if len(cat.tbl_dependents[tbl._id]) > 0:
|
|
259
|
+
dependent_paths = [dep.path for dep in cat.tbl_dependents[tbl._id]]
|
|
260
|
+
if force:
|
|
261
|
+
for dependent_path in dependent_paths:
|
|
262
|
+
drop_table(dependent_path, force=True)
|
|
263
|
+
else:
|
|
264
|
+
raise excs.Error(f'Table {path} has dependents: {", ".join(dependent_paths)}')
|
|
246
265
|
tbl._drop()
|
|
247
|
-
del
|
|
266
|
+
del cat.paths[path_obj]
|
|
248
267
|
_logger.info(f'Dropped table `{path}`.')
|
|
249
268
|
|
|
250
269
|
|
|
@@ -278,7 +297,7 @@ def list_tables(dir_path: str = '', recursive: bool = True) -> list[str]:
|
|
|
278
297
|
return [str(p) for p in Catalog.get().paths.get_children(path, child_type=catalog.Table, recursive=recursive)]
|
|
279
298
|
|
|
280
299
|
|
|
281
|
-
def create_dir(path_str: str, ignore_errors: bool = False) ->
|
|
300
|
+
def create_dir(path_str: str, ignore_errors: bool = False) -> catalog.Dir:
|
|
282
301
|
"""Create a directory.
|
|
283
302
|
|
|
284
303
|
Args:
|
|
@@ -306,10 +325,12 @@ def create_dir(path_str: str, ignore_errors: bool = False) -> None:
|
|
|
306
325
|
session.add(dir_record)
|
|
307
326
|
session.flush()
|
|
308
327
|
assert dir_record.id is not None
|
|
309
|
-
|
|
328
|
+
dir = catalog.Dir(dir_record.id, parent._id, path.name)
|
|
329
|
+
Catalog.get().paths[path] = dir
|
|
310
330
|
session.commit()
|
|
311
331
|
_logger.info(f'Created directory `{path_str}`.')
|
|
312
332
|
print(f'Created directory `{path_str}`.')
|
|
333
|
+
return dir
|
|
313
334
|
except excs.Error as e:
|
|
314
335
|
if ignore_errors:
|
|
315
336
|
return
|
|
@@ -317,7 +338,7 @@ def create_dir(path_str: str, ignore_errors: bool = False) -> None:
|
|
|
317
338
|
raise e
|
|
318
339
|
|
|
319
340
|
|
|
320
|
-
def
|
|
341
|
+
def drop_dir(path_str: str, force: bool = False, ignore_errors: bool = False) -> None:
|
|
321
342
|
"""Remove a directory.
|
|
322
343
|
|
|
323
344
|
Args:
|
|
@@ -327,31 +348,49 @@ def rm_dir(path_str: str) -> None:
|
|
|
327
348
|
Error: If the path does not exist or does not designate a directory or if the directory is not empty.
|
|
328
349
|
|
|
329
350
|
Examples:
|
|
330
|
-
>>> cl.
|
|
351
|
+
>>> cl.drop_dir('my_dir')
|
|
331
352
|
|
|
332
353
|
Remove a subdirectory:
|
|
333
354
|
|
|
334
|
-
>>> cl.
|
|
355
|
+
>>> cl.drop_dir('my_dir.sub_dir')
|
|
335
356
|
"""
|
|
357
|
+
cat = Catalog.get()
|
|
336
358
|
path = catalog.Path(path_str)
|
|
337
|
-
Catalog.get().paths.check_is_valid(path, expected=catalog.Dir)
|
|
338
359
|
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
360
|
+
try:
|
|
361
|
+
cat.paths.check_is_valid(path, expected=catalog.Dir)
|
|
362
|
+
except Exception as e:
|
|
363
|
+
if ignore_errors or force:
|
|
364
|
+
_logger.info(f'Skipped directory `{path}` (does not exist).')
|
|
365
|
+
return
|
|
366
|
+
else:
|
|
367
|
+
raise e
|
|
368
|
+
|
|
369
|
+
children = cat.paths.get_children(path, child_type=None, recursive=True)
|
|
370
|
+
|
|
371
|
+
if len(children) > 0 and not force:
|
|
372
|
+
raise excs.Error(f'Directory `{path_str}` is not empty.')
|
|
373
|
+
|
|
374
|
+
for child in children:
|
|
375
|
+
assert isinstance(child, catalog.Path)
|
|
376
|
+
# We need to check that the child is still in `cat.paths`, since it is possible it was
|
|
377
|
+
# already deleted as a dependent of a preceding child in the iteration.
|
|
378
|
+
try:
|
|
379
|
+
obj = cat.paths[child]
|
|
380
|
+
except excs.Error:
|
|
381
|
+
continue
|
|
382
|
+
if isinstance(obj, catalog.Dir):
|
|
383
|
+
drop_dir(str(child), force=True)
|
|
384
|
+
else:
|
|
385
|
+
assert isinstance(obj, catalog.Table)
|
|
386
|
+
assert not obj._is_dropped # else it should have been removed from `cat.paths` already
|
|
387
|
+
drop_table(str(child), force=True)
|
|
349
388
|
|
|
350
389
|
with Env.get().engine.begin() as conn:
|
|
351
390
|
dir = Catalog.get().paths[path]
|
|
352
391
|
conn.execute(sql.delete(schema.Dir.__table__).where(schema.Dir.id == dir._id))
|
|
353
392
|
del Catalog.get().paths[path]
|
|
354
|
-
_logger.info(f'Removed directory {path_str}')
|
|
393
|
+
_logger.info(f'Removed directory `{path_str}`.')
|
|
355
394
|
|
|
356
395
|
|
|
357
396
|
def list_dirs(path_str: str = '', recursive: bool = True) -> list[str]:
|
|
@@ -403,28 +442,6 @@ def list_functions() -> pd.DataFrame:
|
|
|
403
442
|
return pd_df.hide(axis='index')
|
|
404
443
|
|
|
405
444
|
|
|
406
|
-
def get_path(schema_obj: catalog.SchemaObject) -> str:
|
|
407
|
-
"""Returns the path to a SchemaObject.
|
|
408
|
-
|
|
409
|
-
Args:
|
|
410
|
-
schema_obj: SchemaObject to get the path for.
|
|
411
|
-
|
|
412
|
-
Returns:
|
|
413
|
-
Path to the SchemaObject.
|
|
414
|
-
"""
|
|
415
|
-
path_elements: list[str] = []
|
|
416
|
-
dir_id = schema_obj._dir_id
|
|
417
|
-
while dir_id is not None:
|
|
418
|
-
dir = Catalog.get().paths.get_schema_obj(dir_id)
|
|
419
|
-
if dir._dir_id is None:
|
|
420
|
-
# this is the root dir with name '', which we don't want to include in the path
|
|
421
|
-
break
|
|
422
|
-
path_elements.insert(0, dir._name)
|
|
423
|
-
dir_id = dir._dir_id
|
|
424
|
-
path_elements.append(schema_obj._name)
|
|
425
|
-
return '.'.join(path_elements)
|
|
426
|
-
|
|
427
|
-
|
|
428
445
|
def configure_logging(
|
|
429
446
|
*,
|
|
430
447
|
to_stdout: Optional[bool] = None,
|
|
@@ -24,6 +24,7 @@ class EmbeddingIndex(IndexBase):
|
|
|
24
24
|
- similarity_clause() converts those metrics back to their original form; it is used in expressions outside
|
|
25
25
|
the Order By clause
|
|
26
26
|
- order_by_clause() is used exclusively in the ORDER BY clause
|
|
27
|
+
- embedding function parameters are named '<type-name>_embed', where type-name is ColumnType.Type.name
|
|
27
28
|
"""
|
|
28
29
|
|
|
29
30
|
class Metric(enum.Enum):
|
|
@@ -38,30 +39,30 @@ class EmbeddingIndex(IndexBase):
|
|
|
38
39
|
}
|
|
39
40
|
|
|
40
41
|
def __init__(
|
|
41
|
-
self, c: catalog.Column, metric: str,
|
|
42
|
-
|
|
42
|
+
self, c: catalog.Column, metric: str, string_embed: Optional[func.Function] = None,
|
|
43
|
+
image_embed: Optional[func.Function] = None):
|
|
43
44
|
metric_names = [m.name.lower() for m in self.Metric]
|
|
44
45
|
if metric.lower() not in metric_names:
|
|
45
46
|
raise excs.Error(f'Invalid metric {metric}, must be one of {metric_names}')
|
|
46
47
|
if not c.col_type.is_string_type() and not c.col_type.is_image_type():
|
|
47
48
|
raise excs.Error(f'Embedding index requires string or image column')
|
|
48
|
-
if c.col_type.is_string_type() and
|
|
49
|
-
raise excs.Error(f
|
|
50
|
-
if c.col_type.is_image_type() and
|
|
51
|
-
raise excs.Error(f
|
|
52
|
-
if
|
|
49
|
+
if c.col_type.is_string_type() and string_embed is None:
|
|
50
|
+
raise excs.Error(f"Text embedding function is required for column {c.name} (parameter 'string_embed')")
|
|
51
|
+
if c.col_type.is_image_type() and image_embed is None:
|
|
52
|
+
raise excs.Error(f"Image embedding function is required for column {c.name} (parameter 'image_embed')")
|
|
53
|
+
if string_embed is not None:
|
|
53
54
|
# verify signature
|
|
54
|
-
self._validate_embedding_fn(
|
|
55
|
-
if
|
|
55
|
+
self._validate_embedding_fn(string_embed, 'string_embed', ts.ColumnType.Type.STRING)
|
|
56
|
+
if image_embed is not None:
|
|
56
57
|
# verify signature
|
|
57
|
-
self._validate_embedding_fn(
|
|
58
|
+
self._validate_embedding_fn(image_embed, 'image_embed', ts.ColumnType.Type.IMAGE)
|
|
58
59
|
|
|
59
60
|
self.metric = self.Metric[metric.upper()]
|
|
60
61
|
from pixeltable.exprs import ColumnRef
|
|
61
|
-
self.value_expr =
|
|
62
|
+
self.value_expr = string_embed(ColumnRef(c)) if c.col_type.is_string_type() else image_embed(ColumnRef(c))
|
|
62
63
|
assert self.value_expr.col_type.is_array_type()
|
|
63
|
-
self.
|
|
64
|
-
self.
|
|
64
|
+
self.string_embed = string_embed
|
|
65
|
+
self.image_embed = image_embed
|
|
65
66
|
vector_size = self.value_expr.col_type.shape[0]
|
|
66
67
|
assert vector_size is not None
|
|
67
68
|
self.index_col_type = pgvector.sqlalchemy.Vector(vector_size)
|
|
@@ -88,14 +89,14 @@ class EmbeddingIndex(IndexBase):
|
|
|
88
89
|
idx.create(bind=conn)
|
|
89
90
|
|
|
90
91
|
def similarity_clause(self, val_column: catalog.Column, item: Any) -> sql.ClauseElement:
|
|
91
|
-
"""Create a ClauseElement
|
|
92
|
+
"""Create a ClauseElement that represents '<val_column> <op> <item>'"""
|
|
92
93
|
assert isinstance(item, (str, PIL.Image.Image))
|
|
93
94
|
if isinstance(item, str):
|
|
94
|
-
assert self.
|
|
95
|
-
embedding = self.
|
|
95
|
+
assert self.string_embed is not None
|
|
96
|
+
embedding = self.string_embed.exec(item)
|
|
96
97
|
if isinstance(item, PIL.Image.Image):
|
|
97
|
-
assert self.
|
|
98
|
-
embedding = self.
|
|
98
|
+
assert self.image_embed is not None
|
|
99
|
+
embedding = self.image_embed.exec(item)
|
|
99
100
|
|
|
100
101
|
if self.metric == self.Metric.COSINE:
|
|
101
102
|
return val_column.sa_col.cosine_distance(embedding) * -1 + 1
|
|
@@ -110,11 +111,11 @@ class EmbeddingIndex(IndexBase):
|
|
|
110
111
|
assert isinstance(item, (str, PIL.Image.Image))
|
|
111
112
|
embedding: Optional[np.ndarray] = None
|
|
112
113
|
if isinstance(item, str):
|
|
113
|
-
assert self.
|
|
114
|
-
embedding = self.
|
|
114
|
+
assert self.string_embed is not None
|
|
115
|
+
embedding = self.string_embed.exec(item)
|
|
115
116
|
if isinstance(item, PIL.Image.Image):
|
|
116
|
-
assert self.
|
|
117
|
-
embedding = self.
|
|
117
|
+
assert self.image_embed is not None
|
|
118
|
+
embedding = self.image_embed.exec(item)
|
|
118
119
|
assert embedding is not None
|
|
119
120
|
|
|
120
121
|
if self.metric == self.Metric.COSINE:
|
|
@@ -160,12 +161,12 @@ class EmbeddingIndex(IndexBase):
|
|
|
160
161
|
def as_dict(self) -> dict:
|
|
161
162
|
return {
|
|
162
163
|
'metric': self.metric.name.lower(),
|
|
163
|
-
'
|
|
164
|
-
'
|
|
164
|
+
'string_embed': None if self.string_embed is None else self.string_embed.as_dict(),
|
|
165
|
+
'image_embed': None if self.image_embed is None else self.image_embed.as_dict()
|
|
165
166
|
}
|
|
166
167
|
|
|
167
168
|
@classmethod
|
|
168
169
|
def from_dict(cls, c: catalog.Column, d: dict) -> EmbeddingIndex:
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
return cls(c, metric=d['metric'],
|
|
170
|
+
string_embed = func.Function.from_dict(d['string_embed']) if d['string_embed'] is not None else None
|
|
171
|
+
image_embed = func.Function.from_dict(d['image_embed']) if d['image_embed'] is not None else None
|
|
172
|
+
return cls(c, metric=d['metric'], string_embed=string_embed, image_embed=image_embed)
|
pixeltable/io/external_store.py
CHANGED
|
@@ -222,12 +222,12 @@ class Project(ExternalStore, abc.ABC):
|
|
|
222
222
|
if t_col not in t_cols:
|
|
223
223
|
if is_user_specified_col_mapping:
|
|
224
224
|
raise excs.Error(
|
|
225
|
-
f'Column name `{t_col}` appears as a key in `col_mapping`, but Table `{table.
|
|
225
|
+
f'Column name `{t_col}` appears as a key in `col_mapping`, but Table `{table.name}` '
|
|
226
226
|
'contains no such column.'
|
|
227
227
|
)
|
|
228
228
|
else:
|
|
229
229
|
raise excs.Error(
|
|
230
|
-
f'Column `{t_col}` does not exist in Table `{table.
|
|
230
|
+
f'Column `{t_col}` does not exist in Table `{table.name}`. Either add a column `{t_col}`, '
|
|
231
231
|
f'or specify a `col_mapping` to associate a different column with the external field `{ext_col}`.'
|
|
232
232
|
)
|
|
233
233
|
if ext_col not in export_cols and ext_col not in import_cols:
|
pixeltable/io/globals.py
CHANGED
|
@@ -50,7 +50,7 @@ def create_label_studio_project(
|
|
|
50
50
|
`ls_project_0`, `ls_project_1`, etc.
|
|
51
51
|
title: An optional title for the Label Studio project. This is the title that annotators
|
|
52
52
|
will see inside Label Studio. Unlike `name`, it does not need to be an identifier and
|
|
53
|
-
does not need to be unique. If not specified, the table name `t.
|
|
53
|
+
does not need to be unique. If not specified, the table name `t.name` will be used.
|
|
54
54
|
media_import_method: The method to use when transferring media files to Label Studio:
|
|
55
55
|
- `post`: Media will be sent to Label Studio via HTTP post. This should generally only be used for
|
|
56
56
|
prototyping; due to restrictions in Label Studio, it can only be used with projects that have
|
pixeltable/io/label_studio.py
CHANGED
|
@@ -95,7 +95,7 @@ class LabelStudioProject(Project):
|
|
|
95
95
|
return {ANNOTATIONS_COLUMN: pxt.JsonType(nullable=True)}
|
|
96
96
|
|
|
97
97
|
def sync(self, t: Table, export_data: bool, import_data: bool) -> SyncStatus:
|
|
98
|
-
_logger.info(f'Syncing Label Studio project "{self.project_title}" with table `{t.
|
|
98
|
+
_logger.info(f'Syncing Label Studio project "{self.project_title}" with table `{t.name}`'
|
|
99
99
|
f' (export: {export_data}, import: {import_data}).')
|
|
100
100
|
# Collect all existing tasks into a dict with entries `rowid: task`
|
|
101
101
|
tasks = {tuple(task['meta']['rowid']): task for task in self.__fetch_all_tasks()}
|
|
@@ -386,7 +386,7 @@ class LabelStudioProject(Project):
|
|
|
386
386
|
updates = [{'_rowid': rowid, local_annotations_col.name: ann} for rowid, ann in annotations.items()]
|
|
387
387
|
if len(updates) > 0:
|
|
388
388
|
_logger.info(
|
|
389
|
-
f'Updating table `{t.
|
|
389
|
+
f'Updating table `{t.name}`, column `{local_annotations_col.name}` with {len(updates)} total annotations.'
|
|
390
390
|
)
|
|
391
391
|
# batch_update currently doesn't propagate from views to base tables. As a workaround, we call
|
|
392
392
|
# batch_update on the actual ancestor table that holds the annotations column.
|
|
@@ -554,7 +554,7 @@ class LabelStudioProject(Project):
|
|
|
554
554
|
|
|
555
555
|
if title is None:
|
|
556
556
|
# `title` defaults to table name
|
|
557
|
-
title = t.
|
|
557
|
+
title = t.name
|
|
558
558
|
|
|
559
559
|
# Create a column to hold the annotations, if one does not yet exist
|
|
560
560
|
if col_mapping is None or ANNOTATIONS_COLUMN in col_mapping.values():
|
pixeltable/metadata/__init__.py
CHANGED
|
@@ -10,7 +10,7 @@ import sqlalchemy.orm as orm
|
|
|
10
10
|
from .schema import SystemInfo, SystemInfoMd
|
|
11
11
|
|
|
12
12
|
# current version of the metadata; this is incremented whenever the metadata schema changes
|
|
13
|
-
VERSION =
|
|
13
|
+
VERSION = 18
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
def create_system_info(engine: sql.engine.Engine) -> None:
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import sqlalchemy as sql
|
|
2
|
+
|
|
3
|
+
from pixeltable.metadata import register_converter
|
|
4
|
+
from pixeltable.metadata.converters.util import convert_table_md
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@register_converter(version=17)
|
|
8
|
+
def _(engine: sql.engine.Engine) -> None:
|
|
9
|
+
convert_table_md(
|
|
10
|
+
engine,
|
|
11
|
+
table_md_updater=__update_table_md
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def __update_table_md(table_md: dict) -> None:
|
|
16
|
+
# key changes in IndexMd.init_args: img_embed -> image_embed, txt_embed -> string_embed
|
|
17
|
+
if len(table_md['index_md']) == 0:
|
|
18
|
+
return
|
|
19
|
+
for idx_md in table_md['index_md'].values():
|
|
20
|
+
if not idx_md['class_fqn'].endswith('.EmbeddingIndex'):
|
|
21
|
+
continue
|
|
22
|
+
init_dict = idx_md['init_args']
|
|
23
|
+
init_dict['image_embed'] = init_dict['img_embed']
|
|
24
|
+
del init_dict['img_embed']
|
|
25
|
+
init_dict['string_embed'] = init_dict['txt_embed']
|
|
26
|
+
del init_dict['txt_embed']
|
|
@@ -253,7 +253,7 @@ class Dumper:
|
|
|
253
253
|
add_column('c6_to_string', t.c6.apply(json.dumps))
|
|
254
254
|
add_column('c6_back_to_json', t[f'{col_prefix}_c6_to_string'].apply(json.loads))
|
|
255
255
|
|
|
256
|
-
t.add_embedding_index(f'{col_prefix}_function_call',
|
|
256
|
+
t.add_embedding_index(f'{col_prefix}_function_call', string_embed=embed_udf.clip_text_embed)
|
|
257
257
|
|
|
258
258
|
# query()
|
|
259
259
|
@t.query
|