pixeltable 0.2.21__py3-none-any.whl → 0.2.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +1 -1
- pixeltable/catalog/column.py +37 -11
- pixeltable/catalog/globals.py +18 -0
- pixeltable/catalog/insertable_table.py +6 -4
- pixeltable/catalog/table.py +19 -3
- pixeltable/catalog/table_version.py +34 -14
- pixeltable/catalog/view.py +16 -17
- pixeltable/dataframe.py +7 -8
- pixeltable/env.py +5 -0
- pixeltable/exec/__init__.py +0 -1
- pixeltable/exec/aggregation_node.py +6 -3
- pixeltable/exec/cache_prefetch_node.py +1 -1
- pixeltable/exec/data_row_batch.py +2 -19
- pixeltable/exec/exec_node.py +2 -1
- pixeltable/exec/expr_eval_node.py +17 -10
- pixeltable/exec/in_memory_data_node.py +6 -3
- pixeltable/exec/sql_node.py +24 -25
- pixeltable/exprs/arithmetic_expr.py +3 -1
- pixeltable/exprs/array_slice.py +7 -7
- pixeltable/exprs/column_property_ref.py +37 -10
- pixeltable/exprs/column_ref.py +93 -14
- pixeltable/exprs/comparison.py +5 -5
- pixeltable/exprs/compound_predicate.py +8 -7
- pixeltable/exprs/data_row.py +27 -18
- pixeltable/exprs/expr.py +53 -52
- pixeltable/exprs/expr_set.py +5 -0
- pixeltable/exprs/function_call.py +32 -16
- pixeltable/exprs/globals.py +4 -1
- pixeltable/exprs/in_predicate.py +8 -7
- pixeltable/exprs/inline_expr.py +4 -4
- pixeltable/exprs/is_null.py +4 -4
- pixeltable/exprs/json_mapper.py +11 -12
- pixeltable/exprs/json_path.py +5 -10
- pixeltable/exprs/literal.py +5 -5
- pixeltable/exprs/method_ref.py +5 -4
- pixeltable/exprs/object_ref.py +2 -1
- pixeltable/exprs/row_builder.py +88 -36
- pixeltable/exprs/rowid_ref.py +12 -11
- pixeltable/exprs/similarity_expr.py +12 -7
- pixeltable/exprs/sql_element_cache.py +7 -5
- pixeltable/exprs/type_cast.py +8 -6
- pixeltable/exprs/variable.py +5 -4
- pixeltable/func/aggregate_function.py +1 -1
- pixeltable/func/function.py +11 -10
- pixeltable/functions/__init__.py +2 -2
- pixeltable/functions/globals.py +5 -7
- pixeltable/functions/huggingface.py +19 -20
- pixeltable/functions/llama_cpp.py +106 -0
- pixeltable/functions/ollama.py +147 -0
- pixeltable/functions/replicate.py +72 -0
- pixeltable/functions/string.py +9 -0
- pixeltable/globals.py +12 -20
- pixeltable/index/btree.py +16 -3
- pixeltable/index/embedding_index.py +4 -4
- pixeltable/io/__init__.py +1 -2
- pixeltable/io/fiftyone.py +178 -0
- pixeltable/io/globals.py +96 -2
- pixeltable/iterators/base.py +3 -2
- pixeltable/iterators/document.py +1 -1
- pixeltable/iterators/video.py +120 -63
- pixeltable/metadata/__init__.py +1 -1
- pixeltable/metadata/converters/convert_21.py +34 -0
- pixeltable/metadata/converters/util.py +45 -4
- pixeltable/metadata/notes.py +1 -0
- pixeltable/metadata/schema.py +8 -0
- pixeltable/plan.py +16 -14
- pixeltable/py.typed +0 -0
- pixeltable/store.py +7 -2
- pixeltable/tool/create_test_video.py +1 -1
- pixeltable/tool/embed_udf.py +1 -1
- pixeltable/tool/mypy_plugin.py +28 -5
- pixeltable/type_system.py +17 -1
- pixeltable/utils/documents.py +15 -1
- pixeltable/utils/formatter.py +9 -10
- {pixeltable-0.2.21.dist-info → pixeltable-0.2.22.dist-info}/METADATA +46 -10
- pixeltable-0.2.22.dist-info/RECORD +153 -0
- pixeltable/exec/media_validation_node.py +0 -43
- pixeltable-0.2.21.dist-info/RECORD +0 -148
- {pixeltable-0.2.21.dist-info → pixeltable-0.2.22.dist-info}/LICENSE +0 -0
- {pixeltable-0.2.21.dist-info → pixeltable-0.2.22.dist-info}/WHEEL +0 -0
- {pixeltable-0.2.21.dist-info → pixeltable-0.2.22.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Pixeltable [UDFs](https://pixeltable.readme.io/docs/user-defined-functions-udfs)
|
|
3
|
+
that wrap various endpoints from the Replicate API. In order to use them, you must
|
|
4
|
+
first `pip install replicate` and configure your Replicate credentials, as described in
|
|
5
|
+
the [Working with Replicate](https://pixeltable.readme.io/docs/working-with-replicate) tutorial.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import TYPE_CHECKING, Any
|
|
9
|
+
|
|
10
|
+
import pixeltable as pxt
|
|
11
|
+
from pixeltable.env import Env, register_client
|
|
12
|
+
from pixeltable.utils.code import local_public_names
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
import replicate # type: ignore[import-untyped]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@register_client('replicate')
|
|
19
|
+
def _(api_token: str) -> 'replicate.Client':
|
|
20
|
+
import replicate
|
|
21
|
+
return replicate.Client(api_token=api_token)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _replicate_client() -> 'replicate.Client':
|
|
25
|
+
return Env.get().get_client('replicate')
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@pxt.udf
|
|
29
|
+
def run(
|
|
30
|
+
input: dict[str, Any],
|
|
31
|
+
*,
|
|
32
|
+
ref: str,
|
|
33
|
+
) -> dict[str, Any]:
|
|
34
|
+
"""
|
|
35
|
+
Run a model on Replicate.
|
|
36
|
+
|
|
37
|
+
For additional details, see: <https://replicate.com/docs/topics/models/run-a-model>
|
|
38
|
+
|
|
39
|
+
__Requirements:__
|
|
40
|
+
|
|
41
|
+
- `pip install replicate`
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
input: The input parameters for the model.
|
|
45
|
+
ref: The name of the model to run.
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
The output of the model.
|
|
49
|
+
|
|
50
|
+
Examples:
|
|
51
|
+
Add a computed column that applies the model `meta/meta-llama-3-8b-instruct`
|
|
52
|
+
to an existing Pixeltable column `tbl.prompt` of the table `tbl`:
|
|
53
|
+
|
|
54
|
+
>>> input = {'system_prompt': 'You are a helpful assistant.', 'prompt': tbl.prompt}
|
|
55
|
+
... tbl['response'] = run(input, ref='meta/meta-llama-3-8b-instruct')
|
|
56
|
+
|
|
57
|
+
Add a computed column that uses the model `black-forest-labs/flux-schnell`
|
|
58
|
+
to generate images from an existing Pixeltable column `tbl.prompt`:
|
|
59
|
+
|
|
60
|
+
>>> input = {'prompt': tbl.prompt, 'go_fast': True, 'megapixels': '1'}
|
|
61
|
+
... tbl['response'] = run(input, ref='black-forest-labs/flux-schnell')
|
|
62
|
+
... tbl['image'] = tbl.response.output[0].astype(pxt.Image)
|
|
63
|
+
"""
|
|
64
|
+
Env.get().require_package('replicate')
|
|
65
|
+
return _replicate_client().run(ref, input, use_file_output=False)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
__all__ = local_public_names(__name__)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def __dir__():
|
|
72
|
+
return __all__
|
pixeltable/functions/string.py
CHANGED
|
@@ -283,6 +283,15 @@ def isspace(self: str) -> bool:
|
|
|
283
283
|
"""
|
|
284
284
|
return self.isspace()
|
|
285
285
|
|
|
286
|
+
@pxt.udf
|
|
287
|
+
def join(sep: str, elements: list) -> str:
|
|
288
|
+
"""
|
|
289
|
+
Return a string which is the concatenation of the strings in `elements`.
|
|
290
|
+
|
|
291
|
+
Equivalent to [`str.join()`](https://docs.python.org/3/library/stdtypes.html#str.join)
|
|
292
|
+
"""
|
|
293
|
+
return sep.join(elements)
|
|
294
|
+
|
|
286
295
|
@pxt.udf(is_method=True)
|
|
287
296
|
def len(self: str) -> int:
|
|
288
297
|
"""
|
pixeltable/globals.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import dataclasses
|
|
2
2
|
import logging
|
|
3
|
-
from typing import Any, Iterable, Optional, Union
|
|
3
|
+
from typing import Any, Iterable, Optional, Union, Literal
|
|
4
4
|
from uuid import UUID
|
|
5
5
|
|
|
6
6
|
import pandas as pd
|
|
@@ -33,6 +33,7 @@ def create_table(
|
|
|
33
33
|
primary_key: Optional[Union[str, list[str]]] = None,
|
|
34
34
|
num_retained_versions: int = 10,
|
|
35
35
|
comment: str = '',
|
|
36
|
+
media_validation: Literal['on_read', 'on_write'] = 'on_write'
|
|
36
37
|
) -> catalog.Table:
|
|
37
38
|
"""Create a new base table.
|
|
38
39
|
|
|
@@ -44,6 +45,9 @@ def create_table(
|
|
|
44
45
|
table.
|
|
45
46
|
num_retained_versions: Number of versions of the table to retain.
|
|
46
47
|
comment: An optional comment; its meaning is user-defined.
|
|
48
|
+
media_validation: Media validation policy for the table.
|
|
49
|
+
- `'on_read'`: validate media files at query time
|
|
50
|
+
- `'on_write'`: validate media files during insert/update operations
|
|
47
51
|
|
|
48
52
|
Returns:
|
|
49
53
|
A handle to the newly created [`Table`][pixeltable.Table].
|
|
@@ -89,14 +93,8 @@ def create_table(
|
|
|
89
93
|
raise excs.Error('primary_key must be a single column name or a list of column names')
|
|
90
94
|
|
|
91
95
|
tbl = catalog.InsertableTable._create(
|
|
92
|
-
dir._id,
|
|
93
|
-
|
|
94
|
-
schema,
|
|
95
|
-
df,
|
|
96
|
-
primary_key=primary_key,
|
|
97
|
-
num_retained_versions=num_retained_versions,
|
|
98
|
-
comment=comment,
|
|
99
|
-
)
|
|
96
|
+
dir._id, path.name, schema, df, primary_key=primary_key, num_retained_versions=num_retained_versions,
|
|
97
|
+
comment=comment, media_validation=catalog.MediaValidation.validated(media_validation, 'media_validation'))
|
|
100
98
|
Catalog.get().paths[path] = tbl
|
|
101
99
|
|
|
102
100
|
_logger.info(f'Created table `{path_str}`.')
|
|
@@ -112,6 +110,7 @@ def create_view(
|
|
|
112
110
|
iterator: Optional[tuple[type[ComponentIterator], dict[str, Any]]] = None,
|
|
113
111
|
num_retained_versions: int = 10,
|
|
114
112
|
comment: str = '',
|
|
113
|
+
media_validation: Literal['on_read', 'on_write'] = 'on_write',
|
|
115
114
|
ignore_errors: bool = False,
|
|
116
115
|
) -> Optional[catalog.Table]:
|
|
117
116
|
"""Create a view of an existing table object (which itself can be a view or a snapshot or a base table).
|
|
@@ -177,17 +176,10 @@ def create_view(
|
|
|
177
176
|
iterator_class, iterator_args = iterator
|
|
178
177
|
|
|
179
178
|
view = catalog.View._create(
|
|
180
|
-
dir._id,
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
predicate=where,
|
|
185
|
-
is_snapshot=is_snapshot,
|
|
186
|
-
iterator_cls=iterator_class,
|
|
187
|
-
iterator_args=iterator_args,
|
|
188
|
-
num_retained_versions=num_retained_versions,
|
|
189
|
-
comment=comment,
|
|
190
|
-
)
|
|
179
|
+
dir._id, path.name, base=tbl_version_path, additional_columns=additional_columns, predicate=where,
|
|
180
|
+
is_snapshot=is_snapshot, iterator_cls=iterator_class, iterator_args=iterator_args,
|
|
181
|
+
num_retained_versions=num_retained_versions, comment=comment,
|
|
182
|
+
media_validation=catalog.MediaValidation.validated(media_validation, 'media_validation'))
|
|
191
183
|
Catalog.get().paths[path] = view
|
|
192
184
|
_logger.info(f'Created view `{path_str}`.')
|
|
193
185
|
FileCache.get().emit_eviction_warnings()
|
pixeltable/index/btree.py
CHANGED
|
@@ -1,13 +1,16 @@
|
|
|
1
|
-
from typing import Optional
|
|
1
|
+
from typing import Optional, TYPE_CHECKING
|
|
2
2
|
|
|
3
3
|
import sqlalchemy as sql
|
|
4
4
|
|
|
5
|
+
# TODO: why does this import result in a circular import, but the one im embedding_index.py doesn't?
|
|
6
|
+
# import pixeltable.catalog as catalog
|
|
5
7
|
import pixeltable.exceptions as excs
|
|
6
8
|
from pixeltable import catalog, exprs
|
|
7
9
|
from pixeltable.func.udf import udf
|
|
8
|
-
|
|
9
10
|
from .base import IndexBase
|
|
10
11
|
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
import pixeltable.exprs
|
|
11
14
|
|
|
12
15
|
class BtreeIndex(IndexBase):
|
|
13
16
|
"""
|
|
@@ -15,6 +18,8 @@ class BtreeIndex(IndexBase):
|
|
|
15
18
|
"""
|
|
16
19
|
MAX_STRING_LEN = 256
|
|
17
20
|
|
|
21
|
+
value_expr: 'pixeltable.exprs.Expr'
|
|
22
|
+
|
|
18
23
|
@staticmethod
|
|
19
24
|
@udf
|
|
20
25
|
def str_filter(s: Optional[str]) -> Optional[str]:
|
|
@@ -25,7 +30,14 @@ class BtreeIndex(IndexBase):
|
|
|
25
30
|
def __init__(self, c: 'catalog.Column'):
|
|
26
31
|
if not c.col_type.is_scalar_type() and not c.col_type.is_media_type():
|
|
27
32
|
raise excs.Error(f'Index on column {c.name}: B-tree index requires scalar or media type, got {c.col_type}')
|
|
28
|
-
|
|
33
|
+
if c.col_type.is_media_type():
|
|
34
|
+
# an index on a media column is an index on the file url
|
|
35
|
+
# no validation for media columns: we're only interested in the string value
|
|
36
|
+
self.value_expr = exprs.ColumnRef(c, perform_validation=False)
|
|
37
|
+
else:
|
|
38
|
+
self.value_expr = (
|
|
39
|
+
BtreeIndex.str_filter(exprs.ColumnRef(c)) if c.col_type.is_string_type() else exprs.ColumnRef(c)
|
|
40
|
+
)
|
|
29
41
|
|
|
30
42
|
def index_value_expr(self) -> 'exprs.Expr':
|
|
31
43
|
return self.value_expr
|
|
@@ -52,3 +64,4 @@ class BtreeIndex(IndexBase):
|
|
|
52
64
|
@classmethod
|
|
53
65
|
def from_dict(cls, c: 'catalog.Column', d: dict) -> 'BtreeIndex':
|
|
54
66
|
return cls(c)
|
|
67
|
+
|
|
@@ -86,8 +86,8 @@ class EmbeddingIndex(IndexBase):
|
|
|
86
86
|
)
|
|
87
87
|
idx.create(bind=conn)
|
|
88
88
|
|
|
89
|
-
def similarity_clause(self, val_column: catalog.Column, item: Any) -> sql.
|
|
90
|
-
"""Create a
|
|
89
|
+
def similarity_clause(self, val_column: catalog.Column, item: Any) -> sql.ColumnElement:
|
|
90
|
+
"""Create a ColumnElement that represents '<val_column> <op> <item>'"""
|
|
91
91
|
assert isinstance(item, (str, PIL.Image.Image))
|
|
92
92
|
if isinstance(item, str):
|
|
93
93
|
assert self.string_embed is not None
|
|
@@ -104,8 +104,8 @@ class EmbeddingIndex(IndexBase):
|
|
|
104
104
|
assert self.metric == self.Metric.L2
|
|
105
105
|
return val_column.sa_col.l2_distance(embedding)
|
|
106
106
|
|
|
107
|
-
def order_by_clause(self, val_column: catalog.Column, item: Any, is_asc: bool) -> sql.
|
|
108
|
-
"""Create a
|
|
107
|
+
def order_by_clause(self, val_column: catalog.Column, item: Any, is_asc: bool) -> sql.ColumnElement:
|
|
108
|
+
"""Create a ColumnElement that is used in an ORDER BY clause"""
|
|
109
109
|
assert isinstance(item, (str, PIL.Image.Image))
|
|
110
110
|
embedding: Optional[np.ndarray] = None
|
|
111
111
|
if isinstance(item, str):
|
pixeltable/io/__init__.py
CHANGED
|
@@ -1,10 +1,9 @@
|
|
|
1
1
|
from .external_store import ExternalStore, SyncStatus
|
|
2
|
-
from .globals import create_label_studio_project,
|
|
2
|
+
from .globals import create_label_studio_project, export_images_as_fo_dataset, import_json, import_rows
|
|
3
3
|
from .hf_datasets import import_huggingface_dataset
|
|
4
4
|
from .pandas import import_csv, import_excel, import_pandas
|
|
5
5
|
from .parquet import import_parquet
|
|
6
6
|
|
|
7
|
-
|
|
8
7
|
__default_dir = set(symbol for symbol in dir() if not symbol.startswith('_'))
|
|
9
8
|
__removed_symbols = {'globals', 'hf_datasets', 'pandas', 'parquet'}
|
|
10
9
|
__all__ = sorted(list(__default_dir - __removed_symbols))
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Iterator, Optional, Union
|
|
3
|
+
|
|
4
|
+
import fiftyone as fo # type: ignore[import-untyped]
|
|
5
|
+
import fiftyone.utils.data as foud # type: ignore[import-untyped]
|
|
6
|
+
import PIL.Image
|
|
7
|
+
import puremagic
|
|
8
|
+
|
|
9
|
+
import pixeltable as pxt
|
|
10
|
+
import pixeltable.exceptions as excs
|
|
11
|
+
from pixeltable import exprs
|
|
12
|
+
from pixeltable.env import Env
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class PxtImageDatasetImporter(foud.LabeledImageDatasetImporter):
|
|
16
|
+
"""
|
|
17
|
+
Implementation of a FiftyOne `DatasetImporter` that reads image data from a Pixeltable table.
|
|
18
|
+
"""
|
|
19
|
+
__image_format: str # format to use for any exported images that are not already stored on disk
|
|
20
|
+
__labels: dict[str, tuple[exprs.Expr, type[fo.Label]]] # label_name -> (expr, label_cls)
|
|
21
|
+
__image_idx: int # index of the image expr in the select list
|
|
22
|
+
__localpath_idx: Optional[int] # index of the image localpath in the select list, if present
|
|
23
|
+
__row_iter: Iterator[list] # iterator over the table rows, to be convered to FiftyOne samples
|
|
24
|
+
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
tbl: pxt.Table,
|
|
28
|
+
image: exprs.Expr,
|
|
29
|
+
image_format: str,
|
|
30
|
+
classifications: Union[exprs.Expr, list[exprs.Expr], dict[str, exprs.Expr], None] = None,
|
|
31
|
+
detections: Union[exprs.Expr, list[exprs.Expr], dict[str, exprs.Expr], None] = None,
|
|
32
|
+
dataset_dir: Optional[os.PathLike] = None,
|
|
33
|
+
shuffle: bool = False,
|
|
34
|
+
seed: Union[int, float, str, bytes, bytearray, None] = None,
|
|
35
|
+
max_samples: Optional[int] = None,
|
|
36
|
+
):
|
|
37
|
+
super().__init__(
|
|
38
|
+
dataset_dir=dataset_dir,
|
|
39
|
+
shuffle=shuffle,
|
|
40
|
+
seed=seed,
|
|
41
|
+
max_samples=max_samples
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
self.__image_format = image_format
|
|
45
|
+
|
|
46
|
+
label_categories = [
|
|
47
|
+
(classifications, fo.Classifications, 'classifications'),
|
|
48
|
+
(detections, fo.Detections, 'detections'),
|
|
49
|
+
]
|
|
50
|
+
|
|
51
|
+
# Construct the labels. First add labels for all label types that have named dictionaries.
|
|
52
|
+
self.__labels = {}
|
|
53
|
+
for exprs_, label_cls, _ in label_categories:
|
|
54
|
+
if isinstance(exprs_, dict):
|
|
55
|
+
for label_name, expr in exprs_.items():
|
|
56
|
+
if not label_name.isidentifier():
|
|
57
|
+
raise excs.Error(f"Invalid label name: {label_name}")
|
|
58
|
+
if label_name in self.__labels:
|
|
59
|
+
raise excs.Error(f"Duplicate label name: {label_name}")
|
|
60
|
+
self.__labels[label_name] = (expr, label_cls)
|
|
61
|
+
|
|
62
|
+
# Now add the remaining labels, assigning unused default names.
|
|
63
|
+
for exprs_, label_cls, default_name in label_categories:
|
|
64
|
+
if exprs_ is None or isinstance(exprs_, dict):
|
|
65
|
+
continue
|
|
66
|
+
if isinstance(exprs_, exprs.Expr):
|
|
67
|
+
exprs_ = [exprs_]
|
|
68
|
+
assert isinstance(exprs_, list)
|
|
69
|
+
for expr in exprs_:
|
|
70
|
+
if default_name not in self.__labels:
|
|
71
|
+
name = default_name
|
|
72
|
+
else:
|
|
73
|
+
i = 1
|
|
74
|
+
while f'{default_name}_{i}' in self.__labels:
|
|
75
|
+
i += 1
|
|
76
|
+
name = f'{default_name}_{i}'
|
|
77
|
+
self.__labels[name] = (expr, label_cls)
|
|
78
|
+
|
|
79
|
+
# Build the select list:
|
|
80
|
+
# - Labels first, in the order they appear in self.__labels
|
|
81
|
+
# - Then the `image` expr
|
|
82
|
+
# - Then `image.localpath`, if `images` is a stored columnref
|
|
83
|
+
|
|
84
|
+
selection = [expr for expr, _ in self.__labels.values()]
|
|
85
|
+
self.__image_idx = len(selection)
|
|
86
|
+
selection.append(image)
|
|
87
|
+
|
|
88
|
+
if isinstance(image, exprs.ColumnRef) and image.col.is_stored:
|
|
89
|
+
# A stored image column; we can use the existing localpaths
|
|
90
|
+
self.__localpath_idx = len(selection)
|
|
91
|
+
selection.append(image.localpath)
|
|
92
|
+
else:
|
|
93
|
+
self.__localpath_idx = None
|
|
94
|
+
|
|
95
|
+
df = tbl.select(*selection)
|
|
96
|
+
self.__row_iter = df._output_row_iterator()
|
|
97
|
+
|
|
98
|
+
def __next__(self) -> tuple[str, Optional[fo.ImageMetadata], Optional[dict[str, fo.Label]]]:
|
|
99
|
+
row = next(self.__row_iter)
|
|
100
|
+
img = row[self.__image_idx]
|
|
101
|
+
assert isinstance(img, PIL.Image.Image)
|
|
102
|
+
if self.__localpath_idx is not None:
|
|
103
|
+
# Use the existing localpath of the stored image
|
|
104
|
+
file = row[self.__localpath_idx]
|
|
105
|
+
assert isinstance(file, str)
|
|
106
|
+
else:
|
|
107
|
+
# Write the dynamically created image to a temp file
|
|
108
|
+
file = str(Env.get().create_tmp_path(f'.{self.__image_format}'))
|
|
109
|
+
img.save(file, format=self.__image_format)
|
|
110
|
+
|
|
111
|
+
metadata = fo.ImageMetadata(
|
|
112
|
+
size_bytes=os.path.getsize(file),
|
|
113
|
+
mime_type=puremagic.from_file(file, mime=True),
|
|
114
|
+
width=img.width,
|
|
115
|
+
height=img.height,
|
|
116
|
+
filepath=file,
|
|
117
|
+
num_channels=len(img.getbands()),
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
labels: dict[str, fo.Label] = {}
|
|
121
|
+
for idx, (label_name, (_, label_cls)) in enumerate(self.__labels.items()):
|
|
122
|
+
label_data = row[idx]
|
|
123
|
+
if label_data is None:
|
|
124
|
+
continue
|
|
125
|
+
|
|
126
|
+
label: fo.Label
|
|
127
|
+
if label_cls is fo.Classifications:
|
|
128
|
+
label = fo.Classifications(classifications=self.__as_fo_classifications(label_data))
|
|
129
|
+
elif label_cls is fo.Detections:
|
|
130
|
+
label = fo.Detections(detections=self.__as_fo_detections(label_data))
|
|
131
|
+
else:
|
|
132
|
+
assert False
|
|
133
|
+
labels[label_name] = label
|
|
134
|
+
|
|
135
|
+
return file, metadata, labels
|
|
136
|
+
|
|
137
|
+
def __as_fo_classifications(self, data: list) -> list[fo.Classification]:
|
|
138
|
+
if not isinstance(data, list) or any('label' not in entry for entry in data):
|
|
139
|
+
raise excs.Error(
|
|
140
|
+
f'Invalid classifications data: {data}\n'
|
|
141
|
+
"(Expected a list of dicts, each containing a 'label' key)"
|
|
142
|
+
)
|
|
143
|
+
return [
|
|
144
|
+
fo.Classification(label=entry['label'], confidence=entry.get('confidence'))
|
|
145
|
+
for entry in data
|
|
146
|
+
]
|
|
147
|
+
|
|
148
|
+
def __as_fo_detections(self, data: list) -> list[fo.Detections]:
|
|
149
|
+
if not isinstance(data, list) or any('label' not in entry or 'bounding_box' not in entry for entry in data):
|
|
150
|
+
raise excs.Error(
|
|
151
|
+
f'Invalid detections data: {data}\n'
|
|
152
|
+
"(Expected a list of dicts, each containing a 'label' and 'bounding_box' key)"
|
|
153
|
+
)
|
|
154
|
+
return [
|
|
155
|
+
fo.Detection(label=entry['label'], bounding_box=entry['bounding_box'], confidence=entry.get('confidence'))
|
|
156
|
+
for entry in data
|
|
157
|
+
]
|
|
158
|
+
|
|
159
|
+
@property
|
|
160
|
+
def has_dataset_info(self) -> bool:
|
|
161
|
+
return False
|
|
162
|
+
|
|
163
|
+
@property
|
|
164
|
+
def has_image_metadata(self) -> bool:
|
|
165
|
+
return True
|
|
166
|
+
|
|
167
|
+
@property
|
|
168
|
+
def label_cls(self) -> dict[str, type]:
|
|
169
|
+
return {label_name: label_cls for label_name, (_, label_cls) in self.__labels.items()}
|
|
170
|
+
|
|
171
|
+
def setup(self) -> None:
|
|
172
|
+
pass
|
|
173
|
+
|
|
174
|
+
def get_dataset_info(self) -> dict:
|
|
175
|
+
pass
|
|
176
|
+
|
|
177
|
+
def close(self, *args) -> None:
|
|
178
|
+
pass
|
pixeltable/io/globals.py
CHANGED
|
@@ -1,10 +1,14 @@
|
|
|
1
|
-
from typing import Any, Literal, Optional, Union
|
|
1
|
+
from typing import TYPE_CHECKING, Any, Literal, Optional, Union
|
|
2
2
|
|
|
3
3
|
import pixeltable as pxt
|
|
4
4
|
import pixeltable.exceptions as excs
|
|
5
|
-
from pixeltable import Table
|
|
5
|
+
from pixeltable import Table, exprs
|
|
6
|
+
from pixeltable.env import Env
|
|
6
7
|
from pixeltable.io.external_store import SyncStatus
|
|
7
8
|
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
import fiftyone as fo # type: ignore[import-untyped]
|
|
11
|
+
|
|
8
12
|
|
|
9
13
|
def create_label_studio_project(
|
|
10
14
|
t: Table,
|
|
@@ -116,6 +120,8 @@ def create_label_studio_project(
|
|
|
116
120
|
s3_configuration={'bucket': 'my-bucket', 'region_name': 'us-east-2'}
|
|
117
121
|
)
|
|
118
122
|
"""
|
|
123
|
+
Env.get().require_package('label_studio_sdk')
|
|
124
|
+
|
|
119
125
|
from pixeltable.io.label_studio import LabelStudioProject
|
|
120
126
|
|
|
121
127
|
ls_project = LabelStudioProject.create(
|
|
@@ -267,3 +273,91 @@ def import_json(
|
|
|
267
273
|
contents = urllib.request.urlopen(filepath_or_url).read()
|
|
268
274
|
data = json.loads(contents, **kwargs)
|
|
269
275
|
return import_rows(tbl_path, data, schema_overrides=schema_overrides, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def export_images_as_fo_dataset(
|
|
279
|
+
tbl: pxt.Table,
|
|
280
|
+
images: exprs.Expr,
|
|
281
|
+
image_format: str = 'webp',
|
|
282
|
+
classifications: Union[exprs.Expr, list[exprs.Expr], dict[str, exprs.Expr], None] = None,
|
|
283
|
+
detections: Union[exprs.Expr, list[exprs.Expr], dict[str, exprs.Expr], None] = None,
|
|
284
|
+
) -> 'fo.Dataset':
|
|
285
|
+
"""
|
|
286
|
+
Export images from a Pixeltable table as a Voxel51 dataset. The data must consist of a single column
|
|
287
|
+
(or expression) containing image data, along with optional additional columns containing labels. Currently, only
|
|
288
|
+
classification and detection labels are supported.
|
|
289
|
+
|
|
290
|
+
The [Working with Voxel51 in Pixeltable](https://docs.pixeltable.com/docs/working-with-voxel51) tutorial contains a
|
|
291
|
+
fully worked example showing how to export data from a Pixeltable table and load it into Voxel51.
|
|
292
|
+
|
|
293
|
+
Images in the dataset that already exist on disk will be exported directly, in whatever format they
|
|
294
|
+
are stored in. Images that are not already on disk (such as frames extracted using a
|
|
295
|
+
[`FrameIterator`][pixeltable.iterators.FrameIterator]) will first be written to disk in the specified
|
|
296
|
+
`image_format`.
|
|
297
|
+
|
|
298
|
+
The label parameters accept one or more sets of labels of each type. If a single `Expr` is provided, then it will
|
|
299
|
+
be exported as a single set of labels with a default name such as `classifications`.
|
|
300
|
+
(The single set of labels may still containing multiple individual labels; see below.)
|
|
301
|
+
If a list of `Expr`s is provided, then each one will be exported as a separate set of labels with a default name
|
|
302
|
+
such as `classifications`, `classifications_1`, etc. If a dictionary of `Expr`s is provided, then each entry will
|
|
303
|
+
be exported as a set of labels with the specified name.
|
|
304
|
+
|
|
305
|
+
__Requirements:__
|
|
306
|
+
|
|
307
|
+
- `pip install fiftyone`
|
|
308
|
+
|
|
309
|
+
Args:
|
|
310
|
+
tbl: The table from which to export data.
|
|
311
|
+
images: A column or expression that contains the images to export.
|
|
312
|
+
image_format: The format to use when writing out images for export.
|
|
313
|
+
classifications: Optional image classification labels. If a single `Expr` is provided, it must be a table
|
|
314
|
+
column or an expression that evaluates to a list of dictionaries. Each dictionary in the list corresponds
|
|
315
|
+
to an image class and must have the following structure:
|
|
316
|
+
|
|
317
|
+
```python
|
|
318
|
+
{'label': 'zebra', 'confidence': 0.325}
|
|
319
|
+
```
|
|
320
|
+
|
|
321
|
+
If multiple `Expr`s are provided, each one must evaluate to a list of such dictionaries.
|
|
322
|
+
detections: Optional image detection labels. If a single `Expr` is provided, it must be a table column or an
|
|
323
|
+
expression that evaluates to a list of dictionaries. Each dictionary in the list corresponds to an image
|
|
324
|
+
detection, and must have the following structure:
|
|
325
|
+
|
|
326
|
+
```python
|
|
327
|
+
{
|
|
328
|
+
'label': 'giraffe',
|
|
329
|
+
'confidence': 0.99,
|
|
330
|
+
'bounding_box': [0.081, 0.836, 0.202, 0.136] # [x, y, w, h], fractional coordinates
|
|
331
|
+
}
|
|
332
|
+
```
|
|
333
|
+
|
|
334
|
+
If multiple `Expr`s are provided, each one must evaluate to a list of such dictionaries.
|
|
335
|
+
|
|
336
|
+
Returns:
|
|
337
|
+
A Voxel51 dataset.
|
|
338
|
+
|
|
339
|
+
Example:
|
|
340
|
+
Export the images in the `image` column of the table `tbl` as a Voxel51 dataset, using classification
|
|
341
|
+
labels from `tbl.classifications`:
|
|
342
|
+
|
|
343
|
+
>>> export_as_fiftyone(
|
|
344
|
+
... tbl,
|
|
345
|
+
... tbl.image,
|
|
346
|
+
... classifications=tbl.classifications
|
|
347
|
+
... )
|
|
348
|
+
|
|
349
|
+
See the [Working with Voxel51 in Pixeltable](https://docs.pixeltable.com/docs/working-with-voxel51) tutorial
|
|
350
|
+
for a fully worked example.
|
|
351
|
+
"""
|
|
352
|
+
Env.get().require_package('fiftyone')
|
|
353
|
+
|
|
354
|
+
import fiftyone as fo
|
|
355
|
+
|
|
356
|
+
from pixeltable.io.fiftyone import PxtImageDatasetImporter
|
|
357
|
+
|
|
358
|
+
if not images.col_type.is_image_type():
|
|
359
|
+
raise excs.Error(f'`images` must be an expression of type Image (got {images.col_type._to_base_str()})')
|
|
360
|
+
|
|
361
|
+
return fo.Dataset.from_importer(PxtImageDatasetImporter(
|
|
362
|
+
tbl, images, image_format, classifications=classifications, detections=detections
|
|
363
|
+
))
|
pixeltable/iterators/base.py
CHANGED
pixeltable/iterators/document.py
CHANGED
|
@@ -152,7 +152,7 @@ class DocumentSplitter(ComponentIterator):
|
|
|
152
152
|
assert self._doc_handle.pdf_doc is not None
|
|
153
153
|
self._sections = self._pdf_sections()
|
|
154
154
|
else:
|
|
155
|
-
assert False, f'
|
|
155
|
+
assert False, f'Unsupported document format: {self._doc_handle.format}'
|
|
156
156
|
|
|
157
157
|
if Separator.SENTENCE in self._separators:
|
|
158
158
|
self._sections = self._sentence_sections(self._sections)
|