pixeltable 0.2.20__py3-none-any.whl → 0.2.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +7 -19
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +7 -7
- pixeltable/catalog/column.py +37 -11
- pixeltable/catalog/globals.py +21 -0
- pixeltable/catalog/insertable_table.py +6 -4
- pixeltable/catalog/table.py +227 -148
- pixeltable/catalog/table_version.py +66 -28
- pixeltable/catalog/table_version_path.py +0 -8
- pixeltable/catalog/view.py +18 -19
- pixeltable/dataframe.py +16 -32
- pixeltable/env.py +6 -1
- pixeltable/exec/__init__.py +1 -2
- pixeltable/exec/aggregation_node.py +27 -17
- pixeltable/exec/cache_prefetch_node.py +1 -1
- pixeltable/exec/data_row_batch.py +9 -26
- pixeltable/exec/exec_node.py +36 -7
- pixeltable/exec/expr_eval_node.py +19 -11
- pixeltable/exec/in_memory_data_node.py +14 -11
- pixeltable/exec/sql_node.py +266 -138
- pixeltable/exprs/__init__.py +1 -0
- pixeltable/exprs/arithmetic_expr.py +3 -1
- pixeltable/exprs/array_slice.py +7 -7
- pixeltable/exprs/column_property_ref.py +37 -10
- pixeltable/exprs/column_ref.py +93 -14
- pixeltable/exprs/comparison.py +5 -5
- pixeltable/exprs/compound_predicate.py +8 -7
- pixeltable/exprs/data_row.py +56 -36
- pixeltable/exprs/expr.py +65 -63
- pixeltable/exprs/expr_dict.py +55 -0
- pixeltable/exprs/expr_set.py +26 -15
- pixeltable/exprs/function_call.py +53 -24
- pixeltable/exprs/globals.py +4 -1
- pixeltable/exprs/in_predicate.py +8 -7
- pixeltable/exprs/inline_expr.py +4 -4
- pixeltable/exprs/is_null.py +4 -4
- pixeltable/exprs/json_mapper.py +11 -12
- pixeltable/exprs/json_path.py +5 -10
- pixeltable/exprs/literal.py +5 -5
- pixeltable/exprs/method_ref.py +5 -4
- pixeltable/exprs/object_ref.py +2 -1
- pixeltable/exprs/row_builder.py +88 -36
- pixeltable/exprs/rowid_ref.py +14 -13
- pixeltable/exprs/similarity_expr.py +12 -7
- pixeltable/exprs/sql_element_cache.py +12 -6
- pixeltable/exprs/type_cast.py +8 -6
- pixeltable/exprs/variable.py +5 -4
- pixeltable/ext/functions/whisperx.py +7 -2
- pixeltable/func/aggregate_function.py +1 -1
- pixeltable/func/callable_function.py +2 -2
- pixeltable/func/function.py +11 -10
- pixeltable/func/function_registry.py +6 -7
- pixeltable/func/query_template_function.py +11 -12
- pixeltable/func/signature.py +17 -15
- pixeltable/func/udf.py +0 -4
- pixeltable/functions/__init__.py +2 -2
- pixeltable/functions/audio.py +4 -6
- pixeltable/functions/globals.py +84 -42
- pixeltable/functions/huggingface.py +31 -34
- pixeltable/functions/image.py +59 -45
- pixeltable/functions/json.py +0 -1
- pixeltable/functions/llama_cpp.py +106 -0
- pixeltable/functions/mistralai.py +2 -2
- pixeltable/functions/ollama.py +147 -0
- pixeltable/functions/openai.py +22 -25
- pixeltable/functions/replicate.py +72 -0
- pixeltable/functions/string.py +59 -50
- pixeltable/functions/timestamp.py +20 -20
- pixeltable/functions/together.py +2 -2
- pixeltable/functions/video.py +11 -20
- pixeltable/functions/whisper.py +2 -20
- pixeltable/globals.py +65 -74
- pixeltable/index/base.py +2 -2
- pixeltable/index/btree.py +20 -7
- pixeltable/index/embedding_index.py +12 -14
- pixeltable/io/__init__.py +1 -2
- pixeltable/io/external_store.py +11 -5
- pixeltable/io/fiftyone.py +178 -0
- pixeltable/io/globals.py +98 -2
- pixeltable/io/hf_datasets.py +1 -1
- pixeltable/io/label_studio.py +6 -6
- pixeltable/io/parquet.py +14 -13
- pixeltable/iterators/base.py +3 -2
- pixeltable/iterators/document.py +10 -8
- pixeltable/iterators/video.py +126 -60
- pixeltable/metadata/__init__.py +4 -3
- pixeltable/metadata/converters/convert_14.py +4 -2
- pixeltable/metadata/converters/convert_15.py +1 -1
- pixeltable/metadata/converters/convert_19.py +1 -0
- pixeltable/metadata/converters/convert_20.py +1 -1
- pixeltable/metadata/converters/convert_21.py +34 -0
- pixeltable/metadata/converters/util.py +54 -12
- pixeltable/metadata/notes.py +1 -0
- pixeltable/metadata/schema.py +40 -21
- pixeltable/plan.py +149 -165
- pixeltable/py.typed +0 -0
- pixeltable/store.py +57 -37
- pixeltable/tool/create_test_db_dump.py +6 -6
- pixeltable/tool/create_test_video.py +1 -1
- pixeltable/tool/doc_plugins/griffe.py +3 -34
- pixeltable/tool/embed_udf.py +1 -1
- pixeltable/tool/mypy_plugin.py +55 -0
- pixeltable/type_system.py +260 -61
- pixeltable/utils/arrow.py +10 -9
- pixeltable/utils/coco.py +4 -4
- pixeltable/utils/documents.py +16 -2
- pixeltable/utils/filecache.py +9 -9
- pixeltable/utils/formatter.py +10 -11
- pixeltable/utils/http_server.py +2 -5
- pixeltable/utils/media_store.py +6 -6
- pixeltable/utils/pytorch.py +10 -11
- pixeltable/utils/sql.py +2 -1
- {pixeltable-0.2.20.dist-info → pixeltable-0.2.22.dist-info}/METADATA +50 -13
- pixeltable-0.2.22.dist-info/RECORD +153 -0
- pixeltable/exec/media_validation_node.py +0 -43
- pixeltable/utils/help.py +0 -11
- pixeltable-0.2.20.dist-info/RECORD +0 -147
- {pixeltable-0.2.20.dist-info → pixeltable-0.2.22.dist-info}/LICENSE +0 -0
- {pixeltable-0.2.20.dist-info → pixeltable-0.2.22.dist-info}/WHEEL +0 -0
- {pixeltable-0.2.20.dist-info → pixeltable-0.2.22.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Iterator, Optional, Union
|
|
3
|
+
|
|
4
|
+
import fiftyone as fo # type: ignore[import-untyped]
|
|
5
|
+
import fiftyone.utils.data as foud # type: ignore[import-untyped]
|
|
6
|
+
import PIL.Image
|
|
7
|
+
import puremagic
|
|
8
|
+
|
|
9
|
+
import pixeltable as pxt
|
|
10
|
+
import pixeltable.exceptions as excs
|
|
11
|
+
from pixeltable import exprs
|
|
12
|
+
from pixeltable.env import Env
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class PxtImageDatasetImporter(foud.LabeledImageDatasetImporter):
|
|
16
|
+
"""
|
|
17
|
+
Implementation of a FiftyOne `DatasetImporter` that reads image data from a Pixeltable table.
|
|
18
|
+
"""
|
|
19
|
+
__image_format: str # format to use for any exported images that are not already stored on disk
|
|
20
|
+
__labels: dict[str, tuple[exprs.Expr, type[fo.Label]]] # label_name -> (expr, label_cls)
|
|
21
|
+
__image_idx: int # index of the image expr in the select list
|
|
22
|
+
__localpath_idx: Optional[int] # index of the image localpath in the select list, if present
|
|
23
|
+
__row_iter: Iterator[list] # iterator over the table rows, to be convered to FiftyOne samples
|
|
24
|
+
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
tbl: pxt.Table,
|
|
28
|
+
image: exprs.Expr,
|
|
29
|
+
image_format: str,
|
|
30
|
+
classifications: Union[exprs.Expr, list[exprs.Expr], dict[str, exprs.Expr], None] = None,
|
|
31
|
+
detections: Union[exprs.Expr, list[exprs.Expr], dict[str, exprs.Expr], None] = None,
|
|
32
|
+
dataset_dir: Optional[os.PathLike] = None,
|
|
33
|
+
shuffle: bool = False,
|
|
34
|
+
seed: Union[int, float, str, bytes, bytearray, None] = None,
|
|
35
|
+
max_samples: Optional[int] = None,
|
|
36
|
+
):
|
|
37
|
+
super().__init__(
|
|
38
|
+
dataset_dir=dataset_dir,
|
|
39
|
+
shuffle=shuffle,
|
|
40
|
+
seed=seed,
|
|
41
|
+
max_samples=max_samples
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
self.__image_format = image_format
|
|
45
|
+
|
|
46
|
+
label_categories = [
|
|
47
|
+
(classifications, fo.Classifications, 'classifications'),
|
|
48
|
+
(detections, fo.Detections, 'detections'),
|
|
49
|
+
]
|
|
50
|
+
|
|
51
|
+
# Construct the labels. First add labels for all label types that have named dictionaries.
|
|
52
|
+
self.__labels = {}
|
|
53
|
+
for exprs_, label_cls, _ in label_categories:
|
|
54
|
+
if isinstance(exprs_, dict):
|
|
55
|
+
for label_name, expr in exprs_.items():
|
|
56
|
+
if not label_name.isidentifier():
|
|
57
|
+
raise excs.Error(f"Invalid label name: {label_name}")
|
|
58
|
+
if label_name in self.__labels:
|
|
59
|
+
raise excs.Error(f"Duplicate label name: {label_name}")
|
|
60
|
+
self.__labels[label_name] = (expr, label_cls)
|
|
61
|
+
|
|
62
|
+
# Now add the remaining labels, assigning unused default names.
|
|
63
|
+
for exprs_, label_cls, default_name in label_categories:
|
|
64
|
+
if exprs_ is None or isinstance(exprs_, dict):
|
|
65
|
+
continue
|
|
66
|
+
if isinstance(exprs_, exprs.Expr):
|
|
67
|
+
exprs_ = [exprs_]
|
|
68
|
+
assert isinstance(exprs_, list)
|
|
69
|
+
for expr in exprs_:
|
|
70
|
+
if default_name not in self.__labels:
|
|
71
|
+
name = default_name
|
|
72
|
+
else:
|
|
73
|
+
i = 1
|
|
74
|
+
while f'{default_name}_{i}' in self.__labels:
|
|
75
|
+
i += 1
|
|
76
|
+
name = f'{default_name}_{i}'
|
|
77
|
+
self.__labels[name] = (expr, label_cls)
|
|
78
|
+
|
|
79
|
+
# Build the select list:
|
|
80
|
+
# - Labels first, in the order they appear in self.__labels
|
|
81
|
+
# - Then the `image` expr
|
|
82
|
+
# - Then `image.localpath`, if `images` is a stored columnref
|
|
83
|
+
|
|
84
|
+
selection = [expr for expr, _ in self.__labels.values()]
|
|
85
|
+
self.__image_idx = len(selection)
|
|
86
|
+
selection.append(image)
|
|
87
|
+
|
|
88
|
+
if isinstance(image, exprs.ColumnRef) and image.col.is_stored:
|
|
89
|
+
# A stored image column; we can use the existing localpaths
|
|
90
|
+
self.__localpath_idx = len(selection)
|
|
91
|
+
selection.append(image.localpath)
|
|
92
|
+
else:
|
|
93
|
+
self.__localpath_idx = None
|
|
94
|
+
|
|
95
|
+
df = tbl.select(*selection)
|
|
96
|
+
self.__row_iter = df._output_row_iterator()
|
|
97
|
+
|
|
98
|
+
def __next__(self) -> tuple[str, Optional[fo.ImageMetadata], Optional[dict[str, fo.Label]]]:
|
|
99
|
+
row = next(self.__row_iter)
|
|
100
|
+
img = row[self.__image_idx]
|
|
101
|
+
assert isinstance(img, PIL.Image.Image)
|
|
102
|
+
if self.__localpath_idx is not None:
|
|
103
|
+
# Use the existing localpath of the stored image
|
|
104
|
+
file = row[self.__localpath_idx]
|
|
105
|
+
assert isinstance(file, str)
|
|
106
|
+
else:
|
|
107
|
+
# Write the dynamically created image to a temp file
|
|
108
|
+
file = str(Env.get().create_tmp_path(f'.{self.__image_format}'))
|
|
109
|
+
img.save(file, format=self.__image_format)
|
|
110
|
+
|
|
111
|
+
metadata = fo.ImageMetadata(
|
|
112
|
+
size_bytes=os.path.getsize(file),
|
|
113
|
+
mime_type=puremagic.from_file(file, mime=True),
|
|
114
|
+
width=img.width,
|
|
115
|
+
height=img.height,
|
|
116
|
+
filepath=file,
|
|
117
|
+
num_channels=len(img.getbands()),
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
labels: dict[str, fo.Label] = {}
|
|
121
|
+
for idx, (label_name, (_, label_cls)) in enumerate(self.__labels.items()):
|
|
122
|
+
label_data = row[idx]
|
|
123
|
+
if label_data is None:
|
|
124
|
+
continue
|
|
125
|
+
|
|
126
|
+
label: fo.Label
|
|
127
|
+
if label_cls is fo.Classifications:
|
|
128
|
+
label = fo.Classifications(classifications=self.__as_fo_classifications(label_data))
|
|
129
|
+
elif label_cls is fo.Detections:
|
|
130
|
+
label = fo.Detections(detections=self.__as_fo_detections(label_data))
|
|
131
|
+
else:
|
|
132
|
+
assert False
|
|
133
|
+
labels[label_name] = label
|
|
134
|
+
|
|
135
|
+
return file, metadata, labels
|
|
136
|
+
|
|
137
|
+
def __as_fo_classifications(self, data: list) -> list[fo.Classification]:
|
|
138
|
+
if not isinstance(data, list) or any('label' not in entry for entry in data):
|
|
139
|
+
raise excs.Error(
|
|
140
|
+
f'Invalid classifications data: {data}\n'
|
|
141
|
+
"(Expected a list of dicts, each containing a 'label' key)"
|
|
142
|
+
)
|
|
143
|
+
return [
|
|
144
|
+
fo.Classification(label=entry['label'], confidence=entry.get('confidence'))
|
|
145
|
+
for entry in data
|
|
146
|
+
]
|
|
147
|
+
|
|
148
|
+
def __as_fo_detections(self, data: list) -> list[fo.Detections]:
|
|
149
|
+
if not isinstance(data, list) or any('label' not in entry or 'bounding_box' not in entry for entry in data):
|
|
150
|
+
raise excs.Error(
|
|
151
|
+
f'Invalid detections data: {data}\n'
|
|
152
|
+
"(Expected a list of dicts, each containing a 'label' and 'bounding_box' key)"
|
|
153
|
+
)
|
|
154
|
+
return [
|
|
155
|
+
fo.Detection(label=entry['label'], bounding_box=entry['bounding_box'], confidence=entry.get('confidence'))
|
|
156
|
+
for entry in data
|
|
157
|
+
]
|
|
158
|
+
|
|
159
|
+
@property
|
|
160
|
+
def has_dataset_info(self) -> bool:
|
|
161
|
+
return False
|
|
162
|
+
|
|
163
|
+
@property
|
|
164
|
+
def has_image_metadata(self) -> bool:
|
|
165
|
+
return True
|
|
166
|
+
|
|
167
|
+
@property
|
|
168
|
+
def label_cls(self) -> dict[str, type]:
|
|
169
|
+
return {label_name: label_cls for label_name, (_, label_cls) in self.__labels.items()}
|
|
170
|
+
|
|
171
|
+
def setup(self) -> None:
|
|
172
|
+
pass
|
|
173
|
+
|
|
174
|
+
def get_dataset_info(self) -> dict:
|
|
175
|
+
pass
|
|
176
|
+
|
|
177
|
+
def close(self, *args) -> None:
|
|
178
|
+
pass
|
pixeltable/io/globals.py
CHANGED
|
@@ -1,10 +1,14 @@
|
|
|
1
|
-
from typing import Any, Literal, Optional, Union
|
|
1
|
+
from typing import TYPE_CHECKING, Any, Literal, Optional, Union
|
|
2
2
|
|
|
3
3
|
import pixeltable as pxt
|
|
4
4
|
import pixeltable.exceptions as excs
|
|
5
|
-
from pixeltable import Table
|
|
5
|
+
from pixeltable import Table, exprs
|
|
6
|
+
from pixeltable.env import Env
|
|
6
7
|
from pixeltable.io.external_store import SyncStatus
|
|
7
8
|
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
import fiftyone as fo # type: ignore[import-untyped]
|
|
11
|
+
|
|
8
12
|
|
|
9
13
|
def create_label_studio_project(
|
|
10
14
|
t: Table,
|
|
@@ -116,6 +120,8 @@ def create_label_studio_project(
|
|
|
116
120
|
s3_configuration={'bucket': 'my-bucket', 'region_name': 'us-east-2'}
|
|
117
121
|
)
|
|
118
122
|
"""
|
|
123
|
+
Env.get().require_package('label_studio_sdk')
|
|
124
|
+
|
|
119
125
|
from pixeltable.io.label_studio import LabelStudioProject
|
|
120
126
|
|
|
121
127
|
ls_project = LabelStudioProject.create(
|
|
@@ -187,6 +193,8 @@ def import_rows(
|
|
|
187
193
|
# If `key` is not in `schema_overrides`, then we infer its type from the data.
|
|
188
194
|
# The column type will always be nullable by default.
|
|
189
195
|
col_type = pxt.ColumnType.infer_literal_type(value, nullable=True)
|
|
196
|
+
if col_type is None:
|
|
197
|
+
raise excs.Error(f'Could not infer type for column `{col_name}`; the value in row {n} has an unsupported type: {type(value)}')
|
|
190
198
|
if col_name not in schema:
|
|
191
199
|
schema[col_name] = col_type
|
|
192
200
|
else:
|
|
@@ -265,3 +273,91 @@ def import_json(
|
|
|
265
273
|
contents = urllib.request.urlopen(filepath_or_url).read()
|
|
266
274
|
data = json.loads(contents, **kwargs)
|
|
267
275
|
return import_rows(tbl_path, data, schema_overrides=schema_overrides, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def export_images_as_fo_dataset(
|
|
279
|
+
tbl: pxt.Table,
|
|
280
|
+
images: exprs.Expr,
|
|
281
|
+
image_format: str = 'webp',
|
|
282
|
+
classifications: Union[exprs.Expr, list[exprs.Expr], dict[str, exprs.Expr], None] = None,
|
|
283
|
+
detections: Union[exprs.Expr, list[exprs.Expr], dict[str, exprs.Expr], None] = None,
|
|
284
|
+
) -> 'fo.Dataset':
|
|
285
|
+
"""
|
|
286
|
+
Export images from a Pixeltable table as a Voxel51 dataset. The data must consist of a single column
|
|
287
|
+
(or expression) containing image data, along with optional additional columns containing labels. Currently, only
|
|
288
|
+
classification and detection labels are supported.
|
|
289
|
+
|
|
290
|
+
The [Working with Voxel51 in Pixeltable](https://docs.pixeltable.com/docs/working-with-voxel51) tutorial contains a
|
|
291
|
+
fully worked example showing how to export data from a Pixeltable table and load it into Voxel51.
|
|
292
|
+
|
|
293
|
+
Images in the dataset that already exist on disk will be exported directly, in whatever format they
|
|
294
|
+
are stored in. Images that are not already on disk (such as frames extracted using a
|
|
295
|
+
[`FrameIterator`][pixeltable.iterators.FrameIterator]) will first be written to disk in the specified
|
|
296
|
+
`image_format`.
|
|
297
|
+
|
|
298
|
+
The label parameters accept one or more sets of labels of each type. If a single `Expr` is provided, then it will
|
|
299
|
+
be exported as a single set of labels with a default name such as `classifications`.
|
|
300
|
+
(The single set of labels may still containing multiple individual labels; see below.)
|
|
301
|
+
If a list of `Expr`s is provided, then each one will be exported as a separate set of labels with a default name
|
|
302
|
+
such as `classifications`, `classifications_1`, etc. If a dictionary of `Expr`s is provided, then each entry will
|
|
303
|
+
be exported as a set of labels with the specified name.
|
|
304
|
+
|
|
305
|
+
__Requirements:__
|
|
306
|
+
|
|
307
|
+
- `pip install fiftyone`
|
|
308
|
+
|
|
309
|
+
Args:
|
|
310
|
+
tbl: The table from which to export data.
|
|
311
|
+
images: A column or expression that contains the images to export.
|
|
312
|
+
image_format: The format to use when writing out images for export.
|
|
313
|
+
classifications: Optional image classification labels. If a single `Expr` is provided, it must be a table
|
|
314
|
+
column or an expression that evaluates to a list of dictionaries. Each dictionary in the list corresponds
|
|
315
|
+
to an image class and must have the following structure:
|
|
316
|
+
|
|
317
|
+
```python
|
|
318
|
+
{'label': 'zebra', 'confidence': 0.325}
|
|
319
|
+
```
|
|
320
|
+
|
|
321
|
+
If multiple `Expr`s are provided, each one must evaluate to a list of such dictionaries.
|
|
322
|
+
detections: Optional image detection labels. If a single `Expr` is provided, it must be a table column or an
|
|
323
|
+
expression that evaluates to a list of dictionaries. Each dictionary in the list corresponds to an image
|
|
324
|
+
detection, and must have the following structure:
|
|
325
|
+
|
|
326
|
+
```python
|
|
327
|
+
{
|
|
328
|
+
'label': 'giraffe',
|
|
329
|
+
'confidence': 0.99,
|
|
330
|
+
'bounding_box': [0.081, 0.836, 0.202, 0.136] # [x, y, w, h], fractional coordinates
|
|
331
|
+
}
|
|
332
|
+
```
|
|
333
|
+
|
|
334
|
+
If multiple `Expr`s are provided, each one must evaluate to a list of such dictionaries.
|
|
335
|
+
|
|
336
|
+
Returns:
|
|
337
|
+
A Voxel51 dataset.
|
|
338
|
+
|
|
339
|
+
Example:
|
|
340
|
+
Export the images in the `image` column of the table `tbl` as a Voxel51 dataset, using classification
|
|
341
|
+
labels from `tbl.classifications`:
|
|
342
|
+
|
|
343
|
+
>>> export_as_fiftyone(
|
|
344
|
+
... tbl,
|
|
345
|
+
... tbl.image,
|
|
346
|
+
... classifications=tbl.classifications
|
|
347
|
+
... )
|
|
348
|
+
|
|
349
|
+
See the [Working with Voxel51 in Pixeltable](https://docs.pixeltable.com/docs/working-with-voxel51) tutorial
|
|
350
|
+
for a fully worked example.
|
|
351
|
+
"""
|
|
352
|
+
Env.get().require_package('fiftyone')
|
|
353
|
+
|
|
354
|
+
import fiftyone as fo
|
|
355
|
+
|
|
356
|
+
from pixeltable.io.fiftyone import PxtImageDatasetImporter
|
|
357
|
+
|
|
358
|
+
if not images.col_type.is_image_type():
|
|
359
|
+
raise excs.Error(f'`images` must be an expression of type Image (got {images.col_type._to_base_str()})')
|
|
360
|
+
|
|
361
|
+
return fo.Dataset.from_importer(PxtImageDatasetImporter(
|
|
362
|
+
tbl, images, image_format, classifications=classifications, detections=detections
|
|
363
|
+
))
|
pixeltable/io/hf_datasets.py
CHANGED
pixeltable/io/label_studio.py
CHANGED
|
@@ -4,17 +4,17 @@ import logging
|
|
|
4
4
|
import os
|
|
5
5
|
from dataclasses import dataclass
|
|
6
6
|
from pathlib import Path
|
|
7
|
-
from typing import Any, Iterator, Optional,
|
|
7
|
+
from typing import Any, Iterator, Literal, Optional, cast
|
|
8
8
|
from xml.etree import ElementTree
|
|
9
9
|
|
|
10
|
+
import label_studio_sdk # type: ignore[import-untyped]
|
|
10
11
|
import PIL.Image
|
|
11
|
-
import label_studio_sdk
|
|
12
12
|
from requests.exceptions import HTTPError
|
|
13
13
|
|
|
14
14
|
import pixeltable as pxt
|
|
15
15
|
import pixeltable.env as env
|
|
16
16
|
import pixeltable.exceptions as excs
|
|
17
|
-
from pixeltable import
|
|
17
|
+
from pixeltable import Column, Table
|
|
18
18
|
from pixeltable.exprs import ColumnRef, DataRow, Expr
|
|
19
19
|
from pixeltable.io.external_store import Project, SyncStatus
|
|
20
20
|
from pixeltable.utils import coco
|
|
@@ -211,7 +211,7 @@ class LabelStudioProject(Project):
|
|
|
211
211
|
assert isinstance(row[media_col_idx], PIL.Image.Image)
|
|
212
212
|
file = env.Env.get().create_tmp_path(extension='.png')
|
|
213
213
|
row[media_col_idx].save(file, format='png')
|
|
214
|
-
task_id
|
|
214
|
+
task_id = self.project.import_tasks(file)[0]
|
|
215
215
|
os.remove(file)
|
|
216
216
|
|
|
217
217
|
# Update the task with `rowid` metadata
|
|
@@ -256,7 +256,7 @@ class LabelStudioProject(Project):
|
|
|
256
256
|
assert self.media_import_method == 'file'
|
|
257
257
|
if not col.col_type.is_media_type():
|
|
258
258
|
# Not a media column; query the data directly
|
|
259
|
-
expr_refs[col_name] = t[col_name]
|
|
259
|
+
expr_refs[col_name] = cast(ColumnRef, t[col_name])
|
|
260
260
|
elif col in self.stored_proxies:
|
|
261
261
|
# Media column that has a stored proxy; use it. We have to give it a name,
|
|
262
262
|
# since it's an anonymous column
|
|
@@ -267,7 +267,7 @@ class LabelStudioProject(Project):
|
|
|
267
267
|
# and we can just use the localpath
|
|
268
268
|
expr_refs[col_name] = t[col_name].localpath
|
|
269
269
|
|
|
270
|
-
df = t.select(*[t[col] for col in t_rl_cols], **expr_refs)
|
|
270
|
+
df = t.select(*[t[col.name] for col in t_rl_cols], **expr_refs)
|
|
271
271
|
# The following buffers will hold `DataRow` indices that correspond to each of the selected
|
|
272
272
|
# columns. `rl_col_idxs` holds the indices for the columns that map to RectangleLabels
|
|
273
273
|
# preannotations; `data_col_idxs` holds the indices for the columns that map to data fields.
|
pixeltable/io/parquet.py
CHANGED
|
@@ -7,24 +7,23 @@ import random
|
|
|
7
7
|
import typing
|
|
8
8
|
from collections import deque
|
|
9
9
|
from pathlib import Path
|
|
10
|
-
from typing import
|
|
10
|
+
from typing import Any, Optional
|
|
11
11
|
|
|
12
|
-
import PIL.Image
|
|
13
12
|
import numpy as np
|
|
13
|
+
import PIL.Image
|
|
14
14
|
|
|
15
15
|
import pixeltable.exceptions as exc
|
|
16
16
|
import pixeltable.type_system as ts
|
|
17
17
|
from pixeltable.utils.transactional_directory import transactional_directory
|
|
18
18
|
|
|
19
19
|
if typing.TYPE_CHECKING:
|
|
20
|
-
import pixeltable as pxt
|
|
21
20
|
import pyarrow as pa
|
|
22
|
-
|
|
21
|
+
import pixeltable as pxt
|
|
23
22
|
|
|
24
23
|
_logger = logging.getLogger(__name__)
|
|
25
24
|
|
|
26
25
|
|
|
27
|
-
def _write_batch(value_batch:
|
|
26
|
+
def _write_batch(value_batch: dict[str, deque], schema: pa.Schema, output_path: Path) -> None:
|
|
28
27
|
import pyarrow as pa
|
|
29
28
|
from pyarrow import parquet
|
|
30
29
|
|
|
@@ -37,7 +36,7 @@ def _write_batch(value_batch: Dict[str, deque], schema: pa.Schema, output_path:
|
|
|
37
36
|
pydict[field.name] = value_batch[field.name]
|
|
38
37
|
|
|
39
38
|
tab = pa.Table.from_pydict(pydict, schema=schema)
|
|
40
|
-
parquet.write_table(tab, output_path)
|
|
39
|
+
parquet.write_table(tab, str(output_path))
|
|
41
40
|
|
|
42
41
|
|
|
43
42
|
def save_parquet(df: pxt.DataFrame, dest_path: Path, partition_size_bytes: int = 100_000_000) -> None:
|
|
@@ -67,7 +66,7 @@ def save_parquet(df: pxt.DataFrame, dest_path: Path, partition_size_bytes: int =
|
|
|
67
66
|
json.dump(type_dict, (temp_path / '.pixeltable.column_types.json').open('w')) # keep type metadata
|
|
68
67
|
|
|
69
68
|
batch_num = 0
|
|
70
|
-
current_value_batch:
|
|
69
|
+
current_value_batch: dict[str, deque] = {k: deque() for k in df.schema.keys()}
|
|
71
70
|
current_byte_estimate = 0
|
|
72
71
|
|
|
73
72
|
for data_row in df._exec():
|
|
@@ -128,13 +127,14 @@ def save_parquet(df: pxt.DataFrame, dest_path: Path, partition_size_bytes: int =
|
|
|
128
127
|
_write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
|
|
129
128
|
|
|
130
129
|
|
|
131
|
-
def parquet_schema_to_pixeltable_schema(parquet_path: str) ->
|
|
130
|
+
def parquet_schema_to_pixeltable_schema(parquet_path: str) -> dict[str, Optional[ts.ColumnType]]:
|
|
132
131
|
"""Generate a default pixeltable schema for the given parquet file. Returns None for unknown types."""
|
|
133
132
|
from pyarrow import parquet
|
|
133
|
+
|
|
134
134
|
from pixeltable.utils.arrow import to_pixeltable_schema
|
|
135
135
|
|
|
136
136
|
input_path = Path(parquet_path).expanduser()
|
|
137
|
-
parquet_dataset = parquet.ParquetDataset(input_path)
|
|
137
|
+
parquet_dataset = parquet.ParquetDataset(str(input_path))
|
|
138
138
|
return to_pixeltable_schema(parquet_dataset.schema)
|
|
139
139
|
|
|
140
140
|
|
|
@@ -142,7 +142,7 @@ def import_parquet(
|
|
|
142
142
|
table_path: str,
|
|
143
143
|
*,
|
|
144
144
|
parquet_path: str,
|
|
145
|
-
schema_overrides: Optional[
|
|
145
|
+
schema_overrides: Optional[dict[str, ts.ColumnType]] = None,
|
|
146
146
|
**kwargs: Any,
|
|
147
147
|
) -> pxt.Table:
|
|
148
148
|
"""Creates a new base table from a Parquet file or set of files. Requires pyarrow to be installed.
|
|
@@ -159,12 +159,13 @@ def import_parquet(
|
|
|
159
159
|
Returns:
|
|
160
160
|
A handle to the newly created [`Table`][pixeltable.Table].
|
|
161
161
|
"""
|
|
162
|
-
import pixeltable as pxt
|
|
163
162
|
from pyarrow import parquet
|
|
163
|
+
|
|
164
|
+
import pixeltable as pxt
|
|
164
165
|
from pixeltable.utils.arrow import iter_tuples
|
|
165
166
|
|
|
166
167
|
input_path = Path(parquet_path).expanduser()
|
|
167
|
-
parquet_dataset = parquet.ParquetDataset(input_path)
|
|
168
|
+
parquet_dataset = parquet.ParquetDataset(str(input_path))
|
|
168
169
|
|
|
169
170
|
schema = parquet_schema_to_pixeltable_schema(parquet_path)
|
|
170
171
|
if schema_overrides is None:
|
|
@@ -181,7 +182,7 @@ def import_parquet(
|
|
|
181
182
|
try:
|
|
182
183
|
tmp_name = f'{table_path}_tmp_{random.randint(0, 100000000)}'
|
|
183
184
|
tab = pxt.create_table(tmp_name, schema, **kwargs)
|
|
184
|
-
for fragment in parquet_dataset.fragments:
|
|
185
|
+
for fragment in parquet_dataset.fragments: # type: ignore[attr-defined]
|
|
185
186
|
for batch in fragment.to_batches():
|
|
186
187
|
dict_batch = list(iter_tuples(batch))
|
|
187
188
|
tab.insert(dict_batch)
|
pixeltable/iterators/base.py
CHANGED
pixeltable/iterators/document.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import dataclasses
|
|
2
2
|
import enum
|
|
3
3
|
import logging
|
|
4
|
-
from typing import Any, Iterable, Iterator, Optional
|
|
4
|
+
from typing import Any, Iterable, Iterator, Optional, Union
|
|
5
5
|
|
|
6
6
|
import ftfy
|
|
7
7
|
|
|
@@ -152,7 +152,7 @@ class DocumentSplitter(ComponentIterator):
|
|
|
152
152
|
assert self._doc_handle.pdf_doc is not None
|
|
153
153
|
self._sections = self._pdf_sections()
|
|
154
154
|
else:
|
|
155
|
-
assert False, f'
|
|
155
|
+
assert False, f'Unsupported document format: {self._doc_handle.format}'
|
|
156
156
|
|
|
157
157
|
if Separator.SENTENCE in self._separators:
|
|
158
158
|
self._sections = self._sentence_sections(self._sections)
|
|
@@ -176,7 +176,7 @@ class DocumentSplitter(ComponentIterator):
|
|
|
176
176
|
|
|
177
177
|
@classmethod
|
|
178
178
|
def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ColumnType], list[str]]:
|
|
179
|
-
schema = {'text': StringType()}
|
|
179
|
+
schema: dict[str, ColumnType] = {'text': StringType()}
|
|
180
180
|
md_fields = _parse_metadata(kwargs['metadata']) if 'metadata' in kwargs else []
|
|
181
181
|
|
|
182
182
|
for md_field in md_fields:
|
|
@@ -214,7 +214,7 @@ class DocumentSplitter(ComponentIterator):
|
|
|
214
214
|
section = next(self._sections)
|
|
215
215
|
if section.text is None:
|
|
216
216
|
continue
|
|
217
|
-
result = {'text': section.text}
|
|
217
|
+
result: dict[str, Any] = {'text': section.text}
|
|
218
218
|
for md_field in self._metadata_fields:
|
|
219
219
|
if md_field == ChunkMetadata.TITLE:
|
|
220
220
|
result[md_field.name.lower()] = self._doc_title
|
|
@@ -234,7 +234,7 @@ class DocumentSplitter(ComponentIterator):
|
|
|
234
234
|
emit_on_paragraph = Separator.PARAGRAPH in self._separators or Separator.SENTENCE in self._separators
|
|
235
235
|
emit_on_heading = Separator.HEADING in self._separators or emit_on_paragraph
|
|
236
236
|
# current state
|
|
237
|
-
accumulated_text = [] # currently accumulated text
|
|
237
|
+
accumulated_text: list[str] = [] # currently accumulated text
|
|
238
238
|
# accumulate pieces then join before emit to avoid quadratic complexity of string concatenation
|
|
239
239
|
|
|
240
240
|
headings: dict[str, str] = {} # current state of observed headings (level -> text)
|
|
@@ -260,9 +260,10 @@ class DocumentSplitter(ComponentIterator):
|
|
|
260
260
|
yield DocumentSection(text=full_text, metadata=md)
|
|
261
261
|
accumulated_text = []
|
|
262
262
|
|
|
263
|
-
def process_element(el: bs4.
|
|
263
|
+
def process_element(el: Union[bs4.element.Tag, bs4.NavigableString]) -> Iterator[DocumentSection]:
|
|
264
264
|
# process the element and emit sections as necessary
|
|
265
265
|
nonlocal accumulated_text, headings, sourceline, emit_on_heading, emit_on_paragraph
|
|
266
|
+
|
|
266
267
|
if el.name in self._skip_tags:
|
|
267
268
|
return
|
|
268
269
|
|
|
@@ -282,6 +283,7 @@ class DocumentSplitter(ComponentIterator):
|
|
|
282
283
|
yield from emit()
|
|
283
284
|
update_metadata(el)
|
|
284
285
|
for child in el.children:
|
|
286
|
+
assert isinstance(child, (bs4.element.Tag, bs4.NavigableString)), type(el)
|
|
285
287
|
yield from process_element(child)
|
|
286
288
|
|
|
287
289
|
yield from process_element(self._doc_handle.bs_doc)
|
|
@@ -293,7 +295,7 @@ class DocumentSplitter(ComponentIterator):
|
|
|
293
295
|
emit_on_paragraph = Separator.PARAGRAPH in self._separators or Separator.SENTENCE in self._separators
|
|
294
296
|
emit_on_heading = Separator.HEADING in self._separators or emit_on_paragraph
|
|
295
297
|
# current state
|
|
296
|
-
accumulated_text = [] # currently accumulated text
|
|
298
|
+
accumulated_text: list[str] = [] # currently accumulated text
|
|
297
299
|
# accumulate pieces then join before emit to avoid quadratic complexity of string concatenation
|
|
298
300
|
headings: dict[str, str] = {} # current state of observed headings (level -> text)
|
|
299
301
|
|
|
@@ -347,7 +349,7 @@ class DocumentSplitter(ComponentIterator):
|
|
|
347
349
|
|
|
348
350
|
def _pdf_sections(self) -> Iterator[DocumentSection]:
|
|
349
351
|
"""Create DocumentSections reflecting the pdf-specific separators"""
|
|
350
|
-
import fitz
|
|
352
|
+
import fitz # type: ignore[import-untyped]
|
|
351
353
|
doc: fitz.Document = self._doc_handle.pdf_doc
|
|
352
354
|
assert doc is not None
|
|
353
355
|
|