pixeltable 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +2 -3
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +2 -1
- pixeltable/catalog/catalog.py +63 -36
- pixeltable/catalog/column.py +11 -4
- pixeltable/catalog/dir.py +5 -5
- pixeltable/catalog/globals.py +28 -14
- pixeltable/catalog/insertable_table.py +81 -43
- pixeltable/catalog/path.py +2 -2
- pixeltable/catalog/table.py +140 -109
- pixeltable/catalog/table_version.py +60 -43
- pixeltable/catalog/table_version_handle.py +3 -0
- pixeltable/catalog/table_version_path.py +1 -1
- pixeltable/catalog/view.py +17 -9
- pixeltable/dataframe.py +5 -3
- pixeltable/env.py +109 -43
- pixeltable/exec/__init__.py +2 -0
- pixeltable/exec/aggregation_node.py +6 -8
- pixeltable/exec/cache_prefetch_node.py +4 -7
- pixeltable/exec/component_iteration_node.py +1 -3
- pixeltable/exec/data_row_batch.py +1 -2
- pixeltable/exec/exec_context.py +1 -1
- pixeltable/exec/exec_node.py +2 -3
- pixeltable/exec/expr_eval/__init__.py +2 -0
- pixeltable/exec/expr_eval/evaluators.py +137 -20
- pixeltable/exec/expr_eval/expr_eval_node.py +43 -64
- pixeltable/exec/expr_eval/globals.py +68 -7
- pixeltable/exec/expr_eval/schedulers.py +25 -23
- pixeltable/exec/in_memory_data_node.py +8 -6
- pixeltable/exec/row_update_node.py +3 -4
- pixeltable/exec/sql_node.py +16 -17
- pixeltable/exprs/__init__.py +3 -2
- pixeltable/exprs/arithmetic_expr.py +2 -0
- pixeltable/exprs/column_property_ref.py +1 -1
- pixeltable/exprs/column_ref.py +39 -3
- pixeltable/exprs/compound_predicate.py +1 -1
- pixeltable/exprs/data_row.py +17 -1
- pixeltable/exprs/expr.py +51 -21
- pixeltable/exprs/function_call.py +34 -2
- pixeltable/exprs/globals.py +12 -0
- pixeltable/exprs/json_mapper.py +95 -48
- pixeltable/exprs/json_path.py +3 -10
- pixeltable/exprs/method_ref.py +2 -2
- pixeltable/exprs/object_ref.py +2 -2
- pixeltable/exprs/row_builder.py +33 -6
- pixeltable/exprs/similarity_expr.py +6 -21
- pixeltable/exprs/sql_element_cache.py +1 -1
- pixeltable/exprs/string_op.py +107 -0
- pixeltable/ext/__init__.py +1 -1
- pixeltable/ext/functions/__init__.py +1 -1
- pixeltable/ext/functions/whisperx.py +1 -1
- pixeltable/ext/functions/yolox.py +22 -65
- pixeltable/func/aggregate_function.py +1 -1
- pixeltable/func/callable_function.py +2 -5
- pixeltable/func/expr_template_function.py +22 -2
- pixeltable/func/function.py +4 -5
- pixeltable/func/function_registry.py +1 -1
- pixeltable/func/signature.py +1 -1
- pixeltable/func/tools.py +2 -2
- pixeltable/func/udf.py +2 -2
- pixeltable/functions/__init__.py +2 -2
- pixeltable/functions/anthropic.py +2 -2
- pixeltable/functions/audio.py +1 -1
- pixeltable/functions/deepseek.py +1 -1
- pixeltable/functions/fireworks.py +1 -1
- pixeltable/functions/globals.py +22 -11
- pixeltable/functions/huggingface.py +1 -1
- pixeltable/functions/image.py +1 -1
- pixeltable/functions/json.py +1 -1
- pixeltable/functions/llama_cpp.py +1 -1
- pixeltable/functions/math.py +1 -1
- pixeltable/functions/mistralai.py +1 -1
- pixeltable/functions/ollama.py +1 -1
- pixeltable/functions/openai.py +2 -2
- pixeltable/functions/replicate.py +1 -1
- pixeltable/functions/string.py +1 -1
- pixeltable/functions/timestamp.py +1 -1
- pixeltable/functions/together.py +1 -1
- pixeltable/functions/util.py +1 -1
- pixeltable/functions/video.py +2 -2
- pixeltable/functions/vision.py +2 -2
- pixeltable/globals.py +85 -33
- pixeltable/index/embedding_index.py +12 -1
- pixeltable/io/__init__.py +8 -5
- pixeltable/io/datarows.py +138 -0
- pixeltable/io/external_store.py +8 -5
- pixeltable/io/fiftyone.py +6 -7
- pixeltable/io/globals.py +7 -160
- pixeltable/io/hf_datasets.py +21 -98
- pixeltable/io/label_studio.py +21 -20
- pixeltable/io/pandas.py +35 -48
- pixeltable/io/parquet.py +17 -42
- pixeltable/io/table_data_conduit.py +569 -0
- pixeltable/io/utils.py +6 -21
- pixeltable/iterators/__init__.py +1 -1
- pixeltable/metadata/__init__.py +6 -4
- pixeltable/metadata/converters/convert_24.py +3 -3
- pixeltable/metadata/converters/convert_25.py +1 -1
- pixeltable/metadata/converters/convert_29.py +1 -1
- pixeltable/metadata/converters/convert_30.py +50 -0
- pixeltable/metadata/converters/util.py +26 -1
- pixeltable/metadata/notes.py +1 -0
- pixeltable/metadata/schema.py +3 -0
- pixeltable/store.py +2 -2
- pixeltable/type_system.py +19 -7
- pixeltable/utils/arrow.py +32 -7
- pixeltable/utils/console_output.py +3 -2
- pixeltable/utils/coroutine.py +3 -3
- pixeltable/utils/dbms.py +66 -0
- pixeltable/utils/documents.py +61 -67
- pixeltable/utils/filecache.py +1 -1
- pixeltable/utils/http_server.py +3 -2
- pixeltable/utils/pytorch.py +1 -1
- pixeltable/utils/sql.py +1 -1
- pixeltable-0.3.11.dist-info/METADATA +436 -0
- pixeltable-0.3.11.dist-info/RECORD +179 -0
- {pixeltable-0.3.9.dist-info → pixeltable-0.3.11.dist-info}/WHEEL +1 -1
- pixeltable/catalog/path_dict.py +0 -169
- pixeltable-0.3.9.dist-info/METADATA +0 -382
- pixeltable-0.3.9.dist-info/RECORD +0 -175
- {pixeltable-0.3.9.dist-info → pixeltable-0.3.11.dist-info}/LICENSE +0 -0
- {pixeltable-0.3.9.dist-info → pixeltable-0.3.11.dist-info}/entry_points.txt +0 -0
pixeltable/io/hf_datasets.py
CHANGED
|
@@ -1,41 +1,38 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
import logging
|
|
4
|
-
import math
|
|
5
|
-
import random
|
|
6
3
|
import typing
|
|
7
4
|
from typing import Any, Optional, Union
|
|
8
5
|
|
|
9
6
|
import pixeltable as pxt
|
|
10
7
|
import pixeltable.type_system as ts
|
|
11
|
-
from pixeltable import exceptions as excs
|
|
12
|
-
|
|
13
|
-
from .utils import normalize_import_parameters, normalize_schema_names
|
|
14
8
|
|
|
15
9
|
if typing.TYPE_CHECKING:
|
|
16
10
|
import datasets # type: ignore[import-untyped]
|
|
17
11
|
|
|
18
|
-
_logger = logging.getLogger('pixeltable')
|
|
19
|
-
|
|
20
|
-
# use 100MB as the batch size limit for loading a huggingface dataset into pixeltable.
|
|
21
|
-
# The primary goal is to bound memory use, regardless of dataset size.
|
|
22
|
-
# Second goal is to limit overhead. 100MB is presumed to be reasonable for a lot of storage systems.
|
|
23
|
-
_K_BATCH_SIZE_BYTES = 100_000_000
|
|
24
12
|
|
|
25
|
-
# note, there are many more types. we allow overrides in the
|
|
13
|
+
# note, there are many more types. we allow overrides in the schema_overrides parameter
|
|
26
14
|
# to handle cases where the appropriate type is not yet mapped, or to override this mapping.
|
|
27
15
|
# https://huggingface.co/docs/datasets/v2.17.0/en/package_reference/main_classes#datasets.Value
|
|
28
16
|
_hf_to_pxt: dict[str, ts.ColumnType] = {
|
|
29
|
-
'int32': ts.IntType(nullable=True), # pixeltable widens to big int
|
|
30
|
-
'int64': ts.IntType(nullable=True),
|
|
31
17
|
'bool': ts.BoolType(nullable=True),
|
|
18
|
+
'int8': ts.IntType(nullable=True),
|
|
19
|
+
'int16': ts.IntType(nullable=True),
|
|
20
|
+
'int32': ts.IntType(nullable=True),
|
|
21
|
+
'int64': ts.IntType(nullable=True),
|
|
22
|
+
'uint8': ts.IntType(nullable=True),
|
|
23
|
+
'uint16': ts.IntType(nullable=True),
|
|
24
|
+
'uint32': ts.IntType(nullable=True),
|
|
25
|
+
'uint64': ts.IntType(nullable=True),
|
|
26
|
+
'float16': ts.FloatType(nullable=True),
|
|
32
27
|
'float32': ts.FloatType(nullable=True),
|
|
33
28
|
'float64': ts.FloatType(nullable=True),
|
|
34
|
-
'large_string': ts.StringType(nullable=True),
|
|
35
29
|
'string': ts.StringType(nullable=True),
|
|
30
|
+
'large_string': ts.StringType(nullable=True),
|
|
36
31
|
'timestamp[s]': ts.TimestampType(nullable=True),
|
|
37
32
|
'timestamp[ms]': ts.TimestampType(nullable=True), # HF dataset iterator converts timestamps to datetime.datetime
|
|
38
33
|
'timestamp[us]': ts.TimestampType(nullable=True),
|
|
34
|
+
'date32': ts.StringType(nullable=True), # date32 is not supported in pixeltable, use string
|
|
35
|
+
'date64': ts.StringType(nullable=True), # date64 is not supported in pixeltable, use string
|
|
39
36
|
}
|
|
40
37
|
|
|
41
38
|
|
|
@@ -88,7 +85,6 @@ def import_huggingface_dataset(
|
|
|
88
85
|
table_path: str,
|
|
89
86
|
dataset: Union[datasets.Dataset, datasets.DatasetDict],
|
|
90
87
|
*,
|
|
91
|
-
column_name_for_split: Optional[str] = None,
|
|
92
88
|
schema_overrides: Optional[dict[str, Any]] = None,
|
|
93
89
|
primary_key: Optional[Union[str, list[str]]] = None,
|
|
94
90
|
**kwargs: Any,
|
|
@@ -101,91 +97,18 @@ def import_huggingface_dataset(
|
|
|
101
97
|
dataset: Huggingface [`datasets.Dataset`](https://huggingface.co/docs/datasets/en/package_reference/main_classes#datasets.Dataset)
|
|
102
98
|
or [`datasets.DatasetDict`](https://huggingface.co/docs/datasets/en/package_reference/main_classes#datasets.DatasetDict)
|
|
103
99
|
to insert into the table.
|
|
104
|
-
column_name_for_split: column name to use for split information. If None, no split information will be stored.
|
|
105
100
|
schema_overrides: If specified, then for each (name, type) pair in `schema_overrides`, the column with
|
|
106
|
-
name `name` will be given type `type`, instead of being inferred from the `Dataset` or `DatasetDict`.
|
|
107
|
-
`schema_overrides` should be the column names of the `Dataset` or `DatasetDict` (whether or not
|
|
108
|
-
Pixeltable identifiers).
|
|
101
|
+
name `name` will be given type `type`, instead of being inferred from the `Dataset` or `DatasetDict`.
|
|
102
|
+
The keys in `schema_overrides` should be the column names of the `Dataset` or `DatasetDict` (whether or not
|
|
103
|
+
they are valid Pixeltable identifiers).
|
|
109
104
|
primary_key: The primary key of the table (see [`create_table()`][pixeltable.create_table]).
|
|
110
105
|
kwargs: Additional arguments to pass to `create_table`.
|
|
106
|
+
An argument of `column_name_for_split` must be provided if the source is a DatasetDict.
|
|
107
|
+
This column name will contain the split information. If None, no split information will be stored.
|
|
111
108
|
|
|
112
109
|
Returns:
|
|
113
110
|
A handle to the newly created [`Table`][pixeltable.Table].
|
|
114
111
|
"""
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
if not isinstance(dataset, (datasets.Dataset, datasets.DatasetDict)):
|
|
120
|
-
raise excs.Error(f'`type(dataset)` must be `datasets.Dataset` or `datasets.DatasetDict`. Got {type(dataset)=}')
|
|
121
|
-
|
|
122
|
-
# Create the pixeltable schema from the huggingface schema
|
|
123
|
-
hf_schema_source = _get_hf_schema(dataset)
|
|
124
|
-
schema_overrides, primary_key = normalize_import_parameters(schema_overrides, primary_key)
|
|
125
|
-
hf_schema = huggingface_schema_to_pxt_schema(hf_schema_source, schema_overrides, primary_key)
|
|
126
|
-
|
|
127
|
-
# Add the split column to the schema if requested
|
|
128
|
-
if column_name_for_split is not None:
|
|
129
|
-
if column_name_for_split in hf_schema:
|
|
130
|
-
raise excs.Error(
|
|
131
|
-
f'Column name `{column_name_for_split}` already exists in dataset schema; provide a different `column_name_for_split`'
|
|
132
|
-
)
|
|
133
|
-
hf_schema[column_name_for_split] = ts.StringType(nullable=True)
|
|
134
|
-
|
|
135
|
-
schema, pxt_pk, _ = normalize_schema_names(hf_schema, primary_key, schema_overrides, True)
|
|
136
|
-
|
|
137
|
-
# Prepare to create table and insert data
|
|
138
|
-
if table_path in pxt.list_tables():
|
|
139
|
-
raise excs.Error(f'table {table_path} already exists')
|
|
140
|
-
|
|
141
|
-
if isinstance(dataset, datasets.Dataset):
|
|
142
|
-
# when loading an hf dataset partially, dataset.split._name is sometimes the form "train[0:1000]"
|
|
143
|
-
raw_name = dataset.split._name
|
|
144
|
-
split_name = raw_name.split('[')[0] if raw_name is not None else None
|
|
145
|
-
dataset_dict = {split_name: dataset}
|
|
146
|
-
else:
|
|
147
|
-
dataset_dict = dataset
|
|
148
|
-
|
|
149
|
-
# extract all class labels from the dataset to translate category ints to strings
|
|
150
|
-
categorical_features = {
|
|
151
|
-
feature_name: feature_type.names
|
|
152
|
-
for (feature_name, feature_type) in hf_schema_source.items()
|
|
153
|
-
if isinstance(feature_type, datasets.ClassLabel)
|
|
154
|
-
}
|
|
155
|
-
|
|
156
|
-
try:
|
|
157
|
-
# random tmp name
|
|
158
|
-
tmp_name = f'{table_path}_tmp_{random.randint(0, 100000000)}'
|
|
159
|
-
tab = pxt.create_table(tmp_name, schema, primary_key=pxt_pk, **kwargs)
|
|
160
|
-
|
|
161
|
-
def _translate_row(row: dict[str, Any], split_name: str) -> dict[str, Any]:
|
|
162
|
-
output_row = row.copy()
|
|
163
|
-
# map all class labels to strings
|
|
164
|
-
for field, values in categorical_features.items():
|
|
165
|
-
output_row[field] = values[row[field]]
|
|
166
|
-
# add split name to row
|
|
167
|
-
if column_name_for_split is not None:
|
|
168
|
-
output_row[column_name_for_split] = split_name
|
|
169
|
-
return output_row
|
|
170
|
-
|
|
171
|
-
for split_name, split_dataset in dataset_dict.items():
|
|
172
|
-
num_batches = split_dataset.size_in_bytes / _K_BATCH_SIZE_BYTES
|
|
173
|
-
tuples_per_batch = math.ceil(split_dataset.num_rows / num_batches)
|
|
174
|
-
assert tuples_per_batch > 0
|
|
175
|
-
|
|
176
|
-
batch = []
|
|
177
|
-
for row in split_dataset:
|
|
178
|
-
batch.append(_translate_row(row, split_name))
|
|
179
|
-
if len(batch) >= tuples_per_batch:
|
|
180
|
-
tab.insert(batch)
|
|
181
|
-
batch = []
|
|
182
|
-
# last batch
|
|
183
|
-
if len(batch) > 0:
|
|
184
|
-
tab.insert(batch)
|
|
185
|
-
|
|
186
|
-
except Exception as e:
|
|
187
|
-
_logger.error(f'Error while inserting dataset into table: {tmp_name}')
|
|
188
|
-
raise e
|
|
189
|
-
|
|
190
|
-
pxt.move(tmp_name, table_path)
|
|
191
|
-
return pxt.get_table(table_path)
|
|
112
|
+
return pxt.create_table(
|
|
113
|
+
table_path, source=dataset, schema_overrides=schema_overrides, primary_key=primary_key, extra_args=kwargs
|
|
114
|
+
)
|
pixeltable/io/label_studio.py
CHANGED
|
@@ -5,16 +5,14 @@ import os
|
|
|
5
5
|
from dataclasses import dataclass
|
|
6
6
|
from pathlib import Path
|
|
7
7
|
from typing import Any, Iterator, Literal, Optional, cast
|
|
8
|
-
from xml.etree import ElementTree
|
|
8
|
+
from xml.etree import ElementTree as ET
|
|
9
9
|
|
|
10
10
|
import label_studio_sdk # type: ignore[import-untyped]
|
|
11
11
|
import PIL.Image
|
|
12
12
|
from requests.exceptions import HTTPError
|
|
13
13
|
|
|
14
14
|
import pixeltable as pxt
|
|
15
|
-
import
|
|
16
|
-
import pixeltable.exceptions as excs
|
|
17
|
-
from pixeltable import Column, Table
|
|
15
|
+
from pixeltable import Column, Table, env, exceptions as excs
|
|
18
16
|
from pixeltable.config import Config
|
|
19
17
|
from pixeltable.exprs import ColumnRef, DataRow, Expr
|
|
20
18
|
from pixeltable.io.external_store import Project, SyncStatus
|
|
@@ -140,7 +138,8 @@ class LabelStudioProject(Project):
|
|
|
140
138
|
page += 1
|
|
141
139
|
if unknown_task_count > 0:
|
|
142
140
|
_logger.warning(
|
|
143
|
-
f'Skipped {unknown_task_count} unrecognized task(s) when syncing
|
|
141
|
+
f'Skipped {unknown_task_count} unrecognized task(s) when syncing '
|
|
142
|
+
f'Label Studio project {self.project_title!r}.'
|
|
144
143
|
)
|
|
145
144
|
|
|
146
145
|
def __update_tasks(self, t: Table, existing_tasks: dict[tuple, dict]) -> SyncStatus:
|
|
@@ -174,11 +173,11 @@ class LabelStudioProject(Project):
|
|
|
174
173
|
# Send media to Label Studio by HTTP post.
|
|
175
174
|
assert len(t_data_cols) == 1 # This was verified when the project was set up
|
|
176
175
|
return self.__update_tasks_by_post(t, existing_tasks, t_data_cols[0], t_rl_cols, rl_info)
|
|
177
|
-
elif self.media_import_method
|
|
176
|
+
elif self.media_import_method in ('file', 'url'):
|
|
178
177
|
# Send media to Label Studio by file reference (local file or URL).
|
|
179
178
|
return self.__update_tasks_by_files(t, existing_tasks, t_data_cols, t_rl_cols, rl_info)
|
|
180
179
|
else:
|
|
181
|
-
|
|
180
|
+
raise AssertionError()
|
|
182
181
|
|
|
183
182
|
def __update_tasks_by_post(
|
|
184
183
|
self,
|
|
@@ -227,7 +226,7 @@ class LabelStudioProject(Project):
|
|
|
227
226
|
)
|
|
228
227
|
for i in range(len(coco_annotations))
|
|
229
228
|
]
|
|
230
|
-
_logger.debug(
|
|
229
|
+
_logger.debug('`predictions`: {%s}', predictions)
|
|
231
230
|
self.project.create_predictions(predictions)
|
|
232
231
|
tasks_created += 1
|
|
233
232
|
|
|
@@ -358,7 +357,7 @@ class LabelStudioProject(Project):
|
|
|
358
357
|
def __localpath_to_lspath(cls, localpath: str) -> str:
|
|
359
358
|
# Transform the local path into Label Studio's bespoke path format.
|
|
360
359
|
relpath = Path(localpath).relative_to(Config.get().home)
|
|
361
|
-
return f'/data/local-files/?d={
|
|
360
|
+
return f'/data/local-files/?d={relpath}'
|
|
362
361
|
|
|
363
362
|
def __delete_stale_tasks(
|
|
364
363
|
self, existing_tasks: dict[tuple, dict], row_ids_in_pxt: set[tuple], tasks_created: int
|
|
@@ -405,7 +404,8 @@ class LabelStudioProject(Project):
|
|
|
405
404
|
updates = [{'_rowid': rowid, local_annotations_col.name: ann} for rowid, ann in annotations.items()]
|
|
406
405
|
if len(updates) > 0:
|
|
407
406
|
_logger.info(
|
|
408
|
-
f'Updating table
|
|
407
|
+
f'Updating table {t._name!r}, column {local_annotations_col.name!r} '
|
|
408
|
+
f'with {len(updates)} total annotations.'
|
|
409
409
|
)
|
|
410
410
|
# batch_update currently doesn't propagate from views to base tables. As a workaround, we call
|
|
411
411
|
# batch_update on the actual ancestor table that holds the annotations column.
|
|
@@ -451,7 +451,7 @@ class LabelStudioProject(Project):
|
|
|
451
451
|
Parses a Label Studio XML config, extracting the names and Pixeltable types of
|
|
452
452
|
all input variables.
|
|
453
453
|
"""
|
|
454
|
-
root:
|
|
454
|
+
root: ET.Element = ET.fromstring(xml_config)
|
|
455
455
|
if root.tag.lower() != 'view':
|
|
456
456
|
raise excs.Error('Root of Label Studio config must be a `View`')
|
|
457
457
|
config = _LabelStudioConfig(
|
|
@@ -461,7 +461,7 @@ class LabelStudioProject(Project):
|
|
|
461
461
|
return config
|
|
462
462
|
|
|
463
463
|
@classmethod
|
|
464
|
-
def __parse_data_keys_config(cls, root:
|
|
464
|
+
def __parse_data_keys_config(cls, root: ET.Element) -> dict[str, '_DataKey']:
|
|
465
465
|
"""Parses the data keys from a Label Studio XML config."""
|
|
466
466
|
config: dict[str, '_DataKey'] = {}
|
|
467
467
|
for element in root:
|
|
@@ -477,7 +477,7 @@ class LabelStudioProject(Project):
|
|
|
477
477
|
return config
|
|
478
478
|
|
|
479
479
|
@classmethod
|
|
480
|
-
def __parse_rectangle_labels_config(cls, root:
|
|
480
|
+
def __parse_rectangle_labels_config(cls, root: ET.Element) -> dict[str, '_RectangleLabel']:
|
|
481
481
|
"""Parses the RectangleLabels from a Label Studio XML config."""
|
|
482
482
|
config: dict[str, '_RectangleLabel'] = {}
|
|
483
483
|
for element in root:
|
|
@@ -534,7 +534,7 @@ class LabelStudioProject(Project):
|
|
|
534
534
|
_label_studio_client().delete_project(self.project_id)
|
|
535
535
|
env.Env.get().console_logger.info(f'Deleted Label Studio project: {title}')
|
|
536
536
|
|
|
537
|
-
def __eq__(self, other) -> bool:
|
|
537
|
+
def __eq__(self, other: object) -> bool:
|
|
538
538
|
return isinstance(other, LabelStudioProject) and self.project_id == other.project_id
|
|
539
539
|
|
|
540
540
|
def __hash__(self) -> int:
|
|
@@ -576,7 +576,7 @@ class LabelStudioProject(Project):
|
|
|
576
576
|
local_annotations_column = ANNOTATIONS_COLUMN
|
|
577
577
|
else:
|
|
578
578
|
local_annotations_column = next(k for k, v in col_mapping.items() if v == ANNOTATIONS_COLUMN)
|
|
579
|
-
if local_annotations_column not in t._schema
|
|
579
|
+
if local_annotations_column not in t._schema:
|
|
580
580
|
t.add_columns({local_annotations_column: pxt.JsonType(nullable=True)})
|
|
581
581
|
|
|
582
582
|
resolved_col_mapping = cls.validate_columns(
|
|
@@ -591,9 +591,9 @@ class LabelStudioProject(Project):
|
|
|
591
591
|
if media_import_method != 'url':
|
|
592
592
|
raise excs.Error("`s3_configuration` is only valid when `media_import_method == 'url'`")
|
|
593
593
|
s3_configuration = copy.copy(s3_configuration)
|
|
594
|
-
if
|
|
594
|
+
if 'bucket' not in s3_configuration:
|
|
595
595
|
raise excs.Error('`s3_configuration` must contain a `bucket` field')
|
|
596
|
-
if
|
|
596
|
+
if 'title' not in s3_configuration:
|
|
597
597
|
s3_configuration['title'] = 'Pixeltable-S3-Import-Storage'
|
|
598
598
|
if (
|
|
599
599
|
'aws_access_key_id' not in s3_configuration
|
|
@@ -633,7 +633,8 @@ class LabelStudioProject(Project):
|
|
|
633
633
|
raise excs.Error(
|
|
634
634
|
'`media_import_method` is set to `file`, but your Label Studio server is not configured '
|
|
635
635
|
'for local file storage.\nPlease set the `LABEL_STUDIO_LOCAL_FILES_SERVING_ENABLED` '
|
|
636
|
-
'environment variable to `true` in the environment where your Label Studio server
|
|
636
|
+
'environment variable to `true` in the environment where your Label Studio server '
|
|
637
|
+
'is running.'
|
|
637
638
|
) from exc
|
|
638
639
|
raise # Handle any other exception type normally
|
|
639
640
|
|
|
@@ -663,7 +664,7 @@ class _LabelStudioConfig:
|
|
|
663
664
|
rectangle_labels: dict[str, _RectangleLabel]
|
|
664
665
|
|
|
665
666
|
def validate(self) -> None:
|
|
666
|
-
data_key_names =
|
|
667
|
+
data_key_names = {key.name for key in self.data_keys.values() if key.name is not None}
|
|
667
668
|
for name, rl in self.rectangle_labels.items():
|
|
668
669
|
if rl.to_name not in data_key_names:
|
|
669
670
|
raise excs.Error(
|
|
@@ -674,7 +675,7 @@ class _LabelStudioConfig:
|
|
|
674
675
|
@property
|
|
675
676
|
def export_columns(self) -> dict[str, pxt.ColumnType]:
|
|
676
677
|
data_key_cols = {key_id: key_info.column_type for key_id, key_info in self.data_keys.items()}
|
|
677
|
-
rl_cols = {name: pxt.JsonType() for name in self.rectangle_labels
|
|
678
|
+
rl_cols = {name: pxt.JsonType() for name in self.rectangle_labels}
|
|
678
679
|
return {**data_key_cols, **rl_cols}
|
|
679
680
|
|
|
680
681
|
|
pixeltable/io/pandas.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import os
|
|
1
2
|
from typing import Any, Optional, Union
|
|
2
3
|
|
|
3
4
|
import numpy as np
|
|
@@ -7,9 +8,6 @@ from pandas.api.types import is_datetime64_any_dtype, is_extension_array_dtype
|
|
|
7
8
|
|
|
8
9
|
import pixeltable as pxt
|
|
9
10
|
import pixeltable.exceptions as excs
|
|
10
|
-
from pixeltable import Table
|
|
11
|
-
|
|
12
|
-
from .utils import find_or_create_table, normalize_import_parameters, normalize_schema_names
|
|
13
11
|
|
|
14
12
|
|
|
15
13
|
def import_pandas(
|
|
@@ -43,30 +41,24 @@ def import_pandas(
|
|
|
43
41
|
Returns:
|
|
44
42
|
A handle to the newly created [`Table`][pixeltable.Table].
|
|
45
43
|
"""
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
tbl_rows = [__df_row_to_pxt_row(row, pd_schema, col_mapping) for row in df.itertuples()]
|
|
54
|
-
|
|
55
|
-
table = find_or_create_table(
|
|
56
|
-
tbl_name, schema, primary_key=pxt_pk, num_retained_versions=num_retained_versions, comment=comment
|
|
44
|
+
return pxt.create_table(
|
|
45
|
+
tbl_name,
|
|
46
|
+
source=df,
|
|
47
|
+
schema_overrides=schema_overrides,
|
|
48
|
+
primary_key=primary_key,
|
|
49
|
+
num_retained_versions=num_retained_versions,
|
|
50
|
+
comment=comment,
|
|
57
51
|
)
|
|
58
|
-
table.insert(tbl_rows)
|
|
59
|
-
return table
|
|
60
52
|
|
|
61
53
|
|
|
62
54
|
def import_csv(
|
|
63
55
|
tbl_name: str,
|
|
64
|
-
filepath_or_buffer,
|
|
56
|
+
filepath_or_buffer: Union[str, os.PathLike],
|
|
65
57
|
schema_overrides: Optional[dict[str, Any]] = None,
|
|
66
58
|
primary_key: Optional[Union[str, list[str]]] = None,
|
|
67
59
|
num_retained_versions: int = 10,
|
|
68
60
|
comment: str = '',
|
|
69
|
-
**kwargs,
|
|
61
|
+
**kwargs: Any,
|
|
70
62
|
) -> pxt.Table:
|
|
71
63
|
"""
|
|
72
64
|
Creates a new base table from a csv file. This is a convenience method and is equivalent
|
|
@@ -77,26 +69,26 @@ def import_csv(
|
|
|
77
69
|
Returns:
|
|
78
70
|
A handle to the newly created [`Table`][pixeltable.Table].
|
|
79
71
|
"""
|
|
80
|
-
|
|
81
|
-
return import_pandas(
|
|
72
|
+
return pxt.create_table(
|
|
82
73
|
tbl_name,
|
|
83
|
-
|
|
74
|
+
source=filepath_or_buffer,
|
|
84
75
|
schema_overrides=schema_overrides,
|
|
85
76
|
primary_key=primary_key,
|
|
86
77
|
num_retained_versions=num_retained_versions,
|
|
87
78
|
comment=comment,
|
|
79
|
+
extra_args=kwargs,
|
|
88
80
|
)
|
|
89
81
|
|
|
90
82
|
|
|
91
83
|
def import_excel(
|
|
92
84
|
tbl_name: str,
|
|
93
|
-
io,
|
|
94
|
-
|
|
85
|
+
io: Union[str, os.PathLike],
|
|
86
|
+
*,
|
|
95
87
|
schema_overrides: Optional[dict[str, Any]] = None,
|
|
96
88
|
primary_key: Optional[Union[str, list[str]]] = None,
|
|
97
89
|
num_retained_versions: int = 10,
|
|
98
90
|
comment: str = '',
|
|
99
|
-
**kwargs,
|
|
91
|
+
**kwargs: Any,
|
|
100
92
|
) -> pxt.Table:
|
|
101
93
|
"""
|
|
102
94
|
Creates a new base table from an Excel (.xlsx) file. This is a convenience method and is
|
|
@@ -107,18 +99,18 @@ def import_excel(
|
|
|
107
99
|
Returns:
|
|
108
100
|
A handle to the newly created [`Table`][pixeltable.Table].
|
|
109
101
|
"""
|
|
110
|
-
|
|
111
|
-
return import_pandas(
|
|
102
|
+
return pxt.create_table(
|
|
112
103
|
tbl_name,
|
|
113
|
-
|
|
104
|
+
source=io,
|
|
114
105
|
schema_overrides=schema_overrides,
|
|
115
106
|
primary_key=primary_key,
|
|
116
107
|
num_retained_versions=num_retained_versions,
|
|
117
108
|
comment=comment,
|
|
109
|
+
extra_args=kwargs,
|
|
118
110
|
)
|
|
119
111
|
|
|
120
112
|
|
|
121
|
-
def
|
|
113
|
+
def _df_check_primary_key_values(df: pd.DataFrame, primary_key: list[str]) -> None:
|
|
122
114
|
for pd_name in primary_key:
|
|
123
115
|
# This can be faster for large DataFrames
|
|
124
116
|
has_nulls = df[pd_name].count() < len(df)
|
|
@@ -146,15 +138,6 @@ def df_infer_schema(
|
|
|
146
138
|
return pd_schema
|
|
147
139
|
|
|
148
140
|
|
|
149
|
-
"""
|
|
150
|
-
# Check if a datetime64[ns, UTC] dtype
|
|
151
|
-
def is_datetime_tz_utc(x: Any) -> bool:
|
|
152
|
-
if isinstance(x, pd.Timestamp) and x.tzinfo is not None and str(x.tzinfo) == 'UTC':
|
|
153
|
-
return True
|
|
154
|
-
return pd.api.types.is_datetime64tz_dtype(x) and str(x).endswith('UTC]')
|
|
155
|
-
"""
|
|
156
|
-
|
|
157
|
-
|
|
158
141
|
def __pd_dtype_to_pxt_type(pd_dtype: DtypeObj, nullable: bool) -> Optional[pxt.ColumnType]:
|
|
159
142
|
"""
|
|
160
143
|
Determines a pixeltable ColumnType from a pandas dtype
|
|
@@ -165,7 +148,8 @@ def __pd_dtype_to_pxt_type(pd_dtype: DtypeObj, nullable: bool) -> Optional[pxt.C
|
|
|
165
148
|
Returns:
|
|
166
149
|
pxt.ColumnType: A pixeltable ColumnType
|
|
167
150
|
"""
|
|
168
|
-
# Pandas extension arrays / types (Int64, boolean, string[pyarrow], etc.) are not directly
|
|
151
|
+
# Pandas extension arrays / types (Int64, boolean, string[pyarrow], etc.) are not directly
|
|
152
|
+
# compatible with NumPy dtypes
|
|
169
153
|
# The timezone-aware datetime64[ns, tz=] dtype is a pandas extension dtype
|
|
170
154
|
if is_datetime64_any_dtype(pd_dtype):
|
|
171
155
|
return pxt.TimestampType(nullable=nullable)
|
|
@@ -204,32 +188,35 @@ def __pd_coltype_to_pxt_type(pd_dtype: DtypeObj, data_col: pd.Series, nullable:
|
|
|
204
188
|
raise excs.Error(f'Could not infer Pixeltable type of column: {data_col.name} (dtype: {pd_dtype})')
|
|
205
189
|
|
|
206
190
|
|
|
207
|
-
def
|
|
191
|
+
def _df_row_to_pxt_row(
|
|
208
192
|
row: tuple[Any, ...], schema: dict[str, pxt.ColumnType], col_mapping: Optional[dict[str, str]]
|
|
209
193
|
) -> dict[str, Any]:
|
|
210
194
|
"""Convert a row to insertable format"""
|
|
211
195
|
pxt_row: dict[str, Any] = {}
|
|
212
196
|
for val, (col_name, pxt_type) in zip(row[1:], schema.items()):
|
|
197
|
+
pxt_name = col_mapping.get(col_name, col_name)
|
|
198
|
+
nval: Any
|
|
213
199
|
if pxt_type.is_float_type():
|
|
214
|
-
|
|
200
|
+
nval = float(val)
|
|
215
201
|
elif isinstance(val, float) and np.isnan(val):
|
|
216
202
|
# pandas uses NaN for empty cells, even for types other than float;
|
|
217
203
|
# for any type but a float, convert these to None
|
|
218
|
-
|
|
204
|
+
nval = None
|
|
219
205
|
elif pxt_type.is_int_type():
|
|
220
|
-
|
|
206
|
+
nval = int(val)
|
|
221
207
|
elif pxt_type.is_bool_type():
|
|
222
|
-
|
|
208
|
+
nval = bool(val)
|
|
223
209
|
elif pxt_type.is_string_type():
|
|
224
|
-
|
|
210
|
+
nval = str(val)
|
|
225
211
|
elif pxt_type.is_timestamp_type():
|
|
226
212
|
if pd.isnull(val):
|
|
227
213
|
# pandas has the bespoke 'NaT' type for a missing timestamp; postgres is very
|
|
228
214
|
# much not-ok with it. (But if we convert it to None and then load out the
|
|
229
215
|
# table contents as a pandas DataFrame, it will correctly restore the 'NaT'!)
|
|
230
|
-
|
|
216
|
+
nval = None
|
|
231
217
|
else:
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
218
|
+
nval = pd.Timestamp(val).to_pydatetime()
|
|
219
|
+
else:
|
|
220
|
+
nval = val
|
|
221
|
+
pxt_row[pxt_name] = nval
|
|
235
222
|
return pxt_row
|
pixeltable/io/parquet.py
CHANGED
|
@@ -4,7 +4,6 @@ import datetime
|
|
|
4
4
|
import io
|
|
5
5
|
import json
|
|
6
6
|
import logging
|
|
7
|
-
import random
|
|
8
7
|
import typing
|
|
9
8
|
from collections import deque
|
|
10
9
|
from pathlib import Path
|
|
@@ -14,12 +13,10 @@ import numpy as np
|
|
|
14
13
|
import PIL.Image
|
|
15
14
|
|
|
16
15
|
import pixeltable as pxt
|
|
17
|
-
import pixeltable.exceptions as
|
|
16
|
+
import pixeltable.exceptions as excs
|
|
18
17
|
from pixeltable.env import Env
|
|
19
18
|
from pixeltable.utils.transactional_directory import transactional_directory
|
|
20
19
|
|
|
21
|
-
from .utils import normalize_import_parameters, normalize_schema_names
|
|
22
|
-
|
|
23
20
|
if typing.TYPE_CHECKING:
|
|
24
21
|
import pyarrow as pa
|
|
25
22
|
|
|
@@ -78,7 +75,7 @@ def export_parquet(
|
|
|
78
75
|
arrow_schema = to_arrow_schema(df.schema)
|
|
79
76
|
|
|
80
77
|
if not inline_images and any(col_type.is_image_type() for col_type in df.schema.values()):
|
|
81
|
-
raise
|
|
78
|
+
raise excs.Error('Cannot export Dataframe with image columns when inline_images is False')
|
|
82
79
|
|
|
83
80
|
# store the changes atomically
|
|
84
81
|
with transactional_directory(parquet_path) as temp_path:
|
|
@@ -87,7 +84,7 @@ def export_parquet(
|
|
|
87
84
|
json.dump(type_dict, (temp_path / '.pixeltable.column_types.json').open('w')) # keep type metadata
|
|
88
85
|
|
|
89
86
|
batch_num = 0
|
|
90
|
-
current_value_batch: dict[str, deque] = {k: deque() for k in df.schema
|
|
87
|
+
current_value_batch: dict[str, deque] = {k: deque() for k in df.schema}
|
|
91
88
|
current_byte_estimate = 0
|
|
92
89
|
|
|
93
90
|
with Env.get().begin_xact():
|
|
@@ -111,7 +108,7 @@ def export_parquet(
|
|
|
111
108
|
val.save(buf, format='PNG')
|
|
112
109
|
val = buf.getvalue()
|
|
113
110
|
else:
|
|
114
|
-
|
|
111
|
+
raise excs.Error(f'unknown image type {type(val)}')
|
|
115
112
|
length = len(val)
|
|
116
113
|
elif col_type.is_string_type():
|
|
117
114
|
length = len(val)
|
|
@@ -119,16 +116,14 @@ def export_parquet(
|
|
|
119
116
|
if data_row.file_paths is not None and data_row.file_paths[e.slot_idx] is not None:
|
|
120
117
|
val = data_row.file_paths[e.slot_idx]
|
|
121
118
|
else:
|
|
122
|
-
|
|
119
|
+
raise excs.Error(f'unknown video type {type(val)}')
|
|
123
120
|
length = len(val)
|
|
124
121
|
elif col_type.is_json_type():
|
|
125
122
|
val = json.dumps(val)
|
|
126
123
|
length = len(val)
|
|
127
124
|
elif col_type.is_array_type():
|
|
128
125
|
length = val.nbytes
|
|
129
|
-
elif col_type.is_int_type():
|
|
130
|
-
length = 8
|
|
131
|
-
elif col_type.is_float_type():
|
|
126
|
+
elif col_type.is_int_type() or col_type.is_float_type():
|
|
132
127
|
length = 8
|
|
133
128
|
elif col_type.is_bool_type():
|
|
134
129
|
length = 1
|
|
@@ -136,7 +131,7 @@ def export_parquet(
|
|
|
136
131
|
val = val.astimezone(datetime.timezone.utc)
|
|
137
132
|
length = 8
|
|
138
133
|
else:
|
|
139
|
-
|
|
134
|
+
raise excs.Error(f'unknown type {col_type} for {col_name}')
|
|
140
135
|
|
|
141
136
|
current_value_batch[col_name].append(val)
|
|
142
137
|
current_byte_estimate += length
|
|
@@ -144,7 +139,7 @@ def export_parquet(
|
|
|
144
139
|
assert batch_num < 100_000, 'wrote too many parquet files, unclear ordering'
|
|
145
140
|
_write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
|
|
146
141
|
batch_num += 1
|
|
147
|
-
current_value_batch = {k: deque() for k in df.schema
|
|
142
|
+
current_value_batch = {k: deque() for k in df.schema}
|
|
148
143
|
current_byte_estimate = 0
|
|
149
144
|
|
|
150
145
|
_write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
|
|
@@ -173,32 +168,12 @@ def import_parquet(
|
|
|
173
168
|
Returns:
|
|
174
169
|
A handle to the newly created table.
|
|
175
170
|
"""
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
schema, pxt_pk, col_mapping = normalize_schema_names(ar_schema, primary_key, schema_overrides, False)
|
|
186
|
-
|
|
187
|
-
if table in pxt.list_tables():
|
|
188
|
-
raise exc.Error(f'Table {table} already exists')
|
|
189
|
-
|
|
190
|
-
tmp_name = f'{table}_tmp_{random.randint(0, 100000000)}'
|
|
191
|
-
total_rows = 0
|
|
192
|
-
try:
|
|
193
|
-
tab = pxt.create_table(tmp_name, schema, primary_key=pxt_pk, **kwargs)
|
|
194
|
-
for fragment in parquet_dataset.fragments: # type: ignore[attr-defined]
|
|
195
|
-
for batch in fragment.to_batches():
|
|
196
|
-
dict_batch = list(iter_tuples2(batch, col_mapping, schema))
|
|
197
|
-
total_rows += len(dict_batch)
|
|
198
|
-
tab.insert(dict_batch)
|
|
199
|
-
except Exception as e:
|
|
200
|
-
_logger.error(f'Error after inserting {total_rows} rows from Parquet file into table: {e}')
|
|
201
|
-
raise e
|
|
202
|
-
|
|
203
|
-
pxt.move(tmp_name, table)
|
|
204
|
-
return pxt.get_table(table)
|
|
171
|
+
value = kwargs.pop('source_format', None)
|
|
172
|
+
return pxt.create_table(
|
|
173
|
+
table,
|
|
174
|
+
source=parquet_path,
|
|
175
|
+
source_format=value,
|
|
176
|
+
schema_overrides=schema_overrides,
|
|
177
|
+
primary_key=primary_key,
|
|
178
|
+
extra_args=kwargs,
|
|
179
|
+
)
|