pixeltable 0.3.4__py3-none-any.whl → 0.3.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +1 -0
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/catalog.py +9 -2
- pixeltable/catalog/column.py +1 -1
- pixeltable/catalog/dir.py +1 -1
- pixeltable/catalog/table.py +3 -1
- pixeltable/catalog/table_version.py +12 -2
- pixeltable/catalog/table_version_path.py +2 -2
- pixeltable/catalog/view.py +64 -20
- pixeltable/dataframe.py +11 -6
- pixeltable/env.py +12 -0
- pixeltable/exec/expr_eval/evaluators.py +4 -2
- pixeltable/exec/expr_eval/expr_eval_node.py +4 -1
- pixeltable/exprs/comparison.py +8 -4
- pixeltable/exprs/data_row.py +9 -7
- pixeltable/exprs/expr.py +2 -2
- pixeltable/exprs/function_call.py +155 -313
- pixeltable/exprs/json_mapper.py +25 -8
- pixeltable/exprs/json_path.py +6 -5
- pixeltable/exprs/object_ref.py +16 -5
- pixeltable/exprs/row_builder.py +10 -3
- pixeltable/func/aggregate_function.py +29 -15
- pixeltable/func/callable_function.py +11 -8
- pixeltable/func/expr_template_function.py +3 -9
- pixeltable/func/function.py +148 -74
- pixeltable/func/signature.py +65 -30
- pixeltable/func/tools.py +26 -26
- pixeltable/func/udf.py +1 -1
- pixeltable/functions/__init__.py +1 -0
- pixeltable/functions/anthropic.py +9 -3
- pixeltable/functions/deepseek.py +121 -0
- pixeltable/functions/image.py +7 -7
- pixeltable/functions/openai.py +30 -13
- pixeltable/functions/video.py +14 -7
- pixeltable/globals.py +14 -3
- pixeltable/index/embedding_index.py +4 -13
- pixeltable/io/globals.py +88 -77
- pixeltable/io/hf_datasets.py +34 -34
- pixeltable/io/pandas.py +75 -76
- pixeltable/io/parquet.py +19 -27
- pixeltable/io/utils.py +115 -0
- pixeltable/iterators/audio.py +2 -1
- pixeltable/iterators/video.py +1 -1
- pixeltable/metadata/__init__.py +2 -1
- pixeltable/metadata/converters/convert_15.py +18 -8
- pixeltable/metadata/converters/convert_27.py +31 -0
- pixeltable/metadata/converters/convert_28.py +15 -0
- pixeltable/metadata/converters/convert_29.py +111 -0
- pixeltable/metadata/converters/util.py +12 -1
- pixeltable/metadata/notes.py +3 -0
- pixeltable/metadata/schema.py +8 -0
- pixeltable/share/__init__.py +1 -0
- pixeltable/share/packager.py +41 -13
- pixeltable/share/publish.py +97 -0
- pixeltable/type_system.py +40 -14
- pixeltable/utils/__init__.py +41 -0
- pixeltable/utils/arrow.py +40 -7
- pixeltable/utils/formatter.py +1 -1
- {pixeltable-0.3.4.dist-info → pixeltable-0.3.6.dist-info}/METADATA +34 -49
- {pixeltable-0.3.4.dist-info → pixeltable-0.3.6.dist-info}/RECORD +63 -57
- {pixeltable-0.3.4.dist-info → pixeltable-0.3.6.dist-info}/WHEEL +1 -1
- {pixeltable-0.3.4.dist-info → pixeltable-0.3.6.dist-info}/LICENSE +0 -0
- {pixeltable-0.3.4.dist-info → pixeltable-0.3.6.dist-info}/entry_points.txt +0 -0
|
@@ -99,10 +99,10 @@ class EmbeddingIndex(IndexBase):
|
|
|
99
99
|
# Now validate the return types of the embedding functions.
|
|
100
100
|
|
|
101
101
|
if self.string_embed is not None:
|
|
102
|
-
self._validate_embedding_fn(self.string_embed
|
|
102
|
+
self._validate_embedding_fn(self.string_embed)
|
|
103
103
|
|
|
104
104
|
if self.image_embed is not None:
|
|
105
|
-
self._validate_embedding_fn(self.image_embed
|
|
105
|
+
self._validate_embedding_fn(self.image_embed)
|
|
106
106
|
|
|
107
107
|
if c.col_type.is_string_type() and self.string_embed is None:
|
|
108
108
|
raise excs.Error(f"Text embedding function is required for column {c.name} (parameter 'string_embed')")
|
|
@@ -206,21 +206,12 @@ class EmbeddingIndex(IndexBase):
|
|
|
206
206
|
return None
|
|
207
207
|
|
|
208
208
|
@classmethod
|
|
209
|
-
def _validate_embedding_fn(cls, embed_fn: func.Function
|
|
209
|
+
def _validate_embedding_fn(cls, embed_fn: func.Function) -> None:
|
|
210
210
|
"""Validate the given embedding function."""
|
|
211
211
|
assert not embed_fn.is_polymorphic
|
|
212
|
-
sig = embed_fn.signature
|
|
213
212
|
|
|
214
|
-
|
|
215
|
-
param_name = sig.parameters_by_pos[0].name
|
|
216
|
-
if expected_type == ts.ColumnType.Type.STRING:
|
|
217
|
-
return_type = embed_fn.call_return_type([], {param_name: 'dummy'})
|
|
218
|
-
else:
|
|
219
|
-
assert expected_type == ts.ColumnType.Type.IMAGE
|
|
220
|
-
img = PIL.Image.new('RGB', (512, 512))
|
|
221
|
-
return_type = embed_fn.call_return_type([], {param_name: img})
|
|
213
|
+
return_type = embed_fn.signature.return_type
|
|
222
214
|
|
|
223
|
-
assert return_type is not None
|
|
224
215
|
if not isinstance(return_type, ts.ArrayType):
|
|
225
216
|
raise excs.Error(
|
|
226
217
|
f'The function `{embed_fn.name}` is not a valid embedding: '
|
pixeltable/io/globals.py
CHANGED
|
@@ -1,3 +1,7 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import urllib.parse
|
|
3
|
+
import urllib.request
|
|
4
|
+
from pathlib import Path
|
|
1
5
|
from typing import TYPE_CHECKING, Any, Literal, Optional, Union
|
|
2
6
|
|
|
3
7
|
import pixeltable as pxt
|
|
@@ -5,11 +9,61 @@ import pixeltable.exceptions as excs
|
|
|
5
9
|
from pixeltable import Table, exprs
|
|
6
10
|
from pixeltable.env import Env
|
|
7
11
|
from pixeltable.io.external_store import SyncStatus
|
|
12
|
+
from pixeltable.utils import parse_local_file_path
|
|
8
13
|
|
|
9
14
|
if TYPE_CHECKING:
|
|
10
15
|
import fiftyone as fo # type: ignore[import-untyped]
|
|
11
16
|
|
|
12
17
|
|
|
18
|
+
from .utils import find_or_create_table, normalize_import_parameters, normalize_schema_names
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _infer_schema_from_rows(
|
|
22
|
+
rows: list[dict[str, Any]], schema_overrides: dict[str, Any], primary_key: list[str]
|
|
23
|
+
) -> dict[str, pxt.ColumnType]:
|
|
24
|
+
schema: dict[str, pxt.ColumnType] = {}
|
|
25
|
+
cols_with_nones: set[str] = set()
|
|
26
|
+
|
|
27
|
+
for n, row in enumerate(rows):
|
|
28
|
+
for col_name, value in row.items():
|
|
29
|
+
if col_name in schema_overrides:
|
|
30
|
+
# We do the insertion here; this will ensure that the column order matches the order
|
|
31
|
+
# in which the column names are encountered in the input data, even if `schema_overrides`
|
|
32
|
+
# is specified.
|
|
33
|
+
if col_name not in schema:
|
|
34
|
+
schema[col_name] = schema_overrides[col_name]
|
|
35
|
+
elif value is not None:
|
|
36
|
+
# If `key` is not in `schema_overrides`, then we infer its type from the data.
|
|
37
|
+
# The column type will always be nullable by default.
|
|
38
|
+
col_type = pxt.ColumnType.infer_literal_type(value, nullable=col_name not in primary_key)
|
|
39
|
+
if col_type is None:
|
|
40
|
+
raise excs.Error(
|
|
41
|
+
f'Could not infer type for column `{col_name}`; the value in row {n} has an unsupported type: {type(value)}'
|
|
42
|
+
)
|
|
43
|
+
if col_name not in schema:
|
|
44
|
+
schema[col_name] = col_type
|
|
45
|
+
else:
|
|
46
|
+
supertype = schema[col_name].supertype(col_type)
|
|
47
|
+
if supertype is None:
|
|
48
|
+
raise excs.Error(
|
|
49
|
+
f'Could not infer type of column `{col_name}`; the value in row {n} does not match preceding type {schema[col_name]}: {value!r}\n'
|
|
50
|
+
'Consider specifying the type explicitly in `schema_overrides`.'
|
|
51
|
+
)
|
|
52
|
+
schema[col_name] = supertype
|
|
53
|
+
else:
|
|
54
|
+
cols_with_nones.add(col_name)
|
|
55
|
+
|
|
56
|
+
entirely_none_cols = cols_with_nones - schema.keys()
|
|
57
|
+
if len(entirely_none_cols) > 0:
|
|
58
|
+
# A column can only end up in `entirely_none_cols` if it was not in `schema_overrides` and
|
|
59
|
+
# was not encountered in any row with a non-None value.
|
|
60
|
+
raise excs.Error(
|
|
61
|
+
f'The following columns have no non-null values: {", ".join(entirely_none_cols)}\n'
|
|
62
|
+
'Consider specifying the type(s) explicitly in `schema_overrides`.'
|
|
63
|
+
)
|
|
64
|
+
return schema
|
|
65
|
+
|
|
66
|
+
|
|
13
67
|
def create_label_studio_project(
|
|
14
68
|
t: Table,
|
|
15
69
|
label_config: str,
|
|
@@ -140,7 +194,7 @@ def import_rows(
|
|
|
140
194
|
tbl_path: str,
|
|
141
195
|
rows: list[dict[str, Any]],
|
|
142
196
|
*,
|
|
143
|
-
schema_overrides: Optional[dict[str,
|
|
197
|
+
schema_overrides: Optional[dict[str, Any]] = None,
|
|
144
198
|
primary_key: Optional[Union[str, list[str]]] = None,
|
|
145
199
|
num_retained_versions: int = 10,
|
|
146
200
|
comment: str = '',
|
|
@@ -169,67 +223,22 @@ def import_rows(
|
|
|
169
223
|
Returns:
|
|
170
224
|
A handle to the newly created [`Table`][pixeltable.Table].
|
|
171
225
|
"""
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
schema
|
|
175
|
-
cols_with_nones: set[str] = set()
|
|
226
|
+
schema_overrides, primary_key = normalize_import_parameters(schema_overrides, primary_key)
|
|
227
|
+
row_schema = _infer_schema_from_rows(rows, schema_overrides, primary_key)
|
|
228
|
+
schema, pxt_pk, _ = normalize_schema_names(row_schema, primary_key, schema_overrides, True)
|
|
176
229
|
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
if col_name in schema_overrides:
|
|
180
|
-
# We do the insertion here; this will ensure that the column order matches the order
|
|
181
|
-
# in which the column names are encountered in the input data, even if `schema_overrides`
|
|
182
|
-
# is specified.
|
|
183
|
-
if col_name not in schema:
|
|
184
|
-
schema[col_name] = schema_overrides[col_name]
|
|
185
|
-
elif value is not None:
|
|
186
|
-
# If `key` is not in `schema_overrides`, then we infer its type from the data.
|
|
187
|
-
# The column type will always be nullable by default.
|
|
188
|
-
col_type = pxt.ColumnType.infer_literal_type(value, nullable=True)
|
|
189
|
-
if col_type is None:
|
|
190
|
-
raise excs.Error(
|
|
191
|
-
f'Could not infer type for column `{col_name}`; the value in row {n} has an unsupported type: {type(value)}'
|
|
192
|
-
)
|
|
193
|
-
if col_name not in schema:
|
|
194
|
-
schema[col_name] = col_type
|
|
195
|
-
else:
|
|
196
|
-
supertype = schema[col_name].supertype(col_type)
|
|
197
|
-
if supertype is None:
|
|
198
|
-
raise excs.Error(
|
|
199
|
-
f'Could not infer type of column `{col_name}`; the value in row {n} does not match preceding type {schema[col_name]}: {value!r}\n'
|
|
200
|
-
'Consider specifying the type explicitly in `schema_overrides`.'
|
|
201
|
-
)
|
|
202
|
-
schema[col_name] = supertype
|
|
203
|
-
else:
|
|
204
|
-
cols_with_nones.add(col_name)
|
|
205
|
-
|
|
206
|
-
extraneous_keys = schema_overrides.keys() - schema.keys()
|
|
207
|
-
if len(extraneous_keys) > 0:
|
|
208
|
-
raise excs.Error(
|
|
209
|
-
f'The following columns specified in `schema_overrides` are not present in the data: {", ".join(extraneous_keys)}'
|
|
210
|
-
)
|
|
211
|
-
|
|
212
|
-
entirely_none_cols = cols_with_nones - schema.keys()
|
|
213
|
-
if len(entirely_none_cols) > 0:
|
|
214
|
-
# A column can only end up in `entirely_null_cols` if it was not in `schema_overrides` and
|
|
215
|
-
# was not encountered in any row with a non-None value.
|
|
216
|
-
raise excs.Error(
|
|
217
|
-
f'The following columns have no non-null values: {", ".join(entirely_none_cols)}\n'
|
|
218
|
-
'Consider specifying the type(s) explicitly in `schema_overrides`.'
|
|
219
|
-
)
|
|
220
|
-
|
|
221
|
-
t = pxt.create_table(
|
|
222
|
-
tbl_path, schema, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment
|
|
230
|
+
table = find_or_create_table(
|
|
231
|
+
tbl_path, schema, primary_key=pxt_pk, num_retained_versions=num_retained_versions, comment=comment
|
|
223
232
|
)
|
|
224
|
-
|
|
225
|
-
return
|
|
233
|
+
table.insert(rows)
|
|
234
|
+
return table
|
|
226
235
|
|
|
227
236
|
|
|
228
237
|
def import_json(
|
|
229
238
|
tbl_path: str,
|
|
230
239
|
filepath_or_url: str,
|
|
231
240
|
*,
|
|
232
|
-
schema_overrides: Optional[dict[str,
|
|
241
|
+
schema_overrides: Optional[dict[str, Any]] = None,
|
|
233
242
|
primary_key: Optional[Union[str, list[str]]] = None,
|
|
234
243
|
num_retained_versions: int = 10,
|
|
235
244
|
comment: str = '',
|
|
@@ -253,33 +262,35 @@ def import_json(
|
|
|
253
262
|
Returns:
|
|
254
263
|
A handle to the newly created [`Table`][pixeltable.Table].
|
|
255
264
|
"""
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
if len(parsed.scheme) <= 1 or parsed.scheme == 'file':
|
|
263
|
-
# local file path
|
|
264
|
-
if len(parsed.scheme) <= 1:
|
|
265
|
-
filepath = filepath_or_url
|
|
266
|
-
else:
|
|
267
|
-
filepath = urllib.parse.unquote(urllib.request.url2pathname(parsed.path))
|
|
268
|
-
with open(filepath) as fp:
|
|
265
|
+
path = parse_local_file_path(filepath_or_url)
|
|
266
|
+
if path is None: # it's a URL
|
|
267
|
+
# TODO: This should read from S3 as well.
|
|
268
|
+
contents = urllib.request.urlopen(filepath_or_url).read()
|
|
269
|
+
else:
|
|
270
|
+
with open(path) as fp:
|
|
269
271
|
contents = fp.read()
|
|
272
|
+
|
|
273
|
+
rows = json.loads(contents, **kwargs)
|
|
274
|
+
|
|
275
|
+
schema_overrides, primary_key = normalize_import_parameters(schema_overrides, primary_key)
|
|
276
|
+
row_schema = _infer_schema_from_rows(rows, schema_overrides, primary_key)
|
|
277
|
+
schema, pxt_pk, col_mapping = normalize_schema_names(row_schema, primary_key, schema_overrides, False)
|
|
278
|
+
|
|
279
|
+
# Convert all rows to insertable format - not needed, misnamed columns and types are errors in the incoming row format
|
|
280
|
+
if col_mapping is not None:
|
|
281
|
+
tbl_rows = [
|
|
282
|
+
{field if col_mapping is None else col_mapping[field]: val for field, val in row.items()} for row in rows
|
|
283
|
+
]
|
|
270
284
|
else:
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
tbl_path,
|
|
276
|
-
data,
|
|
277
|
-
schema_overrides=schema_overrides,
|
|
278
|
-
primary_key=primary_key,
|
|
279
|
-
num_retained_versions=num_retained_versions,
|
|
280
|
-
comment=comment,
|
|
285
|
+
tbl_rows = rows
|
|
286
|
+
|
|
287
|
+
table = find_or_create_table(
|
|
288
|
+
tbl_path, schema, primary_key=pxt_pk, num_retained_versions=num_retained_versions, comment=comment
|
|
281
289
|
)
|
|
282
290
|
|
|
291
|
+
table.insert(tbl_rows)
|
|
292
|
+
return table
|
|
293
|
+
|
|
283
294
|
|
|
284
295
|
def export_images_as_fo_dataset(
|
|
285
296
|
tbl: pxt.Table,
|
pixeltable/io/hf_datasets.py
CHANGED
|
@@ -10,6 +10,8 @@ import pixeltable as pxt
|
|
|
10
10
|
import pixeltable.type_system as ts
|
|
11
11
|
from pixeltable import exceptions as excs
|
|
12
12
|
|
|
13
|
+
from .utils import normalize_import_parameters, normalize_schema_names
|
|
14
|
+
|
|
13
15
|
if typing.TYPE_CHECKING:
|
|
14
16
|
import datasets # type: ignore[import-untyped]
|
|
15
17
|
|
|
@@ -28,29 +30,33 @@ _hf_to_pxt: dict[str, ts.ColumnType] = {
|
|
|
28
30
|
'int64': ts.IntType(nullable=True),
|
|
29
31
|
'bool': ts.BoolType(nullable=True),
|
|
30
32
|
'float32': ts.FloatType(nullable=True),
|
|
33
|
+
'float64': ts.FloatType(nullable=True),
|
|
34
|
+
'large_string': ts.StringType(nullable=True),
|
|
31
35
|
'string': ts.StringType(nullable=True),
|
|
32
36
|
'timestamp[s]': ts.TimestampType(nullable=True),
|
|
33
37
|
'timestamp[ms]': ts.TimestampType(nullable=True), # HF dataset iterator converts timestamps to datetime.datetime
|
|
38
|
+
'timestamp[us]': ts.TimestampType(nullable=True),
|
|
34
39
|
}
|
|
35
40
|
|
|
36
41
|
|
|
37
|
-
def _to_pixeltable_type(feature_type: Any) -> Optional[ts.ColumnType]:
|
|
42
|
+
def _to_pixeltable_type(feature_type: Any, nullable: bool) -> Optional[ts.ColumnType]:
|
|
38
43
|
"""Convert a huggingface feature type to a pixeltable ColumnType if one is defined."""
|
|
39
44
|
import datasets
|
|
40
45
|
|
|
41
46
|
if isinstance(feature_type, datasets.ClassLabel):
|
|
42
47
|
# enum, example: ClassLabel(names=['neg', 'pos'], id=None)
|
|
43
|
-
return ts.StringType(nullable=
|
|
48
|
+
return ts.StringType(nullable=nullable)
|
|
44
49
|
elif isinstance(feature_type, datasets.Value):
|
|
45
50
|
# example: Value(dtype='int64', id=None)
|
|
46
|
-
|
|
51
|
+
pt = _hf_to_pxt.get(feature_type.dtype, None)
|
|
52
|
+
return pt.copy(nullable=nullable) if pt is not None else None
|
|
47
53
|
elif isinstance(feature_type, datasets.Sequence):
|
|
48
54
|
# example: cohere wiki. Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None)
|
|
49
|
-
dtype = _to_pixeltable_type(feature_type.feature)
|
|
55
|
+
dtype = _to_pixeltable_type(feature_type.feature, nullable)
|
|
50
56
|
length = feature_type.length if feature_type.length != -1 else None
|
|
51
57
|
return ts.ArrayType(shape=(length,), dtype=dtype)
|
|
52
58
|
elif isinstance(feature_type, datasets.Image):
|
|
53
|
-
return ts.ImageType(nullable=
|
|
59
|
+
return ts.ImageType(nullable=nullable)
|
|
54
60
|
else:
|
|
55
61
|
return None
|
|
56
62
|
|
|
@@ -63,15 +69,17 @@ def _get_hf_schema(dataset: Union[datasets.Dataset, datasets.DatasetDict]) -> da
|
|
|
63
69
|
return first_dataset.features
|
|
64
70
|
|
|
65
71
|
|
|
66
|
-
def
|
|
67
|
-
|
|
72
|
+
def huggingface_schema_to_pxt_schema(
|
|
73
|
+
hf_schema: datasets.Features, schema_overrides: dict[str, Any], primary_key: list[str]
|
|
68
74
|
) -> dict[str, Optional[ts.ColumnType]]:
|
|
69
75
|
"""Generate a pixeltable schema from a huggingface dataset schema.
|
|
70
76
|
Columns without a known mapping are mapped to None
|
|
71
77
|
"""
|
|
72
|
-
hf_schema = _get_hf_schema(hf_dataset)
|
|
73
78
|
pixeltable_schema = {
|
|
74
|
-
column_name: _to_pixeltable_type(feature_type
|
|
79
|
+
column_name: _to_pixeltable_type(feature_type, column_name not in primary_key)
|
|
80
|
+
if column_name not in schema_overrides
|
|
81
|
+
else schema_overrides[column_name]
|
|
82
|
+
for column_name, feature_type in hf_schema.items()
|
|
75
83
|
}
|
|
76
84
|
return pixeltable_schema
|
|
77
85
|
|
|
@@ -82,6 +90,7 @@ def import_huggingface_dataset(
|
|
|
82
90
|
*,
|
|
83
91
|
column_name_for_split: Optional[str] = None,
|
|
84
92
|
schema_overrides: Optional[dict[str, Any]] = None,
|
|
93
|
+
primary_key: Optional[Union[str, list[str]]] = None,
|
|
85
94
|
**kwargs: Any,
|
|
86
95
|
) -> pxt.Table:
|
|
87
96
|
"""Create a new base table from a Huggingface dataset, or dataset dict with multiple splits.
|
|
@@ -97,6 +106,7 @@ def import_huggingface_dataset(
|
|
|
97
106
|
name `name` will be given type `type`, instead of being inferred from the `Dataset` or `DatasetDict`. The keys in
|
|
98
107
|
`schema_overrides` should be the column names of the `Dataset` or `DatasetDict` (whether or not they are valid
|
|
99
108
|
Pixeltable identifiers).
|
|
109
|
+
primary_key: The primary key of the table (see [`create_table()`][pixeltable.create_table]).
|
|
100
110
|
kwargs: Additional arguments to pass to `create_table`.
|
|
101
111
|
|
|
102
112
|
Returns:
|
|
@@ -106,57 +116,47 @@ def import_huggingface_dataset(
|
|
|
106
116
|
|
|
107
117
|
import pixeltable as pxt
|
|
108
118
|
|
|
109
|
-
if table_path in pxt.list_tables():
|
|
110
|
-
raise excs.Error(f'table {table_path} already exists')
|
|
111
|
-
|
|
112
119
|
if not isinstance(dataset, (datasets.Dataset, datasets.DatasetDict)):
|
|
113
120
|
raise excs.Error(f'`type(dataset)` must be `datasets.Dataset` or `datasets.DatasetDict`. Got {type(dataset)=}')
|
|
114
121
|
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
dataset_dict = {split_name: dataset}
|
|
120
|
-
else:
|
|
121
|
-
dataset_dict = dataset
|
|
122
|
-
|
|
123
|
-
pixeltable_schema = huggingface_schema_to_pixeltable_schema(dataset)
|
|
124
|
-
if schema_overrides is not None:
|
|
125
|
-
pixeltable_schema.update(schema_overrides)
|
|
122
|
+
# Create the pixeltable schema from the huggingface schema
|
|
123
|
+
hf_schema_source = _get_hf_schema(dataset)
|
|
124
|
+
schema_overrides, primary_key = normalize_import_parameters(schema_overrides, primary_key)
|
|
125
|
+
hf_schema = huggingface_schema_to_pxt_schema(hf_schema_source, schema_overrides, primary_key)
|
|
126
126
|
|
|
127
|
+
# Add the split column to the schema if requested
|
|
127
128
|
if column_name_for_split is not None:
|
|
128
|
-
if column_name_for_split in
|
|
129
|
+
if column_name_for_split in hf_schema:
|
|
129
130
|
raise excs.Error(
|
|
130
131
|
f'Column name `{column_name_for_split}` already exists in dataset schema; provide a different `column_name_for_split`'
|
|
131
132
|
)
|
|
132
|
-
|
|
133
|
+
hf_schema[column_name_for_split] = ts.StringType(nullable=True)
|
|
133
134
|
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
135
|
+
schema, pxt_pk, _ = normalize_schema_names(hf_schema, primary_key, schema_overrides, True)
|
|
136
|
+
|
|
137
|
+
# Prepare to create table and insert data
|
|
138
|
+
if table_path in pxt.list_tables():
|
|
139
|
+
raise excs.Error(f'table {table_path} already exists')
|
|
137
140
|
|
|
138
141
|
if isinstance(dataset, datasets.Dataset):
|
|
139
142
|
# when loading an hf dataset partially, dataset.split._name is sometimes the form "train[0:1000]"
|
|
140
143
|
raw_name = dataset.split._name
|
|
141
144
|
split_name = raw_name.split('[')[0] if raw_name is not None else None
|
|
142
145
|
dataset_dict = {split_name: dataset}
|
|
143
|
-
elif isinstance(dataset, datasets.DatasetDict):
|
|
144
|
-
dataset_dict = dataset
|
|
145
146
|
else:
|
|
146
|
-
|
|
147
|
+
dataset_dict = dataset
|
|
147
148
|
|
|
148
149
|
# extract all class labels from the dataset to translate category ints to strings
|
|
149
|
-
hf_schema = _get_hf_schema(dataset)
|
|
150
150
|
categorical_features = {
|
|
151
151
|
feature_name: feature_type.names
|
|
152
|
-
for (feature_name, feature_type) in
|
|
152
|
+
for (feature_name, feature_type) in hf_schema_source.items()
|
|
153
153
|
if isinstance(feature_type, datasets.ClassLabel)
|
|
154
154
|
}
|
|
155
155
|
|
|
156
156
|
try:
|
|
157
157
|
# random tmp name
|
|
158
158
|
tmp_name = f'{table_path}_tmp_{random.randint(0, 100000000)}'
|
|
159
|
-
tab = pxt.create_table(tmp_name,
|
|
159
|
+
tab = pxt.create_table(tmp_name, schema, primary_key=pxt_pk, **kwargs)
|
|
160
160
|
|
|
161
161
|
def _translate_row(row: dict[str, Any], split_name: str) -> dict[str, Any]:
|
|
162
162
|
output_row = row.copy()
|
pixeltable/io/pandas.py
CHANGED
|
@@ -2,17 +2,21 @@ from typing import Any, Optional, Union
|
|
|
2
2
|
|
|
3
3
|
import numpy as np
|
|
4
4
|
import pandas as pd
|
|
5
|
+
from pandas._typing import DtypeObj # For pandas dtype type hints
|
|
6
|
+
from pandas.api.types import is_datetime64_any_dtype, is_extension_array_dtype
|
|
5
7
|
|
|
6
8
|
import pixeltable as pxt
|
|
7
9
|
import pixeltable.exceptions as excs
|
|
8
|
-
|
|
10
|
+
from pixeltable import Table
|
|
11
|
+
|
|
12
|
+
from .utils import find_or_create_table, normalize_import_parameters, normalize_schema_names
|
|
9
13
|
|
|
10
14
|
|
|
11
15
|
def import_pandas(
|
|
12
16
|
tbl_name: str,
|
|
13
17
|
df: pd.DataFrame,
|
|
14
18
|
*,
|
|
15
|
-
schema_overrides: Optional[dict[str,
|
|
19
|
+
schema_overrides: Optional[dict[str, Any]] = None,
|
|
16
20
|
primary_key: Optional[Union[str, list[str]]] = None,
|
|
17
21
|
num_retained_versions: int = 10,
|
|
18
22
|
comment: str = '',
|
|
@@ -39,16 +43,16 @@ def import_pandas(
|
|
|
39
43
|
Returns:
|
|
40
44
|
A handle to the newly created [`Table`][pixeltable.Table].
|
|
41
45
|
"""
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
table =
|
|
46
|
+
schema_overrides, primary_key = normalize_import_parameters(schema_overrides, primary_key)
|
|
47
|
+
pd_schema = df_infer_schema(df, schema_overrides, primary_key)
|
|
48
|
+
schema, pxt_pk, col_mapping = normalize_schema_names(pd_schema, primary_key, schema_overrides, False)
|
|
49
|
+
|
|
50
|
+
__check_primary_key_values(df, primary_key)
|
|
51
|
+
|
|
52
|
+
# Convert all rows to insertable format
|
|
53
|
+
tbl_rows = [__df_row_to_pxt_row(row, pd_schema, col_mapping) for row in df.itertuples()]
|
|
54
|
+
|
|
55
|
+
table = find_or_create_table(
|
|
52
56
|
tbl_name, schema, primary_key=pxt_pk, num_retained_versions=num_retained_versions, comment=comment
|
|
53
57
|
)
|
|
54
58
|
table.insert(tbl_rows)
|
|
@@ -58,7 +62,7 @@ def import_pandas(
|
|
|
58
62
|
def import_csv(
|
|
59
63
|
tbl_name: str,
|
|
60
64
|
filepath_or_buffer,
|
|
61
|
-
schema_overrides: Optional[dict[str,
|
|
65
|
+
schema_overrides: Optional[dict[str, Any]] = None,
|
|
62
66
|
primary_key: Optional[Union[str, list[str]]] = None,
|
|
63
67
|
num_retained_versions: int = 10,
|
|
64
68
|
comment: str = '',
|
|
@@ -88,7 +92,7 @@ def import_excel(
|
|
|
88
92
|
tbl_name: str,
|
|
89
93
|
io,
|
|
90
94
|
*args,
|
|
91
|
-
schema_overrides: Optional[dict[str,
|
|
95
|
+
schema_overrides: Optional[dict[str, Any]] = None,
|
|
92
96
|
primary_key: Optional[Union[str, list[str]]] = None,
|
|
93
97
|
num_retained_versions: int = 10,
|
|
94
98
|
comment: str = '',
|
|
@@ -114,82 +118,73 @@ def import_excel(
|
|
|
114
118
|
)
|
|
115
119
|
|
|
116
120
|
|
|
117
|
-
def
|
|
121
|
+
def __check_primary_key_values(df: pd.DataFrame, primary_key: list[str]) -> None:
|
|
122
|
+
for pd_name in primary_key:
|
|
123
|
+
# This can be faster for large DataFrames
|
|
124
|
+
has_nulls = df[pd_name].count() < len(df)
|
|
125
|
+
if has_nulls:
|
|
126
|
+
raise excs.Error(f'Primary key column `{pd_name}` cannot contain null values.')
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def df_infer_schema(
|
|
118
130
|
df: pd.DataFrame, schema_overrides: dict[str, pxt.ColumnType], primary_key: list[str]
|
|
119
|
-
) ->
|
|
131
|
+
) -> dict[str, pxt.ColumnType]:
|
|
120
132
|
"""
|
|
121
133
|
Infers a Pixeltable schema from a Pandas DataFrame.
|
|
122
134
|
|
|
123
135
|
Returns:
|
|
124
136
|
A tuple containing a Pixeltable schema and a list of primary key column names.
|
|
125
137
|
"""
|
|
126
|
-
|
|
127
|
-
if pd_name not in df.columns:
|
|
128
|
-
raise excs.Error(
|
|
129
|
-
f'Column `{pd_name}` specified in `schema_overrides` does not exist in the given `DataFrame`.'
|
|
130
|
-
)
|
|
131
|
-
for pd_name in primary_key:
|
|
132
|
-
if pd_name not in df.columns:
|
|
133
|
-
raise excs.Error(f'Primary key column `{pd_name}` does not exist in the given `DataFrame`.')
|
|
134
|
-
|
|
135
|
-
schema: dict[str, pxt.ColumnType] = {}
|
|
136
|
-
col_mapping: dict[str, str] = {} # Maps Pandas column names to Pixeltable column names
|
|
137
|
-
|
|
138
|
+
pd_schema: dict[str, pxt.ColumnType] = {}
|
|
138
139
|
for pd_name, pd_dtype in zip(df.columns, df.dtypes):
|
|
139
140
|
if pd_name in schema_overrides:
|
|
140
141
|
pxt_type = schema_overrides[pd_name]
|
|
141
142
|
else:
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
n = 2
|
|
159
|
-
while f'{pxt_name}_{n}' in schema:
|
|
160
|
-
n += 1
|
|
161
|
-
pxt_name = f'{pxt_name}_{n}'
|
|
162
|
-
schema[pxt_name] = pxt_type
|
|
163
|
-
col_mapping[pd_name] = pxt_name
|
|
164
|
-
|
|
165
|
-
pxt_pk = [col_mapping[pk] for pk in primary_key]
|
|
166
|
-
return schema, pxt_pk
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
def __normalize_pxt_col_name(pd_name: str) -> str:
|
|
170
|
-
"""
|
|
171
|
-
Normalizes an arbitrary DataFrame column name into a valid Pixeltable identifier by:
|
|
172
|
-
- replacing any non-ascii or non-alphanumeric characters with an underscore _
|
|
173
|
-
- prefixing the result with the letter 'c' if it starts with an underscore or a number
|
|
143
|
+
pxt_type = __pd_coltype_to_pxt_type(pd_dtype, df[pd_name], pd_name not in primary_key)
|
|
144
|
+
pd_schema[pd_name] = pxt_type
|
|
145
|
+
|
|
146
|
+
return pd_schema
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
"""
|
|
150
|
+
# Check if a datetime64[ns, UTC] dtype
|
|
151
|
+
def is_datetime_tz_utc(x: Any) -> bool:
|
|
152
|
+
if isinstance(x, pd.Timestamp) and x.tzinfo is not None and str(x.tzinfo) == 'UTC':
|
|
153
|
+
return True
|
|
154
|
+
return pd.api.types.is_datetime64tz_dtype(x) and str(x).endswith('UTC]')
|
|
155
|
+
"""
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def __pd_dtype_to_pxt_type(pd_dtype: DtypeObj, nullable: bool) -> Optional[pxt.ColumnType]:
|
|
174
159
|
"""
|
|
175
|
-
|
|
176
|
-
if id[0].isnumeric():
|
|
177
|
-
id = f'c_{id}'
|
|
178
|
-
elif id[0] == '_':
|
|
179
|
-
id = f'c{id}'
|
|
180
|
-
assert pxt.catalog.is_valid_identifier(id), id
|
|
181
|
-
return id
|
|
160
|
+
Determines a pixeltable ColumnType from a pandas dtype
|
|
182
161
|
|
|
162
|
+
Args:
|
|
163
|
+
pd_dtype: A pandas dtype object
|
|
183
164
|
|
|
184
|
-
|
|
165
|
+
Returns:
|
|
166
|
+
pxt.ColumnType: A pixeltable ColumnType
|
|
167
|
+
"""
|
|
168
|
+
# Pandas extension arrays / types (Int64, boolean, string[pyarrow], etc.) are not directly compatible with NumPy dtypes
|
|
169
|
+
# The timezone-aware datetime64[ns, tz=] dtype is a pandas extension dtype
|
|
170
|
+
if is_datetime64_any_dtype(pd_dtype):
|
|
171
|
+
return pxt.TimestampType(nullable=nullable)
|
|
172
|
+
if is_extension_array_dtype(pd_dtype):
|
|
173
|
+
return None
|
|
174
|
+
# Most other pandas dtypes are directly NumPy compatible
|
|
175
|
+
assert isinstance(pd_dtype, np.dtype)
|
|
176
|
+
return pxt.ArrayType.from_np_dtype(pd_dtype, nullable)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def __pd_coltype_to_pxt_type(pd_dtype: DtypeObj, data_col: pd.Series, nullable: bool) -> pxt.ColumnType:
|
|
185
180
|
"""
|
|
186
|
-
Infers a Pixeltable type based on a
|
|
181
|
+
Infers a Pixeltable type based on a pandas dtype.
|
|
187
182
|
"""
|
|
188
|
-
pxttype =
|
|
183
|
+
pxttype = __pd_dtype_to_pxt_type(pd_dtype, nullable)
|
|
189
184
|
if pxttype is not None:
|
|
190
185
|
return pxttype
|
|
191
186
|
|
|
192
|
-
if
|
|
187
|
+
if pd_dtype == np.object_:
|
|
193
188
|
# The `object_` dtype can mean all sorts of things; see if we can infer the Pixeltable type
|
|
194
189
|
# based on the actual data in `data_col`.
|
|
195
190
|
# First drop any null values (they don't contribute to type inference).
|
|
@@ -206,11 +201,14 @@ def __np_dtype_to_pxt_type(np_dtype: np.dtype, data_col: pd.Series, nullable: bo
|
|
|
206
201
|
else:
|
|
207
202
|
return inferred_type.copy(nullable=nullable)
|
|
208
203
|
|
|
209
|
-
raise excs.Error(f'Could not infer Pixeltable type of column: {data_col.name} (dtype: {
|
|
204
|
+
raise excs.Error(f'Could not infer Pixeltable type of column: {data_col.name} (dtype: {pd_dtype})')
|
|
210
205
|
|
|
211
206
|
|
|
212
|
-
def __df_row_to_pxt_row(
|
|
213
|
-
|
|
207
|
+
def __df_row_to_pxt_row(
|
|
208
|
+
row: tuple[Any, ...], schema: dict[str, pxt.ColumnType], col_mapping: Optional[dict[str, str]]
|
|
209
|
+
) -> dict[str, Any]:
|
|
210
|
+
"""Convert a row to insertable format"""
|
|
211
|
+
pxt_row: dict[str, Any] = {}
|
|
214
212
|
for val, (col_name, pxt_type) in zip(row[1:], schema.items()):
|
|
215
213
|
if pxt_type.is_float_type():
|
|
216
214
|
val = float(val)
|
|
@@ -232,5 +230,6 @@ def __df_row_to_pxt_row(row: tuple[Any, ...], schema: dict[str, pxt.ColumnType])
|
|
|
232
230
|
val = None
|
|
233
231
|
else:
|
|
234
232
|
val = pd.Timestamp(val).to_pydatetime()
|
|
235
|
-
|
|
236
|
-
|
|
233
|
+
pxt_name = col_name if col_mapping is None else col_mapping[col_name]
|
|
234
|
+
pxt_row[pxt_name] = val
|
|
235
|
+
return pxt_row
|