pixeltable 0.1.0__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +34 -6
- pixeltable/catalog/__init__.py +13 -0
- pixeltable/catalog/catalog.py +159 -0
- pixeltable/catalog/column.py +200 -0
- pixeltable/catalog/dir.py +32 -0
- pixeltable/catalog/globals.py +33 -0
- pixeltable/catalog/insertable_table.py +191 -0
- pixeltable/catalog/named_function.py +36 -0
- pixeltable/catalog/path.py +58 -0
- pixeltable/catalog/path_dict.py +139 -0
- pixeltable/catalog/schema_object.py +39 -0
- pixeltable/catalog/table.py +581 -0
- pixeltable/catalog/table_version.py +749 -0
- pixeltable/catalog/table_version_path.py +133 -0
- pixeltable/catalog/view.py +203 -0
- pixeltable/client.py +590 -30
- pixeltable/dataframe.py +540 -349
- pixeltable/env.py +359 -45
- pixeltable/exceptions.py +12 -21
- pixeltable/exec/__init__.py +9 -0
- pixeltable/exec/aggregation_node.py +78 -0
- pixeltable/exec/cache_prefetch_node.py +116 -0
- pixeltable/exec/component_iteration_node.py +79 -0
- pixeltable/exec/data_row_batch.py +95 -0
- pixeltable/exec/exec_context.py +22 -0
- pixeltable/exec/exec_node.py +61 -0
- pixeltable/exec/expr_eval_node.py +217 -0
- pixeltable/exec/in_memory_data_node.py +69 -0
- pixeltable/exec/media_validation_node.py +43 -0
- pixeltable/exec/sql_scan_node.py +225 -0
- pixeltable/exprs/__init__.py +24 -0
- pixeltable/exprs/arithmetic_expr.py +102 -0
- pixeltable/exprs/array_slice.py +71 -0
- pixeltable/exprs/column_property_ref.py +77 -0
- pixeltable/exprs/column_ref.py +105 -0
- pixeltable/exprs/comparison.py +77 -0
- pixeltable/exprs/compound_predicate.py +98 -0
- pixeltable/exprs/data_row.py +195 -0
- pixeltable/exprs/expr.py +586 -0
- pixeltable/exprs/expr_set.py +39 -0
- pixeltable/exprs/function_call.py +380 -0
- pixeltable/exprs/globals.py +69 -0
- pixeltable/exprs/image_member_access.py +115 -0
- pixeltable/exprs/image_similarity_predicate.py +58 -0
- pixeltable/exprs/inline_array.py +107 -0
- pixeltable/exprs/inline_dict.py +101 -0
- pixeltable/exprs/is_null.py +38 -0
- pixeltable/exprs/json_mapper.py +121 -0
- pixeltable/exprs/json_path.py +159 -0
- pixeltable/exprs/literal.py +54 -0
- pixeltable/exprs/object_ref.py +41 -0
- pixeltable/exprs/predicate.py +44 -0
- pixeltable/exprs/row_builder.py +355 -0
- pixeltable/exprs/rowid_ref.py +94 -0
- pixeltable/exprs/type_cast.py +53 -0
- pixeltable/exprs/variable.py +45 -0
- pixeltable/func/__init__.py +9 -0
- pixeltable/func/aggregate_function.py +194 -0
- pixeltable/func/batched_function.py +53 -0
- pixeltable/func/callable_function.py +69 -0
- pixeltable/func/expr_template_function.py +82 -0
- pixeltable/func/function.py +110 -0
- pixeltable/func/function_registry.py +227 -0
- pixeltable/func/globals.py +36 -0
- pixeltable/func/nos_function.py +202 -0
- pixeltable/func/signature.py +166 -0
- pixeltable/func/udf.py +163 -0
- pixeltable/functions/__init__.py +52 -103
- pixeltable/functions/eval.py +216 -0
- pixeltable/functions/fireworks.py +34 -0
- pixeltable/functions/huggingface.py +120 -0
- pixeltable/functions/image.py +16 -0
- pixeltable/functions/openai.py +256 -0
- pixeltable/functions/pil/image.py +148 -7
- pixeltable/functions/string.py +13 -0
- pixeltable/functions/together.py +122 -0
- pixeltable/functions/util.py +41 -0
- pixeltable/functions/video.py +62 -0
- pixeltable/iterators/__init__.py +3 -0
- pixeltable/iterators/base.py +48 -0
- pixeltable/iterators/document.py +311 -0
- pixeltable/iterators/video.py +89 -0
- pixeltable/metadata/__init__.py +54 -0
- pixeltable/metadata/converters/convert_10.py +18 -0
- pixeltable/metadata/schema.py +211 -0
- pixeltable/plan.py +656 -0
- pixeltable/store.py +418 -182
- pixeltable/tests/conftest.py +146 -88
- pixeltable/tests/functions/test_fireworks.py +42 -0
- pixeltable/tests/functions/test_functions.py +60 -0
- pixeltable/tests/functions/test_huggingface.py +158 -0
- pixeltable/tests/functions/test_openai.py +152 -0
- pixeltable/tests/functions/test_together.py +111 -0
- pixeltable/tests/test_audio.py +65 -0
- pixeltable/tests/test_catalog.py +27 -0
- pixeltable/tests/test_client.py +14 -14
- pixeltable/tests/test_component_view.py +370 -0
- pixeltable/tests/test_dataframe.py +439 -0
- pixeltable/tests/test_dirs.py +78 -62
- pixeltable/tests/test_document.py +120 -0
- pixeltable/tests/test_exprs.py +592 -135
- pixeltable/tests/test_function.py +297 -67
- pixeltable/tests/test_migration.py +43 -0
- pixeltable/tests/test_nos.py +54 -0
- pixeltable/tests/test_snapshot.py +208 -0
- pixeltable/tests/test_table.py +1195 -263
- pixeltable/tests/test_transactional_directory.py +42 -0
- pixeltable/tests/test_types.py +5 -11
- pixeltable/tests/test_video.py +151 -34
- pixeltable/tests/test_view.py +530 -0
- pixeltable/tests/utils.py +320 -45
- pixeltable/tool/create_test_db_dump.py +149 -0
- pixeltable/tool/create_test_video.py +81 -0
- pixeltable/type_system.py +445 -124
- pixeltable/utils/__init__.py +17 -46
- pixeltable/utils/arrow.py +98 -0
- pixeltable/utils/clip.py +12 -15
- pixeltable/utils/coco.py +136 -0
- pixeltable/utils/documents.py +39 -0
- pixeltable/utils/filecache.py +195 -0
- pixeltable/utils/help.py +11 -0
- pixeltable/utils/hf_datasets.py +157 -0
- pixeltable/utils/media_store.py +76 -0
- pixeltable/utils/parquet.py +167 -0
- pixeltable/utils/pytorch.py +91 -0
- pixeltable/utils/s3.py +13 -0
- pixeltable/utils/sql.py +17 -0
- pixeltable/utils/transactional_directory.py +35 -0
- pixeltable-0.2.4.dist-info/LICENSE +18 -0
- pixeltable-0.2.4.dist-info/METADATA +127 -0
- pixeltable-0.2.4.dist-info/RECORD +132 -0
- {pixeltable-0.1.0.dist-info → pixeltable-0.2.4.dist-info}/WHEEL +1 -1
- pixeltable/catalog.py +0 -1421
- pixeltable/exprs.py +0 -1745
- pixeltable/function.py +0 -269
- pixeltable/functions/clip.py +0 -10
- pixeltable/functions/pil/__init__.py +0 -23
- pixeltable/functions/tf.py +0 -21
- pixeltable/index.py +0 -57
- pixeltable/tests/test_dict.py +0 -24
- pixeltable/tests/test_functions.py +0 -11
- pixeltable/tests/test_tf.py +0 -69
- pixeltable/tf.py +0 -33
- pixeltable/utils/tf.py +0 -33
- pixeltable/utils/video.py +0 -32
- pixeltable-0.1.0.dist-info/METADATA +0 -34
- pixeltable-0.1.0.dist-info/RECORD +0 -36
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
import datasets
|
|
2
|
+
from typing import Union, Optional, List, Dict, Any
|
|
3
|
+
import pixeltable.type_system as ts
|
|
4
|
+
from pixeltable import exceptions as excs
|
|
5
|
+
import math
|
|
6
|
+
import logging
|
|
7
|
+
import pixeltable
|
|
8
|
+
import random
|
|
9
|
+
|
|
10
|
+
_logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
# use 100MB as the batch size limit for loading a huggingface dataset into pixeltable.
|
|
13
|
+
# The primary goal is to bound memory use, regardless of dataset size.
|
|
14
|
+
# Second goal is to limit overhead. 100MB is presumed to be reasonable for a lot of storage systems.
|
|
15
|
+
_K_BATCH_SIZE_BYTES = 100_000_000
|
|
16
|
+
|
|
17
|
+
# note, there are many more types. we allow overrides in the schema_override parameter
|
|
18
|
+
# to handle cases where the appropriate type is not yet mapped, or to override this mapping.
|
|
19
|
+
# https://huggingface.co/docs/datasets/v2.17.0/en/package_reference/main_classes#datasets.Value
|
|
20
|
+
_hf_to_pxt: Dict[str, ts.ColumnType] = {
|
|
21
|
+
'int32': ts.IntType(nullable=True), # pixeltable widens to big int
|
|
22
|
+
'int64': ts.IntType(nullable=True),
|
|
23
|
+
'bool': ts.BoolType(nullable=True),
|
|
24
|
+
'float32': ts.FloatType(nullable=True),
|
|
25
|
+
'string': ts.StringType(nullable=True),
|
|
26
|
+
'timestamp[s]': ts.TimestampType(nullable=True),
|
|
27
|
+
'timestamp[ms]': ts.TimestampType(nullable=True), # HF dataset iterator converts timestamps to datetime.datetime
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
def _to_pixeltable_type(
|
|
31
|
+
feature_type: Union[datasets.ClassLabel, datasets.Value, datasets.Sequence],
|
|
32
|
+
) -> Optional[ts.ColumnType]:
|
|
33
|
+
"""Convert a huggingface feature type to a pixeltable ColumnType if one is defined."""
|
|
34
|
+
if isinstance(feature_type, datasets.ClassLabel):
|
|
35
|
+
# enum, example: ClassLabel(names=['neg', 'pos'], id=None)
|
|
36
|
+
return ts.StringType(nullable=True)
|
|
37
|
+
elif isinstance(feature_type, datasets.Value):
|
|
38
|
+
# example: Value(dtype='int64', id=None)
|
|
39
|
+
return _hf_to_pxt.get(feature_type.dtype, None)
|
|
40
|
+
elif isinstance(feature_type, datasets.Sequence):
|
|
41
|
+
# example: cohere wiki. Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None)
|
|
42
|
+
dtype = _to_pixeltable_type(feature_type.feature)
|
|
43
|
+
length = feature_type.length if feature_type.length != -1 else None
|
|
44
|
+
return ts.ArrayType(shape=(length,), dtype=dtype)
|
|
45
|
+
else:
|
|
46
|
+
return None
|
|
47
|
+
|
|
48
|
+
def _get_hf_schema(dataset: Union[datasets.Dataset, datasets.DatasetDict]) -> datasets.Features:
|
|
49
|
+
"""Get the schema of a huggingface dataset as a dictionary."""
|
|
50
|
+
first_dataset = dataset if isinstance(dataset, datasets.Dataset) else next(iter(dataset.values()))
|
|
51
|
+
return first_dataset.features
|
|
52
|
+
|
|
53
|
+
def huggingface_schema_to_pixeltable_schema(
|
|
54
|
+
hf_dataset: Union[datasets.Dataset, datasets.DatasetDict],
|
|
55
|
+
) -> Dict[str, Optional[ts.ColumnType]]:
|
|
56
|
+
"""Generate a pixeltable schema from a huggingface dataset schema.
|
|
57
|
+
Columns without a known mapping are mapped to None
|
|
58
|
+
"""
|
|
59
|
+
hf_schema = _get_hf_schema(hf_dataset)
|
|
60
|
+
pixeltable_schema = {
|
|
61
|
+
column_name: _to_pixeltable_type(feature_type) for column_name, feature_type in hf_schema.items()
|
|
62
|
+
}
|
|
63
|
+
return pixeltable_schema
|
|
64
|
+
|
|
65
|
+
def import_huggingface_dataset(
|
|
66
|
+
cl: 'pixeltable.Client',
|
|
67
|
+
table_path: str,
|
|
68
|
+
dataset: Union[datasets.Dataset, datasets.DatasetDict],
|
|
69
|
+
*,
|
|
70
|
+
column_name_for_split: Optional[str],
|
|
71
|
+
schema_override: Optional[Dict[str, Any]],
|
|
72
|
+
**kwargs,
|
|
73
|
+
) -> 'pixeltable.InsertableTable':
|
|
74
|
+
"""See `pixeltable.Client.import_huggingface_dataset` for documentation"""
|
|
75
|
+
if table_path in cl.list_tables():
|
|
76
|
+
raise excs.Error(f'table {table_path} already exists')
|
|
77
|
+
|
|
78
|
+
if not isinstance(dataset, (datasets.Dataset, datasets.DatasetDict)):
|
|
79
|
+
raise excs.Error(f'`type(dataset)` must be `datasets.Dataset` or `datasets.DatasetDict`. Got {type(dataset)=}')
|
|
80
|
+
|
|
81
|
+
if isinstance(dataset, datasets.Dataset):
|
|
82
|
+
# when loading an hf dataset partially, dataset.split._name is sometimes the form "train[0:1000]"
|
|
83
|
+
raw_name = dataset.split._name
|
|
84
|
+
split_name = raw_name.split('[')[0] if raw_name is not None else None
|
|
85
|
+
dataset_dict = {split_name: dataset}
|
|
86
|
+
else:
|
|
87
|
+
dataset_dict = dataset
|
|
88
|
+
|
|
89
|
+
pixeltable_schema = huggingface_schema_to_pixeltable_schema(dataset)
|
|
90
|
+
if schema_override is not None:
|
|
91
|
+
pixeltable_schema.update(schema_override)
|
|
92
|
+
|
|
93
|
+
if column_name_for_split is not None:
|
|
94
|
+
if column_name_for_split in pixeltable_schema:
|
|
95
|
+
raise excs.Error(
|
|
96
|
+
f'Column name `{column_name_for_split}` already exists in dataset schema; provide a different `column_name_for_split`'
|
|
97
|
+
)
|
|
98
|
+
pixeltable_schema[column_name_for_split] = ts.StringType(nullable=True)
|
|
99
|
+
|
|
100
|
+
for field, column_type in pixeltable_schema.items():
|
|
101
|
+
if column_type is None:
|
|
102
|
+
raise excs.Error(f'Could not infer pixeltable type for feature `{field}` in huggingface dataset')
|
|
103
|
+
|
|
104
|
+
if isinstance(dataset, datasets.Dataset):
|
|
105
|
+
# when loading an hf dataset partially, dataset.split._name is sometimes the form "train[0:1000]"
|
|
106
|
+
raw_name = dataset.split._name
|
|
107
|
+
split_name = raw_name.split('[')[0] if raw_name is not None else None
|
|
108
|
+
dataset_dict = {split_name: dataset}
|
|
109
|
+
elif isinstance(dataset, datasets.DatasetDict):
|
|
110
|
+
dataset_dict = dataset
|
|
111
|
+
else:
|
|
112
|
+
raise excs.Error(f'`type(dataset)` must be `datasets.Dataset` or `datasets.DatasetDict`. Got {type(dataset)=}')
|
|
113
|
+
|
|
114
|
+
# extract all class labels from the dataset to translate category ints to strings
|
|
115
|
+
hf_schema = _get_hf_schema(dataset)
|
|
116
|
+
categorical_features = {
|
|
117
|
+
feature_name: feature_type.names
|
|
118
|
+
for (feature_name, feature_type) in hf_schema.items()
|
|
119
|
+
if isinstance(feature_type, datasets.ClassLabel)
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
try:
|
|
123
|
+
# random tmp name
|
|
124
|
+
tmp_name = f'{table_path}_tmp_{random.randint(0, 100000000)}'
|
|
125
|
+
tab = cl.create_table(tmp_name, pixeltable_schema, **kwargs)
|
|
126
|
+
|
|
127
|
+
def _translate_row(row: Dict[str, Any], split_name: str) -> Dict[str, Any]:
|
|
128
|
+
output_row = row.copy()
|
|
129
|
+
# map all class labels to strings
|
|
130
|
+
for field, values in categorical_features.items():
|
|
131
|
+
output_row[field] = values[row[field]]
|
|
132
|
+
# add split name to row
|
|
133
|
+
if column_name_for_split is not None:
|
|
134
|
+
output_row[column_name_for_split] = split_name
|
|
135
|
+
return output_row
|
|
136
|
+
|
|
137
|
+
for split_name, split_dataset in dataset_dict.items():
|
|
138
|
+
num_batches = split_dataset.size_in_bytes / _K_BATCH_SIZE_BYTES
|
|
139
|
+
tuples_per_batch = math.ceil(split_dataset.num_rows / num_batches)
|
|
140
|
+
assert tuples_per_batch > 0
|
|
141
|
+
|
|
142
|
+
batch = []
|
|
143
|
+
for row in split_dataset:
|
|
144
|
+
batch.append(_translate_row(row, split_name))
|
|
145
|
+
if len(batch) >= tuples_per_batch:
|
|
146
|
+
tab.insert(batch)
|
|
147
|
+
batch = []
|
|
148
|
+
# last batch
|
|
149
|
+
if len(batch) > 0:
|
|
150
|
+
tab.insert(batch)
|
|
151
|
+
|
|
152
|
+
except Exception as e:
|
|
153
|
+
_logger.error(f'Error while inserting dataset into table: {tmp_name}')
|
|
154
|
+
raise e
|
|
155
|
+
|
|
156
|
+
cl.move(tmp_name, table_path)
|
|
157
|
+
return cl.get_table(table_path)
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import glob
|
|
2
|
+
import os
|
|
3
|
+
import re
|
|
4
|
+
import shutil
|
|
5
|
+
import uuid
|
|
6
|
+
from typing import Optional, List, Tuple, Dict
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from collections import defaultdict
|
|
9
|
+
from uuid import UUID
|
|
10
|
+
|
|
11
|
+
from pixeltable.env import Env
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class MediaStore:
|
|
15
|
+
"""
|
|
16
|
+
Utilities to manage media files stored in Env.media_dir
|
|
17
|
+
|
|
18
|
+
Media file names are a composite of: table id, column id, version, uuid:
|
|
19
|
+
the table id/column id/version are redundant but useful for identifying all files for a table
|
|
20
|
+
or all files created for a particular version of a table
|
|
21
|
+
"""
|
|
22
|
+
pattern = re.compile(r'([0-9a-fA-F]+)_(\d+)_(\d+)_([0-9a-fA-F]+)') # tbl_id, col_id, version, uuid
|
|
23
|
+
|
|
24
|
+
@classmethod
|
|
25
|
+
def prepare_media_path(cls, tbl_id: UUID, col_id: int, version: int, ext: Optional[str] = None) -> Path:
|
|
26
|
+
"""
|
|
27
|
+
Construct a new, unique Path name for a persisted media file, and create the parent directory
|
|
28
|
+
for the new Path if it does not already exist. The Path will reside in
|
|
29
|
+
the environment's media_dir.
|
|
30
|
+
"""
|
|
31
|
+
id_hex = uuid.uuid4().hex
|
|
32
|
+
parent = Env.get().media_dir / tbl_id.hex / id_hex[0:2] / id_hex[0:4]
|
|
33
|
+
parent.mkdir(parents=True, exist_ok=True)
|
|
34
|
+
return parent / f'{tbl_id.hex}_{col_id}_{version}_{id_hex}{ext or ""}'
|
|
35
|
+
|
|
36
|
+
@classmethod
|
|
37
|
+
def delete(cls, tbl_id: UUID, version: Optional[int] = None) -> None:
|
|
38
|
+
"""Delete all files belonging to tbl_id. If version is not None, delete
|
|
39
|
+
only those files belonging to the specified version."""
|
|
40
|
+
assert tbl_id is not None
|
|
41
|
+
if version is None:
|
|
42
|
+
# Remove the entire folder for this table id.
|
|
43
|
+
path = Env.get().media_dir / tbl_id.hex
|
|
44
|
+
if path.exists():
|
|
45
|
+
shutil.rmtree(path)
|
|
46
|
+
else:
|
|
47
|
+
# Remove only the elements for the specified version.
|
|
48
|
+
paths = glob.glob(str(Env.get().media_dir / tbl_id.hex) + f'/**/{tbl_id.hex}_*_{version}_*', recursive=True)
|
|
49
|
+
for path in paths:
|
|
50
|
+
os.remove(path)
|
|
51
|
+
|
|
52
|
+
@classmethod
|
|
53
|
+
def count(cls, tbl_id: UUID) -> int:
|
|
54
|
+
"""
|
|
55
|
+
Return number of files for given tbl_id.
|
|
56
|
+
"""
|
|
57
|
+
paths = glob.glob(str(Env.get().media_dir / tbl_id.hex) + f'/**/{tbl_id.hex}_*', recursive=True)
|
|
58
|
+
return len(paths)
|
|
59
|
+
|
|
60
|
+
@classmethod
|
|
61
|
+
def stats(cls) -> List[Tuple[int, int, int, int]]:
|
|
62
|
+
paths = glob.glob(str(Env.get().media_dir) + "/**", recursive=True)
|
|
63
|
+
# key: (tbl_id, col_id), value: (num_files, size)
|
|
64
|
+
d: Dict[Tuple[UUID, int], List[int]] = defaultdict(lambda: [0, 0])
|
|
65
|
+
for p in paths:
|
|
66
|
+
if not os.path.isdir(p):
|
|
67
|
+
matched = re.match(cls.pattern, Path(p).name)
|
|
68
|
+
assert matched is not None
|
|
69
|
+
tbl_id, col_id = UUID(hex=matched[1]), int(matched[2])
|
|
70
|
+
file_info = os.stat(p)
|
|
71
|
+
t = d[(tbl_id, col_id)]
|
|
72
|
+
t[0] += 1
|
|
73
|
+
t[1] += file_info.st_size
|
|
74
|
+
result = [(tbl_id, col_id, num_files, size) for (tbl_id, col_id), (num_files, size) in d.items()]
|
|
75
|
+
result.sort(key=lambda e: e[3], reverse=True)
|
|
76
|
+
return result
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
import io
|
|
2
|
+
import json
|
|
3
|
+
import logging
|
|
4
|
+
from collections import deque
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Dict, List, Optional, Union
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
import PIL.Image
|
|
10
|
+
import pyarrow as pa
|
|
11
|
+
import pyarrow.parquet
|
|
12
|
+
|
|
13
|
+
import pixeltable.type_system as ts
|
|
14
|
+
from pixeltable.utils.arrow import iter_tuples, to_arrow_schema, to_pixeltable_schema
|
|
15
|
+
from pixeltable.utils.transactional_directory import transactional_directory
|
|
16
|
+
import pixeltable.exceptions as exc
|
|
17
|
+
|
|
18
|
+
import random
|
|
19
|
+
|
|
20
|
+
_logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _write_batch(value_batch : Dict[str, deque], schema : pa.Schema, output_path : Path) -> None:
|
|
24
|
+
pydict = {}
|
|
25
|
+
for field in schema:
|
|
26
|
+
if isinstance(field.type, pa.FixedShapeTensorType):
|
|
27
|
+
stacked_arr = np.stack(value_batch[field.name])
|
|
28
|
+
pydict[field.name] = pa.FixedShapeTensorArray.from_numpy_ndarray(stacked_arr)
|
|
29
|
+
else:
|
|
30
|
+
pydict[field.name] = value_batch[field.name]
|
|
31
|
+
|
|
32
|
+
tab = pa.Table.from_pydict(pydict, schema=schema)
|
|
33
|
+
pa.parquet.write_table(tab, output_path)
|
|
34
|
+
|
|
35
|
+
def save_parquet(df: 'pixeltable.DataFrame', dest_path: Path, partition_size_bytes: int = 100_000_000) -> None:
|
|
36
|
+
"""
|
|
37
|
+
Internal method to stream dataframe data to parquet format.
|
|
38
|
+
Does not materialize the dataset to memory.
|
|
39
|
+
|
|
40
|
+
It preserves pixeltable type metadata in a json file, which would otherwise
|
|
41
|
+
not be available in the parquet format.
|
|
42
|
+
|
|
43
|
+
Images are stored inline in a compressed format in their parquet file.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
df : dataframe to save.
|
|
47
|
+
dest_path : path to directory to save the parquet files to.
|
|
48
|
+
partition_size_bytes : maximum target size for each chunk. Default 100_000_000 bytes.
|
|
49
|
+
"""
|
|
50
|
+
column_names = df.get_column_names()
|
|
51
|
+
column_types = df.get_column_types()
|
|
52
|
+
type_dict = {k: v.as_dict() for k, v in zip(column_names, column_types)}
|
|
53
|
+
arrow_schema = to_arrow_schema(dict(zip(column_names, column_types)))
|
|
54
|
+
|
|
55
|
+
# store the changes atomically
|
|
56
|
+
with transactional_directory(dest_path) as temp_path:
|
|
57
|
+
# dump metadata json file so we can inspect what was the source of the parquet file later on.
|
|
58
|
+
json.dump(df._as_dict(), (temp_path / '.pixeltable.json').open('w')) # pylint: disable=protected-access
|
|
59
|
+
json.dump(type_dict, (temp_path / '.pixeltable.column_types.json').open('w')) # keep type metadata
|
|
60
|
+
|
|
61
|
+
batch_num = 0
|
|
62
|
+
current_value_batch : Dict[str, deque] = {k:deque() for k in column_names}
|
|
63
|
+
current_byte_estimate = 0
|
|
64
|
+
|
|
65
|
+
for data_row in df._exec(): # pylint: disable=protected-access
|
|
66
|
+
for (col_name, col_type, e) in zip(column_names, column_types, df._select_list_exprs): # pylint: disable=protected-access
|
|
67
|
+
val = data_row[e.slot_idx]
|
|
68
|
+
if val is None:
|
|
69
|
+
current_value_batch[col_name].append(val)
|
|
70
|
+
continue
|
|
71
|
+
|
|
72
|
+
assert val is not None
|
|
73
|
+
if col_type.is_image_type():
|
|
74
|
+
# images get inlined into the parquet file
|
|
75
|
+
if data_row.file_paths is not None and data_row.file_paths[e.slot_idx] is not None:
|
|
76
|
+
# if there is a file, read directly to preserve information
|
|
77
|
+
with open(data_row.file_paths[e.slot_idx], 'rb') as f:
|
|
78
|
+
val = f.read()
|
|
79
|
+
elif isinstance(val, PIL.Image.Image):
|
|
80
|
+
# if no file available, eg. bc it is computed, convert to png
|
|
81
|
+
buf = io.BytesIO()
|
|
82
|
+
val.save(buf, format='PNG')
|
|
83
|
+
val = buf.getvalue()
|
|
84
|
+
else:
|
|
85
|
+
assert False, f'unknown image type {type(val)}'
|
|
86
|
+
length = len(val)
|
|
87
|
+
elif col_type.is_string_type():
|
|
88
|
+
length = len(val)
|
|
89
|
+
elif col_type.is_video_type():
|
|
90
|
+
if data_row.file_paths is not None and data_row.file_paths[e.slot_idx] is not None:
|
|
91
|
+
val = data_row.file_paths[e.slot_idx]
|
|
92
|
+
else:
|
|
93
|
+
assert False, f'unknown video type {type(val)}'
|
|
94
|
+
length = len(val)
|
|
95
|
+
elif col_type.is_json_type():
|
|
96
|
+
val = json.dumps(val)
|
|
97
|
+
length = len(val)
|
|
98
|
+
elif col_type.is_array_type():
|
|
99
|
+
length = val.nbytes
|
|
100
|
+
elif col_type.is_int_type():
|
|
101
|
+
length = 8
|
|
102
|
+
elif col_type.is_float_type():
|
|
103
|
+
length = 8
|
|
104
|
+
elif col_type.is_bool_type():
|
|
105
|
+
length = 1
|
|
106
|
+
elif col_type.is_timestamp_type():
|
|
107
|
+
length = 8
|
|
108
|
+
else:
|
|
109
|
+
assert False, f'unknown type {col_type} for {col_name}'
|
|
110
|
+
|
|
111
|
+
current_value_batch[col_name].append(val)
|
|
112
|
+
current_byte_estimate += length
|
|
113
|
+
if current_byte_estimate > partition_size_bytes:
|
|
114
|
+
assert batch_num < 100_000, 'wrote too many parquet files, unclear ordering'
|
|
115
|
+
_write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
|
|
116
|
+
batch_num += 1
|
|
117
|
+
current_value_batch = {k:deque() for k in column_names}
|
|
118
|
+
current_byte_estimate = 0
|
|
119
|
+
|
|
120
|
+
_write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def parquet_schema_to_pixeltable_schema(parquet_path: str) -> Dict[str, Optional[ts.ColumnType]]:
|
|
124
|
+
"""Generate a default pixeltable schema for the given parquet file. Returns None for unknown types."""
|
|
125
|
+
|
|
126
|
+
input_path = Path(parquet_path).expanduser()
|
|
127
|
+
parquet_dataset = pa.parquet.ParquetDataset(input_path)
|
|
128
|
+
return to_pixeltable_schema(parquet_dataset.schema)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def import_parquet(
|
|
132
|
+
cl: 'pixeltable.Client',
|
|
133
|
+
table_path: str,
|
|
134
|
+
*,
|
|
135
|
+
parquet_path: str,
|
|
136
|
+
schema_override: Optional[Dict[str, ts.ColumnType]],
|
|
137
|
+
**kwargs,
|
|
138
|
+
) -> 'catalog.InsertableTable':
|
|
139
|
+
"""See `pixeltable.Client.import_parquet` for documentation"""
|
|
140
|
+
input_path = Path(parquet_path).expanduser()
|
|
141
|
+
parquet_dataset = pa.parquet.ParquetDataset(input_path)
|
|
142
|
+
|
|
143
|
+
schema = parquet_schema_to_pixeltable_schema(parquet_path)
|
|
144
|
+
if schema_override is None:
|
|
145
|
+
schema_override = {}
|
|
146
|
+
|
|
147
|
+
schema.update(schema_override)
|
|
148
|
+
for k, v in schema.items():
|
|
149
|
+
if v is None:
|
|
150
|
+
raise exc.Error(f'Could not infer pixeltable type for column {k} from parquet file')
|
|
151
|
+
|
|
152
|
+
if table_path in cl.list_tables():
|
|
153
|
+
raise exc.Error(f'Table {table_path} already exists')
|
|
154
|
+
|
|
155
|
+
try:
|
|
156
|
+
tmp_name = f'{table_path}_tmp_{random.randint(0, 100000000)}'
|
|
157
|
+
tab = cl.create_table(tmp_name, schema, **kwargs)
|
|
158
|
+
for fragment in parquet_dataset.fragments:
|
|
159
|
+
for batch in fragment.to_batches():
|
|
160
|
+
dict_batch = list(iter_tuples(batch))
|
|
161
|
+
tab.insert(dict_batch)
|
|
162
|
+
except Exception as e:
|
|
163
|
+
_logger.error(f'Error while inserting Parquet file into table: {e}')
|
|
164
|
+
raise e
|
|
165
|
+
|
|
166
|
+
cl.move(tmp_name, table_path)
|
|
167
|
+
return cl.get_table(table_path)
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
import io
|
|
2
|
+
import pyarrow as pa
|
|
3
|
+
import pyarrow.parquet
|
|
4
|
+
import torch
|
|
5
|
+
import torch.utils.data
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
import PIL.Image
|
|
8
|
+
import json
|
|
9
|
+
from typing import Dict, Iterator, Any
|
|
10
|
+
import datetime
|
|
11
|
+
|
|
12
|
+
from pixeltable.type_system import ColumnType
|
|
13
|
+
import numpy as np
|
|
14
|
+
|
|
15
|
+
class PixeltablePytorchDataset(torch.utils.data.IterableDataset):
|
|
16
|
+
"""
|
|
17
|
+
PyTorch dataset interface for pixeltable data.
|
|
18
|
+
NB. This class must inherit from torch.utils.data.IterableDataset for it
|
|
19
|
+
to work with torch.utils.data.DataLoader.
|
|
20
|
+
"""
|
|
21
|
+
def __init__(
|
|
22
|
+
self,
|
|
23
|
+
path: Path,
|
|
24
|
+
image_format: str,
|
|
25
|
+
):
|
|
26
|
+
"""
|
|
27
|
+
Args:
|
|
28
|
+
path: path to directory containing parquet files
|
|
29
|
+
image_format: 'np' or 'pt'. 'np' is RGB uint8 array,
|
|
30
|
+
'pt' is result of torchvision.transforms.ToTensor()
|
|
31
|
+
"""
|
|
32
|
+
super().__init__()
|
|
33
|
+
|
|
34
|
+
self.path = path
|
|
35
|
+
self.image_format = image_format
|
|
36
|
+
assert image_format in ["np", "pt"]
|
|
37
|
+
column_type_path = path / '.pixeltable.column_types.json'
|
|
38
|
+
assert column_type_path.exists(), f"missing {column_type_path}"
|
|
39
|
+
with column_type_path.open() as f:
|
|
40
|
+
column_types = json.load(f)
|
|
41
|
+
self.column_types = {k: ColumnType.from_dict(v) for k, v in column_types.items()}
|
|
42
|
+
self.part_metadata = pa.parquet.ParquetDataset(path).files
|
|
43
|
+
|
|
44
|
+
def _unmarshall(self, k: str, v: Any) -> Any:
|
|
45
|
+
if self.column_types[k].is_image_type():
|
|
46
|
+
assert isinstance(v, bytes)
|
|
47
|
+
im = PIL.Image.open(io.BytesIO(v))
|
|
48
|
+
arr = np.array(im) # will copy data to guarantee "WRITEABLE" flag assertion below.
|
|
49
|
+
assert arr.flags["WRITEABLE"]
|
|
50
|
+
|
|
51
|
+
if self.image_format == "np":
|
|
52
|
+
return arr
|
|
53
|
+
|
|
54
|
+
assert self.image_format == "pt"
|
|
55
|
+
import torchvision # pylint: disable = import-outside-toplevel
|
|
56
|
+
|
|
57
|
+
# use arr instead of im in ToTensor() to guarantee array input
|
|
58
|
+
# to torch.from_numpy is writable. Using im is a suspected cause of
|
|
59
|
+
# https://github.com/pixeltable/pixeltable/issues/69
|
|
60
|
+
return torchvision.transforms.ToTensor()(arr)
|
|
61
|
+
elif self.column_types[k].is_json_type():
|
|
62
|
+
assert isinstance(v, str)
|
|
63
|
+
return json.loads(v)
|
|
64
|
+
elif self.column_types[k].is_array_type():
|
|
65
|
+
assert isinstance(v, np.ndarray)
|
|
66
|
+
if not v.flags["WRITEABLE"]:
|
|
67
|
+
v = v.copy()
|
|
68
|
+
assert v.flags["WRITEABLE"]
|
|
69
|
+
return v
|
|
70
|
+
elif self.column_types[k].is_timestamp_type():
|
|
71
|
+
# pytorch default collation only supports numeric types
|
|
72
|
+
assert isinstance(v, datetime.datetime)
|
|
73
|
+
return v.timestamp()
|
|
74
|
+
else:
|
|
75
|
+
assert not isinstance(v, np.ndarray) # all array outputs should be handled above
|
|
76
|
+
return v
|
|
77
|
+
|
|
78
|
+
def __iter__(self) -> Iterator[Dict[str, Any]]:
|
|
79
|
+
import pixeltable.utils.arrow as arrow
|
|
80
|
+
worker_info = torch.utils.data.get_worker_info()
|
|
81
|
+
|
|
82
|
+
if worker_info is None:
|
|
83
|
+
part_list = range(len(self.part_metadata))
|
|
84
|
+
else:
|
|
85
|
+
part_list = [ i for i in part_list if (i % worker_info.num_workers) == worker_info.id ]
|
|
86
|
+
|
|
87
|
+
for part_no in part_list:
|
|
88
|
+
pqf = pa.parquet.ParquetFile(self.part_metadata[part_no])
|
|
89
|
+
for batch in pqf.iter_batches():
|
|
90
|
+
for tup in arrow.iter_tuples(batch):
|
|
91
|
+
yield {k: self._unmarshall(k, v) for k, v in tup.items()}
|
pixeltable/utils/s3.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def get_client() -> Any:
|
|
5
|
+
import boto3
|
|
6
|
+
import botocore
|
|
7
|
+
try:
|
|
8
|
+
boto3.Session().get_credentials().get_frozen_credentials()
|
|
9
|
+
return boto3.client('s3') # credentials are available
|
|
10
|
+
except AttributeError:
|
|
11
|
+
# No credentials available, use unsigned mode
|
|
12
|
+
config = botocore.config.Config(signature_version=botocore.UNSIGNED)
|
|
13
|
+
return boto3.client('s3', config=config)
|
pixeltable/utils/sql.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
import sqlalchemy as sql
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def log_stmt(logger: logging.Logger, stmt) -> None:
|
|
7
|
+
logger.debug(f'executing {str(stmt.compile(dialect=sql.dialects.postgresql.dialect()))}')
|
|
8
|
+
|
|
9
|
+
def log_explain(logger: logging.Logger, stmt: sql.sql.ClauseElement, conn: sql.engine.Connection) -> None:
|
|
10
|
+
try:
|
|
11
|
+
# don't set dialect=Env.get().engine.dialect: x % y turns into x %% y, which results in a syntax error
|
|
12
|
+
stmt_str = str(stmt.compile(compile_kwargs={'literal_binds': True}))
|
|
13
|
+
explain_result = conn.execute(sql.text(f'EXPLAIN {stmt_str}'))
|
|
14
|
+
explain_str = '\n'.join([str(row) for row in explain_result])
|
|
15
|
+
logger.debug(f'SqlScanNode explain:\n{explain_str}')
|
|
16
|
+
except Exception as e:
|
|
17
|
+
logger.warning(f'EXPLAIN failed')
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import shutil
|
|
2
|
+
from contextlib import contextmanager
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any, Generator
|
|
5
|
+
|
|
6
|
+
import pixeltable.exceptions as excs
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@contextmanager
|
|
10
|
+
def transactional_directory(folder_path: Path) -> Generator[Path, Any, Any]:
|
|
11
|
+
"""
|
|
12
|
+
Args:
|
|
13
|
+
folder_path: path to the folder we want to create
|
|
14
|
+
|
|
15
|
+
Yields:
|
|
16
|
+
A pathlib.Path to a hidden temporary folder, which can be used to accumulate changes.
|
|
17
|
+
If everything succeeds, the changes are committed via an atomic move operation upon exiting the 'with' block (os.replace)
|
|
18
|
+
If an exception occurred, no changes are visible in the original folder.
|
|
19
|
+
|
|
20
|
+
Example:
|
|
21
|
+
folder_path = pathlib.Path("path/to/folder")
|
|
22
|
+
with transactional_folder(folder_path) as temp_folder:
|
|
23
|
+
(temp_folder / "subfolder1").mkdir()
|
|
24
|
+
(temp_folder / "subfolder2").mkdir()
|
|
25
|
+
"""
|
|
26
|
+
if folder_path.exists():
|
|
27
|
+
raise excs.Error(f"Folder {folder_path} already exists")
|
|
28
|
+
|
|
29
|
+
tmp_folder = folder_path.parent / f".tmp_{folder_path.name}"
|
|
30
|
+
# Remove the temporary folder if it already exists, eg if the previous run crashed
|
|
31
|
+
shutil.rmtree(str(tmp_folder), ignore_errors=True)
|
|
32
|
+
tmp_folder.mkdir(parents=True)
|
|
33
|
+
yield tmp_folder
|
|
34
|
+
# If everything succeeds, `commit' the changes by moving the temporary folder
|
|
35
|
+
tmp_folder.rename(folder_path)
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
Apache License
|
|
2
|
+
Version 2.0, January 2004
|
|
3
|
+
http://www.apache.org/licenses/
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
Copyright 2023 Marcel Kornacker
|
|
7
|
+
|
|
8
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
9
|
+
you may not use this file except in compliance with the License.
|
|
10
|
+
You may obtain a copy of the License at
|
|
11
|
+
|
|
12
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
13
|
+
|
|
14
|
+
Unless required by applicable law or agreed to in writing, software
|
|
15
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
16
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
17
|
+
See the License for the specific language governing permissions and
|
|
18
|
+
limitations under the License.
|