pixeltable 0.3.14__py3-none-any.whl → 0.5.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pixeltable/__init__.py +42 -8
- pixeltable/{dataframe.py → _query.py} +470 -206
- pixeltable/_version.py +1 -0
- pixeltable/catalog/__init__.py +5 -4
- pixeltable/catalog/catalog.py +1785 -432
- pixeltable/catalog/column.py +190 -113
- pixeltable/catalog/dir.py +2 -4
- pixeltable/catalog/globals.py +19 -46
- pixeltable/catalog/insertable_table.py +191 -98
- pixeltable/catalog/path.py +63 -23
- pixeltable/catalog/schema_object.py +11 -15
- pixeltable/catalog/table.py +843 -436
- pixeltable/catalog/table_metadata.py +103 -0
- pixeltable/catalog/table_version.py +978 -657
- pixeltable/catalog/table_version_handle.py +72 -16
- pixeltable/catalog/table_version_path.py +112 -43
- pixeltable/catalog/tbl_ops.py +53 -0
- pixeltable/catalog/update_status.py +191 -0
- pixeltable/catalog/view.py +134 -90
- pixeltable/config.py +134 -22
- pixeltable/env.py +471 -157
- pixeltable/exceptions.py +6 -0
- pixeltable/exec/__init__.py +4 -1
- pixeltable/exec/aggregation_node.py +7 -8
- pixeltable/exec/cache_prefetch_node.py +83 -110
- pixeltable/exec/cell_materialization_node.py +268 -0
- pixeltable/exec/cell_reconstruction_node.py +168 -0
- pixeltable/exec/component_iteration_node.py +4 -3
- pixeltable/exec/data_row_batch.py +8 -65
- pixeltable/exec/exec_context.py +16 -4
- pixeltable/exec/exec_node.py +13 -36
- pixeltable/exec/expr_eval/evaluators.py +11 -7
- pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
- pixeltable/exec/expr_eval/globals.py +8 -5
- pixeltable/exec/expr_eval/row_buffer.py +1 -2
- pixeltable/exec/expr_eval/schedulers.py +106 -56
- pixeltable/exec/globals.py +35 -0
- pixeltable/exec/in_memory_data_node.py +19 -19
- pixeltable/exec/object_store_save_node.py +293 -0
- pixeltable/exec/row_update_node.py +16 -9
- pixeltable/exec/sql_node.py +351 -84
- pixeltable/exprs/__init__.py +1 -1
- pixeltable/exprs/arithmetic_expr.py +27 -22
- pixeltable/exprs/array_slice.py +3 -3
- pixeltable/exprs/column_property_ref.py +36 -23
- pixeltable/exprs/column_ref.py +213 -89
- pixeltable/exprs/comparison.py +5 -5
- pixeltable/exprs/compound_predicate.py +5 -4
- pixeltable/exprs/data_row.py +164 -54
- pixeltable/exprs/expr.py +70 -44
- pixeltable/exprs/expr_dict.py +3 -3
- pixeltable/exprs/expr_set.py +17 -10
- pixeltable/exprs/function_call.py +100 -40
- pixeltable/exprs/globals.py +2 -2
- pixeltable/exprs/in_predicate.py +4 -4
- pixeltable/exprs/inline_expr.py +18 -32
- pixeltable/exprs/is_null.py +7 -3
- pixeltable/exprs/json_mapper.py +8 -8
- pixeltable/exprs/json_path.py +56 -22
- pixeltable/exprs/literal.py +27 -5
- pixeltable/exprs/method_ref.py +2 -2
- pixeltable/exprs/object_ref.py +2 -2
- pixeltable/exprs/row_builder.py +167 -67
- pixeltable/exprs/rowid_ref.py +25 -10
- pixeltable/exprs/similarity_expr.py +58 -40
- pixeltable/exprs/sql_element_cache.py +4 -4
- pixeltable/exprs/string_op.py +5 -5
- pixeltable/exprs/type_cast.py +3 -5
- pixeltable/func/__init__.py +1 -0
- pixeltable/func/aggregate_function.py +8 -8
- pixeltable/func/callable_function.py +9 -9
- pixeltable/func/expr_template_function.py +17 -11
- pixeltable/func/function.py +18 -20
- pixeltable/func/function_registry.py +6 -7
- pixeltable/func/globals.py +2 -3
- pixeltable/func/mcp.py +74 -0
- pixeltable/func/query_template_function.py +29 -27
- pixeltable/func/signature.py +46 -19
- pixeltable/func/tools.py +31 -13
- pixeltable/func/udf.py +18 -20
- pixeltable/functions/__init__.py +16 -0
- pixeltable/functions/anthropic.py +123 -77
- pixeltable/functions/audio.py +147 -10
- pixeltable/functions/bedrock.py +13 -6
- pixeltable/functions/date.py +7 -4
- pixeltable/functions/deepseek.py +35 -43
- pixeltable/functions/document.py +81 -0
- pixeltable/functions/fal.py +76 -0
- pixeltable/functions/fireworks.py +11 -20
- pixeltable/functions/gemini.py +195 -39
- pixeltable/functions/globals.py +142 -14
- pixeltable/functions/groq.py +108 -0
- pixeltable/functions/huggingface.py +1056 -24
- pixeltable/functions/image.py +115 -57
- pixeltable/functions/json.py +1 -1
- pixeltable/functions/llama_cpp.py +28 -13
- pixeltable/functions/math.py +67 -5
- pixeltable/functions/mistralai.py +18 -55
- pixeltable/functions/net.py +70 -0
- pixeltable/functions/ollama.py +20 -13
- pixeltable/functions/openai.py +240 -226
- pixeltable/functions/openrouter.py +143 -0
- pixeltable/functions/replicate.py +4 -4
- pixeltable/functions/reve.py +250 -0
- pixeltable/functions/string.py +239 -69
- pixeltable/functions/timestamp.py +16 -16
- pixeltable/functions/together.py +24 -84
- pixeltable/functions/twelvelabs.py +188 -0
- pixeltable/functions/util.py +6 -1
- pixeltable/functions/uuid.py +30 -0
- pixeltable/functions/video.py +1515 -107
- pixeltable/functions/vision.py +8 -8
- pixeltable/functions/voyageai.py +289 -0
- pixeltable/functions/whisper.py +16 -8
- pixeltable/functions/whisperx.py +179 -0
- pixeltable/{ext/functions → functions}/yolox.py +2 -4
- pixeltable/globals.py +362 -115
- pixeltable/index/base.py +17 -21
- pixeltable/index/btree.py +28 -22
- pixeltable/index/embedding_index.py +100 -118
- pixeltable/io/__init__.py +4 -2
- pixeltable/io/datarows.py +8 -7
- pixeltable/io/external_store.py +56 -105
- pixeltable/io/fiftyone.py +13 -13
- pixeltable/io/globals.py +31 -30
- pixeltable/io/hf_datasets.py +61 -16
- pixeltable/io/label_studio.py +74 -70
- pixeltable/io/lancedb.py +3 -0
- pixeltable/io/pandas.py +21 -12
- pixeltable/io/parquet.py +25 -105
- pixeltable/io/table_data_conduit.py +250 -123
- pixeltable/io/utils.py +4 -4
- pixeltable/iterators/__init__.py +2 -1
- pixeltable/iterators/audio.py +26 -25
- pixeltable/iterators/base.py +9 -3
- pixeltable/iterators/document.py +112 -78
- pixeltable/iterators/image.py +12 -15
- pixeltable/iterators/string.py +11 -4
- pixeltable/iterators/video.py +523 -120
- pixeltable/metadata/__init__.py +14 -3
- pixeltable/metadata/converters/convert_13.py +2 -2
- pixeltable/metadata/converters/convert_18.py +2 -2
- pixeltable/metadata/converters/convert_19.py +2 -2
- pixeltable/metadata/converters/convert_20.py +2 -2
- pixeltable/metadata/converters/convert_21.py +2 -2
- pixeltable/metadata/converters/convert_22.py +2 -2
- pixeltable/metadata/converters/convert_24.py +2 -2
- pixeltable/metadata/converters/convert_25.py +2 -2
- pixeltable/metadata/converters/convert_26.py +2 -2
- pixeltable/metadata/converters/convert_29.py +4 -4
- pixeltable/metadata/converters/convert_30.py +34 -21
- pixeltable/metadata/converters/convert_34.py +2 -2
- pixeltable/metadata/converters/convert_35.py +9 -0
- pixeltable/metadata/converters/convert_36.py +38 -0
- pixeltable/metadata/converters/convert_37.py +15 -0
- pixeltable/metadata/converters/convert_38.py +39 -0
- pixeltable/metadata/converters/convert_39.py +124 -0
- pixeltable/metadata/converters/convert_40.py +73 -0
- pixeltable/metadata/converters/convert_41.py +12 -0
- pixeltable/metadata/converters/convert_42.py +9 -0
- pixeltable/metadata/converters/convert_43.py +44 -0
- pixeltable/metadata/converters/util.py +20 -31
- pixeltable/metadata/notes.py +9 -0
- pixeltable/metadata/schema.py +140 -53
- pixeltable/metadata/utils.py +74 -0
- pixeltable/mypy/__init__.py +3 -0
- pixeltable/mypy/mypy_plugin.py +123 -0
- pixeltable/plan.py +382 -115
- pixeltable/share/__init__.py +1 -1
- pixeltable/share/packager.py +547 -83
- pixeltable/share/protocol/__init__.py +33 -0
- pixeltable/share/protocol/common.py +165 -0
- pixeltable/share/protocol/operation_types.py +33 -0
- pixeltable/share/protocol/replica.py +119 -0
- pixeltable/share/publish.py +257 -59
- pixeltable/store.py +311 -194
- pixeltable/type_system.py +373 -211
- pixeltable/utils/__init__.py +2 -3
- pixeltable/utils/arrow.py +131 -17
- pixeltable/utils/av.py +298 -0
- pixeltable/utils/azure_store.py +346 -0
- pixeltable/utils/coco.py +6 -6
- pixeltable/utils/code.py +3 -3
- pixeltable/utils/console_output.py +4 -1
- pixeltable/utils/coroutine.py +6 -23
- pixeltable/utils/dbms.py +32 -6
- pixeltable/utils/description_helper.py +4 -5
- pixeltable/utils/documents.py +7 -18
- pixeltable/utils/exception_handler.py +7 -30
- pixeltable/utils/filecache.py +6 -6
- pixeltable/utils/formatter.py +86 -48
- pixeltable/utils/gcs_store.py +295 -0
- pixeltable/utils/http.py +133 -0
- pixeltable/utils/http_server.py +2 -3
- pixeltable/utils/iceberg.py +1 -2
- pixeltable/utils/image.py +17 -0
- pixeltable/utils/lancedb.py +90 -0
- pixeltable/utils/local_store.py +322 -0
- pixeltable/utils/misc.py +5 -0
- pixeltable/utils/object_stores.py +573 -0
- pixeltable/utils/pydantic.py +60 -0
- pixeltable/utils/pytorch.py +5 -6
- pixeltable/utils/s3_store.py +527 -0
- pixeltable/utils/sql.py +26 -0
- pixeltable/utils/system.py +30 -0
- pixeltable-0.5.7.dist-info/METADATA +579 -0
- pixeltable-0.5.7.dist-info/RECORD +227 -0
- {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
- pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
- pixeltable/__version__.py +0 -3
- pixeltable/catalog/named_function.py +0 -40
- pixeltable/ext/__init__.py +0 -17
- pixeltable/ext/functions/__init__.py +0 -11
- pixeltable/ext/functions/whisperx.py +0 -77
- pixeltable/utils/media_store.py +0 -77
- pixeltable/utils/s3.py +0 -17
- pixeltable-0.3.14.dist-info/METADATA +0 -434
- pixeltable-0.3.14.dist-info/RECORD +0 -186
- pixeltable-0.3.14.dist-info/entry_points.txt +0 -3
- {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
pixeltable/io/parquet.py
CHANGED
|
@@ -1,158 +1,78 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
import datetime
|
|
4
|
-
import io
|
|
5
3
|
import json
|
|
6
4
|
import logging
|
|
7
5
|
import typing
|
|
8
|
-
from collections import deque
|
|
9
6
|
from pathlib import Path
|
|
10
|
-
from typing import Any
|
|
11
|
-
|
|
12
|
-
import numpy as np
|
|
13
|
-
import PIL.Image
|
|
7
|
+
from typing import Any
|
|
14
8
|
|
|
15
9
|
import pixeltable as pxt
|
|
16
10
|
import pixeltable.exceptions as excs
|
|
17
|
-
from pixeltable.
|
|
11
|
+
from pixeltable.catalog import Catalog
|
|
18
12
|
from pixeltable.utils.transactional_directory import transactional_directory
|
|
19
13
|
|
|
20
14
|
if typing.TYPE_CHECKING:
|
|
21
|
-
import pyarrow as pa
|
|
22
|
-
|
|
23
15
|
import pixeltable as pxt
|
|
24
16
|
|
|
25
17
|
_logger = logging.getLogger('pixeltable')
|
|
26
18
|
|
|
27
19
|
|
|
28
|
-
def _write_batch(value_batch: dict[str, deque], schema: pa.Schema, output_path: Path) -> None:
|
|
29
|
-
import pyarrow as pa
|
|
30
|
-
from pyarrow import parquet
|
|
31
|
-
|
|
32
|
-
pydict = {}
|
|
33
|
-
for field in schema:
|
|
34
|
-
if isinstance(field.type, pa.FixedShapeTensorType):
|
|
35
|
-
stacked_arr = np.stack(value_batch[field.name])
|
|
36
|
-
pydict[field.name] = pa.FixedShapeTensorArray.from_numpy_ndarray(stacked_arr)
|
|
37
|
-
else:
|
|
38
|
-
pydict[field.name] = value_batch[field.name]
|
|
39
|
-
|
|
40
|
-
tab = pa.Table.from_pydict(pydict, schema=schema)
|
|
41
|
-
parquet.write_table(tab, str(output_path))
|
|
42
|
-
|
|
43
|
-
|
|
44
20
|
def export_parquet(
|
|
45
|
-
|
|
21
|
+
table_or_query: pxt.Table | pxt.Query,
|
|
46
22
|
parquet_path: Path,
|
|
47
23
|
partition_size_bytes: int = 100_000_000,
|
|
48
24
|
inline_images: bool = False,
|
|
49
25
|
) -> None:
|
|
50
26
|
"""
|
|
51
|
-
Exports a
|
|
27
|
+
Exports a Query's data to one or more Parquet files. Requires pyarrow to be installed.
|
|
52
28
|
|
|
53
29
|
It additionally writes the pixeltable metadata in a json file, which would otherwise
|
|
54
30
|
not be available in the parquet format.
|
|
55
31
|
|
|
56
32
|
Args:
|
|
57
|
-
|
|
33
|
+
table_or_query : Table or Query to export.
|
|
58
34
|
parquet_path : Path to directory to write the parquet files to.
|
|
59
35
|
partition_size_bytes : The maximum target size for each chunk. Default 100_000_000 bytes.
|
|
60
36
|
inline_images : If True, images are stored inline in the parquet file. This is useful
|
|
61
37
|
for small images, to be imported as pytorch dataset. But can be inefficient
|
|
62
38
|
for large images, and cannot be imported into pixeltable.
|
|
63
|
-
If False, will raise an error if the
|
|
39
|
+
If False, will raise an error if the Query has any image column.
|
|
64
40
|
Default False.
|
|
65
41
|
"""
|
|
66
|
-
|
|
42
|
+
import pyarrow as pa
|
|
67
43
|
|
|
68
|
-
|
|
69
|
-
if isinstance(table_or_df, pxt.catalog.Table):
|
|
70
|
-
df = table_or_df._df()
|
|
71
|
-
else:
|
|
72
|
-
df = table_or_df
|
|
44
|
+
from pixeltable.utils.arrow import to_record_batches
|
|
73
45
|
|
|
74
|
-
|
|
75
|
-
|
|
46
|
+
query: pxt.Query
|
|
47
|
+
if isinstance(table_or_query, pxt.catalog.Table):
|
|
48
|
+
query = table_or_query.select()
|
|
49
|
+
else:
|
|
50
|
+
query = table_or_query
|
|
76
51
|
|
|
77
|
-
if not inline_images and any(col_type.is_image_type() for col_type in
|
|
78
|
-
raise excs.Error('Cannot export
|
|
52
|
+
if not inline_images and any(col_type.is_image_type() for col_type in query.schema.values()):
|
|
53
|
+
raise excs.Error('Cannot export Query with image columns when inline_images is False')
|
|
79
54
|
|
|
80
55
|
# store the changes atomically
|
|
81
56
|
with transactional_directory(parquet_path) as temp_path:
|
|
82
57
|
# dump metadata json file so we can inspect what was the source of the parquet file later on.
|
|
83
|
-
json.dump(
|
|
58
|
+
json.dump(query.as_dict(), (temp_path / '.pixeltable.json').open('w'))
|
|
59
|
+
type_dict = {k: v.as_dict() for k, v in query.schema.items()}
|
|
84
60
|
json.dump(type_dict, (temp_path / '.pixeltable.column_types.json').open('w')) # keep type metadata
|
|
85
|
-
|
|
86
61
|
batch_num = 0
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
val = data_row[e.slot_idx]
|
|
94
|
-
if val is None:
|
|
95
|
-
current_value_batch[col_name].append(val)
|
|
96
|
-
continue
|
|
97
|
-
|
|
98
|
-
assert val is not None
|
|
99
|
-
if col_type.is_image_type():
|
|
100
|
-
# images get inlined into the parquet file
|
|
101
|
-
if data_row.file_paths is not None and data_row.file_paths[e.slot_idx] is not None:
|
|
102
|
-
# if there is a file, read directly to preserve information
|
|
103
|
-
with open(data_row.file_paths[e.slot_idx], 'rb') as f:
|
|
104
|
-
val = f.read()
|
|
105
|
-
elif isinstance(val, PIL.Image.Image):
|
|
106
|
-
# if no file available, eg. bc it is computed, convert to png
|
|
107
|
-
buf = io.BytesIO()
|
|
108
|
-
val.save(buf, format='PNG')
|
|
109
|
-
val = buf.getvalue()
|
|
110
|
-
else:
|
|
111
|
-
raise excs.Error(f'unknown image type {type(val)}')
|
|
112
|
-
length = len(val)
|
|
113
|
-
elif col_type.is_string_type():
|
|
114
|
-
length = len(val)
|
|
115
|
-
elif col_type.is_video_type():
|
|
116
|
-
if data_row.file_paths is not None and data_row.file_paths[e.slot_idx] is not None:
|
|
117
|
-
val = data_row.file_paths[e.slot_idx]
|
|
118
|
-
else:
|
|
119
|
-
raise excs.Error(f'unknown video type {type(val)}')
|
|
120
|
-
length = len(val)
|
|
121
|
-
elif col_type.is_json_type():
|
|
122
|
-
val = json.dumps(val)
|
|
123
|
-
length = len(val)
|
|
124
|
-
elif col_type.is_array_type():
|
|
125
|
-
length = val.nbytes
|
|
126
|
-
elif col_type.is_int_type() or col_type.is_float_type():
|
|
127
|
-
length = 8
|
|
128
|
-
elif col_type.is_bool_type():
|
|
129
|
-
length = 1
|
|
130
|
-
elif col_type.is_date_type():
|
|
131
|
-
length = 4
|
|
132
|
-
elif col_type.is_timestamp_type():
|
|
133
|
-
val = val.astimezone(datetime.timezone.utc)
|
|
134
|
-
length = 8
|
|
135
|
-
else:
|
|
136
|
-
raise excs.Error(f'unknown type {col_type} for {col_name}')
|
|
137
|
-
|
|
138
|
-
current_value_batch[col_name].append(val)
|
|
139
|
-
current_byte_estimate += length
|
|
140
|
-
if current_byte_estimate > partition_size_bytes:
|
|
141
|
-
assert batch_num < 100_000, 'wrote too many parquet files, unclear ordering'
|
|
142
|
-
_write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
|
|
143
|
-
batch_num += 1
|
|
144
|
-
current_value_batch = {k: deque() for k in df.schema}
|
|
145
|
-
current_byte_estimate = 0
|
|
146
|
-
|
|
147
|
-
_write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
|
|
62
|
+
with Catalog.get().begin_xact(for_write=False):
|
|
63
|
+
for record_batch in to_record_batches(query, partition_size_bytes):
|
|
64
|
+
output_path = temp_path / f'part-{batch_num:05d}.parquet'
|
|
65
|
+
arrow_tbl = pa.Table.from_batches([record_batch])
|
|
66
|
+
pa.parquet.write_table(arrow_tbl, str(output_path))
|
|
67
|
+
batch_num += 1
|
|
148
68
|
|
|
149
69
|
|
|
150
70
|
def import_parquet(
|
|
151
71
|
table: str,
|
|
152
72
|
*,
|
|
153
73
|
parquet_path: str,
|
|
154
|
-
schema_overrides:
|
|
155
|
-
primary_key:
|
|
74
|
+
schema_overrides: dict[str, Any] | None = None,
|
|
75
|
+
primary_key: str | list[str] | None = None,
|
|
156
76
|
**kwargs: Any,
|
|
157
77
|
) -> pxt.Table:
|
|
158
78
|
"""Creates a new base table from a Parquet file or set of files. Requires pyarrow to be installed.
|