pixeltable 0.2.26__py3-none-any.whl → 0.5.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pixeltable/__init__.py +83 -19
- pixeltable/_query.py +1444 -0
- pixeltable/_version.py +1 -0
- pixeltable/catalog/__init__.py +7 -4
- pixeltable/catalog/catalog.py +2394 -119
- pixeltable/catalog/column.py +225 -104
- pixeltable/catalog/dir.py +38 -9
- pixeltable/catalog/globals.py +53 -34
- pixeltable/catalog/insertable_table.py +265 -115
- pixeltable/catalog/path.py +80 -17
- pixeltable/catalog/schema_object.py +28 -43
- pixeltable/catalog/table.py +1270 -677
- pixeltable/catalog/table_metadata.py +103 -0
- pixeltable/catalog/table_version.py +1270 -751
- pixeltable/catalog/table_version_handle.py +109 -0
- pixeltable/catalog/table_version_path.py +137 -42
- pixeltable/catalog/tbl_ops.py +53 -0
- pixeltable/catalog/update_status.py +191 -0
- pixeltable/catalog/view.py +251 -134
- pixeltable/config.py +215 -0
- pixeltable/env.py +736 -285
- pixeltable/exceptions.py +26 -2
- pixeltable/exec/__init__.py +7 -2
- pixeltable/exec/aggregation_node.py +39 -21
- pixeltable/exec/cache_prefetch_node.py +87 -109
- pixeltable/exec/cell_materialization_node.py +268 -0
- pixeltable/exec/cell_reconstruction_node.py +168 -0
- pixeltable/exec/component_iteration_node.py +25 -28
- pixeltable/exec/data_row_batch.py +11 -46
- pixeltable/exec/exec_context.py +26 -11
- pixeltable/exec/exec_node.py +35 -27
- pixeltable/exec/expr_eval/__init__.py +3 -0
- pixeltable/exec/expr_eval/evaluators.py +365 -0
- pixeltable/exec/expr_eval/expr_eval_node.py +413 -0
- pixeltable/exec/expr_eval/globals.py +200 -0
- pixeltable/exec/expr_eval/row_buffer.py +74 -0
- pixeltable/exec/expr_eval/schedulers.py +413 -0
- pixeltable/exec/globals.py +35 -0
- pixeltable/exec/in_memory_data_node.py +35 -27
- pixeltable/exec/object_store_save_node.py +293 -0
- pixeltable/exec/row_update_node.py +44 -29
- pixeltable/exec/sql_node.py +414 -115
- pixeltable/exprs/__init__.py +8 -5
- pixeltable/exprs/arithmetic_expr.py +79 -45
- pixeltable/exprs/array_slice.py +5 -5
- pixeltable/exprs/column_property_ref.py +40 -26
- pixeltable/exprs/column_ref.py +254 -61
- pixeltable/exprs/comparison.py +14 -9
- pixeltable/exprs/compound_predicate.py +9 -10
- pixeltable/exprs/data_row.py +213 -72
- pixeltable/exprs/expr.py +270 -104
- pixeltable/exprs/expr_dict.py +6 -5
- pixeltable/exprs/expr_set.py +20 -11
- pixeltable/exprs/function_call.py +383 -284
- pixeltable/exprs/globals.py +18 -5
- pixeltable/exprs/in_predicate.py +7 -7
- pixeltable/exprs/inline_expr.py +37 -37
- pixeltable/exprs/is_null.py +8 -4
- pixeltable/exprs/json_mapper.py +120 -54
- pixeltable/exprs/json_path.py +90 -60
- pixeltable/exprs/literal.py +61 -16
- pixeltable/exprs/method_ref.py +7 -6
- pixeltable/exprs/object_ref.py +19 -8
- pixeltable/exprs/row_builder.py +238 -75
- pixeltable/exprs/rowid_ref.py +53 -15
- pixeltable/exprs/similarity_expr.py +65 -50
- pixeltable/exprs/sql_element_cache.py +5 -5
- pixeltable/exprs/string_op.py +107 -0
- pixeltable/exprs/type_cast.py +25 -13
- pixeltable/exprs/variable.py +2 -2
- pixeltable/func/__init__.py +9 -5
- pixeltable/func/aggregate_function.py +197 -92
- pixeltable/func/callable_function.py +119 -35
- pixeltable/func/expr_template_function.py +101 -48
- pixeltable/func/function.py +375 -62
- pixeltable/func/function_registry.py +20 -19
- pixeltable/func/globals.py +6 -5
- pixeltable/func/mcp.py +74 -0
- pixeltable/func/query_template_function.py +151 -35
- pixeltable/func/signature.py +178 -49
- pixeltable/func/tools.py +164 -0
- pixeltable/func/udf.py +176 -53
- pixeltable/functions/__init__.py +44 -4
- pixeltable/functions/anthropic.py +226 -47
- pixeltable/functions/audio.py +148 -11
- pixeltable/functions/bedrock.py +137 -0
- pixeltable/functions/date.py +188 -0
- pixeltable/functions/deepseek.py +113 -0
- pixeltable/functions/document.py +81 -0
- pixeltable/functions/fal.py +76 -0
- pixeltable/functions/fireworks.py +72 -20
- pixeltable/functions/gemini.py +249 -0
- pixeltable/functions/globals.py +208 -53
- pixeltable/functions/groq.py +108 -0
- pixeltable/functions/huggingface.py +1088 -95
- pixeltable/functions/image.py +155 -84
- pixeltable/functions/json.py +8 -11
- pixeltable/functions/llama_cpp.py +31 -19
- pixeltable/functions/math.py +169 -0
- pixeltable/functions/mistralai.py +50 -75
- pixeltable/functions/net.py +70 -0
- pixeltable/functions/ollama.py +29 -36
- pixeltable/functions/openai.py +548 -160
- pixeltable/functions/openrouter.py +143 -0
- pixeltable/functions/replicate.py +15 -14
- pixeltable/functions/reve.py +250 -0
- pixeltable/functions/string.py +310 -85
- pixeltable/functions/timestamp.py +37 -19
- pixeltable/functions/together.py +77 -120
- pixeltable/functions/twelvelabs.py +188 -0
- pixeltable/functions/util.py +7 -2
- pixeltable/functions/uuid.py +30 -0
- pixeltable/functions/video.py +1528 -117
- pixeltable/functions/vision.py +26 -26
- pixeltable/functions/voyageai.py +289 -0
- pixeltable/functions/whisper.py +19 -10
- pixeltable/functions/whisperx.py +179 -0
- pixeltable/functions/yolox.py +112 -0
- pixeltable/globals.py +716 -236
- pixeltable/index/__init__.py +3 -1
- pixeltable/index/base.py +17 -21
- pixeltable/index/btree.py +32 -22
- pixeltable/index/embedding_index.py +155 -92
- pixeltable/io/__init__.py +12 -7
- pixeltable/io/datarows.py +140 -0
- pixeltable/io/external_store.py +83 -125
- pixeltable/io/fiftyone.py +24 -33
- pixeltable/io/globals.py +47 -182
- pixeltable/io/hf_datasets.py +96 -127
- pixeltable/io/label_studio.py +171 -156
- pixeltable/io/lancedb.py +3 -0
- pixeltable/io/pandas.py +136 -115
- pixeltable/io/parquet.py +40 -153
- pixeltable/io/table_data_conduit.py +702 -0
- pixeltable/io/utils.py +100 -0
- pixeltable/iterators/__init__.py +8 -4
- pixeltable/iterators/audio.py +207 -0
- pixeltable/iterators/base.py +9 -3
- pixeltable/iterators/document.py +144 -87
- pixeltable/iterators/image.py +17 -38
- pixeltable/iterators/string.py +15 -12
- pixeltable/iterators/video.py +523 -127
- pixeltable/metadata/__init__.py +33 -8
- pixeltable/metadata/converters/convert_10.py +2 -3
- pixeltable/metadata/converters/convert_13.py +2 -2
- pixeltable/metadata/converters/convert_15.py +15 -11
- pixeltable/metadata/converters/convert_16.py +4 -5
- pixeltable/metadata/converters/convert_17.py +4 -5
- pixeltable/metadata/converters/convert_18.py +4 -6
- pixeltable/metadata/converters/convert_19.py +6 -9
- pixeltable/metadata/converters/convert_20.py +3 -6
- pixeltable/metadata/converters/convert_21.py +6 -8
- pixeltable/metadata/converters/convert_22.py +3 -2
- pixeltable/metadata/converters/convert_23.py +33 -0
- pixeltable/metadata/converters/convert_24.py +55 -0
- pixeltable/metadata/converters/convert_25.py +19 -0
- pixeltable/metadata/converters/convert_26.py +23 -0
- pixeltable/metadata/converters/convert_27.py +29 -0
- pixeltable/metadata/converters/convert_28.py +13 -0
- pixeltable/metadata/converters/convert_29.py +110 -0
- pixeltable/metadata/converters/convert_30.py +63 -0
- pixeltable/metadata/converters/convert_31.py +11 -0
- pixeltable/metadata/converters/convert_32.py +15 -0
- pixeltable/metadata/converters/convert_33.py +17 -0
- pixeltable/metadata/converters/convert_34.py +21 -0
- pixeltable/metadata/converters/convert_35.py +9 -0
- pixeltable/metadata/converters/convert_36.py +38 -0
- pixeltable/metadata/converters/convert_37.py +15 -0
- pixeltable/metadata/converters/convert_38.py +39 -0
- pixeltable/metadata/converters/convert_39.py +124 -0
- pixeltable/metadata/converters/convert_40.py +73 -0
- pixeltable/metadata/converters/convert_41.py +12 -0
- pixeltable/metadata/converters/convert_42.py +9 -0
- pixeltable/metadata/converters/convert_43.py +44 -0
- pixeltable/metadata/converters/util.py +44 -18
- pixeltable/metadata/notes.py +21 -0
- pixeltable/metadata/schema.py +185 -42
- pixeltable/metadata/utils.py +74 -0
- pixeltable/mypy/__init__.py +3 -0
- pixeltable/mypy/mypy_plugin.py +123 -0
- pixeltable/plan.py +616 -225
- pixeltable/share/__init__.py +3 -0
- pixeltable/share/packager.py +797 -0
- pixeltable/share/protocol/__init__.py +33 -0
- pixeltable/share/protocol/common.py +165 -0
- pixeltable/share/protocol/operation_types.py +33 -0
- pixeltable/share/protocol/replica.py +119 -0
- pixeltable/share/publish.py +349 -0
- pixeltable/store.py +398 -232
- pixeltable/type_system.py +730 -267
- pixeltable/utils/__init__.py +40 -0
- pixeltable/utils/arrow.py +201 -29
- pixeltable/utils/av.py +298 -0
- pixeltable/utils/azure_store.py +346 -0
- pixeltable/utils/coco.py +26 -27
- pixeltable/utils/code.py +4 -4
- pixeltable/utils/console_output.py +46 -0
- pixeltable/utils/coroutine.py +24 -0
- pixeltable/utils/dbms.py +92 -0
- pixeltable/utils/description_helper.py +11 -12
- pixeltable/utils/documents.py +60 -61
- pixeltable/utils/exception_handler.py +36 -0
- pixeltable/utils/filecache.py +38 -22
- pixeltable/utils/formatter.py +88 -51
- pixeltable/utils/gcs_store.py +295 -0
- pixeltable/utils/http.py +133 -0
- pixeltable/utils/http_server.py +14 -13
- pixeltable/utils/iceberg.py +13 -0
- pixeltable/utils/image.py +17 -0
- pixeltable/utils/lancedb.py +90 -0
- pixeltable/utils/local_store.py +322 -0
- pixeltable/utils/misc.py +5 -0
- pixeltable/utils/object_stores.py +573 -0
- pixeltable/utils/pydantic.py +60 -0
- pixeltable/utils/pytorch.py +20 -20
- pixeltable/utils/s3_store.py +527 -0
- pixeltable/utils/sql.py +32 -5
- pixeltable/utils/system.py +30 -0
- pixeltable/utils/transactional_directory.py +4 -3
- pixeltable-0.5.7.dist-info/METADATA +579 -0
- pixeltable-0.5.7.dist-info/RECORD +227 -0
- {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
- pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
- pixeltable/__version__.py +0 -3
- pixeltable/catalog/named_function.py +0 -36
- pixeltable/catalog/path_dict.py +0 -141
- pixeltable/dataframe.py +0 -894
- pixeltable/exec/expr_eval_node.py +0 -232
- pixeltable/ext/__init__.py +0 -14
- pixeltable/ext/functions/__init__.py +0 -8
- pixeltable/ext/functions/whisperx.py +0 -77
- pixeltable/ext/functions/yolox.py +0 -157
- pixeltable/tool/create_test_db_dump.py +0 -311
- pixeltable/tool/create_test_video.py +0 -81
- pixeltable/tool/doc_plugins/griffe.py +0 -50
- pixeltable/tool/doc_plugins/mkdocstrings.py +0 -6
- pixeltable/tool/doc_plugins/templates/material/udf.html.jinja +0 -135
- pixeltable/tool/embed_udf.py +0 -9
- pixeltable/tool/mypy_plugin.py +0 -55
- pixeltable/utils/media_store.py +0 -76
- pixeltable/utils/s3.py +0 -16
- pixeltable-0.2.26.dist-info/METADATA +0 -400
- pixeltable-0.2.26.dist-info/RECORD +0 -156
- pixeltable-0.2.26.dist-info/entry_points.txt +0 -3
- {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
pixeltable/io/lancedb.py
ADDED
pixeltable/io/pandas.py
CHANGED
|
@@ -1,18 +1,26 @@
|
|
|
1
|
-
|
|
1
|
+
import os
|
|
2
|
+
import uuid
|
|
3
|
+
from typing import Any
|
|
2
4
|
|
|
3
5
|
import numpy as np
|
|
4
6
|
import pandas as pd
|
|
7
|
+
from pandas._typing import DtypeObj # For pandas dtype type hints
|
|
8
|
+
from pandas.api.types import is_datetime64_any_dtype, is_extension_array_dtype
|
|
5
9
|
|
|
6
10
|
import pixeltable as pxt
|
|
7
11
|
import pixeltable.exceptions as excs
|
|
8
12
|
import pixeltable.type_system as ts
|
|
13
|
+
from pixeltable.env import Env
|
|
9
14
|
|
|
10
15
|
|
|
11
16
|
def import_pandas(
|
|
12
|
-
tbl_name: str,
|
|
13
|
-
|
|
17
|
+
tbl_name: str,
|
|
18
|
+
df: pd.DataFrame,
|
|
19
|
+
*,
|
|
20
|
+
schema_overrides: dict[str, Any] | None = None,
|
|
21
|
+
primary_key: str | list[str] | None = None,
|
|
14
22
|
num_retained_versions: int = 10,
|
|
15
|
-
comment: str = ''
|
|
23
|
+
comment: str = '',
|
|
16
24
|
) -> pxt.Table:
|
|
17
25
|
"""Creates a new base table from a Pandas
|
|
18
26
|
[`DataFrame`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html), with the
|
|
@@ -36,26 +44,24 @@ def import_pandas(
|
|
|
36
44
|
Returns:
|
|
37
45
|
A handle to the newly created [`Table`][pixeltable.Table].
|
|
38
46
|
"""
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
tbl_rows = (dict(__df_row_to_pxt_row(row, schema)) for row in df.itertuples())
|
|
48
|
-
table = pxt.create_table(tbl_name, schema, primary_key=pxt_pk, num_retained_versions=num_retained_versions, comment=comment)
|
|
49
|
-
table.insert(tbl_rows)
|
|
50
|
-
return table
|
|
47
|
+
return pxt.create_table(
|
|
48
|
+
tbl_name,
|
|
49
|
+
source=df,
|
|
50
|
+
schema_overrides=schema_overrides,
|
|
51
|
+
primary_key=primary_key,
|
|
52
|
+
num_retained_versions=num_retained_versions,
|
|
53
|
+
comment=comment,
|
|
54
|
+
)
|
|
51
55
|
|
|
52
56
|
|
|
53
57
|
def import_csv(
|
|
54
|
-
tbl_name: str,
|
|
55
|
-
|
|
58
|
+
tbl_name: str,
|
|
59
|
+
filepath_or_buffer: str | os.PathLike,
|
|
60
|
+
schema_overrides: dict[str, Any] | None = None,
|
|
61
|
+
primary_key: str | list[str] | None = None,
|
|
56
62
|
num_retained_versions: int = 10,
|
|
57
63
|
comment: str = '',
|
|
58
|
-
**kwargs
|
|
64
|
+
**kwargs: Any,
|
|
59
65
|
) -> pxt.Table:
|
|
60
66
|
"""
|
|
61
67
|
Creates a new base table from a csv file. This is a convenience method and is equivalent
|
|
@@ -66,16 +72,26 @@ def import_csv(
|
|
|
66
72
|
Returns:
|
|
67
73
|
A handle to the newly created [`Table`][pixeltable.Table].
|
|
68
74
|
"""
|
|
69
|
-
|
|
70
|
-
|
|
75
|
+
return pxt.create_table(
|
|
76
|
+
tbl_name,
|
|
77
|
+
source=filepath_or_buffer,
|
|
78
|
+
schema_overrides=schema_overrides,
|
|
79
|
+
primary_key=primary_key,
|
|
80
|
+
num_retained_versions=num_retained_versions,
|
|
81
|
+
comment=comment,
|
|
82
|
+
extra_args=kwargs,
|
|
83
|
+
)
|
|
71
84
|
|
|
72
85
|
|
|
73
86
|
def import_excel(
|
|
74
|
-
tbl_name: str,
|
|
75
|
-
|
|
87
|
+
tbl_name: str,
|
|
88
|
+
io: str | os.PathLike,
|
|
89
|
+
*,
|
|
90
|
+
schema_overrides: dict[str, Any] | None = None,
|
|
91
|
+
primary_key: str | list[str] | None = None,
|
|
76
92
|
num_retained_versions: int = 10,
|
|
77
93
|
comment: str = '',
|
|
78
|
-
**kwargs
|
|
94
|
+
**kwargs: Any,
|
|
79
95
|
) -> pxt.Table:
|
|
80
96
|
"""
|
|
81
97
|
Creates a new base table from an Excel (.xlsx) file. This is a convenience method and is
|
|
@@ -86,97 +102,77 @@ def import_excel(
|
|
|
86
102
|
Returns:
|
|
87
103
|
A handle to the newly created [`Table`][pixeltable.Table].
|
|
88
104
|
"""
|
|
89
|
-
|
|
90
|
-
|
|
105
|
+
return pxt.create_table(
|
|
106
|
+
tbl_name,
|
|
107
|
+
source=io,
|
|
108
|
+
schema_overrides=schema_overrides,
|
|
109
|
+
primary_key=primary_key,
|
|
110
|
+
num_retained_versions=num_retained_versions,
|
|
111
|
+
comment=comment,
|
|
112
|
+
extra_args=kwargs,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def _df_check_primary_key_values(df: pd.DataFrame, primary_key: list[str]) -> None:
|
|
117
|
+
for pd_name in primary_key:
|
|
118
|
+
# This can be faster for large DataFrames
|
|
119
|
+
has_nulls = df[pd_name].count() < len(df)
|
|
120
|
+
if has_nulls:
|
|
121
|
+
raise excs.Error(f'Primary key column `{pd_name}` cannot contain null values.')
|
|
91
122
|
|
|
92
123
|
|
|
93
|
-
def
|
|
94
|
-
df: pd.DataFrame, schema_overrides: dict[str,
|
|
95
|
-
) ->
|
|
124
|
+
def df_infer_schema(
|
|
125
|
+
df: pd.DataFrame, schema_overrides: dict[str, ts.ColumnType], primary_key: list[str]
|
|
126
|
+
) -> dict[str, ts.ColumnType]:
|
|
96
127
|
"""
|
|
97
128
|
Infers a Pixeltable schema from a Pandas DataFrame.
|
|
98
129
|
|
|
99
130
|
Returns:
|
|
100
131
|
A tuple containing a Pixeltable schema and a list of primary key column names.
|
|
101
132
|
"""
|
|
102
|
-
|
|
103
|
-
if pd_name not in df.columns:
|
|
104
|
-
raise excs.Error(
|
|
105
|
-
f'Column `{pd_name}` specified in `schema_overrides` does not exist in the given `DataFrame`.'
|
|
106
|
-
)
|
|
107
|
-
for pd_name in primary_key:
|
|
108
|
-
if pd_name not in df.columns:
|
|
109
|
-
raise excs.Error(f'Primary key column `{pd_name}` does not exist in the given `DataFrame`.')
|
|
110
|
-
|
|
111
|
-
schema: dict[str, pxt.ColumnType] = {}
|
|
112
|
-
col_mapping: dict[str, str] = {} # Maps Pandas column names to Pixeltable column names
|
|
113
|
-
|
|
133
|
+
pd_schema: dict[str, ts.ColumnType] = {}
|
|
114
134
|
for pd_name, pd_dtype in zip(df.columns, df.dtypes):
|
|
115
135
|
if pd_name in schema_overrides:
|
|
136
|
+
assert isinstance(schema_overrides[pd_name], ts.ColumnType)
|
|
116
137
|
pxt_type = schema_overrides[pd_name]
|
|
117
138
|
else:
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
# isinstance(val, float) will check for NaN values in float columns *as well as* floats appearing
|
|
121
|
-
# in object columns (where Pandas uses NaN as a general null).
|
|
122
|
-
# np.issubdtype(pd_dtype, np.datetime64) checks for NaT values specifically in datetime columns.
|
|
123
|
-
has_na = any(
|
|
124
|
-
(isinstance(val, float) or np.issubdtype(pd_dtype, np.datetime64)) and pd.isna(val)
|
|
125
|
-
for val in df[pd_name]
|
|
126
|
-
)
|
|
127
|
-
if has_na and pd_name in primary_key:
|
|
128
|
-
raise excs.Error(f'Primary key column `{pd_name}` cannot contain null values.')
|
|
129
|
-
pxt_type = __np_dtype_to_pxt_type(pd_dtype, df[pd_name], pd_name not in primary_key)
|
|
130
|
-
pxt_name = __normalize_pxt_col_name(pd_name)
|
|
131
|
-
# Ensure that column names are unique by appending a distinguishing suffix
|
|
132
|
-
# to any collisions
|
|
133
|
-
if pxt_name in schema:
|
|
134
|
-
n = 2
|
|
135
|
-
while f'{pxt_name}_{n}' in schema:
|
|
136
|
-
n += 1
|
|
137
|
-
pxt_name = f'{pxt_name}_{n}'
|
|
138
|
-
schema[pxt_name] = pxt_type
|
|
139
|
-
col_mapping[pd_name] = pxt_name
|
|
140
|
-
|
|
141
|
-
pxt_pk = [col_mapping[pk] for pk in primary_key]
|
|
142
|
-
return schema, pxt_pk
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
def __normalize_pxt_col_name(pd_name: str) -> str:
|
|
146
|
-
"""
|
|
147
|
-
Normalizes an arbitrary DataFrame column name into a valid Pixeltable identifier by:
|
|
148
|
-
- replacing any non-ascii or non-alphanumeric characters with an underscore _
|
|
149
|
-
- prefixing the result with the letter 'c' if it starts with an underscore or a number
|
|
150
|
-
"""
|
|
151
|
-
id = ''.join(ch if ch.isascii() and ch.isalnum() else '_' for ch in pd_name)
|
|
152
|
-
if id[0].isnumeric():
|
|
153
|
-
id = f'c_{id}'
|
|
154
|
-
elif id[0] == '_':
|
|
155
|
-
id = f'c{id}'
|
|
156
|
-
assert pxt.catalog.is_valid_identifier(id), id
|
|
157
|
-
return id
|
|
158
|
-
|
|
139
|
+
pxt_type = __pd_coltype_to_pxt_type(pd_dtype, df[pd_name], pd_name not in primary_key)
|
|
140
|
+
pd_schema[pd_name] = pxt_type
|
|
159
141
|
|
|
160
|
-
|
|
161
|
-
"""
|
|
162
|
-
Infers a Pixeltable type based on a Numpy dtype.
|
|
163
|
-
"""
|
|
164
|
-
if np.issubdtype(np_dtype, np.integer):
|
|
165
|
-
return pxt.IntType(nullable=nullable)
|
|
142
|
+
return pd_schema
|
|
166
143
|
|
|
167
|
-
if np.issubdtype(np_dtype, np.floating):
|
|
168
|
-
return pxt.FloatType(nullable=nullable)
|
|
169
144
|
|
|
170
|
-
|
|
171
|
-
|
|
145
|
+
def __pd_dtype_to_pxt_type(pd_dtype: DtypeObj, nullable: bool) -> ts.ColumnType | None:
|
|
146
|
+
"""
|
|
147
|
+
Determines a pixeltable ColumnType from a pandas dtype
|
|
172
148
|
|
|
173
|
-
|
|
174
|
-
|
|
149
|
+
Args:
|
|
150
|
+
pd_dtype: A pandas dtype object
|
|
175
151
|
|
|
176
|
-
|
|
177
|
-
|
|
152
|
+
Returns:
|
|
153
|
+
ts.ColumnType: A pixeltable ColumnType
|
|
154
|
+
"""
|
|
155
|
+
# Pandas extension arrays / types (Int64, boolean, string[pyarrow], etc.) are not directly
|
|
156
|
+
# compatible with NumPy dtypes
|
|
157
|
+
# The timezone-aware datetime64[ns, tz=] dtype is a pandas extension dtype
|
|
158
|
+
if is_datetime64_any_dtype(pd_dtype):
|
|
159
|
+
return ts.TimestampType(nullable=nullable)
|
|
160
|
+
if is_extension_array_dtype(pd_dtype):
|
|
161
|
+
return None
|
|
162
|
+
# Most other pandas dtypes are directly NumPy compatible
|
|
163
|
+
assert isinstance(pd_dtype, np.dtype)
|
|
164
|
+
return ts.ColumnType.from_np_dtype(pd_dtype, nullable)
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def __pd_coltype_to_pxt_type(pd_dtype: DtypeObj, data_col: pd.Series, nullable: bool) -> ts.ColumnType:
|
|
168
|
+
"""
|
|
169
|
+
Infers a Pixeltable type based on a pandas dtype.
|
|
170
|
+
"""
|
|
171
|
+
pxttype = __pd_dtype_to_pxt_type(pd_dtype, nullable)
|
|
172
|
+
if pxttype is not None:
|
|
173
|
+
return pxttype
|
|
178
174
|
|
|
179
|
-
if
|
|
175
|
+
if pd_dtype == np.object_:
|
|
180
176
|
# The `object_` dtype can mean all sorts of things; see if we can infer the Pixeltable type
|
|
181
177
|
# based on the actual data in `data_col`.
|
|
182
178
|
# First drop any null values (they don't contribute to type inference).
|
|
@@ -184,40 +180,65 @@ def __np_dtype_to_pxt_type(np_dtype: np.dtype, data_col: pd.Series, nullable: bo
|
|
|
184
180
|
|
|
185
181
|
if len(data_col) == 0:
|
|
186
182
|
# No non-null values; default to FloatType (the Pandas type of an all-NaN column)
|
|
187
|
-
return
|
|
183
|
+
return ts.FloatType(nullable=nullable)
|
|
188
184
|
|
|
189
|
-
inferred_type =
|
|
185
|
+
inferred_type = ts.ColumnType.infer_common_literal_type(data_col)
|
|
190
186
|
if inferred_type is None:
|
|
191
187
|
# Fallback on StringType if everything else fails
|
|
192
|
-
return
|
|
188
|
+
return ts.StringType(nullable=nullable)
|
|
193
189
|
else:
|
|
194
190
|
return inferred_type.copy(nullable=nullable)
|
|
195
191
|
|
|
196
|
-
raise excs.Error(f'Could not infer Pixeltable type of column: {data_col.name} (dtype: {
|
|
192
|
+
raise excs.Error(f'Could not infer Pixeltable type of column: {data_col.name} (dtype: {pd_dtype})')
|
|
197
193
|
|
|
198
194
|
|
|
199
|
-
def
|
|
200
|
-
|
|
195
|
+
def _df_row_to_pxt_row(
|
|
196
|
+
row: tuple[Any, ...], schema: dict[str, ts.ColumnType], col_mapping: dict[str, str] | None
|
|
197
|
+
) -> dict[str, Any]:
|
|
198
|
+
"""Convert a row to insertable format"""
|
|
199
|
+
pxt_row: dict[str, Any] = {}
|
|
201
200
|
for val, (col_name, pxt_type) in zip(row[1:], schema.items()):
|
|
201
|
+
pxt_name = col_mapping.get(col_name, col_name)
|
|
202
|
+
nval: Any
|
|
202
203
|
if pxt_type.is_float_type():
|
|
203
|
-
|
|
204
|
+
nval = float(val)
|
|
204
205
|
elif isinstance(val, float) and np.isnan(val):
|
|
205
206
|
# pandas uses NaN for empty cells, even for types other than float;
|
|
206
207
|
# for any type but a float, convert these to None
|
|
207
|
-
|
|
208
|
+
nval = None
|
|
208
209
|
elif pxt_type.is_int_type():
|
|
209
|
-
|
|
210
|
+
nval = int(val)
|
|
210
211
|
elif pxt_type.is_bool_type():
|
|
211
|
-
|
|
212
|
+
nval = bool(val)
|
|
212
213
|
elif pxt_type.is_string_type():
|
|
213
|
-
|
|
214
|
+
nval = str(val)
|
|
215
|
+
elif pxt_type.is_date_type():
|
|
216
|
+
if pd.isnull(val):
|
|
217
|
+
# pandas has the bespoke 'NaT' valud for a missing timestamp
|
|
218
|
+
# This is not supported by postgres, and must be converted to None
|
|
219
|
+
nval = None
|
|
220
|
+
else:
|
|
221
|
+
nval = pd.Timestamp(val).date()
|
|
214
222
|
elif pxt_type.is_timestamp_type():
|
|
215
223
|
if pd.isnull(val):
|
|
216
|
-
# pandas has the bespoke 'NaT'
|
|
217
|
-
#
|
|
218
|
-
|
|
219
|
-
val = None
|
|
224
|
+
# pandas has the bespoke 'NaT' value for a missing timestamp
|
|
225
|
+
# This is not supported by postgres, and must be converted to None
|
|
226
|
+
nval = None
|
|
220
227
|
else:
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
228
|
+
tval = pd.Timestamp(val)
|
|
229
|
+
# pandas supports tz-aware and naive timestamps.
|
|
230
|
+
if tval.tz is None:
|
|
231
|
+
nval = pd.Timestamp(tval).tz_localize(tz=Env.get().default_time_zone)
|
|
232
|
+
else:
|
|
233
|
+
nval = tval.astimezone(Env.get().default_time_zone)
|
|
234
|
+
elif pxt_type.is_uuid_type():
|
|
235
|
+
if pd.isnull(val):
|
|
236
|
+
nval = None
|
|
237
|
+
elif isinstance(val, uuid.UUID):
|
|
238
|
+
nval = val
|
|
239
|
+
else:
|
|
240
|
+
nval = uuid.UUID(val)
|
|
241
|
+
else:
|
|
242
|
+
nval = val
|
|
243
|
+
pxt_row[pxt_name] = nval
|
|
244
|
+
return pxt_row
|
pixeltable/io/parquet.py
CHANGED
|
@@ -1,168 +1,78 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
import io
|
|
4
3
|
import json
|
|
5
4
|
import logging
|
|
6
|
-
import random
|
|
7
5
|
import typing
|
|
8
|
-
from collections import deque
|
|
9
6
|
from pathlib import Path
|
|
10
|
-
from typing import Any
|
|
11
|
-
|
|
12
|
-
import numpy as np
|
|
13
|
-
import PIL.Image
|
|
14
|
-
import datetime
|
|
7
|
+
from typing import Any
|
|
15
8
|
|
|
16
9
|
import pixeltable as pxt
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
import pixeltable.type_system as ts
|
|
10
|
+
import pixeltable.exceptions as excs
|
|
11
|
+
from pixeltable.catalog import Catalog
|
|
20
12
|
from pixeltable.utils.transactional_directory import transactional_directory
|
|
21
13
|
|
|
22
14
|
if typing.TYPE_CHECKING:
|
|
23
|
-
import pyarrow as pa
|
|
24
15
|
import pixeltable as pxt
|
|
25
16
|
|
|
26
|
-
_logger = logging.getLogger(
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
def _write_batch(value_batch: dict[str, deque], schema: pa.Schema, output_path: Path) -> None:
|
|
30
|
-
import pyarrow as pa
|
|
31
|
-
from pyarrow import parquet
|
|
32
|
-
|
|
33
|
-
pydict = {}
|
|
34
|
-
for field in schema:
|
|
35
|
-
if isinstance(field.type, pa.FixedShapeTensorType):
|
|
36
|
-
stacked_arr = np.stack(value_batch[field.name])
|
|
37
|
-
pydict[field.name] = pa.FixedShapeTensorArray.from_numpy_ndarray(stacked_arr)
|
|
38
|
-
else:
|
|
39
|
-
pydict[field.name] = value_batch[field.name]
|
|
40
|
-
|
|
41
|
-
tab = pa.Table.from_pydict(pydict, schema=schema)
|
|
42
|
-
parquet.write_table(tab, str(output_path))
|
|
17
|
+
_logger = logging.getLogger('pixeltable')
|
|
43
18
|
|
|
44
19
|
|
|
45
20
|
def export_parquet(
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
21
|
+
table_or_query: pxt.Table | pxt.Query,
|
|
22
|
+
parquet_path: Path,
|
|
23
|
+
partition_size_bytes: int = 100_000_000,
|
|
24
|
+
inline_images: bool = False,
|
|
25
|
+
) -> None:
|
|
51
26
|
"""
|
|
52
|
-
Exports a
|
|
27
|
+
Exports a Query's data to one or more Parquet files. Requires pyarrow to be installed.
|
|
53
28
|
|
|
54
29
|
It additionally writes the pixeltable metadata in a json file, which would otherwise
|
|
55
30
|
not be available in the parquet format.
|
|
56
31
|
|
|
57
32
|
Args:
|
|
58
|
-
|
|
33
|
+
table_or_query : Table or Query to export.
|
|
59
34
|
parquet_path : Path to directory to write the parquet files to.
|
|
60
35
|
partition_size_bytes : The maximum target size for each chunk. Default 100_000_000 bytes.
|
|
61
36
|
inline_images : If True, images are stored inline in the parquet file. This is useful
|
|
62
37
|
for small images, to be imported as pytorch dataset. But can be inefficient
|
|
63
38
|
for large images, and cannot be imported into pixeltable.
|
|
64
|
-
If False, will raise an error if the
|
|
39
|
+
If False, will raise an error if the Query has any image column.
|
|
65
40
|
Default False.
|
|
66
41
|
"""
|
|
67
|
-
|
|
42
|
+
import pyarrow as pa
|
|
68
43
|
|
|
69
|
-
|
|
70
|
-
if isinstance(table_or_df, pxt.catalog.Table):
|
|
71
|
-
df = table_or_df._df()
|
|
72
|
-
else:
|
|
73
|
-
df = table_or_df
|
|
44
|
+
from pixeltable.utils.arrow import to_record_batches
|
|
74
45
|
|
|
75
|
-
|
|
76
|
-
|
|
46
|
+
query: pxt.Query
|
|
47
|
+
if isinstance(table_or_query, pxt.catalog.Table):
|
|
48
|
+
query = table_or_query.select()
|
|
49
|
+
else:
|
|
50
|
+
query = table_or_query
|
|
77
51
|
|
|
78
|
-
if not inline_images and any(col_type.is_image_type() for col_type in
|
|
79
|
-
raise
|
|
52
|
+
if not inline_images and any(col_type.is_image_type() for col_type in query.schema.values()):
|
|
53
|
+
raise excs.Error('Cannot export Query with image columns when inline_images is False')
|
|
80
54
|
|
|
81
55
|
# store the changes atomically
|
|
82
56
|
with transactional_directory(parquet_path) as temp_path:
|
|
83
57
|
# dump metadata json file so we can inspect what was the source of the parquet file later on.
|
|
84
|
-
json.dump(
|
|
58
|
+
json.dump(query.as_dict(), (temp_path / '.pixeltable.json').open('w'))
|
|
59
|
+
type_dict = {k: v.as_dict() for k, v in query.schema.items()}
|
|
85
60
|
json.dump(type_dict, (temp_path / '.pixeltable.column_types.json').open('w')) # keep type metadata
|
|
86
|
-
|
|
87
61
|
batch_num = 0
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
val = data_row[e.slot_idx]
|
|
94
|
-
if val is None:
|
|
95
|
-
current_value_batch[col_name].append(val)
|
|
96
|
-
continue
|
|
97
|
-
|
|
98
|
-
assert val is not None
|
|
99
|
-
if col_type.is_image_type():
|
|
100
|
-
# images get inlined into the parquet file
|
|
101
|
-
if data_row.file_paths is not None and data_row.file_paths[e.slot_idx] is not None:
|
|
102
|
-
# if there is a file, read directly to preserve information
|
|
103
|
-
with open(data_row.file_paths[e.slot_idx], 'rb') as f:
|
|
104
|
-
val = f.read()
|
|
105
|
-
elif isinstance(val, PIL.Image.Image):
|
|
106
|
-
# if no file available, eg. bc it is computed, convert to png
|
|
107
|
-
buf = io.BytesIO()
|
|
108
|
-
val.save(buf, format='PNG')
|
|
109
|
-
val = buf.getvalue()
|
|
110
|
-
else:
|
|
111
|
-
assert False, f'unknown image type {type(val)}'
|
|
112
|
-
length = len(val)
|
|
113
|
-
elif col_type.is_string_type():
|
|
114
|
-
length = len(val)
|
|
115
|
-
elif col_type.is_video_type():
|
|
116
|
-
if data_row.file_paths is not None and data_row.file_paths[e.slot_idx] is not None:
|
|
117
|
-
val = data_row.file_paths[e.slot_idx]
|
|
118
|
-
else:
|
|
119
|
-
assert False, f'unknown video type {type(val)}'
|
|
120
|
-
length = len(val)
|
|
121
|
-
elif col_type.is_json_type():
|
|
122
|
-
val = json.dumps(val)
|
|
123
|
-
length = len(val)
|
|
124
|
-
elif col_type.is_array_type():
|
|
125
|
-
length = val.nbytes
|
|
126
|
-
elif col_type.is_int_type():
|
|
127
|
-
length = 8
|
|
128
|
-
elif col_type.is_float_type():
|
|
129
|
-
length = 8
|
|
130
|
-
elif col_type.is_bool_type():
|
|
131
|
-
length = 1
|
|
132
|
-
elif col_type.is_timestamp_type():
|
|
133
|
-
val = val.astimezone(datetime.timezone.utc)
|
|
134
|
-
length = 8
|
|
135
|
-
else:
|
|
136
|
-
assert False, f'unknown type {col_type} for {col_name}'
|
|
137
|
-
|
|
138
|
-
current_value_batch[col_name].append(val)
|
|
139
|
-
current_byte_estimate += length
|
|
140
|
-
if current_byte_estimate > partition_size_bytes:
|
|
141
|
-
assert batch_num < 100_000, 'wrote too many parquet files, unclear ordering'
|
|
142
|
-
_write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
|
|
62
|
+
with Catalog.get().begin_xact(for_write=False):
|
|
63
|
+
for record_batch in to_record_batches(query, partition_size_bytes):
|
|
64
|
+
output_path = temp_path / f'part-{batch_num:05d}.parquet'
|
|
65
|
+
arrow_tbl = pa.Table.from_batches([record_batch])
|
|
66
|
+
pa.parquet.write_table(arrow_tbl, str(output_path))
|
|
143
67
|
batch_num += 1
|
|
144
|
-
current_value_batch = {k: deque() for k in df.schema.keys()}
|
|
145
|
-
current_byte_estimate = 0
|
|
146
|
-
|
|
147
|
-
_write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
def parquet_schema_to_pixeltable_schema(parquet_path: str) -> dict[str, Optional[ts.ColumnType]]:
|
|
151
|
-
"""Generate a default pixeltable schema for the given parquet file. Returns None for unknown types."""
|
|
152
|
-
from pyarrow import parquet
|
|
153
|
-
|
|
154
|
-
from pixeltable.utils.arrow import to_pixeltable_schema
|
|
155
|
-
|
|
156
|
-
input_path = Path(parquet_path).expanduser()
|
|
157
|
-
parquet_dataset = parquet.ParquetDataset(str(input_path))
|
|
158
|
-
return to_pixeltable_schema(parquet_dataset.schema)
|
|
159
68
|
|
|
160
69
|
|
|
161
70
|
def import_parquet(
|
|
162
71
|
table: str,
|
|
163
72
|
*,
|
|
164
73
|
parquet_path: str,
|
|
165
|
-
schema_overrides:
|
|
74
|
+
schema_overrides: dict[str, Any] | None = None,
|
|
75
|
+
primary_key: str | list[str] | None = None,
|
|
166
76
|
**kwargs: Any,
|
|
167
77
|
) -> pxt.Table:
|
|
168
78
|
"""Creates a new base table from a Parquet file or set of files. Requires pyarrow to be installed.
|
|
@@ -174,41 +84,18 @@ def import_parquet(
|
|
|
174
84
|
name `name` will be given type `type`, instead of being inferred from the Parquet dataset. The keys in
|
|
175
85
|
`schema_overrides` should be the column names of the Parquet dataset (whether or not they are valid
|
|
176
86
|
Pixeltable identifiers).
|
|
87
|
+
primary_key: The primary key of the table (see [`create_table()`][pixeltable.create_table]).
|
|
177
88
|
kwargs: Additional arguments to pass to `create_table`.
|
|
178
89
|
|
|
179
90
|
Returns:
|
|
180
91
|
A handle to the newly created table.
|
|
181
92
|
"""
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
if schema_overrides is None:
|
|
192
|
-
schema_overrides = {}
|
|
193
|
-
|
|
194
|
-
schema.update(schema_overrides)
|
|
195
|
-
for k, v in schema.items():
|
|
196
|
-
if v is None:
|
|
197
|
-
raise exc.Error(f'Could not infer pixeltable type for column {k} from parquet file')
|
|
198
|
-
|
|
199
|
-
if table in pxt.list_tables():
|
|
200
|
-
raise exc.Error(f'Table {table} already exists')
|
|
201
|
-
|
|
202
|
-
try:
|
|
203
|
-
tmp_name = f'{table}_tmp_{random.randint(0, 100000000)}'
|
|
204
|
-
tab = pxt.create_table(tmp_name, schema, **kwargs)
|
|
205
|
-
for fragment in parquet_dataset.fragments: # type: ignore[attr-defined]
|
|
206
|
-
for batch in fragment.to_batches():
|
|
207
|
-
dict_batch = list(iter_tuples(batch))
|
|
208
|
-
tab.insert(dict_batch)
|
|
209
|
-
except Exception as e:
|
|
210
|
-
_logger.error(f'Error while inserting Parquet file into table: {e}')
|
|
211
|
-
raise e
|
|
212
|
-
|
|
213
|
-
pxt.move(tmp_name, table)
|
|
214
|
-
return pxt.get_table(table)
|
|
93
|
+
value = kwargs.pop('source_format', None)
|
|
94
|
+
return pxt.create_table(
|
|
95
|
+
table,
|
|
96
|
+
source=parquet_path,
|
|
97
|
+
source_format=value,
|
|
98
|
+
schema_overrides=schema_overrides,
|
|
99
|
+
primary_key=primary_key,
|
|
100
|
+
extra_args=kwargs,
|
|
101
|
+
)
|