pixeltable 0.3.8__py3-none-any.whl → 0.3.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +1 -2
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/catalog.py +509 -103
- pixeltable/catalog/column.py +5 -0
- pixeltable/catalog/dir.py +15 -6
- pixeltable/catalog/globals.py +16 -0
- pixeltable/catalog/insertable_table.py +82 -41
- pixeltable/catalog/path.py +15 -0
- pixeltable/catalog/schema_object.py +7 -12
- pixeltable/catalog/table.py +81 -67
- pixeltable/catalog/table_version.py +23 -7
- pixeltable/catalog/view.py +9 -6
- pixeltable/env.py +15 -9
- pixeltable/exec/exec_node.py +1 -1
- pixeltable/exprs/__init__.py +2 -1
- pixeltable/exprs/arithmetic_expr.py +2 -0
- pixeltable/exprs/column_ref.py +38 -2
- pixeltable/exprs/expr.py +61 -12
- pixeltable/exprs/function_call.py +1 -4
- pixeltable/exprs/globals.py +12 -0
- pixeltable/exprs/json_mapper.py +4 -4
- pixeltable/exprs/json_path.py +10 -11
- pixeltable/exprs/similarity_expr.py +5 -20
- pixeltable/exprs/string_op.py +107 -0
- pixeltable/ext/functions/yolox.py +21 -64
- pixeltable/func/callable_function.py +5 -2
- pixeltable/func/query_template_function.py +6 -18
- pixeltable/func/tools.py +2 -2
- pixeltable/functions/__init__.py +1 -1
- pixeltable/functions/globals.py +16 -5
- pixeltable/globals.py +172 -262
- pixeltable/io/__init__.py +3 -2
- pixeltable/io/datarows.py +138 -0
- pixeltable/io/external_store.py +8 -5
- pixeltable/io/globals.py +7 -160
- pixeltable/io/hf_datasets.py +21 -98
- pixeltable/io/pandas.py +29 -43
- pixeltable/io/parquet.py +17 -42
- pixeltable/io/table_data_conduit.py +569 -0
- pixeltable/io/utils.py +6 -21
- pixeltable/metadata/__init__.py +1 -1
- pixeltable/metadata/converters/convert_30.py +50 -0
- pixeltable/metadata/converters/util.py +26 -1
- pixeltable/metadata/notes.py +1 -0
- pixeltable/metadata/schema.py +3 -0
- pixeltable/utils/arrow.py +32 -7
- pixeltable/utils/coroutine.py +41 -0
- {pixeltable-0.3.8.dist-info → pixeltable-0.3.10.dist-info}/METADATA +1 -1
- {pixeltable-0.3.8.dist-info → pixeltable-0.3.10.dist-info}/RECORD +52 -47
- {pixeltable-0.3.8.dist-info → pixeltable-0.3.10.dist-info}/WHEEL +1 -1
- {pixeltable-0.3.8.dist-info → pixeltable-0.3.10.dist-info}/LICENSE +0 -0
- {pixeltable-0.3.8.dist-info → pixeltable-0.3.10.dist-info}/entry_points.txt +0 -0
pixeltable/io/pandas.py
CHANGED
|
@@ -7,9 +7,6 @@ from pandas.api.types import is_datetime64_any_dtype, is_extension_array_dtype
|
|
|
7
7
|
|
|
8
8
|
import pixeltable as pxt
|
|
9
9
|
import pixeltable.exceptions as excs
|
|
10
|
-
from pixeltable import Table
|
|
11
|
-
|
|
12
|
-
from .utils import find_or_create_table, normalize_import_parameters, normalize_schema_names
|
|
13
10
|
|
|
14
11
|
|
|
15
12
|
def import_pandas(
|
|
@@ -43,20 +40,14 @@ def import_pandas(
|
|
|
43
40
|
Returns:
|
|
44
41
|
A handle to the newly created [`Table`][pixeltable.Table].
|
|
45
42
|
"""
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
tbl_rows = [__df_row_to_pxt_row(row, pd_schema, col_mapping) for row in df.itertuples()]
|
|
54
|
-
|
|
55
|
-
table = find_or_create_table(
|
|
56
|
-
tbl_name, schema, primary_key=pxt_pk, num_retained_versions=num_retained_versions, comment=comment
|
|
43
|
+
return pxt.create_table(
|
|
44
|
+
tbl_name,
|
|
45
|
+
source=df,
|
|
46
|
+
schema_overrides=schema_overrides,
|
|
47
|
+
primary_key=primary_key,
|
|
48
|
+
num_retained_versions=num_retained_versions,
|
|
49
|
+
comment=comment,
|
|
57
50
|
)
|
|
58
|
-
table.insert(tbl_rows)
|
|
59
|
-
return table
|
|
60
51
|
|
|
61
52
|
|
|
62
53
|
def import_csv(
|
|
@@ -77,14 +68,14 @@ def import_csv(
|
|
|
77
68
|
Returns:
|
|
78
69
|
A handle to the newly created [`Table`][pixeltable.Table].
|
|
79
70
|
"""
|
|
80
|
-
|
|
81
|
-
return import_pandas(
|
|
71
|
+
return pxt.create_table(
|
|
82
72
|
tbl_name,
|
|
83
|
-
|
|
73
|
+
source=filepath_or_buffer,
|
|
84
74
|
schema_overrides=schema_overrides,
|
|
85
75
|
primary_key=primary_key,
|
|
86
76
|
num_retained_versions=num_retained_versions,
|
|
87
77
|
comment=comment,
|
|
78
|
+
extra_args=kwargs,
|
|
88
79
|
)
|
|
89
80
|
|
|
90
81
|
|
|
@@ -107,18 +98,18 @@ def import_excel(
|
|
|
107
98
|
Returns:
|
|
108
99
|
A handle to the newly created [`Table`][pixeltable.Table].
|
|
109
100
|
"""
|
|
110
|
-
|
|
111
|
-
return import_pandas(
|
|
101
|
+
return pxt.create_table(
|
|
112
102
|
tbl_name,
|
|
113
|
-
|
|
103
|
+
source=io,
|
|
114
104
|
schema_overrides=schema_overrides,
|
|
115
105
|
primary_key=primary_key,
|
|
116
106
|
num_retained_versions=num_retained_versions,
|
|
117
107
|
comment=comment,
|
|
108
|
+
extra_args=kwargs,
|
|
118
109
|
)
|
|
119
110
|
|
|
120
111
|
|
|
121
|
-
def
|
|
112
|
+
def _df_check_primary_key_values(df: pd.DataFrame, primary_key: list[str]) -> None:
|
|
122
113
|
for pd_name in primary_key:
|
|
123
114
|
# This can be faster for large DataFrames
|
|
124
115
|
has_nulls = df[pd_name].count() < len(df)
|
|
@@ -146,15 +137,6 @@ def df_infer_schema(
|
|
|
146
137
|
return pd_schema
|
|
147
138
|
|
|
148
139
|
|
|
149
|
-
"""
|
|
150
|
-
# Check if a datetime64[ns, UTC] dtype
|
|
151
|
-
def is_datetime_tz_utc(x: Any) -> bool:
|
|
152
|
-
if isinstance(x, pd.Timestamp) and x.tzinfo is not None and str(x.tzinfo) == 'UTC':
|
|
153
|
-
return True
|
|
154
|
-
return pd.api.types.is_datetime64tz_dtype(x) and str(x).endswith('UTC]')
|
|
155
|
-
"""
|
|
156
|
-
|
|
157
|
-
|
|
158
140
|
def __pd_dtype_to_pxt_type(pd_dtype: DtypeObj, nullable: bool) -> Optional[pxt.ColumnType]:
|
|
159
141
|
"""
|
|
160
142
|
Determines a pixeltable ColumnType from a pandas dtype
|
|
@@ -165,7 +147,8 @@ def __pd_dtype_to_pxt_type(pd_dtype: DtypeObj, nullable: bool) -> Optional[pxt.C
|
|
|
165
147
|
Returns:
|
|
166
148
|
pxt.ColumnType: A pixeltable ColumnType
|
|
167
149
|
"""
|
|
168
|
-
# Pandas extension arrays / types (Int64, boolean, string[pyarrow], etc.) are not directly
|
|
150
|
+
# Pandas extension arrays / types (Int64, boolean, string[pyarrow], etc.) are not directly
|
|
151
|
+
# compatible with NumPy dtypes
|
|
169
152
|
# The timezone-aware datetime64[ns, tz=] dtype is a pandas extension dtype
|
|
170
153
|
if is_datetime64_any_dtype(pd_dtype):
|
|
171
154
|
return pxt.TimestampType(nullable=nullable)
|
|
@@ -204,32 +187,35 @@ def __pd_coltype_to_pxt_type(pd_dtype: DtypeObj, data_col: pd.Series, nullable:
|
|
|
204
187
|
raise excs.Error(f'Could not infer Pixeltable type of column: {data_col.name} (dtype: {pd_dtype})')
|
|
205
188
|
|
|
206
189
|
|
|
207
|
-
def
|
|
190
|
+
def _df_row_to_pxt_row(
|
|
208
191
|
row: tuple[Any, ...], schema: dict[str, pxt.ColumnType], col_mapping: Optional[dict[str, str]]
|
|
209
192
|
) -> dict[str, Any]:
|
|
210
193
|
"""Convert a row to insertable format"""
|
|
211
194
|
pxt_row: dict[str, Any] = {}
|
|
212
195
|
for val, (col_name, pxt_type) in zip(row[1:], schema.items()):
|
|
196
|
+
pxt_name = col_mapping.get(col_name, col_name)
|
|
197
|
+
nval: Any
|
|
213
198
|
if pxt_type.is_float_type():
|
|
214
|
-
|
|
199
|
+
nval = float(val)
|
|
215
200
|
elif isinstance(val, float) and np.isnan(val):
|
|
216
201
|
# pandas uses NaN for empty cells, even for types other than float;
|
|
217
202
|
# for any type but a float, convert these to None
|
|
218
|
-
|
|
203
|
+
nval = None
|
|
219
204
|
elif pxt_type.is_int_type():
|
|
220
|
-
|
|
205
|
+
nval = int(val)
|
|
221
206
|
elif pxt_type.is_bool_type():
|
|
222
|
-
|
|
207
|
+
nval = bool(val)
|
|
223
208
|
elif pxt_type.is_string_type():
|
|
224
|
-
|
|
209
|
+
nval = str(val)
|
|
225
210
|
elif pxt_type.is_timestamp_type():
|
|
226
211
|
if pd.isnull(val):
|
|
227
212
|
# pandas has the bespoke 'NaT' type for a missing timestamp; postgres is very
|
|
228
213
|
# much not-ok with it. (But if we convert it to None and then load out the
|
|
229
214
|
# table contents as a pandas DataFrame, it will correctly restore the 'NaT'!)
|
|
230
|
-
|
|
215
|
+
nval = None
|
|
231
216
|
else:
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
217
|
+
nval = pd.Timestamp(val).to_pydatetime()
|
|
218
|
+
else:
|
|
219
|
+
nval = val
|
|
220
|
+
pxt_row[pxt_name] = nval
|
|
235
221
|
return pxt_row
|
pixeltable/io/parquet.py
CHANGED
|
@@ -4,7 +4,6 @@ import datetime
|
|
|
4
4
|
import io
|
|
5
5
|
import json
|
|
6
6
|
import logging
|
|
7
|
-
import random
|
|
8
7
|
import typing
|
|
9
8
|
from collections import deque
|
|
10
9
|
from pathlib import Path
|
|
@@ -14,12 +13,10 @@ import numpy as np
|
|
|
14
13
|
import PIL.Image
|
|
15
14
|
|
|
16
15
|
import pixeltable as pxt
|
|
17
|
-
import pixeltable.exceptions as
|
|
16
|
+
import pixeltable.exceptions as excs
|
|
18
17
|
from pixeltable.env import Env
|
|
19
18
|
from pixeltable.utils.transactional_directory import transactional_directory
|
|
20
19
|
|
|
21
|
-
from .utils import normalize_import_parameters, normalize_schema_names
|
|
22
|
-
|
|
23
20
|
if typing.TYPE_CHECKING:
|
|
24
21
|
import pyarrow as pa
|
|
25
22
|
|
|
@@ -78,7 +75,7 @@ def export_parquet(
|
|
|
78
75
|
arrow_schema = to_arrow_schema(df.schema)
|
|
79
76
|
|
|
80
77
|
if not inline_images and any(col_type.is_image_type() for col_type in df.schema.values()):
|
|
81
|
-
raise
|
|
78
|
+
raise excs.Error('Cannot export Dataframe with image columns when inline_images is False')
|
|
82
79
|
|
|
83
80
|
# store the changes atomically
|
|
84
81
|
with transactional_directory(parquet_path) as temp_path:
|
|
@@ -87,7 +84,7 @@ def export_parquet(
|
|
|
87
84
|
json.dump(type_dict, (temp_path / '.pixeltable.column_types.json').open('w')) # keep type metadata
|
|
88
85
|
|
|
89
86
|
batch_num = 0
|
|
90
|
-
current_value_batch: dict[str, deque] = {k: deque() for k in df.schema
|
|
87
|
+
current_value_batch: dict[str, deque] = {k: deque() for k in df.schema}
|
|
91
88
|
current_byte_estimate = 0
|
|
92
89
|
|
|
93
90
|
with Env.get().begin_xact():
|
|
@@ -111,7 +108,7 @@ def export_parquet(
|
|
|
111
108
|
val.save(buf, format='PNG')
|
|
112
109
|
val = buf.getvalue()
|
|
113
110
|
else:
|
|
114
|
-
|
|
111
|
+
raise excs.Error(f'unknown image type {type(val)}')
|
|
115
112
|
length = len(val)
|
|
116
113
|
elif col_type.is_string_type():
|
|
117
114
|
length = len(val)
|
|
@@ -119,16 +116,14 @@ def export_parquet(
|
|
|
119
116
|
if data_row.file_paths is not None and data_row.file_paths[e.slot_idx] is not None:
|
|
120
117
|
val = data_row.file_paths[e.slot_idx]
|
|
121
118
|
else:
|
|
122
|
-
|
|
119
|
+
raise excs.Error(f'unknown video type {type(val)}')
|
|
123
120
|
length = len(val)
|
|
124
121
|
elif col_type.is_json_type():
|
|
125
122
|
val = json.dumps(val)
|
|
126
123
|
length = len(val)
|
|
127
124
|
elif col_type.is_array_type():
|
|
128
125
|
length = val.nbytes
|
|
129
|
-
elif col_type.is_int_type():
|
|
130
|
-
length = 8
|
|
131
|
-
elif col_type.is_float_type():
|
|
126
|
+
elif col_type.is_int_type() or col_type.is_float_type():
|
|
132
127
|
length = 8
|
|
133
128
|
elif col_type.is_bool_type():
|
|
134
129
|
length = 1
|
|
@@ -136,7 +131,7 @@ def export_parquet(
|
|
|
136
131
|
val = val.astimezone(datetime.timezone.utc)
|
|
137
132
|
length = 8
|
|
138
133
|
else:
|
|
139
|
-
|
|
134
|
+
raise excs.Error(f'unknown type {col_type} for {col_name}')
|
|
140
135
|
|
|
141
136
|
current_value_batch[col_name].append(val)
|
|
142
137
|
current_byte_estimate += length
|
|
@@ -144,7 +139,7 @@ def export_parquet(
|
|
|
144
139
|
assert batch_num < 100_000, 'wrote too many parquet files, unclear ordering'
|
|
145
140
|
_write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
|
|
146
141
|
batch_num += 1
|
|
147
|
-
current_value_batch = {k: deque() for k in df.schema
|
|
142
|
+
current_value_batch = {k: deque() for k in df.schema}
|
|
148
143
|
current_byte_estimate = 0
|
|
149
144
|
|
|
150
145
|
_write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
|
|
@@ -173,32 +168,12 @@ def import_parquet(
|
|
|
173
168
|
Returns:
|
|
174
169
|
A handle to the newly created table.
|
|
175
170
|
"""
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
schema, pxt_pk, col_mapping = normalize_schema_names(ar_schema, primary_key, schema_overrides, False)
|
|
186
|
-
|
|
187
|
-
if table in pxt.list_tables():
|
|
188
|
-
raise exc.Error(f'Table {table} already exists')
|
|
189
|
-
|
|
190
|
-
tmp_name = f'{table}_tmp_{random.randint(0, 100000000)}'
|
|
191
|
-
total_rows = 0
|
|
192
|
-
try:
|
|
193
|
-
tab = pxt.create_table(tmp_name, schema, primary_key=pxt_pk, **kwargs)
|
|
194
|
-
for fragment in parquet_dataset.fragments: # type: ignore[attr-defined]
|
|
195
|
-
for batch in fragment.to_batches():
|
|
196
|
-
dict_batch = list(iter_tuples2(batch, col_mapping, schema))
|
|
197
|
-
total_rows += len(dict_batch)
|
|
198
|
-
tab.insert(dict_batch)
|
|
199
|
-
except Exception as e:
|
|
200
|
-
_logger.error(f'Error after inserting {total_rows} rows from Parquet file into table: {e}')
|
|
201
|
-
raise e
|
|
202
|
-
|
|
203
|
-
pxt.move(tmp_name, table)
|
|
204
|
-
return pxt.get_table(table)
|
|
171
|
+
value = kwargs.pop('source_format', None)
|
|
172
|
+
return pxt.create_table(
|
|
173
|
+
table,
|
|
174
|
+
source=parquet_path,
|
|
175
|
+
source_format=value,
|
|
176
|
+
schema_overrides=schema_overrides,
|
|
177
|
+
primary_key=primary_key,
|
|
178
|
+
extra_args=kwargs,
|
|
179
|
+
)
|