pixeltable 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +64 -11
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +1 -1
- pixeltable/catalog/catalog.py +50 -27
- pixeltable/catalog/column.py +27 -11
- pixeltable/catalog/dir.py +6 -4
- pixeltable/catalog/globals.py +8 -1
- pixeltable/catalog/insertable_table.py +25 -15
- pixeltable/catalog/named_function.py +10 -6
- pixeltable/catalog/path.py +3 -2
- pixeltable/catalog/path_dict.py +8 -6
- pixeltable/catalog/schema_object.py +2 -1
- pixeltable/catalog/table.py +123 -103
- pixeltable/catalog/table_version.py +292 -143
- pixeltable/catalog/table_version_path.py +8 -5
- pixeltable/catalog/view.py +68 -27
- pixeltable/dataframe.py +102 -72
- pixeltable/env.py +39 -23
- pixeltable/exec/__init__.py +2 -2
- pixeltable/exec/aggregation_node.py +10 -4
- pixeltable/exec/cache_prefetch_node.py +5 -3
- pixeltable/exec/component_iteration_node.py +9 -8
- pixeltable/exec/data_row_batch.py +21 -10
- pixeltable/exec/exec_context.py +10 -3
- pixeltable/exec/exec_node.py +23 -12
- pixeltable/exec/expr_eval/evaluators.py +18 -17
- pixeltable/exec/expr_eval/expr_eval_node.py +29 -16
- pixeltable/exec/expr_eval/globals.py +33 -11
- pixeltable/exec/expr_eval/row_buffer.py +5 -6
- pixeltable/exec/expr_eval/schedulers.py +170 -42
- pixeltable/exec/in_memory_data_node.py +8 -7
- pixeltable/exec/row_update_node.py +15 -5
- pixeltable/exec/sql_node.py +56 -27
- pixeltable/exprs/__init__.py +2 -2
- pixeltable/exprs/arithmetic_expr.py +57 -26
- pixeltable/exprs/array_slice.py +1 -1
- pixeltable/exprs/column_property_ref.py +2 -1
- pixeltable/exprs/column_ref.py +20 -15
- pixeltable/exprs/comparison.py +6 -2
- pixeltable/exprs/compound_predicate.py +1 -3
- pixeltable/exprs/data_row.py +2 -2
- pixeltable/exprs/expr.py +101 -72
- pixeltable/exprs/expr_dict.py +2 -1
- pixeltable/exprs/expr_set.py +3 -1
- pixeltable/exprs/function_call.py +39 -41
- pixeltable/exprs/globals.py +1 -0
- pixeltable/exprs/in_predicate.py +2 -2
- pixeltable/exprs/inline_expr.py +20 -17
- pixeltable/exprs/json_mapper.py +4 -2
- pixeltable/exprs/json_path.py +12 -18
- pixeltable/exprs/literal.py +5 -9
- pixeltable/exprs/method_ref.py +1 -0
- pixeltable/exprs/object_ref.py +1 -1
- pixeltable/exprs/row_builder.py +31 -16
- pixeltable/exprs/rowid_ref.py +14 -5
- pixeltable/exprs/similarity_expr.py +11 -6
- pixeltable/exprs/sql_element_cache.py +1 -1
- pixeltable/exprs/type_cast.py +24 -9
- pixeltable/ext/__init__.py +1 -0
- pixeltable/ext/functions/__init__.py +1 -0
- pixeltable/ext/functions/whisperx.py +2 -2
- pixeltable/ext/functions/yolox.py +11 -11
- pixeltable/func/aggregate_function.py +17 -13
- pixeltable/func/callable_function.py +6 -6
- pixeltable/func/expr_template_function.py +15 -14
- pixeltable/func/function.py +16 -16
- pixeltable/func/function_registry.py +11 -8
- pixeltable/func/globals.py +4 -2
- pixeltable/func/query_template_function.py +12 -13
- pixeltable/func/signature.py +18 -9
- pixeltable/func/tools.py +10 -17
- pixeltable/func/udf.py +106 -11
- pixeltable/functions/__init__.py +21 -2
- pixeltable/functions/anthropic.py +21 -15
- pixeltable/functions/fireworks.py +63 -5
- pixeltable/functions/gemini.py +13 -3
- pixeltable/functions/globals.py +18 -6
- pixeltable/functions/huggingface.py +20 -38
- pixeltable/functions/image.py +7 -3
- pixeltable/functions/json.py +1 -0
- pixeltable/functions/llama_cpp.py +1 -4
- pixeltable/functions/mistralai.py +31 -20
- pixeltable/functions/ollama.py +4 -18
- pixeltable/functions/openai.py +214 -109
- pixeltable/functions/replicate.py +11 -10
- pixeltable/functions/string.py +70 -7
- pixeltable/functions/timestamp.py +21 -8
- pixeltable/functions/together.py +66 -52
- pixeltable/functions/video.py +1 -0
- pixeltable/functions/vision.py +14 -11
- pixeltable/functions/whisper.py +2 -1
- pixeltable/globals.py +61 -28
- pixeltable/index/__init__.py +1 -1
- pixeltable/index/btree.py +5 -3
- pixeltable/index/embedding_index.py +15 -14
- pixeltable/io/__init__.py +1 -1
- pixeltable/io/external_store.py +30 -25
- pixeltable/io/fiftyone.py +6 -14
- pixeltable/io/globals.py +33 -27
- pixeltable/io/hf_datasets.py +3 -2
- pixeltable/io/label_studio.py +80 -71
- pixeltable/io/pandas.py +33 -9
- pixeltable/io/parquet.py +10 -13
- pixeltable/iterators/__init__.py +1 -0
- pixeltable/iterators/audio.py +205 -0
- pixeltable/iterators/document.py +19 -8
- pixeltable/iterators/image.py +6 -24
- pixeltable/iterators/string.py +3 -6
- pixeltable/iterators/video.py +1 -7
- pixeltable/metadata/__init__.py +9 -2
- pixeltable/metadata/converters/convert_10.py +2 -2
- pixeltable/metadata/converters/convert_15.py +1 -5
- pixeltable/metadata/converters/convert_16.py +2 -4
- pixeltable/metadata/converters/convert_17.py +2 -4
- pixeltable/metadata/converters/convert_18.py +2 -4
- pixeltable/metadata/converters/convert_19.py +2 -5
- pixeltable/metadata/converters/convert_20.py +1 -4
- pixeltable/metadata/converters/convert_21.py +4 -6
- pixeltable/metadata/converters/convert_22.py +1 -0
- pixeltable/metadata/converters/convert_23.py +5 -5
- pixeltable/metadata/converters/convert_24.py +12 -13
- pixeltable/metadata/converters/convert_26.py +23 -0
- pixeltable/metadata/converters/util.py +3 -4
- pixeltable/metadata/notes.py +1 -0
- pixeltable/metadata/schema.py +13 -2
- pixeltable/plan.py +173 -98
- pixeltable/store.py +42 -26
- pixeltable/type_system.py +130 -85
- pixeltable/utils/arrow.py +1 -7
- pixeltable/utils/coco.py +16 -17
- pixeltable/utils/code.py +1 -1
- pixeltable/utils/console_output.py +44 -0
- pixeltable/utils/description_helper.py +7 -7
- pixeltable/utils/documents.py +3 -1
- pixeltable/utils/filecache.py +13 -8
- pixeltable/utils/http_server.py +9 -8
- pixeltable/utils/media_store.py +2 -1
- pixeltable/utils/pytorch.py +11 -14
- pixeltable/utils/s3.py +1 -0
- pixeltable/utils/sql.py +1 -0
- pixeltable/utils/transactional_directory.py +2 -2
- {pixeltable-0.3.1.dist-info → pixeltable-0.3.3.dist-info}/METADATA +7 -8
- pixeltable-0.3.3.dist-info/RECORD +163 -0
- pixeltable-0.3.1.dist-info/RECORD +0 -160
- {pixeltable-0.3.1.dist-info → pixeltable-0.3.3.dist-info}/LICENSE +0 -0
- {pixeltable-0.3.1.dist-info → pixeltable-0.3.3.dist-info}/WHEEL +0 -0
- {pixeltable-0.3.1.dist-info → pixeltable-0.3.3.dist-info}/entry_points.txt +0 -0
pixeltable/store.py
CHANGED
|
@@ -32,6 +32,7 @@ class StoreBase:
|
|
|
32
32
|
- v_min: version at which the row was created
|
|
33
33
|
- v_max: version at which the row was deleted (or MAX_VERSION if it's still live)
|
|
34
34
|
"""
|
|
35
|
+
|
|
35
36
|
tbl_version: catalog.TableVersion
|
|
36
37
|
sa_md: sql.MetaData
|
|
37
38
|
sa_tbl: Optional[sql.Table]
|
|
@@ -65,8 +66,9 @@ class StoreBase:
|
|
|
65
66
|
"""Create and return system columns"""
|
|
66
67
|
rowid_cols = self._create_rowid_columns()
|
|
67
68
|
self.v_min_col = sql.Column('v_min', sql.BigInteger, nullable=False)
|
|
68
|
-
self.v_max_col =
|
|
69
|
-
|
|
69
|
+
self.v_max_col = sql.Column(
|
|
70
|
+
'v_max', sql.BigInteger, nullable=False, server_default=str(schema.Table.MAX_VERSION)
|
|
71
|
+
)
|
|
70
72
|
self._pk_cols = [*rowid_cols, self.v_min_col]
|
|
71
73
|
return [*rowid_cols, self.v_min_col, self.v_max_col]
|
|
72
74
|
|
|
@@ -134,7 +136,7 @@ class StoreBase:
|
|
|
134
136
|
return new_file_url
|
|
135
137
|
|
|
136
138
|
def _move_tmp_media_files(
|
|
137
|
-
|
|
139
|
+
self, table_rows: list[dict[str, Any]], media_cols: list[catalog.Column], v_min: int
|
|
138
140
|
) -> None:
|
|
139
141
|
"""Move tmp media files that we generated to a permanent location"""
|
|
140
142
|
for c in media_cols:
|
|
@@ -143,7 +145,7 @@ class StoreBase:
|
|
|
143
145
|
table_row[c.store_name()] = self._move_tmp_media_file(file_url, c, v_min)
|
|
144
146
|
|
|
145
147
|
def _create_table_row(
|
|
146
|
-
|
|
148
|
+
self, input_row: exprs.DataRow, row_builder: exprs.RowBuilder, exc_col_ids: set[int], pk: tuple[int, ...]
|
|
147
149
|
) -> tuple[dict[str, Any], int]:
|
|
148
150
|
"""Return Tuple[complete table row, # of exceptions] for insert()
|
|
149
151
|
Creates a row that includes the PK columns, with the values from input_row.pk.
|
|
@@ -193,11 +195,13 @@ class StoreBase:
|
|
|
193
195
|
added_storage_cols = [col.store_name()]
|
|
194
196
|
if col.records_errors:
|
|
195
197
|
# we also need to create the errormsg and errortype storage cols
|
|
196
|
-
stmt = sql.text(
|
|
197
|
-
|
|
198
|
+
stmt = sql.text(
|
|
199
|
+
f'ALTER TABLE {self._storage_name()} ADD COLUMN {col.errormsg_store_name()} VARCHAR DEFAULT NULL'
|
|
200
|
+
)
|
|
198
201
|
conn.execute(stmt)
|
|
199
|
-
stmt = sql.text(
|
|
200
|
-
|
|
202
|
+
stmt = sql.text(
|
|
203
|
+
f'ALTER TABLE {self._storage_name()} ADD COLUMN {col.errortype_store_name()} VARCHAR DEFAULT NULL'
|
|
204
|
+
)
|
|
201
205
|
conn.execute(stmt)
|
|
202
206
|
added_storage_cols.extend([col.errormsg_store_name(), col.errortype_store_name()])
|
|
203
207
|
self.create_sa_tbl()
|
|
@@ -219,7 +223,7 @@ class StoreBase:
|
|
|
219
223
|
exec_plan: ExecNode,
|
|
220
224
|
value_expr_slot_idx: int,
|
|
221
225
|
conn: sql.engine.Connection,
|
|
222
|
-
on_error: Literal['abort', 'ignore']
|
|
226
|
+
on_error: Literal['abort', 'ignore'],
|
|
223
227
|
) -> int:
|
|
224
228
|
"""Update store column of a computed column with values produced by an execution plan
|
|
225
229
|
|
|
@@ -295,10 +299,9 @@ class StoreBase:
|
|
|
295
299
|
update_stmt = update_stmt.where(pk_col == tmp_pk_col)
|
|
296
300
|
update_stmt = update_stmt.values({col.sa_col: tmp_val_col})
|
|
297
301
|
if col.records_errors:
|
|
298
|
-
update_stmt = update_stmt.values(
|
|
299
|
-
col.sa_errortype_col: tmp_errortype_col,
|
|
300
|
-
|
|
301
|
-
})
|
|
302
|
+
update_stmt = update_stmt.values(
|
|
303
|
+
{col.sa_errortype_col: tmp_errortype_col, col.sa_errormsg_col: tmp_errormsg_col}
|
|
304
|
+
)
|
|
302
305
|
log_explain(_logger, update_stmt, conn)
|
|
303
306
|
conn.execute(update_stmt)
|
|
304
307
|
|
|
@@ -308,8 +311,13 @@ class StoreBase:
|
|
|
308
311
|
return num_excs
|
|
309
312
|
|
|
310
313
|
def insert_rows(
|
|
311
|
-
|
|
312
|
-
|
|
314
|
+
self,
|
|
315
|
+
exec_plan: ExecNode,
|
|
316
|
+
conn: sql.engine.Connection,
|
|
317
|
+
v_min: Optional[int] = None,
|
|
318
|
+
show_progress: bool = True,
|
|
319
|
+
rowids: Optional[Iterator[int]] = None,
|
|
320
|
+
abort_on_exc: bool = False,
|
|
313
321
|
) -> tuple[int, int, set[int]]:
|
|
314
322
|
"""Insert rows into the store table and update the catalog table's md
|
|
315
323
|
Returns:
|
|
@@ -347,12 +355,12 @@ class StoreBase:
|
|
|
347
355
|
|
|
348
356
|
if show_progress:
|
|
349
357
|
if progress_bar is None:
|
|
350
|
-
warnings.simplefilter(
|
|
358
|
+
warnings.simplefilter('ignore', category=TqdmWarning)
|
|
351
359
|
progress_bar = tqdm(
|
|
352
360
|
desc=f'Inserting rows into `{self.tbl_version.name}`',
|
|
353
361
|
unit=' rows',
|
|
354
362
|
ncols=100,
|
|
355
|
-
file=sys.stdout
|
|
363
|
+
file=sys.stdout,
|
|
356
364
|
)
|
|
357
365
|
progress_bar.update(1)
|
|
358
366
|
|
|
@@ -379,8 +387,13 @@ class StoreBase:
|
|
|
379
387
|
return sql.and_(clause, self.base._versions_clause(versions[1:], match_on_vmin))
|
|
380
388
|
|
|
381
389
|
def delete_rows(
|
|
382
|
-
|
|
383
|
-
|
|
390
|
+
self,
|
|
391
|
+
current_version: int,
|
|
392
|
+
base_versions: list[Optional[int]],
|
|
393
|
+
match_on_vmin: bool,
|
|
394
|
+
where_clause: Optional[sql.ColumnElement[bool]],
|
|
395
|
+
conn: sql.engine.Connection,
|
|
396
|
+
) -> int:
|
|
384
397
|
"""Mark rows as deleted that are live and were created prior to current_version.
|
|
385
398
|
Also: populate the undo columns
|
|
386
399
|
Args:
|
|
@@ -394,12 +407,12 @@ class StoreBase:
|
|
|
394
407
|
"""
|
|
395
408
|
where_clause = sql.true() if where_clause is None else where_clause
|
|
396
409
|
where_clause = sql.and_(
|
|
397
|
-
self.v_min_col < current_version,
|
|
398
|
-
|
|
399
|
-
where_clause)
|
|
410
|
+
self.v_min_col < current_version, self.v_max_col == schema.Table.MAX_VERSION, where_clause
|
|
411
|
+
)
|
|
400
412
|
rowid_join_clause = self._rowid_join_predicate()
|
|
401
|
-
base_versions_clause =
|
|
402
|
-
else self.base._versions_clause(base_versions, match_on_vmin)
|
|
413
|
+
base_versions_clause = (
|
|
414
|
+
sql.true() if len(base_versions) == 0 else self.base._versions_clause(base_versions, match_on_vmin)
|
|
415
|
+
)
|
|
403
416
|
set_clause: dict[sql.Column, Union[int, sql.Column]] = {self.v_max_col: current_version}
|
|
404
417
|
for index_info in self.tbl_version.idxs_by_name.values():
|
|
405
418
|
# copy value column to undo column
|
|
@@ -450,7 +463,9 @@ class StoreView(StoreBase):
|
|
|
450
463
|
def _rowid_join_predicate(self) -> sql.ColumnElement[bool]:
|
|
451
464
|
return sql.and_(
|
|
452
465
|
self.base._rowid_join_predicate(),
|
|
453
|
-
*[c1 == c2 for c1, c2 in zip(self.rowid_columns(), self.base.rowid_columns())]
|
|
466
|
+
*[c1 == c2 for c1, c2 in zip(self.rowid_columns(), self.base.rowid_columns())],
|
|
467
|
+
)
|
|
468
|
+
|
|
454
469
|
|
|
455
470
|
class StoreComponentView(StoreView):
|
|
456
471
|
"""A view that stores components of its base, as produced by a ComponentIterator
|
|
@@ -482,4 +497,5 @@ class StoreComponentView(StoreView):
|
|
|
482
497
|
def _rowid_join_predicate(self) -> sql.ColumnElement[bool]:
|
|
483
498
|
return sql.and_(
|
|
484
499
|
self.base._rowid_join_predicate(),
|
|
485
|
-
*[c1 == c2 for c1, c2 in zip(self.rowid_columns()[:-1], self.base.rowid_columns())]
|
|
500
|
+
*[c1 == c2 for c1, c2 in zip(self.rowid_columns()[:-1], self.base.rowid_columns())],
|
|
501
|
+
)
|
pixeltable/type_system.py
CHANGED
|
@@ -9,17 +9,18 @@ import typing
|
|
|
9
9
|
import urllib.parse
|
|
10
10
|
import urllib.request
|
|
11
11
|
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
from typing import _GenericAlias # type: ignore[attr-defined] # isort: skip
|
|
12
14
|
from typing import Any, Iterable, Mapping, Optional, Sequence, Union
|
|
13
15
|
|
|
14
|
-
import PIL.Image
|
|
15
16
|
import av # type: ignore
|
|
16
17
|
import jsonschema
|
|
17
18
|
import jsonschema.protocols
|
|
18
19
|
import jsonschema.validators
|
|
19
20
|
import numpy as np
|
|
21
|
+
import PIL.Image
|
|
20
22
|
import pydantic
|
|
21
23
|
import sqlalchemy as sql
|
|
22
|
-
from typing import _GenericAlias # type: ignore[attr-defined]
|
|
23
24
|
from typing_extensions import _AnnotatedAlias
|
|
24
25
|
|
|
25
26
|
import pixeltable.exceptions as excs
|
|
@@ -45,9 +46,11 @@ class ColumnType:
|
|
|
45
46
|
|
|
46
47
|
@classmethod
|
|
47
48
|
def supertype(
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
49
|
+
cls,
|
|
50
|
+
type1: 'ColumnType.Type',
|
|
51
|
+
type2: 'ColumnType.Type',
|
|
52
|
+
# we need to pass this in because we can't easily append it as a class member
|
|
53
|
+
common_supertypes: dict[tuple['ColumnType.Type', 'ColumnType.Type'], 'ColumnType.Type'],
|
|
51
54
|
) -> Optional['ColumnType.Type']:
|
|
52
55
|
if type1 == type2:
|
|
53
56
|
return type1
|
|
@@ -59,23 +62,23 @@ class ColumnType:
|
|
|
59
62
|
return t
|
|
60
63
|
return None
|
|
61
64
|
|
|
62
|
-
|
|
63
65
|
@enum.unique
|
|
64
66
|
class DType(enum.Enum):
|
|
65
67
|
"""
|
|
66
68
|
Base type used in images and arrays
|
|
67
69
|
"""
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
70
|
+
|
|
71
|
+
BOOL = (0,)
|
|
72
|
+
INT8 = (1,)
|
|
73
|
+
INT16 = (2,)
|
|
74
|
+
INT32 = (3,)
|
|
75
|
+
INT64 = (4,)
|
|
76
|
+
UINT8 = (5,)
|
|
77
|
+
UINT16 = (6,)
|
|
78
|
+
UINT32 = (7,)
|
|
79
|
+
UINT64 = (8,)
|
|
80
|
+
FLOAT16 = (9,)
|
|
81
|
+
FLOAT32 = (10,)
|
|
79
82
|
FLOAT64 = 11
|
|
80
83
|
|
|
81
84
|
scalar_types = {Type.STRING, Type.INT, Type.FLOAT, Type.BOOL, Type.TIMESTAMP}
|
|
@@ -113,10 +116,7 @@ class ColumnType:
|
|
|
113
116
|
return json.dumps([t.as_dict() for t in type_list])
|
|
114
117
|
|
|
115
118
|
def as_dict(self) -> dict:
|
|
116
|
-
return {
|
|
117
|
-
'_classname': self.__class__.__name__,
|
|
118
|
-
**self._as_dict(),
|
|
119
|
-
}
|
|
119
|
+
return {'_classname': self.__class__.__name__, **self._as_dict()}
|
|
120
120
|
|
|
121
121
|
def _as_dict(self) -> dict:
|
|
122
122
|
return {'nullable': self.nullable}
|
|
@@ -277,10 +277,7 @@ class ColumnType:
|
|
|
277
277
|
|
|
278
278
|
@classmethod
|
|
279
279
|
def from_python_type(
|
|
280
|
-
cls,
|
|
281
|
-
t: Union[type, _GenericAlias],
|
|
282
|
-
nullable_default: bool = False,
|
|
283
|
-
allow_builtin_types: bool = True
|
|
280
|
+
cls, t: Union[type, _GenericAlias], nullable_default: bool = False, allow_builtin_types: bool = True
|
|
284
281
|
) -> Optional[ColumnType]:
|
|
285
282
|
"""
|
|
286
283
|
Convert a Python type into a Pixeltable `ColumnType` instance.
|
|
@@ -309,9 +306,7 @@ class ColumnType:
|
|
|
309
306
|
required_args = typing.get_args(t)
|
|
310
307
|
assert len(required_args) == 1
|
|
311
308
|
return cls.from_python_type(
|
|
312
|
-
required_args[0],
|
|
313
|
-
nullable_default=False,
|
|
314
|
-
allow_builtin_types=allow_builtin_types
|
|
309
|
+
required_args[0], nullable_default=False, allow_builtin_types=allow_builtin_types
|
|
315
310
|
)
|
|
316
311
|
elif origin is typing.Annotated:
|
|
317
312
|
annotated_args = typing.get_args(t)
|
|
@@ -349,7 +344,7 @@ class ColumnType:
|
|
|
349
344
|
cls,
|
|
350
345
|
t: Union[ColumnType, type, _AnnotatedAlias],
|
|
351
346
|
nullable_default: bool = False,
|
|
352
|
-
allow_builtin_types: bool = True
|
|
347
|
+
allow_builtin_types: bool = True,
|
|
353
348
|
) -> ColumnType:
|
|
354
349
|
"""
|
|
355
350
|
Convert any type recognizable by Pixeltable to its corresponding ColumnType.
|
|
@@ -415,7 +410,7 @@ class ColumnType:
|
|
|
415
410
|
|
|
416
411
|
def _create_literal(self, val: Any) -> Any:
|
|
417
412
|
"""Create a literal of this type from val, including any needed conversions.
|
|
418
|
-
|
|
413
|
+
val is guaranteed to be non-None"""
|
|
419
414
|
return val
|
|
420
415
|
|
|
421
416
|
def create_literal(self, val: Any) -> Any:
|
|
@@ -484,12 +479,7 @@ class ColumnType:
|
|
|
484
479
|
|
|
485
480
|
def to_json_schema(self) -> dict[str, Any]:
|
|
486
481
|
if self.nullable:
|
|
487
|
-
return {
|
|
488
|
-
'anyOf': [
|
|
489
|
-
self._to_json_schema(),
|
|
490
|
-
{'type': 'null'},
|
|
491
|
-
]
|
|
492
|
-
}
|
|
482
|
+
return {'anyOf': [self._to_json_schema(), {'type': 'null'}]}
|
|
493
483
|
else:
|
|
494
484
|
return self._to_json_schema()
|
|
495
485
|
|
|
@@ -612,7 +602,6 @@ class TimestampType(ColumnType):
|
|
|
612
602
|
|
|
613
603
|
|
|
614
604
|
class JsonType(ColumnType):
|
|
615
|
-
|
|
616
605
|
json_schema: Optional[dict[str, Any]]
|
|
617
606
|
__validator: Optional[jsonschema.protocols.Validator]
|
|
618
607
|
|
|
@@ -699,8 +688,7 @@ class JsonType(ColumnType):
|
|
|
699
688
|
superschema = self.__superschema(self.json_schema, other.json_schema)
|
|
700
689
|
|
|
701
690
|
return JsonType(
|
|
702
|
-
json_schema=(None if len(superschema) == 0 else superschema),
|
|
703
|
-
nullable=(self.nullable or other.nullable)
|
|
691
|
+
json_schema=(None if len(superschema) == 0 else superschema), nullable=(self.nullable or other.nullable)
|
|
704
692
|
)
|
|
705
693
|
|
|
706
694
|
@classmethod
|
|
@@ -755,7 +743,7 @@ class JsonType(ColumnType):
|
|
|
755
743
|
a_type = a.get('type')
|
|
756
744
|
b_type = b.get('type')
|
|
757
745
|
|
|
758
|
-
if
|
|
746
|
+
if a_type in ('string', 'integer', 'number', 'boolean', 'object', 'array') and a_type == b_type:
|
|
759
747
|
# a and b both have the same type designation, but are not identical. This can happen if
|
|
760
748
|
# (for example) they have validators or other attributes that differ. In this case, we
|
|
761
749
|
# generalize to {'type': t}, where t is their shared type, with no other qualifications.
|
|
@@ -793,12 +781,29 @@ class JsonType(ColumnType):
|
|
|
793
781
|
|
|
794
782
|
|
|
795
783
|
class ArrayType(ColumnType):
|
|
796
|
-
|
|
784
|
+
shape: Optional[tuple[Optional[int], ...]]
|
|
785
|
+
pxt_dtype: Optional[ColumnType]
|
|
786
|
+
dtype: Optional[ColumnType.Type]
|
|
787
|
+
|
|
788
|
+
def __init__(
|
|
789
|
+
self,
|
|
790
|
+
shape: Optional[tuple[Optional[int], ...]] = None,
|
|
791
|
+
dtype: Optional[ColumnType] = None,
|
|
792
|
+
nullable: bool = False,
|
|
793
|
+
):
|
|
797
794
|
super().__init__(self.Type.ARRAY, nullable=nullable)
|
|
795
|
+
assert shape is None or dtype is not None, (shape, dtype) # cannot specify a shape without a dtype
|
|
796
|
+
assert (
|
|
797
|
+
dtype is None
|
|
798
|
+
or dtype.is_int_type()
|
|
799
|
+
or dtype.is_float_type()
|
|
800
|
+
or dtype.is_bool_type()
|
|
801
|
+
or dtype.is_string_type()
|
|
802
|
+
)
|
|
803
|
+
|
|
798
804
|
self.shape = shape
|
|
799
|
-
|
|
800
|
-
self.
|
|
801
|
-
self.dtype = dtype._type
|
|
805
|
+
self.pxt_dtype = dtype # we need this for copy() and __str__()
|
|
806
|
+
self.dtype = None if dtype is None else dtype._type
|
|
802
807
|
|
|
803
808
|
def copy(self, nullable: bool) -> ColumnType:
|
|
804
809
|
return ArrayType(self.shape, self.pxt_dtype, nullable=nullable)
|
|
@@ -812,41 +817,53 @@ class ArrayType(ColumnType):
|
|
|
812
817
|
def supertype(self, other: ColumnType) -> Optional[ArrayType]:
|
|
813
818
|
if not isinstance(other, ArrayType):
|
|
814
819
|
return None
|
|
820
|
+
super_dtype = self.Type.supertype(self.dtype, other.dtype, self.common_supertypes)
|
|
821
|
+
if super_dtype is None:
|
|
822
|
+
# if the dtypes are incompatible, then the supertype is a fully general array
|
|
823
|
+
return ArrayType(nullable=(self.nullable or other.nullable))
|
|
824
|
+
super_shape: Optional[tuple[Optional[int], ...]]
|
|
815
825
|
if len(self.shape) != len(other.shape):
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
shape = [n1 if n1 == n2 else None for n1, n2 in zip(self.shape, other.shape)]
|
|
821
|
-
return ArrayType(tuple(shape), self.make_type(base_type), nullable=(self.nullable or other.nullable))
|
|
826
|
+
super_shape = None
|
|
827
|
+
else:
|
|
828
|
+
super_shape = tuple(n1 if n1 == n2 else None for n1, n2 in zip(self.shape, other.shape))
|
|
829
|
+
return ArrayType(super_shape, self.make_type(super_dtype), nullable=(self.nullable or other.nullable))
|
|
822
830
|
|
|
823
831
|
def _as_dict(self) -> dict:
|
|
824
832
|
result = super()._as_dict()
|
|
825
|
-
|
|
833
|
+
shape_as_list = None if self.shape is None else list(self.shape)
|
|
834
|
+
dtype_value = None if self.dtype is None else self.dtype.value
|
|
835
|
+
result.update(shape=shape_as_list, dtype=dtype_value)
|
|
826
836
|
return result
|
|
827
837
|
|
|
828
838
|
def _to_base_str(self) -> str:
|
|
839
|
+
if self.shape is None and self.dtype is None:
|
|
840
|
+
return 'Array'
|
|
841
|
+
if self.shape is None:
|
|
842
|
+
return f'Array[{self.pxt_dtype}]'
|
|
843
|
+
assert self.dtype is not None
|
|
829
844
|
return f'Array[{self.shape}, {self.pxt_dtype}]'
|
|
830
845
|
|
|
831
846
|
@classmethod
|
|
832
847
|
def _from_dict(cls, d: dict) -> ColumnType:
|
|
833
848
|
assert 'shape' in d
|
|
834
849
|
assert 'dtype' in d
|
|
835
|
-
shape = tuple(d['shape'])
|
|
836
|
-
dtype = cls.make_type(cls.Type(d['dtype']))
|
|
850
|
+
shape = None if d['shape'] is None else tuple(d['shape'])
|
|
851
|
+
dtype = None if d['dtype'] is None else cls.make_type(cls.Type(d['dtype']))
|
|
837
852
|
return cls(shape, dtype, nullable=d['nullable'])
|
|
838
853
|
|
|
839
854
|
@classmethod
|
|
840
855
|
def from_literal(cls, val: np.ndarray, nullable: bool = False) -> Optional[ArrayType]:
|
|
841
856
|
# determine our dtype
|
|
842
857
|
assert isinstance(val, np.ndarray)
|
|
858
|
+
dtype: ColumnType
|
|
843
859
|
if np.issubdtype(val.dtype, np.integer):
|
|
844
|
-
dtype
|
|
860
|
+
dtype = IntType()
|
|
845
861
|
elif np.issubdtype(val.dtype, np.floating):
|
|
846
862
|
dtype = FloatType()
|
|
847
863
|
elif val.dtype == np.bool_:
|
|
848
864
|
dtype = BoolType()
|
|
849
|
-
elif val.dtype
|
|
865
|
+
elif np.issubdtype(val.dtype, np.str_):
|
|
866
|
+
# Note that this includes NumPy types like '<U1' -- arrays of single Unicode characters
|
|
850
867
|
dtype = StringType()
|
|
851
868
|
else:
|
|
852
869
|
return None
|
|
@@ -855,32 +872,49 @@ class ArrayType(ColumnType):
|
|
|
855
872
|
def is_valid_literal(self, val: np.ndarray) -> bool:
|
|
856
873
|
if not isinstance(val, np.ndarray):
|
|
857
874
|
return False
|
|
858
|
-
|
|
875
|
+
|
|
876
|
+
# If a dtype is specified, check that there's a match
|
|
877
|
+
if self.dtype is not None and not np.issubdtype(val.dtype, self.numpy_dtype()):
|
|
859
878
|
return False
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
879
|
+
|
|
880
|
+
# If no dtype is specified, we still need to check that the dtype is one of the supported types
|
|
881
|
+
if self.dtype is None and not any(
|
|
882
|
+
np.issubdtype(val.dtype, ndtype) for ndtype in [np.int64, np.float32, np.bool_, np.str_]
|
|
883
|
+
):
|
|
884
|
+
return False
|
|
885
|
+
|
|
886
|
+
# If a shape is specified, check that there's a match
|
|
887
|
+
if self.shape is not None:
|
|
888
|
+
if len(val.shape) != len(self.shape):
|
|
868
889
|
return False
|
|
869
|
-
|
|
890
|
+
# check that the shapes are compatible
|
|
891
|
+
for n1, n2 in zip(val.shape, self.shape):
|
|
892
|
+
assert n1 is not None # `val` must have a concrete shape
|
|
893
|
+
if n2 is None:
|
|
894
|
+
continue # wildcard
|
|
895
|
+
if n1 != n2:
|
|
896
|
+
return False
|
|
897
|
+
|
|
898
|
+
return True
|
|
870
899
|
|
|
871
900
|
def _to_json_schema(self) -> dict[str, Any]:
|
|
872
|
-
return {
|
|
873
|
-
'type': 'array',
|
|
874
|
-
'items': self.pxt_dtype._to_json_schema(),
|
|
875
|
-
}
|
|
901
|
+
return {'type': 'array', 'items': self.pxt_dtype._to_json_schema()}
|
|
876
902
|
|
|
877
903
|
def _validate_literal(self, val: Any) -> None:
|
|
878
904
|
if not isinstance(val, np.ndarray):
|
|
879
905
|
raise TypeError(f'Expected numpy.ndarray, got {val.__class__.__name__}')
|
|
880
906
|
if not self.is_valid_literal(val):
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
907
|
+
if self.shape is not None:
|
|
908
|
+
raise TypeError(
|
|
909
|
+
f'Expected numpy.ndarray({self.shape}, dtype={self.numpy_dtype()}), '
|
|
910
|
+
f'got numpy.ndarray({val.shape}, dtype={val.dtype})'
|
|
911
|
+
)
|
|
912
|
+
elif self.dtype is not None:
|
|
913
|
+
raise TypeError(
|
|
914
|
+
f'Expected numpy.ndarray of dtype {self.numpy_dtype()}, got numpy.ndarray of dtype {val.dtype}'
|
|
915
|
+
)
|
|
916
|
+
else:
|
|
917
|
+
raise TypeError(f'Unsupported dtype for numpy.ndarray: {val.dtype}')
|
|
884
918
|
|
|
885
919
|
def _create_literal(self, val: Any) -> Any:
|
|
886
920
|
if isinstance(val, (list, tuple)):
|
|
@@ -892,7 +926,9 @@ class ArrayType(ColumnType):
|
|
|
892
926
|
def to_sa_type(self) -> sql.types.TypeEngine:
|
|
893
927
|
return sql.LargeBinary()
|
|
894
928
|
|
|
895
|
-
def numpy_dtype(self) -> np.dtype:
|
|
929
|
+
def numpy_dtype(self) -> Optional[np.dtype]:
|
|
930
|
+
if self.dtype is None:
|
|
931
|
+
return None
|
|
896
932
|
if self.dtype == self.Type.INT:
|
|
897
933
|
return np.dtype(np.int64)
|
|
898
934
|
if self.dtype == self.Type.FLOAT:
|
|
@@ -901,20 +937,24 @@ class ArrayType(ColumnType):
|
|
|
901
937
|
return np.dtype(np.bool_)
|
|
902
938
|
if self.dtype == self.Type.STRING:
|
|
903
939
|
return np.dtype(np.str_)
|
|
904
|
-
assert False
|
|
940
|
+
assert False, self.dtype
|
|
905
941
|
|
|
906
942
|
|
|
907
943
|
class ImageType(ColumnType):
|
|
908
944
|
def __init__(
|
|
909
|
-
|
|
910
|
-
|
|
945
|
+
self,
|
|
946
|
+
width: Optional[int] = None,
|
|
947
|
+
height: Optional[int] = None,
|
|
948
|
+
size: Optional[tuple[int, int]] = None,
|
|
949
|
+
mode: Optional[str] = None,
|
|
950
|
+
nullable: bool = False,
|
|
911
951
|
):
|
|
912
952
|
"""
|
|
913
953
|
TODO: does it make sense to specify only width or height?
|
|
914
954
|
"""
|
|
915
955
|
super().__init__(self.Type.IMAGE, nullable=nullable)
|
|
916
|
-
assert not(width is not None and size is not None)
|
|
917
|
-
assert not(height is not None and size is not None)
|
|
956
|
+
assert not (width is not None and size is not None)
|
|
957
|
+
assert not (height is not None and size is not None)
|
|
918
958
|
if size is not None:
|
|
919
959
|
self.width = size[0]
|
|
920
960
|
self.height = size[1]
|
|
@@ -1104,6 +1144,7 @@ class DocumentType(ColumnType):
|
|
|
1104
1144
|
def validate_media(self, val: Any) -> None:
|
|
1105
1145
|
assert isinstance(val, str)
|
|
1106
1146
|
from pixeltable.utils.documents import get_document_handle
|
|
1147
|
+
|
|
1107
1148
|
dh = get_document_handle(val)
|
|
1108
1149
|
if dh is None:
|
|
1109
1150
|
raise excs.Error(f'Not a recognized document format: {val}')
|
|
@@ -1117,6 +1158,7 @@ class Required(typing.Generic[T]):
|
|
|
1117
1158
|
Marker class to indicate that a column is non-nullable in a schema definition. This has no meaning as a type hint,
|
|
1118
1159
|
and is intended only for schema declarations.
|
|
1119
1160
|
"""
|
|
1161
|
+
|
|
1120
1162
|
pass
|
|
1121
1163
|
|
|
1122
1164
|
|
|
@@ -1139,6 +1181,7 @@ class _PxtType:
|
|
|
1139
1181
|
`Image[(300, 300), 'RGB']`. The specialized forms resolve to `typing.Annotated` instances whose annotation is a
|
|
1140
1182
|
`ColumnType`.
|
|
1141
1183
|
"""
|
|
1184
|
+
|
|
1142
1185
|
def __init__(self):
|
|
1143
1186
|
raise TypeError(f'Type `{type(self)}` cannot be instantiated.')
|
|
1144
1187
|
|
|
@@ -1174,6 +1217,8 @@ class Array(np.ndarray, _PxtType):
|
|
|
1174
1217
|
params = item if isinstance(item, tuple) else (item,)
|
|
1175
1218
|
shape: Optional[tuple] = None
|
|
1176
1219
|
dtype: Optional[ColumnType] = None
|
|
1220
|
+
if not any(isinstance(param, (type, _AnnotatedAlias)) for param in params):
|
|
1221
|
+
raise TypeError('Array type parameter must include a dtype.')
|
|
1177
1222
|
for param in params:
|
|
1178
1223
|
if isinstance(param, tuple):
|
|
1179
1224
|
if not all(n is None or (isinstance(n, int) and n >= 1) for n in param):
|
|
@@ -1181,21 +1226,17 @@ class Array(np.ndarray, _PxtType):
|
|
|
1181
1226
|
if shape is not None:
|
|
1182
1227
|
raise TypeError(f'Duplicate Array type parameter: {param}')
|
|
1183
1228
|
shape = param
|
|
1184
|
-
elif isinstance(param, type
|
|
1229
|
+
elif isinstance(param, (type, _AnnotatedAlias)):
|
|
1185
1230
|
if dtype is not None:
|
|
1186
1231
|
raise TypeError(f'Duplicate Array type parameter: {param}')
|
|
1187
1232
|
dtype = ColumnType.normalize_type(param, allow_builtin_types=False)
|
|
1188
1233
|
else:
|
|
1189
1234
|
raise TypeError(f'Invalid Array type parameter: {param}')
|
|
1190
|
-
if shape is None:
|
|
1191
|
-
raise TypeError('Array type is missing parameter: shape')
|
|
1192
|
-
if dtype is None:
|
|
1193
|
-
raise TypeError('Array type is missing parameter: dtype')
|
|
1194
1235
|
return typing.Annotated[np.ndarray, ArrayType(shape=shape, dtype=dtype, nullable=False)]
|
|
1195
1236
|
|
|
1196
1237
|
@classmethod
|
|
1197
1238
|
def as_col_type(cls, nullable: bool) -> ColumnType:
|
|
1198
|
-
|
|
1239
|
+
return ArrayType(nullable=nullable)
|
|
1199
1240
|
|
|
1200
1241
|
|
|
1201
1242
|
class Image(PIL.Image.Image, _PxtType):
|
|
@@ -1219,7 +1260,11 @@ class Image(PIL.Image.Image, _PxtType):
|
|
|
1219
1260
|
mode: Optional[str] = None
|
|
1220
1261
|
for param in params:
|
|
1221
1262
|
if isinstance(param, tuple):
|
|
1222
|
-
if
|
|
1263
|
+
if (
|
|
1264
|
+
len(param) != 2
|
|
1265
|
+
or not isinstance(param[0], (int, type(None)))
|
|
1266
|
+
or not isinstance(param[1], (int, type(None)))
|
|
1267
|
+
):
|
|
1223
1268
|
raise TypeError(f'Invalid Image type parameter: {param}')
|
|
1224
1269
|
if size is not None:
|
|
1225
1270
|
raise TypeError(f'Duplicate Image type parameter: {param}')
|
pixeltable/utils/arrow.py
CHANGED
|
@@ -1,16 +1,10 @@
|
|
|
1
|
-
import
|
|
1
|
+
import datetime
|
|
2
2
|
from typing import Any, Iterator, Optional, Union
|
|
3
3
|
|
|
4
4
|
import numpy as np
|
|
5
5
|
import pyarrow as pa
|
|
6
|
-
import datetime
|
|
7
6
|
|
|
8
7
|
import pixeltable.type_system as ts
|
|
9
|
-
from pixeltable.env import Env
|
|
10
|
-
|
|
11
|
-
_tz_def = Env().get().default_time_zone
|
|
12
|
-
|
|
13
|
-
_logger = logging.getLogger(__name__)
|
|
14
8
|
|
|
15
9
|
_pa_to_pt: dict[pa.DataType, ts.ColumnType] = {
|
|
16
10
|
pa.string(): ts.StringType(nullable=True),
|