pixeltable 0.2.20__py3-none-any.whl → 0.2.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +7 -19
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +7 -7
- pixeltable/catalog/column.py +37 -11
- pixeltable/catalog/globals.py +21 -0
- pixeltable/catalog/insertable_table.py +6 -4
- pixeltable/catalog/table.py +227 -148
- pixeltable/catalog/table_version.py +66 -28
- pixeltable/catalog/table_version_path.py +0 -8
- pixeltable/catalog/view.py +18 -19
- pixeltable/dataframe.py +16 -32
- pixeltable/env.py +6 -1
- pixeltable/exec/__init__.py +1 -2
- pixeltable/exec/aggregation_node.py +27 -17
- pixeltable/exec/cache_prefetch_node.py +1 -1
- pixeltable/exec/data_row_batch.py +9 -26
- pixeltable/exec/exec_node.py +36 -7
- pixeltable/exec/expr_eval_node.py +19 -11
- pixeltable/exec/in_memory_data_node.py +14 -11
- pixeltable/exec/sql_node.py +266 -138
- pixeltable/exprs/__init__.py +1 -0
- pixeltable/exprs/arithmetic_expr.py +3 -1
- pixeltable/exprs/array_slice.py +7 -7
- pixeltable/exprs/column_property_ref.py +37 -10
- pixeltable/exprs/column_ref.py +93 -14
- pixeltable/exprs/comparison.py +5 -5
- pixeltable/exprs/compound_predicate.py +8 -7
- pixeltable/exprs/data_row.py +56 -36
- pixeltable/exprs/expr.py +65 -63
- pixeltable/exprs/expr_dict.py +55 -0
- pixeltable/exprs/expr_set.py +26 -15
- pixeltable/exprs/function_call.py +53 -24
- pixeltable/exprs/globals.py +4 -1
- pixeltable/exprs/in_predicate.py +8 -7
- pixeltable/exprs/inline_expr.py +4 -4
- pixeltable/exprs/is_null.py +4 -4
- pixeltable/exprs/json_mapper.py +11 -12
- pixeltable/exprs/json_path.py +5 -10
- pixeltable/exprs/literal.py +5 -5
- pixeltable/exprs/method_ref.py +5 -4
- pixeltable/exprs/object_ref.py +2 -1
- pixeltable/exprs/row_builder.py +88 -36
- pixeltable/exprs/rowid_ref.py +14 -13
- pixeltable/exprs/similarity_expr.py +12 -7
- pixeltable/exprs/sql_element_cache.py +12 -6
- pixeltable/exprs/type_cast.py +8 -6
- pixeltable/exprs/variable.py +5 -4
- pixeltable/ext/functions/whisperx.py +7 -2
- pixeltable/func/aggregate_function.py +1 -1
- pixeltable/func/callable_function.py +2 -2
- pixeltable/func/function.py +11 -10
- pixeltable/func/function_registry.py +6 -7
- pixeltable/func/query_template_function.py +11 -12
- pixeltable/func/signature.py +17 -15
- pixeltable/func/udf.py +0 -4
- pixeltable/functions/__init__.py +2 -2
- pixeltable/functions/audio.py +4 -6
- pixeltable/functions/globals.py +84 -42
- pixeltable/functions/huggingface.py +31 -34
- pixeltable/functions/image.py +59 -45
- pixeltable/functions/json.py +0 -1
- pixeltable/functions/llama_cpp.py +106 -0
- pixeltable/functions/mistralai.py +2 -2
- pixeltable/functions/ollama.py +147 -0
- pixeltable/functions/openai.py +22 -25
- pixeltable/functions/replicate.py +72 -0
- pixeltable/functions/string.py +59 -50
- pixeltable/functions/timestamp.py +20 -20
- pixeltable/functions/together.py +2 -2
- pixeltable/functions/video.py +11 -20
- pixeltable/functions/whisper.py +2 -20
- pixeltable/globals.py +65 -74
- pixeltable/index/base.py +2 -2
- pixeltable/index/btree.py +20 -7
- pixeltable/index/embedding_index.py +12 -14
- pixeltable/io/__init__.py +1 -2
- pixeltable/io/external_store.py +11 -5
- pixeltable/io/fiftyone.py +178 -0
- pixeltable/io/globals.py +98 -2
- pixeltable/io/hf_datasets.py +1 -1
- pixeltable/io/label_studio.py +6 -6
- pixeltable/io/parquet.py +14 -13
- pixeltable/iterators/base.py +3 -2
- pixeltable/iterators/document.py +10 -8
- pixeltable/iterators/video.py +126 -60
- pixeltable/metadata/__init__.py +4 -3
- pixeltable/metadata/converters/convert_14.py +4 -2
- pixeltable/metadata/converters/convert_15.py +1 -1
- pixeltable/metadata/converters/convert_19.py +1 -0
- pixeltable/metadata/converters/convert_20.py +1 -1
- pixeltable/metadata/converters/convert_21.py +34 -0
- pixeltable/metadata/converters/util.py +54 -12
- pixeltable/metadata/notes.py +1 -0
- pixeltable/metadata/schema.py +40 -21
- pixeltable/plan.py +149 -165
- pixeltable/py.typed +0 -0
- pixeltable/store.py +57 -37
- pixeltable/tool/create_test_db_dump.py +6 -6
- pixeltable/tool/create_test_video.py +1 -1
- pixeltable/tool/doc_plugins/griffe.py +3 -34
- pixeltable/tool/embed_udf.py +1 -1
- pixeltable/tool/mypy_plugin.py +55 -0
- pixeltable/type_system.py +260 -61
- pixeltable/utils/arrow.py +10 -9
- pixeltable/utils/coco.py +4 -4
- pixeltable/utils/documents.py +16 -2
- pixeltable/utils/filecache.py +9 -9
- pixeltable/utils/formatter.py +10 -11
- pixeltable/utils/http_server.py +2 -5
- pixeltable/utils/media_store.py +6 -6
- pixeltable/utils/pytorch.py +10 -11
- pixeltable/utils/sql.py +2 -1
- {pixeltable-0.2.20.dist-info → pixeltable-0.2.22.dist-info}/METADATA +50 -13
- pixeltable-0.2.22.dist-info/RECORD +153 -0
- pixeltable/exec/media_validation_node.py +0 -43
- pixeltable/utils/help.py +0 -11
- pixeltable-0.2.20.dist-info/RECORD +0 -147
- {pixeltable-0.2.20.dist-info → pixeltable-0.2.22.dist-info}/LICENSE +0 -0
- {pixeltable-0.2.20.dist-info → pixeltable-0.2.22.dist-info}/WHEEL +0 -0
- {pixeltable-0.2.20.dist-info → pixeltable-0.2.22.dist-info}/entry_points.txt +0 -0
pixeltable/type_system.py
CHANGED
|
@@ -3,20 +3,22 @@ from __future__ import annotations
|
|
|
3
3
|
import abc
|
|
4
4
|
import datetime
|
|
5
5
|
import enum
|
|
6
|
+
import io
|
|
6
7
|
import json
|
|
7
8
|
import typing
|
|
8
9
|
import urllib.parse
|
|
9
10
|
import urllib.request
|
|
10
11
|
from pathlib import Path
|
|
11
|
-
from typing import Any,
|
|
12
|
+
from typing import Any, Iterable, Mapping, Optional, Sequence, Union
|
|
12
13
|
|
|
14
|
+
import PIL.Image
|
|
13
15
|
import av # type: ignore
|
|
14
16
|
import numpy as np
|
|
15
|
-
import PIL.Image
|
|
16
17
|
import sqlalchemy as sql
|
|
18
|
+
from typing import _GenericAlias # type: ignore[attr-defined]
|
|
19
|
+
from typing_extensions import _AnnotatedAlias
|
|
17
20
|
|
|
18
21
|
import pixeltable.exceptions as excs
|
|
19
|
-
from pixeltable.env import Env
|
|
20
22
|
|
|
21
23
|
|
|
22
24
|
class ColumnType:
|
|
@@ -41,7 +43,7 @@ class ColumnType:
|
|
|
41
43
|
def supertype(
|
|
42
44
|
cls, type1: 'ColumnType.Type', type2: 'ColumnType.Type',
|
|
43
45
|
# we need to pass this in because we can't easily append it as a class member
|
|
44
|
-
common_supertypes:
|
|
46
|
+
common_supertypes: dict[tuple['ColumnType.Type', 'ColumnType.Type'], 'ColumnType.Type']
|
|
45
47
|
) -> Optional['ColumnType.Type']:
|
|
46
48
|
if type1 == type2:
|
|
47
49
|
return type1
|
|
@@ -103,16 +105,16 @@ class ColumnType:
|
|
|
103
105
|
return self.__class__(nullable=nullable) # type: ignore[call-arg]
|
|
104
106
|
|
|
105
107
|
@classmethod
|
|
106
|
-
def serialize_list(cls, type_list:
|
|
108
|
+
def serialize_list(cls, type_list: list[ColumnType]) -> str:
|
|
107
109
|
return json.dumps([t.as_dict() for t in type_list])
|
|
108
110
|
|
|
109
|
-
def as_dict(self) ->
|
|
111
|
+
def as_dict(self) -> dict:
|
|
110
112
|
return {
|
|
111
113
|
'_classname': self.__class__.__name__,
|
|
112
114
|
**self._as_dict(),
|
|
113
115
|
}
|
|
114
116
|
|
|
115
|
-
def _as_dict(self) ->
|
|
117
|
+
def _as_dict(self) -> dict:
|
|
116
118
|
return {'nullable': self.nullable}
|
|
117
119
|
|
|
118
120
|
@classmethod
|
|
@@ -121,18 +123,18 @@ class ColumnType:
|
|
|
121
123
|
return cls.from_dict(type_dict)
|
|
122
124
|
|
|
123
125
|
@classmethod
|
|
124
|
-
def deserialize_list(cls, type_list_str: str) ->
|
|
126
|
+
def deserialize_list(cls, type_list_str: str) -> list[ColumnType]:
|
|
125
127
|
type_dict_list = json.loads(type_list_str)
|
|
126
128
|
return [cls.from_dict(type_dict) for type_dict in type_dict_list]
|
|
127
129
|
|
|
128
130
|
@classmethod
|
|
129
|
-
def from_dict(cls, type_dict:
|
|
131
|
+
def from_dict(cls, type_dict: dict) -> ColumnType:
|
|
130
132
|
assert '_classname' in type_dict
|
|
131
133
|
type_class = globals()[type_dict['_classname']]
|
|
132
134
|
return type_class._from_dict(type_dict)
|
|
133
135
|
|
|
134
136
|
@classmethod
|
|
135
|
-
def _from_dict(cls, d:
|
|
137
|
+
def _from_dict(cls, d: dict) -> ColumnType:
|
|
136
138
|
"""
|
|
137
139
|
Default implementation: simply invoke c'tor
|
|
138
140
|
"""
|
|
@@ -164,11 +166,28 @@ class ColumnType:
|
|
|
164
166
|
return DocumentType()
|
|
165
167
|
|
|
166
168
|
def __str__(self) -> str:
|
|
167
|
-
return self.
|
|
169
|
+
return self._to_str(as_schema=False)
|
|
170
|
+
|
|
171
|
+
def _to_str(self, as_schema: bool) -> str:
|
|
172
|
+
base_str = self._to_base_str()
|
|
173
|
+
if as_schema:
|
|
174
|
+
return base_str if self.nullable else f'Required[{base_str}]'
|
|
175
|
+
else:
|
|
176
|
+
return f'Optional[{base_str}]' if self.nullable else base_str
|
|
177
|
+
|
|
178
|
+
def _to_base_str(self) -> str:
|
|
179
|
+
"""
|
|
180
|
+
String representation of this type, disregarding nullability. Default implementation is to camel-case
|
|
181
|
+
the type name; subclasses may override.
|
|
182
|
+
"""
|
|
183
|
+
return self._type.name[0] + self._type.name[1:].lower()
|
|
168
184
|
|
|
169
185
|
def __eq__(self, other: object) -> bool:
|
|
170
186
|
return isinstance(other, ColumnType) and self.matches(other) and self.nullable == other.nullable
|
|
171
187
|
|
|
188
|
+
def __hash__(self) -> int:
|
|
189
|
+
return hash((self._type, self.nullable))
|
|
190
|
+
|
|
172
191
|
def is_supertype_of(self, other: ColumnType, ignore_nullable: bool = False) -> bool:
|
|
173
192
|
if ignore_nullable:
|
|
174
193
|
supertype = self.supertype(other)
|
|
@@ -253,39 +272,63 @@ class ColumnType:
|
|
|
253
272
|
return inferred_type
|
|
254
273
|
|
|
255
274
|
@classmethod
|
|
256
|
-
def from_python_type(cls, t: type) -> Optional[ColumnType]:
|
|
275
|
+
def from_python_type(cls, t: Union[type, _GenericAlias], nullable_default: bool = False) -> Optional[ColumnType]:
|
|
257
276
|
if typing.get_origin(t) is typing.Union:
|
|
258
277
|
union_args = typing.get_args(t)
|
|
259
|
-
if union_args
|
|
260
|
-
# `t` is a type of the form Optional[T] (equivalently, Union[T, None]).
|
|
278
|
+
if len(union_args) == 2 and type(None) in union_args:
|
|
279
|
+
# `t` is a type of the form Optional[T] (equivalently, Union[T, None] or Union[None, T]).
|
|
261
280
|
# We treat it as the underlying type but with nullable=True.
|
|
262
|
-
|
|
281
|
+
underlying_py_type = union_args[0] if union_args[1] is type(None) else union_args[1]
|
|
282
|
+
underlying = cls.from_python_type(underlying_py_type)
|
|
263
283
|
if underlying is not None:
|
|
264
|
-
underlying.
|
|
265
|
-
|
|
284
|
+
return underlying.copy(nullable=True)
|
|
285
|
+
elif typing.get_origin(t) is typing.Annotated:
|
|
286
|
+
annotated_args = typing.get_args(t)
|
|
287
|
+
origin = annotated_args[0]
|
|
288
|
+
parameters = annotated_args[1]
|
|
289
|
+
if isinstance(parameters, ColumnType):
|
|
290
|
+
return parameters.copy(nullable=nullable_default)
|
|
291
|
+
elif typing.get_origin(t) is Required:
|
|
292
|
+
required_args = typing.get_args(t)
|
|
293
|
+
assert len(required_args) == 1
|
|
294
|
+
return cls.from_python_type(required_args[0], nullable_default=False)
|
|
266
295
|
else:
|
|
267
296
|
# Discard type parameters to ensure that parameterized types such as `list[T]`
|
|
268
297
|
# are correctly mapped to Pixeltable types.
|
|
269
|
-
|
|
270
|
-
if
|
|
271
|
-
# No type parameters; the
|
|
272
|
-
|
|
273
|
-
if
|
|
274
|
-
return
|
|
275
|
-
if
|
|
276
|
-
return
|
|
277
|
-
if
|
|
278
|
-
return
|
|
279
|
-
if
|
|
280
|
-
return
|
|
281
|
-
if
|
|
282
|
-
return
|
|
283
|
-
if
|
|
284
|
-
return
|
|
285
|
-
if
|
|
286
|
-
return ImageType()
|
|
298
|
+
origin = typing.get_origin(t)
|
|
299
|
+
if origin is None:
|
|
300
|
+
# No type parameters; the origin type is just `t` itself
|
|
301
|
+
origin = t
|
|
302
|
+
if issubclass(origin, _PxtType):
|
|
303
|
+
return origin.as_col_type(nullable=nullable_default)
|
|
304
|
+
if origin is str:
|
|
305
|
+
return StringType(nullable=nullable_default)
|
|
306
|
+
if origin is int:
|
|
307
|
+
return IntType(nullable=nullable_default)
|
|
308
|
+
if origin is float:
|
|
309
|
+
return FloatType(nullable=nullable_default)
|
|
310
|
+
if origin is bool:
|
|
311
|
+
return BoolType(nullable=nullable_default)
|
|
312
|
+
if origin is datetime.datetime:
|
|
313
|
+
return TimestampType(nullable=nullable_default)
|
|
314
|
+
if origin is PIL.Image.Image:
|
|
315
|
+
return ImageType(nullable=nullable_default)
|
|
316
|
+
if issubclass(origin, Sequence) or issubclass(origin, Mapping):
|
|
317
|
+
return JsonType(nullable=nullable_default)
|
|
287
318
|
return None
|
|
288
319
|
|
|
320
|
+
@classmethod
|
|
321
|
+
def normalize_type(cls, t: Union[ColumnType, type, _AnnotatedAlias], nullable_default: bool = False) -> ColumnType:
|
|
322
|
+
"""
|
|
323
|
+
Convert any type recognizable by Pixeltable to its corresponding ColumnType.
|
|
324
|
+
"""
|
|
325
|
+
if isinstance(t, ColumnType):
|
|
326
|
+
return t
|
|
327
|
+
col_type = cls.from_python_type(t, nullable_default)
|
|
328
|
+
if col_type is None:
|
|
329
|
+
raise excs.Error(f'Unknown type: {t}')
|
|
330
|
+
return col_type
|
|
331
|
+
|
|
289
332
|
def validate_literal(self, val: Any) -> None:
|
|
290
333
|
"""Raise TypeError if val is not a valid literal for this type"""
|
|
291
334
|
if val is None:
|
|
@@ -491,7 +534,7 @@ class TimestampType(ColumnType):
|
|
|
491
534
|
|
|
492
535
|
class JsonType(ColumnType):
|
|
493
536
|
# TODO: type_spec also needs to be able to express lists
|
|
494
|
-
def __init__(self, type_spec: Optional[
|
|
537
|
+
def __init__(self, type_spec: Optional[dict[str, ColumnType]] = None, nullable: bool = False):
|
|
495
538
|
super().__init__(self.Type.JSON, nullable=nullable)
|
|
496
539
|
self.type_spec = type_spec
|
|
497
540
|
|
|
@@ -526,7 +569,7 @@ class JsonType(ColumnType):
|
|
|
526
569
|
type_spec[other_field_name] = field_type
|
|
527
570
|
return JsonType(type_spec, nullable=(self.nullable or other.nullable))
|
|
528
571
|
|
|
529
|
-
def _as_dict(self) ->
|
|
572
|
+
def _as_dict(self) -> dict:
|
|
530
573
|
result = super()._as_dict()
|
|
531
574
|
if self.type_spec is not None:
|
|
532
575
|
type_spec_dict = {field_name: field_type.serialize() for field_name, field_type in self.type_spec.items()}
|
|
@@ -534,7 +577,7 @@ class JsonType(ColumnType):
|
|
|
534
577
|
return result
|
|
535
578
|
|
|
536
579
|
@classmethod
|
|
537
|
-
def _from_dict(cls, d:
|
|
580
|
+
def _from_dict(cls, d: dict) -> ColumnType:
|
|
538
581
|
type_spec = None
|
|
539
582
|
if 'type_spec' in d:
|
|
540
583
|
type_spec = {
|
|
@@ -590,6 +633,9 @@ class ArrayType(ColumnType):
|
|
|
590
633
|
def matches(self, other: ColumnType) -> bool:
|
|
591
634
|
return isinstance(other, ArrayType) and self.shape == other.shape and self.dtype == other.dtype
|
|
592
635
|
|
|
636
|
+
def __hash__(self) -> int:
|
|
637
|
+
return hash((self._type, self.nullable, self.shape, self.dtype))
|
|
638
|
+
|
|
593
639
|
def supertype(self, other: ColumnType) -> Optional[ArrayType]:
|
|
594
640
|
if not isinstance(other, ArrayType):
|
|
595
641
|
return None
|
|
@@ -601,16 +647,16 @@ class ArrayType(ColumnType):
|
|
|
601
647
|
shape = [n1 if n1 == n2 else None for n1, n2 in zip(self.shape, other.shape)]
|
|
602
648
|
return ArrayType(tuple(shape), self.make_type(base_type), nullable=(self.nullable or other.nullable))
|
|
603
649
|
|
|
604
|
-
def _as_dict(self) ->
|
|
650
|
+
def _as_dict(self) -> dict:
|
|
605
651
|
result = super()._as_dict()
|
|
606
652
|
result.update(shape=list(self.shape), dtype=self.dtype.value)
|
|
607
653
|
return result
|
|
608
654
|
|
|
609
|
-
def
|
|
610
|
-
return f'{self.
|
|
655
|
+
def _to_base_str(self) -> str:
|
|
656
|
+
return f'Array[{self.shape}, {self.pxt_dtype}]'
|
|
611
657
|
|
|
612
658
|
@classmethod
|
|
613
|
-
def _from_dict(cls, d:
|
|
659
|
+
def _from_dict(cls, d: dict) -> ColumnType:
|
|
614
660
|
assert 'shape' in d
|
|
615
661
|
assert 'dtype' in d
|
|
616
662
|
shape = tuple(d['shape'])
|
|
@@ -681,7 +727,7 @@ class ArrayType(ColumnType):
|
|
|
681
727
|
|
|
682
728
|
class ImageType(ColumnType):
|
|
683
729
|
def __init__(
|
|
684
|
-
self, width: Optional[int] = None, height: Optional[int] = None, size: Optional[
|
|
730
|
+
self, width: Optional[int] = None, height: Optional[int] = None, size: Optional[tuple[int, int]] = None,
|
|
685
731
|
mode: Optional[str] = None, nullable: bool = False
|
|
686
732
|
):
|
|
687
733
|
"""
|
|
@@ -701,23 +747,17 @@ class ImageType(ColumnType):
|
|
|
701
747
|
def copy(self, nullable: bool) -> ColumnType:
|
|
702
748
|
return ImageType(self.width, self.height, mode=self.mode, nullable=nullable)
|
|
703
749
|
|
|
704
|
-
def
|
|
705
|
-
|
|
750
|
+
def _to_base_str(self) -> str:
|
|
751
|
+
params = []
|
|
752
|
+
if self.width is not None or self.height is not None:
|
|
753
|
+
params.append(f'({self.width}, {self.height})')
|
|
754
|
+
if self.mode is not None:
|
|
755
|
+
params.append(repr(self.mode))
|
|
756
|
+
if len(params) == 0:
|
|
706
757
|
params_str = ''
|
|
707
|
-
if self.width is not None:
|
|
708
|
-
params_str = f'width={self.width}'
|
|
709
|
-
if self.height is not None:
|
|
710
|
-
if len(params_str) > 0:
|
|
711
|
-
params_str += ', '
|
|
712
|
-
params_str += f'height={self.height}'
|
|
713
|
-
if self.mode is not None:
|
|
714
|
-
if len(params_str) > 0:
|
|
715
|
-
params_str += ', '
|
|
716
|
-
params_str += f'mode={self.mode}'
|
|
717
|
-
params_str = f'({params_str})'
|
|
718
758
|
else:
|
|
719
|
-
params_str = ''
|
|
720
|
-
return f'{
|
|
759
|
+
params_str = f'[{", ".join(params)}]'
|
|
760
|
+
return f'Image{params_str}'
|
|
721
761
|
|
|
722
762
|
def matches(self, other: ColumnType) -> bool:
|
|
723
763
|
return (
|
|
@@ -727,6 +767,9 @@ class ImageType(ColumnType):
|
|
|
727
767
|
and self.mode == other.mode
|
|
728
768
|
)
|
|
729
769
|
|
|
770
|
+
def __hash__(self) -> int:
|
|
771
|
+
return hash((self._type, self.nullable, self.size, self.mode))
|
|
772
|
+
|
|
730
773
|
def supertype(self, other: ColumnType) -> Optional[ImageType]:
|
|
731
774
|
if not isinstance(other, ImageType):
|
|
732
775
|
return None
|
|
@@ -736,18 +779,18 @@ class ImageType(ColumnType):
|
|
|
736
779
|
return ImageType(width=width, height=height, mode=mode, nullable=(self.nullable or other.nullable))
|
|
737
780
|
|
|
738
781
|
@property
|
|
739
|
-
def size(self) -> Optional[
|
|
782
|
+
def size(self) -> Optional[tuple[int, int]]:
|
|
740
783
|
if self.width is None or self.height is None:
|
|
741
784
|
return None
|
|
742
785
|
return (self.width, self.height)
|
|
743
786
|
|
|
744
|
-
def _as_dict(self) ->
|
|
787
|
+
def _as_dict(self) -> dict:
|
|
745
788
|
result = super()._as_dict()
|
|
746
789
|
result.update(width=self.width, height=self.height, mode=self.mode)
|
|
747
790
|
return result
|
|
748
791
|
|
|
749
792
|
@classmethod
|
|
750
|
-
def _from_dict(cls, d:
|
|
793
|
+
def _from_dict(cls, d: dict) -> ColumnType:
|
|
751
794
|
assert 'width' in d
|
|
752
795
|
assert 'height' in d
|
|
753
796
|
assert 'mode' in d
|
|
@@ -756,6 +799,20 @@ class ImageType(ColumnType):
|
|
|
756
799
|
def to_sa_type(self) -> sql.types.TypeEngine:
|
|
757
800
|
return sql.String()
|
|
758
801
|
|
|
802
|
+
def _create_literal(self, val: Any) -> Any:
|
|
803
|
+
if isinstance(val, str) and val.startswith('data:'):
|
|
804
|
+
# try parsing this as a `data:` URL, and if successful, decode the image immediately
|
|
805
|
+
try:
|
|
806
|
+
with urllib.request.urlopen(val) as response:
|
|
807
|
+
b = response.read()
|
|
808
|
+
img = PIL.Image.open(io.BytesIO(b))
|
|
809
|
+
img.load()
|
|
810
|
+
return img
|
|
811
|
+
except Exception as exc:
|
|
812
|
+
errormsg_val = val if len(val) < 50 else val[:50] + '...'
|
|
813
|
+
raise excs.Error(f'data URL could not be decoded into a valid image: {errormsg_val}') from exc
|
|
814
|
+
return val
|
|
815
|
+
|
|
759
816
|
def _validate_literal(self, val: Any) -> None:
|
|
760
817
|
if isinstance(val, PIL.Image.Image):
|
|
761
818
|
return
|
|
@@ -834,6 +891,7 @@ class DocumentType(ColumnType):
|
|
|
834
891
|
HTML = 0
|
|
835
892
|
MD = 1
|
|
836
893
|
PDF = 2
|
|
894
|
+
XML = 3
|
|
837
895
|
|
|
838
896
|
def __init__(self, nullable: bool = False, doc_formats: Optional[str] = None):
|
|
839
897
|
super().__init__(self.Type.DOCUMENT, nullable=nullable)
|
|
@@ -853,6 +911,9 @@ class DocumentType(ColumnType):
|
|
|
853
911
|
def matches(self, other: ColumnType) -> bool:
|
|
854
912
|
return isinstance(other, DocumentType) and self._doc_formats == other._doc_formats
|
|
855
913
|
|
|
914
|
+
def __hash__(self) -> int:
|
|
915
|
+
return hash((self._type, self.nullable, self._doc_formats))
|
|
916
|
+
|
|
856
917
|
def to_sa_type(self) -> sql.types.TypeEngine:
|
|
857
918
|
# stored as a file path
|
|
858
919
|
return sql.String()
|
|
@@ -866,3 +927,141 @@ class DocumentType(ColumnType):
|
|
|
866
927
|
dh = get_document_handle(val)
|
|
867
928
|
if dh is None:
|
|
868
929
|
raise excs.Error(f'Not a recognized document format: {val}')
|
|
930
|
+
|
|
931
|
+
|
|
932
|
+
T = typing.TypeVar('T')
|
|
933
|
+
|
|
934
|
+
|
|
935
|
+
class Required(typing.Generic[T]):
|
|
936
|
+
"""
|
|
937
|
+
Marker class to indicate that a column is non-nullable in a schema definition. This has no meaning as a type hint,
|
|
938
|
+
and is intended only for schema declarations.
|
|
939
|
+
"""
|
|
940
|
+
pass
|
|
941
|
+
|
|
942
|
+
|
|
943
|
+
String = typing.Annotated[str, StringType(nullable=False)]
|
|
944
|
+
Int = typing.Annotated[int, IntType(nullable=False)]
|
|
945
|
+
Float = typing.Annotated[float, FloatType(nullable=False)]
|
|
946
|
+
Bool = typing.Annotated[bool, BoolType(nullable=False)]
|
|
947
|
+
Timestamp = typing.Annotated[datetime.datetime, TimestampType(nullable=False)]
|
|
948
|
+
|
|
949
|
+
|
|
950
|
+
class _PxtType:
|
|
951
|
+
"""
|
|
952
|
+
Base class for the Pixeltable type-hint family. Subclasses of this class are meant to be used as type hints, both
|
|
953
|
+
in schema definitions and in UDF signatures. Whereas `ColumnType`s are instantiable and carry semantic information
|
|
954
|
+
about the Pixeltable type system, `_PxtType` subclasses are purely for convenience: they are not instantiable and
|
|
955
|
+
must be resolved to a `ColumnType` (by calling `ColumnType.from_python_type()`) in order to do anything meaningful
|
|
956
|
+
with them.
|
|
957
|
+
|
|
958
|
+
`_PxtType` subclasses can be specialized (as type hints) with type parameters; for example:
|
|
959
|
+
`Image[(300, 300), 'RGB']`. The specialized forms resolve to `typing.Annotated` instances whose annotation is a
|
|
960
|
+
`ColumnType`.
|
|
961
|
+
"""
|
|
962
|
+
def __init__(self):
|
|
963
|
+
raise TypeError(f'Type `{type(self)}` cannot be instantiated.')
|
|
964
|
+
|
|
965
|
+
@classmethod
|
|
966
|
+
def as_col_type(cls, nullable: bool) -> ColumnType:
|
|
967
|
+
raise NotImplementedError()
|
|
968
|
+
|
|
969
|
+
|
|
970
|
+
class Json(_PxtType):
|
|
971
|
+
@classmethod
|
|
972
|
+
def as_col_type(cls, nullable: bool) -> ColumnType:
|
|
973
|
+
return JsonType(nullable=nullable)
|
|
974
|
+
|
|
975
|
+
|
|
976
|
+
class Array(np.ndarray, _PxtType):
|
|
977
|
+
def __class_getitem__(cls, item: Any) -> _AnnotatedAlias:
|
|
978
|
+
"""
|
|
979
|
+
`item` (the type subscript) must be a tuple with exactly two elements (in any order):
|
|
980
|
+
- A tuple of `Optional[int]`s, specifying the shape of the array
|
|
981
|
+
- A type, specifying the dtype of the array
|
|
982
|
+
Example: Array[(3, None, 2), float]
|
|
983
|
+
"""
|
|
984
|
+
params = item if isinstance(item, tuple) else (item,)
|
|
985
|
+
shape: Optional[tuple] = None
|
|
986
|
+
dtype: Optional[ColumnType] = None
|
|
987
|
+
for param in params:
|
|
988
|
+
if isinstance(param, tuple):
|
|
989
|
+
if not all(n is None or (isinstance(n, int) and n >= 1) for n in param):
|
|
990
|
+
raise TypeError(f'Invalid Array type parameter: {param}')
|
|
991
|
+
if shape is not None:
|
|
992
|
+
raise TypeError(f'Duplicate Array type parameter: {param}')
|
|
993
|
+
shape = param
|
|
994
|
+
elif isinstance(param, type) or isinstance(param, _AnnotatedAlias):
|
|
995
|
+
if dtype is not None:
|
|
996
|
+
raise TypeError(f'Duplicate Array type parameter: {param}')
|
|
997
|
+
dtype = ColumnType.from_python_type(param)
|
|
998
|
+
else:
|
|
999
|
+
raise TypeError(f'Invalid Array type parameter: {param}')
|
|
1000
|
+
if shape is None:
|
|
1001
|
+
raise TypeError('Array type is missing parameter: shape')
|
|
1002
|
+
if dtype is None:
|
|
1003
|
+
raise TypeError('Array type is missing parameter: dtype')
|
|
1004
|
+
return typing.Annotated[np.ndarray, ArrayType(shape=shape, dtype=dtype, nullable=False)]
|
|
1005
|
+
|
|
1006
|
+
@classmethod
|
|
1007
|
+
def as_col_type(cls, nullable: bool) -> ColumnType:
|
|
1008
|
+
raise TypeError('Array type cannot be used without specifying shape and dtype')
|
|
1009
|
+
|
|
1010
|
+
|
|
1011
|
+
class Image(PIL.Image.Image, _PxtType):
|
|
1012
|
+
def __class_getitem__(cls, item: Any) -> _AnnotatedAlias:
|
|
1013
|
+
"""
|
|
1014
|
+
`item` (the type subscript) must be one of the following, or a tuple containing either or both in any order:
|
|
1015
|
+
- A 2-tuple of `int`s, specifying the size of the image
|
|
1016
|
+
- A string, specifying the mode of the image
|
|
1017
|
+
Example: Image[(300, 300), 'RGB']
|
|
1018
|
+
"""
|
|
1019
|
+
if isinstance(item, tuple) and all(n is None or isinstance(n, int) for n in item):
|
|
1020
|
+
# It's a tuple of the form (width, height)
|
|
1021
|
+
params = (item,)
|
|
1022
|
+
elif isinstance(item, tuple):
|
|
1023
|
+
# It's a compound tuple (multiple parameters)
|
|
1024
|
+
params = item
|
|
1025
|
+
else:
|
|
1026
|
+
# Not a tuple (single arg)
|
|
1027
|
+
params = (item,)
|
|
1028
|
+
size: Optional[tuple] = None
|
|
1029
|
+
mode: Optional[str] = None
|
|
1030
|
+
for param in params:
|
|
1031
|
+
if isinstance(param, tuple):
|
|
1032
|
+
if len(param) != 2 or not isinstance(param[0], (int, type(None))) or not isinstance(param[1], (int, type(None))):
|
|
1033
|
+
raise TypeError(f'Invalid Image type parameter: {param}')
|
|
1034
|
+
if size is not None:
|
|
1035
|
+
raise TypeError(f'Duplicate Image type parameter: {param}')
|
|
1036
|
+
size = param
|
|
1037
|
+
elif isinstance(param, str):
|
|
1038
|
+
if param not in PIL.Image.MODES:
|
|
1039
|
+
raise TypeError(f'Invalid Image type parameter: {param!r}')
|
|
1040
|
+
if mode is not None:
|
|
1041
|
+
raise TypeError(f'Duplicate Image type parameter: {param!r}')
|
|
1042
|
+
mode = param
|
|
1043
|
+
else:
|
|
1044
|
+
raise TypeError(f'Invalid Image type parameter: {param}')
|
|
1045
|
+
return typing.Annotated[PIL.Image.Image, ImageType(size=size, mode=mode, nullable=False)]
|
|
1046
|
+
|
|
1047
|
+
@classmethod
|
|
1048
|
+
def as_col_type(cls, nullable: bool) -> ColumnType:
|
|
1049
|
+
return ImageType(nullable=nullable)
|
|
1050
|
+
|
|
1051
|
+
|
|
1052
|
+
class Video(str, _PxtType):
|
|
1053
|
+
@classmethod
|
|
1054
|
+
def as_col_type(cls, nullable: bool) -> ColumnType:
|
|
1055
|
+
return VideoType(nullable=nullable)
|
|
1056
|
+
|
|
1057
|
+
|
|
1058
|
+
class Audio(str, _PxtType):
|
|
1059
|
+
@classmethod
|
|
1060
|
+
def as_col_type(cls, nullable: bool) -> ColumnType:
|
|
1061
|
+
return AudioType(nullable=nullable)
|
|
1062
|
+
|
|
1063
|
+
|
|
1064
|
+
class Document(str, _PxtType):
|
|
1065
|
+
@classmethod
|
|
1066
|
+
def as_col_type(cls, nullable: bool) -> ColumnType:
|
|
1067
|
+
return DocumentType(nullable=nullable)
|
pixeltable/utils/arrow.py
CHANGED
|
@@ -1,13 +1,14 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Any,
|
|
2
|
+
from typing import Any, Iterator, Optional, Union
|
|
3
3
|
|
|
4
|
+
import numpy as np
|
|
4
5
|
import pyarrow as pa
|
|
5
6
|
|
|
6
7
|
import pixeltable.type_system as ts
|
|
7
8
|
|
|
8
9
|
_logger = logging.getLogger(__name__)
|
|
9
10
|
|
|
10
|
-
_pa_to_pt:
|
|
11
|
+
_pa_to_pt: dict[pa.DataType, ts.ColumnType] = {
|
|
11
12
|
pa.string(): ts.StringType(nullable=True),
|
|
12
13
|
pa.timestamp('us'): ts.TimestampType(nullable=True),
|
|
13
14
|
pa.bool_(): ts.BoolType(nullable=True),
|
|
@@ -20,7 +21,7 @@ _pa_to_pt: Dict[pa.DataType, ts.ColumnType] = {
|
|
|
20
21
|
pa.float32(): ts.FloatType(nullable=True),
|
|
21
22
|
}
|
|
22
23
|
|
|
23
|
-
_pt_to_pa:
|
|
24
|
+
_pt_to_pa: dict[type[ts.ColumnType], pa.DataType] = {
|
|
24
25
|
ts.StringType: pa.string(),
|
|
25
26
|
ts.TimestampType: pa.timestamp('us'), # postgres timestamp is microseconds
|
|
26
27
|
ts.BoolType: pa.bool_(),
|
|
@@ -61,19 +62,19 @@ def to_arrow_type(pixeltable_type: ts.ColumnType) -> Optional[pa.DataType]:
|
|
|
61
62
|
return None
|
|
62
63
|
|
|
63
64
|
|
|
64
|
-
def to_pixeltable_schema(arrow_schema: pa.Schema) ->
|
|
65
|
+
def to_pixeltable_schema(arrow_schema: pa.Schema) -> dict[str, ts.ColumnType]:
|
|
65
66
|
return {field.name: to_pixeltable_type(field.type) for field in arrow_schema}
|
|
66
67
|
|
|
67
68
|
|
|
68
|
-
def to_arrow_schema(pixeltable_schema:
|
|
69
|
-
return pa.schema((name, to_arrow_type(typ)) for name, typ in pixeltable_schema.items())
|
|
69
|
+
def to_arrow_schema(pixeltable_schema: dict[str, Any]) -> pa.Schema:
|
|
70
|
+
return pa.schema((name, to_arrow_type(typ)) for name, typ in pixeltable_schema.items()) # type: ignore[misc]
|
|
70
71
|
|
|
71
72
|
|
|
72
|
-
def to_pydict(batch: pa.RecordBatch) ->
|
|
73
|
+
def to_pydict(batch: pa.RecordBatch) -> dict[str, Union[list, np.ndarray]]:
|
|
73
74
|
"""Convert a RecordBatch to a dictionary of lists, unlike pa.lib.RecordBatch.to_pydict,
|
|
74
75
|
this function will not convert numpy arrays to lists, and will preserve the original numpy dtype.
|
|
75
76
|
"""
|
|
76
|
-
out = {}
|
|
77
|
+
out: dict[str, Union[list, np.ndarray]] = {}
|
|
77
78
|
for k, name in enumerate(batch.schema.names):
|
|
78
79
|
col = batch.column(k)
|
|
79
80
|
if isinstance(col.type, pa.FixedShapeTensorType):
|
|
@@ -86,7 +87,7 @@ def to_pydict(batch: pa.RecordBatch) -> Dict[str, Iterable[Any]]:
|
|
|
86
87
|
return out
|
|
87
88
|
|
|
88
89
|
|
|
89
|
-
def iter_tuples(batch: pa.RecordBatch) -> Iterator[
|
|
90
|
+
def iter_tuples(batch: pa.RecordBatch) -> Iterator[dict[str, Any]]:
|
|
90
91
|
"""Convert a RecordBatch to an iterator of dictionaries. also works with pa.Table and pa.RowGroup"""
|
|
91
92
|
pydict = to_pydict(batch)
|
|
92
93
|
assert len(pydict) > 0, 'empty record batch'
|
pixeltable/utils/coco.py
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
|
-
from typing import List, Dict, Any, Set
|
|
2
|
-
from pathlib import Path
|
|
3
1
|
import json
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Any, Dict, List, Set
|
|
4
4
|
|
|
5
5
|
import PIL
|
|
6
6
|
|
|
7
|
+
import pixeltable as pxt
|
|
7
8
|
import pixeltable.exceptions as excs
|
|
8
9
|
|
|
9
|
-
|
|
10
10
|
format_msg = """
|
|
11
11
|
|
|
12
12
|
Required format:
|
|
@@ -48,7 +48,7 @@ def _verify_input_dict(input_dict: Dict[str, Any]) -> None:
|
|
|
48
48
|
if not isinstance(annotation['category'], (str, int)):
|
|
49
49
|
raise excs.Error(f'Value for "category" is not a str or int: {annotation}{format_msg}')
|
|
50
50
|
|
|
51
|
-
def write_coco_dataset(df:
|
|
51
|
+
def write_coco_dataset(df: pxt.DataFrame, dest_path: Path) -> Path:
|
|
52
52
|
"""Export a DataFrame result set as a COCO dataset in dest_path and return the path of the data.json file."""
|
|
53
53
|
# TODO: validate schema
|
|
54
54
|
if len(df._select_list_exprs) != 1 or not df._select_list_exprs[0].col_type.is_json_type():
|
pixeltable/utils/documents.py
CHANGED
|
@@ -2,7 +2,7 @@ import dataclasses
|
|
|
2
2
|
from typing import Optional
|
|
3
3
|
|
|
4
4
|
import bs4
|
|
5
|
-
import fitz #
|
|
5
|
+
import fitz # type: ignore[import-untyped]
|
|
6
6
|
import puremagic
|
|
7
7
|
|
|
8
8
|
import pixeltable.type_system as ts
|
|
@@ -35,6 +35,11 @@ def get_document_handle(path: str) -> Optional[DocumentHandle]:
|
|
|
35
35
|
if md_ast is not None:
|
|
36
36
|
return DocumentHandle(format=ts.DocumentType.DocumentFormat.MD, md_ast=md_ast)
|
|
37
37
|
|
|
38
|
+
if doc_format == '.xml':
|
|
39
|
+
bs_doc = get_xml_handle(path)
|
|
40
|
+
if bs_doc is not None:
|
|
41
|
+
return DocumentHandle(format=ts.DocumentType.DocumentFormat.XML, bs_doc=bs_doc)
|
|
42
|
+
|
|
38
43
|
return None
|
|
39
44
|
|
|
40
45
|
|
|
@@ -54,7 +59,16 @@ def get_pdf_handle(path: str) -> Optional[fitz.Document]:
|
|
|
54
59
|
def get_html_handle(path: str) -> Optional[bs4.BeautifulSoup]:
|
|
55
60
|
try:
|
|
56
61
|
with open(path, 'r', encoding='utf8') as fp:
|
|
57
|
-
doc = bs4.BeautifulSoup(fp, '
|
|
62
|
+
doc = bs4.BeautifulSoup(fp, 'lxml')
|
|
63
|
+
return doc if doc.find() is not None else None
|
|
64
|
+
except Exception:
|
|
65
|
+
return None
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def get_xml_handle(path: str) -> Optional[bs4.BeautifulSoup]:
|
|
69
|
+
try:
|
|
70
|
+
with open(path, 'r', encoding='utf8') as fp:
|
|
71
|
+
doc = bs4.BeautifulSoup(fp, 'xml')
|
|
58
72
|
return doc if doc.find() is not None else None
|
|
59
73
|
except Exception:
|
|
60
74
|
return None
|