pixeltable 0.2.20__py3-none-any.whl → 0.2.21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +7 -19
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +7 -7
- pixeltable/catalog/globals.py +3 -0
- pixeltable/catalog/table.py +208 -145
- pixeltable/catalog/table_version.py +36 -18
- pixeltable/catalog/table_version_path.py +0 -8
- pixeltable/catalog/view.py +3 -3
- pixeltable/dataframe.py +9 -24
- pixeltable/env.py +1 -1
- pixeltable/exec/__init__.py +1 -1
- pixeltable/exec/aggregation_node.py +22 -15
- pixeltable/exec/data_row_batch.py +7 -7
- pixeltable/exec/exec_node.py +35 -7
- pixeltable/exec/expr_eval_node.py +2 -1
- pixeltable/exec/in_memory_data_node.py +9 -9
- pixeltable/exec/sql_node.py +265 -136
- pixeltable/exprs/__init__.py +1 -0
- pixeltable/exprs/data_row.py +30 -19
- pixeltable/exprs/expr.py +15 -14
- pixeltable/exprs/expr_dict.py +55 -0
- pixeltable/exprs/expr_set.py +21 -15
- pixeltable/exprs/function_call.py +21 -8
- pixeltable/exprs/rowid_ref.py +2 -2
- pixeltable/exprs/sql_element_cache.py +5 -1
- pixeltable/ext/functions/whisperx.py +7 -2
- pixeltable/func/callable_function.py +2 -2
- pixeltable/func/function_registry.py +6 -7
- pixeltable/func/query_template_function.py +11 -12
- pixeltable/func/signature.py +17 -15
- pixeltable/func/udf.py +0 -4
- pixeltable/functions/__init__.py +1 -1
- pixeltable/functions/audio.py +4 -6
- pixeltable/functions/globals.py +86 -42
- pixeltable/functions/huggingface.py +12 -14
- pixeltable/functions/image.py +59 -45
- pixeltable/functions/json.py +0 -1
- pixeltable/functions/mistralai.py +2 -2
- pixeltable/functions/openai.py +22 -25
- pixeltable/functions/string.py +50 -50
- pixeltable/functions/timestamp.py +20 -20
- pixeltable/functions/together.py +2 -2
- pixeltable/functions/video.py +11 -20
- pixeltable/functions/whisper.py +2 -20
- pixeltable/globals.py +55 -56
- pixeltable/index/base.py +2 -2
- pixeltable/index/btree.py +7 -7
- pixeltable/index/embedding_index.py +8 -10
- pixeltable/io/external_store.py +11 -5
- pixeltable/io/globals.py +2 -0
- pixeltable/io/hf_datasets.py +1 -1
- pixeltable/io/label_studio.py +6 -6
- pixeltable/io/parquet.py +14 -13
- pixeltable/iterators/document.py +9 -7
- pixeltable/iterators/video.py +10 -1
- pixeltable/metadata/__init__.py +3 -2
- pixeltable/metadata/converters/convert_14.py +4 -2
- pixeltable/metadata/converters/convert_15.py +1 -1
- pixeltable/metadata/converters/convert_19.py +1 -0
- pixeltable/metadata/converters/convert_20.py +1 -1
- pixeltable/metadata/converters/util.py +9 -8
- pixeltable/metadata/schema.py +32 -21
- pixeltable/plan.py +136 -154
- pixeltable/store.py +51 -36
- pixeltable/tool/create_test_db_dump.py +6 -6
- pixeltable/tool/doc_plugins/griffe.py +3 -34
- pixeltable/tool/mypy_plugin.py +32 -0
- pixeltable/type_system.py +243 -60
- pixeltable/utils/arrow.py +10 -9
- pixeltable/utils/coco.py +4 -4
- pixeltable/utils/documents.py +1 -1
- pixeltable/utils/filecache.py +9 -9
- pixeltable/utils/formatter.py +1 -1
- pixeltable/utils/http_server.py +2 -5
- pixeltable/utils/media_store.py +6 -6
- pixeltable/utils/pytorch.py +10 -11
- pixeltable/utils/sql.py +2 -1
- {pixeltable-0.2.20.dist-info → pixeltable-0.2.21.dist-info}/METADATA +6 -5
- pixeltable-0.2.21.dist-info/RECORD +148 -0
- pixeltable/utils/help.py +0 -11
- pixeltable-0.2.20.dist-info/RECORD +0 -147
- {pixeltable-0.2.20.dist-info → pixeltable-0.2.21.dist-info}/LICENSE +0 -0
- {pixeltable-0.2.20.dist-info → pixeltable-0.2.21.dist-info}/WHEEL +0 -0
- {pixeltable-0.2.20.dist-info → pixeltable-0.2.21.dist-info}/entry_points.txt +0 -0
|
@@ -149,18 +149,18 @@ class Dumper:
|
|
|
149
149
|
pxt.create_dir('views')
|
|
150
150
|
|
|
151
151
|
# simple view
|
|
152
|
-
v = pxt.create_view('views.view', t
|
|
152
|
+
v = pxt.create_view('views.view', t.where(t.c2 < 50))
|
|
153
153
|
self.__add_expr_columns(v, 'view')
|
|
154
154
|
|
|
155
155
|
# snapshot
|
|
156
|
-
_ = pxt.create_view('views.snapshot', t
|
|
156
|
+
_ = pxt.create_view('views.snapshot', t.where(t.c2 >= 75), is_snapshot=True)
|
|
157
157
|
|
|
158
158
|
# view of views
|
|
159
|
-
vv = pxt.create_view('views.view_of_views', v
|
|
159
|
+
vv = pxt.create_view('views.view_of_views', v.where(t.c2 >= 25))
|
|
160
160
|
self.__add_expr_columns(vv, 'view_of_views')
|
|
161
161
|
|
|
162
162
|
# empty view
|
|
163
|
-
e = pxt.create_view('views.empty_view', t
|
|
163
|
+
e = pxt.create_view('views.empty_view', t.where(t.c2 == 4171780))
|
|
164
164
|
assert e.count() == 0
|
|
165
165
|
self.__add_expr_columns(e, 'empty_view', include_expensive_functions=True)
|
|
166
166
|
|
|
@@ -278,13 +278,13 @@ class Dumper:
|
|
|
278
278
|
# this breaks; TODO: why?
|
|
279
279
|
#return t.where(t.c2 < i)
|
|
280
280
|
return t.where(t.c2 < i).select(t.c1, t.c2)
|
|
281
|
-
add_column('query_output', t.q1(t.c2))
|
|
281
|
+
add_column('query_output', t.queries.q1(t.c2))
|
|
282
282
|
|
|
283
283
|
@t.query
|
|
284
284
|
def q2(s: str):
|
|
285
285
|
sim = t[f'{col_prefix}_function_call'].similarity(s)
|
|
286
286
|
return t.order_by(sim, asc=False).select(t[f'{col_prefix}_function_call']).limit(5)
|
|
287
|
-
add_column('sim_output', t.q2(t.c1))
|
|
287
|
+
add_column('sim_output', t.queries.q2(t.c1))
|
|
288
288
|
|
|
289
289
|
|
|
290
290
|
@pxt.udf(_force_stored=True)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import ast
|
|
2
|
-
from typing import Optional, Union
|
|
3
2
|
import warnings
|
|
3
|
+
from typing import Optional, Union
|
|
4
4
|
|
|
5
5
|
import griffe
|
|
6
6
|
import griffe.expressions
|
|
@@ -39,7 +39,7 @@ class PxtGriffeExtension(Extension):
|
|
|
39
39
|
udf = griffe.dynamic_import(func.path)
|
|
40
40
|
assert isinstance(udf, pxt.Function)
|
|
41
41
|
# Convert the return type to a Pixeltable type reference
|
|
42
|
-
func.returns =
|
|
42
|
+
func.returns = str(udf.signature.get_return_type())
|
|
43
43
|
# Convert the parameter types to Pixeltable type references
|
|
44
44
|
for griffe_param in func.parameters:
|
|
45
45
|
assert isinstance(griffe_param.annotation, griffe.expressions.Expr)
|
|
@@ -47,35 +47,4 @@ class PxtGriffeExtension(Extension):
|
|
|
47
47
|
logger.warning(f'Parameter `{griffe_param.name}` not found in signature for UDF: {udf.display_name}')
|
|
48
48
|
continue
|
|
49
49
|
pxt_param = udf.signature.parameters[griffe_param.name]
|
|
50
|
-
griffe_param.annotation =
|
|
51
|
-
|
|
52
|
-
def __column_type_to_display_str(self, column_type: Optional[pxt.ColumnType]) -> str:
|
|
53
|
-
# TODO: When we enhance the Pixeltable type system, we may want to refactor some of this logic out.
|
|
54
|
-
# I'm putting it here for now though.
|
|
55
|
-
if column_type is None:
|
|
56
|
-
return 'None'
|
|
57
|
-
if column_type.is_string_type():
|
|
58
|
-
base = 'str'
|
|
59
|
-
elif column_type.is_int_type():
|
|
60
|
-
base = 'int'
|
|
61
|
-
elif column_type.is_float_type():
|
|
62
|
-
base = 'float'
|
|
63
|
-
elif column_type.is_bool_type():
|
|
64
|
-
base = 'bool'
|
|
65
|
-
elif column_type.is_timestamp_type():
|
|
66
|
-
base = 'datetime'
|
|
67
|
-
elif column_type.is_array_type():
|
|
68
|
-
base = 'ArrayT'
|
|
69
|
-
elif column_type.is_json_type():
|
|
70
|
-
base = 'JsonT'
|
|
71
|
-
elif column_type.is_image_type():
|
|
72
|
-
base = 'ImageT'
|
|
73
|
-
elif column_type.is_video_type():
|
|
74
|
-
base = 'VideoT'
|
|
75
|
-
elif column_type.is_audio_type():
|
|
76
|
-
base = 'AudioT'
|
|
77
|
-
elif column_type.is_document_type():
|
|
78
|
-
base = 'DocumentT'
|
|
79
|
-
else:
|
|
80
|
-
assert False
|
|
81
|
-
return f'Optional[{base}]' if column_type.nullable else base
|
|
50
|
+
griffe_param.annotation = str(pxt_param.col_type)
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from typing import Callable, Optional
|
|
2
|
+
|
|
3
|
+
from mypy.plugin import AnalyzeTypeContext, Plugin
|
|
4
|
+
from mypy.types import Type
|
|
5
|
+
|
|
6
|
+
import pixeltable as pxt
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class PxtPlugin(Plugin):
|
|
10
|
+
__TYPE_MAP = {
|
|
11
|
+
pxt.Json: 'typing.Any',
|
|
12
|
+
pxt.Array: 'numpy.ndarray',
|
|
13
|
+
pxt.Image: 'PIL.Image.Image',
|
|
14
|
+
pxt.Video: 'builtins.str',
|
|
15
|
+
pxt.Audio: 'builtins.str',
|
|
16
|
+
pxt.Document: 'builtins.str',
|
|
17
|
+
}
|
|
18
|
+
__FULLNAME_MAP = {
|
|
19
|
+
f'{k.__module__}.{k.__name__}': v
|
|
20
|
+
for k, v in __TYPE_MAP.items()
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
def get_type_analyze_hook(self, fullname: str) -> Optional[Callable[[AnalyzeTypeContext], type]]:
|
|
24
|
+
if fullname in self.__FULLNAME_MAP:
|
|
25
|
+
subst_name = self.__FULLNAME_MAP[fullname]
|
|
26
|
+
return lambda ctx: pxt_hook(ctx, subst_name)
|
|
27
|
+
|
|
28
|
+
def plugin(version: str):
|
|
29
|
+
return PxtPlugin
|
|
30
|
+
|
|
31
|
+
def pxt_hook(ctx: AnalyzeTypeContext, subst_name: str) -> Type:
|
|
32
|
+
return ctx.api.named_type(subst_name)
|
pixeltable/type_system.py
CHANGED
|
@@ -8,15 +8,16 @@ import typing
|
|
|
8
8
|
import urllib.parse
|
|
9
9
|
import urllib.request
|
|
10
10
|
from pathlib import Path
|
|
11
|
-
from typing import Any,
|
|
11
|
+
from typing import Any, Iterable, Mapping, Optional, Sequence, Union
|
|
12
12
|
|
|
13
13
|
import av # type: ignore
|
|
14
14
|
import numpy as np
|
|
15
15
|
import PIL.Image
|
|
16
16
|
import sqlalchemy as sql
|
|
17
|
+
from typing import _GenericAlias # type: ignore[attr-defined]
|
|
18
|
+
from typing_extensions import _AnnotatedAlias
|
|
17
19
|
|
|
18
20
|
import pixeltable.exceptions as excs
|
|
19
|
-
from pixeltable.env import Env
|
|
20
21
|
|
|
21
22
|
|
|
22
23
|
class ColumnType:
|
|
@@ -41,7 +42,7 @@ class ColumnType:
|
|
|
41
42
|
def supertype(
|
|
42
43
|
cls, type1: 'ColumnType.Type', type2: 'ColumnType.Type',
|
|
43
44
|
# we need to pass this in because we can't easily append it as a class member
|
|
44
|
-
common_supertypes:
|
|
45
|
+
common_supertypes: dict[tuple['ColumnType.Type', 'ColumnType.Type'], 'ColumnType.Type']
|
|
45
46
|
) -> Optional['ColumnType.Type']:
|
|
46
47
|
if type1 == type2:
|
|
47
48
|
return type1
|
|
@@ -103,16 +104,16 @@ class ColumnType:
|
|
|
103
104
|
return self.__class__(nullable=nullable) # type: ignore[call-arg]
|
|
104
105
|
|
|
105
106
|
@classmethod
|
|
106
|
-
def serialize_list(cls, type_list:
|
|
107
|
+
def serialize_list(cls, type_list: list[ColumnType]) -> str:
|
|
107
108
|
return json.dumps([t.as_dict() for t in type_list])
|
|
108
109
|
|
|
109
|
-
def as_dict(self) ->
|
|
110
|
+
def as_dict(self) -> dict:
|
|
110
111
|
return {
|
|
111
112
|
'_classname': self.__class__.__name__,
|
|
112
113
|
**self._as_dict(),
|
|
113
114
|
}
|
|
114
115
|
|
|
115
|
-
def _as_dict(self) ->
|
|
116
|
+
def _as_dict(self) -> dict:
|
|
116
117
|
return {'nullable': self.nullable}
|
|
117
118
|
|
|
118
119
|
@classmethod
|
|
@@ -121,18 +122,18 @@ class ColumnType:
|
|
|
121
122
|
return cls.from_dict(type_dict)
|
|
122
123
|
|
|
123
124
|
@classmethod
|
|
124
|
-
def deserialize_list(cls, type_list_str: str) ->
|
|
125
|
+
def deserialize_list(cls, type_list_str: str) -> list[ColumnType]:
|
|
125
126
|
type_dict_list = json.loads(type_list_str)
|
|
126
127
|
return [cls.from_dict(type_dict) for type_dict in type_dict_list]
|
|
127
128
|
|
|
128
129
|
@classmethod
|
|
129
|
-
def from_dict(cls, type_dict:
|
|
130
|
+
def from_dict(cls, type_dict: dict) -> ColumnType:
|
|
130
131
|
assert '_classname' in type_dict
|
|
131
132
|
type_class = globals()[type_dict['_classname']]
|
|
132
133
|
return type_class._from_dict(type_dict)
|
|
133
134
|
|
|
134
135
|
@classmethod
|
|
135
|
-
def _from_dict(cls, d:
|
|
136
|
+
def _from_dict(cls, d: dict) -> ColumnType:
|
|
136
137
|
"""
|
|
137
138
|
Default implementation: simply invoke c'tor
|
|
138
139
|
"""
|
|
@@ -164,11 +165,28 @@ class ColumnType:
|
|
|
164
165
|
return DocumentType()
|
|
165
166
|
|
|
166
167
|
def __str__(self) -> str:
|
|
167
|
-
return self.
|
|
168
|
+
return self._to_str(as_schema=False)
|
|
169
|
+
|
|
170
|
+
def _to_str(self, as_schema: bool) -> str:
|
|
171
|
+
base_str = self._to_base_str()
|
|
172
|
+
if as_schema:
|
|
173
|
+
return base_str if self.nullable else f'Required[{base_str}]'
|
|
174
|
+
else:
|
|
175
|
+
return f'Optional[{base_str}]' if self.nullable else base_str
|
|
176
|
+
|
|
177
|
+
def _to_base_str(self) -> str:
|
|
178
|
+
"""
|
|
179
|
+
String representation of this type, disregarding nullability. Default implementation is to camel-case
|
|
180
|
+
the type name; subclasses may override.
|
|
181
|
+
"""
|
|
182
|
+
return self._type.name[0] + self._type.name[1:].lower()
|
|
168
183
|
|
|
169
184
|
def __eq__(self, other: object) -> bool:
|
|
170
185
|
return isinstance(other, ColumnType) and self.matches(other) and self.nullable == other.nullable
|
|
171
186
|
|
|
187
|
+
def __hash__(self) -> int:
|
|
188
|
+
return hash((self._type, self.nullable))
|
|
189
|
+
|
|
172
190
|
def is_supertype_of(self, other: ColumnType, ignore_nullable: bool = False) -> bool:
|
|
173
191
|
if ignore_nullable:
|
|
174
192
|
supertype = self.supertype(other)
|
|
@@ -253,39 +271,63 @@ class ColumnType:
|
|
|
253
271
|
return inferred_type
|
|
254
272
|
|
|
255
273
|
@classmethod
|
|
256
|
-
def from_python_type(cls, t: type) -> Optional[ColumnType]:
|
|
274
|
+
def from_python_type(cls, t: Union[type, _GenericAlias], nullable_default: bool = False) -> Optional[ColumnType]:
|
|
257
275
|
if typing.get_origin(t) is typing.Union:
|
|
258
276
|
union_args = typing.get_args(t)
|
|
259
|
-
if union_args
|
|
260
|
-
# `t` is a type of the form Optional[T] (equivalently, Union[T, None]).
|
|
277
|
+
if len(union_args) == 2 and type(None) in union_args:
|
|
278
|
+
# `t` is a type of the form Optional[T] (equivalently, Union[T, None] or Union[None, T]).
|
|
261
279
|
# We treat it as the underlying type but with nullable=True.
|
|
262
|
-
|
|
280
|
+
underlying_py_type = union_args[0] if union_args[1] is type(None) else union_args[1]
|
|
281
|
+
underlying = cls.from_python_type(underlying_py_type)
|
|
263
282
|
if underlying is not None:
|
|
264
|
-
underlying.
|
|
265
|
-
|
|
283
|
+
return underlying.copy(nullable=True)
|
|
284
|
+
elif typing.get_origin(t) is typing.Annotated:
|
|
285
|
+
annotated_args = typing.get_args(t)
|
|
286
|
+
origin = annotated_args[0]
|
|
287
|
+
parameters = annotated_args[1]
|
|
288
|
+
if isinstance(parameters, ColumnType):
|
|
289
|
+
return parameters.copy(nullable=nullable_default)
|
|
290
|
+
elif typing.get_origin(t) is Required:
|
|
291
|
+
required_args = typing.get_args(t)
|
|
292
|
+
assert len(required_args) == 1
|
|
293
|
+
return cls.from_python_type(required_args[0], nullable_default=False)
|
|
266
294
|
else:
|
|
267
295
|
# Discard type parameters to ensure that parameterized types such as `list[T]`
|
|
268
296
|
# are correctly mapped to Pixeltable types.
|
|
269
|
-
|
|
270
|
-
if
|
|
271
|
-
# No type parameters; the
|
|
272
|
-
|
|
273
|
-
if
|
|
274
|
-
return
|
|
275
|
-
if
|
|
276
|
-
return
|
|
277
|
-
if
|
|
278
|
-
return
|
|
279
|
-
if
|
|
280
|
-
return
|
|
281
|
-
if
|
|
282
|
-
return
|
|
283
|
-
if
|
|
284
|
-
return
|
|
285
|
-
if
|
|
286
|
-
return ImageType()
|
|
297
|
+
origin = typing.get_origin(t)
|
|
298
|
+
if origin is None:
|
|
299
|
+
# No type parameters; the origin type is just `t` itself
|
|
300
|
+
origin = t
|
|
301
|
+
if issubclass(origin, _PxtType):
|
|
302
|
+
return origin.as_col_type(nullable=nullable_default)
|
|
303
|
+
if origin is str:
|
|
304
|
+
return StringType(nullable=nullable_default)
|
|
305
|
+
if origin is int:
|
|
306
|
+
return IntType(nullable=nullable_default)
|
|
307
|
+
if origin is float:
|
|
308
|
+
return FloatType(nullable=nullable_default)
|
|
309
|
+
if origin is bool:
|
|
310
|
+
return BoolType(nullable=nullable_default)
|
|
311
|
+
if origin is datetime.datetime:
|
|
312
|
+
return TimestampType(nullable=nullable_default)
|
|
313
|
+
if origin is PIL.Image.Image:
|
|
314
|
+
return ImageType(nullable=nullable_default)
|
|
315
|
+
if issubclass(origin, Sequence) or issubclass(origin, Mapping):
|
|
316
|
+
return JsonType(nullable=nullable_default)
|
|
287
317
|
return None
|
|
288
318
|
|
|
319
|
+
@classmethod
|
|
320
|
+
def normalize_type(cls, t: Union[ColumnType, type, _AnnotatedAlias], nullable_default: bool = False) -> ColumnType:
|
|
321
|
+
"""
|
|
322
|
+
Convert any type recognizable by Pixeltable to its corresponding ColumnType.
|
|
323
|
+
"""
|
|
324
|
+
if isinstance(t, ColumnType):
|
|
325
|
+
return t
|
|
326
|
+
col_type = cls.from_python_type(t, nullable_default)
|
|
327
|
+
if col_type is None:
|
|
328
|
+
raise excs.Error(f'Unknown type: {t}')
|
|
329
|
+
return col_type
|
|
330
|
+
|
|
289
331
|
def validate_literal(self, val: Any) -> None:
|
|
290
332
|
"""Raise TypeError if val is not a valid literal for this type"""
|
|
291
333
|
if val is None:
|
|
@@ -491,7 +533,7 @@ class TimestampType(ColumnType):
|
|
|
491
533
|
|
|
492
534
|
class JsonType(ColumnType):
|
|
493
535
|
# TODO: type_spec also needs to be able to express lists
|
|
494
|
-
def __init__(self, type_spec: Optional[
|
|
536
|
+
def __init__(self, type_spec: Optional[dict[str, ColumnType]] = None, nullable: bool = False):
|
|
495
537
|
super().__init__(self.Type.JSON, nullable=nullable)
|
|
496
538
|
self.type_spec = type_spec
|
|
497
539
|
|
|
@@ -526,7 +568,7 @@ class JsonType(ColumnType):
|
|
|
526
568
|
type_spec[other_field_name] = field_type
|
|
527
569
|
return JsonType(type_spec, nullable=(self.nullable or other.nullable))
|
|
528
570
|
|
|
529
|
-
def _as_dict(self) ->
|
|
571
|
+
def _as_dict(self) -> dict:
|
|
530
572
|
result = super()._as_dict()
|
|
531
573
|
if self.type_spec is not None:
|
|
532
574
|
type_spec_dict = {field_name: field_type.serialize() for field_name, field_type in self.type_spec.items()}
|
|
@@ -534,7 +576,7 @@ class JsonType(ColumnType):
|
|
|
534
576
|
return result
|
|
535
577
|
|
|
536
578
|
@classmethod
|
|
537
|
-
def _from_dict(cls, d:
|
|
579
|
+
def _from_dict(cls, d: dict) -> ColumnType:
|
|
538
580
|
type_spec = None
|
|
539
581
|
if 'type_spec' in d:
|
|
540
582
|
type_spec = {
|
|
@@ -590,6 +632,9 @@ class ArrayType(ColumnType):
|
|
|
590
632
|
def matches(self, other: ColumnType) -> bool:
|
|
591
633
|
return isinstance(other, ArrayType) and self.shape == other.shape and self.dtype == other.dtype
|
|
592
634
|
|
|
635
|
+
def __hash__(self) -> int:
|
|
636
|
+
return hash((self._type, self.nullable, self.shape, self.dtype))
|
|
637
|
+
|
|
593
638
|
def supertype(self, other: ColumnType) -> Optional[ArrayType]:
|
|
594
639
|
if not isinstance(other, ArrayType):
|
|
595
640
|
return None
|
|
@@ -601,16 +646,16 @@ class ArrayType(ColumnType):
|
|
|
601
646
|
shape = [n1 if n1 == n2 else None for n1, n2 in zip(self.shape, other.shape)]
|
|
602
647
|
return ArrayType(tuple(shape), self.make_type(base_type), nullable=(self.nullable or other.nullable))
|
|
603
648
|
|
|
604
|
-
def _as_dict(self) ->
|
|
649
|
+
def _as_dict(self) -> dict:
|
|
605
650
|
result = super()._as_dict()
|
|
606
651
|
result.update(shape=list(self.shape), dtype=self.dtype.value)
|
|
607
652
|
return result
|
|
608
653
|
|
|
609
|
-
def
|
|
610
|
-
return f'{self.
|
|
654
|
+
def _to_base_str(self) -> str:
|
|
655
|
+
return f'Array[{self.shape}, {self.pxt_dtype}]'
|
|
611
656
|
|
|
612
657
|
@classmethod
|
|
613
|
-
def _from_dict(cls, d:
|
|
658
|
+
def _from_dict(cls, d: dict) -> ColumnType:
|
|
614
659
|
assert 'shape' in d
|
|
615
660
|
assert 'dtype' in d
|
|
616
661
|
shape = tuple(d['shape'])
|
|
@@ -681,7 +726,7 @@ class ArrayType(ColumnType):
|
|
|
681
726
|
|
|
682
727
|
class ImageType(ColumnType):
|
|
683
728
|
def __init__(
|
|
684
|
-
self, width: Optional[int] = None, height: Optional[int] = None, size: Optional[
|
|
729
|
+
self, width: Optional[int] = None, height: Optional[int] = None, size: Optional[tuple[int, int]] = None,
|
|
685
730
|
mode: Optional[str] = None, nullable: bool = False
|
|
686
731
|
):
|
|
687
732
|
"""
|
|
@@ -701,23 +746,17 @@ class ImageType(ColumnType):
|
|
|
701
746
|
def copy(self, nullable: bool) -> ColumnType:
|
|
702
747
|
return ImageType(self.width, self.height, mode=self.mode, nullable=nullable)
|
|
703
748
|
|
|
704
|
-
def
|
|
705
|
-
|
|
749
|
+
def _to_base_str(self) -> str:
|
|
750
|
+
params = []
|
|
751
|
+
if self.width is not None or self.height is not None:
|
|
752
|
+
params.append(f'({self.width}, {self.height})')
|
|
753
|
+
if self.mode is not None:
|
|
754
|
+
params.append(repr(self.mode))
|
|
755
|
+
if len(params) == 0:
|
|
706
756
|
params_str = ''
|
|
707
|
-
if self.width is not None:
|
|
708
|
-
params_str = f'width={self.width}'
|
|
709
|
-
if self.height is not None:
|
|
710
|
-
if len(params_str) > 0:
|
|
711
|
-
params_str += ', '
|
|
712
|
-
params_str += f'height={self.height}'
|
|
713
|
-
if self.mode is not None:
|
|
714
|
-
if len(params_str) > 0:
|
|
715
|
-
params_str += ', '
|
|
716
|
-
params_str += f'mode={self.mode}'
|
|
717
|
-
params_str = f'({params_str})'
|
|
718
757
|
else:
|
|
719
|
-
params_str = ''
|
|
720
|
-
return f'{
|
|
758
|
+
params_str = f'[{", ".join(params)}]'
|
|
759
|
+
return f'Image{params_str}'
|
|
721
760
|
|
|
722
761
|
def matches(self, other: ColumnType) -> bool:
|
|
723
762
|
return (
|
|
@@ -727,6 +766,9 @@ class ImageType(ColumnType):
|
|
|
727
766
|
and self.mode == other.mode
|
|
728
767
|
)
|
|
729
768
|
|
|
769
|
+
def __hash__(self) -> int:
|
|
770
|
+
return hash((self._type, self.nullable, self.size, self.mode))
|
|
771
|
+
|
|
730
772
|
def supertype(self, other: ColumnType) -> Optional[ImageType]:
|
|
731
773
|
if not isinstance(other, ImageType):
|
|
732
774
|
return None
|
|
@@ -736,18 +778,18 @@ class ImageType(ColumnType):
|
|
|
736
778
|
return ImageType(width=width, height=height, mode=mode, nullable=(self.nullable or other.nullable))
|
|
737
779
|
|
|
738
780
|
@property
|
|
739
|
-
def size(self) -> Optional[
|
|
781
|
+
def size(self) -> Optional[tuple[int, int]]:
|
|
740
782
|
if self.width is None or self.height is None:
|
|
741
783
|
return None
|
|
742
784
|
return (self.width, self.height)
|
|
743
785
|
|
|
744
|
-
def _as_dict(self) ->
|
|
786
|
+
def _as_dict(self) -> dict:
|
|
745
787
|
result = super()._as_dict()
|
|
746
788
|
result.update(width=self.width, height=self.height, mode=self.mode)
|
|
747
789
|
return result
|
|
748
790
|
|
|
749
791
|
@classmethod
|
|
750
|
-
def _from_dict(cls, d:
|
|
792
|
+
def _from_dict(cls, d: dict) -> ColumnType:
|
|
751
793
|
assert 'width' in d
|
|
752
794
|
assert 'height' in d
|
|
753
795
|
assert 'mode' in d
|
|
@@ -853,6 +895,9 @@ class DocumentType(ColumnType):
|
|
|
853
895
|
def matches(self, other: ColumnType) -> bool:
|
|
854
896
|
return isinstance(other, DocumentType) and self._doc_formats == other._doc_formats
|
|
855
897
|
|
|
898
|
+
def __hash__(self) -> int:
|
|
899
|
+
return hash((self._type, self.nullable, self._doc_formats))
|
|
900
|
+
|
|
856
901
|
def to_sa_type(self) -> sql.types.TypeEngine:
|
|
857
902
|
# stored as a file path
|
|
858
903
|
return sql.String()
|
|
@@ -866,3 +911,141 @@ class DocumentType(ColumnType):
|
|
|
866
911
|
dh = get_document_handle(val)
|
|
867
912
|
if dh is None:
|
|
868
913
|
raise excs.Error(f'Not a recognized document format: {val}')
|
|
914
|
+
|
|
915
|
+
|
|
916
|
+
T = typing.TypeVar('T')
|
|
917
|
+
|
|
918
|
+
|
|
919
|
+
class Required(typing.Generic[T]):
|
|
920
|
+
"""
|
|
921
|
+
Marker class to indicate that a column is non-nullable in a schema definition. This has no meaning as a type hint,
|
|
922
|
+
and is intended only for schema declarations.
|
|
923
|
+
"""
|
|
924
|
+
pass
|
|
925
|
+
|
|
926
|
+
|
|
927
|
+
String = typing.Annotated[str, StringType(nullable=False)]
|
|
928
|
+
Int = typing.Annotated[int, IntType(nullable=False)]
|
|
929
|
+
Float = typing.Annotated[float, FloatType(nullable=False)]
|
|
930
|
+
Bool = typing.Annotated[bool, BoolType(nullable=False)]
|
|
931
|
+
Timestamp = typing.Annotated[datetime.datetime, TimestampType(nullable=False)]
|
|
932
|
+
|
|
933
|
+
|
|
934
|
+
class _PxtType:
|
|
935
|
+
"""
|
|
936
|
+
Base class for the Pixeltable type-hint family. Subclasses of this class are meant to be used as type hints, both
|
|
937
|
+
in schema definitions and in UDF signatures. Whereas `ColumnType`s are instantiable and carry semantic information
|
|
938
|
+
about the Pixeltable type system, `_PxtType` subclasses are purely for convenience: they are not instantiable and
|
|
939
|
+
must be resolved to a `ColumnType` (by calling `ColumnType.from_python_type()`) in order to do anything meaningful
|
|
940
|
+
with them.
|
|
941
|
+
|
|
942
|
+
`_PxtType` subclasses can be specialized (as type hints) with type parameters; for example:
|
|
943
|
+
`Image[(300, 300), 'RGB']`. The specialized forms resolve to `typing.Annotated` instances whose annotation is a
|
|
944
|
+
`ColumnType`.
|
|
945
|
+
"""
|
|
946
|
+
def __init__(self):
|
|
947
|
+
raise TypeError(f'Type `{type(self)}` cannot be instantiated.')
|
|
948
|
+
|
|
949
|
+
@classmethod
|
|
950
|
+
def as_col_type(cls, nullable: bool) -> ColumnType:
|
|
951
|
+
raise NotImplementedError()
|
|
952
|
+
|
|
953
|
+
|
|
954
|
+
class Json(_PxtType):
|
|
955
|
+
@classmethod
|
|
956
|
+
def as_col_type(cls, nullable: bool) -> ColumnType:
|
|
957
|
+
return JsonType(nullable=nullable)
|
|
958
|
+
|
|
959
|
+
|
|
960
|
+
class Array(np.ndarray, _PxtType):
|
|
961
|
+
def __class_getitem__(cls, item: Any) -> _AnnotatedAlias:
|
|
962
|
+
"""
|
|
963
|
+
`item` (the type subscript) must be a tuple with exactly two elements (in any order):
|
|
964
|
+
- A tuple of `Optional[int]`s, specifying the shape of the array
|
|
965
|
+
- A type, specifying the dtype of the array
|
|
966
|
+
Example: Array[(3, None, 2), float]
|
|
967
|
+
"""
|
|
968
|
+
params = item if isinstance(item, tuple) else (item,)
|
|
969
|
+
shape: Optional[tuple] = None
|
|
970
|
+
dtype: Optional[ColumnType] = None
|
|
971
|
+
for param in params:
|
|
972
|
+
if isinstance(param, tuple):
|
|
973
|
+
if not all(n is None or (isinstance(n, int) and n >= 1) for n in param):
|
|
974
|
+
raise TypeError(f'Invalid Array type parameter: {param}')
|
|
975
|
+
if shape is not None:
|
|
976
|
+
raise TypeError(f'Duplicate Array type parameter: {param}')
|
|
977
|
+
shape = param
|
|
978
|
+
elif isinstance(param, type) or isinstance(param, _AnnotatedAlias):
|
|
979
|
+
if dtype is not None:
|
|
980
|
+
raise TypeError(f'Duplicate Array type parameter: {param}')
|
|
981
|
+
dtype = ColumnType.from_python_type(param)
|
|
982
|
+
else:
|
|
983
|
+
raise TypeError(f'Invalid Array type parameter: {param}')
|
|
984
|
+
if shape is None:
|
|
985
|
+
raise TypeError('Array type is missing parameter: shape')
|
|
986
|
+
if dtype is None:
|
|
987
|
+
raise TypeError('Array type is missing parameter: dtype')
|
|
988
|
+
return typing.Annotated[np.ndarray, ArrayType(shape=shape, dtype=dtype, nullable=False)]
|
|
989
|
+
|
|
990
|
+
@classmethod
|
|
991
|
+
def as_col_type(cls, nullable: bool) -> ColumnType:
|
|
992
|
+
raise TypeError('Array type cannot be used without specifying shape and dtype')
|
|
993
|
+
|
|
994
|
+
|
|
995
|
+
class Image(PIL.Image.Image, _PxtType):
|
|
996
|
+
def __class_getitem__(cls, item: Any) -> _AnnotatedAlias:
|
|
997
|
+
"""
|
|
998
|
+
`item` (the type subscript) must be one of the following, or a tuple containing either or both in any order:
|
|
999
|
+
- A 2-tuple of `int`s, specifying the size of the image
|
|
1000
|
+
- A string, specifying the mode of the image
|
|
1001
|
+
Example: Image[(300, 300), 'RGB']
|
|
1002
|
+
"""
|
|
1003
|
+
if isinstance(item, tuple) and all(n is None or isinstance(n, int) for n in item):
|
|
1004
|
+
# It's a tuple of the form (width, height)
|
|
1005
|
+
params = (item,)
|
|
1006
|
+
elif isinstance(item, tuple):
|
|
1007
|
+
# It's a compound tuple (multiple parameters)
|
|
1008
|
+
params = item
|
|
1009
|
+
else:
|
|
1010
|
+
# Not a tuple (single arg)
|
|
1011
|
+
params = (item,)
|
|
1012
|
+
size: Optional[tuple] = None
|
|
1013
|
+
mode: Optional[str] = None
|
|
1014
|
+
for param in params:
|
|
1015
|
+
if isinstance(param, tuple):
|
|
1016
|
+
if len(param) != 2 or not isinstance(param[0], (int, type(None))) or not isinstance(param[1], (int, type(None))):
|
|
1017
|
+
raise TypeError(f'Invalid Image type parameter: {param}')
|
|
1018
|
+
if size is not None:
|
|
1019
|
+
raise TypeError(f'Duplicate Image type parameter: {param}')
|
|
1020
|
+
size = param
|
|
1021
|
+
elif isinstance(param, str):
|
|
1022
|
+
if param not in PIL.Image.MODES:
|
|
1023
|
+
raise TypeError(f'Invalid Image type parameter: {param!r}')
|
|
1024
|
+
if mode is not None:
|
|
1025
|
+
raise TypeError(f'Duplicate Image type parameter: {param!r}')
|
|
1026
|
+
mode = param
|
|
1027
|
+
else:
|
|
1028
|
+
raise TypeError(f'Invalid Image type parameter: {param}')
|
|
1029
|
+
return typing.Annotated[PIL.Image.Image, ImageType(size=size, mode=mode, nullable=False)]
|
|
1030
|
+
|
|
1031
|
+
@classmethod
|
|
1032
|
+
def as_col_type(cls, nullable: bool) -> ColumnType:
|
|
1033
|
+
return ImageType(nullable=nullable)
|
|
1034
|
+
|
|
1035
|
+
|
|
1036
|
+
class Video(str, _PxtType):
|
|
1037
|
+
@classmethod
|
|
1038
|
+
def as_col_type(cls, nullable: bool) -> ColumnType:
|
|
1039
|
+
return VideoType(nullable=nullable)
|
|
1040
|
+
|
|
1041
|
+
|
|
1042
|
+
class Audio(str, _PxtType):
|
|
1043
|
+
@classmethod
|
|
1044
|
+
def as_col_type(cls, nullable: bool) -> ColumnType:
|
|
1045
|
+
return AudioType(nullable=nullable)
|
|
1046
|
+
|
|
1047
|
+
|
|
1048
|
+
class Document(str, _PxtType):
|
|
1049
|
+
@classmethod
|
|
1050
|
+
def as_col_type(cls, nullable: bool) -> ColumnType:
|
|
1051
|
+
return DocumentType(nullable=nullable)
|
pixeltable/utils/arrow.py
CHANGED
|
@@ -1,13 +1,14 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Any,
|
|
2
|
+
from typing import Any, Iterator, Optional, Union
|
|
3
3
|
|
|
4
|
+
import numpy as np
|
|
4
5
|
import pyarrow as pa
|
|
5
6
|
|
|
6
7
|
import pixeltable.type_system as ts
|
|
7
8
|
|
|
8
9
|
_logger = logging.getLogger(__name__)
|
|
9
10
|
|
|
10
|
-
_pa_to_pt:
|
|
11
|
+
_pa_to_pt: dict[pa.DataType, ts.ColumnType] = {
|
|
11
12
|
pa.string(): ts.StringType(nullable=True),
|
|
12
13
|
pa.timestamp('us'): ts.TimestampType(nullable=True),
|
|
13
14
|
pa.bool_(): ts.BoolType(nullable=True),
|
|
@@ -20,7 +21,7 @@ _pa_to_pt: Dict[pa.DataType, ts.ColumnType] = {
|
|
|
20
21
|
pa.float32(): ts.FloatType(nullable=True),
|
|
21
22
|
}
|
|
22
23
|
|
|
23
|
-
_pt_to_pa:
|
|
24
|
+
_pt_to_pa: dict[type[ts.ColumnType], pa.DataType] = {
|
|
24
25
|
ts.StringType: pa.string(),
|
|
25
26
|
ts.TimestampType: pa.timestamp('us'), # postgres timestamp is microseconds
|
|
26
27
|
ts.BoolType: pa.bool_(),
|
|
@@ -61,19 +62,19 @@ def to_arrow_type(pixeltable_type: ts.ColumnType) -> Optional[pa.DataType]:
|
|
|
61
62
|
return None
|
|
62
63
|
|
|
63
64
|
|
|
64
|
-
def to_pixeltable_schema(arrow_schema: pa.Schema) ->
|
|
65
|
+
def to_pixeltable_schema(arrow_schema: pa.Schema) -> dict[str, ts.ColumnType]:
|
|
65
66
|
return {field.name: to_pixeltable_type(field.type) for field in arrow_schema}
|
|
66
67
|
|
|
67
68
|
|
|
68
|
-
def to_arrow_schema(pixeltable_schema:
|
|
69
|
-
return pa.schema((name, to_arrow_type(typ)) for name, typ in pixeltable_schema.items())
|
|
69
|
+
def to_arrow_schema(pixeltable_schema: dict[str, Any]) -> pa.Schema:
|
|
70
|
+
return pa.schema((name, to_arrow_type(typ)) for name, typ in pixeltable_schema.items()) # type: ignore[misc]
|
|
70
71
|
|
|
71
72
|
|
|
72
|
-
def to_pydict(batch: pa.RecordBatch) ->
|
|
73
|
+
def to_pydict(batch: pa.RecordBatch) -> dict[str, Union[list, np.ndarray]]:
|
|
73
74
|
"""Convert a RecordBatch to a dictionary of lists, unlike pa.lib.RecordBatch.to_pydict,
|
|
74
75
|
this function will not convert numpy arrays to lists, and will preserve the original numpy dtype.
|
|
75
76
|
"""
|
|
76
|
-
out = {}
|
|
77
|
+
out: dict[str, Union[list, np.ndarray]] = {}
|
|
77
78
|
for k, name in enumerate(batch.schema.names):
|
|
78
79
|
col = batch.column(k)
|
|
79
80
|
if isinstance(col.type, pa.FixedShapeTensorType):
|
|
@@ -86,7 +87,7 @@ def to_pydict(batch: pa.RecordBatch) -> Dict[str, Iterable[Any]]:
|
|
|
86
87
|
return out
|
|
87
88
|
|
|
88
89
|
|
|
89
|
-
def iter_tuples(batch: pa.RecordBatch) -> Iterator[
|
|
90
|
+
def iter_tuples(batch: pa.RecordBatch) -> Iterator[dict[str, Any]]:
|
|
90
91
|
"""Convert a RecordBatch to an iterator of dictionaries. also works with pa.Table and pa.RowGroup"""
|
|
91
92
|
pydict = to_pydict(batch)
|
|
92
93
|
assert len(pydict) > 0, 'empty record batch'
|
pixeltable/utils/coco.py
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
|
-
from typing import List, Dict, Any, Set
|
|
2
|
-
from pathlib import Path
|
|
3
1
|
import json
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Any, Dict, List, Set
|
|
4
4
|
|
|
5
5
|
import PIL
|
|
6
6
|
|
|
7
|
+
import pixeltable as pxt
|
|
7
8
|
import pixeltable.exceptions as excs
|
|
8
9
|
|
|
9
|
-
|
|
10
10
|
format_msg = """
|
|
11
11
|
|
|
12
12
|
Required format:
|
|
@@ -48,7 +48,7 @@ def _verify_input_dict(input_dict: Dict[str, Any]) -> None:
|
|
|
48
48
|
if not isinstance(annotation['category'], (str, int)):
|
|
49
49
|
raise excs.Error(f'Value for "category" is not a str or int: {annotation}{format_msg}')
|
|
50
50
|
|
|
51
|
-
def write_coco_dataset(df:
|
|
51
|
+
def write_coco_dataset(df: pxt.DataFrame, dest_path: Path) -> Path:
|
|
52
52
|
"""Export a DataFrame result set as a COCO dataset in dest_path and return the path of the data.json file."""
|
|
53
53
|
# TODO: validate schema
|
|
54
54
|
if len(df._select_list_exprs) != 1 or not df._select_list_exprs[0].col_type.is_json_type():
|