pixeltable 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +34 -6
- pixeltable/catalog/__init__.py +13 -0
- pixeltable/catalog/catalog.py +159 -0
- pixeltable/catalog/column.py +200 -0
- pixeltable/catalog/dir.py +32 -0
- pixeltable/catalog/globals.py +33 -0
- pixeltable/catalog/insertable_table.py +191 -0
- pixeltable/catalog/named_function.py +36 -0
- pixeltable/catalog/path.py +58 -0
- pixeltable/catalog/path_dict.py +139 -0
- pixeltable/catalog/schema_object.py +39 -0
- pixeltable/catalog/table.py +581 -0
- pixeltable/catalog/table_version.py +749 -0
- pixeltable/catalog/table_version_path.py +133 -0
- pixeltable/catalog/view.py +203 -0
- pixeltable/client.py +520 -30
- pixeltable/dataframe.py +540 -349
- pixeltable/env.py +373 -45
- pixeltable/exceptions.py +12 -21
- pixeltable/exec/__init__.py +9 -0
- pixeltable/exec/aggregation_node.py +78 -0
- pixeltable/exec/cache_prefetch_node.py +113 -0
- pixeltable/exec/component_iteration_node.py +79 -0
- pixeltable/exec/data_row_batch.py +95 -0
- pixeltable/exec/exec_context.py +22 -0
- pixeltable/exec/exec_node.py +61 -0
- pixeltable/exec/expr_eval_node.py +217 -0
- pixeltable/exec/in_memory_data_node.py +69 -0
- pixeltable/exec/media_validation_node.py +43 -0
- pixeltable/exec/sql_scan_node.py +225 -0
- pixeltable/exprs/__init__.py +24 -0
- pixeltable/exprs/arithmetic_expr.py +102 -0
- pixeltable/exprs/array_slice.py +71 -0
- pixeltable/exprs/column_property_ref.py +77 -0
- pixeltable/exprs/column_ref.py +105 -0
- pixeltable/exprs/comparison.py +77 -0
- pixeltable/exprs/compound_predicate.py +98 -0
- pixeltable/exprs/data_row.py +187 -0
- pixeltable/exprs/expr.py +586 -0
- pixeltable/exprs/expr_set.py +39 -0
- pixeltable/exprs/function_call.py +380 -0
- pixeltable/exprs/globals.py +69 -0
- pixeltable/exprs/image_member_access.py +115 -0
- pixeltable/exprs/image_similarity_predicate.py +58 -0
- pixeltable/exprs/inline_array.py +107 -0
- pixeltable/exprs/inline_dict.py +101 -0
- pixeltable/exprs/is_null.py +38 -0
- pixeltable/exprs/json_mapper.py +121 -0
- pixeltable/exprs/json_path.py +159 -0
- pixeltable/exprs/literal.py +54 -0
- pixeltable/exprs/object_ref.py +41 -0
- pixeltable/exprs/predicate.py +44 -0
- pixeltable/exprs/row_builder.py +355 -0
- pixeltable/exprs/rowid_ref.py +94 -0
- pixeltable/exprs/type_cast.py +53 -0
- pixeltable/exprs/variable.py +45 -0
- pixeltable/func/__init__.py +9 -0
- pixeltable/func/aggregate_function.py +194 -0
- pixeltable/func/batched_function.py +53 -0
- pixeltable/func/callable_function.py +69 -0
- pixeltable/func/expr_template_function.py +82 -0
- pixeltable/func/function.py +110 -0
- pixeltable/func/function_registry.py +227 -0
- pixeltable/func/globals.py +36 -0
- pixeltable/func/nos_function.py +202 -0
- pixeltable/func/signature.py +166 -0
- pixeltable/func/udf.py +163 -0
- pixeltable/functions/__init__.py +52 -103
- pixeltable/functions/eval.py +216 -0
- pixeltable/functions/fireworks.py +61 -0
- pixeltable/functions/huggingface.py +120 -0
- pixeltable/functions/image.py +16 -0
- pixeltable/functions/openai.py +88 -0
- pixeltable/functions/pil/image.py +148 -7
- pixeltable/functions/string.py +13 -0
- pixeltable/functions/together.py +27 -0
- pixeltable/functions/util.py +41 -0
- pixeltable/functions/video.py +62 -0
- pixeltable/iterators/__init__.py +3 -0
- pixeltable/iterators/base.py +48 -0
- pixeltable/iterators/document.py +311 -0
- pixeltable/iterators/video.py +89 -0
- pixeltable/metadata/__init__.py +54 -0
- pixeltable/metadata/converters/convert_10.py +18 -0
- pixeltable/metadata/schema.py +211 -0
- pixeltable/plan.py +656 -0
- pixeltable/store.py +413 -182
- pixeltable/tests/conftest.py +143 -87
- pixeltable/tests/test_audio.py +65 -0
- pixeltable/tests/test_catalog.py +27 -0
- pixeltable/tests/test_client.py +14 -14
- pixeltable/tests/test_component_view.py +372 -0
- pixeltable/tests/test_dataframe.py +433 -0
- pixeltable/tests/test_dirs.py +78 -62
- pixeltable/tests/test_document.py +117 -0
- pixeltable/tests/test_exprs.py +591 -135
- pixeltable/tests/test_function.py +297 -67
- pixeltable/tests/test_functions.py +283 -1
- pixeltable/tests/test_migration.py +43 -0
- pixeltable/tests/test_nos.py +54 -0
- pixeltable/tests/test_snapshot.py +208 -0
- pixeltable/tests/test_table.py +1085 -262
- pixeltable/tests/test_transactional_directory.py +42 -0
- pixeltable/tests/test_types.py +5 -11
- pixeltable/tests/test_video.py +149 -34
- pixeltable/tests/test_view.py +530 -0
- pixeltable/tests/utils.py +186 -45
- pixeltable/tool/create_test_db_dump.py +149 -0
- pixeltable/type_system.py +490 -126
- pixeltable/utils/__init__.py +17 -46
- pixeltable/utils/clip.py +12 -15
- pixeltable/utils/coco.py +136 -0
- pixeltable/utils/documents.py +39 -0
- pixeltable/utils/filecache.py +195 -0
- pixeltable/utils/help.py +11 -0
- pixeltable/utils/media_store.py +76 -0
- pixeltable/utils/parquet.py +126 -0
- pixeltable/utils/pytorch.py +172 -0
- pixeltable/utils/s3.py +13 -0
- pixeltable/utils/sql.py +17 -0
- pixeltable/utils/transactional_directory.py +35 -0
- pixeltable-0.2.0.dist-info/LICENSE +18 -0
- pixeltable-0.2.0.dist-info/METADATA +117 -0
- pixeltable-0.2.0.dist-info/RECORD +125 -0
- {pixeltable-0.1.1.dist-info → pixeltable-0.2.0.dist-info}/WHEEL +1 -1
- pixeltable/catalog.py +0 -1421
- pixeltable/exprs.py +0 -1745
- pixeltable/function.py +0 -269
- pixeltable/functions/clip.py +0 -10
- pixeltable/functions/pil/__init__.py +0 -23
- pixeltable/functions/tf.py +0 -21
- pixeltable/index.py +0 -57
- pixeltable/tests/test_dict.py +0 -24
- pixeltable/tests/test_tf.py +0 -69
- pixeltable/tf.py +0 -33
- pixeltable/utils/tf.py +0 -33
- pixeltable/utils/video.py +0 -32
- pixeltable-0.1.1.dist-info/METADATA +0 -31
- pixeltable-0.1.1.dist-info/RECORD +0 -36
pixeltable/type_system.py
CHANGED
|
@@ -1,15 +1,21 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import abc
|
|
2
|
-
from typing import Any, Optional, Tuple, Dict, Callable, List, Union
|
|
3
|
-
import enum
|
|
4
4
|
import datetime
|
|
5
|
+
import enum
|
|
5
6
|
import json
|
|
7
|
+
import typing
|
|
8
|
+
import urllib.parse
|
|
9
|
+
from copy import copy
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any, Optional, Tuple, Dict, Callable, List, Union
|
|
6
12
|
|
|
7
|
-
import os
|
|
8
|
-
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
|
|
9
|
-
import tensorflow as tf
|
|
10
13
|
import PIL.Image
|
|
14
|
+
import av
|
|
15
|
+
import numpy as np
|
|
11
16
|
import sqlalchemy as sql
|
|
12
17
|
|
|
18
|
+
from pixeltable import exceptions as excs
|
|
13
19
|
|
|
14
20
|
|
|
15
21
|
class ColumnType:
|
|
@@ -24,25 +30,16 @@ class ColumnType:
|
|
|
24
30
|
ARRAY = 6
|
|
25
31
|
IMAGE = 7
|
|
26
32
|
VIDEO = 8
|
|
33
|
+
AUDIO = 9
|
|
34
|
+
DOCUMENT = 10
|
|
27
35
|
|
|
28
36
|
# exprs that don't evaluate to a computable value in Pixeltable, such as an Image member function
|
|
29
|
-
INVALID =
|
|
30
|
-
|
|
31
|
-
def to_tf(self) -> tf.dtypes.DType:
|
|
32
|
-
if self == self.STRING:
|
|
33
|
-
return tf.string
|
|
34
|
-
if self == self.INT:
|
|
35
|
-
return tf.int64
|
|
36
|
-
if self == self.FLOAT:
|
|
37
|
-
return tf.float32
|
|
38
|
-
if self == self.BOOL:
|
|
39
|
-
return tf.bool
|
|
40
|
-
raise TypeError(f'Cannot convert {self} to TensorFlow')
|
|
37
|
+
INVALID = 255
|
|
41
38
|
|
|
42
39
|
@classmethod
|
|
43
40
|
def supertype(
|
|
44
41
|
cls, type1: 'Type', type2: 'Type',
|
|
45
|
-
# we need to pass this in because we can't easily
|
|
42
|
+
# we need to pass this in because we can't easily append it as a class member
|
|
46
43
|
common_supertypes: Dict[Tuple['Type', 'Type'], 'Type']
|
|
47
44
|
) -> Optional['Type']:
|
|
48
45
|
if type1 == type2:
|
|
@@ -82,8 +79,9 @@ class ColumnType:
|
|
|
82
79
|
(Type.INT, Type.FLOAT): Type.FLOAT,
|
|
83
80
|
}
|
|
84
81
|
|
|
85
|
-
def __init__(self, t: Type):
|
|
82
|
+
def __init__(self, t: Type, nullable: bool = False):
|
|
86
83
|
self._type = t
|
|
84
|
+
self.nullable = nullable
|
|
87
85
|
|
|
88
86
|
@property
|
|
89
87
|
def type_enum(self) -> Type:
|
|
@@ -93,7 +91,7 @@ class ColumnType:
|
|
|
93
91
|
return json.dumps(self.as_dict())
|
|
94
92
|
|
|
95
93
|
@classmethod
|
|
96
|
-
def serialize_list(cls, type_list: List[
|
|
94
|
+
def serialize_list(cls, type_list: List[ColumnType]) -> str:
|
|
97
95
|
return json.dumps([t.as_dict() for t in type_list])
|
|
98
96
|
|
|
99
97
|
def as_dict(self) -> Dict:
|
|
@@ -103,33 +101,34 @@ class ColumnType:
|
|
|
103
101
|
}
|
|
104
102
|
|
|
105
103
|
def _as_dict(self) -> Dict:
|
|
106
|
-
return {}
|
|
104
|
+
return {'nullable': self.nullable}
|
|
107
105
|
|
|
108
106
|
@classmethod
|
|
109
|
-
def deserialize(cls, type_str: str) ->
|
|
107
|
+
def deserialize(cls, type_str: str) -> ColumnType:
|
|
110
108
|
type_dict = json.loads(type_str)
|
|
111
109
|
return cls.from_dict(type_dict)
|
|
112
110
|
|
|
113
111
|
@classmethod
|
|
114
|
-
def deserialize_list(cls, type_list_str: str) -> List[
|
|
112
|
+
def deserialize_list(cls, type_list_str: str) -> List[ColumnType]:
|
|
115
113
|
type_dict_list = json.loads(type_list_str)
|
|
116
114
|
return [cls.from_dict(type_dict) for type_dict in type_dict_list]
|
|
117
115
|
|
|
118
116
|
@classmethod
|
|
119
|
-
def from_dict(cls, type_dict: Dict) ->
|
|
117
|
+
def from_dict(cls, type_dict: Dict) -> ColumnType:
|
|
120
118
|
assert '_classname' in type_dict
|
|
121
119
|
type_class = globals()[type_dict['_classname']]
|
|
122
120
|
return type_class._from_dict(type_dict)
|
|
123
121
|
|
|
124
122
|
@classmethod
|
|
125
|
-
def _from_dict(cls, d: Dict) ->
|
|
123
|
+
def _from_dict(cls, d: Dict) -> ColumnType:
|
|
126
124
|
"""
|
|
127
|
-
Default implementation: simply invoke c'tor
|
|
125
|
+
Default implementation: simply invoke c'tor
|
|
128
126
|
"""
|
|
129
|
-
|
|
127
|
+
assert 'nullable' in d
|
|
128
|
+
return cls(nullable=d['nullable'])
|
|
130
129
|
|
|
131
130
|
@classmethod
|
|
132
|
-
def make_type(cls, t: Type) ->
|
|
131
|
+
def make_type(cls, t: Type) -> ColumnType:
|
|
133
132
|
assert t != cls.Type.INVALID and t != cls.Type.ARRAY
|
|
134
133
|
if t == cls.Type.STRING:
|
|
135
134
|
return StringType()
|
|
@@ -147,21 +146,44 @@ class ColumnType:
|
|
|
147
146
|
return ImageType()
|
|
148
147
|
if t == cls.Type.VIDEO:
|
|
149
148
|
return VideoType()
|
|
149
|
+
if t == cls.Type.AUDIO:
|
|
150
|
+
return AudioType()
|
|
151
|
+
if t == cls.Type.DOCUMENT:
|
|
152
|
+
return AudioType()
|
|
150
153
|
|
|
151
154
|
def __str__(self) -> str:
|
|
152
155
|
return self._type.name.lower()
|
|
153
156
|
|
|
154
157
|
def __eq__(self, other: object) -> bool:
|
|
158
|
+
return self.matches(other) and self.nullable == other.nullable
|
|
159
|
+
|
|
160
|
+
def is_supertype_of(self, other: ColumnType) -> bool:
|
|
161
|
+
if type(self) != type(other):
|
|
162
|
+
return False
|
|
163
|
+
if self.matches(other):
|
|
164
|
+
return True
|
|
165
|
+
return self._is_supertype_of(other)
|
|
166
|
+
|
|
167
|
+
@abc.abstractmethod
|
|
168
|
+
def _is_supertype_of(self, other: ColumnType) -> bool:
|
|
169
|
+
return False
|
|
170
|
+
|
|
171
|
+
def matches(self, other: object) -> bool:
|
|
172
|
+
"""Two types match if they're equal, aside from nullability"""
|
|
173
|
+
if not isinstance(other, ColumnType):
|
|
174
|
+
pass
|
|
155
175
|
assert isinstance(other, ColumnType)
|
|
156
|
-
if
|
|
176
|
+
if type(self) != type(other):
|
|
157
177
|
return False
|
|
158
178
|
for member_var in vars(self).keys():
|
|
179
|
+
if member_var == 'nullable':
|
|
180
|
+
continue
|
|
159
181
|
if getattr(self, member_var) != getattr(other, member_var):
|
|
160
182
|
return False
|
|
161
183
|
return True
|
|
162
184
|
|
|
163
185
|
@classmethod
|
|
164
|
-
def supertype(cls, type1:
|
|
186
|
+
def supertype(cls, type1: ColumnType, type2: ColumnType) -> Optional[ColumnType]:
|
|
165
187
|
if type1 == type2:
|
|
166
188
|
return type1
|
|
167
189
|
|
|
@@ -183,16 +205,15 @@ class ColumnType:
|
|
|
183
205
|
|
|
184
206
|
@classmethod
|
|
185
207
|
@abc.abstractmethod
|
|
186
|
-
def _supertype(cls, type1:
|
|
208
|
+
def _supertype(cls, type1: ColumnType, type2: ColumnType) -> Optional[ColumnType]:
|
|
187
209
|
"""
|
|
188
210
|
Class-specific implementation of determining the supertype. type1 and type2 are from the same subclass of
|
|
189
211
|
ColumnType.
|
|
190
212
|
"""
|
|
191
213
|
pass
|
|
192
214
|
|
|
193
|
-
|
|
194
215
|
@classmethod
|
|
195
|
-
def
|
|
216
|
+
def infer_literal_type(cls, val: Any) -> Optional[ColumnType]:
|
|
196
217
|
if isinstance(val, str):
|
|
197
218
|
return StringType()
|
|
198
219
|
if isinstance(val, int):
|
|
@@ -203,6 +224,85 @@ class ColumnType:
|
|
|
203
224
|
return BoolType()
|
|
204
225
|
if isinstance(val, datetime.datetime) or isinstance(val, datetime.date):
|
|
205
226
|
return TimestampType()
|
|
227
|
+
if isinstance(val, np.ndarray):
|
|
228
|
+
col_type = ArrayType.from_literal(val)
|
|
229
|
+
if col_type is not None:
|
|
230
|
+
return col_type
|
|
231
|
+
# this could still be json-serializable
|
|
232
|
+
if isinstance(val, dict) or isinstance(val, np.ndarray):
|
|
233
|
+
try:
|
|
234
|
+
JsonType().validate_literal(val)
|
|
235
|
+
return JsonType()
|
|
236
|
+
except TypeError:
|
|
237
|
+
return None
|
|
238
|
+
return None
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
@classmethod
|
|
242
|
+
def from_python_type(cls, t: type) -> Optional[ColumnType]:
|
|
243
|
+
if t in _python_type_to_column_type:
|
|
244
|
+
return _python_type_to_column_type[t]
|
|
245
|
+
elif isinstance(t, typing._UnionGenericAlias) and t.__args__[1] is type(None):
|
|
246
|
+
# `t` is a type of the form Optional[T] (equivalently, Union[T, None]).
|
|
247
|
+
# We treat it as the underlying type but with nullable=True.
|
|
248
|
+
if t.__args__[0] in _python_type_to_column_type:
|
|
249
|
+
underlying = copy(_python_type_to_column_type[t.__args__[0]])
|
|
250
|
+
underlying.nullable = True
|
|
251
|
+
return underlying
|
|
252
|
+
|
|
253
|
+
return None
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def validate_literal(self, val: Any) -> None:
|
|
257
|
+
"""Raise TypeError if val is not a valid literal for this type"""
|
|
258
|
+
if val is None:
|
|
259
|
+
if not self.nullable:
|
|
260
|
+
raise TypeError('Expected non-None value')
|
|
261
|
+
else:
|
|
262
|
+
return
|
|
263
|
+
self._validate_literal(val)
|
|
264
|
+
|
|
265
|
+
def validate_media(self, val: Any) -> None:
|
|
266
|
+
"""
|
|
267
|
+
Raise TypeError if val is not a path to a valid media file (or a valid in-memory byte sequence) for this type
|
|
268
|
+
"""
|
|
269
|
+
if self.is_media_type():
|
|
270
|
+
raise NotImplementedError(f'validate_media() not implemented for {self.__class__.__name__}')
|
|
271
|
+
|
|
272
|
+
def _validate_file_path(self, val: Any) -> None:
|
|
273
|
+
"""Raises TypeError if not a valid local file path or not a path/byte sequence"""
|
|
274
|
+
if isinstance(val, str):
|
|
275
|
+
parsed = urllib.parse.urlparse(val)
|
|
276
|
+
if parsed.scheme != '' and parsed.scheme != 'file':
|
|
277
|
+
return
|
|
278
|
+
path = Path(urllib.parse.unquote(parsed.path))
|
|
279
|
+
if not path.is_file():
|
|
280
|
+
raise TypeError(f'File not found: {str(path)}')
|
|
281
|
+
else:
|
|
282
|
+
if not isinstance(val, bytes):
|
|
283
|
+
raise TypeError(f'expected file path or bytes, got {type(val)}')
|
|
284
|
+
|
|
285
|
+
@abc.abstractmethod
|
|
286
|
+
def _validate_literal(self, val: Any) -> None:
|
|
287
|
+
"""Raise TypeError if val is not a valid literal for this type"""
|
|
288
|
+
pass
|
|
289
|
+
|
|
290
|
+
@abc.abstractmethod
|
|
291
|
+
def _create_literal(self, val : Any) -> Any:
|
|
292
|
+
"""Create a literal of this type from val, including any needed conversions.
|
|
293
|
+
val is guaranteed to be non-None"""
|
|
294
|
+
return val
|
|
295
|
+
|
|
296
|
+
def create_literal(self, val: Any) -> Any:
|
|
297
|
+
"""Create a literal of this type from val or raise TypeError if not possible"""
|
|
298
|
+
if val is not None:
|
|
299
|
+
val = self._create_literal(val)
|
|
300
|
+
|
|
301
|
+
self.validate_literal(val)
|
|
302
|
+
return val
|
|
303
|
+
|
|
304
|
+
def print_value(self, val: Any) -> str:
|
|
305
|
+
return str(val)
|
|
206
306
|
|
|
207
307
|
def is_scalar_type(self) -> bool:
|
|
208
308
|
return self._type in self.scalar_types
|
|
@@ -240,6 +340,16 @@ class ColumnType:
|
|
|
240
340
|
def is_video_type(self) -> bool:
|
|
241
341
|
return self._type == self.Type.VIDEO
|
|
242
342
|
|
|
343
|
+
def is_audio_type(self) -> bool:
|
|
344
|
+
return self._type == self.Type.AUDIO
|
|
345
|
+
|
|
346
|
+
def is_document_type(self) -> bool:
|
|
347
|
+
return self._type == self.Type.DOCUMENT
|
|
348
|
+
|
|
349
|
+
def is_media_type(self) -> bool:
|
|
350
|
+
# types that refer to external media files
|
|
351
|
+
return self.is_image_type() or self.is_video_type() or self.is_audio_type() or self.is_document_type()
|
|
352
|
+
|
|
243
353
|
@abc.abstractmethod
|
|
244
354
|
def to_sql(self) -> str:
|
|
245
355
|
"""
|
|
@@ -273,6 +383,10 @@ class ColumnType:
|
|
|
273
383
|
return sql.VARBINARY
|
|
274
384
|
assert False
|
|
275
385
|
|
|
386
|
+
@abc.abstractmethod
|
|
387
|
+
def to_arrow_type(self) -> 'pyarrow.DataType':
|
|
388
|
+
assert False, f'Have not implemented {self.__class__.__name__} to Arrow'
|
|
389
|
+
|
|
276
390
|
@staticmethod
|
|
277
391
|
def no_conversion(v: Any) -> Any:
|
|
278
392
|
"""
|
|
@@ -281,21 +395,17 @@ class ColumnType:
|
|
|
281
395
|
"""
|
|
282
396
|
assert False
|
|
283
397
|
|
|
284
|
-
def conversion_fn(self, target:
|
|
398
|
+
def conversion_fn(self, target: ColumnType) -> Optional[Callable[[Any], Any]]:
|
|
285
399
|
"""
|
|
286
400
|
Return Callable that converts a column value of type self to a value of type 'target'.
|
|
287
401
|
Returns None if conversion isn't possible.
|
|
288
402
|
"""
|
|
289
403
|
return None
|
|
290
404
|
|
|
291
|
-
@abc.abstractmethod
|
|
292
|
-
def to_tf(self) -> Union[tf.TypeSpec, Dict[str, tf.TypeSpec]]:
|
|
293
|
-
pass
|
|
294
|
-
|
|
295
405
|
|
|
296
406
|
class InvalidType(ColumnType):
|
|
297
|
-
def __init__(self):
|
|
298
|
-
super().__init__(self.Type.INVALID)
|
|
407
|
+
def __init__(self, nullable: bool = False):
|
|
408
|
+
super().__init__(self.Type.INVALID, nullable=nullable)
|
|
299
409
|
|
|
300
410
|
def to_sql(self) -> str:
|
|
301
411
|
assert False
|
|
@@ -303,13 +413,18 @@ class InvalidType(ColumnType):
|
|
|
303
413
|
def to_sa_type(self) -> Any:
|
|
304
414
|
assert False
|
|
305
415
|
|
|
306
|
-
def
|
|
307
|
-
|
|
416
|
+
def to_arrow_type(self) -> 'pyarrow.DataType':
|
|
417
|
+
assert False
|
|
308
418
|
|
|
419
|
+
def print_value(self, val: Any) -> str:
|
|
420
|
+
assert False
|
|
421
|
+
|
|
422
|
+
def _validate_literal(self, val: Any) -> None:
|
|
423
|
+
assert False
|
|
309
424
|
|
|
310
425
|
class StringType(ColumnType):
|
|
311
|
-
def __init__(self):
|
|
312
|
-
super().__init__(self.Type.STRING)
|
|
426
|
+
def __init__(self, nullable: bool = False):
|
|
427
|
+
super().__init__(self.Type.STRING, nullable=nullable)
|
|
313
428
|
|
|
314
429
|
def conversion_fn(self, target: ColumnType) -> Optional[Callable[[Any], Any]]:
|
|
315
430
|
if not target.is_timestamp_type():
|
|
@@ -327,74 +442,111 @@ class StringType(ColumnType):
|
|
|
327
442
|
|
|
328
443
|
def to_sa_type(self) -> str:
|
|
329
444
|
return sql.String
|
|
445
|
+
|
|
446
|
+
def to_arrow_type(self) -> 'pyarrow.DataType':
|
|
447
|
+
import pyarrow as pa # pylint: disable=import-outside-toplevel
|
|
448
|
+
return pa.string()
|
|
330
449
|
|
|
331
|
-
def
|
|
332
|
-
return
|
|
450
|
+
def print_value(self, val: Any) -> str:
|
|
451
|
+
return f"'{val}'"
|
|
452
|
+
|
|
453
|
+
def _validate_literal(self, val: Any) -> None:
|
|
454
|
+
if not isinstance(val, str):
|
|
455
|
+
raise TypeError(f'Expected string, got {val.__class__.__name__}')
|
|
333
456
|
|
|
334
457
|
|
|
335
458
|
class IntType(ColumnType):
|
|
336
|
-
def __init__(self):
|
|
337
|
-
super().__init__(self.Type.INT)
|
|
459
|
+
def __init__(self, nullable: bool = False):
|
|
460
|
+
super().__init__(self.Type.INT, nullable=nullable)
|
|
338
461
|
|
|
339
462
|
def to_sql(self) -> str:
|
|
340
|
-
return '
|
|
463
|
+
return 'BIGINT'
|
|
341
464
|
|
|
342
465
|
def to_sa_type(self) -> str:
|
|
343
|
-
return sql.
|
|
466
|
+
return sql.BigInteger
|
|
467
|
+
|
|
468
|
+
def to_arrow_type(self) -> 'pyarrow.DataType':
|
|
469
|
+
import pyarrow as pa # pylint: disable=import-outside-toplevel
|
|
470
|
+
return pa.int64() # to be consistent with bigint above
|
|
344
471
|
|
|
345
|
-
def
|
|
346
|
-
|
|
347
|
-
|
|
472
|
+
def _validate_literal(self, val: Any) -> None:
|
|
473
|
+
if not isinstance(val, int):
|
|
474
|
+
raise TypeError(f'Expected int, got {val.__class__.__name__}')
|
|
348
475
|
|
|
349
476
|
|
|
350
477
|
class FloatType(ColumnType):
|
|
351
|
-
def __init__(self):
|
|
352
|
-
super().__init__(self.Type.FLOAT)
|
|
478
|
+
def __init__(self, nullable: bool = False):
|
|
479
|
+
super().__init__(self.Type.FLOAT, nullable=nullable)
|
|
353
480
|
|
|
354
481
|
def to_sql(self) -> str:
|
|
355
482
|
return 'FLOAT'
|
|
356
483
|
|
|
357
484
|
def to_sa_type(self) -> str:
|
|
358
485
|
return sql.Float
|
|
486
|
+
|
|
487
|
+
def to_arrow_type(self) -> 'pyarrow.DataType':
|
|
488
|
+
import pyarrow as pa
|
|
489
|
+
return pa.float32()
|
|
359
490
|
|
|
360
|
-
def
|
|
361
|
-
|
|
362
|
-
|
|
491
|
+
def _validate_literal(self, val: Any) -> None:
|
|
492
|
+
if not isinstance(val, float):
|
|
493
|
+
raise TypeError(f'Expected float, got {val.__class__.__name__}')
|
|
363
494
|
|
|
495
|
+
def _create_literal(self, val: Any) -> Any:
|
|
496
|
+
if isinstance(val, int):
|
|
497
|
+
return float(val)
|
|
498
|
+
return val
|
|
364
499
|
|
|
365
500
|
class BoolType(ColumnType):
|
|
366
|
-
def __init__(self):
|
|
367
|
-
super().__init__(self.Type.BOOL)
|
|
501
|
+
def __init__(self, nullable: bool = False):
|
|
502
|
+
super().__init__(self.Type.BOOL, nullable=nullable)
|
|
368
503
|
|
|
369
504
|
def to_sql(self) -> str:
|
|
370
505
|
return 'BOOLEAN'
|
|
371
506
|
|
|
372
507
|
def to_sa_type(self) -> str:
|
|
373
508
|
return sql.Boolean
|
|
509
|
+
|
|
510
|
+
def to_arrow_type(self) -> 'pyarrow.DataType':
|
|
511
|
+
import pyarrow as pa # pylint: disable=import-outside-toplevel
|
|
512
|
+
return pa.bool_()
|
|
374
513
|
|
|
375
|
-
def
|
|
376
|
-
|
|
377
|
-
|
|
514
|
+
def _validate_literal(self, val: Any) -> None:
|
|
515
|
+
if not isinstance(val, bool):
|
|
516
|
+
raise TypeError(f'Expected bool, got {val.__class__.__name__}')
|
|
378
517
|
|
|
518
|
+
def _create_literal(self, val: Any) -> Any:
|
|
519
|
+
if isinstance(val, int):
|
|
520
|
+
return bool(val)
|
|
521
|
+
return val
|
|
379
522
|
|
|
380
523
|
class TimestampType(ColumnType):
|
|
381
|
-
def __init__(self):
|
|
382
|
-
super().__init__(self.Type.TIMESTAMP)
|
|
524
|
+
def __init__(self, nullable: bool = False):
|
|
525
|
+
super().__init__(self.Type.TIMESTAMP, nullable=nullable)
|
|
383
526
|
|
|
384
527
|
def to_sql(self) -> str:
|
|
385
528
|
return 'INTEGER'
|
|
386
529
|
|
|
387
530
|
def to_sa_type(self) -> str:
|
|
388
531
|
return sql.TIMESTAMP
|
|
532
|
+
|
|
533
|
+
def to_arrow_type(self) -> 'pyarrow.DataType':
|
|
534
|
+
import pyarrow as pa # pylint: disable=import-outside-toplevel
|
|
535
|
+
return pa.timestamp('us') # postgres timestamp is microseconds
|
|
389
536
|
|
|
390
|
-
def
|
|
391
|
-
|
|
537
|
+
def _validate_literal(self, val: Any) -> None:
|
|
538
|
+
if not isinstance(val, datetime.datetime) and not isinstance(val, datetime.date):
|
|
539
|
+
raise TypeError(f'Expected datetime.datetime or datetime.date, got {val.__class__.__name__}')
|
|
392
540
|
|
|
541
|
+
def _create_literal(self, val: Any) -> Any:
|
|
542
|
+
if isinstance(val, str):
|
|
543
|
+
return datetime.datetime.fromisoformat(val)
|
|
544
|
+
return val
|
|
393
545
|
|
|
394
546
|
class JsonType(ColumnType):
|
|
395
547
|
# TODO: type_spec also needs to be able to express lists
|
|
396
|
-
def __init__(self, type_spec: Optional[Dict[str, ColumnType]] = None):
|
|
397
|
-
super().__init__(self.Type.JSON)
|
|
548
|
+
def __init__(self, type_spec: Optional[Dict[str, ColumnType]] = None, nullable: bool = False):
|
|
549
|
+
super().__init__(self.Type.JSON, nullable=nullable)
|
|
398
550
|
self.type_spec = type_spec
|
|
399
551
|
|
|
400
552
|
def _as_dict(self) -> Dict:
|
|
@@ -405,34 +557,52 @@ class JsonType(ColumnType):
|
|
|
405
557
|
return result
|
|
406
558
|
|
|
407
559
|
@classmethod
|
|
408
|
-
def _from_dict(cls, d: Dict) ->
|
|
560
|
+
def _from_dict(cls, d: Dict) -> ColumnType:
|
|
409
561
|
type_spec = None
|
|
410
562
|
if 'type_spec' in d:
|
|
411
563
|
type_spec = {
|
|
412
564
|
field_name: cls.deserialize(field_type_dict) for field_name, field_type_dict in d['type_spec'].items()
|
|
413
565
|
}
|
|
414
|
-
return cls(type_spec)
|
|
566
|
+
return cls(type_spec, nullable=d['nullable'])
|
|
415
567
|
|
|
416
568
|
def to_sql(self) -> str:
|
|
417
569
|
return 'JSONB'
|
|
418
570
|
|
|
419
571
|
def to_sa_type(self) -> str:
|
|
420
572
|
return sql.dialects.postgresql.JSONB
|
|
421
|
-
|
|
422
|
-
def
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
573
|
+
|
|
574
|
+
def to_arrow_type(self) -> 'pyarrow.DataType':
|
|
575
|
+
import pyarrow as pa # pylint: disable=import-outside-toplevel
|
|
576
|
+
return pa.string() # TODO: weight advantage of pa.struct type.
|
|
577
|
+
|
|
578
|
+
def print_value(self, val: Any) -> str:
|
|
579
|
+
val_type = self.infer_literal_type(val)
|
|
580
|
+
if val_type == self:
|
|
581
|
+
return str(val)
|
|
582
|
+
return val_type.print_value(val)
|
|
583
|
+
|
|
584
|
+
def _validate_literal(self, val: Any) -> None:
|
|
585
|
+
if not isinstance(val, dict) and not isinstance(val, list):
|
|
586
|
+
raise TypeError(f'Expected dict or list, got {val.__class__.__name__}')
|
|
587
|
+
try:
|
|
588
|
+
_ = json.dumps(val)
|
|
589
|
+
except TypeError as e:
|
|
590
|
+
raise TypeError(f'Expected JSON-serializable object, got {val}')
|
|
591
|
+
|
|
592
|
+
def _create_literal(self, val: Any) -> Any:
|
|
593
|
+
if isinstance(val, tuple):
|
|
594
|
+
val = list(val)
|
|
595
|
+
return val
|
|
427
596
|
|
|
428
597
|
class ArrayType(ColumnType):
|
|
429
598
|
def __init__(
|
|
430
|
-
self, shape: Tuple[Union[int, None], ...], dtype: ColumnType
|
|
431
|
-
super().__init__(self.Type.ARRAY)
|
|
599
|
+
self, shape: Tuple[Union[int, None], ...], dtype: ColumnType, nullable: bool = False):
|
|
600
|
+
super().__init__(self.Type.ARRAY, nullable=nullable)
|
|
432
601
|
self.shape = shape
|
|
433
|
-
|
|
602
|
+
assert dtype.is_int_type() or dtype.is_float_type() or dtype.is_bool_type() or dtype.is_string_type()
|
|
603
|
+
self.dtype = dtype._type
|
|
434
604
|
|
|
435
|
-
def _supertype(cls, type1:
|
|
605
|
+
def _supertype(cls, type1: ArrayType, type2: ArrayType) -> Optional[ArrayType]:
|
|
436
606
|
if len(type1.shape) != len(type2.shape):
|
|
437
607
|
return None
|
|
438
608
|
base_type = ColumnType.supertype(type1.dtype, type2.dtype)
|
|
@@ -447,53 +617,94 @@ class ArrayType(ColumnType):
|
|
|
447
617
|
return result
|
|
448
618
|
|
|
449
619
|
def __str__(self) -> str:
|
|
450
|
-
return f'{self.
|
|
620
|
+
return f'{self._type.name.lower()}({self.shape}, dtype={self.dtype.name})'
|
|
451
621
|
|
|
452
622
|
@classmethod
|
|
453
|
-
def _from_dict(cls, d: Dict) ->
|
|
623
|
+
def _from_dict(cls, d: Dict) -> ColumnType:
|
|
454
624
|
assert 'shape' in d
|
|
455
625
|
assert 'dtype' in d
|
|
456
626
|
shape = tuple(d['shape'])
|
|
457
|
-
dtype = cls.Type(d['dtype'])
|
|
458
|
-
return cls(shape, dtype)
|
|
459
|
-
|
|
460
|
-
def to_sql(self) -> str:
|
|
461
|
-
return 'BYTEA'
|
|
627
|
+
dtype = cls.make_type(cls.Type(d['dtype']))
|
|
628
|
+
return cls(shape, dtype, nullable=d['nullable'])
|
|
462
629
|
|
|
463
|
-
|
|
464
|
-
|
|
630
|
+
@classmethod
|
|
631
|
+
def from_literal(cls, val: np.ndarray) -> Optional[ArrayType]:
|
|
632
|
+
# determine our dtype
|
|
633
|
+
assert isinstance(val, np.ndarray)
|
|
634
|
+
if np.issubdtype(val.dtype, np.integer):
|
|
635
|
+
dtype = IntType()
|
|
636
|
+
elif np.issubdtype(val.dtype, np.floating):
|
|
637
|
+
dtype = FloatType()
|
|
638
|
+
elif val.dtype == np.bool_:
|
|
639
|
+
dtype = BoolType()
|
|
640
|
+
elif val.dtype == np.str_:
|
|
641
|
+
dtype = StringType()
|
|
642
|
+
else:
|
|
643
|
+
return None
|
|
644
|
+
return cls(val.shape, dtype=dtype, nullable=True)
|
|
465
645
|
|
|
466
|
-
def
|
|
467
|
-
|
|
646
|
+
def is_valid_literal(self, val: np.ndarray) -> bool:
|
|
647
|
+
if not isinstance(val, np.ndarray):
|
|
648
|
+
return False
|
|
649
|
+
if len(val.shape) != len(self.shape):
|
|
650
|
+
return False
|
|
651
|
+
# check that the shapes are compatible
|
|
652
|
+
for n1, n2 in zip(val.shape, self.shape):
|
|
653
|
+
if n1 is None:
|
|
654
|
+
return False
|
|
655
|
+
if n2 is None:
|
|
656
|
+
# wildcard
|
|
657
|
+
continue
|
|
658
|
+
if n1 != n2:
|
|
659
|
+
return False
|
|
660
|
+
return val.dtype == self.numpy_dtype()
|
|
468
661
|
|
|
662
|
+
def _validate_literal(self, val: Any) -> None:
|
|
663
|
+
if not isinstance(val, np.ndarray):
|
|
664
|
+
raise TypeError(f'Expected numpy.ndarray, got {val.__class__.__name__}')
|
|
665
|
+
if not self.is_valid_literal(val):
|
|
666
|
+
raise TypeError((
|
|
667
|
+
f'Expected ndarray({self.shape}, dtype={self.numpy_dtype()}), '
|
|
668
|
+
f'got ndarray({val.shape}, dtype={val.dtype})'))
|
|
469
669
|
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
RGB = 1
|
|
670
|
+
def _create_literal(self, val: Any) -> Any:
|
|
671
|
+
if isinstance(val, (list,tuple)):
|
|
672
|
+
return np.array(val)
|
|
673
|
+
return val
|
|
475
674
|
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
if pil_mode == 'L':
|
|
479
|
-
return cls.L
|
|
480
|
-
if pil_mode == 'RGB':
|
|
481
|
-
return cls.RGB
|
|
675
|
+
def to_sql(self) -> str:
|
|
676
|
+
return 'BYTEA'
|
|
482
677
|
|
|
483
|
-
|
|
484
|
-
|
|
678
|
+
def to_sa_type(self) -> str:
|
|
679
|
+
return sql.LargeBinary
|
|
680
|
+
|
|
681
|
+
def to_arrow_type(self) -> 'pyarrow.DataType':
|
|
682
|
+
import pyarrow as pa # pylint: disable=import-outside-toplevel
|
|
683
|
+
if any([n is None for n in self.shape]):
|
|
684
|
+
raise TypeError(f'Cannot convert array with unknown shape to Arrow')
|
|
685
|
+
return pa.fixed_shape_tensor(pa.from_numpy_dtype(self.numpy_dtype()), self.shape)
|
|
686
|
+
|
|
687
|
+
def numpy_dtype(self) -> np.dtype:
|
|
688
|
+
if self.dtype == self.Type.INT:
|
|
689
|
+
return np.dtype(np.int64)
|
|
690
|
+
if self.dtype == self.Type.FLOAT:
|
|
691
|
+
return np.dtype(np.float32)
|
|
692
|
+
if self.dtype == self.Type.BOOL:
|
|
693
|
+
return np.dtype(np.bool_)
|
|
694
|
+
if self.dtype == self.Type.STRING:
|
|
695
|
+
return np.dtype(np.str_)
|
|
696
|
+
assert False
|
|
485
697
|
|
|
486
|
-
def num_channels(self) -> int:
|
|
487
|
-
return len(self.name)
|
|
488
698
|
|
|
699
|
+
class ImageType(ColumnType):
|
|
489
700
|
def __init__(
|
|
490
701
|
self, width: Optional[int] = None, height: Optional[int] = None, size: Optional[Tuple[int, int]] = None,
|
|
491
|
-
mode: Optional[
|
|
702
|
+
mode: Optional[str] = None, nullable: bool = False
|
|
492
703
|
):
|
|
493
704
|
"""
|
|
494
705
|
TODO: does it make sense to specify only width or height?
|
|
495
706
|
"""
|
|
496
|
-
super().__init__(self.Type.IMAGE)
|
|
707
|
+
super().__init__(self.Type.IMAGE, nullable=nullable)
|
|
497
708
|
assert not(width is not None and size is not None)
|
|
498
709
|
assert not(height is not None and size is not None)
|
|
499
710
|
if size is not None:
|
|
@@ -504,22 +715,53 @@ class ImageType(ColumnType):
|
|
|
504
715
|
self.height = height
|
|
505
716
|
self.mode = mode
|
|
506
717
|
|
|
718
|
+
def __str__(self) -> str:
|
|
719
|
+
if self.width is not None or self.height is not None or self.mode is not None:
|
|
720
|
+
params_str = ''
|
|
721
|
+
if self.width is not None:
|
|
722
|
+
params_str = f'width={self.width}'
|
|
723
|
+
if self.height is not None:
|
|
724
|
+
if len(params_str) > 0:
|
|
725
|
+
params_str += ', '
|
|
726
|
+
params_str += f'height={self.height}'
|
|
727
|
+
if self.mode is not None:
|
|
728
|
+
if len(params_str) > 0:
|
|
729
|
+
params_str += ', '
|
|
730
|
+
params_str += f'mode={self.mode}'
|
|
731
|
+
params_str = f'({params_str})'
|
|
732
|
+
else:
|
|
733
|
+
params_str = ''
|
|
734
|
+
return f'{self._type.name.lower()}{params_str}'
|
|
735
|
+
|
|
736
|
+
def _is_supertype_of(self, other: ImageType) -> bool:
|
|
737
|
+
if self.mode != other.mode:
|
|
738
|
+
return False
|
|
739
|
+
if self.width is None and self.height is None:
|
|
740
|
+
return True
|
|
741
|
+
if self.width != other.width and self.height != other.height:
|
|
742
|
+
return False
|
|
743
|
+
|
|
744
|
+
@property
|
|
745
|
+
def size(self) -> Optional[Tuple[int, int]]:
|
|
746
|
+
if self.width is None or self.height is None:
|
|
747
|
+
return None
|
|
748
|
+
return (self.width, self.height)
|
|
749
|
+
|
|
507
750
|
@property
|
|
508
751
|
def num_channels(self) -> Optional[int]:
|
|
509
752
|
return None if self.mode is None else self.mode.num_channels()
|
|
510
753
|
|
|
511
754
|
def _as_dict(self) -> Dict:
|
|
512
755
|
result = super()._as_dict()
|
|
513
|
-
result.update(width=self.width, height=self.height, mode=self.mode
|
|
756
|
+
result.update(width=self.width, height=self.height, mode=self.mode)
|
|
514
757
|
return result
|
|
515
758
|
|
|
516
759
|
@classmethod
|
|
517
|
-
def _from_dict(cls, d: Dict) ->
|
|
760
|
+
def _from_dict(cls, d: Dict) -> ColumnType:
|
|
518
761
|
assert 'width' in d
|
|
519
762
|
assert 'height' in d
|
|
520
763
|
assert 'mode' in d
|
|
521
|
-
|
|
522
|
-
return cls(width=d['width'], height=d['height'], mode=cls.Mode(mode_val) if mode_val is not None else None)
|
|
764
|
+
return cls(width=d['width'], height=d['height'], mode=d['mode'], nullable=d['nullable'])
|
|
523
765
|
|
|
524
766
|
def conversion_fn(self, target: ColumnType) -> Optional[Callable[[Any], Any]]:
|
|
525
767
|
if not target.is_image_type():
|
|
@@ -546,22 +788,111 @@ class ImageType(ColumnType):
|
|
|
546
788
|
|
|
547
789
|
def to_sa_type(self) -> str:
|
|
548
790
|
return sql.String
|
|
791
|
+
|
|
792
|
+
def to_arrow_type(self) -> 'pyarrow.DataType':
|
|
793
|
+
import pyarrow as pa # pylint: disable=import-outside-toplevel
|
|
794
|
+
return pa.binary()
|
|
795
|
+
|
|
796
|
+
def _validate_literal(self, val: Any) -> None:
|
|
797
|
+
if isinstance(val, PIL.Image.Image):
|
|
798
|
+
return
|
|
799
|
+
self._validate_file_path(val)
|
|
800
|
+
|
|
801
|
+
def validate_media(self, val: Any) -> None:
|
|
802
|
+
assert isinstance(val, str)
|
|
803
|
+
try:
|
|
804
|
+
_ = PIL.Image.open(val)
|
|
805
|
+
except PIL.UnidentifiedImageError:
|
|
806
|
+
raise excs.Error(f'Not a valid image: {val}') from None
|
|
549
807
|
|
|
550
|
-
|
|
551
|
-
|
|
808
|
+
class VideoType(ColumnType):
|
|
809
|
+
def __init__(self, nullable: bool = False):
|
|
810
|
+
super().__init__(self.Type.VIDEO, nullable=nullable)
|
|
552
811
|
|
|
812
|
+
def to_sql(self) -> str:
|
|
813
|
+
# stored as a file path
|
|
814
|
+
return 'VARCHAR'
|
|
553
815
|
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
816
|
+
def to_sa_type(self) -> str:
|
|
817
|
+
return sql.String
|
|
818
|
+
|
|
819
|
+
def to_arrow_type(self) -> 'pyarrow.DataType':
|
|
820
|
+
import pyarrow as pa # pylint: disable=import-outside-toplevel
|
|
821
|
+
return pa.string()
|
|
822
|
+
|
|
823
|
+
def _validate_literal(self, val: Any) -> None:
|
|
824
|
+
self._validate_file_path(val)
|
|
825
|
+
|
|
826
|
+
def validate_media(self, val: Any) -> None:
|
|
827
|
+
assert isinstance(val, str)
|
|
828
|
+
try:
|
|
829
|
+
with av.open(val, 'r') as fh:
|
|
830
|
+
if len(fh.streams.video) == 0:
|
|
831
|
+
raise excs.Error(f'Not a valid video: {val}')
|
|
832
|
+
# decode a few frames to make sure it's playable
|
|
833
|
+
# TODO: decode all frames? but that's very slow
|
|
834
|
+
num_decoded = 0
|
|
835
|
+
for frame in fh.decode(video=0):
|
|
836
|
+
_ = frame.to_image()
|
|
837
|
+
num_decoded += 1
|
|
838
|
+
if num_decoded == 10:
|
|
839
|
+
break
|
|
840
|
+
if num_decoded < 2:
|
|
841
|
+
# this is most likely an image file
|
|
842
|
+
raise excs.Error(f'Not a valid video: {val}')
|
|
843
|
+
except av.AVError:
|
|
844
|
+
raise excs.Error(f'Not a valid video: {val}') from None
|
|
845
|
+
|
|
846
|
+
class AudioType(ColumnType):
|
|
847
|
+
def __init__(self, nullable: bool = False):
|
|
848
|
+
super().__init__(self.Type.AUDIO, nullable=nullable)
|
|
557
849
|
|
|
558
|
-
def
|
|
559
|
-
|
|
560
|
-
return
|
|
850
|
+
def to_sql(self) -> str:
|
|
851
|
+
# stored as a file path
|
|
852
|
+
return 'VARCHAR'
|
|
561
853
|
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
854
|
+
def to_sa_type(self) -> str:
|
|
855
|
+
return sql.String
|
|
856
|
+
|
|
857
|
+
def to_arrow_type(self) -> 'pyarrow.DataType':
|
|
858
|
+
import pyarrow as pa # pylint: disable=import-outside-toplevel
|
|
859
|
+
return pa.string()
|
|
860
|
+
|
|
861
|
+
def _validate_literal(self, val: Any) -> None:
|
|
862
|
+
self._validate_file_path(val)
|
|
863
|
+
|
|
864
|
+
def validate_media(self, val: Any) -> None:
|
|
865
|
+
try:
|
|
866
|
+
with av.open(val) as container:
|
|
867
|
+
if len(container.streams.audio) == 0:
|
|
868
|
+
raise excs.Error(f'No audio stream in file: {val}')
|
|
869
|
+
audio_stream = container.streams.audio[0]
|
|
870
|
+
|
|
871
|
+
# decode everything to make sure it's playable
|
|
872
|
+
# TODO: is there some way to verify it's a playable audio file other than decoding all of it?
|
|
873
|
+
for packet in container.demux(audio_stream):
|
|
874
|
+
for _ in packet.decode():
|
|
875
|
+
pass
|
|
876
|
+
except av.AVError as e:
|
|
877
|
+
raise excs.Error(f'Not a valid audio file: {val}\n{e}') from None
|
|
878
|
+
|
|
879
|
+
class DocumentType(ColumnType):
|
|
880
|
+
@enum.unique
|
|
881
|
+
class DocumentFormat(enum.Enum):
|
|
882
|
+
HTML = 0
|
|
883
|
+
MD = 1
|
|
884
|
+
PDF = 2
|
|
885
|
+
|
|
886
|
+
def __init__(self, nullable: bool = False, doc_formats: Optional[str] = None):
|
|
887
|
+
super().__init__(self.Type.DOCUMENT, nullable=nullable)
|
|
888
|
+
if doc_formats is not None:
|
|
889
|
+
type_strs = doc_formats.split(',')
|
|
890
|
+
for type_str in type_strs:
|
|
891
|
+
if not hasattr(self.DocumentFormat, type_str):
|
|
892
|
+
raise ValueError(f'Invalid document type: {type_str}')
|
|
893
|
+
self._doc_formats = [self.DocumentFormat[type_str.upper()] for type_str in type_strs]
|
|
894
|
+
else:
|
|
895
|
+
self._doc_formats = [t for t in self.DocumentFormat]
|
|
565
896
|
|
|
566
897
|
def to_sql(self) -> str:
|
|
567
898
|
# stored as a file path
|
|
@@ -570,5 +901,38 @@ class VideoType(ColumnType):
|
|
|
570
901
|
def to_sa_type(self) -> str:
|
|
571
902
|
return sql.String
|
|
572
903
|
|
|
573
|
-
def
|
|
574
|
-
|
|
904
|
+
def to_arrow_type(self) -> 'pyarrow.DataType':
|
|
905
|
+
import pyarrow as pa # pylint: disable=import-outside-toplevel
|
|
906
|
+
return pa.string()
|
|
907
|
+
|
|
908
|
+
def _validate_literal(self, val: Any) -> None:
|
|
909
|
+
self._validate_file_path(val)
|
|
910
|
+
|
|
911
|
+
def validate_media(self, val: Any) -> None:
|
|
912
|
+
assert isinstance(val, str)
|
|
913
|
+
from pixeltable.utils.documents import get_document_handle
|
|
914
|
+
with open(val, 'r') as fh:
|
|
915
|
+
try:
|
|
916
|
+
s = fh.read()
|
|
917
|
+
dh = get_document_handle(s)
|
|
918
|
+
if dh is None:
|
|
919
|
+
raise excs.Error(f'Not a recognized document format: {val}')
|
|
920
|
+
except Exception as e:
|
|
921
|
+
raise excs.Error(f'Not a recognized document format: {val}') from None
|
|
922
|
+
|
|
923
|
+
|
|
924
|
+
# A dictionary mapping various Python types to their respective ColumnTypes.
|
|
925
|
+
# This can be used to infer Pixeltable ColumnTypes from type hints on Python
|
|
926
|
+
# functions. (Since Python functions do not necessarily have type hints, this
|
|
927
|
+
# should always be an optional/convenience inference.)
|
|
928
|
+
_python_type_to_column_type: dict[type, ColumnType] = {
|
|
929
|
+
str: StringType(),
|
|
930
|
+
int: IntType(),
|
|
931
|
+
float: FloatType(),
|
|
932
|
+
bool: BoolType(),
|
|
933
|
+
datetime.datetime: TimestampType(),
|
|
934
|
+
datetime.date: TimestampType(),
|
|
935
|
+
list: JsonType(),
|
|
936
|
+
dict: JsonType(),
|
|
937
|
+
PIL.Image.Image: ImageType()
|
|
938
|
+
}
|