pixeltable 0.1.2__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +21 -4
- pixeltable/catalog/__init__.py +13 -0
- pixeltable/catalog/catalog.py +159 -0
- pixeltable/catalog/column.py +200 -0
- pixeltable/catalog/dir.py +32 -0
- pixeltable/catalog/globals.py +33 -0
- pixeltable/catalog/insertable_table.py +191 -0
- pixeltable/catalog/named_function.py +36 -0
- pixeltable/catalog/path.py +58 -0
- pixeltable/catalog/path_dict.py +139 -0
- pixeltable/catalog/schema_object.py +39 -0
- pixeltable/catalog/table.py +581 -0
- pixeltable/catalog/table_version.py +749 -0
- pixeltable/catalog/table_version_path.py +133 -0
- pixeltable/catalog/view.py +203 -0
- pixeltable/client.py +520 -31
- pixeltable/dataframe.py +540 -349
- pixeltable/env.py +373 -48
- pixeltable/exceptions.py +12 -21
- pixeltable/exec/__init__.py +9 -0
- pixeltable/exec/aggregation_node.py +78 -0
- pixeltable/exec/cache_prefetch_node.py +113 -0
- pixeltable/exec/component_iteration_node.py +79 -0
- pixeltable/exec/data_row_batch.py +95 -0
- pixeltable/exec/exec_context.py +22 -0
- pixeltable/exec/exec_node.py +61 -0
- pixeltable/exec/expr_eval_node.py +217 -0
- pixeltable/exec/in_memory_data_node.py +69 -0
- pixeltable/exec/media_validation_node.py +43 -0
- pixeltable/exec/sql_scan_node.py +225 -0
- pixeltable/exprs/__init__.py +24 -0
- pixeltable/exprs/arithmetic_expr.py +102 -0
- pixeltable/exprs/array_slice.py +71 -0
- pixeltable/exprs/column_property_ref.py +77 -0
- pixeltable/exprs/column_ref.py +105 -0
- pixeltable/exprs/comparison.py +77 -0
- pixeltable/exprs/compound_predicate.py +98 -0
- pixeltable/exprs/data_row.py +187 -0
- pixeltable/exprs/expr.py +586 -0
- pixeltable/exprs/expr_set.py +39 -0
- pixeltable/exprs/function_call.py +380 -0
- pixeltable/exprs/globals.py +69 -0
- pixeltable/exprs/image_member_access.py +115 -0
- pixeltable/exprs/image_similarity_predicate.py +58 -0
- pixeltable/exprs/inline_array.py +107 -0
- pixeltable/exprs/inline_dict.py +101 -0
- pixeltable/exprs/is_null.py +38 -0
- pixeltable/exprs/json_mapper.py +121 -0
- pixeltable/exprs/json_path.py +159 -0
- pixeltable/exprs/literal.py +54 -0
- pixeltable/exprs/object_ref.py +41 -0
- pixeltable/exprs/predicate.py +44 -0
- pixeltable/exprs/row_builder.py +355 -0
- pixeltable/exprs/rowid_ref.py +94 -0
- pixeltable/exprs/type_cast.py +53 -0
- pixeltable/exprs/variable.py +45 -0
- pixeltable/func/__init__.py +9 -0
- pixeltable/func/aggregate_function.py +194 -0
- pixeltable/func/batched_function.py +53 -0
- pixeltable/func/callable_function.py +69 -0
- pixeltable/func/expr_template_function.py +82 -0
- pixeltable/func/function.py +110 -0
- pixeltable/func/function_registry.py +227 -0
- pixeltable/func/globals.py +36 -0
- pixeltable/func/nos_function.py +202 -0
- pixeltable/func/signature.py +166 -0
- pixeltable/func/udf.py +163 -0
- pixeltable/functions/__init__.py +52 -103
- pixeltable/functions/eval.py +216 -0
- pixeltable/functions/fireworks.py +61 -0
- pixeltable/functions/huggingface.py +120 -0
- pixeltable/functions/image.py +16 -0
- pixeltable/functions/openai.py +88 -0
- pixeltable/functions/pil/image.py +148 -7
- pixeltable/functions/string.py +13 -0
- pixeltable/functions/together.py +27 -0
- pixeltable/functions/util.py +41 -0
- pixeltable/functions/video.py +62 -0
- pixeltable/iterators/__init__.py +3 -0
- pixeltable/iterators/base.py +48 -0
- pixeltable/iterators/document.py +311 -0
- pixeltable/iterators/video.py +89 -0
- pixeltable/metadata/__init__.py +54 -0
- pixeltable/metadata/converters/convert_10.py +18 -0
- pixeltable/metadata/schema.py +211 -0
- pixeltable/plan.py +656 -0
- pixeltable/store.py +413 -182
- pixeltable/tests/conftest.py +143 -86
- pixeltable/tests/test_audio.py +65 -0
- pixeltable/tests/test_catalog.py +27 -0
- pixeltable/tests/test_client.py +14 -14
- pixeltable/tests/test_component_view.py +372 -0
- pixeltable/tests/test_dataframe.py +433 -0
- pixeltable/tests/test_dirs.py +78 -62
- pixeltable/tests/test_document.py +117 -0
- pixeltable/tests/test_exprs.py +591 -135
- pixeltable/tests/test_function.py +297 -67
- pixeltable/tests/test_functions.py +283 -1
- pixeltable/tests/test_migration.py +43 -0
- pixeltable/tests/test_nos.py +54 -0
- pixeltable/tests/test_snapshot.py +208 -0
- pixeltable/tests/test_table.py +1086 -258
- pixeltable/tests/test_transactional_directory.py +42 -0
- pixeltable/tests/test_types.py +5 -11
- pixeltable/tests/test_video.py +149 -34
- pixeltable/tests/test_view.py +530 -0
- pixeltable/tests/utils.py +186 -45
- pixeltable/tool/create_test_db_dump.py +149 -0
- pixeltable/type_system.py +490 -133
- pixeltable/utils/__init__.py +17 -46
- pixeltable/utils/clip.py +12 -15
- pixeltable/utils/coco.py +136 -0
- pixeltable/utils/documents.py +39 -0
- pixeltable/utils/filecache.py +195 -0
- pixeltable/utils/help.py +11 -0
- pixeltable/utils/media_store.py +76 -0
- pixeltable/utils/parquet.py +126 -0
- pixeltable/utils/pytorch.py +172 -0
- pixeltable/utils/s3.py +13 -0
- pixeltable/utils/sql.py +17 -0
- pixeltable/utils/transactional_directory.py +35 -0
- pixeltable-0.2.0.dist-info/LICENSE +18 -0
- pixeltable-0.2.0.dist-info/METADATA +117 -0
- pixeltable-0.2.0.dist-info/RECORD +125 -0
- {pixeltable-0.1.2.dist-info → pixeltable-0.2.0.dist-info}/WHEEL +1 -1
- pixeltable/catalog.py +0 -1421
- pixeltable/exprs.py +0 -1745
- pixeltable/function.py +0 -269
- pixeltable/functions/clip.py +0 -10
- pixeltable/functions/pil/__init__.py +0 -23
- pixeltable/functions/tf.py +0 -21
- pixeltable/index.py +0 -57
- pixeltable/tests/test_dict.py +0 -24
- pixeltable/tests/test_tf.py +0 -69
- pixeltable/tf.py +0 -33
- pixeltable/utils/tf.py +0 -33
- pixeltable/utils/video.py +0 -32
- pixeltable-0.1.2.dist-info/LICENSE +0 -201
- pixeltable-0.1.2.dist-info/METADATA +0 -89
- pixeltable-0.1.2.dist-info/RECORD +0 -37
pixeltable/type_system.py
CHANGED
|
@@ -1,15 +1,21 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import abc
|
|
2
|
-
from typing import Any, Optional, Tuple, Dict, Callable, List, Union
|
|
3
|
-
import enum
|
|
4
4
|
import datetime
|
|
5
|
+
import enum
|
|
5
6
|
import json
|
|
7
|
+
import typing
|
|
8
|
+
import urllib.parse
|
|
9
|
+
from copy import copy
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any, Optional, Tuple, Dict, Callable, List, Union
|
|
6
12
|
|
|
7
|
-
import os
|
|
8
|
-
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
|
|
9
|
-
#import tensorflow as tf
|
|
10
13
|
import PIL.Image
|
|
14
|
+
import av
|
|
15
|
+
import numpy as np
|
|
11
16
|
import sqlalchemy as sql
|
|
12
17
|
|
|
18
|
+
from pixeltable import exceptions as excs
|
|
13
19
|
|
|
14
20
|
|
|
15
21
|
class ColumnType:
|
|
@@ -24,26 +30,16 @@ class ColumnType:
|
|
|
24
30
|
ARRAY = 6
|
|
25
31
|
IMAGE = 7
|
|
26
32
|
VIDEO = 8
|
|
33
|
+
AUDIO = 9
|
|
34
|
+
DOCUMENT = 10
|
|
27
35
|
|
|
28
36
|
# exprs that don't evaluate to a computable value in Pixeltable, such as an Image member function
|
|
29
|
-
INVALID =
|
|
30
|
-
|
|
31
|
-
def to_tf(self) -> 'tf.dtypes.DType':
|
|
32
|
-
import tensorflow as tf
|
|
33
|
-
if self == self.STRING:
|
|
34
|
-
return tf.string
|
|
35
|
-
if self == self.INT:
|
|
36
|
-
return tf.int64
|
|
37
|
-
if self == self.FLOAT:
|
|
38
|
-
return tf.float32
|
|
39
|
-
if self == self.BOOL:
|
|
40
|
-
return tf.bool
|
|
41
|
-
raise TypeError(f'Cannot convert {self} to TensorFlow')
|
|
37
|
+
INVALID = 255
|
|
42
38
|
|
|
43
39
|
@classmethod
|
|
44
40
|
def supertype(
|
|
45
41
|
cls, type1: 'Type', type2: 'Type',
|
|
46
|
-
# we need to pass this in because we can't easily
|
|
42
|
+
# we need to pass this in because we can't easily append it as a class member
|
|
47
43
|
common_supertypes: Dict[Tuple['Type', 'Type'], 'Type']
|
|
48
44
|
) -> Optional['Type']:
|
|
49
45
|
if type1 == type2:
|
|
@@ -83,8 +79,9 @@ class ColumnType:
|
|
|
83
79
|
(Type.INT, Type.FLOAT): Type.FLOAT,
|
|
84
80
|
}
|
|
85
81
|
|
|
86
|
-
def __init__(self, t: Type):
|
|
82
|
+
def __init__(self, t: Type, nullable: bool = False):
|
|
87
83
|
self._type = t
|
|
84
|
+
self.nullable = nullable
|
|
88
85
|
|
|
89
86
|
@property
|
|
90
87
|
def type_enum(self) -> Type:
|
|
@@ -94,7 +91,7 @@ class ColumnType:
|
|
|
94
91
|
return json.dumps(self.as_dict())
|
|
95
92
|
|
|
96
93
|
@classmethod
|
|
97
|
-
def serialize_list(cls, type_list: List[
|
|
94
|
+
def serialize_list(cls, type_list: List[ColumnType]) -> str:
|
|
98
95
|
return json.dumps([t.as_dict() for t in type_list])
|
|
99
96
|
|
|
100
97
|
def as_dict(self) -> Dict:
|
|
@@ -104,33 +101,34 @@ class ColumnType:
|
|
|
104
101
|
}
|
|
105
102
|
|
|
106
103
|
def _as_dict(self) -> Dict:
|
|
107
|
-
return {}
|
|
104
|
+
return {'nullable': self.nullable}
|
|
108
105
|
|
|
109
106
|
@classmethod
|
|
110
|
-
def deserialize(cls, type_str: str) ->
|
|
107
|
+
def deserialize(cls, type_str: str) -> ColumnType:
|
|
111
108
|
type_dict = json.loads(type_str)
|
|
112
109
|
return cls.from_dict(type_dict)
|
|
113
110
|
|
|
114
111
|
@classmethod
|
|
115
|
-
def deserialize_list(cls, type_list_str: str) -> List[
|
|
112
|
+
def deserialize_list(cls, type_list_str: str) -> List[ColumnType]:
|
|
116
113
|
type_dict_list = json.loads(type_list_str)
|
|
117
114
|
return [cls.from_dict(type_dict) for type_dict in type_dict_list]
|
|
118
115
|
|
|
119
116
|
@classmethod
|
|
120
|
-
def from_dict(cls, type_dict: Dict) ->
|
|
117
|
+
def from_dict(cls, type_dict: Dict) -> ColumnType:
|
|
121
118
|
assert '_classname' in type_dict
|
|
122
119
|
type_class = globals()[type_dict['_classname']]
|
|
123
120
|
return type_class._from_dict(type_dict)
|
|
124
121
|
|
|
125
122
|
@classmethod
|
|
126
|
-
def _from_dict(cls, d: Dict) ->
|
|
123
|
+
def _from_dict(cls, d: Dict) -> ColumnType:
|
|
127
124
|
"""
|
|
128
|
-
Default implementation: simply invoke c'tor
|
|
125
|
+
Default implementation: simply invoke c'tor
|
|
129
126
|
"""
|
|
130
|
-
|
|
127
|
+
assert 'nullable' in d
|
|
128
|
+
return cls(nullable=d['nullable'])
|
|
131
129
|
|
|
132
130
|
@classmethod
|
|
133
|
-
def make_type(cls, t: Type) ->
|
|
131
|
+
def make_type(cls, t: Type) -> ColumnType:
|
|
134
132
|
assert t != cls.Type.INVALID and t != cls.Type.ARRAY
|
|
135
133
|
if t == cls.Type.STRING:
|
|
136
134
|
return StringType()
|
|
@@ -148,21 +146,44 @@ class ColumnType:
|
|
|
148
146
|
return ImageType()
|
|
149
147
|
if t == cls.Type.VIDEO:
|
|
150
148
|
return VideoType()
|
|
149
|
+
if t == cls.Type.AUDIO:
|
|
150
|
+
return AudioType()
|
|
151
|
+
if t == cls.Type.DOCUMENT:
|
|
152
|
+
return AudioType()
|
|
151
153
|
|
|
152
154
|
def __str__(self) -> str:
|
|
153
155
|
return self._type.name.lower()
|
|
154
156
|
|
|
155
157
|
def __eq__(self, other: object) -> bool:
|
|
158
|
+
return self.matches(other) and self.nullable == other.nullable
|
|
159
|
+
|
|
160
|
+
def is_supertype_of(self, other: ColumnType) -> bool:
|
|
161
|
+
if type(self) != type(other):
|
|
162
|
+
return False
|
|
163
|
+
if self.matches(other):
|
|
164
|
+
return True
|
|
165
|
+
return self._is_supertype_of(other)
|
|
166
|
+
|
|
167
|
+
@abc.abstractmethod
|
|
168
|
+
def _is_supertype_of(self, other: ColumnType) -> bool:
|
|
169
|
+
return False
|
|
170
|
+
|
|
171
|
+
def matches(self, other: object) -> bool:
|
|
172
|
+
"""Two types match if they're equal, aside from nullability"""
|
|
173
|
+
if not isinstance(other, ColumnType):
|
|
174
|
+
pass
|
|
156
175
|
assert isinstance(other, ColumnType)
|
|
157
|
-
if
|
|
176
|
+
if type(self) != type(other):
|
|
158
177
|
return False
|
|
159
178
|
for member_var in vars(self).keys():
|
|
179
|
+
if member_var == 'nullable':
|
|
180
|
+
continue
|
|
160
181
|
if getattr(self, member_var) != getattr(other, member_var):
|
|
161
182
|
return False
|
|
162
183
|
return True
|
|
163
184
|
|
|
164
185
|
@classmethod
|
|
165
|
-
def supertype(cls, type1:
|
|
186
|
+
def supertype(cls, type1: ColumnType, type2: ColumnType) -> Optional[ColumnType]:
|
|
166
187
|
if type1 == type2:
|
|
167
188
|
return type1
|
|
168
189
|
|
|
@@ -184,16 +205,15 @@ class ColumnType:
|
|
|
184
205
|
|
|
185
206
|
@classmethod
|
|
186
207
|
@abc.abstractmethod
|
|
187
|
-
def _supertype(cls, type1:
|
|
208
|
+
def _supertype(cls, type1: ColumnType, type2: ColumnType) -> Optional[ColumnType]:
|
|
188
209
|
"""
|
|
189
210
|
Class-specific implementation of determining the supertype. type1 and type2 are from the same subclass of
|
|
190
211
|
ColumnType.
|
|
191
212
|
"""
|
|
192
213
|
pass
|
|
193
214
|
|
|
194
|
-
|
|
195
215
|
@classmethod
|
|
196
|
-
def
|
|
216
|
+
def infer_literal_type(cls, val: Any) -> Optional[ColumnType]:
|
|
197
217
|
if isinstance(val, str):
|
|
198
218
|
return StringType()
|
|
199
219
|
if isinstance(val, int):
|
|
@@ -204,6 +224,85 @@ class ColumnType:
|
|
|
204
224
|
return BoolType()
|
|
205
225
|
if isinstance(val, datetime.datetime) or isinstance(val, datetime.date):
|
|
206
226
|
return TimestampType()
|
|
227
|
+
if isinstance(val, np.ndarray):
|
|
228
|
+
col_type = ArrayType.from_literal(val)
|
|
229
|
+
if col_type is not None:
|
|
230
|
+
return col_type
|
|
231
|
+
# this could still be json-serializable
|
|
232
|
+
if isinstance(val, dict) or isinstance(val, np.ndarray):
|
|
233
|
+
try:
|
|
234
|
+
JsonType().validate_literal(val)
|
|
235
|
+
return JsonType()
|
|
236
|
+
except TypeError:
|
|
237
|
+
return None
|
|
238
|
+
return None
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
@classmethod
|
|
242
|
+
def from_python_type(cls, t: type) -> Optional[ColumnType]:
|
|
243
|
+
if t in _python_type_to_column_type:
|
|
244
|
+
return _python_type_to_column_type[t]
|
|
245
|
+
elif isinstance(t, typing._UnionGenericAlias) and t.__args__[1] is type(None):
|
|
246
|
+
# `t` is a type of the form Optional[T] (equivalently, Union[T, None]).
|
|
247
|
+
# We treat it as the underlying type but with nullable=True.
|
|
248
|
+
if t.__args__[0] in _python_type_to_column_type:
|
|
249
|
+
underlying = copy(_python_type_to_column_type[t.__args__[0]])
|
|
250
|
+
underlying.nullable = True
|
|
251
|
+
return underlying
|
|
252
|
+
|
|
253
|
+
return None
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def validate_literal(self, val: Any) -> None:
|
|
257
|
+
"""Raise TypeError if val is not a valid literal for this type"""
|
|
258
|
+
if val is None:
|
|
259
|
+
if not self.nullable:
|
|
260
|
+
raise TypeError('Expected non-None value')
|
|
261
|
+
else:
|
|
262
|
+
return
|
|
263
|
+
self._validate_literal(val)
|
|
264
|
+
|
|
265
|
+
def validate_media(self, val: Any) -> None:
|
|
266
|
+
"""
|
|
267
|
+
Raise TypeError if val is not a path to a valid media file (or a valid in-memory byte sequence) for this type
|
|
268
|
+
"""
|
|
269
|
+
if self.is_media_type():
|
|
270
|
+
raise NotImplementedError(f'validate_media() not implemented for {self.__class__.__name__}')
|
|
271
|
+
|
|
272
|
+
def _validate_file_path(self, val: Any) -> None:
|
|
273
|
+
"""Raises TypeError if not a valid local file path or not a path/byte sequence"""
|
|
274
|
+
if isinstance(val, str):
|
|
275
|
+
parsed = urllib.parse.urlparse(val)
|
|
276
|
+
if parsed.scheme != '' and parsed.scheme != 'file':
|
|
277
|
+
return
|
|
278
|
+
path = Path(urllib.parse.unquote(parsed.path))
|
|
279
|
+
if not path.is_file():
|
|
280
|
+
raise TypeError(f'File not found: {str(path)}')
|
|
281
|
+
else:
|
|
282
|
+
if not isinstance(val, bytes):
|
|
283
|
+
raise TypeError(f'expected file path or bytes, got {type(val)}')
|
|
284
|
+
|
|
285
|
+
@abc.abstractmethod
|
|
286
|
+
def _validate_literal(self, val: Any) -> None:
|
|
287
|
+
"""Raise TypeError if val is not a valid literal for this type"""
|
|
288
|
+
pass
|
|
289
|
+
|
|
290
|
+
@abc.abstractmethod
|
|
291
|
+
def _create_literal(self, val : Any) -> Any:
|
|
292
|
+
"""Create a literal of this type from val, including any needed conversions.
|
|
293
|
+
val is guaranteed to be non-None"""
|
|
294
|
+
return val
|
|
295
|
+
|
|
296
|
+
def create_literal(self, val: Any) -> Any:
|
|
297
|
+
"""Create a literal of this type from val or raise TypeError if not possible"""
|
|
298
|
+
if val is not None:
|
|
299
|
+
val = self._create_literal(val)
|
|
300
|
+
|
|
301
|
+
self.validate_literal(val)
|
|
302
|
+
return val
|
|
303
|
+
|
|
304
|
+
def print_value(self, val: Any) -> str:
|
|
305
|
+
return str(val)
|
|
207
306
|
|
|
208
307
|
def is_scalar_type(self) -> bool:
|
|
209
308
|
return self._type in self.scalar_types
|
|
@@ -241,6 +340,16 @@ class ColumnType:
|
|
|
241
340
|
def is_video_type(self) -> bool:
|
|
242
341
|
return self._type == self.Type.VIDEO
|
|
243
342
|
|
|
343
|
+
def is_audio_type(self) -> bool:
|
|
344
|
+
return self._type == self.Type.AUDIO
|
|
345
|
+
|
|
346
|
+
def is_document_type(self) -> bool:
|
|
347
|
+
return self._type == self.Type.DOCUMENT
|
|
348
|
+
|
|
349
|
+
def is_media_type(self) -> bool:
|
|
350
|
+
# types that refer to external media files
|
|
351
|
+
return self.is_image_type() or self.is_video_type() or self.is_audio_type() or self.is_document_type()
|
|
352
|
+
|
|
244
353
|
@abc.abstractmethod
|
|
245
354
|
def to_sql(self) -> str:
|
|
246
355
|
"""
|
|
@@ -274,6 +383,10 @@ class ColumnType:
|
|
|
274
383
|
return sql.VARBINARY
|
|
275
384
|
assert False
|
|
276
385
|
|
|
386
|
+
@abc.abstractmethod
|
|
387
|
+
def to_arrow_type(self) -> 'pyarrow.DataType':
|
|
388
|
+
assert False, f'Have not implemented {self.__class__.__name__} to Arrow'
|
|
389
|
+
|
|
277
390
|
@staticmethod
|
|
278
391
|
def no_conversion(v: Any) -> Any:
|
|
279
392
|
"""
|
|
@@ -282,21 +395,17 @@ class ColumnType:
|
|
|
282
395
|
"""
|
|
283
396
|
assert False
|
|
284
397
|
|
|
285
|
-
def conversion_fn(self, target:
|
|
398
|
+
def conversion_fn(self, target: ColumnType) -> Optional[Callable[[Any], Any]]:
|
|
286
399
|
"""
|
|
287
400
|
Return Callable that converts a column value of type self to a value of type 'target'.
|
|
288
401
|
Returns None if conversion isn't possible.
|
|
289
402
|
"""
|
|
290
403
|
return None
|
|
291
404
|
|
|
292
|
-
@abc.abstractmethod
|
|
293
|
-
def to_tf(self) -> Union['tf.TypeSpec', Dict[str, 'tf.TypeSpec']]:
|
|
294
|
-
pass
|
|
295
|
-
|
|
296
405
|
|
|
297
406
|
class InvalidType(ColumnType):
|
|
298
|
-
def __init__(self):
|
|
299
|
-
super().__init__(self.Type.INVALID)
|
|
407
|
+
def __init__(self, nullable: bool = False):
|
|
408
|
+
super().__init__(self.Type.INVALID, nullable=nullable)
|
|
300
409
|
|
|
301
410
|
def to_sql(self) -> str:
|
|
302
411
|
assert False
|
|
@@ -304,13 +413,18 @@ class InvalidType(ColumnType):
|
|
|
304
413
|
def to_sa_type(self) -> Any:
|
|
305
414
|
assert False
|
|
306
415
|
|
|
307
|
-
def
|
|
308
|
-
|
|
416
|
+
def to_arrow_type(self) -> 'pyarrow.DataType':
|
|
417
|
+
assert False
|
|
309
418
|
|
|
419
|
+
def print_value(self, val: Any) -> str:
|
|
420
|
+
assert False
|
|
421
|
+
|
|
422
|
+
def _validate_literal(self, val: Any) -> None:
|
|
423
|
+
assert False
|
|
310
424
|
|
|
311
425
|
class StringType(ColumnType):
|
|
312
|
-
def __init__(self):
|
|
313
|
-
super().__init__(self.Type.STRING)
|
|
426
|
+
def __init__(self, nullable: bool = False):
|
|
427
|
+
super().__init__(self.Type.STRING, nullable=nullable)
|
|
314
428
|
|
|
315
429
|
def conversion_fn(self, target: ColumnType) -> Optional[Callable[[Any], Any]]:
|
|
316
430
|
if not target.is_timestamp_type():
|
|
@@ -328,78 +442,111 @@ class StringType(ColumnType):
|
|
|
328
442
|
|
|
329
443
|
def to_sa_type(self) -> str:
|
|
330
444
|
return sql.String
|
|
445
|
+
|
|
446
|
+
def to_arrow_type(self) -> 'pyarrow.DataType':
|
|
447
|
+
import pyarrow as pa # pylint: disable=import-outside-toplevel
|
|
448
|
+
return pa.string()
|
|
331
449
|
|
|
332
|
-
def
|
|
333
|
-
|
|
334
|
-
|
|
450
|
+
def print_value(self, val: Any) -> str:
|
|
451
|
+
return f"'{val}'"
|
|
452
|
+
|
|
453
|
+
def _validate_literal(self, val: Any) -> None:
|
|
454
|
+
if not isinstance(val, str):
|
|
455
|
+
raise TypeError(f'Expected string, got {val.__class__.__name__}')
|
|
335
456
|
|
|
336
457
|
|
|
337
458
|
class IntType(ColumnType):
|
|
338
|
-
def __init__(self):
|
|
339
|
-
super().__init__(self.Type.INT)
|
|
459
|
+
def __init__(self, nullable: bool = False):
|
|
460
|
+
super().__init__(self.Type.INT, nullable=nullable)
|
|
340
461
|
|
|
341
462
|
def to_sql(self) -> str:
|
|
342
|
-
return '
|
|
463
|
+
return 'BIGINT'
|
|
343
464
|
|
|
344
465
|
def to_sa_type(self) -> str:
|
|
345
|
-
return sql.
|
|
466
|
+
return sql.BigInteger
|
|
467
|
+
|
|
468
|
+
def to_arrow_type(self) -> 'pyarrow.DataType':
|
|
469
|
+
import pyarrow as pa # pylint: disable=import-outside-toplevel
|
|
470
|
+
return pa.int64() # to be consistent with bigint above
|
|
346
471
|
|
|
347
|
-
def
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
return tf.TensorSpec(shape=(), dtype=tf.int64)
|
|
472
|
+
def _validate_literal(self, val: Any) -> None:
|
|
473
|
+
if not isinstance(val, int):
|
|
474
|
+
raise TypeError(f'Expected int, got {val.__class__.__name__}')
|
|
351
475
|
|
|
352
476
|
|
|
353
477
|
class FloatType(ColumnType):
|
|
354
|
-
def __init__(self):
|
|
355
|
-
super().__init__(self.Type.FLOAT)
|
|
478
|
+
def __init__(self, nullable: bool = False):
|
|
479
|
+
super().__init__(self.Type.FLOAT, nullable=nullable)
|
|
356
480
|
|
|
357
481
|
def to_sql(self) -> str:
|
|
358
482
|
return 'FLOAT'
|
|
359
483
|
|
|
360
484
|
def to_sa_type(self) -> str:
|
|
361
485
|
return sql.Float
|
|
486
|
+
|
|
487
|
+
def to_arrow_type(self) -> 'pyarrow.DataType':
|
|
488
|
+
import pyarrow as pa
|
|
489
|
+
return pa.float32()
|
|
362
490
|
|
|
363
|
-
def
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
return tf.TensorSpec(shape=(), dtype=tf.float32)
|
|
491
|
+
def _validate_literal(self, val: Any) -> None:
|
|
492
|
+
if not isinstance(val, float):
|
|
493
|
+
raise TypeError(f'Expected float, got {val.__class__.__name__}')
|
|
367
494
|
|
|
495
|
+
def _create_literal(self, val: Any) -> Any:
|
|
496
|
+
if isinstance(val, int):
|
|
497
|
+
return float(val)
|
|
498
|
+
return val
|
|
368
499
|
|
|
369
500
|
class BoolType(ColumnType):
|
|
370
|
-
def __init__(self):
|
|
371
|
-
super().__init__(self.Type.BOOL)
|
|
501
|
+
def __init__(self, nullable: bool = False):
|
|
502
|
+
super().__init__(self.Type.BOOL, nullable=nullable)
|
|
372
503
|
|
|
373
504
|
def to_sql(self) -> str:
|
|
374
505
|
return 'BOOLEAN'
|
|
375
506
|
|
|
376
507
|
def to_sa_type(self) -> str:
|
|
377
508
|
return sql.Boolean
|
|
509
|
+
|
|
510
|
+
def to_arrow_type(self) -> 'pyarrow.DataType':
|
|
511
|
+
import pyarrow as pa # pylint: disable=import-outside-toplevel
|
|
512
|
+
return pa.bool_()
|
|
378
513
|
|
|
379
|
-
def
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
return tf.TensorSpec(shape=(), dtype=tf.bool)
|
|
514
|
+
def _validate_literal(self, val: Any) -> None:
|
|
515
|
+
if not isinstance(val, bool):
|
|
516
|
+
raise TypeError(f'Expected bool, got {val.__class__.__name__}')
|
|
383
517
|
|
|
518
|
+
def _create_literal(self, val: Any) -> Any:
|
|
519
|
+
if isinstance(val, int):
|
|
520
|
+
return bool(val)
|
|
521
|
+
return val
|
|
384
522
|
|
|
385
523
|
class TimestampType(ColumnType):
|
|
386
|
-
def __init__(self):
|
|
387
|
-
super().__init__(self.Type.TIMESTAMP)
|
|
524
|
+
def __init__(self, nullable: bool = False):
|
|
525
|
+
super().__init__(self.Type.TIMESTAMP, nullable=nullable)
|
|
388
526
|
|
|
389
527
|
def to_sql(self) -> str:
|
|
390
528
|
return 'INTEGER'
|
|
391
529
|
|
|
392
530
|
def to_sa_type(self) -> str:
|
|
393
531
|
return sql.TIMESTAMP
|
|
532
|
+
|
|
533
|
+
def to_arrow_type(self) -> 'pyarrow.DataType':
|
|
534
|
+
import pyarrow as pa # pylint: disable=import-outside-toplevel
|
|
535
|
+
return pa.timestamp('us') # postgres timestamp is microseconds
|
|
394
536
|
|
|
395
|
-
def
|
|
396
|
-
|
|
537
|
+
def _validate_literal(self, val: Any) -> None:
|
|
538
|
+
if not isinstance(val, datetime.datetime) and not isinstance(val, datetime.date):
|
|
539
|
+
raise TypeError(f'Expected datetime.datetime or datetime.date, got {val.__class__.__name__}')
|
|
397
540
|
|
|
541
|
+
def _create_literal(self, val: Any) -> Any:
|
|
542
|
+
if isinstance(val, str):
|
|
543
|
+
return datetime.datetime.fromisoformat(val)
|
|
544
|
+
return val
|
|
398
545
|
|
|
399
546
|
class JsonType(ColumnType):
|
|
400
547
|
# TODO: type_spec also needs to be able to express lists
|
|
401
|
-
def __init__(self, type_spec: Optional[Dict[str, ColumnType]] = None):
|
|
402
|
-
super().__init__(self.Type.JSON)
|
|
548
|
+
def __init__(self, type_spec: Optional[Dict[str, ColumnType]] = None, nullable: bool = False):
|
|
549
|
+
super().__init__(self.Type.JSON, nullable=nullable)
|
|
403
550
|
self.type_spec = type_spec
|
|
404
551
|
|
|
405
552
|
def _as_dict(self) -> Dict:
|
|
@@ -410,34 +557,52 @@ class JsonType(ColumnType):
|
|
|
410
557
|
return result
|
|
411
558
|
|
|
412
559
|
@classmethod
|
|
413
|
-
def _from_dict(cls, d: Dict) ->
|
|
560
|
+
def _from_dict(cls, d: Dict) -> ColumnType:
|
|
414
561
|
type_spec = None
|
|
415
562
|
if 'type_spec' in d:
|
|
416
563
|
type_spec = {
|
|
417
564
|
field_name: cls.deserialize(field_type_dict) for field_name, field_type_dict in d['type_spec'].items()
|
|
418
565
|
}
|
|
419
|
-
return cls(type_spec)
|
|
566
|
+
return cls(type_spec, nullable=d['nullable'])
|
|
420
567
|
|
|
421
568
|
def to_sql(self) -> str:
|
|
422
569
|
return 'JSONB'
|
|
423
570
|
|
|
424
571
|
def to_sa_type(self) -> str:
|
|
425
572
|
return sql.dialects.postgresql.JSONB
|
|
426
|
-
|
|
427
|
-
def
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
573
|
+
|
|
574
|
+
def to_arrow_type(self) -> 'pyarrow.DataType':
|
|
575
|
+
import pyarrow as pa # pylint: disable=import-outside-toplevel
|
|
576
|
+
return pa.string() # TODO: weight advantage of pa.struct type.
|
|
577
|
+
|
|
578
|
+
def print_value(self, val: Any) -> str:
|
|
579
|
+
val_type = self.infer_literal_type(val)
|
|
580
|
+
if val_type == self:
|
|
581
|
+
return str(val)
|
|
582
|
+
return val_type.print_value(val)
|
|
583
|
+
|
|
584
|
+
def _validate_literal(self, val: Any) -> None:
|
|
585
|
+
if not isinstance(val, dict) and not isinstance(val, list):
|
|
586
|
+
raise TypeError(f'Expected dict or list, got {val.__class__.__name__}')
|
|
587
|
+
try:
|
|
588
|
+
_ = json.dumps(val)
|
|
589
|
+
except TypeError as e:
|
|
590
|
+
raise TypeError(f'Expected JSON-serializable object, got {val}')
|
|
591
|
+
|
|
592
|
+
def _create_literal(self, val: Any) -> Any:
|
|
593
|
+
if isinstance(val, tuple):
|
|
594
|
+
val = list(val)
|
|
595
|
+
return val
|
|
432
596
|
|
|
433
597
|
class ArrayType(ColumnType):
|
|
434
598
|
def __init__(
|
|
435
|
-
self, shape: Tuple[Union[int, None], ...], dtype: ColumnType
|
|
436
|
-
super().__init__(self.Type.ARRAY)
|
|
599
|
+
self, shape: Tuple[Union[int, None], ...], dtype: ColumnType, nullable: bool = False):
|
|
600
|
+
super().__init__(self.Type.ARRAY, nullable=nullable)
|
|
437
601
|
self.shape = shape
|
|
438
|
-
|
|
602
|
+
assert dtype.is_int_type() or dtype.is_float_type() or dtype.is_bool_type() or dtype.is_string_type()
|
|
603
|
+
self.dtype = dtype._type
|
|
439
604
|
|
|
440
|
-
def _supertype(cls, type1:
|
|
605
|
+
def _supertype(cls, type1: ArrayType, type2: ArrayType) -> Optional[ArrayType]:
|
|
441
606
|
if len(type1.shape) != len(type2.shape):
|
|
442
607
|
return None
|
|
443
608
|
base_type = ColumnType.supertype(type1.dtype, type2.dtype)
|
|
@@ -452,54 +617,94 @@ class ArrayType(ColumnType):
|
|
|
452
617
|
return result
|
|
453
618
|
|
|
454
619
|
def __str__(self) -> str:
|
|
455
|
-
return f'{self.
|
|
620
|
+
return f'{self._type.name.lower()}({self.shape}, dtype={self.dtype.name})'
|
|
456
621
|
|
|
457
622
|
@classmethod
|
|
458
|
-
def _from_dict(cls, d: Dict) ->
|
|
623
|
+
def _from_dict(cls, d: Dict) -> ColumnType:
|
|
459
624
|
assert 'shape' in d
|
|
460
625
|
assert 'dtype' in d
|
|
461
626
|
shape = tuple(d['shape'])
|
|
462
|
-
dtype = cls.Type(d['dtype'])
|
|
463
|
-
return cls(shape, dtype)
|
|
464
|
-
|
|
465
|
-
def to_sql(self) -> str:
|
|
466
|
-
return 'BYTEA'
|
|
627
|
+
dtype = cls.make_type(cls.Type(d['dtype']))
|
|
628
|
+
return cls(shape, dtype, nullable=d['nullable'])
|
|
467
629
|
|
|
468
|
-
|
|
469
|
-
|
|
630
|
+
@classmethod
|
|
631
|
+
def from_literal(cls, val: np.ndarray) -> Optional[ArrayType]:
|
|
632
|
+
# determine our dtype
|
|
633
|
+
assert isinstance(val, np.ndarray)
|
|
634
|
+
if np.issubdtype(val.dtype, np.integer):
|
|
635
|
+
dtype = IntType()
|
|
636
|
+
elif np.issubdtype(val.dtype, np.floating):
|
|
637
|
+
dtype = FloatType()
|
|
638
|
+
elif val.dtype == np.bool_:
|
|
639
|
+
dtype = BoolType()
|
|
640
|
+
elif val.dtype == np.str_:
|
|
641
|
+
dtype = StringType()
|
|
642
|
+
else:
|
|
643
|
+
return None
|
|
644
|
+
return cls(val.shape, dtype=dtype, nullable=True)
|
|
470
645
|
|
|
471
|
-
def
|
|
472
|
-
|
|
473
|
-
|
|
646
|
+
def is_valid_literal(self, val: np.ndarray) -> bool:
|
|
647
|
+
if not isinstance(val, np.ndarray):
|
|
648
|
+
return False
|
|
649
|
+
if len(val.shape) != len(self.shape):
|
|
650
|
+
return False
|
|
651
|
+
# check that the shapes are compatible
|
|
652
|
+
for n1, n2 in zip(val.shape, self.shape):
|
|
653
|
+
if n1 is None:
|
|
654
|
+
return False
|
|
655
|
+
if n2 is None:
|
|
656
|
+
# wildcard
|
|
657
|
+
continue
|
|
658
|
+
if n1 != n2:
|
|
659
|
+
return False
|
|
660
|
+
return val.dtype == self.numpy_dtype()
|
|
474
661
|
|
|
662
|
+
def _validate_literal(self, val: Any) -> None:
|
|
663
|
+
if not isinstance(val, np.ndarray):
|
|
664
|
+
raise TypeError(f'Expected numpy.ndarray, got {val.__class__.__name__}')
|
|
665
|
+
if not self.is_valid_literal(val):
|
|
666
|
+
raise TypeError((
|
|
667
|
+
f'Expected ndarray({self.shape}, dtype={self.numpy_dtype()}), '
|
|
668
|
+
f'got ndarray({val.shape}, dtype={val.dtype})'))
|
|
475
669
|
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
RGB = 1
|
|
670
|
+
def _create_literal(self, val: Any) -> Any:
|
|
671
|
+
if isinstance(val, (list,tuple)):
|
|
672
|
+
return np.array(val)
|
|
673
|
+
return val
|
|
481
674
|
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
if pil_mode == 'L':
|
|
485
|
-
return cls.L
|
|
486
|
-
if pil_mode == 'RGB':
|
|
487
|
-
return cls.RGB
|
|
675
|
+
def to_sql(self) -> str:
|
|
676
|
+
return 'BYTEA'
|
|
488
677
|
|
|
489
|
-
|
|
490
|
-
|
|
678
|
+
def to_sa_type(self) -> str:
|
|
679
|
+
return sql.LargeBinary
|
|
680
|
+
|
|
681
|
+
def to_arrow_type(self) -> 'pyarrow.DataType':
|
|
682
|
+
import pyarrow as pa # pylint: disable=import-outside-toplevel
|
|
683
|
+
if any([n is None for n in self.shape]):
|
|
684
|
+
raise TypeError(f'Cannot convert array with unknown shape to Arrow')
|
|
685
|
+
return pa.fixed_shape_tensor(pa.from_numpy_dtype(self.numpy_dtype()), self.shape)
|
|
686
|
+
|
|
687
|
+
def numpy_dtype(self) -> np.dtype:
|
|
688
|
+
if self.dtype == self.Type.INT:
|
|
689
|
+
return np.dtype(np.int64)
|
|
690
|
+
if self.dtype == self.Type.FLOAT:
|
|
691
|
+
return np.dtype(np.float32)
|
|
692
|
+
if self.dtype == self.Type.BOOL:
|
|
693
|
+
return np.dtype(np.bool_)
|
|
694
|
+
if self.dtype == self.Type.STRING:
|
|
695
|
+
return np.dtype(np.str_)
|
|
696
|
+
assert False
|
|
491
697
|
|
|
492
|
-
def num_channels(self) -> int:
|
|
493
|
-
return len(self.name)
|
|
494
698
|
|
|
699
|
+
class ImageType(ColumnType):
|
|
495
700
|
def __init__(
|
|
496
701
|
self, width: Optional[int] = None, height: Optional[int] = None, size: Optional[Tuple[int, int]] = None,
|
|
497
|
-
mode: Optional[
|
|
702
|
+
mode: Optional[str] = None, nullable: bool = False
|
|
498
703
|
):
|
|
499
704
|
"""
|
|
500
705
|
TODO: does it make sense to specify only width or height?
|
|
501
706
|
"""
|
|
502
|
-
super().__init__(self.Type.IMAGE)
|
|
707
|
+
super().__init__(self.Type.IMAGE, nullable=nullable)
|
|
503
708
|
assert not(width is not None and size is not None)
|
|
504
709
|
assert not(height is not None and size is not None)
|
|
505
710
|
if size is not None:
|
|
@@ -510,22 +715,53 @@ class ImageType(ColumnType):
|
|
|
510
715
|
self.height = height
|
|
511
716
|
self.mode = mode
|
|
512
717
|
|
|
718
|
+
def __str__(self) -> str:
|
|
719
|
+
if self.width is not None or self.height is not None or self.mode is not None:
|
|
720
|
+
params_str = ''
|
|
721
|
+
if self.width is not None:
|
|
722
|
+
params_str = f'width={self.width}'
|
|
723
|
+
if self.height is not None:
|
|
724
|
+
if len(params_str) > 0:
|
|
725
|
+
params_str += ', '
|
|
726
|
+
params_str += f'height={self.height}'
|
|
727
|
+
if self.mode is not None:
|
|
728
|
+
if len(params_str) > 0:
|
|
729
|
+
params_str += ', '
|
|
730
|
+
params_str += f'mode={self.mode}'
|
|
731
|
+
params_str = f'({params_str})'
|
|
732
|
+
else:
|
|
733
|
+
params_str = ''
|
|
734
|
+
return f'{self._type.name.lower()}{params_str}'
|
|
735
|
+
|
|
736
|
+
def _is_supertype_of(self, other: ImageType) -> bool:
|
|
737
|
+
if self.mode != other.mode:
|
|
738
|
+
return False
|
|
739
|
+
if self.width is None and self.height is None:
|
|
740
|
+
return True
|
|
741
|
+
if self.width != other.width and self.height != other.height:
|
|
742
|
+
return False
|
|
743
|
+
|
|
744
|
+
@property
|
|
745
|
+
def size(self) -> Optional[Tuple[int, int]]:
|
|
746
|
+
if self.width is None or self.height is None:
|
|
747
|
+
return None
|
|
748
|
+
return (self.width, self.height)
|
|
749
|
+
|
|
513
750
|
@property
|
|
514
751
|
def num_channels(self) -> Optional[int]:
|
|
515
752
|
return None if self.mode is None else self.mode.num_channels()
|
|
516
753
|
|
|
517
754
|
def _as_dict(self) -> Dict:
|
|
518
755
|
result = super()._as_dict()
|
|
519
|
-
result.update(width=self.width, height=self.height, mode=self.mode
|
|
756
|
+
result.update(width=self.width, height=self.height, mode=self.mode)
|
|
520
757
|
return result
|
|
521
758
|
|
|
522
759
|
@classmethod
|
|
523
|
-
def _from_dict(cls, d: Dict) ->
|
|
760
|
+
def _from_dict(cls, d: Dict) -> ColumnType:
|
|
524
761
|
assert 'width' in d
|
|
525
762
|
assert 'height' in d
|
|
526
763
|
assert 'mode' in d
|
|
527
|
-
|
|
528
|
-
return cls(width=d['width'], height=d['height'], mode=cls.Mode(mode_val) if mode_val is not None else None)
|
|
764
|
+
return cls(width=d['width'], height=d['height'], mode=d['mode'], nullable=d['nullable'])
|
|
529
765
|
|
|
530
766
|
def conversion_fn(self, target: ColumnType) -> Optional[Callable[[Any], Any]]:
|
|
531
767
|
if not target.is_image_type():
|
|
@@ -552,23 +788,111 @@ class ImageType(ColumnType):
|
|
|
552
788
|
|
|
553
789
|
def to_sa_type(self) -> str:
|
|
554
790
|
return sql.String
|
|
791
|
+
|
|
792
|
+
def to_arrow_type(self) -> 'pyarrow.DataType':
|
|
793
|
+
import pyarrow as pa # pylint: disable=import-outside-toplevel
|
|
794
|
+
return pa.binary()
|
|
795
|
+
|
|
796
|
+
def _validate_literal(self, val: Any) -> None:
|
|
797
|
+
if isinstance(val, PIL.Image.Image):
|
|
798
|
+
return
|
|
799
|
+
self._validate_file_path(val)
|
|
800
|
+
|
|
801
|
+
def validate_media(self, val: Any) -> None:
|
|
802
|
+
assert isinstance(val, str)
|
|
803
|
+
try:
|
|
804
|
+
_ = PIL.Image.open(val)
|
|
805
|
+
except PIL.UnidentifiedImageError:
|
|
806
|
+
raise excs.Error(f'Not a valid image: {val}') from None
|
|
555
807
|
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
808
|
+
class VideoType(ColumnType):
|
|
809
|
+
def __init__(self, nullable: bool = False):
|
|
810
|
+
super().__init__(self.Type.VIDEO, nullable=nullable)
|
|
559
811
|
|
|
812
|
+
def to_sql(self) -> str:
|
|
813
|
+
# stored as a file path
|
|
814
|
+
return 'VARCHAR'
|
|
560
815
|
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
816
|
+
def to_sa_type(self) -> str:
|
|
817
|
+
return sql.String
|
|
818
|
+
|
|
819
|
+
def to_arrow_type(self) -> 'pyarrow.DataType':
|
|
820
|
+
import pyarrow as pa # pylint: disable=import-outside-toplevel
|
|
821
|
+
return pa.string()
|
|
822
|
+
|
|
823
|
+
def _validate_literal(self, val: Any) -> None:
|
|
824
|
+
self._validate_file_path(val)
|
|
825
|
+
|
|
826
|
+
def validate_media(self, val: Any) -> None:
|
|
827
|
+
assert isinstance(val, str)
|
|
828
|
+
try:
|
|
829
|
+
with av.open(val, 'r') as fh:
|
|
830
|
+
if len(fh.streams.video) == 0:
|
|
831
|
+
raise excs.Error(f'Not a valid video: {val}')
|
|
832
|
+
# decode a few frames to make sure it's playable
|
|
833
|
+
# TODO: decode all frames? but that's very slow
|
|
834
|
+
num_decoded = 0
|
|
835
|
+
for frame in fh.decode(video=0):
|
|
836
|
+
_ = frame.to_image()
|
|
837
|
+
num_decoded += 1
|
|
838
|
+
if num_decoded == 10:
|
|
839
|
+
break
|
|
840
|
+
if num_decoded < 2:
|
|
841
|
+
# this is most likely an image file
|
|
842
|
+
raise excs.Error(f'Not a valid video: {val}')
|
|
843
|
+
except av.AVError:
|
|
844
|
+
raise excs.Error(f'Not a valid video: {val}') from None
|
|
845
|
+
|
|
846
|
+
class AudioType(ColumnType):
|
|
847
|
+
def __init__(self, nullable: bool = False):
|
|
848
|
+
super().__init__(self.Type.AUDIO, nullable=nullable)
|
|
564
849
|
|
|
565
|
-
def
|
|
566
|
-
|
|
567
|
-
return
|
|
850
|
+
def to_sql(self) -> str:
|
|
851
|
+
# stored as a file path
|
|
852
|
+
return 'VARCHAR'
|
|
568
853
|
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
854
|
+
def to_sa_type(self) -> str:
|
|
855
|
+
return sql.String
|
|
856
|
+
|
|
857
|
+
def to_arrow_type(self) -> 'pyarrow.DataType':
|
|
858
|
+
import pyarrow as pa # pylint: disable=import-outside-toplevel
|
|
859
|
+
return pa.string()
|
|
860
|
+
|
|
861
|
+
def _validate_literal(self, val: Any) -> None:
|
|
862
|
+
self._validate_file_path(val)
|
|
863
|
+
|
|
864
|
+
def validate_media(self, val: Any) -> None:
|
|
865
|
+
try:
|
|
866
|
+
with av.open(val) as container:
|
|
867
|
+
if len(container.streams.audio) == 0:
|
|
868
|
+
raise excs.Error(f'No audio stream in file: {val}')
|
|
869
|
+
audio_stream = container.streams.audio[0]
|
|
870
|
+
|
|
871
|
+
# decode everything to make sure it's playable
|
|
872
|
+
# TODO: is there some way to verify it's a playable audio file other than decoding all of it?
|
|
873
|
+
for packet in container.demux(audio_stream):
|
|
874
|
+
for _ in packet.decode():
|
|
875
|
+
pass
|
|
876
|
+
except av.AVError as e:
|
|
877
|
+
raise excs.Error(f'Not a valid audio file: {val}\n{e}') from None
|
|
878
|
+
|
|
879
|
+
class DocumentType(ColumnType):
|
|
880
|
+
@enum.unique
|
|
881
|
+
class DocumentFormat(enum.Enum):
|
|
882
|
+
HTML = 0
|
|
883
|
+
MD = 1
|
|
884
|
+
PDF = 2
|
|
885
|
+
|
|
886
|
+
def __init__(self, nullable: bool = False, doc_formats: Optional[str] = None):
|
|
887
|
+
super().__init__(self.Type.DOCUMENT, nullable=nullable)
|
|
888
|
+
if doc_formats is not None:
|
|
889
|
+
type_strs = doc_formats.split(',')
|
|
890
|
+
for type_str in type_strs:
|
|
891
|
+
if not hasattr(self.DocumentFormat, type_str):
|
|
892
|
+
raise ValueError(f'Invalid document type: {type_str}')
|
|
893
|
+
self._doc_formats = [self.DocumentFormat[type_str.upper()] for type_str in type_strs]
|
|
894
|
+
else:
|
|
895
|
+
self._doc_formats = [t for t in self.DocumentFormat]
|
|
572
896
|
|
|
573
897
|
def to_sql(self) -> str:
|
|
574
898
|
# stored as a file path
|
|
@@ -577,5 +901,38 @@ class VideoType(ColumnType):
|
|
|
577
901
|
def to_sa_type(self) -> str:
|
|
578
902
|
return sql.String
|
|
579
903
|
|
|
580
|
-
def
|
|
581
|
-
|
|
904
|
+
def to_arrow_type(self) -> 'pyarrow.DataType':
|
|
905
|
+
import pyarrow as pa # pylint: disable=import-outside-toplevel
|
|
906
|
+
return pa.string()
|
|
907
|
+
|
|
908
|
+
def _validate_literal(self, val: Any) -> None:
|
|
909
|
+
self._validate_file_path(val)
|
|
910
|
+
|
|
911
|
+
def validate_media(self, val: Any) -> None:
|
|
912
|
+
assert isinstance(val, str)
|
|
913
|
+
from pixeltable.utils.documents import get_document_handle
|
|
914
|
+
with open(val, 'r') as fh:
|
|
915
|
+
try:
|
|
916
|
+
s = fh.read()
|
|
917
|
+
dh = get_document_handle(s)
|
|
918
|
+
if dh is None:
|
|
919
|
+
raise excs.Error(f'Not a recognized document format: {val}')
|
|
920
|
+
except Exception as e:
|
|
921
|
+
raise excs.Error(f'Not a recognized document format: {val}') from None
|
|
922
|
+
|
|
923
|
+
|
|
924
|
+
# A dictionary mapping various Python types to their respective ColumnTypes.
|
|
925
|
+
# This can be used to infer Pixeltable ColumnTypes from type hints on Python
|
|
926
|
+
# functions. (Since Python functions do not necessarily have type hints, this
|
|
927
|
+
# should always be an optional/convenience inference.)
|
|
928
|
+
_python_type_to_column_type: dict[type, ColumnType] = {
|
|
929
|
+
str: StringType(),
|
|
930
|
+
int: IntType(),
|
|
931
|
+
float: FloatType(),
|
|
932
|
+
bool: BoolType(),
|
|
933
|
+
datetime.datetime: TimestampType(),
|
|
934
|
+
datetime.date: TimestampType(),
|
|
935
|
+
list: JsonType(),
|
|
936
|
+
dict: JsonType(),
|
|
937
|
+
PIL.Image.Image: ImageType()
|
|
938
|
+
}
|