pixeltable 0.1.0__py3-none-any.whl → 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (147) hide show
  1. pixeltable/__init__.py +34 -6
  2. pixeltable/catalog/__init__.py +13 -0
  3. pixeltable/catalog/catalog.py +159 -0
  4. pixeltable/catalog/column.py +200 -0
  5. pixeltable/catalog/dir.py +32 -0
  6. pixeltable/catalog/globals.py +33 -0
  7. pixeltable/catalog/insertable_table.py +191 -0
  8. pixeltable/catalog/named_function.py +36 -0
  9. pixeltable/catalog/path.py +58 -0
  10. pixeltable/catalog/path_dict.py +139 -0
  11. pixeltable/catalog/schema_object.py +39 -0
  12. pixeltable/catalog/table.py +581 -0
  13. pixeltable/catalog/table_version.py +749 -0
  14. pixeltable/catalog/table_version_path.py +133 -0
  15. pixeltable/catalog/view.py +203 -0
  16. pixeltable/client.py +590 -30
  17. pixeltable/dataframe.py +540 -349
  18. pixeltable/env.py +359 -45
  19. pixeltable/exceptions.py +12 -21
  20. pixeltable/exec/__init__.py +9 -0
  21. pixeltable/exec/aggregation_node.py +78 -0
  22. pixeltable/exec/cache_prefetch_node.py +116 -0
  23. pixeltable/exec/component_iteration_node.py +79 -0
  24. pixeltable/exec/data_row_batch.py +95 -0
  25. pixeltable/exec/exec_context.py +22 -0
  26. pixeltable/exec/exec_node.py +61 -0
  27. pixeltable/exec/expr_eval_node.py +217 -0
  28. pixeltable/exec/in_memory_data_node.py +69 -0
  29. pixeltable/exec/media_validation_node.py +43 -0
  30. pixeltable/exec/sql_scan_node.py +225 -0
  31. pixeltable/exprs/__init__.py +24 -0
  32. pixeltable/exprs/arithmetic_expr.py +102 -0
  33. pixeltable/exprs/array_slice.py +71 -0
  34. pixeltable/exprs/column_property_ref.py +77 -0
  35. pixeltable/exprs/column_ref.py +105 -0
  36. pixeltable/exprs/comparison.py +77 -0
  37. pixeltable/exprs/compound_predicate.py +98 -0
  38. pixeltable/exprs/data_row.py +195 -0
  39. pixeltable/exprs/expr.py +586 -0
  40. pixeltable/exprs/expr_set.py +39 -0
  41. pixeltable/exprs/function_call.py +380 -0
  42. pixeltable/exprs/globals.py +69 -0
  43. pixeltable/exprs/image_member_access.py +115 -0
  44. pixeltable/exprs/image_similarity_predicate.py +58 -0
  45. pixeltable/exprs/inline_array.py +107 -0
  46. pixeltable/exprs/inline_dict.py +101 -0
  47. pixeltable/exprs/is_null.py +38 -0
  48. pixeltable/exprs/json_mapper.py +121 -0
  49. pixeltable/exprs/json_path.py +159 -0
  50. pixeltable/exprs/literal.py +54 -0
  51. pixeltable/exprs/object_ref.py +41 -0
  52. pixeltable/exprs/predicate.py +44 -0
  53. pixeltable/exprs/row_builder.py +355 -0
  54. pixeltable/exprs/rowid_ref.py +94 -0
  55. pixeltable/exprs/type_cast.py +53 -0
  56. pixeltable/exprs/variable.py +45 -0
  57. pixeltable/func/__init__.py +9 -0
  58. pixeltable/func/aggregate_function.py +194 -0
  59. pixeltable/func/batched_function.py +53 -0
  60. pixeltable/func/callable_function.py +69 -0
  61. pixeltable/func/expr_template_function.py +82 -0
  62. pixeltable/func/function.py +110 -0
  63. pixeltable/func/function_registry.py +227 -0
  64. pixeltable/func/globals.py +36 -0
  65. pixeltable/func/nos_function.py +202 -0
  66. pixeltable/func/signature.py +166 -0
  67. pixeltable/func/udf.py +163 -0
  68. pixeltable/functions/__init__.py +52 -103
  69. pixeltable/functions/eval.py +216 -0
  70. pixeltable/functions/fireworks.py +34 -0
  71. pixeltable/functions/huggingface.py +120 -0
  72. pixeltable/functions/image.py +16 -0
  73. pixeltable/functions/openai.py +256 -0
  74. pixeltable/functions/pil/image.py +148 -7
  75. pixeltable/functions/string.py +13 -0
  76. pixeltable/functions/together.py +122 -0
  77. pixeltable/functions/util.py +41 -0
  78. pixeltable/functions/video.py +62 -0
  79. pixeltable/iterators/__init__.py +3 -0
  80. pixeltable/iterators/base.py +48 -0
  81. pixeltable/iterators/document.py +311 -0
  82. pixeltable/iterators/video.py +89 -0
  83. pixeltable/metadata/__init__.py +54 -0
  84. pixeltable/metadata/converters/convert_10.py +18 -0
  85. pixeltable/metadata/schema.py +211 -0
  86. pixeltable/plan.py +656 -0
  87. pixeltable/store.py +418 -182
  88. pixeltable/tests/conftest.py +146 -88
  89. pixeltable/tests/functions/test_fireworks.py +42 -0
  90. pixeltable/tests/functions/test_functions.py +60 -0
  91. pixeltable/tests/functions/test_huggingface.py +158 -0
  92. pixeltable/tests/functions/test_openai.py +152 -0
  93. pixeltable/tests/functions/test_together.py +111 -0
  94. pixeltable/tests/test_audio.py +65 -0
  95. pixeltable/tests/test_catalog.py +27 -0
  96. pixeltable/tests/test_client.py +14 -14
  97. pixeltable/tests/test_component_view.py +370 -0
  98. pixeltable/tests/test_dataframe.py +439 -0
  99. pixeltable/tests/test_dirs.py +78 -62
  100. pixeltable/tests/test_document.py +120 -0
  101. pixeltable/tests/test_exprs.py +592 -135
  102. pixeltable/tests/test_function.py +297 -67
  103. pixeltable/tests/test_migration.py +43 -0
  104. pixeltable/tests/test_nos.py +54 -0
  105. pixeltable/tests/test_snapshot.py +208 -0
  106. pixeltable/tests/test_table.py +1195 -263
  107. pixeltable/tests/test_transactional_directory.py +42 -0
  108. pixeltable/tests/test_types.py +5 -11
  109. pixeltable/tests/test_video.py +151 -34
  110. pixeltable/tests/test_view.py +530 -0
  111. pixeltable/tests/utils.py +320 -45
  112. pixeltable/tool/create_test_db_dump.py +149 -0
  113. pixeltable/tool/create_test_video.py +81 -0
  114. pixeltable/type_system.py +445 -124
  115. pixeltable/utils/__init__.py +17 -46
  116. pixeltable/utils/arrow.py +98 -0
  117. pixeltable/utils/clip.py +12 -15
  118. pixeltable/utils/coco.py +136 -0
  119. pixeltable/utils/documents.py +39 -0
  120. pixeltable/utils/filecache.py +195 -0
  121. pixeltable/utils/help.py +11 -0
  122. pixeltable/utils/hf_datasets.py +157 -0
  123. pixeltable/utils/media_store.py +76 -0
  124. pixeltable/utils/parquet.py +167 -0
  125. pixeltable/utils/pytorch.py +91 -0
  126. pixeltable/utils/s3.py +13 -0
  127. pixeltable/utils/sql.py +17 -0
  128. pixeltable/utils/transactional_directory.py +35 -0
  129. pixeltable-0.2.4.dist-info/LICENSE +18 -0
  130. pixeltable-0.2.4.dist-info/METADATA +127 -0
  131. pixeltable-0.2.4.dist-info/RECORD +132 -0
  132. {pixeltable-0.1.0.dist-info → pixeltable-0.2.4.dist-info}/WHEEL +1 -1
  133. pixeltable/catalog.py +0 -1421
  134. pixeltable/exprs.py +0 -1745
  135. pixeltable/function.py +0 -269
  136. pixeltable/functions/clip.py +0 -10
  137. pixeltable/functions/pil/__init__.py +0 -23
  138. pixeltable/functions/tf.py +0 -21
  139. pixeltable/index.py +0 -57
  140. pixeltable/tests/test_dict.py +0 -24
  141. pixeltable/tests/test_functions.py +0 -11
  142. pixeltable/tests/test_tf.py +0 -69
  143. pixeltable/tf.py +0 -33
  144. pixeltable/utils/tf.py +0 -33
  145. pixeltable/utils/video.py +0 -32
  146. pixeltable-0.1.0.dist-info/METADATA +0 -34
  147. pixeltable-0.1.0.dist-info/RECORD +0 -36
pixeltable/type_system.py CHANGED
@@ -1,15 +1,20 @@
1
+ from __future__ import annotations
2
+
1
3
  import abc
2
- from typing import Any, Optional, Tuple, Dict, Callable, List, Union
3
- import enum
4
4
  import datetime
5
+ import enum
5
6
  import json
7
+ import typing
8
+ import urllib.parse
9
+ from pathlib import Path
10
+ from typing import Any, Optional, Tuple, Dict, Callable, List, Union, Sequence, Mapping
6
11
 
7
- import os
8
- os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
9
- import tensorflow as tf
10
12
  import PIL.Image
13
+ import av
14
+ import numpy as np
11
15
  import sqlalchemy as sql
12
16
 
17
+ from pixeltable import exceptions as excs
13
18
 
14
19
 
15
20
  class ColumnType:
@@ -24,25 +29,16 @@ class ColumnType:
24
29
  ARRAY = 6
25
30
  IMAGE = 7
26
31
  VIDEO = 8
32
+ AUDIO = 9
33
+ DOCUMENT = 10
27
34
 
28
35
  # exprs that don't evaluate to a computable value in Pixeltable, such as an Image member function
29
- INVALID = 9
30
-
31
- def to_tf(self) -> tf.dtypes.DType:
32
- if self == self.STRING:
33
- return tf.string
34
- if self == self.INT:
35
- return tf.int64
36
- if self == self.FLOAT:
37
- return tf.float32
38
- if self == self.BOOL:
39
- return tf.bool
40
- raise TypeError(f'Cannot convert {self} to TensorFlow')
36
+ INVALID = 255
41
37
 
42
38
  @classmethod
43
39
  def supertype(
44
40
  cls, type1: 'Type', type2: 'Type',
45
- # we need to pass this in because we can't easily add it as a class member
41
+ # we need to pass this in because we can't easily append it as a class member
46
42
  common_supertypes: Dict[Tuple['Type', 'Type'], 'Type']
47
43
  ) -> Optional['Type']:
48
44
  if type1 == type2:
@@ -82,8 +78,9 @@ class ColumnType:
82
78
  (Type.INT, Type.FLOAT): Type.FLOAT,
83
79
  }
84
80
 
85
- def __init__(self, t: Type):
81
+ def __init__(self, t: Type, nullable: bool = False):
86
82
  self._type = t
83
+ self.nullable = nullable
87
84
 
88
85
  @property
89
86
  def type_enum(self) -> Type:
@@ -93,7 +90,7 @@ class ColumnType:
93
90
  return json.dumps(self.as_dict())
94
91
 
95
92
  @classmethod
96
- def serialize_list(cls, type_list: List['ColumnType']) -> str:
93
+ def serialize_list(cls, type_list: List[ColumnType]) -> str:
97
94
  return json.dumps([t.as_dict() for t in type_list])
98
95
 
99
96
  def as_dict(self) -> Dict:
@@ -103,33 +100,34 @@ class ColumnType:
103
100
  }
104
101
 
105
102
  def _as_dict(self) -> Dict:
106
- return {}
103
+ return {'nullable': self.nullable}
107
104
 
108
105
  @classmethod
109
- def deserialize(cls, type_str: str) -> 'ColumnType':
106
+ def deserialize(cls, type_str: str) -> ColumnType:
110
107
  type_dict = json.loads(type_str)
111
108
  return cls.from_dict(type_dict)
112
109
 
113
110
  @classmethod
114
- def deserialize_list(cls, type_list_str: str) -> List['ColumnType']:
111
+ def deserialize_list(cls, type_list_str: str) -> List[ColumnType]:
115
112
  type_dict_list = json.loads(type_list_str)
116
113
  return [cls.from_dict(type_dict) for type_dict in type_dict_list]
117
114
 
118
115
  @classmethod
119
- def from_dict(cls, type_dict: Dict) -> 'ColumnType':
116
+ def from_dict(cls, type_dict: Dict) -> ColumnType:
120
117
  assert '_classname' in type_dict
121
118
  type_class = globals()[type_dict['_classname']]
122
119
  return type_class._from_dict(type_dict)
123
120
 
124
121
  @classmethod
125
- def _from_dict(cls, d: Dict) -> 'ColumnType':
122
+ def _from_dict(cls, d: Dict) -> ColumnType:
126
123
  """
127
- Default implementation: simply invoke c'tor without arguments
124
+ Default implementation: simply invoke c'tor
128
125
  """
129
- return cls()
126
+ assert 'nullable' in d
127
+ return cls(nullable=d['nullable'])
130
128
 
131
129
  @classmethod
132
- def make_type(cls, t: Type) -> 'ColumnType':
130
+ def make_type(cls, t: Type) -> ColumnType:
133
131
  assert t != cls.Type.INVALID and t != cls.Type.ARRAY
134
132
  if t == cls.Type.STRING:
135
133
  return StringType()
@@ -147,21 +145,44 @@ class ColumnType:
147
145
  return ImageType()
148
146
  if t == cls.Type.VIDEO:
149
147
  return VideoType()
148
+ if t == cls.Type.AUDIO:
149
+ return AudioType()
150
+ if t == cls.Type.DOCUMENT:
151
+ return AudioType()
150
152
 
151
153
  def __str__(self) -> str:
152
154
  return self._type.name.lower()
153
155
 
154
156
  def __eq__(self, other: object) -> bool:
157
+ return self.matches(other) and self.nullable == other.nullable
158
+
159
+ def is_supertype_of(self, other: ColumnType) -> bool:
160
+ if type(self) != type(other):
161
+ return False
162
+ if self.matches(other):
163
+ return True
164
+ return self._is_supertype_of(other)
165
+
166
+ @abc.abstractmethod
167
+ def _is_supertype_of(self, other: ColumnType) -> bool:
168
+ return False
169
+
170
+ def matches(self, other: object) -> bool:
171
+ """Two types match if they're equal, aside from nullability"""
172
+ if not isinstance(other, ColumnType):
173
+ pass
155
174
  assert isinstance(other, ColumnType)
156
- if False and type(self) != type(other):
175
+ if type(self) != type(other):
157
176
  return False
158
177
  for member_var in vars(self).keys():
178
+ if member_var == 'nullable':
179
+ continue
159
180
  if getattr(self, member_var) != getattr(other, member_var):
160
181
  return False
161
182
  return True
162
183
 
163
184
  @classmethod
164
- def supertype(cls, type1: 'ColumnType', type2: 'ColumnType') -> Optional['ColumnType']:
185
+ def supertype(cls, type1: ColumnType, type2: ColumnType) -> Optional[ColumnType]:
165
186
  if type1 == type2:
166
187
  return type1
167
188
 
@@ -183,16 +204,15 @@ class ColumnType:
183
204
 
184
205
  @classmethod
185
206
  @abc.abstractmethod
186
- def _supertype(cls, type1: 'ColumnType', type2: 'ColumnType') -> Optional['ColumnType']:
207
+ def _supertype(cls, type1: ColumnType, type2: ColumnType) -> Optional[ColumnType]:
187
208
  """
188
209
  Class-specific implementation of determining the supertype. type1 and type2 are from the same subclass of
189
210
  ColumnType.
190
211
  """
191
212
  pass
192
213
 
193
-
194
214
  @classmethod
195
- def get_value_type(cls, val: Any) -> 'ColumnType':
215
+ def infer_literal_type(cls, val: Any) -> Optional[ColumnType]:
196
216
  if isinstance(val, str):
197
217
  return StringType()
198
218
  if isinstance(val, int):
@@ -203,6 +223,104 @@ class ColumnType:
203
223
  return BoolType()
204
224
  if isinstance(val, datetime.datetime) or isinstance(val, datetime.date):
205
225
  return TimestampType()
226
+ if isinstance(val, np.ndarray):
227
+ col_type = ArrayType.from_literal(val)
228
+ if col_type is not None:
229
+ return col_type
230
+ # this could still be json-serializable
231
+ if isinstance(val, dict) or isinstance(val, np.ndarray):
232
+ try:
233
+ JsonType().validate_literal(val)
234
+ return JsonType()
235
+ except TypeError:
236
+ return None
237
+ return None
238
+
239
+
240
+ @classmethod
241
+ def from_python_type(cls, t: type) -> Optional[ColumnType]:
242
+ if typing.get_origin(t) is typing.Union:
243
+ union_args = typing.get_args(t)
244
+ if union_args[1] is type(None):
245
+ # `t` is a type of the form Optional[T] (equivalently, Union[T, None]).
246
+ # We treat it as the underlying type but with nullable=True.
247
+ underlying = cls.from_python_type(union_args[0])
248
+ if underlying is not None:
249
+ underlying.nullable = True
250
+ return underlying
251
+ else:
252
+ # Discard type parameters to ensure that parameterized types such as `list[T]`
253
+ # are correctly mapped to Pixeltable types.
254
+ base = typing.get_origin(t)
255
+ if base is None:
256
+ # No type parameters; the base type is just `t` itself
257
+ base = t
258
+ if base is str:
259
+ return StringType()
260
+ if base is int:
261
+ return IntType()
262
+ if base is float:
263
+ return FloatType()
264
+ if base is bool:
265
+ return BoolType()
266
+ if base is datetime.date or base is datetime.datetime:
267
+ return TimestampType()
268
+ if issubclass(base, Sequence) or issubclass(base, Mapping):
269
+ return JsonType()
270
+ if issubclass(base, PIL.Image.Image):
271
+ return ImageType()
272
+ return None
273
+
274
+ def validate_literal(self, val: Any) -> None:
275
+ """Raise TypeError if val is not a valid literal for this type"""
276
+ if val is None:
277
+ if not self.nullable:
278
+ raise TypeError('Expected non-None value')
279
+ else:
280
+ return
281
+ self._validate_literal(val)
282
+
283
+ def validate_media(self, val: Any) -> None:
284
+ """
285
+ Raise TypeError if val is not a path to a valid media file (or a valid in-memory byte sequence) for this type
286
+ """
287
+ if self.is_media_type():
288
+ raise NotImplementedError(f'validate_media() not implemented for {self.__class__.__name__}')
289
+
290
+ def _validate_file_path(self, val: Any) -> None:
291
+ """Raises TypeError if not a valid local file path or not a path/byte sequence"""
292
+ if isinstance(val, str):
293
+ parsed = urllib.parse.urlparse(val)
294
+ if parsed.scheme != '' and parsed.scheme != 'file':
295
+ return
296
+ path = Path(urllib.parse.unquote(parsed.path))
297
+ if not path.is_file():
298
+ raise TypeError(f'File not found: {str(path)}')
299
+ else:
300
+ if not isinstance(val, bytes):
301
+ raise TypeError(f'expected file path or bytes, got {type(val)}')
302
+
303
+ @abc.abstractmethod
304
+ def _validate_literal(self, val: Any) -> None:
305
+ """Raise TypeError if val is not a valid literal for this type"""
306
+ pass
307
+
308
+ @abc.abstractmethod
309
+ def _create_literal(self, val : Any) -> Any:
310
+ """Create a literal of this type from val, including any needed conversions.
311
+ val is guaranteed to be non-None"""
312
+ return val
313
+
314
+ def create_literal(self, val: Any) -> Any:
315
+ """Create a literal of this type from val or raise TypeError if not possible"""
316
+ if val is not None:
317
+ val = self._create_literal(val)
318
+
319
+ self.validate_literal(val)
320
+ return val
321
+
322
+ def print_value(self, val: Any) -> str:
323
+ return str(val)
206
324
 
207
325
  def is_scalar_type(self) -> bool:
208
326
  return self._type in self.scalar_types
@@ -240,6 +358,16 @@ class ColumnType:
240
358
  def is_video_type(self) -> bool:
241
359
  return self._type == self.Type.VIDEO
242
360
 
361
+ def is_audio_type(self) -> bool:
362
+ return self._type == self.Type.AUDIO
363
+
364
+ def is_document_type(self) -> bool:
365
+ return self._type == self.Type.DOCUMENT
366
+
367
+ def is_media_type(self) -> bool:
368
+ # types that refer to external media files
369
+ return self.is_image_type() or self.is_video_type() or self.is_audio_type() or self.is_document_type()
370
+
243
371
  @abc.abstractmethod
244
372
  def to_sql(self) -> str:
245
373
  """
@@ -281,21 +409,17 @@ class ColumnType:
281
409
  """
282
410
  assert False
283
411
 
284
- def conversion_fn(self, target: 'ColumnType') -> Optional[Callable[[Any], Any]]:
412
+ def conversion_fn(self, target: ColumnType) -> Optional[Callable[[Any], Any]]:
285
413
  """
286
414
  Return Callable that converts a column value of type self to a value of type 'target'.
287
415
  Returns None if conversion isn't possible.
288
416
  """
289
417
  return None
290
418
 
291
- @abc.abstractmethod
292
- def to_tf(self) -> Union[tf.TypeSpec, Dict[str, tf.TypeSpec]]:
293
- pass
294
-
295
419
 
296
420
  class InvalidType(ColumnType):
297
- def __init__(self):
298
- super().__init__(self.Type.INVALID)
421
+ def __init__(self, nullable: bool = False):
422
+ super().__init__(self.Type.INVALID, nullable=nullable)
299
423
 
300
424
  def to_sql(self) -> str:
301
425
  assert False
@@ -303,13 +427,15 @@ class InvalidType(ColumnType):
303
427
  def to_sa_type(self) -> Any:
304
428
  assert False
305
429
 
306
- def to_tf(self) -> Union[tf.TypeSpec, Dict[str, tf.TypeSpec]]:
307
- raise TypeError(f'Invalid type cannot be converted to Tensorflow')
430
+ def print_value(self, val: Any) -> str:
431
+ assert False
308
432
 
433
+ def _validate_literal(self, val: Any) -> None:
434
+ assert False
309
435
 
310
436
  class StringType(ColumnType):
311
- def __init__(self):
312
- super().__init__(self.Type.STRING)
437
+ def __init__(self, nullable: bool = False):
438
+ super().__init__(self.Type.STRING, nullable=nullable)
313
439
 
314
440
  def conversion_fn(self, target: ColumnType) -> Optional[Callable[[Any], Any]]:
315
441
  if not target.is_timestamp_type():
@@ -328,28 +454,39 @@ class StringType(ColumnType):
328
454
  def to_sa_type(self) -> str:
329
455
  return sql.String
330
456
 
331
- def to_tf(self) -> Union[tf.TypeSpec, Dict[str, tf.TypeSpec]]:
332
- return tf.TensorSpec(shape=(), dtype=tf.string)
457
+ def print_value(self, val: Any) -> str:
458
+ return f"'{val}'"
459
+
460
+ def _validate_literal(self, val: Any) -> None:
461
+ if not isinstance(val, str):
462
+ raise TypeError(f'Expected string, got {val.__class__.__name__}')
333
463
 
464
+ def _create_literal(self, val: Any) -> Any:
465
+ # Replace null byte within python string with space to avoid issues with Postgres.
466
+ # Use a space to avoid merging words.
467
+ # TODO(orm): this will also be an issue with JSON inputs, would space still be a good replacement?
468
+ if isinstance(val, str) and '\x00' in val:
469
+ return val.replace('\x00', ' ')
470
+ return val
334
471
 
335
472
  class IntType(ColumnType):
336
- def __init__(self):
337
- super().__init__(self.Type.INT)
473
+ def __init__(self, nullable: bool = False):
474
+ super().__init__(self.Type.INT, nullable=nullable)
338
475
 
339
476
  def to_sql(self) -> str:
340
- return 'INTEGER'
477
+ return 'BIGINT'
341
478
 
342
479
  def to_sa_type(self) -> str:
343
- return sql.Integer
480
+ return sql.BigInteger
344
481
 
345
- def to_tf(self) -> Union[tf.TypeSpec, Dict[str, tf.TypeSpec]]:
346
- # TODO: how to specify the correct int subtype?
347
- return tf.TensorSpec(shape=(), dtype=tf.int64)
482
+ def _validate_literal(self, val: Any) -> None:
483
+ if not isinstance(val, int):
484
+ raise TypeError(f'Expected int, got {val.__class__.__name__}')
348
485
 
349
486
 
350
487
  class FloatType(ColumnType):
351
- def __init__(self):
352
- super().__init__(self.Type.FLOAT)
488
+ def __init__(self, nullable: bool = False):
489
+ super().__init__(self.Type.FLOAT, nullable=nullable)
353
490
 
354
491
  def to_sql(self) -> str:
355
492
  return 'FLOAT'
@@ -357,14 +494,18 @@ class FloatType(ColumnType):
357
494
  def to_sa_type(self) -> str:
358
495
  return sql.Float
359
496
 
360
- def to_tf(self) -> Union[tf.TypeSpec, Dict[str, tf.TypeSpec]]:
361
- # TODO: how to specify the correct float subtype?
362
- return tf.TensorSpec(shape=(), dtype=tf.float32)
497
+ def _validate_literal(self, val: Any) -> None:
498
+ if not isinstance(val, float):
499
+ raise TypeError(f'Expected float, got {val.__class__.__name__}')
363
500
 
501
+ def _create_literal(self, val: Any) -> Any:
502
+ if isinstance(val, int):
503
+ return float(val)
504
+ return val
364
505
 
365
506
  class BoolType(ColumnType):
366
- def __init__(self):
367
- super().__init__(self.Type.BOOL)
507
+ def __init__(self, nullable: bool = False):
508
+ super().__init__(self.Type.BOOL, nullable=nullable)
368
509
 
369
510
  def to_sql(self) -> str:
370
511
  return 'BOOLEAN'
@@ -372,14 +513,18 @@ class BoolType(ColumnType):
372
513
  def to_sa_type(self) -> str:
373
514
  return sql.Boolean
374
515
 
375
- def to_tf(self) -> Union[tf.TypeSpec, Dict[str, tf.TypeSpec]]:
376
- # TODO: how to specify the correct int subtype?
377
- return tf.TensorSpec(shape=(), dtype=tf.bool)
516
+ def _validate_literal(self, val: Any) -> None:
517
+ if not isinstance(val, bool):
518
+ raise TypeError(f'Expected bool, got {val.__class__.__name__}')
378
519
 
520
+ def _create_literal(self, val: Any) -> Any:
521
+ if isinstance(val, int):
522
+ return bool(val)
523
+ return val
379
524
 
380
525
  class TimestampType(ColumnType):
381
- def __init__(self):
382
- super().__init__(self.Type.TIMESTAMP)
526
+ def __init__(self, nullable: bool = False):
527
+ super().__init__(self.Type.TIMESTAMP, nullable=nullable)
383
528
 
384
529
  def to_sql(self) -> str:
385
530
  return 'INTEGER'
@@ -387,14 +532,19 @@ class TimestampType(ColumnType):
387
532
  def to_sa_type(self) -> str:
388
533
  return sql.TIMESTAMP
389
534
 
390
- def to_tf(self) -> Union[tf.TypeSpec, Dict[str, tf.TypeSpec]]:
391
- raise TypeError(f'Timestamp type cannot be converted to Tensorflow')
535
+ def _validate_literal(self, val: Any) -> None:
536
+ if not isinstance(val, datetime.datetime) and not isinstance(val, datetime.date):
537
+ raise TypeError(f'Expected datetime.datetime or datetime.date, got {val.__class__.__name__}')
392
538
 
539
+ def _create_literal(self, val: Any) -> Any:
540
+ if isinstance(val, str):
541
+ return datetime.datetime.fromisoformat(val)
542
+ return val
393
543
 
394
544
  class JsonType(ColumnType):
395
545
  # TODO: type_spec also needs to be able to express lists
396
- def __init__(self, type_spec: Optional[Dict[str, ColumnType]] = None):
397
- super().__init__(self.Type.JSON)
546
+ def __init__(self, type_spec: Optional[Dict[str, ColumnType]] = None, nullable: bool = False):
547
+ super().__init__(self.Type.JSON, nullable=nullable)
398
548
  self.type_spec = type_spec
399
549
 
400
550
  def _as_dict(self) -> Dict:
@@ -405,13 +555,13 @@ class JsonType(ColumnType):
405
555
  return result
406
556
 
407
557
  @classmethod
408
- def _from_dict(cls, d: Dict) -> 'ColumnType':
558
+ def _from_dict(cls, d: Dict) -> ColumnType:
409
559
  type_spec = None
410
560
  if 'type_spec' in d:
411
561
  type_spec = {
412
562
  field_name: cls.deserialize(field_type_dict) for field_name, field_type_dict in d['type_spec'].items()
413
563
  }
414
- return cls(type_spec)
564
+ return cls(type_spec, nullable=d['nullable'])
415
565
 
416
566
  def to_sql(self) -> str:
417
567
  return 'JSONB'
@@ -419,20 +569,34 @@ class JsonType(ColumnType):
419
569
  def to_sa_type(self) -> str:
420
570
  return sql.dialects.postgresql.JSONB
421
571
 
422
- def to_tf(self) -> Union[tf.TypeSpec, Dict[str, tf.TypeSpec]]:
423
- if self.type_spec is None:
424
- raise TypeError(f'Cannot convert {self.__class__.__name__} with missing type spec to TensorFlow')
425
- return {k: v.to_tf() for k, v in self.type_spec.items()}
426
-
572
+ def print_value(self, val: Any) -> str:
573
+ val_type = self.infer_literal_type(val)
574
+ if val_type == self:
575
+ return str(val)
576
+ return val_type.print_value(val)
577
+
578
+ def _validate_literal(self, val: Any) -> None:
579
+ if not isinstance(val, dict) and not isinstance(val, list):
580
+ raise TypeError(f'Expected dict or list, got {val.__class__.__name__}')
581
+ try:
582
+ _ = json.dumps(val)
583
+ except TypeError as e:
584
+ raise TypeError(f'Expected JSON-serializable object, got {val}')
585
+
586
+ def _create_literal(self, val: Any) -> Any:
587
+ if isinstance(val, tuple):
588
+ val = list(val)
589
+ return val
427
590
 
428
591
  class ArrayType(ColumnType):
429
592
  def __init__(
430
- self, shape: Tuple[Union[int, None], ...], dtype: ColumnType.Type):
431
- super().__init__(self.Type.ARRAY)
593
+ self, shape: Tuple[Union[int, None], ...], dtype: ColumnType, nullable: bool = False):
594
+ super().__init__(self.Type.ARRAY, nullable=nullable)
432
595
  self.shape = shape
433
- self.dtype = dtype
596
+ assert dtype.is_int_type() or dtype.is_float_type() or dtype.is_bool_type() or dtype.is_string_type()
597
+ self.dtype = dtype._type
434
598
 
435
- def _supertype(cls, type1: 'ArrayType', type2: 'ArrayType') -> Optional['ArrayType']:
599
+ def _supertype(cls, type1: ArrayType, type2: ArrayType) -> Optional[ArrayType]:
436
600
  if len(type1.shape) != len(type2.shape):
437
601
  return None
438
602
  base_type = ColumnType.supertype(type1.dtype, type2.dtype)
@@ -447,53 +611,90 @@ class ArrayType(ColumnType):
447
611
  return result
448
612
 
449
613
  def __str__(self) -> str:
450
- return f'{self.__class__.__name__}({self.shape}, dtype={self.dtype.name})'
614
+ return f'{self._type.name.lower()}({self.shape}, dtype={self.dtype.name})'
451
615
 
452
616
  @classmethod
453
- def _from_dict(cls, d: Dict) -> 'ColumnType':
617
+ def _from_dict(cls, d: Dict) -> ColumnType:
454
618
  assert 'shape' in d
455
619
  assert 'dtype' in d
456
620
  shape = tuple(d['shape'])
457
- dtype = cls.Type(d['dtype'])
458
- return cls(shape, dtype)
621
+ dtype = cls.make_type(cls.Type(d['dtype']))
622
+ return cls(shape, dtype, nullable=d['nullable'])
623
+
624
+ @classmethod
625
+ def from_literal(cls, val: np.ndarray) -> Optional[ArrayType]:
626
+ # determine our dtype
627
+ assert isinstance(val, np.ndarray)
628
+ if np.issubdtype(val.dtype, np.integer):
629
+ dtype = IntType()
630
+ elif np.issubdtype(val.dtype, np.floating):
631
+ dtype = FloatType()
632
+ elif val.dtype == np.bool_:
633
+ dtype = BoolType()
634
+ elif val.dtype == np.str_:
635
+ dtype = StringType()
636
+ else:
637
+ return None
638
+ return cls(val.shape, dtype=dtype, nullable=True)
639
+
640
+ def is_valid_literal(self, val: np.ndarray) -> bool:
641
+ if not isinstance(val, np.ndarray):
642
+ return False
643
+ if len(val.shape) != len(self.shape):
644
+ return False
645
+ # check that the shapes are compatible
646
+ for n1, n2 in zip(val.shape, self.shape):
647
+ if n1 is None:
648
+ return False
649
+ if n2 is None:
650
+ # wildcard
651
+ continue
652
+ if n1 != n2:
653
+ return False
654
+ return val.dtype == self.numpy_dtype()
655
+
656
+ def _validate_literal(self, val: Any) -> None:
657
+ if not isinstance(val, np.ndarray):
658
+ raise TypeError(f'Expected numpy.ndarray, got {val.__class__.__name__}')
659
+ if not self.is_valid_literal(val):
660
+ raise TypeError((
661
+ f'Expected ndarray({self.shape}, dtype={self.numpy_dtype()}), '
662
+ f'got ndarray({val.shape}, dtype={val.dtype})'))
663
+
664
+ def _create_literal(self, val: Any) -> Any:
665
+ if isinstance(val, (list,tuple)):
666
+ # map python float to whichever numpy float is
667
+ # declared for this type, rather than assume float64
668
+ return np.array(val, dtype=self.numpy_dtype())
669
+ return val
459
670
 
460
671
  def to_sql(self) -> str:
461
672
  return 'BYTEA'
462
673
 
463
674
  def to_sa_type(self) -> str:
464
- return sql.VARBINARY
465
-
466
- def to_tf(self) -> Union[tf.TypeSpec, Dict[str, tf.TypeSpec]]:
467
- return tf.TensorSpec(shape=self.shape, dtype=self.dtype.to_tf())
675
+ return sql.LargeBinary
676
+
677
+ def numpy_dtype(self) -> np.dtype:
678
+ if self.dtype == self.Type.INT:
679
+ return np.dtype(np.int64)
680
+ if self.dtype == self.Type.FLOAT:
681
+ return np.dtype(np.float32)
682
+ if self.dtype == self.Type.BOOL:
683
+ return np.dtype(np.bool_)
684
+ if self.dtype == self.Type.STRING:
685
+ return np.dtype(np.str_)
686
+ assert False
468
687
 
469
688
 
470
689
  class ImageType(ColumnType):
471
- @enum.unique
472
- class Mode(enum.Enum):
473
- L = 0,
474
- RGB = 1
475
-
476
- @classmethod
477
- def from_pil(cls, pil_mode: str) -> 'Mode':
478
- if pil_mode == 'L':
479
- return cls.L
480
- if pil_mode == 'RGB':
481
- return cls.RGB
482
-
483
- def to_pil(self) -> str:
484
- return self.name
485
-
486
- def num_channels(self) -> int:
487
- return len(self.name)
488
-
489
690
  def __init__(
490
691
  self, width: Optional[int] = None, height: Optional[int] = None, size: Optional[Tuple[int, int]] = None,
491
- mode: Optional[Mode] = None
692
+ mode: Optional[str] = None, nullable: bool = False
492
693
  ):
493
694
  """
494
695
  TODO: does it make sense to specify only width or height?
495
696
  """
496
- super().__init__(self.Type.IMAGE)
697
+ super().__init__(self.Type.IMAGE, nullable=nullable)
497
698
  assert not(width is not None and size is not None)
498
699
  assert not(height is not None and size is not None)
499
700
  if size is not None:
@@ -504,22 +705,53 @@ class ImageType(ColumnType):
504
705
  self.height = height
505
706
  self.mode = mode
506
707
 
708
+ def __str__(self) -> str:
709
+ if self.width is not None or self.height is not None or self.mode is not None:
710
+ params_str = ''
711
+ if self.width is not None:
712
+ params_str = f'width={self.width}'
713
+ if self.height is not None:
714
+ if len(params_str) > 0:
715
+ params_str += ', '
716
+ params_str += f'height={self.height}'
717
+ if self.mode is not None:
718
+ if len(params_str) > 0:
719
+ params_str += ', '
720
+ params_str += f'mode={self.mode}'
721
+ params_str = f'({params_str})'
722
+ else:
723
+ params_str = ''
724
+ return f'{self._type.name.lower()}{params_str}'
725
+
726
+ def _is_supertype_of(self, other: ImageType) -> bool:
727
+ if self.mode != other.mode:
728
+ return False
729
+ if self.width is None and self.height is None:
730
+ return True
731
+ if self.width != other.width and self.height != other.height:
732
+ return False
733
+
734
+ @property
735
+ def size(self) -> Optional[Tuple[int, int]]:
736
+ if self.width is None or self.height is None:
737
+ return None
738
+ return (self.width, self.height)
739
+
507
740
  @property
508
741
  def num_channels(self) -> Optional[int]:
509
742
  return None if self.mode is None else self.mode.num_channels()
510
743
 
511
744
  def _as_dict(self) -> Dict:
512
745
  result = super()._as_dict()
513
- result.update(width=self.width, height=self.height, mode=self.mode.value if self.mode is not None else None)
746
+ result.update(width=self.width, height=self.height, mode=self.mode)
514
747
  return result
515
748
 
516
749
  @classmethod
517
- def _from_dict(cls, d: Dict) -> 'ColumnType':
750
+ def _from_dict(cls, d: Dict) -> ColumnType:
518
751
  assert 'width' in d
519
752
  assert 'height' in d
520
753
  assert 'mode' in d
521
- mode_val = d['mode']
522
- return cls(width=d['width'], height=d['height'], mode=cls.Mode(mode_val) if mode_val is not None else None)
754
+ return cls(width=d['width'], height=d['height'], mode=d['mode'], nullable=d['nullable'])
523
755
 
524
756
  def conversion_fn(self, target: ColumnType) -> Optional[Callable[[Any], Any]]:
525
757
  if not target.is_image_type():
@@ -547,21 +779,55 @@ class ImageType(ColumnType):
547
779
  def to_sa_type(self) -> str:
548
780
  return sql.String
549
781
 
550
- def to_tf(self) -> Union[tf.TypeSpec, Dict[str, tf.TypeSpec]]:
551
- return tf.TensorSpec(shape=(self.height, self.width, self.num_channels), dtype=tf.uint8)
782
+ def _validate_literal(self, val: Any) -> None:
783
+ if isinstance(val, PIL.Image.Image):
784
+ return
785
+ self._validate_file_path(val)
552
786
 
787
+ def validate_media(self, val: Any) -> None:
788
+ assert isinstance(val, str)
789
+ try:
790
+ _ = PIL.Image.open(val)
791
+ except PIL.UnidentifiedImageError:
792
+ raise excs.Error(f'Not a valid image: {val}') from None
553
793
 
554
794
  class VideoType(ColumnType):
555
- def __init__(self):
556
- super().__init__(self.Type.VIDEO)
795
+ def __init__(self, nullable: bool = False):
796
+ super().__init__(self.Type.VIDEO, nullable=nullable)
557
797
 
558
- def _as_dict(self) -> Dict:
559
- result = super()._as_dict()
560
- return result
798
+ def to_sql(self) -> str:
799
+ # stored as a file path
800
+ return 'VARCHAR'
561
801
 
562
- @classmethod
563
- def _from_dict(cls, d: Dict) -> 'ColumnType':
564
- return cls()
802
+ def to_sa_type(self) -> str:
803
+ return sql.String
804
+
805
+ def _validate_literal(self, val: Any) -> None:
806
+ self._validate_file_path(val)
807
+
808
+ def validate_media(self, val: Any) -> None:
809
+ assert isinstance(val, str)
810
+ try:
811
+ with av.open(val, 'r') as fh:
812
+ if len(fh.streams.video) == 0:
813
+ raise excs.Error(f'Not a valid video: {val}')
814
+ # decode a few frames to make sure it's playable
815
+ # TODO: decode all frames? but that's very slow
816
+ num_decoded = 0
817
+ for frame in fh.decode(video=0):
818
+ _ = frame.to_image()
819
+ num_decoded += 1
820
+ if num_decoded == 10:
821
+ break
822
+ if num_decoded < 2:
823
+ # this is most likely an image file
824
+ raise excs.Error(f'Not a valid video: {val}')
825
+ except av.AVError:
826
+ raise excs.Error(f'Not a valid video: {val}') from None
827
+
828
+ class AudioType(ColumnType):
829
+ def __init__(self, nullable: bool = False):
830
+ super().__init__(self.Type.AUDIO, nullable=nullable)
565
831
 
566
832
  def to_sql(self) -> str:
567
833
  # stored as a file path
@@ -570,5 +836,60 @@ class VideoType(ColumnType):
570
836
  def to_sa_type(self) -> str:
571
837
  return sql.String
572
838
 
573
- def to_tf(self) -> Union[tf.TypeSpec, Dict[str, tf.TypeSpec]]:
574
- assert False
839
+ def _validate_literal(self, val: Any) -> None:
840
+ self._validate_file_path(val)
841
+
842
+ def validate_media(self, val: Any) -> None:
843
+ try:
844
+ with av.open(val) as container:
845
+ if len(container.streams.audio) == 0:
846
+ raise excs.Error(f'No audio stream in file: {val}')
847
+ audio_stream = container.streams.audio[0]
848
+
849
+ # decode everything to make sure it's playable
850
+ # TODO: is there some way to verify it's a playable audio file other than decoding all of it?
851
+ for packet in container.demux(audio_stream):
852
+ for _ in packet.decode():
853
+ pass
854
+ except av.AVError as e:
855
+ raise excs.Error(f'Not a valid audio file: {val}\n{e}') from None
856
+
857
+ class DocumentType(ColumnType):
858
+ @enum.unique
859
+ class DocumentFormat(enum.Enum):
860
+ HTML = 0
861
+ MD = 1
862
+ PDF = 2
863
+
864
+ def __init__(self, nullable: bool = False, doc_formats: Optional[str] = None):
865
+ super().__init__(self.Type.DOCUMENT, nullable=nullable)
866
+ if doc_formats is not None:
867
+ type_strs = doc_formats.split(',')
868
+ for type_str in type_strs:
869
+ if not hasattr(self.DocumentFormat, type_str):
870
+ raise ValueError(f'Invalid document type: {type_str}')
871
+ self._doc_formats = [self.DocumentFormat[type_str.upper()] for type_str in type_strs]
872
+ else:
873
+ self._doc_formats = [t for t in self.DocumentFormat]
874
+
875
+ def to_sql(self) -> str:
876
+ # stored as a file path
877
+ return 'VARCHAR'
878
+
879
+ def to_sa_type(self) -> str:
880
+ return sql.String
881
+
882
+ def _validate_literal(self, val: Any) -> None:
883
+ self._validate_file_path(val)
884
+
885
+ def validate_media(self, val: Any) -> None:
886
+ assert isinstance(val, str)
887
+ from pixeltable.utils.documents import get_document_handle
888
+ with open(val, 'r', encoding='utf8') as fh:
889
+ try:
890
+ s = fh.read()
891
+ dh = get_document_handle(s)
892
+ if dh is None:
893
+ raise excs.Error(f'Not a recognized document format: {val}')
894
+ except Exception as e:
895
+ raise excs.Error(f'Not a recognized document format: {val}') from None