pixeltable 0.1.2__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (140) hide show
  1. pixeltable/__init__.py +21 -4
  2. pixeltable/catalog/__init__.py +13 -0
  3. pixeltable/catalog/catalog.py +159 -0
  4. pixeltable/catalog/column.py +200 -0
  5. pixeltable/catalog/dir.py +32 -0
  6. pixeltable/catalog/globals.py +33 -0
  7. pixeltable/catalog/insertable_table.py +191 -0
  8. pixeltable/catalog/named_function.py +36 -0
  9. pixeltable/catalog/path.py +58 -0
  10. pixeltable/catalog/path_dict.py +139 -0
  11. pixeltable/catalog/schema_object.py +39 -0
  12. pixeltable/catalog/table.py +581 -0
  13. pixeltable/catalog/table_version.py +749 -0
  14. pixeltable/catalog/table_version_path.py +133 -0
  15. pixeltable/catalog/view.py +203 -0
  16. pixeltable/client.py +520 -31
  17. pixeltable/dataframe.py +540 -349
  18. pixeltable/env.py +373 -48
  19. pixeltable/exceptions.py +12 -21
  20. pixeltable/exec/__init__.py +9 -0
  21. pixeltable/exec/aggregation_node.py +78 -0
  22. pixeltable/exec/cache_prefetch_node.py +113 -0
  23. pixeltable/exec/component_iteration_node.py +79 -0
  24. pixeltable/exec/data_row_batch.py +95 -0
  25. pixeltable/exec/exec_context.py +22 -0
  26. pixeltable/exec/exec_node.py +61 -0
  27. pixeltable/exec/expr_eval_node.py +217 -0
  28. pixeltable/exec/in_memory_data_node.py +69 -0
  29. pixeltable/exec/media_validation_node.py +43 -0
  30. pixeltable/exec/sql_scan_node.py +225 -0
  31. pixeltable/exprs/__init__.py +24 -0
  32. pixeltable/exprs/arithmetic_expr.py +102 -0
  33. pixeltable/exprs/array_slice.py +71 -0
  34. pixeltable/exprs/column_property_ref.py +77 -0
  35. pixeltable/exprs/column_ref.py +105 -0
  36. pixeltable/exprs/comparison.py +77 -0
  37. pixeltable/exprs/compound_predicate.py +98 -0
  38. pixeltable/exprs/data_row.py +187 -0
  39. pixeltable/exprs/expr.py +586 -0
  40. pixeltable/exprs/expr_set.py +39 -0
  41. pixeltable/exprs/function_call.py +380 -0
  42. pixeltable/exprs/globals.py +69 -0
  43. pixeltable/exprs/image_member_access.py +115 -0
  44. pixeltable/exprs/image_similarity_predicate.py +58 -0
  45. pixeltable/exprs/inline_array.py +107 -0
  46. pixeltable/exprs/inline_dict.py +101 -0
  47. pixeltable/exprs/is_null.py +38 -0
  48. pixeltable/exprs/json_mapper.py +121 -0
  49. pixeltable/exprs/json_path.py +159 -0
  50. pixeltable/exprs/literal.py +54 -0
  51. pixeltable/exprs/object_ref.py +41 -0
  52. pixeltable/exprs/predicate.py +44 -0
  53. pixeltable/exprs/row_builder.py +355 -0
  54. pixeltable/exprs/rowid_ref.py +94 -0
  55. pixeltable/exprs/type_cast.py +53 -0
  56. pixeltable/exprs/variable.py +45 -0
  57. pixeltable/func/__init__.py +9 -0
  58. pixeltable/func/aggregate_function.py +194 -0
  59. pixeltable/func/batched_function.py +53 -0
  60. pixeltable/func/callable_function.py +69 -0
  61. pixeltable/func/expr_template_function.py +82 -0
  62. pixeltable/func/function.py +110 -0
  63. pixeltable/func/function_registry.py +227 -0
  64. pixeltable/func/globals.py +36 -0
  65. pixeltable/func/nos_function.py +202 -0
  66. pixeltable/func/signature.py +166 -0
  67. pixeltable/func/udf.py +163 -0
  68. pixeltable/functions/__init__.py +52 -103
  69. pixeltable/functions/eval.py +216 -0
  70. pixeltable/functions/fireworks.py +61 -0
  71. pixeltable/functions/huggingface.py +120 -0
  72. pixeltable/functions/image.py +16 -0
  73. pixeltable/functions/openai.py +88 -0
  74. pixeltable/functions/pil/image.py +148 -7
  75. pixeltable/functions/string.py +13 -0
  76. pixeltable/functions/together.py +27 -0
  77. pixeltable/functions/util.py +41 -0
  78. pixeltable/functions/video.py +62 -0
  79. pixeltable/iterators/__init__.py +3 -0
  80. pixeltable/iterators/base.py +48 -0
  81. pixeltable/iterators/document.py +311 -0
  82. pixeltable/iterators/video.py +89 -0
  83. pixeltable/metadata/__init__.py +54 -0
  84. pixeltable/metadata/converters/convert_10.py +18 -0
  85. pixeltable/metadata/schema.py +211 -0
  86. pixeltable/plan.py +656 -0
  87. pixeltable/store.py +413 -182
  88. pixeltable/tests/conftest.py +143 -86
  89. pixeltable/tests/test_audio.py +65 -0
  90. pixeltable/tests/test_catalog.py +27 -0
  91. pixeltable/tests/test_client.py +14 -14
  92. pixeltable/tests/test_component_view.py +372 -0
  93. pixeltable/tests/test_dataframe.py +433 -0
  94. pixeltable/tests/test_dirs.py +78 -62
  95. pixeltable/tests/test_document.py +117 -0
  96. pixeltable/tests/test_exprs.py +591 -135
  97. pixeltable/tests/test_function.py +297 -67
  98. pixeltable/tests/test_functions.py +283 -1
  99. pixeltable/tests/test_migration.py +43 -0
  100. pixeltable/tests/test_nos.py +54 -0
  101. pixeltable/tests/test_snapshot.py +208 -0
  102. pixeltable/tests/test_table.py +1086 -258
  103. pixeltable/tests/test_transactional_directory.py +42 -0
  104. pixeltable/tests/test_types.py +5 -11
  105. pixeltable/tests/test_video.py +149 -34
  106. pixeltable/tests/test_view.py +530 -0
  107. pixeltable/tests/utils.py +186 -45
  108. pixeltable/tool/create_test_db_dump.py +149 -0
  109. pixeltable/type_system.py +490 -133
  110. pixeltable/utils/__init__.py +17 -46
  111. pixeltable/utils/clip.py +12 -15
  112. pixeltable/utils/coco.py +136 -0
  113. pixeltable/utils/documents.py +39 -0
  114. pixeltable/utils/filecache.py +195 -0
  115. pixeltable/utils/help.py +11 -0
  116. pixeltable/utils/media_store.py +76 -0
  117. pixeltable/utils/parquet.py +126 -0
  118. pixeltable/utils/pytorch.py +172 -0
  119. pixeltable/utils/s3.py +13 -0
  120. pixeltable/utils/sql.py +17 -0
  121. pixeltable/utils/transactional_directory.py +35 -0
  122. pixeltable-0.2.0.dist-info/LICENSE +18 -0
  123. pixeltable-0.2.0.dist-info/METADATA +117 -0
  124. pixeltable-0.2.0.dist-info/RECORD +125 -0
  125. {pixeltable-0.1.2.dist-info → pixeltable-0.2.0.dist-info}/WHEEL +1 -1
  126. pixeltable/catalog.py +0 -1421
  127. pixeltable/exprs.py +0 -1745
  128. pixeltable/function.py +0 -269
  129. pixeltable/functions/clip.py +0 -10
  130. pixeltable/functions/pil/__init__.py +0 -23
  131. pixeltable/functions/tf.py +0 -21
  132. pixeltable/index.py +0 -57
  133. pixeltable/tests/test_dict.py +0 -24
  134. pixeltable/tests/test_tf.py +0 -69
  135. pixeltable/tf.py +0 -33
  136. pixeltable/utils/tf.py +0 -33
  137. pixeltable/utils/video.py +0 -32
  138. pixeltable-0.1.2.dist-info/LICENSE +0 -201
  139. pixeltable-0.1.2.dist-info/METADATA +0 -89
  140. pixeltable-0.1.2.dist-info/RECORD +0 -37
pixeltable/type_system.py CHANGED
@@ -1,15 +1,21 @@
1
+ from __future__ import annotations
2
+
1
3
  import abc
2
- from typing import Any, Optional, Tuple, Dict, Callable, List, Union
3
- import enum
4
4
  import datetime
5
+ import enum
5
6
  import json
7
+ import typing
8
+ import urllib.parse
9
+ from copy import copy
10
+ from pathlib import Path
11
+ from typing import Any, Optional, Tuple, Dict, Callable, List, Union
6
12
 
7
- import os
8
- os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
9
- #import tensorflow as tf
10
13
  import PIL.Image
14
+ import av
15
+ import numpy as np
11
16
  import sqlalchemy as sql
12
17
 
18
+ from pixeltable import exceptions as excs
13
19
 
14
20
 
15
21
  class ColumnType:
@@ -24,26 +30,16 @@ class ColumnType:
24
30
  ARRAY = 6
25
31
  IMAGE = 7
26
32
  VIDEO = 8
33
+ AUDIO = 9
34
+ DOCUMENT = 10
27
35
 
28
36
  # exprs that don't evaluate to a computable value in Pixeltable, such as an Image member function
29
- INVALID = 9
30
-
31
- def to_tf(self) -> 'tf.dtypes.DType':
32
- import tensorflow as tf
33
- if self == self.STRING:
34
- return tf.string
35
- if self == self.INT:
36
- return tf.int64
37
- if self == self.FLOAT:
38
- return tf.float32
39
- if self == self.BOOL:
40
- return tf.bool
41
- raise TypeError(f'Cannot convert {self} to TensorFlow')
37
+ INVALID = 255
42
38
 
43
39
  @classmethod
44
40
  def supertype(
45
41
  cls, type1: 'Type', type2: 'Type',
46
- # we need to pass this in because we can't easily add it as a class member
42
+ # we need to pass this in because we can't easily append it as a class member
47
43
  common_supertypes: Dict[Tuple['Type', 'Type'], 'Type']
48
44
  ) -> Optional['Type']:
49
45
  if type1 == type2:
@@ -83,8 +79,9 @@ class ColumnType:
83
79
  (Type.INT, Type.FLOAT): Type.FLOAT,
84
80
  }
85
81
 
86
- def __init__(self, t: Type):
82
+ def __init__(self, t: Type, nullable: bool = False):
87
83
  self._type = t
84
+ self.nullable = nullable
88
85
 
89
86
  @property
90
87
  def type_enum(self) -> Type:
@@ -94,7 +91,7 @@ class ColumnType:
94
91
  return json.dumps(self.as_dict())
95
92
 
96
93
  @classmethod
97
- def serialize_list(cls, type_list: List['ColumnType']) -> str:
94
+ def serialize_list(cls, type_list: List[ColumnType]) -> str:
98
95
  return json.dumps([t.as_dict() for t in type_list])
99
96
 
100
97
  def as_dict(self) -> Dict:
@@ -104,33 +101,34 @@ class ColumnType:
104
101
  }
105
102
 
106
103
  def _as_dict(self) -> Dict:
107
- return {}
104
+ return {'nullable': self.nullable}
108
105
 
109
106
  @classmethod
110
- def deserialize(cls, type_str: str) -> 'ColumnType':
107
+ def deserialize(cls, type_str: str) -> ColumnType:
111
108
  type_dict = json.loads(type_str)
112
109
  return cls.from_dict(type_dict)
113
110
 
114
111
  @classmethod
115
- def deserialize_list(cls, type_list_str: str) -> List['ColumnType']:
112
+ def deserialize_list(cls, type_list_str: str) -> List[ColumnType]:
116
113
  type_dict_list = json.loads(type_list_str)
117
114
  return [cls.from_dict(type_dict) for type_dict in type_dict_list]
118
115
 
119
116
  @classmethod
120
- def from_dict(cls, type_dict: Dict) -> 'ColumnType':
117
+ def from_dict(cls, type_dict: Dict) -> ColumnType:
121
118
  assert '_classname' in type_dict
122
119
  type_class = globals()[type_dict['_classname']]
123
120
  return type_class._from_dict(type_dict)
124
121
 
125
122
  @classmethod
126
- def _from_dict(cls, d: Dict) -> 'ColumnType':
123
+ def _from_dict(cls, d: Dict) -> ColumnType:
127
124
  """
128
- Default implementation: simply invoke c'tor without arguments
125
+ Default implementation: simply invoke c'tor
129
126
  """
130
- return cls()
127
+ assert 'nullable' in d
128
+ return cls(nullable=d['nullable'])
131
129
 
132
130
  @classmethod
133
- def make_type(cls, t: Type) -> 'ColumnType':
131
+ def make_type(cls, t: Type) -> ColumnType:
134
132
  assert t != cls.Type.INVALID and t != cls.Type.ARRAY
135
133
  if t == cls.Type.STRING:
136
134
  return StringType()
@@ -148,21 +146,44 @@ class ColumnType:
148
146
  return ImageType()
149
147
  if t == cls.Type.VIDEO:
150
148
  return VideoType()
149
+ if t == cls.Type.AUDIO:
150
+ return AudioType()
151
+ if t == cls.Type.DOCUMENT:
152
+ return AudioType()
151
153
 
152
154
  def __str__(self) -> str:
153
155
  return self._type.name.lower()
154
156
 
155
157
  def __eq__(self, other: object) -> bool:
158
+ return self.matches(other) and self.nullable == other.nullable
159
+
160
+ def is_supertype_of(self, other: ColumnType) -> bool:
161
+ if type(self) != type(other):
162
+ return False
163
+ if self.matches(other):
164
+ return True
165
+ return self._is_supertype_of(other)
166
+
167
+ @abc.abstractmethod
168
+ def _is_supertype_of(self, other: ColumnType) -> bool:
169
+ return False
170
+
171
+ def matches(self, other: object) -> bool:
172
+ """Two types match if they're equal, aside from nullability"""
173
+ if not isinstance(other, ColumnType):
174
+ pass
156
175
  assert isinstance(other, ColumnType)
157
- if False and type(self) != type(other):
176
+ if type(self) != type(other):
158
177
  return False
159
178
  for member_var in vars(self).keys():
179
+ if member_var == 'nullable':
180
+ continue
160
181
  if getattr(self, member_var) != getattr(other, member_var):
161
182
  return False
162
183
  return True
163
184
 
164
185
  @classmethod
165
- def supertype(cls, type1: 'ColumnType', type2: 'ColumnType') -> Optional['ColumnType']:
186
+ def supertype(cls, type1: ColumnType, type2: ColumnType) -> Optional[ColumnType]:
166
187
  if type1 == type2:
167
188
  return type1
168
189
 
@@ -184,16 +205,15 @@ class ColumnType:
184
205
 
185
206
  @classmethod
186
207
  @abc.abstractmethod
187
- def _supertype(cls, type1: 'ColumnType', type2: 'ColumnType') -> Optional['ColumnType']:
208
+ def _supertype(cls, type1: ColumnType, type2: ColumnType) -> Optional[ColumnType]:
188
209
  """
189
210
  Class-specific implementation of determining the supertype. type1 and type2 are from the same subclass of
190
211
  ColumnType.
191
212
  """
192
213
  pass
193
214
 
194
-
195
215
  @classmethod
196
- def get_value_type(cls, val: Any) -> 'ColumnType':
216
+ def infer_literal_type(cls, val: Any) -> Optional[ColumnType]:
197
217
  if isinstance(val, str):
198
218
  return StringType()
199
219
  if isinstance(val, int):
@@ -204,6 +224,85 @@ class ColumnType:
204
224
  return BoolType()
205
225
  if isinstance(val, datetime.datetime) or isinstance(val, datetime.date):
206
226
  return TimestampType()
227
+ if isinstance(val, np.ndarray):
228
+ col_type = ArrayType.from_literal(val)
229
+ if col_type is not None:
230
+ return col_type
231
+ # this could still be json-serializable
232
+ if isinstance(val, dict) or isinstance(val, np.ndarray):
233
+ try:
234
+ JsonType().validate_literal(val)
235
+ return JsonType()
236
+ except TypeError:
237
+ return None
238
+ return None
239
+
240
+
241
+ @classmethod
242
+ def from_python_type(cls, t: type) -> Optional[ColumnType]:
243
+ if t in _python_type_to_column_type:
244
+ return _python_type_to_column_type[t]
245
+ elif isinstance(t, typing._UnionGenericAlias) and t.__args__[1] is type(None):
246
+ # `t` is a type of the form Optional[T] (equivalently, Union[T, None]).
247
+ # We treat it as the underlying type but with nullable=True.
248
+ if t.__args__[0] in _python_type_to_column_type:
249
+ underlying = copy(_python_type_to_column_type[t.__args__[0]])
250
+ underlying.nullable = True
251
+ return underlying
252
+
253
+ return None
254
+
255
+
256
+ def validate_literal(self, val: Any) -> None:
257
+ """Raise TypeError if val is not a valid literal for this type"""
258
+ if val is None:
259
+ if not self.nullable:
260
+ raise TypeError('Expected non-None value')
261
+ else:
262
+ return
263
+ self._validate_literal(val)
264
+
265
+ def validate_media(self, val: Any) -> None:
266
+ """
267
+ Raise TypeError if val is not a path to a valid media file (or a valid in-memory byte sequence) for this type
268
+ """
269
+ if self.is_media_type():
270
+ raise NotImplementedError(f'validate_media() not implemented for {self.__class__.__name__}')
271
+
272
+ def _validate_file_path(self, val: Any) -> None:
273
+ """Raises TypeError if not a valid local file path or not a path/byte sequence"""
274
+ if isinstance(val, str):
275
+ parsed = urllib.parse.urlparse(val)
276
+ if parsed.scheme != '' and parsed.scheme != 'file':
277
+ return
278
+ path = Path(urllib.parse.unquote(parsed.path))
279
+ if not path.is_file():
280
+ raise TypeError(f'File not found: {str(path)}')
281
+ else:
282
+ if not isinstance(val, bytes):
283
+ raise TypeError(f'expected file path or bytes, got {type(val)}')
284
+
285
+ @abc.abstractmethod
286
+ def _validate_literal(self, val: Any) -> None:
287
+ """Raise TypeError if val is not a valid literal for this type"""
288
+ pass
289
+
290
+ @abc.abstractmethod
291
+ def _create_literal(self, val : Any) -> Any:
292
+ """Create a literal of this type from val, including any needed conversions.
293
+ val is guaranteed to be non-None"""
294
+ return val
295
+
296
+ def create_literal(self, val: Any) -> Any:
297
+ """Create a literal of this type from val or raise TypeError if not possible"""
298
+ if val is not None:
299
+ val = self._create_literal(val)
300
+
301
+ self.validate_literal(val)
302
+ return val
303
+
304
+ def print_value(self, val: Any) -> str:
305
+ return str(val)
207
306
 
208
307
  def is_scalar_type(self) -> bool:
209
308
  return self._type in self.scalar_types
@@ -241,6 +340,16 @@ class ColumnType:
241
340
  def is_video_type(self) -> bool:
242
341
  return self._type == self.Type.VIDEO
243
342
 
343
+ def is_audio_type(self) -> bool:
344
+ return self._type == self.Type.AUDIO
345
+
346
+ def is_document_type(self) -> bool:
347
+ return self._type == self.Type.DOCUMENT
348
+
349
+ def is_media_type(self) -> bool:
350
+ # types that refer to external media files
351
+ return self.is_image_type() or self.is_video_type() or self.is_audio_type() or self.is_document_type()
352
+
244
353
  @abc.abstractmethod
245
354
  def to_sql(self) -> str:
246
355
  """
@@ -274,6 +383,10 @@ class ColumnType:
274
383
  return sql.VARBINARY
275
384
  assert False
276
385
 
386
+ @abc.abstractmethod
387
+ def to_arrow_type(self) -> 'pyarrow.DataType':
388
+ assert False, f'Have not implemented {self.__class__.__name__} to Arrow'
389
+
277
390
  @staticmethod
278
391
  def no_conversion(v: Any) -> Any:
279
392
  """
@@ -282,21 +395,17 @@ class ColumnType:
282
395
  """
283
396
  assert False
284
397
 
285
- def conversion_fn(self, target: 'ColumnType') -> Optional[Callable[[Any], Any]]:
398
+ def conversion_fn(self, target: ColumnType) -> Optional[Callable[[Any], Any]]:
286
399
  """
287
400
  Return Callable that converts a column value of type self to a value of type 'target'.
288
401
  Returns None if conversion isn't possible.
289
402
  """
290
403
  return None
291
404
 
292
- @abc.abstractmethod
293
- def to_tf(self) -> Union['tf.TypeSpec', Dict[str, 'tf.TypeSpec']]:
294
- pass
295
-
296
405
 
297
406
  class InvalidType(ColumnType):
298
- def __init__(self):
299
- super().__init__(self.Type.INVALID)
407
+ def __init__(self, nullable: bool = False):
408
+ super().__init__(self.Type.INVALID, nullable=nullable)
300
409
 
301
410
  def to_sql(self) -> str:
302
411
  assert False
@@ -304,13 +413,18 @@ class InvalidType(ColumnType):
304
413
  def to_sa_type(self) -> Any:
305
414
  assert False
306
415
 
307
- def to_tf(self) -> Union['tf.TypeSpec', Dict[str, 'tf.TypeSpec']]:
308
- raise TypeError(f'Invalid type cannot be converted to Tensorflow')
416
+ def to_arrow_type(self) -> 'pyarrow.DataType':
417
+ assert False
309
418
 
419
+ def print_value(self, val: Any) -> str:
420
+ assert False
421
+
422
+ def _validate_literal(self, val: Any) -> None:
423
+ assert False
310
424
 
311
425
  class StringType(ColumnType):
312
- def __init__(self):
313
- super().__init__(self.Type.STRING)
426
+ def __init__(self, nullable: bool = False):
427
+ super().__init__(self.Type.STRING, nullable=nullable)
314
428
 
315
429
  def conversion_fn(self, target: ColumnType) -> Optional[Callable[[Any], Any]]:
316
430
  if not target.is_timestamp_type():
@@ -328,78 +442,111 @@ class StringType(ColumnType):
328
442
 
329
443
  def to_sa_type(self) -> str:
330
444
  return sql.String
445
+
446
+ def to_arrow_type(self) -> 'pyarrow.DataType':
447
+ import pyarrow as pa # pylint: disable=import-outside-toplevel
448
+ return pa.string()
331
449
 
332
- def to_tf(self) -> Union['tf.TypeSpec', Dict[str, 'tf.TypeSpec']]:
333
- import tensorflow as tf
334
- return tf.TensorSpec(shape=(), dtype=tf.string)
450
+ def print_value(self, val: Any) -> str:
451
+ return f"'{val}'"
452
+
453
+ def _validate_literal(self, val: Any) -> None:
454
+ if not isinstance(val, str):
455
+ raise TypeError(f'Expected string, got {val.__class__.__name__}')
335
456
 
336
457
 
337
458
  class IntType(ColumnType):
338
- def __init__(self):
339
- super().__init__(self.Type.INT)
459
+ def __init__(self, nullable: bool = False):
460
+ super().__init__(self.Type.INT, nullable=nullable)
340
461
 
341
462
  def to_sql(self) -> str:
342
- return 'INTEGER'
463
+ return 'BIGINT'
343
464
 
344
465
  def to_sa_type(self) -> str:
345
- return sql.Integer
466
+ return sql.BigInteger
467
+
468
+ def to_arrow_type(self) -> 'pyarrow.DataType':
469
+ import pyarrow as pa # pylint: disable=import-outside-toplevel
470
+ return pa.int64() # to be consistent with bigint above
346
471
 
347
- def to_tf(self) -> Union['tf.TypeSpec', Dict[str, 'tf.TypeSpec']]:
348
- # TODO: how to specify the correct int subtype?
349
- import tensorflow as tf
350
- return tf.TensorSpec(shape=(), dtype=tf.int64)
472
+ def _validate_literal(self, val: Any) -> None:
473
+ if not isinstance(val, int):
474
+ raise TypeError(f'Expected int, got {val.__class__.__name__}')
351
475
 
352
476
 
353
477
  class FloatType(ColumnType):
354
- def __init__(self):
355
- super().__init__(self.Type.FLOAT)
478
+ def __init__(self, nullable: bool = False):
479
+ super().__init__(self.Type.FLOAT, nullable=nullable)
356
480
 
357
481
  def to_sql(self) -> str:
358
482
  return 'FLOAT'
359
483
 
360
484
  def to_sa_type(self) -> str:
361
485
  return sql.Float
486
+
487
+ def to_arrow_type(self) -> 'pyarrow.DataType':
488
+ import pyarrow as pa
489
+ return pa.float32()
362
490
 
363
- def to_tf(self) -> Union['tf.TypeSpec', Dict[str, 'tf.TypeSpec']]:
364
- import tensorflow as tf
365
- # TODO: how to specify the correct float subtype?
366
- return tf.TensorSpec(shape=(), dtype=tf.float32)
491
+ def _validate_literal(self, val: Any) -> None:
492
+ if not isinstance(val, float):
493
+ raise TypeError(f'Expected float, got {val.__class__.__name__}')
367
494
 
495
+ def _create_literal(self, val: Any) -> Any:
496
+ if isinstance(val, int):
497
+ return float(val)
498
+ return val
368
499
 
369
500
  class BoolType(ColumnType):
370
- def __init__(self):
371
- super().__init__(self.Type.BOOL)
501
+ def __init__(self, nullable: bool = False):
502
+ super().__init__(self.Type.BOOL, nullable=nullable)
372
503
 
373
504
  def to_sql(self) -> str:
374
505
  return 'BOOLEAN'
375
506
 
376
507
  def to_sa_type(self) -> str:
377
508
  return sql.Boolean
509
+
510
+ def to_arrow_type(self) -> 'pyarrow.DataType':
511
+ import pyarrow as pa # pylint: disable=import-outside-toplevel
512
+ return pa.bool_()
378
513
 
379
- def to_tf(self) -> Union['tf.TypeSpec', Dict[str, 'tf.TypeSpec']]:
380
- import tensorflow as tf
381
- # TODO: how to specify the correct int subtype?
382
- return tf.TensorSpec(shape=(), dtype=tf.bool)
514
+ def _validate_literal(self, val: Any) -> None:
515
+ if not isinstance(val, bool):
516
+ raise TypeError(f'Expected bool, got {val.__class__.__name__}')
383
517
 
518
+ def _create_literal(self, val: Any) -> Any:
519
+ if isinstance(val, int):
520
+ return bool(val)
521
+ return val
384
522
 
385
523
  class TimestampType(ColumnType):
386
- def __init__(self):
387
- super().__init__(self.Type.TIMESTAMP)
524
+ def __init__(self, nullable: bool = False):
525
+ super().__init__(self.Type.TIMESTAMP, nullable=nullable)
388
526
 
389
527
  def to_sql(self) -> str:
390
528
  return 'INTEGER'
391
529
 
392
530
  def to_sa_type(self) -> str:
393
531
  return sql.TIMESTAMP
532
+
533
+ def to_arrow_type(self) -> 'pyarrow.DataType':
534
+ import pyarrow as pa # pylint: disable=import-outside-toplevel
535
+ return pa.timestamp('us') # postgres timestamp is microseconds
394
536
 
395
- def to_tf(self) -> Union['tf.TypeSpec', Dict[str, 'tf.TypeSpec']]:
396
- raise TypeError(f'Timestamp type cannot be converted to Tensorflow')
537
+ def _validate_literal(self, val: Any) -> None:
538
+ if not isinstance(val, datetime.datetime) and not isinstance(val, datetime.date):
539
+ raise TypeError(f'Expected datetime.datetime or datetime.date, got {val.__class__.__name__}')
397
540
 
541
+ def _create_literal(self, val: Any) -> Any:
542
+ if isinstance(val, str):
543
+ return datetime.datetime.fromisoformat(val)
544
+ return val
398
545
 
399
546
  class JsonType(ColumnType):
400
547
  # TODO: type_spec also needs to be able to express lists
401
- def __init__(self, type_spec: Optional[Dict[str, ColumnType]] = None):
402
- super().__init__(self.Type.JSON)
548
+ def __init__(self, type_spec: Optional[Dict[str, ColumnType]] = None, nullable: bool = False):
549
+ super().__init__(self.Type.JSON, nullable=nullable)
403
550
  self.type_spec = type_spec
404
551
 
405
552
  def _as_dict(self) -> Dict:
@@ -410,34 +557,52 @@ class JsonType(ColumnType):
410
557
  return result
411
558
 
412
559
  @classmethod
413
- def _from_dict(cls, d: Dict) -> 'ColumnType':
560
+ def _from_dict(cls, d: Dict) -> ColumnType:
414
561
  type_spec = None
415
562
  if 'type_spec' in d:
416
563
  type_spec = {
417
564
  field_name: cls.deserialize(field_type_dict) for field_name, field_type_dict in d['type_spec'].items()
418
565
  }
419
- return cls(type_spec)
566
+ return cls(type_spec, nullable=d['nullable'])
420
567
 
421
568
  def to_sql(self) -> str:
422
569
  return 'JSONB'
423
570
 
424
571
  def to_sa_type(self) -> str:
425
572
  return sql.dialects.postgresql.JSONB
426
-
427
- def to_tf(self) -> Union['tf.TypeSpec', Dict[str, 'tf.TypeSpec']]:
428
- if self.type_spec is None:
429
- raise TypeError(f'Cannot convert {self.__class__.__name__} with missing type spec to TensorFlow')
430
- return {k: v.to_tf() for k, v in self.type_spec.items()}
431
-
573
+
574
+ def to_arrow_type(self) -> 'pyarrow.DataType':
575
+ import pyarrow as pa # pylint: disable=import-outside-toplevel
576
+ return pa.string() # TODO: weight advantage of pa.struct type.
577
+
578
+ def print_value(self, val: Any) -> str:
579
+ val_type = self.infer_literal_type(val)
580
+ if val_type == self:
581
+ return str(val)
582
+ return val_type.print_value(val)
583
+
584
+ def _validate_literal(self, val: Any) -> None:
585
+ if not isinstance(val, dict) and not isinstance(val, list):
586
+ raise TypeError(f'Expected dict or list, got {val.__class__.__name__}')
587
+ try:
588
+ _ = json.dumps(val)
589
+ except TypeError as e:
590
+ raise TypeError(f'Expected JSON-serializable object, got {val}')
591
+
592
+ def _create_literal(self, val: Any) -> Any:
593
+ if isinstance(val, tuple):
594
+ val = list(val)
595
+ return val
432
596
 
433
597
  class ArrayType(ColumnType):
434
598
  def __init__(
435
- self, shape: Tuple[Union[int, None], ...], dtype: ColumnType.Type):
436
- super().__init__(self.Type.ARRAY)
599
+ self, shape: Tuple[Union[int, None], ...], dtype: ColumnType, nullable: bool = False):
600
+ super().__init__(self.Type.ARRAY, nullable=nullable)
437
601
  self.shape = shape
438
- self.dtype = dtype
602
+ assert dtype.is_int_type() or dtype.is_float_type() or dtype.is_bool_type() or dtype.is_string_type()
603
+ self.dtype = dtype._type
439
604
 
440
- def _supertype(cls, type1: 'ArrayType', type2: 'ArrayType') -> Optional['ArrayType']:
605
+ def _supertype(cls, type1: ArrayType, type2: ArrayType) -> Optional[ArrayType]:
441
606
  if len(type1.shape) != len(type2.shape):
442
607
  return None
443
608
  base_type = ColumnType.supertype(type1.dtype, type2.dtype)
@@ -452,54 +617,94 @@ class ArrayType(ColumnType):
452
617
  return result
453
618
 
454
619
  def __str__(self) -> str:
455
- return f'{self.__class__.__name__}({self.shape}, dtype={self.dtype.name})'
620
+ return f'{self._type.name.lower()}({self.shape}, dtype={self.dtype.name})'
456
621
 
457
622
  @classmethod
458
- def _from_dict(cls, d: Dict) -> 'ColumnType':
623
+ def _from_dict(cls, d: Dict) -> ColumnType:
459
624
  assert 'shape' in d
460
625
  assert 'dtype' in d
461
626
  shape = tuple(d['shape'])
462
- dtype = cls.Type(d['dtype'])
463
- return cls(shape, dtype)
464
-
465
- def to_sql(self) -> str:
466
- return 'BYTEA'
627
+ dtype = cls.make_type(cls.Type(d['dtype']))
628
+ return cls(shape, dtype, nullable=d['nullable'])
467
629
 
468
- def to_sa_type(self) -> str:
469
- return sql.VARBINARY
630
+ @classmethod
631
+ def from_literal(cls, val: np.ndarray) -> Optional[ArrayType]:
632
+ # determine our dtype
633
+ assert isinstance(val, np.ndarray)
634
+ if np.issubdtype(val.dtype, np.integer):
635
+ dtype = IntType()
636
+ elif np.issubdtype(val.dtype, np.floating):
637
+ dtype = FloatType()
638
+ elif val.dtype == np.bool_:
639
+ dtype = BoolType()
640
+ elif val.dtype == np.str_:
641
+ dtype = StringType()
642
+ else:
643
+ return None
644
+ return cls(val.shape, dtype=dtype, nullable=True)
470
645
 
471
- def to_tf(self) -> Union['tf.TypeSpec', Dict[str, 'tf.TypeSpec']]:
472
- import tensorflow as tf
473
- return tf.TensorSpec(shape=self.shape, dtype=self.dtype.to_tf())
646
+ def is_valid_literal(self, val: np.ndarray) -> bool:
647
+ if not isinstance(val, np.ndarray):
648
+ return False
649
+ if len(val.shape) != len(self.shape):
650
+ return False
651
+ # check that the shapes are compatible
652
+ for n1, n2 in zip(val.shape, self.shape):
653
+ if n1 is None:
654
+ return False
655
+ if n2 is None:
656
+ # wildcard
657
+ continue
658
+ if n1 != n2:
659
+ return False
660
+ return val.dtype == self.numpy_dtype()
474
661
 
662
+ def _validate_literal(self, val: Any) -> None:
663
+ if not isinstance(val, np.ndarray):
664
+ raise TypeError(f'Expected numpy.ndarray, got {val.__class__.__name__}')
665
+ if not self.is_valid_literal(val):
666
+ raise TypeError((
667
+ f'Expected ndarray({self.shape}, dtype={self.numpy_dtype()}), '
668
+ f'got ndarray({val.shape}, dtype={val.dtype})'))
475
669
 
476
- class ImageType(ColumnType):
477
- @enum.unique
478
- class Mode(enum.Enum):
479
- L = 0,
480
- RGB = 1
670
+ def _create_literal(self, val: Any) -> Any:
671
+ if isinstance(val, (list,tuple)):
672
+ return np.array(val)
673
+ return val
481
674
 
482
- @classmethod
483
- def from_pil(cls, pil_mode: str) -> 'Mode':
484
- if pil_mode == 'L':
485
- return cls.L
486
- if pil_mode == 'RGB':
487
- return cls.RGB
675
+ def to_sql(self) -> str:
676
+ return 'BYTEA'
488
677
 
489
- def to_pil(self) -> str:
490
- return self.name
678
+ def to_sa_type(self) -> str:
679
+ return sql.LargeBinary
680
+
681
+ def to_arrow_type(self) -> 'pyarrow.DataType':
682
+ import pyarrow as pa # pylint: disable=import-outside-toplevel
683
+ if any([n is None for n in self.shape]):
684
+ raise TypeError(f'Cannot convert array with unknown shape to Arrow')
685
+ return pa.fixed_shape_tensor(pa.from_numpy_dtype(self.numpy_dtype()), self.shape)
686
+
687
+ def numpy_dtype(self) -> np.dtype:
688
+ if self.dtype == self.Type.INT:
689
+ return np.dtype(np.int64)
690
+ if self.dtype == self.Type.FLOAT:
691
+ return np.dtype(np.float32)
692
+ if self.dtype == self.Type.BOOL:
693
+ return np.dtype(np.bool_)
694
+ if self.dtype == self.Type.STRING:
695
+ return np.dtype(np.str_)
696
+ assert False
491
697
 
492
- def num_channels(self) -> int:
493
- return len(self.name)
494
698
 
699
+ class ImageType(ColumnType):
495
700
  def __init__(
496
701
  self, width: Optional[int] = None, height: Optional[int] = None, size: Optional[Tuple[int, int]] = None,
497
- mode: Optional[Mode] = None
702
+ mode: Optional[str] = None, nullable: bool = False
498
703
  ):
499
704
  """
500
705
  TODO: does it make sense to specify only width or height?
501
706
  """
502
- super().__init__(self.Type.IMAGE)
707
+ super().__init__(self.Type.IMAGE, nullable=nullable)
503
708
  assert not(width is not None and size is not None)
504
709
  assert not(height is not None and size is not None)
505
710
  if size is not None:
@@ -510,22 +715,53 @@ class ImageType(ColumnType):
510
715
  self.height = height
511
716
  self.mode = mode
512
717
 
718
+ def __str__(self) -> str:
719
+ if self.width is not None or self.height is not None or self.mode is not None:
720
+ params_str = ''
721
+ if self.width is not None:
722
+ params_str = f'width={self.width}'
723
+ if self.height is not None:
724
+ if len(params_str) > 0:
725
+ params_str += ', '
726
+ params_str += f'height={self.height}'
727
+ if self.mode is not None:
728
+ if len(params_str) > 0:
729
+ params_str += ', '
730
+ params_str += f'mode={self.mode}'
731
+ params_str = f'({params_str})'
732
+ else:
733
+ params_str = ''
734
+ return f'{self._type.name.lower()}{params_str}'
735
+
736
+ def _is_supertype_of(self, other: ImageType) -> bool:
737
+ if self.mode != other.mode:
738
+ return False
739
+ if self.width is None and self.height is None:
740
+ return True
741
+ if self.width != other.width and self.height != other.height:
742
+ return False
743
+
744
+ @property
745
+ def size(self) -> Optional[Tuple[int, int]]:
746
+ if self.width is None or self.height is None:
747
+ return None
748
+ return (self.width, self.height)
749
+
513
750
  @property
514
751
  def num_channels(self) -> Optional[int]:
515
752
  return None if self.mode is None else self.mode.num_channels()
516
753
 
517
754
  def _as_dict(self) -> Dict:
518
755
  result = super()._as_dict()
519
- result.update(width=self.width, height=self.height, mode=self.mode.value if self.mode is not None else None)
756
+ result.update(width=self.width, height=self.height, mode=self.mode)
520
757
  return result
521
758
 
522
759
  @classmethod
523
- def _from_dict(cls, d: Dict) -> 'ColumnType':
760
+ def _from_dict(cls, d: Dict) -> ColumnType:
524
761
  assert 'width' in d
525
762
  assert 'height' in d
526
763
  assert 'mode' in d
527
- mode_val = d['mode']
528
- return cls(width=d['width'], height=d['height'], mode=cls.Mode(mode_val) if mode_val is not None else None)
764
+ return cls(width=d['width'], height=d['height'], mode=d['mode'], nullable=d['nullable'])
529
765
 
530
766
  def conversion_fn(self, target: ColumnType) -> Optional[Callable[[Any], Any]]:
531
767
  if not target.is_image_type():
@@ -552,23 +788,111 @@ class ImageType(ColumnType):
552
788
 
553
789
  def to_sa_type(self) -> str:
554
790
  return sql.String
791
+
792
+ def to_arrow_type(self) -> 'pyarrow.DataType':
793
+ import pyarrow as pa # pylint: disable=import-outside-toplevel
794
+ return pa.binary()
795
+
796
+ def _validate_literal(self, val: Any) -> None:
797
+ if isinstance(val, PIL.Image.Image):
798
+ return
799
+ self._validate_file_path(val)
800
+
801
+ def validate_media(self, val: Any) -> None:
802
+ assert isinstance(val, str)
803
+ try:
804
+ _ = PIL.Image.open(val)
805
+ except PIL.UnidentifiedImageError:
806
+ raise excs.Error(f'Not a valid image: {val}') from None
555
807
 
556
- def to_tf(self) -> Union['tf.TypeSpec', Dict[str, 'tf.TypeSpec']]:
557
- import tensorflow as tf
558
- return tf.TensorSpec(shape=(self.height, self.width, self.num_channels), dtype=tf.uint8)
808
+ class VideoType(ColumnType):
809
+ def __init__(self, nullable: bool = False):
810
+ super().__init__(self.Type.VIDEO, nullable=nullable)
559
811
 
812
+ def to_sql(self) -> str:
813
+ # stored as a file path
814
+ return 'VARCHAR'
560
815
 
561
- class VideoType(ColumnType):
562
- def __init__(self):
563
- super().__init__(self.Type.VIDEO)
816
+ def to_sa_type(self) -> str:
817
+ return sql.String
818
+
819
+ def to_arrow_type(self) -> 'pyarrow.DataType':
820
+ import pyarrow as pa # pylint: disable=import-outside-toplevel
821
+ return pa.string()
822
+
823
+ def _validate_literal(self, val: Any) -> None:
824
+ self._validate_file_path(val)
825
+
826
+ def validate_media(self, val: Any) -> None:
827
+ assert isinstance(val, str)
828
+ try:
829
+ with av.open(val, 'r') as fh:
830
+ if len(fh.streams.video) == 0:
831
+ raise excs.Error(f'Not a valid video: {val}')
832
+ # decode a few frames to make sure it's playable
833
+ # TODO: decode all frames? but that's very slow
834
+ num_decoded = 0
835
+ for frame in fh.decode(video=0):
836
+ _ = frame.to_image()
837
+ num_decoded += 1
838
+ if num_decoded == 10:
839
+ break
840
+ if num_decoded < 2:
841
+ # this is most likely an image file
842
+ raise excs.Error(f'Not a valid video: {val}')
843
+ except av.AVError:
844
+ raise excs.Error(f'Not a valid video: {val}') from None
845
+
846
+ class AudioType(ColumnType):
847
+ def __init__(self, nullable: bool = False):
848
+ super().__init__(self.Type.AUDIO, nullable=nullable)
564
849
 
565
- def _as_dict(self) -> Dict:
566
- result = super()._as_dict()
567
- return result
850
+ def to_sql(self) -> str:
851
+ # stored as a file path
852
+ return 'VARCHAR'
568
853
 
569
- @classmethod
570
- def _from_dict(cls, d: Dict) -> 'ColumnType':
571
- return cls()
854
+ def to_sa_type(self) -> str:
855
+ return sql.String
856
+
857
+ def to_arrow_type(self) -> 'pyarrow.DataType':
858
+ import pyarrow as pa # pylint: disable=import-outside-toplevel
859
+ return pa.string()
860
+
861
+ def _validate_literal(self, val: Any) -> None:
862
+ self._validate_file_path(val)
863
+
864
+ def validate_media(self, val: Any) -> None:
865
+ try:
866
+ with av.open(val) as container:
867
+ if len(container.streams.audio) == 0:
868
+ raise excs.Error(f'No audio stream in file: {val}')
869
+ audio_stream = container.streams.audio[0]
870
+
871
+ # decode everything to make sure it's playable
872
+ # TODO: is there some way to verify it's a playable audio file other than decoding all of it?
873
+ for packet in container.demux(audio_stream):
874
+ for _ in packet.decode():
875
+ pass
876
+ except av.AVError as e:
877
+ raise excs.Error(f'Not a valid audio file: {val}\n{e}') from None
878
+
879
+ class DocumentType(ColumnType):
880
+ @enum.unique
881
+ class DocumentFormat(enum.Enum):
882
+ HTML = 0
883
+ MD = 1
884
+ PDF = 2
885
+
886
+ def __init__(self, nullable: bool = False, doc_formats: Optional[str] = None):
887
+ super().__init__(self.Type.DOCUMENT, nullable=nullable)
888
+ if doc_formats is not None:
889
+ type_strs = doc_formats.split(',')
890
+ for type_str in type_strs:
891
+ if not hasattr(self.DocumentFormat, type_str):
892
+ raise ValueError(f'Invalid document type: {type_str}')
893
+ self._doc_formats = [self.DocumentFormat[type_str.upper()] for type_str in type_strs]
894
+ else:
895
+ self._doc_formats = [t for t in self.DocumentFormat]
572
896
 
573
897
  def to_sql(self) -> str:
574
898
  # stored as a file path
@@ -577,5 +901,38 @@ class VideoType(ColumnType):
577
901
  def to_sa_type(self) -> str:
578
902
  return sql.String
579
903
 
580
- def to_tf(self) -> Union['tf.TypeSpec', Dict[str, 'tf.TypeSpec']]:
581
- assert False
904
+ def to_arrow_type(self) -> 'pyarrow.DataType':
905
+ import pyarrow as pa # pylint: disable=import-outside-toplevel
906
+ return pa.string()
907
+
908
+ def _validate_literal(self, val: Any) -> None:
909
+ self._validate_file_path(val)
910
+
911
+ def validate_media(self, val: Any) -> None:
912
+ assert isinstance(val, str)
913
+ from pixeltable.utils.documents import get_document_handle
914
+ with open(val, 'r') as fh:
915
+ try:
916
+ s = fh.read()
917
+ dh = get_document_handle(s)
918
+ if dh is None:
919
+ raise excs.Error(f'Not a recognized document format: {val}')
920
+ except Exception as e:
921
+ raise excs.Error(f'Not a recognized document format: {val}') from None
922
+
923
+
924
+ # A dictionary mapping various Python types to their respective ColumnTypes.
925
+ # This can be used to infer Pixeltable ColumnTypes from type hints on Python
926
+ # functions. (Since Python functions do not necessarily have type hints, this
927
+ # should always be an optional/convenience inference.)
928
+ _python_type_to_column_type: dict[type, ColumnType] = {
929
+ str: StringType(),
930
+ int: IntType(),
931
+ float: FloatType(),
932
+ bool: BoolType(),
933
+ datetime.datetime: TimestampType(),
934
+ datetime.date: TimestampType(),
935
+ list: JsonType(),
936
+ dict: JsonType(),
937
+ PIL.Image.Image: ImageType()
938
+ }