pixeltable 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (139) hide show
  1. pixeltable/__init__.py +34 -6
  2. pixeltable/catalog/__init__.py +13 -0
  3. pixeltable/catalog/catalog.py +159 -0
  4. pixeltable/catalog/column.py +200 -0
  5. pixeltable/catalog/dir.py +32 -0
  6. pixeltable/catalog/globals.py +33 -0
  7. pixeltable/catalog/insertable_table.py +191 -0
  8. pixeltable/catalog/named_function.py +36 -0
  9. pixeltable/catalog/path.py +58 -0
  10. pixeltable/catalog/path_dict.py +139 -0
  11. pixeltable/catalog/schema_object.py +39 -0
  12. pixeltable/catalog/table.py +581 -0
  13. pixeltable/catalog/table_version.py +749 -0
  14. pixeltable/catalog/table_version_path.py +133 -0
  15. pixeltable/catalog/view.py +203 -0
  16. pixeltable/client.py +520 -30
  17. pixeltable/dataframe.py +540 -349
  18. pixeltable/env.py +373 -45
  19. pixeltable/exceptions.py +12 -21
  20. pixeltable/exec/__init__.py +9 -0
  21. pixeltable/exec/aggregation_node.py +78 -0
  22. pixeltable/exec/cache_prefetch_node.py +113 -0
  23. pixeltable/exec/component_iteration_node.py +79 -0
  24. pixeltable/exec/data_row_batch.py +95 -0
  25. pixeltable/exec/exec_context.py +22 -0
  26. pixeltable/exec/exec_node.py +61 -0
  27. pixeltable/exec/expr_eval_node.py +217 -0
  28. pixeltable/exec/in_memory_data_node.py +69 -0
  29. pixeltable/exec/media_validation_node.py +43 -0
  30. pixeltable/exec/sql_scan_node.py +225 -0
  31. pixeltable/exprs/__init__.py +24 -0
  32. pixeltable/exprs/arithmetic_expr.py +102 -0
  33. pixeltable/exprs/array_slice.py +71 -0
  34. pixeltable/exprs/column_property_ref.py +77 -0
  35. pixeltable/exprs/column_ref.py +105 -0
  36. pixeltable/exprs/comparison.py +77 -0
  37. pixeltable/exprs/compound_predicate.py +98 -0
  38. pixeltable/exprs/data_row.py +187 -0
  39. pixeltable/exprs/expr.py +586 -0
  40. pixeltable/exprs/expr_set.py +39 -0
  41. pixeltable/exprs/function_call.py +380 -0
  42. pixeltable/exprs/globals.py +69 -0
  43. pixeltable/exprs/image_member_access.py +115 -0
  44. pixeltable/exprs/image_similarity_predicate.py +58 -0
  45. pixeltable/exprs/inline_array.py +107 -0
  46. pixeltable/exprs/inline_dict.py +101 -0
  47. pixeltable/exprs/is_null.py +38 -0
  48. pixeltable/exprs/json_mapper.py +121 -0
  49. pixeltable/exprs/json_path.py +159 -0
  50. pixeltable/exprs/literal.py +54 -0
  51. pixeltable/exprs/object_ref.py +41 -0
  52. pixeltable/exprs/predicate.py +44 -0
  53. pixeltable/exprs/row_builder.py +355 -0
  54. pixeltable/exprs/rowid_ref.py +94 -0
  55. pixeltable/exprs/type_cast.py +53 -0
  56. pixeltable/exprs/variable.py +45 -0
  57. pixeltable/func/__init__.py +9 -0
  58. pixeltable/func/aggregate_function.py +194 -0
  59. pixeltable/func/batched_function.py +53 -0
  60. pixeltable/func/callable_function.py +69 -0
  61. pixeltable/func/expr_template_function.py +82 -0
  62. pixeltable/func/function.py +110 -0
  63. pixeltable/func/function_registry.py +227 -0
  64. pixeltable/func/globals.py +36 -0
  65. pixeltable/func/nos_function.py +202 -0
  66. pixeltable/func/signature.py +166 -0
  67. pixeltable/func/udf.py +163 -0
  68. pixeltable/functions/__init__.py +52 -103
  69. pixeltable/functions/eval.py +216 -0
  70. pixeltable/functions/fireworks.py +61 -0
  71. pixeltable/functions/huggingface.py +120 -0
  72. pixeltable/functions/image.py +16 -0
  73. pixeltable/functions/openai.py +88 -0
  74. pixeltable/functions/pil/image.py +148 -7
  75. pixeltable/functions/string.py +13 -0
  76. pixeltable/functions/together.py +27 -0
  77. pixeltable/functions/util.py +41 -0
  78. pixeltable/functions/video.py +62 -0
  79. pixeltable/iterators/__init__.py +3 -0
  80. pixeltable/iterators/base.py +48 -0
  81. pixeltable/iterators/document.py +311 -0
  82. pixeltable/iterators/video.py +89 -0
  83. pixeltable/metadata/__init__.py +54 -0
  84. pixeltable/metadata/converters/convert_10.py +18 -0
  85. pixeltable/metadata/schema.py +211 -0
  86. pixeltable/plan.py +656 -0
  87. pixeltable/store.py +413 -182
  88. pixeltable/tests/conftest.py +143 -87
  89. pixeltable/tests/test_audio.py +65 -0
  90. pixeltable/tests/test_catalog.py +27 -0
  91. pixeltable/tests/test_client.py +14 -14
  92. pixeltable/tests/test_component_view.py +372 -0
  93. pixeltable/tests/test_dataframe.py +433 -0
  94. pixeltable/tests/test_dirs.py +78 -62
  95. pixeltable/tests/test_document.py +117 -0
  96. pixeltable/tests/test_exprs.py +591 -135
  97. pixeltable/tests/test_function.py +297 -67
  98. pixeltable/tests/test_functions.py +283 -1
  99. pixeltable/tests/test_migration.py +43 -0
  100. pixeltable/tests/test_nos.py +54 -0
  101. pixeltable/tests/test_snapshot.py +208 -0
  102. pixeltable/tests/test_table.py +1085 -262
  103. pixeltable/tests/test_transactional_directory.py +42 -0
  104. pixeltable/tests/test_types.py +5 -11
  105. pixeltable/tests/test_video.py +149 -34
  106. pixeltable/tests/test_view.py +530 -0
  107. pixeltable/tests/utils.py +186 -45
  108. pixeltable/tool/create_test_db_dump.py +149 -0
  109. pixeltable/type_system.py +490 -126
  110. pixeltable/utils/__init__.py +17 -46
  111. pixeltable/utils/clip.py +12 -15
  112. pixeltable/utils/coco.py +136 -0
  113. pixeltable/utils/documents.py +39 -0
  114. pixeltable/utils/filecache.py +195 -0
  115. pixeltable/utils/help.py +11 -0
  116. pixeltable/utils/media_store.py +76 -0
  117. pixeltable/utils/parquet.py +126 -0
  118. pixeltable/utils/pytorch.py +172 -0
  119. pixeltable/utils/s3.py +13 -0
  120. pixeltable/utils/sql.py +17 -0
  121. pixeltable/utils/transactional_directory.py +35 -0
  122. pixeltable-0.2.0.dist-info/LICENSE +18 -0
  123. pixeltable-0.2.0.dist-info/METADATA +117 -0
  124. pixeltable-0.2.0.dist-info/RECORD +125 -0
  125. {pixeltable-0.1.1.dist-info → pixeltable-0.2.0.dist-info}/WHEEL +1 -1
  126. pixeltable/catalog.py +0 -1421
  127. pixeltable/exprs.py +0 -1745
  128. pixeltable/function.py +0 -269
  129. pixeltable/functions/clip.py +0 -10
  130. pixeltable/functions/pil/__init__.py +0 -23
  131. pixeltable/functions/tf.py +0 -21
  132. pixeltable/index.py +0 -57
  133. pixeltable/tests/test_dict.py +0 -24
  134. pixeltable/tests/test_tf.py +0 -69
  135. pixeltable/tf.py +0 -33
  136. pixeltable/utils/tf.py +0 -33
  137. pixeltable/utils/video.py +0 -32
  138. pixeltable-0.1.1.dist-info/METADATA +0 -31
  139. pixeltable-0.1.1.dist-info/RECORD +0 -36
pixeltable/type_system.py CHANGED
@@ -1,15 +1,21 @@
1
+ from __future__ import annotations
2
+
1
3
  import abc
2
- from typing import Any, Optional, Tuple, Dict, Callable, List, Union
3
- import enum
4
4
  import datetime
5
+ import enum
5
6
  import json
7
+ import typing
8
+ import urllib.parse
9
+ from copy import copy
10
+ from pathlib import Path
11
+ from typing import Any, Optional, Tuple, Dict, Callable, List, Union
6
12
 
7
- import os
8
- os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
9
- import tensorflow as tf
10
13
  import PIL.Image
14
+ import av
15
+ import numpy as np
11
16
  import sqlalchemy as sql
12
17
 
18
+ from pixeltable import exceptions as excs
13
19
 
14
20
 
15
21
  class ColumnType:
@@ -24,25 +30,16 @@ class ColumnType:
24
30
  ARRAY = 6
25
31
  IMAGE = 7
26
32
  VIDEO = 8
33
+ AUDIO = 9
34
+ DOCUMENT = 10
27
35
 
28
36
  # exprs that don't evaluate to a computable value in Pixeltable, such as an Image member function
29
- INVALID = 9
30
-
31
- def to_tf(self) -> tf.dtypes.DType:
32
- if self == self.STRING:
33
- return tf.string
34
- if self == self.INT:
35
- return tf.int64
36
- if self == self.FLOAT:
37
- return tf.float32
38
- if self == self.BOOL:
39
- return tf.bool
40
- raise TypeError(f'Cannot convert {self} to TensorFlow')
37
+ INVALID = 255
41
38
 
42
39
  @classmethod
43
40
  def supertype(
44
41
  cls, type1: 'Type', type2: 'Type',
45
- # we need to pass this in because we can't easily add it as a class member
42
+ # we need to pass this in because we can't easily append it as a class member
46
43
  common_supertypes: Dict[Tuple['Type', 'Type'], 'Type']
47
44
  ) -> Optional['Type']:
48
45
  if type1 == type2:
@@ -82,8 +79,9 @@ class ColumnType:
82
79
  (Type.INT, Type.FLOAT): Type.FLOAT,
83
80
  }
84
81
 
85
- def __init__(self, t: Type):
82
+ def __init__(self, t: Type, nullable: bool = False):
86
83
  self._type = t
84
+ self.nullable = nullable
87
85
 
88
86
  @property
89
87
  def type_enum(self) -> Type:
@@ -93,7 +91,7 @@ class ColumnType:
93
91
  return json.dumps(self.as_dict())
94
92
 
95
93
  @classmethod
96
- def serialize_list(cls, type_list: List['ColumnType']) -> str:
94
+ def serialize_list(cls, type_list: List[ColumnType]) -> str:
97
95
  return json.dumps([t.as_dict() for t in type_list])
98
96
 
99
97
  def as_dict(self) -> Dict:
@@ -103,33 +101,34 @@ class ColumnType:
103
101
  }
104
102
 
105
103
  def _as_dict(self) -> Dict:
106
- return {}
104
+ return {'nullable': self.nullable}
107
105
 
108
106
  @classmethod
109
- def deserialize(cls, type_str: str) -> 'ColumnType':
107
+ def deserialize(cls, type_str: str) -> ColumnType:
110
108
  type_dict = json.loads(type_str)
111
109
  return cls.from_dict(type_dict)
112
110
 
113
111
  @classmethod
114
- def deserialize_list(cls, type_list_str: str) -> List['ColumnType']:
112
+ def deserialize_list(cls, type_list_str: str) -> List[ColumnType]:
115
113
  type_dict_list = json.loads(type_list_str)
116
114
  return [cls.from_dict(type_dict) for type_dict in type_dict_list]
117
115
 
118
116
  @classmethod
119
- def from_dict(cls, type_dict: Dict) -> 'ColumnType':
117
+ def from_dict(cls, type_dict: Dict) -> ColumnType:
120
118
  assert '_classname' in type_dict
121
119
  type_class = globals()[type_dict['_classname']]
122
120
  return type_class._from_dict(type_dict)
123
121
 
124
122
  @classmethod
125
- def _from_dict(cls, d: Dict) -> 'ColumnType':
123
+ def _from_dict(cls, d: Dict) -> ColumnType:
126
124
  """
127
- Default implementation: simply invoke c'tor without arguments
125
+ Default implementation: simply invoke c'tor
128
126
  """
129
- return cls()
127
+ assert 'nullable' in d
128
+ return cls(nullable=d['nullable'])
130
129
 
131
130
  @classmethod
132
- def make_type(cls, t: Type) -> 'ColumnType':
131
+ def make_type(cls, t: Type) -> ColumnType:
133
132
  assert t != cls.Type.INVALID and t != cls.Type.ARRAY
134
133
  if t == cls.Type.STRING:
135
134
  return StringType()
@@ -147,21 +146,44 @@ class ColumnType:
147
146
  return ImageType()
148
147
  if t == cls.Type.VIDEO:
149
148
  return VideoType()
149
+ if t == cls.Type.AUDIO:
150
+ return AudioType()
151
+ if t == cls.Type.DOCUMENT:
152
+ return AudioType()
150
153
 
151
154
  def __str__(self) -> str:
152
155
  return self._type.name.lower()
153
156
 
154
157
  def __eq__(self, other: object) -> bool:
158
+ return self.matches(other) and self.nullable == other.nullable
159
+
160
+ def is_supertype_of(self, other: ColumnType) -> bool:
161
+ if type(self) != type(other):
162
+ return False
163
+ if self.matches(other):
164
+ return True
165
+ return self._is_supertype_of(other)
166
+
167
+ @abc.abstractmethod
168
+ def _is_supertype_of(self, other: ColumnType) -> bool:
169
+ return False
170
+
171
+ def matches(self, other: object) -> bool:
172
+ """Two types match if they're equal, aside from nullability"""
173
+ if not isinstance(other, ColumnType):
174
+ pass
155
175
  assert isinstance(other, ColumnType)
156
- if False and type(self) != type(other):
176
+ if type(self) != type(other):
157
177
  return False
158
178
  for member_var in vars(self).keys():
179
+ if member_var == 'nullable':
180
+ continue
159
181
  if getattr(self, member_var) != getattr(other, member_var):
160
182
  return False
161
183
  return True
162
184
 
163
185
  @classmethod
164
- def supertype(cls, type1: 'ColumnType', type2: 'ColumnType') -> Optional['ColumnType']:
186
+ def supertype(cls, type1: ColumnType, type2: ColumnType) -> Optional[ColumnType]:
165
187
  if type1 == type2:
166
188
  return type1
167
189
 
@@ -183,16 +205,15 @@ class ColumnType:
183
205
 
184
206
  @classmethod
185
207
  @abc.abstractmethod
186
- def _supertype(cls, type1: 'ColumnType', type2: 'ColumnType') -> Optional['ColumnType']:
208
+ def _supertype(cls, type1: ColumnType, type2: ColumnType) -> Optional[ColumnType]:
187
209
  """
188
210
  Class-specific implementation of determining the supertype. type1 and type2 are from the same subclass of
189
211
  ColumnType.
190
212
  """
191
213
  pass
192
214
 
193
-
194
215
  @classmethod
195
- def get_value_type(cls, val: Any) -> 'ColumnType':
216
+ def infer_literal_type(cls, val: Any) -> Optional[ColumnType]:
196
217
  if isinstance(val, str):
197
218
  return StringType()
198
219
  if isinstance(val, int):
@@ -203,6 +224,85 @@ class ColumnType:
203
224
  return BoolType()
204
225
  if isinstance(val, datetime.datetime) or isinstance(val, datetime.date):
205
226
  return TimestampType()
227
+ if isinstance(val, np.ndarray):
228
+ col_type = ArrayType.from_literal(val)
229
+ if col_type is not None:
230
+ return col_type
231
+ # this could still be json-serializable
232
+ if isinstance(val, dict) or isinstance(val, np.ndarray):
233
+ try:
234
+ JsonType().validate_literal(val)
235
+ return JsonType()
236
+ except TypeError:
237
+ return None
238
+ return None
239
+
240
+
241
+ @classmethod
242
+ def from_python_type(cls, t: type) -> Optional[ColumnType]:
243
+ if t in _python_type_to_column_type:
244
+ return _python_type_to_column_type[t]
245
+ elif isinstance(t, typing._UnionGenericAlias) and t.__args__[1] is type(None):
246
+ # `t` is a type of the form Optional[T] (equivalently, Union[T, None]).
247
+ # We treat it as the underlying type but with nullable=True.
248
+ if t.__args__[0] in _python_type_to_column_type:
249
+ underlying = copy(_python_type_to_column_type[t.__args__[0]])
250
+ underlying.nullable = True
251
+ return underlying
252
+
253
+ return None
254
+
255
+
256
+ def validate_literal(self, val: Any) -> None:
257
+ """Raise TypeError if val is not a valid literal for this type"""
258
+ if val is None:
259
+ if not self.nullable:
260
+ raise TypeError('Expected non-None value')
261
+ else:
262
+ return
263
+ self._validate_literal(val)
264
+
265
+ def validate_media(self, val: Any) -> None:
266
+ """
267
+ Raise TypeError if val is not a path to a valid media file (or a valid in-memory byte sequence) for this type
268
+ """
269
+ if self.is_media_type():
270
+ raise NotImplementedError(f'validate_media() not implemented for {self.__class__.__name__}')
271
+
272
+ def _validate_file_path(self, val: Any) -> None:
273
+ """Raises TypeError if not a valid local file path or not a path/byte sequence"""
274
+ if isinstance(val, str):
275
+ parsed = urllib.parse.urlparse(val)
276
+ if parsed.scheme != '' and parsed.scheme != 'file':
277
+ return
278
+ path = Path(urllib.parse.unquote(parsed.path))
279
+ if not path.is_file():
280
+ raise TypeError(f'File not found: {str(path)}')
281
+ else:
282
+ if not isinstance(val, bytes):
283
+ raise TypeError(f'expected file path or bytes, got {type(val)}')
284
+
285
+ @abc.abstractmethod
286
+ def _validate_literal(self, val: Any) -> None:
287
+ """Raise TypeError if val is not a valid literal for this type"""
288
+ pass
289
+
290
+ @abc.abstractmethod
291
+ def _create_literal(self, val : Any) -> Any:
292
+ """Create a literal of this type from val, including any needed conversions.
293
+ val is guaranteed to be non-None"""
294
+ return val
295
+
296
+ def create_literal(self, val: Any) -> Any:
297
+ """Create a literal of this type from val or raise TypeError if not possible"""
298
+ if val is not None:
299
+ val = self._create_literal(val)
300
+
301
+ self.validate_literal(val)
302
+ return val
303
+
304
+ def print_value(self, val: Any) -> str:
305
+ return str(val)
206
306
 
207
307
  def is_scalar_type(self) -> bool:
208
308
  return self._type in self.scalar_types
@@ -240,6 +340,16 @@ class ColumnType:
240
340
  def is_video_type(self) -> bool:
241
341
  return self._type == self.Type.VIDEO
242
342
 
343
+ def is_audio_type(self) -> bool:
344
+ return self._type == self.Type.AUDIO
345
+
346
+ def is_document_type(self) -> bool:
347
+ return self._type == self.Type.DOCUMENT
348
+
349
+ def is_media_type(self) -> bool:
350
+ # types that refer to external media files
351
+ return self.is_image_type() or self.is_video_type() or self.is_audio_type() or self.is_document_type()
352
+
243
353
  @abc.abstractmethod
244
354
  def to_sql(self) -> str:
245
355
  """
@@ -273,6 +383,10 @@ class ColumnType:
273
383
  return sql.VARBINARY
274
384
  assert False
275
385
 
386
+ @abc.abstractmethod
387
+ def to_arrow_type(self) -> 'pyarrow.DataType':
388
+ assert False, f'Have not implemented {self.__class__.__name__} to Arrow'
389
+
276
390
  @staticmethod
277
391
  def no_conversion(v: Any) -> Any:
278
392
  """
@@ -281,21 +395,17 @@ class ColumnType:
281
395
  """
282
396
  assert False
283
397
 
284
- def conversion_fn(self, target: 'ColumnType') -> Optional[Callable[[Any], Any]]:
398
+ def conversion_fn(self, target: ColumnType) -> Optional[Callable[[Any], Any]]:
285
399
  """
286
400
  Return Callable that converts a column value of type self to a value of type 'target'.
287
401
  Returns None if conversion isn't possible.
288
402
  """
289
403
  return None
290
404
 
291
- @abc.abstractmethod
292
- def to_tf(self) -> Union[tf.TypeSpec, Dict[str, tf.TypeSpec]]:
293
- pass
294
-
295
405
 
296
406
  class InvalidType(ColumnType):
297
- def __init__(self):
298
- super().__init__(self.Type.INVALID)
407
+ def __init__(self, nullable: bool = False):
408
+ super().__init__(self.Type.INVALID, nullable=nullable)
299
409
 
300
410
  def to_sql(self) -> str:
301
411
  assert False
@@ -303,13 +413,18 @@ class InvalidType(ColumnType):
303
413
  def to_sa_type(self) -> Any:
304
414
  assert False
305
415
 
306
- def to_tf(self) -> Union[tf.TypeSpec, Dict[str, tf.TypeSpec]]:
307
- raise TypeError(f'Invalid type cannot be converted to Tensorflow')
416
+ def to_arrow_type(self) -> 'pyarrow.DataType':
417
+ assert False
308
418
 
419
+ def print_value(self, val: Any) -> str:
420
+ assert False
421
+
422
+ def _validate_literal(self, val: Any) -> None:
423
+ assert False
309
424
 
310
425
  class StringType(ColumnType):
311
- def __init__(self):
312
- super().__init__(self.Type.STRING)
426
+ def __init__(self, nullable: bool = False):
427
+ super().__init__(self.Type.STRING, nullable=nullable)
313
428
 
314
429
  def conversion_fn(self, target: ColumnType) -> Optional[Callable[[Any], Any]]:
315
430
  if not target.is_timestamp_type():
@@ -327,74 +442,111 @@ class StringType(ColumnType):
327
442
 
328
443
  def to_sa_type(self) -> str:
329
444
  return sql.String
445
+
446
+ def to_arrow_type(self) -> 'pyarrow.DataType':
447
+ import pyarrow as pa # pylint: disable=import-outside-toplevel
448
+ return pa.string()
330
449
 
331
- def to_tf(self) -> Union[tf.TypeSpec, Dict[str, tf.TypeSpec]]:
332
- return tf.TensorSpec(shape=(), dtype=tf.string)
450
+ def print_value(self, val: Any) -> str:
451
+ return f"'{val}'"
452
+
453
+ def _validate_literal(self, val: Any) -> None:
454
+ if not isinstance(val, str):
455
+ raise TypeError(f'Expected string, got {val.__class__.__name__}')
333
456
 
334
457
 
335
458
  class IntType(ColumnType):
336
- def __init__(self):
337
- super().__init__(self.Type.INT)
459
+ def __init__(self, nullable: bool = False):
460
+ super().__init__(self.Type.INT, nullable=nullable)
338
461
 
339
462
  def to_sql(self) -> str:
340
- return 'INTEGER'
463
+ return 'BIGINT'
341
464
 
342
465
  def to_sa_type(self) -> str:
343
- return sql.Integer
466
+ return sql.BigInteger
467
+
468
+ def to_arrow_type(self) -> 'pyarrow.DataType':
469
+ import pyarrow as pa # pylint: disable=import-outside-toplevel
470
+ return pa.int64() # to be consistent with bigint above
344
471
 
345
- def to_tf(self) -> Union[tf.TypeSpec, Dict[str, tf.TypeSpec]]:
346
- # TODO: how to specify the correct int subtype?
347
- return tf.TensorSpec(shape=(), dtype=tf.int64)
472
+ def _validate_literal(self, val: Any) -> None:
473
+ if not isinstance(val, int):
474
+ raise TypeError(f'Expected int, got {val.__class__.__name__}')
348
475
 
349
476
 
350
477
  class FloatType(ColumnType):
351
- def __init__(self):
352
- super().__init__(self.Type.FLOAT)
478
+ def __init__(self, nullable: bool = False):
479
+ super().__init__(self.Type.FLOAT, nullable=nullable)
353
480
 
354
481
  def to_sql(self) -> str:
355
482
  return 'FLOAT'
356
483
 
357
484
  def to_sa_type(self) -> str:
358
485
  return sql.Float
486
+
487
+ def to_arrow_type(self) -> 'pyarrow.DataType':
488
+ import pyarrow as pa
489
+ return pa.float32()
359
490
 
360
- def to_tf(self) -> Union[tf.TypeSpec, Dict[str, tf.TypeSpec]]:
361
- # TODO: how to specify the correct float subtype?
362
- return tf.TensorSpec(shape=(), dtype=tf.float32)
491
+ def _validate_literal(self, val: Any) -> None:
492
+ if not isinstance(val, float):
493
+ raise TypeError(f'Expected float, got {val.__class__.__name__}')
363
494
 
495
+ def _create_literal(self, val: Any) -> Any:
496
+ if isinstance(val, int):
497
+ return float(val)
498
+ return val
364
499
 
365
500
  class BoolType(ColumnType):
366
- def __init__(self):
367
- super().__init__(self.Type.BOOL)
501
+ def __init__(self, nullable: bool = False):
502
+ super().__init__(self.Type.BOOL, nullable=nullable)
368
503
 
369
504
  def to_sql(self) -> str:
370
505
  return 'BOOLEAN'
371
506
 
372
507
  def to_sa_type(self) -> str:
373
508
  return sql.Boolean
509
+
510
+ def to_arrow_type(self) -> 'pyarrow.DataType':
511
+ import pyarrow as pa # pylint: disable=import-outside-toplevel
512
+ return pa.bool_()
374
513
 
375
- def to_tf(self) -> Union[tf.TypeSpec, Dict[str, tf.TypeSpec]]:
376
- # TODO: how to specify the correct int subtype?
377
- return tf.TensorSpec(shape=(), dtype=tf.bool)
514
+ def _validate_literal(self, val: Any) -> None:
515
+ if not isinstance(val, bool):
516
+ raise TypeError(f'Expected bool, got {val.__class__.__name__}')
378
517
 
518
+ def _create_literal(self, val: Any) -> Any:
519
+ if isinstance(val, int):
520
+ return bool(val)
521
+ return val
379
522
 
380
523
  class TimestampType(ColumnType):
381
- def __init__(self):
382
- super().__init__(self.Type.TIMESTAMP)
524
+ def __init__(self, nullable: bool = False):
525
+ super().__init__(self.Type.TIMESTAMP, nullable=nullable)
383
526
 
384
527
  def to_sql(self) -> str:
385
528
  return 'INTEGER'
386
529
 
387
530
  def to_sa_type(self) -> str:
388
531
  return sql.TIMESTAMP
532
+
533
+ def to_arrow_type(self) -> 'pyarrow.DataType':
534
+ import pyarrow as pa # pylint: disable=import-outside-toplevel
535
+ return pa.timestamp('us') # postgres timestamp is microseconds
389
536
 
390
- def to_tf(self) -> Union[tf.TypeSpec, Dict[str, tf.TypeSpec]]:
391
- raise TypeError(f'Timestamp type cannot be converted to Tensorflow')
537
+ def _validate_literal(self, val: Any) -> None:
538
+ if not isinstance(val, datetime.datetime) and not isinstance(val, datetime.date):
539
+ raise TypeError(f'Expected datetime.datetime or datetime.date, got {val.__class__.__name__}')
392
540
 
541
+ def _create_literal(self, val: Any) -> Any:
542
+ if isinstance(val, str):
543
+ return datetime.datetime.fromisoformat(val)
544
+ return val
393
545
 
394
546
  class JsonType(ColumnType):
395
547
  # TODO: type_spec also needs to be able to express lists
396
- def __init__(self, type_spec: Optional[Dict[str, ColumnType]] = None):
397
- super().__init__(self.Type.JSON)
548
+ def __init__(self, type_spec: Optional[Dict[str, ColumnType]] = None, nullable: bool = False):
549
+ super().__init__(self.Type.JSON, nullable=nullable)
398
550
  self.type_spec = type_spec
399
551
 
400
552
  def _as_dict(self) -> Dict:
@@ -405,34 +557,52 @@ class JsonType(ColumnType):
405
557
  return result
406
558
 
407
559
  @classmethod
408
- def _from_dict(cls, d: Dict) -> 'ColumnType':
560
+ def _from_dict(cls, d: Dict) -> ColumnType:
409
561
  type_spec = None
410
562
  if 'type_spec' in d:
411
563
  type_spec = {
412
564
  field_name: cls.deserialize(field_type_dict) for field_name, field_type_dict in d['type_spec'].items()
413
565
  }
414
- return cls(type_spec)
566
+ return cls(type_spec, nullable=d['nullable'])
415
567
 
416
568
  def to_sql(self) -> str:
417
569
  return 'JSONB'
418
570
 
419
571
  def to_sa_type(self) -> str:
420
572
  return sql.dialects.postgresql.JSONB
421
-
422
- def to_tf(self) -> Union[tf.TypeSpec, Dict[str, tf.TypeSpec]]:
423
- if self.type_spec is None:
424
- raise TypeError(f'Cannot convert {self.__class__.__name__} with missing type spec to TensorFlow')
425
- return {k: v.to_tf() for k, v in self.type_spec.items()}
426
-
573
+
574
+ def to_arrow_type(self) -> 'pyarrow.DataType':
575
+ import pyarrow as pa # pylint: disable=import-outside-toplevel
576
+ return pa.string() # TODO: weight advantage of pa.struct type.
577
+
578
+ def print_value(self, val: Any) -> str:
579
+ val_type = self.infer_literal_type(val)
580
+ if val_type == self:
581
+ return str(val)
582
+ return val_type.print_value(val)
583
+
584
+ def _validate_literal(self, val: Any) -> None:
585
+ if not isinstance(val, dict) and not isinstance(val, list):
586
+ raise TypeError(f'Expected dict or list, got {val.__class__.__name__}')
587
+ try:
588
+ _ = json.dumps(val)
589
+ except TypeError as e:
590
+ raise TypeError(f'Expected JSON-serializable object, got {val}')
591
+
592
+ def _create_literal(self, val: Any) -> Any:
593
+ if isinstance(val, tuple):
594
+ val = list(val)
595
+ return val
427
596
 
428
597
  class ArrayType(ColumnType):
429
598
  def __init__(
430
- self, shape: Tuple[Union[int, None], ...], dtype: ColumnType.Type):
431
- super().__init__(self.Type.ARRAY)
599
+ self, shape: Tuple[Union[int, None], ...], dtype: ColumnType, nullable: bool = False):
600
+ super().__init__(self.Type.ARRAY, nullable=nullable)
432
601
  self.shape = shape
433
- self.dtype = dtype
602
+ assert dtype.is_int_type() or dtype.is_float_type() or dtype.is_bool_type() or dtype.is_string_type()
603
+ self.dtype = dtype._type
434
604
 
435
- def _supertype(cls, type1: 'ArrayType', type2: 'ArrayType') -> Optional['ArrayType']:
605
+ def _supertype(cls, type1: ArrayType, type2: ArrayType) -> Optional[ArrayType]:
436
606
  if len(type1.shape) != len(type2.shape):
437
607
  return None
438
608
  base_type = ColumnType.supertype(type1.dtype, type2.dtype)
@@ -447,53 +617,94 @@ class ArrayType(ColumnType):
447
617
  return result
448
618
 
449
619
  def __str__(self) -> str:
450
- return f'{self.__class__.__name__}({self.shape}, dtype={self.dtype.name})'
620
+ return f'{self._type.name.lower()}({self.shape}, dtype={self.dtype.name})'
451
621
 
452
622
  @classmethod
453
- def _from_dict(cls, d: Dict) -> 'ColumnType':
623
+ def _from_dict(cls, d: Dict) -> ColumnType:
454
624
  assert 'shape' in d
455
625
  assert 'dtype' in d
456
626
  shape = tuple(d['shape'])
457
- dtype = cls.Type(d['dtype'])
458
- return cls(shape, dtype)
459
-
460
- def to_sql(self) -> str:
461
- return 'BYTEA'
627
+ dtype = cls.make_type(cls.Type(d['dtype']))
628
+ return cls(shape, dtype, nullable=d['nullable'])
462
629
 
463
- def to_sa_type(self) -> str:
464
- return sql.VARBINARY
630
+ @classmethod
631
+ def from_literal(cls, val: np.ndarray) -> Optional[ArrayType]:
632
+ # determine our dtype
633
+ assert isinstance(val, np.ndarray)
634
+ if np.issubdtype(val.dtype, np.integer):
635
+ dtype = IntType()
636
+ elif np.issubdtype(val.dtype, np.floating):
637
+ dtype = FloatType()
638
+ elif val.dtype == np.bool_:
639
+ dtype = BoolType()
640
+ elif val.dtype == np.str_:
641
+ dtype = StringType()
642
+ else:
643
+ return None
644
+ return cls(val.shape, dtype=dtype, nullable=True)
465
645
 
466
- def to_tf(self) -> Union[tf.TypeSpec, Dict[str, tf.TypeSpec]]:
467
- return tf.TensorSpec(shape=self.shape, dtype=self.dtype.to_tf())
646
+ def is_valid_literal(self, val: np.ndarray) -> bool:
647
+ if not isinstance(val, np.ndarray):
648
+ return False
649
+ if len(val.shape) != len(self.shape):
650
+ return False
651
+ # check that the shapes are compatible
652
+ for n1, n2 in zip(val.shape, self.shape):
653
+ if n1 is None:
654
+ return False
655
+ if n2 is None:
656
+ # wildcard
657
+ continue
658
+ if n1 != n2:
659
+ return False
660
+ return val.dtype == self.numpy_dtype()
468
661
 
662
+ def _validate_literal(self, val: Any) -> None:
663
+ if not isinstance(val, np.ndarray):
664
+ raise TypeError(f'Expected numpy.ndarray, got {val.__class__.__name__}')
665
+ if not self.is_valid_literal(val):
666
+ raise TypeError((
667
+ f'Expected ndarray({self.shape}, dtype={self.numpy_dtype()}), '
668
+ f'got ndarray({val.shape}, dtype={val.dtype})'))
469
669
 
470
- class ImageType(ColumnType):
471
- @enum.unique
472
- class Mode(enum.Enum):
473
- L = 0,
474
- RGB = 1
670
+ def _create_literal(self, val: Any) -> Any:
671
+ if isinstance(val, (list,tuple)):
672
+ return np.array(val)
673
+ return val
475
674
 
476
- @classmethod
477
- def from_pil(cls, pil_mode: str) -> 'Mode':
478
- if pil_mode == 'L':
479
- return cls.L
480
- if pil_mode == 'RGB':
481
- return cls.RGB
675
+ def to_sql(self) -> str:
676
+ return 'BYTEA'
482
677
 
483
- def to_pil(self) -> str:
484
- return self.name
678
+ def to_sa_type(self) -> str:
679
+ return sql.LargeBinary
680
+
681
+ def to_arrow_type(self) -> 'pyarrow.DataType':
682
+ import pyarrow as pa # pylint: disable=import-outside-toplevel
683
+ if any([n is None for n in self.shape]):
684
+ raise TypeError(f'Cannot convert array with unknown shape to Arrow')
685
+ return pa.fixed_shape_tensor(pa.from_numpy_dtype(self.numpy_dtype()), self.shape)
686
+
687
+ def numpy_dtype(self) -> np.dtype:
688
+ if self.dtype == self.Type.INT:
689
+ return np.dtype(np.int64)
690
+ if self.dtype == self.Type.FLOAT:
691
+ return np.dtype(np.float32)
692
+ if self.dtype == self.Type.BOOL:
693
+ return np.dtype(np.bool_)
694
+ if self.dtype == self.Type.STRING:
695
+ return np.dtype(np.str_)
696
+ assert False
485
697
 
486
- def num_channels(self) -> int:
487
- return len(self.name)
488
698
 
699
+ class ImageType(ColumnType):
489
700
  def __init__(
490
701
  self, width: Optional[int] = None, height: Optional[int] = None, size: Optional[Tuple[int, int]] = None,
491
- mode: Optional[Mode] = None
702
+ mode: Optional[str] = None, nullable: bool = False
492
703
  ):
493
704
  """
494
705
  TODO: does it make sense to specify only width or height?
495
706
  """
496
- super().__init__(self.Type.IMAGE)
707
+ super().__init__(self.Type.IMAGE, nullable=nullable)
497
708
  assert not(width is not None and size is not None)
498
709
  assert not(height is not None and size is not None)
499
710
  if size is not None:
@@ -504,22 +715,53 @@ class ImageType(ColumnType):
504
715
  self.height = height
505
716
  self.mode = mode
506
717
 
718
+ def __str__(self) -> str:
719
+ if self.width is not None or self.height is not None or self.mode is not None:
720
+ params_str = ''
721
+ if self.width is not None:
722
+ params_str = f'width={self.width}'
723
+ if self.height is not None:
724
+ if len(params_str) > 0:
725
+ params_str += ', '
726
+ params_str += f'height={self.height}'
727
+ if self.mode is not None:
728
+ if len(params_str) > 0:
729
+ params_str += ', '
730
+ params_str += f'mode={self.mode}'
731
+ params_str = f'({params_str})'
732
+ else:
733
+ params_str = ''
734
+ return f'{self._type.name.lower()}{params_str}'
735
+
736
+ def _is_supertype_of(self, other: ImageType) -> bool:
737
+ if self.mode != other.mode:
738
+ return False
739
+ if self.width is None and self.height is None:
740
+ return True
741
+ if self.width != other.width and self.height != other.height:
742
+ return False
743
+
744
+ @property
745
+ def size(self) -> Optional[Tuple[int, int]]:
746
+ if self.width is None or self.height is None:
747
+ return None
748
+ return (self.width, self.height)
749
+
507
750
  @property
508
751
  def num_channels(self) -> Optional[int]:
509
752
  return None if self.mode is None else self.mode.num_channels()
510
753
 
511
754
  def _as_dict(self) -> Dict:
512
755
  result = super()._as_dict()
513
- result.update(width=self.width, height=self.height, mode=self.mode.value if self.mode is not None else None)
756
+ result.update(width=self.width, height=self.height, mode=self.mode)
514
757
  return result
515
758
 
516
759
  @classmethod
517
- def _from_dict(cls, d: Dict) -> 'ColumnType':
760
+ def _from_dict(cls, d: Dict) -> ColumnType:
518
761
  assert 'width' in d
519
762
  assert 'height' in d
520
763
  assert 'mode' in d
521
- mode_val = d['mode']
522
- return cls(width=d['width'], height=d['height'], mode=cls.Mode(mode_val) if mode_val is not None else None)
764
+ return cls(width=d['width'], height=d['height'], mode=d['mode'], nullable=d['nullable'])
523
765
 
524
766
  def conversion_fn(self, target: ColumnType) -> Optional[Callable[[Any], Any]]:
525
767
  if not target.is_image_type():
@@ -546,22 +788,111 @@ class ImageType(ColumnType):
546
788
 
547
789
  def to_sa_type(self) -> str:
548
790
  return sql.String
791
+
792
+ def to_arrow_type(self) -> 'pyarrow.DataType':
793
+ import pyarrow as pa # pylint: disable=import-outside-toplevel
794
+ return pa.binary()
795
+
796
+ def _validate_literal(self, val: Any) -> None:
797
+ if isinstance(val, PIL.Image.Image):
798
+ return
799
+ self._validate_file_path(val)
800
+
801
+ def validate_media(self, val: Any) -> None:
802
+ assert isinstance(val, str)
803
+ try:
804
+ _ = PIL.Image.open(val)
805
+ except PIL.UnidentifiedImageError:
806
+ raise excs.Error(f'Not a valid image: {val}') from None
549
807
 
550
- def to_tf(self) -> Union[tf.TypeSpec, Dict[str, tf.TypeSpec]]:
551
- return tf.TensorSpec(shape=(self.height, self.width, self.num_channels), dtype=tf.uint8)
808
+ class VideoType(ColumnType):
809
+ def __init__(self, nullable: bool = False):
810
+ super().__init__(self.Type.VIDEO, nullable=nullable)
552
811
 
812
+ def to_sql(self) -> str:
813
+ # stored as a file path
814
+ return 'VARCHAR'
553
815
 
554
- class VideoType(ColumnType):
555
- def __init__(self):
556
- super().__init__(self.Type.VIDEO)
816
+ def to_sa_type(self) -> str:
817
+ return sql.String
818
+
819
+ def to_arrow_type(self) -> 'pyarrow.DataType':
820
+ import pyarrow as pa # pylint: disable=import-outside-toplevel
821
+ return pa.string()
822
+
823
+ def _validate_literal(self, val: Any) -> None:
824
+ self._validate_file_path(val)
825
+
826
+ def validate_media(self, val: Any) -> None:
827
+ assert isinstance(val, str)
828
+ try:
829
+ with av.open(val, 'r') as fh:
830
+ if len(fh.streams.video) == 0:
831
+ raise excs.Error(f'Not a valid video: {val}')
832
+ # decode a few frames to make sure it's playable
833
+ # TODO: decode all frames? but that's very slow
834
+ num_decoded = 0
835
+ for frame in fh.decode(video=0):
836
+ _ = frame.to_image()
837
+ num_decoded += 1
838
+ if num_decoded == 10:
839
+ break
840
+ if num_decoded < 2:
841
+ # this is most likely an image file
842
+ raise excs.Error(f'Not a valid video: {val}')
843
+ except av.AVError:
844
+ raise excs.Error(f'Not a valid video: {val}') from None
845
+
846
+ class AudioType(ColumnType):
847
+ def __init__(self, nullable: bool = False):
848
+ super().__init__(self.Type.AUDIO, nullable=nullable)
557
849
 
558
- def _as_dict(self) -> Dict:
559
- result = super()._as_dict()
560
- return result
850
+ def to_sql(self) -> str:
851
+ # stored as a file path
852
+ return 'VARCHAR'
561
853
 
562
- @classmethod
563
- def _from_dict(cls, d: Dict) -> 'ColumnType':
564
- return cls()
854
+ def to_sa_type(self) -> str:
855
+ return sql.String
856
+
857
+ def to_arrow_type(self) -> 'pyarrow.DataType':
858
+ import pyarrow as pa # pylint: disable=import-outside-toplevel
859
+ return pa.string()
860
+
861
+ def _validate_literal(self, val: Any) -> None:
862
+ self._validate_file_path(val)
863
+
864
+ def validate_media(self, val: Any) -> None:
865
+ try:
866
+ with av.open(val) as container:
867
+ if len(container.streams.audio) == 0:
868
+ raise excs.Error(f'No audio stream in file: {val}')
869
+ audio_stream = container.streams.audio[0]
870
+
871
+ # decode everything to make sure it's playable
872
+ # TODO: is there some way to verify it's a playable audio file other than decoding all of it?
873
+ for packet in container.demux(audio_stream):
874
+ for _ in packet.decode():
875
+ pass
876
+ except av.AVError as e:
877
+ raise excs.Error(f'Not a valid audio file: {val}\n{e}') from None
878
+
879
+ class DocumentType(ColumnType):
880
+ @enum.unique
881
+ class DocumentFormat(enum.Enum):
882
+ HTML = 0
883
+ MD = 1
884
+ PDF = 2
885
+
886
+ def __init__(self, nullable: bool = False, doc_formats: Optional[str] = None):
887
+ super().__init__(self.Type.DOCUMENT, nullable=nullable)
888
+ if doc_formats is not None:
889
+ type_strs = doc_formats.split(',')
890
+ for type_str in type_strs:
891
+ if not hasattr(self.DocumentFormat, type_str):
892
+ raise ValueError(f'Invalid document type: {type_str}')
893
+ self._doc_formats = [self.DocumentFormat[type_str.upper()] for type_str in type_strs]
894
+ else:
895
+ self._doc_formats = [t for t in self.DocumentFormat]
565
896
 
566
897
  def to_sql(self) -> str:
567
898
  # stored as a file path
@@ -570,5 +901,38 @@ class VideoType(ColumnType):
570
901
  def to_sa_type(self) -> str:
571
902
  return sql.String
572
903
 
573
- def to_tf(self) -> Union[tf.TypeSpec, Dict[str, tf.TypeSpec]]:
574
- assert False
904
+ def to_arrow_type(self) -> 'pyarrow.DataType':
905
+ import pyarrow as pa # pylint: disable=import-outside-toplevel
906
+ return pa.string()
907
+
908
+ def _validate_literal(self, val: Any) -> None:
909
+ self._validate_file_path(val)
910
+
911
+ def validate_media(self, val: Any) -> None:
912
+ assert isinstance(val, str)
913
+ from pixeltable.utils.documents import get_document_handle
914
+ with open(val, 'r') as fh:
915
+ try:
916
+ s = fh.read()
917
+ dh = get_document_handle(s)
918
+ if dh is None:
919
+ raise excs.Error(f'Not a recognized document format: {val}')
920
+ except Exception as e:
921
+ raise excs.Error(f'Not a recognized document format: {val}') from None
922
+
923
+
924
+ # A dictionary mapping various Python types to their respective ColumnTypes.
925
+ # This can be used to infer Pixeltable ColumnTypes from type hints on Python
926
+ # functions. (Since Python functions do not necessarily have type hints, this
927
+ # should always be an optional/convenience inference.)
928
+ _python_type_to_column_type: dict[type, ColumnType] = {
929
+ str: StringType(),
930
+ int: IntType(),
931
+ float: FloatType(),
932
+ bool: BoolType(),
933
+ datetime.datetime: TimestampType(),
934
+ datetime.date: TimestampType(),
935
+ list: JsonType(),
936
+ dict: JsonType(),
937
+ PIL.Image.Image: ImageType()
938
+ }