pixeltable 0.2.19__py3-none-any.whl → 0.2.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (88) hide show
  1. pixeltable/__init__.py +7 -19
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/__init__.py +7 -7
  4. pixeltable/catalog/globals.py +3 -0
  5. pixeltable/catalog/insertable_table.py +9 -7
  6. pixeltable/catalog/table.py +220 -143
  7. pixeltable/catalog/table_version.py +36 -18
  8. pixeltable/catalog/table_version_path.py +0 -8
  9. pixeltable/catalog/view.py +3 -3
  10. pixeltable/dataframe.py +9 -24
  11. pixeltable/env.py +107 -36
  12. pixeltable/exceptions.py +7 -4
  13. pixeltable/exec/__init__.py +1 -1
  14. pixeltable/exec/aggregation_node.py +22 -15
  15. pixeltable/exec/component_iteration_node.py +62 -41
  16. pixeltable/exec/data_row_batch.py +7 -7
  17. pixeltable/exec/exec_node.py +35 -7
  18. pixeltable/exec/expr_eval_node.py +2 -1
  19. pixeltable/exec/in_memory_data_node.py +9 -9
  20. pixeltable/exec/sql_node.py +265 -136
  21. pixeltable/exprs/__init__.py +1 -0
  22. pixeltable/exprs/data_row.py +30 -19
  23. pixeltable/exprs/expr.py +15 -14
  24. pixeltable/exprs/expr_dict.py +55 -0
  25. pixeltable/exprs/expr_set.py +21 -15
  26. pixeltable/exprs/function_call.py +21 -8
  27. pixeltable/exprs/json_path.py +3 -6
  28. pixeltable/exprs/rowid_ref.py +2 -2
  29. pixeltable/exprs/sql_element_cache.py +5 -1
  30. pixeltable/ext/functions/whisperx.py +7 -2
  31. pixeltable/func/callable_function.py +2 -2
  32. pixeltable/func/function_registry.py +6 -7
  33. pixeltable/func/query_template_function.py +11 -12
  34. pixeltable/func/signature.py +17 -15
  35. pixeltable/func/udf.py +0 -4
  36. pixeltable/functions/__init__.py +1 -1
  37. pixeltable/functions/audio.py +4 -6
  38. pixeltable/functions/globals.py +86 -42
  39. pixeltable/functions/huggingface.py +12 -14
  40. pixeltable/functions/image.py +59 -45
  41. pixeltable/functions/json.py +0 -1
  42. pixeltable/functions/mistralai.py +2 -2
  43. pixeltable/functions/openai.py +22 -25
  44. pixeltable/functions/string.py +50 -50
  45. pixeltable/functions/timestamp.py +20 -20
  46. pixeltable/functions/together.py +26 -12
  47. pixeltable/functions/video.py +11 -20
  48. pixeltable/functions/whisper.py +2 -20
  49. pixeltable/globals.py +57 -56
  50. pixeltable/index/base.py +2 -2
  51. pixeltable/index/btree.py +7 -7
  52. pixeltable/index/embedding_index.py +8 -10
  53. pixeltable/io/external_store.py +11 -5
  54. pixeltable/io/globals.py +3 -1
  55. pixeltable/io/hf_datasets.py +4 -4
  56. pixeltable/io/label_studio.py +6 -6
  57. pixeltable/io/parquet.py +14 -13
  58. pixeltable/iterators/document.py +10 -8
  59. pixeltable/iterators/video.py +10 -1
  60. pixeltable/metadata/__init__.py +3 -2
  61. pixeltable/metadata/converters/convert_14.py +4 -2
  62. pixeltable/metadata/converters/convert_15.py +1 -1
  63. pixeltable/metadata/converters/convert_19.py +1 -0
  64. pixeltable/metadata/converters/convert_20.py +1 -1
  65. pixeltable/metadata/converters/util.py +9 -8
  66. pixeltable/metadata/schema.py +32 -21
  67. pixeltable/plan.py +136 -154
  68. pixeltable/store.py +51 -36
  69. pixeltable/tool/create_test_db_dump.py +7 -7
  70. pixeltable/tool/doc_plugins/griffe.py +3 -34
  71. pixeltable/tool/mypy_plugin.py +32 -0
  72. pixeltable/type_system.py +243 -60
  73. pixeltable/utils/arrow.py +10 -9
  74. pixeltable/utils/coco.py +4 -4
  75. pixeltable/utils/documents.py +1 -1
  76. pixeltable/utils/filecache.py +131 -84
  77. pixeltable/utils/formatter.py +1 -1
  78. pixeltable/utils/http_server.py +2 -5
  79. pixeltable/utils/media_store.py +6 -6
  80. pixeltable/utils/pytorch.py +10 -11
  81. pixeltable/utils/sql.py +2 -1
  82. {pixeltable-0.2.19.dist-info → pixeltable-0.2.21.dist-info}/METADATA +16 -7
  83. pixeltable-0.2.21.dist-info/RECORD +148 -0
  84. pixeltable/utils/help.py +0 -11
  85. pixeltable-0.2.19.dist-info/RECORD +0 -147
  86. {pixeltable-0.2.19.dist-info → pixeltable-0.2.21.dist-info}/LICENSE +0 -0
  87. {pixeltable-0.2.19.dist-info → pixeltable-0.2.21.dist-info}/WHEEL +0 -0
  88. {pixeltable-0.2.19.dist-info → pixeltable-0.2.21.dist-info}/entry_points.txt +0 -0
@@ -36,7 +36,7 @@ class Dumper:
36
36
  mock_home_dir = self.output_dir / '.pixeltable'
37
37
  mock_home_dir.mkdir(parents=True, exist_ok=True)
38
38
  os.environ['PIXELTABLE_HOME'] = str(mock_home_dir)
39
- os.environ['PIXELTABLE_CONFIG'] = str(shared_home / 'config.yaml')
39
+ os.environ['PIXELTABLE_CONFIG'] = str(shared_home / 'config.toml')
40
40
  os.environ['PIXELTABLE_DB'] = db_name
41
41
  os.environ['PIXELTABLE_PGDATA'] = str(shared_home / 'pgdata')
42
42
 
@@ -149,18 +149,18 @@ class Dumper:
149
149
  pxt.create_dir('views')
150
150
 
151
151
  # simple view
152
- v = pxt.create_view('views.view', t, filter=(t.c2 < 50))
152
+ v = pxt.create_view('views.view', t.where(t.c2 < 50))
153
153
  self.__add_expr_columns(v, 'view')
154
154
 
155
155
  # snapshot
156
- _ = pxt.create_view('views.snapshot', t, filter=(t.c2 >= 75), is_snapshot=True)
156
+ _ = pxt.create_view('views.snapshot', t.where(t.c2 >= 75), is_snapshot=True)
157
157
 
158
158
  # view of views
159
- vv = pxt.create_view('views.view_of_views', v, filter=(t.c2 >= 25))
159
+ vv = pxt.create_view('views.view_of_views', v.where(t.c2 >= 25))
160
160
  self.__add_expr_columns(vv, 'view_of_views')
161
161
 
162
162
  # empty view
163
- e = pxt.create_view('views.empty_view', t, filter=t.c2 == 4171780)
163
+ e = pxt.create_view('views.empty_view', t.where(t.c2 == 4171780))
164
164
  assert e.count() == 0
165
165
  self.__add_expr_columns(e, 'empty_view', include_expensive_functions=True)
166
166
 
@@ -278,13 +278,13 @@ class Dumper:
278
278
  # this breaks; TODO: why?
279
279
  #return t.where(t.c2 < i)
280
280
  return t.where(t.c2 < i).select(t.c1, t.c2)
281
- add_column('query_output', t.q1(t.c2))
281
+ add_column('query_output', t.queries.q1(t.c2))
282
282
 
283
283
  @t.query
284
284
  def q2(s: str):
285
285
  sim = t[f'{col_prefix}_function_call'].similarity(s)
286
286
  return t.order_by(sim, asc=False).select(t[f'{col_prefix}_function_call']).limit(5)
287
- add_column('sim_output', t.q2(t.c1))
287
+ add_column('sim_output', t.queries.q2(t.c1))
288
288
 
289
289
 
290
290
  @pxt.udf(_force_stored=True)
@@ -1,6 +1,6 @@
1
1
  import ast
2
- from typing import Optional, Union
3
2
  import warnings
3
+ from typing import Optional, Union
4
4
 
5
5
  import griffe
6
6
  import griffe.expressions
@@ -39,7 +39,7 @@ class PxtGriffeExtension(Extension):
39
39
  udf = griffe.dynamic_import(func.path)
40
40
  assert isinstance(udf, pxt.Function)
41
41
  # Convert the return type to a Pixeltable type reference
42
- func.returns = self.__column_type_to_display_str(udf.signature.get_return_type())
42
+ func.returns = str(udf.signature.get_return_type())
43
43
  # Convert the parameter types to Pixeltable type references
44
44
  for griffe_param in func.parameters:
45
45
  assert isinstance(griffe_param.annotation, griffe.expressions.Expr)
@@ -47,35 +47,4 @@ class PxtGriffeExtension(Extension):
47
47
  logger.warning(f'Parameter `{griffe_param.name}` not found in signature for UDF: {udf.display_name}')
48
48
  continue
49
49
  pxt_param = udf.signature.parameters[griffe_param.name]
50
- griffe_param.annotation = self.__column_type_to_display_str(pxt_param.col_type)
51
-
52
- def __column_type_to_display_str(self, column_type: Optional[pxt.ColumnType]) -> str:
53
- # TODO: When we enhance the Pixeltable type system, we may want to refactor some of this logic out.
54
- # I'm putting it here for now though.
55
- if column_type is None:
56
- return 'None'
57
- if column_type.is_string_type():
58
- base = 'str'
59
- elif column_type.is_int_type():
60
- base = 'int'
61
- elif column_type.is_float_type():
62
- base = 'float'
63
- elif column_type.is_bool_type():
64
- base = 'bool'
65
- elif column_type.is_timestamp_type():
66
- base = 'datetime'
67
- elif column_type.is_array_type():
68
- base = 'ArrayT'
69
- elif column_type.is_json_type():
70
- base = 'JsonT'
71
- elif column_type.is_image_type():
72
- base = 'ImageT'
73
- elif column_type.is_video_type():
74
- base = 'VideoT'
75
- elif column_type.is_audio_type():
76
- base = 'AudioT'
77
- elif column_type.is_document_type():
78
- base = 'DocumentT'
79
- else:
80
- assert False
81
- return f'Optional[{base}]' if column_type.nullable else base
50
+ griffe_param.annotation = str(pxt_param.col_type)
@@ -0,0 +1,32 @@
1
+ from typing import Callable, Optional
2
+
3
+ from mypy.plugin import AnalyzeTypeContext, Plugin
4
+ from mypy.types import Type
5
+
6
+ import pixeltable as pxt
7
+
8
+
9
+ class PxtPlugin(Plugin):
10
+ __TYPE_MAP = {
11
+ pxt.Json: 'typing.Any',
12
+ pxt.Array: 'numpy.ndarray',
13
+ pxt.Image: 'PIL.Image.Image',
14
+ pxt.Video: 'builtins.str',
15
+ pxt.Audio: 'builtins.str',
16
+ pxt.Document: 'builtins.str',
17
+ }
18
+ __FULLNAME_MAP = {
19
+ f'{k.__module__}.{k.__name__}': v
20
+ for k, v in __TYPE_MAP.items()
21
+ }
22
+
23
+ def get_type_analyze_hook(self, fullname: str) -> Optional[Callable[[AnalyzeTypeContext], type]]:
24
+ if fullname in self.__FULLNAME_MAP:
25
+ subst_name = self.__FULLNAME_MAP[fullname]
26
+ return lambda ctx: pxt_hook(ctx, subst_name)
27
+
28
+ def plugin(version: str):
29
+ return PxtPlugin
30
+
31
+ def pxt_hook(ctx: AnalyzeTypeContext, subst_name: str) -> Type:
32
+ return ctx.api.named_type(subst_name)
pixeltable/type_system.py CHANGED
@@ -8,15 +8,16 @@ import typing
8
8
  import urllib.parse
9
9
  import urllib.request
10
10
  from pathlib import Path
11
- from typing import Any, Dict, Iterable, List, Mapping, Optional, Sequence, Tuple, Union
11
+ from typing import Any, Iterable, Mapping, Optional, Sequence, Union
12
12
 
13
13
  import av # type: ignore
14
14
  import numpy as np
15
15
  import PIL.Image
16
16
  import sqlalchemy as sql
17
+ from typing import _GenericAlias # type: ignore[attr-defined]
18
+ from typing_extensions import _AnnotatedAlias
17
19
 
18
20
  import pixeltable.exceptions as excs
19
- from pixeltable.env import Env
20
21
 
21
22
 
22
23
  class ColumnType:
@@ -41,7 +42,7 @@ class ColumnType:
41
42
  def supertype(
42
43
  cls, type1: 'ColumnType.Type', type2: 'ColumnType.Type',
43
44
  # we need to pass this in because we can't easily append it as a class member
44
- common_supertypes: Dict[Tuple['ColumnType.Type', 'ColumnType.Type'], 'ColumnType.Type']
45
+ common_supertypes: dict[tuple['ColumnType.Type', 'ColumnType.Type'], 'ColumnType.Type']
45
46
  ) -> Optional['ColumnType.Type']:
46
47
  if type1 == type2:
47
48
  return type1
@@ -103,16 +104,16 @@ class ColumnType:
103
104
  return self.__class__(nullable=nullable) # type: ignore[call-arg]
104
105
 
105
106
  @classmethod
106
- def serialize_list(cls, type_list: List[ColumnType]) -> str:
107
+ def serialize_list(cls, type_list: list[ColumnType]) -> str:
107
108
  return json.dumps([t.as_dict() for t in type_list])
108
109
 
109
- def as_dict(self) -> Dict:
110
+ def as_dict(self) -> dict:
110
111
  return {
111
112
  '_classname': self.__class__.__name__,
112
113
  **self._as_dict(),
113
114
  }
114
115
 
115
- def _as_dict(self) -> Dict:
116
+ def _as_dict(self) -> dict:
116
117
  return {'nullable': self.nullable}
117
118
 
118
119
  @classmethod
@@ -121,18 +122,18 @@ class ColumnType:
121
122
  return cls.from_dict(type_dict)
122
123
 
123
124
  @classmethod
124
- def deserialize_list(cls, type_list_str: str) -> List[ColumnType]:
125
+ def deserialize_list(cls, type_list_str: str) -> list[ColumnType]:
125
126
  type_dict_list = json.loads(type_list_str)
126
127
  return [cls.from_dict(type_dict) for type_dict in type_dict_list]
127
128
 
128
129
  @classmethod
129
- def from_dict(cls, type_dict: Dict) -> ColumnType:
130
+ def from_dict(cls, type_dict: dict) -> ColumnType:
130
131
  assert '_classname' in type_dict
131
132
  type_class = globals()[type_dict['_classname']]
132
133
  return type_class._from_dict(type_dict)
133
134
 
134
135
  @classmethod
135
- def _from_dict(cls, d: Dict) -> ColumnType:
136
+ def _from_dict(cls, d: dict) -> ColumnType:
136
137
  """
137
138
  Default implementation: simply invoke c'tor
138
139
  """
@@ -164,11 +165,28 @@ class ColumnType:
164
165
  return DocumentType()
165
166
 
166
167
  def __str__(self) -> str:
167
- return self._type.name.lower()
168
+ return self._to_str(as_schema=False)
169
+
170
+ def _to_str(self, as_schema: bool) -> str:
171
+ base_str = self._to_base_str()
172
+ if as_schema:
173
+ return base_str if self.nullable else f'Required[{base_str}]'
174
+ else:
175
+ return f'Optional[{base_str}]' if self.nullable else base_str
176
+
177
+ def _to_base_str(self) -> str:
178
+ """
179
+ String representation of this type, disregarding nullability. Default implementation is to camel-case
180
+ the type name; subclasses may override.
181
+ """
182
+ return self._type.name[0] + self._type.name[1:].lower()
168
183
 
169
184
  def __eq__(self, other: object) -> bool:
170
185
  return isinstance(other, ColumnType) and self.matches(other) and self.nullable == other.nullable
171
186
 
187
+ def __hash__(self) -> int:
188
+ return hash((self._type, self.nullable))
189
+
172
190
  def is_supertype_of(self, other: ColumnType, ignore_nullable: bool = False) -> bool:
173
191
  if ignore_nullable:
174
192
  supertype = self.supertype(other)
@@ -253,39 +271,63 @@ class ColumnType:
253
271
  return inferred_type
254
272
 
255
273
  @classmethod
256
- def from_python_type(cls, t: type) -> Optional[ColumnType]:
274
+ def from_python_type(cls, t: Union[type, _GenericAlias], nullable_default: bool = False) -> Optional[ColumnType]:
257
275
  if typing.get_origin(t) is typing.Union:
258
276
  union_args = typing.get_args(t)
259
- if union_args[1] is type(None):
260
- # `t` is a type of the form Optional[T] (equivalently, Union[T, None]).
277
+ if len(union_args) == 2 and type(None) in union_args:
278
+ # `t` is a type of the form Optional[T] (equivalently, Union[T, None] or Union[None, T]).
261
279
  # We treat it as the underlying type but with nullable=True.
262
- underlying = cls.from_python_type(union_args[0])
280
+ underlying_py_type = union_args[0] if union_args[1] is type(None) else union_args[1]
281
+ underlying = cls.from_python_type(underlying_py_type)
263
282
  if underlying is not None:
264
- underlying._nullable = True
265
- return underlying
283
+ return underlying.copy(nullable=True)
284
+ elif typing.get_origin(t) is typing.Annotated:
285
+ annotated_args = typing.get_args(t)
286
+ origin = annotated_args[0]
287
+ parameters = annotated_args[1]
288
+ if isinstance(parameters, ColumnType):
289
+ return parameters.copy(nullable=nullable_default)
290
+ elif typing.get_origin(t) is Required:
291
+ required_args = typing.get_args(t)
292
+ assert len(required_args) == 1
293
+ return cls.from_python_type(required_args[0], nullable_default=False)
266
294
  else:
267
295
  # Discard type parameters to ensure that parameterized types such as `list[T]`
268
296
  # are correctly mapped to Pixeltable types.
269
- base = typing.get_origin(t)
270
- if base is None:
271
- # No type parameters; the base type is just `t` itself
272
- base = t
273
- if base is str:
274
- return StringType()
275
- if base is int:
276
- return IntType()
277
- if base is float:
278
- return FloatType()
279
- if base is bool:
280
- return BoolType()
281
- if base is datetime.datetime:
282
- return TimestampType()
283
- if issubclass(base, Sequence) or issubclass(base, Mapping):
284
- return JsonType()
285
- if issubclass(base, PIL.Image.Image):
286
- return ImageType()
297
+ origin = typing.get_origin(t)
298
+ if origin is None:
299
+ # No type parameters; the origin type is just `t` itself
300
+ origin = t
301
+ if issubclass(origin, _PxtType):
302
+ return origin.as_col_type(nullable=nullable_default)
303
+ if origin is str:
304
+ return StringType(nullable=nullable_default)
305
+ if origin is int:
306
+ return IntType(nullable=nullable_default)
307
+ if origin is float:
308
+ return FloatType(nullable=nullable_default)
309
+ if origin is bool:
310
+ return BoolType(nullable=nullable_default)
311
+ if origin is datetime.datetime:
312
+ return TimestampType(nullable=nullable_default)
313
+ if origin is PIL.Image.Image:
314
+ return ImageType(nullable=nullable_default)
315
+ if issubclass(origin, Sequence) or issubclass(origin, Mapping):
316
+ return JsonType(nullable=nullable_default)
287
317
  return None
288
318
 
319
+ @classmethod
320
+ def normalize_type(cls, t: Union[ColumnType, type, _AnnotatedAlias], nullable_default: bool = False) -> ColumnType:
321
+ """
322
+ Convert any type recognizable by Pixeltable to its corresponding ColumnType.
323
+ """
324
+ if isinstance(t, ColumnType):
325
+ return t
326
+ col_type = cls.from_python_type(t, nullable_default)
327
+ if col_type is None:
328
+ raise excs.Error(f'Unknown type: {t}')
329
+ return col_type
330
+
289
331
  def validate_literal(self, val: Any) -> None:
290
332
  """Raise TypeError if val is not a valid literal for this type"""
291
333
  if val is None:
@@ -491,7 +533,7 @@ class TimestampType(ColumnType):
491
533
 
492
534
  class JsonType(ColumnType):
493
535
  # TODO: type_spec also needs to be able to express lists
494
- def __init__(self, type_spec: Optional[Dict[str, ColumnType]] = None, nullable: bool = False):
536
+ def __init__(self, type_spec: Optional[dict[str, ColumnType]] = None, nullable: bool = False):
495
537
  super().__init__(self.Type.JSON, nullable=nullable)
496
538
  self.type_spec = type_spec
497
539
 
@@ -526,7 +568,7 @@ class JsonType(ColumnType):
526
568
  type_spec[other_field_name] = field_type
527
569
  return JsonType(type_spec, nullable=(self.nullable or other.nullable))
528
570
 
529
- def _as_dict(self) -> Dict:
571
+ def _as_dict(self) -> dict:
530
572
  result = super()._as_dict()
531
573
  if self.type_spec is not None:
532
574
  type_spec_dict = {field_name: field_type.serialize() for field_name, field_type in self.type_spec.items()}
@@ -534,7 +576,7 @@ class JsonType(ColumnType):
534
576
  return result
535
577
 
536
578
  @classmethod
537
- def _from_dict(cls, d: Dict) -> ColumnType:
579
+ def _from_dict(cls, d: dict) -> ColumnType:
538
580
  type_spec = None
539
581
  if 'type_spec' in d:
540
582
  type_spec = {
@@ -590,6 +632,9 @@ class ArrayType(ColumnType):
590
632
  def matches(self, other: ColumnType) -> bool:
591
633
  return isinstance(other, ArrayType) and self.shape == other.shape and self.dtype == other.dtype
592
634
 
635
+ def __hash__(self) -> int:
636
+ return hash((self._type, self.nullable, self.shape, self.dtype))
637
+
593
638
  def supertype(self, other: ColumnType) -> Optional[ArrayType]:
594
639
  if not isinstance(other, ArrayType):
595
640
  return None
@@ -601,16 +646,16 @@ class ArrayType(ColumnType):
601
646
  shape = [n1 if n1 == n2 else None for n1, n2 in zip(self.shape, other.shape)]
602
647
  return ArrayType(tuple(shape), self.make_type(base_type), nullable=(self.nullable or other.nullable))
603
648
 
604
- def _as_dict(self) -> Dict:
649
+ def _as_dict(self) -> dict:
605
650
  result = super()._as_dict()
606
651
  result.update(shape=list(self.shape), dtype=self.dtype.value)
607
652
  return result
608
653
 
609
- def __str__(self) -> str:
610
- return f'{self._type.name.lower()}({self.shape}, dtype={self.dtype.name})'
654
+ def _to_base_str(self) -> str:
655
+ return f'Array[{self.shape}, {self.pxt_dtype}]'
611
656
 
612
657
  @classmethod
613
- def _from_dict(cls, d: Dict) -> ColumnType:
658
+ def _from_dict(cls, d: dict) -> ColumnType:
614
659
  assert 'shape' in d
615
660
  assert 'dtype' in d
616
661
  shape = tuple(d['shape'])
@@ -681,7 +726,7 @@ class ArrayType(ColumnType):
681
726
 
682
727
  class ImageType(ColumnType):
683
728
  def __init__(
684
- self, width: Optional[int] = None, height: Optional[int] = None, size: Optional[Tuple[int, int]] = None,
729
+ self, width: Optional[int] = None, height: Optional[int] = None, size: Optional[tuple[int, int]] = None,
685
730
  mode: Optional[str] = None, nullable: bool = False
686
731
  ):
687
732
  """
@@ -701,23 +746,17 @@ class ImageType(ColumnType):
701
746
  def copy(self, nullable: bool) -> ColumnType:
702
747
  return ImageType(self.width, self.height, mode=self.mode, nullable=nullable)
703
748
 
704
- def __str__(self) -> str:
705
- if self.width is not None or self.height is not None or self.mode is not None:
749
+ def _to_base_str(self) -> str:
750
+ params = []
751
+ if self.width is not None or self.height is not None:
752
+ params.append(f'({self.width}, {self.height})')
753
+ if self.mode is not None:
754
+ params.append(repr(self.mode))
755
+ if len(params) == 0:
706
756
  params_str = ''
707
- if self.width is not None:
708
- params_str = f'width={self.width}'
709
- if self.height is not None:
710
- if len(params_str) > 0:
711
- params_str += ', '
712
- params_str += f'height={self.height}'
713
- if self.mode is not None:
714
- if len(params_str) > 0:
715
- params_str += ', '
716
- params_str += f'mode={self.mode}'
717
- params_str = f'({params_str})'
718
757
  else:
719
- params_str = ''
720
- return f'{self._type.name.lower()}{params_str}'
758
+ params_str = f'[{", ".join(params)}]'
759
+ return f'Image{params_str}'
721
760
 
722
761
  def matches(self, other: ColumnType) -> bool:
723
762
  return (
@@ -727,6 +766,9 @@ class ImageType(ColumnType):
727
766
  and self.mode == other.mode
728
767
  )
729
768
 
769
+ def __hash__(self) -> int:
770
+ return hash((self._type, self.nullable, self.size, self.mode))
771
+
730
772
  def supertype(self, other: ColumnType) -> Optional[ImageType]:
731
773
  if not isinstance(other, ImageType):
732
774
  return None
@@ -736,18 +778,18 @@ class ImageType(ColumnType):
736
778
  return ImageType(width=width, height=height, mode=mode, nullable=(self.nullable or other.nullable))
737
779
 
738
780
  @property
739
- def size(self) -> Optional[Tuple[int, int]]:
781
+ def size(self) -> Optional[tuple[int, int]]:
740
782
  if self.width is None or self.height is None:
741
783
  return None
742
784
  return (self.width, self.height)
743
785
 
744
- def _as_dict(self) -> Dict:
786
+ def _as_dict(self) -> dict:
745
787
  result = super()._as_dict()
746
788
  result.update(width=self.width, height=self.height, mode=self.mode)
747
789
  return result
748
790
 
749
791
  @classmethod
750
- def _from_dict(cls, d: Dict) -> ColumnType:
792
+ def _from_dict(cls, d: dict) -> ColumnType:
751
793
  assert 'width' in d
752
794
  assert 'height' in d
753
795
  assert 'mode' in d
@@ -853,6 +895,9 @@ class DocumentType(ColumnType):
853
895
  def matches(self, other: ColumnType) -> bool:
854
896
  return isinstance(other, DocumentType) and self._doc_formats == other._doc_formats
855
897
 
898
+ def __hash__(self) -> int:
899
+ return hash((self._type, self.nullable, self._doc_formats))
900
+
856
901
  def to_sa_type(self) -> sql.types.TypeEngine:
857
902
  # stored as a file path
858
903
  return sql.String()
@@ -866,3 +911,141 @@ class DocumentType(ColumnType):
866
911
  dh = get_document_handle(val)
867
912
  if dh is None:
868
913
  raise excs.Error(f'Not a recognized document format: {val}')
914
+
915
+
916
+ T = typing.TypeVar('T')
917
+
918
+
919
+ class Required(typing.Generic[T]):
920
+ """
921
+ Marker class to indicate that a column is non-nullable in a schema definition. This has no meaning as a type hint,
922
+ and is intended only for schema declarations.
923
+ """
924
+ pass
925
+
926
+
927
+ String = typing.Annotated[str, StringType(nullable=False)]
928
+ Int = typing.Annotated[int, IntType(nullable=False)]
929
+ Float = typing.Annotated[float, FloatType(nullable=False)]
930
+ Bool = typing.Annotated[bool, BoolType(nullable=False)]
931
+ Timestamp = typing.Annotated[datetime.datetime, TimestampType(nullable=False)]
932
+
933
+
934
+ class _PxtType:
935
+ """
936
+ Base class for the Pixeltable type-hint family. Subclasses of this class are meant to be used as type hints, both
937
+ in schema definitions and in UDF signatures. Whereas `ColumnType`s are instantiable and carry semantic information
938
+ about the Pixeltable type system, `_PxtType` subclasses are purely for convenience: they are not instantiable and
939
+ must be resolved to a `ColumnType` (by calling `ColumnType.from_python_type()`) in order to do anything meaningful
940
+ with them.
941
+
942
+ `_PxtType` subclasses can be specialized (as type hints) with type parameters; for example:
943
+ `Image[(300, 300), 'RGB']`. The specialized forms resolve to `typing.Annotated` instances whose annotation is a
944
+ `ColumnType`.
945
+ """
946
+ def __init__(self):
947
+ raise TypeError(f'Type `{type(self)}` cannot be instantiated.')
948
+
949
+ @classmethod
950
+ def as_col_type(cls, nullable: bool) -> ColumnType:
951
+ raise NotImplementedError()
952
+
953
+
954
+ class Json(_PxtType):
955
+ @classmethod
956
+ def as_col_type(cls, nullable: bool) -> ColumnType:
957
+ return JsonType(nullable=nullable)
958
+
959
+
960
+ class Array(np.ndarray, _PxtType):
961
+ def __class_getitem__(cls, item: Any) -> _AnnotatedAlias:
962
+ """
963
+ `item` (the type subscript) must be a tuple with exactly two elements (in any order):
964
+ - A tuple of `Optional[int]`s, specifying the shape of the array
965
+ - A type, specifying the dtype of the array
966
+ Example: Array[(3, None, 2), float]
967
+ """
968
+ params = item if isinstance(item, tuple) else (item,)
969
+ shape: Optional[tuple] = None
970
+ dtype: Optional[ColumnType] = None
971
+ for param in params:
972
+ if isinstance(param, tuple):
973
+ if not all(n is None or (isinstance(n, int) and n >= 1) for n in param):
974
+ raise TypeError(f'Invalid Array type parameter: {param}')
975
+ if shape is not None:
976
+ raise TypeError(f'Duplicate Array type parameter: {param}')
977
+ shape = param
978
+ elif isinstance(param, type) or isinstance(param, _AnnotatedAlias):
979
+ if dtype is not None:
980
+ raise TypeError(f'Duplicate Array type parameter: {param}')
981
+ dtype = ColumnType.from_python_type(param)
982
+ else:
983
+ raise TypeError(f'Invalid Array type parameter: {param}')
984
+ if shape is None:
985
+ raise TypeError('Array type is missing parameter: shape')
986
+ if dtype is None:
987
+ raise TypeError('Array type is missing parameter: dtype')
988
+ return typing.Annotated[np.ndarray, ArrayType(shape=shape, dtype=dtype, nullable=False)]
989
+
990
+ @classmethod
991
+ def as_col_type(cls, nullable: bool) -> ColumnType:
992
+ raise TypeError('Array type cannot be used without specifying shape and dtype')
993
+
994
+
995
+ class Image(PIL.Image.Image, _PxtType):
996
+ def __class_getitem__(cls, item: Any) -> _AnnotatedAlias:
997
+ """
998
+ `item` (the type subscript) must be one of the following, or a tuple containing either or both in any order:
999
+ - A 2-tuple of `int`s, specifying the size of the image
1000
+ - A string, specifying the mode of the image
1001
+ Example: Image[(300, 300), 'RGB']
1002
+ """
1003
+ if isinstance(item, tuple) and all(n is None or isinstance(n, int) for n in item):
1004
+ # It's a tuple of the form (width, height)
1005
+ params = (item,)
1006
+ elif isinstance(item, tuple):
1007
+ # It's a compound tuple (multiple parameters)
1008
+ params = item
1009
+ else:
1010
+ # Not a tuple (single arg)
1011
+ params = (item,)
1012
+ size: Optional[tuple] = None
1013
+ mode: Optional[str] = None
1014
+ for param in params:
1015
+ if isinstance(param, tuple):
1016
+ if len(param) != 2 or not isinstance(param[0], (int, type(None))) or not isinstance(param[1], (int, type(None))):
1017
+ raise TypeError(f'Invalid Image type parameter: {param}')
1018
+ if size is not None:
1019
+ raise TypeError(f'Duplicate Image type parameter: {param}')
1020
+ size = param
1021
+ elif isinstance(param, str):
1022
+ if param not in PIL.Image.MODES:
1023
+ raise TypeError(f'Invalid Image type parameter: {param!r}')
1024
+ if mode is not None:
1025
+ raise TypeError(f'Duplicate Image type parameter: {param!r}')
1026
+ mode = param
1027
+ else:
1028
+ raise TypeError(f'Invalid Image type parameter: {param}')
1029
+ return typing.Annotated[PIL.Image.Image, ImageType(size=size, mode=mode, nullable=False)]
1030
+
1031
+ @classmethod
1032
+ def as_col_type(cls, nullable: bool) -> ColumnType:
1033
+ return ImageType(nullable=nullable)
1034
+
1035
+
1036
+ class Video(str, _PxtType):
1037
+ @classmethod
1038
+ def as_col_type(cls, nullable: bool) -> ColumnType:
1039
+ return VideoType(nullable=nullable)
1040
+
1041
+
1042
+ class Audio(str, _PxtType):
1043
+ @classmethod
1044
+ def as_col_type(cls, nullable: bool) -> ColumnType:
1045
+ return AudioType(nullable=nullable)
1046
+
1047
+
1048
+ class Document(str, _PxtType):
1049
+ @classmethod
1050
+ def as_col_type(cls, nullable: bool) -> ColumnType:
1051
+ return DocumentType(nullable=nullable)
pixeltable/utils/arrow.py CHANGED
@@ -1,13 +1,14 @@
1
1
  import logging
2
- from typing import Any, Dict, Iterable, Iterator, Optional
2
+ from typing import Any, Iterator, Optional, Union
3
3
 
4
+ import numpy as np
4
5
  import pyarrow as pa
5
6
 
6
7
  import pixeltable.type_system as ts
7
8
 
8
9
  _logger = logging.getLogger(__name__)
9
10
 
10
- _pa_to_pt: Dict[pa.DataType, ts.ColumnType] = {
11
+ _pa_to_pt: dict[pa.DataType, ts.ColumnType] = {
11
12
  pa.string(): ts.StringType(nullable=True),
12
13
  pa.timestamp('us'): ts.TimestampType(nullable=True),
13
14
  pa.bool_(): ts.BoolType(nullable=True),
@@ -20,7 +21,7 @@ _pa_to_pt: Dict[pa.DataType, ts.ColumnType] = {
20
21
  pa.float32(): ts.FloatType(nullable=True),
21
22
  }
22
23
 
23
- _pt_to_pa: Dict[ts.ColumnType, pa.DataType] = {
24
+ _pt_to_pa: dict[type[ts.ColumnType], pa.DataType] = {
24
25
  ts.StringType: pa.string(),
25
26
  ts.TimestampType: pa.timestamp('us'), # postgres timestamp is microseconds
26
27
  ts.BoolType: pa.bool_(),
@@ -61,19 +62,19 @@ def to_arrow_type(pixeltable_type: ts.ColumnType) -> Optional[pa.DataType]:
61
62
  return None
62
63
 
63
64
 
64
- def to_pixeltable_schema(arrow_schema: pa.Schema) -> Dict[str, ts.ColumnType]:
65
+ def to_pixeltable_schema(arrow_schema: pa.Schema) -> dict[str, ts.ColumnType]:
65
66
  return {field.name: to_pixeltable_type(field.type) for field in arrow_schema}
66
67
 
67
68
 
68
- def to_arrow_schema(pixeltable_schema: Dict[str, Any]) -> pa.Schema:
69
- return pa.schema((name, to_arrow_type(typ)) for name, typ in pixeltable_schema.items())
69
+ def to_arrow_schema(pixeltable_schema: dict[str, Any]) -> pa.Schema:
70
+ return pa.schema((name, to_arrow_type(typ)) for name, typ in pixeltable_schema.items()) # type: ignore[misc]
70
71
 
71
72
 
72
- def to_pydict(batch: pa.RecordBatch) -> Dict[str, Iterable[Any]]:
73
+ def to_pydict(batch: pa.RecordBatch) -> dict[str, Union[list, np.ndarray]]:
73
74
  """Convert a RecordBatch to a dictionary of lists, unlike pa.lib.RecordBatch.to_pydict,
74
75
  this function will not convert numpy arrays to lists, and will preserve the original numpy dtype.
75
76
  """
76
- out = {}
77
+ out: dict[str, Union[list, np.ndarray]] = {}
77
78
  for k, name in enumerate(batch.schema.names):
78
79
  col = batch.column(k)
79
80
  if isinstance(col.type, pa.FixedShapeTensorType):
@@ -86,7 +87,7 @@ def to_pydict(batch: pa.RecordBatch) -> Dict[str, Iterable[Any]]:
86
87
  return out
87
88
 
88
89
 
89
- def iter_tuples(batch: pa.RecordBatch) -> Iterator[Dict[str, Any]]:
90
+ def iter_tuples(batch: pa.RecordBatch) -> Iterator[dict[str, Any]]:
90
91
  """Convert a RecordBatch to an iterator of dictionaries. also works with pa.Table and pa.RowGroup"""
91
92
  pydict = to_pydict(batch)
92
93
  assert len(pydict) > 0, 'empty record batch'