pixeltable 0.2.20__py3-none-any.whl → 0.2.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (120) hide show
  1. pixeltable/__init__.py +7 -19
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/__init__.py +7 -7
  4. pixeltable/catalog/column.py +37 -11
  5. pixeltable/catalog/globals.py +21 -0
  6. pixeltable/catalog/insertable_table.py +6 -4
  7. pixeltable/catalog/table.py +227 -148
  8. pixeltable/catalog/table_version.py +66 -28
  9. pixeltable/catalog/table_version_path.py +0 -8
  10. pixeltable/catalog/view.py +18 -19
  11. pixeltable/dataframe.py +16 -32
  12. pixeltable/env.py +6 -1
  13. pixeltable/exec/__init__.py +1 -2
  14. pixeltable/exec/aggregation_node.py +27 -17
  15. pixeltable/exec/cache_prefetch_node.py +1 -1
  16. pixeltable/exec/data_row_batch.py +9 -26
  17. pixeltable/exec/exec_node.py +36 -7
  18. pixeltable/exec/expr_eval_node.py +19 -11
  19. pixeltable/exec/in_memory_data_node.py +14 -11
  20. pixeltable/exec/sql_node.py +266 -138
  21. pixeltable/exprs/__init__.py +1 -0
  22. pixeltable/exprs/arithmetic_expr.py +3 -1
  23. pixeltable/exprs/array_slice.py +7 -7
  24. pixeltable/exprs/column_property_ref.py +37 -10
  25. pixeltable/exprs/column_ref.py +93 -14
  26. pixeltable/exprs/comparison.py +5 -5
  27. pixeltable/exprs/compound_predicate.py +8 -7
  28. pixeltable/exprs/data_row.py +56 -36
  29. pixeltable/exprs/expr.py +65 -63
  30. pixeltable/exprs/expr_dict.py +55 -0
  31. pixeltable/exprs/expr_set.py +26 -15
  32. pixeltable/exprs/function_call.py +53 -24
  33. pixeltable/exprs/globals.py +4 -1
  34. pixeltable/exprs/in_predicate.py +8 -7
  35. pixeltable/exprs/inline_expr.py +4 -4
  36. pixeltable/exprs/is_null.py +4 -4
  37. pixeltable/exprs/json_mapper.py +11 -12
  38. pixeltable/exprs/json_path.py +5 -10
  39. pixeltable/exprs/literal.py +5 -5
  40. pixeltable/exprs/method_ref.py +5 -4
  41. pixeltable/exprs/object_ref.py +2 -1
  42. pixeltable/exprs/row_builder.py +88 -36
  43. pixeltable/exprs/rowid_ref.py +14 -13
  44. pixeltable/exprs/similarity_expr.py +12 -7
  45. pixeltable/exprs/sql_element_cache.py +12 -6
  46. pixeltable/exprs/type_cast.py +8 -6
  47. pixeltable/exprs/variable.py +5 -4
  48. pixeltable/ext/functions/whisperx.py +7 -2
  49. pixeltable/func/aggregate_function.py +1 -1
  50. pixeltable/func/callable_function.py +2 -2
  51. pixeltable/func/function.py +11 -10
  52. pixeltable/func/function_registry.py +6 -7
  53. pixeltable/func/query_template_function.py +11 -12
  54. pixeltable/func/signature.py +17 -15
  55. pixeltable/func/udf.py +0 -4
  56. pixeltable/functions/__init__.py +2 -2
  57. pixeltable/functions/audio.py +4 -6
  58. pixeltable/functions/globals.py +84 -42
  59. pixeltable/functions/huggingface.py +31 -34
  60. pixeltable/functions/image.py +59 -45
  61. pixeltable/functions/json.py +0 -1
  62. pixeltable/functions/llama_cpp.py +106 -0
  63. pixeltable/functions/mistralai.py +2 -2
  64. pixeltable/functions/ollama.py +147 -0
  65. pixeltable/functions/openai.py +22 -25
  66. pixeltable/functions/replicate.py +72 -0
  67. pixeltable/functions/string.py +59 -50
  68. pixeltable/functions/timestamp.py +20 -20
  69. pixeltable/functions/together.py +2 -2
  70. pixeltable/functions/video.py +11 -20
  71. pixeltable/functions/whisper.py +2 -20
  72. pixeltable/globals.py +65 -74
  73. pixeltable/index/base.py +2 -2
  74. pixeltable/index/btree.py +20 -7
  75. pixeltable/index/embedding_index.py +12 -14
  76. pixeltable/io/__init__.py +1 -2
  77. pixeltable/io/external_store.py +11 -5
  78. pixeltable/io/fiftyone.py +178 -0
  79. pixeltable/io/globals.py +98 -2
  80. pixeltable/io/hf_datasets.py +1 -1
  81. pixeltable/io/label_studio.py +6 -6
  82. pixeltable/io/parquet.py +14 -13
  83. pixeltable/iterators/base.py +3 -2
  84. pixeltable/iterators/document.py +10 -8
  85. pixeltable/iterators/video.py +126 -60
  86. pixeltable/metadata/__init__.py +4 -3
  87. pixeltable/metadata/converters/convert_14.py +4 -2
  88. pixeltable/metadata/converters/convert_15.py +1 -1
  89. pixeltable/metadata/converters/convert_19.py +1 -0
  90. pixeltable/metadata/converters/convert_20.py +1 -1
  91. pixeltable/metadata/converters/convert_21.py +34 -0
  92. pixeltable/metadata/converters/util.py +54 -12
  93. pixeltable/metadata/notes.py +1 -0
  94. pixeltable/metadata/schema.py +40 -21
  95. pixeltable/plan.py +149 -165
  96. pixeltable/py.typed +0 -0
  97. pixeltable/store.py +57 -37
  98. pixeltable/tool/create_test_db_dump.py +6 -6
  99. pixeltable/tool/create_test_video.py +1 -1
  100. pixeltable/tool/doc_plugins/griffe.py +3 -34
  101. pixeltable/tool/embed_udf.py +1 -1
  102. pixeltable/tool/mypy_plugin.py +55 -0
  103. pixeltable/type_system.py +260 -61
  104. pixeltable/utils/arrow.py +10 -9
  105. pixeltable/utils/coco.py +4 -4
  106. pixeltable/utils/documents.py +16 -2
  107. pixeltable/utils/filecache.py +9 -9
  108. pixeltable/utils/formatter.py +10 -11
  109. pixeltable/utils/http_server.py +2 -5
  110. pixeltable/utils/media_store.py +6 -6
  111. pixeltable/utils/pytorch.py +10 -11
  112. pixeltable/utils/sql.py +2 -1
  113. {pixeltable-0.2.20.dist-info → pixeltable-0.2.22.dist-info}/METADATA +50 -13
  114. pixeltable-0.2.22.dist-info/RECORD +153 -0
  115. pixeltable/exec/media_validation_node.py +0 -43
  116. pixeltable/utils/help.py +0 -11
  117. pixeltable-0.2.20.dist-info/RECORD +0 -147
  118. {pixeltable-0.2.20.dist-info → pixeltable-0.2.22.dist-info}/LICENSE +0 -0
  119. {pixeltable-0.2.20.dist-info → pixeltable-0.2.22.dist-info}/WHEEL +0 -0
  120. {pixeltable-0.2.20.dist-info → pixeltable-0.2.22.dist-info}/entry_points.txt +0 -0
pixeltable/type_system.py CHANGED
@@ -3,20 +3,22 @@ from __future__ import annotations
3
3
  import abc
4
4
  import datetime
5
5
  import enum
6
+ import io
6
7
  import json
7
8
  import typing
8
9
  import urllib.parse
9
10
  import urllib.request
10
11
  from pathlib import Path
11
- from typing import Any, Dict, Iterable, List, Mapping, Optional, Sequence, Tuple, Union
12
+ from typing import Any, Iterable, Mapping, Optional, Sequence, Union
12
13
 
14
+ import PIL.Image
13
15
  import av # type: ignore
14
16
  import numpy as np
15
- import PIL.Image
16
17
  import sqlalchemy as sql
18
+ from typing import _GenericAlias # type: ignore[attr-defined]
19
+ from typing_extensions import _AnnotatedAlias
17
20
 
18
21
  import pixeltable.exceptions as excs
19
- from pixeltable.env import Env
20
22
 
21
23
 
22
24
  class ColumnType:
@@ -41,7 +43,7 @@ class ColumnType:
41
43
  def supertype(
42
44
  cls, type1: 'ColumnType.Type', type2: 'ColumnType.Type',
43
45
  # we need to pass this in because we can't easily append it as a class member
44
- common_supertypes: Dict[Tuple['ColumnType.Type', 'ColumnType.Type'], 'ColumnType.Type']
46
+ common_supertypes: dict[tuple['ColumnType.Type', 'ColumnType.Type'], 'ColumnType.Type']
45
47
  ) -> Optional['ColumnType.Type']:
46
48
  if type1 == type2:
47
49
  return type1
@@ -103,16 +105,16 @@ class ColumnType:
103
105
  return self.__class__(nullable=nullable) # type: ignore[call-arg]
104
106
 
105
107
  @classmethod
106
- def serialize_list(cls, type_list: List[ColumnType]) -> str:
108
+ def serialize_list(cls, type_list: list[ColumnType]) -> str:
107
109
  return json.dumps([t.as_dict() for t in type_list])
108
110
 
109
- def as_dict(self) -> Dict:
111
+ def as_dict(self) -> dict:
110
112
  return {
111
113
  '_classname': self.__class__.__name__,
112
114
  **self._as_dict(),
113
115
  }
114
116
 
115
- def _as_dict(self) -> Dict:
117
+ def _as_dict(self) -> dict:
116
118
  return {'nullable': self.nullable}
117
119
 
118
120
  @classmethod
@@ -121,18 +123,18 @@ class ColumnType:
121
123
  return cls.from_dict(type_dict)
122
124
 
123
125
  @classmethod
124
- def deserialize_list(cls, type_list_str: str) -> List[ColumnType]:
126
+ def deserialize_list(cls, type_list_str: str) -> list[ColumnType]:
125
127
  type_dict_list = json.loads(type_list_str)
126
128
  return [cls.from_dict(type_dict) for type_dict in type_dict_list]
127
129
 
128
130
  @classmethod
129
- def from_dict(cls, type_dict: Dict) -> ColumnType:
131
+ def from_dict(cls, type_dict: dict) -> ColumnType:
130
132
  assert '_classname' in type_dict
131
133
  type_class = globals()[type_dict['_classname']]
132
134
  return type_class._from_dict(type_dict)
133
135
 
134
136
  @classmethod
135
- def _from_dict(cls, d: Dict) -> ColumnType:
137
+ def _from_dict(cls, d: dict) -> ColumnType:
136
138
  """
137
139
  Default implementation: simply invoke c'tor
138
140
  """
@@ -164,11 +166,28 @@ class ColumnType:
164
166
  return DocumentType()
165
167
 
166
168
  def __str__(self) -> str:
167
- return self._type.name.lower()
169
+ return self._to_str(as_schema=False)
170
+
171
+ def _to_str(self, as_schema: bool) -> str:
172
+ base_str = self._to_base_str()
173
+ if as_schema:
174
+ return base_str if self.nullable else f'Required[{base_str}]'
175
+ else:
176
+ return f'Optional[{base_str}]' if self.nullable else base_str
177
+
178
+ def _to_base_str(self) -> str:
179
+ """
180
+ String representation of this type, disregarding nullability. Default implementation is to camel-case
181
+ the type name; subclasses may override.
182
+ """
183
+ return self._type.name[0] + self._type.name[1:].lower()
168
184
 
169
185
  def __eq__(self, other: object) -> bool:
170
186
  return isinstance(other, ColumnType) and self.matches(other) and self.nullable == other.nullable
171
187
 
188
+ def __hash__(self) -> int:
189
+ return hash((self._type, self.nullable))
190
+
172
191
  def is_supertype_of(self, other: ColumnType, ignore_nullable: bool = False) -> bool:
173
192
  if ignore_nullable:
174
193
  supertype = self.supertype(other)
@@ -253,39 +272,63 @@ class ColumnType:
253
272
  return inferred_type
254
273
 
255
274
  @classmethod
256
- def from_python_type(cls, t: type) -> Optional[ColumnType]:
275
+ def from_python_type(cls, t: Union[type, _GenericAlias], nullable_default: bool = False) -> Optional[ColumnType]:
257
276
  if typing.get_origin(t) is typing.Union:
258
277
  union_args = typing.get_args(t)
259
- if union_args[1] is type(None):
260
- # `t` is a type of the form Optional[T] (equivalently, Union[T, None]).
278
+ if len(union_args) == 2 and type(None) in union_args:
279
+ # `t` is a type of the form Optional[T] (equivalently, Union[T, None] or Union[None, T]).
261
280
  # We treat it as the underlying type but with nullable=True.
262
- underlying = cls.from_python_type(union_args[0])
281
+ underlying_py_type = union_args[0] if union_args[1] is type(None) else union_args[1]
282
+ underlying = cls.from_python_type(underlying_py_type)
263
283
  if underlying is not None:
264
- underlying._nullable = True
265
- return underlying
284
+ return underlying.copy(nullable=True)
285
+ elif typing.get_origin(t) is typing.Annotated:
286
+ annotated_args = typing.get_args(t)
287
+ origin = annotated_args[0]
288
+ parameters = annotated_args[1]
289
+ if isinstance(parameters, ColumnType):
290
+ return parameters.copy(nullable=nullable_default)
291
+ elif typing.get_origin(t) is Required:
292
+ required_args = typing.get_args(t)
293
+ assert len(required_args) == 1
294
+ return cls.from_python_type(required_args[0], nullable_default=False)
266
295
  else:
267
296
  # Discard type parameters to ensure that parameterized types such as `list[T]`
268
297
  # are correctly mapped to Pixeltable types.
269
- base = typing.get_origin(t)
270
- if base is None:
271
- # No type parameters; the base type is just `t` itself
272
- base = t
273
- if base is str:
274
- return StringType()
275
- if base is int:
276
- return IntType()
277
- if base is float:
278
- return FloatType()
279
- if base is bool:
280
- return BoolType()
281
- if base is datetime.datetime:
282
- return TimestampType()
283
- if issubclass(base, Sequence) or issubclass(base, Mapping):
284
- return JsonType()
285
- if issubclass(base, PIL.Image.Image):
286
- return ImageType()
298
+ origin = typing.get_origin(t)
299
+ if origin is None:
300
+ # No type parameters; the origin type is just `t` itself
301
+ origin = t
302
+ if issubclass(origin, _PxtType):
303
+ return origin.as_col_type(nullable=nullable_default)
304
+ if origin is str:
305
+ return StringType(nullable=nullable_default)
306
+ if origin is int:
307
+ return IntType(nullable=nullable_default)
308
+ if origin is float:
309
+ return FloatType(nullable=nullable_default)
310
+ if origin is bool:
311
+ return BoolType(nullable=nullable_default)
312
+ if origin is datetime.datetime:
313
+ return TimestampType(nullable=nullable_default)
314
+ if origin is PIL.Image.Image:
315
+ return ImageType(nullable=nullable_default)
316
+ if issubclass(origin, Sequence) or issubclass(origin, Mapping):
317
+ return JsonType(nullable=nullable_default)
287
318
  return None
288
319
 
320
+ @classmethod
321
+ def normalize_type(cls, t: Union[ColumnType, type, _AnnotatedAlias], nullable_default: bool = False) -> ColumnType:
322
+ """
323
+ Convert any type recognizable by Pixeltable to its corresponding ColumnType.
324
+ """
325
+ if isinstance(t, ColumnType):
326
+ return t
327
+ col_type = cls.from_python_type(t, nullable_default)
328
+ if col_type is None:
329
+ raise excs.Error(f'Unknown type: {t}')
330
+ return col_type
331
+
289
332
  def validate_literal(self, val: Any) -> None:
290
333
  """Raise TypeError if val is not a valid literal for this type"""
291
334
  if val is None:
@@ -491,7 +534,7 @@ class TimestampType(ColumnType):
491
534
 
492
535
  class JsonType(ColumnType):
493
536
  # TODO: type_spec also needs to be able to express lists
494
- def __init__(self, type_spec: Optional[Dict[str, ColumnType]] = None, nullable: bool = False):
537
+ def __init__(self, type_spec: Optional[dict[str, ColumnType]] = None, nullable: bool = False):
495
538
  super().__init__(self.Type.JSON, nullable=nullable)
496
539
  self.type_spec = type_spec
497
540
 
@@ -526,7 +569,7 @@ class JsonType(ColumnType):
526
569
  type_spec[other_field_name] = field_type
527
570
  return JsonType(type_spec, nullable=(self.nullable or other.nullable))
528
571
 
529
- def _as_dict(self) -> Dict:
572
+ def _as_dict(self) -> dict:
530
573
  result = super()._as_dict()
531
574
  if self.type_spec is not None:
532
575
  type_spec_dict = {field_name: field_type.serialize() for field_name, field_type in self.type_spec.items()}
@@ -534,7 +577,7 @@ class JsonType(ColumnType):
534
577
  return result
535
578
 
536
579
  @classmethod
537
- def _from_dict(cls, d: Dict) -> ColumnType:
580
+ def _from_dict(cls, d: dict) -> ColumnType:
538
581
  type_spec = None
539
582
  if 'type_spec' in d:
540
583
  type_spec = {
@@ -590,6 +633,9 @@ class ArrayType(ColumnType):
590
633
  def matches(self, other: ColumnType) -> bool:
591
634
  return isinstance(other, ArrayType) and self.shape == other.shape and self.dtype == other.dtype
592
635
 
636
+ def __hash__(self) -> int:
637
+ return hash((self._type, self.nullable, self.shape, self.dtype))
638
+
593
639
  def supertype(self, other: ColumnType) -> Optional[ArrayType]:
594
640
  if not isinstance(other, ArrayType):
595
641
  return None
@@ -601,16 +647,16 @@ class ArrayType(ColumnType):
601
647
  shape = [n1 if n1 == n2 else None for n1, n2 in zip(self.shape, other.shape)]
602
648
  return ArrayType(tuple(shape), self.make_type(base_type), nullable=(self.nullable or other.nullable))
603
649
 
604
- def _as_dict(self) -> Dict:
650
+ def _as_dict(self) -> dict:
605
651
  result = super()._as_dict()
606
652
  result.update(shape=list(self.shape), dtype=self.dtype.value)
607
653
  return result
608
654
 
609
- def __str__(self) -> str:
610
- return f'{self._type.name.lower()}({self.shape}, dtype={self.dtype.name})'
655
+ def _to_base_str(self) -> str:
656
+ return f'Array[{self.shape}, {self.pxt_dtype}]'
611
657
 
612
658
  @classmethod
613
- def _from_dict(cls, d: Dict) -> ColumnType:
659
+ def _from_dict(cls, d: dict) -> ColumnType:
614
660
  assert 'shape' in d
615
661
  assert 'dtype' in d
616
662
  shape = tuple(d['shape'])
@@ -681,7 +727,7 @@ class ArrayType(ColumnType):
681
727
 
682
728
  class ImageType(ColumnType):
683
729
  def __init__(
684
- self, width: Optional[int] = None, height: Optional[int] = None, size: Optional[Tuple[int, int]] = None,
730
+ self, width: Optional[int] = None, height: Optional[int] = None, size: Optional[tuple[int, int]] = None,
685
731
  mode: Optional[str] = None, nullable: bool = False
686
732
  ):
687
733
  """
@@ -701,23 +747,17 @@ class ImageType(ColumnType):
701
747
  def copy(self, nullable: bool) -> ColumnType:
702
748
  return ImageType(self.width, self.height, mode=self.mode, nullable=nullable)
703
749
 
704
- def __str__(self) -> str:
705
- if self.width is not None or self.height is not None or self.mode is not None:
750
+ def _to_base_str(self) -> str:
751
+ params = []
752
+ if self.width is not None or self.height is not None:
753
+ params.append(f'({self.width}, {self.height})')
754
+ if self.mode is not None:
755
+ params.append(repr(self.mode))
756
+ if len(params) == 0:
706
757
  params_str = ''
707
- if self.width is not None:
708
- params_str = f'width={self.width}'
709
- if self.height is not None:
710
- if len(params_str) > 0:
711
- params_str += ', '
712
- params_str += f'height={self.height}'
713
- if self.mode is not None:
714
- if len(params_str) > 0:
715
- params_str += ', '
716
- params_str += f'mode={self.mode}'
717
- params_str = f'({params_str})'
718
758
  else:
719
- params_str = ''
720
- return f'{self._type.name.lower()}{params_str}'
759
+ params_str = f'[{", ".join(params)}]'
760
+ return f'Image{params_str}'
721
761
 
722
762
  def matches(self, other: ColumnType) -> bool:
723
763
  return (
@@ -727,6 +767,9 @@ class ImageType(ColumnType):
727
767
  and self.mode == other.mode
728
768
  )
729
769
 
770
+ def __hash__(self) -> int:
771
+ return hash((self._type, self.nullable, self.size, self.mode))
772
+
730
773
  def supertype(self, other: ColumnType) -> Optional[ImageType]:
731
774
  if not isinstance(other, ImageType):
732
775
  return None
@@ -736,18 +779,18 @@ class ImageType(ColumnType):
736
779
  return ImageType(width=width, height=height, mode=mode, nullable=(self.nullable or other.nullable))
737
780
 
738
781
  @property
739
- def size(self) -> Optional[Tuple[int, int]]:
782
+ def size(self) -> Optional[tuple[int, int]]:
740
783
  if self.width is None or self.height is None:
741
784
  return None
742
785
  return (self.width, self.height)
743
786
 
744
- def _as_dict(self) -> Dict:
787
+ def _as_dict(self) -> dict:
745
788
  result = super()._as_dict()
746
789
  result.update(width=self.width, height=self.height, mode=self.mode)
747
790
  return result
748
791
 
749
792
  @classmethod
750
- def _from_dict(cls, d: Dict) -> ColumnType:
793
+ def _from_dict(cls, d: dict) -> ColumnType:
751
794
  assert 'width' in d
752
795
  assert 'height' in d
753
796
  assert 'mode' in d
@@ -756,6 +799,20 @@ class ImageType(ColumnType):
756
799
  def to_sa_type(self) -> sql.types.TypeEngine:
757
800
  return sql.String()
758
801
 
802
+ def _create_literal(self, val: Any) -> Any:
803
+ if isinstance(val, str) and val.startswith('data:'):
804
+ # try parsing this as a `data:` URL, and if successful, decode the image immediately
805
+ try:
806
+ with urllib.request.urlopen(val) as response:
807
+ b = response.read()
808
+ img = PIL.Image.open(io.BytesIO(b))
809
+ img.load()
810
+ return img
811
+ except Exception as exc:
812
+ errormsg_val = val if len(val) < 50 else val[:50] + '...'
813
+ raise excs.Error(f'data URL could not be decoded into a valid image: {errormsg_val}') from exc
814
+ return val
815
+
759
816
  def _validate_literal(self, val: Any) -> None:
760
817
  if isinstance(val, PIL.Image.Image):
761
818
  return
@@ -834,6 +891,7 @@ class DocumentType(ColumnType):
834
891
  HTML = 0
835
892
  MD = 1
836
893
  PDF = 2
894
+ XML = 3
837
895
 
838
896
  def __init__(self, nullable: bool = False, doc_formats: Optional[str] = None):
839
897
  super().__init__(self.Type.DOCUMENT, nullable=nullable)
@@ -853,6 +911,9 @@ class DocumentType(ColumnType):
853
911
  def matches(self, other: ColumnType) -> bool:
854
912
  return isinstance(other, DocumentType) and self._doc_formats == other._doc_formats
855
913
 
914
+ def __hash__(self) -> int:
915
+ return hash((self._type, self.nullable, self._doc_formats))
916
+
856
917
  def to_sa_type(self) -> sql.types.TypeEngine:
857
918
  # stored as a file path
858
919
  return sql.String()
@@ -866,3 +927,141 @@ class DocumentType(ColumnType):
866
927
  dh = get_document_handle(val)
867
928
  if dh is None:
868
929
  raise excs.Error(f'Not a recognized document format: {val}')
930
+
931
+
932
+ T = typing.TypeVar('T')
933
+
934
+
935
+ class Required(typing.Generic[T]):
936
+ """
937
+ Marker class to indicate that a column is non-nullable in a schema definition. This has no meaning as a type hint,
938
+ and is intended only for schema declarations.
939
+ """
940
+ pass
941
+
942
+
943
+ String = typing.Annotated[str, StringType(nullable=False)]
944
+ Int = typing.Annotated[int, IntType(nullable=False)]
945
+ Float = typing.Annotated[float, FloatType(nullable=False)]
946
+ Bool = typing.Annotated[bool, BoolType(nullable=False)]
947
+ Timestamp = typing.Annotated[datetime.datetime, TimestampType(nullable=False)]
948
+
949
+
950
+ class _PxtType:
951
+ """
952
+ Base class for the Pixeltable type-hint family. Subclasses of this class are meant to be used as type hints, both
953
+ in schema definitions and in UDF signatures. Whereas `ColumnType`s are instantiable and carry semantic information
954
+ about the Pixeltable type system, `_PxtType` subclasses are purely for convenience: they are not instantiable and
955
+ must be resolved to a `ColumnType` (by calling `ColumnType.from_python_type()`) in order to do anything meaningful
956
+ with them.
957
+
958
+ `_PxtType` subclasses can be specialized (as type hints) with type parameters; for example:
959
+ `Image[(300, 300), 'RGB']`. The specialized forms resolve to `typing.Annotated` instances whose annotation is a
960
+ `ColumnType`.
961
+ """
962
+ def __init__(self):
963
+ raise TypeError(f'Type `{type(self)}` cannot be instantiated.')
964
+
965
+ @classmethod
966
+ def as_col_type(cls, nullable: bool) -> ColumnType:
967
+ raise NotImplementedError()
968
+
969
+
970
+ class Json(_PxtType):
971
+ @classmethod
972
+ def as_col_type(cls, nullable: bool) -> ColumnType:
973
+ return JsonType(nullable=nullable)
974
+
975
+
976
+ class Array(np.ndarray, _PxtType):
977
+ def __class_getitem__(cls, item: Any) -> _AnnotatedAlias:
978
+ """
979
+ `item` (the type subscript) must be a tuple with exactly two elements (in any order):
980
+ - A tuple of `Optional[int]`s, specifying the shape of the array
981
+ - A type, specifying the dtype of the array
982
+ Example: Array[(3, None, 2), float]
983
+ """
984
+ params = item if isinstance(item, tuple) else (item,)
985
+ shape: Optional[tuple] = None
986
+ dtype: Optional[ColumnType] = None
987
+ for param in params:
988
+ if isinstance(param, tuple):
989
+ if not all(n is None or (isinstance(n, int) and n >= 1) for n in param):
990
+ raise TypeError(f'Invalid Array type parameter: {param}')
991
+ if shape is not None:
992
+ raise TypeError(f'Duplicate Array type parameter: {param}')
993
+ shape = param
994
+ elif isinstance(param, type) or isinstance(param, _AnnotatedAlias):
995
+ if dtype is not None:
996
+ raise TypeError(f'Duplicate Array type parameter: {param}')
997
+ dtype = ColumnType.from_python_type(param)
998
+ else:
999
+ raise TypeError(f'Invalid Array type parameter: {param}')
1000
+ if shape is None:
1001
+ raise TypeError('Array type is missing parameter: shape')
1002
+ if dtype is None:
1003
+ raise TypeError('Array type is missing parameter: dtype')
1004
+ return typing.Annotated[np.ndarray, ArrayType(shape=shape, dtype=dtype, nullable=False)]
1005
+
1006
+ @classmethod
1007
+ def as_col_type(cls, nullable: bool) -> ColumnType:
1008
+ raise TypeError('Array type cannot be used without specifying shape and dtype')
1009
+
1010
+
1011
+ class Image(PIL.Image.Image, _PxtType):
1012
+ def __class_getitem__(cls, item: Any) -> _AnnotatedAlias:
1013
+ """
1014
+ `item` (the type subscript) must be one of the following, or a tuple containing either or both in any order:
1015
+ - A 2-tuple of `int`s, specifying the size of the image
1016
+ - A string, specifying the mode of the image
1017
+ Example: Image[(300, 300), 'RGB']
1018
+ """
1019
+ if isinstance(item, tuple) and all(n is None or isinstance(n, int) for n in item):
1020
+ # It's a tuple of the form (width, height)
1021
+ params = (item,)
1022
+ elif isinstance(item, tuple):
1023
+ # It's a compound tuple (multiple parameters)
1024
+ params = item
1025
+ else:
1026
+ # Not a tuple (single arg)
1027
+ params = (item,)
1028
+ size: Optional[tuple] = None
1029
+ mode: Optional[str] = None
1030
+ for param in params:
1031
+ if isinstance(param, tuple):
1032
+ if len(param) != 2 or not isinstance(param[0], (int, type(None))) or not isinstance(param[1], (int, type(None))):
1033
+ raise TypeError(f'Invalid Image type parameter: {param}')
1034
+ if size is not None:
1035
+ raise TypeError(f'Duplicate Image type parameter: {param}')
1036
+ size = param
1037
+ elif isinstance(param, str):
1038
+ if param not in PIL.Image.MODES:
1039
+ raise TypeError(f'Invalid Image type parameter: {param!r}')
1040
+ if mode is not None:
1041
+ raise TypeError(f'Duplicate Image type parameter: {param!r}')
1042
+ mode = param
1043
+ else:
1044
+ raise TypeError(f'Invalid Image type parameter: {param}')
1045
+ return typing.Annotated[PIL.Image.Image, ImageType(size=size, mode=mode, nullable=False)]
1046
+
1047
+ @classmethod
1048
+ def as_col_type(cls, nullable: bool) -> ColumnType:
1049
+ return ImageType(nullable=nullable)
1050
+
1051
+
1052
+ class Video(str, _PxtType):
1053
+ @classmethod
1054
+ def as_col_type(cls, nullable: bool) -> ColumnType:
1055
+ return VideoType(nullable=nullable)
1056
+
1057
+
1058
+ class Audio(str, _PxtType):
1059
+ @classmethod
1060
+ def as_col_type(cls, nullable: bool) -> ColumnType:
1061
+ return AudioType(nullable=nullable)
1062
+
1063
+
1064
+ class Document(str, _PxtType):
1065
+ @classmethod
1066
+ def as_col_type(cls, nullable: bool) -> ColumnType:
1067
+ return DocumentType(nullable=nullable)
pixeltable/utils/arrow.py CHANGED
@@ -1,13 +1,14 @@
1
1
  import logging
2
- from typing import Any, Dict, Iterable, Iterator, Optional
2
+ from typing import Any, Iterator, Optional, Union
3
3
 
4
+ import numpy as np
4
5
  import pyarrow as pa
5
6
 
6
7
  import pixeltable.type_system as ts
7
8
 
8
9
  _logger = logging.getLogger(__name__)
9
10
 
10
- _pa_to_pt: Dict[pa.DataType, ts.ColumnType] = {
11
+ _pa_to_pt: dict[pa.DataType, ts.ColumnType] = {
11
12
  pa.string(): ts.StringType(nullable=True),
12
13
  pa.timestamp('us'): ts.TimestampType(nullable=True),
13
14
  pa.bool_(): ts.BoolType(nullable=True),
@@ -20,7 +21,7 @@ _pa_to_pt: Dict[pa.DataType, ts.ColumnType] = {
20
21
  pa.float32(): ts.FloatType(nullable=True),
21
22
  }
22
23
 
23
- _pt_to_pa: Dict[ts.ColumnType, pa.DataType] = {
24
+ _pt_to_pa: dict[type[ts.ColumnType], pa.DataType] = {
24
25
  ts.StringType: pa.string(),
25
26
  ts.TimestampType: pa.timestamp('us'), # postgres timestamp is microseconds
26
27
  ts.BoolType: pa.bool_(),
@@ -61,19 +62,19 @@ def to_arrow_type(pixeltable_type: ts.ColumnType) -> Optional[pa.DataType]:
61
62
  return None
62
63
 
63
64
 
64
- def to_pixeltable_schema(arrow_schema: pa.Schema) -> Dict[str, ts.ColumnType]:
65
+ def to_pixeltable_schema(arrow_schema: pa.Schema) -> dict[str, ts.ColumnType]:
65
66
  return {field.name: to_pixeltable_type(field.type) for field in arrow_schema}
66
67
 
67
68
 
68
- def to_arrow_schema(pixeltable_schema: Dict[str, Any]) -> pa.Schema:
69
- return pa.schema((name, to_arrow_type(typ)) for name, typ in pixeltable_schema.items())
69
+ def to_arrow_schema(pixeltable_schema: dict[str, Any]) -> pa.Schema:
70
+ return pa.schema((name, to_arrow_type(typ)) for name, typ in pixeltable_schema.items()) # type: ignore[misc]
70
71
 
71
72
 
72
- def to_pydict(batch: pa.RecordBatch) -> Dict[str, Iterable[Any]]:
73
+ def to_pydict(batch: pa.RecordBatch) -> dict[str, Union[list, np.ndarray]]:
73
74
  """Convert a RecordBatch to a dictionary of lists, unlike pa.lib.RecordBatch.to_pydict,
74
75
  this function will not convert numpy arrays to lists, and will preserve the original numpy dtype.
75
76
  """
76
- out = {}
77
+ out: dict[str, Union[list, np.ndarray]] = {}
77
78
  for k, name in enumerate(batch.schema.names):
78
79
  col = batch.column(k)
79
80
  if isinstance(col.type, pa.FixedShapeTensorType):
@@ -86,7 +87,7 @@ def to_pydict(batch: pa.RecordBatch) -> Dict[str, Iterable[Any]]:
86
87
  return out
87
88
 
88
89
 
89
- def iter_tuples(batch: pa.RecordBatch) -> Iterator[Dict[str, Any]]:
90
+ def iter_tuples(batch: pa.RecordBatch) -> Iterator[dict[str, Any]]:
90
91
  """Convert a RecordBatch to an iterator of dictionaries. also works with pa.Table and pa.RowGroup"""
91
92
  pydict = to_pydict(batch)
92
93
  assert len(pydict) > 0, 'empty record batch'
pixeltable/utils/coco.py CHANGED
@@ -1,12 +1,12 @@
1
- from typing import List, Dict, Any, Set
2
- from pathlib import Path
3
1
  import json
2
+ from pathlib import Path
3
+ from typing import Any, Dict, List, Set
4
4
 
5
5
  import PIL
6
6
 
7
+ import pixeltable as pxt
7
8
  import pixeltable.exceptions as excs
8
9
 
9
-
10
10
  format_msg = """
11
11
 
12
12
  Required format:
@@ -48,7 +48,7 @@ def _verify_input_dict(input_dict: Dict[str, Any]) -> None:
48
48
  if not isinstance(annotation['category'], (str, int)):
49
49
  raise excs.Error(f'Value for "category" is not a str or int: {annotation}{format_msg}')
50
50
 
51
- def write_coco_dataset(df: 'pixeltable.DataFrame', dest_path: Path) -> Path:
51
+ def write_coco_dataset(df: pxt.DataFrame, dest_path: Path) -> Path:
52
52
  """Export a DataFrame result set as a COCO dataset in dest_path and return the path of the data.json file."""
53
53
  # TODO: validate schema
54
54
  if len(df._select_list_exprs) != 1 or not df._select_list_exprs[0].col_type.is_json_type():
@@ -2,7 +2,7 @@ import dataclasses
2
2
  from typing import Optional
3
3
 
4
4
  import bs4
5
- import fitz # (pymupdf)
5
+ import fitz # type: ignore[import-untyped]
6
6
  import puremagic
7
7
 
8
8
  import pixeltable.type_system as ts
@@ -35,6 +35,11 @@ def get_document_handle(path: str) -> Optional[DocumentHandle]:
35
35
  if md_ast is not None:
36
36
  return DocumentHandle(format=ts.DocumentType.DocumentFormat.MD, md_ast=md_ast)
37
37
 
38
+ if doc_format == '.xml':
39
+ bs_doc = get_xml_handle(path)
40
+ if bs_doc is not None:
41
+ return DocumentHandle(format=ts.DocumentType.DocumentFormat.XML, bs_doc=bs_doc)
42
+
38
43
  return None
39
44
 
40
45
 
@@ -54,7 +59,16 @@ def get_pdf_handle(path: str) -> Optional[fitz.Document]:
54
59
  def get_html_handle(path: str) -> Optional[bs4.BeautifulSoup]:
55
60
  try:
56
61
  with open(path, 'r', encoding='utf8') as fp:
57
- doc = bs4.BeautifulSoup(fp, 'html.parser')
62
+ doc = bs4.BeautifulSoup(fp, 'lxml')
63
+ return doc if doc.find() is not None else None
64
+ except Exception:
65
+ return None
66
+
67
+
68
+ def get_xml_handle(path: str) -> Optional[bs4.BeautifulSoup]:
69
+ try:
70
+ with open(path, 'r', encoding='utf8') as fp:
71
+ doc = bs4.BeautifulSoup(fp, 'xml')
58
72
  return doc if doc.find() is not None else None
59
73
  except Exception:
60
74
  return None