pixeltable 0.2.26__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. pixeltable/__init__.py +83 -19
  2. pixeltable/_query.py +1444 -0
  3. pixeltable/_version.py +1 -0
  4. pixeltable/catalog/__init__.py +7 -4
  5. pixeltable/catalog/catalog.py +2394 -119
  6. pixeltable/catalog/column.py +225 -104
  7. pixeltable/catalog/dir.py +38 -9
  8. pixeltable/catalog/globals.py +53 -34
  9. pixeltable/catalog/insertable_table.py +265 -115
  10. pixeltable/catalog/path.py +80 -17
  11. pixeltable/catalog/schema_object.py +28 -43
  12. pixeltable/catalog/table.py +1270 -677
  13. pixeltable/catalog/table_metadata.py +103 -0
  14. pixeltable/catalog/table_version.py +1270 -751
  15. pixeltable/catalog/table_version_handle.py +109 -0
  16. pixeltable/catalog/table_version_path.py +137 -42
  17. pixeltable/catalog/tbl_ops.py +53 -0
  18. pixeltable/catalog/update_status.py +191 -0
  19. pixeltable/catalog/view.py +251 -134
  20. pixeltable/config.py +215 -0
  21. pixeltable/env.py +736 -285
  22. pixeltable/exceptions.py +26 -2
  23. pixeltable/exec/__init__.py +7 -2
  24. pixeltable/exec/aggregation_node.py +39 -21
  25. pixeltable/exec/cache_prefetch_node.py +87 -109
  26. pixeltable/exec/cell_materialization_node.py +268 -0
  27. pixeltable/exec/cell_reconstruction_node.py +168 -0
  28. pixeltable/exec/component_iteration_node.py +25 -28
  29. pixeltable/exec/data_row_batch.py +11 -46
  30. pixeltable/exec/exec_context.py +26 -11
  31. pixeltable/exec/exec_node.py +35 -27
  32. pixeltable/exec/expr_eval/__init__.py +3 -0
  33. pixeltable/exec/expr_eval/evaluators.py +365 -0
  34. pixeltable/exec/expr_eval/expr_eval_node.py +413 -0
  35. pixeltable/exec/expr_eval/globals.py +200 -0
  36. pixeltable/exec/expr_eval/row_buffer.py +74 -0
  37. pixeltable/exec/expr_eval/schedulers.py +413 -0
  38. pixeltable/exec/globals.py +35 -0
  39. pixeltable/exec/in_memory_data_node.py +35 -27
  40. pixeltable/exec/object_store_save_node.py +293 -0
  41. pixeltable/exec/row_update_node.py +44 -29
  42. pixeltable/exec/sql_node.py +414 -115
  43. pixeltable/exprs/__init__.py +8 -5
  44. pixeltable/exprs/arithmetic_expr.py +79 -45
  45. pixeltable/exprs/array_slice.py +5 -5
  46. pixeltable/exprs/column_property_ref.py +40 -26
  47. pixeltable/exprs/column_ref.py +254 -61
  48. pixeltable/exprs/comparison.py +14 -9
  49. pixeltable/exprs/compound_predicate.py +9 -10
  50. pixeltable/exprs/data_row.py +213 -72
  51. pixeltable/exprs/expr.py +270 -104
  52. pixeltable/exprs/expr_dict.py +6 -5
  53. pixeltable/exprs/expr_set.py +20 -11
  54. pixeltable/exprs/function_call.py +383 -284
  55. pixeltable/exprs/globals.py +18 -5
  56. pixeltable/exprs/in_predicate.py +7 -7
  57. pixeltable/exprs/inline_expr.py +37 -37
  58. pixeltable/exprs/is_null.py +8 -4
  59. pixeltable/exprs/json_mapper.py +120 -54
  60. pixeltable/exprs/json_path.py +90 -60
  61. pixeltable/exprs/literal.py +61 -16
  62. pixeltable/exprs/method_ref.py +7 -6
  63. pixeltable/exprs/object_ref.py +19 -8
  64. pixeltable/exprs/row_builder.py +238 -75
  65. pixeltable/exprs/rowid_ref.py +53 -15
  66. pixeltable/exprs/similarity_expr.py +65 -50
  67. pixeltable/exprs/sql_element_cache.py +5 -5
  68. pixeltable/exprs/string_op.py +107 -0
  69. pixeltable/exprs/type_cast.py +25 -13
  70. pixeltable/exprs/variable.py +2 -2
  71. pixeltable/func/__init__.py +9 -5
  72. pixeltable/func/aggregate_function.py +197 -92
  73. pixeltable/func/callable_function.py +119 -35
  74. pixeltable/func/expr_template_function.py +101 -48
  75. pixeltable/func/function.py +375 -62
  76. pixeltable/func/function_registry.py +20 -19
  77. pixeltable/func/globals.py +6 -5
  78. pixeltable/func/mcp.py +74 -0
  79. pixeltable/func/query_template_function.py +151 -35
  80. pixeltable/func/signature.py +178 -49
  81. pixeltable/func/tools.py +164 -0
  82. pixeltable/func/udf.py +176 -53
  83. pixeltable/functions/__init__.py +44 -4
  84. pixeltable/functions/anthropic.py +226 -47
  85. pixeltable/functions/audio.py +148 -11
  86. pixeltable/functions/bedrock.py +137 -0
  87. pixeltable/functions/date.py +188 -0
  88. pixeltable/functions/deepseek.py +113 -0
  89. pixeltable/functions/document.py +81 -0
  90. pixeltable/functions/fal.py +76 -0
  91. pixeltable/functions/fireworks.py +72 -20
  92. pixeltable/functions/gemini.py +249 -0
  93. pixeltable/functions/globals.py +208 -53
  94. pixeltable/functions/groq.py +108 -0
  95. pixeltable/functions/huggingface.py +1088 -95
  96. pixeltable/functions/image.py +155 -84
  97. pixeltable/functions/json.py +8 -11
  98. pixeltable/functions/llama_cpp.py +31 -19
  99. pixeltable/functions/math.py +169 -0
  100. pixeltable/functions/mistralai.py +50 -75
  101. pixeltable/functions/net.py +70 -0
  102. pixeltable/functions/ollama.py +29 -36
  103. pixeltable/functions/openai.py +548 -160
  104. pixeltable/functions/openrouter.py +143 -0
  105. pixeltable/functions/replicate.py +15 -14
  106. pixeltable/functions/reve.py +250 -0
  107. pixeltable/functions/string.py +310 -85
  108. pixeltable/functions/timestamp.py +37 -19
  109. pixeltable/functions/together.py +77 -120
  110. pixeltable/functions/twelvelabs.py +188 -0
  111. pixeltable/functions/util.py +7 -2
  112. pixeltable/functions/uuid.py +30 -0
  113. pixeltable/functions/video.py +1528 -117
  114. pixeltable/functions/vision.py +26 -26
  115. pixeltable/functions/voyageai.py +289 -0
  116. pixeltable/functions/whisper.py +19 -10
  117. pixeltable/functions/whisperx.py +179 -0
  118. pixeltable/functions/yolox.py +112 -0
  119. pixeltable/globals.py +716 -236
  120. pixeltable/index/__init__.py +3 -1
  121. pixeltable/index/base.py +17 -21
  122. pixeltable/index/btree.py +32 -22
  123. pixeltable/index/embedding_index.py +155 -92
  124. pixeltable/io/__init__.py +12 -7
  125. pixeltable/io/datarows.py +140 -0
  126. pixeltable/io/external_store.py +83 -125
  127. pixeltable/io/fiftyone.py +24 -33
  128. pixeltable/io/globals.py +47 -182
  129. pixeltable/io/hf_datasets.py +96 -127
  130. pixeltable/io/label_studio.py +171 -156
  131. pixeltable/io/lancedb.py +3 -0
  132. pixeltable/io/pandas.py +136 -115
  133. pixeltable/io/parquet.py +40 -153
  134. pixeltable/io/table_data_conduit.py +702 -0
  135. pixeltable/io/utils.py +100 -0
  136. pixeltable/iterators/__init__.py +8 -4
  137. pixeltable/iterators/audio.py +207 -0
  138. pixeltable/iterators/base.py +9 -3
  139. pixeltable/iterators/document.py +144 -87
  140. pixeltable/iterators/image.py +17 -38
  141. pixeltable/iterators/string.py +15 -12
  142. pixeltable/iterators/video.py +523 -127
  143. pixeltable/metadata/__init__.py +33 -8
  144. pixeltable/metadata/converters/convert_10.py +2 -3
  145. pixeltable/metadata/converters/convert_13.py +2 -2
  146. pixeltable/metadata/converters/convert_15.py +15 -11
  147. pixeltable/metadata/converters/convert_16.py +4 -5
  148. pixeltable/metadata/converters/convert_17.py +4 -5
  149. pixeltable/metadata/converters/convert_18.py +4 -6
  150. pixeltable/metadata/converters/convert_19.py +6 -9
  151. pixeltable/metadata/converters/convert_20.py +3 -6
  152. pixeltable/metadata/converters/convert_21.py +6 -8
  153. pixeltable/metadata/converters/convert_22.py +3 -2
  154. pixeltable/metadata/converters/convert_23.py +33 -0
  155. pixeltable/metadata/converters/convert_24.py +55 -0
  156. pixeltable/metadata/converters/convert_25.py +19 -0
  157. pixeltable/metadata/converters/convert_26.py +23 -0
  158. pixeltable/metadata/converters/convert_27.py +29 -0
  159. pixeltable/metadata/converters/convert_28.py +13 -0
  160. pixeltable/metadata/converters/convert_29.py +110 -0
  161. pixeltable/metadata/converters/convert_30.py +63 -0
  162. pixeltable/metadata/converters/convert_31.py +11 -0
  163. pixeltable/metadata/converters/convert_32.py +15 -0
  164. pixeltable/metadata/converters/convert_33.py +17 -0
  165. pixeltable/metadata/converters/convert_34.py +21 -0
  166. pixeltable/metadata/converters/convert_35.py +9 -0
  167. pixeltable/metadata/converters/convert_36.py +38 -0
  168. pixeltable/metadata/converters/convert_37.py +15 -0
  169. pixeltable/metadata/converters/convert_38.py +39 -0
  170. pixeltable/metadata/converters/convert_39.py +124 -0
  171. pixeltable/metadata/converters/convert_40.py +73 -0
  172. pixeltable/metadata/converters/convert_41.py +12 -0
  173. pixeltable/metadata/converters/convert_42.py +9 -0
  174. pixeltable/metadata/converters/convert_43.py +44 -0
  175. pixeltable/metadata/converters/util.py +44 -18
  176. pixeltable/metadata/notes.py +21 -0
  177. pixeltable/metadata/schema.py +185 -42
  178. pixeltable/metadata/utils.py +74 -0
  179. pixeltable/mypy/__init__.py +3 -0
  180. pixeltable/mypy/mypy_plugin.py +123 -0
  181. pixeltable/plan.py +616 -225
  182. pixeltable/share/__init__.py +3 -0
  183. pixeltable/share/packager.py +797 -0
  184. pixeltable/share/protocol/__init__.py +33 -0
  185. pixeltable/share/protocol/common.py +165 -0
  186. pixeltable/share/protocol/operation_types.py +33 -0
  187. pixeltable/share/protocol/replica.py +119 -0
  188. pixeltable/share/publish.py +349 -0
  189. pixeltable/store.py +398 -232
  190. pixeltable/type_system.py +730 -267
  191. pixeltable/utils/__init__.py +40 -0
  192. pixeltable/utils/arrow.py +201 -29
  193. pixeltable/utils/av.py +298 -0
  194. pixeltable/utils/azure_store.py +346 -0
  195. pixeltable/utils/coco.py +26 -27
  196. pixeltable/utils/code.py +4 -4
  197. pixeltable/utils/console_output.py +46 -0
  198. pixeltable/utils/coroutine.py +24 -0
  199. pixeltable/utils/dbms.py +92 -0
  200. pixeltable/utils/description_helper.py +11 -12
  201. pixeltable/utils/documents.py +60 -61
  202. pixeltable/utils/exception_handler.py +36 -0
  203. pixeltable/utils/filecache.py +38 -22
  204. pixeltable/utils/formatter.py +88 -51
  205. pixeltable/utils/gcs_store.py +295 -0
  206. pixeltable/utils/http.py +133 -0
  207. pixeltable/utils/http_server.py +14 -13
  208. pixeltable/utils/iceberg.py +13 -0
  209. pixeltable/utils/image.py +17 -0
  210. pixeltable/utils/lancedb.py +90 -0
  211. pixeltable/utils/local_store.py +322 -0
  212. pixeltable/utils/misc.py +5 -0
  213. pixeltable/utils/object_stores.py +573 -0
  214. pixeltable/utils/pydantic.py +60 -0
  215. pixeltable/utils/pytorch.py +20 -20
  216. pixeltable/utils/s3_store.py +527 -0
  217. pixeltable/utils/sql.py +32 -5
  218. pixeltable/utils/system.py +30 -0
  219. pixeltable/utils/transactional_directory.py +4 -3
  220. pixeltable-0.5.7.dist-info/METADATA +579 -0
  221. pixeltable-0.5.7.dist-info/RECORD +227 -0
  222. {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
  223. pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
  224. pixeltable/__version__.py +0 -3
  225. pixeltable/catalog/named_function.py +0 -36
  226. pixeltable/catalog/path_dict.py +0 -141
  227. pixeltable/dataframe.py +0 -894
  228. pixeltable/exec/expr_eval_node.py +0 -232
  229. pixeltable/ext/__init__.py +0 -14
  230. pixeltable/ext/functions/__init__.py +0 -8
  231. pixeltable/ext/functions/whisperx.py +0 -77
  232. pixeltable/ext/functions/yolox.py +0 -157
  233. pixeltable/tool/create_test_db_dump.py +0 -311
  234. pixeltable/tool/create_test_video.py +0 -81
  235. pixeltable/tool/doc_plugins/griffe.py +0 -50
  236. pixeltable/tool/doc_plugins/mkdocstrings.py +0 -6
  237. pixeltable/tool/doc_plugins/templates/material/udf.html.jinja +0 -135
  238. pixeltable/tool/embed_udf.py +0 -9
  239. pixeltable/tool/mypy_plugin.py +0 -55
  240. pixeltable/utils/media_store.py +0 -76
  241. pixeltable/utils/s3.py +0 -16
  242. pixeltable-0.2.26.dist-info/METADATA +0 -400
  243. pixeltable-0.2.26.dist-info/RECORD +0 -156
  244. pixeltable-0.2.26.dist-info/entry_points.txt +0 -3
  245. {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
pixeltable/type_system.py CHANGED
@@ -9,17 +9,25 @@ import types
9
9
  import typing
10
10
  import urllib.parse
11
11
  import urllib.request
12
+ import uuid
12
13
  from pathlib import Path
13
- from typing import Any, Iterable, Mapping, Optional, Sequence, Union
14
+ from typing import Any, ClassVar, Iterable, Literal, Mapping, Sequence, Union
14
15
 
15
- import PIL.Image
16
- import av # type: ignore
16
+ from typing import _GenericAlias # type: ignore[attr-defined] # isort: skip
17
+
18
+ import av
19
+ import jsonschema
20
+ import jsonschema.protocols
21
+ import jsonschema.validators
17
22
  import numpy as np
23
+ import PIL.Image
24
+ import pydantic
18
25
  import sqlalchemy as sql
19
- from typing import _GenericAlias # type: ignore[attr-defined]
20
26
  from typing_extensions import _AnnotatedAlias
21
27
 
22
28
  import pixeltable.exceptions as excs
29
+ from pixeltable.env import Env
30
+ from pixeltable.utils import parse_local_file_path
23
31
 
24
32
 
25
33
  class ColumnType:
@@ -36,16 +44,21 @@ class ColumnType:
36
44
  VIDEO = 8
37
45
  AUDIO = 9
38
46
  DOCUMENT = 10
47
+ DATE = 11
48
+ UUID = 12
49
+ BINARY = 13
39
50
 
40
51
  # exprs that don't evaluate to a computable value in Pixeltable, such as an Image member function
41
52
  INVALID = 255
42
53
 
43
54
  @classmethod
44
55
  def supertype(
45
- cls, type1: 'ColumnType.Type', type2: 'ColumnType.Type',
46
- # we need to pass this in because we can't easily append it as a class member
47
- common_supertypes: dict[tuple['ColumnType.Type', 'ColumnType.Type'], 'ColumnType.Type']
48
- ) -> Optional['ColumnType.Type']:
56
+ cls,
57
+ type1: 'ColumnType.Type' | None,
58
+ type2: 'ColumnType.Type' | None,
59
+ # we need to pass this in because we can't easily append it as a class member
60
+ common_supertypes: dict[tuple['ColumnType.Type', 'ColumnType.Type'], 'ColumnType.Type'],
61
+ ) -> 'ColumnType.Type' | None:
49
62
  if type1 == type2:
50
63
  return type1
51
64
  t = common_supertypes.get((type1, type2))
@@ -56,28 +69,10 @@ class ColumnType:
56
69
  return t
57
70
  return None
58
71
 
59
-
60
- @enum.unique
61
- class DType(enum.Enum):
62
- """
63
- Base type used in images and arrays
64
- """
65
- BOOL = 0,
66
- INT8 = 1,
67
- INT16 = 2,
68
- INT32 = 3,
69
- INT64 = 4,
70
- UINT8 = 5,
71
- UINT16 = 6,
72
- UINT32 = 7,
73
- UINT64 = 8,
74
- FLOAT16 = 9,
75
- FLOAT32 = 10,
76
- FLOAT64 = 11
77
-
78
- scalar_types = {Type.STRING, Type.INT, Type.FLOAT, Type.BOOL, Type.TIMESTAMP}
79
- numeric_types = {Type.INT, Type.FLOAT}
80
- common_supertypes: dict[tuple[Type, Type], Type] = {
72
+ scalar_json_types: ClassVar[set[Type]] = {Type.STRING, Type.INT, Type.FLOAT, Type.BOOL}
73
+ scalar_types: ClassVar[set[Type]] = scalar_json_types | {Type.TIMESTAMP, Type.DATE, Type.UUID}
74
+ numeric_types: ClassVar[set[Type]] = {Type.INT, Type.FLOAT}
75
+ common_supertypes: ClassVar[dict[tuple[Type, Type], Type]] = {
81
76
  (Type.BOOL, Type.INT): Type.INT,
82
77
  (Type.BOOL, Type.FLOAT): Type.FLOAT,
83
78
  (Type.INT, Type.FLOAT): Type.FLOAT,
@@ -87,6 +82,9 @@ class ColumnType:
87
82
  self._type = t
88
83
  self._nullable = nullable
89
84
 
85
+ def has_supertype(self) -> bool:
86
+ return True
87
+
90
88
  @property
91
89
  def nullable(self) -> bool:
92
90
  return self._nullable
@@ -110,10 +108,7 @@ class ColumnType:
110
108
  return json.dumps([t.as_dict() for t in type_list])
111
109
 
112
110
  def as_dict(self) -> dict:
113
- return {
114
- '_classname': self.__class__.__name__,
115
- **self._as_dict(),
116
- }
111
+ return {'_classname': self.__class__.__name__, **self._as_dict()}
117
112
 
118
113
  def _as_dict(self) -> dict:
119
114
  return {'nullable': self.nullable}
@@ -144,27 +139,37 @@ class ColumnType:
144
139
 
145
140
  @classmethod
146
141
  def make_type(cls, t: Type) -> ColumnType:
147
- assert t != cls.Type.INVALID and t != cls.Type.ARRAY
148
- if t == cls.Type.STRING:
149
- return StringType()
150
- if t == cls.Type.INT:
151
- return IntType()
152
- if t == cls.Type.FLOAT:
153
- return FloatType()
154
- if t == cls.Type.BOOL:
155
- return BoolType()
156
- if t == cls.Type.TIMESTAMP:
157
- return TimestampType()
158
- if t == cls.Type.JSON:
159
- return JsonType()
160
- if t == cls.Type.IMAGE:
161
- return ImageType()
162
- if t == cls.Type.VIDEO:
163
- return VideoType()
164
- if t == cls.Type.AUDIO:
165
- return AudioType()
166
- if t == cls.Type.DOCUMENT:
167
- return DocumentType()
142
+ match t:
143
+ case cls.Type.STRING:
144
+ return StringType()
145
+ case cls.Type.INT:
146
+ return IntType()
147
+ case cls.Type.FLOAT:
148
+ return FloatType()
149
+ case cls.Type.BOOL:
150
+ return BoolType()
151
+ case cls.Type.TIMESTAMP:
152
+ return TimestampType()
153
+ case cls.Type.JSON:
154
+ return JsonType()
155
+ case cls.Type.ARRAY:
156
+ return ArrayType()
157
+ case cls.Type.IMAGE:
158
+ return ImageType()
159
+ case cls.Type.VIDEO:
160
+ return VideoType()
161
+ case cls.Type.AUDIO:
162
+ return AudioType()
163
+ case cls.Type.DOCUMENT:
164
+ return DocumentType()
165
+ case cls.Type.DATE:
166
+ return DateType()
167
+ case cls.Type.UUID:
168
+ return UUIDType()
169
+ case cls.Type.BINARY:
170
+ return BinaryType()
171
+ case _:
172
+ raise AssertionError(t)
168
173
 
169
174
  def __repr__(self) -> str:
170
175
  return self._to_str(as_schema=False)
@@ -174,7 +179,7 @@ class ColumnType:
174
179
  if as_schema:
175
180
  return base_str if self.nullable else f'Required[{base_str}]'
176
181
  else:
177
- return f'Optional[{base_str}]' if self.nullable else base_str
182
+ return f'{base_str} | None' if self.nullable else base_str
178
183
 
179
184
  def _to_base_str(self) -> str:
180
185
  """
@@ -203,27 +208,41 @@ class ColumnType:
203
208
  # Default: just compare base types (this works for all types whose only parameter is nullable)
204
209
  return self._type == other._type
205
210
 
206
- def supertype(self, other: ColumnType) -> Optional[ColumnType]:
211
+ def supertype(self, other: ColumnType, for_inference: bool = False) -> ColumnType | None:
212
+ """
213
+ Returns the most specific type that is a supertype of both `self` and `other`.
214
+
215
+ If `for_inference=True`, then we disallow certain type relationships that are technically correct, but may
216
+ be confusing for schema inference during data imports.
217
+ """
207
218
  if self == other:
208
219
  return self
209
220
  if self.matches(other):
210
221
  return self.copy(nullable=(self.nullable or other.nullable))
211
222
 
212
223
  if self.is_invalid_type():
213
- return other
224
+ return other.copy(nullable=(self.nullable or other.nullable))
214
225
  if other.is_invalid_type():
215
- return self
226
+ return self.copy(nullable=(self.nullable or other.nullable))
216
227
 
217
228
  if self.is_scalar_type() and other.is_scalar_type():
218
229
  t = self.Type.supertype(self._type, other._type, self.common_supertypes)
219
230
  if t is not None:
220
231
  return self.make_type(t).copy(nullable=(self.nullable or other.nullable))
221
- return None
232
+
233
+ # If we see a mix of JSON and/or JSON-compatible scalar types, resolve to JSON.
234
+ # (For JSON+JSON, we return None to allow JsonType to handle merging the type schemas.)
235
+ if not for_inference and (
236
+ (self.is_json_type() and other.is_scalar_json_type())
237
+ or (self.is_scalar_json_type() and other.is_json_type())
238
+ or (self.is_scalar_json_type() and other.is_scalar_json_type())
239
+ ):
240
+ return JsonType(nullable=(self.nullable or other.nullable))
222
241
 
223
242
  return None
224
243
 
225
244
  @classmethod
226
- def infer_literal_type(cls, val: Any, nullable: bool = False) -> Optional[ColumnType]:
245
+ def infer_literal_type(cls, val: Any, nullable: bool = False) -> ColumnType | None:
227
246
  if val is None:
228
247
  return InvalidType(nullable=True)
229
248
  if isinstance(val, str):
@@ -235,16 +254,24 @@ class ColumnType:
235
254
  return IntType(nullable=nullable)
236
255
  if isinstance(val, float):
237
256
  return FloatType(nullable=nullable)
257
+ # When checking types of dates / timestamps, be aware that a datetime is also a date,
258
+ # but a date is not a datetime. So check for datetime first.
238
259
  if isinstance(val, datetime.datetime):
239
260
  return TimestampType(nullable=nullable)
261
+ if isinstance(val, datetime.date):
262
+ return DateType(nullable=nullable)
263
+ if isinstance(val, uuid.UUID):
264
+ return UUIDType(nullable=nullable)
265
+ if isinstance(val, bytes):
266
+ return BinaryType(nullable=nullable)
240
267
  if isinstance(val, PIL.Image.Image):
241
268
  return ImageType(width=val.width, height=val.height, mode=val.mode, nullable=nullable)
242
269
  if isinstance(val, np.ndarray):
243
270
  col_type = ArrayType.from_literal(val, nullable=nullable)
244
271
  if col_type is not None:
245
272
  return col_type
246
- # this could still be json-serializable
247
- if isinstance(val, dict) or isinstance(val, list) or isinstance(val, np.ndarray):
273
+ # this could still be json-serializable
274
+ if isinstance(val, (list, tuple, dict, np.ndarray, pydantic.BaseModel)):
248
275
  try:
249
276
  JsonType().validate_literal(val)
250
277
  return JsonType(nullable=nullable)
@@ -253,7 +280,7 @@ class ColumnType:
253
280
  return None
254
281
 
255
282
  @classmethod
256
- def infer_common_literal_type(cls, vals: Iterable[Any]) -> Optional[ColumnType]:
283
+ def infer_common_literal_type(cls, vals: Iterable[Any]) -> ColumnType | None:
257
284
  """
258
285
  Returns the most specific type that is a supertype of all literals in `vals`. If no such type
259
286
  exists, returns None.
@@ -261,24 +288,27 @@ class ColumnType:
261
288
  Args:
262
289
  vals: A collection of literals.
263
290
  """
264
- inferred_type: Optional[ColumnType] = None
291
+ inferred_type: ColumnType | None = None
265
292
  for val in vals:
266
293
  val_type = cls.infer_literal_type(val)
267
294
  if inferred_type is None:
268
295
  inferred_type = val_type
269
296
  else:
270
- inferred_type = inferred_type.supertype(val_type)
271
- if inferred_type is None:
272
- return None
297
+ inferred_type = inferred_type.supertype(val_type, for_inference=True)
298
+ if inferred_type is None:
299
+ return None
300
+ if not inferred_type.has_supertype():
301
+ return inferred_type
273
302
  return inferred_type
274
303
 
275
304
  @classmethod
276
305
  def from_python_type(
277
306
  cls,
278
- t: Union[type, _GenericAlias],
307
+ t: type | _GenericAlias,
279
308
  nullable_default: bool = False,
280
- allow_builtin_types: bool = True
281
- ) -> Optional[ColumnType]:
309
+ allow_builtin_types: bool = True,
310
+ infer_pydantic_json: bool = False,
311
+ ) -> ColumnType | None:
282
312
  """
283
313
  Convert a Python type into a Pixeltable `ColumnType` instance.
284
314
 
@@ -290,34 +320,34 @@ class ColumnType:
290
320
  allowed (as in UDF definitions). If False, then only Pixeltable types such as `pxt.String`,
291
321
  `pxt.Int`, etc., will be allowed (as in schema definitions). `Optional` and `Required`
292
322
  designations will be allowed regardless.
323
+ infer_pydantic_json: If True, accepts an extended set of built-ins (eg, Enum, Path) and returns the type to
324
+ which pydantic.BaseModel.model_dump(mode='json') serializes it.
293
325
  """
294
326
  origin = typing.get_origin(t)
295
- if origin is typing.Union:
296
- # Check if `t` has the form Optional[T].
297
- union_args = typing.get_args(t)
298
- if len(union_args) == 2 and type(None) in union_args:
299
- # `t` is a type of the form Optional[T] (equivalently, Union[T, None] or Union[None, T]).
327
+ type_args = typing.get_args(t)
328
+ if origin in (typing.Union, types.UnionType):
329
+ # Check if `t` has the form T | None.
330
+ if len(type_args) == 2 and type(None) in type_args:
331
+ # `t` is a type of the form T | None (equivalently, T | None or None | T).
300
332
  # We treat it as the underlying type but with nullable=True.
301
- underlying_py_type = union_args[0] if union_args[1] is type(None) else union_args[1]
302
- underlying = cls.from_python_type(underlying_py_type, allow_builtin_types=allow_builtin_types)
333
+ underlying_py_type = type_args[0] if type_args[1] is type(None) else type_args[1]
334
+ underlying = cls.from_python_type(
335
+ underlying_py_type, allow_builtin_types=allow_builtin_types, infer_pydantic_json=infer_pydantic_json
336
+ )
303
337
  if underlying is not None:
304
338
  return underlying.copy(nullable=True)
305
339
  elif origin is Required:
306
- required_args = typing.get_args(t)
307
- assert len(required_args) == 1
340
+ assert len(type_args) == 1
308
341
  return cls.from_python_type(
309
- required_args[0],
310
- nullable_default=False,
311
- allow_builtin_types=allow_builtin_types
312
- )
342
+ type_args[0], nullable_default=False, allow_builtin_types=allow_builtin_types
343
+ ).copy(nullable=False)
313
344
  elif origin is typing.Annotated:
314
- annotated_args = typing.get_args(t)
315
- origin = annotated_args[0]
316
- parameters = annotated_args[1]
345
+ origin = type_args[0]
346
+ parameters = type_args[1]
317
347
  if isinstance(parameters, ColumnType):
318
348
  return parameters.copy(nullable=nullable_default)
319
349
  else:
320
- # It's something other than Optional[T], Required[T], or an explicitly annotated type.
350
+ # It's something other than T | None, Required[T], or an explicitly annotated type.
321
351
  if origin is not None:
322
352
  # Discard type parameters to ensure that parameterized types such as `list[T]`
323
353
  # are correctly mapped to Pixeltable types.
@@ -325,6 +355,18 @@ class ColumnType:
325
355
  if isinstance(t, type) and issubclass(t, _PxtType):
326
356
  return t.as_col_type(nullable=nullable_default)
327
357
  elif allow_builtin_types:
358
+ if t is Literal and len(type_args) > 0:
359
+ literal_type = cls.infer_common_literal_type(type_args)
360
+ if literal_type is None:
361
+ return None
362
+ return literal_type.copy(nullable=(literal_type.nullable or nullable_default))
363
+ if infer_pydantic_json and isinstance(t, type) and issubclass(t, enum.Enum):
364
+ literal_type = cls.infer_common_literal_type(member.value for member in t)
365
+ if literal_type is None:
366
+ return None
367
+ return literal_type.copy(nullable=(literal_type.nullable or nullable_default))
368
+ if infer_pydantic_json and t is Path:
369
+ return StringType(nullable=nullable_default)
328
370
  if t is str:
329
371
  return StringType(nullable=nullable_default)
330
372
  if t is int:
@@ -335,18 +377,21 @@ class ColumnType:
335
377
  return BoolType(nullable=nullable_default)
336
378
  if t is datetime.datetime:
337
379
  return TimestampType(nullable=nullable_default)
380
+ if t is datetime.date:
381
+ return DateType(nullable=nullable_default)
382
+ if t is uuid.UUID:
383
+ return UUIDType(nullable=nullable_default)
384
+ if t is bytes:
385
+ return BinaryType(nullable=nullable_default)
338
386
  if t is PIL.Image.Image:
339
387
  return ImageType(nullable=nullable_default)
340
- if issubclass(t, Sequence) or issubclass(t, Mapping):
388
+ if isinstance(t, type) and issubclass(t, (Sequence, Mapping, pydantic.BaseModel)):
341
389
  return JsonType(nullable=nullable_default)
342
390
  return None
343
391
 
344
392
  @classmethod
345
393
  def normalize_type(
346
- cls,
347
- t: Union[ColumnType, type, _AnnotatedAlias],
348
- nullable_default: bool = False,
349
- allow_builtin_types: bool = True
394
+ cls, t: ColumnType | type | _AnnotatedAlias, nullable_default: bool = False, allow_builtin_types: bool = True
350
395
  ) -> ColumnType:
351
396
  """
352
397
  Convert any type recognizable by Pixeltable to its corresponding ColumnType.
@@ -358,25 +403,58 @@ class ColumnType:
358
403
  cls.__raise_exc_for_invalid_type(t)
359
404
  return col_type
360
405
 
361
- __TYPE_SUGGESTIONS: list[tuple[type, str]] = [
406
+ __TYPE_SUGGESTIONS: ClassVar[list[tuple[type, str]]] = [
362
407
  (str, 'pxt.String'),
363
408
  (bool, 'pxt.Bool'),
364
409
  (int, 'pxt.Int'),
365
410
  (float, 'pxt.Float'),
366
411
  (datetime.datetime, 'pxt.Timestamp'),
412
+ (datetime.date, 'pxt.Date'),
413
+ (uuid.UUID, 'pxt.UUID'),
367
414
  (PIL.Image.Image, 'pxt.Image'),
415
+ (bytes, 'pxt.Binary'),
368
416
  (Sequence, 'pxt.Json'),
369
417
  (Mapping, 'pxt.Json'),
370
418
  ]
371
419
 
372
420
  @classmethod
373
- def __raise_exc_for_invalid_type(cls, t: Union[type, _AnnotatedAlias]) -> None:
421
+ def __raise_exc_for_invalid_type(cls, t: type | _AnnotatedAlias) -> None:
374
422
  for builtin_type, suggestion in cls.__TYPE_SUGGESTIONS:
375
423
  if t is builtin_type or (isinstance(t, type) and issubclass(t, builtin_type)):
376
424
  name = t.__name__ if t.__module__ == 'builtins' else f'{t.__module__}.{t.__name__}'
377
425
  raise excs.Error(f'Standard Python type `{name}` cannot be used here; use `{suggestion}` instead')
378
426
  raise excs.Error(f'Unknown type: {t}')
379
427
 
428
+ @classmethod
429
+ def from_json_schema(cls, schema: dict[str, Any]) -> ColumnType | None:
430
+ # We first express the JSON schema as a Python type, and then convert it to a Pixeltable type.
431
+ # TODO: Is there a meaningful fallback if one of these operations fails? (Maybe another use case for a pxt Any
432
+ # type?)
433
+ py_type = cls.__json_schema_to_py_type(schema)
434
+ return cls.from_python_type(py_type) if py_type is not None else None
435
+
436
+ @classmethod
437
+ def __json_schema_to_py_type(cls, schema: dict[str, Any]) -> type | _GenericAlias | None:
438
+ if 'type' in schema:
439
+ if schema['type'] == 'null':
440
+ return type(None)
441
+ if schema['type'] == 'string':
442
+ return str
443
+ if schema['type'] == 'integer':
444
+ return int
445
+ if schema['type'] == 'number':
446
+ return float
447
+ if schema['type'] == 'boolean':
448
+ return bool
449
+ if schema['type'] in ('array', 'object'):
450
+ return list
451
+ elif 'anyOf' in schema:
452
+ subscripts = tuple(cls.__json_schema_to_py_type(subschema) for subschema in schema['anyOf'])
453
+ if all(subscript is not None for subscript in subscripts):
454
+ return Union[subscripts]
455
+
456
+ return None
457
+
380
458
  def validate_literal(self, val: Any) -> None:
381
459
  """Raise TypeError if val is not a valid literal for this type"""
382
460
  if val is None:
@@ -396,15 +474,11 @@ class ColumnType:
396
474
  def _validate_file_path(self, val: Any) -> None:
397
475
  """Raises TypeError if not a valid local file path or not a path/byte sequence"""
398
476
  if isinstance(val, str):
399
- parsed = urllib.parse.urlparse(val)
400
- if parsed.scheme != '' and parsed.scheme != 'file':
401
- return
402
- path = Path(urllib.parse.unquote(urllib.request.url2pathname(parsed.path)))
403
- if not path.is_file():
404
- raise TypeError(f'File not found: {str(path)}')
405
- else:
406
- if not isinstance(val, bytes):
407
- raise TypeError(f'expected file path or bytes, got {type(val)}')
477
+ path = parse_local_file_path(val)
478
+ if path is not None and not path.is_file():
479
+ raise TypeError(f'File not found: {path}')
480
+ elif not isinstance(val, bytes):
481
+ raise TypeError(f'expected file path or bytes, got {type(val)}')
408
482
 
409
483
  @abc.abstractmethod
410
484
  def _validate_literal(self, val: Any) -> None:
@@ -412,7 +486,7 @@ class ColumnType:
412
486
 
413
487
  def _create_literal(self, val: Any) -> Any:
414
488
  """Create a literal of this type from val, including any needed conversions.
415
- val is guaranteed to be non-None"""
489
+ val is guaranteed to be non-None"""
416
490
  return val
417
491
 
418
492
  def create_literal(self, val: Any) -> Any:
@@ -429,6 +503,9 @@ class ColumnType:
429
503
  def is_scalar_type(self) -> bool:
430
504
  return self._type in self.scalar_types
431
505
 
506
+ def is_scalar_json_type(self) -> bool:
507
+ return self._type in self.scalar_json_types
508
+
432
509
  def is_numeric_type(self) -> bool:
433
510
  return self._type in self.numeric_types
434
511
 
@@ -450,12 +527,21 @@ class ColumnType:
450
527
  def is_timestamp_type(self) -> bool:
451
528
  return self._type == self.Type.TIMESTAMP
452
529
 
530
+ def is_date_type(self) -> bool:
531
+ return self._type == self.Type.DATE
532
+
533
+ def is_uuid_type(self) -> bool:
534
+ return self._type == self.Type.UUID
535
+
453
536
  def is_json_type(self) -> bool:
454
537
  return self._type == self.Type.JSON
455
538
 
456
539
  def is_array_type(self) -> bool:
457
540
  return self._type == self.Type.ARRAY
458
541
 
542
+ def is_binary_type(self) -> bool:
543
+ return self._type == self.Type.BINARY
544
+
459
545
  def is_image_type(self) -> bool:
460
546
  return self._type == self.Type.IMAGE
461
547
 
@@ -472,35 +558,85 @@ class ColumnType:
472
558
  # types that refer to external media files
473
559
  return self.is_image_type() or self.is_video_type() or self.is_audio_type() or self.is_document_type()
474
560
 
561
+ def supports_file_offloading(self) -> bool:
562
+ # types that can be offloaded to file-based storage via a CellMaterializationNode
563
+ return self.is_array_type() or self.is_json_type() or self.is_binary_type()
564
+
565
+ @classmethod
475
566
  @abc.abstractmethod
476
- def to_sa_type(self) -> sql.types.TypeEngine:
567
+ def to_sa_type(cls) -> sql.types.TypeEngine:
477
568
  """
478
569
  Return corresponding SQLAlchemy type.
479
570
  """
480
- pass
571
+
572
+ def to_json_schema(self) -> dict[str, Any]:
573
+ if self.nullable:
574
+ return {'anyOf': [self._to_json_schema(), {'type': 'null'}]}
575
+ else:
576
+ return self._to_json_schema()
577
+
578
+ def _to_json_schema(self) -> dict[str, Any]:
579
+ raise excs.Error(f'Pixeltable type {self} is not a valid JSON type')
580
+
581
+ @classmethod
582
+ def from_np_dtype(cls, dtype: np.dtype, nullable: bool) -> ColumnType | None:
583
+ """
584
+ Return pixeltable type corresponding to a given simple numpy dtype
585
+ """
586
+ if np.issubdtype(dtype, np.integer):
587
+ return IntType(nullable=nullable)
588
+
589
+ if np.issubdtype(dtype, np.floating):
590
+ return FloatType(nullable=nullable)
591
+
592
+ if dtype == np.bool_:
593
+ return BoolType(nullable=nullable)
594
+
595
+ if np.issubdtype(dtype, np.str_):
596
+ return StringType(nullable=nullable)
597
+
598
+ if np.issubdtype(dtype, np.character):
599
+ return StringType(nullable=nullable)
600
+
601
+ if np.issubdtype(dtype, np.datetime64):
602
+ unit, _ = np.datetime_data(dtype)
603
+ if unit in ('D', 'M', 'Y'):
604
+ return DateType(nullable=nullable)
605
+ else:
606
+ return TimestampType(nullable=nullable)
607
+
608
+ return None
481
609
 
482
610
 
483
611
  class InvalidType(ColumnType):
484
612
  def __init__(self, nullable: bool = False):
485
613
  super().__init__(self.Type.INVALID, nullable=nullable)
486
614
 
487
- def to_sa_type(self) -> sql.types.TypeEngine:
488
- assert False
615
+ @classmethod
616
+ def to_sa_type(cls) -> sql.types.TypeEngine:
617
+ return sql.types.NullType()
489
618
 
490
619
  def print_value(self, val: Any) -> str:
491
620
  return str(val)
492
621
 
493
622
  def _validate_literal(self, val: Any) -> None:
494
- assert False
623
+ raise AssertionError()
495
624
 
496
625
 
497
626
  class StringType(ColumnType):
498
627
  def __init__(self, nullable: bool = False):
499
628
  super().__init__(self.Type.STRING, nullable=nullable)
500
629
 
501
- def to_sa_type(self) -> sql.types.TypeEngine:
630
+ def has_supertype(self) -> bool:
631
+ return not self.nullable
632
+
633
+ @classmethod
634
+ def to_sa_type(cls) -> sql.types.TypeEngine:
502
635
  return sql.String()
503
636
 
637
+ def _to_json_schema(self) -> dict[str, Any]:
638
+ return {'type': 'string'}
639
+
504
640
  def print_value(self, val: Any) -> str:
505
641
  return f"'{val}'"
506
642
 
@@ -521,11 +657,17 @@ class IntType(ColumnType):
521
657
  def __init__(self, nullable: bool = False):
522
658
  super().__init__(self.Type.INT, nullable=nullable)
523
659
 
524
- def to_sa_type(self) -> sql.types.TypeEngine:
660
+ @classmethod
661
+ def to_sa_type(cls) -> sql.types.TypeEngine:
525
662
  return sql.BigInteger()
526
663
 
664
+ def _to_json_schema(self) -> dict[str, Any]:
665
+ return {'type': 'integer'}
666
+
527
667
  def _validate_literal(self, val: Any) -> None:
528
- if not isinstance(val, int):
668
+ # bool is a subclass of int, so we need to check for it
669
+ # explicitly first
670
+ if isinstance(val, bool) or not isinstance(val, int):
529
671
  raise TypeError(f'Expected int, got {val.__class__.__name__}')
530
672
 
531
673
 
@@ -533,9 +675,13 @@ class FloatType(ColumnType):
533
675
  def __init__(self, nullable: bool = False):
534
676
  super().__init__(self.Type.FLOAT, nullable=nullable)
535
677
 
536
- def to_sa_type(self) -> sql.types.TypeEngine:
678
+ @classmethod
679
+ def to_sa_type(cls) -> sql.types.TypeEngine:
537
680
  return sql.Float()
538
681
 
682
+ def _to_json_schema(self) -> dict[str, Any]:
683
+ return {'type': 'number'}
684
+
539
685
  def _validate_literal(self, val: Any) -> None:
540
686
  if not isinstance(val, float):
541
687
  raise TypeError(f'Expected float, got {val.__class__.__name__}')
@@ -550,9 +696,13 @@ class BoolType(ColumnType):
550
696
  def __init__(self, nullable: bool = False):
551
697
  super().__init__(self.Type.BOOL, nullable=nullable)
552
698
 
553
- def to_sa_type(self) -> sql.types.TypeEngine:
699
+ @classmethod
700
+ def to_sa_type(cls) -> sql.types.TypeEngine:
554
701
  return sql.Boolean()
555
702
 
703
+ def _to_json_schema(self) -> dict[str, Any]:
704
+ return {'type': 'boolean'}
705
+
556
706
  def _validate_literal(self, val: Any) -> None:
557
707
  if not isinstance(val, bool):
558
708
  raise TypeError(f'Expected bool, got {val.__class__.__name__}')
@@ -567,7 +717,11 @@ class TimestampType(ColumnType):
567
717
  def __init__(self, nullable: bool = False):
568
718
  super().__init__(self.Type.TIMESTAMP, nullable=nullable)
569
719
 
570
- def to_sa_type(self) -> sql.types.TypeEngine:
720
+ def has_supertype(self) -> bool:
721
+ return not self.nullable
722
+
723
+ @classmethod
724
+ def to_sa_type(cls) -> sql.types.TypeEngine:
571
725
  return sql.TIMESTAMP(timezone=True)
572
726
 
573
727
  def _validate_literal(self, val: Any) -> None:
@@ -577,65 +731,120 @@ class TimestampType(ColumnType):
577
731
  def _create_literal(self, val: Any) -> Any:
578
732
  if isinstance(val, str):
579
733
  return datetime.datetime.fromisoformat(val)
734
+ # Place naive timestamps in the default time zone
735
+ if isinstance(val, datetime.datetime) and val.tzinfo is None:
736
+ return val.replace(tzinfo=Env.get().default_time_zone)
737
+ return val
738
+
739
+
740
+ class DateType(ColumnType):
741
+ def __init__(self, nullable: bool = False):
742
+ super().__init__(self.Type.DATE, nullable=nullable)
743
+
744
+ def has_supertype(self) -> bool:
745
+ return not self.nullable
746
+
747
+ @classmethod
748
+ def to_sa_type(cls) -> sql.types.TypeEngine:
749
+ return sql.Date()
750
+
751
+ def _validate_literal(self, val: Any) -> None:
752
+ if not isinstance(val, datetime.date):
753
+ raise TypeError(f'Expected datetime.date, got {val.__class__.__name__}')
754
+
755
+ def _create_literal(self, val: Any) -> Any:
756
+ if isinstance(val, str):
757
+ return datetime.datetime.fromisoformat(val).date()
758
+ if isinstance(val, datetime.date):
759
+ return val
760
+ return val
761
+
762
+
763
+ class UUIDType(ColumnType):
764
+ def __init__(self, nullable: bool = False):
765
+ super().__init__(self.Type.UUID, nullable=nullable)
766
+
767
+ def has_supertype(self) -> bool:
768
+ return not self.nullable
769
+
770
+ @classmethod
771
+ def to_sa_type(cls) -> sql.types.TypeEngine:
772
+ return sql.UUID(as_uuid=True)
773
+
774
+ def _to_json_schema(self) -> dict[str, Any]:
775
+ return {'type': 'string', 'format': 'uuid'}
776
+
777
+ def print_value(self, val: Any) -> str:
778
+ return f"'{val}'"
779
+
780
+ def _to_base_str(self) -> str:
781
+ return 'UUID'
782
+
783
+ def _validate_literal(self, val: Any) -> None:
784
+ if not isinstance(val, uuid.UUID):
785
+ raise TypeError(f'Expected uuid.UUID, got {val.__class__.__name__}')
786
+
787
+ def _create_literal(self, val: Any) -> Any:
788
+ if isinstance(val, str):
789
+ return uuid.UUID(val)
580
790
  return val
581
791
 
582
792
 
793
+ class BinaryType(ColumnType):
794
+ def __init__(self, nullable: bool = False):
795
+ super().__init__(self.Type.BINARY, nullable=nullable)
796
+
797
+ @classmethod
798
+ def to_sa_type(cls) -> sql.types.TypeEngine:
799
+ return sql.LargeBinary()
800
+
801
+ def _to_base_str(self) -> str:
802
+ return 'Binary'
803
+
804
+ def _validate_literal(self, val: Any) -> None:
805
+ if not isinstance(val, bytes):
806
+ raise TypeError(f'Expected `bytes`, got `{val.__class__.__name__}`')
807
+
808
+
583
809
  class JsonType(ColumnType):
584
- # TODO: type_spec also needs to be able to express lists
585
- def __init__(self, type_spec: Optional[dict[str, ColumnType]] = None, nullable: bool = False):
810
+ json_schema: dict[str, Any] | None
811
+ __validator: jsonschema.protocols.Validator | None
812
+
813
+ def __init__(self, json_schema: dict[str, Any] | None = None, nullable: bool = False):
586
814
  super().__init__(self.Type.JSON, nullable=nullable)
587
- self.type_spec = type_spec
815
+ self.json_schema = json_schema
816
+ if json_schema is None:
817
+ self.__validator = None
818
+ else:
819
+ validator_cls = jsonschema.validators.validator_for(json_schema)
820
+ validator_cls.check_schema(json_schema)
821
+ self.__validator = validator_cls(json_schema)
588
822
 
589
823
  def copy(self, nullable: bool) -> ColumnType:
590
- return JsonType(self.type_spec, nullable=nullable)
824
+ return JsonType(json_schema=self.json_schema, nullable=nullable)
591
825
 
592
826
  def matches(self, other: ColumnType) -> bool:
593
- return isinstance(other, JsonType) and self.type_spec == other.type_spec
594
-
595
- def supertype(self, other: ColumnType) -> Optional[JsonType]:
596
- if not isinstance(other, JsonType):
597
- return None
598
- if self.type_spec is None:
599
- # we don't have a type spec and can accept anything accepted by other
600
- return JsonType(nullable=(self.nullable or other.nullable))
601
- if other.type_spec is None:
602
- # we have a type spec but other doesn't
603
- return JsonType(nullable=(self.nullable or other.nullable))
604
-
605
- # we both have type specs; the supertype's type spec is the union of the two
606
- type_spec: dict[str, ColumnType] = {}
607
- type_spec.update(self.type_spec)
608
- for other_field_name, other_field_type in other.type_spec.items():
609
- if other_field_name not in type_spec:
610
- type_spec[other_field_name] = other_field_type
611
- else:
612
- # both type specs have this field
613
- field_type = type_spec[other_field_name].supertype(other_field_type)
614
- if field_type is None:
615
- # conflicting types
616
- return JsonType(nullable=(self.nullable or other.nullable))
617
- type_spec[other_field_name] = field_type
618
- return JsonType(type_spec, nullable=(self.nullable or other.nullable))
827
+ return isinstance(other, JsonType) and self.json_schema == other.json_schema
619
828
 
620
829
  def _as_dict(self) -> dict:
621
830
  result = super()._as_dict()
622
- if self.type_spec is not None:
623
- type_spec_dict = {field_name: field_type.serialize() for field_name, field_type in self.type_spec.items()}
624
- result.update({'type_spec': type_spec_dict})
831
+ if self.json_schema is not None:
832
+ result.update({'json_schema': self.json_schema})
625
833
  return result
626
834
 
627
835
  @classmethod
628
836
  def _from_dict(cls, d: dict) -> ColumnType:
629
- type_spec = None
630
- if 'type_spec' in d:
631
- type_spec = {
632
- field_name: cls.deserialize(field_type_dict) for field_name, field_type_dict in d['type_spec'].items()
633
- }
634
- return cls(type_spec, nullable=d['nullable'])
635
-
636
- def to_sa_type(self) -> sql.types.TypeEngine:
837
+ return cls(json_schema=d.get('json_schema'), nullable=d['nullable'])
838
+
839
+ @classmethod
840
+ def to_sa_type(cls) -> sql.types.TypeEngine:
637
841
  return sql.dialects.postgresql.JSONB()
638
842
 
843
+ def _to_json_schema(self) -> dict[str, Any]:
844
+ if self.json_schema is None:
845
+ return {}
846
+ return self.json_schema
847
+
639
848
  def print_value(self, val: Any) -> str:
640
849
  val_type = self.infer_literal_type(val)
641
850
  if val_type is None:
@@ -645,38 +854,200 @@ class JsonType(ColumnType):
645
854
  return val_type.print_value(val)
646
855
 
647
856
  def _validate_literal(self, val: Any) -> None:
648
- if not isinstance(val, dict) and not isinstance(val, list):
649
- # TODO In the future we should accept scalars too, which would enable us to remove this top-level check
650
- raise TypeError(f'Expected dict or list, got {val.__class__.__name__}')
651
- if not self.__is_valid_literal(val):
857
+ if isinstance(val, tuple):
858
+ val = list(val)
859
+ if isinstance(val, pydantic.BaseModel):
860
+ val = val.model_dump()
861
+ if not self.__is_valid_json(val):
652
862
  raise TypeError(f'That literal is not a valid Pixeltable JSON object: {val}')
863
+ if self.__validator is not None:
864
+ self.__validator.validate(val)
653
865
 
654
866
  @classmethod
655
- def __is_valid_literal(cls, val: Any) -> bool:
656
- if val is None or isinstance(val, (str, int, float, bool)):
867
+ def __is_valid_json(cls, val: Any) -> bool:
868
+ if val is None or isinstance(val, (str, int, float, bool, np.ndarray, PIL.Image.Image, bytes)):
657
869
  return True
658
870
  if isinstance(val, (list, tuple)):
659
- return all(cls.__is_valid_literal(v) for v in val)
871
+ return all(cls.__is_valid_json(v) for v in val)
660
872
  if isinstance(val, dict):
661
- return all(isinstance(k, str) and cls.__is_valid_literal(v) for k, v in val.items())
873
+ return all(isinstance(k, str) and cls.__is_valid_json(v) for k, v in val.items())
662
874
  return False
663
875
 
664
876
  def _create_literal(self, val: Any) -> Any:
665
877
  if isinstance(val, tuple):
666
878
  val = list(val)
879
+ if isinstance(val, pydantic.BaseModel):
880
+ return val.model_dump()
667
881
  return val
668
882
 
883
+ def supertype(self, other: ColumnType, for_inference: bool = False) -> JsonType | None:
884
+ # Try using the (much faster) supertype logic in ColumnType first. That will work if, for example, the types
885
+ # are identical except for nullability. If that doesn't work and both types are JsonType, then we will need to
886
+ # merge their schemas.
887
+ basic_supertype = super().supertype(other)
888
+ if basic_supertype is not None:
889
+ assert isinstance(basic_supertype, JsonType)
890
+ return basic_supertype
891
+
892
+ if not isinstance(other, JsonType):
893
+ return None
894
+
895
+ if self.json_schema is None or other.json_schema is None:
896
+ return JsonType(nullable=(self.nullable or other.nullable))
897
+
898
+ superschema = self.__superschema(self.json_schema, other.json_schema)
899
+
900
+ return JsonType(
901
+ json_schema=(None if len(superschema) == 0 else superschema), nullable=(self.nullable or other.nullable)
902
+ )
903
+
904
+ @classmethod
905
+ def __superschema(cls, a: dict[str, Any], b: dict[str, Any]) -> dict[str, Any] | None:
906
+ # Defining a general type hierarchy over all JSON schemas would be a challenging problem. In order to keep
907
+ # things manageable, we only define a hierarchy among "conforming" schemas, which provides enough generality
908
+ # for the most important use cases (unions for type inference, validation of inline exprs). A schema is
909
+ # considered to be conforming if either:
910
+ # (i) it is a scalar (string, integer, number, boolean) or dictionary (object) type; or
911
+ # (ii) it is an "anyOf" schema of one of the above types and the exact schema {'type': 'null'}.
912
+ # Conforming schemas are organized into a type hierarchy in an internally consistent way. Nonconforming
913
+ # schemas are allowed, but they are isolates in the type hierarchy: a nonconforming schema has no proper
914
+ # subtypes, and its only proper supertype is an unconstrained JsonType().
915
+ #
916
+ # There is some subtlety in the handling of nullable fields. Nullable fields are represented in JSON
917
+ # schemas as (for example) {'anyOf': [{'type': 'string'}, {'type': 'null'}]}. When finding the supertype
918
+ # of schemas that might be nullable, we first unpack the 'anyOf's, find the supertype of the underlyings,
919
+ # then reapply the 'anyOf' if appropriate. The top-level schema (i.e., JsonType.json_schema) is presumed
920
+ # to NOT be in this form (since nullability is indicated by the `nullable` field of the JsonType object),
921
+ # so this subtlety is applicable only to types that occur in subfields.
922
+ #
923
+ # There is currently no special handling of lists; distinct schemas with type 'array' will union to the
924
+ # generic {'type': 'array'} schema. This could be a TODO item if there is a need for it in the future.
925
+
926
+ if a == b:
927
+ return a
928
+
929
+ if 'properties' in a and 'properties' in b:
930
+ a_props = a['properties']
931
+ b_props = b['properties']
932
+ a_req = a.get('required', [])
933
+ b_req = b.get('required', [])
934
+ super_props = {}
935
+ super_req = []
936
+ for key, a_prop_schema in a_props.items():
937
+ if key in b_props: # in both a and b
938
+ prop_schema = cls.__superschema_with_nulls(a_prop_schema, b_props[key])
939
+ super_props[key] = prop_schema
940
+ if key in a_req and key in b_req:
941
+ super_req.append(key)
942
+ else: # in a but not b
943
+ # Add it to the supertype schema as optional (regardless of its status in a)
944
+ super_props[key] = a_prop_schema
945
+ for key, b_prop_schema in b_props.items():
946
+ if key not in a_props: # in b but not a
947
+ super_props[key] = b_prop_schema
948
+ schema = {'type': 'object', 'properties': super_props}
949
+ if len(super_req) > 0:
950
+ schema['required'] = super_req
951
+ return schema
952
+
953
+ a_type = a.get('type')
954
+ b_type = b.get('type')
955
+
956
+ if a_type in ('string', 'integer', 'number', 'boolean', 'object', 'array') and a_type == b_type:
957
+ # a and b both have the same type designation, but are not identical. This can happen if
958
+ # (for example) they have validators or other attributes that differ. In this case, we
959
+ # generalize to {'type': t}, where t is their shared type, with no other qualifications.
960
+ return {'type': a_type}
961
+
962
+ return {} # Unresolvable type conflict; the supertype is an unrestricted JsonType.
963
+
964
+ @classmethod
965
+ def __superschema_with_nulls(cls, a: dict[str, Any], b: dict[str, Any]) -> dict[str, Any] | None:
966
+ a, a_nullable = cls.__unpack_null_from_schema(a)
967
+ b, b_nullable = cls.__unpack_null_from_schema(b)
968
+
969
+ result = cls.__superschema(a, b)
970
+ if len(result) > 0 and (a_nullable or b_nullable):
971
+ # if len(result) == 0, then null is implicitly accepted; otherwise, we need to explicitly allow it
972
+ return {'anyOf': [result, {'type': 'null'}]}
973
+ return result
974
+
975
+ @classmethod
976
+ def __unpack_null_from_schema(cls, s: dict[str, Any]) -> tuple[dict[str, Any], bool]:
977
+ if 'anyOf' in s and len(s['anyOf']) == 2 and {'type': 'null'} in s['anyOf']:
978
+ try:
979
+ return next(s for s in s['anyOf'] if s != {'type': 'null'}), True
980
+ except StopIteration:
981
+ pass
982
+ return s, False
983
+
984
+ def _to_base_str(self) -> str:
985
+ if self.json_schema is None:
986
+ return 'Json'
987
+ elif 'title' in self.json_schema:
988
+ return f'Json[{self.json_schema["title"]}]'
989
+ else:
990
+ return f'Json[{self.json_schema}]'
991
+
992
+
993
+ ARRAY_SUPPORTED_NUMPY_DTYPES = [
994
+ np.bool_,
995
+ np.uint8,
996
+ np.uint16,
997
+ np.uint32,
998
+ np.uint64,
999
+ np.int8,
1000
+ np.int16,
1001
+ np.int32,
1002
+ np.int64,
1003
+ np.float16,
1004
+ np.float32,
1005
+ np.float64,
1006
+ np.str_,
1007
+ ]
1008
+
669
1009
 
670
1010
  class ArrayType(ColumnType):
671
- def __init__(self, shape: tuple[Union[int, None], ...], dtype: ColumnType, nullable: bool = False):
1011
+ pxt_dtype_to_numpy_dtype: ClassVar[dict[ColumnType.Type, np.dtype]] = {
1012
+ ColumnType.Type.INT: np.dtype(np.int64),
1013
+ ColumnType.Type.FLOAT: np.dtype(np.float32),
1014
+ ColumnType.Type.BOOL: np.dtype(np.bool_),
1015
+ ColumnType.Type.STRING: np.dtype(np.str_),
1016
+ }
1017
+
1018
+ shape: tuple[int | None, ...] | None
1019
+ dtype: np.dtype | None
1020
+
1021
+ def __init__(
1022
+ self,
1023
+ shape: tuple[int | None, ...] | None = None,
1024
+ dtype: ColumnType | np.dtype | None = None,
1025
+ nullable: bool = False,
1026
+ ):
672
1027
  super().__init__(self.Type.ARRAY, nullable=nullable)
1028
+ assert shape is None or dtype is not None, (shape, dtype) # cannot specify a shape without a dtype
673
1029
  self.shape = shape
674
- assert dtype.is_int_type() or dtype.is_float_type() or dtype.is_bool_type() or dtype.is_string_type()
675
- self.pxt_dtype = dtype
676
- self.dtype = dtype._type
1030
+ if dtype is None:
1031
+ self.dtype = None
1032
+ elif isinstance(dtype, np.dtype):
1033
+ # Numpy string has some specifications (endianness, max length, encoding) that we don't support, so we just
1034
+ # strip them out.
1035
+ if dtype.type == np.str_:
1036
+ self.dtype = np.dtype(np.str_)
1037
+ else:
1038
+ if dtype not in ARRAY_SUPPORTED_NUMPY_DTYPES:
1039
+ raise ValueError(f'Unsupported dtype: {dtype}')
1040
+ self.dtype = dtype
1041
+ elif isinstance(dtype, ColumnType):
1042
+ self.dtype = self.pxt_dtype_to_numpy_dtype.get(dtype._type, None)
1043
+ if self.dtype is None:
1044
+ raise ValueError(f'Unsupported dtype: {dtype}')
1045
+ assert self.dtype in ARRAY_SUPPORTED_NUMPY_DTYPES
1046
+ else:
1047
+ raise ValueError(f'Unsupported dtype: {dtype}')
677
1048
 
678
1049
  def copy(self, nullable: bool) -> ColumnType:
679
- return ArrayType(self.shape, self.pxt_dtype, nullable=nullable)
1050
+ return ArrayType(self.shape, self.dtype, nullable=nullable)
680
1051
 
681
1052
  def matches(self, other: ColumnType) -> bool:
682
1053
  return isinstance(other, ArrayType) and self.shape == other.shape and self.dtype == other.dtype
@@ -684,106 +1055,133 @@ class ArrayType(ColumnType):
684
1055
  def __hash__(self) -> int:
685
1056
  return hash((self._type, self.nullable, self.shape, self.dtype))
686
1057
 
687
- def supertype(self, other: ColumnType) -> Optional[ArrayType]:
1058
+ def supertype(self, other: ColumnType, for_inference: bool = False) -> ArrayType | None:
1059
+ basic_supertype = super().supertype(other)
1060
+ if basic_supertype is not None:
1061
+ assert isinstance(basic_supertype, ArrayType)
1062
+ return basic_supertype
1063
+
688
1064
  if not isinstance(other, ArrayType):
689
1065
  return None
690
- if len(self.shape) != len(other.shape):
691
- return None
692
- base_type = self.Type.supertype(self.dtype, other.dtype, self.common_supertypes)
693
- if base_type is None:
694
- return None
695
- shape = [n1 if n1 == n2 else None for n1, n2 in zip(self.shape, other.shape)]
696
- return ArrayType(tuple(shape), self.make_type(base_type), nullable=(self.nullable or other.nullable))
1066
+
1067
+ # Supertype has dtype only if dtypes are identical. We can change this behavior to consider casting rules or
1068
+ # something else if there's demand for it.
1069
+ if self.dtype != other.dtype:
1070
+ return ArrayType(nullable=(self.nullable or other.nullable))
1071
+ super_dtype = self.dtype
1072
+
1073
+ # Determine the shape of the supertype
1074
+ super_shape: tuple[int | None, ...] | None
1075
+ if self.shape is None or other.shape is None or len(self.shape) != len(other.shape):
1076
+ super_shape = None
1077
+ else:
1078
+ super_shape = tuple(n1 if n1 == n2 else None for n1, n2 in zip(self.shape, other.shape))
1079
+ return ArrayType(super_shape, super_dtype, nullable=(self.nullable or other.nullable))
697
1080
 
698
1081
  def _as_dict(self) -> dict:
699
1082
  result = super()._as_dict()
700
- result.update(shape=list(self.shape), dtype=self.dtype.value)
1083
+ shape_as_list = None if self.shape is None else list(self.shape)
1084
+ result.update(shape=shape_as_list)
1085
+
1086
+ if self.dtype is None:
1087
+ result.update(numpy_dtype=None)
1088
+ elif self.dtype == np.str_:
1089
+ # str(np.str_) would be something like '<U', but since we don't support the string specifications, just use
1090
+ # 'str' instead to avoid confusion.
1091
+ result.update(numpy_dtype='str')
1092
+ else:
1093
+ result.update(numpy_dtype=str(self.dtype))
701
1094
  return result
702
1095
 
703
1096
  def _to_base_str(self) -> str:
704
- return f'Array[{self.shape}, {self.pxt_dtype}]'
1097
+ if self.shape is None and self.dtype is None:
1098
+ return 'Array'
1099
+ if self.shape is None:
1100
+ return f'Array[{self.dtype.name}]'
1101
+ assert self.dtype is not None
1102
+ return f'Array[{self.shape}, {self.dtype.name}]'
705
1103
 
706
1104
  @classmethod
707
1105
  def _from_dict(cls, d: dict) -> ColumnType:
1106
+ assert 'numpy_dtype' in d
1107
+ dtype = None if d['numpy_dtype'] is None else np.dtype(d['numpy_dtype'])
708
1108
  assert 'shape' in d
709
- assert 'dtype' in d
710
- shape = tuple(d['shape'])
711
- dtype = cls.make_type(cls.Type(d['dtype']))
1109
+ shape = None if d['shape'] is None else tuple(d['shape'])
712
1110
  return cls(shape, dtype, nullable=d['nullable'])
713
1111
 
714
1112
  @classmethod
715
- def from_literal(cls, val: np.ndarray, nullable: bool = False) -> Optional[ArrayType]:
716
- # determine our dtype
1113
+ def from_literal(cls, val: np.ndarray, nullable: bool = False) -> ArrayType | None:
717
1114
  assert isinstance(val, np.ndarray)
718
- if np.issubdtype(val.dtype, np.integer):
719
- dtype: ColumnType = IntType()
720
- elif np.issubdtype(val.dtype, np.floating):
721
- dtype = FloatType()
722
- elif val.dtype == np.bool_:
723
- dtype = BoolType()
724
- elif val.dtype == np.str_:
725
- dtype = StringType()
726
- else:
1115
+ if val.dtype.type not in ARRAY_SUPPORTED_NUMPY_DTYPES:
727
1116
  return None
728
- return cls(val.shape, dtype=dtype, nullable=nullable)
1117
+ return cls(val.shape, dtype=val.dtype, nullable=nullable)
729
1118
 
730
- def is_valid_literal(self, val: np.ndarray) -> bool:
731
- if not isinstance(val, np.ndarray):
732
- return False
733
- if len(val.shape) != len(self.shape):
734
- return False
735
- # check that the shapes are compatible
736
- for n1, n2 in zip(val.shape, self.shape):
737
- if n1 is None:
738
- return False
739
- if n2 is None:
740
- # wildcard
741
- continue
742
- if n1 != n2:
743
- return False
744
- return val.dtype == self.numpy_dtype()
1119
+ def _to_json_schema(self) -> dict[str, Any]:
1120
+ schema: dict[str, Any] = {'type': 'array'}
1121
+ if self.dtype == np.str_:
1122
+ schema.update({'items': {'type': 'str'}})
1123
+ elif self.dtype is not None:
1124
+ schema.update({'items': {'type': str(self.dtype)}})
1125
+ return schema
745
1126
 
746
1127
  def _validate_literal(self, val: Any) -> None:
747
1128
  if not isinstance(val, np.ndarray):
748
1129
  raise TypeError(f'Expected numpy.ndarray, got {val.__class__.__name__}')
749
- if not self.is_valid_literal(val):
750
- raise TypeError((
751
- f'Expected ndarray({self.shape}, dtype={self.numpy_dtype()}), '
752
- f'got ndarray({val.shape}, dtype={val.dtype})'))
1130
+
1131
+ # If column type has a dtype, check if it matches
1132
+ if self.dtype == np.str_:
1133
+ if val.dtype.type != np.str_:
1134
+ raise TypeError(f'Expected numpy.ndarray of dtype {self.dtype}, got numpy.ndarray of dtype {val.dtype}')
1135
+ elif self.dtype is not None and self.dtype != val.dtype:
1136
+ raise TypeError(f'Expected numpy.ndarray of dtype {self.dtype}, got numpy.ndarray of dtype {val.dtype}')
1137
+
1138
+ # Check that the dtype is one of the supported types
1139
+ if val.dtype.type != np.str_ and val.dtype not in ARRAY_SUPPORTED_NUMPY_DTYPES:
1140
+ raise TypeError(f'Unsupported dtype {val.dtype}')
1141
+
1142
+ # If a shape is specified, check that there's a match
1143
+ if self.shape is not None:
1144
+ if len(val.shape) != len(self.shape):
1145
+ raise TypeError(
1146
+ f'Expected numpy.ndarray({self.shape}, dtype={self.dtype}), '
1147
+ f'got numpy.ndarray({val.shape}, dtype={val.dtype})'
1148
+ )
1149
+ # check that the shapes are compatible
1150
+ for n1, n2 in zip(val.shape, self.shape):
1151
+ assert n1 is not None # `val` must have a concrete shape
1152
+ if n2 is None:
1153
+ continue # wildcard
1154
+ if n1 != n2:
1155
+ raise TypeError(
1156
+ f'Expected numpy.ndarray({self.shape}, dtype={self.dtype}), '
1157
+ f'got numpy.ndarray({val.shape}, dtype={val.dtype})'
1158
+ )
753
1159
 
754
1160
  def _create_literal(self, val: Any) -> Any:
755
- if isinstance(val, (list,tuple)):
1161
+ if isinstance(val, (list, tuple)):
756
1162
  # map python float to whichever numpy float is
757
1163
  # declared for this type, rather than assume float64
758
- return np.array(val, dtype=self.numpy_dtype())
1164
+ return np.array(val, dtype=self.dtype)
759
1165
  return val
760
1166
 
761
- def to_sa_type(self) -> sql.types.TypeEngine:
1167
+ @classmethod
1168
+ def to_sa_type(cls) -> sql.types.TypeEngine:
762
1169
  return sql.LargeBinary()
763
1170
 
764
- def numpy_dtype(self) -> np.dtype:
765
- if self.dtype == self.Type.INT:
766
- return np.dtype(np.int64)
767
- if self.dtype == self.Type.FLOAT:
768
- return np.dtype(np.float32)
769
- if self.dtype == self.Type.BOOL:
770
- return np.dtype(np.bool_)
771
- if self.dtype == self.Type.STRING:
772
- return np.dtype(np.str_)
773
- assert False
774
-
775
1171
 
776
1172
  class ImageType(ColumnType):
777
1173
  def __init__(
778
- self, width: Optional[int] = None, height: Optional[int] = None, size: Optional[tuple[int, int]] = None,
779
- mode: Optional[str] = None, nullable: bool = False
1174
+ self,
1175
+ width: int | None = None,
1176
+ height: int | None = None,
1177
+ size: tuple[int, int] | None = None,
1178
+ mode: str | None = None,
1179
+ nullable: bool = False,
780
1180
  ):
781
- """
782
- TODO: does it make sense to specify only width or height?
783
- """
1181
+ # TODO: does it make sense to specify only width or height?
784
1182
  super().__init__(self.Type.IMAGE, nullable=nullable)
785
- assert not(width is not None and size is not None)
786
- assert not(height is not None and size is not None)
1183
+ assert not (width is not None and size is not None)
1184
+ assert not (height is not None and size is not None)
787
1185
  if size is not None:
788
1186
  self.width = size[0]
789
1187
  self.height = size[1]
@@ -818,16 +1216,22 @@ class ImageType(ColumnType):
818
1216
  def __hash__(self) -> int:
819
1217
  return hash((self._type, self.nullable, self.size, self.mode))
820
1218
 
821
- def supertype(self, other: ColumnType) -> Optional[ImageType]:
1219
+ def supertype(self, other: ColumnType, for_inference: bool = False) -> ImageType | None:
1220
+ basic_supertype = super().supertype(other)
1221
+ if basic_supertype is not None:
1222
+ assert isinstance(basic_supertype, ImageType)
1223
+ return basic_supertype
1224
+
822
1225
  if not isinstance(other, ImageType):
823
1226
  return None
1227
+
824
1228
  width = self.width if self.width == other.width else None
825
1229
  height = self.height if self.height == other.height else None
826
1230
  mode = self.mode if self.mode == other.mode else None
827
1231
  return ImageType(width=width, height=height, mode=mode, nullable=(self.nullable or other.nullable))
828
1232
 
829
1233
  @property
830
- def size(self) -> Optional[tuple[int, int]]:
1234
+ def size(self) -> tuple[int, int] | None:
831
1235
  if self.width is None or self.height is None:
832
1236
  return None
833
1237
  return (self.width, self.height)
@@ -844,7 +1248,8 @@ class ImageType(ColumnType):
844
1248
  assert 'mode' in d
845
1249
  return cls(width=d['width'], height=d['height'], mode=d['mode'], nullable=d['nullable'])
846
1250
 
847
- def to_sa_type(self) -> sql.types.TypeEngine:
1251
+ @classmethod
1252
+ def to_sa_type(cls) -> sql.types.TypeEngine:
848
1253
  return sql.String()
849
1254
 
850
1255
  def _create_literal(self, val: Any) -> Any:
@@ -857,8 +1262,8 @@ class ImageType(ColumnType):
857
1262
  img.load()
858
1263
  return img
859
1264
  except Exception as exc:
860
- errormsg_val = val if len(val) < 50 else val[:50] + '...'
861
- raise excs.Error(f'data URL could not be decoded into a valid image: {errormsg_val}') from exc
1265
+ error_msg_val = val if len(val) < 50 else val[:50] + '...'
1266
+ raise excs.Error(f'data URL could not be decoded into a valid image: {error_msg_val}') from exc
862
1267
  return val
863
1268
 
864
1269
  def _validate_literal(self, val: Any) -> None:
@@ -878,7 +1283,8 @@ class VideoType(ColumnType):
878
1283
  def __init__(self, nullable: bool = False):
879
1284
  super().__init__(self.Type.VIDEO, nullable=nullable)
880
1285
 
881
- def to_sa_type(self) -> sql.types.TypeEngine:
1286
+ @classmethod
1287
+ def to_sa_type(cls) -> sql.types.TypeEngine:
882
1288
  # stored as a file path
883
1289
  return sql.String()
884
1290
 
@@ -902,7 +1308,7 @@ class VideoType(ColumnType):
902
1308
  if num_decoded < 2:
903
1309
  # this is most likely an image file
904
1310
  raise excs.Error(f'Not a valid video: {val}')
905
- except av.AVError:
1311
+ except av.FFmpegError:
906
1312
  raise excs.Error(f'Not a valid video: {val}') from None
907
1313
 
908
1314
 
@@ -910,7 +1316,8 @@ class AudioType(ColumnType):
910
1316
  def __init__(self, nullable: bool = False):
911
1317
  super().__init__(self.Type.AUDIO, nullable=nullable)
912
1318
 
913
- def to_sa_type(self) -> sql.types.TypeEngine:
1319
+ @classmethod
1320
+ def to_sa_type(cls) -> sql.types.TypeEngine:
914
1321
  # stored as a file path
915
1322
  return sql.String()
916
1323
 
@@ -929,7 +1336,7 @@ class AudioType(ColumnType):
929
1336
  for packet in container.demux(audio_stream):
930
1337
  for _ in packet.decode():
931
1338
  pass
932
- except av.AVError as e:
1339
+ except av.FFmpegError as e:
933
1340
  raise excs.Error(f'Not a valid audio file: {val}\n{e}') from None
934
1341
 
935
1342
 
@@ -940,8 +1347,23 @@ class DocumentType(ColumnType):
940
1347
  MD = 1
941
1348
  PDF = 2
942
1349
  XML = 3
1350
+ TXT = 4
1351
+
1352
+ @classmethod
1353
+ def from_extension(cls, ext: str) -> 'DocumentType.DocumentFormat' | None:
1354
+ if ext in ('.htm', '.html'):
1355
+ return cls.HTML
1356
+ if ext == '.md':
1357
+ return cls.MD
1358
+ if ext == '.pdf':
1359
+ return cls.PDF
1360
+ if ext == '.xml':
1361
+ return cls.XML
1362
+ if ext == '.txt':
1363
+ return cls.TXT
1364
+ return None
943
1365
 
944
- def __init__(self, nullable: bool = False, doc_formats: Optional[str] = None):
1366
+ def __init__(self, nullable: bool = False, doc_formats: str | None = None):
945
1367
  super().__init__(self.Type.DOCUMENT, nullable=nullable)
946
1368
  self.doc_formats = doc_formats
947
1369
  if doc_formats is not None:
@@ -951,7 +1373,7 @@ class DocumentType(ColumnType):
951
1373
  raise ValueError(f'Invalid document type: {type_str}')
952
1374
  self._doc_formats = [self.DocumentFormat[type_str.upper()] for type_str in type_strs]
953
1375
  else:
954
- self._doc_formats = [t for t in self.DocumentFormat]
1376
+ self._doc_formats = list(self.DocumentFormat)
955
1377
 
956
1378
  def copy(self, nullable: bool) -> ColumnType:
957
1379
  return DocumentType(doc_formats=self.doc_formats, nullable=nullable)
@@ -962,7 +1384,8 @@ class DocumentType(ColumnType):
962
1384
  def __hash__(self) -> int:
963
1385
  return hash((self._type, self.nullable, self._doc_formats))
964
1386
 
965
- def to_sa_type(self) -> sql.types.TypeEngine:
1387
+ @classmethod
1388
+ def to_sa_type(cls) -> sql.types.TypeEngine:
966
1389
  # stored as a file path
967
1390
  return sql.String()
968
1391
 
@@ -972,9 +1395,8 @@ class DocumentType(ColumnType):
972
1395
  def validate_media(self, val: Any) -> None:
973
1396
  assert isinstance(val, str)
974
1397
  from pixeltable.utils.documents import get_document_handle
975
- dh = get_document_handle(val)
976
- if dh is None:
977
- raise excs.Error(f'Not a recognized document format: {val}')
1398
+
1399
+ _ = get_document_handle(val)
978
1400
 
979
1401
 
980
1402
  T = typing.TypeVar('T')
@@ -985,6 +1407,7 @@ class Required(typing.Generic[T]):
985
1407
  Marker class to indicate that a column is non-nullable in a schema definition. This has no meaning as a type hint,
986
1408
  and is intended only for schema declarations.
987
1409
  """
1410
+
988
1411
  pass
989
1412
 
990
1413
 
@@ -993,6 +1416,9 @@ Int = typing.Annotated[int, IntType(nullable=False)]
993
1416
  Float = typing.Annotated[float, FloatType(nullable=False)]
994
1417
  Bool = typing.Annotated[bool, BoolType(nullable=False)]
995
1418
  Timestamp = typing.Annotated[datetime.datetime, TimestampType(nullable=False)]
1419
+ Date = typing.Annotated[datetime.date, DateType(nullable=False)]
1420
+ UUID = typing.Annotated[uuid.UUID, UUIDType(nullable=False)]
1421
+ Binary = typing.Annotated[bytes, BinaryType(nullable=False)]
996
1422
 
997
1423
 
998
1424
  class _PxtType:
@@ -1007,7 +1433,8 @@ class _PxtType:
1007
1433
  `Image[(300, 300), 'RGB']`. The specialized forms resolve to `typing.Annotated` instances whose annotation is a
1008
1434
  `ColumnType`.
1009
1435
  """
1010
- def __init__(self):
1436
+
1437
+ def __init__(self) -> None:
1011
1438
  raise TypeError(f'Type `{type(self)}` cannot be instantiated.')
1012
1439
 
1013
1440
  @classmethod
@@ -1016,6 +1443,16 @@ class _PxtType:
1016
1443
 
1017
1444
 
1018
1445
  class Json(_PxtType):
1446
+ def __class_getitem__(cls, item: Any) -> _AnnotatedAlias:
1447
+ """
1448
+ `item` (the type subscript) must be a `dict` representing a valid JSON Schema.
1449
+ """
1450
+ if not isinstance(item, dict):
1451
+ raise TypeError('Json type parameter must be a dict')
1452
+
1453
+ # The JsonType initializer will validate the JSON Schema.
1454
+ return typing.Annotated[Any, JsonType(json_schema=item, nullable=False)]
1455
+
1019
1456
  @classmethod
1020
1457
  def as_col_type(cls, nullable: bool) -> ColumnType:
1021
1458
  return JsonType(nullable=nullable)
@@ -1024,14 +1461,19 @@ class Json(_PxtType):
1024
1461
  class Array(np.ndarray, _PxtType):
1025
1462
  def __class_getitem__(cls, item: Any) -> _AnnotatedAlias:
1026
1463
  """
1027
- `item` (the type subscript) must be a tuple with exactly two elements (in any order):
1028
- - A tuple of `Optional[int]`s, specifying the shape of the array
1029
- - A type, specifying the dtype of the array
1030
- Example: Array[(3, None, 2), pxt.Float]
1464
+ `item` (the type subscript) must be a tuple with at most two elements (in any order):
1465
+ - An optional tuple of `int | None`s, specifying the shape of the array
1466
+ - A type (`ColumnType | np.dtype`), specifying the dtype of the array
1467
+ Examples:
1468
+ * Array[(3, None, 2), pxt.Float]
1469
+ * Array[(4, 4), np.uint8]
1470
+ * Array[np.bool]
1031
1471
  """
1032
1472
  params = item if isinstance(item, tuple) else (item,)
1033
- shape: Optional[tuple] = None
1034
- dtype: Optional[ColumnType] = None
1473
+ shape: tuple | None = None
1474
+ dtype: ColumnType | np.dtype | None = None
1475
+ if not any(isinstance(param, (type, _AnnotatedAlias)) for param in params):
1476
+ raise TypeError('Array type parameter must include a dtype.')
1035
1477
  for param in params:
1036
1478
  if isinstance(param, tuple):
1037
1479
  if not all(n is None or (isinstance(n, int) and n >= 1) for n in param):
@@ -1039,21 +1481,20 @@ class Array(np.ndarray, _PxtType):
1039
1481
  if shape is not None:
1040
1482
  raise TypeError(f'Duplicate Array type parameter: {param}')
1041
1483
  shape = param
1042
- elif isinstance(param, type) or isinstance(param, _AnnotatedAlias):
1484
+ elif isinstance(param, (type, _AnnotatedAlias)):
1043
1485
  if dtype is not None:
1044
1486
  raise TypeError(f'Duplicate Array type parameter: {param}')
1045
- dtype = ColumnType.normalize_type(param, allow_builtin_types=False)
1487
+ if isinstance(param, type) and param in ARRAY_SUPPORTED_NUMPY_DTYPES:
1488
+ dtype = np.dtype(param)
1489
+ else:
1490
+ dtype = ColumnType.normalize_type(param, allow_builtin_types=False)
1046
1491
  else:
1047
1492
  raise TypeError(f'Invalid Array type parameter: {param}')
1048
- if shape is None:
1049
- raise TypeError('Array type is missing parameter: shape')
1050
- if dtype is None:
1051
- raise TypeError('Array type is missing parameter: dtype')
1052
1493
  return typing.Annotated[np.ndarray, ArrayType(shape=shape, dtype=dtype, nullable=False)]
1053
1494
 
1054
1495
  @classmethod
1055
1496
  def as_col_type(cls, nullable: bool) -> ColumnType:
1056
- raise TypeError('Array type cannot be used without specifying shape and dtype')
1497
+ return ArrayType(nullable=nullable)
1057
1498
 
1058
1499
 
1059
1500
  class Image(PIL.Image.Image, _PxtType):
@@ -1073,11 +1514,15 @@ class Image(PIL.Image.Image, _PxtType):
1073
1514
  else:
1074
1515
  # Not a tuple (single arg)
1075
1516
  params = (item,)
1076
- size: Optional[tuple] = None
1077
- mode: Optional[str] = None
1517
+ size: tuple | None = None
1518
+ mode: str | None = None
1078
1519
  for param in params:
1079
1520
  if isinstance(param, tuple):
1080
- if len(param) != 2 or not isinstance(param[0], (int, type(None))) or not isinstance(param[1], (int, type(None))):
1521
+ if (
1522
+ len(param) != 2
1523
+ or not isinstance(param[0], (int, type(None)))
1524
+ or not isinstance(param[1], (int, type(None)))
1525
+ ):
1081
1526
  raise TypeError(f'Invalid Image type parameter: {param}')
1082
1527
  if size is not None:
1083
1528
  raise TypeError(f'Duplicate Image type parameter: {param}')
@@ -1113,3 +1558,21 @@ class Document(str, _PxtType):
1113
1558
  @classmethod
1114
1559
  def as_col_type(cls, nullable: bool) -> ColumnType:
1115
1560
  return DocumentType(nullable=nullable)
1561
+
1562
+
1563
+ ALL_PIXELTABLE_TYPES = (
1564
+ String,
1565
+ Bool,
1566
+ Int,
1567
+ Float,
1568
+ Timestamp,
1569
+ Json,
1570
+ Array,
1571
+ Image,
1572
+ Video,
1573
+ Audio,
1574
+ Document,
1575
+ Date,
1576
+ UUID,
1577
+ Binary,
1578
+ )