pixeltable 0.2.28__py3-none-any.whl → 0.2.30__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (62) hide show
  1. pixeltable/__init__.py +1 -1
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/__init__.py +1 -1
  4. pixeltable/catalog/dir.py +6 -0
  5. pixeltable/catalog/globals.py +25 -0
  6. pixeltable/catalog/named_function.py +4 -0
  7. pixeltable/catalog/path_dict.py +37 -11
  8. pixeltable/catalog/schema_object.py +6 -0
  9. pixeltable/catalog/table.py +96 -19
  10. pixeltable/catalog/table_version.py +22 -8
  11. pixeltable/dataframe.py +201 -3
  12. pixeltable/env.py +9 -3
  13. pixeltable/exec/expr_eval_node.py +1 -1
  14. pixeltable/exec/sql_node.py +2 -2
  15. pixeltable/exprs/function_call.py +134 -29
  16. pixeltable/exprs/inline_expr.py +22 -2
  17. pixeltable/exprs/row_builder.py +1 -1
  18. pixeltable/exprs/similarity_expr.py +9 -2
  19. pixeltable/func/__init__.py +1 -0
  20. pixeltable/func/aggregate_function.py +151 -68
  21. pixeltable/func/callable_function.py +50 -16
  22. pixeltable/func/expr_template_function.py +62 -24
  23. pixeltable/func/function.py +191 -23
  24. pixeltable/func/function_registry.py +2 -1
  25. pixeltable/func/query_template_function.py +11 -6
  26. pixeltable/func/signature.py +64 -7
  27. pixeltable/func/tools.py +116 -0
  28. pixeltable/func/udf.py +57 -35
  29. pixeltable/functions/__init__.py +2 -2
  30. pixeltable/functions/anthropic.py +36 -2
  31. pixeltable/functions/globals.py +54 -34
  32. pixeltable/functions/json.py +3 -8
  33. pixeltable/functions/math.py +67 -0
  34. pixeltable/functions/ollama.py +4 -4
  35. pixeltable/functions/openai.py +31 -2
  36. pixeltable/functions/timestamp.py +1 -1
  37. pixeltable/functions/video.py +2 -8
  38. pixeltable/functions/vision.py +1 -1
  39. pixeltable/globals.py +347 -79
  40. pixeltable/index/embedding_index.py +44 -24
  41. pixeltable/metadata/__init__.py +1 -1
  42. pixeltable/metadata/converters/convert_16.py +2 -1
  43. pixeltable/metadata/converters/convert_17.py +2 -1
  44. pixeltable/metadata/converters/convert_23.py +35 -0
  45. pixeltable/metadata/converters/convert_24.py +47 -0
  46. pixeltable/metadata/converters/util.py +4 -2
  47. pixeltable/metadata/notes.py +2 -0
  48. pixeltable/metadata/schema.py +1 -0
  49. pixeltable/type_system.py +192 -48
  50. {pixeltable-0.2.28.dist-info → pixeltable-0.2.30.dist-info}/METADATA +4 -2
  51. {pixeltable-0.2.28.dist-info → pixeltable-0.2.30.dist-info}/RECORD +54 -57
  52. pixeltable-0.2.30.dist-info/entry_points.txt +3 -0
  53. pixeltable/tool/create_test_db_dump.py +0 -311
  54. pixeltable/tool/create_test_video.py +0 -81
  55. pixeltable/tool/doc_plugins/griffe.py +0 -50
  56. pixeltable/tool/doc_plugins/mkdocstrings.py +0 -6
  57. pixeltable/tool/doc_plugins/templates/material/udf.html.jinja +0 -135
  58. pixeltable/tool/embed_udf.py +0 -9
  59. pixeltable/tool/mypy_plugin.py +0 -55
  60. pixeltable-0.2.28.dist-info/entry_points.txt +0 -3
  61. {pixeltable-0.2.28.dist-info → pixeltable-0.2.30.dist-info}/LICENSE +0 -0
  62. {pixeltable-0.2.28.dist-info → pixeltable-0.2.30.dist-info}/WHEEL +0 -0
@@ -37,6 +37,14 @@ class EmbeddingIndex(IndexBase):
37
37
  Metric.L2: 'vector_l2_ops'
38
38
  }
39
39
 
40
+ metric: Metric
41
+ value_expr: exprs.FunctionCall
42
+ string_embed: Optional[func.Function]
43
+ image_embed: Optional[func.Function]
44
+ string_embed_signature_idx: int
45
+ image_embed_signature_idx: int
46
+ index_col_type: pgvector.sqlalchemy.Vector
47
+
40
48
  def __init__(
41
49
  self, c: catalog.Column, metric: str, string_embed: Optional[func.Function] = None,
42
50
  image_embed: Optional[func.Function] = None):
@@ -49,18 +57,22 @@ class EmbeddingIndex(IndexBase):
49
57
  raise excs.Error(f"Text embedding function is required for column {c.name} (parameter 'string_embed')")
50
58
  if c.col_type.is_image_type() and image_embed is None:
51
59
  raise excs.Error(f"Image embedding function is required for column {c.name} (parameter 'image_embed')")
52
- if string_embed is not None:
53
- # verify signature
54
- self._validate_embedding_fn(string_embed, 'string_embed', ts.ColumnType.Type.STRING)
55
- if image_embed is not None:
56
- # verify signature
57
- self._validate_embedding_fn(image_embed, 'image_embed', ts.ColumnType.Type.IMAGE)
60
+
61
+ if string_embed is None:
62
+ self.string_embed = None
63
+ else:
64
+ # verify signature and convert to a monomorphic function
65
+ self.string_embed = self._validate_embedding_fn(string_embed, 'string_embed', ts.ColumnType.Type.STRING)
66
+
67
+ if image_embed is None:
68
+ self.image_embed = None
69
+ else:
70
+ # verify signature and convert to a monomorphic function
71
+ self.image_embed = self._validate_embedding_fn(image_embed, 'image_embed', ts.ColumnType.Type.IMAGE)
58
72
 
59
73
  self.metric = self.Metric[metric.upper()]
60
74
  self.value_expr = string_embed(exprs.ColumnRef(c)) if c.col_type.is_string_type() else image_embed(exprs.ColumnRef(c))
61
75
  assert isinstance(self.value_expr.col_type, ts.ArrayType)
62
- self.string_embed = string_embed
63
- self.image_embed = image_embed
64
76
  vector_size = self.value_expr.col_type.shape[0]
65
77
  assert vector_size is not None
66
78
  self.index_col_type = pgvector.sqlalchemy.Vector(vector_size)
@@ -91,10 +103,10 @@ class EmbeddingIndex(IndexBase):
91
103
  assert isinstance(item, (str, PIL.Image.Image))
92
104
  if isinstance(item, str):
93
105
  assert self.string_embed is not None
94
- embedding = self.string_embed.exec(item)
106
+ embedding = self.string_embed.exec([item], {})
95
107
  if isinstance(item, PIL.Image.Image):
96
108
  assert self.image_embed is not None
97
- embedding = self.image_embed.exec(item)
109
+ embedding = self.image_embed.exec([item], {})
98
110
 
99
111
  if self.metric == self.Metric.COSINE:
100
112
  return val_column.sa_col.cosine_distance(embedding) * -1 + 1
@@ -110,10 +122,10 @@ class EmbeddingIndex(IndexBase):
110
122
  embedding: Optional[np.ndarray] = None
111
123
  if isinstance(item, str):
112
124
  assert self.string_embed is not None
113
- embedding = self.string_embed.exec(item)
125
+ embedding = self.string_embed.exec([item], {})
114
126
  if isinstance(item, PIL.Image.Image):
115
127
  assert self.image_embed is not None
116
- embedding = self.image_embed.exec(item)
128
+ embedding = self.image_embed.exec([item], {})
117
129
  assert embedding is not None
118
130
 
119
131
  if self.metric == self.Metric.COSINE:
@@ -132,27 +144,33 @@ class EmbeddingIndex(IndexBase):
132
144
  return 'embedding'
133
145
 
134
146
  @classmethod
135
- def _validate_embedding_fn(cls, embed_fn: func.Function, name: str, expected_type: ts.ColumnType.Type) -> None:
136
- """Validate the signature"""
147
+ def _validate_embedding_fn(cls, embed_fn: func.Function, name: str, expected_type: ts.ColumnType.Type) -> func.Function:
148
+ """Validate that the Function has a matching signature, and return the corresponding monomorphic function."""
137
149
  assert isinstance(embed_fn, func.Function)
138
- sig = embed_fn.signature
139
150
 
140
- # The embedding function must be a 1-ary function of the correct type. But it's ok if the function signature
141
- # has more than one parameter, as long as it has at most one *required* parameter.
142
- if (len(sig.parameters) == 0
143
- or len(sig.required_parameters) > 1
144
- or sig.parameters_by_pos[0].col_type.type_enum != expected_type):
145
- raise excs.Error(
146
- f'{name} must take a single {expected_type.name.lower()} parameter, but has signature {sig}')
151
+ signature_idx: int = -1
152
+ for idx, sig in enumerate(embed_fn.signatures):
153
+ # The embedding function must be a 1-ary function of the correct type. But it's ok if the function signature
154
+ # has more than one parameter, as long as it has at most one *required* parameter.
155
+ if (len(sig.parameters) >= 1
156
+ and len(sig.required_parameters) <= 1
157
+ and sig.parameters_by_pos[0].col_type.type_enum == expected_type):
158
+ signature_idx = idx
159
+ break
160
+
161
+ if signature_idx == -1:
162
+ raise excs.Error(f'{name} must take a single {expected_type.name.lower()} parameter')
163
+
164
+ resolved_fn = embed_fn._resolved_fns[signature_idx]
147
165
 
148
166
  # validate return type
149
167
  param_name = sig.parameters_by_pos[0].name
150
168
  if expected_type == ts.ColumnType.Type.STRING:
151
- return_type = embed_fn.call_return_type({param_name: 'dummy'})
169
+ return_type = resolved_fn.call_return_type([], {param_name: 'dummy'})
152
170
  else:
153
171
  assert expected_type == ts.ColumnType.Type.IMAGE
154
172
  img = PIL.Image.new('RGB', (512, 512))
155
- return_type = embed_fn.call_return_type({param_name: img})
173
+ return_type = resolved_fn.call_return_type([], {param_name: img})
156
174
  assert return_type is not None
157
175
  if not isinstance(return_type, ts.ArrayType):
158
176
  raise excs.Error(f'{name} must return an array, but returns {return_type}')
@@ -161,6 +179,8 @@ class EmbeddingIndex(IndexBase):
161
179
  if len(shape) != 1 or shape[0] == None:
162
180
  raise excs.Error(f'{name} must return a 1D array of a specific length, but returns {return_type}')
163
181
 
182
+ return resolved_fn
183
+
164
184
  def as_dict(self) -> dict:
165
185
  return {
166
186
  'metric': self.metric.name.lower(),
@@ -10,7 +10,7 @@ import sqlalchemy.orm as orm
10
10
  from .schema import SystemInfo, SystemInfoMd
11
11
 
12
12
  # current version of the metadata; this is incremented whenever the metadata schema changes
13
- VERSION = 23
13
+ VERSION = 25
14
14
 
15
15
 
16
16
  def create_system_info(engine: sql.engine.Engine) -> None:
@@ -1,3 +1,4 @@
1
+ from uuid import UUID
1
2
  import sqlalchemy as sql
2
3
 
3
4
  from pixeltable.metadata import register_converter
@@ -12,7 +13,7 @@ def _(engine: sql.engine.Engine) -> None:
12
13
  )
13
14
 
14
15
 
15
- def __update_table_md(table_md: dict) -> None:
16
+ def __update_table_md(table_md: dict, table_id: UUID) -> None:
16
17
  # External stores are not migratable; just drop them
17
18
  del table_md['remotes']
18
19
  table_md['external_stores'] = {}
@@ -1,3 +1,4 @@
1
+ from uuid import UUID
1
2
  import sqlalchemy as sql
2
3
 
3
4
  from pixeltable.metadata import register_converter
@@ -12,7 +13,7 @@ def _(engine: sql.engine.Engine) -> None:
12
13
  )
13
14
 
14
15
 
15
- def __update_table_md(table_md: dict) -> None:
16
+ def __update_table_md(table_md: dict, table_id: UUID) -> None:
16
17
  # key changes in IndexMd.init_args: img_embed -> image_embed, txt_embed -> string_embed
17
18
  if len(table_md['index_md']) == 0:
18
19
  return
@@ -0,0 +1,35 @@
1
+ import logging
2
+ from typing import Any, Optional
3
+ from uuid import UUID
4
+ import sqlalchemy as sql
5
+
6
+ from pixeltable.metadata import register_converter
7
+ from pixeltable.metadata.converters.util import convert_table_md
8
+ from pixeltable.metadata.schema import Table
9
+
10
+ _logger = logging.getLogger('pixeltable')
11
+
12
+ @register_converter(version=23)
13
+ def _(engine: sql.engine.Engine) -> None:
14
+ convert_table_md(
15
+ engine,
16
+ table_md_updater=__update_table_md
17
+ )
18
+
19
+ def __update_table_md(table_md: dict, table_id: UUID) -> None:
20
+ """update the index metadata to add indexed_col_tbl_id column if it is missing
21
+
22
+ Args:
23
+ table_md (dict): copy of the original table metadata. this gets updated in place.
24
+ table_id (UUID): the table id
25
+
26
+ """
27
+ if len(table_md['index_md']) == 0:
28
+ return
29
+ for idx_md in table_md['index_md'].values():
30
+ if 'indexed_col_tbl_id' not in idx_md:
31
+ # index metadata is missing indexed_col_tbl_id
32
+ # assume that the indexed column is in the same table
33
+ # and update the index metadata.
34
+ _logger.info(f'Updating index metadata for table: {table_id} index: {idx_md["id"]}')
35
+ idx_md['indexed_col_tbl_id'] = str(table_id)
@@ -0,0 +1,47 @@
1
+ import importlib
2
+ from typing import Any, Optional
3
+
4
+ import sqlalchemy as sql
5
+
6
+ from pixeltable.metadata import register_converter
7
+ from pixeltable.metadata.converters.util import convert_table_md
8
+
9
+
10
+ @register_converter(version=24)
11
+ def _(engine: sql.engine.Engine) -> None:
12
+ convert_table_md(engine, substitution_fn=__substitute_md)
13
+
14
+
15
+ def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], Any]]:
16
+ from pixeltable import func
17
+ from pixeltable.func.globals import resolve_symbol
18
+
19
+ if (isinstance(v, dict) and
20
+ '_classpath' in v and
21
+ v['_classpath'] in ['pixeltable.func.callable_function.CallableFunction',
22
+ 'pixeltable.func.aggregate_function.AggregateFunction',
23
+ 'pixeltable.func.expr_template_function.ExprTemplateFunction']):
24
+ if 'path' in v:
25
+ assert 'signature' not in v
26
+ f = resolve_symbol(v['path'])
27
+ assert isinstance(f, func.Function)
28
+ v['signature'] = f.signatures[0].as_dict()
29
+ return k, v
30
+
31
+ if isinstance(v, dict) and '_classname' in v and v['_classname'] == 'FunctionCall':
32
+ # Correct an older serialization mechanism where Expr elements of FunctionCall args and
33
+ # kwargs were indicated with idx == -1 rather than None. This was fixed for InlineList
34
+ # and InlineDict back in convert_20, but not for FunctionCall.
35
+ assert 'args' in v and isinstance(v['args'], list)
36
+ assert 'kwargs' in v and isinstance(v['kwargs'], dict)
37
+ v['args'] = [
38
+ (None, arg) if idx == -1 else (idx, arg)
39
+ for idx, arg in v['args']
40
+ ]
41
+ v['kwargs'] = {
42
+ k: (None, arg) if idx == -1 else (idx, arg)
43
+ for k, (idx, arg) in v['kwargs'].items()
44
+ }
45
+ return k, v
46
+
47
+ return None
@@ -1,6 +1,7 @@
1
1
  import copy
2
2
  import logging
3
3
  from typing import Any, Callable, Optional
4
+ from uuid import UUID
4
5
 
5
6
  import sqlalchemy as sql
6
7
 
@@ -11,7 +12,7 @@ __logger = logging.getLogger('pixeltable')
11
12
 
12
13
  def convert_table_md(
13
14
  engine: sql.engine.Engine,
14
- table_md_updater: Optional[Callable[[dict], None]] = None,
15
+ table_md_updater: Optional[Callable[[dict, UUID], None]] = None,
15
16
  column_md_updater: Optional[Callable[[dict], None]] = None,
16
17
  external_store_md_updater: Optional[Callable[[dict], None]] = None,
17
18
  substitution_fn: Optional[Callable[[Optional[str], Any], Optional[tuple[Optional[str], Any]]]] = None
@@ -22,6 +23,7 @@ def convert_table_md(
22
23
  Args:
23
24
  engine: The SQLAlchemy engine.
24
25
  table_md_updater: A function that updates schema.TableMd dicts in place.
26
+ It takes two arguments: the metadata dict (new values) and the table id.
25
27
  column_md_updater: A function that updates schema.ColumnMd dicts in place.
26
28
  external_store_md_updater: A function that updates the external store metadata in place.
27
29
  substitution_fn: A function that substitutes metadata values. If specified, all metadata will be traversed
@@ -37,7 +39,7 @@ def convert_table_md(
37
39
  assert isinstance(table_md, dict)
38
40
  updated_table_md = copy.deepcopy(table_md)
39
41
  if table_md_updater is not None:
40
- table_md_updater(updated_table_md)
42
+ table_md_updater(updated_table_md, id)
41
43
  if column_md_updater is not None:
42
44
  __update_column_md(updated_table_md, column_md_updater)
43
45
  if external_store_md_updater is not None:
@@ -2,6 +2,8 @@
2
2
  # rather than as a comment, so that the existence of a description can be enforced by
3
3
  # the unit tests when new versions are added.
4
4
  VERSION_NOTES = {
5
+ 25: 'Functions with multiple signatures',
6
+ 24: 'Added TableMd/IndexMd.indexed_col_tbl_id',
5
7
  23: 'DataFrame.from_clause',
6
8
  22: 'TableMd/ColumnMd.media_validation',
7
9
  21: 'Separate InlineArray and InlineList',
@@ -112,6 +112,7 @@ class IndexMd:
112
112
  """
113
113
  id: int
114
114
  name: str
115
+ indexed_col_tbl_id: str # UUID of the table (as string) that contains column being indexed
115
116
  indexed_col_id: int # column being indexed
116
117
  index_val_col_id: int # column holding the values to be indexed
117
118
  index_val_undo_col_id: int # column holding index values for deleted rows
pixeltable/type_system.py CHANGED
@@ -5,7 +5,6 @@ import datetime
5
5
  import enum
6
6
  import io
7
7
  import json
8
- import types
9
8
  import typing
10
9
  import urllib.parse
11
10
  import urllib.request
@@ -14,7 +13,11 @@ from typing import Any, Iterable, Mapping, Optional, Sequence, Union
14
13
 
15
14
  import PIL.Image
16
15
  import av # type: ignore
16
+ import jsonschema
17
+ import jsonschema.protocols
18
+ import jsonschema.validators
17
19
  import numpy as np
20
+ import pydantic
18
21
  import sqlalchemy as sql
19
22
  from typing import _GenericAlias # type: ignore[attr-defined]
20
23
  from typing_extensions import _AnnotatedAlias
@@ -244,7 +247,7 @@ class ColumnType:
244
247
  if col_type is not None:
245
248
  return col_type
246
249
  # this could still be json-serializable
247
- if isinstance(val, dict) or isinstance(val, list) or isinstance(val, np.ndarray):
250
+ if isinstance(val, dict) or isinstance(val, list) or isinstance(val, np.ndarray) or isinstance(val, pydantic.BaseModel):
248
251
  try:
249
252
  JsonType().validate_literal(val)
250
253
  return JsonType(nullable=nullable)
@@ -337,7 +340,7 @@ class ColumnType:
337
340
  return TimestampType(nullable=nullable_default)
338
341
  if t is PIL.Image.Image:
339
342
  return ImageType(nullable=nullable_default)
340
- if issubclass(t, Sequence) or issubclass(t, Mapping):
343
+ if issubclass(t, Sequence) or issubclass(t, Mapping) or issubclass(t, pydantic.BaseModel):
341
344
  return JsonType(nullable=nullable_default)
342
345
  return None
343
346
 
@@ -479,6 +482,20 @@ class ColumnType:
479
482
  """
480
483
  pass
481
484
 
485
+ def to_json_schema(self) -> dict[str, Any]:
486
+ if self.nullable:
487
+ return {
488
+ 'anyOf': [
489
+ self._to_json_schema(),
490
+ {'type': 'null'},
491
+ ]
492
+ }
493
+ else:
494
+ return self._to_json_schema()
495
+
496
+ def _to_json_schema(self) -> dict[str, Any]:
497
+ raise excs.Error(f'Pixeltable type {self} is not a valid JSON type')
498
+
482
499
 
483
500
  class InvalidType(ColumnType):
484
501
  def __init__(self, nullable: bool = False):
@@ -501,6 +518,9 @@ class StringType(ColumnType):
501
518
  def to_sa_type(self) -> sql.types.TypeEngine:
502
519
  return sql.String()
503
520
 
521
+ def _to_json_schema(self) -> dict[str, Any]:
522
+ return {'type': 'string'}
523
+
504
524
  def print_value(self, val: Any) -> str:
505
525
  return f"'{val}'"
506
526
 
@@ -524,8 +544,13 @@ class IntType(ColumnType):
524
544
  def to_sa_type(self) -> sql.types.TypeEngine:
525
545
  return sql.BigInteger()
526
546
 
547
+ def _to_json_schema(self) -> dict[str, Any]:
548
+ return {'type': 'integer'}
549
+
527
550
  def _validate_literal(self, val: Any) -> None:
528
- if not isinstance(val, int):
551
+ # bool is a subclass of int, so we need to check for it
552
+ # explicitly first
553
+ if isinstance(val, bool) or not isinstance(val, int):
529
554
  raise TypeError(f'Expected int, got {val.__class__.__name__}')
530
555
 
531
556
 
@@ -536,6 +561,9 @@ class FloatType(ColumnType):
536
561
  def to_sa_type(self) -> sql.types.TypeEngine:
537
562
  return sql.Float()
538
563
 
564
+ def _to_json_schema(self) -> dict[str, Any]:
565
+ return {'type': 'number'}
566
+
539
567
  def _validate_literal(self, val: Any) -> None:
540
568
  if not isinstance(val, float):
541
569
  raise TypeError(f'Expected float, got {val.__class__.__name__}')
@@ -553,6 +581,9 @@ class BoolType(ColumnType):
553
581
  def to_sa_type(self) -> sql.types.TypeEngine:
554
582
  return sql.Boolean()
555
583
 
584
+ def _to_json_schema(self) -> dict[str, Any]:
585
+ return {'type': 'boolean'}
586
+
556
587
  def _validate_literal(self, val: Any) -> None:
557
588
  if not isinstance(val, bool):
558
589
  raise TypeError(f'Expected bool, got {val.__class__.__name__}')
@@ -581,61 +612,44 @@ class TimestampType(ColumnType):
581
612
 
582
613
 
583
614
  class JsonType(ColumnType):
584
- # TODO: type_spec also needs to be able to express lists
585
- def __init__(self, type_spec: Optional[dict[str, ColumnType]] = None, nullable: bool = False):
615
+
616
+ json_schema: Optional[dict[str, Any]]
617
+ __validator: Optional[jsonschema.protocols.Validator]
618
+
619
+ def __init__(self, json_schema: Optional[dict[str, Any]] = None, nullable: bool = False):
586
620
  super().__init__(self.Type.JSON, nullable=nullable)
587
- self.type_spec = type_spec
621
+ self.json_schema = json_schema
622
+ if json_schema is None:
623
+ self.__validator = None
624
+ else:
625
+ validator_cls = jsonschema.validators.validator_for(json_schema)
626
+ validator_cls.check_schema(json_schema)
627
+ self.__validator = validator_cls(json_schema)
588
628
 
589
629
  def copy(self, nullable: bool) -> ColumnType:
590
- return JsonType(self.type_spec, nullable=nullable)
630
+ return JsonType(json_schema=self.json_schema, nullable=nullable)
591
631
 
592
632
  def matches(self, other: ColumnType) -> bool:
593
- return isinstance(other, JsonType) and self.type_spec == other.type_spec
594
-
595
- def supertype(self, other: ColumnType) -> Optional[JsonType]:
596
- if not isinstance(other, JsonType):
597
- return None
598
- if self.type_spec is None:
599
- # we don't have a type spec and can accept anything accepted by other
600
- return JsonType(nullable=(self.nullable or other.nullable))
601
- if other.type_spec is None:
602
- # we have a type spec but other doesn't
603
- return JsonType(nullable=(self.nullable or other.nullable))
604
-
605
- # we both have type specs; the supertype's type spec is the union of the two
606
- type_spec: dict[str, ColumnType] = {}
607
- type_spec.update(self.type_spec)
608
- for other_field_name, other_field_type in other.type_spec.items():
609
- if other_field_name not in type_spec:
610
- type_spec[other_field_name] = other_field_type
611
- else:
612
- # both type specs have this field
613
- field_type = type_spec[other_field_name].supertype(other_field_type)
614
- if field_type is None:
615
- # conflicting types
616
- return JsonType(nullable=(self.nullable or other.nullable))
617
- type_spec[other_field_name] = field_type
618
- return JsonType(type_spec, nullable=(self.nullable or other.nullable))
633
+ return isinstance(other, JsonType) and self.json_schema == other.json_schema
619
634
 
620
635
  def _as_dict(self) -> dict:
621
636
  result = super()._as_dict()
622
- if self.type_spec is not None:
623
- type_spec_dict = {field_name: field_type.serialize() for field_name, field_type in self.type_spec.items()}
624
- result.update({'type_spec': type_spec_dict})
637
+ if self.json_schema is not None:
638
+ result.update({'json_schema': self.json_schema})
625
639
  return result
626
640
 
627
641
  @classmethod
628
642
  def _from_dict(cls, d: dict) -> ColumnType:
629
- type_spec = None
630
- if 'type_spec' in d:
631
- type_spec = {
632
- field_name: cls.deserialize(field_type_dict) for field_name, field_type_dict in d['type_spec'].items()
633
- }
634
- return cls(type_spec, nullable=d['nullable'])
643
+ return cls(json_schema=d.get('json_schema'), nullable=d['nullable'])
635
644
 
636
645
  def to_sa_type(self) -> sql.types.TypeEngine:
637
646
  return sql.dialects.postgresql.JSONB()
638
647
 
648
+ def _to_json_schema(self) -> dict[str, Any]:
649
+ if self.json_schema is None:
650
+ return {}
651
+ return self.json_schema
652
+
639
653
  def print_value(self, val: Any) -> str:
640
654
  val_type = self.infer_literal_type(val)
641
655
  if val_type is None:
@@ -645,27 +659,141 @@ class JsonType(ColumnType):
645
659
  return val_type.print_value(val)
646
660
 
647
661
  def _validate_literal(self, val: Any) -> None:
648
- if not isinstance(val, dict) and not isinstance(val, list):
662
+ if not isinstance(val, (dict, list)):
649
663
  # TODO In the future we should accept scalars too, which would enable us to remove this top-level check
650
664
  raise TypeError(f'Expected dict or list, got {val.__class__.__name__}')
651
- if not self.__is_valid_literal(val):
665
+ if not self.__is_valid_json(val):
652
666
  raise TypeError(f'That literal is not a valid Pixeltable JSON object: {val}')
667
+ if self.__validator is not None:
668
+ self.__validator.validate(val)
653
669
 
654
670
  @classmethod
655
- def __is_valid_literal(cls, val: Any) -> bool:
671
+ def __is_valid_json(cls, val: Any) -> bool:
656
672
  if val is None or isinstance(val, (str, int, float, bool)):
657
673
  return True
658
674
  if isinstance(val, (list, tuple)):
659
- return all(cls.__is_valid_literal(v) for v in val)
675
+ return all(cls.__is_valid_json(v) for v in val)
660
676
  if isinstance(val, dict):
661
- return all(isinstance(k, str) and cls.__is_valid_literal(v) for k, v in val.items())
677
+ return all(isinstance(k, str) and cls.__is_valid_json(v) for k, v in val.items())
662
678
  return False
663
679
 
664
680
  def _create_literal(self, val: Any) -> Any:
665
681
  if isinstance(val, tuple):
666
682
  val = list(val)
683
+ if isinstance(val, pydantic.BaseModel):
684
+ return val.model_dump()
667
685
  return val
668
686
 
687
+ def supertype(self, other: ColumnType) -> Optional[JsonType]:
688
+ # Try using the (much faster) supertype logic in ColumnType first. That will work if, for example, the types
689
+ # are identical except for nullability. If that doesn't work and both types are JsonType, then we will need to
690
+ # merge their schemas.
691
+ basic_supertype = super().supertype(other)
692
+ if basic_supertype is not None:
693
+ assert isinstance(basic_supertype, JsonType)
694
+ return basic_supertype
695
+
696
+ if not isinstance(other, JsonType):
697
+ return None
698
+
699
+ if self.json_schema is None or other.json_schema is None:
700
+ return JsonType(nullable=(self.nullable or other.nullable))
701
+
702
+ superschema = self.__superschema(self.json_schema, other.json_schema)
703
+
704
+ return JsonType(
705
+ json_schema=(None if len(superschema) == 0 else superschema),
706
+ nullable=(self.nullable or other.nullable)
707
+ )
708
+
709
+ @classmethod
710
+ def __superschema(cls, a: dict[str, Any], b: dict[str, Any]) -> Optional[dict[str, Any]]:
711
+ # Defining a general type hierarchy over all JSON schemas would be a challenging problem. In order to keep
712
+ # things manageable, we only define a hierarchy among "conforming" schemas, which provides enough generality
713
+ # for the most important use cases (unions for type inference, validation of inline exprs). A schema is
714
+ # considered to be conforming if either:
715
+ # (i) it is a scalar (string, integer, number, boolean) or dictionary (object) type; or
716
+ # (ii) it is an "anyOf" schema of one of the above types and the exact schema {'type': 'null'}.
717
+ # Conforming schemas are organized into a type hierarchy in an internally consistent way. Nonconforming
718
+ # schemas are allowed, but they are isolates in the type hierarchy: a nonconforming schema has no proper
719
+ # subtypes, and its only proper supertype is an unconstrained JsonType().
720
+ #
721
+ # There is some subtlety in the handling of nullable fields. Nullable fields are represented in JSON
722
+ # schemas as (for example) {'anyOf': [{'type': 'string'}, {'type': 'null'}]}. When finding the supertype
723
+ # of schemas that might be nullable, we first unpack the 'anyOf's, find the supertype of the underlyings,
724
+ # then reapply the 'anyOf' if appropriate. The top-level schema (i.e., JsonType.json_schema) is presumed
725
+ # to NOT be in this form (since nullability is indicated by the `nullable` field of the JsonType object),
726
+ # so this subtlety is applicable only to types that occur in subfields.
727
+ #
728
+ # There is currently no special handling of lists; distinct schemas with type 'array' will union to the
729
+ # generic {'type': 'array'} schema. This could be a TODO item if there is a need for it in the future.
730
+
731
+ if a == b:
732
+ return a
733
+
734
+ if 'properties' in a and 'properties' in b:
735
+ a_props = a['properties']
736
+ b_props = b['properties']
737
+ a_req = a.get('required', [])
738
+ b_req = b.get('required', [])
739
+ super_props = {}
740
+ super_req = []
741
+ for key, a_prop_schema in a_props.items():
742
+ if key in b_props: # in both a and b
743
+ prop_schema = cls.__superschema_with_nulls(a_prop_schema, b_props[key])
744
+ super_props[key] = prop_schema
745
+ if key in a_req and key in b_req:
746
+ super_req.append(key)
747
+ else: # in a but not b
748
+ # Add it to the supertype schema as optional (regardless of its status in a)
749
+ super_props[key] = a_prop_schema
750
+ for key, b_prop_schema in b_props.items():
751
+ if key not in a_props: # in b but not a
752
+ super_props[key] = b_prop_schema
753
+ schema = {'type': 'object', 'properties': super_props}
754
+ if len(super_req) > 0:
755
+ schema['required'] = super_req
756
+ return schema
757
+
758
+ a_type = a.get('type')
759
+ b_type = b.get('type')
760
+
761
+ if (a_type in ('string', 'integer', 'number', 'boolean', 'object', 'array') and a_type == b_type):
762
+ # a and b both have the same type designation, but are not identical. This can happen if
763
+ # (for example) they have validators or other attributes that differ. In this case, we
764
+ # generalize to {'type': t}, where t is their shared type, with no other qualifications.
765
+ return {'type': a_type}
766
+
767
+ return {} # Unresolvable type conflict; the supertype is an unrestricted JsonType.
768
+
769
+ @classmethod
770
+ def __superschema_with_nulls(cls, a: dict[str, Any], b: dict[str, Any]) -> Optional[dict[str, Any]]:
771
+ a, a_nullable = cls.__unpack_null_from_schema(a)
772
+ b, b_nullable = cls.__unpack_null_from_schema(b)
773
+
774
+ result = cls.__superschema(a, b)
775
+ if len(result) > 0 and (a_nullable or b_nullable):
776
+ # if len(result) == 0, then null is implicitly accepted; otherwise, we need to explicitly allow it
777
+ return {'anyOf': [result, {'type': 'null'}]}
778
+ return result
779
+
780
+ @classmethod
781
+ def __unpack_null_from_schema(cls, s: dict[str, Any]) -> tuple[dict[str, Any], bool]:
782
+ if 'anyOf' in s and len(s['anyOf']) == 2 and {'type': 'null'} in s['anyOf']:
783
+ try:
784
+ return next(s for s in s['anyOf'] if s != {'type': 'null'}), True
785
+ except StopIteration:
786
+ pass
787
+ return s, False
788
+
789
+ def _to_base_str(self) -> str:
790
+ if self.json_schema is None:
791
+ return 'Json'
792
+ elif 'title' in self.json_schema:
793
+ return f'Json[{self.json_schema["title"]}]'
794
+ else:
795
+ return f'Json[{self.json_schema}]'
796
+
669
797
 
670
798
  class ArrayType(ColumnType):
671
799
  def __init__(self, shape: tuple[Union[int, None], ...], dtype: ColumnType, nullable: bool = False):
@@ -743,6 +871,12 @@ class ArrayType(ColumnType):
743
871
  return False
744
872
  return val.dtype == self.numpy_dtype()
745
873
 
874
+ def _to_json_schema(self) -> dict[str, Any]:
875
+ return {
876
+ 'type': 'array',
877
+ 'items': self.pxt_dtype._to_json_schema(),
878
+ }
879
+
746
880
  def _validate_literal(self, val: Any) -> None:
747
881
  if not isinstance(val, np.ndarray):
748
882
  raise TypeError(f'Expected numpy.ndarray, got {val.__class__.__name__}')
@@ -1017,6 +1151,16 @@ class _PxtType:
1017
1151
 
1018
1152
 
1019
1153
  class Json(_PxtType):
1154
+ def __class_getitem__(cls, item: Any) -> _AnnotatedAlias:
1155
+ """
1156
+ `item` (the type subscript) must be a `dict` representing a valid JSON Schema.
1157
+ """
1158
+ if not isinstance(item, dict):
1159
+ raise TypeError('Json type parameter must be a dict')
1160
+
1161
+ # The JsonType initializer will validate the JSON Schema.
1162
+ return typing.Annotated[Any, JsonType(json_schema=item, nullable=False)]
1163
+
1020
1164
  @classmethod
1021
1165
  def as_col_type(cls, nullable: bool) -> ColumnType:
1022
1166
  return JsonType(nullable=nullable)