pixeltable 0.2.28__py3-none-any.whl → 0.2.30__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +1 -1
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +1 -1
- pixeltable/catalog/dir.py +6 -0
- pixeltable/catalog/globals.py +25 -0
- pixeltable/catalog/named_function.py +4 -0
- pixeltable/catalog/path_dict.py +37 -11
- pixeltable/catalog/schema_object.py +6 -0
- pixeltable/catalog/table.py +96 -19
- pixeltable/catalog/table_version.py +22 -8
- pixeltable/dataframe.py +201 -3
- pixeltable/env.py +9 -3
- pixeltable/exec/expr_eval_node.py +1 -1
- pixeltable/exec/sql_node.py +2 -2
- pixeltable/exprs/function_call.py +134 -29
- pixeltable/exprs/inline_expr.py +22 -2
- pixeltable/exprs/row_builder.py +1 -1
- pixeltable/exprs/similarity_expr.py +9 -2
- pixeltable/func/__init__.py +1 -0
- pixeltable/func/aggregate_function.py +151 -68
- pixeltable/func/callable_function.py +50 -16
- pixeltable/func/expr_template_function.py +62 -24
- pixeltable/func/function.py +191 -23
- pixeltable/func/function_registry.py +2 -1
- pixeltable/func/query_template_function.py +11 -6
- pixeltable/func/signature.py +64 -7
- pixeltable/func/tools.py +116 -0
- pixeltable/func/udf.py +57 -35
- pixeltable/functions/__init__.py +2 -2
- pixeltable/functions/anthropic.py +36 -2
- pixeltable/functions/globals.py +54 -34
- pixeltable/functions/json.py +3 -8
- pixeltable/functions/math.py +67 -0
- pixeltable/functions/ollama.py +4 -4
- pixeltable/functions/openai.py +31 -2
- pixeltable/functions/timestamp.py +1 -1
- pixeltable/functions/video.py +2 -8
- pixeltable/functions/vision.py +1 -1
- pixeltable/globals.py +347 -79
- pixeltable/index/embedding_index.py +44 -24
- pixeltable/metadata/__init__.py +1 -1
- pixeltable/metadata/converters/convert_16.py +2 -1
- pixeltable/metadata/converters/convert_17.py +2 -1
- pixeltable/metadata/converters/convert_23.py +35 -0
- pixeltable/metadata/converters/convert_24.py +47 -0
- pixeltable/metadata/converters/util.py +4 -2
- pixeltable/metadata/notes.py +2 -0
- pixeltable/metadata/schema.py +1 -0
- pixeltable/type_system.py +192 -48
- {pixeltable-0.2.28.dist-info → pixeltable-0.2.30.dist-info}/METADATA +4 -2
- {pixeltable-0.2.28.dist-info → pixeltable-0.2.30.dist-info}/RECORD +54 -57
- pixeltable-0.2.30.dist-info/entry_points.txt +3 -0
- pixeltable/tool/create_test_db_dump.py +0 -311
- pixeltable/tool/create_test_video.py +0 -81
- pixeltable/tool/doc_plugins/griffe.py +0 -50
- pixeltable/tool/doc_plugins/mkdocstrings.py +0 -6
- pixeltable/tool/doc_plugins/templates/material/udf.html.jinja +0 -135
- pixeltable/tool/embed_udf.py +0 -9
- pixeltable/tool/mypy_plugin.py +0 -55
- pixeltable-0.2.28.dist-info/entry_points.txt +0 -3
- {pixeltable-0.2.28.dist-info → pixeltable-0.2.30.dist-info}/LICENSE +0 -0
- {pixeltable-0.2.28.dist-info → pixeltable-0.2.30.dist-info}/WHEEL +0 -0
|
@@ -37,6 +37,14 @@ class EmbeddingIndex(IndexBase):
|
|
|
37
37
|
Metric.L2: 'vector_l2_ops'
|
|
38
38
|
}
|
|
39
39
|
|
|
40
|
+
metric: Metric
|
|
41
|
+
value_expr: exprs.FunctionCall
|
|
42
|
+
string_embed: Optional[func.Function]
|
|
43
|
+
image_embed: Optional[func.Function]
|
|
44
|
+
string_embed_signature_idx: int
|
|
45
|
+
image_embed_signature_idx: int
|
|
46
|
+
index_col_type: pgvector.sqlalchemy.Vector
|
|
47
|
+
|
|
40
48
|
def __init__(
|
|
41
49
|
self, c: catalog.Column, metric: str, string_embed: Optional[func.Function] = None,
|
|
42
50
|
image_embed: Optional[func.Function] = None):
|
|
@@ -49,18 +57,22 @@ class EmbeddingIndex(IndexBase):
|
|
|
49
57
|
raise excs.Error(f"Text embedding function is required for column {c.name} (parameter 'string_embed')")
|
|
50
58
|
if c.col_type.is_image_type() and image_embed is None:
|
|
51
59
|
raise excs.Error(f"Image embedding function is required for column {c.name} (parameter 'image_embed')")
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
self.
|
|
55
|
-
|
|
56
|
-
# verify signature
|
|
57
|
-
self._validate_embedding_fn(
|
|
60
|
+
|
|
61
|
+
if string_embed is None:
|
|
62
|
+
self.string_embed = None
|
|
63
|
+
else:
|
|
64
|
+
# verify signature and convert to a monomorphic function
|
|
65
|
+
self.string_embed = self._validate_embedding_fn(string_embed, 'string_embed', ts.ColumnType.Type.STRING)
|
|
66
|
+
|
|
67
|
+
if image_embed is None:
|
|
68
|
+
self.image_embed = None
|
|
69
|
+
else:
|
|
70
|
+
# verify signature and convert to a monomorphic function
|
|
71
|
+
self.image_embed = self._validate_embedding_fn(image_embed, 'image_embed', ts.ColumnType.Type.IMAGE)
|
|
58
72
|
|
|
59
73
|
self.metric = self.Metric[metric.upper()]
|
|
60
74
|
self.value_expr = string_embed(exprs.ColumnRef(c)) if c.col_type.is_string_type() else image_embed(exprs.ColumnRef(c))
|
|
61
75
|
assert isinstance(self.value_expr.col_type, ts.ArrayType)
|
|
62
|
-
self.string_embed = string_embed
|
|
63
|
-
self.image_embed = image_embed
|
|
64
76
|
vector_size = self.value_expr.col_type.shape[0]
|
|
65
77
|
assert vector_size is not None
|
|
66
78
|
self.index_col_type = pgvector.sqlalchemy.Vector(vector_size)
|
|
@@ -91,10 +103,10 @@ class EmbeddingIndex(IndexBase):
|
|
|
91
103
|
assert isinstance(item, (str, PIL.Image.Image))
|
|
92
104
|
if isinstance(item, str):
|
|
93
105
|
assert self.string_embed is not None
|
|
94
|
-
embedding = self.string_embed.exec(item)
|
|
106
|
+
embedding = self.string_embed.exec([item], {})
|
|
95
107
|
if isinstance(item, PIL.Image.Image):
|
|
96
108
|
assert self.image_embed is not None
|
|
97
|
-
embedding = self.image_embed.exec(item)
|
|
109
|
+
embedding = self.image_embed.exec([item], {})
|
|
98
110
|
|
|
99
111
|
if self.metric == self.Metric.COSINE:
|
|
100
112
|
return val_column.sa_col.cosine_distance(embedding) * -1 + 1
|
|
@@ -110,10 +122,10 @@ class EmbeddingIndex(IndexBase):
|
|
|
110
122
|
embedding: Optional[np.ndarray] = None
|
|
111
123
|
if isinstance(item, str):
|
|
112
124
|
assert self.string_embed is not None
|
|
113
|
-
embedding = self.string_embed.exec(item)
|
|
125
|
+
embedding = self.string_embed.exec([item], {})
|
|
114
126
|
if isinstance(item, PIL.Image.Image):
|
|
115
127
|
assert self.image_embed is not None
|
|
116
|
-
embedding = self.image_embed.exec(item)
|
|
128
|
+
embedding = self.image_embed.exec([item], {})
|
|
117
129
|
assert embedding is not None
|
|
118
130
|
|
|
119
131
|
if self.metric == self.Metric.COSINE:
|
|
@@ -132,27 +144,33 @@ class EmbeddingIndex(IndexBase):
|
|
|
132
144
|
return 'embedding'
|
|
133
145
|
|
|
134
146
|
@classmethod
|
|
135
|
-
def _validate_embedding_fn(cls, embed_fn: func.Function, name: str, expected_type: ts.ColumnType.Type) ->
|
|
136
|
-
"""Validate the signature"""
|
|
147
|
+
def _validate_embedding_fn(cls, embed_fn: func.Function, name: str, expected_type: ts.ColumnType.Type) -> func.Function:
|
|
148
|
+
"""Validate that the Function has a matching signature, and return the corresponding monomorphic function."""
|
|
137
149
|
assert isinstance(embed_fn, func.Function)
|
|
138
|
-
sig = embed_fn.signature
|
|
139
150
|
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
151
|
+
signature_idx: int = -1
|
|
152
|
+
for idx, sig in enumerate(embed_fn.signatures):
|
|
153
|
+
# The embedding function must be a 1-ary function of the correct type. But it's ok if the function signature
|
|
154
|
+
# has more than one parameter, as long as it has at most one *required* parameter.
|
|
155
|
+
if (len(sig.parameters) >= 1
|
|
156
|
+
and len(sig.required_parameters) <= 1
|
|
157
|
+
and sig.parameters_by_pos[0].col_type.type_enum == expected_type):
|
|
158
|
+
signature_idx = idx
|
|
159
|
+
break
|
|
160
|
+
|
|
161
|
+
if signature_idx == -1:
|
|
162
|
+
raise excs.Error(f'{name} must take a single {expected_type.name.lower()} parameter')
|
|
163
|
+
|
|
164
|
+
resolved_fn = embed_fn._resolved_fns[signature_idx]
|
|
147
165
|
|
|
148
166
|
# validate return type
|
|
149
167
|
param_name = sig.parameters_by_pos[0].name
|
|
150
168
|
if expected_type == ts.ColumnType.Type.STRING:
|
|
151
|
-
return_type =
|
|
169
|
+
return_type = resolved_fn.call_return_type([], {param_name: 'dummy'})
|
|
152
170
|
else:
|
|
153
171
|
assert expected_type == ts.ColumnType.Type.IMAGE
|
|
154
172
|
img = PIL.Image.new('RGB', (512, 512))
|
|
155
|
-
return_type =
|
|
173
|
+
return_type = resolved_fn.call_return_type([], {param_name: img})
|
|
156
174
|
assert return_type is not None
|
|
157
175
|
if not isinstance(return_type, ts.ArrayType):
|
|
158
176
|
raise excs.Error(f'{name} must return an array, but returns {return_type}')
|
|
@@ -161,6 +179,8 @@ class EmbeddingIndex(IndexBase):
|
|
|
161
179
|
if len(shape) != 1 or shape[0] == None:
|
|
162
180
|
raise excs.Error(f'{name} must return a 1D array of a specific length, but returns {return_type}')
|
|
163
181
|
|
|
182
|
+
return resolved_fn
|
|
183
|
+
|
|
164
184
|
def as_dict(self) -> dict:
|
|
165
185
|
return {
|
|
166
186
|
'metric': self.metric.name.lower(),
|
pixeltable/metadata/__init__.py
CHANGED
|
@@ -10,7 +10,7 @@ import sqlalchemy.orm as orm
|
|
|
10
10
|
from .schema import SystemInfo, SystemInfoMd
|
|
11
11
|
|
|
12
12
|
# current version of the metadata; this is incremented whenever the metadata schema changes
|
|
13
|
-
VERSION =
|
|
13
|
+
VERSION = 25
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
def create_system_info(engine: sql.engine.Engine) -> None:
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from uuid import UUID
|
|
1
2
|
import sqlalchemy as sql
|
|
2
3
|
|
|
3
4
|
from pixeltable.metadata import register_converter
|
|
@@ -12,7 +13,7 @@ def _(engine: sql.engine.Engine) -> None:
|
|
|
12
13
|
)
|
|
13
14
|
|
|
14
15
|
|
|
15
|
-
def __update_table_md(table_md: dict) -> None:
|
|
16
|
+
def __update_table_md(table_md: dict, table_id: UUID) -> None:
|
|
16
17
|
# External stores are not migratable; just drop them
|
|
17
18
|
del table_md['remotes']
|
|
18
19
|
table_md['external_stores'] = {}
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from uuid import UUID
|
|
1
2
|
import sqlalchemy as sql
|
|
2
3
|
|
|
3
4
|
from pixeltable.metadata import register_converter
|
|
@@ -12,7 +13,7 @@ def _(engine: sql.engine.Engine) -> None:
|
|
|
12
13
|
)
|
|
13
14
|
|
|
14
15
|
|
|
15
|
-
def __update_table_md(table_md: dict) -> None:
|
|
16
|
+
def __update_table_md(table_md: dict, table_id: UUID) -> None:
|
|
16
17
|
# key changes in IndexMd.init_args: img_embed -> image_embed, txt_embed -> string_embed
|
|
17
18
|
if len(table_md['index_md']) == 0:
|
|
18
19
|
return
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Any, Optional
|
|
3
|
+
from uuid import UUID
|
|
4
|
+
import sqlalchemy as sql
|
|
5
|
+
|
|
6
|
+
from pixeltable.metadata import register_converter
|
|
7
|
+
from pixeltable.metadata.converters.util import convert_table_md
|
|
8
|
+
from pixeltable.metadata.schema import Table
|
|
9
|
+
|
|
10
|
+
_logger = logging.getLogger('pixeltable')
|
|
11
|
+
|
|
12
|
+
@register_converter(version=23)
|
|
13
|
+
def _(engine: sql.engine.Engine) -> None:
|
|
14
|
+
convert_table_md(
|
|
15
|
+
engine,
|
|
16
|
+
table_md_updater=__update_table_md
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
def __update_table_md(table_md: dict, table_id: UUID) -> None:
|
|
20
|
+
"""update the index metadata to add indexed_col_tbl_id column if it is missing
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
table_md (dict): copy of the original table metadata. this gets updated in place.
|
|
24
|
+
table_id (UUID): the table id
|
|
25
|
+
|
|
26
|
+
"""
|
|
27
|
+
if len(table_md['index_md']) == 0:
|
|
28
|
+
return
|
|
29
|
+
for idx_md in table_md['index_md'].values():
|
|
30
|
+
if 'indexed_col_tbl_id' not in idx_md:
|
|
31
|
+
# index metadata is missing indexed_col_tbl_id
|
|
32
|
+
# assume that the indexed column is in the same table
|
|
33
|
+
# and update the index metadata.
|
|
34
|
+
_logger.info(f'Updating index metadata for table: {table_id} index: {idx_md["id"]}')
|
|
35
|
+
idx_md['indexed_col_tbl_id'] = str(table_id)
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import importlib
|
|
2
|
+
from typing import Any, Optional
|
|
3
|
+
|
|
4
|
+
import sqlalchemy as sql
|
|
5
|
+
|
|
6
|
+
from pixeltable.metadata import register_converter
|
|
7
|
+
from pixeltable.metadata.converters.util import convert_table_md
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@register_converter(version=24)
|
|
11
|
+
def _(engine: sql.engine.Engine) -> None:
|
|
12
|
+
convert_table_md(engine, substitution_fn=__substitute_md)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], Any]]:
|
|
16
|
+
from pixeltable import func
|
|
17
|
+
from pixeltable.func.globals import resolve_symbol
|
|
18
|
+
|
|
19
|
+
if (isinstance(v, dict) and
|
|
20
|
+
'_classpath' in v and
|
|
21
|
+
v['_classpath'] in ['pixeltable.func.callable_function.CallableFunction',
|
|
22
|
+
'pixeltable.func.aggregate_function.AggregateFunction',
|
|
23
|
+
'pixeltable.func.expr_template_function.ExprTemplateFunction']):
|
|
24
|
+
if 'path' in v:
|
|
25
|
+
assert 'signature' not in v
|
|
26
|
+
f = resolve_symbol(v['path'])
|
|
27
|
+
assert isinstance(f, func.Function)
|
|
28
|
+
v['signature'] = f.signatures[0].as_dict()
|
|
29
|
+
return k, v
|
|
30
|
+
|
|
31
|
+
if isinstance(v, dict) and '_classname' in v and v['_classname'] == 'FunctionCall':
|
|
32
|
+
# Correct an older serialization mechanism where Expr elements of FunctionCall args and
|
|
33
|
+
# kwargs were indicated with idx == -1 rather than None. This was fixed for InlineList
|
|
34
|
+
# and InlineDict back in convert_20, but not for FunctionCall.
|
|
35
|
+
assert 'args' in v and isinstance(v['args'], list)
|
|
36
|
+
assert 'kwargs' in v and isinstance(v['kwargs'], dict)
|
|
37
|
+
v['args'] = [
|
|
38
|
+
(None, arg) if idx == -1 else (idx, arg)
|
|
39
|
+
for idx, arg in v['args']
|
|
40
|
+
]
|
|
41
|
+
v['kwargs'] = {
|
|
42
|
+
k: (None, arg) if idx == -1 else (idx, arg)
|
|
43
|
+
for k, (idx, arg) in v['kwargs'].items()
|
|
44
|
+
}
|
|
45
|
+
return k, v
|
|
46
|
+
|
|
47
|
+
return None
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import copy
|
|
2
2
|
import logging
|
|
3
3
|
from typing import Any, Callable, Optional
|
|
4
|
+
from uuid import UUID
|
|
4
5
|
|
|
5
6
|
import sqlalchemy as sql
|
|
6
7
|
|
|
@@ -11,7 +12,7 @@ __logger = logging.getLogger('pixeltable')
|
|
|
11
12
|
|
|
12
13
|
def convert_table_md(
|
|
13
14
|
engine: sql.engine.Engine,
|
|
14
|
-
table_md_updater: Optional[Callable[[dict], None]] = None,
|
|
15
|
+
table_md_updater: Optional[Callable[[dict, UUID], None]] = None,
|
|
15
16
|
column_md_updater: Optional[Callable[[dict], None]] = None,
|
|
16
17
|
external_store_md_updater: Optional[Callable[[dict], None]] = None,
|
|
17
18
|
substitution_fn: Optional[Callable[[Optional[str], Any], Optional[tuple[Optional[str], Any]]]] = None
|
|
@@ -22,6 +23,7 @@ def convert_table_md(
|
|
|
22
23
|
Args:
|
|
23
24
|
engine: The SQLAlchemy engine.
|
|
24
25
|
table_md_updater: A function that updates schema.TableMd dicts in place.
|
|
26
|
+
It takes two arguments: the metadata dict (new values) and the table id.
|
|
25
27
|
column_md_updater: A function that updates schema.ColumnMd dicts in place.
|
|
26
28
|
external_store_md_updater: A function that updates the external store metadata in place.
|
|
27
29
|
substitution_fn: A function that substitutes metadata values. If specified, all metadata will be traversed
|
|
@@ -37,7 +39,7 @@ def convert_table_md(
|
|
|
37
39
|
assert isinstance(table_md, dict)
|
|
38
40
|
updated_table_md = copy.deepcopy(table_md)
|
|
39
41
|
if table_md_updater is not None:
|
|
40
|
-
table_md_updater(updated_table_md)
|
|
42
|
+
table_md_updater(updated_table_md, id)
|
|
41
43
|
if column_md_updater is not None:
|
|
42
44
|
__update_column_md(updated_table_md, column_md_updater)
|
|
43
45
|
if external_store_md_updater is not None:
|
pixeltable/metadata/notes.py
CHANGED
|
@@ -2,6 +2,8 @@
|
|
|
2
2
|
# rather than as a comment, so that the existence of a description can be enforced by
|
|
3
3
|
# the unit tests when new versions are added.
|
|
4
4
|
VERSION_NOTES = {
|
|
5
|
+
25: 'Functions with multiple signatures',
|
|
6
|
+
24: 'Added TableMd/IndexMd.indexed_col_tbl_id',
|
|
5
7
|
23: 'DataFrame.from_clause',
|
|
6
8
|
22: 'TableMd/ColumnMd.media_validation',
|
|
7
9
|
21: 'Separate InlineArray and InlineList',
|
pixeltable/metadata/schema.py
CHANGED
|
@@ -112,6 +112,7 @@ class IndexMd:
|
|
|
112
112
|
"""
|
|
113
113
|
id: int
|
|
114
114
|
name: str
|
|
115
|
+
indexed_col_tbl_id: str # UUID of the table (as string) that contains column being indexed
|
|
115
116
|
indexed_col_id: int # column being indexed
|
|
116
117
|
index_val_col_id: int # column holding the values to be indexed
|
|
117
118
|
index_val_undo_col_id: int # column holding index values for deleted rows
|
pixeltable/type_system.py
CHANGED
|
@@ -5,7 +5,6 @@ import datetime
|
|
|
5
5
|
import enum
|
|
6
6
|
import io
|
|
7
7
|
import json
|
|
8
|
-
import types
|
|
9
8
|
import typing
|
|
10
9
|
import urllib.parse
|
|
11
10
|
import urllib.request
|
|
@@ -14,7 +13,11 @@ from typing import Any, Iterable, Mapping, Optional, Sequence, Union
|
|
|
14
13
|
|
|
15
14
|
import PIL.Image
|
|
16
15
|
import av # type: ignore
|
|
16
|
+
import jsonschema
|
|
17
|
+
import jsonschema.protocols
|
|
18
|
+
import jsonschema.validators
|
|
17
19
|
import numpy as np
|
|
20
|
+
import pydantic
|
|
18
21
|
import sqlalchemy as sql
|
|
19
22
|
from typing import _GenericAlias # type: ignore[attr-defined]
|
|
20
23
|
from typing_extensions import _AnnotatedAlias
|
|
@@ -244,7 +247,7 @@ class ColumnType:
|
|
|
244
247
|
if col_type is not None:
|
|
245
248
|
return col_type
|
|
246
249
|
# this could still be json-serializable
|
|
247
|
-
if isinstance(val, dict) or isinstance(val, list) or isinstance(val, np.ndarray):
|
|
250
|
+
if isinstance(val, dict) or isinstance(val, list) or isinstance(val, np.ndarray) or isinstance(val, pydantic.BaseModel):
|
|
248
251
|
try:
|
|
249
252
|
JsonType().validate_literal(val)
|
|
250
253
|
return JsonType(nullable=nullable)
|
|
@@ -337,7 +340,7 @@ class ColumnType:
|
|
|
337
340
|
return TimestampType(nullable=nullable_default)
|
|
338
341
|
if t is PIL.Image.Image:
|
|
339
342
|
return ImageType(nullable=nullable_default)
|
|
340
|
-
if issubclass(t, Sequence) or issubclass(t, Mapping):
|
|
343
|
+
if issubclass(t, Sequence) or issubclass(t, Mapping) or issubclass(t, pydantic.BaseModel):
|
|
341
344
|
return JsonType(nullable=nullable_default)
|
|
342
345
|
return None
|
|
343
346
|
|
|
@@ -479,6 +482,20 @@ class ColumnType:
|
|
|
479
482
|
"""
|
|
480
483
|
pass
|
|
481
484
|
|
|
485
|
+
def to_json_schema(self) -> dict[str, Any]:
|
|
486
|
+
if self.nullable:
|
|
487
|
+
return {
|
|
488
|
+
'anyOf': [
|
|
489
|
+
self._to_json_schema(),
|
|
490
|
+
{'type': 'null'},
|
|
491
|
+
]
|
|
492
|
+
}
|
|
493
|
+
else:
|
|
494
|
+
return self._to_json_schema()
|
|
495
|
+
|
|
496
|
+
def _to_json_schema(self) -> dict[str, Any]:
|
|
497
|
+
raise excs.Error(f'Pixeltable type {self} is not a valid JSON type')
|
|
498
|
+
|
|
482
499
|
|
|
483
500
|
class InvalidType(ColumnType):
|
|
484
501
|
def __init__(self, nullable: bool = False):
|
|
@@ -501,6 +518,9 @@ class StringType(ColumnType):
|
|
|
501
518
|
def to_sa_type(self) -> sql.types.TypeEngine:
|
|
502
519
|
return sql.String()
|
|
503
520
|
|
|
521
|
+
def _to_json_schema(self) -> dict[str, Any]:
|
|
522
|
+
return {'type': 'string'}
|
|
523
|
+
|
|
504
524
|
def print_value(self, val: Any) -> str:
|
|
505
525
|
return f"'{val}'"
|
|
506
526
|
|
|
@@ -524,8 +544,13 @@ class IntType(ColumnType):
|
|
|
524
544
|
def to_sa_type(self) -> sql.types.TypeEngine:
|
|
525
545
|
return sql.BigInteger()
|
|
526
546
|
|
|
547
|
+
def _to_json_schema(self) -> dict[str, Any]:
|
|
548
|
+
return {'type': 'integer'}
|
|
549
|
+
|
|
527
550
|
def _validate_literal(self, val: Any) -> None:
|
|
528
|
-
|
|
551
|
+
# bool is a subclass of int, so we need to check for it
|
|
552
|
+
# explicitly first
|
|
553
|
+
if isinstance(val, bool) or not isinstance(val, int):
|
|
529
554
|
raise TypeError(f'Expected int, got {val.__class__.__name__}')
|
|
530
555
|
|
|
531
556
|
|
|
@@ -536,6 +561,9 @@ class FloatType(ColumnType):
|
|
|
536
561
|
def to_sa_type(self) -> sql.types.TypeEngine:
|
|
537
562
|
return sql.Float()
|
|
538
563
|
|
|
564
|
+
def _to_json_schema(self) -> dict[str, Any]:
|
|
565
|
+
return {'type': 'number'}
|
|
566
|
+
|
|
539
567
|
def _validate_literal(self, val: Any) -> None:
|
|
540
568
|
if not isinstance(val, float):
|
|
541
569
|
raise TypeError(f'Expected float, got {val.__class__.__name__}')
|
|
@@ -553,6 +581,9 @@ class BoolType(ColumnType):
|
|
|
553
581
|
def to_sa_type(self) -> sql.types.TypeEngine:
|
|
554
582
|
return sql.Boolean()
|
|
555
583
|
|
|
584
|
+
def _to_json_schema(self) -> dict[str, Any]:
|
|
585
|
+
return {'type': 'boolean'}
|
|
586
|
+
|
|
556
587
|
def _validate_literal(self, val: Any) -> None:
|
|
557
588
|
if not isinstance(val, bool):
|
|
558
589
|
raise TypeError(f'Expected bool, got {val.__class__.__name__}')
|
|
@@ -581,61 +612,44 @@ class TimestampType(ColumnType):
|
|
|
581
612
|
|
|
582
613
|
|
|
583
614
|
class JsonType(ColumnType):
|
|
584
|
-
|
|
585
|
-
|
|
615
|
+
|
|
616
|
+
json_schema: Optional[dict[str, Any]]
|
|
617
|
+
__validator: Optional[jsonschema.protocols.Validator]
|
|
618
|
+
|
|
619
|
+
def __init__(self, json_schema: Optional[dict[str, Any]] = None, nullable: bool = False):
|
|
586
620
|
super().__init__(self.Type.JSON, nullable=nullable)
|
|
587
|
-
self.
|
|
621
|
+
self.json_schema = json_schema
|
|
622
|
+
if json_schema is None:
|
|
623
|
+
self.__validator = None
|
|
624
|
+
else:
|
|
625
|
+
validator_cls = jsonschema.validators.validator_for(json_schema)
|
|
626
|
+
validator_cls.check_schema(json_schema)
|
|
627
|
+
self.__validator = validator_cls(json_schema)
|
|
588
628
|
|
|
589
629
|
def copy(self, nullable: bool) -> ColumnType:
|
|
590
|
-
return JsonType(self.
|
|
630
|
+
return JsonType(json_schema=self.json_schema, nullable=nullable)
|
|
591
631
|
|
|
592
632
|
def matches(self, other: ColumnType) -> bool:
|
|
593
|
-
return isinstance(other, JsonType) and self.
|
|
594
|
-
|
|
595
|
-
def supertype(self, other: ColumnType) -> Optional[JsonType]:
|
|
596
|
-
if not isinstance(other, JsonType):
|
|
597
|
-
return None
|
|
598
|
-
if self.type_spec is None:
|
|
599
|
-
# we don't have a type spec and can accept anything accepted by other
|
|
600
|
-
return JsonType(nullable=(self.nullable or other.nullable))
|
|
601
|
-
if other.type_spec is None:
|
|
602
|
-
# we have a type spec but other doesn't
|
|
603
|
-
return JsonType(nullable=(self.nullable or other.nullable))
|
|
604
|
-
|
|
605
|
-
# we both have type specs; the supertype's type spec is the union of the two
|
|
606
|
-
type_spec: dict[str, ColumnType] = {}
|
|
607
|
-
type_spec.update(self.type_spec)
|
|
608
|
-
for other_field_name, other_field_type in other.type_spec.items():
|
|
609
|
-
if other_field_name not in type_spec:
|
|
610
|
-
type_spec[other_field_name] = other_field_type
|
|
611
|
-
else:
|
|
612
|
-
# both type specs have this field
|
|
613
|
-
field_type = type_spec[other_field_name].supertype(other_field_type)
|
|
614
|
-
if field_type is None:
|
|
615
|
-
# conflicting types
|
|
616
|
-
return JsonType(nullable=(self.nullable or other.nullable))
|
|
617
|
-
type_spec[other_field_name] = field_type
|
|
618
|
-
return JsonType(type_spec, nullable=(self.nullable or other.nullable))
|
|
633
|
+
return isinstance(other, JsonType) and self.json_schema == other.json_schema
|
|
619
634
|
|
|
620
635
|
def _as_dict(self) -> dict:
|
|
621
636
|
result = super()._as_dict()
|
|
622
|
-
if self.
|
|
623
|
-
|
|
624
|
-
result.update({'type_spec': type_spec_dict})
|
|
637
|
+
if self.json_schema is not None:
|
|
638
|
+
result.update({'json_schema': self.json_schema})
|
|
625
639
|
return result
|
|
626
640
|
|
|
627
641
|
@classmethod
|
|
628
642
|
def _from_dict(cls, d: dict) -> ColumnType:
|
|
629
|
-
|
|
630
|
-
if 'type_spec' in d:
|
|
631
|
-
type_spec = {
|
|
632
|
-
field_name: cls.deserialize(field_type_dict) for field_name, field_type_dict in d['type_spec'].items()
|
|
633
|
-
}
|
|
634
|
-
return cls(type_spec, nullable=d['nullable'])
|
|
643
|
+
return cls(json_schema=d.get('json_schema'), nullable=d['nullable'])
|
|
635
644
|
|
|
636
645
|
def to_sa_type(self) -> sql.types.TypeEngine:
|
|
637
646
|
return sql.dialects.postgresql.JSONB()
|
|
638
647
|
|
|
648
|
+
def _to_json_schema(self) -> dict[str, Any]:
|
|
649
|
+
if self.json_schema is None:
|
|
650
|
+
return {}
|
|
651
|
+
return self.json_schema
|
|
652
|
+
|
|
639
653
|
def print_value(self, val: Any) -> str:
|
|
640
654
|
val_type = self.infer_literal_type(val)
|
|
641
655
|
if val_type is None:
|
|
@@ -645,27 +659,141 @@ class JsonType(ColumnType):
|
|
|
645
659
|
return val_type.print_value(val)
|
|
646
660
|
|
|
647
661
|
def _validate_literal(self, val: Any) -> None:
|
|
648
|
-
if not isinstance(val, dict
|
|
662
|
+
if not isinstance(val, (dict, list)):
|
|
649
663
|
# TODO In the future we should accept scalars too, which would enable us to remove this top-level check
|
|
650
664
|
raise TypeError(f'Expected dict or list, got {val.__class__.__name__}')
|
|
651
|
-
if not self.
|
|
665
|
+
if not self.__is_valid_json(val):
|
|
652
666
|
raise TypeError(f'That literal is not a valid Pixeltable JSON object: {val}')
|
|
667
|
+
if self.__validator is not None:
|
|
668
|
+
self.__validator.validate(val)
|
|
653
669
|
|
|
654
670
|
@classmethod
|
|
655
|
-
def
|
|
671
|
+
def __is_valid_json(cls, val: Any) -> bool:
|
|
656
672
|
if val is None or isinstance(val, (str, int, float, bool)):
|
|
657
673
|
return True
|
|
658
674
|
if isinstance(val, (list, tuple)):
|
|
659
|
-
return all(cls.
|
|
675
|
+
return all(cls.__is_valid_json(v) for v in val)
|
|
660
676
|
if isinstance(val, dict):
|
|
661
|
-
return all(isinstance(k, str) and cls.
|
|
677
|
+
return all(isinstance(k, str) and cls.__is_valid_json(v) for k, v in val.items())
|
|
662
678
|
return False
|
|
663
679
|
|
|
664
680
|
def _create_literal(self, val: Any) -> Any:
|
|
665
681
|
if isinstance(val, tuple):
|
|
666
682
|
val = list(val)
|
|
683
|
+
if isinstance(val, pydantic.BaseModel):
|
|
684
|
+
return val.model_dump()
|
|
667
685
|
return val
|
|
668
686
|
|
|
687
|
+
def supertype(self, other: ColumnType) -> Optional[JsonType]:
|
|
688
|
+
# Try using the (much faster) supertype logic in ColumnType first. That will work if, for example, the types
|
|
689
|
+
# are identical except for nullability. If that doesn't work and both types are JsonType, then we will need to
|
|
690
|
+
# merge their schemas.
|
|
691
|
+
basic_supertype = super().supertype(other)
|
|
692
|
+
if basic_supertype is not None:
|
|
693
|
+
assert isinstance(basic_supertype, JsonType)
|
|
694
|
+
return basic_supertype
|
|
695
|
+
|
|
696
|
+
if not isinstance(other, JsonType):
|
|
697
|
+
return None
|
|
698
|
+
|
|
699
|
+
if self.json_schema is None or other.json_schema is None:
|
|
700
|
+
return JsonType(nullable=(self.nullable or other.nullable))
|
|
701
|
+
|
|
702
|
+
superschema = self.__superschema(self.json_schema, other.json_schema)
|
|
703
|
+
|
|
704
|
+
return JsonType(
|
|
705
|
+
json_schema=(None if len(superschema) == 0 else superschema),
|
|
706
|
+
nullable=(self.nullable or other.nullable)
|
|
707
|
+
)
|
|
708
|
+
|
|
709
|
+
@classmethod
|
|
710
|
+
def __superschema(cls, a: dict[str, Any], b: dict[str, Any]) -> Optional[dict[str, Any]]:
|
|
711
|
+
# Defining a general type hierarchy over all JSON schemas would be a challenging problem. In order to keep
|
|
712
|
+
# things manageable, we only define a hierarchy among "conforming" schemas, which provides enough generality
|
|
713
|
+
# for the most important use cases (unions for type inference, validation of inline exprs). A schema is
|
|
714
|
+
# considered to be conforming if either:
|
|
715
|
+
# (i) it is a scalar (string, integer, number, boolean) or dictionary (object) type; or
|
|
716
|
+
# (ii) it is an "anyOf" schema of one of the above types and the exact schema {'type': 'null'}.
|
|
717
|
+
# Conforming schemas are organized into a type hierarchy in an internally consistent way. Nonconforming
|
|
718
|
+
# schemas are allowed, but they are isolates in the type hierarchy: a nonconforming schema has no proper
|
|
719
|
+
# subtypes, and its only proper supertype is an unconstrained JsonType().
|
|
720
|
+
#
|
|
721
|
+
# There is some subtlety in the handling of nullable fields. Nullable fields are represented in JSON
|
|
722
|
+
# schemas as (for example) {'anyOf': [{'type': 'string'}, {'type': 'null'}]}. When finding the supertype
|
|
723
|
+
# of schemas that might be nullable, we first unpack the 'anyOf's, find the supertype of the underlyings,
|
|
724
|
+
# then reapply the 'anyOf' if appropriate. The top-level schema (i.e., JsonType.json_schema) is presumed
|
|
725
|
+
# to NOT be in this form (since nullability is indicated by the `nullable` field of the JsonType object),
|
|
726
|
+
# so this subtlety is applicable only to types that occur in subfields.
|
|
727
|
+
#
|
|
728
|
+
# There is currently no special handling of lists; distinct schemas with type 'array' will union to the
|
|
729
|
+
# generic {'type': 'array'} schema. This could be a TODO item if there is a need for it in the future.
|
|
730
|
+
|
|
731
|
+
if a == b:
|
|
732
|
+
return a
|
|
733
|
+
|
|
734
|
+
if 'properties' in a and 'properties' in b:
|
|
735
|
+
a_props = a['properties']
|
|
736
|
+
b_props = b['properties']
|
|
737
|
+
a_req = a.get('required', [])
|
|
738
|
+
b_req = b.get('required', [])
|
|
739
|
+
super_props = {}
|
|
740
|
+
super_req = []
|
|
741
|
+
for key, a_prop_schema in a_props.items():
|
|
742
|
+
if key in b_props: # in both a and b
|
|
743
|
+
prop_schema = cls.__superschema_with_nulls(a_prop_schema, b_props[key])
|
|
744
|
+
super_props[key] = prop_schema
|
|
745
|
+
if key in a_req and key in b_req:
|
|
746
|
+
super_req.append(key)
|
|
747
|
+
else: # in a but not b
|
|
748
|
+
# Add it to the supertype schema as optional (regardless of its status in a)
|
|
749
|
+
super_props[key] = a_prop_schema
|
|
750
|
+
for key, b_prop_schema in b_props.items():
|
|
751
|
+
if key not in a_props: # in b but not a
|
|
752
|
+
super_props[key] = b_prop_schema
|
|
753
|
+
schema = {'type': 'object', 'properties': super_props}
|
|
754
|
+
if len(super_req) > 0:
|
|
755
|
+
schema['required'] = super_req
|
|
756
|
+
return schema
|
|
757
|
+
|
|
758
|
+
a_type = a.get('type')
|
|
759
|
+
b_type = b.get('type')
|
|
760
|
+
|
|
761
|
+
if (a_type in ('string', 'integer', 'number', 'boolean', 'object', 'array') and a_type == b_type):
|
|
762
|
+
# a and b both have the same type designation, but are not identical. This can happen if
|
|
763
|
+
# (for example) they have validators or other attributes that differ. In this case, we
|
|
764
|
+
# generalize to {'type': t}, where t is their shared type, with no other qualifications.
|
|
765
|
+
return {'type': a_type}
|
|
766
|
+
|
|
767
|
+
return {} # Unresolvable type conflict; the supertype is an unrestricted JsonType.
|
|
768
|
+
|
|
769
|
+
@classmethod
|
|
770
|
+
def __superschema_with_nulls(cls, a: dict[str, Any], b: dict[str, Any]) -> Optional[dict[str, Any]]:
|
|
771
|
+
a, a_nullable = cls.__unpack_null_from_schema(a)
|
|
772
|
+
b, b_nullable = cls.__unpack_null_from_schema(b)
|
|
773
|
+
|
|
774
|
+
result = cls.__superschema(a, b)
|
|
775
|
+
if len(result) > 0 and (a_nullable or b_nullable):
|
|
776
|
+
# if len(result) == 0, then null is implicitly accepted; otherwise, we need to explicitly allow it
|
|
777
|
+
return {'anyOf': [result, {'type': 'null'}]}
|
|
778
|
+
return result
|
|
779
|
+
|
|
780
|
+
@classmethod
|
|
781
|
+
def __unpack_null_from_schema(cls, s: dict[str, Any]) -> tuple[dict[str, Any], bool]:
|
|
782
|
+
if 'anyOf' in s and len(s['anyOf']) == 2 and {'type': 'null'} in s['anyOf']:
|
|
783
|
+
try:
|
|
784
|
+
return next(s for s in s['anyOf'] if s != {'type': 'null'}), True
|
|
785
|
+
except StopIteration:
|
|
786
|
+
pass
|
|
787
|
+
return s, False
|
|
788
|
+
|
|
789
|
+
def _to_base_str(self) -> str:
|
|
790
|
+
if self.json_schema is None:
|
|
791
|
+
return 'Json'
|
|
792
|
+
elif 'title' in self.json_schema:
|
|
793
|
+
return f'Json[{self.json_schema["title"]}]'
|
|
794
|
+
else:
|
|
795
|
+
return f'Json[{self.json_schema}]'
|
|
796
|
+
|
|
669
797
|
|
|
670
798
|
class ArrayType(ColumnType):
|
|
671
799
|
def __init__(self, shape: tuple[Union[int, None], ...], dtype: ColumnType, nullable: bool = False):
|
|
@@ -743,6 +871,12 @@ class ArrayType(ColumnType):
|
|
|
743
871
|
return False
|
|
744
872
|
return val.dtype == self.numpy_dtype()
|
|
745
873
|
|
|
874
|
+
def _to_json_schema(self) -> dict[str, Any]:
|
|
875
|
+
return {
|
|
876
|
+
'type': 'array',
|
|
877
|
+
'items': self.pxt_dtype._to_json_schema(),
|
|
878
|
+
}
|
|
879
|
+
|
|
746
880
|
def _validate_literal(self, val: Any) -> None:
|
|
747
881
|
if not isinstance(val, np.ndarray):
|
|
748
882
|
raise TypeError(f'Expected numpy.ndarray, got {val.__class__.__name__}')
|
|
@@ -1017,6 +1151,16 @@ class _PxtType:
|
|
|
1017
1151
|
|
|
1018
1152
|
|
|
1019
1153
|
class Json(_PxtType):
|
|
1154
|
+
def __class_getitem__(cls, item: Any) -> _AnnotatedAlias:
|
|
1155
|
+
"""
|
|
1156
|
+
`item` (the type subscript) must be a `dict` representing a valid JSON Schema.
|
|
1157
|
+
"""
|
|
1158
|
+
if not isinstance(item, dict):
|
|
1159
|
+
raise TypeError('Json type parameter must be a dict')
|
|
1160
|
+
|
|
1161
|
+
# The JsonType initializer will validate the JSON Schema.
|
|
1162
|
+
return typing.Annotated[Any, JsonType(json_schema=item, nullable=False)]
|
|
1163
|
+
|
|
1020
1164
|
@classmethod
|
|
1021
1165
|
def as_col_type(cls, nullable: bool) -> ColumnType:
|
|
1022
1166
|
return JsonType(nullable=nullable)
|