pixeltable 0.4.15__py3-none-any.whl → 0.4.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +4 -0
- pixeltable/catalog/catalog.py +125 -63
- pixeltable/catalog/column.py +7 -2
- pixeltable/catalog/table.py +1 -0
- pixeltable/catalog/table_metadata.py +4 -0
- pixeltable/catalog/table_version.py +174 -117
- pixeltable/catalog/table_version_handle.py +4 -1
- pixeltable/catalog/table_version_path.py +0 -11
- pixeltable/catalog/view.py +6 -0
- pixeltable/config.py +7 -0
- pixeltable/dataframe.py +10 -5
- pixeltable/env.py +56 -19
- pixeltable/exec/__init__.py +2 -0
- pixeltable/exec/cell_materialization_node.py +231 -0
- pixeltable/exec/cell_reconstruction_node.py +135 -0
- pixeltable/exec/exec_node.py +1 -1
- pixeltable/exec/expr_eval/evaluators.py +1 -0
- pixeltable/exec/expr_eval/expr_eval_node.py +3 -0
- pixeltable/exec/expr_eval/globals.py +2 -0
- pixeltable/exec/globals.py +32 -0
- pixeltable/exec/object_store_save_node.py +1 -4
- pixeltable/exec/row_update_node.py +16 -9
- pixeltable/exec/sql_node.py +107 -14
- pixeltable/exprs/__init__.py +1 -1
- pixeltable/exprs/arithmetic_expr.py +23 -18
- pixeltable/exprs/column_property_ref.py +10 -10
- pixeltable/exprs/column_ref.py +2 -2
- pixeltable/exprs/data_row.py +106 -37
- pixeltable/exprs/expr.py +9 -0
- pixeltable/exprs/expr_set.py +14 -7
- pixeltable/exprs/inline_expr.py +2 -19
- pixeltable/exprs/json_path.py +45 -12
- pixeltable/exprs/row_builder.py +54 -22
- pixeltable/functions/__init__.py +1 -0
- pixeltable/functions/bedrock.py +7 -0
- pixeltable/functions/deepseek.py +11 -4
- pixeltable/functions/llama_cpp.py +7 -0
- pixeltable/functions/math.py +1 -1
- pixeltable/functions/ollama.py +7 -0
- pixeltable/functions/openai.py +4 -4
- pixeltable/functions/openrouter.py +143 -0
- pixeltable/functions/video.py +110 -28
- pixeltable/globals.py +10 -4
- pixeltable/io/globals.py +18 -17
- pixeltable/io/parquet.py +1 -1
- pixeltable/io/table_data_conduit.py +47 -22
- pixeltable/iterators/document.py +61 -23
- pixeltable/iterators/video.py +126 -53
- pixeltable/metadata/__init__.py +1 -1
- pixeltable/metadata/converters/convert_40.py +73 -0
- pixeltable/metadata/notes.py +1 -0
- pixeltable/plan.py +175 -46
- pixeltable/share/packager.py +155 -26
- pixeltable/store.py +2 -3
- pixeltable/type_system.py +5 -3
- pixeltable/utils/arrow.py +6 -6
- pixeltable/utils/av.py +65 -0
- pixeltable/utils/console_output.py +4 -1
- pixeltable/utils/exception_handler.py +5 -28
- pixeltable/utils/image.py +7 -0
- pixeltable/utils/misc.py +5 -0
- pixeltable/utils/object_stores.py +16 -1
- pixeltable/utils/s3_store.py +44 -11
- {pixeltable-0.4.15.dist-info → pixeltable-0.4.17.dist-info}/METADATA +29 -28
- {pixeltable-0.4.15.dist-info → pixeltable-0.4.17.dist-info}/RECORD +68 -61
- {pixeltable-0.4.15.dist-info → pixeltable-0.4.17.dist-info}/WHEEL +0 -0
- {pixeltable-0.4.15.dist-info → pixeltable-0.4.17.dist-info}/entry_points.txt +0 -0
- {pixeltable-0.4.15.dist-info → pixeltable-0.4.17.dist-info}/licenses/LICENSE +0 -0
pixeltable/exprs/data_row.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import dataclasses
|
|
3
4
|
import datetime
|
|
4
5
|
import io
|
|
5
6
|
import urllib.parse
|
|
@@ -13,15 +14,72 @@ import PIL
|
|
|
13
14
|
import PIL.Image
|
|
14
15
|
import sqlalchemy as sql
|
|
15
16
|
|
|
17
|
+
import pixeltable.utils.image as image_utils
|
|
16
18
|
from pixeltable import catalog, env
|
|
17
19
|
from pixeltable.utils.local_store import TempStore
|
|
20
|
+
from pixeltable.utils.misc import non_none_dict_factory
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclasses.dataclass
|
|
24
|
+
class ArrayMd:
|
|
25
|
+
"""
|
|
26
|
+
Metadata for array cells that are stored externally.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
start: int
|
|
30
|
+
end: int
|
|
31
|
+
|
|
32
|
+
# we store bool arrays as packed bits (uint8 arrays), and need to record the shape to reconstruct the array
|
|
33
|
+
is_bool: bool = False
|
|
34
|
+
shape: tuple[int, ...] | None = None
|
|
35
|
+
|
|
36
|
+
def as_dict(self) -> dict:
|
|
37
|
+
# dict_factory: suppress Nones
|
|
38
|
+
x = dataclasses.asdict(self, dict_factory=non_none_dict_factory)
|
|
39
|
+
return x
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclasses.dataclass
|
|
43
|
+
class CellMd:
|
|
44
|
+
"""
|
|
45
|
+
Content of the cellmd column.
|
|
46
|
+
|
|
47
|
+
All fields are optional, to minimize storage.
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
errortype: str | None = None
|
|
51
|
+
errormsg: str | None = None
|
|
52
|
+
|
|
53
|
+
# a list of file urls that are used to store images and arrays; only set for json and array columns
|
|
54
|
+
# for json columns: a list of all urls referenced in the column value
|
|
55
|
+
# for array columns: a single url
|
|
56
|
+
file_urls: list[str] | None = None
|
|
57
|
+
|
|
58
|
+
array_md: ArrayMd | None = None
|
|
59
|
+
|
|
60
|
+
@classmethod
|
|
61
|
+
def from_dict(cls, d: dict) -> CellMd:
|
|
62
|
+
x: CellMd
|
|
63
|
+
if 'array_md' in d:
|
|
64
|
+
d2 = d.copy()
|
|
65
|
+
del d2['array_md']
|
|
66
|
+
x = cls(**d2, array_md=ArrayMd(**d['array_md']))
|
|
67
|
+
else:
|
|
68
|
+
x = cls(**d)
|
|
69
|
+
return x
|
|
70
|
+
|
|
71
|
+
def as_dict(self) -> dict:
|
|
72
|
+
x = dataclasses.asdict(self, dict_factory=non_none_dict_factory)
|
|
73
|
+
return x
|
|
18
74
|
|
|
19
75
|
|
|
20
76
|
class DataRow:
|
|
21
77
|
"""
|
|
22
78
|
Encapsulates all data and execution state needed by RowBuilder and DataRowBatch:
|
|
23
79
|
- state for in-memory computation
|
|
24
|
-
- state for
|
|
80
|
+
- state needed for expression evaluation
|
|
81
|
+
- containers for output column values
|
|
82
|
+
|
|
25
83
|
This is not meant to be a black-box abstraction.
|
|
26
84
|
|
|
27
85
|
In-memory representations by column type:
|
|
@@ -39,79 +97,92 @@ class DataRow:
|
|
|
39
97
|
- DocumentType: local path if available, otherwise url
|
|
40
98
|
"""
|
|
41
99
|
|
|
100
|
+
# expr evaluation state; indexed by slot idx
|
|
42
101
|
vals: np.ndarray # of object
|
|
43
102
|
has_val: np.ndarray # of bool
|
|
44
103
|
excs: np.ndarray # of object
|
|
45
|
-
|
|
46
|
-
# If `may_have_exc` is False, then we guarantee that no slot has an exception set. This is used to optimize
|
|
47
|
-
# exception handling under normal operation.
|
|
48
|
-
_may_have_exc: bool
|
|
49
|
-
|
|
50
|
-
# expr evaluation state; indexed by slot idx
|
|
51
104
|
missing_slots: np.ndarray # of bool; number of missing dependencies
|
|
52
105
|
missing_dependents: np.ndarray # of int16; number of missing dependents
|
|
53
106
|
is_scheduled: np.ndarray # of bool; True if this slot is scheduled for evaluation
|
|
54
107
|
|
|
55
|
-
#
|
|
56
|
-
|
|
57
|
-
media_slot_idxs: list[int]
|
|
58
|
-
array_slot_idxs: list[int]
|
|
59
|
-
|
|
60
|
-
# the primary key of a store row is a sequence of ints (the number is different for table vs view)
|
|
61
|
-
pk: Optional[tuple[int, ...]]
|
|
108
|
+
# CellMd needed for query execution; needs to be indexed by slot idx, not column id, to work for joins
|
|
109
|
+
slot_md: dict[int, CellMd]
|
|
62
110
|
|
|
63
111
|
# file_urls:
|
|
64
112
|
# - stored url of file for media in vals[i]
|
|
65
113
|
# - None if vals[i] is not media type
|
|
66
114
|
# - not None if file_paths[i] is not None
|
|
115
|
+
# TODO: this is a sparse vector; should it be a dict[int, str]?
|
|
67
116
|
file_urls: np.ndarray # of str
|
|
68
117
|
|
|
69
118
|
# file_paths:
|
|
70
119
|
# - local path of media file in vals[i]; points to the file cache if file_urls[i] is remote
|
|
71
120
|
# - None if vals[i] is not a media type or if there is no local file yet for file_urls[i]
|
|
121
|
+
# TODO: this is a sparse vector; should it be a dict[int, str]?
|
|
72
122
|
file_paths: np.ndarray # of str
|
|
73
123
|
|
|
124
|
+
# If `may_have_exc` is False, then we guarantee that no slot has an exception set. This is used to optimize
|
|
125
|
+
# exception handling under normal operation.
|
|
126
|
+
_may_have_exc: bool
|
|
127
|
+
|
|
128
|
+
# the primary key of a store row is a sequence of ints (the number is different for table vs view)
|
|
129
|
+
pk: Optional[tuple[int, ...]]
|
|
74
130
|
# for nested rows (ie, those produced by JsonMapperDispatcher)
|
|
75
131
|
parent_row: Optional[DataRow]
|
|
76
132
|
parent_slot_idx: Optional[int]
|
|
77
133
|
|
|
134
|
+
# state for table output (insert()/update()); key: column id
|
|
135
|
+
cell_vals: dict[int, Any] # materialized values of output columns, in the format required for the column
|
|
136
|
+
cell_md: dict[int, CellMd]
|
|
137
|
+
|
|
138
|
+
# control structures that are shared across all DataRows in a batch
|
|
139
|
+
img_slot_idxs: list[int]
|
|
140
|
+
media_slot_idxs: list[int]
|
|
141
|
+
array_slot_idxs: list[int]
|
|
142
|
+
json_slot_idxs: list[int]
|
|
143
|
+
|
|
78
144
|
def __init__(
|
|
79
145
|
self,
|
|
80
146
|
size: int,
|
|
81
147
|
img_slot_idxs: list[int],
|
|
82
148
|
media_slot_idxs: list[int],
|
|
83
149
|
array_slot_idxs: list[int],
|
|
150
|
+
json_slot_idxs: list[int],
|
|
84
151
|
parent_row: Optional[DataRow] = None,
|
|
85
152
|
parent_slot_idx: Optional[int] = None,
|
|
86
153
|
):
|
|
87
|
-
self.img_slot_idxs = img_slot_idxs
|
|
88
|
-
self.media_slot_idxs = media_slot_idxs
|
|
89
|
-
self.array_slot_idxs = array_slot_idxs
|
|
90
154
|
self.init(size)
|
|
91
155
|
self.parent_row = parent_row
|
|
92
156
|
self.parent_slot_idx = parent_slot_idx
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
self.
|
|
96
|
-
self.
|
|
97
|
-
|
|
157
|
+
self.img_slot_idxs = img_slot_idxs
|
|
158
|
+
self.media_slot_idxs = media_slot_idxs
|
|
159
|
+
self.array_slot_idxs = array_slot_idxs
|
|
160
|
+
self.json_slot_idxs = json_slot_idxs
|
|
161
|
+
|
|
162
|
+
def init(self, size: int) -> None:
|
|
163
|
+
self.vals = np.full(size, None, dtype=object)
|
|
164
|
+
self.has_val = np.zeros(size, dtype=bool)
|
|
165
|
+
self.excs = np.full(size, None, dtype=object)
|
|
166
|
+
self.missing_slots = np.zeros(size, dtype=bool)
|
|
167
|
+
self.missing_dependents = np.zeros(size, dtype=np.int16)
|
|
168
|
+
self.is_scheduled = np.zeros(size, dtype=bool)
|
|
169
|
+
self.slot_md = {}
|
|
170
|
+
self.file_urls = np.full(size, None, dtype=object)
|
|
171
|
+
self.file_paths = np.full(size, None, dtype=object)
|
|
98
172
|
self._may_have_exc = False
|
|
99
|
-
self.
|
|
100
|
-
self.
|
|
101
|
-
self.is_scheduled = np.zeros(num_slots, dtype=bool)
|
|
173
|
+
self.cell_vals = {}
|
|
174
|
+
self.cell_md = {}
|
|
102
175
|
self.pk = None
|
|
103
|
-
self.file_urls = np.full(num_slots, None, dtype=object)
|
|
104
|
-
self.file_paths = np.full(num_slots, None, dtype=object)
|
|
105
176
|
self.parent_row = None
|
|
106
177
|
self.parent_slot_idx = None
|
|
107
178
|
|
|
108
|
-
def clear(self,
|
|
109
|
-
if
|
|
110
|
-
self.has_val[
|
|
111
|
-
self.vals[
|
|
112
|
-
self.excs[
|
|
113
|
-
self.file_urls[
|
|
114
|
-
self.file_paths[
|
|
179
|
+
def clear(self, slot_idxs: Optional[np.ndarray] = None) -> None:
|
|
180
|
+
if slot_idxs is not None:
|
|
181
|
+
self.has_val[slot_idxs] = False
|
|
182
|
+
self.vals[slot_idxs] = None
|
|
183
|
+
self.excs[slot_idxs] = None
|
|
184
|
+
self.file_urls[slot_idxs] = None
|
|
185
|
+
self.file_paths[slot_idxs] = None
|
|
115
186
|
else:
|
|
116
187
|
self.init(len(self.vals))
|
|
117
188
|
|
|
@@ -292,9 +363,7 @@ class DataRow:
|
|
|
292
363
|
val = self.vals[index]
|
|
293
364
|
format = None
|
|
294
365
|
if isinstance(val, PIL.Image.Image):
|
|
295
|
-
|
|
296
|
-
# In that case, use WebP instead.
|
|
297
|
-
format = 'webp' if val.has_transparency_data else 'jpeg'
|
|
366
|
+
format = image_utils.default_format(val)
|
|
298
367
|
filepath, url = TempStore.save_media_object(val, col, format=format)
|
|
299
368
|
self.file_paths[index] = str(filepath) if filepath is not None else None
|
|
300
369
|
self.vals[index] = None
|
pixeltable/exprs/expr.py
CHANGED
|
@@ -368,6 +368,15 @@ class Expr(abc.ABC):
|
|
|
368
368
|
for e in expr_list:
|
|
369
369
|
yield from e.subexprs(expr_class=expr_class, filter=filter, traverse_matches=traverse_matches)
|
|
370
370
|
|
|
371
|
+
@classmethod
|
|
372
|
+
def list_contains(
|
|
373
|
+
cls,
|
|
374
|
+
expr_list: Iterable[Expr],
|
|
375
|
+
expr_class: type[Expr] | None = None,
|
|
376
|
+
filter: Callable[[Expr], bool] | None = None,
|
|
377
|
+
) -> bool:
|
|
378
|
+
return any(e._contains(expr_class, filter) for e in expr_list)
|
|
379
|
+
|
|
371
380
|
def _contains(self, cls: Optional[type[Expr]] = None, filter: Optional[Callable[[Expr], bool]] = None) -> bool:
|
|
372
381
|
"""
|
|
373
382
|
Returns True if any subexpr is an instance of cls and/or matches filter.
|
pixeltable/exprs/expr_set.py
CHANGED
|
@@ -9,26 +9,33 @@ T = TypeVar('T', bound='Expr')
|
|
|
9
9
|
|
|
10
10
|
class ExprSet(Generic[T]):
|
|
11
11
|
"""
|
|
12
|
-
|
|
12
|
+
An ordered set that also supports indexed lookup (by slot_idx and Expr.id). Exprs are uniquely identified by
|
|
13
|
+
Expr.id.
|
|
13
14
|
"""
|
|
14
15
|
|
|
15
16
|
exprs: dict[int, T] # key: Expr.id
|
|
17
|
+
expr_offsets: dict[int, int] # key: Expr.id, value: offset into self.exprs.keys()
|
|
16
18
|
exprs_by_idx: dict[int, T] # key: slot_idx
|
|
17
19
|
|
|
18
20
|
def __init__(self, elements: Optional[Iterable[T]] = None):
|
|
19
21
|
self.exprs = {}
|
|
22
|
+
self.expr_offsets = {}
|
|
20
23
|
self.exprs_by_idx = {}
|
|
21
24
|
if elements is not None:
|
|
22
25
|
for e in elements:
|
|
23
26
|
self.add(e)
|
|
24
27
|
|
|
25
|
-
def add(self, expr: T) ->
|
|
26
|
-
|
|
27
|
-
|
|
28
|
+
def add(self, expr: T) -> int:
|
|
29
|
+
"""Returns offset corresponding to iteration order"""
|
|
30
|
+
offset = self.expr_offsets.get(expr.id)
|
|
31
|
+
if offset is not None:
|
|
32
|
+
return offset
|
|
33
|
+
offset = len(self.exprs)
|
|
28
34
|
self.exprs[expr.id] = expr
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
35
|
+
self.expr_offsets[expr.id] = offset
|
|
36
|
+
if expr.slot_idx is not None:
|
|
37
|
+
self.exprs_by_idx[expr.slot_idx] = expr
|
|
38
|
+
return offset
|
|
32
39
|
|
|
33
40
|
def update(self, *others: Iterable[T]) -> None:
|
|
34
41
|
for other in others:
|
pixeltable/exprs/inline_expr.py
CHANGED
|
@@ -98,13 +98,7 @@ class InlineList(Expr):
|
|
|
98
98
|
def __init__(self, elements: Iterable):
|
|
99
99
|
exprs = [Expr.from_object(el) for el in elements]
|
|
100
100
|
|
|
101
|
-
|
|
102
|
-
'type': 'array',
|
|
103
|
-
'prefixItems': [expr.col_type.to_json_schema() for expr in exprs],
|
|
104
|
-
'items': False, # No additional items (fixed length)
|
|
105
|
-
}
|
|
106
|
-
|
|
107
|
-
super().__init__(ts.JsonType(json_schema))
|
|
101
|
+
super().__init__(ts.JsonType())
|
|
108
102
|
self.components.extend(exprs)
|
|
109
103
|
self.id = self._create_id()
|
|
110
104
|
|
|
@@ -150,18 +144,7 @@ class InlineDict(Expr):
|
|
|
150
144
|
self.keys.append(key)
|
|
151
145
|
exprs.append(Expr.from_object(val))
|
|
152
146
|
|
|
153
|
-
|
|
154
|
-
try:
|
|
155
|
-
json_schema = {
|
|
156
|
-
'type': 'object',
|
|
157
|
-
'properties': {key: expr.col_type.to_json_schema() for key, expr in zip(self.keys, exprs)},
|
|
158
|
-
}
|
|
159
|
-
except excs.Error:
|
|
160
|
-
# InlineDicts are used to store iterator arguments, which are not required to be valid JSON types,
|
|
161
|
-
# so we can't always construct a valid schema.
|
|
162
|
-
json_schema = None
|
|
163
|
-
|
|
164
|
-
super().__init__(ts.JsonType(json_schema))
|
|
147
|
+
super().__init__(ts.JsonType())
|
|
165
148
|
self.components.extend(exprs)
|
|
166
149
|
self.id = self._create_id()
|
|
167
150
|
|
pixeltable/exprs/json_path.py
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import io
|
|
4
|
+
from pathlib import Path
|
|
3
5
|
from typing import Any, Optional
|
|
4
6
|
|
|
5
7
|
import jmespath
|
|
@@ -7,6 +9,7 @@ import sqlalchemy as sql
|
|
|
7
9
|
|
|
8
10
|
from pixeltable import catalog, exceptions as excs, type_system as ts
|
|
9
11
|
|
|
12
|
+
from .column_ref import ColumnRef
|
|
10
13
|
from .data_row import DataRow
|
|
11
14
|
from .expr import Expr
|
|
12
15
|
from .globals import print_slice
|
|
@@ -23,6 +26,11 @@ class JsonPath(Expr):
|
|
|
23
26
|
(0: indicates the immediately preceding JsonMapper, -1: the parent of the immediately preceding mapper, ...)
|
|
24
27
|
"""
|
|
25
28
|
|
|
29
|
+
path_elements: list[str | int | slice]
|
|
30
|
+
compiled_path: jmespath.parser.ParsedResult | None
|
|
31
|
+
scope_idx: int
|
|
32
|
+
file_handles: dict[Path, io.BufferedReader] # key: file path
|
|
33
|
+
|
|
26
34
|
def __init__(
|
|
27
35
|
self, anchor: Optional[Expr], path_elements: Optional[list[str | int | slice]] = None, scope_idx: int = 0
|
|
28
36
|
) -> None:
|
|
@@ -31,16 +39,22 @@ class JsonPath(Expr):
|
|
|
31
39
|
super().__init__(ts.JsonType(nullable=True)) # JsonPath expressions are always nullable
|
|
32
40
|
if anchor is not None:
|
|
33
41
|
self.components = [anchor]
|
|
34
|
-
self.path_elements
|
|
42
|
+
self.path_elements = path_elements
|
|
35
43
|
self.compiled_path = jmespath.compile(self._json_path()) if len(path_elements) > 0 else None
|
|
36
44
|
self.scope_idx = scope_idx
|
|
37
45
|
# NOTE: the _create_id() result will change if set_anchor() gets called;
|
|
38
46
|
# this is not a problem, because _create_id() shouldn't be called after init()
|
|
39
47
|
self.id = self._create_id()
|
|
48
|
+
self.file_handles = {}
|
|
49
|
+
|
|
50
|
+
def release(self) -> None:
|
|
51
|
+
for fh in self.file_handles.values():
|
|
52
|
+
fh.close()
|
|
53
|
+
self.file_handles.clear()
|
|
40
54
|
|
|
41
55
|
def __repr__(self) -> str:
|
|
42
56
|
# else 'R': the anchor is RELATIVE_PATH_ROOT
|
|
43
|
-
anchor_str = str(self.
|
|
57
|
+
anchor_str = str(self.anchor) if self.anchor is not None else 'R'
|
|
44
58
|
if len(self.path_elements) == 0:
|
|
45
59
|
return anchor_str
|
|
46
60
|
return f'{anchor_str}{"." if isinstance(self.path_elements[0], str) else ""}{self._json_path()}'
|
|
@@ -67,7 +81,7 @@ class JsonPath(Expr):
|
|
|
67
81
|
return cls(anchor, path_elements, d['scope_idx'])
|
|
68
82
|
|
|
69
83
|
@property
|
|
70
|
-
def
|
|
84
|
+
def anchor(self) -> Optional[Expr]:
|
|
71
85
|
return None if len(self.components) == 0 else self.components[0]
|
|
72
86
|
|
|
73
87
|
def set_anchor(self, anchor: Expr) -> None:
|
|
@@ -75,7 +89,7 @@ class JsonPath(Expr):
|
|
|
75
89
|
self.components = [anchor]
|
|
76
90
|
|
|
77
91
|
def is_relative_path(self) -> bool:
|
|
78
|
-
return self.
|
|
92
|
+
return self.anchor is None
|
|
79
93
|
|
|
80
94
|
def _has_relative_path(self) -> bool:
|
|
81
95
|
return self.is_relative_path() or super()._has_relative_path()
|
|
@@ -85,7 +99,7 @@ class JsonPath(Expr):
|
|
|
85
99
|
# TODO: take scope_idx into account
|
|
86
100
|
self.set_anchor(mapper.scope_anchor)
|
|
87
101
|
else:
|
|
88
|
-
self.
|
|
102
|
+
self.anchor._bind_rel_paths(mapper)
|
|
89
103
|
|
|
90
104
|
def __call__(self, *args: object, **kwargs: object) -> 'JsonPath':
|
|
91
105
|
"""
|
|
@@ -99,15 +113,15 @@ class JsonPath(Expr):
|
|
|
99
113
|
|
|
100
114
|
def __getattr__(self, name: str) -> 'JsonPath':
|
|
101
115
|
assert isinstance(name, str)
|
|
102
|
-
return JsonPath(self.
|
|
116
|
+
return JsonPath(self.anchor, [*self.path_elements, name])
|
|
103
117
|
|
|
104
118
|
def __getitem__(self, index: object) -> 'JsonPath':
|
|
105
119
|
if isinstance(index, (int, slice, str)):
|
|
106
|
-
return JsonPath(self.
|
|
120
|
+
return JsonPath(self.anchor, [*self.path_elements, index])
|
|
107
121
|
raise excs.Error(f'Invalid json list index: {index}')
|
|
108
122
|
|
|
109
123
|
def default_column_name(self) -> Optional[str]:
|
|
110
|
-
anchor_name = self.
|
|
124
|
+
anchor_name = self.anchor.default_column_name() if self.anchor is not None else ''
|
|
111
125
|
ret_name = f'{anchor_name}.{self._json_path()}'
|
|
112
126
|
|
|
113
127
|
def cleanup_char(s: str) -> str:
|
|
@@ -159,12 +173,31 @@ class JsonPath(Expr):
|
|
|
159
173
|
result.append(f'[{print_slice(element)}]')
|
|
160
174
|
return ''.join(result)
|
|
161
175
|
|
|
162
|
-
def eval(self,
|
|
163
|
-
assert self.
|
|
164
|
-
val =
|
|
176
|
+
def eval(self, row: DataRow, row_builder: RowBuilder) -> None:
|
|
177
|
+
assert self.anchor is not None, self
|
|
178
|
+
val = row[self.anchor.slot_idx]
|
|
165
179
|
if self.compiled_path is not None:
|
|
166
180
|
val = self.compiled_path.search(val)
|
|
167
|
-
|
|
181
|
+
row[self.slot_idx] = val
|
|
182
|
+
if val is None or self.anchor is None or not isinstance(self.anchor, ColumnRef):
|
|
183
|
+
return
|
|
184
|
+
|
|
185
|
+
# the origin of val is a json-typed column, which might stored inlined objects
|
|
186
|
+
if self.anchor.slot_idx not in row.slot_md:
|
|
187
|
+
# we can infer that there aren't any inlined objects because our execution plan doesn't include
|
|
188
|
+
# materializing the cellmd (eg, insert plans)
|
|
189
|
+
# TODO: have the planner pass that fact into ExprEvalNode explicitly to streamline this path a bit more
|
|
190
|
+
return
|
|
191
|
+
|
|
192
|
+
# defer import until it's needed
|
|
193
|
+
from pixeltable.exec.cell_reconstruction_node import json_has_inlined_objs, reconstruct_json
|
|
194
|
+
|
|
195
|
+
cell_md = row.slot_md[self.anchor.slot_idx]
|
|
196
|
+
if cell_md is None or cell_md.file_urls is None or not json_has_inlined_objs(val):
|
|
197
|
+
# val doesn't contain inlined objects
|
|
198
|
+
return
|
|
199
|
+
|
|
200
|
+
row.vals[self.slot_idx] = reconstruct_json(val, cell_md.file_urls, self.file_handles)
|
|
168
201
|
|
|
169
202
|
|
|
170
203
|
RELATIVE_PATH_ROOT = JsonPath(None)
|
pixeltable/exprs/row_builder.py
CHANGED
|
@@ -1,15 +1,17 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import dataclasses
|
|
3
4
|
import sys
|
|
4
5
|
import time
|
|
5
|
-
from dataclasses import dataclass
|
|
6
6
|
from typing import Any, Iterable, NamedTuple, Optional, Sequence
|
|
7
7
|
from uuid import UUID
|
|
8
8
|
|
|
9
9
|
import numpy as np
|
|
10
|
+
import sqlalchemy as sql
|
|
10
11
|
|
|
11
12
|
from pixeltable import catalog, exceptions as excs, exprs, utils
|
|
12
13
|
from pixeltable.env import Env
|
|
14
|
+
from pixeltable.utils.misc import non_none_dict_factory
|
|
13
15
|
|
|
14
16
|
from .data_row import DataRow
|
|
15
17
|
from .expr import Expr, ExprScope
|
|
@@ -68,7 +70,7 @@ class RowBuilder:
|
|
|
68
70
|
input_exprs: ExprSet
|
|
69
71
|
|
|
70
72
|
tbl: Optional[catalog.TableVersion] # reference table of the RowBuilder; used to identify pk columns for writes
|
|
71
|
-
table_columns:
|
|
73
|
+
table_columns: dict[catalog.Column, int | None] # value: slot idx, if the result of an expr
|
|
72
74
|
default_eval_ctx: EvalCtx
|
|
73
75
|
unstored_iter_args: dict[UUID, Expr]
|
|
74
76
|
|
|
@@ -92,10 +94,9 @@ class RowBuilder:
|
|
|
92
94
|
img_slot_idxs: list[int] # Indices of image slots
|
|
93
95
|
media_slot_idxs: list[int] # Indices of non-image media slots
|
|
94
96
|
array_slot_idxs: list[int] # Indices of array slots
|
|
95
|
-
|
|
96
|
-
stored_media_cols: list[exprs.ColumnSlotIdx]
|
|
97
|
+
json_slot_idxs: list[int] # Indices of json slots
|
|
97
98
|
|
|
98
|
-
@dataclass
|
|
99
|
+
@dataclasses.dataclass
|
|
99
100
|
class EvalCtx:
|
|
100
101
|
"""Context for evaluating a set of target exprs"""
|
|
101
102
|
|
|
@@ -113,8 +114,6 @@ class RowBuilder:
|
|
|
113
114
|
):
|
|
114
115
|
self.unique_exprs: ExprSet[Expr] = ExprSet() # dependencies precede their dependents
|
|
115
116
|
self.next_slot_idx = 0
|
|
116
|
-
self.stored_img_cols = []
|
|
117
|
-
self.stored_media_cols = []
|
|
118
117
|
|
|
119
118
|
# record input and output exprs; make copies to avoid reusing execution state
|
|
120
119
|
unique_input_exprs = [self._record_unique_expr(e.copy(), recursive=False) for e in input_exprs]
|
|
@@ -138,7 +137,7 @@ class RowBuilder:
|
|
|
138
137
|
from .column_ref import ColumnRef
|
|
139
138
|
|
|
140
139
|
self.tbl = tbl
|
|
141
|
-
self.table_columns
|
|
140
|
+
self.table_columns = {}
|
|
142
141
|
self.input_exprs = ExprSet()
|
|
143
142
|
validating_colrefs: dict[Expr, Expr] = {} # key: non-validating colref, value: corresp. validating colref
|
|
144
143
|
for col in columns:
|
|
@@ -245,17 +244,27 @@ class RowBuilder:
|
|
|
245
244
|
e.slot_idx for e in self.unique_exprs if e.col_type.is_media_type() and not e.col_type.is_image_type()
|
|
246
245
|
]
|
|
247
246
|
self.array_slot_idxs = [e.slot_idx for e in self.unique_exprs if e.col_type.is_array_type()]
|
|
247
|
+
self.json_slot_idxs = [e.slot_idx for e in self.unique_exprs if e.col_type.is_json_type()]
|
|
248
248
|
|
|
249
249
|
def add_table_column(self, col: catalog.Column, slot_idx: int) -> None:
|
|
250
|
-
"""Record
|
|
250
|
+
"""Record an output column for which the value is produced via expr evaluation"""
|
|
251
251
|
assert self.tbl is not None
|
|
252
252
|
assert col.is_stored
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
253
|
+
self.table_columns[col] = slot_idx
|
|
254
|
+
|
|
255
|
+
def add_table_columns(self, cols: list[catalog.Column]) -> None:
|
|
256
|
+
"""Record output columns whose values are materialized into DataRow.cell_vals"""
|
|
257
|
+
for col in cols:
|
|
258
|
+
self.table_columns[col] = None
|
|
259
|
+
|
|
260
|
+
@property
|
|
261
|
+
def media_output_col_info(self) -> list[ColumnSlotIdx]:
|
|
262
|
+
"""Return slot idxs for media output columns whose values are produced by expr evaluation"""
|
|
263
|
+
return [
|
|
264
|
+
ColumnSlotIdx(col, slot_idx)
|
|
265
|
+
for col, slot_idx in self.table_columns.items()
|
|
266
|
+
if col.col_type.is_media_type() and slot_idx is not None
|
|
267
|
+
]
|
|
259
268
|
|
|
260
269
|
@property
|
|
261
270
|
def num_materialized(self) -> int:
|
|
@@ -462,13 +471,30 @@ class RowBuilder:
|
|
|
462
471
|
|
|
463
472
|
num_excs = 0
|
|
464
473
|
table_row: list[Any] = list(pk)
|
|
465
|
-
|
|
474
|
+
# Nulls in JSONB columns need to be stored as sql.sql.null(), otherwise it stores a json 'null'
|
|
475
|
+
for col, slot_idx in self.table_columns.items():
|
|
476
|
+
if col.id in data_row.cell_vals:
|
|
477
|
+
table_row.append(data_row.cell_vals[col.id])
|
|
478
|
+
if col.stores_cellmd:
|
|
479
|
+
if data_row.cell_md[col.id] is None:
|
|
480
|
+
table_row.append(sql.sql.null())
|
|
481
|
+
else:
|
|
482
|
+
# we want to minimize the size of the stored dict and use dict_factory to remove Nones
|
|
483
|
+
md = dataclasses.asdict(data_row.cell_md[col.id], dict_factory=non_none_dict_factory)
|
|
484
|
+
assert len(md) > 0
|
|
485
|
+
table_row.append(md)
|
|
486
|
+
if slot_idx is not None and data_row.has_exc(slot_idx):
|
|
487
|
+
num_excs += 1
|
|
488
|
+
if cols_with_excs is not None:
|
|
489
|
+
cols_with_excs.add(col.id)
|
|
490
|
+
continue
|
|
491
|
+
|
|
466
492
|
if data_row.has_exc(slot_idx):
|
|
467
493
|
exc = data_row.get_exc(slot_idx)
|
|
468
494
|
num_excs += 1
|
|
469
495
|
if cols_with_excs is not None:
|
|
470
496
|
cols_with_excs.add(col.id)
|
|
471
|
-
table_row.append(None)
|
|
497
|
+
table_row.append(sql.sql.null() if col.col_type.is_json_type() else None)
|
|
472
498
|
if col.stores_cellmd:
|
|
473
499
|
# exceptions get stored in the errortype/-msg properties of the cellmd column
|
|
474
500
|
table_row.append(ColumnPropertyRef.create_cellmd_exc(exc))
|
|
@@ -476,7 +502,7 @@ class RowBuilder:
|
|
|
476
502
|
val = data_row.get_stored_val(slot_idx, col.get_sa_col_type())
|
|
477
503
|
table_row.append(val)
|
|
478
504
|
if col.stores_cellmd:
|
|
479
|
-
table_row.append(
|
|
505
|
+
table_row.append(sql.sql.null()) # placeholder for cellmd column
|
|
480
506
|
|
|
481
507
|
return table_row, num_excs
|
|
482
508
|
|
|
@@ -490,12 +516,18 @@ class RowBuilder:
|
|
|
490
516
|
store_col_names: list[str] = [pk_col.name for pk_col in self.tbl.store_tbl.pk_columns()]
|
|
491
517
|
|
|
492
518
|
for col in self.table_columns:
|
|
493
|
-
store_col_names.append(col.
|
|
494
|
-
if col.
|
|
495
|
-
store_col_names.append(col.
|
|
519
|
+
store_col_names.append(col.store_name())
|
|
520
|
+
if col.stores_cellmd:
|
|
521
|
+
store_col_names.append(col.cellmd_store_name())
|
|
496
522
|
|
|
497
523
|
return store_col_names
|
|
498
524
|
|
|
499
525
|
def make_row(self) -> exprs.DataRow:
|
|
500
526
|
"""Creates a new DataRow with the current row_builder's configuration."""
|
|
501
|
-
return exprs.DataRow(
|
|
527
|
+
return exprs.DataRow(
|
|
528
|
+
size=self.num_materialized,
|
|
529
|
+
img_slot_idxs=self.img_slot_idxs,
|
|
530
|
+
media_slot_idxs=self.media_slot_idxs,
|
|
531
|
+
array_slot_idxs=self.array_slot_idxs,
|
|
532
|
+
json_slot_idxs=self.json_slot_idxs,
|
|
533
|
+
)
|
pixeltable/functions/__init__.py
CHANGED
pixeltable/functions/bedrock.py
CHANGED
|
@@ -1,3 +1,10 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Pixeltable UDFs for AWS Bedrock AI models.
|
|
3
|
+
|
|
4
|
+
Provides integration with AWS Bedrock for accessing various foundation models
|
|
5
|
+
including Anthropic Claude, Amazon Titan, and other providers.
|
|
6
|
+
"""
|
|
7
|
+
|
|
1
8
|
import logging
|
|
2
9
|
from typing import TYPE_CHECKING, Any, Optional
|
|
3
10
|
|
pixeltable/functions/deepseek.py
CHANGED
|
@@ -1,3 +1,10 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Pixeltable UDFs for Deepseek AI models.
|
|
3
|
+
|
|
4
|
+
Provides integration with Deepseek's language models for chat completions
|
|
5
|
+
and other AI capabilities.
|
|
6
|
+
"""
|
|
7
|
+
|
|
1
8
|
import json
|
|
2
9
|
from typing import TYPE_CHECKING, Any, Optional
|
|
3
10
|
|
|
@@ -67,10 +74,10 @@ async def chat_completions(
|
|
|
67
74
|
of the table `tbl`:
|
|
68
75
|
|
|
69
76
|
>>> messages = [
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
77
|
+
... {'role': 'system', 'content': 'You are a helpful assistant.'},
|
|
78
|
+
... {'role': 'user', 'content': tbl.prompt}
|
|
79
|
+
... ]
|
|
80
|
+
>>> tbl.add_computed_column(response=chat_completions(messages, model='deepseek-chat'))
|
|
74
81
|
"""
|
|
75
82
|
if model_kwargs is None:
|
|
76
83
|
model_kwargs = {}
|
|
@@ -1,3 +1,10 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Pixeltable UDFs for llama.cpp models.
|
|
3
|
+
|
|
4
|
+
Provides integration with llama.cpp for running quantized language models locally,
|
|
5
|
+
supporting chat completions and embeddings with GGUF format models.
|
|
6
|
+
"""
|
|
7
|
+
|
|
1
8
|
from pathlib import Path
|
|
2
9
|
from typing import TYPE_CHECKING, Any, Optional
|
|
3
10
|
|
pixeltable/functions/math.py
CHANGED
|
@@ -97,7 +97,7 @@ def _(self: sql.ColumnElement, digits: Optional[sql.ColumnElement] = None) -> sq
|
|
|
97
97
|
if digits is None:
|
|
98
98
|
return sql.func.round(self)
|
|
99
99
|
else:
|
|
100
|
-
return sql.func.round(
|
|
100
|
+
return sql.cast(sql.func.round(sql.cast(self, sql.Numeric), sql.cast(digits, sql.Integer)), sql.Float)
|
|
101
101
|
|
|
102
102
|
|
|
103
103
|
@pxt.udf(is_method=True)
|