pixeltable 0.4.15__py3-none-any.whl → 0.4.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (68) hide show
  1. pixeltable/__init__.py +4 -0
  2. pixeltable/catalog/catalog.py +125 -63
  3. pixeltable/catalog/column.py +7 -2
  4. pixeltable/catalog/table.py +1 -0
  5. pixeltable/catalog/table_metadata.py +4 -0
  6. pixeltable/catalog/table_version.py +174 -117
  7. pixeltable/catalog/table_version_handle.py +4 -1
  8. pixeltable/catalog/table_version_path.py +0 -11
  9. pixeltable/catalog/view.py +6 -0
  10. pixeltable/config.py +7 -0
  11. pixeltable/dataframe.py +10 -5
  12. pixeltable/env.py +56 -19
  13. pixeltable/exec/__init__.py +2 -0
  14. pixeltable/exec/cell_materialization_node.py +231 -0
  15. pixeltable/exec/cell_reconstruction_node.py +135 -0
  16. pixeltable/exec/exec_node.py +1 -1
  17. pixeltable/exec/expr_eval/evaluators.py +1 -0
  18. pixeltable/exec/expr_eval/expr_eval_node.py +3 -0
  19. pixeltable/exec/expr_eval/globals.py +2 -0
  20. pixeltable/exec/globals.py +32 -0
  21. pixeltable/exec/object_store_save_node.py +1 -4
  22. pixeltable/exec/row_update_node.py +16 -9
  23. pixeltable/exec/sql_node.py +107 -14
  24. pixeltable/exprs/__init__.py +1 -1
  25. pixeltable/exprs/arithmetic_expr.py +23 -18
  26. pixeltable/exprs/column_property_ref.py +10 -10
  27. pixeltable/exprs/column_ref.py +2 -2
  28. pixeltable/exprs/data_row.py +106 -37
  29. pixeltable/exprs/expr.py +9 -0
  30. pixeltable/exprs/expr_set.py +14 -7
  31. pixeltable/exprs/inline_expr.py +2 -19
  32. pixeltable/exprs/json_path.py +45 -12
  33. pixeltable/exprs/row_builder.py +54 -22
  34. pixeltable/functions/__init__.py +1 -0
  35. pixeltable/functions/bedrock.py +7 -0
  36. pixeltable/functions/deepseek.py +11 -4
  37. pixeltable/functions/llama_cpp.py +7 -0
  38. pixeltable/functions/math.py +1 -1
  39. pixeltable/functions/ollama.py +7 -0
  40. pixeltable/functions/openai.py +4 -4
  41. pixeltable/functions/openrouter.py +143 -0
  42. pixeltable/functions/video.py +110 -28
  43. pixeltable/globals.py +10 -4
  44. pixeltable/io/globals.py +18 -17
  45. pixeltable/io/parquet.py +1 -1
  46. pixeltable/io/table_data_conduit.py +47 -22
  47. pixeltable/iterators/document.py +61 -23
  48. pixeltable/iterators/video.py +126 -53
  49. pixeltable/metadata/__init__.py +1 -1
  50. pixeltable/metadata/converters/convert_40.py +73 -0
  51. pixeltable/metadata/notes.py +1 -0
  52. pixeltable/plan.py +175 -46
  53. pixeltable/share/packager.py +155 -26
  54. pixeltable/store.py +2 -3
  55. pixeltable/type_system.py +5 -3
  56. pixeltable/utils/arrow.py +6 -6
  57. pixeltable/utils/av.py +65 -0
  58. pixeltable/utils/console_output.py +4 -1
  59. pixeltable/utils/exception_handler.py +5 -28
  60. pixeltable/utils/image.py +7 -0
  61. pixeltable/utils/misc.py +5 -0
  62. pixeltable/utils/object_stores.py +16 -1
  63. pixeltable/utils/s3_store.py +44 -11
  64. {pixeltable-0.4.15.dist-info → pixeltable-0.4.17.dist-info}/METADATA +29 -28
  65. {pixeltable-0.4.15.dist-info → pixeltable-0.4.17.dist-info}/RECORD +68 -61
  66. {pixeltable-0.4.15.dist-info → pixeltable-0.4.17.dist-info}/WHEEL +0 -0
  67. {pixeltable-0.4.15.dist-info → pixeltable-0.4.17.dist-info}/entry_points.txt +0 -0
  68. {pixeltable-0.4.15.dist-info → pixeltable-0.4.17.dist-info}/licenses/LICENSE +0 -0
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import dataclasses
3
4
  import datetime
4
5
  import io
5
6
  import urllib.parse
@@ -13,15 +14,72 @@ import PIL
13
14
  import PIL.Image
14
15
  import sqlalchemy as sql
15
16
 
17
+ import pixeltable.utils.image as image_utils
16
18
  from pixeltable import catalog, env
17
19
  from pixeltable.utils.local_store import TempStore
20
+ from pixeltable.utils.misc import non_none_dict_factory
21
+
22
+
23
+ @dataclasses.dataclass
24
+ class ArrayMd:
25
+ """
26
+ Metadata for array cells that are stored externally.
27
+ """
28
+
29
+ start: int
30
+ end: int
31
+
32
+ # we store bool arrays as packed bits (uint8 arrays), and need to record the shape to reconstruct the array
33
+ is_bool: bool = False
34
+ shape: tuple[int, ...] | None = None
35
+
36
+ def as_dict(self) -> dict:
37
+ # dict_factory: suppress Nones
38
+ x = dataclasses.asdict(self, dict_factory=non_none_dict_factory)
39
+ return x
40
+
41
+
42
+ @dataclasses.dataclass
43
+ class CellMd:
44
+ """
45
+ Content of the cellmd column.
46
+
47
+ All fields are optional, to minimize storage.
48
+ """
49
+
50
+ errortype: str | None = None
51
+ errormsg: str | None = None
52
+
53
+ # a list of file urls that are used to store images and arrays; only set for json and array columns
54
+ # for json columns: a list of all urls referenced in the column value
55
+ # for array columns: a single url
56
+ file_urls: list[str] | None = None
57
+
58
+ array_md: ArrayMd | None = None
59
+
60
+ @classmethod
61
+ def from_dict(cls, d: dict) -> CellMd:
62
+ x: CellMd
63
+ if 'array_md' in d:
64
+ d2 = d.copy()
65
+ del d2['array_md']
66
+ x = cls(**d2, array_md=ArrayMd(**d['array_md']))
67
+ else:
68
+ x = cls(**d)
69
+ return x
70
+
71
+ def as_dict(self) -> dict:
72
+ x = dataclasses.asdict(self, dict_factory=non_none_dict_factory)
73
+ return x
18
74
 
19
75
 
20
76
  class DataRow:
21
77
  """
22
78
  Encapsulates all data and execution state needed by RowBuilder and DataRowBatch:
23
79
  - state for in-memory computation
24
- - state for storing the data
80
+ - state needed for expression evaluation
81
+ - containers for output column values
82
+
25
83
  This is not meant to be a black-box abstraction.
26
84
 
27
85
  In-memory representations by column type:
@@ -39,79 +97,92 @@ class DataRow:
39
97
  - DocumentType: local path if available, otherwise url
40
98
  """
41
99
 
100
+ # expr evaluation state; indexed by slot idx
42
101
  vals: np.ndarray # of object
43
102
  has_val: np.ndarray # of bool
44
103
  excs: np.ndarray # of object
45
-
46
- # If `may_have_exc` is False, then we guarantee that no slot has an exception set. This is used to optimize
47
- # exception handling under normal operation.
48
- _may_have_exc: bool
49
-
50
- # expr evaluation state; indexed by slot idx
51
104
  missing_slots: np.ndarray # of bool; number of missing dependencies
52
105
  missing_dependents: np.ndarray # of int16; number of missing dependents
53
106
  is_scheduled: np.ndarray # of bool; True if this slot is scheduled for evaluation
54
107
 
55
- # control structures that are shared across all DataRows in a batch
56
- img_slot_idxs: list[int]
57
- media_slot_idxs: list[int]
58
- array_slot_idxs: list[int]
59
-
60
- # the primary key of a store row is a sequence of ints (the number is different for table vs view)
61
- pk: Optional[tuple[int, ...]]
108
+ # CellMd needed for query execution; needs to be indexed by slot idx, not column id, to work for joins
109
+ slot_md: dict[int, CellMd]
62
110
 
63
111
  # file_urls:
64
112
  # - stored url of file for media in vals[i]
65
113
  # - None if vals[i] is not media type
66
114
  # - not None if file_paths[i] is not None
115
+ # TODO: this is a sparse vector; should it be a dict[int, str]?
67
116
  file_urls: np.ndarray # of str
68
117
 
69
118
  # file_paths:
70
119
  # - local path of media file in vals[i]; points to the file cache if file_urls[i] is remote
71
120
  # - None if vals[i] is not a media type or if there is no local file yet for file_urls[i]
121
+ # TODO: this is a sparse vector; should it be a dict[int, str]?
72
122
  file_paths: np.ndarray # of str
73
123
 
124
+ # If `may_have_exc` is False, then we guarantee that no slot has an exception set. This is used to optimize
125
+ # exception handling under normal operation.
126
+ _may_have_exc: bool
127
+
128
+ # the primary key of a store row is a sequence of ints (the number is different for table vs view)
129
+ pk: Optional[tuple[int, ...]]
74
130
  # for nested rows (ie, those produced by JsonMapperDispatcher)
75
131
  parent_row: Optional[DataRow]
76
132
  parent_slot_idx: Optional[int]
77
133
 
134
+ # state for table output (insert()/update()); key: column id
135
+ cell_vals: dict[int, Any] # materialized values of output columns, in the format required for the column
136
+ cell_md: dict[int, CellMd]
137
+
138
+ # control structures that are shared across all DataRows in a batch
139
+ img_slot_idxs: list[int]
140
+ media_slot_idxs: list[int]
141
+ array_slot_idxs: list[int]
142
+ json_slot_idxs: list[int]
143
+
78
144
  def __init__(
79
145
  self,
80
146
  size: int,
81
147
  img_slot_idxs: list[int],
82
148
  media_slot_idxs: list[int],
83
149
  array_slot_idxs: list[int],
150
+ json_slot_idxs: list[int],
84
151
  parent_row: Optional[DataRow] = None,
85
152
  parent_slot_idx: Optional[int] = None,
86
153
  ):
87
- self.img_slot_idxs = img_slot_idxs
88
- self.media_slot_idxs = media_slot_idxs
89
- self.array_slot_idxs = array_slot_idxs
90
154
  self.init(size)
91
155
  self.parent_row = parent_row
92
156
  self.parent_slot_idx = parent_slot_idx
93
-
94
- def init(self, num_slots: int) -> None:
95
- self.vals = np.full(num_slots, None, dtype=object)
96
- self.has_val = np.zeros(num_slots, dtype=bool)
97
- self.excs = np.full(num_slots, None, dtype=object)
157
+ self.img_slot_idxs = img_slot_idxs
158
+ self.media_slot_idxs = media_slot_idxs
159
+ self.array_slot_idxs = array_slot_idxs
160
+ self.json_slot_idxs = json_slot_idxs
161
+
162
+ def init(self, size: int) -> None:
163
+ self.vals = np.full(size, None, dtype=object)
164
+ self.has_val = np.zeros(size, dtype=bool)
165
+ self.excs = np.full(size, None, dtype=object)
166
+ self.missing_slots = np.zeros(size, dtype=bool)
167
+ self.missing_dependents = np.zeros(size, dtype=np.int16)
168
+ self.is_scheduled = np.zeros(size, dtype=bool)
169
+ self.slot_md = {}
170
+ self.file_urls = np.full(size, None, dtype=object)
171
+ self.file_paths = np.full(size, None, dtype=object)
98
172
  self._may_have_exc = False
99
- self.missing_slots = np.zeros(num_slots, dtype=bool)
100
- self.missing_dependents = np.zeros(num_slots, dtype=np.int16)
101
- self.is_scheduled = np.zeros(num_slots, dtype=bool)
173
+ self.cell_vals = {}
174
+ self.cell_md = {}
102
175
  self.pk = None
103
- self.file_urls = np.full(num_slots, None, dtype=object)
104
- self.file_paths = np.full(num_slots, None, dtype=object)
105
176
  self.parent_row = None
106
177
  self.parent_slot_idx = None
107
178
 
108
- def clear(self, idxs: Optional[np.ndarray] = None) -> None:
109
- if idxs is not None:
110
- self.has_val[idxs] = False
111
- self.vals[idxs] = None
112
- self.excs[idxs] = None
113
- self.file_urls[idxs] = None
114
- self.file_paths[idxs] = None
179
+ def clear(self, slot_idxs: Optional[np.ndarray] = None) -> None:
180
+ if slot_idxs is not None:
181
+ self.has_val[slot_idxs] = False
182
+ self.vals[slot_idxs] = None
183
+ self.excs[slot_idxs] = None
184
+ self.file_urls[slot_idxs] = None
185
+ self.file_paths[slot_idxs] = None
115
186
  else:
116
187
  self.init(len(self.vals))
117
188
 
@@ -292,9 +363,7 @@ class DataRow:
292
363
  val = self.vals[index]
293
364
  format = None
294
365
  if isinstance(val, PIL.Image.Image):
295
- # Default to JPEG unless the image has a transparency layer (which isn't supported by JPEG).
296
- # In that case, use WebP instead.
297
- format = 'webp' if val.has_transparency_data else 'jpeg'
366
+ format = image_utils.default_format(val)
298
367
  filepath, url = TempStore.save_media_object(val, col, format=format)
299
368
  self.file_paths[index] = str(filepath) if filepath is not None else None
300
369
  self.vals[index] = None
pixeltable/exprs/expr.py CHANGED
@@ -368,6 +368,15 @@ class Expr(abc.ABC):
368
368
  for e in expr_list:
369
369
  yield from e.subexprs(expr_class=expr_class, filter=filter, traverse_matches=traverse_matches)
370
370
 
371
+ @classmethod
372
+ def list_contains(
373
+ cls,
374
+ expr_list: Iterable[Expr],
375
+ expr_class: type[Expr] | None = None,
376
+ filter: Callable[[Expr], bool] | None = None,
377
+ ) -> bool:
378
+ return any(e._contains(expr_class, filter) for e in expr_list)
379
+
371
380
  def _contains(self, cls: Optional[type[Expr]] = None, filter: Optional[Callable[[Expr], bool]] = None) -> bool:
372
381
  """
373
382
  Returns True if any subexpr is an instance of cls and/or matches filter.
@@ -9,26 +9,33 @@ T = TypeVar('T', bound='Expr')
9
9
 
10
10
  class ExprSet(Generic[T]):
11
11
  """
12
- A set that also supports indexed lookup (by slot_idx and Expr.id). Exprs are uniquely identified by Expr.id.
12
+ An ordered set that also supports indexed lookup (by slot_idx and Expr.id). Exprs are uniquely identified by
13
+ Expr.id.
13
14
  """
14
15
 
15
16
  exprs: dict[int, T] # key: Expr.id
17
+ expr_offsets: dict[int, int] # key: Expr.id, value: offset into self.exprs.keys()
16
18
  exprs_by_idx: dict[int, T] # key: slot_idx
17
19
 
18
20
  def __init__(self, elements: Optional[Iterable[T]] = None):
19
21
  self.exprs = {}
22
+ self.expr_offsets = {}
20
23
  self.exprs_by_idx = {}
21
24
  if elements is not None:
22
25
  for e in elements:
23
26
  self.add(e)
24
27
 
25
- def add(self, expr: T) -> None:
26
- if expr.id in self.exprs:
27
- return
28
+ def add(self, expr: T) -> int:
29
+ """Returns offset corresponding to iteration order"""
30
+ offset = self.expr_offsets.get(expr.id)
31
+ if offset is not None:
32
+ return offset
33
+ offset = len(self.exprs)
28
34
  self.exprs[expr.id] = expr
29
- if expr.slot_idx is None:
30
- return
31
- self.exprs_by_idx[expr.slot_idx] = expr
35
+ self.expr_offsets[expr.id] = offset
36
+ if expr.slot_idx is not None:
37
+ self.exprs_by_idx[expr.slot_idx] = expr
38
+ return offset
32
39
 
33
40
  def update(self, *others: Iterable[T]) -> None:
34
41
  for other in others:
@@ -98,13 +98,7 @@ class InlineList(Expr):
98
98
  def __init__(self, elements: Iterable):
99
99
  exprs = [Expr.from_object(el) for el in elements]
100
100
 
101
- json_schema = {
102
- 'type': 'array',
103
- 'prefixItems': [expr.col_type.to_json_schema() for expr in exprs],
104
- 'items': False, # No additional items (fixed length)
105
- }
106
-
107
- super().__init__(ts.JsonType(json_schema))
101
+ super().__init__(ts.JsonType())
108
102
  self.components.extend(exprs)
109
103
  self.id = self._create_id()
110
104
 
@@ -150,18 +144,7 @@ class InlineDict(Expr):
150
144
  self.keys.append(key)
151
145
  exprs.append(Expr.from_object(val))
152
146
 
153
- json_schema: Optional[dict[str, Any]]
154
- try:
155
- json_schema = {
156
- 'type': 'object',
157
- 'properties': {key: expr.col_type.to_json_schema() for key, expr in zip(self.keys, exprs)},
158
- }
159
- except excs.Error:
160
- # InlineDicts are used to store iterator arguments, which are not required to be valid JSON types,
161
- # so we can't always construct a valid schema.
162
- json_schema = None
163
-
164
- super().__init__(ts.JsonType(json_schema))
147
+ super().__init__(ts.JsonType())
165
148
  self.components.extend(exprs)
166
149
  self.id = self._create_id()
167
150
 
@@ -1,5 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import io
4
+ from pathlib import Path
3
5
  from typing import Any, Optional
4
6
 
5
7
  import jmespath
@@ -7,6 +9,7 @@ import sqlalchemy as sql
7
9
 
8
10
  from pixeltable import catalog, exceptions as excs, type_system as ts
9
11
 
12
+ from .column_ref import ColumnRef
10
13
  from .data_row import DataRow
11
14
  from .expr import Expr
12
15
  from .globals import print_slice
@@ -23,6 +26,11 @@ class JsonPath(Expr):
23
26
  (0: indicates the immediately preceding JsonMapper, -1: the parent of the immediately preceding mapper, ...)
24
27
  """
25
28
 
29
+ path_elements: list[str | int | slice]
30
+ compiled_path: jmespath.parser.ParsedResult | None
31
+ scope_idx: int
32
+ file_handles: dict[Path, io.BufferedReader] # key: file path
33
+
26
34
  def __init__(
27
35
  self, anchor: Optional[Expr], path_elements: Optional[list[str | int | slice]] = None, scope_idx: int = 0
28
36
  ) -> None:
@@ -31,16 +39,22 @@ class JsonPath(Expr):
31
39
  super().__init__(ts.JsonType(nullable=True)) # JsonPath expressions are always nullable
32
40
  if anchor is not None:
33
41
  self.components = [anchor]
34
- self.path_elements: list[str | int | slice] = path_elements
42
+ self.path_elements = path_elements
35
43
  self.compiled_path = jmespath.compile(self._json_path()) if len(path_elements) > 0 else None
36
44
  self.scope_idx = scope_idx
37
45
  # NOTE: the _create_id() result will change if set_anchor() gets called;
38
46
  # this is not a problem, because _create_id() shouldn't be called after init()
39
47
  self.id = self._create_id()
48
+ self.file_handles = {}
49
+
50
+ def release(self) -> None:
51
+ for fh in self.file_handles.values():
52
+ fh.close()
53
+ self.file_handles.clear()
40
54
 
41
55
  def __repr__(self) -> str:
42
56
  # else 'R': the anchor is RELATIVE_PATH_ROOT
43
- anchor_str = str(self._anchor) if self._anchor is not None else 'R'
57
+ anchor_str = str(self.anchor) if self.anchor is not None else 'R'
44
58
  if len(self.path_elements) == 0:
45
59
  return anchor_str
46
60
  return f'{anchor_str}{"." if isinstance(self.path_elements[0], str) else ""}{self._json_path()}'
@@ -67,7 +81,7 @@ class JsonPath(Expr):
67
81
  return cls(anchor, path_elements, d['scope_idx'])
68
82
 
69
83
  @property
70
- def _anchor(self) -> Optional[Expr]:
84
+ def anchor(self) -> Optional[Expr]:
71
85
  return None if len(self.components) == 0 else self.components[0]
72
86
 
73
87
  def set_anchor(self, anchor: Expr) -> None:
@@ -75,7 +89,7 @@ class JsonPath(Expr):
75
89
  self.components = [anchor]
76
90
 
77
91
  def is_relative_path(self) -> bool:
78
- return self._anchor is None
92
+ return self.anchor is None
79
93
 
80
94
  def _has_relative_path(self) -> bool:
81
95
  return self.is_relative_path() or super()._has_relative_path()
@@ -85,7 +99,7 @@ class JsonPath(Expr):
85
99
  # TODO: take scope_idx into account
86
100
  self.set_anchor(mapper.scope_anchor)
87
101
  else:
88
- self._anchor._bind_rel_paths(mapper)
102
+ self.anchor._bind_rel_paths(mapper)
89
103
 
90
104
  def __call__(self, *args: object, **kwargs: object) -> 'JsonPath':
91
105
  """
@@ -99,15 +113,15 @@ class JsonPath(Expr):
99
113
 
100
114
  def __getattr__(self, name: str) -> 'JsonPath':
101
115
  assert isinstance(name, str)
102
- return JsonPath(self._anchor, [*self.path_elements, name])
116
+ return JsonPath(self.anchor, [*self.path_elements, name])
103
117
 
104
118
  def __getitem__(self, index: object) -> 'JsonPath':
105
119
  if isinstance(index, (int, slice, str)):
106
- return JsonPath(self._anchor, [*self.path_elements, index])
120
+ return JsonPath(self.anchor, [*self.path_elements, index])
107
121
  raise excs.Error(f'Invalid json list index: {index}')
108
122
 
109
123
  def default_column_name(self) -> Optional[str]:
110
- anchor_name = self._anchor.default_column_name() if self._anchor is not None else ''
124
+ anchor_name = self.anchor.default_column_name() if self.anchor is not None else ''
111
125
  ret_name = f'{anchor_name}.{self._json_path()}'
112
126
 
113
127
  def cleanup_char(s: str) -> str:
@@ -159,12 +173,31 @@ class JsonPath(Expr):
159
173
  result.append(f'[{print_slice(element)}]')
160
174
  return ''.join(result)
161
175
 
162
- def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
163
- assert self._anchor is not None, self
164
- val = data_row[self._anchor.slot_idx]
176
+ def eval(self, row: DataRow, row_builder: RowBuilder) -> None:
177
+ assert self.anchor is not None, self
178
+ val = row[self.anchor.slot_idx]
165
179
  if self.compiled_path is not None:
166
180
  val = self.compiled_path.search(val)
167
- data_row[self.slot_idx] = val
181
+ row[self.slot_idx] = val
182
+ if val is None or self.anchor is None or not isinstance(self.anchor, ColumnRef):
183
+ return
184
+
185
+ # the origin of val is a json-typed column, which might stored inlined objects
186
+ if self.anchor.slot_idx not in row.slot_md:
187
+ # we can infer that there aren't any inlined objects because our execution plan doesn't include
188
+ # materializing the cellmd (eg, insert plans)
189
+ # TODO: have the planner pass that fact into ExprEvalNode explicitly to streamline this path a bit more
190
+ return
191
+
192
+ # defer import until it's needed
193
+ from pixeltable.exec.cell_reconstruction_node import json_has_inlined_objs, reconstruct_json
194
+
195
+ cell_md = row.slot_md[self.anchor.slot_idx]
196
+ if cell_md is None or cell_md.file_urls is None or not json_has_inlined_objs(val):
197
+ # val doesn't contain inlined objects
198
+ return
199
+
200
+ row.vals[self.slot_idx] = reconstruct_json(val, cell_md.file_urls, self.file_handles)
168
201
 
169
202
 
170
203
  RELATIVE_PATH_ROOT = JsonPath(None)
@@ -1,15 +1,17 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import dataclasses
3
4
  import sys
4
5
  import time
5
- from dataclasses import dataclass
6
6
  from typing import Any, Iterable, NamedTuple, Optional, Sequence
7
7
  from uuid import UUID
8
8
 
9
9
  import numpy as np
10
+ import sqlalchemy as sql
10
11
 
11
12
  from pixeltable import catalog, exceptions as excs, exprs, utils
12
13
  from pixeltable.env import Env
14
+ from pixeltable.utils.misc import non_none_dict_factory
13
15
 
14
16
  from .data_row import DataRow
15
17
  from .expr import Expr, ExprScope
@@ -68,7 +70,7 @@ class RowBuilder:
68
70
  input_exprs: ExprSet
69
71
 
70
72
  tbl: Optional[catalog.TableVersion] # reference table of the RowBuilder; used to identify pk columns for writes
71
- table_columns: list[ColumnSlotIdx]
73
+ table_columns: dict[catalog.Column, int | None] # value: slot idx, if the result of an expr
72
74
  default_eval_ctx: EvalCtx
73
75
  unstored_iter_args: dict[UUID, Expr]
74
76
 
@@ -92,10 +94,9 @@ class RowBuilder:
92
94
  img_slot_idxs: list[int] # Indices of image slots
93
95
  media_slot_idxs: list[int] # Indices of non-image media slots
94
96
  array_slot_idxs: list[int] # Indices of array slots
95
- stored_img_cols: list[exprs.ColumnSlotIdx]
96
- stored_media_cols: list[exprs.ColumnSlotIdx]
97
+ json_slot_idxs: list[int] # Indices of json slots
97
98
 
98
- @dataclass
99
+ @dataclasses.dataclass
99
100
  class EvalCtx:
100
101
  """Context for evaluating a set of target exprs"""
101
102
 
@@ -113,8 +114,6 @@ class RowBuilder:
113
114
  ):
114
115
  self.unique_exprs: ExprSet[Expr] = ExprSet() # dependencies precede their dependents
115
116
  self.next_slot_idx = 0
116
- self.stored_img_cols = []
117
- self.stored_media_cols = []
118
117
 
119
118
  # record input and output exprs; make copies to avoid reusing execution state
120
119
  unique_input_exprs = [self._record_unique_expr(e.copy(), recursive=False) for e in input_exprs]
@@ -138,7 +137,7 @@ class RowBuilder:
138
137
  from .column_ref import ColumnRef
139
138
 
140
139
  self.tbl = tbl
141
- self.table_columns: list[ColumnSlotIdx] = []
140
+ self.table_columns = {}
142
141
  self.input_exprs = ExprSet()
143
142
  validating_colrefs: dict[Expr, Expr] = {} # key: non-validating colref, value: corresp. validating colref
144
143
  for col in columns:
@@ -245,17 +244,27 @@ class RowBuilder:
245
244
  e.slot_idx for e in self.unique_exprs if e.col_type.is_media_type() and not e.col_type.is_image_type()
246
245
  ]
247
246
  self.array_slot_idxs = [e.slot_idx for e in self.unique_exprs if e.col_type.is_array_type()]
247
+ self.json_slot_idxs = [e.slot_idx for e in self.unique_exprs if e.col_type.is_json_type()]
248
248
 
249
249
  def add_table_column(self, col: catalog.Column, slot_idx: int) -> None:
250
- """Record a column that is part of the table row"""
250
+ """Record an output column for which the value is produced via expr evaluation"""
251
251
  assert self.tbl is not None
252
252
  assert col.is_stored
253
- info = ColumnSlotIdx(col, slot_idx)
254
- self.table_columns.append(info)
255
- if col.col_type.is_media_type():
256
- self.stored_media_cols.append(info)
257
- if col.col_type.is_image_type():
258
- self.stored_img_cols.append(info)
253
+ self.table_columns[col] = slot_idx
254
+
255
+ def add_table_columns(self, cols: list[catalog.Column]) -> None:
256
+ """Record output columns whose values are materialized into DataRow.cell_vals"""
257
+ for col in cols:
258
+ self.table_columns[col] = None
259
+
260
+ @property
261
+ def media_output_col_info(self) -> list[ColumnSlotIdx]:
262
+ """Return slot idxs for media output columns whose values are produced by expr evaluation"""
263
+ return [
264
+ ColumnSlotIdx(col, slot_idx)
265
+ for col, slot_idx in self.table_columns.items()
266
+ if col.col_type.is_media_type() and slot_idx is not None
267
+ ]
259
268
 
260
269
  @property
261
270
  def num_materialized(self) -> int:
@@ -462,13 +471,30 @@ class RowBuilder:
462
471
 
463
472
  num_excs = 0
464
473
  table_row: list[Any] = list(pk)
465
- for col, slot_idx in self.table_columns:
474
+ # Nulls in JSONB columns need to be stored as sql.sql.null(), otherwise it stores a json 'null'
475
+ for col, slot_idx in self.table_columns.items():
476
+ if col.id in data_row.cell_vals:
477
+ table_row.append(data_row.cell_vals[col.id])
478
+ if col.stores_cellmd:
479
+ if data_row.cell_md[col.id] is None:
480
+ table_row.append(sql.sql.null())
481
+ else:
482
+ # we want to minimize the size of the stored dict and use dict_factory to remove Nones
483
+ md = dataclasses.asdict(data_row.cell_md[col.id], dict_factory=non_none_dict_factory)
484
+ assert len(md) > 0
485
+ table_row.append(md)
486
+ if slot_idx is not None and data_row.has_exc(slot_idx):
487
+ num_excs += 1
488
+ if cols_with_excs is not None:
489
+ cols_with_excs.add(col.id)
490
+ continue
491
+
466
492
  if data_row.has_exc(slot_idx):
467
493
  exc = data_row.get_exc(slot_idx)
468
494
  num_excs += 1
469
495
  if cols_with_excs is not None:
470
496
  cols_with_excs.add(col.id)
471
- table_row.append(None)
497
+ table_row.append(sql.sql.null() if col.col_type.is_json_type() else None)
472
498
  if col.stores_cellmd:
473
499
  # exceptions get stored in the errortype/-msg properties of the cellmd column
474
500
  table_row.append(ColumnPropertyRef.create_cellmd_exc(exc))
@@ -476,7 +502,7 @@ class RowBuilder:
476
502
  val = data_row.get_stored_val(slot_idx, col.get_sa_col_type())
477
503
  table_row.append(val)
478
504
  if col.stores_cellmd:
479
- table_row.append(None) # placeholder for cellmd column
505
+ table_row.append(sql.sql.null()) # placeholder for cellmd column
480
506
 
481
507
  return table_row, num_excs
482
508
 
@@ -490,12 +516,18 @@ class RowBuilder:
490
516
  store_col_names: list[str] = [pk_col.name for pk_col in self.tbl.store_tbl.pk_columns()]
491
517
 
492
518
  for col in self.table_columns:
493
- store_col_names.append(col.col.store_name())
494
- if col.col.stores_cellmd:
495
- store_col_names.append(col.col.cellmd_store_name())
519
+ store_col_names.append(col.store_name())
520
+ if col.stores_cellmd:
521
+ store_col_names.append(col.cellmd_store_name())
496
522
 
497
523
  return store_col_names
498
524
 
499
525
  def make_row(self) -> exprs.DataRow:
500
526
  """Creates a new DataRow with the current row_builder's configuration."""
501
- return exprs.DataRow(self.num_materialized, self.img_slot_idxs, self.media_slot_idxs, self.array_slot_idxs)
527
+ return exprs.DataRow(
528
+ size=self.num_materialized,
529
+ img_slot_idxs=self.img_slot_idxs,
530
+ media_slot_idxs=self.media_slot_idxs,
531
+ array_slot_idxs=self.array_slot_idxs,
532
+ json_slot_idxs=self.json_slot_idxs,
533
+ )
@@ -19,6 +19,7 @@ from . import (
19
19
  mistralai,
20
20
  ollama,
21
21
  openai,
22
+ openrouter,
22
23
  replicate,
23
24
  string,
24
25
  timestamp,
@@ -1,3 +1,10 @@
1
+ """
2
+ Pixeltable UDFs for AWS Bedrock AI models.
3
+
4
+ Provides integration with AWS Bedrock for accessing various foundation models
5
+ including Anthropic Claude, Amazon Titan, and other providers.
6
+ """
7
+
1
8
  import logging
2
9
  from typing import TYPE_CHECKING, Any, Optional
3
10
 
@@ -1,3 +1,10 @@
1
+ """
2
+ Pixeltable UDFs for Deepseek AI models.
3
+
4
+ Provides integration with Deepseek's language models for chat completions
5
+ and other AI capabilities.
6
+ """
7
+
1
8
  import json
2
9
  from typing import TYPE_CHECKING, Any, Optional
3
10
 
@@ -67,10 +74,10 @@ async def chat_completions(
67
74
  of the table `tbl`:
68
75
 
69
76
  >>> messages = [
70
- {'role': 'system', 'content': 'You are a helpful assistant.'},
71
- {'role': 'user', 'content': tbl.prompt}
72
- ]
73
- tbl.add_computed_column(response=chat_completions(messages, model='deepseek-chat'))
77
+ ... {'role': 'system', 'content': 'You are a helpful assistant.'},
78
+ ... {'role': 'user', 'content': tbl.prompt}
79
+ ... ]
80
+ >>> tbl.add_computed_column(response=chat_completions(messages, model='deepseek-chat'))
74
81
  """
75
82
  if model_kwargs is None:
76
83
  model_kwargs = {}
@@ -1,3 +1,10 @@
1
+ """
2
+ Pixeltable UDFs for llama.cpp models.
3
+
4
+ Provides integration with llama.cpp for running quantized language models locally,
5
+ supporting chat completions and embeddings with GGUF format models.
6
+ """
7
+
1
8
  from pathlib import Path
2
9
  from typing import TYPE_CHECKING, Any, Optional
3
10
 
@@ -97,7 +97,7 @@ def _(self: sql.ColumnElement, digits: Optional[sql.ColumnElement] = None) -> sq
97
97
  if digits is None:
98
98
  return sql.func.round(self)
99
99
  else:
100
- return sql.func.round(self.cast(sql.Numeric), digits.cast(sql.Integer))
100
+ return sql.cast(sql.func.round(sql.cast(self, sql.Numeric), sql.cast(digits, sql.Integer)), sql.Float)
101
101
 
102
102
 
103
103
  @pxt.udf(is_method=True)