pixeltable 0.3.14__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. pixeltable/__init__.py +42 -8
  2. pixeltable/{dataframe.py → _query.py} +470 -206
  3. pixeltable/_version.py +1 -0
  4. pixeltable/catalog/__init__.py +5 -4
  5. pixeltable/catalog/catalog.py +1785 -432
  6. pixeltable/catalog/column.py +190 -113
  7. pixeltable/catalog/dir.py +2 -4
  8. pixeltable/catalog/globals.py +19 -46
  9. pixeltable/catalog/insertable_table.py +191 -98
  10. pixeltable/catalog/path.py +63 -23
  11. pixeltable/catalog/schema_object.py +11 -15
  12. pixeltable/catalog/table.py +843 -436
  13. pixeltable/catalog/table_metadata.py +103 -0
  14. pixeltable/catalog/table_version.py +978 -657
  15. pixeltable/catalog/table_version_handle.py +72 -16
  16. pixeltable/catalog/table_version_path.py +112 -43
  17. pixeltable/catalog/tbl_ops.py +53 -0
  18. pixeltable/catalog/update_status.py +191 -0
  19. pixeltable/catalog/view.py +134 -90
  20. pixeltable/config.py +134 -22
  21. pixeltable/env.py +471 -157
  22. pixeltable/exceptions.py +6 -0
  23. pixeltable/exec/__init__.py +4 -1
  24. pixeltable/exec/aggregation_node.py +7 -8
  25. pixeltable/exec/cache_prefetch_node.py +83 -110
  26. pixeltable/exec/cell_materialization_node.py +268 -0
  27. pixeltable/exec/cell_reconstruction_node.py +168 -0
  28. pixeltable/exec/component_iteration_node.py +4 -3
  29. pixeltable/exec/data_row_batch.py +8 -65
  30. pixeltable/exec/exec_context.py +16 -4
  31. pixeltable/exec/exec_node.py +13 -36
  32. pixeltable/exec/expr_eval/evaluators.py +11 -7
  33. pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
  34. pixeltable/exec/expr_eval/globals.py +8 -5
  35. pixeltable/exec/expr_eval/row_buffer.py +1 -2
  36. pixeltable/exec/expr_eval/schedulers.py +106 -56
  37. pixeltable/exec/globals.py +35 -0
  38. pixeltable/exec/in_memory_data_node.py +19 -19
  39. pixeltable/exec/object_store_save_node.py +293 -0
  40. pixeltable/exec/row_update_node.py +16 -9
  41. pixeltable/exec/sql_node.py +351 -84
  42. pixeltable/exprs/__init__.py +1 -1
  43. pixeltable/exprs/arithmetic_expr.py +27 -22
  44. pixeltable/exprs/array_slice.py +3 -3
  45. pixeltable/exprs/column_property_ref.py +36 -23
  46. pixeltable/exprs/column_ref.py +213 -89
  47. pixeltable/exprs/comparison.py +5 -5
  48. pixeltable/exprs/compound_predicate.py +5 -4
  49. pixeltable/exprs/data_row.py +164 -54
  50. pixeltable/exprs/expr.py +70 -44
  51. pixeltable/exprs/expr_dict.py +3 -3
  52. pixeltable/exprs/expr_set.py +17 -10
  53. pixeltable/exprs/function_call.py +100 -40
  54. pixeltable/exprs/globals.py +2 -2
  55. pixeltable/exprs/in_predicate.py +4 -4
  56. pixeltable/exprs/inline_expr.py +18 -32
  57. pixeltable/exprs/is_null.py +7 -3
  58. pixeltable/exprs/json_mapper.py +8 -8
  59. pixeltable/exprs/json_path.py +56 -22
  60. pixeltable/exprs/literal.py +27 -5
  61. pixeltable/exprs/method_ref.py +2 -2
  62. pixeltable/exprs/object_ref.py +2 -2
  63. pixeltable/exprs/row_builder.py +167 -67
  64. pixeltable/exprs/rowid_ref.py +25 -10
  65. pixeltable/exprs/similarity_expr.py +58 -40
  66. pixeltable/exprs/sql_element_cache.py +4 -4
  67. pixeltable/exprs/string_op.py +5 -5
  68. pixeltable/exprs/type_cast.py +3 -5
  69. pixeltable/func/__init__.py +1 -0
  70. pixeltable/func/aggregate_function.py +8 -8
  71. pixeltable/func/callable_function.py +9 -9
  72. pixeltable/func/expr_template_function.py +17 -11
  73. pixeltable/func/function.py +18 -20
  74. pixeltable/func/function_registry.py +6 -7
  75. pixeltable/func/globals.py +2 -3
  76. pixeltable/func/mcp.py +74 -0
  77. pixeltable/func/query_template_function.py +29 -27
  78. pixeltable/func/signature.py +46 -19
  79. pixeltable/func/tools.py +31 -13
  80. pixeltable/func/udf.py +18 -20
  81. pixeltable/functions/__init__.py +16 -0
  82. pixeltable/functions/anthropic.py +123 -77
  83. pixeltable/functions/audio.py +147 -10
  84. pixeltable/functions/bedrock.py +13 -6
  85. pixeltable/functions/date.py +7 -4
  86. pixeltable/functions/deepseek.py +35 -43
  87. pixeltable/functions/document.py +81 -0
  88. pixeltable/functions/fal.py +76 -0
  89. pixeltable/functions/fireworks.py +11 -20
  90. pixeltable/functions/gemini.py +195 -39
  91. pixeltable/functions/globals.py +142 -14
  92. pixeltable/functions/groq.py +108 -0
  93. pixeltable/functions/huggingface.py +1056 -24
  94. pixeltable/functions/image.py +115 -57
  95. pixeltable/functions/json.py +1 -1
  96. pixeltable/functions/llama_cpp.py +28 -13
  97. pixeltable/functions/math.py +67 -5
  98. pixeltable/functions/mistralai.py +18 -55
  99. pixeltable/functions/net.py +70 -0
  100. pixeltable/functions/ollama.py +20 -13
  101. pixeltable/functions/openai.py +240 -226
  102. pixeltable/functions/openrouter.py +143 -0
  103. pixeltable/functions/replicate.py +4 -4
  104. pixeltable/functions/reve.py +250 -0
  105. pixeltable/functions/string.py +239 -69
  106. pixeltable/functions/timestamp.py +16 -16
  107. pixeltable/functions/together.py +24 -84
  108. pixeltable/functions/twelvelabs.py +188 -0
  109. pixeltable/functions/util.py +6 -1
  110. pixeltable/functions/uuid.py +30 -0
  111. pixeltable/functions/video.py +1515 -107
  112. pixeltable/functions/vision.py +8 -8
  113. pixeltable/functions/voyageai.py +289 -0
  114. pixeltable/functions/whisper.py +16 -8
  115. pixeltable/functions/whisperx.py +179 -0
  116. pixeltable/{ext/functions → functions}/yolox.py +2 -4
  117. pixeltable/globals.py +362 -115
  118. pixeltable/index/base.py +17 -21
  119. pixeltable/index/btree.py +28 -22
  120. pixeltable/index/embedding_index.py +100 -118
  121. pixeltable/io/__init__.py +4 -2
  122. pixeltable/io/datarows.py +8 -7
  123. pixeltable/io/external_store.py +56 -105
  124. pixeltable/io/fiftyone.py +13 -13
  125. pixeltable/io/globals.py +31 -30
  126. pixeltable/io/hf_datasets.py +61 -16
  127. pixeltable/io/label_studio.py +74 -70
  128. pixeltable/io/lancedb.py +3 -0
  129. pixeltable/io/pandas.py +21 -12
  130. pixeltable/io/parquet.py +25 -105
  131. pixeltable/io/table_data_conduit.py +250 -123
  132. pixeltable/io/utils.py +4 -4
  133. pixeltable/iterators/__init__.py +2 -1
  134. pixeltable/iterators/audio.py +26 -25
  135. pixeltable/iterators/base.py +9 -3
  136. pixeltable/iterators/document.py +112 -78
  137. pixeltable/iterators/image.py +12 -15
  138. pixeltable/iterators/string.py +11 -4
  139. pixeltable/iterators/video.py +523 -120
  140. pixeltable/metadata/__init__.py +14 -3
  141. pixeltable/metadata/converters/convert_13.py +2 -2
  142. pixeltable/metadata/converters/convert_18.py +2 -2
  143. pixeltable/metadata/converters/convert_19.py +2 -2
  144. pixeltable/metadata/converters/convert_20.py +2 -2
  145. pixeltable/metadata/converters/convert_21.py +2 -2
  146. pixeltable/metadata/converters/convert_22.py +2 -2
  147. pixeltable/metadata/converters/convert_24.py +2 -2
  148. pixeltable/metadata/converters/convert_25.py +2 -2
  149. pixeltable/metadata/converters/convert_26.py +2 -2
  150. pixeltable/metadata/converters/convert_29.py +4 -4
  151. pixeltable/metadata/converters/convert_30.py +34 -21
  152. pixeltable/metadata/converters/convert_34.py +2 -2
  153. pixeltable/metadata/converters/convert_35.py +9 -0
  154. pixeltable/metadata/converters/convert_36.py +38 -0
  155. pixeltable/metadata/converters/convert_37.py +15 -0
  156. pixeltable/metadata/converters/convert_38.py +39 -0
  157. pixeltable/metadata/converters/convert_39.py +124 -0
  158. pixeltable/metadata/converters/convert_40.py +73 -0
  159. pixeltable/metadata/converters/convert_41.py +12 -0
  160. pixeltable/metadata/converters/convert_42.py +9 -0
  161. pixeltable/metadata/converters/convert_43.py +44 -0
  162. pixeltable/metadata/converters/util.py +20 -31
  163. pixeltable/metadata/notes.py +9 -0
  164. pixeltable/metadata/schema.py +140 -53
  165. pixeltable/metadata/utils.py +74 -0
  166. pixeltable/mypy/__init__.py +3 -0
  167. pixeltable/mypy/mypy_plugin.py +123 -0
  168. pixeltable/plan.py +382 -115
  169. pixeltable/share/__init__.py +1 -1
  170. pixeltable/share/packager.py +547 -83
  171. pixeltable/share/protocol/__init__.py +33 -0
  172. pixeltable/share/protocol/common.py +165 -0
  173. pixeltable/share/protocol/operation_types.py +33 -0
  174. pixeltable/share/protocol/replica.py +119 -0
  175. pixeltable/share/publish.py +257 -59
  176. pixeltable/store.py +311 -194
  177. pixeltable/type_system.py +373 -211
  178. pixeltable/utils/__init__.py +2 -3
  179. pixeltable/utils/arrow.py +131 -17
  180. pixeltable/utils/av.py +298 -0
  181. pixeltable/utils/azure_store.py +346 -0
  182. pixeltable/utils/coco.py +6 -6
  183. pixeltable/utils/code.py +3 -3
  184. pixeltable/utils/console_output.py +4 -1
  185. pixeltable/utils/coroutine.py +6 -23
  186. pixeltable/utils/dbms.py +32 -6
  187. pixeltable/utils/description_helper.py +4 -5
  188. pixeltable/utils/documents.py +7 -18
  189. pixeltable/utils/exception_handler.py +7 -30
  190. pixeltable/utils/filecache.py +6 -6
  191. pixeltable/utils/formatter.py +86 -48
  192. pixeltable/utils/gcs_store.py +295 -0
  193. pixeltable/utils/http.py +133 -0
  194. pixeltable/utils/http_server.py +2 -3
  195. pixeltable/utils/iceberg.py +1 -2
  196. pixeltable/utils/image.py +17 -0
  197. pixeltable/utils/lancedb.py +90 -0
  198. pixeltable/utils/local_store.py +322 -0
  199. pixeltable/utils/misc.py +5 -0
  200. pixeltable/utils/object_stores.py +573 -0
  201. pixeltable/utils/pydantic.py +60 -0
  202. pixeltable/utils/pytorch.py +5 -6
  203. pixeltable/utils/s3_store.py +527 -0
  204. pixeltable/utils/sql.py +26 -0
  205. pixeltable/utils/system.py +30 -0
  206. pixeltable-0.5.7.dist-info/METADATA +579 -0
  207. pixeltable-0.5.7.dist-info/RECORD +227 -0
  208. {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
  209. pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
  210. pixeltable/__version__.py +0 -3
  211. pixeltable/catalog/named_function.py +0 -40
  212. pixeltable/ext/__init__.py +0 -17
  213. pixeltable/ext/functions/__init__.py +0 -11
  214. pixeltable/ext/functions/whisperx.py +0 -77
  215. pixeltable/utils/media_store.py +0 -77
  216. pixeltable/utils/s3.py +0 -17
  217. pixeltable-0.3.14.dist-info/METADATA +0 -434
  218. pixeltable-0.3.14.dist-info/RECORD +0 -186
  219. pixeltable-0.3.14.dist-info/entry_points.txt +0 -3
  220. {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
@@ -1,7 +1,9 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import base64
3
4
  import datetime
4
- from typing import Any, Optional
5
+ import uuid
6
+ from typing import Any
5
7
 
6
8
  import numpy as np
7
9
  import sqlalchemy as sql
@@ -16,7 +18,9 @@ from .sql_element_cache import SqlElementCache
16
18
 
17
19
 
18
20
  class Literal(Expr):
19
- def __init__(self, val: Any, col_type: Optional[ts.ColumnType] = None):
21
+ val: Any
22
+
23
+ def __init__(self, val: Any, col_type: ts.ColumnType | None = None):
20
24
  if col_type is not None:
21
25
  val = col_type.create_literal(val)
22
26
  else:
@@ -40,7 +44,7 @@ class Literal(Expr):
40
44
  self.val = val
41
45
  self.id = self._create_id()
42
46
 
43
- def default_column_name(self) -> Optional[str]:
47
+ def default_column_name(self) -> str | None:
44
48
  return 'Literal'
45
49
 
46
50
  def __str__(self) -> str:
@@ -53,6 +57,9 @@ class Literal(Expr):
53
57
  if self.col_type.is_date_type():
54
58
  assert isinstance(self.val, datetime.date)
55
59
  return f"'{self.val.isoformat()}'"
60
+ if self.col_type.is_uuid_type():
61
+ assert isinstance(self.val, uuid.UUID)
62
+ return f"'{self.val}'"
56
63
  if self.col_type.is_array_type():
57
64
  assert isinstance(self.val, np.ndarray)
58
65
  return str(self.val.tolist())
@@ -67,7 +74,7 @@ class Literal(Expr):
67
74
  def _id_attrs(self) -> list[tuple[str, Any]]:
68
75
  return [*super()._id_attrs(), ('val', self.val)]
69
76
 
70
- def sql_expr(self, _: SqlElementCache) -> Optional[sql.ColumnElement]:
77
+ def sql_expr(self, _: SqlElementCache) -> sql.ColumnElement | None:
71
78
  # Return a sql object so that constants can participate in SQL expressions
72
79
  return sql.sql.expression.literal(self.val, type_=self.col_type.to_sa_type())
73
80
 
@@ -89,13 +96,21 @@ class Literal(Expr):
89
96
  assert isinstance(self.val, datetime.date)
90
97
  encoded_val = self.val.isoformat()
91
98
  return {'val': encoded_val, 'val_t': self.col_type._type.name, **super()._as_dict()}
99
+ elif self.col_type.is_uuid_type():
100
+ assert isinstance(self.val, uuid.UUID)
101
+ encoded_val = str(self.val)
102
+ return {'val': encoded_val, 'val_t': self.col_type._type.name, **super()._as_dict()}
103
+ elif self.col_type.is_binary_type():
104
+ assert isinstance(self.val, bytes)
105
+ encoded_val = base64.b64encode(self.val).decode('utf-8')
106
+ return {'val': encoded_val, 'val_t': self.col_type._type.name, **super()._as_dict()}
92
107
  elif self.col_type.is_array_type():
93
108
  assert isinstance(self.val, np.ndarray)
94
109
  return {'val': self.val.tolist(), 'val_t': self.col_type._type.name, **super()._as_dict()}
95
110
  else:
96
111
  return {'val': self.val, **super()._as_dict()}
97
112
 
98
- def as_literal(self) -> Optional[Literal]:
113
+ def as_literal(self) -> Literal | None:
99
114
  return self
100
115
 
101
116
  @classmethod
@@ -110,6 +125,13 @@ class Literal(Expr):
110
125
  dt = datetime.datetime.fromisoformat(d['val'])
111
126
  assert dt.tzinfo == datetime.timezone.utc # Must be UTC in the database
112
127
  return cls(dt)
128
+ elif val_t == ts.ColumnType.Type.UUID.name:
129
+ uuid_val = uuid.UUID(d['val'])
130
+ return cls(uuid_val)
131
+ elif val_t == ts.ColumnType.Type.BINARY.name:
132
+ assert isinstance(d['val'], str)
133
+ bytes_val = base64.b64decode(d['val'].encode('utf-8'))
134
+ return cls(bytes_val)
113
135
  elif val_t == ts.ColumnType.Type.ARRAY.name:
114
136
  arrays = np.array(d['val'])
115
137
  return cls(arrays)
@@ -1,4 +1,4 @@
1
- from typing import Any, Optional
1
+ from typing import Any
2
2
 
3
3
  import sqlalchemy as sql
4
4
 
@@ -55,7 +55,7 @@ class MethodRef(Expr):
55
55
  def _id_attrs(self) -> list[tuple[str, Any]]:
56
56
  return [*super()._id_attrs(), ('method_name', self.method_name)]
57
57
 
58
- def sql_expr(self, _: SqlElementCache) -> Optional[sql.ColumnElement]:
58
+ def sql_expr(self, _: SqlElementCache) -> sql.ColumnElement | None:
59
59
  return None
60
60
 
61
61
  def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import Any, Optional
3
+ from typing import Any
4
4
 
5
5
  import sqlalchemy as sql
6
6
 
@@ -43,7 +43,7 @@ class ObjectRef(Expr):
43
43
  def _equals(self, other: ObjectRef) -> bool:
44
44
  return self.id == other.id
45
45
 
46
- def sql_expr(self, _: SqlElementCache) -> Optional[sql.ColumnElement]:
46
+ def sql_expr(self, _: SqlElementCache) -> sql.ColumnElement | None:
47
47
  return None
48
48
 
49
49
  def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
@@ -1,21 +1,25 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import dataclasses
3
4
  import sys
4
5
  import time
5
- from dataclasses import dataclass
6
- from typing import Any, Iterable, Optional, Sequence
6
+ from typing import TYPE_CHECKING, Any, Iterable, NamedTuple, Sequence, TypeVar
7
7
  from uuid import UUID
8
8
 
9
9
  import numpy as np
10
+ import sqlalchemy as sql
10
11
 
11
- from pixeltable import catalog, exceptions as excs, utils
12
+ from pixeltable import catalog, exceptions as excs, exprs, utils
12
13
  from pixeltable.env import Env
13
- from pixeltable.utils.media_store import MediaStore
14
+ from pixeltable.utils.misc import non_none_dict_factory
14
15
 
15
16
  from .data_row import DataRow
16
17
  from .expr import Expr, ExprScope
17
18
  from .expr_set import ExprSet
18
19
 
20
+ if TYPE_CHECKING:
21
+ from .column_ref import ColumnRef
22
+
19
23
 
20
24
  class ExecProfile:
21
25
  def __init__(self, row_builder: RowBuilder):
@@ -35,8 +39,7 @@ class ExecProfile:
35
39
  )
36
40
 
37
41
 
38
- @dataclass
39
- class ColumnSlotIdx:
42
+ class ColumnSlotIdx(NamedTuple):
40
43
  """Info for how to locate materialized column in DataRow
41
44
  TODO: can this be integrated into RowBuilder directly?
42
45
  """
@@ -50,6 +53,12 @@ class RowBuilder:
50
53
 
51
54
  For ColumnRefs to unstored iterator columns:
52
55
  - in order for them to be executable, we also record the iterator args and pass them to the ColumnRef
56
+
57
+ Args:
58
+ output_exprs: list of Exprs to be evaluated
59
+ columns: list of columns to be materialized
60
+ input_exprs: list of Exprs that are excluded from evaluation (because they're already materialized)
61
+ TODO: enforce that output_exprs doesn't overlap with input_exprs?
53
62
  """
54
63
 
55
64
  unique_exprs: ExprSet
@@ -63,9 +72,13 @@ class RowBuilder:
63
72
 
64
73
  input_exprs: ExprSet
65
74
 
66
- table_columns: list[ColumnSlotIdx]
75
+ tbl: catalog.TableVersion | None # reference table of the RowBuilder; used to identify pk columns for writes
76
+ for_view_load: bool # True if this RowBuilder represents a view load
77
+
78
+ table_columns: dict[catalog.Column, int | None] # value: slot idx, if the result of an expr
67
79
  default_eval_ctx: EvalCtx
68
80
  unstored_iter_args: dict[UUID, Expr]
81
+ unstored_iter_outputs: dict[UUID, list['ColumnRef']]
69
82
 
70
83
  # transitive dependents for the purpose of exception propagation: an exception for slot i is propagated to
71
84
  # _exc_dependents[i]
@@ -84,7 +97,12 @@ class RowBuilder:
84
97
  # (a subexpr can be shared across multiple output exprs)
85
98
  output_expr_ids: list[set[int]]
86
99
 
87
- @dataclass
100
+ img_slot_idxs: list[int] # Indices of image slots
101
+ media_slot_idxs: list[int] # Indices of non-image media slots
102
+ array_slot_idxs: list[int] # Indices of array slots
103
+ json_slot_idxs: list[int] # Indices of json slots
104
+
105
+ @dataclasses.dataclass
88
106
  class EvalCtx:
89
107
  """Context for evaluating a set of target exprs"""
90
108
 
@@ -93,41 +111,45 @@ class RowBuilder:
93
111
  target_slot_idxs: list[int] # slot idxs of target exprs; might contain duplicates
94
112
  target_exprs: list[Expr] # exprs corresponding to target_slot_idxs
95
113
 
96
- def __init__(self, output_exprs: Sequence[Expr], columns: Sequence[catalog.Column], input_exprs: Iterable[Expr]):
97
- """
98
- Args:
99
- output_exprs: list of Exprs to be evaluated
100
- columns: list of columns to be materialized
101
- input_exprs: list of Exprs that are excluded from evaluation (because they're already materialized)
102
- TODO: enforce that output_exprs doesn't overlap with input_exprs?
103
- """
114
+ def __init__(
115
+ self,
116
+ output_exprs: Sequence[Expr],
117
+ columns: Sequence[catalog.Column],
118
+ input_exprs: Iterable[Expr],
119
+ tbl: catalog.TableVersion | None = None,
120
+ for_view_load: bool = False,
121
+ ):
122
+ from .column_property_ref import ColumnPropertyRef
123
+ from .column_ref import ColumnRef
124
+
104
125
  self.unique_exprs: ExprSet[Expr] = ExprSet() # dependencies precede their dependents
105
126
  self.next_slot_idx = 0
106
127
 
107
- # record input and output exprs; make copies to avoid reusing execution state
128
+ # record input exprs; make copies to avoid reusing execution state
108
129
  unique_input_exprs = [self._record_unique_expr(e.copy(), recursive=False) for e in input_exprs]
130
+
109
131
  self.input_expr_slot_idxs = {e.slot_idx for e in unique_input_exprs}
110
132
 
111
133
  resolve_cols = set(columns)
112
134
  self.output_exprs = ExprSet(
113
- [
114
- self._record_unique_expr(e.copy().resolve_computed_cols(resolve_cols=resolve_cols), recursive=True)
115
- for e in output_exprs
116
- ]
135
+ self._record_unique_expr(e.copy().resolve_computed_cols(resolve_cols=resolve_cols), recursive=True)
136
+ for e in output_exprs
117
137
  )
118
138
 
119
139
  # if init(columns):
120
- # - we are creating table rows and need to record columns for create_table_row()
140
+ # - we are creating table rows and need to record columns for create_store_table_row()
121
141
  # - output_exprs materialize those columns
122
142
  # - input_exprs are ColumnRefs of the non-computed columns (ie, what needs to be provided as input)
123
143
  # - media validation:
124
144
  # * for write-validated columns, we need to create validating ColumnRefs
125
145
  # * further references to that column (eg, computed cols) need to resolve to the validating ColumnRef
126
- from .column_ref import ColumnRef
127
146
 
128
- self.table_columns: list[ColumnSlotIdx] = []
147
+ self.for_view_load = for_view_load
148
+ self.tbl = tbl
149
+ self.table_columns = {}
129
150
  self.input_exprs = ExprSet()
130
151
  validating_colrefs: dict[Expr, Expr] = {} # key: non-validating colref, value: corresp. validating colref
152
+
131
153
  for col in columns:
132
154
  expr: Expr
133
155
  if col.is_computed:
@@ -168,24 +190,39 @@ class RowBuilder:
168
190
  # because that would cause them to be evaluated for every single row
169
191
  # - the separate eval ctx allows the ColumnRef to materialize the iterator args only when the underlying
170
192
  # iterated object changes
193
+
171
194
  col_refs = [e for e in self.unique_exprs if isinstance(e, ColumnRef)]
172
195
 
173
196
  def refs_unstored_iter_col(col_ref: ColumnRef) -> bool:
174
- tbl = col_ref.col.tbl
175
- return (
176
- tbl.get().is_component_view and tbl.get().is_iterator_column(col_ref.col) and not col_ref.col.is_stored
177
- )
197
+ tbl = col_ref.col.get_tbl()
198
+ return tbl.is_component_view and tbl.is_iterator_column(col_ref.col) and not col_ref.col.is_stored
178
199
 
179
200
  unstored_iter_col_refs = [col_ref for col_ref in col_refs if refs_unstored_iter_col(col_ref)]
180
- component_views = [col_ref.col.tbl for col_ref in unstored_iter_col_refs]
181
- unstored_iter_args = {view.id: view.get().iterator_args.copy() for view in component_views}
201
+ component_views = [col_ref.col.get_tbl() for col_ref in unstored_iter_col_refs]
202
+ unstored_iter_args = {view.id: view.iterator_args.copy() for view in component_views}
203
+
204
+ # the *stored* output columns of the unstored iterators
205
+ self.unstored_iter_outputs = {
206
+ view.id: [
207
+ self._record_unique_expr(ColumnRef(col), recursive=True)
208
+ for col in view.iterator_columns()
209
+ if col.is_stored
210
+ ]
211
+ for view in component_views
212
+ }
213
+
182
214
  self.unstored_iter_args = {
183
- id: self._record_unique_expr(arg, recursive=True) for id, arg in unstored_iter_args.items()
215
+ id: self._record_unique_expr(args, recursive=True) for id, args in unstored_iter_args.items()
184
216
  }
185
217
 
218
+ unstored_iter_col_refs = [
219
+ self._record_unique_expr(col_ref, recursive=True) for col_ref in unstored_iter_col_refs
220
+ ]
221
+
186
222
  for col_ref in unstored_iter_col_refs:
187
- iter_arg_ctx = self.create_eval_ctx([unstored_iter_args[col_ref.col.tbl.id]])
188
- col_ref.set_iter_arg_ctx(iter_arg_ctx)
223
+ iter_arg_ctx = self.create_eval_ctx([self.unstored_iter_args[col_ref.col.get_tbl().id]])
224
+ iter_outputs = self.unstored_iter_outputs[col_ref.col.get_tbl().id]
225
+ col_ref.set_iter_arg_ctx(iter_arg_ctx, iter_outputs)
189
226
 
190
227
  # we guarantee that we can compute the expr DAG in a single front-to-back pass
191
228
  for i, expr in enumerate(self.unique_exprs):
@@ -196,14 +233,13 @@ class RowBuilder:
196
233
  # self.dependents = np.zeros((self.num_materialized, self.num_materialized), dtype=bool)
197
234
  self.dependencies = np.zeros((self.num_materialized, self.num_materialized), dtype=bool)
198
235
  exc_dependencies: list[set[int]] = [set() for _ in range(self.num_materialized)]
199
- from .column_property_ref import ColumnPropertyRef
200
236
 
201
237
  for expr in self.unique_exprs:
202
238
  if expr.slot_idx in self.input_expr_slot_idxs:
203
239
  # this is input and therefore doesn't depend on other exprs
204
240
  continue
205
241
  # error properties don't have exceptions themselves
206
- if isinstance(expr, ColumnPropertyRef) and expr.is_error_prop():
242
+ if isinstance(expr, ColumnPropertyRef) and expr.is_cellmd_prop():
207
243
  continue
208
244
  dependency_idxs = [d.slot_idx for d in expr.dependencies()]
209
245
  self.dependencies[expr.slot_idx, dependency_idxs] = True
@@ -229,13 +265,32 @@ class RowBuilder:
229
265
  for e in self.output_exprs:
230
266
  self._record_output_expr_id(e, e.slot_idx)
231
267
 
268
+ self.img_slot_idxs = [e.slot_idx for e in self.unique_exprs if e.col_type.is_image_type()]
269
+ self.media_slot_idxs = [
270
+ e.slot_idx for e in self.unique_exprs if e.col_type.is_media_type() and not e.col_type.is_image_type()
271
+ ]
272
+ self.array_slot_idxs = [e.slot_idx for e in self.unique_exprs if e.col_type.is_array_type()]
273
+ self.json_slot_idxs = [e.slot_idx for e in self.unique_exprs if e.col_type.is_json_type()]
274
+
232
275
  def add_table_column(self, col: catalog.Column, slot_idx: int) -> None:
233
- """Record a column that is part of the table row"""
234
- self.table_columns.append(ColumnSlotIdx(col, slot_idx))
276
+ """Record an output column for which the value is produced via expr evaluation"""
277
+ assert self.tbl is not None
278
+ assert col.is_stored
279
+ self.table_columns[col] = slot_idx
235
280
 
236
- def output_slot_idxs(self) -> list[ColumnSlotIdx]:
237
- """Return ColumnSlotIdx for output columns"""
238
- return self.table_columns
281
+ def add_table_columns(self, cols: list[catalog.Column]) -> None:
282
+ """Record output columns whose values are materialized into DataRow.cell_vals"""
283
+ for col in cols:
284
+ self.table_columns[col] = None
285
+
286
+ @property
287
+ def media_output_col_info(self) -> list[ColumnSlotIdx]:
288
+ """Return slot idxs for media output columns whose values are produced by expr evaluation"""
289
+ return [
290
+ ColumnSlotIdx(col, slot_idx)
291
+ for col, slot_idx in self.table_columns.items()
292
+ if col.col_type.is_media_type() and slot_idx is not None
293
+ ]
239
294
 
240
295
  @property
241
296
  def num_materialized(self) -> int:
@@ -250,7 +305,9 @@ class RowBuilder:
250
305
  self.next_slot_idx += 1
251
306
  return result
252
307
 
253
- def _record_unique_expr(self, expr: Expr, recursive: bool) -> Expr:
308
+ T = TypeVar('T', bound=Expr)
309
+
310
+ def _record_unique_expr(self, expr: T, recursive: bool) -> T:
254
311
  """Records the expr if it's not a duplicate and assigns a slot idx to expr and its components"
255
312
  Returns:
256
313
  the unique expr
@@ -279,7 +336,7 @@ class RowBuilder:
279
336
  self._record_output_expr_id(d, output_expr_id)
280
337
 
281
338
  def _compute_dependencies(
282
- self, target_slot_idxs: list[int], excluded_slot_idxs: list[int], target_scope: Optional[ExprScope] = None
339
+ self, target_slot_idxs: list[int], excluded_slot_idxs: list[int], target_scope: ExprScope | None = None
283
340
  ) -> list[int]:
284
341
  """Compute exprs needed to materialize the given target slots, excluding 'excluded_slot_idxs'
285
342
 
@@ -333,7 +390,7 @@ class RowBuilder:
333
390
  self.__set_slot_idxs_aux(c)
334
391
 
335
392
  def get_dependencies(
336
- self, targets: Iterable[Expr], exclude: Optional[Iterable[Expr]] = None, limit_scope: bool = True
393
+ self, targets: Iterable[Expr], exclude: Iterable[Expr] | None = None, limit_scope: bool = True
337
394
  ) -> list[Expr]:
338
395
  """
339
396
  Return list of dependencies needed to evaluate the given target exprs (expressed as slot idxs).
@@ -351,7 +408,7 @@ class RowBuilder:
351
408
  return []
352
409
  # make sure we only refer to recorded exprs
353
410
  targets = [self.unique_exprs[e] for e in targets]
354
- target_scope: Optional[ExprScope] = None
411
+ target_scope: ExprScope | None = None
355
412
  if limit_scope:
356
413
  # make sure all targets are from the same scope
357
414
  target_scopes = {e.scope() for e in targets}
@@ -369,7 +426,7 @@ class RowBuilder:
369
426
  return [self.unique_exprs[id] for id in result_ids]
370
427
 
371
428
  def create_eval_ctx(
372
- self, targets: Iterable[Expr], exclude: Optional[Iterable[Expr]] = None, limit_scope: bool = True
429
+ self, targets: Iterable[Expr], exclude: Iterable[Expr] | None = None, limit_scope: bool = True
373
430
  ) -> EvalCtx:
374
431
  """Return EvalCtx for targets"""
375
432
  targets = list(targets)
@@ -398,9 +455,9 @@ class RowBuilder:
398
455
  self,
399
456
  data_row: DataRow,
400
457
  ctx: EvalCtx,
401
- profile: Optional[ExecProfile] = None,
458
+ profile: ExecProfile | None = None,
402
459
  ignore_errors: bool = False,
403
- force_eval: Optional[ExprScope] = None,
460
+ force_eval: ExprScope | None = None,
404
461
  ) -> None:
405
462
  """
406
463
  Populates the slots in data_row given in ctx.
@@ -429,33 +486,76 @@ class RowBuilder:
429
486
  expr, f'expression {expr}', data_row.get_exc(expr.slot_idx), exc_tb, input_vals, 0
430
487
  ) from exc
431
488
 
432
- def create_table_row(self, data_row: DataRow, exc_col_ids: set[int]) -> tuple[dict[str, Any], int]:
433
- """Create a table row from the slots that have an output column assigned
489
+ def create_store_table_row(
490
+ self, data_row: DataRow, cols_with_excs: set[int] | None, pk: tuple[int, ...]
491
+ ) -> tuple[list[Any], int]:
492
+ """Create a store table row from the slots that have an output column assigned
434
493
 
435
- Return tuple[dict that represents a stored row (can be passed to sql.insert()), # of exceptions]
494
+ Return tuple[list of row values in `self.table_columns` order, # of exceptions]
436
495
  This excludes system columns.
496
+ Row values are converted to their store type.
437
497
  """
498
+ from pixeltable.exprs.column_property_ref import ColumnPropertyRef
499
+
438
500
  num_excs = 0
439
- table_row: dict[str, Any] = {}
440
- for info in self.table_columns:
441
- col, slot_idx = info.col, info.slot_idx
501
+ table_row: list[Any] = list(pk)
502
+ # Nulls in JSONB columns need to be stored as sql.sql.null(), otherwise it stores a json 'null'
503
+ for col, slot_idx in self.table_columns.items():
504
+ if col.id in data_row.cell_vals:
505
+ table_row.append(data_row.cell_vals[col.id])
506
+ if col.stores_cellmd:
507
+ if data_row.cell_md[col.id] is None:
508
+ table_row.append(sql.sql.null())
509
+ else:
510
+ # we want to minimize the size of the stored dict and use dict_factory to remove Nones
511
+ md = dataclasses.asdict(data_row.cell_md[col.id], dict_factory=non_none_dict_factory)
512
+ assert len(md) > 0
513
+ table_row.append(md)
514
+ if slot_idx is not None and data_row.has_exc(slot_idx):
515
+ num_excs += 1
516
+ if cols_with_excs is not None:
517
+ cols_with_excs.add(col.id)
518
+ continue
519
+
442
520
  if data_row.has_exc(slot_idx):
443
- # exceptions get stored in the errortype/-msg columns
444
521
  exc = data_row.get_exc(slot_idx)
445
522
  num_excs += 1
446
- exc_col_ids.add(col.id)
447
- table_row[col.store_name()] = None
448
- table_row[col.errortype_store_name()] = type(exc).__name__
449
- table_row[col.errormsg_store_name()] = str(exc)
523
+ if cols_with_excs is not None:
524
+ cols_with_excs.add(col.id)
525
+ table_row.append(sql.sql.null() if col.col_type.is_json_type() else None)
526
+ if col.stores_cellmd:
527
+ # exceptions get stored in the errortype/-msg properties of the cellmd column
528
+ table_row.append(ColumnPropertyRef.create_cellmd_exc(exc))
450
529
  else:
451
- if col.col_type.is_image_type() and data_row.file_urls[slot_idx] is None:
452
- # we have yet to store this image
453
- filepath = str(MediaStore.prepare_media_path(col.tbl.id, col.id, col.tbl.get().version))
454
- data_row.flush_img(slot_idx, filepath)
455
- val = data_row.get_stored_val(slot_idx, col.sa_col.type)
456
- table_row[col.store_name()] = val
457
- # we unfortunately need to set these, even if there are no errors
458
- table_row[col.errortype_store_name()] = None
459
- table_row[col.errormsg_store_name()] = None
530
+ val = data_row.get_stored_val(slot_idx, col.sa_col_type)
531
+ table_row.append(val)
532
+ if col.stores_cellmd:
533
+ table_row.append(sql.sql.null()) # placeholder for cellmd column
460
534
 
461
535
  return table_row, num_excs
536
+
537
+ def store_column_names(self) -> list[str]:
538
+ """
539
+ Returns the list of store column names corresponding to the table_columns of this RowBuilder.
540
+ The second tuple element of the return value is a dictionary containing all media columns in the
541
+ table; it's the mapping {list_index: column}.
542
+ """
543
+ assert self.tbl is not None, self.table_columns
544
+ store_col_names: list[str] = [pk_col.name for pk_col in self.tbl.store_tbl.pk_columns()]
545
+
546
+ for col in self.table_columns:
547
+ store_col_names.append(col.store_name())
548
+ if col.stores_cellmd:
549
+ store_col_names.append(col.cellmd_store_name())
550
+
551
+ return store_col_names
552
+
553
+ def make_row(self) -> exprs.DataRow:
554
+ """Creates a new DataRow with the current row_builder's configuration."""
555
+ return exprs.DataRow(
556
+ size=self.num_materialized,
557
+ img_slot_idxs=self.img_slot_idxs,
558
+ media_slot_idxs=self.media_slot_idxs,
559
+ array_slot_idxs=self.array_slot_idxs,
560
+ json_slot_idxs=self.json_slot_idxs,
561
+ )
@@ -1,17 +1,21 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import Any, Optional, cast
3
+ import logging
4
+ from typing import Any, cast
4
5
  from uuid import UUID
5
6
 
6
7
  import sqlalchemy as sql
7
8
 
8
9
  from pixeltable import catalog, type_system as ts
10
+ from pixeltable.catalog.table_version import TableVersionKey
9
11
 
10
12
  from .data_row import DataRow
11
13
  from .expr import Expr
12
14
  from .row_builder import RowBuilder
13
15
  from .sql_element_cache import SqlElementCache
14
16
 
17
+ _logger = logging.getLogger('pixeltable')
18
+
15
19
 
16
20
  class RowidRef(Expr):
17
21
  """A reference to a part of a table rowid
@@ -22,18 +26,18 @@ class RowidRef(Expr):
22
26
  (with and without a TableVersion).
23
27
  """
24
28
 
25
- tbl: Optional[catalog.TableVersionHandle]
26
- normalized_base: Optional[catalog.TableVersionHandle]
29
+ tbl: catalog.TableVersionHandle | None
30
+ normalized_base: catalog.TableVersionHandle | None
27
31
  tbl_id: UUID
28
32
  normalized_base_id: UUID
29
33
  rowid_component_idx: int
30
34
 
31
35
  def __init__(
32
36
  self,
33
- tbl: Optional[catalog.TableVersionHandle],
37
+ tbl: catalog.TableVersionHandle | None,
34
38
  idx: int,
35
- tbl_id: Optional[UUID] = None,
36
- normalized_base_id: Optional[UUID] = None,
39
+ tbl_id: UUID | None = None,
40
+ normalized_base_id: UUID | None = None,
37
41
  ):
38
42
  super().__init__(ts.IntType(nullable=False))
39
43
  self.tbl = tbl
@@ -54,7 +58,7 @@ class RowidRef(Expr):
54
58
  self.rowid_component_idx = idx
55
59
  self.id = self._create_id()
56
60
 
57
- def default_column_name(self) -> Optional[str]:
61
+ def default_column_name(self) -> str | None:
58
62
  return str(self)
59
63
 
60
64
  def _equals(self, other: RowidRef) -> bool:
@@ -74,7 +78,11 @@ class RowidRef(Expr):
74
78
  # check if this is the pos column of a component view
75
79
  from pixeltable import store
76
80
 
77
- tbl = self.tbl.get() if self.tbl is not None else catalog.Catalog.get().get_tbl_version(self.tbl_id, None)
81
+ tbl = (
82
+ self.tbl.get()
83
+ if self.tbl is not None
84
+ else catalog.Catalog.get().get_tbl_version(TableVersionKey(self.tbl_id, None, None))
85
+ )
78
86
  if (
79
87
  tbl.is_component_view
80
88
  and self.rowid_component_idx == cast(store.StoreComponentView, tbl.store_tbl).pos_col_idx
@@ -95,8 +103,13 @@ class RowidRef(Expr):
95
103
  self.tbl = tbl.tbl_version
96
104
  self.tbl_id = self.tbl.id
97
105
 
98
- def sql_expr(self, _: SqlElementCache) -> Optional[sql.ColumnElement]:
99
- tbl = self.tbl.get() if self.tbl is not None else catalog.Catalog.get().get_tbl_version(self.tbl_id, None)
106
+ def sql_expr(self, _: SqlElementCache) -> sql.ColumnElement | None:
107
+ tbl = (
108
+ self.tbl.get()
109
+ if self.tbl is not None
110
+ else catalog.Catalog.get().get_tbl_version(TableVersionKey(self.tbl_id, None, None))
111
+ )
112
+ assert tbl.is_validated
100
113
  rowid_cols = tbl.store_tbl.rowid_columns()
101
114
  assert self.rowid_component_idx <= len(rowid_cols), (
102
115
  f'{self.rowid_component_idx} not consistent with {rowid_cols}'
@@ -107,6 +120,8 @@ class RowidRef(Expr):
107
120
  data_row[self.slot_idx] = data_row.pk[self.rowid_component_idx]
108
121
 
109
122
  def _as_dict(self) -> dict:
123
+ # TODO: Serialize the full TableVersionHandle, not just the UUID
124
+ assert self.tbl is None or self.tbl.anchor_tbl_id is None # TODO: support anchor_tbl_id for view-over-replica
110
125
  return {
111
126
  'tbl_id': str(self.tbl_id),
112
127
  'normalized_base_id': str(self.normalized_base_id),