pixeltable 0.2.26__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. pixeltable/__init__.py +83 -19
  2. pixeltable/_query.py +1444 -0
  3. pixeltable/_version.py +1 -0
  4. pixeltable/catalog/__init__.py +7 -4
  5. pixeltable/catalog/catalog.py +2394 -119
  6. pixeltable/catalog/column.py +225 -104
  7. pixeltable/catalog/dir.py +38 -9
  8. pixeltable/catalog/globals.py +53 -34
  9. pixeltable/catalog/insertable_table.py +265 -115
  10. pixeltable/catalog/path.py +80 -17
  11. pixeltable/catalog/schema_object.py +28 -43
  12. pixeltable/catalog/table.py +1270 -677
  13. pixeltable/catalog/table_metadata.py +103 -0
  14. pixeltable/catalog/table_version.py +1270 -751
  15. pixeltable/catalog/table_version_handle.py +109 -0
  16. pixeltable/catalog/table_version_path.py +137 -42
  17. pixeltable/catalog/tbl_ops.py +53 -0
  18. pixeltable/catalog/update_status.py +191 -0
  19. pixeltable/catalog/view.py +251 -134
  20. pixeltable/config.py +215 -0
  21. pixeltable/env.py +736 -285
  22. pixeltable/exceptions.py +26 -2
  23. pixeltable/exec/__init__.py +7 -2
  24. pixeltable/exec/aggregation_node.py +39 -21
  25. pixeltable/exec/cache_prefetch_node.py +87 -109
  26. pixeltable/exec/cell_materialization_node.py +268 -0
  27. pixeltable/exec/cell_reconstruction_node.py +168 -0
  28. pixeltable/exec/component_iteration_node.py +25 -28
  29. pixeltable/exec/data_row_batch.py +11 -46
  30. pixeltable/exec/exec_context.py +26 -11
  31. pixeltable/exec/exec_node.py +35 -27
  32. pixeltable/exec/expr_eval/__init__.py +3 -0
  33. pixeltable/exec/expr_eval/evaluators.py +365 -0
  34. pixeltable/exec/expr_eval/expr_eval_node.py +413 -0
  35. pixeltable/exec/expr_eval/globals.py +200 -0
  36. pixeltable/exec/expr_eval/row_buffer.py +74 -0
  37. pixeltable/exec/expr_eval/schedulers.py +413 -0
  38. pixeltable/exec/globals.py +35 -0
  39. pixeltable/exec/in_memory_data_node.py +35 -27
  40. pixeltable/exec/object_store_save_node.py +293 -0
  41. pixeltable/exec/row_update_node.py +44 -29
  42. pixeltable/exec/sql_node.py +414 -115
  43. pixeltable/exprs/__init__.py +8 -5
  44. pixeltable/exprs/arithmetic_expr.py +79 -45
  45. pixeltable/exprs/array_slice.py +5 -5
  46. pixeltable/exprs/column_property_ref.py +40 -26
  47. pixeltable/exprs/column_ref.py +254 -61
  48. pixeltable/exprs/comparison.py +14 -9
  49. pixeltable/exprs/compound_predicate.py +9 -10
  50. pixeltable/exprs/data_row.py +213 -72
  51. pixeltable/exprs/expr.py +270 -104
  52. pixeltable/exprs/expr_dict.py +6 -5
  53. pixeltable/exprs/expr_set.py +20 -11
  54. pixeltable/exprs/function_call.py +383 -284
  55. pixeltable/exprs/globals.py +18 -5
  56. pixeltable/exprs/in_predicate.py +7 -7
  57. pixeltable/exprs/inline_expr.py +37 -37
  58. pixeltable/exprs/is_null.py +8 -4
  59. pixeltable/exprs/json_mapper.py +120 -54
  60. pixeltable/exprs/json_path.py +90 -60
  61. pixeltable/exprs/literal.py +61 -16
  62. pixeltable/exprs/method_ref.py +7 -6
  63. pixeltable/exprs/object_ref.py +19 -8
  64. pixeltable/exprs/row_builder.py +238 -75
  65. pixeltable/exprs/rowid_ref.py +53 -15
  66. pixeltable/exprs/similarity_expr.py +65 -50
  67. pixeltable/exprs/sql_element_cache.py +5 -5
  68. pixeltable/exprs/string_op.py +107 -0
  69. pixeltable/exprs/type_cast.py +25 -13
  70. pixeltable/exprs/variable.py +2 -2
  71. pixeltable/func/__init__.py +9 -5
  72. pixeltable/func/aggregate_function.py +197 -92
  73. pixeltable/func/callable_function.py +119 -35
  74. pixeltable/func/expr_template_function.py +101 -48
  75. pixeltable/func/function.py +375 -62
  76. pixeltable/func/function_registry.py +20 -19
  77. pixeltable/func/globals.py +6 -5
  78. pixeltable/func/mcp.py +74 -0
  79. pixeltable/func/query_template_function.py +151 -35
  80. pixeltable/func/signature.py +178 -49
  81. pixeltable/func/tools.py +164 -0
  82. pixeltable/func/udf.py +176 -53
  83. pixeltable/functions/__init__.py +44 -4
  84. pixeltable/functions/anthropic.py +226 -47
  85. pixeltable/functions/audio.py +148 -11
  86. pixeltable/functions/bedrock.py +137 -0
  87. pixeltable/functions/date.py +188 -0
  88. pixeltable/functions/deepseek.py +113 -0
  89. pixeltable/functions/document.py +81 -0
  90. pixeltable/functions/fal.py +76 -0
  91. pixeltable/functions/fireworks.py +72 -20
  92. pixeltable/functions/gemini.py +249 -0
  93. pixeltable/functions/globals.py +208 -53
  94. pixeltable/functions/groq.py +108 -0
  95. pixeltable/functions/huggingface.py +1088 -95
  96. pixeltable/functions/image.py +155 -84
  97. pixeltable/functions/json.py +8 -11
  98. pixeltable/functions/llama_cpp.py +31 -19
  99. pixeltable/functions/math.py +169 -0
  100. pixeltable/functions/mistralai.py +50 -75
  101. pixeltable/functions/net.py +70 -0
  102. pixeltable/functions/ollama.py +29 -36
  103. pixeltable/functions/openai.py +548 -160
  104. pixeltable/functions/openrouter.py +143 -0
  105. pixeltable/functions/replicate.py +15 -14
  106. pixeltable/functions/reve.py +250 -0
  107. pixeltable/functions/string.py +310 -85
  108. pixeltable/functions/timestamp.py +37 -19
  109. pixeltable/functions/together.py +77 -120
  110. pixeltable/functions/twelvelabs.py +188 -0
  111. pixeltable/functions/util.py +7 -2
  112. pixeltable/functions/uuid.py +30 -0
  113. pixeltable/functions/video.py +1528 -117
  114. pixeltable/functions/vision.py +26 -26
  115. pixeltable/functions/voyageai.py +289 -0
  116. pixeltable/functions/whisper.py +19 -10
  117. pixeltable/functions/whisperx.py +179 -0
  118. pixeltable/functions/yolox.py +112 -0
  119. pixeltable/globals.py +716 -236
  120. pixeltable/index/__init__.py +3 -1
  121. pixeltable/index/base.py +17 -21
  122. pixeltable/index/btree.py +32 -22
  123. pixeltable/index/embedding_index.py +155 -92
  124. pixeltable/io/__init__.py +12 -7
  125. pixeltable/io/datarows.py +140 -0
  126. pixeltable/io/external_store.py +83 -125
  127. pixeltable/io/fiftyone.py +24 -33
  128. pixeltable/io/globals.py +47 -182
  129. pixeltable/io/hf_datasets.py +96 -127
  130. pixeltable/io/label_studio.py +171 -156
  131. pixeltable/io/lancedb.py +3 -0
  132. pixeltable/io/pandas.py +136 -115
  133. pixeltable/io/parquet.py +40 -153
  134. pixeltable/io/table_data_conduit.py +702 -0
  135. pixeltable/io/utils.py +100 -0
  136. pixeltable/iterators/__init__.py +8 -4
  137. pixeltable/iterators/audio.py +207 -0
  138. pixeltable/iterators/base.py +9 -3
  139. pixeltable/iterators/document.py +144 -87
  140. pixeltable/iterators/image.py +17 -38
  141. pixeltable/iterators/string.py +15 -12
  142. pixeltable/iterators/video.py +523 -127
  143. pixeltable/metadata/__init__.py +33 -8
  144. pixeltable/metadata/converters/convert_10.py +2 -3
  145. pixeltable/metadata/converters/convert_13.py +2 -2
  146. pixeltable/metadata/converters/convert_15.py +15 -11
  147. pixeltable/metadata/converters/convert_16.py +4 -5
  148. pixeltable/metadata/converters/convert_17.py +4 -5
  149. pixeltable/metadata/converters/convert_18.py +4 -6
  150. pixeltable/metadata/converters/convert_19.py +6 -9
  151. pixeltable/metadata/converters/convert_20.py +3 -6
  152. pixeltable/metadata/converters/convert_21.py +6 -8
  153. pixeltable/metadata/converters/convert_22.py +3 -2
  154. pixeltable/metadata/converters/convert_23.py +33 -0
  155. pixeltable/metadata/converters/convert_24.py +55 -0
  156. pixeltable/metadata/converters/convert_25.py +19 -0
  157. pixeltable/metadata/converters/convert_26.py +23 -0
  158. pixeltable/metadata/converters/convert_27.py +29 -0
  159. pixeltable/metadata/converters/convert_28.py +13 -0
  160. pixeltable/metadata/converters/convert_29.py +110 -0
  161. pixeltable/metadata/converters/convert_30.py +63 -0
  162. pixeltable/metadata/converters/convert_31.py +11 -0
  163. pixeltable/metadata/converters/convert_32.py +15 -0
  164. pixeltable/metadata/converters/convert_33.py +17 -0
  165. pixeltable/metadata/converters/convert_34.py +21 -0
  166. pixeltable/metadata/converters/convert_35.py +9 -0
  167. pixeltable/metadata/converters/convert_36.py +38 -0
  168. pixeltable/metadata/converters/convert_37.py +15 -0
  169. pixeltable/metadata/converters/convert_38.py +39 -0
  170. pixeltable/metadata/converters/convert_39.py +124 -0
  171. pixeltable/metadata/converters/convert_40.py +73 -0
  172. pixeltable/metadata/converters/convert_41.py +12 -0
  173. pixeltable/metadata/converters/convert_42.py +9 -0
  174. pixeltable/metadata/converters/convert_43.py +44 -0
  175. pixeltable/metadata/converters/util.py +44 -18
  176. pixeltable/metadata/notes.py +21 -0
  177. pixeltable/metadata/schema.py +185 -42
  178. pixeltable/metadata/utils.py +74 -0
  179. pixeltable/mypy/__init__.py +3 -0
  180. pixeltable/mypy/mypy_plugin.py +123 -0
  181. pixeltable/plan.py +616 -225
  182. pixeltable/share/__init__.py +3 -0
  183. pixeltable/share/packager.py +797 -0
  184. pixeltable/share/protocol/__init__.py +33 -0
  185. pixeltable/share/protocol/common.py +165 -0
  186. pixeltable/share/protocol/operation_types.py +33 -0
  187. pixeltable/share/protocol/replica.py +119 -0
  188. pixeltable/share/publish.py +349 -0
  189. pixeltable/store.py +398 -232
  190. pixeltable/type_system.py +730 -267
  191. pixeltable/utils/__init__.py +40 -0
  192. pixeltable/utils/arrow.py +201 -29
  193. pixeltable/utils/av.py +298 -0
  194. pixeltable/utils/azure_store.py +346 -0
  195. pixeltable/utils/coco.py +26 -27
  196. pixeltable/utils/code.py +4 -4
  197. pixeltable/utils/console_output.py +46 -0
  198. pixeltable/utils/coroutine.py +24 -0
  199. pixeltable/utils/dbms.py +92 -0
  200. pixeltable/utils/description_helper.py +11 -12
  201. pixeltable/utils/documents.py +60 -61
  202. pixeltable/utils/exception_handler.py +36 -0
  203. pixeltable/utils/filecache.py +38 -22
  204. pixeltable/utils/formatter.py +88 -51
  205. pixeltable/utils/gcs_store.py +295 -0
  206. pixeltable/utils/http.py +133 -0
  207. pixeltable/utils/http_server.py +14 -13
  208. pixeltable/utils/iceberg.py +13 -0
  209. pixeltable/utils/image.py +17 -0
  210. pixeltable/utils/lancedb.py +90 -0
  211. pixeltable/utils/local_store.py +322 -0
  212. pixeltable/utils/misc.py +5 -0
  213. pixeltable/utils/object_stores.py +573 -0
  214. pixeltable/utils/pydantic.py +60 -0
  215. pixeltable/utils/pytorch.py +20 -20
  216. pixeltable/utils/s3_store.py +527 -0
  217. pixeltable/utils/sql.py +32 -5
  218. pixeltable/utils/system.py +30 -0
  219. pixeltable/utils/transactional_directory.py +4 -3
  220. pixeltable-0.5.7.dist-info/METADATA +579 -0
  221. pixeltable-0.5.7.dist-info/RECORD +227 -0
  222. {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
  223. pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
  224. pixeltable/__version__.py +0 -3
  225. pixeltable/catalog/named_function.py +0 -36
  226. pixeltable/catalog/path_dict.py +0 -141
  227. pixeltable/dataframe.py +0 -894
  228. pixeltable/exec/expr_eval_node.py +0 -232
  229. pixeltable/ext/__init__.py +0 -14
  230. pixeltable/ext/functions/__init__.py +0 -8
  231. pixeltable/ext/functions/whisperx.py +0 -77
  232. pixeltable/ext/functions/yolox.py +0 -157
  233. pixeltable/tool/create_test_db_dump.py +0 -311
  234. pixeltable/tool/create_test_video.py +0 -81
  235. pixeltable/tool/doc_plugins/griffe.py +0 -50
  236. pixeltable/tool/doc_plugins/mkdocstrings.py +0 -6
  237. pixeltable/tool/doc_plugins/templates/material/udf.html.jinja +0 -135
  238. pixeltable/tool/embed_udf.py +0 -9
  239. pixeltable/tool/mypy_plugin.py +0 -55
  240. pixeltable/utils/media_store.py +0 -76
  241. pixeltable/utils/s3.py +0 -16
  242. pixeltable-0.2.26.dist-info/METADATA +0 -400
  243. pixeltable-0.2.26.dist-info/RECORD +0 -156
  244. pixeltable-0.2.26.dist-info/entry_points.txt +0 -3
  245. {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
@@ -1,21 +1,29 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import Any, Optional, Sequence
3
+ import warnings
4
+ from typing import TYPE_CHECKING, Any, Sequence, cast
4
5
  from uuid import UUID
5
6
 
7
+ import PIL.Image
6
8
  import sqlalchemy as sql
7
9
 
8
- import pixeltable as pxt
9
10
  import pixeltable.catalog as catalog
10
11
  import pixeltable.exceptions as excs
11
12
  import pixeltable.iterators as iters
13
+ import pixeltable.type_system as ts
14
+ from pixeltable.catalog.table_version import TableVersionKey
12
15
 
13
16
  from ..utils.description_helper import DescriptionHelper
17
+ from ..utils.filecache import FileCache
14
18
  from .data_row import DataRow
15
19
  from .expr import Expr
20
+ from .literal import Literal
16
21
  from .row_builder import RowBuilder
17
22
  from .sql_element_cache import SqlElementCache
18
23
 
24
+ if TYPE_CHECKING:
25
+ from pixeltable._query import Query, ResultSet
26
+
19
27
 
20
28
  class ColumnRef(Expr):
21
29
  """A reference to a table column
@@ -32,34 +40,48 @@ class ColumnRef(Expr):
32
40
  - in that case, the ColumnRef also instantiates a second non-validating ColumnRef as a component (= dependency)
33
41
  - the non-validating ColumnRef is used for SQL translation
34
42
 
43
+ A ColumnRef may have an optional reference table, which carries the context of the ColumnRef resolution. Thus
44
+ if `v` is a view of `t` (for example), then `v.my_col` and `t.my_col` refer to the same underlying column, but
45
+ their reference tables will be `v` and `t`, respectively. This is to ensure correct behavior of expressions such
46
+ as `v.my_col.head()`.
47
+
35
48
  TODO:
36
49
  separate Exprs (like validating ColumnRefs) from the logical expression tree and instead have RowBuilder
37
50
  insert them into the EvalCtxs as needed
38
51
  """
39
52
 
40
- col: catalog.Column
53
+ col: catalog.Column # TODO: merge with col_handle
54
+ col_handle: catalog.ColumnHandle
55
+ reference_tbl: catalog.TableVersionPath | None
41
56
  is_unstored_iter_col: bool
42
- iter_arg_ctx: Optional[RowBuilder.EvalCtx]
43
- base_rowid_len: int
44
- base_rowid: Sequence[Optional[Any]]
45
- iterator: Optional[iters.ComponentIterator]
46
- pos_idx: Optional[int]
47
- id: int
48
57
  perform_validation: bool # if True, performs media validation
49
-
50
- def __init__(self, col: catalog.Column, perform_validation: Optional[bool] = None):
58
+ iter_arg_ctx: RowBuilder.EvalCtx | None
59
+ iter_outputs: list[ColumnRef] | None
60
+ base_rowid_len: int # number of rowid columns in the base table
61
+
62
+ # execution state
63
+ base_rowid: Sequence[Any | None]
64
+ iterator: iters.ComponentIterator | None
65
+ pos_idx: int
66
+
67
+ def __init__(
68
+ self,
69
+ col: catalog.Column,
70
+ reference_tbl: catalog.TableVersionPath | None = None,
71
+ perform_validation: bool | None = None,
72
+ ):
51
73
  super().__init__(col.col_type)
52
- assert col.tbl is not None
53
74
  self.col = col
54
- self.is_unstored_iter_col = \
55
- col.tbl.is_component_view() and col.tbl.is_iterator_column(col) and not col.is_stored
75
+ self.reference_tbl = reference_tbl
76
+ self.col_handle = col.handle
77
+
78
+ self.is_unstored_iter_col = col.is_iterator_col and not col.is_stored
56
79
  self.iter_arg_ctx = None
57
- # number of rowid columns in the base table
58
- self.base_rowid_len = col.tbl.base.num_rowid_columns() if self.is_unstored_iter_col else 0
59
- self.base_rowid = [None] * self.base_rowid_len
80
+ self.iter_outputs = None
81
+ self.base_rowid_len = 0
82
+ self.base_rowid = []
60
83
  self.iterator = None
61
- # index of the position column in the view's primary key; don't try to reference tbl.store_tbl here
62
- self.pos_idx = col.tbl.num_rowid_columns() - 1 if self.is_unstored_iter_col else None
84
+ self.pos_idx = 0
63
85
 
64
86
  self.perform_validation = False
65
87
  if col.col_type.is_media_type():
@@ -78,34 +100,50 @@ class ColumnRef(Expr):
78
100
  self.components = [non_validating_col_ref]
79
101
  self.id = self._create_id()
80
102
 
81
- def set_iter_arg_ctx(self, iter_arg_ctx: RowBuilder.EvalCtx) -> None:
103
+ def set_iter_arg_ctx(self, iter_arg_ctx: RowBuilder.EvalCtx, iter_outputs: list[ColumnRef]) -> None:
82
104
  self.iter_arg_ctx = iter_arg_ctx
105
+ self.iter_outputs = iter_outputs
106
+ # If this is an unstored iterator column, then the iterator outputs may be needed in order to properly set the
107
+ # iterator position. Therefore, we need to add them as components in order to ensure they're marked as
108
+ # eval dependencies.
109
+ self.components.extend(iter_outputs)
83
110
  assert len(self.iter_arg_ctx.target_slot_idxs) == 1 # a single inline dict
84
111
 
85
112
  def _id_attrs(self) -> list[tuple[str, Any]]:
86
- return (
87
- super()._id_attrs()
88
- + [('tbl_id', self.col.tbl.id), ('col_id', self.col.id), ('perform_validation', self.perform_validation)]
89
- )
113
+ return [
114
+ *super()._id_attrs(),
115
+ ('tbl_id', self.col.tbl_handle.id),
116
+ ('col_id', self.col.id),
117
+ ('perform_validation', self.perform_validation),
118
+ ]
90
119
 
91
120
  # override
92
121
  def _retarget(self, tbl_versions: dict[UUID, catalog.TableVersion]) -> ColumnRef:
93
- target = tbl_versions[self.col.tbl.id]
94
- assert self.col.id in target.cols_by_id
122
+ target = tbl_versions[self.col.tbl_handle.id]
123
+ assert self.col.id in target.cols_by_id, f'{target}: {self.col.id} not in {list(target.cols_by_id.keys())}'
95
124
  col = target.cols_by_id[self.col.id]
96
- return ColumnRef(col)
125
+ return ColumnRef(col, self.reference_tbl)
97
126
 
98
127
  def __getattr__(self, name: str) -> Expr:
99
128
  from .column_property_ref import ColumnPropertyRef
100
129
 
101
130
  # resolve column properties
102
- if name == ColumnPropertyRef.Property.ERRORTYPE.name.lower() \
103
- or name == ColumnPropertyRef.Property.ERRORMSG.name.lower():
104
- if not (self.col.is_computed and self.col.is_stored) and not self.col.col_type.is_media_type():
131
+ if name == ColumnPropertyRef.Property.CELLMD.name.lower():
132
+ # This is not user accessible, but used internally to store cell metadata
133
+ return super().__getattr__(name)
134
+
135
+ if (
136
+ name == ColumnPropertyRef.Property.ERRORTYPE.name.lower()
137
+ or name == ColumnPropertyRef.Property.ERRORMSG.name.lower()
138
+ ):
139
+ is_valid = (self.col.is_computed or self.col.col_type.is_media_type()) and self.col.is_stored
140
+ if not is_valid:
105
141
  raise excs.Error(f'{name} only valid for a stored computed or media column: {self}')
106
142
  return ColumnPropertyRef(self, ColumnPropertyRef.Property[name.upper()])
107
- if name == ColumnPropertyRef.Property.FILEURL.name.lower() \
108
- or name == ColumnPropertyRef.Property.LOCALPATH.name.lower():
143
+ if (
144
+ name == ColumnPropertyRef.Property.FILEURL.name.lower()
145
+ or name == ColumnPropertyRef.Property.LOCALPATH.name.lower()
146
+ ):
109
147
  if not self.col.col_type.is_media_type():
110
148
  raise excs.Error(f'{name} only valid for image/video/audio/document columns: {self}')
111
149
  if self.col.is_computed and not self.col.is_stored:
@@ -114,35 +152,161 @@ class ColumnRef(Expr):
114
152
 
115
153
  if self.col_type.is_json_type():
116
154
  from .json_path import JsonPath
155
+
117
156
  return JsonPath(self, [name])
118
157
 
119
158
  return super().__getattr__(name)
120
159
 
121
- def similarity(self, item: Any, *, idx: Optional[str] = None) -> Expr:
160
+ def recompute(self, *, cascade: bool = True, errors_only: bool = False) -> catalog.UpdateStatus:
161
+ cat = catalog.Catalog.get()
162
+ # lock_mutable_tree=True: we need to be able to see whether any transitive view has column dependents
163
+ with cat.begin_xact(tbl=self.reference_tbl, for_write=True, lock_mutable_tree=True):
164
+ tbl_version = self.col_handle.tbl_version.get()
165
+ if tbl_version.id != self.reference_tbl.tbl_id:
166
+ raise excs.Error('Cannot recompute column of a base.')
167
+ if tbl_version.is_snapshot:
168
+ raise excs.Error('Cannot recompute column of a snapshot.')
169
+ col_name = self.col_handle.get().name
170
+ status = tbl_version.recompute_columns([col_name], errors_only=errors_only, cascade=cascade)
171
+ FileCache.get().emit_eviction_warnings()
172
+ return status
173
+
174
+ def similarity(
175
+ self,
176
+ item: Any = None,
177
+ *,
178
+ string: str | None = None,
179
+ image: PIL.Image.Image | None = None,
180
+ audio: str | None = None,
181
+ video: str | None = None,
182
+ idx: str | None = None,
183
+ ) -> Expr:
122
184
  from .similarity_expr import SimilarityExpr
123
- return SimilarityExpr(self, item, idx_name=idx)
124
185
 
125
- def default_column_name(self) -> Optional[str]:
126
- return str(self)
186
+ if item is not None:
187
+ warnings.warn(
188
+ 'Use of similarity() without specifying an explicit modality is deprecated -- '
189
+ 'since version 0.5.7. Please use one of the following instead:\n'
190
+ ' .similarity(string=...)\n'
191
+ ' .similarity(image=...)\n'
192
+ ' .similarity(audio=...)\n'
193
+ ' .similarity(video=...)',
194
+ DeprecationWarning,
195
+ stacklevel=2,
196
+ )
197
+
198
+ arg_count = (string is not None) + (image is not None) + (audio is not None) + (video is not None)
199
+
200
+ if item is not None and arg_count != 0:
201
+ raise excs.Error('similarity(): `item` is deprecated and cannot be used together with modality arguments')
202
+
203
+ if arg_count > 1:
204
+ raise excs.Error('similarity(): expected exactly one of string=..., image=..., audio=..., video=...')
205
+
206
+ expr: Expr
207
+
208
+ if item is not None:
209
+ if isinstance(item, Expr): # This can happen when using similarity() with @query
210
+ if not (item.col_type.is_string_type() or item.col_type.is_image_type()):
211
+ raise excs.Error(f'similarity(): expected `String` or `Image`; got `{item.col_type}`')
212
+ expr = item
213
+ else:
214
+ if not isinstance(item, (str, PIL.Image.Image)):
215
+ raise excs.Error(f'similarity(): expected `str` or `PIL.Image.Image`; got `{type(item).__name__}`')
216
+ expr = Expr.from_object(item)
217
+ assert expr.col_type.is_string_type() or expr.col_type.is_image_type()
218
+
219
+ if string is not None:
220
+ if isinstance(string, Expr):
221
+ if not string.col_type.is_string_type():
222
+ raise excs.Error(f'similarity(string=...): expected `String`; got `{expr.col_type}`')
223
+ expr = string
224
+ else:
225
+ if not isinstance(string, str):
226
+ raise excs.Error(f'similarity(string=...): expected `str`; got `{type(string).__name__}`')
227
+ expr = Expr.from_object(string)
228
+ assert expr.col_type.is_string_type()
229
+
230
+ if image is not None:
231
+ if isinstance(image, Expr):
232
+ if not image.col_type.is_image_type():
233
+ raise excs.Error(f'similarity(image=...): expected `Image`; got `{image.col_type}`')
234
+ expr = image
235
+ else:
236
+ if not isinstance(image, PIL.Image.Image):
237
+ raise excs.Error(f'similarity(image=...): expected `PIL.Image.Image`; got `{type(image).__name__}`')
238
+ expr = Expr.from_object(image)
239
+ assert expr.col_type.is_image_type()
240
+
241
+ if audio is not None:
242
+ if isinstance(audio, Expr):
243
+ if not audio.col_type.is_audio_type():
244
+ raise excs.Error(f'similarity(audio=...): expected `Audio`; got `{audio.col_type}`')
245
+ expr = audio
246
+ else:
247
+ if not isinstance(audio, str):
248
+ raise excs.Error(
249
+ f'similarity(audio=...): expected `str` (path to audio file); got `{type(audio).__name__}`'
250
+ )
251
+ expr = Literal(audio, ts.AudioType())
252
+
253
+ if video is not None:
254
+ if isinstance(video, Expr):
255
+ if not video.col_type.is_video_type():
256
+ raise excs.Error(f'similarity(video=...): expected `Video`; got `{video.col_type}`')
257
+ expr = video
258
+ else:
259
+ if not isinstance(video, str):
260
+ raise excs.Error(
261
+ f'similarity(video=...): expected `str` (path to video file); got `{type(video).__name__}`'
262
+ )
263
+ expr = Literal(video, ts.VideoType())
264
+
265
+ return SimilarityExpr(self, expr, idx_name=idx)
266
+
267
+ def embedding(self, *, idx: str | None = None) -> ColumnRef:
268
+ from pixeltable.index import EmbeddingIndex
269
+
270
+ idx_info = self.tbl.get().get_idx(self.col, idx, EmbeddingIndex)
271
+ return ColumnRef(idx_info.val_col)
272
+
273
+ @property
274
+ def tbl(self) -> catalog.TableVersionHandle:
275
+ return self.reference_tbl.tbl_version if self.reference_tbl is not None else self.col.tbl_handle
276
+
277
+ def default_column_name(self) -> str | None:
278
+ return self.col.name if self.col is not None else None
127
279
 
128
280
  def _equals(self, other: ColumnRef) -> bool:
129
281
  return self.col == other.col and self.perform_validation == other.perform_validation
130
282
 
131
- def _df(self) -> 'pxt.dataframe.DataFrame':
132
- tbl = catalog.Catalog.get().tbls[self.col.tbl.id]
133
- return tbl.select(self)
283
+ def select(self) -> 'Query':
284
+ import pixeltable.plan as plan
285
+ from pixeltable._query import Query
286
+
287
+ if self.reference_tbl is None:
288
+ # No reference table; use the current version of the table to which the column belongs
289
+ tbl = catalog.Catalog.get().get_table_by_id(self.col.tbl_handle.id)
290
+ return tbl.select(self)
291
+ else:
292
+ # Explicit reference table; construct a Query directly from it
293
+ return Query(plan.FromClause([self.reference_tbl])).select(self)
134
294
 
135
- def show(self, *args, **kwargs) -> 'pxt.dataframe.DataFrameResultSet':
136
- return self._df().show(*args, **kwargs)
295
+ def show(self, *args: Any, **kwargs: Any) -> 'ResultSet':
296
+ return self.select().show(*args, **kwargs)
137
297
 
138
- def head(self, *args, **kwargs) -> 'pxt.dataframe.DataFrameResultSet':
139
- return self._df().head(*args, **kwargs)
298
+ def head(self, *args: Any, **kwargs: Any) -> 'ResultSet':
299
+ return self.select().head(*args, **kwargs)
140
300
 
141
- def tail(self, *args, **kwargs) -> 'pxt.dataframe.DataFrameResultSet':
142
- return self._df().tail(*args, **kwargs)
301
+ def tail(self, *args: Any, **kwargs: Any) -> 'ResultSet':
302
+ return self.select().tail(*args, **kwargs)
143
303
 
144
304
  def count(self) -> int:
145
- return self._df().count()
305
+ return self.select().count()
306
+
307
+ def distinct(self) -> 'Query':
308
+ """Return distinct values in this column."""
309
+ return self.select().distinct()
146
310
 
147
311
  def __str__(self) -> str:
148
312
  if self.col.name is None:
@@ -157,17 +321,32 @@ class ColumnRef(Expr):
157
321
  return self._descriptors().to_html()
158
322
 
159
323
  def _descriptors(self) -> DescriptionHelper:
160
- tbl = catalog.Catalog.get().tbls[self.col.tbl.id]
324
+ with catalog.Catalog.get().begin_xact():
325
+ tbl = catalog.Catalog.get().get_table_by_id(self.col.tbl_handle.id)
161
326
  helper = DescriptionHelper()
162
- helper.append(f'Column\n{self.col.name!r}\n(of table {tbl._path!r})')
327
+ helper.append(f'Column\n{self.col.name!r}\n(of table {tbl._path()!r})')
163
328
  helper.append(tbl._col_descriptor([self.col.name]))
164
329
  idxs = tbl._index_descriptor([self.col.name])
165
330
  if len(idxs) > 0:
166
331
  helper.append(idxs)
167
332
  return helper
168
333
 
169
- def sql_expr(self, _: SqlElementCache) -> Optional[sql.ColumnElement]:
170
- return None if self.perform_validation else self.col.sa_col
334
+ def prepare(self) -> None:
335
+ from pixeltable import store
336
+
337
+ if not self.is_unstored_iter_col:
338
+ return
339
+ col = self.col_handle.get()
340
+ self.base_rowid_len = col.get_tbl().base.get().num_rowid_columns()
341
+ self.base_rowid = [None] * self.base_rowid_len
342
+ assert isinstance(col.get_tbl().store_tbl, store.StoreComponentView)
343
+ self.pos_idx = cast(store.StoreComponentView, col.get_tbl().store_tbl).pos_col_idx
344
+
345
+ def sql_expr(self, _: SqlElementCache) -> sql.ColumnElement | None:
346
+ if self.perform_validation:
347
+ return None
348
+ self.col = self.col_handle.get()
349
+ return self.col.sa_col
171
350
 
172
351
  def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
173
352
  if self.perform_validation:
@@ -205,31 +384,44 @@ class ColumnRef(Expr):
205
384
  return
206
385
 
207
386
  # if this is a new base row, we need to instantiate a new iterator
208
- if self.base_rowid != data_row.pk[:self.base_rowid_len]:
387
+ if self.base_rowid != data_row.pk[: self.base_rowid_len]:
388
+ assert self.iter_arg_ctx is not None
209
389
  row_builder.eval(data_row, self.iter_arg_ctx)
210
390
  iterator_args = data_row[self.iter_arg_ctx.target_slot_idxs[0]]
211
- self.iterator = self.col.tbl.iterator_cls(**iterator_args)
212
- self.base_rowid = data_row.pk[:self.base_rowid_len]
213
- self.iterator.set_pos(data_row.pk[self.pos_idx])
391
+ self.iterator = self.col.get_tbl().iterator_cls(**iterator_args)
392
+ self.base_rowid = data_row.pk[: self.base_rowid_len]
393
+ stored_outputs = {col_ref.col.name: data_row[col_ref.slot_idx] for col_ref in self.iter_outputs}
394
+ assert all(name is not None for name in stored_outputs)
395
+ self.iterator.set_pos(data_row.pk[self.pos_idx], **stored_outputs)
214
396
  res = next(self.iterator)
215
397
  data_row[self.slot_idx] = res[self.col.name]
216
398
 
217
399
  def _as_dict(self) -> dict:
218
- tbl = self.col.tbl
219
- version = tbl.version if tbl.is_snapshot else None
400
+ tbl_handle = self.col.tbl_handle
220
401
  # we omit self.components, even if this is a validating ColumnRef, because init() will recreate the
221
402
  # non-validating component ColumnRef
403
+ assert tbl_handle.anchor_tbl_id is None # TODO: support anchor_tbl_id for view-over-replica
222
404
  return {
223
- 'tbl_id': str(tbl.id),
224
- 'tbl_version': version,
405
+ 'tbl_id': str(tbl_handle.id),
406
+ 'tbl_version': tbl_handle.effective_version,
225
407
  'col_id': self.col.id,
226
- 'perform_validation': self.perform_validation
408
+ 'reference_tbl': self.reference_tbl.as_dict() if self.reference_tbl is not None else None,
409
+ 'perform_validation': self.perform_validation,
227
410
  }
228
411
 
412
+ @classmethod
413
+ def get_column_id(cls, d: dict) -> catalog.QColumnId:
414
+ tbl_id, col_id = UUID(d['tbl_id']), d['col_id']
415
+ return catalog.QColumnId(tbl_id, col_id)
416
+
229
417
  @classmethod
230
418
  def get_column(cls, d: dict) -> catalog.Column:
231
419
  tbl_id, version, col_id = UUID(d['tbl_id']), d['tbl_version'], d['col_id']
232
- tbl_version = catalog.Catalog.get().tbl_versions[(tbl_id, version)]
420
+ # validate_initialized=False: this gets called as part of TableVersion.init()
421
+ # TODO: When we have views on replicas, we will need to store anchor_tbl_id in metadata as well.
422
+ tbl_version = catalog.Catalog.get().get_tbl_version(
423
+ TableVersionKey(tbl_id, version, None), validate_initialized=False
424
+ )
233
425
  # don't use tbl_version.cols_by_id here, this might be a snapshot reference to a column that was then dropped
234
426
  col = next(col for col in tbl_version.cols if col.id == col_id)
235
427
  return col
@@ -237,5 +429,6 @@ class ColumnRef(Expr):
237
429
  @classmethod
238
430
  def _from_dict(cls, d: dict, _: list[Expr]) -> ColumnRef:
239
431
  col = cls.get_column(d)
432
+ reference_tbl = None if d['reference_tbl'] is None else catalog.TableVersionPath.from_dict(d['reference_tbl'])
240
433
  perform_validation = d['perform_validation']
241
- return cls(col, perform_validation=perform_validation)
434
+ return cls(col, reference_tbl, perform_validation=perform_validation)
@@ -1,11 +1,10 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import Any, Optional
3
+ from typing import Any
4
4
 
5
5
  import sqlalchemy as sql
6
6
 
7
7
  import pixeltable.exceptions as excs
8
- import pixeltable.index as index
9
8
  import pixeltable.type_system as ts
10
9
 
11
10
  from .column_ref import ColumnRef
@@ -22,6 +21,8 @@ class Comparison(Expr):
22
21
  operator: ComparisonOperator
23
22
 
24
23
  def __init__(self, operator: ComparisonOperator, op1: Expr, op2: Expr):
24
+ from pixeltable import index
25
+
25
26
  super().__init__(ts.BoolType())
26
27
  self.operator = operator
27
28
 
@@ -38,9 +39,11 @@ class Comparison(Expr):
38
39
  self.is_search_arg_comparison = False
39
40
  self.components = [op1, op2]
40
41
 
41
- import pixeltable.index as index
42
- if self.is_search_arg_comparison and self._op2.col_type.is_string_type() \
43
- and len(self._op2.val) >= index.BtreeIndex.MAX_STRING_LEN:
42
+ if (
43
+ self.is_search_arg_comparison
44
+ and self._op2.col_type.is_string_type()
45
+ and len(self._op2.val) >= index.BtreeIndex.MAX_STRING_LEN
46
+ ):
44
47
  # we can't use an index for this after all
45
48
  raise excs.Error(
46
49
  f'String literal too long for comparison against indexed column {self._op1.col.name!r} '
@@ -56,7 +59,7 @@ class Comparison(Expr):
56
59
  return self.operator == other.operator
57
60
 
58
61
  def _id_attrs(self) -> list[tuple[str, Any]]:
59
- return super()._id_attrs() + [('operator', self.operator.value)]
62
+ return [*super()._id_attrs(), ('operator', self.operator.value)]
60
63
 
61
64
  @property
62
65
  def _op1(self) -> Expr:
@@ -66,7 +69,9 @@ class Comparison(Expr):
66
69
  def _op2(self) -> Expr:
67
70
  return self.components[1]
68
71
 
69
- def sql_expr(self, sql_elements: SqlElementCache) -> Optional[sql.ColumnElement]:
72
+ def sql_expr(self, sql_elements: SqlElementCache) -> sql.ColumnElement | None:
73
+ import pixeltable.index as index
74
+
70
75
  if str(self._op1.col_type.to_sa_type()) != str(self._op2.col_type.to_sa_type()):
71
76
  # Comparing columns of different SQL types (e.g., string vs. json); this can only be done in Python
72
77
  # TODO(aaron-siegel): We may be able to handle some cases in SQL by casting one side to the other's type
@@ -76,9 +81,9 @@ class Comparison(Expr):
76
81
  if self.is_search_arg_comparison:
77
82
  # reference the index value column if there is an index and this is not a snapshot
78
83
  # (indices don't apply to snapshots)
79
- tbl = self._op1.col.tbl
84
+ tbl = self._op1.col.get_tbl()
80
85
  idx_info = [
81
- info for info in self._op1.col.get_idx_info().values() if isinstance(info.idx, index.BtreeIndex)
86
+ info for info in tbl.idxs_by_col.get(self._op1.col.qid, []) if isinstance(info.idx, index.BtreeIndex)
82
87
  ]
83
88
  if len(idx_info) > 0 and not tbl.is_snapshot:
84
89
  # there shouldn't be multiple B-tree indices on a column
@@ -1,11 +1,11 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import operator
4
- from typing import Any, Callable, Optional
4
+ from typing import Any, Callable
5
5
 
6
6
  import sqlalchemy as sql
7
7
 
8
- import pixeltable.type_system as ts
8
+ from pixeltable import type_system as ts
9
9
 
10
10
  from .data_row import DataRow
11
11
  from .expr import Expr
@@ -36,7 +36,8 @@ class CompoundPredicate(Expr):
36
36
  return f' {self.operator} '.join([f'({e})' for e in self.components])
37
37
 
38
38
  @classmethod
39
- def make_conjunction(cls, operands: list[Expr]) -> Optional[Expr]:
39
+ def make_conjunction(cls, operands: list[Expr | None]) -> Expr | None:
40
+ operands = [e for e in operands if e is not None]
40
41
  if len(operands) == 0:
41
42
  return None
42
43
  if len(operands) == 1:
@@ -58,17 +59,16 @@ class CompoundPredicate(Expr):
58
59
  return self.operator == other.operator
59
60
 
60
61
  def _id_attrs(self) -> list[tuple[str, Any]]:
61
- return super()._id_attrs() + [('operator', self.operator.value)]
62
+ return [*super()._id_attrs(), ('operator', self.operator.value)]
62
63
 
63
- def split_conjuncts(
64
- self, condition: Callable[[Expr], bool]) -> tuple[list[Expr], Optional[Expr]]:
65
- if self.operator == LogicalOperator.OR or self.operator == LogicalOperator.NOT:
64
+ def split_conjuncts(self, condition: Callable[[Expr], bool]) -> tuple[list[Expr], Expr | None]:
65
+ if self.operator in (LogicalOperator.OR, LogicalOperator.NOT):
66
66
  return super().split_conjuncts(condition)
67
67
  matches = [op for op in self.components if condition(op)]
68
68
  non_matches = [op for op in self.components if not condition(op)]
69
69
  return (matches, self.make_conjunction(non_matches))
70
70
 
71
- def sql_expr(self, sql_elements: SqlElementCache) -> Optional[sql.ColumnElement]:
71
+ def sql_expr(self, sql_elements: SqlElementCache) -> sql.ColumnElement | None:
72
72
  sql_exprs = [sql_elements.get(op) for op in self.components]
73
73
  if any(e is None for e in sql_exprs):
74
74
  return None
@@ -84,7 +84,7 @@ class CompoundPredicate(Expr):
84
84
  if self.operator == LogicalOperator.NOT:
85
85
  data_row[self.slot_idx] = not data_row[self.components[0].slot_idx]
86
86
  else:
87
- val = True if self.operator == LogicalOperator.AND else False
87
+ val = self.operator == LogicalOperator.AND
88
88
  op_function = operator.and_ if self.operator == LogicalOperator.AND else operator.or_
89
89
  for op in self.components:
90
90
  val = op_function(val, data_row[op.slot_idx])
@@ -97,4 +97,3 @@ class CompoundPredicate(Expr):
97
97
  def _from_dict(cls, d: dict, components: list[Expr]) -> CompoundPredicate:
98
98
  assert 'operator' in d
99
99
  return cls(LogicalOperator(d['operator']), components)
100
-