pixeltable 0.4.0rc3__py3-none-any.whl → 0.4.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (202) hide show
  1. pixeltable/__init__.py +23 -5
  2. pixeltable/_version.py +1 -0
  3. pixeltable/catalog/__init__.py +5 -3
  4. pixeltable/catalog/catalog.py +1318 -404
  5. pixeltable/catalog/column.py +186 -115
  6. pixeltable/catalog/dir.py +1 -2
  7. pixeltable/catalog/globals.py +11 -43
  8. pixeltable/catalog/insertable_table.py +167 -79
  9. pixeltable/catalog/path.py +61 -23
  10. pixeltable/catalog/schema_object.py +9 -10
  11. pixeltable/catalog/table.py +626 -308
  12. pixeltable/catalog/table_metadata.py +101 -0
  13. pixeltable/catalog/table_version.py +713 -569
  14. pixeltable/catalog/table_version_handle.py +37 -6
  15. pixeltable/catalog/table_version_path.py +42 -29
  16. pixeltable/catalog/tbl_ops.py +50 -0
  17. pixeltable/catalog/update_status.py +191 -0
  18. pixeltable/catalog/view.py +108 -94
  19. pixeltable/config.py +128 -22
  20. pixeltable/dataframe.py +188 -100
  21. pixeltable/env.py +407 -136
  22. pixeltable/exceptions.py +6 -0
  23. pixeltable/exec/__init__.py +3 -0
  24. pixeltable/exec/aggregation_node.py +7 -8
  25. pixeltable/exec/cache_prefetch_node.py +83 -110
  26. pixeltable/exec/cell_materialization_node.py +231 -0
  27. pixeltable/exec/cell_reconstruction_node.py +135 -0
  28. pixeltable/exec/component_iteration_node.py +4 -3
  29. pixeltable/exec/data_row_batch.py +8 -65
  30. pixeltable/exec/exec_context.py +16 -4
  31. pixeltable/exec/exec_node.py +13 -36
  32. pixeltable/exec/expr_eval/evaluators.py +7 -6
  33. pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
  34. pixeltable/exec/expr_eval/globals.py +8 -5
  35. pixeltable/exec/expr_eval/row_buffer.py +1 -2
  36. pixeltable/exec/expr_eval/schedulers.py +190 -30
  37. pixeltable/exec/globals.py +32 -0
  38. pixeltable/exec/in_memory_data_node.py +18 -18
  39. pixeltable/exec/object_store_save_node.py +293 -0
  40. pixeltable/exec/row_update_node.py +16 -9
  41. pixeltable/exec/sql_node.py +206 -101
  42. pixeltable/exprs/__init__.py +1 -1
  43. pixeltable/exprs/arithmetic_expr.py +27 -22
  44. pixeltable/exprs/array_slice.py +3 -3
  45. pixeltable/exprs/column_property_ref.py +34 -30
  46. pixeltable/exprs/column_ref.py +92 -96
  47. pixeltable/exprs/comparison.py +5 -5
  48. pixeltable/exprs/compound_predicate.py +5 -4
  49. pixeltable/exprs/data_row.py +152 -55
  50. pixeltable/exprs/expr.py +62 -43
  51. pixeltable/exprs/expr_dict.py +3 -3
  52. pixeltable/exprs/expr_set.py +17 -10
  53. pixeltable/exprs/function_call.py +75 -37
  54. pixeltable/exprs/globals.py +1 -2
  55. pixeltable/exprs/in_predicate.py +4 -4
  56. pixeltable/exprs/inline_expr.py +10 -27
  57. pixeltable/exprs/is_null.py +1 -3
  58. pixeltable/exprs/json_mapper.py +8 -8
  59. pixeltable/exprs/json_path.py +56 -22
  60. pixeltable/exprs/literal.py +5 -5
  61. pixeltable/exprs/method_ref.py +2 -2
  62. pixeltable/exprs/object_ref.py +2 -2
  63. pixeltable/exprs/row_builder.py +127 -53
  64. pixeltable/exprs/rowid_ref.py +8 -12
  65. pixeltable/exprs/similarity_expr.py +50 -25
  66. pixeltable/exprs/sql_element_cache.py +4 -4
  67. pixeltable/exprs/string_op.py +5 -5
  68. pixeltable/exprs/type_cast.py +3 -5
  69. pixeltable/func/__init__.py +1 -0
  70. pixeltable/func/aggregate_function.py +8 -8
  71. pixeltable/func/callable_function.py +9 -9
  72. pixeltable/func/expr_template_function.py +10 -10
  73. pixeltable/func/function.py +18 -20
  74. pixeltable/func/function_registry.py +6 -7
  75. pixeltable/func/globals.py +2 -3
  76. pixeltable/func/mcp.py +74 -0
  77. pixeltable/func/query_template_function.py +20 -18
  78. pixeltable/func/signature.py +43 -16
  79. pixeltable/func/tools.py +23 -13
  80. pixeltable/func/udf.py +18 -20
  81. pixeltable/functions/__init__.py +6 -0
  82. pixeltable/functions/anthropic.py +93 -33
  83. pixeltable/functions/audio.py +114 -10
  84. pixeltable/functions/bedrock.py +13 -6
  85. pixeltable/functions/date.py +1 -1
  86. pixeltable/functions/deepseek.py +20 -9
  87. pixeltable/functions/fireworks.py +2 -2
  88. pixeltable/functions/gemini.py +28 -11
  89. pixeltable/functions/globals.py +13 -13
  90. pixeltable/functions/groq.py +108 -0
  91. pixeltable/functions/huggingface.py +1046 -23
  92. pixeltable/functions/image.py +9 -18
  93. pixeltable/functions/llama_cpp.py +23 -8
  94. pixeltable/functions/math.py +3 -4
  95. pixeltable/functions/mistralai.py +4 -15
  96. pixeltable/functions/ollama.py +16 -9
  97. pixeltable/functions/openai.py +104 -82
  98. pixeltable/functions/openrouter.py +143 -0
  99. pixeltable/functions/replicate.py +2 -2
  100. pixeltable/functions/reve.py +250 -0
  101. pixeltable/functions/string.py +21 -28
  102. pixeltable/functions/timestamp.py +13 -14
  103. pixeltable/functions/together.py +4 -6
  104. pixeltable/functions/twelvelabs.py +92 -0
  105. pixeltable/functions/util.py +6 -1
  106. pixeltable/functions/video.py +1388 -106
  107. pixeltable/functions/vision.py +7 -7
  108. pixeltable/functions/whisper.py +15 -7
  109. pixeltable/functions/whisperx.py +179 -0
  110. pixeltable/{ext/functions → functions}/yolox.py +2 -4
  111. pixeltable/globals.py +332 -105
  112. pixeltable/index/base.py +13 -22
  113. pixeltable/index/btree.py +23 -22
  114. pixeltable/index/embedding_index.py +32 -44
  115. pixeltable/io/__init__.py +4 -2
  116. pixeltable/io/datarows.py +7 -6
  117. pixeltable/io/external_store.py +49 -77
  118. pixeltable/io/fiftyone.py +11 -11
  119. pixeltable/io/globals.py +29 -28
  120. pixeltable/io/hf_datasets.py +17 -9
  121. pixeltable/io/label_studio.py +70 -66
  122. pixeltable/io/lancedb.py +3 -0
  123. pixeltable/io/pandas.py +12 -11
  124. pixeltable/io/parquet.py +13 -93
  125. pixeltable/io/table_data_conduit.py +71 -47
  126. pixeltable/io/utils.py +3 -3
  127. pixeltable/iterators/__init__.py +2 -1
  128. pixeltable/iterators/audio.py +21 -11
  129. pixeltable/iterators/document.py +116 -55
  130. pixeltable/iterators/image.py +5 -2
  131. pixeltable/iterators/video.py +293 -13
  132. pixeltable/metadata/__init__.py +4 -2
  133. pixeltable/metadata/converters/convert_18.py +2 -2
  134. pixeltable/metadata/converters/convert_19.py +2 -2
  135. pixeltable/metadata/converters/convert_20.py +2 -2
  136. pixeltable/metadata/converters/convert_21.py +2 -2
  137. pixeltable/metadata/converters/convert_22.py +2 -2
  138. pixeltable/metadata/converters/convert_24.py +2 -2
  139. pixeltable/metadata/converters/convert_25.py +2 -2
  140. pixeltable/metadata/converters/convert_26.py +2 -2
  141. pixeltable/metadata/converters/convert_29.py +4 -4
  142. pixeltable/metadata/converters/convert_34.py +2 -2
  143. pixeltable/metadata/converters/convert_36.py +2 -2
  144. pixeltable/metadata/converters/convert_37.py +15 -0
  145. pixeltable/metadata/converters/convert_38.py +39 -0
  146. pixeltable/metadata/converters/convert_39.py +124 -0
  147. pixeltable/metadata/converters/convert_40.py +73 -0
  148. pixeltable/metadata/converters/util.py +13 -12
  149. pixeltable/metadata/notes.py +4 -0
  150. pixeltable/metadata/schema.py +79 -42
  151. pixeltable/metadata/utils.py +74 -0
  152. pixeltable/mypy/__init__.py +3 -0
  153. pixeltable/mypy/mypy_plugin.py +123 -0
  154. pixeltable/plan.py +274 -223
  155. pixeltable/share/__init__.py +1 -1
  156. pixeltable/share/packager.py +259 -129
  157. pixeltable/share/protocol/__init__.py +34 -0
  158. pixeltable/share/protocol/common.py +170 -0
  159. pixeltable/share/protocol/operation_types.py +33 -0
  160. pixeltable/share/protocol/replica.py +109 -0
  161. pixeltable/share/publish.py +213 -57
  162. pixeltable/store.py +238 -175
  163. pixeltable/type_system.py +104 -63
  164. pixeltable/utils/__init__.py +2 -3
  165. pixeltable/utils/arrow.py +108 -13
  166. pixeltable/utils/av.py +298 -0
  167. pixeltable/utils/azure_store.py +305 -0
  168. pixeltable/utils/code.py +3 -3
  169. pixeltable/utils/console_output.py +4 -1
  170. pixeltable/utils/coroutine.py +6 -23
  171. pixeltable/utils/dbms.py +31 -5
  172. pixeltable/utils/description_helper.py +4 -5
  173. pixeltable/utils/documents.py +5 -6
  174. pixeltable/utils/exception_handler.py +7 -30
  175. pixeltable/utils/filecache.py +6 -6
  176. pixeltable/utils/formatter.py +4 -6
  177. pixeltable/utils/gcs_store.py +283 -0
  178. pixeltable/utils/http_server.py +2 -3
  179. pixeltable/utils/iceberg.py +1 -2
  180. pixeltable/utils/image.py +17 -0
  181. pixeltable/utils/lancedb.py +88 -0
  182. pixeltable/utils/local_store.py +316 -0
  183. pixeltable/utils/misc.py +5 -0
  184. pixeltable/utils/object_stores.py +528 -0
  185. pixeltable/utils/pydantic.py +60 -0
  186. pixeltable/utils/pytorch.py +5 -6
  187. pixeltable/utils/s3_store.py +392 -0
  188. pixeltable-0.4.20.dist-info/METADATA +587 -0
  189. pixeltable-0.4.20.dist-info/RECORD +218 -0
  190. {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info}/WHEEL +1 -1
  191. pixeltable-0.4.20.dist-info/entry_points.txt +2 -0
  192. pixeltable/__version__.py +0 -3
  193. pixeltable/ext/__init__.py +0 -17
  194. pixeltable/ext/functions/__init__.py +0 -11
  195. pixeltable/ext/functions/whisperx.py +0 -77
  196. pixeltable/utils/media_store.py +0 -77
  197. pixeltable/utils/s3.py +0 -17
  198. pixeltable/utils/sample.py +0 -25
  199. pixeltable-0.4.0rc3.dist-info/METADATA +0 -435
  200. pixeltable-0.4.0rc3.dist-info/RECORD +0 -189
  201. pixeltable-0.4.0rc3.dist-info/entry_points.txt +0 -3
  202. {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info/licenses}/LICENSE +0 -0
pixeltable/store.py CHANGED
@@ -2,23 +2,22 @@ from __future__ import annotations
2
2
 
3
3
  import abc
4
4
  import logging
5
- import os
6
5
  import sys
7
- import urllib.parse
8
- import urllib.request
6
+ import time
9
7
  import warnings
10
- from typing import Any, Iterable, Iterator, Literal, Optional, Union
8
+ from typing import Any, Iterable, Iterator
11
9
 
12
10
  import more_itertools
11
+ import psycopg
13
12
  import sqlalchemy as sql
14
13
  from tqdm import TqdmWarning, tqdm
15
14
 
16
- from pixeltable import catalog, exceptions as excs, exprs
15
+ from pixeltable import catalog, exceptions as excs
16
+ from pixeltable.catalog.update_status import RowCountStats
17
17
  from pixeltable.env import Env
18
18
  from pixeltable.exec import ExecNode
19
19
  from pixeltable.metadata import schema
20
20
  from pixeltable.utils.exception_handler import run_cleanup
21
- from pixeltable.utils.media_store import MediaStore
22
21
  from pixeltable.utils.sql import log_explain, log_stmt
23
22
 
24
23
  _logger = logging.getLogger('pixeltable')
@@ -35,13 +34,16 @@ class StoreBase:
35
34
 
36
35
  tbl_version: catalog.TableVersionHandle
37
36
  sa_md: sql.MetaData
38
- sa_tbl: Optional[sql.Table]
37
+ sa_tbl: sql.Table | None
39
38
  _pk_cols: list[sql.Column]
40
39
  v_min_col: sql.Column
41
40
  v_max_col: sql.Column
42
- base: Optional[StoreBase]
41
+ base: StoreBase | None
43
42
 
44
- __INSERT_BATCH_SIZE = 1000
43
+ # In my cursory experiments this was the optimal batch size: it was an improvement over 5_000 and there was no real
44
+ # benefit to going higher.
45
+ # TODO: Perform more rigorous experiments with different table structures and OS environments to refine this.
46
+ __INSERT_BATCH_SIZE = 10_000
45
47
 
46
48
  def __init__(self, tbl_version: catalog.TableVersion):
47
49
  self.tbl_version = catalog.TableVersionHandle(
@@ -78,20 +80,20 @@ class StoreBase:
78
80
  self._pk_cols = [*rowid_cols, self.v_min_col]
79
81
  return [*rowid_cols, self.v_min_col, self.v_max_col]
80
82
 
81
- def create_sa_tbl(self, tbl_version: Optional[catalog.TableVersion] = None) -> None:
83
+ def create_sa_tbl(self, tbl_version: catalog.TableVersion | None = None) -> None:
82
84
  """Create self.sa_tbl from self.tbl_version."""
83
85
  if tbl_version is None:
84
86
  tbl_version = self.tbl_version.get()
85
87
  system_cols = self._create_system_columns()
86
88
  all_cols = system_cols.copy()
89
+ # we captured all columns, including dropped ones: they're still part of the physical table
87
90
  for col in [c for c in tbl_version.cols if c.is_stored]:
88
91
  # re-create sql.Column for each column, regardless of whether it already has sa_col set: it was bound
89
92
  # to the last sql.Table version we created and cannot be reused
90
93
  col.create_sa_cols()
91
94
  all_cols.append(col.sa_col)
92
- if col.records_errors:
93
- all_cols.append(col.sa_errormsg_col)
94
- all_cols.append(col.sa_errortype_col)
95
+ if col.stores_cellmd:
96
+ all_cols.append(col.sa_cellmd_col)
95
97
 
96
98
  if self.sa_tbl is not None:
97
99
  # if we're called in response to a schema change, we need to remove the old table first
@@ -122,51 +124,6 @@ class StoreBase:
122
124
  def _storage_name(self) -> str:
123
125
  """Return the name of the data store table"""
124
126
 
125
- def _move_tmp_media_file(self, file_url: Optional[str], col: catalog.Column, v_min: int) -> str:
126
- """Move tmp media file with given url to Env.media_dir and return new url, or given url if not a tmp_dir file"""
127
- pxt_tmp_dir = str(Env.get().tmp_dir)
128
- if file_url is None:
129
- return None
130
- parsed = urllib.parse.urlparse(file_url)
131
- # We should never be passed a local file path here. The "len > 1" ensures that Windows
132
- # file paths aren't mistaken for URLs with a single-character scheme.
133
- assert len(parsed.scheme) > 1
134
- if parsed.scheme != 'file':
135
- # remote url
136
- return file_url
137
- file_path = urllib.parse.unquote(urllib.request.url2pathname(parsed.path))
138
- if not file_path.startswith(pxt_tmp_dir):
139
- # not a tmp file
140
- return file_url
141
- _, ext = os.path.splitext(file_path)
142
- new_path = str(MediaStore.prepare_media_path(self.tbl_version.id, col.id, v_min, ext=ext))
143
- os.rename(file_path, new_path)
144
- new_file_url = urllib.parse.urljoin('file:', urllib.request.pathname2url(new_path))
145
- return new_file_url
146
-
147
- def _move_tmp_media_files(
148
- self, table_rows: list[dict[str, Any]], media_cols: list[catalog.Column], v_min: int
149
- ) -> None:
150
- """Move tmp media files that we generated to a permanent location"""
151
- for c in media_cols:
152
- for table_row in table_rows:
153
- file_url = table_row[c.store_name()]
154
- table_row[c.store_name()] = self._move_tmp_media_file(file_url, c, v_min)
155
-
156
- def _create_table_row(
157
- self, input_row: exprs.DataRow, row_builder: exprs.RowBuilder, exc_col_ids: set[int], pk: tuple[int, ...]
158
- ) -> tuple[dict[str, Any], int]:
159
- """Return Tuple[complete table row, # of exceptions] for insert()
160
- Creates a row that includes the PK columns, with the values from input_row.pk.
161
- Returns:
162
- Tuple[complete table row, # of exceptions]
163
- """
164
- table_row, num_excs = row_builder.create_table_row(input_row, exc_col_ids)
165
- assert len(pk) == len(self._pk_cols)
166
- for pk_col, pk_val in zip(self._pk_cols, pk):
167
- table_row[pk_col.name] = pk_val
168
- return table_row, num_excs
169
-
170
127
  def count(self) -> int:
171
128
  """Return the number of rows visible in self.tbl_version"""
172
129
  stmt = (
@@ -180,15 +137,123 @@ class StoreBase:
180
137
  assert isinstance(result, int)
181
138
  return result
182
139
 
140
+ def _exec_if_not_exists(self, stmt: str, wait_for_table: bool) -> None:
141
+ """
142
+ Execute a statement containing 'IF NOT EXISTS' and ignore any duplicate object-related errors.
143
+
144
+ The statement needs to run in a separate transaction, because the expected error conditions will abort the
145
+ enclosing transaction (and the ability to run additional statements in that same transaction).
146
+ """
147
+ while True:
148
+ with Env.get().begin_xact(for_write=True) as conn:
149
+ try:
150
+ if wait_for_table and not Env.get().is_using_cockroachdb:
151
+ # Try to lock the table to make sure that it exists. This needs to run in the same transaction
152
+ # as 'stmt' to avoid a race condition.
153
+ # TODO: adapt this for CockroachDB
154
+ lock_stmt = f'LOCK TABLE {self._storage_name()} IN ACCESS EXCLUSIVE MODE'
155
+ conn.execute(sql.text(lock_stmt))
156
+ conn.execute(sql.text(stmt))
157
+ return
158
+ except (sql.exc.IntegrityError, sql.exc.ProgrammingError) as e:
159
+ Env.get().console_logger.info(f'{stmt} failed with: {e}')
160
+ if (
161
+ isinstance(e.orig, psycopg.errors.UniqueViolation)
162
+ and 'duplicate key value violates unique constraint' in str(e.orig)
163
+ ) or (
164
+ isinstance(e.orig, (psycopg.errors.DuplicateObject, psycopg.errors.DuplicateTable))
165
+ and 'already exists' in str(e.orig)
166
+ ):
167
+ # table already exists
168
+ return
169
+ elif isinstance(e.orig, psycopg.errors.UndefinedTable):
170
+ # the Lock Table failed because the table doesn't exist yet; try again
171
+ time.sleep(1)
172
+ continue
173
+ else:
174
+ raise
175
+
176
+ def _store_tbl_exists(self) -> bool:
177
+ """Returns True if the store table exists, False otherwise."""
178
+ with Env.get().begin_xact(for_write=False) as conn:
179
+ q = (
180
+ 'SELECT COUNT(*) FROM pg_catalog.pg_tables '
181
+ f"WHERE schemaname = 'public' AND tablename = {self._storage_name()!r}"
182
+ )
183
+ res = conn.execute(sql.text(q)).scalar_one()
184
+ return res == 1
185
+
183
186
  def create(self) -> None:
184
- conn = Env.get().conn
185
- self.sa_md.create_all(bind=conn)
187
+ """
188
+ Create or update store table to bring it in sync with self.sa_tbl. Idempotent.
189
+
190
+ This runs a sequence of DDL statements (Create Table, Alter Table Add Column, Create Index), each of which
191
+ is run in its own transaction.
192
+
193
+ The exception to that are local replicas, for which TableRestorer creates an enclosing transaction. In theory,
194
+ this should avoid the potential for race conditions that motivate the error handling present in
195
+ _exec_if_not_exists() (meaning: we shouldn't see those errors when creating local replicas).
196
+ TODO: remove the special case for local replicas in order to make the logic easier to reason about.
197
+ """
198
+ postgres_dialect = sql.dialects.postgresql.dialect()
199
+
200
+ if not self._store_tbl_exists():
201
+ # run Create Table If Not Exists; we always need If Not Exists to avoid race conditions between concurrent
202
+ # Pixeltable processes
203
+ create_stmt = sql.schema.CreateTable(self.sa_tbl, if_not_exists=True).compile(dialect=postgres_dialect)
204
+ self._exec_if_not_exists(str(create_stmt), wait_for_table=False)
205
+ else:
206
+ # ensure that all columns exist by running Alter Table Add Column If Not Exists for all columns
207
+ for col in self.sa_tbl.columns:
208
+ stmt = self._add_column_stmt(col)
209
+ self._exec_if_not_exists(stmt, wait_for_table=True)
210
+ # TODO: do we also need to ensure that these columns are now visible (ie, is there another potential race
211
+ # condition here?)
212
+
213
+ # ensure that all system indices exist by running Create Index If Not Exists
214
+ for idx in self.sa_tbl.indexes:
215
+ create_idx_stmt = sql.schema.CreateIndex(idx, if_not_exists=True).compile(dialect=postgres_dialect)
216
+ self._exec_if_not_exists(str(create_idx_stmt), wait_for_table=True)
217
+
218
+ # ensure that all visible non-system indices exist by running appropriate create statements
219
+ for id in self.tbl_version.get().idxs:
220
+ self.create_index(id)
221
+
222
+ def create_index(self, idx_id: int) -> None:
223
+ """Create If Not Exists for this index"""
224
+ idx_info = self.tbl_version.get().idxs[idx_id]
225
+ stmt = idx_info.idx.sa_create_stmt(self.tbl_version.get()._store_idx_name(idx_id), idx_info.val_col.sa_col)
226
+ self._exec_if_not_exists(str(stmt), wait_for_table=True)
227
+
228
+ def validate(self) -> None:
229
+ """Validate store table against self.table_version"""
230
+ with Env.get().begin_xact() as conn:
231
+ # check that all columns are present
232
+ q = f'SELECT column_name FROM information_schema.columns WHERE table_name = {self._storage_name()!r}'
233
+ store_col_info = {row[0] for row in conn.execute(sql.text(q)).fetchall()}
234
+ tbl_col_info = {col.store_name() for col in self.tbl_version.get().cols if col.is_stored}
235
+ assert tbl_col_info.issubset(store_col_info)
236
+
237
+ # check that all visible indices are present
238
+ q = f'SELECT indexname FROM pg_indexes WHERE tablename = {self._storage_name()!r}'
239
+ store_idx_names = {row[0] for row in conn.execute(sql.text(q)).fetchall()}
240
+ tbl_index_names = {
241
+ self.tbl_version.get()._store_idx_name(info.id) for info in self.tbl_version.get().idxs.values()
242
+ }
243
+ assert tbl_index_names.issubset(store_idx_names)
186
244
 
187
245
  def drop(self) -> None:
188
246
  """Drop store table"""
189
247
  conn = Env.get().conn
190
248
  self.sa_md.drop_all(bind=conn)
191
249
 
250
+ def _add_column_stmt(self, sa_col: sql.Column) -> str:
251
+ col_type_str = sa_col.type.compile(dialect=sql.dialects.postgresql.dialect())
252
+ return (
253
+ f'ALTER TABLE {self._storage_name()} ADD COLUMN IF NOT EXISTS '
254
+ f'{sa_col.name} {col_type_str} {"NOT " if not sa_col.nullable else ""} NULL'
255
+ )
256
+
192
257
  def add_column(self, col: catalog.Column) -> None:
193
258
  """Add column(s) to the store-resident table based on a catalog column
194
259
 
@@ -197,14 +262,13 @@ class StoreBase:
197
262
  """
198
263
  assert col.is_stored
199
264
  conn = Env.get().conn
200
- col_type_str = col.get_sa_col_type().compile(dialect=conn.dialect)
265
+ col_type_str = col.sa_col_type.compile(dialect=conn.dialect)
201
266
  s_txt = f'ALTER TABLE {self._storage_name()} ADD COLUMN {col.store_name()} {col_type_str} NULL'
202
267
  added_storage_cols = [col.store_name()]
203
- if col.records_errors:
204
- # we also need to create the errormsg and errortype storage cols
205
- s_txt += f' , ADD COLUMN {col.errormsg_store_name()} VARCHAR DEFAULT NULL'
206
- s_txt += f' , ADD COLUMN {col.errortype_store_name()} VARCHAR DEFAULT NULL'
207
- added_storage_cols.extend([col.errormsg_store_name(), col.errortype_store_name()])
268
+ if col.stores_cellmd:
269
+ cellmd_type_str = col.sa_cellmd_type().compile(dialect=conn.dialect)
270
+ s_txt += f' , ADD COLUMN {col.cellmd_store_name()} {cellmd_type_str} DEFAULT NULL'
271
+ added_storage_cols.append(col.cellmd_store_name())
208
272
 
209
273
  stmt = sql.text(s_txt)
210
274
  log_stmt(_logger, stmt)
@@ -215,25 +279,13 @@ class StoreBase:
215
279
  def drop_column(self, col: catalog.Column) -> None:
216
280
  """Execute Alter Table Drop Column statement"""
217
281
  s_txt = f'ALTER TABLE {self._storage_name()} DROP COLUMN {col.store_name()}'
218
- if col.records_errors:
219
- s_txt += f' , DROP COLUMN {col.errormsg_store_name()}'
220
- s_txt += f' , DROP COLUMN {col.errortype_store_name()}'
282
+ if col.stores_cellmd:
283
+ s_txt += f' , DROP COLUMN {col.cellmd_store_name()}'
221
284
  stmt = sql.text(s_txt)
222
285
  log_stmt(_logger, stmt)
223
286
  Env.get().conn.execute(stmt)
224
287
 
225
- def ensure_columns_exist(self, cols: Iterable[catalog.Column]) -> None:
226
- conn = Env.get().conn
227
- sql_text = f'SELECT column_name FROM information_schema.columns WHERE table_name = {self._storage_name()!r}'
228
- result = conn.execute(sql.text(sql_text))
229
- existing_cols = {row[0] for row in result}
230
- for col in cols:
231
- if col.store_name() not in existing_cols:
232
- self.add_column(col)
233
-
234
- def load_column(
235
- self, col: catalog.Column, exec_plan: ExecNode, value_expr_slot_idx: int, on_error: Literal['abort', 'ignore']
236
- ) -> int:
288
+ def load_column(self, col: catalog.Column, exec_plan: ExecNode, abort_on_exc: bool) -> int:
237
289
  """Update store column of a computed column with values produced by an execution plan
238
290
 
239
291
  Returns:
@@ -242,84 +294,69 @@ class StoreBase:
242
294
  sql.exc.DBAPIError if there was a SQL error during execution
243
295
  excs.Error if on_error='abort' and there was an exception during row evaluation
244
296
  """
245
- assert col.tbl.id == self.tbl_version.id
297
+ assert col.get_tbl().id == self.tbl_version.id
246
298
  num_excs = 0
247
299
  num_rows = 0
248
300
  # create temp table to store output of exec_plan, with the same primary key as the store table
249
301
  tmp_name = f'temp_{self._storage_name()}'
250
- tmp_pk_cols = [sql.Column(col.name, col.type, primary_key=True) for col in self.pk_columns()]
251
- tmp_cols = tmp_pk_cols.copy()
302
+ tmp_pk_cols = tuple(sql.Column(col.name, col.type, primary_key=True) for col in self.pk_columns())
252
303
  tmp_val_col = sql.Column(col.sa_col.name, col.sa_col.type)
253
- tmp_cols.append(tmp_val_col)
304
+ tmp_cols = [*tmp_pk_cols, tmp_val_col]
254
305
  # add error columns if the store column records errors
255
- if col.records_errors:
256
- tmp_errortype_col = sql.Column(col.sa_errortype_col.name, col.sa_errortype_col.type)
257
- tmp_cols.append(tmp_errortype_col)
258
- tmp_errormsg_col = sql.Column(col.sa_errormsg_col.name, col.sa_errormsg_col.type)
259
- tmp_cols.append(tmp_errormsg_col)
306
+ if col.stores_cellmd:
307
+ tmp_cellmd_col = sql.Column(col.sa_cellmd_col.name, col.sa_cellmd_col.type)
308
+ tmp_cols.append(tmp_cellmd_col)
309
+ tmp_col_names = [col.name for col in tmp_cols]
310
+
260
311
  tmp_tbl = sql.Table(tmp_name, self.sa_md, *tmp_cols, prefixes=['TEMPORARY'])
261
312
  conn = Env.get().conn
262
313
  tmp_tbl.create(bind=conn)
263
314
 
315
+ row_builder = exec_plan.row_builder
316
+
264
317
  try:
318
+ table_rows: list[tuple[Any]] = []
319
+
265
320
  # insert rows from exec_plan into temp table
266
- # TODO: unify the table row construction logic with RowBuilder.create_table_row()
267
321
  for row_batch in exec_plan:
268
322
  num_rows += len(row_batch)
269
- tbl_rows: list[dict[str, Any]] = []
270
- for result_row in row_batch:
271
- tbl_row: dict[str, Any] = {}
272
- for pk_col, pk_val in zip(self.pk_columns(), result_row.pk):
273
- tbl_row[pk_col.name] = pk_val
274
-
275
- if col.is_computed:
276
- if result_row.has_exc(value_expr_slot_idx):
277
- num_excs += 1
278
- value_exc = result_row.get_exc(value_expr_slot_idx)
279
- if on_error == 'abort':
280
- raise excs.Error(
281
- f'Error while evaluating computed column `{col.name}`:\n{value_exc}'
282
- ) from value_exc
283
- # we store a NULL value and record the exception/exc type
284
- error_type = type(value_exc).__name__
285
- error_msg = str(value_exc)
286
- tbl_row[col.sa_col.name] = None
287
- tbl_row[col.sa_errortype_col.name] = error_type
288
- tbl_row[col.sa_errormsg_col.name] = error_msg
289
- else:
290
- if col.col_type.is_image_type() and result_row.file_urls[value_expr_slot_idx] is None:
291
- # we have yet to store this image
292
- filepath = str(MediaStore.prepare_media_path(col.tbl.id, col.id, col.tbl.version))
293
- result_row.flush_img(value_expr_slot_idx, filepath)
294
- val = result_row.get_stored_val(value_expr_slot_idx, col.sa_col.type)
295
- if col.col_type.is_media_type():
296
- val = self._move_tmp_media_file(val, col, result_row.pk[-1])
297
- tbl_row[col.sa_col.name] = val
298
- if col.records_errors:
299
- tbl_row[col.sa_errortype_col.name] = None
300
- tbl_row[col.sa_errormsg_col.name] = None
301
-
302
- tbl_rows.append(tbl_row)
303
- conn.execute(sql.insert(tmp_tbl), tbl_rows)
323
+ batch_table_rows: list[tuple[Any]] = []
324
+
325
+ for row in row_batch:
326
+ if abort_on_exc and row.has_exc():
327
+ exc = row.get_first_exc()
328
+ raise excs.Error(f'Error while evaluating computed column {col.name!r}:\n{exc}') from exc
329
+ table_row, num_row_exc = row_builder.create_store_table_row(row, None, row.pk)
330
+ num_excs += num_row_exc
331
+ batch_table_rows.append(tuple(table_row))
332
+
333
+ table_rows.extend(batch_table_rows)
334
+
335
+ if len(table_rows) >= self.__INSERT_BATCH_SIZE:
336
+ self.sql_insert(tmp_tbl, tmp_col_names, table_rows)
337
+ table_rows.clear()
338
+
339
+ if len(table_rows) > 0:
340
+ self.sql_insert(tmp_tbl, tmp_col_names, table_rows)
304
341
 
305
342
  # update store table with values from temp table
306
343
  update_stmt = sql.update(self.sa_tbl)
307
344
  for pk_col, tmp_pk_col in zip(self.pk_columns(), tmp_pk_cols):
308
345
  update_stmt = update_stmt.where(pk_col == tmp_pk_col)
309
346
  update_stmt = update_stmt.values({col.sa_col: tmp_val_col})
310
- if col.records_errors:
311
- update_stmt = update_stmt.values(
312
- {col.sa_errortype_col: tmp_errortype_col, col.sa_errormsg_col: tmp_errormsg_col}
313
- )
347
+ if col.stores_cellmd:
348
+ update_stmt = update_stmt.values({col.sa_cellmd_col: tmp_cellmd_col})
314
349
  log_explain(_logger, update_stmt, conn)
315
350
  conn.execute(update_stmt)
351
+
316
352
  finally:
317
353
 
318
354
  def remove_tmp_tbl() -> None:
319
355
  self.sa_md.remove(tmp_tbl)
320
356
  tmp_tbl.drop(bind=conn)
321
357
 
322
- run_cleanup(remove_tmp_tbl, raise_error=True)
358
+ run_cleanup(remove_tmp_tbl, raise_error=False)
359
+
323
360
  return num_excs
324
361
 
325
362
  def insert_rows(
@@ -327,9 +364,9 @@ class StoreBase:
327
364
  exec_plan: ExecNode,
328
365
  v_min: int,
329
366
  show_progress: bool = True,
330
- rowids: Optional[Iterator[int]] = None,
367
+ rowids: Iterator[int] | None = None,
331
368
  abort_on_exc: bool = False,
332
- ) -> tuple[int, int, set[int]]:
369
+ ) -> tuple[set[int], RowCountStats]:
333
370
  """Insert rows into the store table and update the catalog table's md
334
371
  Returns:
335
372
  number of inserted rows, number of exceptions, set of column ids that have exceptions
@@ -339,53 +376,80 @@ class StoreBase:
339
376
  num_excs = 0
340
377
  num_rows = 0
341
378
  cols_with_excs: set[int] = set()
342
- progress_bar: Optional[tqdm] = None # create this only after we started executing
379
+ progress_bar: tqdm | None = None # create this only after we started executing
343
380
  row_builder = exec_plan.row_builder
344
- media_cols = [info.col for info in row_builder.table_columns if info.col.col_type.is_media_type()]
345
- conn = Env.get().conn
381
+
382
+ store_col_names = row_builder.store_column_names()
346
383
 
347
384
  try:
385
+ table_rows: list[tuple[Any]] = []
348
386
  exec_plan.open()
387
+
349
388
  for row_batch in exec_plan:
350
389
  num_rows += len(row_batch)
351
- for batch_start_idx in range(0, len(row_batch), self.__INSERT_BATCH_SIZE):
352
- # compute batch of rows and convert them into table rows
353
- table_rows: list[dict[str, Any]] = []
354
- batch_stop_idx = min(batch_start_idx + self.__INSERT_BATCH_SIZE, len(row_batch))
355
- for row_idx in range(batch_start_idx, batch_stop_idx):
356
- row = row_batch[row_idx]
357
- # if abort_on_exc == True, we need to check for media validation exceptions
358
- if abort_on_exc and row.has_exc():
359
- exc = row.get_first_exc()
360
- raise exc
361
-
362
- rowid = (next(rowids),) if rowids is not None else row.pk[:-1]
363
- pk = (*rowid, v_min)
364
- table_row, num_row_exc = self._create_table_row(row, row_builder, cols_with_excs, pk=pk)
365
- num_excs += num_row_exc
366
- table_rows.append(table_row)
367
-
368
- if show_progress:
369
- if progress_bar is None:
370
- warnings.simplefilter('ignore', category=TqdmWarning)
371
- progress_bar = tqdm(
372
- desc=f'Inserting rows into `{self.tbl_version.get().name}`',
373
- unit=' rows',
374
- ncols=100,
375
- file=sys.stdout,
376
- )
377
- progress_bar.update(1)
378
-
379
- # insert batch of rows
380
- self._move_tmp_media_files(table_rows, media_cols, v_min)
381
- conn.execute(sql.insert(self.sa_tbl), table_rows)
390
+ batch_table_rows: list[tuple[Any]] = []
391
+
392
+ # compute batch of rows and convert them into table rows
393
+ for row in row_batch:
394
+ # if abort_on_exc == True, we need to check for media validation exceptions
395
+ if abort_on_exc and row.has_exc():
396
+ exc = row.get_first_exc()
397
+ raise exc
398
+
399
+ rowid = (next(rowids),) if rowids is not None else row.pk[:-1]
400
+ pk = (*rowid, v_min)
401
+ assert len(pk) == len(self._pk_cols)
402
+ table_row, num_row_exc = row_builder.create_store_table_row(row, cols_with_excs, pk)
403
+ num_excs += num_row_exc
404
+
405
+ if show_progress and Env.get().verbosity >= 1:
406
+ if progress_bar is None:
407
+ warnings.simplefilter('ignore', category=TqdmWarning)
408
+ progress_bar = tqdm(
409
+ desc=f'Inserting rows into `{self.tbl_version.get().name}`',
410
+ unit=' rows',
411
+ ncols=100,
412
+ file=sys.stdout,
413
+ )
414
+ progress_bar.update(1)
415
+
416
+ batch_table_rows.append(tuple(table_row))
417
+
418
+ table_rows.extend(batch_table_rows)
419
+
420
+ # if a batch is ready for insertion into the database, insert it
421
+ if len(table_rows) >= self.__INSERT_BATCH_SIZE:
422
+ self.sql_insert(self.sa_tbl, store_col_names, table_rows)
423
+ table_rows.clear()
424
+
425
+ # insert any remaining rows
426
+ if len(table_rows) > 0:
427
+ self.sql_insert(self.sa_tbl, store_col_names, table_rows)
428
+
382
429
  if progress_bar is not None:
383
430
  progress_bar.close()
384
- return num_rows, num_excs, cols_with_excs
431
+ computed_values = exec_plan.ctx.num_computed_exprs * num_rows
432
+ row_counts = RowCountStats(ins_rows=num_rows, num_excs=num_excs, computed_values=computed_values)
433
+
434
+ return cols_with_excs, row_counts
385
435
  finally:
386
436
  exec_plan.close()
387
437
 
388
- def _versions_clause(self, versions: list[Optional[int]], match_on_vmin: bool) -> sql.ColumnElement[bool]:
438
+ @classmethod
439
+ def sql_insert(cls, sa_tbl: sql.Table, store_col_names: list[str], table_rows: list[tuple[Any]]) -> None:
440
+ assert len(table_rows) > 0
441
+ conn = Env.get().conn
442
+ conn.execute(sql.insert(sa_tbl), [dict(zip(store_col_names, table_row)) for table_row in table_rows])
443
+
444
+ # TODO: Inserting directly via psycopg delivers a small performance benefit, but is somewhat fraught due to
445
+ # differences in the data representation that SQLAlchemy/psycopg expect. The below code will do the
446
+ # insertion in psycopg and can be used if/when we decide to pursue that optimization.
447
+ # col_names_str = ", ".join(store_col_names)
448
+ # placeholders_str = ", ".join('%s' for _ in store_col_names)
449
+ # stmt_text = f'INSERT INTO {self.sa_tbl.name} ({col_names_str}) VALUES ({placeholders_str})'
450
+ # conn.exec_driver_sql(stmt_text, table_rows)
451
+
452
+ def _versions_clause(self, versions: list[int | None], match_on_vmin: bool) -> sql.ColumnElement[bool]:
389
453
  """Return filter for base versions"""
390
454
  v = versions[0]
391
455
  if v is None:
@@ -403,9 +467,9 @@ class StoreBase:
403
467
  def delete_rows(
404
468
  self,
405
469
  current_version: int,
406
- base_versions: list[Optional[int]],
470
+ base_versions: list[int | None],
407
471
  match_on_vmin: bool,
408
- where_clause: Optional[sql.ColumnElement[bool]],
472
+ where_clause: sql.ColumnElement[bool] | None,
409
473
  ) -> int:
410
474
  """Mark rows as deleted that are live and were created prior to current_version.
411
475
  Also: populate the undo columns
@@ -424,7 +488,7 @@ class StoreBase:
424
488
  base_versions_clause = (
425
489
  sql.true() if len(base_versions) == 0 else self.base._versions_clause(base_versions, match_on_vmin)
426
490
  )
427
- set_clause: dict[sql.Column, Union[int, sql.Column]] = {self.v_max_col: current_version}
491
+ set_clause: dict[sql.Column, int | sql.Column] = {self.v_max_col: current_version}
428
492
  for index_info in self.tbl_version.get().idxs_by_name.values():
429
493
  # copy value column to undo column
430
494
  set_clause[index_info.undo_col.sa_col] = index_info.val_col.sa_col
@@ -451,8 +515,7 @@ class StoreBase:
451
515
  *[c1 == c2 for c1, c2 in zip(self.rowid_columns(), filter_view.rowid_columns())],
452
516
  )
453
517
  stmt = (
454
- sql.select('*') # TODO: Use a more specific list of columns?
455
- .select_from(self.sa_tbl)
518
+ sql.select(self.sa_tbl)
456
519
  .where(self.v_min_col <= version)
457
520
  .where(self.v_max_col > version)
458
521
  .where(sql.exists().where(filter_predicate))
@@ -532,7 +595,7 @@ class StoreComponentView(StoreView):
532
595
  self.rowid_cols.append(self.pos_col)
533
596
  return self.rowid_cols
534
597
 
535
- def create_sa_tbl(self, tbl_version: Optional[catalog.TableVersion] = None) -> None:
598
+ def create_sa_tbl(self, tbl_version: catalog.TableVersion | None = None) -> None:
536
599
  if tbl_version is None:
537
600
  tbl_version = self.tbl_version.get()
538
601
  super().create_sa_tbl(tbl_version)