pixeltable 0.2.26__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. pixeltable/__init__.py +83 -19
  2. pixeltable/_query.py +1444 -0
  3. pixeltable/_version.py +1 -0
  4. pixeltable/catalog/__init__.py +7 -4
  5. pixeltable/catalog/catalog.py +2394 -119
  6. pixeltable/catalog/column.py +225 -104
  7. pixeltable/catalog/dir.py +38 -9
  8. pixeltable/catalog/globals.py +53 -34
  9. pixeltable/catalog/insertable_table.py +265 -115
  10. pixeltable/catalog/path.py +80 -17
  11. pixeltable/catalog/schema_object.py +28 -43
  12. pixeltable/catalog/table.py +1270 -677
  13. pixeltable/catalog/table_metadata.py +103 -0
  14. pixeltable/catalog/table_version.py +1270 -751
  15. pixeltable/catalog/table_version_handle.py +109 -0
  16. pixeltable/catalog/table_version_path.py +137 -42
  17. pixeltable/catalog/tbl_ops.py +53 -0
  18. pixeltable/catalog/update_status.py +191 -0
  19. pixeltable/catalog/view.py +251 -134
  20. pixeltable/config.py +215 -0
  21. pixeltable/env.py +736 -285
  22. pixeltable/exceptions.py +26 -2
  23. pixeltable/exec/__init__.py +7 -2
  24. pixeltable/exec/aggregation_node.py +39 -21
  25. pixeltable/exec/cache_prefetch_node.py +87 -109
  26. pixeltable/exec/cell_materialization_node.py +268 -0
  27. pixeltable/exec/cell_reconstruction_node.py +168 -0
  28. pixeltable/exec/component_iteration_node.py +25 -28
  29. pixeltable/exec/data_row_batch.py +11 -46
  30. pixeltable/exec/exec_context.py +26 -11
  31. pixeltable/exec/exec_node.py +35 -27
  32. pixeltable/exec/expr_eval/__init__.py +3 -0
  33. pixeltable/exec/expr_eval/evaluators.py +365 -0
  34. pixeltable/exec/expr_eval/expr_eval_node.py +413 -0
  35. pixeltable/exec/expr_eval/globals.py +200 -0
  36. pixeltable/exec/expr_eval/row_buffer.py +74 -0
  37. pixeltable/exec/expr_eval/schedulers.py +413 -0
  38. pixeltable/exec/globals.py +35 -0
  39. pixeltable/exec/in_memory_data_node.py +35 -27
  40. pixeltable/exec/object_store_save_node.py +293 -0
  41. pixeltable/exec/row_update_node.py +44 -29
  42. pixeltable/exec/sql_node.py +414 -115
  43. pixeltable/exprs/__init__.py +8 -5
  44. pixeltable/exprs/arithmetic_expr.py +79 -45
  45. pixeltable/exprs/array_slice.py +5 -5
  46. pixeltable/exprs/column_property_ref.py +40 -26
  47. pixeltable/exprs/column_ref.py +254 -61
  48. pixeltable/exprs/comparison.py +14 -9
  49. pixeltable/exprs/compound_predicate.py +9 -10
  50. pixeltable/exprs/data_row.py +213 -72
  51. pixeltable/exprs/expr.py +270 -104
  52. pixeltable/exprs/expr_dict.py +6 -5
  53. pixeltable/exprs/expr_set.py +20 -11
  54. pixeltable/exprs/function_call.py +383 -284
  55. pixeltable/exprs/globals.py +18 -5
  56. pixeltable/exprs/in_predicate.py +7 -7
  57. pixeltable/exprs/inline_expr.py +37 -37
  58. pixeltable/exprs/is_null.py +8 -4
  59. pixeltable/exprs/json_mapper.py +120 -54
  60. pixeltable/exprs/json_path.py +90 -60
  61. pixeltable/exprs/literal.py +61 -16
  62. pixeltable/exprs/method_ref.py +7 -6
  63. pixeltable/exprs/object_ref.py +19 -8
  64. pixeltable/exprs/row_builder.py +238 -75
  65. pixeltable/exprs/rowid_ref.py +53 -15
  66. pixeltable/exprs/similarity_expr.py +65 -50
  67. pixeltable/exprs/sql_element_cache.py +5 -5
  68. pixeltable/exprs/string_op.py +107 -0
  69. pixeltable/exprs/type_cast.py +25 -13
  70. pixeltable/exprs/variable.py +2 -2
  71. pixeltable/func/__init__.py +9 -5
  72. pixeltable/func/aggregate_function.py +197 -92
  73. pixeltable/func/callable_function.py +119 -35
  74. pixeltable/func/expr_template_function.py +101 -48
  75. pixeltable/func/function.py +375 -62
  76. pixeltable/func/function_registry.py +20 -19
  77. pixeltable/func/globals.py +6 -5
  78. pixeltable/func/mcp.py +74 -0
  79. pixeltable/func/query_template_function.py +151 -35
  80. pixeltable/func/signature.py +178 -49
  81. pixeltable/func/tools.py +164 -0
  82. pixeltable/func/udf.py +176 -53
  83. pixeltable/functions/__init__.py +44 -4
  84. pixeltable/functions/anthropic.py +226 -47
  85. pixeltable/functions/audio.py +148 -11
  86. pixeltable/functions/bedrock.py +137 -0
  87. pixeltable/functions/date.py +188 -0
  88. pixeltable/functions/deepseek.py +113 -0
  89. pixeltable/functions/document.py +81 -0
  90. pixeltable/functions/fal.py +76 -0
  91. pixeltable/functions/fireworks.py +72 -20
  92. pixeltable/functions/gemini.py +249 -0
  93. pixeltable/functions/globals.py +208 -53
  94. pixeltable/functions/groq.py +108 -0
  95. pixeltable/functions/huggingface.py +1088 -95
  96. pixeltable/functions/image.py +155 -84
  97. pixeltable/functions/json.py +8 -11
  98. pixeltable/functions/llama_cpp.py +31 -19
  99. pixeltable/functions/math.py +169 -0
  100. pixeltable/functions/mistralai.py +50 -75
  101. pixeltable/functions/net.py +70 -0
  102. pixeltable/functions/ollama.py +29 -36
  103. pixeltable/functions/openai.py +548 -160
  104. pixeltable/functions/openrouter.py +143 -0
  105. pixeltable/functions/replicate.py +15 -14
  106. pixeltable/functions/reve.py +250 -0
  107. pixeltable/functions/string.py +310 -85
  108. pixeltable/functions/timestamp.py +37 -19
  109. pixeltable/functions/together.py +77 -120
  110. pixeltable/functions/twelvelabs.py +188 -0
  111. pixeltable/functions/util.py +7 -2
  112. pixeltable/functions/uuid.py +30 -0
  113. pixeltable/functions/video.py +1528 -117
  114. pixeltable/functions/vision.py +26 -26
  115. pixeltable/functions/voyageai.py +289 -0
  116. pixeltable/functions/whisper.py +19 -10
  117. pixeltable/functions/whisperx.py +179 -0
  118. pixeltable/functions/yolox.py +112 -0
  119. pixeltable/globals.py +716 -236
  120. pixeltable/index/__init__.py +3 -1
  121. pixeltable/index/base.py +17 -21
  122. pixeltable/index/btree.py +32 -22
  123. pixeltable/index/embedding_index.py +155 -92
  124. pixeltable/io/__init__.py +12 -7
  125. pixeltable/io/datarows.py +140 -0
  126. pixeltable/io/external_store.py +83 -125
  127. pixeltable/io/fiftyone.py +24 -33
  128. pixeltable/io/globals.py +47 -182
  129. pixeltable/io/hf_datasets.py +96 -127
  130. pixeltable/io/label_studio.py +171 -156
  131. pixeltable/io/lancedb.py +3 -0
  132. pixeltable/io/pandas.py +136 -115
  133. pixeltable/io/parquet.py +40 -153
  134. pixeltable/io/table_data_conduit.py +702 -0
  135. pixeltable/io/utils.py +100 -0
  136. pixeltable/iterators/__init__.py +8 -4
  137. pixeltable/iterators/audio.py +207 -0
  138. pixeltable/iterators/base.py +9 -3
  139. pixeltable/iterators/document.py +144 -87
  140. pixeltable/iterators/image.py +17 -38
  141. pixeltable/iterators/string.py +15 -12
  142. pixeltable/iterators/video.py +523 -127
  143. pixeltable/metadata/__init__.py +33 -8
  144. pixeltable/metadata/converters/convert_10.py +2 -3
  145. pixeltable/metadata/converters/convert_13.py +2 -2
  146. pixeltable/metadata/converters/convert_15.py +15 -11
  147. pixeltable/metadata/converters/convert_16.py +4 -5
  148. pixeltable/metadata/converters/convert_17.py +4 -5
  149. pixeltable/metadata/converters/convert_18.py +4 -6
  150. pixeltable/metadata/converters/convert_19.py +6 -9
  151. pixeltable/metadata/converters/convert_20.py +3 -6
  152. pixeltable/metadata/converters/convert_21.py +6 -8
  153. pixeltable/metadata/converters/convert_22.py +3 -2
  154. pixeltable/metadata/converters/convert_23.py +33 -0
  155. pixeltable/metadata/converters/convert_24.py +55 -0
  156. pixeltable/metadata/converters/convert_25.py +19 -0
  157. pixeltable/metadata/converters/convert_26.py +23 -0
  158. pixeltable/metadata/converters/convert_27.py +29 -0
  159. pixeltable/metadata/converters/convert_28.py +13 -0
  160. pixeltable/metadata/converters/convert_29.py +110 -0
  161. pixeltable/metadata/converters/convert_30.py +63 -0
  162. pixeltable/metadata/converters/convert_31.py +11 -0
  163. pixeltable/metadata/converters/convert_32.py +15 -0
  164. pixeltable/metadata/converters/convert_33.py +17 -0
  165. pixeltable/metadata/converters/convert_34.py +21 -0
  166. pixeltable/metadata/converters/convert_35.py +9 -0
  167. pixeltable/metadata/converters/convert_36.py +38 -0
  168. pixeltable/metadata/converters/convert_37.py +15 -0
  169. pixeltable/metadata/converters/convert_38.py +39 -0
  170. pixeltable/metadata/converters/convert_39.py +124 -0
  171. pixeltable/metadata/converters/convert_40.py +73 -0
  172. pixeltable/metadata/converters/convert_41.py +12 -0
  173. pixeltable/metadata/converters/convert_42.py +9 -0
  174. pixeltable/metadata/converters/convert_43.py +44 -0
  175. pixeltable/metadata/converters/util.py +44 -18
  176. pixeltable/metadata/notes.py +21 -0
  177. pixeltable/metadata/schema.py +185 -42
  178. pixeltable/metadata/utils.py +74 -0
  179. pixeltable/mypy/__init__.py +3 -0
  180. pixeltable/mypy/mypy_plugin.py +123 -0
  181. pixeltable/plan.py +616 -225
  182. pixeltable/share/__init__.py +3 -0
  183. pixeltable/share/packager.py +797 -0
  184. pixeltable/share/protocol/__init__.py +33 -0
  185. pixeltable/share/protocol/common.py +165 -0
  186. pixeltable/share/protocol/operation_types.py +33 -0
  187. pixeltable/share/protocol/replica.py +119 -0
  188. pixeltable/share/publish.py +349 -0
  189. pixeltable/store.py +398 -232
  190. pixeltable/type_system.py +730 -267
  191. pixeltable/utils/__init__.py +40 -0
  192. pixeltable/utils/arrow.py +201 -29
  193. pixeltable/utils/av.py +298 -0
  194. pixeltable/utils/azure_store.py +346 -0
  195. pixeltable/utils/coco.py +26 -27
  196. pixeltable/utils/code.py +4 -4
  197. pixeltable/utils/console_output.py +46 -0
  198. pixeltable/utils/coroutine.py +24 -0
  199. pixeltable/utils/dbms.py +92 -0
  200. pixeltable/utils/description_helper.py +11 -12
  201. pixeltable/utils/documents.py +60 -61
  202. pixeltable/utils/exception_handler.py +36 -0
  203. pixeltable/utils/filecache.py +38 -22
  204. pixeltable/utils/formatter.py +88 -51
  205. pixeltable/utils/gcs_store.py +295 -0
  206. pixeltable/utils/http.py +133 -0
  207. pixeltable/utils/http_server.py +14 -13
  208. pixeltable/utils/iceberg.py +13 -0
  209. pixeltable/utils/image.py +17 -0
  210. pixeltable/utils/lancedb.py +90 -0
  211. pixeltable/utils/local_store.py +322 -0
  212. pixeltable/utils/misc.py +5 -0
  213. pixeltable/utils/object_stores.py +573 -0
  214. pixeltable/utils/pydantic.py +60 -0
  215. pixeltable/utils/pytorch.py +20 -20
  216. pixeltable/utils/s3_store.py +527 -0
  217. pixeltable/utils/sql.py +32 -5
  218. pixeltable/utils/system.py +30 -0
  219. pixeltable/utils/transactional_directory.py +4 -3
  220. pixeltable-0.5.7.dist-info/METADATA +579 -0
  221. pixeltable-0.5.7.dist-info/RECORD +227 -0
  222. {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
  223. pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
  224. pixeltable/__version__.py +0 -3
  225. pixeltable/catalog/named_function.py +0 -36
  226. pixeltable/catalog/path_dict.py +0 -141
  227. pixeltable/dataframe.py +0 -894
  228. pixeltable/exec/expr_eval_node.py +0 -232
  229. pixeltable/ext/__init__.py +0 -14
  230. pixeltable/ext/functions/__init__.py +0 -8
  231. pixeltable/ext/functions/whisperx.py +0 -77
  232. pixeltable/ext/functions/yolox.py +0 -157
  233. pixeltable/tool/create_test_db_dump.py +0 -311
  234. pixeltable/tool/create_test_video.py +0 -81
  235. pixeltable/tool/doc_plugins/griffe.py +0 -50
  236. pixeltable/tool/doc_plugins/mkdocstrings.py +0 -6
  237. pixeltable/tool/doc_plugins/templates/material/udf.html.jinja +0 -135
  238. pixeltable/tool/embed_udf.py +0 -9
  239. pixeltable/tool/mypy_plugin.py +0 -55
  240. pixeltable/utils/media_store.py +0 -76
  241. pixeltable/utils/s3.py +0 -16
  242. pixeltable-0.2.26.dist-info/METADATA +0 -400
  243. pixeltable-0.2.26.dist-info/RECORD +0 -156
  244. pixeltable-0.2.26.dist-info/entry_points.txt +0 -3
  245. {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
pixeltable/store.py CHANGED
@@ -2,23 +2,23 @@ from __future__ import annotations
2
2
 
3
3
  import abc
4
4
  import logging
5
- import os
6
5
  import sys
7
- import urllib.parse
8
- import urllib.request
6
+ import time
9
7
  import warnings
10
- from typing import Any, Iterator, Literal, Optional, Union
8
+ from typing import Any, Iterable, Iterator
9
+ from uuid import UUID
11
10
 
11
+ import more_itertools
12
+ import psycopg
12
13
  import sqlalchemy as sql
13
14
  from tqdm import TqdmWarning, tqdm
14
15
 
15
- import pixeltable.catalog as catalog
16
- import pixeltable.env as env
17
- import pixeltable.exceptions as excs
18
- from pixeltable import exprs
16
+ from pixeltable import catalog, exceptions as excs
17
+ from pixeltable.catalog.update_status import RowCountStats
18
+ from pixeltable.env import Env
19
19
  from pixeltable.exec import ExecNode
20
20
  from pixeltable.metadata import schema
21
- from pixeltable.utils.media_store import MediaStore
21
+ from pixeltable.utils.exception_handler import run_cleanup
22
22
  from pixeltable.utils.sql import log_explain, log_stmt
23
23
 
24
24
  _logger = logging.getLogger('pixeltable')
@@ -32,24 +32,49 @@ class StoreBase:
32
32
  - v_min: version at which the row was created
33
33
  - v_max: version at which the row was deleted (or MAX_VERSION if it's still live)
34
34
  """
35
- tbl_version: catalog.TableVersion
35
+
36
+ tbl_version: catalog.TableVersionHandle
36
37
  sa_md: sql.MetaData
37
- sa_tbl: Optional[sql.Table]
38
+ sa_tbl: sql.Table | None
38
39
  _pk_cols: list[sql.Column]
39
40
  v_min_col: sql.Column
40
41
  v_max_col: sql.Column
41
- base: Optional[StoreBase]
42
42
 
43
- __INSERT_BATCH_SIZE = 1000
43
+ # We need to declare a `base` variable here, even though it's only defined for instances of `StoreView`,
44
+ # since it's referenced by various methods of `StoreBase`
45
+ _base: StoreBase | None
46
+
47
+ # In my cursory experiments this was the optimal batch size: it was an improvement over 5_000 and there was no real
48
+ # benefit to going higher.
49
+ # TODO: Perform more rigorous experiments with different table structures and OS environments to refine this.
50
+ __INSERT_BATCH_SIZE = 10_000
44
51
 
45
52
  def __init__(self, tbl_version: catalog.TableVersion):
46
- self.tbl_version = tbl_version
53
+ self.tbl_version = tbl_version.handle
47
54
  self.sa_md = sql.MetaData()
48
55
  self.sa_tbl = None
49
- # We need to declare a `base` variable here, even though it's only defined for instances of `StoreView`,
50
- # since it's referenced by various methods of `StoreBase`
51
- self.base = None if tbl_version.base is None else tbl_version.base.store_tbl
52
- self.create_sa_tbl()
56
+ self._pk_cols = []
57
+
58
+ # we initialize _base lazily, because the base may not exist anymore at this point
59
+ # (but we might still need sa_table to access our store table); do this before create_sa_tbl()
60
+ self._base = None
61
+
62
+ # we're passing in tbl_version to avoid a circular call to TableVersionHandle.get()
63
+ self.create_sa_tbl(tbl_version)
64
+
65
+ @property
66
+ def base(self) -> StoreBase | None:
67
+ if self._base is None:
68
+ tv = self.tbl_version.get()
69
+ self._base = tv.base.get().store_tbl if tv.base is not None else None
70
+ return self._base
71
+
72
+ @classmethod
73
+ def storage_name(cls, tbl_id: UUID, is_view: bool) -> str:
74
+ return f'{"view" if is_view else "tbl"}_{tbl_id.hex}'
75
+
76
+ def system_columns(self) -> list[sql.Column]:
77
+ return [*self._pk_cols, self.v_max_col]
53
78
 
54
79
  def pk_columns(self) -> list[sql.Column]:
55
80
  return self._pk_cols
@@ -63,25 +88,44 @@ class StoreBase:
63
88
 
64
89
  def _create_system_columns(self) -> list[sql.Column]:
65
90
  """Create and return system columns"""
66
- rowid_cols = self._create_rowid_columns()
91
+ rowid_cols: list[sql.Column]
92
+ if self._store_tbl_exists():
93
+ # derive our rowid Columns from the existing table, without having to access self.base.store_tbl:
94
+ # self.base may not exist anymore (both this table and our base got dropped in the same transaction, and
95
+ # the base was finalized before this table)
96
+ with Env.get().begin_xact(for_write=False) as conn:
97
+ q = (
98
+ f'SELECT column_name FROM information_schema.columns WHERE table_name = {self._storage_name()!r} '
99
+ 'ORDER BY ordinal_position'
100
+ )
101
+ col_names = [row[0] for row in conn.execute(sql.text(q)).fetchall()]
102
+ num_rowid_cols = col_names.index('v_min')
103
+ rowid_cols = [
104
+ sql.Column(col_name, sql.BigInteger, nullable=False) for col_name in col_names[:num_rowid_cols]
105
+ ]
106
+ else:
107
+ rowid_cols = self._create_rowid_columns()
67
108
  self.v_min_col = sql.Column('v_min', sql.BigInteger, nullable=False)
68
- self.v_max_col = \
69
- sql.Column('v_max', sql.BigInteger, nullable=False, server_default=str(schema.Table.MAX_VERSION))
109
+ self.v_max_col = sql.Column(
110
+ 'v_max', sql.BigInteger, nullable=False, server_default=str(schema.Table.MAX_VERSION)
111
+ )
70
112
  self._pk_cols = [*rowid_cols, self.v_min_col]
71
113
  return [*rowid_cols, self.v_min_col, self.v_max_col]
72
114
 
73
- def create_sa_tbl(self) -> None:
115
+ def create_sa_tbl(self, tbl_version: catalog.TableVersion | None = None) -> None:
74
116
  """Create self.sa_tbl from self.tbl_version."""
117
+ if tbl_version is None:
118
+ tbl_version = self.tbl_version.get()
75
119
  system_cols = self._create_system_columns()
76
120
  all_cols = system_cols.copy()
77
- for col in [c for c in self.tbl_version.cols if c.is_stored]:
121
+ # we captured all columns, including dropped ones: they're still part of the physical table
122
+ for col in [c for c in tbl_version.cols if c.is_stored]:
78
123
  # re-create sql.Column for each column, regardless of whether it already has sa_col set: it was bound
79
124
  # to the last sql.Table version we created and cannot be reused
80
125
  col.create_sa_cols()
81
126
  all_cols.append(col.sa_col)
82
- if col.records_errors:
83
- all_cols.append(col.sa_errormsg_col)
84
- all_cols.append(col.sa_errortype_col)
127
+ if col.stores_cellmd:
128
+ all_cols.append(col.sa_cellmd_col)
85
129
 
86
130
  if self.sa_tbl is not None:
87
131
  # if we're called in response to a schema change, we need to remove the old table first
@@ -92,16 +136,17 @@ class StoreBase:
92
136
  # - base x view joins can be executed as merge joins
93
137
  # - speeds up ORDER BY rowid DESC
94
138
  # - allows filtering for a particular table version in index scan
95
- idx_name = f'sys_cols_idx_{self.tbl_version.id.hex}'
139
+ idx_name = f'sys_cols_idx_{tbl_version.id.hex}'
96
140
  idxs.append(sql.Index(idx_name, *system_cols))
97
141
 
98
142
  # v_min/v_max indices: speeds up base table scans needed to propagate a base table insert or delete
99
- idx_name = f'vmin_idx_{self.tbl_version.id.hex}'
100
- idxs.append(sql.Index(idx_name, self.v_min_col, postgresql_using='brin'))
101
- idx_name = f'vmax_idx_{self.tbl_version.id.hex}'
102
- idxs.append(sql.Index(idx_name, self.v_max_col, postgresql_using='brin'))
143
+ idx_name = f'vmin_idx_{tbl_version.id.hex}'
144
+ idxs.append(sql.Index(idx_name, self.v_min_col, postgresql_using=Env.get().dbms.version_index_type))
145
+ idx_name = f'vmax_idx_{tbl_version.id.hex}'
146
+ idxs.append(sql.Index(idx_name, self.v_max_col, postgresql_using=Env.get().dbms.version_index_type))
103
147
 
104
148
  self.sa_tbl = sql.Table(self._storage_name(), self.sa_md, *all_cols, *idxs)
149
+ # _logger.debug(f'created sa tbl for {tbl_version.id!s} (sa_tbl={id(self.sa_tbl):x}, tv={id(tbl_version):x})')
105
150
 
106
151
  @abc.abstractmethod
107
152
  def _rowid_join_predicate(self) -> sql.ColumnElement[bool]:
@@ -111,116 +156,169 @@ class StoreBase:
111
156
  def _storage_name(self) -> str:
112
157
  """Return the name of the data store table"""
113
158
 
114
- def _move_tmp_media_file(self, file_url: Optional[str], col: catalog.Column, v_min: int) -> str:
115
- """Move tmp media file with given url to Env.media_dir and return new url, or given url if not a tmp_dir file"""
116
- pxt_tmp_dir = str(env.Env.get().tmp_dir)
117
- if file_url is None:
118
- return None
119
- parsed = urllib.parse.urlparse(file_url)
120
- # We should never be passed a local file path here. The "len > 1" ensures that Windows
121
- # file paths aren't mistaken for URLs with a single-character scheme.
122
- assert len(parsed.scheme) > 1
123
- if parsed.scheme != 'file':
124
- # remote url
125
- return file_url
126
- file_path = urllib.parse.unquote(urllib.request.url2pathname(parsed.path))
127
- if not file_path.startswith(pxt_tmp_dir):
128
- # not a tmp file
129
- return file_url
130
- _, ext = os.path.splitext(file_path)
131
- new_path = str(MediaStore.prepare_media_path(self.tbl_version.id, col.id, v_min, ext=ext))
132
- os.rename(file_path, new_path)
133
- new_file_url = urllib.parse.urljoin('file:', urllib.request.pathname2url(new_path))
134
- return new_file_url
135
-
136
- def _move_tmp_media_files(
137
- self, table_rows: list[dict[str, Any]], media_cols: list[catalog.Column], v_min: int
138
- ) -> None:
139
- """Move tmp media files that we generated to a permanent location"""
140
- for c in media_cols:
141
- for table_row in table_rows:
142
- file_url = table_row[c.store_name()]
143
- table_row[c.store_name()] = self._move_tmp_media_file(file_url, c, v_min)
144
-
145
- def _create_table_row(
146
- self, input_row: exprs.DataRow, row_builder: exprs.RowBuilder, exc_col_ids: set[int], pk: tuple[int, ...]
147
- ) -> tuple[dict[str, Any], int]:
148
- """Return Tuple[complete table row, # of exceptions] for insert()
149
- Creates a row that includes the PK columns, with the values from input_row.pk.
150
- Returns:
151
- Tuple[complete table row, # of exceptions]
152
- """
153
- table_row, num_excs = row_builder.create_table_row(input_row, exc_col_ids)
154
- assert len(pk) == len(self._pk_cols)
155
- for pk_col, pk_val in zip(self._pk_cols, pk):
156
- table_row[pk_col.name] = pk_val
157
- return table_row, num_excs
158
-
159
- def count(self, conn: Optional[sql.engine.Connection] = None) -> int:
159
+ def count(self) -> int:
160
160
  """Return the number of rows visible in self.tbl_version"""
161
161
  stmt = (
162
162
  sql.select(sql.func.count('*'))
163
163
  .select_from(self.sa_tbl)
164
- .where(self.v_min_col <= self.tbl_version.version)
165
- .where(self.v_max_col > self.tbl_version.version)
164
+ .where(self.v_min_col <= self.tbl_version.get().version)
165
+ .where(self.v_max_col > self.tbl_version.get().version)
166
166
  )
167
- if conn is None:
168
- with env.Env.get().engine.connect() as conn:
169
- result = conn.execute(stmt).scalar_one()
170
- else:
171
- result = conn.execute(stmt).scalar_one()
167
+ conn = Env.get().conn
168
+ result = conn.execute(stmt).scalar_one()
172
169
  assert isinstance(result, int)
173
170
  return result
174
171
 
175
- def create(self, conn: sql.engine.Connection) -> None:
176
- self.sa_md.create_all(bind=conn)
172
+ def _exec_if_not_exists(self, stmt: str, wait_for_table: bool) -> None:
173
+ """
174
+ Execute a statement containing 'IF NOT EXISTS' and ignore any duplicate object-related errors.
175
+
176
+ The statement needs to run in a separate transaction, because the expected error conditions will abort the
177
+ enclosing transaction (and the ability to run additional statements in that same transaction).
178
+ """
179
+ while True:
180
+ with Env.get().begin_xact(for_write=True) as conn:
181
+ try:
182
+ if wait_for_table and not Env.get().is_using_cockroachdb:
183
+ # Try to lock the table to make sure that it exists. This needs to run in the same transaction
184
+ # as 'stmt' to avoid a race condition.
185
+ # TODO: adapt this for CockroachDB
186
+ lock_stmt = f'LOCK TABLE {self._storage_name()} IN ACCESS EXCLUSIVE MODE'
187
+ conn.execute(sql.text(lock_stmt))
188
+ conn.execute(sql.text(stmt))
189
+ return
190
+ except (sql.exc.IntegrityError, sql.exc.ProgrammingError) as e:
191
+ Env.get().console_logger.info(f'{stmt} failed with: {e}')
192
+ if (
193
+ isinstance(e.orig, psycopg.errors.UniqueViolation)
194
+ and 'duplicate key value violates unique constraint' in str(e.orig)
195
+ ) or (
196
+ isinstance(e.orig, (psycopg.errors.DuplicateObject, psycopg.errors.DuplicateTable))
197
+ and 'already exists' in str(e.orig)
198
+ ):
199
+ # table already exists
200
+ return
201
+ elif isinstance(e.orig, psycopg.errors.UndefinedTable):
202
+ # the Lock Table failed because the table doesn't exist yet; try again
203
+ time.sleep(1)
204
+ continue
205
+ else:
206
+ raise
207
+
208
+ def _store_tbl_exists(self) -> bool:
209
+ """Returns True if the store table exists, False otherwise."""
210
+ with Env.get().begin_xact(for_write=False) as conn:
211
+ q = (
212
+ 'SELECT COUNT(*) FROM pg_catalog.pg_tables '
213
+ f"WHERE schemaname = 'public' AND tablename = {self._storage_name()!r}"
214
+ )
215
+ res = conn.execute(sql.text(q)).scalar_one()
216
+ return res == 1
217
+
218
+ def create(self) -> None:
219
+ """
220
+ Create or update store table to bring it in sync with self.sa_tbl. Idempotent.
177
221
 
178
- def drop(self, conn: sql.engine.Connection) -> None:
222
+ This runs a sequence of DDL statements (Create Table, Alter Table Add Column, Create Index), each of which
223
+ is run in its own transaction.
224
+
225
+ The exception to that are local replicas, for which TableRestorer creates an enclosing transaction. In theory,
226
+ this should avoid the potential for race conditions that motivate the error handling present in
227
+ _exec_if_not_exists() (meaning: we shouldn't see those errors when creating local replicas).
228
+ TODO: remove the special case for local replicas in order to make the logic easier to reason about.
229
+ """
230
+ postgres_dialect = sql.dialects.postgresql.dialect()
231
+
232
+ if not self._store_tbl_exists():
233
+ # run Create Table If Not Exists; we always need If Not Exists to avoid race conditions between concurrent
234
+ # Pixeltable processes
235
+ create_stmt = sql.schema.CreateTable(self.sa_tbl, if_not_exists=True).compile(dialect=postgres_dialect)
236
+ self._exec_if_not_exists(str(create_stmt), wait_for_table=False)
237
+ else:
238
+ # ensure that all columns exist by running Alter Table Add Column If Not Exists for all columns
239
+ for col in self.sa_tbl.columns:
240
+ stmt = self._add_column_stmt(col)
241
+ self._exec_if_not_exists(stmt, wait_for_table=True)
242
+ # TODO: do we also need to ensure that these columns are now visible (ie, is there another potential race
243
+ # condition here?)
244
+
245
+ # ensure that all system indices exist by running Create Index If Not Exists
246
+ for idx in self.sa_tbl.indexes:
247
+ create_idx_stmt = sql.schema.CreateIndex(idx, if_not_exists=True).compile(dialect=postgres_dialect)
248
+ self._exec_if_not_exists(str(create_idx_stmt), wait_for_table=True)
249
+
250
+ # ensure that all visible non-system indices exist by running appropriate create statements
251
+ for id in self.tbl_version.get().idxs:
252
+ self.create_index(id)
253
+
254
+ def create_index(self, idx_id: int) -> None:
255
+ """Create If Not Exists for this index"""
256
+ idx_info = self.tbl_version.get().idxs[idx_id]
257
+ stmt = idx_info.idx.sa_create_stmt(self.tbl_version.get()._store_idx_name(idx_id), idx_info.val_col.sa_col)
258
+ self._exec_if_not_exists(str(stmt), wait_for_table=True)
259
+
260
+ def validate(self) -> None:
261
+ """Validate store table against self.table_version"""
262
+ with Env.get().begin_xact() as conn:
263
+ # check that all columns are present
264
+ q = f'SELECT column_name FROM information_schema.columns WHERE table_name = {self._storage_name()!r}'
265
+ store_col_info = {row[0] for row in conn.execute(sql.text(q)).fetchall()}
266
+ tbl_col_info = {col.store_name() for col in self.tbl_version.get().cols if col.is_stored}
267
+ assert tbl_col_info.issubset(store_col_info)
268
+
269
+ # check that all visible indices are present
270
+ q = f'SELECT indexname FROM pg_indexes WHERE tablename = {self._storage_name()!r}'
271
+ store_idx_names = {row[0] for row in conn.execute(sql.text(q)).fetchall()}
272
+ tbl_index_names = {
273
+ self.tbl_version.get()._store_idx_name(info.id) for info in self.tbl_version.get().idxs.values()
274
+ }
275
+ assert tbl_index_names.issubset(store_idx_names)
276
+
277
+ def drop(self) -> None:
179
278
  """Drop store table"""
180
- self.sa_md.drop_all(bind=conn)
279
+ conn = Env.get().conn
280
+ drop_stmt = f'DROP TABLE IF EXISTS {self._storage_name()}'
281
+ conn.execute(sql.text(drop_stmt))
282
+
283
+ def _add_column_stmt(self, sa_col: sql.Column) -> str:
284
+ col_type_str = sa_col.type.compile(dialect=sql.dialects.postgresql.dialect())
285
+ return (
286
+ f'ALTER TABLE {self._storage_name()} ADD COLUMN IF NOT EXISTS '
287
+ f'{sa_col.name} {col_type_str} {"NOT " if not sa_col.nullable else ""} NULL'
288
+ )
181
289
 
182
- def add_column(self, col: catalog.Column, conn: sql.engine.Connection) -> None:
290
+ def add_column(self, col: catalog.Column) -> None:
183
291
  """Add column(s) to the store-resident table based on a catalog column
184
292
 
185
293
  Note that a computed catalog column will require two extra columns (for the computed value and for the error
186
294
  message).
187
295
  """
188
296
  assert col.is_stored
189
- col_type_str = col.get_sa_col_type().compile(dialect=conn.dialect)
190
- stmt = sql.text(f'ALTER TABLE {self._storage_name()} ADD COLUMN {col.store_name()} {col_type_str} NULL')
297
+ conn = Env.get().conn
298
+ col_type_str = col.sa_col_type.compile(dialect=conn.dialect)
299
+ s_txt = f'ALTER TABLE {self._storage_name()} ADD COLUMN {col.store_name()} {col_type_str} NULL'
300
+ added_storage_cols = [col.store_name()]
301
+ if col.stores_cellmd:
302
+ cellmd_type_str = col.sa_cellmd_type().compile(dialect=conn.dialect)
303
+ s_txt += f' , ADD COLUMN {col.cellmd_store_name()} {cellmd_type_str} DEFAULT NULL'
304
+ added_storage_cols.append(col.cellmd_store_name())
305
+
306
+ stmt = sql.text(s_txt)
191
307
  log_stmt(_logger, stmt)
192
308
  conn.execute(stmt)
193
- added_storage_cols = [col.store_name()]
194
- if col.records_errors:
195
- # we also need to create the errormsg and errortype storage cols
196
- stmt = sql.text(f'ALTER TABLE {self._storage_name()} '
197
- f'ADD COLUMN {col.errormsg_store_name()} VARCHAR DEFAULT NULL')
198
- conn.execute(stmt)
199
- stmt = sql.text(f'ALTER TABLE {self._storage_name()} '
200
- f'ADD COLUMN {col.errortype_store_name()} VARCHAR DEFAULT NULL')
201
- conn.execute(stmt)
202
- added_storage_cols.extend([col.errormsg_store_name(), col.errortype_store_name()])
203
309
  self.create_sa_tbl()
204
310
  _logger.info(f'Added columns {added_storage_cols} to storage table {self._storage_name()}')
205
311
 
206
- def drop_column(self, col: catalog.Column, conn: sql.engine.Connection) -> None:
312
+ def drop_column(self, col: catalog.Column) -> None:
207
313
  """Execute Alter Table Drop Column statement"""
208
- stmt = f'ALTER TABLE {self._storage_name()} DROP COLUMN {col.store_name()}'
209
- conn.execute(sql.text(stmt))
210
- if col.records_errors:
211
- stmt = f'ALTER TABLE {self._storage_name()} DROP COLUMN {col.errormsg_store_name()}'
212
- conn.execute(sql.text(stmt))
213
- stmt = f'ALTER TABLE {self._storage_name()} DROP COLUMN {col.errortype_store_name()}'
214
- conn.execute(sql.text(stmt))
215
-
216
- def load_column(
217
- self,
218
- col: catalog.Column,
219
- exec_plan: ExecNode,
220
- value_expr_slot_idx: int,
221
- conn: sql.engine.Connection,
222
- on_error: Literal['abort', 'ignore']
223
- ) -> int:
314
+ s_txt = f'ALTER TABLE {self._storage_name()} DROP COLUMN {col.store_name()}'
315
+ if col.stores_cellmd:
316
+ s_txt += f' , DROP COLUMN {col.cellmd_store_name()}'
317
+ stmt = sql.text(s_txt)
318
+ log_stmt(_logger, stmt)
319
+ Env.get().conn.execute(stmt)
320
+
321
+ def load_column(self, col: catalog.Column, exec_plan: ExecNode, abort_on_exc: bool) -> int:
224
322
  """Update store column of a computed column with values produced by an execution plan
225
323
 
226
324
  Returns:
@@ -229,142 +327,169 @@ class StoreBase:
229
327
  sql.exc.DBAPIError if there was a SQL error during execution
230
328
  excs.Error if on_error='abort' and there was an exception during row evaluation
231
329
  """
330
+ assert col.get_tbl().id == self.tbl_version.id
232
331
  num_excs = 0
233
332
  num_rows = 0
234
-
235
333
  # create temp table to store output of exec_plan, with the same primary key as the store table
236
334
  tmp_name = f'temp_{self._storage_name()}'
237
- tmp_pk_cols = [sql.Column(col.name, col.type, primary_key=True) for col in self.pk_columns()]
238
- tmp_cols = tmp_pk_cols.copy()
335
+ tmp_pk_cols = tuple(sql.Column(col.name, col.type, primary_key=True) for col in self.pk_columns())
239
336
  tmp_val_col = sql.Column(col.sa_col.name, col.sa_col.type)
240
- tmp_cols.append(tmp_val_col)
337
+ tmp_cols = [*tmp_pk_cols, tmp_val_col]
241
338
  # add error columns if the store column records errors
242
- if col.records_errors:
243
- tmp_errortype_col = sql.Column(col.sa_errortype_col.name, col.sa_errortype_col.type)
244
- tmp_cols.append(tmp_errortype_col)
245
- tmp_errormsg_col = sql.Column(col.sa_errormsg_col.name, col.sa_errormsg_col.type)
246
- tmp_cols.append(tmp_errormsg_col)
339
+ if col.stores_cellmd:
340
+ tmp_cellmd_col = sql.Column(col.sa_cellmd_col.name, col.sa_cellmd_col.type)
341
+ tmp_cols.append(tmp_cellmd_col)
342
+ tmp_col_names = [col.name for col in tmp_cols]
343
+
247
344
  tmp_tbl = sql.Table(tmp_name, self.sa_md, *tmp_cols, prefixes=['TEMPORARY'])
345
+ conn = Env.get().conn
248
346
  tmp_tbl.create(bind=conn)
249
347
 
348
+ row_builder = exec_plan.row_builder
349
+
250
350
  try:
351
+ table_rows: list[tuple[Any]] = []
352
+
251
353
  # insert rows from exec_plan into temp table
252
354
  for row_batch in exec_plan:
253
355
  num_rows += len(row_batch)
254
- tbl_rows: list[dict[str, Any]] = []
255
- for result_row in row_batch:
256
- tbl_row: dict[str, Any] = {}
257
- for pk_col, pk_val in zip(self.pk_columns(), result_row.pk):
258
- tbl_row[pk_col.name] = pk_val
259
-
260
- if col.is_computed:
261
- if result_row.has_exc(value_expr_slot_idx):
262
- num_excs += 1
263
- value_exc = result_row.get_exc(value_expr_slot_idx)
264
- if on_error == 'abort':
265
- raise excs.Error(
266
- f'Error while evaluating computed column `{col.name}`:\n{value_exc}'
267
- ) from value_exc
268
- # we store a NULL value and record the exception/exc type
269
- error_type = type(value_exc).__name__
270
- error_msg = str(value_exc)
271
- tbl_row[col.sa_col.name] = None
272
- tbl_row[col.sa_errortype_col.name] = error_type
273
- tbl_row[col.sa_errormsg_col.name] = error_msg
274
- else:
275
- val = result_row.get_stored_val(value_expr_slot_idx, col.sa_col.type)
276
- if col.col_type.is_media_type():
277
- val = self._move_tmp_media_file(val, col, result_row.pk[-1])
278
- tbl_row[col.sa_col.name] = val
279
- if col.records_errors:
280
- tbl_row[col.sa_errortype_col.name] = None
281
- tbl_row[col.sa_errormsg_col.name] = None
282
-
283
- tbl_rows.append(tbl_row)
284
- conn.execute(sql.insert(tmp_tbl), tbl_rows)
356
+ batch_table_rows: list[tuple[Any]] = []
357
+
358
+ for row in row_batch:
359
+ if abort_on_exc and row.has_exc():
360
+ exc = row.get_first_exc()
361
+ raise excs.Error(f'Error while evaluating computed column {col.name!r}:\n{exc}') from exc
362
+ table_row, num_row_exc = row_builder.create_store_table_row(row, None, row.pk)
363
+ num_excs += num_row_exc
364
+ batch_table_rows.append(tuple(table_row))
365
+
366
+ table_rows.extend(batch_table_rows)
367
+
368
+ if len(table_rows) >= self.__INSERT_BATCH_SIZE:
369
+ self.sql_insert(tmp_tbl, tmp_col_names, table_rows)
370
+ table_rows.clear()
371
+
372
+ if len(table_rows) > 0:
373
+ self.sql_insert(tmp_tbl, tmp_col_names, table_rows)
285
374
 
286
375
  # update store table with values from temp table
287
376
  update_stmt = sql.update(self.sa_tbl)
288
377
  for pk_col, tmp_pk_col in zip(self.pk_columns(), tmp_pk_cols):
289
378
  update_stmt = update_stmt.where(pk_col == tmp_pk_col)
290
379
  update_stmt = update_stmt.values({col.sa_col: tmp_val_col})
291
- if col.records_errors:
292
- update_stmt = update_stmt.values({
293
- col.sa_errortype_col: tmp_errortype_col,
294
- col.sa_errormsg_col: tmp_errormsg_col
295
- })
380
+ if col.stores_cellmd:
381
+ update_stmt = update_stmt.values({col.sa_cellmd_col: tmp_cellmd_col})
296
382
  log_explain(_logger, update_stmt, conn)
297
383
  conn.execute(update_stmt)
298
384
 
299
385
  finally:
300
- tmp_tbl.drop(bind=conn)
301
- self.sa_md.remove(tmp_tbl)
386
+
387
+ def remove_tmp_tbl() -> None:
388
+ self.sa_md.remove(tmp_tbl)
389
+ tmp_tbl.drop(bind=conn)
390
+
391
+ run_cleanup(remove_tmp_tbl, raise_error=False)
392
+
302
393
  return num_excs
303
394
 
304
395
  def insert_rows(
305
- self, exec_plan: ExecNode, conn: sql.engine.Connection, v_min: Optional[int] = None,
306
- show_progress: bool = True, rowids: Optional[Iterator[int]] = None, abort_on_exc: bool = False
307
- ) -> tuple[int, int, set[int]]:
396
+ self,
397
+ exec_plan: ExecNode,
398
+ v_min: int,
399
+ show_progress: bool = True,
400
+ rowids: Iterator[int] | None = None,
401
+ abort_on_exc: bool = False,
402
+ ) -> tuple[set[int], RowCountStats]:
308
403
  """Insert rows into the store table and update the catalog table's md
309
404
  Returns:
310
405
  number of inserted rows, number of exceptions, set of column ids that have exceptions
311
406
  """
312
407
  assert v_min is not None
313
- exec_plan.ctx.set_conn(conn)
314
408
  # TODO: total?
315
409
  num_excs = 0
316
410
  num_rows = 0
317
411
  cols_with_excs: set[int] = set()
318
- progress_bar: Optional[tqdm] = None # create this only after we started executing
412
+ progress_bar: tqdm | None = None # create this only after we started executing
319
413
  row_builder = exec_plan.row_builder
320
- media_cols = [info.col for info in row_builder.table_columns if info.col.col_type.is_media_type()]
414
+
415
+ store_col_names = row_builder.store_column_names()
416
+
321
417
  try:
418
+ table_rows: list[tuple[Any]] = []
322
419
  exec_plan.open()
420
+
323
421
  for row_batch in exec_plan:
324
422
  num_rows += len(row_batch)
325
- for batch_start_idx in range(0, len(row_batch), self.__INSERT_BATCH_SIZE):
326
- # compute batch of rows and convert them into table rows
327
- table_rows: list[dict[str, Any]] = []
328
- batch_stop_idx = min(batch_start_idx + self.__INSERT_BATCH_SIZE, len(row_batch))
329
- for row_idx in range(batch_start_idx, batch_stop_idx):
330
- row = row_batch[row_idx]
331
- # if abort_on_exc == True, we need to check for media validation exceptions
332
- if abort_on_exc and row.has_exc():
333
- exc = row.get_first_exc()
334
- raise exc
335
-
336
- rowid = (next(rowids),) if rowids is not None else row.pk[:-1]
337
- pk = rowid + (v_min,)
338
- table_row, num_row_exc = self._create_table_row(row, row_builder, cols_with_excs, pk=pk)
339
- num_excs += num_row_exc
340
- table_rows.append(table_row)
341
-
342
- if show_progress:
343
- if progress_bar is None:
344
- warnings.simplefilter("ignore", category=TqdmWarning)
345
- progress_bar = tqdm(
346
- desc=f'Inserting rows into `{self.tbl_version.name}`',
347
- unit=' rows',
348
- ncols=100,
349
- file=sys.stdout
350
- )
351
- progress_bar.update(1)
352
-
353
- # insert batch of rows
354
- self._move_tmp_media_files(table_rows, media_cols, v_min)
355
- conn.execute(sql.insert(self.sa_tbl), table_rows)
423
+ batch_table_rows: list[tuple[Any]] = []
424
+
425
+ # compute batch of rows and convert them into table rows
426
+ for row in row_batch:
427
+ # if abort_on_exc == True, we need to check for media validation exceptions
428
+ if abort_on_exc and row.has_exc():
429
+ exc = row.get_first_exc()
430
+ raise exc
431
+
432
+ rowid = (next(rowids),) if rowids is not None else row.pk[:-1]
433
+ pk = (*rowid, v_min)
434
+ assert len(pk) == len(self._pk_cols)
435
+ table_row, num_row_exc = row_builder.create_store_table_row(row, cols_with_excs, pk)
436
+ num_excs += num_row_exc
437
+
438
+ if show_progress and Env.get().verbosity >= 1:
439
+ if progress_bar is None:
440
+ warnings.simplefilter('ignore', category=TqdmWarning)
441
+ progress_bar = tqdm(
442
+ desc=f'Inserting rows into `{self.tbl_version.get().name}`',
443
+ unit=' rows',
444
+ ncols=100,
445
+ file=sys.stdout,
446
+ )
447
+ progress_bar.update(1)
448
+
449
+ batch_table_rows.append(tuple(table_row))
450
+
451
+ table_rows.extend(batch_table_rows)
452
+
453
+ # if a batch is ready for insertion into the database, insert it
454
+ if len(table_rows) >= self.__INSERT_BATCH_SIZE:
455
+ self.sql_insert(self.sa_tbl, store_col_names, table_rows)
456
+ table_rows.clear()
457
+
458
+ # insert any remaining rows
459
+ if len(table_rows) > 0:
460
+ self.sql_insert(self.sa_tbl, store_col_names, table_rows)
461
+
356
462
  if progress_bar is not None:
357
463
  progress_bar.close()
358
- return num_rows, num_excs, cols_with_excs
464
+ computed_values = exec_plan.ctx.num_computed_exprs * num_rows
465
+ row_counts = RowCountStats(ins_rows=num_rows, num_excs=num_excs, computed_values=computed_values)
466
+
467
+ return cols_with_excs, row_counts
359
468
  finally:
360
469
  exec_plan.close()
361
470
 
362
- def _versions_clause(self, versions: list[Optional[int]], match_on_vmin: bool) -> sql.ColumnElement[bool]:
471
+ @classmethod
472
+ def sql_insert(cls, sa_tbl: sql.Table, store_col_names: list[str], table_rows: list[tuple[Any]]) -> None:
473
+ assert len(table_rows) > 0
474
+ conn = Env.get().conn
475
+ conn.execute(sql.insert(sa_tbl), [dict(zip(store_col_names, table_row)) for table_row in table_rows])
476
+
477
+ # TODO: Inserting directly via psycopg delivers a small performance benefit, but is somewhat fraught due to
478
+ # differences in the data representation that SQLAlchemy/psycopg expect. The below code will do the
479
+ # insertion in psycopg and can be used if/when we decide to pursue that optimization.
480
+ # col_names_str = ", ".join(store_col_names)
481
+ # placeholders_str = ", ".join('%s' for _ in store_col_names)
482
+ # stmt_text = f'INSERT INTO {self.sa_tbl.name} ({col_names_str}) VALUES ({placeholders_str})'
483
+ # conn.exec_driver_sql(stmt_text, table_rows)
484
+
485
+ def _versions_clause(self, versions: list[int | None], match_on_vmin: bool) -> sql.ColumnElement[bool]:
363
486
  """Return filter for base versions"""
364
487
  v = versions[0]
365
488
  if v is None:
366
489
  # we're looking at live rows
367
- clause = sql.and_(self.v_min_col <= self.tbl_version.version, self.v_max_col == schema.Table.MAX_VERSION)
490
+ clause = sql.and_(
491
+ self.v_min_col <= self.tbl_version.get().version, self.v_max_col == schema.Table.MAX_VERSION
492
+ )
368
493
  else:
369
494
  # we're looking at a specific version
370
495
  clause = self.v_min_col == v if match_on_vmin else self.v_max_col == v
@@ -373,8 +498,12 @@ class StoreBase:
373
498
  return sql.and_(clause, self.base._versions_clause(versions[1:], match_on_vmin))
374
499
 
375
500
  def delete_rows(
376
- self, current_version: int, base_versions: list[Optional[int]], match_on_vmin: bool,
377
- where_clause: Optional[sql.ColumnElement[bool]], conn: sql.engine.Connection) -> int:
501
+ self,
502
+ current_version: int,
503
+ base_versions: list[int | None],
504
+ match_on_vmin: bool,
505
+ where_clause: sql.ColumnElement[bool] | None,
506
+ ) -> int:
378
507
  """Mark rows as deleted that are live and were created prior to current_version.
379
508
  Also: populate the undo columns
380
509
  Args:
@@ -387,34 +516,63 @@ class StoreBase:
387
516
  number of deleted rows
388
517
  """
389
518
  where_clause = sql.true() if where_clause is None else where_clause
390
- where_clause = sql.and_(
391
- self.v_min_col < current_version,
392
- self.v_max_col == schema.Table.MAX_VERSION,
393
- where_clause)
519
+ version_clause = sql.and_(self.v_min_col < current_version, self.v_max_col == schema.Table.MAX_VERSION)
394
520
  rowid_join_clause = self._rowid_join_predicate()
395
- base_versions_clause = sql.true() if len(base_versions) == 0 \
396
- else self.base._versions_clause(base_versions, match_on_vmin)
397
- set_clause: dict[sql.Column, Union[int, sql.Column]] = {self.v_max_col: current_version}
398
- for index_info in self.tbl_version.idxs_by_name.values():
521
+ base_versions_clause = (
522
+ sql.true() if len(base_versions) == 0 else self.base._versions_clause(base_versions, match_on_vmin)
523
+ )
524
+ set_clause: dict[sql.Column, int | sql.Column] = {self.v_max_col: current_version}
525
+ for index_info in self.tbl_version.get().idxs_by_name.values():
399
526
  # copy value column to undo column
400
527
  set_clause[index_info.undo_col.sa_col] = index_info.val_col.sa_col
401
528
  # set value column to NULL
402
529
  set_clause[index_info.val_col.sa_col] = None
530
+
403
531
  stmt = (
404
532
  sql.update(self.sa_tbl)
405
533
  .values(set_clause)
406
534
  .where(where_clause)
535
+ .where(version_clause)
407
536
  .where(rowid_join_clause)
408
537
  .where(base_versions_clause)
409
538
  )
539
+ conn = Env.get().conn
410
540
  log_explain(_logger, stmt, conn)
411
541
  status = conn.execute(stmt)
412
542
  return status.rowcount
413
543
 
544
+ def dump_rows(self, version: int, filter_view: StoreBase, filter_view_version: int) -> Iterator[dict[str, Any]]:
545
+ filter_predicate = sql.and_(
546
+ filter_view.v_min_col <= filter_view_version,
547
+ filter_view.v_max_col > filter_view_version,
548
+ *[c1 == c2 for c1, c2 in zip(self.rowid_columns(), filter_view.rowid_columns())],
549
+ )
550
+ stmt = (
551
+ sql.select(self.sa_tbl)
552
+ .where(self.v_min_col <= version)
553
+ .where(self.v_max_col > version)
554
+ .where(sql.exists().where(filter_predicate))
555
+ )
556
+ conn = Env.get().conn
557
+ _logger.debug(stmt)
558
+ log_explain(_logger, stmt, conn)
559
+ result = conn.execute(stmt)
560
+ for row in result:
561
+ yield dict(zip(result.keys(), row))
562
+
563
+ def load_rows(self, rows: Iterable[dict[str, Any]], batch_size: int = 10_000) -> None:
564
+ """
565
+ When instantiating a replica, we can't rely on the usual insertion code path, which contains error handling
566
+ and other logic that doesn't apply.
567
+ """
568
+ conn = Env.get().conn
569
+ for batch in more_itertools.batched(rows, batch_size):
570
+ conn.execute(sql.insert(self.sa_tbl), batch)
571
+
414
572
 
415
573
  class StoreTable(StoreBase):
416
574
  def __init__(self, tbl_version: catalog.TableVersion):
417
- assert not tbl_version.is_view()
575
+ assert not tbl_version.is_view
418
576
  super().__init__(tbl_version)
419
577
 
420
578
  def _create_rowid_columns(self) -> list[sql.Column]:
@@ -430,7 +588,7 @@ class StoreTable(StoreBase):
430
588
 
431
589
  class StoreView(StoreBase):
432
590
  def __init__(self, catalog_view: catalog.TableVersion):
433
- assert catalog_view.is_view()
591
+ assert catalog_view.is_view
434
592
  super().__init__(catalog_view)
435
593
 
436
594
  def _create_rowid_columns(self) -> list[sql.Column]:
@@ -444,7 +602,9 @@ class StoreView(StoreBase):
444
602
  def _rowid_join_predicate(self) -> sql.ColumnElement[bool]:
445
603
  return sql.and_(
446
604
  self.base._rowid_join_predicate(),
447
- *[c1 == c2 for c1, c2 in zip(self.rowid_columns(), self.base.rowid_columns())])
605
+ *[c1 == c2 for c1, c2 in zip(self.rowid_columns(), self.base.rowid_columns())],
606
+ )
607
+
448
608
 
449
609
  class StoreComponentView(StoreView):
450
610
  """A view that stores components of its base, as produced by a ComponentIterator
@@ -452,28 +612,34 @@ class StoreComponentView(StoreView):
452
612
  PK: now also includes pos, the position returned by the ComponentIterator for the base row identified by base_rowid
453
613
  """
454
614
 
455
- rowid_cols: list[sql.Column]
456
- pos_col: sql.Column
457
- pos_col_idx: int
458
-
459
615
  def __init__(self, catalog_view: catalog.TableVersion):
460
616
  super().__init__(catalog_view)
461
617
 
462
618
  def _create_rowid_columns(self) -> list[sql.Column]:
463
619
  # each base row is expanded into n view rows
464
- self.rowid_cols = [sql.Column(c.name, c.type) for c in self.base.rowid_columns()]
620
+ rowid_cols = [sql.Column(c.name, c.type) for c in self.base.rowid_columns()]
465
621
  # name of pos column: avoid collisions with bases' pos columns
466
- self.pos_col = sql.Column(f'pos_{len(self.rowid_cols) - 1}', sql.BigInteger, nullable=False)
467
- self.pos_col_idx = len(self.rowid_cols)
468
- self.rowid_cols.append(self.pos_col)
469
- return self.rowid_cols
470
-
471
- def create_sa_tbl(self) -> None:
472
- super().create_sa_tbl()
622
+ pos_col = sql.Column(f'pos_{len(rowid_cols) - 1}', sql.BigInteger, nullable=False)
623
+ rowid_cols.append(pos_col)
624
+ return rowid_cols
625
+
626
+ @property
627
+ def pos_col(self) -> sql.Column:
628
+ return self.rowid_columns()[-1]
629
+
630
+ @property
631
+ def pos_col_idx(self) -> int:
632
+ return len(self.rowid_columns()) - 1
633
+
634
+ def create_sa_tbl(self, tbl_version: catalog.TableVersion | None = None) -> None:
635
+ if tbl_version is None:
636
+ tbl_version = self.tbl_version.get()
637
+ super().create_sa_tbl(tbl_version)
473
638
  # we need to fix up the 'pos' column in TableVersion
474
- self.tbl_version.cols_by_name['pos'].sa_col = self.pos_col
639
+ tbl_version.cols_by_name['pos'].sa_col = self.pos_col
475
640
 
476
641
  def _rowid_join_predicate(self) -> sql.ColumnElement[bool]:
477
642
  return sql.and_(
478
643
  self.base._rowid_join_predicate(),
479
- *[c1 == c2 for c1, c2 in zip(self.rowid_columns()[:-1], self.base.rowid_columns())])
644
+ *[c1 == c2 for c1, c2 in zip(self.rowid_columns()[:-1], self.base.rowid_columns())],
645
+ )