pixeltable 0.2.26__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. pixeltable/__init__.py +83 -19
  2. pixeltable/_query.py +1444 -0
  3. pixeltable/_version.py +1 -0
  4. pixeltable/catalog/__init__.py +7 -4
  5. pixeltable/catalog/catalog.py +2394 -119
  6. pixeltable/catalog/column.py +225 -104
  7. pixeltable/catalog/dir.py +38 -9
  8. pixeltable/catalog/globals.py +53 -34
  9. pixeltable/catalog/insertable_table.py +265 -115
  10. pixeltable/catalog/path.py +80 -17
  11. pixeltable/catalog/schema_object.py +28 -43
  12. pixeltable/catalog/table.py +1270 -677
  13. pixeltable/catalog/table_metadata.py +103 -0
  14. pixeltable/catalog/table_version.py +1270 -751
  15. pixeltable/catalog/table_version_handle.py +109 -0
  16. pixeltable/catalog/table_version_path.py +137 -42
  17. pixeltable/catalog/tbl_ops.py +53 -0
  18. pixeltable/catalog/update_status.py +191 -0
  19. pixeltable/catalog/view.py +251 -134
  20. pixeltable/config.py +215 -0
  21. pixeltable/env.py +736 -285
  22. pixeltable/exceptions.py +26 -2
  23. pixeltable/exec/__init__.py +7 -2
  24. pixeltable/exec/aggregation_node.py +39 -21
  25. pixeltable/exec/cache_prefetch_node.py +87 -109
  26. pixeltable/exec/cell_materialization_node.py +268 -0
  27. pixeltable/exec/cell_reconstruction_node.py +168 -0
  28. pixeltable/exec/component_iteration_node.py +25 -28
  29. pixeltable/exec/data_row_batch.py +11 -46
  30. pixeltable/exec/exec_context.py +26 -11
  31. pixeltable/exec/exec_node.py +35 -27
  32. pixeltable/exec/expr_eval/__init__.py +3 -0
  33. pixeltable/exec/expr_eval/evaluators.py +365 -0
  34. pixeltable/exec/expr_eval/expr_eval_node.py +413 -0
  35. pixeltable/exec/expr_eval/globals.py +200 -0
  36. pixeltable/exec/expr_eval/row_buffer.py +74 -0
  37. pixeltable/exec/expr_eval/schedulers.py +413 -0
  38. pixeltable/exec/globals.py +35 -0
  39. pixeltable/exec/in_memory_data_node.py +35 -27
  40. pixeltable/exec/object_store_save_node.py +293 -0
  41. pixeltable/exec/row_update_node.py +44 -29
  42. pixeltable/exec/sql_node.py +414 -115
  43. pixeltable/exprs/__init__.py +8 -5
  44. pixeltable/exprs/arithmetic_expr.py +79 -45
  45. pixeltable/exprs/array_slice.py +5 -5
  46. pixeltable/exprs/column_property_ref.py +40 -26
  47. pixeltable/exprs/column_ref.py +254 -61
  48. pixeltable/exprs/comparison.py +14 -9
  49. pixeltable/exprs/compound_predicate.py +9 -10
  50. pixeltable/exprs/data_row.py +213 -72
  51. pixeltable/exprs/expr.py +270 -104
  52. pixeltable/exprs/expr_dict.py +6 -5
  53. pixeltable/exprs/expr_set.py +20 -11
  54. pixeltable/exprs/function_call.py +383 -284
  55. pixeltable/exprs/globals.py +18 -5
  56. pixeltable/exprs/in_predicate.py +7 -7
  57. pixeltable/exprs/inline_expr.py +37 -37
  58. pixeltable/exprs/is_null.py +8 -4
  59. pixeltable/exprs/json_mapper.py +120 -54
  60. pixeltable/exprs/json_path.py +90 -60
  61. pixeltable/exprs/literal.py +61 -16
  62. pixeltable/exprs/method_ref.py +7 -6
  63. pixeltable/exprs/object_ref.py +19 -8
  64. pixeltable/exprs/row_builder.py +238 -75
  65. pixeltable/exprs/rowid_ref.py +53 -15
  66. pixeltable/exprs/similarity_expr.py +65 -50
  67. pixeltable/exprs/sql_element_cache.py +5 -5
  68. pixeltable/exprs/string_op.py +107 -0
  69. pixeltable/exprs/type_cast.py +25 -13
  70. pixeltable/exprs/variable.py +2 -2
  71. pixeltable/func/__init__.py +9 -5
  72. pixeltable/func/aggregate_function.py +197 -92
  73. pixeltable/func/callable_function.py +119 -35
  74. pixeltable/func/expr_template_function.py +101 -48
  75. pixeltable/func/function.py +375 -62
  76. pixeltable/func/function_registry.py +20 -19
  77. pixeltable/func/globals.py +6 -5
  78. pixeltable/func/mcp.py +74 -0
  79. pixeltable/func/query_template_function.py +151 -35
  80. pixeltable/func/signature.py +178 -49
  81. pixeltable/func/tools.py +164 -0
  82. pixeltable/func/udf.py +176 -53
  83. pixeltable/functions/__init__.py +44 -4
  84. pixeltable/functions/anthropic.py +226 -47
  85. pixeltable/functions/audio.py +148 -11
  86. pixeltable/functions/bedrock.py +137 -0
  87. pixeltable/functions/date.py +188 -0
  88. pixeltable/functions/deepseek.py +113 -0
  89. pixeltable/functions/document.py +81 -0
  90. pixeltable/functions/fal.py +76 -0
  91. pixeltable/functions/fireworks.py +72 -20
  92. pixeltable/functions/gemini.py +249 -0
  93. pixeltable/functions/globals.py +208 -53
  94. pixeltable/functions/groq.py +108 -0
  95. pixeltable/functions/huggingface.py +1088 -95
  96. pixeltable/functions/image.py +155 -84
  97. pixeltable/functions/json.py +8 -11
  98. pixeltable/functions/llama_cpp.py +31 -19
  99. pixeltable/functions/math.py +169 -0
  100. pixeltable/functions/mistralai.py +50 -75
  101. pixeltable/functions/net.py +70 -0
  102. pixeltable/functions/ollama.py +29 -36
  103. pixeltable/functions/openai.py +548 -160
  104. pixeltable/functions/openrouter.py +143 -0
  105. pixeltable/functions/replicate.py +15 -14
  106. pixeltable/functions/reve.py +250 -0
  107. pixeltable/functions/string.py +310 -85
  108. pixeltable/functions/timestamp.py +37 -19
  109. pixeltable/functions/together.py +77 -120
  110. pixeltable/functions/twelvelabs.py +188 -0
  111. pixeltable/functions/util.py +7 -2
  112. pixeltable/functions/uuid.py +30 -0
  113. pixeltable/functions/video.py +1528 -117
  114. pixeltable/functions/vision.py +26 -26
  115. pixeltable/functions/voyageai.py +289 -0
  116. pixeltable/functions/whisper.py +19 -10
  117. pixeltable/functions/whisperx.py +179 -0
  118. pixeltable/functions/yolox.py +112 -0
  119. pixeltable/globals.py +716 -236
  120. pixeltable/index/__init__.py +3 -1
  121. pixeltable/index/base.py +17 -21
  122. pixeltable/index/btree.py +32 -22
  123. pixeltable/index/embedding_index.py +155 -92
  124. pixeltable/io/__init__.py +12 -7
  125. pixeltable/io/datarows.py +140 -0
  126. pixeltable/io/external_store.py +83 -125
  127. pixeltable/io/fiftyone.py +24 -33
  128. pixeltable/io/globals.py +47 -182
  129. pixeltable/io/hf_datasets.py +96 -127
  130. pixeltable/io/label_studio.py +171 -156
  131. pixeltable/io/lancedb.py +3 -0
  132. pixeltable/io/pandas.py +136 -115
  133. pixeltable/io/parquet.py +40 -153
  134. pixeltable/io/table_data_conduit.py +702 -0
  135. pixeltable/io/utils.py +100 -0
  136. pixeltable/iterators/__init__.py +8 -4
  137. pixeltable/iterators/audio.py +207 -0
  138. pixeltable/iterators/base.py +9 -3
  139. pixeltable/iterators/document.py +144 -87
  140. pixeltable/iterators/image.py +17 -38
  141. pixeltable/iterators/string.py +15 -12
  142. pixeltable/iterators/video.py +523 -127
  143. pixeltable/metadata/__init__.py +33 -8
  144. pixeltable/metadata/converters/convert_10.py +2 -3
  145. pixeltable/metadata/converters/convert_13.py +2 -2
  146. pixeltable/metadata/converters/convert_15.py +15 -11
  147. pixeltable/metadata/converters/convert_16.py +4 -5
  148. pixeltable/metadata/converters/convert_17.py +4 -5
  149. pixeltable/metadata/converters/convert_18.py +4 -6
  150. pixeltable/metadata/converters/convert_19.py +6 -9
  151. pixeltable/metadata/converters/convert_20.py +3 -6
  152. pixeltable/metadata/converters/convert_21.py +6 -8
  153. pixeltable/metadata/converters/convert_22.py +3 -2
  154. pixeltable/metadata/converters/convert_23.py +33 -0
  155. pixeltable/metadata/converters/convert_24.py +55 -0
  156. pixeltable/metadata/converters/convert_25.py +19 -0
  157. pixeltable/metadata/converters/convert_26.py +23 -0
  158. pixeltable/metadata/converters/convert_27.py +29 -0
  159. pixeltable/metadata/converters/convert_28.py +13 -0
  160. pixeltable/metadata/converters/convert_29.py +110 -0
  161. pixeltable/metadata/converters/convert_30.py +63 -0
  162. pixeltable/metadata/converters/convert_31.py +11 -0
  163. pixeltable/metadata/converters/convert_32.py +15 -0
  164. pixeltable/metadata/converters/convert_33.py +17 -0
  165. pixeltable/metadata/converters/convert_34.py +21 -0
  166. pixeltable/metadata/converters/convert_35.py +9 -0
  167. pixeltable/metadata/converters/convert_36.py +38 -0
  168. pixeltable/metadata/converters/convert_37.py +15 -0
  169. pixeltable/metadata/converters/convert_38.py +39 -0
  170. pixeltable/metadata/converters/convert_39.py +124 -0
  171. pixeltable/metadata/converters/convert_40.py +73 -0
  172. pixeltable/metadata/converters/convert_41.py +12 -0
  173. pixeltable/metadata/converters/convert_42.py +9 -0
  174. pixeltable/metadata/converters/convert_43.py +44 -0
  175. pixeltable/metadata/converters/util.py +44 -18
  176. pixeltable/metadata/notes.py +21 -0
  177. pixeltable/metadata/schema.py +185 -42
  178. pixeltable/metadata/utils.py +74 -0
  179. pixeltable/mypy/__init__.py +3 -0
  180. pixeltable/mypy/mypy_plugin.py +123 -0
  181. pixeltable/plan.py +616 -225
  182. pixeltable/share/__init__.py +3 -0
  183. pixeltable/share/packager.py +797 -0
  184. pixeltable/share/protocol/__init__.py +33 -0
  185. pixeltable/share/protocol/common.py +165 -0
  186. pixeltable/share/protocol/operation_types.py +33 -0
  187. pixeltable/share/protocol/replica.py +119 -0
  188. pixeltable/share/publish.py +349 -0
  189. pixeltable/store.py +398 -232
  190. pixeltable/type_system.py +730 -267
  191. pixeltable/utils/__init__.py +40 -0
  192. pixeltable/utils/arrow.py +201 -29
  193. pixeltable/utils/av.py +298 -0
  194. pixeltable/utils/azure_store.py +346 -0
  195. pixeltable/utils/coco.py +26 -27
  196. pixeltable/utils/code.py +4 -4
  197. pixeltable/utils/console_output.py +46 -0
  198. pixeltable/utils/coroutine.py +24 -0
  199. pixeltable/utils/dbms.py +92 -0
  200. pixeltable/utils/description_helper.py +11 -12
  201. pixeltable/utils/documents.py +60 -61
  202. pixeltable/utils/exception_handler.py +36 -0
  203. pixeltable/utils/filecache.py +38 -22
  204. pixeltable/utils/formatter.py +88 -51
  205. pixeltable/utils/gcs_store.py +295 -0
  206. pixeltable/utils/http.py +133 -0
  207. pixeltable/utils/http_server.py +14 -13
  208. pixeltable/utils/iceberg.py +13 -0
  209. pixeltable/utils/image.py +17 -0
  210. pixeltable/utils/lancedb.py +90 -0
  211. pixeltable/utils/local_store.py +322 -0
  212. pixeltable/utils/misc.py +5 -0
  213. pixeltable/utils/object_stores.py +573 -0
  214. pixeltable/utils/pydantic.py +60 -0
  215. pixeltable/utils/pytorch.py +20 -20
  216. pixeltable/utils/s3_store.py +527 -0
  217. pixeltable/utils/sql.py +32 -5
  218. pixeltable/utils/system.py +30 -0
  219. pixeltable/utils/transactional_directory.py +4 -3
  220. pixeltable-0.5.7.dist-info/METADATA +579 -0
  221. pixeltable-0.5.7.dist-info/RECORD +227 -0
  222. {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
  223. pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
  224. pixeltable/__version__.py +0 -3
  225. pixeltable/catalog/named_function.py +0 -36
  226. pixeltable/catalog/path_dict.py +0 -141
  227. pixeltable/dataframe.py +0 -894
  228. pixeltable/exec/expr_eval_node.py +0 -232
  229. pixeltable/ext/__init__.py +0 -14
  230. pixeltable/ext/functions/__init__.py +0 -8
  231. pixeltable/ext/functions/whisperx.py +0 -77
  232. pixeltable/ext/functions/yolox.py +0 -157
  233. pixeltable/tool/create_test_db_dump.py +0 -311
  234. pixeltable/tool/create_test_video.py +0 -81
  235. pixeltable/tool/doc_plugins/griffe.py +0 -50
  236. pixeltable/tool/doc_plugins/mkdocstrings.py +0 -6
  237. pixeltable/tool/doc_plugins/templates/material/udf.html.jinja +0 -135
  238. pixeltable/tool/embed_udf.py +0 -9
  239. pixeltable/tool/mypy_plugin.py +0 -55
  240. pixeltable/utils/media_store.py +0 -76
  241. pixeltable/utils/s3.py +0 -16
  242. pixeltable-0.2.26.dist-info/METADATA +0 -400
  243. pixeltable-0.2.26.dist-info/RECORD +0 -156
  244. pixeltable-0.2.26.dist-info/entry_points.txt +0 -3
  245. {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
@@ -1,160 +1,2435 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import dataclasses
4
+ import functools
4
5
  import logging
5
- from typing import Optional
6
+ import random
7
+ import time
8
+ from collections import defaultdict
9
+ from contextlib import contextmanager
10
+ from enum import Enum
11
+ from typing import TYPE_CHECKING, Any, Callable, Iterator, TypeVar
6
12
  from uuid import UUID
7
13
 
14
+ import psycopg
8
15
  import sqlalchemy as sql
9
- import sqlalchemy.orm as orm
16
+ import sqlalchemy.exc as sql_exc
10
17
 
11
- from .table_version import TableVersion
12
- from .table_version_path import TableVersionPath
18
+ from pixeltable import exceptions as excs
19
+ from pixeltable.env import Env
20
+ from pixeltable.iterators import ComponentIterator
21
+ from pixeltable.metadata import schema
22
+ from pixeltable.utils.exception_handler import run_cleanup
23
+
24
+ from .column import Column
25
+ from .dir import Dir
26
+ from .globals import IfExistsParam, IfNotExistsParam, MediaValidation, QColumnId
27
+ from .insertable_table import InsertableTable
28
+ from .path import Path
29
+ from .schema_object import SchemaObject
13
30
  from .table import Table
14
- from .path_dict import PathDict
31
+ from .table_version import TableVersion, TableVersionKey, TableVersionMd
32
+ from .table_version_handle import TableVersionHandle
33
+ from .table_version_path import TableVersionPath
34
+ from .tbl_ops import TableOp
35
+ from .update_status import UpdateStatus
36
+ from .view import View
37
+
38
+ if TYPE_CHECKING:
39
+ from pixeltable.plan import SampleClause
40
+
41
+ from .. import exprs
15
42
 
16
- import pixeltable.env as env
17
- import pixeltable.metadata.schema as schema
18
43
 
19
44
  _logger = logging.getLogger('pixeltable')
20
45
 
46
+
47
+ def _unpack_row(row: sql.engine.Row | None, entities: list[type[sql.orm.decl_api.DeclarativeBase]]) -> list[Any] | None:
48
+ """Convert a Row result into a list of entity instances.
49
+
50
+ Assumes that the query contains a select() of exactly those entities.
51
+ """
52
+ if row is None:
53
+ return None
54
+
55
+ result: list[sql.orm.decl_api.DeclarativeBase] = []
56
+ column_offset = 0
57
+
58
+ for entity in entities:
59
+ num_cols = len(entity.__table__.columns)
60
+ data = {name: row[column_offset + i] for i, name in enumerate(entity.__table__.columns.keys())}
61
+ inst = entity(**data)
62
+ result.append(inst)
63
+ column_offset += num_cols
64
+
65
+ return result
66
+
67
+
68
+ def md_dict_factory(data: list[tuple[str, Any]]) -> dict:
69
+ """Use this to serialize TableMd instances with asdict()"""
70
+ # serialize enums to their values
71
+ return {k: v.value if isinstance(v, Enum) else v for k, v in data}
72
+
73
+
74
+ # -1: unlimited
75
+ # for now, we don't limit the number of retries, because we haven't seen situations where the actual number of retries
76
+ # grows uncontrollably
77
+ _MAX_RETRIES = -1
78
+
79
+ T = TypeVar('T')
80
+
81
+
82
+ def retry_loop(
83
+ *, tbl: TableVersionPath | None = None, for_write: bool, lock_mutable_tree: bool = False
84
+ ) -> Callable[[Callable[..., T]], Callable[..., T]]:
85
+ def decorator(op: Callable[..., T]) -> Callable[..., T]:
86
+ @functools.wraps(op)
87
+ def loop(*args: Any, **kwargs: Any) -> T:
88
+ cat = Catalog.get()
89
+ # retry_loop() is reentrant
90
+ if cat._in_retry_loop:
91
+ return op(*args, **kwargs)
92
+
93
+ num_retries = 0
94
+ while True:
95
+ cat._in_retry_loop = True
96
+ try:
97
+ # in order for retry to work, we need to make sure that there aren't any prior db updates
98
+ # that are part of an ongoing transaction
99
+ assert not Env.get().in_xact
100
+ with Catalog.get().begin_xact(
101
+ tbl=tbl,
102
+ for_write=for_write,
103
+ convert_db_excs=False,
104
+ lock_mutable_tree=lock_mutable_tree,
105
+ finalize_pending_ops=True,
106
+ ):
107
+ return op(*args, **kwargs)
108
+ except PendingTableOpsError as e:
109
+ Env.get().console_logger.debug(f'retry_loop(): finalizing pending ops for {e.tbl_id}')
110
+ Catalog.get()._finalize_pending_ops(e.tbl_id)
111
+ except (sql_exc.DBAPIError, sql_exc.OperationalError) as e:
112
+ # TODO: what other exceptions should we be looking for?
113
+ if isinstance(
114
+ # TODO: Investigate whether DeadlockDetected points to a bug in our locking protocol,
115
+ # which is supposed to be deadlock-free.
116
+ e.orig,
117
+ (
118
+ psycopg.errors.SerializationFailure,
119
+ psycopg.errors.LockNotAvailable,
120
+ psycopg.errors.DeadlockDetected,
121
+ ),
122
+ ):
123
+ if num_retries < _MAX_RETRIES or _MAX_RETRIES == -1:
124
+ num_retries += 1
125
+ _logger.debug(f'Retrying ({num_retries}) after {type(e.orig)}')
126
+ time.sleep(random.uniform(0.1, 0.5))
127
+ else:
128
+ raise excs.Error(f'Serialization retry limit ({_MAX_RETRIES}) exceeded') from e
129
+ else:
130
+ raise
131
+ except Exception as e:
132
+ # for informational/debugging purposes
133
+ _logger.debug(f'retry_loop(): passing along {e}')
134
+ raise
135
+ finally:
136
+ cat._in_retry_loop = False
137
+
138
+ return loop
139
+
140
+ return decorator
141
+
142
+
143
+ class PendingTableOpsError(Exception):
144
+ tbl_id: UUID
145
+
146
+ def __init__(self, tbl_id: UUID) -> None:
147
+ self.tbl_id = tbl_id
148
+
149
+
21
150
  class Catalog:
22
- """A repository of catalog objects"""
23
- _instance: Optional[Catalog] = None
151
+ """The functional interface to getting access to catalog objects
152
+
153
+ All interface functions must be called in the context of a transaction, started with Catalog.begin_xact() or
154
+ via retry_loop().
155
+
156
+ When calling functions that involve Table or TableVersion instances, the catalog needs to get a chance to finalize
157
+ pending ops against those tables. To that end,
158
+ - use begin_xact(tbl) or begin_xact(tbl_id) if only accessing a single table
159
+ - use retry_loop() when accessing multiple tables (eg, pxt.ls())
160
+
161
+ Caching and invalidation of metadata:
162
+ - Catalog caches TableVersion instances in order to avoid excessive metadata loading
163
+ - for any specific table version (ie, combination of id and effective version) there can be only a single
164
+ Tableversion instance in circulation; the reason is that each TV instance has its own store_tbl.sa_tbl, and
165
+ mixing multiple instances of sqlalchemy Table objects in the same query (for the same underlying table) leads to
166
+ duplicate references to that table in the From clause (ie, incorrect Cartesian products)
167
+ - in order to allow multiple concurrent Python processes to perform updates (data and/or schema) against a shared
168
+ Pixeltable instance, Catalog needs to reload metadata from the store when there are changes
169
+ - concurrent changes are detected by comparing TableVersion.version/view_sn with the stored current version
170
+ (TableMd.current_version/view_sn)
171
+ - cached live TableVersion instances (those with effective_version == None) are validated against the stored
172
+ metadata on transaction boundaries; this is recorded in TableVersion.is_validated
173
+ - metadata validation is only needed for live TableVersion instances (snapshot instances are immutable)
174
+ """
175
+
176
+ _instance: Catalog | None = None
177
+
178
+ # cached TableVersion instances; key: [id, version, anchor_tbl_id]
179
+ # - mutable version of a table: version == None (even though TableVersion.version is set correctly)
180
+ # - snapshot versions: records the version of the snapshot
181
+ # - anchored versions: records the tbl_id of the anchor table (used when the table is a replica)
182
+ _tbl_versions: dict[TableVersionKey, TableVersion]
183
+ _tbls: dict[tuple[UUID, int | None], Table]
184
+ _in_write_xact: bool # True if we're in a write transaction
185
+ _x_locked_tbl_ids: set[UUID] # non-empty for write transactions
186
+ _modified_tvs: set[TableVersionHandle] # TableVersion instances modified in the current transaction
187
+ _roll_forward_ids: set[UUID] # ids of Tables that have pending TableOps
188
+ _undo_actions: list[Callable[[], None]]
189
+ _in_retry_loop: bool
190
+
191
+ # cached column dependencies
192
+ # - key: table id, value: mapping from column id to its dependencies
193
+ # - only maintained for dependencies between non-snapshot table versions
194
+ # - can contain stale entries (stemming from invalidated TV instances)
195
+ _column_dependencies: dict[UUID, dict[QColumnId, set[QColumnId]]]
196
+
197
+ # column dependents are recomputed at the beginning of every write transaction and only reflect the locked tree
198
+ _column_dependents: dict[QColumnId, set[QColumnId]] | None
24
199
 
25
200
  @classmethod
26
201
  def get(cls) -> Catalog:
27
202
  if cls._instance is None:
28
203
  cls._instance = cls()
29
- with orm.Session(env.Env.get().engine, future=True) as session:
30
- cls._instance._load_table_versions(session)
31
- #cls._instance._load_functions(session)
32
204
  return cls._instance
33
205
 
34
206
  @classmethod
35
207
  def clear(cls) -> None:
36
208
  """Remove the instance. Used for testing."""
209
+ if cls._instance is not None:
210
+ # invalidate all existing instances to force reloading of metadata
211
+ for tbl_version in cls._instance._tbl_versions.values():
212
+ tbl_version.is_validated = False
37
213
  cls._instance = None
38
214
 
39
215
  def __init__(self) -> None:
40
- # key: [id, version]
41
- # - mutable version of a table: version == None (even though TableVersion.version is set correctly)
42
- # - snapshot versions: records the version of the snapshot
43
- self.tbl_versions: dict[tuple[UUID, Optional[int]], TableVersion] = {}
216
+ self._tbl_versions = {}
217
+ self._tbls = {} # don't use a defaultdict here, it doesn't cooperate with the debugger
218
+ self._in_write_xact = False
219
+ self._x_locked_tbl_ids = set()
220
+ self._modified_tvs = set()
221
+ self._roll_forward_ids = set()
222
+ self._undo_actions = []
223
+ self._in_retry_loop = False
224
+ self._column_dependencies = {}
225
+ self._column_dependents = None
226
+ self._init_store()
44
227
 
45
- self.tbls: dict[UUID, Table] = {} # don't use a defaultdict here, it doesn't cooperate with the debugger
46
- self.tbl_dependents: dict[UUID, list[Table]] = {}
228
+ def _active_tbl_clause(
229
+ self, *, tbl_id: UUID | None = None, dir_id: UUID | None = None, tbl_name: str | None = None
230
+ ) -> sql.ColumnElement[bool]:
231
+ """Create a clause that filters out dropped tables in addition to the specified conditions."""
232
+ # avoid tables that are in the process of getting dropped
233
+ clause = sql.func.coalesce(schema.Table.md['pending_stmt'].astext, '-1') != str(
234
+ schema.TableStatement.DROP_TABLE.value
235
+ )
236
+ if tbl_id is not None:
237
+ clause = sql.and_(schema.Table.id == tbl_id, clause)
238
+ if dir_id is not None:
239
+ clause = sql.and_(schema.Table.dir_id == dir_id, clause)
240
+ if tbl_name is not None:
241
+ clause = sql.and_(schema.Table.md['name'].astext == tbl_name, clause)
242
+ return clause
47
243
 
48
- self._init_store()
49
- self.paths = PathDict() # do this after _init_catalog()
244
+ def _dropped_tbl_error_msg(self, tbl_id: UUID) -> str:
245
+ return f'Table was dropped (no record found for {tbl_id})'
246
+
247
+ def validate(self) -> None:
248
+ """Validate structural consistency of cached metadata"""
249
+ for (tbl_id, effective_version, anchor_tbl_id), tbl_version in self._tbl_versions.items():
250
+ assert tbl_id == tbl_version.id, f'{tbl_id} != {tbl_version.id}'
251
+ assert effective_version is None or anchor_tbl_id is None
252
+ assert tbl_version.effective_version == tbl_version.version or tbl_version.effective_version is None, (
253
+ f'{tbl_version.effective_version} != {tbl_version.version} for id {tbl_id}'
254
+ )
255
+ assert effective_version == tbl_version.effective_version, (
256
+ f'{effective_version} != {tbl_version.effective_version} for id {tbl_id}'
257
+ )
258
+ assert len(tbl_version.mutable_views) == 0 or tbl_version.is_mutable, (
259
+ f'snapshot_id={tbl_version.id} mutable_views={tbl_version.mutable_views}'
260
+ )
261
+
262
+ assert anchor_tbl_id is None or tbl_version.is_replica
263
+
264
+ if tbl_version.is_view and tbl_version.is_mutable and tbl_version.is_validated:
265
+ # make sure this mutable view is recorded in a mutable base
266
+ base = tbl_version.base
267
+ assert base is not None
268
+ if base.effective_version is None:
269
+ key = TableVersionKey(base.id, None, None)
270
+ assert key in self._tbl_versions
271
+ base_tv = self._tbl_versions[key]
272
+ if not base_tv.is_validated:
273
+ continue
274
+ mutable_view_ids = ', '.join(str(tv.id) for tv in self._tbl_versions[key].mutable_views)
275
+ mutable_view_names = ', '.join(
276
+ tv._tbl_version.name
277
+ for tv in self._tbl_versions[key].mutable_views
278
+ if tv._tbl_version is not None
279
+ )
280
+ assert tbl_version.handle in self._tbl_versions[key].mutable_views, (
281
+ f'{tbl_version.name} ({tbl_version.id}) missing in {mutable_view_ids} ({mutable_view_names})'
282
+ )
283
+
284
+ if len(tbl_version.mutable_views) > 0:
285
+ # make sure we also loaded mutable view metadata, which is needed to detect column dependencies
286
+ for v in tbl_version.mutable_views:
287
+ assert v.effective_version is None, f'{v.id}:{v.effective_version}'
288
+
289
+ def mark_modified_tvs(self, *handle: TableVersionHandle) -> None:
290
+ """Record that the given TableVersion instances were modified in the current transaction"""
291
+ assert Env.get().in_xact
292
+ self._modified_tvs.update(handle)
293
+
294
+ @contextmanager
295
+ def begin_xact(
296
+ self,
297
+ *,
298
+ tbl: TableVersionPath | None = None,
299
+ tbl_id: UUID | None = None,
300
+ for_write: bool = False,
301
+ lock_mutable_tree: bool = False,
302
+ convert_db_excs: bool = True,
303
+ finalize_pending_ops: bool = True,
304
+ ) -> Iterator[sql.Connection]:
305
+ """
306
+ Return a context manager that yields a connection to the database. Idempotent.
307
+
308
+ It is mandatory to call this method, not Env.begin_xact(), if the transaction accesses any table data
309
+ or metadata.
310
+
311
+ If tbl != None, follows this locking protocol:
312
+ - validates/reloads the TableVersion instances of tbl's ancestors (in the hope that this reduces potential
313
+ SerializationErrors later on)
314
+ - if for_write == True, x-locks Table record (by updating Table.lock_dummy; see _acquire_tbl_lock())
315
+ - if for_write == False, validates TableVersion instance
316
+ - if lock_mutable_tree == True, also x-locks all mutable views of the table
317
+ - this needs to be done in a retry loop, because Postgres can decide to abort the transaction
318
+ (SerializationFailure, LockNotAvailable)
319
+ - for that reason, we do all lock acquisition prior to doing any real work (eg, compute column values),
320
+ to minimize the probability of losing that work due to a forced abort
321
+
322
+ If convert_db_excs == True, converts DBAPIErrors into excs.Errors.
323
+ """
324
+ assert tbl is None or tbl_id is None # at most one can be specified
325
+ if Env.get().in_xact:
326
+ # make sure that we requested the required table lock at the beginning of the transaction
327
+ if for_write:
328
+ if tbl is not None:
329
+ assert tbl.tbl_id in self._x_locked_tbl_ids, f'{tbl.tbl_id} not in {self._x_locked_tbl_ids}'
330
+ elif tbl_id is not None:
331
+ assert tbl_id in self._x_locked_tbl_ids, f'{tbl_id} not in {self._x_locked_tbl_ids}'
332
+ yield Env.get().conn
333
+ return
334
+
335
+ # tv_msg = '\n'.join(
336
+ # [
337
+ # f'{tv.id}:{tv.effective_version} : tv={id(tv):x} sa_tbl={id(tv.store_tbl.sa_tbl):x}'
338
+ # for tv in self._tbl_versions.values()
339
+ # ]
340
+ # )
341
+ # _logger.debug(f'begin_xact(): {tv_msg}')
342
+ num_retries = 0
343
+ pending_ops_tbl_id: UUID | None = None
344
+ has_exc = False # True if we exited the 'with ...begin_xact()' block with an exception
345
+ while True:
346
+ if pending_ops_tbl_id is not None:
347
+ Env.get().console_logger.debug(f'begin_xact(): finalizing pending ops for {pending_ops_tbl_id}')
348
+ self._finalize_pending_ops(pending_ops_tbl_id)
349
+ pending_ops_tbl_id = None
350
+
351
+ try:
352
+ self._in_write_xact = for_write
353
+ self._x_locked_tbl_ids = set()
354
+ self._modified_tvs = set()
355
+ self._column_dependents = None
356
+ has_exc = False
357
+
358
+ assert not self._undo_actions
359
+ with Env.get().begin_xact(for_write=for_write) as conn:
360
+ if tbl is not None or tbl_id is not None:
361
+ try:
362
+ target: TableVersionHandle | None = None
363
+ if tbl is not None:
364
+ if self._acquire_path_locks(
365
+ tbl=tbl,
366
+ for_write=for_write,
367
+ lock_mutable_tree=lock_mutable_tree,
368
+ check_pending_ops=finalize_pending_ops,
369
+ ):
370
+ target = tbl.tbl_version
371
+ else:
372
+ target = self._acquire_tbl_lock(
373
+ tbl_id=tbl_id,
374
+ for_write=for_write,
375
+ lock_mutable_tree=lock_mutable_tree,
376
+ raise_if_not_exists=True,
377
+ check_pending_ops=finalize_pending_ops,
378
+ )
379
+
380
+ if target is None:
381
+ # didn't get the write lock
382
+ for_write = False
383
+ elif for_write:
384
+ # we know at this point that target is mutable because we got the X-lock
385
+ if lock_mutable_tree and not target.is_snapshot:
386
+ self._x_locked_tbl_ids = self._get_mutable_tree(target.id)
387
+ self._compute_column_dependents(self._x_locked_tbl_ids)
388
+ else:
389
+ self._x_locked_tbl_ids = {target.id}
390
+ if _logger.isEnabledFor(logging.DEBUG):
391
+ # validate only when we don't see errors
392
+ self.validate()
393
+
394
+ except PendingTableOpsError as e:
395
+ has_exc = True
396
+ if finalize_pending_ops:
397
+ # we remember which table id to finalize
398
+ pending_ops_tbl_id = e.tbl_id
399
+ # raise to abort the transaction
400
+ raise
401
+
402
+ except (sql_exc.DBAPIError, sql_exc.OperationalError) as e:
403
+ has_exc = True
404
+ if isinstance(
405
+ e.orig, (psycopg.errors.SerializationFailure, psycopg.errors.LockNotAvailable)
406
+ ) and (num_retries < _MAX_RETRIES or _MAX_RETRIES == -1):
407
+ num_retries += 1
408
+ _logger.debug(f'Retrying ({num_retries}) after {type(e.orig)}')
409
+ time.sleep(random.uniform(0.1, 0.5))
410
+ assert not self._undo_actions # We should not have any undo actions at this point
411
+ continue
412
+ else:
413
+ raise
414
+
415
+ assert not self._undo_actions
416
+ yield conn
417
+ return
418
+
419
+ except PendingTableOpsError:
420
+ has_exc = True
421
+ if pending_ops_tbl_id is not None:
422
+ # the next iteration of the loop will deal with pending ops for this table id
423
+ continue
424
+ else:
425
+ # we got this exception after getting the initial table locks and therefore need to abort
426
+ raise
427
+
428
+ except (sql_exc.DBAPIError, sql_exc.OperationalError, sql_exc.InternalError) as e:
429
+ has_exc = True
430
+ self.convert_sql_exc(e, tbl_id, tbl.tbl_version if tbl is not None else None, convert_db_excs)
431
+ raise # re-raise the error if it didn't convert to a pxt.Error
432
+
433
+ except (Exception, KeyboardInterrupt) as e:
434
+ has_exc = True
435
+ _logger.debug(f'Caught {e.__class__}')
436
+ raise
437
+
438
+ finally:
439
+ self._in_write_xact = False
440
+ self._x_locked_tbl_ids.clear()
441
+ self._column_dependents = None
442
+
443
+ # invalidate cached current TableVersion instances
444
+ for tv in self._tbl_versions.values():
445
+ if tv.effective_version is None:
446
+ _logger.debug(f'invalidating table version {tv} (0x{id(tv):x})')
447
+ tv.is_validated = False
448
+
449
+ if has_exc:
450
+ # Execute undo actions in reverse order (LIFO)
451
+ for hook in reversed(self._undo_actions):
452
+ run_cleanup(hook, raise_error=False)
453
+ # purge all modified TableVersion instances; we can't guarantee they are still consistent with the
454
+ # stored metadata
455
+ for handle in self._modified_tvs:
456
+ self._clear_tv_cache(handle.key)
457
+ # Clear potentially corrupted cached metadata
458
+ if tbl is not None:
459
+ tbl.clear_cached_md()
460
+
461
+ self._undo_actions.clear()
462
+ self._modified_tvs.clear()
463
+
464
+ def register_undo_action(self, func: Callable[[], None]) -> Callable[[], None]:
465
+ """Registers a function to be called if the current transaction fails.
466
+
467
+ The function is called only if the current transaction fails due to an exception.
468
+
469
+ Rollback functions are called in reverse order of registration (LIFO).
470
+
471
+ The function should not raise exceptions; if it does, they are logged and ignored.
472
+ """
473
+ assert self.in_write_xact
474
+ self._undo_actions.append(func)
475
+ return func
476
+
477
+ def convert_sql_exc(
478
+ self,
479
+ e: sql_exc.StatementError,
480
+ tbl_id: UUID | None = None,
481
+ tbl: TableVersionHandle | None = None,
482
+ convert_db_excs: bool = True,
483
+ ) -> None:
484
+ # we got some db error during the actual operation (not just while trying to get locks on the metadata
485
+ # records); we convert these into pxt.Error exceptions if appropriate
486
+
487
+ # we always convert UndefinedTable exceptions (they can't be retried)
488
+ if isinstance(e.orig, psycopg.errors.UndefinedTable) and tbl is not None:
489
+ # the table got dropped in the middle of the operation
490
+ tbl_name = tbl.get().name
491
+ _logger.debug(f'Exception: undefined table {tbl_name!r}: Caught {type(e.orig)}: {e!r}')
492
+ raise excs.Error(f'Table was dropped: {tbl_name}') from None
493
+ elif (
494
+ # TODO: Investigate whether DeadlockDetected points to a bug in our locking protocol,
495
+ # which is supposed to be deadlock-free.
496
+ isinstance(
497
+ e.orig,
498
+ (
499
+ psycopg.errors.SerializationFailure, # serialization error despite getting x-locks
500
+ psycopg.errors.InFailedSqlTransaction, # can happen after tx fails for another reason
501
+ psycopg.errors.DuplicateColumn, # if a different process added a column concurrently
502
+ psycopg.errors.DeadlockDetected, # locking protocol contention
503
+ ),
504
+ )
505
+ and convert_db_excs
506
+ ):
507
+ msg: str
508
+ if tbl is not None:
509
+ msg = f'{tbl.get().name} ({tbl.id})'
510
+ elif tbl_id is not None:
511
+ msg = f'{tbl_id}'
512
+ else:
513
+ msg = ''
514
+ _logger.debug(f'Exception: {e.orig.__class__}: {msg} ({e})')
515
+ # Suppress the underlying SQL exception unless DEBUG is enabled
516
+ raise_from = e if _logger.isEnabledFor(logging.DEBUG) else None
517
+ raise excs.Error(
518
+ 'That Pixeltable operation could not be completed because it conflicted with another '
519
+ 'operation that was run on a different process.\n'
520
+ 'Please re-run the operation.'
521
+ ) from raise_from
522
+
523
+ @property
524
+ def in_write_xact(self) -> bool:
525
+ return self._in_write_xact
526
+
527
+ def _acquire_path_locks(
528
+ self,
529
+ *,
530
+ tbl: TableVersionPath,
531
+ for_write: bool = False,
532
+ lock_mutable_tree: bool = False,
533
+ check_pending_ops: bool = True,
534
+ ) -> bool:
535
+ """
536
+ Path locking protocol:
537
+ - refresh cached TableVersions of ancestors (we need those even during inserts, for computed columns that
538
+ reference the base tables)
539
+ - refresh cached TableVersion of tbl or get X-lock, depending on for_write
540
+ - if lock_mutable_tree, also X-lock all mutable views of tbl
541
+
542
+ Raises Error if tbl doesn't exist.
543
+ Return False if the lock couldn't be acquired (X-lock on a non-mutable table), True otherwise.
544
+ """
545
+ path_handles = tbl.get_tbl_versions()
546
+ read_handles = path_handles[:0:-1] if for_write else path_handles[::-1]
547
+ for handle in read_handles:
548
+ # update cache
549
+ _ = self.get_tbl_version(handle.key, validate_initialized=True)
550
+ if not for_write:
551
+ return True # nothing left to lock
552
+ handle = self._acquire_tbl_lock(
553
+ tbl_id=tbl.tbl_id,
554
+ for_write=True,
555
+ lock_mutable_tree=lock_mutable_tree,
556
+ raise_if_not_exists=True,
557
+ check_pending_ops=check_pending_ops,
558
+ )
559
+ # update cache
560
+ _ = self.get_tbl_version(path_handles[0].key, validate_initialized=True)
561
+ return handle is not None
562
+
563
+ def _acquire_tbl_lock(
564
+ self,
565
+ *,
566
+ for_write: bool,
567
+ tbl_id: UUID | None = None,
568
+ dir_id: UUID | None = None,
569
+ tbl_name: str | None = None,
570
+ lock_mutable_tree: bool = False,
571
+ raise_if_not_exists: bool = True,
572
+ check_pending_ops: bool = True,
573
+ ) -> TableVersionHandle | None:
574
+ """
575
+ For writes: force acquisition of an X-lock on a Table record via a blind update.
576
+
577
+ Either tbl_id or dir_id/tbl_name need to be specified.
578
+ Returns True if the table was locked, False if it was a snapshot or not found.
579
+ If lock_mutable_tree, recursively locks all mutable views of the table.
580
+
581
+ Returns a handle to what was locked, None if the lock couldn't be acquired (eg, X-lock on a non-mutable table).
582
+ """
583
+ assert (tbl_id is not None) != (dir_id is not None and tbl_name is not None)
584
+ assert (dir_id is None) == (tbl_name is None)
585
+ where_clause: sql.ColumnElement
586
+ if tbl_id is not None:
587
+ where_clause = schema.Table.id == tbl_id
588
+ else:
589
+ where_clause = sql.and_(schema.Table.dir_id == dir_id, schema.Table.md['name'].astext == tbl_name)
590
+ user = Env.get().user
591
+ if user is not None:
592
+ where_clause = sql.and_(where_clause, schema.Table.md['user'].astext == Env.get().user)
593
+
594
+ conn = Env.get().conn
595
+ q = sql.select(schema.Table).where(where_clause)
596
+ if for_write:
597
+ q = q.with_for_update(nowait=True)
598
+ row = conn.execute(q).one_or_none()
599
+ if row is None:
600
+ if raise_if_not_exists:
601
+ raise excs.Error(self._dropped_tbl_error_msg(tbl_id))
602
+ return None # nothing to lock
603
+ tbl_md = schema.md_from_dict(schema.TableMd, row.md)
604
+ if for_write and tbl_md.is_mutable:
605
+ conn.execute(sql.update(schema.Table).values(lock_dummy=1).where(where_clause))
606
+
607
+ if check_pending_ops:
608
+ # check for pending ops after getting table lock
609
+ pending_ops_q = sql.select(sql.func.count()).where(schema.PendingTableOp.tbl_id == row.id)
610
+ has_pending_ops = conn.execute(pending_ops_q).scalar() > 0
611
+ if has_pending_ops:
612
+ raise PendingTableOpsError(row.id)
613
+
614
+ # TODO: properly handle concurrency for replicas with live views (once they are supported)
615
+ if for_write and not tbl_md.is_mutable:
616
+ return None # nothing to lock
617
+
618
+ key = TableVersionKey(tbl_id, tbl_md.current_version if tbl_md.is_snapshot else None, None)
619
+ if tbl_md.is_mutable and lock_mutable_tree:
620
+ # also lock mutable views
621
+ tv = self.get_tbl_version(key, validate_initialized=True)
622
+ for view in tv.mutable_views:
623
+ self._acquire_tbl_lock(
624
+ for_write=for_write,
625
+ tbl_id=view.id,
626
+ lock_mutable_tree=lock_mutable_tree,
627
+ raise_if_not_exists=raise_if_not_exists,
628
+ check_pending_ops=check_pending_ops,
629
+ )
630
+ return TableVersionHandle(key)
631
+
632
+ def _roll_forward(self) -> None:
633
+ """Finalize pending ops for all tables in self._roll_forward_ids."""
634
+ for tbl_id in self._roll_forward_ids:
635
+ self._finalize_pending_ops(tbl_id)
636
+ # TODO: handle replicas
637
+ self._clear_tv_cache(TableVersionKey(tbl_id, None, None))
638
+
639
+ def _finalize_pending_ops(self, tbl_id: UUID) -> None:
640
+ """Finalizes all pending ops for the given table."""
641
+ num_retries = 0
642
+ while True:
643
+ try:
644
+ tbl_version: int
645
+ op: TableOp | None = None
646
+ delete_next_op_stmt: sql.Delete
647
+ reset_state_stmt: sql.Update
648
+ with self.begin_xact(
649
+ tbl_id=tbl_id, for_write=True, convert_db_excs=False, finalize_pending_ops=False
650
+ ) as conn:
651
+ q = (
652
+ sql.select(schema.Table.md, schema.PendingTableOp)
653
+ .select_from(schema.Table)
654
+ .join(schema.PendingTableOp)
655
+ .where(schema.Table.id == tbl_id)
656
+ .where(schema.PendingTableOp.tbl_id == tbl_id)
657
+ .order_by(schema.PendingTableOp.op_sn)
658
+ .limit(1)
659
+ .with_for_update()
660
+ )
661
+ row = conn.execute(q).one_or_none()
662
+ if row is None:
663
+ return
664
+ view_md = row.md.get('view_md')
665
+ is_snapshot = False if view_md is None else view_md.get('is_snapshot')
666
+ assert is_snapshot is not None
667
+ tbl_version = row.md.get('current_version') if is_snapshot else None
668
+ op = schema.md_from_dict(TableOp, row.op)
669
+ delete_next_op_stmt = sql.delete(schema.PendingTableOp).where(
670
+ schema.PendingTableOp.tbl_id == tbl_id, schema.PendingTableOp.op_sn == row.op_sn
671
+ )
672
+ reset_state_stmt = (
673
+ sql.update(schema.Table)
674
+ .where(schema.Table.id == tbl_id)
675
+ .values(
676
+ md=schema.Table.md.op('||')(
677
+ {'tbl_state': schema.TableState.LIVE.value, 'pending_stmt': None}
678
+ )
679
+ )
680
+ )
681
+ _logger.debug(f'finalize_pending_ops({tbl_id}): finalizing op {op!s}')
682
+
683
+ if op.needs_xact:
684
+ if op.delete_table_md_op is not None:
685
+ self.delete_tbl_md(tbl_id)
686
+ else:
687
+ tv = self.get_tbl_version(
688
+ TableVersionKey(tbl_id, tbl_version, None),
689
+ check_pending_ops=False,
690
+ validate_initialized=True,
691
+ )
692
+ # TODO: The above TableVersionKey instance will need to be updated if we see a replica here.
693
+ # For now, just assert that we don't.
694
+ assert not tv.is_replica
695
+ tv.exec_op(op)
696
+
697
+ conn.execute(delete_next_op_stmt)
698
+ if op.op_sn == op.num_ops - 1:
699
+ conn.execute(reset_state_stmt)
700
+ return
701
+ continue
702
+
703
+ # this op runs outside of a transaction
704
+ tv = self.get_tbl_version(
705
+ TableVersionKey(tbl_id, tbl_version, None), check_pending_ops=False, validate_initialized=True
706
+ )
707
+ tv.exec_op(op)
708
+ with self.begin_xact(
709
+ tbl_id=tbl_id, for_write=True, convert_db_excs=False, finalize_pending_ops=False
710
+ ) as conn:
711
+ conn.execute(delete_next_op_stmt)
712
+ if op.op_sn == op.num_ops - 1:
713
+ conn.execute(reset_state_stmt)
714
+ return
715
+
716
+ except (sql_exc.DBAPIError, sql_exc.OperationalError) as e:
717
+ # TODO: why are we still seeing these here, instead of them getting taken care of by the retry
718
+ # logic of begin_xact()?
719
+ if isinstance(e.orig, (psycopg.errors.SerializationFailure, psycopg.errors.LockNotAvailable)):
720
+ num_retries += 1
721
+ log_msg: str
722
+ if op is not None:
723
+ log_msg = f'finalize_pending_ops(): retrying ({num_retries}) op {op!s} after {type(e.orig)}'
724
+ else:
725
+ log_msg = f'finalize_pending_ops(): retrying ({num_retries}) after {type(e.orig)}'
726
+ Env.get().console_logger.debug(log_msg)
727
+ time.sleep(random.uniform(0.1, 0.5))
728
+ continue
729
+ else:
730
+ raise
731
+ except Exception as e:
732
+ Env.get().console_logger.debug(f'finalize_pending_ops(): caught {e}')
733
+ raise
734
+
735
+ num_retries = 0
736
+
737
+ def _debug_str(self) -> str:
738
+ tv_str = '\n'.join(str(k) for k in self._tbl_versions)
739
+ tbl_str = '\n'.join(str(k) for k in self._tbls)
740
+ return f'tbl_versions:\n{tv_str}\ntbls:\n{tbl_str}'
741
+
742
+ def _get_mutable_tree(self, tbl_id: UUID) -> set[UUID]:
743
+ """Returns ids of all tables that form the tree of mutable views starting at tbl_id; includes the root."""
744
+ key = TableVersionKey(tbl_id, None, None)
745
+ assert key in self._tbl_versions, f'{key} not in {self._tbl_versions.keys()}\n{self._debug_str()}'
746
+ tv = self.get_tbl_version(key, validate_initialized=True)
747
+ assert not tv.is_replica
748
+ result: set[UUID] = {tv.id}
749
+ for view in tv.mutable_views:
750
+ result.update(self._get_mutable_tree(view.id))
751
+ return result
752
+
753
+ def _compute_column_dependents(self, mutable_tree: set[UUID]) -> None:
754
+ """Populate self._column_dependents for all tables in mutable_tree"""
755
+ assert self._column_dependents is None
756
+ self._column_dependents = defaultdict(set)
757
+ for tbl_id in mutable_tree:
758
+ assert tbl_id in self._column_dependencies, (
759
+ f'{tbl_id} not in {self._column_dependencies.keys()}\n{self._debug_str()}'
760
+ )
761
+ for col, dependencies in self._column_dependencies[tbl_id].items():
762
+ for dependency in dependencies:
763
+ if dependency.tbl_id not in mutable_tree:
764
+ continue
765
+ dependents = self._column_dependents[dependency]
766
+ dependents.add(col)
767
+
768
+ def record_column_dependencies(self, tbl_version: TableVersion) -> None:
769
+ """Update self._column_dependencies. Only valid for mutable versions."""
770
+ from pixeltable.exprs import Expr
771
+
772
+ assert tbl_version.is_mutable
773
+ dependencies: dict[QColumnId, set[QColumnId]] = {}
774
+ for col in tbl_version.cols_by_id.values():
775
+ if col.value_expr_dict is None:
776
+ continue
777
+ dependencies[QColumnId(tbl_version.id, col.id)] = Expr.get_refd_column_ids(col.value_expr_dict)
778
+ self._column_dependencies[tbl_version.id] = dependencies
779
+
780
+ def get_column_dependents(self, tbl_id: UUID, col_id: int) -> set[Column]:
781
+ """Return all Columns that transitively depend on the given column."""
782
+ assert self._column_dependents is not None
783
+ dependents = self._column_dependents[QColumnId(tbl_id, col_id)]
784
+ result: set[Column] = set()
785
+ for dependent in dependents:
786
+ tv = self.get_tbl_version(TableVersionKey(dependent.tbl_id, None, None), validate_initialized=True)
787
+ col = tv.cols_by_id[dependent.col_id]
788
+ result.add(col)
789
+ return result
790
+
791
+ def _acquire_dir_xlock(
792
+ self, *, parent_id: UUID | None = None, dir_id: UUID | None = None, dir_name: str | None = None
793
+ ) -> None:
794
+ """Force acquisition of an X-lock on a Dir record via a blind update.
795
+
796
+ If dir_id is present, then all other conditions are ignored.
797
+ Note that (parent_id==None) is a valid where condition.
798
+ If dir_id is not specified, the user from the environment is added to the directory filters.
799
+ """
800
+ assert (dir_name is None) != (dir_id is None)
801
+ assert not (parent_id is not None and dir_name is None)
802
+ user = Env.get().user
803
+ assert self._in_write_xact
804
+ q = sql.update(schema.Dir).values(lock_dummy=1)
805
+ if dir_id is not None:
806
+ q = q.where(schema.Dir.id == dir_id)
807
+ else:
808
+ q = q.where(schema.Dir.parent_id == parent_id)
809
+ if dir_name is not None:
810
+ q = q.where(schema.Dir.md['name'].astext == dir_name)
811
+ if user is not None:
812
+ q = q.where(schema.Dir.md['user'].astext == user)
813
+ Env.get().conn.execute(q)
814
+
815
+ def get_dir_path(self, dir_id: UUID) -> Path:
816
+ """Return path for directory with given id"""
817
+ assert isinstance(dir_id, UUID)
818
+ conn = Env.get().conn
819
+ names: list[str] = []
820
+ while True:
821
+ q = sql.select(schema.Dir).where(schema.Dir.id == dir_id)
822
+ row = conn.execute(q).one()
823
+ dir = schema.Dir(**row._mapping)
824
+ if dir.md['name'] == '':
825
+ break
826
+ names.insert(0, dir.md['name'])
827
+ dir_id = dir.parent_id
828
+ return Path.parse('.'.join(names), allow_empty_path=True, allow_system_path=True)
829
+
830
+ @dataclasses.dataclass
831
+ class DirEntry:
832
+ dir: schema.Dir | None
833
+ dir_entries: dict[str, Catalog.DirEntry]
834
+ table: schema.Table | None
835
+
836
+ @retry_loop(for_write=False)
837
+ def get_dir_contents(self, dir_path: Path, recursive: bool = False) -> dict[str, DirEntry]:
838
+ dir = self._get_schema_object(dir_path, expected=Dir, raise_if_not_exists=True)
839
+ return self._get_dir_contents(dir._id, recursive=recursive)
840
+
841
+ def _get_dir_contents(self, dir_id: UUID, recursive: bool = False) -> dict[str, DirEntry]:
842
+ """Returns a dict mapping the entry names to DirEntry objects"""
843
+ conn = Env.get().conn
844
+ result: dict[str, Catalog.DirEntry] = {}
845
+
846
+ q = sql.select(schema.Dir).where(schema.Dir.parent_id == dir_id)
847
+ rows = conn.execute(q).all()
848
+ for row in rows:
849
+ dir = schema.Dir(**row._mapping)
850
+ dir_contents: dict[str, Catalog.DirEntry] = {}
851
+ if recursive:
852
+ dir_contents = self._get_dir_contents(dir.id, recursive=True)
853
+ result[dir.md['name']] = self.DirEntry(dir=dir, dir_entries=dir_contents, table=None)
854
+
855
+ q = sql.select(schema.Table).where(self._active_tbl_clause(dir_id=dir_id))
856
+ rows = conn.execute(q).all()
857
+ for row in rows:
858
+ tbl = schema.Table(**row._mapping)
859
+ result[tbl.md['name']] = self.DirEntry(dir=None, dir_entries={}, table=tbl)
860
+
861
+ return result
862
+
863
+ @retry_loop(for_write=True)
864
+ def move(self, path: Path, new_path: Path, if_exists: IfExistsParam, if_not_exists: IfNotExistsParam) -> None:
865
+ self._move(path, new_path, if_exists, if_not_exists)
866
+
867
+ def _move(self, path: Path, new_path: Path, if_exists: IfExistsParam, if_not_exists: IfNotExistsParam) -> None:
868
+ dest_obj, dest_dir, src_obj = self._prepare_dir_op(
869
+ add_dir_path=new_path.parent,
870
+ add_name=new_path.name,
871
+ drop_dir_path=path.parent,
872
+ drop_name=path.name,
873
+ raise_if_exists=(if_exists == IfExistsParam.ERROR),
874
+ raise_if_not_exists=(if_not_exists == IfNotExistsParam.ERROR),
875
+ )
876
+ assert dest_obj is None or if_exists == IfExistsParam.IGNORE
877
+ assert src_obj is not None or if_not_exists == IfNotExistsParam.IGNORE
878
+ if dest_obj is None and src_obj is not None:
879
+ # If dest_obj is not None, it means `if_exists='ignore'` and the destination already exists.
880
+ # If src_obj is None, it means `if_not_exists='ignore'` and the source doesn't exist.
881
+ # If dest_obj is None and src_obj is not None, then we can proceed with the move.
882
+ src_obj._move(new_path.name, dest_dir._id)
883
+
884
+ def _prepare_dir_op(
885
+ self,
886
+ add_dir_path: Path | None = None,
887
+ add_name: str | None = None,
888
+ drop_dir_path: Path | None = None,
889
+ drop_name: str | None = None,
890
+ drop_expected: type[SchemaObject] | None = None,
891
+ raise_if_exists: bool = False,
892
+ raise_if_not_exists: bool = False,
893
+ ) -> tuple[SchemaObject | None, Dir | None, SchemaObject | None]:
894
+ """
895
+ Validates paths and acquires locks needed for a directory operation, ie, add/drop/rename (add + drop) of a
896
+ directory entry.
897
+
898
+ The target entry is either a table or directory. The directory operation can include
899
+ - adding an entry (<add_dir_path>.<add_name>)
900
+ - dropping an entry (<drop_dir_path>.<drop_name>)
901
+
902
+ Returns: (existing SchemaObject of add path, Dir of add path, existing SchemaObject of drop path)
903
+
904
+ Locking protocol:
905
+ - X locks on the immediate parent directories of the added/dropped entries; this prevents concurrent
906
+ modifications of the parent
907
+ - lock parent before child
908
+ - if both add and drop (= two directories are involved), lock the directories in a pre-determined order
909
+ (in this case, by name) in order to prevent deadlocks between concurrent directory modifications
910
+ """
911
+ assert drop_expected in (None, Table, Dir), drop_expected
912
+ assert (add_dir_path is None) == (add_name is None)
913
+ assert (drop_dir_path is None) == (drop_name is None)
914
+ dir_paths: set[Path] = set()
915
+ if add_dir_path is not None:
916
+ dir_paths.add(add_dir_path)
917
+ if drop_dir_path is not None:
918
+ dir_paths.add(drop_dir_path)
919
+
920
+ add_dir: schema.Dir | None = None
921
+ drop_dir: schema.Dir | None = None
922
+ for p in sorted(dir_paths):
923
+ dir = self._get_dir(p, lock_dir=True)
924
+ if dir is None:
925
+ # Dir does not exist; raise an appropriate error.
926
+ if add_dir_path is not None or add_name is not None:
927
+ raise excs.Error(f'Directory {p!r} does not exist. Create it first with:\npxt.create_dir({p!r})')
928
+ else:
929
+ raise excs.Error(f'Directory {p!r} does not exist.')
930
+ if p == add_dir_path:
931
+ add_dir = dir
932
+ if p == drop_dir_path:
933
+ drop_dir = dir
934
+
935
+ add_obj: SchemaObject | None = None
936
+ if add_dir is not None:
937
+ add_obj = self._get_dir_entry(add_dir.id, add_name, lock_entry=True)
938
+ if add_obj is not None and raise_if_exists:
939
+ add_path = add_dir_path.append(add_name)
940
+ raise excs.Error(f'Path {add_path!r} already exists.')
941
+
942
+ drop_obj: SchemaObject | None = None
943
+ if drop_dir is not None:
944
+ drop_path = drop_dir_path.append(drop_name)
945
+ drop_obj = self._get_dir_entry(drop_dir.id, drop_name, lock_entry=True)
946
+ if drop_obj is None and raise_if_not_exists:
947
+ raise excs.Error(f'Path {drop_path!r} does not exist.')
948
+ if drop_obj is not None and drop_expected is not None and not isinstance(drop_obj, drop_expected):
949
+ expected_name = 'table' if drop_expected is Table else 'directory'
950
+ raise excs.Error(f'{drop_path!r} needs to be a {expected_name} but is a {drop_obj._display_name()}')
951
+
952
+ add_dir_obj = Dir(add_dir.id, add_dir.parent_id, add_dir.md['name']) if add_dir is not None else None
953
+ return add_obj, add_dir_obj, drop_obj
954
+
955
+ def _get_dir_entry(
956
+ self, dir_id: UUID, name: str, version: int | None = None, lock_entry: bool = False
957
+ ) -> SchemaObject | None:
958
+ user = Env.get().user
959
+ conn = Env.get().conn
960
+
961
+ # check for subdirectory
962
+ if lock_entry:
963
+ self._acquire_dir_xlock(parent_id=dir_id, dir_id=None, dir_name=name)
964
+ q = sql.select(schema.Dir).where(
965
+ schema.Dir.parent_id == dir_id, schema.Dir.md['name'].astext == name, schema.Dir.md['user'].astext == user
966
+ )
967
+ rows = conn.execute(q).all()
968
+ # The condition below can occur if there is a synchronization failure across multiple processes
969
+ # It indicates database inconsistency.
970
+ if len(rows) > 1:
971
+ raise AssertionError(rows)
972
+ if len(rows) == 1:
973
+ dir_record = schema.Dir(**rows[0]._mapping)
974
+ return Dir(dir_record.id, dir_record.parent_id, name)
975
+
976
+ # check for table
977
+ if lock_entry:
978
+ self._acquire_tbl_lock(for_write=True, dir_id=dir_id, raise_if_not_exists=False, tbl_name=name)
979
+ q = sql.select(schema.Table.id).where(
980
+ self._active_tbl_clause(dir_id=dir_id, tbl_name=name), schema.Table.md['user'].astext == user
981
+ )
982
+ tbl_id = conn.execute(q).scalars().all()
983
+ assert len(tbl_id) <= 1, name
984
+ if len(tbl_id) == 1:
985
+ return self.get_table_by_id(tbl_id[0], version)
986
+
987
+ return None
988
+
989
+ def _get_schema_object(
990
+ self,
991
+ path: Path,
992
+ expected: type[SchemaObject] | None = None,
993
+ raise_if_exists: bool = False,
994
+ raise_if_not_exists: bool = False,
995
+ lock_parent: bool = False,
996
+ lock_obj: bool = False,
997
+ ) -> SchemaObject | None:
998
+ """Return the schema object at the given path, or None if it doesn't exist.
999
+
1000
+ Raises Error if
1001
+ - the parent directory doesn't exist
1002
+ - raise_if_exists is True and the path exists
1003
+ - raise_if_not_exists is True and the path does not exist
1004
+ - expected is not None and the existing object has a different type
1005
+ """
1006
+ assert expected in (None, Table, Dir), expected
1007
+
1008
+ if path.is_root:
1009
+ # the root dir
1010
+ if expected is not None and expected is not Dir:
1011
+ raise excs.Error(f'{path!r} needs to be a table but is a dir')
1012
+ dir = self._get_dir(path, lock_dir=lock_obj)
1013
+ if dir is None:
1014
+ raise excs.Error(f'Unknown user: {Env.get().user}')
1015
+ return Dir(dir.id, dir.parent_id, dir.md['name'])
1016
+
1017
+ parent_path = path.parent
1018
+ parent_dir = self._get_dir(parent_path, lock_dir=lock_parent)
1019
+ if parent_dir is None:
1020
+ raise excs.Error(f'Directory {parent_path!r} does not exist.')
1021
+ obj = self._get_dir_entry(parent_dir.id, path.name, path.version, lock_entry=lock_obj)
1022
+
1023
+ if obj is None and raise_if_not_exists:
1024
+ raise excs.Error(f'Path {path!r} does not exist.')
1025
+ elif obj is not None and raise_if_exists:
1026
+ raise excs.Error(f'Path {path!r} is an existing {obj._display_name()}.')
1027
+ elif obj is not None and expected is not None and not isinstance(obj, expected):
1028
+ expected_name = 'table' if expected is Table else 'directory'
1029
+ raise excs.Error(f'{path!r} needs to be a {expected_name} but is a {obj._display_name()}.')
1030
+ return obj
1031
+
1032
+ def get_table_by_id(
1033
+ self, tbl_id: UUID, version: int | None = None, ignore_if_dropped: bool = False
1034
+ ) -> Table | None:
1035
+ """Must be executed inside a transaction. Might raise PendingTableOpsError."""
1036
+ if (tbl_id, version) not in self._tbls:
1037
+ if version is None:
1038
+ return self._load_tbl(tbl_id, ignore_pending_drop=ignore_if_dropped)
1039
+ else:
1040
+ return self._load_tbl_at_version(tbl_id, version)
1041
+ return self._tbls.get((tbl_id, version))
1042
+
1043
+ def create_table(
1044
+ self,
1045
+ path: Path,
1046
+ schema: dict[str, Any],
1047
+ if_exists: IfExistsParam,
1048
+ primary_key: list[str] | None,
1049
+ num_retained_versions: int,
1050
+ comment: str,
1051
+ media_validation: MediaValidation,
1052
+ create_default_idxs: bool,
1053
+ ) -> tuple[Table, bool]:
1054
+ """
1055
+ Creates a new InsertableTable at the given path.
1056
+
1057
+ If `if_exists == IfExistsParam.IGNORE` and a table `t` already exists at the given path, returns `t, False`.
1058
+
1059
+ Otherwise, creates a new table `t` and returns `t, True` (or raises an exception if the operation fails).
1060
+ """
1061
+
1062
+ @retry_loop(for_write=True)
1063
+ def create_fn() -> tuple[UUID, bool]:
1064
+ import pixeltable.metadata.schema
1065
+
1066
+ existing = self._handle_path_collision(path, InsertableTable, False, if_exists)
1067
+ if existing is not None:
1068
+ assert isinstance(existing, Table)
1069
+ return existing._id, False
1070
+
1071
+ dir = self._get_schema_object(path.parent, expected=Dir, raise_if_not_exists=True)
1072
+ assert dir is not None
1073
+
1074
+ md, ops = InsertableTable._create(
1075
+ path.name,
1076
+ schema,
1077
+ primary_key=primary_key,
1078
+ num_retained_versions=num_retained_versions,
1079
+ comment=comment,
1080
+ media_validation=media_validation,
1081
+ create_default_idxs=create_default_idxs,
1082
+ )
1083
+ tbl_id = UUID(md.tbl_md.tbl_id)
1084
+ md.tbl_md.pending_stmt = pixeltable.metadata.schema.TableStatement.CREATE_TABLE
1085
+ self.write_tbl_md(tbl_id, dir._id, md.tbl_md, md.version_md, md.schema_version_md, ops)
1086
+ return tbl_id, True
1087
+
1088
+ self._roll_forward_ids.clear()
1089
+ tbl_id, is_created = create_fn()
1090
+ self._roll_forward()
1091
+ with self.begin_xact(tbl_id=tbl_id, for_write=True):
1092
+ tbl = self.get_table_by_id(tbl_id)
1093
+ _logger.info(f'Created table {tbl._name!r}, id={tbl._id}')
1094
+ Env.get().console_logger.info(f'Created table {tbl._name!r}.')
1095
+ return tbl, is_created
1096
+
1097
+ def create_view(
1098
+ self,
1099
+ path: Path,
1100
+ base: TableVersionPath,
1101
+ select_list: list[tuple[exprs.Expr, str | None]] | None,
1102
+ where: exprs.Expr | None,
1103
+ sample_clause: 'SampleClause' | None,
1104
+ additional_columns: dict[str, Any] | None,
1105
+ is_snapshot: bool,
1106
+ create_default_idxs: bool,
1107
+ iterator: tuple[type[ComponentIterator], dict[str, Any]] | None,
1108
+ num_retained_versions: int,
1109
+ comment: str,
1110
+ media_validation: MediaValidation,
1111
+ if_exists: IfExistsParam,
1112
+ ) -> Table:
1113
+ @retry_loop(for_write=True)
1114
+ def create_fn() -> UUID:
1115
+ if not is_snapshot and base.is_mutable():
1116
+ # this is a mutable view of a mutable base; X-lock the base and advance its view_sn before adding
1117
+ # the view
1118
+ self._acquire_tbl_lock(tbl_id=base.tbl_id, for_write=True)
1119
+ base_tv = self.get_tbl_version(TableVersionKey(base.tbl_id, None, None), validate_initialized=True)
1120
+ base_tv.tbl_md.view_sn += 1
1121
+ result = Env.get().conn.execute(
1122
+ sql.update(schema.Table)
1123
+ .values({schema.Table.md: dataclasses.asdict(base_tv.tbl_md, dict_factory=md_dict_factory)})
1124
+ .where(schema.Table.id == base.tbl_id)
1125
+ )
1126
+ assert result.rowcount == 1, result.rowcount
1127
+
1128
+ existing = self._handle_path_collision(path, View, is_snapshot, if_exists, base=base)
1129
+ if existing is not None:
1130
+ assert isinstance(existing, View)
1131
+ return existing._id
1132
+
1133
+ dir = self._get_schema_object(path.parent, expected=Dir, raise_if_not_exists=True)
1134
+ assert dir is not None
1135
+ if iterator is None:
1136
+ iterator_class, iterator_args = None, None
1137
+ else:
1138
+ iterator_class, iterator_args = iterator
1139
+ md, ops = View._create(
1140
+ dir._id,
1141
+ path.name,
1142
+ base=base,
1143
+ select_list=select_list,
1144
+ additional_columns=additional_columns,
1145
+ predicate=where,
1146
+ sample_clause=sample_clause,
1147
+ is_snapshot=is_snapshot,
1148
+ create_default_idxs=create_default_idxs,
1149
+ iterator_cls=iterator_class,
1150
+ iterator_args=iterator_args,
1151
+ num_retained_versions=num_retained_versions,
1152
+ comment=comment,
1153
+ media_validation=media_validation,
1154
+ )
1155
+ tbl_id = UUID(md.tbl_md.tbl_id)
1156
+ md.tbl_md.pending_stmt = schema.TableStatement.CREATE_VIEW
1157
+ self.write_tbl_md(tbl_id, dir._id, md.tbl_md, md.version_md, md.schema_version_md, ops)
1158
+ return tbl_id
1159
+
1160
+ self._roll_forward_ids.clear()
1161
+ view_id = create_fn()
1162
+ if not is_snapshot and base.is_mutable():
1163
+ # invalidate base's TableVersion instance, so that it gets reloaded with the new mutable view
1164
+ self._clear_tv_cache(base.tbl_version.key)
1165
+ # base_tv = self.get_tbl_version(base.tbl_id, base.tbl_version.effective_version, validate_initialized=True)
1166
+ # view_handle = TableVersionHandle(view_id, effective_version=None)
1167
+ # base_tv.mutable_views.add(view_handle)
1168
+
1169
+ self._roll_forward()
1170
+ with self.begin_xact(tbl_id=view_id, for_write=True):
1171
+ return self.get_table_by_id(view_id)
1172
+
1173
+ def _clear_tv_cache(self, key: TableVersionKey) -> None:
1174
+ if key in self._tbl_versions:
1175
+ tv = self._tbl_versions[key]
1176
+ tv.is_validated = False
1177
+ del self._tbl_versions[key]
1178
+
1179
+ def create_replica(self, path: Path, md: list[TableVersionMd], create_store_tbls: bool = True) -> None:
1180
+ """
1181
+ Creates table, table_version, and table_schema_version records for a replica with the given metadata.
1182
+ The metadata should be presented in standard "ancestor order", with the table being replicated at
1183
+ list position 0 and the (root) base table at list position -1.
1184
+ """
1185
+ assert self.in_write_xact
1186
+
1187
+ # Acquire locks for any tables in the ancestor hierarchy that might already exist (base table first).
1188
+ for ancestor_md in md[::-1]: # base table first
1189
+ self._acquire_tbl_lock(for_write=True, tbl_id=UUID(ancestor_md.tbl_md.tbl_id), raise_if_not_exists=False)
1190
+
1191
+ tbl_id = UUID(md[0].tbl_md.tbl_id)
1192
+
1193
+ existing = self._handle_path_collision(path, Table, False, if_exists=IfExistsParam.IGNORE) # type: ignore[type-abstract]
1194
+ if existing is not None and existing._id != tbl_id:
1195
+ raise excs.Error(
1196
+ f'An attempt was made to create a replica table at {path!r}, '
1197
+ 'but a different table already exists at that location.'
1198
+ )
1199
+
1200
+ # Ensure that the system directory exists.
1201
+ self.__ensure_system_dir_exists()
1202
+
1203
+ # Now check to see if this table already exists in the catalog.
1204
+ existing = self.get_table_by_id(tbl_id)
1205
+ if existing is not None:
1206
+ existing_path = Path.parse(existing._path(), allow_system_path=True)
1207
+ if existing_path != path and not existing_path.is_system_path:
1208
+ # It does exist, under a different path from the specified one.
1209
+ raise excs.Error(
1210
+ f'That table has already been replicated as {existing_path!r}.\n'
1211
+ f'Drop the existing replica if you wish to re-create it.'
1212
+ )
1213
+
1214
+ # Now store the metadata for this replica's proper ancestors. If one or more proper ancestors
1215
+ # do not yet exist in the store, they will be created as anonymous system tables.
1216
+ # We instantiate the ancestors starting with the base table and ending with the immediate parent of the
1217
+ # table being replicated.
1218
+ for ancestor_md in md[:0:-1]:
1219
+ ancestor_id = UUID(ancestor_md.tbl_md.tbl_id)
1220
+ replica = self.get_table_by_id(ancestor_id)
1221
+ replica_path: Path
1222
+ if replica is None:
1223
+ # We've never seen this table before. Create a new anonymous system table for it.
1224
+ replica_path = Path.parse(f'_system.replica_{ancestor_id.hex}', allow_system_path=True)
1225
+ else:
1226
+ # The table already exists in the catalog. The existing path might be a system path (if the table
1227
+ # was created as an anonymous base table of some other table), or it might not (if it's a snapshot
1228
+ # that was directly replicated by the user at some point). In either case, use the existing path.
1229
+ replica_path = Path.parse(replica._path(), allow_system_path=True)
1230
+
1231
+ # Store the metadata; it could be a new version (in which case a new record will be created), or a known
1232
+ # version (in which case the newly received metadata will be validated as identical).
1233
+ # If it's a new version, this will result in a new TableVersion record being created.
1234
+ self.__store_replica_md(replica_path, ancestor_md)
1235
+
1236
+ # Now we must clear cached metadata for the ancestor table, to force the next table operation to pick up
1237
+ # the new TableVersion instance. This is necessary because computed columns of descendant tables might
1238
+ # reference columns of the ancestor table that only exist in the new version.
1239
+ replica = self.get_table_by_id(ancestor_id)
1240
+ # assert replica is not None # If it didn't exist before, it must have been created by now.
1241
+ if replica is not None:
1242
+ replica._tbl_version_path.clear_cached_md()
1243
+
1244
+ # Store the metadata for the table being replicated; as before, it could be a new version or a known version.
1245
+ # If it's a new version, then a TableVersion record will be created, unless the table being replicated
1246
+ # is a pure snapshot.
1247
+ self.__store_replica_md(path, md[0], create_store_tbls)
1248
+
1249
+ # Finally, it's possible that the table already exists in the catalog, but as an anonymous system table that
1250
+ # was hidden the last time we checked (and that just became visible when the replica was imported). In this
1251
+ # case, we need to make the existing table visible by moving it to the specified path.
1252
+ # We need to do this at the end, since `existing_path` needs to first have a non-fragment table version in
1253
+ # order to be instantiated as a schema object.
1254
+ existing = self.get_table_by_id(tbl_id)
1255
+ assert existing is not None
1256
+ existing_path = Path.parse(existing._path(), allow_system_path=True)
1257
+ if existing_path != path:
1258
+ assert existing_path.is_system_path
1259
+ self._move(existing_path, path, IfExistsParam.ERROR, IfNotExistsParam.ERROR)
1260
+
1261
+ def __ensure_system_dir_exists(self) -> Dir:
1262
+ system_path = Path.parse('_system', allow_system_path=True)
1263
+ return self._create_dir(system_path, if_exists=IfExistsParam.IGNORE, parents=False)
1264
+
1265
+ def __store_replica_md(self, path: Path, md: TableVersionMd, create_store_tbl: bool = True) -> None:
1266
+ _logger.info(f'Creating replica table at {path!r} with ID: {md.tbl_md.tbl_id}')
1267
+ dir = self._get_schema_object(path.parent, expected=Dir, raise_if_not_exists=True)
1268
+ assert dir is not None
1269
+ assert self._in_write_xact
1270
+
1271
+ conn = Env.get().conn
1272
+ tbl_id = md.tbl_md.tbl_id
1273
+
1274
+ new_tbl_md: schema.TableMd | None = None
1275
+ new_version_md: schema.VersionMd | None = None
1276
+ new_schema_version_md: schema.SchemaVersionMd | None = None
1277
+ is_new_tbl_version: bool = False
1278
+
1279
+ # We need to ensure that the table metadata in the catalog always reflects the latest observed version of
1280
+ # this table. (In particular, if this is a base table, then its table metadata need to be consistent
1281
+ # with the latest version of this table having a replicated view somewhere in the catalog.)
1282
+ # TODO: handle concurrent drop() of an existing replica; if we just ignore that Table record here, we can end
1283
+ # up with a duplicate key violation; in principle, we should wait for the concurrent drop() to finish
1284
+ q: sql.Executable = sql.select(schema.Table.md).where(schema.Table.id == tbl_id)
1285
+ existing_md_row = conn.execute(q).one_or_none()
1286
+
1287
+ # Update md with the given name, current user, and is_replica flag.
1288
+ md = dataclasses.replace(
1289
+ md, tbl_md=dataclasses.replace(md.tbl_md, name=path.name, user=Env.get().user, is_replica=True)
1290
+ )
1291
+ if existing_md_row is None:
1292
+ # No existing table, so create a new record.
1293
+ q = sql.insert(schema.Table.__table__).values(
1294
+ id=tbl_id, dir_id=dir._id, md=dataclasses.asdict(md.tbl_md, dict_factory=md_dict_factory)
1295
+ )
1296
+ conn.execute(q)
1297
+ elif not existing_md_row.md['is_replica']:
1298
+ raise excs.Error(
1299
+ 'An attempt was made to replicate a view whose base table already exists in the local catalog '
1300
+ 'in its original form.\n'
1301
+ 'If this is intentional, you must first drop the existing base table:\n'
1302
+ f' pxt.drop_table({str(path)!r})'
1303
+ )
1304
+ elif md.tbl_md.current_version > existing_md_row.md['current_version']:
1305
+ # New metadata is more recent than the metadata currently stored in the DB; we'll update the record
1306
+ # in place in the DB.
1307
+ new_tbl_md = md.tbl_md
1308
+
1309
+ # Now see if a TableVersion record already exists in the DB for this table version. If not, insert it. If
1310
+ # it already exists, check that the existing record is identical to the new one.
1311
+ q = (
1312
+ sql.select(schema.TableVersion.md)
1313
+ .where(schema.TableVersion.tbl_id == tbl_id)
1314
+ .where(schema.TableVersion.md['version'].cast(sql.Integer) == md.version_md.version)
1315
+ )
1316
+ existing_version_md_row = conn.execute(q).one_or_none()
1317
+ if existing_version_md_row is None:
1318
+ new_version_md = md.version_md
1319
+ is_new_tbl_version = True
1320
+ else:
1321
+ existing_version_md = schema.md_from_dict(schema.VersionMd, existing_version_md_row.md)
1322
+ # Validate that the existing metadata are identical to the new metadata, except is_fragment
1323
+ # and additional_md which may differ.
1324
+ if (
1325
+ dataclasses.replace(
1326
+ existing_version_md,
1327
+ is_fragment=md.version_md.is_fragment,
1328
+ additional_md=md.version_md.additional_md,
1329
+ )
1330
+ != md.version_md
1331
+ ):
1332
+ raise excs.Error(
1333
+ f'The version metadata for the replica {path!r}:{md.version_md.version} is inconsistent with '
1334
+ 'the metadata recorded from a prior replica.\n'
1335
+ 'This is likely due to data corruption in the replicated table.'
1336
+ )
1337
+ if existing_version_md.is_fragment and not md.version_md.is_fragment:
1338
+ # This version exists in the DB as a fragment, but we're importing a complete copy of the same version;
1339
+ # set the is_fragment flag to False in the DB.
1340
+ new_version_md = md.version_md
1341
+
1342
+ # Do the same thing for TableSchemaVersion.
1343
+ q = (
1344
+ sql.select(schema.TableSchemaVersion.md)
1345
+ .where(schema.TableSchemaVersion.tbl_id == tbl_id)
1346
+ .where(
1347
+ schema.TableSchemaVersion.md['schema_version'].cast(sql.Integer) == md.schema_version_md.schema_version
1348
+ )
1349
+ )
1350
+ existing_schema_version_md_row = conn.execute(q).one_or_none()
1351
+ if existing_schema_version_md_row is None:
1352
+ new_schema_version_md = md.schema_version_md
1353
+ else:
1354
+ existing_schema_version_md = schema.md_from_dict(schema.SchemaVersionMd, existing_schema_version_md_row.md)
1355
+ # Validate that the existing metadata are identical to the new metadata.
1356
+ if existing_schema_version_md != md.schema_version_md:
1357
+ raise excs.Error(
1358
+ f'The schema version metadata for the replica {path!r}:{md.schema_version_md.schema_version} '
1359
+ 'is inconsistent with the metadata recorded from a prior replica.\n'
1360
+ 'This is likely due to data corruption in the replicated table.'
1361
+ )
1362
+
1363
+ self.write_tbl_md(UUID(tbl_id), None, new_tbl_md, new_version_md, new_schema_version_md)
1364
+
1365
+ if is_new_tbl_version and not md.is_pure_snapshot:
1366
+ # It's a new version of a table that has a physical store, so we need to create a TableVersion instance.
1367
+ TableVersion.create_replica(md, create_store_tbl)
1368
+
1369
+ def get_additional_md(self, tbl_id: UUID) -> dict[str, Any]:
1370
+ """Return the additional_md field of the given table."""
1371
+ assert Env.get().in_xact
1372
+ conn = Env.get().conn
1373
+ q = sql.select(schema.Table.additional_md).where(self._active_tbl_clause(tbl_id=tbl_id))
1374
+ # TODO: handle concurrent drop()
1375
+ row = conn.execute(q).one()
1376
+ assert isinstance(row[0], dict)
1377
+ return row[0]
1378
+
1379
+ def update_additional_md(self, tbl_id: UUID, additional_md: dict[str, Any]) -> None:
1380
+ """
1381
+ Update the additional_md field of the given table. The new additional_md is merged with the
1382
+ existing one via a JSON dictionary merge, giving preference to the new values.
1383
+ """
1384
+ assert self._in_write_xact
1385
+ conn = Env.get().conn
1386
+ q = (
1387
+ sql.update(schema.Table)
1388
+ .where(schema.Table.id == str(tbl_id))
1389
+ .values({schema.Table.additional_md: schema.Table.additional_md.op('||')(additional_md)})
1390
+ )
1391
+ result = conn.execute(q)
1392
+ assert result.rowcount == 1, result.rowcount
1393
+
1394
+ @retry_loop(for_write=False)
1395
+ def get_table(self, path: Path, if_not_exists: IfNotExistsParam) -> Table | None:
1396
+ obj = Catalog.get()._get_schema_object(
1397
+ path, expected=Table, raise_if_not_exists=(if_not_exists == IfNotExistsParam.ERROR)
1398
+ )
1399
+ if obj is None:
1400
+ _logger.info(f'Skipped table {path!r} (does not exist).')
1401
+ return None
1402
+
1403
+ assert isinstance(obj, Table)
1404
+ # We need to clear cached metadata from tbl_version_path, in case the schema has been changed
1405
+ # by another process.
1406
+ obj._tbl_version_path.clear_cached_md()
1407
+ return obj
1408
+
1409
+ def drop_table(self, path: Path, if_not_exists: IfNotExistsParam, force: bool) -> None:
1410
+ @retry_loop(for_write=True)
1411
+ def drop_fn() -> None:
1412
+ tbl = self._get_schema_object(
1413
+ path,
1414
+ expected=Table,
1415
+ raise_if_not_exists=(if_not_exists == IfNotExistsParam.ERROR and not force),
1416
+ lock_parent=True,
1417
+ lock_obj=False,
1418
+ )
1419
+ if tbl is None:
1420
+ _logger.info(f'Skipped table {path!r} (does not exist).')
1421
+ return
1422
+ assert isinstance(tbl, Table)
1423
+
1424
+ if isinstance(tbl, View) and tbl._tbl_version_path.is_mutable() and tbl._tbl_version_path.base.is_mutable():
1425
+ # this is a mutable view of a mutable base;
1426
+ # lock the base before the view, in order to avoid deadlocks with concurrent inserts/updates
1427
+ base_id = tbl._tbl_version_path.base.tbl_id
1428
+ self._acquire_tbl_lock(tbl_id=base_id, for_write=True, lock_mutable_tree=False)
1429
+
1430
+ self._drop_tbl(tbl, force=force, is_replace=False)
1431
+
1432
+ self._roll_forward_ids.clear()
1433
+ drop_fn()
1434
+ self._roll_forward()
1435
+
1436
+ def _drop_tbl(self, tbl: Table | TableVersionPath, force: bool, is_replace: bool) -> None:
1437
+ """
1438
+ Drop the table (and recursively its views, if force == True).
1439
+
1440
+ `tbl` can be an instance of `Table` for a user table, or `TableVersionPath` for a hidden (system) table.
1441
+
1442
+ Returns:
1443
+ List of table ids that were dropped.
1444
+
1445
+ Locking protocol:
1446
+ - X-lock base before X-locking any view
1447
+ - deadlock-free wrt to TableVersion.insert() (insert propagation also proceeds top-down)
1448
+ - X-locks parent dir prior to calling TableVersion.drop(): prevent concurrent creation of another SchemaObject
1449
+ in the same directory with the same name (which could lead to duplicate names if we get aborted)
1450
+ """
1451
+ is_pure_snapshot: bool
1452
+ if isinstance(tbl, TableVersionPath):
1453
+ tvp = tbl
1454
+ tbl_id = tvp.tbl_id
1455
+ tbl = None
1456
+ is_pure_snapshot = False
1457
+ else:
1458
+ tvp = tbl._tbl_version_path
1459
+ tbl_id = tbl._id
1460
+ is_pure_snapshot = tbl._tbl_version is None
1461
+
1462
+ if tbl is not None:
1463
+ self._acquire_dir_xlock(dir_id=tbl._dir_id)
1464
+ self._acquire_tbl_lock(tbl_id=tbl_id, for_write=True, lock_mutable_tree=False)
1465
+
1466
+ view_ids = self.get_view_ids(tbl_id, for_update=True)
1467
+ is_replica = tvp.is_replica()
1468
+ do_drop = True
1469
+
1470
+ _logger.debug(f'Preparing to drop table {tbl_id} (force={force!r}, is_replica={is_replica}).')
1471
+
1472
+ if len(view_ids) > 0:
1473
+ if force:
1474
+ # recursively drop views first
1475
+ for view_id in view_ids:
1476
+ view = self.get_table_by_id(view_id, ignore_if_dropped=True)
1477
+ if view is not None:
1478
+ self._drop_tbl(view, force=force, is_replace=is_replace)
1479
+
1480
+ elif is_replica:
1481
+ # Dropping a replica with dependents and no 'force': just rename it to be a hidden table;
1482
+ # the actual table will not be dropped.
1483
+ assert tbl is not None # can only occur for a user table
1484
+ system_dir = self.__ensure_system_dir_exists()
1485
+ new_name = f'replica_{tbl_id.hex}'
1486
+ _logger.debug(f'{tbl._path()!r} is a replica with dependents; renaming to {new_name!r}.')
1487
+ tbl._move(new_name, system_dir._id)
1488
+ do_drop = False # don't actually clear the catalog for this table
1489
+
1490
+ else:
1491
+ # It has dependents but is not a replica and no 'force', so it's an error to drop it.
1492
+ assert tbl is not None # can only occur for a user table
1493
+ msg: str
1494
+ if is_replace:
1495
+ msg = (
1496
+ f'{tbl._display_str()} already exists and has dependents. '
1497
+ "Use `if_exists='replace_force'` to replace it."
1498
+ )
1499
+ else:
1500
+ msg = f'{tbl._display_str()} has dependents.'
1501
+ raise excs.Error(msg)
1502
+
1503
+ # if this is a mutable view of a mutable base, advance the base's view_sn
1504
+ if isinstance(tbl, View) and tvp.is_mutable() and tvp.base.is_mutable():
1505
+ base_id = tvp.base.tbl_id
1506
+ base_tv = self.get_tbl_version(TableVersionKey(base_id, None, None), validate_initialized=True)
1507
+ base_tv.tbl_md.view_sn += 1
1508
+ result = Env.get().conn.execute(
1509
+ sql.update(schema.Table.__table__)
1510
+ .values({schema.Table.md: dataclasses.asdict(base_tv.tbl_md, dict_factory=md_dict_factory)})
1511
+ .where(schema.Table.id == base_id)
1512
+ )
1513
+ assert result.rowcount == 1, result.rowcount
1514
+ # force reload of base TV instance in order to make its state consistent with the stored metadata
1515
+ self._clear_tv_cache(base_tv.key)
1516
+
1517
+ if do_drop:
1518
+ if is_pure_snapshot:
1519
+ # there is no physical table, but we still need to delete the Table record; we can do that right now
1520
+ # as part of the current transaction
1521
+ self.delete_tbl_md(tbl_id)
1522
+ else:
1523
+ # invalidate the TableVersion instance when we're done so that existing references to it can find out it
1524
+ # has been dropped
1525
+ self.mark_modified_tvs(tvp.tbl_version)
1526
+
1527
+ # write TableOps to execute the drop, plus the updated Table record
1528
+ tv = tvp.tbl_version.get()
1529
+ tv.tbl_md.pending_stmt = schema.TableStatement.DROP_TABLE
1530
+ drop_ops = tv.drop()
1531
+ self.write_tbl_md(
1532
+ tv.id,
1533
+ dir_id=None,
1534
+ tbl_md=tv.tbl_md,
1535
+ version_md=None,
1536
+ schema_version_md=None,
1537
+ pending_ops=drop_ops,
1538
+ remove_from_dir=True,
1539
+ )
1540
+
1541
+ tvp.clear_cached_md()
1542
+
1543
+ assert (
1544
+ is_replica
1545
+ or (tbl_id, None) in self._tbls # non-replica tables must have an entry with effective_version=None
1546
+ )
1547
+
1548
+ # Remove visible Table references (we do this even for a replica that was just renamed).
1549
+ versions = [version for id, version in self._tbls if id == tbl_id]
1550
+ for version in versions:
1551
+ del self._tbls[tbl_id, version]
1552
+
1553
+ _logger.info(f'Dropped table {tbl_id if tbl is None else repr(tbl._path())}.')
1554
+
1555
+ if (
1556
+ is_replica # if this is a replica,
1557
+ and do_drop # and it was actually dropped (not just renamed),
1558
+ and tvp.base is not None # and it has a base table,
1559
+ ):
1560
+ base_tbl = self.get_table_by_id(tvp.base.tbl_id)
1561
+ base_tbl_path = None if base_tbl is None else Path.parse(base_tbl._path(), allow_system_path=True)
1562
+ if (
1563
+ (base_tbl_path is None or base_tbl_path.is_system_path) # and the base table is hidden,
1564
+ and len(self.get_view_ids(tvp.base.tbl_id, for_update=True)) == 0 # and has no other dependents,
1565
+ ):
1566
+ # then drop the base table as well (possibly recursively).
1567
+ _logger.debug(f'Dropping hidden base table {tvp.base.tbl_id} of dropped replica {tbl_id}.')
1568
+ # we just dropped the anchor on `tvp.base`; we need to clear the anchor so that we can actually
1569
+ # load the TableVersion instance in order to drop it
1570
+ self._drop_tbl(tvp.base.anchor_to(None), force=False, is_replace=False)
1571
+
1572
+ @retry_loop(for_write=True)
1573
+ def create_dir(self, path: Path, if_exists: IfExistsParam, parents: bool) -> Dir:
1574
+ return self._create_dir(path, if_exists, parents)
1575
+
1576
+ def _create_dir(self, path: Path, if_exists: IfExistsParam, parents: bool) -> Dir:
1577
+ # existing = self._handle_path_collision(path, Dir, False, if_exists)
1578
+ # if existing is not None:
1579
+ # assert isinstance(existing, Dir)
1580
+ # return existing
1581
+ #
1582
+ # parent = self._get_schema_object(path.parent)
1583
+ # assert parent is not None
1584
+ # dir = Dir._create(parent._id, path.name)
1585
+ # Env.get().console_logger.info(f'Created directory {path!r}.')
1586
+ # return dir
1587
+
1588
+ if parents:
1589
+ # start walking down from the root
1590
+ last_parent: SchemaObject | None = None
1591
+ for ancestor in path.ancestors():
1592
+ ancestor_obj = self._get_schema_object(ancestor, expected=Dir)
1593
+ assert ancestor_obj is not None or last_parent is not None
1594
+ last_parent = Dir._create(last_parent._id, ancestor.name) if ancestor_obj is None else ancestor_obj
1595
+ parent = last_parent
1596
+ else:
1597
+ parent = self._get_schema_object(path.parent)
1598
+ existing = self._handle_path_collision(path, Dir, False, if_exists)
1599
+ if existing is not None:
1600
+ assert isinstance(existing, Dir)
1601
+ return existing
1602
+ assert parent is not None
1603
+ dir = Dir._create(parent._id, path.name)
1604
+ Env.get().console_logger.info(f'Created directory {path!r}.')
1605
+ return dir
1606
+
1607
+ def drop_dir(self, path: Path, if_not_exists: IfNotExistsParam, force: bool) -> None:
1608
+ @retry_loop(for_write=True)
1609
+ def drop_fn() -> None:
1610
+ _, _, schema_obj = self._prepare_dir_op(
1611
+ drop_dir_path=path.parent,
1612
+ drop_name=path.name,
1613
+ drop_expected=Dir,
1614
+ raise_if_not_exists=if_not_exists == IfNotExistsParam.ERROR and not force,
1615
+ )
1616
+ if schema_obj is None:
1617
+ _logger.info(f'Directory {path!r} does not exist; skipped drop_dir().')
1618
+ return
1619
+ self._drop_dir(schema_obj._id, path, force=force)
1620
+
1621
+ self._roll_forward_ids.clear()
1622
+ drop_fn()
1623
+ self._roll_forward()
1624
+
1625
+ def _drop_dir(self, dir_id: UUID, dir_path: Path, force: bool = False) -> None:
1626
+ conn = Env.get().conn
1627
+ if not force:
1628
+ # check for existing entries
1629
+ q = sql.select(sql.func.count()).select_from(schema.Dir).where(schema.Dir.parent_id == dir_id)
1630
+ num_subdirs = conn.execute(q).scalar()
1631
+ q = sql.select(sql.func.count()).select_from(schema.Table).where(self._active_tbl_clause(dir_id=dir_id))
1632
+ num_tbls = conn.execute(q).scalar()
1633
+ if num_subdirs + num_tbls > 0:
1634
+ raise excs.Error(f'Directory {dir_path!r} is not empty.')
1635
+
1636
+ # drop existing subdirs
1637
+ self._acquire_dir_xlock(dir_id=dir_id)
1638
+ dir_q = sql.select(schema.Dir).where(schema.Dir.parent_id == dir_id)
1639
+ for row in conn.execute(dir_q).all():
1640
+ self._drop_dir(row.id, dir_path.append(row.md['name']), force=True)
1641
+
1642
+ # drop existing tables
1643
+ tbl_q = sql.select(schema.Table).where(self._active_tbl_clause(dir_id=dir_id)).with_for_update()
1644
+ for row in conn.execute(tbl_q).all():
1645
+ tbl = self.get_table_by_id(row.id, ignore_if_dropped=True)
1646
+ # this table would have been dropped already if it's a view of a base we dropped earlier
1647
+ if tbl is not None:
1648
+ self._drop_tbl(tbl, force=True, is_replace=False)
1649
+
1650
+ # self.drop_dir(dir_id)
1651
+ conn.execute(sql.delete(schema.Dir).where(schema.Dir.id == dir_id))
1652
+ _logger.info(f'Removed directory {dir_path!r}.')
1653
+
1654
+ def get_view_ids(self, tbl_id: UUID, for_update: bool = False) -> list[UUID]:
1655
+ """Return the ids of views that directly reference the given table"""
1656
+ conn = Env.get().conn
1657
+ # check whether this table still exists
1658
+ q = sql.select(sql.func.count()).select_from(schema.Table).where(self._active_tbl_clause(tbl_id=tbl_id))
1659
+ tbl_count = conn.execute(q).scalar()
1660
+ if tbl_count == 0:
1661
+ raise excs.Error(self._dropped_tbl_error_msg(tbl_id))
1662
+ q = (
1663
+ sql.select(schema.Table.id)
1664
+ .where(schema.Table.md['view_md']['base_versions'][0][0].astext == tbl_id.hex)
1665
+ .where(self._active_tbl_clause())
1666
+ )
1667
+ if for_update:
1668
+ q = q.with_for_update()
1669
+ result = [r[0] for r in conn.execute(q).all()]
1670
+ return result
1671
+
1672
+ def get_tbl_version(
1673
+ self, key: TableVersionKey, *, check_pending_ops: bool = True, validate_initialized: bool = False
1674
+ ) -> TableVersion | None:
1675
+ """
1676
+ Returns the TableVersion instance for the given table and version and updates the cache.
1677
+
1678
+ If present in the cache and the instance isn't validated, validates version and view_sn against the stored
1679
+ metadata.
1680
+ """
1681
+ # we need a transaction here, if we're not already in one; if this starts a new transaction,
1682
+ # the returned TableVersion instance will not be validated
1683
+ with self.begin_xact(for_write=False) as conn:
1684
+ tv = self._tbl_versions.get(key)
1685
+ if tv is None:
1686
+ tv = self._load_tbl_version(key, check_pending_ops=check_pending_ops)
1687
+ elif not tv.is_validated:
1688
+ # only live instances are invalidated
1689
+ assert key.effective_version is None
1690
+ # _logger.debug(f'validating metadata for table {tbl_id}:{tv.version} ({id(tv):x})')
1691
+ where_clause: sql.ColumnElement[bool]
1692
+ if check_pending_ops:
1693
+ # if we don't want to see pending ops, we also don't want to see dropped tables
1694
+ where_clause = self._active_tbl_clause(tbl_id=key.tbl_id)
1695
+ else:
1696
+ where_clause = schema.Table.id == key.tbl_id
1697
+ q = sql.select(schema.Table.md).where(where_clause)
1698
+ row = conn.execute(q).one_or_none()
1699
+ if row is None:
1700
+ raise excs.Error(self._dropped_tbl_error_msg(key.tbl_id))
1701
+
1702
+ reload = False
1703
+
1704
+ if tv.anchor_tbl_id is None:
1705
+ # live non-replica table; compare our cached TableMd.current_version/view_sn to what's stored
1706
+ q = sql.select(schema.Table.md).where(where_clause)
1707
+ row = conn.execute(q).one_or_none()
1708
+ if row is None:
1709
+ raise excs.Error(self._dropped_tbl_error_msg(key.tbl_id))
1710
+ current_version, view_sn = row.md['current_version'], row.md['view_sn']
1711
+ if current_version != tv.version or view_sn != tv.tbl_md.view_sn:
1712
+ _logger.debug(
1713
+ f'reloading metadata for live table {key.tbl_id} '
1714
+ f'(cached/current version: {tv.version}/{current_version}, '
1715
+ f'cached/current view_sn: {tv.tbl_md.view_sn}/{view_sn})'
1716
+ )
1717
+ reload = True
1718
+
1719
+ else:
1720
+ # live replica table; use the anchored version
1721
+ anchor_tbl_version_md = self.head_version_md(tv.anchor_tbl_id)
1722
+ assert anchor_tbl_version_md is not None
1723
+ q = sql.select(schema.TableVersion.md)
1724
+ if check_pending_ops:
1725
+ q = q.join(schema.Table, schema.Table.id == schema.TableVersion.tbl_id).where(
1726
+ self._active_tbl_clause(tbl_id=key.tbl_id)
1727
+ )
1728
+ q = (
1729
+ q.where(schema.TableVersion.tbl_id == key.tbl_id)
1730
+ .where(schema.TableVersion.md['created_at'].cast(sql.Float) <= anchor_tbl_version_md.created_at)
1731
+ .order_by(schema.TableVersion.md['created_at'].cast(sql.Float).desc())
1732
+ .limit(1)
1733
+ )
1734
+ row = conn.execute(q).one_or_none()
1735
+ if row is None:
1736
+ raise excs.Error(self._dropped_tbl_error_msg(key.tbl_id))
1737
+ version = row.md['version']
1738
+ if version != tv.version: # TODO: How will view_sn work for replicas?
1739
+ _logger.debug(
1740
+ f'reloading metadata for replica table {key.tbl_id} (anchor {key.anchor_tbl_id}) '
1741
+ f'(cached/anchored version: {tv.version}/{version})'
1742
+ )
1743
+ reload = True
1744
+
1745
+ # the stored version can be behind TableVersion.version, because we don't roll back the in-memory
1746
+ # metadata changes after a failed update operation
1747
+ if reload:
1748
+ # the cached metadata is invalid
1749
+ tv = self._load_tbl_version(key, check_pending_ops=check_pending_ops)
1750
+ else:
1751
+ # the cached metadata is valid
1752
+ tv.is_validated = True
1753
+
1754
+ assert tv.anchor_tbl_id == key.anchor_tbl_id
1755
+ assert tv.is_validated, f'{key} not validated\n{tv.__dict__}\n{self._debug_str()}'
1756
+ if validate_initialized:
1757
+ assert tv.is_initialized, f'{key} not initialized\n{tv.__dict__}\n{self._debug_str()}'
1758
+ return tv
1759
+
1760
+ def remove_tbl_version(self, key: TableVersionKey) -> None:
1761
+ assert isinstance(key, TableVersionKey)
1762
+ assert key in self._tbl_versions
1763
+ del self._tbl_versions[key]
1764
+
1765
+ def get_dir(self, dir_id: UUID, for_update: bool = False) -> Dir | None:
1766
+ """Return the Dir with the given id, or None if it doesn't exist"""
1767
+ conn = Env.get().conn
1768
+ if for_update:
1769
+ self._acquire_dir_xlock(dir_id=dir_id)
1770
+ q = sql.select(schema.Dir).where(schema.Dir.id == dir_id)
1771
+ row = conn.execute(q).one_or_none()
1772
+ if row is None:
1773
+ return None
1774
+ dir_record = schema.Dir(**row._mapping)
1775
+ return Dir(dir_record.id, dir_record.parent_id, dir_record.md['name'])
1776
+
1777
+ def _get_dir(self, path: Path, lock_dir: bool = False) -> schema.Dir | None:
1778
+ """
1779
+ lock_dir: if True, X-locks target (but not the ancestors)
1780
+ """
1781
+ user = Env.get().user
1782
+ conn = Env.get().conn
1783
+ if path.is_root:
1784
+ if lock_dir:
1785
+ self._acquire_dir_xlock(dir_name='')
1786
+ q = sql.select(schema.Dir).where(schema.Dir.parent_id.is_(None), schema.Dir.md['user'].astext == user)
1787
+ row = conn.execute(q).one_or_none()
1788
+ return schema.Dir(**row._mapping) if row is not None else None
1789
+ else:
1790
+ parent_dir = self._get_dir(path.parent, lock_dir=False)
1791
+ if parent_dir is None:
1792
+ return None
1793
+ if lock_dir:
1794
+ self._acquire_dir_xlock(parent_id=parent_dir.id, dir_name=path.name)
1795
+ q = sql.select(schema.Dir).where(
1796
+ schema.Dir.parent_id == parent_dir.id,
1797
+ schema.Dir.md['name'].astext == path.name,
1798
+ schema.Dir.md['user'].astext == user,
1799
+ )
1800
+ row = conn.execute(q).one_or_none()
1801
+ return schema.Dir(**row._mapping) if row is not None else None
1802
+
1803
+ def _load_tbl(self, tbl_id: UUID, ignore_pending_drop: bool = False) -> Table | None:
1804
+ """Loads metadata for the table with the given id and caches it."""
1805
+ from .insertable_table import InsertableTable
1806
+ from .view import View
1807
+
1808
+ assert tbl_id is not None
1809
+ _logger.info(f'Loading table {tbl_id}')
1810
+
1811
+ conn = Env.get().conn
1812
+
1813
+ if ignore_pending_drop:
1814
+ # check whether this table is in the process of being dropped
1815
+ q: sql.Executable = sql.select(schema.Table.md).where(schema.Table.id == tbl_id)
1816
+ row = conn.execute(q).one()
1817
+ if row.md['pending_stmt'] == schema.TableStatement.DROP_TABLE.value:
1818
+ return None
1819
+
1820
+ # check for pending ops
1821
+ q = sql.select(sql.func.count()).where(schema.PendingTableOp.tbl_id == tbl_id)
1822
+ has_pending_ops = conn.execute(q).scalar() > 0
1823
+ if has_pending_ops:
1824
+ raise PendingTableOpsError(tbl_id)
1825
+
1826
+ q = (
1827
+ sql.select(schema.Table, schema.TableSchemaVersion)
1828
+ .join(schema.TableSchemaVersion)
1829
+ .where(schema.Table.id == schema.TableSchemaVersion.tbl_id)
1830
+ .where(
1831
+ schema.Table.md['current_schema_version'].cast(sql.Integer) == schema.TableSchemaVersion.schema_version
1832
+ )
1833
+ .where(schema.Table.id == tbl_id)
1834
+ )
1835
+ row = conn.execute(q).one_or_none()
1836
+ if row is None:
1837
+ return None
1838
+ tbl_record, _ = _unpack_row(row, [schema.Table, schema.TableSchemaVersion])
1839
+
1840
+ tbl_md = schema.md_from_dict(schema.TableMd, tbl_record.md)
1841
+ view_md = tbl_md.view_md
1842
+
1843
+ if view_md is None and not tbl_md.is_replica:
1844
+ # this is a base, non-replica table
1845
+ key = TableVersionKey(tbl_id, None, None)
1846
+ if key not in self._tbl_versions:
1847
+ _ = self._load_tbl_version(key)
1848
+ tbl = InsertableTable(tbl_record.dir_id, TableVersionHandle(key))
1849
+ self._tbls[tbl_id, None] = tbl
1850
+ return tbl
1851
+
1852
+ # this is a view; determine the sequence of TableVersions to load
1853
+ tbl_version_path: list[tuple[UUID, int | None]] = []
1854
+ anchor_tbl_id = UUID(tbl_md.tbl_id) if tbl_md.is_replica else None
1855
+ if tbl_md.is_pure_snapshot:
1856
+ # this is a pure snapshot, without a physical table backing it; we only need the bases
1857
+ pass
1858
+ else:
1859
+ effective_version = (
1860
+ 0 if view_md is not None and view_md.is_snapshot else None
1861
+ ) # snapshots only have version 0
1862
+ tbl_version_path.append((tbl_id, effective_version))
1863
+
1864
+ if view_md is not None:
1865
+ tbl_version_path.extend((UUID(ancestor_id), version) for ancestor_id, version in view_md.base_versions)
1866
+
1867
+ if anchor_tbl_id is not None and self.head_version_md(anchor_tbl_id) is None:
1868
+ return None
1869
+
1870
+ # load TableVersions, starting at the root
1871
+ base_path: TableVersionPath | None = None
1872
+ view_path: TableVersionPath | None = None
1873
+ for id, effective_version in tbl_version_path[::-1]:
1874
+ # anchor the path elements that have effective_version == None
1875
+ key = TableVersionKey(id, effective_version, None if effective_version is not None else anchor_tbl_id)
1876
+ if key not in self._tbl_versions:
1877
+ _ = self._load_tbl_version(key)
1878
+ view_path = TableVersionPath(TableVersionHandle(key), base=base_path)
1879
+ base_path = view_path
1880
+ view = View(tbl_id, tbl_record.dir_id, tbl_md.name, view_path, snapshot_only=tbl_md.is_pure_snapshot)
1881
+ self._tbls[tbl_id, None] = view
1882
+ return view
1883
+
1884
+ def _load_tbl_at_version(self, tbl_id: UUID, version: int) -> Table | None:
1885
+ from .view import View
1886
+
1887
+ # Load the specified TableMd and TableVersionMd records from the db.
1888
+ conn = Env.get().conn
1889
+ q: sql.Executable = (
1890
+ sql.select(schema.Table, schema.TableVersion)
1891
+ .join(schema.TableVersion)
1892
+ .where(schema.Table.id == tbl_id)
1893
+ .where(schema.Table.id == schema.TableVersion.tbl_id)
1894
+ .where(schema.TableVersion.version == version)
1895
+ )
1896
+ row = conn.execute(q).one_or_none()
1897
+ if row is None:
1898
+ return None
1899
+ tbl_record, version_record = _unpack_row(row, [schema.Table, schema.TableVersion])
1900
+ tbl_md = schema.md_from_dict(schema.TableMd, tbl_record.md)
1901
+ version_md = schema.md_from_dict(schema.VersionMd, version_record.md)
1902
+ tvp = self.construct_tvp(tbl_id, version, tbl_md.ancestors, version_md.created_at)
1903
+
1904
+ view = View(tbl_id, tbl_record.dir_id, tbl_md.name, tvp, snapshot_only=True)
1905
+ self._tbls[tbl_id, version] = view
1906
+ return view
1907
+
1908
+ def construct_tvp(
1909
+ self, tbl_id: UUID, version: int, ancestors_of_live_tbl: schema.TableVersionPath, created_at: float
1910
+ ) -> TableVersionPath:
1911
+ """
1912
+ Construct the TableVersionPath for the specified version of the given table. Here `live_ancestors` is the
1913
+ list of ancestor table IDs and fixed versions (if any) from the table's metadata. The constructed
1914
+ TableVersionPath will preserve any fixed versions from `live_ancestors` (corresponding to a view-over-snapshot
1915
+ scenario), while "filling in" the implied versions for any `None` versions.
1916
+ """
1917
+ # TODO: Currently, we reconstruct the ancestors by inspecting the created_at timestamps of the table and its
1918
+ # ancestors' versions. In the future, we should store the relevant TableVersionPaths in the database, so
1919
+ # that we don't need to rely on timestamps (which might be nondeterministic in distributed execution
1920
+ # scenarios).
1921
+
1922
+ assert Env.get().conn is not None
1923
+
1924
+ # Build the list of ancestor versions, starting with the given table and traversing back to the base table.
1925
+ # For each proper ancestor,
1926
+ # - If it's an ancestor with a fixed version (view-over-snapshot scenario), we keep the given fixed version.
1927
+ # - If it's an ancestor with a live (floating) version, we use the version whose created_at timestamp equals
1928
+ # or most nearly precedes the given TableVersion's created_at timestamp.
1929
+ ancestors: list[tuple[UUID, int]] = [(tbl_id, version)]
1930
+ for ancestor_id, ancestor_version in ancestors_of_live_tbl:
1931
+ if ancestor_version is not None:
1932
+ # fixed version; just use it
1933
+ ancestors.append((UUID(ancestor_id), ancestor_version))
1934
+ continue
1935
+
1936
+ q = (
1937
+ sql.select(schema.TableVersion)
1938
+ .where(schema.TableVersion.tbl_id == ancestor_id)
1939
+ .where(schema.TableVersion.md['created_at'].cast(sql.Float) <= created_at)
1940
+ .order_by(schema.TableVersion.md['created_at'].cast(sql.Float).desc())
1941
+ .limit(1)
1942
+ )
1943
+ row = Env.get().conn.execute(q).one_or_none()
1944
+ if row is None:
1945
+ # This can happen if an ancestor version is garbage collected; it can also happen in
1946
+ # rare circumstances involving table versions created specifically with Pixeltable 0.4.3.
1947
+ _logger.info(f'Ancestor {ancestor_id} not found for table {tbl_id}:{version}')
1948
+ raise excs.Error('The specified table version is no longer valid and cannot be retrieved.')
1949
+ ancestor_version_record = _unpack_row(row, [schema.TableVersion])[0]
1950
+ ancestor_version_md = schema.md_from_dict(schema.VersionMd, ancestor_version_record.md)
1951
+ assert ancestor_version_md.created_at <= created_at
1952
+ ancestors.append((UUID(ancestor_id), ancestor_version_md.version))
1953
+
1954
+ # Force any ancestors to be loaded (base table first).
1955
+ for anc_id, anc_version in ancestors[::-1]:
1956
+ key = TableVersionKey(anc_id, anc_version, None)
1957
+ if key not in self._tbl_versions:
1958
+ _ = self._load_tbl_version(key)
1959
+
1960
+ # Now reconstruct the relevant TableVersionPath instance from the ancestor versions.
1961
+ tvp: TableVersionPath | None = None
1962
+ for anc_id, anc_version in ancestors[::-1]:
1963
+ tvp = TableVersionPath(TableVersionHandle(TableVersionKey(anc_id, anc_version, None)), base=tvp)
1964
+
1965
+ return tvp
1966
+
1967
+ @retry_loop(for_write=False)
1968
+ def collect_tbl_history(self, tbl_id: UUID, n: int | None) -> list[TableVersionMd]:
1969
+ return self._collect_tbl_history(tbl_id, n)
1970
+
1971
+ def _collect_tbl_history(self, tbl_id: UUID, n: int | None) -> list[TableVersionMd]:
1972
+ """
1973
+ Returns the history of up to n versions of the table with the given UUID.
1974
+
1975
+ Args:
1976
+ tbl_id: the UUID of the table to collect history for.
1977
+ n: Optional limit on the maximum number of versions returned.
1978
+
1979
+ Returns:
1980
+ A sequence of rows, ordered by version number
1981
+ Each row contains a TableVersion and a TableSchemaVersion object.
1982
+ """
1983
+ q = (
1984
+ sql.select(schema.Table, schema.TableVersion, schema.TableSchemaVersion)
1985
+ .where(self._active_tbl_clause(tbl_id=tbl_id))
1986
+ .join(schema.TableVersion)
1987
+ .where(schema.TableVersion.tbl_id == tbl_id)
1988
+ .join(schema.TableSchemaVersion)
1989
+ .where(schema.TableSchemaVersion.tbl_id == tbl_id)
1990
+ .where(
1991
+ schema.TableVersion.md['schema_version'].cast(sql.Integer) == schema.TableSchemaVersion.schema_version
1992
+ )
1993
+ .order_by(schema.TableVersion.version.desc())
1994
+ )
1995
+ if n is not None:
1996
+ q = q.limit(n)
1997
+ src_rows = Env.get().session.execute(q).fetchall()
1998
+ return [
1999
+ TableVersionMd(
2000
+ tbl_md=schema.md_from_dict(schema.TableMd, row.Table.md),
2001
+ version_md=schema.md_from_dict(schema.VersionMd, row.TableVersion.md),
2002
+ schema_version_md=schema.md_from_dict(schema.SchemaVersionMd, row.TableSchemaVersion.md),
2003
+ )
2004
+ for row in src_rows
2005
+ ]
2006
+
2007
+ def head_version_md(self, tbl_id: UUID) -> schema.VersionMd | None:
2008
+ """
2009
+ Returns the TableVersionMd for the most recent non-fragment version of the given table.
2010
+ """
2011
+ conn = Env.get().conn
2012
+
2013
+ q = (
2014
+ sql.select(schema.TableVersion.md)
2015
+ .where(schema.TableVersion.tbl_id == tbl_id)
2016
+ .where(schema.TableVersion.md['is_fragment'].astext == 'false')
2017
+ .order_by(schema.TableVersion.md['version'].cast(sql.Integer).desc())
2018
+ .limit(1)
2019
+ )
2020
+ row = conn.execute(q).one_or_none()
2021
+ if row is None:
2022
+ return None
2023
+ assert isinstance(row[0], dict)
2024
+ return schema.md_from_dict(schema.VersionMd, row[0])
2025
+
2026
+ def load_tbl_md(self, key: TableVersionKey) -> TableVersionMd:
2027
+ """
2028
+ Loads metadata from the store for a given table UUID and version.
2029
+ """
2030
+ anchor_timestamp: float | None = None
2031
+ if key.anchor_tbl_id is not None:
2032
+ anchored_version_md = self.head_version_md(key.anchor_tbl_id)
2033
+ # `anchor_tbl_id` must exist and have at least one non-fragment version, or else this isn't
2034
+ # a valid TableVersion specification.
2035
+ assert anchored_version_md is not None
2036
+ anchor_timestamp = anchored_version_md.created_at
2037
+
2038
+ # _logger.info(f'Loading metadata for table version: {tbl_id}:{effective_version}')
2039
+ conn = Env.get().conn
2040
+
2041
+ q = (
2042
+ sql.select(schema.Table, schema.TableVersion, schema.TableSchemaVersion)
2043
+ .select_from(schema.Table)
2044
+ .where(schema.Table.id == key.tbl_id)
2045
+ .join(schema.TableVersion)
2046
+ .where(schema.TableVersion.tbl_id == key.tbl_id)
2047
+ .join(schema.TableSchemaVersion)
2048
+ .where(schema.TableSchemaVersion.tbl_id == key.tbl_id)
2049
+ )
2050
+
2051
+ if key.effective_version is not None:
2052
+ # we are loading a specific version
2053
+ # SELECT *
2054
+ # FROM Table t
2055
+ # JOIN TableVersion tv ON (tv.tbl_id = tbl_id AND tv.version = effective_version)
2056
+ # JOIN TableSchemaVersion tsv ON (tsv.tbl_id = tbl_id AND tv.md.schema_version = tsv.schema_version)
2057
+ # WHERE t.id = tbl_id
2058
+ q = q.where(
2059
+ schema.TableVersion.md['version'].cast(sql.Integer) == key.effective_version,
2060
+ schema.TableVersion.md['schema_version'].cast(sql.Integer) == schema.TableSchemaVersion.schema_version,
2061
+ )
2062
+ elif anchor_timestamp is not None:
2063
+ # we are loading the version that is anchored to the head version of another table (see TableVersion
2064
+ # docstring for details)
2065
+ # SELECT *
2066
+ # FROM Table t
2067
+ # JOIN TableVersion tv ON (tv.tbl_id = tbl_id)
2068
+ # JOIN TableSchemaVersion tsv ON (tsv.tbl_id = tbl_id AND tv.md.schema_version = tsv.schema_version)
2069
+ # WHERE t.id = tbl_id AND tv.md.created_at <= anchor_timestamp
2070
+ # ORDER BY tv.md.created_at DESC
2071
+ # LIMIT 1
2072
+ q = (
2073
+ q.where(
2074
+ schema.TableVersion.md['created_at'].cast(sql.Float) <= anchor_timestamp,
2075
+ schema.TableVersion.md['schema_version'].cast(sql.Integer)
2076
+ == schema.TableSchemaVersion.schema_version,
2077
+ )
2078
+ .order_by(schema.TableVersion.md['created_at'].cast(sql.Float).desc())
2079
+ .limit(1)
2080
+ )
2081
+ else:
2082
+ # we are loading the current version
2083
+ # SELECT *
2084
+ # FROM Table t
2085
+ # JOIN TableVersion tv ON (tv.tbl_id = tbl_id AND t.current_version = tv.version)
2086
+ # JOIN TableSchemaVersion tsv ON (tsv.tbl_id = tbl_id AND t.current_schema_version = tsv.schema_version)
2087
+ # WHERE t.id = tbl_id
2088
+ q = q.where(
2089
+ schema.Table.md['current_version'].cast(sql.Integer) == schema.TableVersion.version,
2090
+ schema.Table.md['current_schema_version'].cast(sql.Integer) == schema.TableSchemaVersion.schema_version,
2091
+ )
2092
+
2093
+ row = conn.execute(q).one_or_none()
2094
+ if row is None:
2095
+ raise excs.Error(self._dropped_tbl_error_msg(key.tbl_id))
2096
+ tbl_record, version_record, schema_version_record = _unpack_row(
2097
+ row, [schema.Table, schema.TableVersion, schema.TableSchemaVersion]
2098
+ )
2099
+ assert tbl_record.id == key.tbl_id
2100
+ tbl_md = schema.md_from_dict(schema.TableMd, tbl_record.md)
2101
+ version_md = schema.md_from_dict(schema.VersionMd, version_record.md)
2102
+ schema_version_md = schema.md_from_dict(schema.SchemaVersionMd, schema_version_record.md)
2103
+
2104
+ return TableVersionMd(tbl_md, version_md, schema_version_md)
2105
+
2106
+ def write_tbl_md(
2107
+ self,
2108
+ tbl_id: UUID,
2109
+ dir_id: UUID | None,
2110
+ tbl_md: schema.TableMd | None,
2111
+ version_md: schema.VersionMd | None,
2112
+ schema_version_md: schema.SchemaVersionMd | None,
2113
+ pending_ops: list[TableOp] | None = None,
2114
+ remove_from_dir: bool = False,
2115
+ ) -> None:
2116
+ """
2117
+ Stores metadata to the DB and adds tbl_id to self._roll_forward_ids if pending_ops is specified.
2118
+
2119
+ Args:
2120
+ tbl_id: UUID of the table to store metadata for.
2121
+ dir_id: If specified, the tbl_md will be added to the given directory; if None, the table must already exist
2122
+ tbl_md: If specified, `tbl_md` will be inserted, or updated (only one such record can exist per UUID)
2123
+ version_md: inserted as a new record if present
2124
+ schema_version_md: will be inserted as a new record if present
2125
+
2126
+ If inserting `version_md` or `schema_version_md` would be a primary key violation, an exception will be raised.
2127
+ """
2128
+ assert self._in_write_xact
2129
+ assert version_md is None or version_md.created_at > 0.0
2130
+ assert pending_ops is None or len(pending_ops) > 0
2131
+ assert pending_ops is None or tbl_md is not None # if we write pending ops, we must also write new tbl_md
2132
+ session = Env.get().session
2133
+
2134
+ # Construct and insert or update table record if requested.
2135
+ if tbl_md is not None:
2136
+ assert tbl_md.tbl_id == str(tbl_id)
2137
+ if version_md is not None:
2138
+ assert tbl_md.current_version == version_md.version
2139
+ assert tbl_md.current_schema_version == version_md.schema_version
2140
+ if schema_version_md is not None:
2141
+ assert tbl_md.current_schema_version == schema_version_md.schema_version
2142
+ if pending_ops is not None:
2143
+ assert tbl_md.pending_stmt is not None
2144
+ assert all(op.tbl_id == str(tbl_id) for op in pending_ops)
2145
+ assert all(op.op_sn == i for i, op in enumerate(pending_ops))
2146
+ assert all(op.num_ops == len(pending_ops) for op in pending_ops)
2147
+ tbl_md.tbl_state = schema.TableState.ROLLFORWARD
2148
+ self._roll_forward_ids.add(tbl_id)
2149
+
2150
+ if dir_id is not None:
2151
+ # We are inserting a record while creating a new table.
2152
+ tbl_record = schema.Table(
2153
+ id=tbl_id, dir_id=dir_id, md=dataclasses.asdict(tbl_md, dict_factory=md_dict_factory)
2154
+ )
2155
+ session.add(tbl_record)
2156
+ else:
2157
+ # Update the existing table record.
2158
+ values: dict[Any, Any] = {schema.Table.md: dataclasses.asdict(tbl_md, dict_factory=md_dict_factory)}
2159
+ if remove_from_dir:
2160
+ values.update({schema.Table.dir_id: None})
2161
+ result = session.execute(
2162
+ sql.update(schema.Table.__table__).values(values).where(schema.Table.id == tbl_id)
2163
+ )
2164
+ assert isinstance(result, sql.CursorResult)
2165
+ assert result.rowcount == 1, result.rowcount
2166
+
2167
+ # Construct and insert new table version record if requested.
2168
+ if version_md is not None:
2169
+ assert version_md.tbl_id == str(tbl_id)
2170
+ if schema_version_md is not None:
2171
+ assert version_md.schema_version == schema_version_md.schema_version
2172
+ version_rows = (
2173
+ session.query(schema.TableVersion)
2174
+ .filter(schema.TableVersion.tbl_id == tbl_id, schema.TableVersion.version == version_md.version)
2175
+ .all()
2176
+ )
2177
+ if len(version_rows) == 0:
2178
+ # It's a new table version; insert a new record in the DB for it.
2179
+ tbl_version_record = schema.TableVersion(
2180
+ tbl_id=tbl_id, version=version_md.version, md=dataclasses.asdict(version_md)
2181
+ )
2182
+ session.add(tbl_version_record)
2183
+ else:
2184
+ # This table version already exists; update it.
2185
+ assert len(version_rows) == 1 # must be unique
2186
+ version_record = version_rows[0]
2187
+ # Validate that the only fields that can change are 'is_fragment' and 'additional_md'.
2188
+ assert version_record.md == dataclasses.asdict(
2189
+ dataclasses.replace(
2190
+ version_md,
2191
+ is_fragment=version_record.md['is_fragment'],
2192
+ additional_md=version_record.md['additional_md'],
2193
+ )
2194
+ )
2195
+ result = session.execute(
2196
+ sql.update(schema.TableVersion.__table__)
2197
+ .values({schema.TableVersion.md: dataclasses.asdict(version_md)})
2198
+ .where(schema.TableVersion.tbl_id == tbl_id, schema.TableVersion.version == version_md.version)
2199
+ )
2200
+ assert isinstance(result, sql.CursorResult)
2201
+ assert result.rowcount == 1, result.rowcount
2202
+
2203
+ # Construct and insert a new schema version record if requested.
2204
+ if schema_version_md is not None:
2205
+ assert schema_version_md.tbl_id == str(tbl_id)
2206
+ schema_version_record = schema.TableSchemaVersion(
2207
+ tbl_id=tbl_id, schema_version=schema_version_md.schema_version, md=dataclasses.asdict(schema_version_md)
2208
+ )
2209
+ session.add(schema_version_record)
2210
+
2211
+ # make sure we don't have any pending ops
2212
+ assert session.query(schema.PendingTableOp).filter(schema.PendingTableOp.tbl_id == tbl_id).count() == 0
2213
+
2214
+ if pending_ops is not None:
2215
+ for op in pending_ops:
2216
+ op_record = schema.PendingTableOp(tbl_id=tbl_id, op_sn=op.op_sn, op=dataclasses.asdict(op))
2217
+ session.add(op_record)
2218
+
2219
+ session.flush() # Inform SQLAlchemy that we want to write these changes to the DB.
2220
+
2221
+ def store_update_status(self, tbl_id: UUID, version: int, status: UpdateStatus) -> None:
2222
+ """Update the TableVersion.md.update_status field"""
2223
+ assert self._in_write_xact
2224
+ conn = Env.get().conn
2225
+
2226
+ stmt = (
2227
+ sql.update(schema.TableVersion)
2228
+ .where(schema.TableVersion.tbl_id == tbl_id, schema.TableVersion.version == version)
2229
+ .values(md=schema.TableVersion.md.op('||')({'update_status': dataclasses.asdict(status)}))
2230
+ )
2231
+
2232
+ res = conn.execute(stmt)
2233
+ assert res.rowcount == 1, res.rowcount
2234
+
2235
+ def delete_tbl_md(self, tbl_id: UUID) -> None:
2236
+ """
2237
+ Deletes all table metadata from the store for the given table UUID.
2238
+ """
2239
+ conn = Env.get().conn
2240
+ _logger.info(f'delete_tbl_md({tbl_id})')
2241
+ conn.execute(sql.delete(schema.TableSchemaVersion.__table__).where(schema.TableSchemaVersion.tbl_id == tbl_id))
2242
+ conn.execute(sql.delete(schema.TableVersion.__table__).where(schema.TableVersion.tbl_id == tbl_id))
2243
+ conn.execute(sql.delete(schema.PendingTableOp.__table__).where(schema.PendingTableOp.tbl_id == tbl_id))
2244
+ conn.execute(sql.delete(schema.Table.__table__).where(schema.Table.id == tbl_id))
2245
+
2246
+ def load_replica_md(self, tbl: Table) -> list[TableVersionMd]:
2247
+ """
2248
+ Load metadata for the given table along with all its ancestors. The values of TableMd.current_version and
2249
+ TableMd.current_schema_version will be adjusted to ensure that the metadata represent a valid (internally
2250
+ consistent) table state.
2251
+ """
2252
+ # TODO: First acquire X-locks for all relevant metadata entries
2253
+ # TODO: handle concurrent drop()
2254
+
2255
+ # Load metadata for every table in the TableVersionPath for `tbl`.
2256
+ md = [self.load_tbl_md(tv.key) for tv in tbl._tbl_version_path.get_tbl_versions()]
2257
+
2258
+ # If `tbl` is a named pure snapshot, we're not quite done, since the snapshot metadata won't appear in the
2259
+ # TableVersionPath. We need to prepend it separately.
2260
+ if isinstance(tbl, View) and tbl._is_named_pure_snapshot():
2261
+ snapshot_md = self.load_tbl_md(TableVersionKey(tbl._id, 0, None))
2262
+ md = [snapshot_md, *md]
2263
+
2264
+ for ancestor_md in md:
2265
+ # Set the `is_replica` flag on every ancestor's TableMd.
2266
+ ancestor_md.tbl_md.is_replica = True
2267
+ # For replica metadata, we guarantee that the current_version and current_schema_version of TableMd
2268
+ # match the corresponding values in TableVersionMd and TableSchemaVersionMd. This is to ensure that,
2269
+ # when the metadata is later stored in the catalog of a different Pixeltable instance, the values of
2270
+ # current_version and current_schema_version will always point to versions that are known to the
2271
+ # destination catalog.
2272
+ ancestor_md.tbl_md.current_version = ancestor_md.version_md.version
2273
+ ancestor_md.tbl_md.current_schema_version = ancestor_md.schema_version_md.schema_version
2274
+
2275
+ for ancestor_md in md[1:]:
2276
+ # Also, the table version of every proper ancestor is emphemeral; it does not represent a queryable
2277
+ # table version (the data might be incomplete, since we have only retrieved one of its views, not
2278
+ # the table itself).
2279
+ ancestor_md.version_md.is_fragment = True
2280
+
2281
+ return md
2282
+
2283
+ def _load_tbl_version(self, key: TableVersionKey, *, check_pending_ops: bool = True) -> TableVersion | None:
2284
+ """Creates TableVersion instance from stored metadata and registers it in _tbl_versions."""
2285
+ tv_md = self.load_tbl_md(key)
2286
+ tbl_md = tv_md.tbl_md
2287
+ version_md = tv_md.version_md
2288
+ schema_version_md = tv_md.schema_version_md
2289
+ view_md = tbl_md.view_md
2290
+
2291
+ conn = Env.get().conn
2292
+
2293
+ if check_pending_ops:
2294
+ # if we care about pending ops, we also care whether the table is in the process of getting dropped
2295
+ if tbl_md.pending_stmt == schema.TableStatement.DROP_TABLE:
2296
+ raise excs.Error(self._dropped_tbl_error_msg(key.tbl_id))
2297
+
2298
+ pending_ops_q = (
2299
+ sql.select(sql.func.count())
2300
+ .select_from(schema.Table)
2301
+ .join(schema.PendingTableOp)
2302
+ .where(schema.PendingTableOp.tbl_id == key.tbl_id)
2303
+ .where(schema.Table.id == key.tbl_id)
2304
+ )
2305
+ if key.effective_version is not None:
2306
+ # we only care about pending ops if the requested version is the current version
2307
+ pending_ops_q = pending_ops_q.where(
2308
+ sql.text(f"({schema.Table.__table__}.md->>'current_version')::int = {key.effective_version}")
2309
+ )
2310
+ has_pending_ops = conn.execute(pending_ops_q).scalar() > 0
2311
+ if has_pending_ops:
2312
+ raise PendingTableOpsError(key.tbl_id)
2313
+
2314
+ # load mutable view ids for mutable TableVersions
2315
+ mutable_view_ids: list[UUID] = []
2316
+ if key.effective_version is None and key.anchor_tbl_id is None and not tbl_md.is_replica:
2317
+ q = (
2318
+ sql.select(schema.Table.id)
2319
+ .where(schema.Table.md['view_md']['base_versions'][0][0].astext == key.tbl_id.hex)
2320
+ .where(schema.Table.md['view_md']['base_versions'][0][1].astext == None)
2321
+ )
2322
+ mutable_view_ids = [r[0] for r in conn.execute(q).all()]
2323
+
2324
+ mutable_views = [TableVersionHandle(TableVersionKey(id, None, None)) for id in mutable_view_ids]
2325
+
2326
+ tbl_version: TableVersion
2327
+ if view_md is None:
2328
+ # this is a base table
2329
+ tbl_version = TableVersion(key, tbl_md, version_md, schema_version_md, mutable_views)
2330
+ else:
2331
+ assert len(view_md.base_versions) > 0 # a view needs to have a base
2332
+ assert (
2333
+ not tv_md.is_pure_snapshot
2334
+ ) # a pure snapshot doesn't have a physical table backing it, no point in loading it
2335
+
2336
+ base: TableVersionHandle
2337
+ base_path: TableVersionPath | None = None # needed for live view
2338
+ if view_md.is_snapshot:
2339
+ base = TableVersionHandle(
2340
+ TableVersionKey(UUID(view_md.base_versions[0][0]), view_md.base_versions[0][1], key.anchor_tbl_id)
2341
+ )
2342
+ else:
2343
+ base_path = TableVersionPath.from_md(tbl_md.view_md.base_versions)
2344
+ base = base_path.tbl_version
2345
+
2346
+ tbl_version = TableVersion(
2347
+ key, tbl_md, version_md, schema_version_md, mutable_views, base_path=base_path, base=base
2348
+ )
2349
+
2350
+ # register the instance before init()
2351
+ self._tbl_versions[key] = tbl_version
2352
+ # register this instance as modified, so that it gets purged if the transaction fails, it may not be
2353
+ # fully initialized
2354
+ self.mark_modified_tvs(tbl_version.handle)
2355
+ tbl_version.init()
2356
+ return tbl_version
50
2357
 
51
2358
  def _init_store(self) -> None:
52
2359
  """One-time initialization of the stored catalog. Idempotent."""
53
- with orm.Session(env.Env.get().engine, future=True) as session:
54
- if session.query(sql.func.count(schema.Dir.id)).scalar() > 0:
2360
+ self.create_user(None)
2361
+ _logger.info('Initialized catalog.')
2362
+
2363
+ def create_user(self, user: str | None) -> None:
2364
+ """
2365
+ Creates a catalog record (root directory) for the specified user, if one does not already exist.
2366
+ """
2367
+ with Env.get().begin_xact():
2368
+ session = Env.get().session
2369
+ # See if there are any directories in the catalog matching the specified user.
2370
+ if session.query(schema.Dir).where(schema.Dir.md['user'].astext == user).count() > 0:
2371
+ # At least one such directory exists; no need to create a new one.
55
2372
  return
56
- # create a top-level directory, so that every schema object has a directory
57
- dir_md = schema.DirMd(name='')
2373
+
2374
+ dir_md = schema.DirMd(name='', user=user, additional_md={})
58
2375
  dir_record = schema.Dir(parent_id=None, md=dataclasses.asdict(dir_md))
59
2376
  session.add(dir_record)
60
2377
  session.flush()
61
- session.commit()
62
- _logger.info(f'Initialized catalog')
63
-
64
- def _load_snapshot_version(
65
- self, tbl_id: UUID, version: int, base: Optional[TableVersion], session: orm.Session
66
- ) -> TableVersion:
67
- q = session.query(schema.Table, schema.TableSchemaVersion) \
68
- .select_from(schema.Table) \
69
- .join(schema.TableVersion) \
70
- .join(schema.TableSchemaVersion) \
71
- .where(schema.Table.id == tbl_id) \
72
- .where(sql.text(f"({schema.TableVersion.__table__}.md->>'version')::int = {version}")) \
73
- .where(sql.text((
74
- f"({schema.TableVersion.__table__}.md->>'schema_version')::int = "
75
- f"{schema.TableSchemaVersion.__table__}.{schema.TableSchemaVersion.schema_version.name}")))
76
- tbl_record, schema_version_record = q.one()
77
- tbl_md = schema.md_from_dict(schema.TableMd, tbl_record.md)
78
- schema_version_md = schema.md_from_dict(schema.TableSchemaVersionMd, schema_version_record.md)
79
- # we ignore tbl_record.base_tbl_id/base_snapshot_id and use 'base' instead: if the base is a snapshot
80
- # we'd have to look that up first
81
- return TableVersion(tbl_record.id, tbl_md, version, schema_version_md, is_snapshot=True, base=base)
2378
+ _logger.info(f'Added root directory record for user: {user!r}')
82
2379
 
83
- def _load_table_versions(self, session: orm.Session) -> None:
84
- from .insertable_table import InsertableTable
85
- from .view import View
2380
+ def _handle_path_collision(
2381
+ self,
2382
+ path: Path,
2383
+ expected_obj_type: type[SchemaObject],
2384
+ expected_snapshot: bool,
2385
+ if_exists: IfExistsParam,
2386
+ *,
2387
+ base: TableVersionPath | None = None,
2388
+ ) -> SchemaObject | None:
2389
+ obj, _, _ = self._prepare_dir_op(add_dir_path=path.parent, add_name=path.name)
86
2390
 
87
- # load tables/views;
88
- # do this in ascending order of creation ts so that we can resolve base references in one pass
89
- q = session.query(schema.Table, schema.TableSchemaVersion) \
90
- .select_from(schema.Table) \
91
- .join(schema.TableVersion) \
92
- .join(schema.TableSchemaVersion) \
93
- .where(sql.text(f"({schema.TableVersion.__table__}.md->>'version')::int = 0")) \
94
- .where(sql.text((
95
- f"({schema.Table.__table__}.md->>'current_schema_version')::int = "
96
- f"{schema.TableSchemaVersion.__table__}.{schema.TableSchemaVersion.schema_version.name}"))) \
97
- .order_by(sql.text(f"({schema.TableVersion.__table__}.md->>'created_at')::float"))
98
-
99
- for tbl_record, schema_version_record in q.all():
100
- tbl_md = schema.md_from_dict(schema.TableMd, tbl_record.md)
101
- schema_version_md = schema.md_from_dict(schema.TableSchemaVersionMd, schema_version_record.md)
102
- view_md = tbl_md.view_md
103
-
104
- if view_md is not None:
105
- assert len(view_md.base_versions) > 0
106
- # construct a TableVersionPath for the view
107
- refd_versions = [(UUID(tbl_id), version) for tbl_id, version in view_md.base_versions]
108
- base_path: Optional[TableVersionPath] = None
109
- base: Optional[TableVersion] = None
110
- # go through the versions in reverse order, so we can construct TableVersionPaths
111
- for base_id, version in refd_versions[::-1]:
112
- base_version = self.tbl_versions.get((base_id, version), None)
113
- if base_version is None:
114
- if version is None:
115
- # debugging
116
- pass
117
- # if this is a reference to a mutable table, we should have loaded it already
118
- assert version is not None
119
- base_version = self._load_snapshot_version(base_id, version, base, session)
120
- base_path = TableVersionPath(base_version, base=base_path)
121
- base = base_version
122
- assert base_path is not None
123
-
124
- base_tbl_id = base_path.tbl_id()
125
- is_snapshot = view_md is not None and view_md.is_snapshot
126
- snapshot_only = is_snapshot and view_md.predicate is None and len(schema_version_md.columns) == 0
127
- if snapshot_only:
128
- # this is a pure snapshot, without a physical table backing it
129
- view_path = base_path
2391
+ if if_exists == IfExistsParam.ERROR and obj is not None:
2392
+ raise excs.Error(f'Path {path!r} is an existing {obj._display_name()}')
2393
+ else:
2394
+ is_snapshot = isinstance(obj, View) and obj._tbl_version_path.is_snapshot()
2395
+ if obj is not None and (not isinstance(obj, expected_obj_type) or (expected_snapshot and not is_snapshot)):
2396
+ if expected_obj_type is Dir:
2397
+ obj_type_str = 'directory'
2398
+ elif expected_obj_type is InsertableTable:
2399
+ obj_type_str = 'table'
2400
+ elif expected_obj_type is View:
2401
+ obj_type_str = 'snapshot' if expected_snapshot else 'view'
130
2402
  else:
131
- tbl_version = TableVersion(
132
- tbl_record.id, tbl_md, tbl_md.current_version, schema_version_md, is_snapshot=is_snapshot,
133
- base=base_path.tbl_version if is_snapshot else None,
134
- base_path=base_path if not is_snapshot else None)
135
- view_path = TableVersionPath(tbl_version, base=base_path)
2403
+ raise AssertionError()
2404
+ raise excs.Error(
2405
+ f'Path {path!r} already exists but is not a {obj_type_str}. Cannot {if_exists.name.lower()} it.'
2406
+ )
136
2407
 
137
- tbl: Table = View(
138
- tbl_record.id, tbl_record.dir_id, tbl_md.name, view_path, base_tbl_id,
139
- snapshot_only=snapshot_only)
140
- self.tbl_dependents[base_tbl_id].append(tbl)
2408
+ if obj is None:
2409
+ return None
2410
+ if if_exists == IfExistsParam.IGNORE:
2411
+ return obj
141
2412
 
142
- else:
143
- tbl_version = TableVersion(tbl_record.id, tbl_md, tbl_md.current_version, schema_version_md)
144
- tbl = InsertableTable(tbl_record.dir_id, tbl_version)
145
-
146
- self.tbls[tbl._id] = tbl
147
- self.tbl_dependents[tbl._id] = []
148
- self.paths.add_schema_obj(tbl._dir_id, tbl_md.name, tbl)
149
-
150
- # def _load_functions(self, session: orm.Session) -> None:
151
- # # load Function metadata; doesn't load the actual callable, which can be large and is only done on-demand by the
152
- # # FunctionRegistry
153
- # q = session.query(schema.Function.id, schema.Function.dir_id, schema.Function.md) \
154
- # .where(sql.text(f"({schema.Function.__table__}.md->>'name')::text IS NOT NULL"))
155
- # for id, dir_id, md in q.all():
156
- # assert 'name' in md
157
- # name = md['name']
158
- # assert name is not None
159
- # named_fn = NamedFunction(id, dir_id, name)
160
- # self.paths.add_schema_obj(dir_id, name, named_fn)
2413
+ assert if_exists in (IfExistsParam.REPLACE, IfExistsParam.REPLACE_FORCE)
2414
+
2415
+ # Check for circularity
2416
+ if obj is not None and base is not None:
2417
+ assert isinstance(obj, Table) # or else it would have been caught above
2418
+ if obj._id in tuple(version.id for version in base.get_tbl_versions()):
2419
+ raise excs.Error(
2420
+ "Cannot use if_exists='replace' with the same name as one of the view's own ancestors."
2421
+ )
2422
+
2423
+ # drop the existing schema object
2424
+ if isinstance(obj, Dir):
2425
+ dir_contents = self._get_dir_contents(obj._id)
2426
+ if len(dir_contents) > 0 and if_exists == IfExistsParam.REPLACE:
2427
+ raise excs.Error(
2428
+ f'Directory {path!r} already exists and is not empty. '
2429
+ 'Use `if_exists="replace_force"` to replace it.'
2430
+ )
2431
+ self._drop_dir(obj._id, path, force=True)
2432
+ else:
2433
+ assert isinstance(obj, Table)
2434
+ self._drop_tbl(obj, force=if_exists == IfExistsParam.REPLACE_FORCE, is_replace=True)
2435
+ return None