py-data-engine 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (200) hide show
  1. data_engine/__init__.py +37 -0
  2. data_engine/application/__init__.py +39 -0
  3. data_engine/application/actions.py +42 -0
  4. data_engine/application/catalog.py +151 -0
  5. data_engine/application/control.py +213 -0
  6. data_engine/application/details.py +73 -0
  7. data_engine/application/runtime.py +449 -0
  8. data_engine/application/workspace.py +62 -0
  9. data_engine/authoring/__init__.py +14 -0
  10. data_engine/authoring/builder.py +31 -0
  11. data_engine/authoring/execution/__init__.py +6 -0
  12. data_engine/authoring/execution/app.py +6 -0
  13. data_engine/authoring/execution/context.py +82 -0
  14. data_engine/authoring/execution/continuous.py +176 -0
  15. data_engine/authoring/execution/grouped.py +106 -0
  16. data_engine/authoring/execution/logging.py +83 -0
  17. data_engine/authoring/execution/polling.py +135 -0
  18. data_engine/authoring/execution/runner.py +210 -0
  19. data_engine/authoring/execution/single.py +171 -0
  20. data_engine/authoring/flow.py +361 -0
  21. data_engine/authoring/helpers.py +160 -0
  22. data_engine/authoring/model.py +59 -0
  23. data_engine/authoring/primitives.py +430 -0
  24. data_engine/authoring/services.py +42 -0
  25. data_engine/devtools/__init__.py +3 -0
  26. data_engine/devtools/project_ast_map.py +503 -0
  27. data_engine/docs/__init__.py +1 -0
  28. data_engine/docs/sphinx_source/_static/custom.css +13 -0
  29. data_engine/docs/sphinx_source/api.rst +42 -0
  30. data_engine/docs/sphinx_source/conf.py +37 -0
  31. data_engine/docs/sphinx_source/guides/app-runtime-and-workspaces.md +397 -0
  32. data_engine/docs/sphinx_source/guides/authoring-flow-modules.md +215 -0
  33. data_engine/docs/sphinx_source/guides/configuring-flows.md +185 -0
  34. data_engine/docs/sphinx_source/guides/core-concepts.md +208 -0
  35. data_engine/docs/sphinx_source/guides/database-methods.md +107 -0
  36. data_engine/docs/sphinx_source/guides/duckdb-helpers.md +462 -0
  37. data_engine/docs/sphinx_source/guides/flow-context.md +538 -0
  38. data_engine/docs/sphinx_source/guides/flow-methods.md +206 -0
  39. data_engine/docs/sphinx_source/guides/getting-started.md +271 -0
  40. data_engine/docs/sphinx_source/guides/project-inventory.md +5683 -0
  41. data_engine/docs/sphinx_source/guides/project-map.md +118 -0
  42. data_engine/docs/sphinx_source/guides/recipes.md +268 -0
  43. data_engine/docs/sphinx_source/index.rst +22 -0
  44. data_engine/domain/__init__.py +92 -0
  45. data_engine/domain/actions.py +69 -0
  46. data_engine/domain/catalog.py +128 -0
  47. data_engine/domain/details.py +214 -0
  48. data_engine/domain/diagnostics.py +56 -0
  49. data_engine/domain/errors.py +104 -0
  50. data_engine/domain/inspection.py +99 -0
  51. data_engine/domain/logs.py +118 -0
  52. data_engine/domain/operations.py +172 -0
  53. data_engine/domain/operator.py +72 -0
  54. data_engine/domain/runs.py +155 -0
  55. data_engine/domain/runtime.py +279 -0
  56. data_engine/domain/source_state.py +17 -0
  57. data_engine/domain/support.py +54 -0
  58. data_engine/domain/time.py +23 -0
  59. data_engine/domain/workspace.py +159 -0
  60. data_engine/flow_modules/__init__.py +1 -0
  61. data_engine/flow_modules/flow_module_compiler.py +179 -0
  62. data_engine/flow_modules/flow_module_loader.py +201 -0
  63. data_engine/helpers/__init__.py +25 -0
  64. data_engine/helpers/duckdb.py +705 -0
  65. data_engine/hosts/__init__.py +1 -0
  66. data_engine/hosts/daemon/__init__.py +23 -0
  67. data_engine/hosts/daemon/app.py +221 -0
  68. data_engine/hosts/daemon/bootstrap.py +69 -0
  69. data_engine/hosts/daemon/client.py +465 -0
  70. data_engine/hosts/daemon/commands.py +64 -0
  71. data_engine/hosts/daemon/composition.py +310 -0
  72. data_engine/hosts/daemon/constants.py +15 -0
  73. data_engine/hosts/daemon/entrypoints.py +97 -0
  74. data_engine/hosts/daemon/lifecycle.py +191 -0
  75. data_engine/hosts/daemon/manager.py +272 -0
  76. data_engine/hosts/daemon/ownership.py +126 -0
  77. data_engine/hosts/daemon/runtime_commands.py +188 -0
  78. data_engine/hosts/daemon/runtime_control.py +31 -0
  79. data_engine/hosts/daemon/server.py +84 -0
  80. data_engine/hosts/daemon/shared_state.py +147 -0
  81. data_engine/hosts/daemon/state_sync.py +101 -0
  82. data_engine/platform/__init__.py +1 -0
  83. data_engine/platform/identity.py +35 -0
  84. data_engine/platform/local_settings.py +146 -0
  85. data_engine/platform/theme.py +259 -0
  86. data_engine/platform/workspace_models.py +190 -0
  87. data_engine/platform/workspace_policy.py +333 -0
  88. data_engine/runtime/__init__.py +1 -0
  89. data_engine/runtime/file_watch.py +185 -0
  90. data_engine/runtime/ledger_models.py +116 -0
  91. data_engine/runtime/runtime_db.py +938 -0
  92. data_engine/runtime/shared_state.py +523 -0
  93. data_engine/services/__init__.py +49 -0
  94. data_engine/services/daemon.py +64 -0
  95. data_engine/services/daemon_state.py +40 -0
  96. data_engine/services/flow_catalog.py +102 -0
  97. data_engine/services/flow_execution.py +48 -0
  98. data_engine/services/ledger.py +85 -0
  99. data_engine/services/logs.py +65 -0
  100. data_engine/services/runtime_binding.py +105 -0
  101. data_engine/services/runtime_execution.py +126 -0
  102. data_engine/services/runtime_history.py +62 -0
  103. data_engine/services/settings.py +58 -0
  104. data_engine/services/shared_state.py +28 -0
  105. data_engine/services/theme.py +59 -0
  106. data_engine/services/workspace_provisioning.py +224 -0
  107. data_engine/services/workspaces.py +74 -0
  108. data_engine/ui/__init__.py +3 -0
  109. data_engine/ui/cli/__init__.py +19 -0
  110. data_engine/ui/cli/app.py +161 -0
  111. data_engine/ui/cli/commands_doctor.py +178 -0
  112. data_engine/ui/cli/commands_run.py +80 -0
  113. data_engine/ui/cli/commands_start.py +100 -0
  114. data_engine/ui/cli/commands_workspace.py +97 -0
  115. data_engine/ui/cli/dependencies.py +44 -0
  116. data_engine/ui/cli/parser.py +56 -0
  117. data_engine/ui/gui/__init__.py +25 -0
  118. data_engine/ui/gui/app.py +116 -0
  119. data_engine/ui/gui/bootstrap.py +487 -0
  120. data_engine/ui/gui/bootstrapper.py +140 -0
  121. data_engine/ui/gui/cache_models.py +23 -0
  122. data_engine/ui/gui/control_support.py +185 -0
  123. data_engine/ui/gui/controllers/__init__.py +6 -0
  124. data_engine/ui/gui/controllers/flows.py +439 -0
  125. data_engine/ui/gui/controllers/runtime.py +245 -0
  126. data_engine/ui/gui/dialogs/__init__.py +12 -0
  127. data_engine/ui/gui/dialogs/messages.py +88 -0
  128. data_engine/ui/gui/dialogs/previews.py +222 -0
  129. data_engine/ui/gui/helpers/__init__.py +62 -0
  130. data_engine/ui/gui/helpers/inspection.py +81 -0
  131. data_engine/ui/gui/helpers/lifecycle.py +112 -0
  132. data_engine/ui/gui/helpers/scroll.py +28 -0
  133. data_engine/ui/gui/helpers/theming.py +87 -0
  134. data_engine/ui/gui/icons/dark_light.svg +12 -0
  135. data_engine/ui/gui/icons/documentation.svg +1 -0
  136. data_engine/ui/gui/icons/failed.svg +3 -0
  137. data_engine/ui/gui/icons/group.svg +4 -0
  138. data_engine/ui/gui/icons/home.svg +2 -0
  139. data_engine/ui/gui/icons/manual.svg +2 -0
  140. data_engine/ui/gui/icons/poll.svg +2 -0
  141. data_engine/ui/gui/icons/schedule.svg +4 -0
  142. data_engine/ui/gui/icons/settings.svg +2 -0
  143. data_engine/ui/gui/icons/started.svg +3 -0
  144. data_engine/ui/gui/icons/success.svg +3 -0
  145. data_engine/ui/gui/icons/view-log.svg +3 -0
  146. data_engine/ui/gui/icons.py +50 -0
  147. data_engine/ui/gui/launcher.py +48 -0
  148. data_engine/ui/gui/presenters/__init__.py +72 -0
  149. data_engine/ui/gui/presenters/docs.py +140 -0
  150. data_engine/ui/gui/presenters/logs.py +58 -0
  151. data_engine/ui/gui/presenters/runtime_projection.py +29 -0
  152. data_engine/ui/gui/presenters/sidebar.py +88 -0
  153. data_engine/ui/gui/presenters/steps.py +148 -0
  154. data_engine/ui/gui/presenters/workspace.py +39 -0
  155. data_engine/ui/gui/presenters/workspace_binding.py +75 -0
  156. data_engine/ui/gui/presenters/workspace_settings.py +182 -0
  157. data_engine/ui/gui/preview_models.py +37 -0
  158. data_engine/ui/gui/render_support.py +241 -0
  159. data_engine/ui/gui/rendering/__init__.py +12 -0
  160. data_engine/ui/gui/rendering/artifacts.py +95 -0
  161. data_engine/ui/gui/rendering/icons.py +50 -0
  162. data_engine/ui/gui/runtime.py +47 -0
  163. data_engine/ui/gui/state_support.py +193 -0
  164. data_engine/ui/gui/support.py +214 -0
  165. data_engine/ui/gui/surface.py +209 -0
  166. data_engine/ui/gui/theme.py +720 -0
  167. data_engine/ui/gui/widgets/__init__.py +34 -0
  168. data_engine/ui/gui/widgets/config.py +41 -0
  169. data_engine/ui/gui/widgets/logs.py +62 -0
  170. data_engine/ui/gui/widgets/panels.py +507 -0
  171. data_engine/ui/gui/widgets/sidebar.py +130 -0
  172. data_engine/ui/gui/widgets/steps.py +84 -0
  173. data_engine/ui/tui/__init__.py +5 -0
  174. data_engine/ui/tui/app.py +222 -0
  175. data_engine/ui/tui/bootstrap.py +475 -0
  176. data_engine/ui/tui/bootstrapper.py +117 -0
  177. data_engine/ui/tui/controllers/__init__.py +6 -0
  178. data_engine/ui/tui/controllers/flows.py +349 -0
  179. data_engine/ui/tui/controllers/runtime.py +167 -0
  180. data_engine/ui/tui/runtime.py +34 -0
  181. data_engine/ui/tui/state_support.py +141 -0
  182. data_engine/ui/tui/support.py +63 -0
  183. data_engine/ui/tui/theme.py +204 -0
  184. data_engine/ui/tui/widgets.py +123 -0
  185. data_engine/views/__init__.py +109 -0
  186. data_engine/views/actions.py +80 -0
  187. data_engine/views/artifacts.py +58 -0
  188. data_engine/views/flow_display.py +69 -0
  189. data_engine/views/logs.py +54 -0
  190. data_engine/views/models.py +96 -0
  191. data_engine/views/presentation.py +133 -0
  192. data_engine/views/runs.py +62 -0
  193. data_engine/views/state.py +39 -0
  194. data_engine/views/status.py +13 -0
  195. data_engine/views/text.py +109 -0
  196. py_data_engine-0.1.0.dist-info/METADATA +330 -0
  197. py_data_engine-0.1.0.dist-info/RECORD +200 -0
  198. py_data_engine-0.1.0.dist-info/WHEEL +5 -0
  199. py_data_engine-0.1.0.dist-info/entry_points.txt +2 -0
  200. py_data_engine-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,705 @@
1
+ """Public one-shot DuckDB helpers for flow authoring."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import hashlib
6
+ from pathlib import Path
7
+
8
+ import duckdb
9
+ import polars as pl
10
+
11
+
12
+ def _quote_identifier(value: str) -> str:
13
+ text = str(value).strip()
14
+ if not text:
15
+ raise ValueError("Identifier must be non-empty.")
16
+ return '"' + text.replace('"', '""') + '"'
17
+
18
+
19
+ def _quote_table_ref(value: str) -> str:
20
+ parts = [part.strip() for part in str(value).split(".")]
21
+ if not parts or any(not part for part in parts):
22
+ raise ValueError("Table name must be non-empty.")
23
+ return ".".join(_quote_identifier(part) for part in parts)
24
+
25
+
26
+ def _schema_ref(value: str) -> str | None:
27
+ parts = [part.strip() for part in str(value).split(".")]
28
+ if len(parts) <= 1:
29
+ return None
30
+ return ".".join(_quote_identifier(part) for part in parts[:-1])
31
+
32
+
33
+ def _join_predicate(*, left_alias: str, right_alias: str, columns: tuple[str, ...]) -> str:
34
+ return " AND ".join(
35
+ f'{left_alias}.{_quote_identifier(column)} IS NOT DISTINCT FROM {right_alias}.{_quote_identifier(column)}'
36
+ for column in columns
37
+ )
38
+
39
+
40
+ def _ordered_columns(columns: tuple[str, ...]) -> str:
41
+ return ", ".join(_quote_identifier(column) for column in columns)
42
+
43
+
44
+ def _qualified_columns(alias: str, columns: tuple[str, ...]) -> str:
45
+ return ", ".join(f"{alias}.{_quote_identifier(column)}" for column in columns)
46
+
47
+
48
+ def _index_name(*, table: str, columns: tuple[str, ...]) -> str:
49
+ digest = hashlib.sha1(f"{table}|{'|'.join(columns)}".encode("utf-8")).hexdigest()[:10]
50
+ return f"uq_dim_{digest}"
51
+
52
+
53
+ def _existing_table_columns(connection, table: str) -> list[tuple[int, str, str, bool, object, bool]]:
54
+ schema = _schema_ref(table)
55
+ table_name = str(table).split(".")[-1].strip()
56
+ if schema is None:
57
+ return connection.execute(f"PRAGMA table_info({_quote_identifier(table_name)})").fetchall()
58
+ return connection.execute(f"PRAGMA table_info({_quote_table_ref(table)})").fetchall()
59
+
60
+
61
+ def _table_column_names(connection, table: str) -> tuple[str, ...]:
62
+ return tuple(name for _, name, *_ in _existing_table_columns(connection, table))
63
+
64
+
65
+ def _normalize_selected_columns(select: str | list[str] | tuple[str, ...]) -> tuple[str, ...]:
66
+ if isinstance(select, str):
67
+ normalized = (select.strip(),)
68
+ else:
69
+ normalized = tuple(str(value).strip() for value in select)
70
+ if not normalized or any(not value for value in normalized):
71
+ raise ValueError("select must include at least one non-empty column name.")
72
+ return normalized
73
+
74
+
75
+ def _normalize_key_columns(on: str | list[str] | tuple[str, ...]) -> tuple[str, ...]:
76
+ if isinstance(on, str):
77
+ normalized = (on.strip(),)
78
+ else:
79
+ normalized = tuple(str(value).strip() for value in on)
80
+ if not normalized or any(not value for value in normalized):
81
+ raise ValueError("on must include at least one non-empty column name.")
82
+ return normalized
83
+
84
+
85
+ def _normalize_optional_limit(limit: int | None) -> int | None:
86
+ if limit is None:
87
+ return None
88
+ normalized = int(limit)
89
+ if normalized < 0:
90
+ raise ValueError("limit must be non-negative.")
91
+ return normalized
92
+
93
+
94
+ def build_dimension(
95
+ db_path: str | Path,
96
+ table: str,
97
+ *,
98
+ df: pl.DataFrame,
99
+ key_column: str = "dimension_key",
100
+ return_df: bool = True,
101
+ ) -> pl.DataFrame | None:
102
+ """Build or extend one dimension table from unique incoming row combinations.
103
+
104
+ The incoming dataframe is treated as the natural key definition: every incoming
105
+ column participates in uniqueness. The helper ensures the dimension table
106
+ exists, inserts only new combinations, assigns deterministic surrogate keys,
107
+ and optionally returns the natural-key-to-surrogate-key mapping.
108
+ """
109
+
110
+ if not isinstance(df, pl.DataFrame):
111
+ raise TypeError("df must be a Polars DataFrame.")
112
+
113
+ natural_columns = tuple(df.columns)
114
+ if not natural_columns:
115
+ raise ValueError("df must include at least one column.")
116
+
117
+ normalized_key_column = str(key_column).strip()
118
+ if not normalized_key_column:
119
+ raise ValueError("key_column must be non-empty.")
120
+ if normalized_key_column in natural_columns:
121
+ raise ValueError(f'key_column {normalized_key_column!r} must not already exist in df columns.')
122
+
123
+ quoted_table = _quote_table_ref(table)
124
+ quoted_schema = _schema_ref(table)
125
+ quoted_key_column = _quote_identifier(normalized_key_column)
126
+ quoted_natural_columns = _ordered_columns(natural_columns)
127
+ qualified_mapping_columns = _qualified_columns("mapping", natural_columns)
128
+ natural_join = _join_predicate(left_alias="candidate", right_alias="existing", columns=natural_columns)
129
+ mapping_join = _join_predicate(left_alias="mapping", right_alias="incoming_distinct", columns=natural_columns)
130
+ order_by_columns = quoted_natural_columns
131
+
132
+ temp_view = "__data_engine_dimension_incoming"
133
+ temp_distinct = "__data_engine_dimension_incoming_distinct"
134
+ temp_new_rows = "__data_engine_dimension_new_rows"
135
+ unique_index_name = _quote_identifier(_index_name(table=table, columns=natural_columns))
136
+
137
+ resolved_db_path = Path(db_path).expanduser().resolve()
138
+ resolved_db_path.parent.mkdir(parents=True, exist_ok=True)
139
+
140
+ connection = duckdb.connect(resolved_db_path)
141
+ try:
142
+ connection.execute("BEGIN TRANSACTION")
143
+ if quoted_schema is not None:
144
+ connection.execute(f"CREATE SCHEMA IF NOT EXISTS {quoted_schema}")
145
+ connection.register(temp_view, df)
146
+ connection.execute(f"CREATE OR REPLACE TEMP TABLE {temp_distinct} AS SELECT DISTINCT * FROM {temp_view}")
147
+ connection.execute(
148
+ f"""
149
+ CREATE TABLE IF NOT EXISTS {quoted_table} AS
150
+ SELECT
151
+ CAST(NULL AS BIGINT) AS {quoted_key_column},
152
+ *
153
+ FROM {temp_distinct}
154
+ WHERE 1 = 0
155
+ """
156
+ )
157
+ connection.execute(
158
+ f"CREATE UNIQUE INDEX IF NOT EXISTS {unique_index_name} ON {quoted_table} ({quoted_natural_columns})"
159
+ )
160
+ connection.execute(
161
+ f"""
162
+ CREATE OR REPLACE TEMP TABLE {temp_new_rows} AS
163
+ SELECT candidate.*
164
+ FROM {temp_distinct} AS candidate
165
+ LEFT JOIN {quoted_table} AS existing
166
+ ON {natural_join}
167
+ WHERE existing.{quoted_key_column} IS NULL
168
+ """
169
+ )
170
+ connection.execute(
171
+ f"""
172
+ INSERT INTO {quoted_table} ({quoted_key_column}, {quoted_natural_columns})
173
+ SELECT
174
+ current_keys.max_existing_key + ROW_NUMBER() OVER (ORDER BY {order_by_columns}) AS {quoted_key_column},
175
+ new_rows.*
176
+ FROM {temp_new_rows} AS new_rows
177
+ CROSS JOIN (
178
+ SELECT COALESCE(MAX({quoted_key_column}), 0) AS max_existing_key
179
+ FROM {quoted_table}
180
+ ) AS current_keys
181
+ """
182
+ )
183
+
184
+ if not return_df:
185
+ connection.execute("COMMIT")
186
+ return None
187
+
188
+ mapping = connection.execute(
189
+ f"""
190
+ SELECT {qualified_mapping_columns}, mapping.{quoted_key_column}
191
+ FROM {quoted_table} AS mapping
192
+ INNER JOIN {temp_distinct} AS incoming_distinct
193
+ ON {mapping_join}
194
+ ORDER BY {order_by_columns}
195
+ """
196
+ ).pl()
197
+ connection.execute("COMMIT")
198
+ return mapping
199
+ except Exception:
200
+ try:
201
+ connection.execute("ROLLBACK")
202
+ except Exception:
203
+ pass
204
+ raise
205
+ finally:
206
+ connection.close()
207
+
208
+
209
+ def replace_rows_by_file(
210
+ db_path: str | Path,
211
+ table: str,
212
+ *,
213
+ df: pl.DataFrame,
214
+ file_hash: str,
215
+ file_hash_column: str = "file_key",
216
+ return_df: bool = True,
217
+ ) -> pl.DataFrame | None:
218
+ """Atomically replace one file's fact rows and append the current batch."""
219
+
220
+ if not isinstance(df, pl.DataFrame):
221
+ raise TypeError("df must be a Polars DataFrame.")
222
+
223
+ normalized_file_hash = str(file_hash).strip()
224
+ if not normalized_file_hash:
225
+ raise ValueError("file_hash must be non-empty.")
226
+
227
+ normalized_file_hash_column = str(file_hash_column).strip()
228
+ if not normalized_file_hash_column:
229
+ raise ValueError("file_hash_column must be non-empty.")
230
+ if normalized_file_hash_column in df.columns:
231
+ raise ValueError(f'file_hash_column {normalized_file_hash_column!r} must not already exist in df columns.')
232
+
233
+ incoming_with_hash = df.with_columns(pl.lit(normalized_file_hash).alias(normalized_file_hash_column))
234
+ incoming_columns = tuple(incoming_with_hash.columns)
235
+ if not incoming_columns:
236
+ raise ValueError("df must include at least one column.")
237
+
238
+ quoted_table = _quote_table_ref(table)
239
+ quoted_schema = _schema_ref(table)
240
+ quoted_file_hash_column = _quote_identifier(normalized_file_hash_column)
241
+ quoted_incoming_columns = _ordered_columns(incoming_columns)
242
+
243
+ temp_view = "__data_engine_incremental_incoming"
244
+ temp_table = "__data_engine_incremental_incoming_table"
245
+
246
+ resolved_db_path = Path(db_path).expanduser().resolve()
247
+ resolved_db_path.parent.mkdir(parents=True, exist_ok=True)
248
+
249
+ connection = duckdb.connect(resolved_db_path)
250
+ try:
251
+ connection.execute("BEGIN TRANSACTION")
252
+ if quoted_schema is not None:
253
+ connection.execute(f"CREATE SCHEMA IF NOT EXISTS {quoted_schema}")
254
+ connection.register(temp_view, incoming_with_hash)
255
+ connection.execute(f"CREATE OR REPLACE TEMP TABLE {temp_table} AS SELECT * FROM {temp_view}")
256
+ connection.execute(f"CREATE TABLE IF NOT EXISTS {quoted_table} AS SELECT * FROM {temp_table} WHERE 1 = 0")
257
+
258
+ existing_columns = {name: dtype for _, name, dtype, *_ in _existing_table_columns(connection, table)}
259
+ incoming_info = connection.execute(f"PRAGMA table_info({temp_table})").fetchall()
260
+ for _, name, dtype, *_ in incoming_info:
261
+ if name in existing_columns:
262
+ continue
263
+ connection.execute(f"ALTER TABLE {quoted_table} ADD COLUMN {_quote_identifier(name)} {dtype}")
264
+
265
+ connection.execute(
266
+ f"DELETE FROM {quoted_table} WHERE {quoted_file_hash_column} = ?",
267
+ [normalized_file_hash],
268
+ )
269
+ connection.execute(
270
+ f"""
271
+ INSERT INTO {quoted_table} ({quoted_incoming_columns})
272
+ SELECT {quoted_incoming_columns}
273
+ FROM {temp_table}
274
+ """
275
+ )
276
+
277
+ if not return_df:
278
+ connection.execute("COMMIT")
279
+ return None
280
+
281
+ connection.execute("COMMIT")
282
+ return incoming_with_hash
283
+ except Exception:
284
+ try:
285
+ connection.execute("ROLLBACK")
286
+ except Exception:
287
+ pass
288
+ raise
289
+ finally:
290
+ connection.close()
291
+
292
+
293
+ def replace_rows_by_values(
294
+ db_path: str | Path,
295
+ table: str,
296
+ *,
297
+ df: pl.DataFrame,
298
+ column: str,
299
+ return_df: bool = True,
300
+ ) -> pl.DataFrame | None:
301
+ """Atomically replace one value-slice of rows and append the current batch."""
302
+
303
+ if not isinstance(df, pl.DataFrame):
304
+ raise TypeError("df must be a Polars DataFrame.")
305
+ if df.is_empty():
306
+ raise ValueError("df must include at least one row.")
307
+
308
+ normalized_column = str(column).strip()
309
+ if not normalized_column:
310
+ raise ValueError("column must be non-empty.")
311
+ if normalized_column not in df.columns:
312
+ raise ValueError(f'column {normalized_column!r} must exist in df columns.')
313
+
314
+ lookup = df.select(pl.col(normalized_column)).unique(maintain_order=True)
315
+ if lookup.is_empty():
316
+ raise ValueError("df must include at least one replacement value.")
317
+
318
+ quoted_table = _quote_table_ref(table)
319
+ quoted_schema = _schema_ref(table)
320
+ quoted_column = _quote_identifier(normalized_column)
321
+ quoted_df_columns = _ordered_columns(tuple(df.columns))
322
+
323
+ temp_view = "__data_engine_replace_values_df"
324
+ temp_table = "__data_engine_replace_values_df_table"
325
+ temp_lookup_view = "__data_engine_replace_values_lookup"
326
+ temp_lookup_table = "__data_engine_replace_values_lookup_table"
327
+
328
+ resolved_db_path = Path(db_path).expanduser().resolve()
329
+ resolved_db_path.parent.mkdir(parents=True, exist_ok=True)
330
+
331
+ connection = duckdb.connect(resolved_db_path)
332
+ try:
333
+ connection.execute("BEGIN TRANSACTION")
334
+ if quoted_schema is not None:
335
+ connection.execute(f"CREATE SCHEMA IF NOT EXISTS {quoted_schema}")
336
+
337
+ connection.register(temp_view, df)
338
+ connection.execute(f"CREATE OR REPLACE TEMP TABLE {temp_table} AS SELECT * FROM {temp_view}")
339
+ connection.execute(f"CREATE TABLE IF NOT EXISTS {quoted_table} AS SELECT * FROM {temp_table} WHERE 1 = 0")
340
+
341
+ existing_columns = {name: dtype for _, name, dtype, *_ in _existing_table_columns(connection, table)}
342
+ incoming_info = connection.execute(f"PRAGMA table_info({temp_table})").fetchall()
343
+ for _, name, dtype, *_ in incoming_info:
344
+ if name in existing_columns:
345
+ continue
346
+ connection.execute(f"ALTER TABLE {quoted_table} ADD COLUMN {_quote_identifier(name)} {dtype}")
347
+
348
+ connection.register(temp_lookup_view, lookup)
349
+ connection.execute(
350
+ f"""
351
+ CREATE OR REPLACE TEMP TABLE {temp_lookup_table} AS
352
+ SELECT {_quote_identifier(normalized_column)} AS lookup_value
353
+ FROM {temp_lookup_view}
354
+ """
355
+ )
356
+
357
+ connection.execute(
358
+ f"""
359
+ DELETE FROM {quoted_table}
360
+ WHERE {quoted_column} IN (
361
+ SELECT lookup_value
362
+ FROM {temp_lookup_table}
363
+ )
364
+ """
365
+ )
366
+ connection.execute(
367
+ f"""
368
+ INSERT INTO {quoted_table} ({quoted_df_columns})
369
+ SELECT {quoted_df_columns}
370
+ FROM {temp_table}
371
+ """
372
+ )
373
+
374
+ if not return_df:
375
+ connection.execute("COMMIT")
376
+ return None
377
+
378
+ connection.execute("COMMIT")
379
+ return df
380
+ except Exception:
381
+ try:
382
+ connection.execute("ROLLBACK")
383
+ except Exception:
384
+ pass
385
+ raise
386
+ finally:
387
+ connection.close()
388
+
389
+
390
+ def attach_dimension(
391
+ db_path: str | Path,
392
+ table: str,
393
+ *,
394
+ df: pl.DataFrame,
395
+ on: str | list[str] | tuple[str, ...],
396
+ key_column: str = "dimension_key",
397
+ drop_key: bool = False,
398
+ ) -> pl.DataFrame:
399
+ """Attach an existing surrogate key mapping table to an input dataframe."""
400
+
401
+ if not isinstance(df, pl.DataFrame):
402
+ raise TypeError("df must be a Polars DataFrame.")
403
+
404
+ join_columns = _normalize_key_columns(on)
405
+ missing_columns = [column for column in join_columns if column not in df.columns]
406
+ if missing_columns:
407
+ raise ValueError(f"on columns must exist in df: {missing_columns!r}")
408
+
409
+ mapping = read_rows_by_values(
410
+ db_path,
411
+ table,
412
+ column=join_columns[0],
413
+ is_in=df.get_column(join_columns[0]).unique().to_list(),
414
+ select=[*join_columns, key_column],
415
+ ).unique(subset=list(join_columns), maintain_order=True)
416
+
417
+ normalized = df.join(mapping, on=list(join_columns), how="left", validate="m:1")
418
+ if drop_key:
419
+ normalized = normalized.drop(list(join_columns))
420
+ return normalized
421
+
422
+
423
+ def denormalize_columns(
424
+ db_path: str | Path,
425
+ table: str,
426
+ *,
427
+ df: pl.DataFrame,
428
+ key_column: str = "dimension_key",
429
+ select: str | list[str] | tuple[str, ...] = "*",
430
+ drop_key: bool = False,
431
+ ) -> pl.DataFrame:
432
+ """Attach natural columns from an existing dimension table onto a keyed dataframe."""
433
+
434
+ if not isinstance(df, pl.DataFrame):
435
+ raise TypeError("df must be a Polars DataFrame.")
436
+
437
+ normalized_key_column = str(key_column).strip()
438
+ if not normalized_key_column:
439
+ raise ValueError("key_column must be non-empty.")
440
+ if normalized_key_column not in df.columns:
441
+ raise ValueError(f"key_column {normalized_key_column!r} must exist in df.")
442
+
443
+ resolved_db_path = Path(db_path).expanduser().resolve()
444
+ resolved_db_path.parent.mkdir(parents=True, exist_ok=True)
445
+
446
+ connection = duckdb.connect(resolved_db_path)
447
+ try:
448
+ table_columns = _table_column_names(connection, table)
449
+ finally:
450
+ connection.close()
451
+
452
+ if not table_columns:
453
+ raise ValueError(f"Table {table!r} does not exist or has no columns.")
454
+ if normalized_key_column not in table_columns:
455
+ raise ValueError(f"key_column {normalized_key_column!r} must exist in table {table!r}.")
456
+
457
+ if select == "*":
458
+ selected_columns = tuple(column for column in table_columns if column != normalized_key_column)
459
+ else:
460
+ selected_columns = _normalize_selected_columns(select)
461
+ missing_columns = [column for column in selected_columns if column not in table_columns]
462
+ if missing_columns:
463
+ raise ValueError(f"select columns must exist in table {table!r}: {missing_columns!r}")
464
+ if normalized_key_column in selected_columns:
465
+ raise ValueError(f"select must not include key_column {normalized_key_column!r}.")
466
+
467
+ if not selected_columns:
468
+ raise ValueError("select must include at least one non-key column.")
469
+
470
+ mapping = read_rows_by_values(
471
+ db_path,
472
+ table,
473
+ column=normalized_key_column,
474
+ is_in=df.get_column(normalized_key_column).unique().to_list(),
475
+ select=[normalized_key_column, *selected_columns],
476
+ ).unique(subset=[normalized_key_column], maintain_order=True)
477
+
478
+ denormalized = df.join(mapping, on=[normalized_key_column], how="left", validate="m:1")
479
+ if drop_key:
480
+ denormalized = denormalized.drop([normalized_key_column])
481
+ return denormalized
482
+
483
+
484
+ def normalize_columns(
485
+ db_path: str | Path,
486
+ table: str,
487
+ *,
488
+ df: pl.DataFrame,
489
+ on: str | list[str] | tuple[str, ...],
490
+ key_column: str = "dimension_key",
491
+ drop_key: bool = True,
492
+ returns: str | None = "df",
493
+ ) -> pl.DataFrame | None:
494
+ """Build missing surrogate keys and attach them back onto the input dataframe."""
495
+
496
+ if returns not in {"df", "map", None}:
497
+ raise ValueError('returns must be "df", "map", or None.')
498
+
499
+ join_columns = _normalize_key_columns(on)
500
+ natural_key_df = df.select(list(join_columns)).unique(maintain_order=True)
501
+ mapping = build_dimension(
502
+ db_path,
503
+ table,
504
+ df=natural_key_df,
505
+ key_column=key_column,
506
+ return_df=True,
507
+ )
508
+ if mapping is None:
509
+ raise RuntimeError("build_dimension() unexpectedly returned no mapping.")
510
+
511
+ if returns == "map":
512
+ return mapping
513
+ if returns is None:
514
+ return None
515
+
516
+ return attach_dimension(
517
+ db_path,
518
+ table,
519
+ df=df,
520
+ on=join_columns,
521
+ key_column=key_column,
522
+ drop_key=drop_key,
523
+ )
524
+
525
+
526
+ def read_rows_by_values(
527
+ db_path: str | Path,
528
+ table: str,
529
+ *,
530
+ column: str,
531
+ is_in: list[object] | tuple[object, ...],
532
+ select: str | list[str] | tuple[str, ...],
533
+ ) -> pl.DataFrame:
534
+ """Return selected columns for rows whose one column matches any provided value."""
535
+
536
+ normalized_column = str(column).strip()
537
+ if not normalized_column:
538
+ raise ValueError("column must be non-empty.")
539
+ selected_columns = _normalize_selected_columns(select)
540
+
541
+ quoted_table = _quote_table_ref(table)
542
+ quoted_column = _quote_identifier(normalized_column)
543
+ selected_sql = _qualified_columns("source_rows", selected_columns)
544
+
545
+ resolved_db_path = Path(db_path).expanduser().resolve()
546
+ resolved_db_path.parent.mkdir(parents=True, exist_ok=True)
547
+
548
+ connection = duckdb.connect(resolved_db_path)
549
+ try:
550
+ if not is_in:
551
+ return connection.execute(
552
+ f"""
553
+ SELECT {selected_sql}
554
+ FROM {quoted_table} AS source_rows
555
+ WHERE 1 = 0
556
+ """
557
+ ).pl()
558
+
559
+ lookup = pl.DataFrame({"lookup_value": list(is_in)}).unique(maintain_order=True)
560
+ connection.register("__data_engine_lookup_values", lookup)
561
+ connection.execute(
562
+ "CREATE OR REPLACE TEMP TABLE __data_engine_lookup_values_table AS SELECT * FROM __data_engine_lookup_values"
563
+ )
564
+ return connection.execute(
565
+ f"""
566
+ SELECT {selected_sql}
567
+ FROM {quoted_table} AS source_rows
568
+ INNER JOIN __data_engine_lookup_values_table AS lookup
569
+ ON source_rows.{quoted_column} IS NOT DISTINCT FROM lookup.lookup_value
570
+ """
571
+ ).pl()
572
+ finally:
573
+ connection.close()
574
+
575
+
576
+ def read_sql(
577
+ db_path: str | Path,
578
+ *,
579
+ sql: str,
580
+ ) -> pl.DataFrame:
581
+ """Run one SQL query and return the result as a Polars DataFrame."""
582
+
583
+ normalized_sql = str(sql).strip()
584
+ if not normalized_sql:
585
+ raise ValueError("sql must be non-empty.")
586
+
587
+ resolved_db_path = Path(db_path).expanduser().resolve()
588
+ resolved_db_path.parent.mkdir(parents=True, exist_ok=True)
589
+
590
+ connection = duckdb.connect(resolved_db_path)
591
+ try:
592
+ return connection.execute(normalized_sql).pl()
593
+ finally:
594
+ connection.close()
595
+
596
+
597
+ def read_table(
598
+ db_path: str | Path,
599
+ table: str,
600
+ *,
601
+ select: str | list[str] | tuple[str, ...] = "*",
602
+ where: str | None = None,
603
+ limit: int | None = None,
604
+ ) -> pl.DataFrame:
605
+ """Read rows from one table with optional column selection, filter, and limit."""
606
+
607
+ quoted_table = _quote_table_ref(table)
608
+ normalized_where = None if where is None else str(where).strip()
609
+ normalized_limit = _normalize_optional_limit(limit)
610
+
611
+ if select == "*":
612
+ selected_sql = "*"
613
+ else:
614
+ selected_columns = _normalize_selected_columns(select)
615
+ selected_sql = _ordered_columns(selected_columns)
616
+
617
+ query_parts = [f"SELECT {selected_sql}", f"FROM {quoted_table}"]
618
+ if normalized_where:
619
+ query_parts.append(f"WHERE {normalized_where}")
620
+ if normalized_limit is not None:
621
+ query_parts.append(f"LIMIT {normalized_limit}")
622
+
623
+ return read_sql(db_path, sql="\n".join(query_parts))
624
+
625
+
626
+ def replace_table(
627
+ db_path: str | Path,
628
+ table: str,
629
+ *,
630
+ df: pl.DataFrame,
631
+ return_df: bool = True,
632
+ ) -> pl.DataFrame | None:
633
+ """Replace one table wholesale from a dataframe, expanding to the current df schema."""
634
+
635
+ if not isinstance(df, pl.DataFrame):
636
+ raise TypeError("df must be a Polars DataFrame.")
637
+
638
+ df_columns = tuple(df.columns)
639
+ if not df_columns:
640
+ raise ValueError("df must include at least one column.")
641
+
642
+ quoted_table = _quote_table_ref(table)
643
+ quoted_schema = _schema_ref(table)
644
+ quoted_df_columns = _ordered_columns(df_columns)
645
+
646
+ temp_view = "__data_engine_replace_table_df"
647
+ temp_table = "__data_engine_replace_table_df_table"
648
+
649
+ resolved_db_path = Path(db_path).expanduser().resolve()
650
+ resolved_db_path.parent.mkdir(parents=True, exist_ok=True)
651
+
652
+ connection = duckdb.connect(resolved_db_path)
653
+ try:
654
+ connection.execute("BEGIN TRANSACTION")
655
+ if quoted_schema is not None:
656
+ connection.execute(f"CREATE SCHEMA IF NOT EXISTS {quoted_schema}")
657
+
658
+ connection.register(temp_view, df)
659
+ connection.execute(f"CREATE OR REPLACE TEMP TABLE {temp_table} AS SELECT * FROM {temp_view}")
660
+ connection.execute(f"CREATE TABLE IF NOT EXISTS {quoted_table} AS SELECT * FROM {temp_table} WHERE 1 = 0")
661
+
662
+ existing_columns = {name: dtype for _, name, dtype, *_ in _existing_table_columns(connection, table)}
663
+ incoming_info = connection.execute(f"PRAGMA table_info({temp_table})").fetchall()
664
+ for _, name, dtype, *_ in incoming_info:
665
+ if name in existing_columns:
666
+ continue
667
+ connection.execute(f"ALTER TABLE {quoted_table} ADD COLUMN {_quote_identifier(name)} {dtype}")
668
+
669
+ connection.execute(f"DELETE FROM {quoted_table}")
670
+ connection.execute(
671
+ f"""
672
+ INSERT INTO {quoted_table} ({quoted_df_columns})
673
+ SELECT {quoted_df_columns}
674
+ FROM {temp_table}
675
+ """
676
+ )
677
+
678
+ if not return_df:
679
+ connection.execute("COMMIT")
680
+ return None
681
+
682
+ connection.execute("COMMIT")
683
+ return df
684
+ except Exception:
685
+ try:
686
+ connection.execute("ROLLBACK")
687
+ except Exception:
688
+ pass
689
+ raise
690
+ finally:
691
+ connection.close()
692
+
693
+
694
+ __all__ = [
695
+ "attach_dimension",
696
+ "build_dimension",
697
+ "denormalize_columns",
698
+ "normalize_columns",
699
+ "read_rows_by_values",
700
+ "read_sql",
701
+ "read_table",
702
+ "replace_rows_by_file",
703
+ "replace_rows_by_values",
704
+ "replace_table",
705
+ ]
@@ -0,0 +1 @@
1
+ """Host surfaces and process adapters."""