py-data-engine 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_engine/__init__.py +37 -0
- data_engine/application/__init__.py +39 -0
- data_engine/application/actions.py +42 -0
- data_engine/application/catalog.py +151 -0
- data_engine/application/control.py +213 -0
- data_engine/application/details.py +73 -0
- data_engine/application/runtime.py +449 -0
- data_engine/application/workspace.py +62 -0
- data_engine/authoring/__init__.py +14 -0
- data_engine/authoring/builder.py +31 -0
- data_engine/authoring/execution/__init__.py +6 -0
- data_engine/authoring/execution/app.py +6 -0
- data_engine/authoring/execution/context.py +82 -0
- data_engine/authoring/execution/continuous.py +176 -0
- data_engine/authoring/execution/grouped.py +106 -0
- data_engine/authoring/execution/logging.py +83 -0
- data_engine/authoring/execution/polling.py +135 -0
- data_engine/authoring/execution/runner.py +210 -0
- data_engine/authoring/execution/single.py +171 -0
- data_engine/authoring/flow.py +361 -0
- data_engine/authoring/helpers.py +160 -0
- data_engine/authoring/model.py +59 -0
- data_engine/authoring/primitives.py +430 -0
- data_engine/authoring/services.py +42 -0
- data_engine/devtools/__init__.py +3 -0
- data_engine/devtools/project_ast_map.py +503 -0
- data_engine/docs/__init__.py +1 -0
- data_engine/docs/sphinx_source/_static/custom.css +13 -0
- data_engine/docs/sphinx_source/api.rst +42 -0
- data_engine/docs/sphinx_source/conf.py +37 -0
- data_engine/docs/sphinx_source/guides/app-runtime-and-workspaces.md +397 -0
- data_engine/docs/sphinx_source/guides/authoring-flow-modules.md +215 -0
- data_engine/docs/sphinx_source/guides/configuring-flows.md +185 -0
- data_engine/docs/sphinx_source/guides/core-concepts.md +208 -0
- data_engine/docs/sphinx_source/guides/database-methods.md +107 -0
- data_engine/docs/sphinx_source/guides/duckdb-helpers.md +462 -0
- data_engine/docs/sphinx_source/guides/flow-context.md +538 -0
- data_engine/docs/sphinx_source/guides/flow-methods.md +206 -0
- data_engine/docs/sphinx_source/guides/getting-started.md +271 -0
- data_engine/docs/sphinx_source/guides/project-inventory.md +5683 -0
- data_engine/docs/sphinx_source/guides/project-map.md +118 -0
- data_engine/docs/sphinx_source/guides/recipes.md +268 -0
- data_engine/docs/sphinx_source/index.rst +22 -0
- data_engine/domain/__init__.py +92 -0
- data_engine/domain/actions.py +69 -0
- data_engine/domain/catalog.py +128 -0
- data_engine/domain/details.py +214 -0
- data_engine/domain/diagnostics.py +56 -0
- data_engine/domain/errors.py +104 -0
- data_engine/domain/inspection.py +99 -0
- data_engine/domain/logs.py +118 -0
- data_engine/domain/operations.py +172 -0
- data_engine/domain/operator.py +72 -0
- data_engine/domain/runs.py +155 -0
- data_engine/domain/runtime.py +279 -0
- data_engine/domain/source_state.py +17 -0
- data_engine/domain/support.py +54 -0
- data_engine/domain/time.py +23 -0
- data_engine/domain/workspace.py +159 -0
- data_engine/flow_modules/__init__.py +1 -0
- data_engine/flow_modules/flow_module_compiler.py +179 -0
- data_engine/flow_modules/flow_module_loader.py +201 -0
- data_engine/helpers/__init__.py +25 -0
- data_engine/helpers/duckdb.py +705 -0
- data_engine/hosts/__init__.py +1 -0
- data_engine/hosts/daemon/__init__.py +23 -0
- data_engine/hosts/daemon/app.py +221 -0
- data_engine/hosts/daemon/bootstrap.py +69 -0
- data_engine/hosts/daemon/client.py +465 -0
- data_engine/hosts/daemon/commands.py +64 -0
- data_engine/hosts/daemon/composition.py +310 -0
- data_engine/hosts/daemon/constants.py +15 -0
- data_engine/hosts/daemon/entrypoints.py +97 -0
- data_engine/hosts/daemon/lifecycle.py +191 -0
- data_engine/hosts/daemon/manager.py +272 -0
- data_engine/hosts/daemon/ownership.py +126 -0
- data_engine/hosts/daemon/runtime_commands.py +188 -0
- data_engine/hosts/daemon/runtime_control.py +31 -0
- data_engine/hosts/daemon/server.py +84 -0
- data_engine/hosts/daemon/shared_state.py +147 -0
- data_engine/hosts/daemon/state_sync.py +101 -0
- data_engine/platform/__init__.py +1 -0
- data_engine/platform/identity.py +35 -0
- data_engine/platform/local_settings.py +146 -0
- data_engine/platform/theme.py +259 -0
- data_engine/platform/workspace_models.py +190 -0
- data_engine/platform/workspace_policy.py +333 -0
- data_engine/runtime/__init__.py +1 -0
- data_engine/runtime/file_watch.py +185 -0
- data_engine/runtime/ledger_models.py +116 -0
- data_engine/runtime/runtime_db.py +938 -0
- data_engine/runtime/shared_state.py +523 -0
- data_engine/services/__init__.py +49 -0
- data_engine/services/daemon.py +64 -0
- data_engine/services/daemon_state.py +40 -0
- data_engine/services/flow_catalog.py +102 -0
- data_engine/services/flow_execution.py +48 -0
- data_engine/services/ledger.py +85 -0
- data_engine/services/logs.py +65 -0
- data_engine/services/runtime_binding.py +105 -0
- data_engine/services/runtime_execution.py +126 -0
- data_engine/services/runtime_history.py +62 -0
- data_engine/services/settings.py +58 -0
- data_engine/services/shared_state.py +28 -0
- data_engine/services/theme.py +59 -0
- data_engine/services/workspace_provisioning.py +224 -0
- data_engine/services/workspaces.py +74 -0
- data_engine/ui/__init__.py +3 -0
- data_engine/ui/cli/__init__.py +19 -0
- data_engine/ui/cli/app.py +161 -0
- data_engine/ui/cli/commands_doctor.py +178 -0
- data_engine/ui/cli/commands_run.py +80 -0
- data_engine/ui/cli/commands_start.py +100 -0
- data_engine/ui/cli/commands_workspace.py +97 -0
- data_engine/ui/cli/dependencies.py +44 -0
- data_engine/ui/cli/parser.py +56 -0
- data_engine/ui/gui/__init__.py +25 -0
- data_engine/ui/gui/app.py +116 -0
- data_engine/ui/gui/bootstrap.py +487 -0
- data_engine/ui/gui/bootstrapper.py +140 -0
- data_engine/ui/gui/cache_models.py +23 -0
- data_engine/ui/gui/control_support.py +185 -0
- data_engine/ui/gui/controllers/__init__.py +6 -0
- data_engine/ui/gui/controllers/flows.py +439 -0
- data_engine/ui/gui/controllers/runtime.py +245 -0
- data_engine/ui/gui/dialogs/__init__.py +12 -0
- data_engine/ui/gui/dialogs/messages.py +88 -0
- data_engine/ui/gui/dialogs/previews.py +222 -0
- data_engine/ui/gui/helpers/__init__.py +62 -0
- data_engine/ui/gui/helpers/inspection.py +81 -0
- data_engine/ui/gui/helpers/lifecycle.py +112 -0
- data_engine/ui/gui/helpers/scroll.py +28 -0
- data_engine/ui/gui/helpers/theming.py +87 -0
- data_engine/ui/gui/icons/dark_light.svg +12 -0
- data_engine/ui/gui/icons/documentation.svg +1 -0
- data_engine/ui/gui/icons/failed.svg +3 -0
- data_engine/ui/gui/icons/group.svg +4 -0
- data_engine/ui/gui/icons/home.svg +2 -0
- data_engine/ui/gui/icons/manual.svg +2 -0
- data_engine/ui/gui/icons/poll.svg +2 -0
- data_engine/ui/gui/icons/schedule.svg +4 -0
- data_engine/ui/gui/icons/settings.svg +2 -0
- data_engine/ui/gui/icons/started.svg +3 -0
- data_engine/ui/gui/icons/success.svg +3 -0
- data_engine/ui/gui/icons/view-log.svg +3 -0
- data_engine/ui/gui/icons.py +50 -0
- data_engine/ui/gui/launcher.py +48 -0
- data_engine/ui/gui/presenters/__init__.py +72 -0
- data_engine/ui/gui/presenters/docs.py +140 -0
- data_engine/ui/gui/presenters/logs.py +58 -0
- data_engine/ui/gui/presenters/runtime_projection.py +29 -0
- data_engine/ui/gui/presenters/sidebar.py +88 -0
- data_engine/ui/gui/presenters/steps.py +148 -0
- data_engine/ui/gui/presenters/workspace.py +39 -0
- data_engine/ui/gui/presenters/workspace_binding.py +75 -0
- data_engine/ui/gui/presenters/workspace_settings.py +182 -0
- data_engine/ui/gui/preview_models.py +37 -0
- data_engine/ui/gui/render_support.py +241 -0
- data_engine/ui/gui/rendering/__init__.py +12 -0
- data_engine/ui/gui/rendering/artifacts.py +95 -0
- data_engine/ui/gui/rendering/icons.py +50 -0
- data_engine/ui/gui/runtime.py +47 -0
- data_engine/ui/gui/state_support.py +193 -0
- data_engine/ui/gui/support.py +214 -0
- data_engine/ui/gui/surface.py +209 -0
- data_engine/ui/gui/theme.py +720 -0
- data_engine/ui/gui/widgets/__init__.py +34 -0
- data_engine/ui/gui/widgets/config.py +41 -0
- data_engine/ui/gui/widgets/logs.py +62 -0
- data_engine/ui/gui/widgets/panels.py +507 -0
- data_engine/ui/gui/widgets/sidebar.py +130 -0
- data_engine/ui/gui/widgets/steps.py +84 -0
- data_engine/ui/tui/__init__.py +5 -0
- data_engine/ui/tui/app.py +222 -0
- data_engine/ui/tui/bootstrap.py +475 -0
- data_engine/ui/tui/bootstrapper.py +117 -0
- data_engine/ui/tui/controllers/__init__.py +6 -0
- data_engine/ui/tui/controllers/flows.py +349 -0
- data_engine/ui/tui/controllers/runtime.py +167 -0
- data_engine/ui/tui/runtime.py +34 -0
- data_engine/ui/tui/state_support.py +141 -0
- data_engine/ui/tui/support.py +63 -0
- data_engine/ui/tui/theme.py +204 -0
- data_engine/ui/tui/widgets.py +123 -0
- data_engine/views/__init__.py +109 -0
- data_engine/views/actions.py +80 -0
- data_engine/views/artifacts.py +58 -0
- data_engine/views/flow_display.py +69 -0
- data_engine/views/logs.py +54 -0
- data_engine/views/models.py +96 -0
- data_engine/views/presentation.py +133 -0
- data_engine/views/runs.py +62 -0
- data_engine/views/state.py +39 -0
- data_engine/views/status.py +13 -0
- data_engine/views/text.py +109 -0
- py_data_engine-0.1.0.dist-info/METADATA +330 -0
- py_data_engine-0.1.0.dist-info/RECORD +200 -0
- py_data_engine-0.1.0.dist-info/WHEEL +5 -0
- py_data_engine-0.1.0.dist-info/entry_points.txt +2 -0
- py_data_engine-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,462 @@
|
|
|
1
|
+
# DuckDB Helpers
|
|
2
|
+
|
|
3
|
+
`data_engine.helpers.duckdb` is the first public helper layer for common warehouse-style authoring patterns.
|
|
4
|
+
|
|
5
|
+
These helpers are intentionally:
|
|
6
|
+
|
|
7
|
+
- one-shot
|
|
8
|
+
- explicit about the database path
|
|
9
|
+
- explicit about the target table
|
|
10
|
+
- responsible for their own connection lifecycle
|
|
11
|
+
|
|
12
|
+
That means each helper:
|
|
13
|
+
|
|
14
|
+
- opens DuckDB
|
|
15
|
+
- does one job
|
|
16
|
+
- commits or rolls back
|
|
17
|
+
- closes the connection
|
|
18
|
+
|
|
19
|
+
They are designed for flow code that wants less repeated SQL plumbing without hiding too much behavior.
|
|
20
|
+
|
|
21
|
+
## Import style
|
|
22
|
+
|
|
23
|
+
```python
|
|
24
|
+
from data_engine.helpers.duckdb import build_dimension
|
|
25
|
+
from data_engine.helpers.duckdb import attach_dimension
|
|
26
|
+
from data_engine.helpers.duckdb import denormalize_columns
|
|
27
|
+
from data_engine.helpers.duckdb import normalize_columns
|
|
28
|
+
from data_engine.helpers.duckdb import read_rows_by_values
|
|
29
|
+
from data_engine.helpers.duckdb import read_sql
|
|
30
|
+
from data_engine.helpers.duckdb import read_table
|
|
31
|
+
from data_engine.helpers.duckdb import replace_rows_by_file
|
|
32
|
+
from data_engine.helpers.duckdb import replace_rows_by_values
|
|
33
|
+
from data_engine.helpers.duckdb import replace_table
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
The expected path pattern is:
|
|
37
|
+
|
|
38
|
+
```python
|
|
39
|
+
db_path = context.database("claims/analytics.duckdb")
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
You can also use a mirrored output path or any other DuckDB file path you control. The helpers do not require `context`; they only require a path.
|
|
43
|
+
|
|
44
|
+
## Shared conventions
|
|
45
|
+
|
|
46
|
+
The current helper family uses a shared shape:
|
|
47
|
+
|
|
48
|
+
```python
|
|
49
|
+
helper_name(
|
|
50
|
+
db_path,
|
|
51
|
+
table,
|
|
52
|
+
*,
|
|
53
|
+
...
|
|
54
|
+
)
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
Notes:
|
|
58
|
+
|
|
59
|
+
- `db_path` is positional and required
|
|
60
|
+
- `table` is positional and required
|
|
61
|
+
- `df` is the incoming Polars dataframe when the helper works on one
|
|
62
|
+
- `return_df=True` means "return the dataframe result for this helper"
|
|
63
|
+
- identifiers such as table names and column names are quoted safely, including reserved words such as `group`
|
|
64
|
+
- schema-qualified tables such as `"mart.fact_claim"` are supported
|
|
65
|
+
|
|
66
|
+
## `build_dimension(...)`
|
|
67
|
+
|
|
68
|
+
Use this helper when you already have a dataframe trimmed down to only the natural-key columns and want to persist or extend a surrogate-key table.
|
|
69
|
+
|
|
70
|
+
Signature:
|
|
71
|
+
|
|
72
|
+
```python
|
|
73
|
+
build_dimension(
|
|
74
|
+
db_path,
|
|
75
|
+
table,
|
|
76
|
+
*,
|
|
77
|
+
df,
|
|
78
|
+
key_column="dimension_key",
|
|
79
|
+
return_df=True,
|
|
80
|
+
)
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
Behavior:
|
|
84
|
+
|
|
85
|
+
- treats every column in `df` as part of the natural key
|
|
86
|
+
- creates the table if it does not exist
|
|
87
|
+
- inserts only missing unique combinations
|
|
88
|
+
- assigns deterministic integer surrogate keys
|
|
89
|
+
- returns the natural-key-to-surrogate-key mapping when `return_df=True`
|
|
90
|
+
|
|
91
|
+
Example:
|
|
92
|
+
|
|
93
|
+
```python
|
|
94
|
+
mapping = build_dimension(
|
|
95
|
+
context.database("warehouse.duckdb"),
|
|
96
|
+
"mart.dim_member",
|
|
97
|
+
df=member_keys_df,
|
|
98
|
+
key_column="member_key",
|
|
99
|
+
)
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
Returned mapping:
|
|
103
|
+
|
|
104
|
+
```text
|
|
105
|
+
member_id | lob | member_key
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
## `attach_dimension(...)`
|
|
109
|
+
|
|
110
|
+
Use this helper when the surrogate-key table already exists and you only want to join the key back onto a dataframe.
|
|
111
|
+
|
|
112
|
+
Signature:
|
|
113
|
+
|
|
114
|
+
```python
|
|
115
|
+
attach_dimension(
|
|
116
|
+
db_path,
|
|
117
|
+
table,
|
|
118
|
+
*,
|
|
119
|
+
df,
|
|
120
|
+
on,
|
|
121
|
+
key_column="dimension_key",
|
|
122
|
+
drop_key=False,
|
|
123
|
+
)
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
Key arguments:
|
|
127
|
+
|
|
128
|
+
- `on` can be one column name or a list of column names
|
|
129
|
+
- `drop_key=False` keeps the natural-key columns by default
|
|
130
|
+
- set `drop_key=True` when you want the attached surrogate key without the original key columns
|
|
131
|
+
|
|
132
|
+
Example:
|
|
133
|
+
|
|
134
|
+
```python
|
|
135
|
+
attached = attach_dimension(
|
|
136
|
+
context.database("warehouse.duckdb"),
|
|
137
|
+
"mart.dim_member",
|
|
138
|
+
df=claims_df,
|
|
139
|
+
on=["member_id", "lob"],
|
|
140
|
+
key_column="member_key",
|
|
141
|
+
)
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
## `normalize_columns(...)`
|
|
145
|
+
|
|
146
|
+
Use this helper when you want to build missing surrogate keys and immediately attach them back onto the full dataframe.
|
|
147
|
+
|
|
148
|
+
Signature:
|
|
149
|
+
|
|
150
|
+
```python
|
|
151
|
+
normalize_columns(
|
|
152
|
+
db_path,
|
|
153
|
+
table,
|
|
154
|
+
*,
|
|
155
|
+
df,
|
|
156
|
+
on,
|
|
157
|
+
key_column="dimension_key",
|
|
158
|
+
drop_key=True,
|
|
159
|
+
returns="df",
|
|
160
|
+
)
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
Key arguments:
|
|
164
|
+
|
|
165
|
+
- `on` can be one column name or a list of column names
|
|
166
|
+
- `drop_key=True` removes the natural-key columns after the surrogate key is joined back
|
|
167
|
+
- `returns="df"` returns the normalized dataframe
|
|
168
|
+
- `returns="map"` returns only the persisted mapping
|
|
169
|
+
- `returns=None` performs side effects only
|
|
170
|
+
|
|
171
|
+
Example:
|
|
172
|
+
|
|
173
|
+
```python
|
|
174
|
+
normalized = normalize_columns(
|
|
175
|
+
context.database("warehouse.duckdb"),
|
|
176
|
+
"mart.dim_member",
|
|
177
|
+
df=claims_df,
|
|
178
|
+
on=["member_id", "lob"],
|
|
179
|
+
key_column="member_key",
|
|
180
|
+
)
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
If `claims_df` starts with:
|
|
184
|
+
|
|
185
|
+
```text
|
|
186
|
+
member_id | lob | amount
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
Then `normalized` becomes:
|
|
190
|
+
|
|
191
|
+
```text
|
|
192
|
+
amount | member_key
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
This helper uses `build_dimension(...)` and `attach_dimension(...)` internally.
|
|
196
|
+
|
|
197
|
+
## `denormalize_columns(...)`
|
|
198
|
+
|
|
199
|
+
Use this helper when your dataframe already has a surrogate key and you want to attach the natural columns back from the persisted dimension table.
|
|
200
|
+
|
|
201
|
+
Signature:
|
|
202
|
+
|
|
203
|
+
```python
|
|
204
|
+
denormalize_columns(
|
|
205
|
+
db_path,
|
|
206
|
+
table,
|
|
207
|
+
*,
|
|
208
|
+
df,
|
|
209
|
+
key_column="dimension_key",
|
|
210
|
+
select="*",
|
|
211
|
+
drop_key=False,
|
|
212
|
+
)
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
Key arguments:
|
|
216
|
+
|
|
217
|
+
- `key_column` is the surrogate key used to join from `df` into the dimension table
|
|
218
|
+
- `select="*"` attaches every non-key column from the dimension table
|
|
219
|
+
- `select=[...]` lets you attach only a subset of natural columns
|
|
220
|
+
- `drop_key=False` keeps the surrogate key by default
|
|
221
|
+
|
|
222
|
+
Example:
|
|
223
|
+
|
|
224
|
+
```python
|
|
225
|
+
denormalized = denormalize_columns(
|
|
226
|
+
context.database("warehouse.duckdb"),
|
|
227
|
+
"mart.dim_member",
|
|
228
|
+
df=fact_df,
|
|
229
|
+
key_column="member_key",
|
|
230
|
+
)
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
## `replace_rows_by_file(...)`
|
|
234
|
+
|
|
235
|
+
Use this helper when one incoming dataframe represents the full current contents for one source file.
|
|
236
|
+
|
|
237
|
+
Signature:
|
|
238
|
+
|
|
239
|
+
```python
|
|
240
|
+
replace_rows_by_file(
|
|
241
|
+
db_path,
|
|
242
|
+
table,
|
|
243
|
+
*,
|
|
244
|
+
df,
|
|
245
|
+
file_hash,
|
|
246
|
+
file_hash_column="file_key",
|
|
247
|
+
return_df=True,
|
|
248
|
+
)
|
|
249
|
+
```
|
|
250
|
+
|
|
251
|
+
Behavior:
|
|
252
|
+
|
|
253
|
+
- adds a constant file-hash column to `df`
|
|
254
|
+
- creates the table if it does not exist
|
|
255
|
+
- expands the table schema when new columns appear
|
|
256
|
+
- deletes existing rows for that file hash
|
|
257
|
+
- appends the current batch
|
|
258
|
+
|
|
259
|
+
Example:
|
|
260
|
+
|
|
261
|
+
```python
|
|
262
|
+
updated = replace_rows_by_file(
|
|
263
|
+
context.database("warehouse.duckdb"),
|
|
264
|
+
"canon.claim_rows",
|
|
265
|
+
df=claims_df,
|
|
266
|
+
file_hash=context.metadata["file_hash"],
|
|
267
|
+
)
|
|
268
|
+
```
|
|
269
|
+
|
|
270
|
+
This is the usual pattern for canon-style "replace one file slice" loading.
|
|
271
|
+
|
|
272
|
+
## `replace_rows_by_values(...)`
|
|
273
|
+
|
|
274
|
+
Use this helper when one incoming dataframe represents the full current contents for one logical value slice instead of one file.
|
|
275
|
+
|
|
276
|
+
Signature:
|
|
277
|
+
|
|
278
|
+
```python
|
|
279
|
+
replace_rows_by_values(
|
|
280
|
+
db_path,
|
|
281
|
+
table,
|
|
282
|
+
*,
|
|
283
|
+
df,
|
|
284
|
+
column,
|
|
285
|
+
return_df=True,
|
|
286
|
+
)
|
|
287
|
+
```
|
|
288
|
+
|
|
289
|
+
Behavior:
|
|
290
|
+
|
|
291
|
+
- takes the distinct values from `df[column]`
|
|
292
|
+
- deletes existing rows in the target table where `column` matches any of those values
|
|
293
|
+
- appends the current batch
|
|
294
|
+
- creates and expands the table as needed
|
|
295
|
+
|
|
296
|
+
Example:
|
|
297
|
+
|
|
298
|
+
```python
|
|
299
|
+
updated = replace_rows_by_values(
|
|
300
|
+
context.database("warehouse.duckdb"),
|
|
301
|
+
"mart.fact_claim",
|
|
302
|
+
df=claims_for_open_status,
|
|
303
|
+
column="status",
|
|
304
|
+
)
|
|
305
|
+
```
|
|
306
|
+
|
|
307
|
+
That says: "replace every persisted `status` slice represented by this batch, then insert this batch."
|
|
308
|
+
|
|
309
|
+
## `read_rows_by_values(...)`
|
|
310
|
+
|
|
311
|
+
Use this helper when you want a small filtered lookup out of DuckDB as a Polars dataframe.
|
|
312
|
+
|
|
313
|
+
Signature:
|
|
314
|
+
|
|
315
|
+
```python
|
|
316
|
+
read_rows_by_values(
|
|
317
|
+
db_path,
|
|
318
|
+
table,
|
|
319
|
+
*,
|
|
320
|
+
column,
|
|
321
|
+
is_in,
|
|
322
|
+
select,
|
|
323
|
+
)
|
|
324
|
+
```
|
|
325
|
+
|
|
326
|
+
Behavior:
|
|
327
|
+
|
|
328
|
+
- returns rows where `column` matches one of the provided values
|
|
329
|
+
- returns only the selected columns
|
|
330
|
+
- uses a temporary lookup table internally, which works better than manually assembling long SQL `IN (...)` strings
|
|
331
|
+
|
|
332
|
+
Example:
|
|
333
|
+
|
|
334
|
+
```python
|
|
335
|
+
existing = read_rows_by_values(
|
|
336
|
+
context.database("warehouse.duckdb"),
|
|
337
|
+
"mart.fact_claim",
|
|
338
|
+
column="claim_id",
|
|
339
|
+
is_in=[1001, 1002, 1003],
|
|
340
|
+
select=["claim_id", "member_key", "amount"],
|
|
341
|
+
)
|
|
342
|
+
```
|
|
343
|
+
|
|
344
|
+
## `read_sql(...)`
|
|
345
|
+
|
|
346
|
+
Use this helper when you already have the exact DuckDB query you want and just need the result as a Polars dataframe.
|
|
347
|
+
|
|
348
|
+
Signature:
|
|
349
|
+
|
|
350
|
+
```python
|
|
351
|
+
read_sql(
|
|
352
|
+
db_path,
|
|
353
|
+
*,
|
|
354
|
+
sql,
|
|
355
|
+
)
|
|
356
|
+
```
|
|
357
|
+
|
|
358
|
+
Example:
|
|
359
|
+
|
|
360
|
+
```python
|
|
361
|
+
result = read_sql(
|
|
362
|
+
context.database("warehouse.duckdb"),
|
|
363
|
+
sql="""
|
|
364
|
+
SELECT claim_id, amount
|
|
365
|
+
FROM mart.fact_claim
|
|
366
|
+
WHERE amount >= 100
|
|
367
|
+
""",
|
|
368
|
+
)
|
|
369
|
+
```
|
|
370
|
+
|
|
371
|
+
This is the most direct read helper. If you already know the SQL you want, use this.
|
|
372
|
+
|
|
373
|
+
## `read_table(...)`
|
|
374
|
+
|
|
375
|
+
Use this helper when you want a lightweight table reader without writing the whole SQL statement.
|
|
376
|
+
|
|
377
|
+
Signature:
|
|
378
|
+
|
|
379
|
+
```python
|
|
380
|
+
read_table(
|
|
381
|
+
db_path,
|
|
382
|
+
table,
|
|
383
|
+
*,
|
|
384
|
+
select="*",
|
|
385
|
+
where=None,
|
|
386
|
+
limit=None,
|
|
387
|
+
)
|
|
388
|
+
```
|
|
389
|
+
|
|
390
|
+
Example:
|
|
391
|
+
|
|
392
|
+
```python
|
|
393
|
+
result = read_table(
|
|
394
|
+
context.database("warehouse.duckdb"),
|
|
395
|
+
"mart.fact_claim",
|
|
396
|
+
select=["claim_id", "amount"],
|
|
397
|
+
where='"amount" >= 100',
|
|
398
|
+
limit=100,
|
|
399
|
+
)
|
|
400
|
+
```
|
|
401
|
+
|
|
402
|
+
This helper is intentionally small:
|
|
403
|
+
|
|
404
|
+
- `select` can be `"*"` or a list of column names
|
|
405
|
+
- `where` is passed through as SQL
|
|
406
|
+
- `limit` is optional
|
|
407
|
+
|
|
408
|
+
## `replace_table(...)`
|
|
409
|
+
|
|
410
|
+
Use this helper when you want to replace the entire contents of one table with the current dataframe.
|
|
411
|
+
|
|
412
|
+
Signature:
|
|
413
|
+
|
|
414
|
+
```python
|
|
415
|
+
replace_table(
|
|
416
|
+
db_path,
|
|
417
|
+
table,
|
|
418
|
+
*,
|
|
419
|
+
df,
|
|
420
|
+
return_df=True,
|
|
421
|
+
)
|
|
422
|
+
```
|
|
423
|
+
|
|
424
|
+
Behavior:
|
|
425
|
+
|
|
426
|
+
- creates the table if it does not exist
|
|
427
|
+
- expands the table schema when new columns appear
|
|
428
|
+
- deletes all existing rows
|
|
429
|
+
- inserts the current dataframe
|
|
430
|
+
|
|
431
|
+
Example:
|
|
432
|
+
|
|
433
|
+
```python
|
|
434
|
+
replace_table(
|
|
435
|
+
context.database("warehouse.duckdb"),
|
|
436
|
+
"mart.current_snapshot",
|
|
437
|
+
df=snapshot_df,
|
|
438
|
+
)
|
|
439
|
+
```
|
|
440
|
+
|
|
441
|
+
This is the simplest full-refresh write helper in the current set.
|
|
442
|
+
|
|
443
|
+
## Design guidance
|
|
444
|
+
|
|
445
|
+
These helpers are best when:
|
|
446
|
+
|
|
447
|
+
- the database path is stable
|
|
448
|
+
- table ownership is clear
|
|
449
|
+
- the dataframe shape is already mostly what you want
|
|
450
|
+
- you want predictable transactional behavior
|
|
451
|
+
|
|
452
|
+
These helpers are not trying to replace normal SQL authoring. If a step needs custom joins, custom window logic, or highly specific query behavior, using plain DuckDB directly is still the right choice.
|
|
453
|
+
|
|
454
|
+
## When to use direct DuckDB instead
|
|
455
|
+
|
|
456
|
+
Prefer direct DuckDB code when:
|
|
457
|
+
|
|
458
|
+
- the operation is highly custom
|
|
459
|
+
- you need several SQL statements that do not fit one helper
|
|
460
|
+
- you want full manual control over relation registration, temp tables, or query flow
|
|
461
|
+
|
|
462
|
+
The helpers are there to remove repeated boilerplate, not to become a second query language.
|