py-data-engine 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_engine/__init__.py +37 -0
- data_engine/application/__init__.py +39 -0
- data_engine/application/actions.py +42 -0
- data_engine/application/catalog.py +151 -0
- data_engine/application/control.py +213 -0
- data_engine/application/details.py +73 -0
- data_engine/application/runtime.py +449 -0
- data_engine/application/workspace.py +62 -0
- data_engine/authoring/__init__.py +14 -0
- data_engine/authoring/builder.py +31 -0
- data_engine/authoring/execution/__init__.py +6 -0
- data_engine/authoring/execution/app.py +6 -0
- data_engine/authoring/execution/context.py +82 -0
- data_engine/authoring/execution/continuous.py +176 -0
- data_engine/authoring/execution/grouped.py +106 -0
- data_engine/authoring/execution/logging.py +83 -0
- data_engine/authoring/execution/polling.py +135 -0
- data_engine/authoring/execution/runner.py +210 -0
- data_engine/authoring/execution/single.py +171 -0
- data_engine/authoring/flow.py +361 -0
- data_engine/authoring/helpers.py +160 -0
- data_engine/authoring/model.py +59 -0
- data_engine/authoring/primitives.py +430 -0
- data_engine/authoring/services.py +42 -0
- data_engine/devtools/__init__.py +3 -0
- data_engine/devtools/project_ast_map.py +503 -0
- data_engine/docs/__init__.py +1 -0
- data_engine/docs/sphinx_source/_static/custom.css +13 -0
- data_engine/docs/sphinx_source/api.rst +42 -0
- data_engine/docs/sphinx_source/conf.py +37 -0
- data_engine/docs/sphinx_source/guides/app-runtime-and-workspaces.md +397 -0
- data_engine/docs/sphinx_source/guides/authoring-flow-modules.md +215 -0
- data_engine/docs/sphinx_source/guides/configuring-flows.md +185 -0
- data_engine/docs/sphinx_source/guides/core-concepts.md +208 -0
- data_engine/docs/sphinx_source/guides/database-methods.md +107 -0
- data_engine/docs/sphinx_source/guides/duckdb-helpers.md +462 -0
- data_engine/docs/sphinx_source/guides/flow-context.md +538 -0
- data_engine/docs/sphinx_source/guides/flow-methods.md +206 -0
- data_engine/docs/sphinx_source/guides/getting-started.md +271 -0
- data_engine/docs/sphinx_source/guides/project-inventory.md +5683 -0
- data_engine/docs/sphinx_source/guides/project-map.md +118 -0
- data_engine/docs/sphinx_source/guides/recipes.md +268 -0
- data_engine/docs/sphinx_source/index.rst +22 -0
- data_engine/domain/__init__.py +92 -0
- data_engine/domain/actions.py +69 -0
- data_engine/domain/catalog.py +128 -0
- data_engine/domain/details.py +214 -0
- data_engine/domain/diagnostics.py +56 -0
- data_engine/domain/errors.py +104 -0
- data_engine/domain/inspection.py +99 -0
- data_engine/domain/logs.py +118 -0
- data_engine/domain/operations.py +172 -0
- data_engine/domain/operator.py +72 -0
- data_engine/domain/runs.py +155 -0
- data_engine/domain/runtime.py +279 -0
- data_engine/domain/source_state.py +17 -0
- data_engine/domain/support.py +54 -0
- data_engine/domain/time.py +23 -0
- data_engine/domain/workspace.py +159 -0
- data_engine/flow_modules/__init__.py +1 -0
- data_engine/flow_modules/flow_module_compiler.py +179 -0
- data_engine/flow_modules/flow_module_loader.py +201 -0
- data_engine/helpers/__init__.py +25 -0
- data_engine/helpers/duckdb.py +705 -0
- data_engine/hosts/__init__.py +1 -0
- data_engine/hosts/daemon/__init__.py +23 -0
- data_engine/hosts/daemon/app.py +221 -0
- data_engine/hosts/daemon/bootstrap.py +69 -0
- data_engine/hosts/daemon/client.py +465 -0
- data_engine/hosts/daemon/commands.py +64 -0
- data_engine/hosts/daemon/composition.py +310 -0
- data_engine/hosts/daemon/constants.py +15 -0
- data_engine/hosts/daemon/entrypoints.py +97 -0
- data_engine/hosts/daemon/lifecycle.py +191 -0
- data_engine/hosts/daemon/manager.py +272 -0
- data_engine/hosts/daemon/ownership.py +126 -0
- data_engine/hosts/daemon/runtime_commands.py +188 -0
- data_engine/hosts/daemon/runtime_control.py +31 -0
- data_engine/hosts/daemon/server.py +84 -0
- data_engine/hosts/daemon/shared_state.py +147 -0
- data_engine/hosts/daemon/state_sync.py +101 -0
- data_engine/platform/__init__.py +1 -0
- data_engine/platform/identity.py +35 -0
- data_engine/platform/local_settings.py +146 -0
- data_engine/platform/theme.py +259 -0
- data_engine/platform/workspace_models.py +190 -0
- data_engine/platform/workspace_policy.py +333 -0
- data_engine/runtime/__init__.py +1 -0
- data_engine/runtime/file_watch.py +185 -0
- data_engine/runtime/ledger_models.py +116 -0
- data_engine/runtime/runtime_db.py +938 -0
- data_engine/runtime/shared_state.py +523 -0
- data_engine/services/__init__.py +49 -0
- data_engine/services/daemon.py +64 -0
- data_engine/services/daemon_state.py +40 -0
- data_engine/services/flow_catalog.py +102 -0
- data_engine/services/flow_execution.py +48 -0
- data_engine/services/ledger.py +85 -0
- data_engine/services/logs.py +65 -0
- data_engine/services/runtime_binding.py +105 -0
- data_engine/services/runtime_execution.py +126 -0
- data_engine/services/runtime_history.py +62 -0
- data_engine/services/settings.py +58 -0
- data_engine/services/shared_state.py +28 -0
- data_engine/services/theme.py +59 -0
- data_engine/services/workspace_provisioning.py +224 -0
- data_engine/services/workspaces.py +74 -0
- data_engine/ui/__init__.py +3 -0
- data_engine/ui/cli/__init__.py +19 -0
- data_engine/ui/cli/app.py +161 -0
- data_engine/ui/cli/commands_doctor.py +178 -0
- data_engine/ui/cli/commands_run.py +80 -0
- data_engine/ui/cli/commands_start.py +100 -0
- data_engine/ui/cli/commands_workspace.py +97 -0
- data_engine/ui/cli/dependencies.py +44 -0
- data_engine/ui/cli/parser.py +56 -0
- data_engine/ui/gui/__init__.py +25 -0
- data_engine/ui/gui/app.py +116 -0
- data_engine/ui/gui/bootstrap.py +487 -0
- data_engine/ui/gui/bootstrapper.py +140 -0
- data_engine/ui/gui/cache_models.py +23 -0
- data_engine/ui/gui/control_support.py +185 -0
- data_engine/ui/gui/controllers/__init__.py +6 -0
- data_engine/ui/gui/controllers/flows.py +439 -0
- data_engine/ui/gui/controllers/runtime.py +245 -0
- data_engine/ui/gui/dialogs/__init__.py +12 -0
- data_engine/ui/gui/dialogs/messages.py +88 -0
- data_engine/ui/gui/dialogs/previews.py +222 -0
- data_engine/ui/gui/helpers/__init__.py +62 -0
- data_engine/ui/gui/helpers/inspection.py +81 -0
- data_engine/ui/gui/helpers/lifecycle.py +112 -0
- data_engine/ui/gui/helpers/scroll.py +28 -0
- data_engine/ui/gui/helpers/theming.py +87 -0
- data_engine/ui/gui/icons/dark_light.svg +12 -0
- data_engine/ui/gui/icons/documentation.svg +1 -0
- data_engine/ui/gui/icons/failed.svg +3 -0
- data_engine/ui/gui/icons/group.svg +4 -0
- data_engine/ui/gui/icons/home.svg +2 -0
- data_engine/ui/gui/icons/manual.svg +2 -0
- data_engine/ui/gui/icons/poll.svg +2 -0
- data_engine/ui/gui/icons/schedule.svg +4 -0
- data_engine/ui/gui/icons/settings.svg +2 -0
- data_engine/ui/gui/icons/started.svg +3 -0
- data_engine/ui/gui/icons/success.svg +3 -0
- data_engine/ui/gui/icons/view-log.svg +3 -0
- data_engine/ui/gui/icons.py +50 -0
- data_engine/ui/gui/launcher.py +48 -0
- data_engine/ui/gui/presenters/__init__.py +72 -0
- data_engine/ui/gui/presenters/docs.py +140 -0
- data_engine/ui/gui/presenters/logs.py +58 -0
- data_engine/ui/gui/presenters/runtime_projection.py +29 -0
- data_engine/ui/gui/presenters/sidebar.py +88 -0
- data_engine/ui/gui/presenters/steps.py +148 -0
- data_engine/ui/gui/presenters/workspace.py +39 -0
- data_engine/ui/gui/presenters/workspace_binding.py +75 -0
- data_engine/ui/gui/presenters/workspace_settings.py +182 -0
- data_engine/ui/gui/preview_models.py +37 -0
- data_engine/ui/gui/render_support.py +241 -0
- data_engine/ui/gui/rendering/__init__.py +12 -0
- data_engine/ui/gui/rendering/artifacts.py +95 -0
- data_engine/ui/gui/rendering/icons.py +50 -0
- data_engine/ui/gui/runtime.py +47 -0
- data_engine/ui/gui/state_support.py +193 -0
- data_engine/ui/gui/support.py +214 -0
- data_engine/ui/gui/surface.py +209 -0
- data_engine/ui/gui/theme.py +720 -0
- data_engine/ui/gui/widgets/__init__.py +34 -0
- data_engine/ui/gui/widgets/config.py +41 -0
- data_engine/ui/gui/widgets/logs.py +62 -0
- data_engine/ui/gui/widgets/panels.py +507 -0
- data_engine/ui/gui/widgets/sidebar.py +130 -0
- data_engine/ui/gui/widgets/steps.py +84 -0
- data_engine/ui/tui/__init__.py +5 -0
- data_engine/ui/tui/app.py +222 -0
- data_engine/ui/tui/bootstrap.py +475 -0
- data_engine/ui/tui/bootstrapper.py +117 -0
- data_engine/ui/tui/controllers/__init__.py +6 -0
- data_engine/ui/tui/controllers/flows.py +349 -0
- data_engine/ui/tui/controllers/runtime.py +167 -0
- data_engine/ui/tui/runtime.py +34 -0
- data_engine/ui/tui/state_support.py +141 -0
- data_engine/ui/tui/support.py +63 -0
- data_engine/ui/tui/theme.py +204 -0
- data_engine/ui/tui/widgets.py +123 -0
- data_engine/views/__init__.py +109 -0
- data_engine/views/actions.py +80 -0
- data_engine/views/artifacts.py +58 -0
- data_engine/views/flow_display.py +69 -0
- data_engine/views/logs.py +54 -0
- data_engine/views/models.py +96 -0
- data_engine/views/presentation.py +133 -0
- data_engine/views/runs.py +62 -0
- data_engine/views/state.py +39 -0
- data_engine/views/status.py +13 -0
- data_engine/views/text.py +109 -0
- py_data_engine-0.1.0.dist-info/METADATA +330 -0
- py_data_engine-0.1.0.dist-info/RECORD +200 -0
- py_data_engine-0.1.0.dist-info/WHEEL +5 -0
- py_data_engine-0.1.0.dist-info/entry_points.txt +2 -0
- py_data_engine-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,538 @@
|
|
|
1
|
+
# FlowContext
|
|
2
|
+
|
|
3
|
+
`FlowContext` is the runtime object passed to every step.
|
|
4
|
+
|
|
5
|
+
It is the main place where the runtime meets your step code.
|
|
6
|
+
|
|
7
|
+
If you are authoring flows day to day, this is the surface you will use most often.
|
|
8
|
+
|
|
9
|
+
## What `FlowContext` contains
|
|
10
|
+
|
|
11
|
+
Common fields and helpers you will read directly:
|
|
12
|
+
|
|
13
|
+
- `flow_name`
|
|
14
|
+
- `group`
|
|
15
|
+
- `source`
|
|
16
|
+
- `mirror`
|
|
17
|
+
- `config`
|
|
18
|
+
- `database(...)`
|
|
19
|
+
- `current`
|
|
20
|
+
- `objects`
|
|
21
|
+
- `metadata`
|
|
22
|
+
- `source_metadata()`
|
|
23
|
+
|
|
24
|
+
Example:
|
|
25
|
+
|
|
26
|
+
```python
|
|
27
|
+
def inspect_context(context):
|
|
28
|
+
print(context.flow_name)
|
|
29
|
+
print(context.group)
|
|
30
|
+
print(context.current)
|
|
31
|
+
if context.source is not None:
|
|
32
|
+
print(context.source.path)
|
|
33
|
+
return context.current
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## The three most important ideas
|
|
37
|
+
|
|
38
|
+
When in doubt, remember these three ideas:
|
|
39
|
+
|
|
40
|
+
1. `current` is the moving value in the pipeline.
|
|
41
|
+
2. `objects` is the named stash of saved intermediates.
|
|
42
|
+
3. `source` and `mirror` are path namespaces, not open files or connections.
|
|
43
|
+
|
|
44
|
+
Everything else in `FlowContext` builds on those ideas.
|
|
45
|
+
|
|
46
|
+
## `flow_name` and `group`
|
|
47
|
+
|
|
48
|
+
These are the flow identity fields available at runtime.
|
|
49
|
+
|
|
50
|
+
- `flow_name` comes from the flow-module filename
|
|
51
|
+
- `group` comes from `Flow(group=...)`
|
|
52
|
+
|
|
53
|
+
They are useful when you want to:
|
|
54
|
+
|
|
55
|
+
- stamp metadata
|
|
56
|
+
- label outputs
|
|
57
|
+
- branch behavior lightly by flow identity
|
|
58
|
+
- emit operator-facing details into `context.metadata`
|
|
59
|
+
|
|
60
|
+
## `current`
|
|
61
|
+
|
|
62
|
+
`context.current` is the moving runtime slot.
|
|
63
|
+
|
|
64
|
+
- before the first manual or scheduled step, it is `None`
|
|
65
|
+
- after each step, it becomes that step's return value
|
|
66
|
+
- if `use=` is set, the runtime loads the named object into `current` before running the step
|
|
67
|
+
|
|
68
|
+
This is why most steps are so small:
|
|
69
|
+
|
|
70
|
+
```python
|
|
71
|
+
def clean_claims(context):
|
|
72
|
+
return context.current.filter(...)
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
The step does not need to fetch some external hidden pipeline object. The runtime always hands it the current value.
|
|
76
|
+
|
|
77
|
+
## `objects`
|
|
78
|
+
|
|
79
|
+
Saved objects live in `context.objects`.
|
|
80
|
+
|
|
81
|
+
That is what `save_as=` and `use=` operate on.
|
|
82
|
+
|
|
83
|
+
Example:
|
|
84
|
+
|
|
85
|
+
```python
|
|
86
|
+
(
|
|
87
|
+
Flow(group="Claims")
|
|
88
|
+
.step(read_claims, save_as="raw_df")
|
|
89
|
+
.step(clean_claims, use="raw_df", save_as="clean_df")
|
|
90
|
+
.step(write_output, use="clean_df")
|
|
91
|
+
)
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
Inside a step you can also read those values directly:
|
|
95
|
+
|
|
96
|
+
```python
|
|
97
|
+
def compare_versions(context):
|
|
98
|
+
raw_df = context.objects["raw_df"]
|
|
99
|
+
clean_df = context.objects["clean_df"]
|
|
100
|
+
...
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
This is especially useful when a later step needs more than one previously saved object.
|
|
104
|
+
|
|
105
|
+
## `metadata`
|
|
106
|
+
|
|
107
|
+
`context.metadata` is a free-form runtime metadata dictionary.
|
|
108
|
+
|
|
109
|
+
Use it when a step wants to publish details about what happened during execution.
|
|
110
|
+
|
|
111
|
+
The runtime also seeds a few values automatically:
|
|
112
|
+
|
|
113
|
+
- `started_at_utc`
|
|
114
|
+
- `run_id`
|
|
115
|
+
- `step_outputs`
|
|
116
|
+
- `file_hash` when the run is bound to a concrete source file
|
|
117
|
+
|
|
118
|
+
`file_hash` is a stable SHA-1 hash of the source-relative path when one exists. For single-file bindings, it falls back to the concrete source path text.
|
|
119
|
+
|
|
120
|
+
Examples:
|
|
121
|
+
|
|
122
|
+
- row counts
|
|
123
|
+
- source metadata
|
|
124
|
+
- selected config values
|
|
125
|
+
- warning flags
|
|
126
|
+
- lightweight operator diagnostics
|
|
127
|
+
|
|
128
|
+
Example:
|
|
129
|
+
|
|
130
|
+
```python
|
|
131
|
+
def capture_stats(context):
|
|
132
|
+
context.metadata["row_count"] = len(context.current)
|
|
133
|
+
context.metadata["flow_name"] = context.flow_name
|
|
134
|
+
return context.current
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
The runtime also records step output paths here when a step returns an existing `Path`.
|
|
138
|
+
|
|
139
|
+
That is what powers the UI `Inspect` button for a step: if a step writes a file and returns its existing path, the UI can enable inspection for that step.
|
|
140
|
+
|
|
141
|
+
## `config`
|
|
142
|
+
|
|
143
|
+
`context.config` is lazy read-only access to `config/*.toml` files in the current authored workspace.
|
|
144
|
+
|
|
145
|
+
Available helpers are:
|
|
146
|
+
|
|
147
|
+
```python
|
|
148
|
+
context.config.get("claims")
|
|
149
|
+
context.config.require("claims")
|
|
150
|
+
context.config.names()
|
|
151
|
+
context.config.all()
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
### `get(name)`
|
|
155
|
+
|
|
156
|
+
Returns a parsed `dict` or `None`.
|
|
157
|
+
|
|
158
|
+
Use this when the config file is optional:
|
|
159
|
+
|
|
160
|
+
```python
|
|
161
|
+
def apply_runtime_config(context):
|
|
162
|
+
cfg = context.config.get("claims")
|
|
163
|
+
if cfg is None:
|
|
164
|
+
return context.current
|
|
165
|
+
batch_size = cfg.get("runtime", {}).get("batch_size", 5000)
|
|
166
|
+
context.metadata["batch_size"] = batch_size
|
|
167
|
+
return context.current
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
### `require(name)`
|
|
171
|
+
|
|
172
|
+
Returns the parsed `dict` or raises when the file is missing.
|
|
173
|
+
|
|
174
|
+
Use this when the config is part of the flow's contract:
|
|
175
|
+
|
|
176
|
+
```python
|
|
177
|
+
def load_required_settings(context):
|
|
178
|
+
cfg = context.config.require("database")
|
|
179
|
+
dsn = cfg["connection"]["dsn"]
|
|
180
|
+
context.metadata["dsn"] = dsn
|
|
181
|
+
return context.current
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
### `names()`
|
|
185
|
+
|
|
186
|
+
Returns available config stems such as:
|
|
187
|
+
|
|
188
|
+
```python
|
|
189
|
+
("claims", "runtime")
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
This is mostly useful for introspection or diagnostics.
|
|
193
|
+
|
|
194
|
+
### `all()`
|
|
195
|
+
|
|
196
|
+
Returns every parsed config mapping keyed by file stem.
|
|
197
|
+
|
|
198
|
+
Example:
|
|
199
|
+
|
|
200
|
+
```python
|
|
201
|
+
all_config = context.config.all()
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
### What `config` is good for
|
|
205
|
+
|
|
206
|
+
`context.config` is a good fit for:
|
|
207
|
+
|
|
208
|
+
- file names and folder names
|
|
209
|
+
- thresholds and batch sizes
|
|
210
|
+
- optional feature flags
|
|
211
|
+
- SQL parameters
|
|
212
|
+
- external table names
|
|
213
|
+
|
|
214
|
+
It is not a replacement for the `Flow(...)` chain. The orchestration shape still belongs in the fluent flow definition.
|
|
215
|
+
|
|
216
|
+
## `database(...)`
|
|
217
|
+
|
|
218
|
+
`context.database(...)` returns a write-ready path beneath `databases/` in the current authored workspace.
|
|
219
|
+
|
|
220
|
+
Example:
|
|
221
|
+
|
|
222
|
+
```python
|
|
223
|
+
db_path = context.database("claims/db.duckdb")
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
That resolves to:
|
|
227
|
+
|
|
228
|
+
- `workspaces/<workspace_id>/databases/claims/db.duckdb`
|
|
229
|
+
|
|
230
|
+
Rules:
|
|
231
|
+
|
|
232
|
+
- the path must be relative
|
|
233
|
+
- parent directories are created automatically
|
|
234
|
+
- the helper is only available for authored workspace flows
|
|
235
|
+
- it returns a `Path`, not a database connection
|
|
236
|
+
|
|
237
|
+
Typical usage:
|
|
238
|
+
|
|
239
|
+
```python
|
|
240
|
+
import duckdb
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def write_summary(context):
|
|
244
|
+
db_path = context.database("claims/analytics.duckdb")
|
|
245
|
+
conn = duckdb.connect(db_path)
|
|
246
|
+
try:
|
|
247
|
+
...
|
|
248
|
+
finally:
|
|
249
|
+
conn.close()
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
This is intentionally simple. Data Engine gives you the path and leaves connection ownership to your code.
|
|
253
|
+
|
|
254
|
+
## `source_metadata()`
|
|
255
|
+
|
|
256
|
+
`context.source_metadata()` returns basic filesystem metadata for the current source file when one exists.
|
|
257
|
+
|
|
258
|
+
It gives you:
|
|
259
|
+
|
|
260
|
+
- path
|
|
261
|
+
- file name
|
|
262
|
+
- size in bytes
|
|
263
|
+
- modified time in UTC
|
|
264
|
+
|
|
265
|
+
Example:
|
|
266
|
+
|
|
267
|
+
```python
|
|
268
|
+
def capture_source_info(context):
|
|
269
|
+
metadata = context.source_metadata()
|
|
270
|
+
if metadata is not None:
|
|
271
|
+
context.metadata["source_name"] = metadata.name
|
|
272
|
+
context.metadata["source_size_bytes"] = metadata.size_bytes
|
|
273
|
+
return context.current
|
|
274
|
+
```
|
|
275
|
+
|
|
276
|
+
This is useful for audit trails, diagnostics, and output manifests.
|
|
277
|
+
|
|
278
|
+
## `source`
|
|
279
|
+
|
|
280
|
+
`context.source` is the input-side namespace for the active source.
|
|
281
|
+
|
|
282
|
+
It is usually present for poll flows and for scheduled flows that bind a source.
|
|
283
|
+
|
|
284
|
+
It may be `None` for manual flows or scheduled flows that build data entirely in memory.
|
|
285
|
+
|
|
286
|
+
Core helpers are:
|
|
287
|
+
|
|
288
|
+
```python
|
|
289
|
+
context.source.path
|
|
290
|
+
context.source.dir
|
|
291
|
+
context.source.folder
|
|
292
|
+
context.source.with_extension(".json")
|
|
293
|
+
context.source.with_suffix(".json")
|
|
294
|
+
context.source.file("notes.json")
|
|
295
|
+
context.source.namespaced_file("notes.json")
|
|
296
|
+
context.source.root_file("lookup.csv")
|
|
297
|
+
```
|
|
298
|
+
|
|
299
|
+
### `path`
|
|
300
|
+
|
|
301
|
+
The concrete active source file path.
|
|
302
|
+
|
|
303
|
+
This is the simplest and most direct read-side path:
|
|
304
|
+
|
|
305
|
+
```python
|
|
306
|
+
def read_claims(context):
|
|
307
|
+
return pl.read_excel(context.source.path)
|
|
308
|
+
```
|
|
309
|
+
|
|
310
|
+
### `dir`
|
|
311
|
+
|
|
312
|
+
The namespace directory for files derived from the active source.
|
|
313
|
+
|
|
314
|
+
### `folder`
|
|
315
|
+
|
|
316
|
+
The active source file's parent folder.
|
|
317
|
+
|
|
318
|
+
### `with_extension(...)` and `with_suffix(...)`
|
|
319
|
+
|
|
320
|
+
These give you the same source-relative file with a new extension.
|
|
321
|
+
|
|
322
|
+
```python
|
|
323
|
+
def find_json_sidecar(context):
|
|
324
|
+
return context.source.with_extension(".json")
|
|
325
|
+
```
|
|
326
|
+
|
|
327
|
+
### `file(...)`
|
|
328
|
+
|
|
329
|
+
Gives you a path in the active source file's parent folder.
|
|
330
|
+
|
|
331
|
+
```python
|
|
332
|
+
def find_notes(context):
|
|
333
|
+
return context.source.file("notes.json")
|
|
334
|
+
```
|
|
335
|
+
|
|
336
|
+
### `namespaced_file(...)`
|
|
337
|
+
|
|
338
|
+
Gives you a path under the active source file's namespace.
|
|
339
|
+
|
|
340
|
+
```python
|
|
341
|
+
def find_namespaced_notes(context):
|
|
342
|
+
return context.source.namespaced_file("notes.json")
|
|
343
|
+
```
|
|
344
|
+
|
|
345
|
+
### `root_file(...)`
|
|
346
|
+
|
|
347
|
+
Gives you a path directly under the source root.
|
|
348
|
+
|
|
349
|
+
```python
|
|
350
|
+
def load_lookup(context):
|
|
351
|
+
return context.source.root_file("lookup.csv")
|
|
352
|
+
```
|
|
353
|
+
|
|
354
|
+
### Common `source` patterns
|
|
355
|
+
|
|
356
|
+
Use `source` when you need:
|
|
357
|
+
|
|
358
|
+
- the active input file
|
|
359
|
+
- a sidecar file near that input
|
|
360
|
+
- a lookup file under the watched source root
|
|
361
|
+
- namespace-aware paths derived from the current source item
|
|
362
|
+
|
|
363
|
+
## `mirror`
|
|
364
|
+
|
|
365
|
+
`context.mirror` is the mirrored output namespace for the active source.
|
|
366
|
+
|
|
367
|
+
It is present when the flow uses `mirror(root=...)`.
|
|
368
|
+
|
|
369
|
+
Core helpers are:
|
|
370
|
+
|
|
371
|
+
```python
|
|
372
|
+
context.mirror.root
|
|
373
|
+
context.mirror.dir
|
|
374
|
+
context.mirror.folder
|
|
375
|
+
context.mirror.with_extension(".parquet")
|
|
376
|
+
context.mirror.with_suffix(".parquet")
|
|
377
|
+
context.mirror.file("open_claims.parquet")
|
|
378
|
+
context.mirror.namespaced_file("open_claims.parquet")
|
|
379
|
+
context.mirror.root_file("analytics.duckdb")
|
|
380
|
+
```
|
|
381
|
+
|
|
382
|
+
### `with_extension(...)` and `with_suffix(...)`
|
|
383
|
+
|
|
384
|
+
These are for the common "mirror this source file into another format" case.
|
|
385
|
+
|
|
386
|
+
```python
|
|
387
|
+
def write_target(context):
|
|
388
|
+
output = context.mirror.with_extension(".parquet")
|
|
389
|
+
context.current.write_parquet(output)
|
|
390
|
+
return output
|
|
391
|
+
```
|
|
392
|
+
|
|
393
|
+
Returning that written `Path` is what makes the step inspectable in the UI.
|
|
394
|
+
|
|
395
|
+
### `file(...)`
|
|
396
|
+
|
|
397
|
+
Use this for a custom file name in the mirrored source folder:
|
|
398
|
+
|
|
399
|
+
```python
|
|
400
|
+
def write_summary(context):
|
|
401
|
+
summary_path = context.mirror.file("summary.json")
|
|
402
|
+
summary_path.write_text("{}", encoding="utf-8")
|
|
403
|
+
return summary_path
|
|
404
|
+
```
|
|
405
|
+
|
|
406
|
+
### `namespaced_file(...)`
|
|
407
|
+
|
|
408
|
+
Use this for multiple outputs derived from one source:
|
|
409
|
+
|
|
410
|
+
```python
|
|
411
|
+
def write_outputs(context):
|
|
412
|
+
open_path = context.mirror.namespaced_file("open_claims.parquet")
|
|
413
|
+
closed_path = context.mirror.namespaced_file("closed_claims.parquet")
|
|
414
|
+
...
|
|
415
|
+
```
|
|
416
|
+
|
|
417
|
+
### `root_file(...)`
|
|
418
|
+
|
|
419
|
+
Use this when you want one stable artifact under the mirror root rather than one file per source item.
|
|
420
|
+
|
|
421
|
+
```python
|
|
422
|
+
def write_snapshot(context):
|
|
423
|
+
snapshot = context.mirror.root_file("artifacts/latest.parquet")
|
|
424
|
+
context.current.write_parquet(snapshot)
|
|
425
|
+
return snapshot
|
|
426
|
+
```
|
|
427
|
+
|
|
428
|
+
### Common `mirror` patterns
|
|
429
|
+
|
|
430
|
+
Use `mirror` when you want to:
|
|
431
|
+
|
|
432
|
+
- preserve source-relative output structure
|
|
433
|
+
- create many derived outputs from one source
|
|
434
|
+
- write stable summary artifacts under one output root
|
|
435
|
+
- avoid hand-building output folder math
|
|
436
|
+
|
|
437
|
+
All helpers return write-ready paths, so callers do not need to create parent directories themselves.
|
|
438
|
+
|
|
439
|
+
## When `source` or `mirror` may be missing
|
|
440
|
+
|
|
441
|
+
Not every flow has every context surface available.
|
|
442
|
+
|
|
443
|
+
Examples:
|
|
444
|
+
|
|
445
|
+
- a manual flow may have no `source`
|
|
446
|
+
- a purely in-memory scheduled flow may have no `source`
|
|
447
|
+
- a flow with no `mirror(root=...)` has no `mirror`
|
|
448
|
+
|
|
449
|
+
So it is reasonable to write defensive code when the flow shape allows those cases:
|
|
450
|
+
|
|
451
|
+
```python
|
|
452
|
+
def maybe_capture_source(context):
|
|
453
|
+
if context.source is None:
|
|
454
|
+
return context.current
|
|
455
|
+
context.metadata["source_path"] = str(context.source.path)
|
|
456
|
+
return context.current
|
|
457
|
+
```
|
|
458
|
+
|
|
459
|
+
## Batch values
|
|
460
|
+
|
|
461
|
+
`Flow.collect(...)` returns a `Batch` of `FileRef` items instead of a raw list.
|
|
462
|
+
|
|
463
|
+
That means later steps can work with:
|
|
464
|
+
|
|
465
|
+
- `file_ref.name`
|
|
466
|
+
- `file_ref.path`
|
|
467
|
+
- `file_ref.stem`
|
|
468
|
+
- `file_ref.suffix`
|
|
469
|
+
- `file_ref.parent`
|
|
470
|
+
|
|
471
|
+
Example:
|
|
472
|
+
|
|
473
|
+
```python
|
|
474
|
+
def read_claims(file_ref):
|
|
475
|
+
return pl.read_excel(file_ref.path)
|
|
476
|
+
```
|
|
477
|
+
|
|
478
|
+
When you are in a mapped step, the item is often simpler than the full `context`, and that is by design.
|
|
479
|
+
|
|
480
|
+
## A practical context walkthrough
|
|
481
|
+
|
|
482
|
+
Here is a representative flow using several parts of the context together:
|
|
483
|
+
|
|
484
|
+
```python
|
|
485
|
+
import duckdb
|
|
486
|
+
import polars as pl
|
|
487
|
+
|
|
488
|
+
from data_engine import Flow
|
|
489
|
+
|
|
490
|
+
|
|
491
|
+
def read_claims(file_ref):
|
|
492
|
+
return pl.read_excel(file_ref.path)
|
|
493
|
+
|
|
494
|
+
|
|
495
|
+
def combine_claims(context):
|
|
496
|
+
cfg = context.config.get("claims") or {}
|
|
497
|
+
batch_size = cfg.get("runtime", {}).get("batch_size", 5000)
|
|
498
|
+
context.metadata["batch_size"] = batch_size
|
|
499
|
+
return pl.concat(context.current, how="vertical_relaxed")
|
|
500
|
+
|
|
501
|
+
|
|
502
|
+
def summarize(context):
|
|
503
|
+
db_path = context.database("claims/analytics.duckdb")
|
|
504
|
+
conn = duckdb.connect(db_path)
|
|
505
|
+
try:
|
|
506
|
+
conn.register("input", context.current)
|
|
507
|
+
summary = conn.sql("select count(*) as row_count from input").pl()
|
|
508
|
+
finally:
|
|
509
|
+
conn.close()
|
|
510
|
+
output = context.mirror.file("summary.parquet")
|
|
511
|
+
summary.write_parquet(output)
|
|
512
|
+
context.metadata["summary_path"] = str(output)
|
|
513
|
+
return output
|
|
514
|
+
|
|
515
|
+
|
|
516
|
+
def build():
|
|
517
|
+
return (
|
|
518
|
+
Flow(group="Claims")
|
|
519
|
+
.watch(mode="schedule", run_as="batch", interval="15m", source="../../example_data/Input/claims_flat")
|
|
520
|
+
.mirror(root="../../example_data/Output/example_summary")
|
|
521
|
+
.collect([".xlsx"], save_as="claim_files")
|
|
522
|
+
.map(read_claims, use="claim_files", save_as="claim_frames")
|
|
523
|
+
.step(combine_claims, use="claim_frames", save_as="raw_df")
|
|
524
|
+
.step(summarize, use="raw_df")
|
|
525
|
+
)
|
|
526
|
+
```
|
|
527
|
+
|
|
528
|
+
That one flow uses:
|
|
529
|
+
|
|
530
|
+
- `Batch` and `FileRef`
|
|
531
|
+
- `current`
|
|
532
|
+
- `objects`
|
|
533
|
+
- `config`
|
|
534
|
+
- `database(...)`
|
|
535
|
+
- `mirror`
|
|
536
|
+
- `metadata`
|
|
537
|
+
|
|
538
|
+
That is the intended shape of the authoring model: small runtime helpers that make native Python data work easier to organize.
|