py-data-engine 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (200) hide show
  1. data_engine/__init__.py +37 -0
  2. data_engine/application/__init__.py +39 -0
  3. data_engine/application/actions.py +42 -0
  4. data_engine/application/catalog.py +151 -0
  5. data_engine/application/control.py +213 -0
  6. data_engine/application/details.py +73 -0
  7. data_engine/application/runtime.py +449 -0
  8. data_engine/application/workspace.py +62 -0
  9. data_engine/authoring/__init__.py +14 -0
  10. data_engine/authoring/builder.py +31 -0
  11. data_engine/authoring/execution/__init__.py +6 -0
  12. data_engine/authoring/execution/app.py +6 -0
  13. data_engine/authoring/execution/context.py +82 -0
  14. data_engine/authoring/execution/continuous.py +176 -0
  15. data_engine/authoring/execution/grouped.py +106 -0
  16. data_engine/authoring/execution/logging.py +83 -0
  17. data_engine/authoring/execution/polling.py +135 -0
  18. data_engine/authoring/execution/runner.py +210 -0
  19. data_engine/authoring/execution/single.py +171 -0
  20. data_engine/authoring/flow.py +361 -0
  21. data_engine/authoring/helpers.py +160 -0
  22. data_engine/authoring/model.py +59 -0
  23. data_engine/authoring/primitives.py +430 -0
  24. data_engine/authoring/services.py +42 -0
  25. data_engine/devtools/__init__.py +3 -0
  26. data_engine/devtools/project_ast_map.py +503 -0
  27. data_engine/docs/__init__.py +1 -0
  28. data_engine/docs/sphinx_source/_static/custom.css +13 -0
  29. data_engine/docs/sphinx_source/api.rst +42 -0
  30. data_engine/docs/sphinx_source/conf.py +37 -0
  31. data_engine/docs/sphinx_source/guides/app-runtime-and-workspaces.md +397 -0
  32. data_engine/docs/sphinx_source/guides/authoring-flow-modules.md +215 -0
  33. data_engine/docs/sphinx_source/guides/configuring-flows.md +185 -0
  34. data_engine/docs/sphinx_source/guides/core-concepts.md +208 -0
  35. data_engine/docs/sphinx_source/guides/database-methods.md +107 -0
  36. data_engine/docs/sphinx_source/guides/duckdb-helpers.md +462 -0
  37. data_engine/docs/sphinx_source/guides/flow-context.md +538 -0
  38. data_engine/docs/sphinx_source/guides/flow-methods.md +206 -0
  39. data_engine/docs/sphinx_source/guides/getting-started.md +271 -0
  40. data_engine/docs/sphinx_source/guides/project-inventory.md +5683 -0
  41. data_engine/docs/sphinx_source/guides/project-map.md +118 -0
  42. data_engine/docs/sphinx_source/guides/recipes.md +268 -0
  43. data_engine/docs/sphinx_source/index.rst +22 -0
  44. data_engine/domain/__init__.py +92 -0
  45. data_engine/domain/actions.py +69 -0
  46. data_engine/domain/catalog.py +128 -0
  47. data_engine/domain/details.py +214 -0
  48. data_engine/domain/diagnostics.py +56 -0
  49. data_engine/domain/errors.py +104 -0
  50. data_engine/domain/inspection.py +99 -0
  51. data_engine/domain/logs.py +118 -0
  52. data_engine/domain/operations.py +172 -0
  53. data_engine/domain/operator.py +72 -0
  54. data_engine/domain/runs.py +155 -0
  55. data_engine/domain/runtime.py +279 -0
  56. data_engine/domain/source_state.py +17 -0
  57. data_engine/domain/support.py +54 -0
  58. data_engine/domain/time.py +23 -0
  59. data_engine/domain/workspace.py +159 -0
  60. data_engine/flow_modules/__init__.py +1 -0
  61. data_engine/flow_modules/flow_module_compiler.py +179 -0
  62. data_engine/flow_modules/flow_module_loader.py +201 -0
  63. data_engine/helpers/__init__.py +25 -0
  64. data_engine/helpers/duckdb.py +705 -0
  65. data_engine/hosts/__init__.py +1 -0
  66. data_engine/hosts/daemon/__init__.py +23 -0
  67. data_engine/hosts/daemon/app.py +221 -0
  68. data_engine/hosts/daemon/bootstrap.py +69 -0
  69. data_engine/hosts/daemon/client.py +465 -0
  70. data_engine/hosts/daemon/commands.py +64 -0
  71. data_engine/hosts/daemon/composition.py +310 -0
  72. data_engine/hosts/daemon/constants.py +15 -0
  73. data_engine/hosts/daemon/entrypoints.py +97 -0
  74. data_engine/hosts/daemon/lifecycle.py +191 -0
  75. data_engine/hosts/daemon/manager.py +272 -0
  76. data_engine/hosts/daemon/ownership.py +126 -0
  77. data_engine/hosts/daemon/runtime_commands.py +188 -0
  78. data_engine/hosts/daemon/runtime_control.py +31 -0
  79. data_engine/hosts/daemon/server.py +84 -0
  80. data_engine/hosts/daemon/shared_state.py +147 -0
  81. data_engine/hosts/daemon/state_sync.py +101 -0
  82. data_engine/platform/__init__.py +1 -0
  83. data_engine/platform/identity.py +35 -0
  84. data_engine/platform/local_settings.py +146 -0
  85. data_engine/platform/theme.py +259 -0
  86. data_engine/platform/workspace_models.py +190 -0
  87. data_engine/platform/workspace_policy.py +333 -0
  88. data_engine/runtime/__init__.py +1 -0
  89. data_engine/runtime/file_watch.py +185 -0
  90. data_engine/runtime/ledger_models.py +116 -0
  91. data_engine/runtime/runtime_db.py +938 -0
  92. data_engine/runtime/shared_state.py +523 -0
  93. data_engine/services/__init__.py +49 -0
  94. data_engine/services/daemon.py +64 -0
  95. data_engine/services/daemon_state.py +40 -0
  96. data_engine/services/flow_catalog.py +102 -0
  97. data_engine/services/flow_execution.py +48 -0
  98. data_engine/services/ledger.py +85 -0
  99. data_engine/services/logs.py +65 -0
  100. data_engine/services/runtime_binding.py +105 -0
  101. data_engine/services/runtime_execution.py +126 -0
  102. data_engine/services/runtime_history.py +62 -0
  103. data_engine/services/settings.py +58 -0
  104. data_engine/services/shared_state.py +28 -0
  105. data_engine/services/theme.py +59 -0
  106. data_engine/services/workspace_provisioning.py +224 -0
  107. data_engine/services/workspaces.py +74 -0
  108. data_engine/ui/__init__.py +3 -0
  109. data_engine/ui/cli/__init__.py +19 -0
  110. data_engine/ui/cli/app.py +161 -0
  111. data_engine/ui/cli/commands_doctor.py +178 -0
  112. data_engine/ui/cli/commands_run.py +80 -0
  113. data_engine/ui/cli/commands_start.py +100 -0
  114. data_engine/ui/cli/commands_workspace.py +97 -0
  115. data_engine/ui/cli/dependencies.py +44 -0
  116. data_engine/ui/cli/parser.py +56 -0
  117. data_engine/ui/gui/__init__.py +25 -0
  118. data_engine/ui/gui/app.py +116 -0
  119. data_engine/ui/gui/bootstrap.py +487 -0
  120. data_engine/ui/gui/bootstrapper.py +140 -0
  121. data_engine/ui/gui/cache_models.py +23 -0
  122. data_engine/ui/gui/control_support.py +185 -0
  123. data_engine/ui/gui/controllers/__init__.py +6 -0
  124. data_engine/ui/gui/controllers/flows.py +439 -0
  125. data_engine/ui/gui/controllers/runtime.py +245 -0
  126. data_engine/ui/gui/dialogs/__init__.py +12 -0
  127. data_engine/ui/gui/dialogs/messages.py +88 -0
  128. data_engine/ui/gui/dialogs/previews.py +222 -0
  129. data_engine/ui/gui/helpers/__init__.py +62 -0
  130. data_engine/ui/gui/helpers/inspection.py +81 -0
  131. data_engine/ui/gui/helpers/lifecycle.py +112 -0
  132. data_engine/ui/gui/helpers/scroll.py +28 -0
  133. data_engine/ui/gui/helpers/theming.py +87 -0
  134. data_engine/ui/gui/icons/dark_light.svg +12 -0
  135. data_engine/ui/gui/icons/documentation.svg +1 -0
  136. data_engine/ui/gui/icons/failed.svg +3 -0
  137. data_engine/ui/gui/icons/group.svg +4 -0
  138. data_engine/ui/gui/icons/home.svg +2 -0
  139. data_engine/ui/gui/icons/manual.svg +2 -0
  140. data_engine/ui/gui/icons/poll.svg +2 -0
  141. data_engine/ui/gui/icons/schedule.svg +4 -0
  142. data_engine/ui/gui/icons/settings.svg +2 -0
  143. data_engine/ui/gui/icons/started.svg +3 -0
  144. data_engine/ui/gui/icons/success.svg +3 -0
  145. data_engine/ui/gui/icons/view-log.svg +3 -0
  146. data_engine/ui/gui/icons.py +50 -0
  147. data_engine/ui/gui/launcher.py +48 -0
  148. data_engine/ui/gui/presenters/__init__.py +72 -0
  149. data_engine/ui/gui/presenters/docs.py +140 -0
  150. data_engine/ui/gui/presenters/logs.py +58 -0
  151. data_engine/ui/gui/presenters/runtime_projection.py +29 -0
  152. data_engine/ui/gui/presenters/sidebar.py +88 -0
  153. data_engine/ui/gui/presenters/steps.py +148 -0
  154. data_engine/ui/gui/presenters/workspace.py +39 -0
  155. data_engine/ui/gui/presenters/workspace_binding.py +75 -0
  156. data_engine/ui/gui/presenters/workspace_settings.py +182 -0
  157. data_engine/ui/gui/preview_models.py +37 -0
  158. data_engine/ui/gui/render_support.py +241 -0
  159. data_engine/ui/gui/rendering/__init__.py +12 -0
  160. data_engine/ui/gui/rendering/artifacts.py +95 -0
  161. data_engine/ui/gui/rendering/icons.py +50 -0
  162. data_engine/ui/gui/runtime.py +47 -0
  163. data_engine/ui/gui/state_support.py +193 -0
  164. data_engine/ui/gui/support.py +214 -0
  165. data_engine/ui/gui/surface.py +209 -0
  166. data_engine/ui/gui/theme.py +720 -0
  167. data_engine/ui/gui/widgets/__init__.py +34 -0
  168. data_engine/ui/gui/widgets/config.py +41 -0
  169. data_engine/ui/gui/widgets/logs.py +62 -0
  170. data_engine/ui/gui/widgets/panels.py +507 -0
  171. data_engine/ui/gui/widgets/sidebar.py +130 -0
  172. data_engine/ui/gui/widgets/steps.py +84 -0
  173. data_engine/ui/tui/__init__.py +5 -0
  174. data_engine/ui/tui/app.py +222 -0
  175. data_engine/ui/tui/bootstrap.py +475 -0
  176. data_engine/ui/tui/bootstrapper.py +117 -0
  177. data_engine/ui/tui/controllers/__init__.py +6 -0
  178. data_engine/ui/tui/controllers/flows.py +349 -0
  179. data_engine/ui/tui/controllers/runtime.py +167 -0
  180. data_engine/ui/tui/runtime.py +34 -0
  181. data_engine/ui/tui/state_support.py +141 -0
  182. data_engine/ui/tui/support.py +63 -0
  183. data_engine/ui/tui/theme.py +204 -0
  184. data_engine/ui/tui/widgets.py +123 -0
  185. data_engine/views/__init__.py +109 -0
  186. data_engine/views/actions.py +80 -0
  187. data_engine/views/artifacts.py +58 -0
  188. data_engine/views/flow_display.py +69 -0
  189. data_engine/views/logs.py +54 -0
  190. data_engine/views/models.py +96 -0
  191. data_engine/views/presentation.py +133 -0
  192. data_engine/views/runs.py +62 -0
  193. data_engine/views/state.py +39 -0
  194. data_engine/views/status.py +13 -0
  195. data_engine/views/text.py +109 -0
  196. py_data_engine-0.1.0.dist-info/METADATA +330 -0
  197. py_data_engine-0.1.0.dist-info/RECORD +200 -0
  198. py_data_engine-0.1.0.dist-info/WHEEL +5 -0
  199. py_data_engine-0.1.0.dist-info/entry_points.txt +2 -0
  200. py_data_engine-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,538 @@
1
+ # FlowContext
2
+
3
+ `FlowContext` is the runtime object passed to every step.
4
+
5
+ It is the main place where the runtime meets your step code.
6
+
7
+ If you are authoring flows day to day, this is the surface you will use most often.
8
+
9
+ ## What `FlowContext` contains
10
+
11
+ Common fields and helpers you will read directly:
12
+
13
+ - `flow_name`
14
+ - `group`
15
+ - `source`
16
+ - `mirror`
17
+ - `config`
18
+ - `database(...)`
19
+ - `current`
20
+ - `objects`
21
+ - `metadata`
22
+ - `source_metadata()`
23
+
24
+ Example:
25
+
26
+ ```python
27
+ def inspect_context(context):
28
+ print(context.flow_name)
29
+ print(context.group)
30
+ print(context.current)
31
+ if context.source is not None:
32
+ print(context.source.path)
33
+ return context.current
34
+ ```
35
+
36
+ ## The three most important ideas
37
+
38
+ When in doubt, remember these three ideas:
39
+
40
+ 1. `current` is the moving value in the pipeline.
41
+ 2. `objects` is the named stash of saved intermediates.
42
+ 3. `source` and `mirror` are path namespaces, not open files or connections.
43
+
44
+ Everything else in `FlowContext` builds on those ideas.
45
+
46
+ ## `flow_name` and `group`
47
+
48
+ These are the flow identity fields available at runtime.
49
+
50
+ - `flow_name` comes from the flow-module filename
51
+ - `group` comes from `Flow(group=...)`
52
+
53
+ They are useful when you want to:
54
+
55
+ - stamp metadata
56
+ - label outputs
57
+ - branch behavior lightly by flow identity
58
+ - emit operator-facing details into `context.metadata`
59
+
60
+ ## `current`
61
+
62
+ `context.current` is the moving runtime slot.
63
+
64
+ - before the first manual or scheduled step, it is `None`
65
+ - after each step, it becomes that step's return value
66
+ - if `use=` is set, the runtime loads the named object into `current` before running the step
67
+
68
+ This is why most steps are so small:
69
+
70
+ ```python
71
+ def clean_claims(context):
72
+ return context.current.filter(...)
73
+ ```
74
+
75
+ The step does not need to fetch some external hidden pipeline object. The runtime always hands it the current value.
76
+
77
+ ## `objects`
78
+
79
+ Saved objects live in `context.objects`.
80
+
81
+ That is what `save_as=` and `use=` operate on.
82
+
83
+ Example:
84
+
85
+ ```python
86
+ (
87
+ Flow(group="Claims")
88
+ .step(read_claims, save_as="raw_df")
89
+ .step(clean_claims, use="raw_df", save_as="clean_df")
90
+ .step(write_output, use="clean_df")
91
+ )
92
+ ```
93
+
94
+ Inside a step you can also read those values directly:
95
+
96
+ ```python
97
+ def compare_versions(context):
98
+ raw_df = context.objects["raw_df"]
99
+ clean_df = context.objects["clean_df"]
100
+ ...
101
+ ```
102
+
103
+ This is especially useful when a later step needs more than one previously saved object.
104
+
105
+ ## `metadata`
106
+
107
+ `context.metadata` is a free-form runtime metadata dictionary.
108
+
109
+ Use it when a step wants to publish details about what happened during execution.
110
+
111
+ The runtime also seeds a few values automatically:
112
+
113
+ - `started_at_utc`
114
+ - `run_id`
115
+ - `step_outputs`
116
+ - `file_hash` when the run is bound to a concrete source file
117
+
118
+ `file_hash` is a stable SHA-1 hash of the source-relative path when one exists. For single-file bindings, it falls back to the concrete source path text.
119
+
120
+ Examples:
121
+
122
+ - row counts
123
+ - source metadata
124
+ - selected config values
125
+ - warning flags
126
+ - lightweight operator diagnostics
127
+
128
+ Example:
129
+
130
+ ```python
131
+ def capture_stats(context):
132
+ context.metadata["row_count"] = len(context.current)
133
+ context.metadata["flow_name"] = context.flow_name
134
+ return context.current
135
+ ```
136
+
137
+ The runtime also records step output paths here when a step returns an existing `Path`.
138
+
139
+ That is what powers the UI `Inspect` button for a step: if a step writes a file and returns its existing path, the UI can enable inspection for that step.
140
+
141
+ ## `config`
142
+
143
+ `context.config` is lazy read-only access to `config/*.toml` files in the current authored workspace.
144
+
145
+ Available helpers are:
146
+
147
+ ```python
148
+ context.config.get("claims")
149
+ context.config.require("claims")
150
+ context.config.names()
151
+ context.config.all()
152
+ ```
153
+
154
+ ### `get(name)`
155
+
156
+ Returns a parsed `dict` or `None`.
157
+
158
+ Use this when the config file is optional:
159
+
160
+ ```python
161
+ def apply_runtime_config(context):
162
+ cfg = context.config.get("claims")
163
+ if cfg is None:
164
+ return context.current
165
+ batch_size = cfg.get("runtime", {}).get("batch_size", 5000)
166
+ context.metadata["batch_size"] = batch_size
167
+ return context.current
168
+ ```
169
+
170
+ ### `require(name)`
171
+
172
+ Returns the parsed `dict` or raises when the file is missing.
173
+
174
+ Use this when the config is part of the flow's contract:
175
+
176
+ ```python
177
+ def load_required_settings(context):
178
+ cfg = context.config.require("database")
179
+ dsn = cfg["connection"]["dsn"]
180
+ context.metadata["dsn"] = dsn
181
+ return context.current
182
+ ```
183
+
184
+ ### `names()`
185
+
186
+ Returns available config stems such as:
187
+
188
+ ```python
189
+ ("claims", "runtime")
190
+ ```
191
+
192
+ This is mostly useful for introspection or diagnostics.
193
+
194
+ ### `all()`
195
+
196
+ Returns every parsed config mapping keyed by file stem.
197
+
198
+ Example:
199
+
200
+ ```python
201
+ all_config = context.config.all()
202
+ ```
203
+
204
+ ### What `config` is good for
205
+
206
+ `context.config` is a good fit for:
207
+
208
+ - file names and folder names
209
+ - thresholds and batch sizes
210
+ - optional feature flags
211
+ - SQL parameters
212
+ - external table names
213
+
214
+ It is not a replacement for the `Flow(...)` chain. The orchestration shape still belongs in the fluent flow definition.
215
+
216
+ ## `database(...)`
217
+
218
+ `context.database(...)` returns a write-ready path beneath `databases/` in the current authored workspace.
219
+
220
+ Example:
221
+
222
+ ```python
223
+ db_path = context.database("claims/db.duckdb")
224
+ ```
225
+
226
+ That resolves to:
227
+
228
+ - `workspaces/<workspace_id>/databases/claims/db.duckdb`
229
+
230
+ Rules:
231
+
232
+ - the path must be relative
233
+ - parent directories are created automatically
234
+ - the helper is only available for authored workspace flows
235
+ - it returns a `Path`, not a database connection
236
+
237
+ Typical usage:
238
+
239
+ ```python
240
+ import duckdb
241
+
242
+
243
+ def write_summary(context):
244
+ db_path = context.database("claims/analytics.duckdb")
245
+ conn = duckdb.connect(db_path)
246
+ try:
247
+ ...
248
+ finally:
249
+ conn.close()
250
+ ```
251
+
252
+ This is intentionally simple. Data Engine gives you the path and leaves connection ownership to your code.
253
+
254
+ ## `source_metadata()`
255
+
256
+ `context.source_metadata()` returns basic filesystem metadata for the current source file when one exists.
257
+
258
+ It gives you:
259
+
260
+ - path
261
+ - file name
262
+ - size in bytes
263
+ - modified time in UTC
264
+
265
+ Example:
266
+
267
+ ```python
268
+ def capture_source_info(context):
269
+ metadata = context.source_metadata()
270
+ if metadata is not None:
271
+ context.metadata["source_name"] = metadata.name
272
+ context.metadata["source_size_bytes"] = metadata.size_bytes
273
+ return context.current
274
+ ```
275
+
276
+ This is useful for audit trails, diagnostics, and output manifests.
277
+
278
+ ## `source`
279
+
280
+ `context.source` is the input-side namespace for the active source.
281
+
282
+ It is usually present for poll flows and for scheduled flows that bind a source.
283
+
284
+ It may be `None` for manual flows or scheduled flows that build data entirely in memory.
285
+
286
+ Core helpers are:
287
+
288
+ ```python
289
+ context.source.path
290
+ context.source.dir
291
+ context.source.folder
292
+ context.source.with_extension(".json")
293
+ context.source.with_suffix(".json")
294
+ context.source.file("notes.json")
295
+ context.source.namespaced_file("notes.json")
296
+ context.source.root_file("lookup.csv")
297
+ ```
298
+
299
+ ### `path`
300
+
301
+ The concrete active source file path.
302
+
303
+ This is the simplest and most direct read-side path:
304
+
305
+ ```python
306
+ def read_claims(context):
307
+ return pl.read_excel(context.source.path)
308
+ ```
309
+
310
+ ### `dir`
311
+
312
+ The namespace directory for files derived from the active source.
313
+
314
+ ### `folder`
315
+
316
+ The active source file's parent folder.
317
+
318
+ ### `with_extension(...)` and `with_suffix(...)`
319
+
320
+ These give you the same source-relative file with a new extension.
321
+
322
+ ```python
323
+ def find_json_sidecar(context):
324
+ return context.source.with_extension(".json")
325
+ ```
326
+
327
+ ### `file(...)`
328
+
329
+ Gives you a path in the active source file's parent folder.
330
+
331
+ ```python
332
+ def find_notes(context):
333
+ return context.source.file("notes.json")
334
+ ```
335
+
336
+ ### `namespaced_file(...)`
337
+
338
+ Gives you a path under the active source file's namespace.
339
+
340
+ ```python
341
+ def find_namespaced_notes(context):
342
+ return context.source.namespaced_file("notes.json")
343
+ ```
344
+
345
+ ### `root_file(...)`
346
+
347
+ Gives you a path directly under the source root.
348
+
349
+ ```python
350
+ def load_lookup(context):
351
+ return context.source.root_file("lookup.csv")
352
+ ```
353
+
354
+ ### Common `source` patterns
355
+
356
+ Use `source` when you need:
357
+
358
+ - the active input file
359
+ - a sidecar file near that input
360
+ - a lookup file under the watched source root
361
+ - namespace-aware paths derived from the current source item
362
+
363
+ ## `mirror`
364
+
365
+ `context.mirror` is the mirrored output namespace for the active source.
366
+
367
+ It is present when the flow uses `mirror(root=...)`.
368
+
369
+ Core helpers are:
370
+
371
+ ```python
372
+ context.mirror.root
373
+ context.mirror.dir
374
+ context.mirror.folder
375
+ context.mirror.with_extension(".parquet")
376
+ context.mirror.with_suffix(".parquet")
377
+ context.mirror.file("open_claims.parquet")
378
+ context.mirror.namespaced_file("open_claims.parquet")
379
+ context.mirror.root_file("analytics.duckdb")
380
+ ```
381
+
382
+ ### `with_extension(...)` and `with_suffix(...)`
383
+
384
+ These are for the common "mirror this source file into another format" case.
385
+
386
+ ```python
387
+ def write_target(context):
388
+ output = context.mirror.with_extension(".parquet")
389
+ context.current.write_parquet(output)
390
+ return output
391
+ ```
392
+
393
+ Returning that written `Path` is what makes the step inspectable in the UI.
394
+
395
+ ### `file(...)`
396
+
397
+ Use this for a custom file name in the mirrored source folder:
398
+
399
+ ```python
400
+ def write_summary(context):
401
+ summary_path = context.mirror.file("summary.json")
402
+ summary_path.write_text("{}", encoding="utf-8")
403
+ return summary_path
404
+ ```
405
+
406
+ ### `namespaced_file(...)`
407
+
408
+ Use this for multiple outputs derived from one source:
409
+
410
+ ```python
411
+ def write_outputs(context):
412
+ open_path = context.mirror.namespaced_file("open_claims.parquet")
413
+ closed_path = context.mirror.namespaced_file("closed_claims.parquet")
414
+ ...
415
+ ```
416
+
417
+ ### `root_file(...)`
418
+
419
+ Use this when you want one stable artifact under the mirror root rather than one file per source item.
420
+
421
+ ```python
422
+ def write_snapshot(context):
423
+ snapshot = context.mirror.root_file("artifacts/latest.parquet")
424
+ context.current.write_parquet(snapshot)
425
+ return snapshot
426
+ ```
427
+
428
+ ### Common `mirror` patterns
429
+
430
+ Use `mirror` when you want to:
431
+
432
+ - preserve source-relative output structure
433
+ - create many derived outputs from one source
434
+ - write stable summary artifacts under one output root
435
+ - avoid hand-building output folder math
436
+
437
+ All helpers return write-ready paths, so callers do not need to create parent directories themselves.
438
+
439
+ ## When `source` or `mirror` may be missing
440
+
441
+ Not every flow has every context surface available.
442
+
443
+ Examples:
444
+
445
+ - a manual flow may have no `source`
446
+ - a purely in-memory scheduled flow may have no `source`
447
+ - a flow with no `mirror(root=...)` has no `mirror`
448
+
449
+ So it is reasonable to write defensive code when the flow shape allows those cases:
450
+
451
+ ```python
452
+ def maybe_capture_source(context):
453
+ if context.source is None:
454
+ return context.current
455
+ context.metadata["source_path"] = str(context.source.path)
456
+ return context.current
457
+ ```
458
+
459
+ ## Batch values
460
+
461
+ `Flow.collect(...)` returns a `Batch` of `FileRef` items instead of a raw list.
462
+
463
+ That means later steps can work with:
464
+
465
+ - `file_ref.name`
466
+ - `file_ref.path`
467
+ - `file_ref.stem`
468
+ - `file_ref.suffix`
469
+ - `file_ref.parent`
470
+
471
+ Example:
472
+
473
+ ```python
474
+ def read_claims(file_ref):
475
+ return pl.read_excel(file_ref.path)
476
+ ```
477
+
478
+ When you are in a mapped step, the item is often simpler than the full `context`, and that is by design.
479
+
480
+ ## A practical context walkthrough
481
+
482
+ Here is a representative flow using several parts of the context together:
483
+
484
+ ```python
485
+ import duckdb
486
+ import polars as pl
487
+
488
+ from data_engine import Flow
489
+
490
+
491
+ def read_claims(file_ref):
492
+ return pl.read_excel(file_ref.path)
493
+
494
+
495
+ def combine_claims(context):
496
+ cfg = context.config.get("claims") or {}
497
+ batch_size = cfg.get("runtime", {}).get("batch_size", 5000)
498
+ context.metadata["batch_size"] = batch_size
499
+ return pl.concat(context.current, how="vertical_relaxed")
500
+
501
+
502
+ def summarize(context):
503
+ db_path = context.database("claims/analytics.duckdb")
504
+ conn = duckdb.connect(db_path)
505
+ try:
506
+ conn.register("input", context.current)
507
+ summary = conn.sql("select count(*) as row_count from input").pl()
508
+ finally:
509
+ conn.close()
510
+ output = context.mirror.file("summary.parquet")
511
+ summary.write_parquet(output)
512
+ context.metadata["summary_path"] = str(output)
513
+ return output
514
+
515
+
516
+ def build():
517
+ return (
518
+ Flow(group="Claims")
519
+ .watch(mode="schedule", run_as="batch", interval="15m", source="../../example_data/Input/claims_flat")
520
+ .mirror(root="../../example_data/Output/example_summary")
521
+ .collect([".xlsx"], save_as="claim_files")
522
+ .map(read_claims, use="claim_files", save_as="claim_frames")
523
+ .step(combine_claims, use="claim_frames", save_as="raw_df")
524
+ .step(summarize, use="raw_df")
525
+ )
526
+ ```
527
+
528
+ That one flow uses:
529
+
530
+ - `Batch` and `FileRef`
531
+ - `current`
532
+ - `objects`
533
+ - `config`
534
+ - `database(...)`
535
+ - `mirror`
536
+ - `metadata`
537
+
538
+ That is the intended shape of the authoring model: small runtime helpers that make native Python data work easier to organize.