py-data-engine 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (200) hide show
  1. data_engine/__init__.py +37 -0
  2. data_engine/application/__init__.py +39 -0
  3. data_engine/application/actions.py +42 -0
  4. data_engine/application/catalog.py +151 -0
  5. data_engine/application/control.py +213 -0
  6. data_engine/application/details.py +73 -0
  7. data_engine/application/runtime.py +449 -0
  8. data_engine/application/workspace.py +62 -0
  9. data_engine/authoring/__init__.py +14 -0
  10. data_engine/authoring/builder.py +31 -0
  11. data_engine/authoring/execution/__init__.py +6 -0
  12. data_engine/authoring/execution/app.py +6 -0
  13. data_engine/authoring/execution/context.py +82 -0
  14. data_engine/authoring/execution/continuous.py +176 -0
  15. data_engine/authoring/execution/grouped.py +106 -0
  16. data_engine/authoring/execution/logging.py +83 -0
  17. data_engine/authoring/execution/polling.py +135 -0
  18. data_engine/authoring/execution/runner.py +210 -0
  19. data_engine/authoring/execution/single.py +171 -0
  20. data_engine/authoring/flow.py +361 -0
  21. data_engine/authoring/helpers.py +160 -0
  22. data_engine/authoring/model.py +59 -0
  23. data_engine/authoring/primitives.py +430 -0
  24. data_engine/authoring/services.py +42 -0
  25. data_engine/devtools/__init__.py +3 -0
  26. data_engine/devtools/project_ast_map.py +503 -0
  27. data_engine/docs/__init__.py +1 -0
  28. data_engine/docs/sphinx_source/_static/custom.css +13 -0
  29. data_engine/docs/sphinx_source/api.rst +42 -0
  30. data_engine/docs/sphinx_source/conf.py +37 -0
  31. data_engine/docs/sphinx_source/guides/app-runtime-and-workspaces.md +397 -0
  32. data_engine/docs/sphinx_source/guides/authoring-flow-modules.md +215 -0
  33. data_engine/docs/sphinx_source/guides/configuring-flows.md +185 -0
  34. data_engine/docs/sphinx_source/guides/core-concepts.md +208 -0
  35. data_engine/docs/sphinx_source/guides/database-methods.md +107 -0
  36. data_engine/docs/sphinx_source/guides/duckdb-helpers.md +462 -0
  37. data_engine/docs/sphinx_source/guides/flow-context.md +538 -0
  38. data_engine/docs/sphinx_source/guides/flow-methods.md +206 -0
  39. data_engine/docs/sphinx_source/guides/getting-started.md +271 -0
  40. data_engine/docs/sphinx_source/guides/project-inventory.md +5683 -0
  41. data_engine/docs/sphinx_source/guides/project-map.md +118 -0
  42. data_engine/docs/sphinx_source/guides/recipes.md +268 -0
  43. data_engine/docs/sphinx_source/index.rst +22 -0
  44. data_engine/domain/__init__.py +92 -0
  45. data_engine/domain/actions.py +69 -0
  46. data_engine/domain/catalog.py +128 -0
  47. data_engine/domain/details.py +214 -0
  48. data_engine/domain/diagnostics.py +56 -0
  49. data_engine/domain/errors.py +104 -0
  50. data_engine/domain/inspection.py +99 -0
  51. data_engine/domain/logs.py +118 -0
  52. data_engine/domain/operations.py +172 -0
  53. data_engine/domain/operator.py +72 -0
  54. data_engine/domain/runs.py +155 -0
  55. data_engine/domain/runtime.py +279 -0
  56. data_engine/domain/source_state.py +17 -0
  57. data_engine/domain/support.py +54 -0
  58. data_engine/domain/time.py +23 -0
  59. data_engine/domain/workspace.py +159 -0
  60. data_engine/flow_modules/__init__.py +1 -0
  61. data_engine/flow_modules/flow_module_compiler.py +179 -0
  62. data_engine/flow_modules/flow_module_loader.py +201 -0
  63. data_engine/helpers/__init__.py +25 -0
  64. data_engine/helpers/duckdb.py +705 -0
  65. data_engine/hosts/__init__.py +1 -0
  66. data_engine/hosts/daemon/__init__.py +23 -0
  67. data_engine/hosts/daemon/app.py +221 -0
  68. data_engine/hosts/daemon/bootstrap.py +69 -0
  69. data_engine/hosts/daemon/client.py +465 -0
  70. data_engine/hosts/daemon/commands.py +64 -0
  71. data_engine/hosts/daemon/composition.py +310 -0
  72. data_engine/hosts/daemon/constants.py +15 -0
  73. data_engine/hosts/daemon/entrypoints.py +97 -0
  74. data_engine/hosts/daemon/lifecycle.py +191 -0
  75. data_engine/hosts/daemon/manager.py +272 -0
  76. data_engine/hosts/daemon/ownership.py +126 -0
  77. data_engine/hosts/daemon/runtime_commands.py +188 -0
  78. data_engine/hosts/daemon/runtime_control.py +31 -0
  79. data_engine/hosts/daemon/server.py +84 -0
  80. data_engine/hosts/daemon/shared_state.py +147 -0
  81. data_engine/hosts/daemon/state_sync.py +101 -0
  82. data_engine/platform/__init__.py +1 -0
  83. data_engine/platform/identity.py +35 -0
  84. data_engine/platform/local_settings.py +146 -0
  85. data_engine/platform/theme.py +259 -0
  86. data_engine/platform/workspace_models.py +190 -0
  87. data_engine/platform/workspace_policy.py +333 -0
  88. data_engine/runtime/__init__.py +1 -0
  89. data_engine/runtime/file_watch.py +185 -0
  90. data_engine/runtime/ledger_models.py +116 -0
  91. data_engine/runtime/runtime_db.py +938 -0
  92. data_engine/runtime/shared_state.py +523 -0
  93. data_engine/services/__init__.py +49 -0
  94. data_engine/services/daemon.py +64 -0
  95. data_engine/services/daemon_state.py +40 -0
  96. data_engine/services/flow_catalog.py +102 -0
  97. data_engine/services/flow_execution.py +48 -0
  98. data_engine/services/ledger.py +85 -0
  99. data_engine/services/logs.py +65 -0
  100. data_engine/services/runtime_binding.py +105 -0
  101. data_engine/services/runtime_execution.py +126 -0
  102. data_engine/services/runtime_history.py +62 -0
  103. data_engine/services/settings.py +58 -0
  104. data_engine/services/shared_state.py +28 -0
  105. data_engine/services/theme.py +59 -0
  106. data_engine/services/workspace_provisioning.py +224 -0
  107. data_engine/services/workspaces.py +74 -0
  108. data_engine/ui/__init__.py +3 -0
  109. data_engine/ui/cli/__init__.py +19 -0
  110. data_engine/ui/cli/app.py +161 -0
  111. data_engine/ui/cli/commands_doctor.py +178 -0
  112. data_engine/ui/cli/commands_run.py +80 -0
  113. data_engine/ui/cli/commands_start.py +100 -0
  114. data_engine/ui/cli/commands_workspace.py +97 -0
  115. data_engine/ui/cli/dependencies.py +44 -0
  116. data_engine/ui/cli/parser.py +56 -0
  117. data_engine/ui/gui/__init__.py +25 -0
  118. data_engine/ui/gui/app.py +116 -0
  119. data_engine/ui/gui/bootstrap.py +487 -0
  120. data_engine/ui/gui/bootstrapper.py +140 -0
  121. data_engine/ui/gui/cache_models.py +23 -0
  122. data_engine/ui/gui/control_support.py +185 -0
  123. data_engine/ui/gui/controllers/__init__.py +6 -0
  124. data_engine/ui/gui/controllers/flows.py +439 -0
  125. data_engine/ui/gui/controllers/runtime.py +245 -0
  126. data_engine/ui/gui/dialogs/__init__.py +12 -0
  127. data_engine/ui/gui/dialogs/messages.py +88 -0
  128. data_engine/ui/gui/dialogs/previews.py +222 -0
  129. data_engine/ui/gui/helpers/__init__.py +62 -0
  130. data_engine/ui/gui/helpers/inspection.py +81 -0
  131. data_engine/ui/gui/helpers/lifecycle.py +112 -0
  132. data_engine/ui/gui/helpers/scroll.py +28 -0
  133. data_engine/ui/gui/helpers/theming.py +87 -0
  134. data_engine/ui/gui/icons/dark_light.svg +12 -0
  135. data_engine/ui/gui/icons/documentation.svg +1 -0
  136. data_engine/ui/gui/icons/failed.svg +3 -0
  137. data_engine/ui/gui/icons/group.svg +4 -0
  138. data_engine/ui/gui/icons/home.svg +2 -0
  139. data_engine/ui/gui/icons/manual.svg +2 -0
  140. data_engine/ui/gui/icons/poll.svg +2 -0
  141. data_engine/ui/gui/icons/schedule.svg +4 -0
  142. data_engine/ui/gui/icons/settings.svg +2 -0
  143. data_engine/ui/gui/icons/started.svg +3 -0
  144. data_engine/ui/gui/icons/success.svg +3 -0
  145. data_engine/ui/gui/icons/view-log.svg +3 -0
  146. data_engine/ui/gui/icons.py +50 -0
  147. data_engine/ui/gui/launcher.py +48 -0
  148. data_engine/ui/gui/presenters/__init__.py +72 -0
  149. data_engine/ui/gui/presenters/docs.py +140 -0
  150. data_engine/ui/gui/presenters/logs.py +58 -0
  151. data_engine/ui/gui/presenters/runtime_projection.py +29 -0
  152. data_engine/ui/gui/presenters/sidebar.py +88 -0
  153. data_engine/ui/gui/presenters/steps.py +148 -0
  154. data_engine/ui/gui/presenters/workspace.py +39 -0
  155. data_engine/ui/gui/presenters/workspace_binding.py +75 -0
  156. data_engine/ui/gui/presenters/workspace_settings.py +182 -0
  157. data_engine/ui/gui/preview_models.py +37 -0
  158. data_engine/ui/gui/render_support.py +241 -0
  159. data_engine/ui/gui/rendering/__init__.py +12 -0
  160. data_engine/ui/gui/rendering/artifacts.py +95 -0
  161. data_engine/ui/gui/rendering/icons.py +50 -0
  162. data_engine/ui/gui/runtime.py +47 -0
  163. data_engine/ui/gui/state_support.py +193 -0
  164. data_engine/ui/gui/support.py +214 -0
  165. data_engine/ui/gui/surface.py +209 -0
  166. data_engine/ui/gui/theme.py +720 -0
  167. data_engine/ui/gui/widgets/__init__.py +34 -0
  168. data_engine/ui/gui/widgets/config.py +41 -0
  169. data_engine/ui/gui/widgets/logs.py +62 -0
  170. data_engine/ui/gui/widgets/panels.py +507 -0
  171. data_engine/ui/gui/widgets/sidebar.py +130 -0
  172. data_engine/ui/gui/widgets/steps.py +84 -0
  173. data_engine/ui/tui/__init__.py +5 -0
  174. data_engine/ui/tui/app.py +222 -0
  175. data_engine/ui/tui/bootstrap.py +475 -0
  176. data_engine/ui/tui/bootstrapper.py +117 -0
  177. data_engine/ui/tui/controllers/__init__.py +6 -0
  178. data_engine/ui/tui/controllers/flows.py +349 -0
  179. data_engine/ui/tui/controllers/runtime.py +167 -0
  180. data_engine/ui/tui/runtime.py +34 -0
  181. data_engine/ui/tui/state_support.py +141 -0
  182. data_engine/ui/tui/support.py +63 -0
  183. data_engine/ui/tui/theme.py +204 -0
  184. data_engine/ui/tui/widgets.py +123 -0
  185. data_engine/views/__init__.py +109 -0
  186. data_engine/views/actions.py +80 -0
  187. data_engine/views/artifacts.py +58 -0
  188. data_engine/views/flow_display.py +69 -0
  189. data_engine/views/logs.py +54 -0
  190. data_engine/views/models.py +96 -0
  191. data_engine/views/presentation.py +133 -0
  192. data_engine/views/runs.py +62 -0
  193. data_engine/views/state.py +39 -0
  194. data_engine/views/status.py +13 -0
  195. data_engine/views/text.py +109 -0
  196. py_data_engine-0.1.0.dist-info/METADATA +330 -0
  197. py_data_engine-0.1.0.dist-info/RECORD +200 -0
  198. py_data_engine-0.1.0.dist-info/WHEEL +5 -0
  199. py_data_engine-0.1.0.dist-info/entry_points.txt +2 -0
  200. py_data_engine-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,206 @@
1
+ # Flow Methods
2
+
3
+ This page covers the small author-facing `Flow` surface.
4
+
5
+ ```python
6
+ from data_engine import Flow
7
+ ```
8
+
9
+ ## `Flow(group)`
10
+
11
+ Create a new immutable flow definition.
12
+
13
+ ```python
14
+ flow = Flow(group="Claims")
15
+ ```
16
+
17
+ Rules:
18
+
19
+ - `group` must be a non-empty string
20
+ - the flow-module filename provides the flow identity
21
+ - the returned object is immutable, so each fluent call returns a new `Flow`
22
+
23
+ Immutability matters because it keeps authoring predictable. Each chained call produces a new flow definition rather than mutating hidden shared state.
24
+
25
+ ## `watch(...)`
26
+
27
+ Configure a runtime trigger for manual, poll, or schedule execution.
28
+
29
+ ```python
30
+ flow = flow.watch(
31
+ mode="poll",
32
+ source="../../example_data/Input/claims_flat",
33
+ interval="5s",
34
+ extensions=[".xlsx", ".xlsm"],
35
+ settle=1,
36
+ )
37
+ ```
38
+
39
+ ```python
40
+ flow = flow.watch(
41
+ mode="poll",
42
+ source="../../example_data/Settings/single_watch.xlsx",
43
+ interval="5s",
44
+ )
45
+ ```
46
+
47
+ ```python
48
+ flow = flow.watch(mode="schedule", run_as="batch", interval="15m")
49
+ flow = flow.watch(mode="schedule", run_as="batch", interval="15m", source="../../example_data/Input/claims_flat")
50
+ flow = flow.watch(mode="schedule", run_as="batch", time="10:31", source="../../example_data/Settings/single_watch.xlsx")
51
+ flow = flow.watch(mode="schedule", run_as="batch", time=["08:15", "14:45"])
52
+ flow = flow.watch(mode="manual")
53
+ ```
54
+
55
+ Rules:
56
+
57
+ - `mode` must be one of `manual`, `poll`, or `schedule`
58
+ - `run_as` defaults to `individual`
59
+ - `run_as="individual"` means one run per concrete source file
60
+ - `run_as="batch"` means one run at the watched root
61
+ - poll flows require `source=` and `interval=`
62
+ - schedule flows accept exactly one of `interval=` or `time=`
63
+ - `time` accepts either one `HH:MM` string or a collection of `HH:MM` strings
64
+ - `extensions` and `settle` are poll-only options
65
+ - missing or bad paths fail now and recover later when the path becomes valid
66
+ - poll freshness compares the current source file signature against the runtime ledger
67
+
68
+ Practical guidance:
69
+
70
+ - use `manual` for explicit button-driven flows
71
+ - use `poll` when the source changing should be the trigger
72
+ - use `schedule` when time should be the trigger
73
+ - use `run_as="batch"` when the flow should reason about a folder or root as one unit
74
+ - use `run_as="individual"` when each source file should become its own run
75
+
76
+ `watch(...)` is where you describe orchestration intent, not transformation logic.
77
+
78
+ ## `mirror(root=...)`
79
+
80
+ Bind a mirrored output namespace rooted at one directory.
81
+
82
+ ```python
83
+ flow = flow.mirror(root="../../example_data/Output/example_mirror")
84
+ ```
85
+
86
+ `mirror(...)` does not write files. It defines the output namespace exposed later through `context.mirror`.
87
+
88
+ You can omit `mirror(...)` entirely if the flow has no need for a mirrored output namespace.
89
+
90
+ ## `step(fn, use=None, save_as=None, label=None)`
91
+
92
+ Add one generic callable step.
93
+
94
+ ```python
95
+ flow = flow.step(read_claims, save_as="raw_df")
96
+ flow = flow.step(clean_claims, use="raw_df", save_as="clean_df")
97
+ flow = flow.step(write_output, use="clean_df", label="Write Parquet")
98
+ ```
99
+
100
+ Rules:
101
+
102
+ - `fn` must be callable
103
+ - `fn` must accept exactly one `context` parameter
104
+ - `use=` selects a previously saved object
105
+ - `save_as=` stores the returned object
106
+ - `label=` overrides the UI display name
107
+
108
+ The return value always becomes `context.current`.
109
+
110
+ This is the default workhorse method. Most flows are easiest to read when they are mostly made of `step(...)` with occasional `collect(...)` and `map(...)` where batching is truly needed.
111
+
112
+ ## `map(fn, use=None, save_as=None, label=None)`
113
+
114
+ Map one callable across the current batch.
115
+
116
+ ```python
117
+ flow = flow.collect([".pdf"])
118
+ flow = flow.map(validate_pdf)
119
+ flow = flow.map(validate_pdf_with_context, label="Validate Pdf")
120
+ ```
121
+
122
+ ```python
123
+ def validate_pdf(file_ref):
124
+ return {"name": file_ref.name, "ok": file_ref.exists()}
125
+
126
+
127
+ def validate_pdf_with_context(context, file_ref):
128
+ return {"flow": context.flow_name, "name": file_ref.name}
129
+ ```
130
+
131
+ Rules:
132
+
133
+ - `map()` expects the current value to be iterable
134
+ - `fn` may accept either `(item)` or `(context, item)`
135
+ - the mapped results are returned as a `Batch`
136
+ - `map()` raises when the current batch is empty
137
+ - `use=`, `save_as=`, and `label=` work the same way they do for `step()`
138
+
139
+ Reach for `map(...)` when the same callable should run once per collected item. If the callable should reason about the whole collection, switch back to a normal `step(...)`.
140
+
141
+ ## `step_each(fn, use=None, save_as=None, label=None)`
142
+
143
+ `step_each(...)` is an alias for `map(...)`.
144
+
145
+ Use whichever reads better in the flow module:
146
+
147
+ ```python
148
+ flow = flow.map(read_claims)
149
+ flow = flow.step_each(read_claims)
150
+ ```
151
+
152
+ ## `collect(extensions, root=None, recursive=False, use=None, save_as=None, label=None)`
153
+
154
+ Collect matching files into a `Batch` of `FileRef` items.
155
+
156
+ ```python
157
+ flow = flow.collect([".xlsx"])
158
+ flow = flow.collect([".pdf"], recursive=True)
159
+ ```
160
+
161
+ Behavior:
162
+
163
+ - uses `root=` when provided
164
+ - otherwise falls back to `context.source.root`
165
+ - returns a `Batch`, not a raw list
166
+ - each item exposes `.name`, `.path`, `.stem`, `.suffix`, and `.parent`
167
+
168
+ If `root=` is omitted, the runtime falls back to the current source root. That is often the cleanest choice for poll or scheduled batch flows already bound to a source.
169
+
170
+ ## `run_once()`
171
+
172
+ Run the flow one time and return the completed contexts.
173
+
174
+ Use this when you want a one-off Python-driven execution rather than continuous watching.
175
+
176
+ ## `run()`
177
+
178
+ Start continuous execution for watched poll or schedule flows.
179
+
180
+ This is the entrypoint behind long-lived runtime behavior.
181
+
182
+ ## `preview(use=None)`
183
+
184
+ Run one flow for notebook inspection and return a real object.
185
+
186
+ ```python
187
+ build().preview()
188
+ build().preview(use="raw_df").head(10)
189
+ build().preview(use="claim_frames")
190
+ ```
191
+
192
+ Behavior:
193
+
194
+ - without `use=`, returns the final `context.current`
195
+ - with `use="name"`, runs only until `save_as="name"` exists
196
+ - returns the real saved object, so dataframe methods like `.head(10)` work naturally
197
+ - avoids running later write/debug steps once the requested saved object is available
198
+ - if a poll flow would have several startup source files, preview uses the first deterministic source candidate for notebook inspection rather than trying to preview every file at once
199
+
200
+ `preview(...)` is especially useful while authoring notebook-backed flows because it lets you stop at a meaningful intermediate instead of running the whole flow to the final writer step every time.
201
+
202
+ ## `show()`
203
+
204
+ Preview the single current result from a one-off flow.
205
+
206
+ Use this for quick interactive inspection when the final current value itself is the thing you want to see.
@@ -0,0 +1,271 @@
1
+ # Getting Started
2
+
3
+ This guide is for someone new to the code-defined Data Engine API and desktop app.
4
+
5
+ By the end, you should understand:
6
+
7
+ - what a flow is
8
+ - where flow modules live
9
+ - what a workspace contains
10
+ - how discovery and runtime execution work at a high level
11
+ - how to run a first flow end to end
12
+ - how batch workflows fit into the model
13
+
14
+ ## The mental model
15
+
16
+ Data Engine has one source of truth for per-flow behavior: the `Flow` returned by `build()`.
17
+
18
+ In practice:
19
+
20
+ - the flow module defines the flow name, group, runtime mode, and ordered steps
21
+ - step functions do real work with native libraries such as Polars, DuckDB, and plain Python
22
+ - the desktop app discovers those flow modules inside the selected workspace and shows them as configurable runnable flows
23
+
24
+ The engine does not hide the real work behind a DSL. The fluent API owns orchestration, while the step callables own your actual business logic.
25
+
26
+ ## The basic workspace layout
27
+
28
+ A typical authored workspace looks like this:
29
+
30
+ ```text
31
+ workspaces/
32
+ example_workspace/
33
+ flow_modules/
34
+ flow_modules/flow_helpers/
35
+ config/
36
+ databases/
37
+ .workspace_state/
38
+ ```
39
+
40
+ The parts you will usually author directly are:
41
+
42
+ - `flow_modules/`: runnable flows in `.py` or `.ipynb`
43
+ - `flow_modules/flow_helpers/`: reusable helper modules imported from flows
44
+ - `config/`: workspace-local TOML files available through `context.config`
45
+ - `databases/`: a conventional home for workspace-local databases used through `context.database(...)`
46
+
47
+ The app can provision that shape for you without overwriting existing content.
48
+
49
+ ## Where flow module sources live
50
+
51
+ Flow module sources are authored in:
52
+
53
+ - `workspaces/<workspace_id>/flow_modules/<name>.ipynb`
54
+ - `workspaces/<workspace_id>/flow_modules/<name>.py`
55
+
56
+ Reusable helper modules live in:
57
+
58
+ - `workspaces/<workspace_id>/flow_modules/flow_helpers/<name>.py`
59
+
60
+ Compiled runtime modules are generated into machine-local artifacts rather than into the authored workspace itself.
61
+
62
+ Each flow module should export:
63
+
64
+ - optional `DESCRIPTION`
65
+ - `build() -> Flow`
66
+
67
+ Display titles come from `Flow(label=...)` when provided. Otherwise the UI derives them from the flow-module filename.
68
+
69
+ ## Your first flow
70
+
71
+ A minimal scheduled flow can create data in memory and write it out:
72
+
73
+ ```python
74
+ from data_engine import Flow
75
+ import polars as pl
76
+
77
+
78
+ def build_dates(context):
79
+ return pl.DataFrame({"day": [1, 2, 3]})
80
+
81
+
82
+ def write_dates(context):
83
+ output = context.mirror.file("dates.parquet")
84
+ context.current.write_parquet(output)
85
+ return output
86
+
87
+
88
+ def build():
89
+ return (
90
+ Flow(group="Reference")
91
+ .watch(mode="schedule", run_as="batch", interval="1h")
92
+ .mirror(root="../../example_data/Output/date_dimension")
93
+ .step(build_dates, save_as="dates_df")
94
+ .step(write_dates, use="dates_df", label="Write Parquet")
95
+ )
96
+ ```
97
+
98
+ That example shows the full shape:
99
+
100
+ 1. create `Flow(group=...)`
101
+ 2. attach a runtime mode with `watch(...)`
102
+ 3. optionally attach `mirror(...)`
103
+ 4. add ordered `step(...)` callables
104
+ 5. return the built flow from `build()`
105
+
106
+ The return value from each step becomes `context.current`, so later steps can keep operating on the current object or reach back to previously saved objects through `use=`.
107
+
108
+ ## What the app actually does with that flow
109
+
110
+ Once the flow is discovered, the desktop app uses it for:
111
+
112
+ - grouping and labels in the home view
113
+ - deciding whether the flow is manual, poll, or schedule
114
+ - deciding whether the flow participates in the engine
115
+ - rendering step names and inspectable outputs
116
+ - manual runs and engine runs for the selected workspace
117
+
118
+ The app itself binds to one workspace at a time, so when you switch workspaces, the discovered flows, runtime ledger, daemon state, and visible runs all switch with it.
119
+
120
+ ## A starter-style polling flow
121
+
122
+ This shape maps directly to starter flows such as `example_mirror` and `example_poll`:
123
+
124
+ ```python
125
+ from data_engine import Flow
126
+ import polars as pl
127
+
128
+
129
+ def read_claims(context):
130
+ return pl.read_excel(context.source.path)
131
+
132
+
133
+ def keep_open(context):
134
+ return context.current.filter(pl.col("status") == "OPEN")
135
+
136
+
137
+ def write_target(context):
138
+ output = context.mirror.with_suffix(".parquet")
139
+ context.current.write_parquet(output)
140
+ return output
141
+
142
+
143
+ def build():
144
+ return (
145
+ Flow(group="Claims")
146
+ .watch(
147
+ mode="poll",
148
+ source="../../example_data/Input/claims_dated",
149
+ interval="5s",
150
+ extensions=[".xlsx", ".xlsm"],
151
+ settle=1,
152
+ )
153
+ .mirror(root="../../example_data/Output/example_poll")
154
+ .step(read_claims, save_as="raw_df")
155
+ .step(keep_open, use="raw_df", save_as="filtered_df")
156
+ .step(write_target, use="filtered_df", label="Write Parquet")
157
+ )
158
+ ```
159
+
160
+ This is a good first mental model for source-driven flows:
161
+
162
+ - `watch(...)` tells the runtime what to listen to
163
+ - `context.source` tells the step which concrete file is active
164
+ - `mirror(...)` defines where mirrored outputs belong
165
+ - returning the written path makes the result inspectable in the UI
166
+
167
+ ## Batch-oriented files
168
+
169
+ When you want a folder of files as one runtime object, use `Flow.collect(...)` and either `Flow.map(...)` or `Flow.step_each(...)`.
170
+
171
+ ```python
172
+ from data_engine import Flow
173
+
174
+
175
+ def validate_pdf(file_ref):
176
+ return {"name": file_ref.name, "ok": file_ref.exists()}
177
+
178
+
179
+ def summarize_results(context):
180
+ return tuple(item["name"] for item in context.current if item["ok"])
181
+
182
+
183
+ def build():
184
+ return (
185
+ Flow(group="Claims")
186
+ .watch(mode="schedule", run_as="batch", interval="15m", source="../../example_data/Input/pdfs")
187
+ .collect([".pdf"], save_as="pdf_files")
188
+ .map(validate_pdf, use="pdf_files", save_as="pdf_results")
189
+ .step(summarize_results, use="pdf_results")
190
+ )
191
+ ```
192
+
193
+ `Flow.collect(...)` returns a `Batch` of `FileRef` items.
194
+
195
+ `Flow.map(...)` runs one callable per item and returns a new `Batch`.
196
+
197
+ `Flow.step_each(...)` is the same operation with a name that can read more clearly in some flows.
198
+
199
+ If the batch is empty, both forms raise immediately. That behavior is intentional so batch flows fail loudly instead of silently producing ambiguous "nothing happened" results.
200
+
201
+ ## Running flows from Python
202
+
203
+ Load one discovered flow:
204
+
205
+ ```python
206
+ from data_engine import load_flow
207
+
208
+ built = load_flow("example_poll")
209
+ results = built.run_once()
210
+ ```
211
+
212
+ Discover everything the workspace exposes:
213
+
214
+ ```python
215
+ from data_engine import discover_flows, run
216
+
217
+ flows = discover_flows()
218
+ run(*flows)
219
+ ```
220
+
221
+ Notebook-authored flows also support preview-oriented authoring:
222
+
223
+ ```python
224
+ build().preview()
225
+ build().preview(use="raw_df")
226
+ ```
227
+
228
+ That is often the fastest way to sanity-check a flow while you are still writing it.
229
+
230
+ For poll flows that watch a folder, `preview(...)` uses one deterministic startup source as a representative notebook preview rather than trying to run every discovered file.
231
+
232
+ ## Manual, poll, and schedule at a glance
233
+
234
+ ### Manual
235
+
236
+ - `watch(mode="manual")`
237
+ - `context.current` starts as `None`
238
+ - useful for ad hoc or UI-driven runs
239
+ - does not require a source binding
240
+
241
+ ### Poll
242
+
243
+ - `watch(mode="poll", ...)`
244
+ - watches either one file or a directory of source files
245
+ - the first step receives the active source through `context.source`
246
+ - freshness compares the current source file signature against the runtime ledger
247
+ - `extensions=` and `settle=` only apply here
248
+
249
+ ### Schedule
250
+
251
+ - `watch(mode="schedule", ...)`
252
+ - runs on an interval or on one or more wall-clock times
253
+ - supports one `time="HH:MM"` value or a collection of times
254
+ - often starts by building data in memory or loading from a known source root
255
+
256
+ ## A few good habits early
257
+
258
+ - keep import-time code side-effect free
259
+ - keep expensive work inside steps, not at module import
260
+ - return output paths from writer steps when you want the UI `Inspect` action
261
+ - move reusable SQL, parsing helpers, and constants into `flow_modules/flow_helpers/`
262
+ - use `context.config` for workspace-local TOML configuration rather than inventing ad hoc config loading in every flow
263
+ - use `context.database(...)` when you want a conventional workspace-local database path
264
+
265
+ ## Next steps
266
+
267
+ - Read [Core Concepts](core-concepts.md)
268
+ - Read [Authoring Flow Modules](authoring-flow-modules.md)
269
+ - Read [Flow Methods](flow-methods.md)
270
+ - Read [FlowContext](flow-context.md)
271
+ - Read [App Runtime and Workspaces](app-runtime-and-workspaces.md)