py-data-engine 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (200) hide show
  1. data_engine/__init__.py +37 -0
  2. data_engine/application/__init__.py +39 -0
  3. data_engine/application/actions.py +42 -0
  4. data_engine/application/catalog.py +151 -0
  5. data_engine/application/control.py +213 -0
  6. data_engine/application/details.py +73 -0
  7. data_engine/application/runtime.py +449 -0
  8. data_engine/application/workspace.py +62 -0
  9. data_engine/authoring/__init__.py +14 -0
  10. data_engine/authoring/builder.py +31 -0
  11. data_engine/authoring/execution/__init__.py +6 -0
  12. data_engine/authoring/execution/app.py +6 -0
  13. data_engine/authoring/execution/context.py +82 -0
  14. data_engine/authoring/execution/continuous.py +176 -0
  15. data_engine/authoring/execution/grouped.py +106 -0
  16. data_engine/authoring/execution/logging.py +83 -0
  17. data_engine/authoring/execution/polling.py +135 -0
  18. data_engine/authoring/execution/runner.py +210 -0
  19. data_engine/authoring/execution/single.py +171 -0
  20. data_engine/authoring/flow.py +361 -0
  21. data_engine/authoring/helpers.py +160 -0
  22. data_engine/authoring/model.py +59 -0
  23. data_engine/authoring/primitives.py +430 -0
  24. data_engine/authoring/services.py +42 -0
  25. data_engine/devtools/__init__.py +3 -0
  26. data_engine/devtools/project_ast_map.py +503 -0
  27. data_engine/docs/__init__.py +1 -0
  28. data_engine/docs/sphinx_source/_static/custom.css +13 -0
  29. data_engine/docs/sphinx_source/api.rst +42 -0
  30. data_engine/docs/sphinx_source/conf.py +37 -0
  31. data_engine/docs/sphinx_source/guides/app-runtime-and-workspaces.md +397 -0
  32. data_engine/docs/sphinx_source/guides/authoring-flow-modules.md +215 -0
  33. data_engine/docs/sphinx_source/guides/configuring-flows.md +185 -0
  34. data_engine/docs/sphinx_source/guides/core-concepts.md +208 -0
  35. data_engine/docs/sphinx_source/guides/database-methods.md +107 -0
  36. data_engine/docs/sphinx_source/guides/duckdb-helpers.md +462 -0
  37. data_engine/docs/sphinx_source/guides/flow-context.md +538 -0
  38. data_engine/docs/sphinx_source/guides/flow-methods.md +206 -0
  39. data_engine/docs/sphinx_source/guides/getting-started.md +271 -0
  40. data_engine/docs/sphinx_source/guides/project-inventory.md +5683 -0
  41. data_engine/docs/sphinx_source/guides/project-map.md +118 -0
  42. data_engine/docs/sphinx_source/guides/recipes.md +268 -0
  43. data_engine/docs/sphinx_source/index.rst +22 -0
  44. data_engine/domain/__init__.py +92 -0
  45. data_engine/domain/actions.py +69 -0
  46. data_engine/domain/catalog.py +128 -0
  47. data_engine/domain/details.py +214 -0
  48. data_engine/domain/diagnostics.py +56 -0
  49. data_engine/domain/errors.py +104 -0
  50. data_engine/domain/inspection.py +99 -0
  51. data_engine/domain/logs.py +118 -0
  52. data_engine/domain/operations.py +172 -0
  53. data_engine/domain/operator.py +72 -0
  54. data_engine/domain/runs.py +155 -0
  55. data_engine/domain/runtime.py +279 -0
  56. data_engine/domain/source_state.py +17 -0
  57. data_engine/domain/support.py +54 -0
  58. data_engine/domain/time.py +23 -0
  59. data_engine/domain/workspace.py +159 -0
  60. data_engine/flow_modules/__init__.py +1 -0
  61. data_engine/flow_modules/flow_module_compiler.py +179 -0
  62. data_engine/flow_modules/flow_module_loader.py +201 -0
  63. data_engine/helpers/__init__.py +25 -0
  64. data_engine/helpers/duckdb.py +705 -0
  65. data_engine/hosts/__init__.py +1 -0
  66. data_engine/hosts/daemon/__init__.py +23 -0
  67. data_engine/hosts/daemon/app.py +221 -0
  68. data_engine/hosts/daemon/bootstrap.py +69 -0
  69. data_engine/hosts/daemon/client.py +465 -0
  70. data_engine/hosts/daemon/commands.py +64 -0
  71. data_engine/hosts/daemon/composition.py +310 -0
  72. data_engine/hosts/daemon/constants.py +15 -0
  73. data_engine/hosts/daemon/entrypoints.py +97 -0
  74. data_engine/hosts/daemon/lifecycle.py +191 -0
  75. data_engine/hosts/daemon/manager.py +272 -0
  76. data_engine/hosts/daemon/ownership.py +126 -0
  77. data_engine/hosts/daemon/runtime_commands.py +188 -0
  78. data_engine/hosts/daemon/runtime_control.py +31 -0
  79. data_engine/hosts/daemon/server.py +84 -0
  80. data_engine/hosts/daemon/shared_state.py +147 -0
  81. data_engine/hosts/daemon/state_sync.py +101 -0
  82. data_engine/platform/__init__.py +1 -0
  83. data_engine/platform/identity.py +35 -0
  84. data_engine/platform/local_settings.py +146 -0
  85. data_engine/platform/theme.py +259 -0
  86. data_engine/platform/workspace_models.py +190 -0
  87. data_engine/platform/workspace_policy.py +333 -0
  88. data_engine/runtime/__init__.py +1 -0
  89. data_engine/runtime/file_watch.py +185 -0
  90. data_engine/runtime/ledger_models.py +116 -0
  91. data_engine/runtime/runtime_db.py +938 -0
  92. data_engine/runtime/shared_state.py +523 -0
  93. data_engine/services/__init__.py +49 -0
  94. data_engine/services/daemon.py +64 -0
  95. data_engine/services/daemon_state.py +40 -0
  96. data_engine/services/flow_catalog.py +102 -0
  97. data_engine/services/flow_execution.py +48 -0
  98. data_engine/services/ledger.py +85 -0
  99. data_engine/services/logs.py +65 -0
  100. data_engine/services/runtime_binding.py +105 -0
  101. data_engine/services/runtime_execution.py +126 -0
  102. data_engine/services/runtime_history.py +62 -0
  103. data_engine/services/settings.py +58 -0
  104. data_engine/services/shared_state.py +28 -0
  105. data_engine/services/theme.py +59 -0
  106. data_engine/services/workspace_provisioning.py +224 -0
  107. data_engine/services/workspaces.py +74 -0
  108. data_engine/ui/__init__.py +3 -0
  109. data_engine/ui/cli/__init__.py +19 -0
  110. data_engine/ui/cli/app.py +161 -0
  111. data_engine/ui/cli/commands_doctor.py +178 -0
  112. data_engine/ui/cli/commands_run.py +80 -0
  113. data_engine/ui/cli/commands_start.py +100 -0
  114. data_engine/ui/cli/commands_workspace.py +97 -0
  115. data_engine/ui/cli/dependencies.py +44 -0
  116. data_engine/ui/cli/parser.py +56 -0
  117. data_engine/ui/gui/__init__.py +25 -0
  118. data_engine/ui/gui/app.py +116 -0
  119. data_engine/ui/gui/bootstrap.py +487 -0
  120. data_engine/ui/gui/bootstrapper.py +140 -0
  121. data_engine/ui/gui/cache_models.py +23 -0
  122. data_engine/ui/gui/control_support.py +185 -0
  123. data_engine/ui/gui/controllers/__init__.py +6 -0
  124. data_engine/ui/gui/controllers/flows.py +439 -0
  125. data_engine/ui/gui/controllers/runtime.py +245 -0
  126. data_engine/ui/gui/dialogs/__init__.py +12 -0
  127. data_engine/ui/gui/dialogs/messages.py +88 -0
  128. data_engine/ui/gui/dialogs/previews.py +222 -0
  129. data_engine/ui/gui/helpers/__init__.py +62 -0
  130. data_engine/ui/gui/helpers/inspection.py +81 -0
  131. data_engine/ui/gui/helpers/lifecycle.py +112 -0
  132. data_engine/ui/gui/helpers/scroll.py +28 -0
  133. data_engine/ui/gui/helpers/theming.py +87 -0
  134. data_engine/ui/gui/icons/dark_light.svg +12 -0
  135. data_engine/ui/gui/icons/documentation.svg +1 -0
  136. data_engine/ui/gui/icons/failed.svg +3 -0
  137. data_engine/ui/gui/icons/group.svg +4 -0
  138. data_engine/ui/gui/icons/home.svg +2 -0
  139. data_engine/ui/gui/icons/manual.svg +2 -0
  140. data_engine/ui/gui/icons/poll.svg +2 -0
  141. data_engine/ui/gui/icons/schedule.svg +4 -0
  142. data_engine/ui/gui/icons/settings.svg +2 -0
  143. data_engine/ui/gui/icons/started.svg +3 -0
  144. data_engine/ui/gui/icons/success.svg +3 -0
  145. data_engine/ui/gui/icons/view-log.svg +3 -0
  146. data_engine/ui/gui/icons.py +50 -0
  147. data_engine/ui/gui/launcher.py +48 -0
  148. data_engine/ui/gui/presenters/__init__.py +72 -0
  149. data_engine/ui/gui/presenters/docs.py +140 -0
  150. data_engine/ui/gui/presenters/logs.py +58 -0
  151. data_engine/ui/gui/presenters/runtime_projection.py +29 -0
  152. data_engine/ui/gui/presenters/sidebar.py +88 -0
  153. data_engine/ui/gui/presenters/steps.py +148 -0
  154. data_engine/ui/gui/presenters/workspace.py +39 -0
  155. data_engine/ui/gui/presenters/workspace_binding.py +75 -0
  156. data_engine/ui/gui/presenters/workspace_settings.py +182 -0
  157. data_engine/ui/gui/preview_models.py +37 -0
  158. data_engine/ui/gui/render_support.py +241 -0
  159. data_engine/ui/gui/rendering/__init__.py +12 -0
  160. data_engine/ui/gui/rendering/artifacts.py +95 -0
  161. data_engine/ui/gui/rendering/icons.py +50 -0
  162. data_engine/ui/gui/runtime.py +47 -0
  163. data_engine/ui/gui/state_support.py +193 -0
  164. data_engine/ui/gui/support.py +214 -0
  165. data_engine/ui/gui/surface.py +209 -0
  166. data_engine/ui/gui/theme.py +720 -0
  167. data_engine/ui/gui/widgets/__init__.py +34 -0
  168. data_engine/ui/gui/widgets/config.py +41 -0
  169. data_engine/ui/gui/widgets/logs.py +62 -0
  170. data_engine/ui/gui/widgets/panels.py +507 -0
  171. data_engine/ui/gui/widgets/sidebar.py +130 -0
  172. data_engine/ui/gui/widgets/steps.py +84 -0
  173. data_engine/ui/tui/__init__.py +5 -0
  174. data_engine/ui/tui/app.py +222 -0
  175. data_engine/ui/tui/bootstrap.py +475 -0
  176. data_engine/ui/tui/bootstrapper.py +117 -0
  177. data_engine/ui/tui/controllers/__init__.py +6 -0
  178. data_engine/ui/tui/controllers/flows.py +349 -0
  179. data_engine/ui/tui/controllers/runtime.py +167 -0
  180. data_engine/ui/tui/runtime.py +34 -0
  181. data_engine/ui/tui/state_support.py +141 -0
  182. data_engine/ui/tui/support.py +63 -0
  183. data_engine/ui/tui/theme.py +204 -0
  184. data_engine/ui/tui/widgets.py +123 -0
  185. data_engine/views/__init__.py +109 -0
  186. data_engine/views/actions.py +80 -0
  187. data_engine/views/artifacts.py +58 -0
  188. data_engine/views/flow_display.py +69 -0
  189. data_engine/views/logs.py +54 -0
  190. data_engine/views/models.py +96 -0
  191. data_engine/views/presentation.py +133 -0
  192. data_engine/views/runs.py +62 -0
  193. data_engine/views/state.py +39 -0
  194. data_engine/views/status.py +13 -0
  195. data_engine/views/text.py +109 -0
  196. py_data_engine-0.1.0.dist-info/METADATA +330 -0
  197. py_data_engine-0.1.0.dist-info/RECORD +200 -0
  198. py_data_engine-0.1.0.dist-info/WHEEL +5 -0
  199. py_data_engine-0.1.0.dist-info/entry_points.txt +2 -0
  200. py_data_engine-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,185 @@
1
+ # Configuring Flows
2
+
3
+ Per-flow configuration lives in the fluent `Flow` chain, not in TOML.
4
+
5
+ That is an important design choice:
6
+
7
+ - the runtime shape of a flow belongs in the authored `Flow(...)` definition
8
+ - workspace-local TOML in `config/` is for step logic and runtime parameters consumed by your code
9
+ - there is no separate "expand this flow into several configured variants" layer after `build()`
10
+
11
+ ## Core fields
12
+
13
+ ```python
14
+ Flow(group="Claims")
15
+ ```
16
+
17
+ `group` is author-defined. The flow-module filename provides the flow identity.
18
+
19
+ Use `group` to cluster related flows in the UI and runtime model. A good rule of thumb is that a group should mean "these flows belong to the same operator-facing area of work."
20
+
21
+ ## Watching
22
+
23
+ Single-file polling:
24
+
25
+ ```python
26
+ Flow(group="Settings").watch(
27
+ mode="poll",
28
+ source="../../example_data/Settings/single_watch.xlsx",
29
+ interval="5s",
30
+ ).mirror(
31
+ root="../../example_data/Output/example_single_watch",
32
+ )
33
+ ```
34
+
35
+ Directory polling:
36
+
37
+ ```python
38
+ Flow(group="Claims").watch(
39
+ mode="poll",
40
+ source="../../example_data/Input/claims_flat",
41
+ interval="5s",
42
+ extensions=[".xlsx", ".xls", ".xlsm"],
43
+ settle=1,
44
+ ).mirror(
45
+ root="../../example_data/Output/example_mirror",
46
+ )
47
+ ```
48
+
49
+ Scheduled batch runs:
50
+
51
+ ```python
52
+ Flow(group="Analytics").watch(
53
+ mode="schedule",
54
+ run_as="batch",
55
+ interval="15m",
56
+ source="../../example_data/Input/claims_flat",
57
+ ).mirror(
58
+ root="../../example_data/Output/example_summary",
59
+ )
60
+
61
+ Flow(group="Settings").watch(
62
+ mode="schedule",
63
+ run_as="batch",
64
+ time="10:31",
65
+ source="../../example_data/Settings/single_watch.xlsx",
66
+ ).mirror(
67
+ root="../../example_data/Output/example_schedule",
68
+ )
69
+
70
+ Flow(group="Settings").watch(
71
+ mode="schedule",
72
+ run_as="batch",
73
+ time=["08:15", "14:45"],
74
+ source="../../example_data/Settings/single_watch.xlsx",
75
+ )
76
+ ```
77
+
78
+ What watching owns:
79
+
80
+ - source selection
81
+ - ledger-backed source freshness tracking
82
+ - extension filtering for poll directory sources
83
+ - settle/debounce behavior for poll flows
84
+ - whether runtime executes per file or as one root-level batch via `run_as=`
85
+
86
+ What watching does not own:
87
+
88
+ - dataframe reads
89
+ - dataframe transforms
90
+ - database work
91
+ - output writing
92
+
93
+ That separation is what keeps `watch(...)` readable. It tells the engine when and why to run, not how to do the underlying data work.
94
+
95
+ `watch(mode="schedule", ...)` accepts exactly one of:
96
+
97
+ - `interval="10m"`
98
+ - `time="HH:MM"`
99
+ - `time=["08:15", "14:45"]`
100
+
101
+ It may also bind an optional `source=...` path for recurring jobs.
102
+
103
+ ### `run_as`
104
+
105
+ `run_as` controls what the runtime treats as one unit of work.
106
+
107
+ Common values are:
108
+
109
+ - `run_as="individual"`: one run per concrete source file
110
+ - `run_as="batch"`: one run at the watched root
111
+
112
+ Use `individual` when each source file should be processed independently.
113
+
114
+ Use `batch` when the flow should reason about the watched source as one collection, such as "all current workbooks in this folder."
115
+
116
+ ### Poll-specific options
117
+
118
+ `extensions=` limits which files in a polled directory participate in freshness checks and execution.
119
+
120
+ `settle=` adds debounce behavior so the engine does not immediately react to a file that is still being written by another process.
121
+
122
+ ## Mirror bindings
123
+
124
+ Use `mirror(root=...)` when a flow needs source-relative output routing.
125
+
126
+ Inside steps:
127
+
128
+ ```python
129
+ context.mirror.with_suffix(".parquet")
130
+ context.mirror.file("summary.json")
131
+ context.mirror.namespaced_file("open_claims.parquet")
132
+ context.mirror.root_file("analytics.duckdb")
133
+ ```
134
+
135
+ `mirror(...)` does not write files. It only defines the output namespace available at runtime.
136
+
137
+ If a flow has no natural mirrored outputs, you do not need `mirror(...)`.
138
+
139
+ If a flow writes several related outputs, `mirror(...)` is usually the cleanest way to keep them organized without scattering path math through your steps.
140
+
141
+ ## Batch workflows
142
+
143
+ Use `collect(...)` and `map(...)` or `step_each(...)` together for folder-oriented processing:
144
+
145
+ ```python
146
+ Flow(group="Analytics") \
147
+ .watch(mode="schedule", run_as="batch", interval="15m", source="../../example_data/Input/claims_flat") \
148
+ .collect([".xlsx"], save_as="claim_files") \
149
+ .map(read_claims, use="claim_files", save_as="claim_frames")
150
+ ```
151
+
152
+ `map(...)` is the per-item stage in that pipeline, and `step_each(...)` is the equivalent alias. Both raise immediately when the batch is empty.
153
+
154
+ This is the standard batch shape:
155
+
156
+ 1. watch a directory or scheduled source root
157
+ 2. collect matching files into a `Batch`
158
+ 3. map one callable across each file
159
+ 4. switch back to `step(...)` once you want to reason about the combined result
160
+
161
+ ## Configuring step labels and saved objects
162
+
163
+ Flow configuration also includes the names and labels you assign in the chain.
164
+
165
+ Examples:
166
+
167
+ ```python
168
+ (
169
+ Flow(group="Claims")
170
+ .step(read_claims, save_as="raw_df")
171
+ .step(clean_claims, use="raw_df", save_as="clean_df")
172
+ .step(write_output, use="clean_df", label="Write Parquet")
173
+ )
174
+ ```
175
+
176
+ Those fields affect the authoring experience directly:
177
+
178
+ - `save_as=` creates stable names for later steps and notebook previews
179
+ - `use=` loads one of those saved names into `context.current`
180
+ - `label=` controls the display name in the UI
181
+
182
+ If you are deciding where a piece of information belongs:
183
+
184
+ - if it shapes orchestration, put it in the `Flow` chain
185
+ - if it shapes step logic, put it in your code or in `context.config`
@@ -0,0 +1,208 @@
1
+ # Core Concepts
2
+
3
+ ## Flow
4
+
5
+ A `Flow` is an immutable definition with:
6
+
7
+ - `group`
8
+ - an optional trigger via `watch(...)`
9
+ - an optional mirrored output binding via `mirror(...)`
10
+ - ordered generic steps
11
+
12
+ ```python
13
+ from data_engine import Flow
14
+
15
+ flow = Flow(group="Claims")
16
+ ```
17
+
18
+ The flow-module filename is the flow identity used for discovery and runtime bookkeeping. `group` is the author-controlled grouping visible in the UI.
19
+
20
+ ## Runtime modes
21
+
22
+ Manual:
23
+
24
+ - no trigger configured
25
+ - `run_once()` executes the steps once with `context.current = None`
26
+ - useful for button-driven operator runs or preview-oriented flows
27
+
28
+ Poll:
29
+
30
+ - source-driven execution over either one file or a directory of files
31
+ - the runtime compares the current source file signature against the persisted runtime ledger
32
+ - the first step sees the active input through `context.source`
33
+ - startup backlog handling is based on persisted ledger state for each source version
34
+ - intermediate saved objects do not participate in staleness checks
35
+
36
+ Schedule:
37
+
38
+ - interval-driven via `watch(mode="schedule", interval="15m")`
39
+ - or wall-clock via `watch(mode="schedule", time="10:31")`
40
+ - `time` may also be a collection such as `["08:15", "14:45"]`
41
+ - may optionally bind a `source=...` path for recurring jobs
42
+
43
+ The distinction between poll and schedule is important:
44
+
45
+ - poll is source freshness driven
46
+ - schedule is time driven
47
+
48
+ You can combine scheduled execution with a source binding when the flow should run on a schedule but still read from a known source root.
49
+
50
+ ## Step
51
+
52
+ Each `step(...)` is one callable:
53
+
54
+ ```python
55
+ def step(context) -> object:
56
+ ...
57
+ ```
58
+
59
+ The return value always becomes `context.current`.
60
+
61
+ This is the main design boundary:
62
+
63
+ - the fluent API orchestrates runtime behavior
64
+ - native libraries perform the actual data and file work
65
+
66
+ That means Data Engine is intentionally not trying to replace Polars, DuckDB, pathlib, or your Python helper code. It coordinates them.
67
+
68
+ ## Saved objects
69
+
70
+ Steps can save and reuse values:
71
+
72
+ ```python
73
+ (
74
+ Flow(group="Docs")
75
+ .step(read_claims, save_as="raw_df")
76
+ .step(clean_claims, use="raw_df", save_as="clean_df")
77
+ .step(write_output, use="clean_df")
78
+ )
79
+ ```
80
+
81
+ - `use="name"` loads `context.objects["name"]` into `context.current`
82
+ - `save_as="name"` stores the returned value into `context.objects["name"]`
83
+
84
+ In notebooks, those saved names are also the easiest way to inspect intermediates:
85
+
86
+ ```python
87
+ build().preview(use="clean_df").head(10)
88
+ ```
89
+
90
+ This is one of the most useful parts of the authoring model:
91
+
92
+ - `current` gives you the current object in the pipeline
93
+ - `objects` gives you stable named waypoints
94
+
95
+ That makes it easy to structure flows around a few explicit intermediate states rather than one long opaque chain.
96
+
97
+ ## Batch mapping
98
+
99
+ `collect(...)` and `map(...)` are the batch-oriented authoring tools.
100
+
101
+ ```python
102
+ def read_claims(file_ref):
103
+ return pl.read_excel(file_ref.path)
104
+
105
+
106
+ def combine_claims(context):
107
+ return pl.concat(context.current, how="vertical_relaxed")
108
+
109
+
110
+ flow = (
111
+ Flow(group="Analytics")
112
+ .watch(mode="schedule", run_as="batch", interval="15m", source="../../example_data/Input/claims_flat")
113
+ .collect([".xlsx"], save_as="claim_files")
114
+ .map(read_claims, use="claim_files", save_as="claim_frames")
115
+ .step(combine_claims, use="claim_frames")
116
+ )
117
+ ```
118
+
119
+ Use `map(...)` when the same callable should run once per batch item instead of once per whole flow. `map(...)` raises immediately when the batch is empty.
120
+
121
+ Batch mapping is especially useful when you want to:
122
+
123
+ - read many files into many dataframes
124
+ - validate one file at a time
125
+ - emit one lightweight record per source item before combining
126
+
127
+ Use a normal `step(...)` when the callable should reason about the batch as a whole.
128
+
129
+ ## Source and mirror namespaces
130
+
131
+ The runtime exposes two structured path namespaces:
132
+
133
+ - `context.source`
134
+ - `context.mirror`
135
+
136
+ Examples:
137
+
138
+ ```python
139
+ context.source.path
140
+ context.source.with_extension(".json")
141
+ context.source.with_suffix(".json")
142
+ context.source.file("notes.json")
143
+ context.source.namespaced_file("notes.json")
144
+ context.source.root_file("lookup.csv")
145
+
146
+ context.mirror.with_extension(".parquet")
147
+ context.mirror.with_suffix(".parquet")
148
+ context.mirror.file("open_claims.parquet")
149
+ context.mirror.namespaced_file("open_claims.parquet")
150
+ context.mirror.root_file("analytics.duckdb")
151
+ ```
152
+
153
+ `context.source` resolves read-side paths. `context.mirror` resolves write-ready output paths.
154
+
155
+ The important difference is:
156
+
157
+ - `source` is about where the active input lives
158
+ - `mirror` is about where outputs for that input should go
159
+
160
+ That lets you keep path logic readable and source-aware without hand-building relative paths in every step.
161
+
162
+ Examples of common patterns:
163
+
164
+ - read a sidecar file beside the current source with `context.source.file("notes.json")`
165
+ - write one mirrored parquet beside the source shape with `context.mirror.with_suffix(".parquet")`
166
+ - write multiple outputs for the same source with `context.mirror.namespaced_file(...)`
167
+ - write a stable root-level artifact such as a snapshot or DuckDB file with `context.mirror.root_file(...)`
168
+
169
+ ## Discovery
170
+
171
+ The desktop UI and Python entrypoints discover flows from compiled flow modules.
172
+
173
+ Each discovered flow module contributes:
174
+
175
+ - a module name
176
+ - optional `DESCRIPTION`
177
+ - `build() -> Flow`
178
+
179
+ The flow-module filename/module name is the flow identity surfaced in discovery and execution. The UI uses `Flow.label` when present, otherwise it derives a readable title from that internal name.
180
+
181
+ That discovered `Flow` object is what the UI inspects for:
182
+
183
+ - grouping
184
+ - step labels
185
+ - runtime mode
186
+ - source and mirror bindings
187
+
188
+ The app does not maintain a second hidden config layer that mutates flow behavior after discovery. The authored `Flow` is the real contract the runtime and UI are looking at.
189
+
190
+ ## Workspaces
191
+
192
+ Flows do not exist in isolation. They are discovered from the currently selected authored workspace.
193
+
194
+ An authored workspace typically contains:
195
+
196
+ - `flow_modules/`
197
+ - `flow_modules/flow_helpers/`
198
+ - `config/`
199
+ - `databases/`
200
+
201
+ The desktop app binds to one workspace at a time. When the selected workspace changes, the app reloads:
202
+
203
+ - discovered flows
204
+ - local runtime state
205
+ - daemon control state
206
+ - visible runs and logs
207
+
208
+ For the control and state model behind that, see [App Runtime and Workspaces](app-runtime-and-workspaces.md).
@@ -0,0 +1,107 @@
1
+ # Database Methods
2
+
3
+ There is no first-class database sub-chain. Use DuckDB directly inside step callables, usually with a workspace-local database path from `context.database(...)`.
4
+
5
+ If you want common warehouse-style shortcuts, see [DuckDB Helpers](duckdb-helpers.md). That helper layer covers several repeated patterns without taking over general SQL authoring.
6
+
7
+ That is intentional. The core API deliberately avoids hiding connection ownership, transactions, or query semantics behind a special fluent DSL.
8
+
9
+ In practice, that means:
10
+
11
+ - Data Engine gives you a conventional path
12
+ - your step opens and closes the database connection
13
+ - normal DuckDB and Python rules apply
14
+
15
+ ## `context.database(...)`
16
+
17
+ `context.database(name)` returns a path beneath the current authored workspace's `databases/` folder.
18
+
19
+ Examples:
20
+
21
+ ```python
22
+ context.database("analytics.duckdb")
23
+ context.database("claims/analytics.duckdb")
24
+ ```
25
+
26
+ Those resolve to:
27
+
28
+ - `workspaces/<workspace_id>/databases/analytics.duckdb`
29
+ - `workspaces/<workspace_id>/databases/claims/analytics.duckdb`
30
+
31
+ Rules:
32
+
33
+ - the path must be relative
34
+ - parent directories are created automatically
35
+ - the helper is only available for authored workspace flows
36
+ - it does not create a connection for you
37
+
38
+ That last point is important. Returning the path avoids hidden connection lifetime issues and keeps the behavior obvious.
39
+
40
+ ## Example
41
+
42
+ ```python
43
+ import duckdb
44
+ import polars as pl
45
+
46
+ from data_engine import Flow
47
+
48
+
49
+ def read_claims(file_ref):
50
+ return pl.read_excel(file_ref.path)
51
+
52
+
53
+ def build_source(context):
54
+ return pl.concat(context.current, how="vertical_relaxed")
55
+
56
+
57
+ def summarize(context):
58
+ conn = duckdb.connect(context.database("analytics.duckdb"))
59
+ try:
60
+ conn.register("input", context.current)
61
+ return conn.sql(
62
+ """
63
+ select workflow, count(*) as row_count
64
+ from input
65
+ group by workflow
66
+ """
67
+ ).pl()
68
+ finally:
69
+ conn.close()
70
+
71
+
72
+ def build():
73
+ return (
74
+ Flow(group="Analytics")
75
+ .watch(
76
+ mode="schedule",
77
+ run_as="batch",
78
+ interval="15m",
79
+ source="../../example_data/Input/claims_flat",
80
+ )
81
+ .mirror(root="../../example_data/Output/example_summary")
82
+ .collect([".xlsx"], save_as="claim_files")
83
+ .map(read_claims, use="claim_files", save_as="claim_frames")
84
+ .step(build_source, use="claim_frames", save_as="raw_df")
85
+ .step(summarize, use="raw_df", save_as="summary_df")
86
+ )
87
+ ```
88
+
89
+ This keeps the flow API small while still letting flow modules use native SQL and native DuckDB connections.
90
+
91
+ ## Good patterns
92
+
93
+ - open the connection inside the step that needs it
94
+ - close the connection in `finally:`
95
+ - keep the path stable when you want incremental or append-oriented databases
96
+ - use subfolders such as `claims/analytics.duckdb` when one workspace owns several related databases
97
+
98
+ ## A note on mirror vs database paths
99
+
100
+ If the database is a durable workspace-local asset, prefer `context.database(...)`.
101
+
102
+ If the database is really just another output artifact produced by one mirrored source flow, `context.mirror.root_file("analytics.duckdb")` can still be appropriate.
103
+
104
+ The difference is mostly semantic:
105
+
106
+ - `context.database(...)` says "this belongs to the workspace as a local database"
107
+ - `context.mirror...` says "this belongs to this flow's output namespace"