goldenflow 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. goldenflow-1.0.0/.clinerules +298 -0
  2. goldenflow-1.0.0/.cursorrules +298 -0
  3. goldenflow-1.0.0/.github/ISSUE_TEMPLATE/bug_report.yml +56 -0
  4. goldenflow-1.0.0/.github/ISSUE_TEMPLATE/config.yml +5 -0
  5. goldenflow-1.0.0/.github/ISSUE_TEMPLATE/feature_request.yml +38 -0
  6. goldenflow-1.0.0/.github/copilot-instructions.md +298 -0
  7. goldenflow-1.0.0/.github/workflows/pages.yml +44 -0
  8. goldenflow-1.0.0/.github/workflows/publish.yml +21 -0
  9. goldenflow-1.0.0/.github/workflows/test.yml +46 -0
  10. goldenflow-1.0.0/.gitignore +26 -0
  11. goldenflow-1.0.0/.windsurfrules +298 -0
  12. goldenflow-1.0.0/AGENTS.md +298 -0
  13. goldenflow-1.0.0/CHANGELOG.md +31 -0
  14. goldenflow-1.0.0/CLAUDE.md +306 -0
  15. goldenflow-1.0.0/CODE_OF_CONDUCT.md +36 -0
  16. goldenflow-1.0.0/CONTRIBUTING.md +233 -0
  17. goldenflow-1.0.0/Dockerfile.mcp +6 -0
  18. goldenflow-1.0.0/PKG-INFO +695 -0
  19. goldenflow-1.0.0/README.md +635 -0
  20. goldenflow-1.0.0/benchmarks/datasets/generate_datasets.py +523 -0
  21. goldenflow-1.0.0/benchmarks/datasets/goldenflow_bench/data.csv +5001 -0
  22. goldenflow-1.0.0/benchmarks/datasets/goldenflow_bench/ground_truth.json +211 -0
  23. goldenflow-1.0.0/benchmarks/quality_benchmark.py +424 -0
  24. goldenflow-1.0.0/benchmarks/schema_mapping_benchmark.py +256 -0
  25. goldenflow-1.0.0/benchmarks/speed_benchmark.py +146 -0
  26. goldenflow-1.0.0/docs/_config.yml +22 -0
  27. goldenflow-1.0.0/docs/index.md +30 -0
  28. goldenflow-1.0.0/docs/llms-full.txt +280 -0
  29. goldenflow-1.0.0/docs/llms.txt +18 -0
  30. goldenflow-1.0.0/docs/superpowers/plans/2026-03-25-goldenflow-implementation.md +4697 -0
  31. goldenflow-1.0.0/docs/superpowers/specs/2026-03-25-goldenflow-design.md +425 -0
  32. goldenflow-1.0.0/examples/README.md +13 -0
  33. goldenflow-1.0.0/examples/config_based.py +34 -0
  34. goldenflow-1.0.0/examples/domain_pack.py +36 -0
  35. goldenflow-1.0.0/examples/transform_basic.py +28 -0
  36. goldenflow-1.0.0/golden-suite.json +59 -0
  37. goldenflow-1.0.0/goldenflow/__init__.py +113 -0
  38. goldenflow-1.0.0/goldenflow/a2a/__init__.py +1 -0
  39. goldenflow-1.0.0/goldenflow/a2a/server.py +135 -0
  40. goldenflow-1.0.0/goldenflow/api/__init__.py +0 -0
  41. goldenflow-1.0.0/goldenflow/api/server.py +80 -0
  42. goldenflow-1.0.0/goldenflow/cli/__init__.py +0 -0
  43. goldenflow-1.0.0/goldenflow/cli/errors.py +32 -0
  44. goldenflow-1.0.0/goldenflow/cli/init_wizard.py +93 -0
  45. goldenflow-1.0.0/goldenflow/cli/main.py +426 -0
  46. goldenflow-1.0.0/goldenflow/cli/schedule.py +61 -0
  47. goldenflow-1.0.0/goldenflow/cli/watch.py +53 -0
  48. goldenflow-1.0.0/goldenflow/config/__init__.py +0 -0
  49. goldenflow-1.0.0/goldenflow/config/learner.py +27 -0
  50. goldenflow-1.0.0/goldenflow/config/loader.py +31 -0
  51. goldenflow-1.0.0/goldenflow/config/schema.py +44 -0
  52. goldenflow-1.0.0/goldenflow/connectors/__init__.py +3 -0
  53. goldenflow-1.0.0/goldenflow/connectors/database.py +21 -0
  54. goldenflow-1.0.0/goldenflow/connectors/file.py +55 -0
  55. goldenflow-1.0.0/goldenflow/connectors/gcs.py +67 -0
  56. goldenflow-1.0.0/goldenflow/connectors/s3.py +63 -0
  57. goldenflow-1.0.0/goldenflow/domains/__init__.py +23 -0
  58. goldenflow-1.0.0/goldenflow/domains/base.py +12 -0
  59. goldenflow-1.0.0/goldenflow/domains/ecommerce.py +27 -0
  60. goldenflow-1.0.0/goldenflow/domains/finance.py +37 -0
  61. goldenflow-1.0.0/goldenflow/domains/healthcare.py +50 -0
  62. goldenflow-1.0.0/goldenflow/domains/people_hr.py +64 -0
  63. goldenflow-1.0.0/goldenflow/domains/real_estate.py +27 -0
  64. goldenflow-1.0.0/goldenflow/engine/__init__.py +0 -0
  65. goldenflow-1.0.0/goldenflow/engine/differ.py +54 -0
  66. goldenflow-1.0.0/goldenflow/engine/manifest.py +74 -0
  67. goldenflow-1.0.0/goldenflow/engine/profiler_bridge.py +201 -0
  68. goldenflow-1.0.0/goldenflow/engine/selector.py +61 -0
  69. goldenflow-1.0.0/goldenflow/engine/transformer.py +275 -0
  70. goldenflow-1.0.0/goldenflow/history.py +57 -0
  71. goldenflow-1.0.0/goldenflow/llm/__init__.py +0 -0
  72. goldenflow-1.0.0/goldenflow/llm/corrector.py +102 -0
  73. goldenflow-1.0.0/goldenflow/mapping/__init__.py +0 -0
  74. goldenflow-1.0.0/goldenflow/mapping/name_similarity.py +44 -0
  75. goldenflow-1.0.0/goldenflow/mapping/profile_similarity.py +34 -0
  76. goldenflow-1.0.0/goldenflow/mapping/schema_mapper.py +90 -0
  77. goldenflow-1.0.0/goldenflow/mcp/__init__.py +0 -0
  78. goldenflow-1.0.0/goldenflow/mcp/server.py +321 -0
  79. goldenflow-1.0.0/goldenflow/notebook.py +104 -0
  80. goldenflow-1.0.0/goldenflow/py.typed +0 -0
  81. goldenflow-1.0.0/goldenflow/reporters/__init__.py +0 -0
  82. goldenflow-1.0.0/goldenflow/reporters/json_reporter.py +9 -0
  83. goldenflow-1.0.0/goldenflow/reporters/rich_console.py +73 -0
  84. goldenflow-1.0.0/goldenflow/streaming.py +61 -0
  85. goldenflow-1.0.0/goldenflow/transforms/__init__.py +64 -0
  86. goldenflow-1.0.0/goldenflow/transforms/address.py +134 -0
  87. goldenflow-1.0.0/goldenflow/transforms/auto_correct.py +110 -0
  88. goldenflow-1.0.0/goldenflow/transforms/categorical.py +121 -0
  89. goldenflow-1.0.0/goldenflow/transforms/dates.py +86 -0
  90. goldenflow-1.0.0/goldenflow/transforms/names.py +119 -0
  91. goldenflow-1.0.0/goldenflow/transforms/numeric.py +57 -0
  92. goldenflow-1.0.0/goldenflow/transforms/phone.py +74 -0
  93. goldenflow-1.0.0/goldenflow/transforms/text.py +87 -0
  94. goldenflow-1.0.0/goldenflow/tui/__init__.py +0 -0
  95. goldenflow-1.0.0/goldenflow/tui/app.py +463 -0
  96. goldenflow-1.0.0/llms.txt +17 -0
  97. goldenflow-1.0.0/pyproject.toml +61 -0
  98. goldenflow-1.0.0/server.json +28 -0
  99. goldenflow-1.0.0/smithery.yaml +26 -0
  100. goldenflow-1.0.0/tests/__init__.py +0 -0
  101. goldenflow-1.0.0/tests/api/__init__.py +0 -0
  102. goldenflow-1.0.0/tests/api/test_server.py +31 -0
  103. goldenflow-1.0.0/tests/cli/__init__.py +0 -0
  104. goldenflow-1.0.0/tests/cli/test_cli.py +47 -0
  105. goldenflow-1.0.0/tests/cli/test_cli_polish.py +22 -0
  106. goldenflow-1.0.0/tests/config/__init__.py +0 -0
  107. goldenflow-1.0.0/tests/config/test_loader.py +50 -0
  108. goldenflow-1.0.0/tests/config/test_schema.py +51 -0
  109. goldenflow-1.0.0/tests/conftest.py +46 -0
  110. goldenflow-1.0.0/tests/connectors/__init__.py +0 -0
  111. goldenflow-1.0.0/tests/connectors/test_cloud.py +48 -0
  112. goldenflow-1.0.0/tests/connectors/test_database.py +15 -0
  113. goldenflow-1.0.0/tests/connectors/test_file.py +51 -0
  114. goldenflow-1.0.0/tests/domains/__init__.py +0 -0
  115. goldenflow-1.0.0/tests/domains/test_all_domains.py +25 -0
  116. goldenflow-1.0.0/tests/domains/test_people_hr.py +25 -0
  117. goldenflow-1.0.0/tests/engine/__init__.py +0 -0
  118. goldenflow-1.0.0/tests/engine/test_differ.py +31 -0
  119. goldenflow-1.0.0/tests/engine/test_manifest.py +48 -0
  120. goldenflow-1.0.0/tests/engine/test_profiler_bridge.py +42 -0
  121. goldenflow-1.0.0/tests/engine/test_selector.py +74 -0
  122. goldenflow-1.0.0/tests/engine/test_transformer.py +110 -0
  123. goldenflow-1.0.0/tests/fixtures/messy.csv +7 -0
  124. goldenflow-1.0.0/tests/llm/__init__.py +0 -0
  125. goldenflow-1.0.0/tests/llm/test_corrector.py +69 -0
  126. goldenflow-1.0.0/tests/mapping/__init__.py +0 -0
  127. goldenflow-1.0.0/tests/mapping/test_schema_mapper.py +42 -0
  128. goldenflow-1.0.0/tests/mcp/__init__.py +0 -0
  129. goldenflow-1.0.0/tests/mcp/test_mcp.py +72 -0
  130. goldenflow-1.0.0/tests/test_a2a.py +74 -0
  131. goldenflow-1.0.0/tests/test_history.py +26 -0
  132. goldenflow-1.0.0/tests/test_integration.py +127 -0
  133. goldenflow-1.0.0/tests/test_notebook.py +11 -0
  134. goldenflow-1.0.0/tests/test_public_api.py +54 -0
  135. goldenflow-1.0.0/tests/test_streaming.py +35 -0
  136. goldenflow-1.0.0/tests/transforms/__init__.py +0 -0
  137. goldenflow-1.0.0/tests/transforms/test_address.py +60 -0
  138. goldenflow-1.0.0/tests/transforms/test_auto_correct.py +55 -0
  139. goldenflow-1.0.0/tests/transforms/test_categorical.py +59 -0
  140. goldenflow-1.0.0/tests/transforms/test_dates.py +47 -0
  141. goldenflow-1.0.0/tests/transforms/test_names.py +56 -0
  142. goldenflow-1.0.0/tests/transforms/test_numeric.py +32 -0
  143. goldenflow-1.0.0/tests/transforms/test_phone.py +34 -0
  144. goldenflow-1.0.0/tests/transforms/test_registry.py +68 -0
  145. goldenflow-1.0.0/tests/transforms/test_text.py +73 -0
  146. goldenflow-1.0.0/tests/tui/__init__.py +0 -0
  147. goldenflow-1.0.0/tests/tui/test_tui.py +71 -0
@@ -0,0 +1,298 @@
1
+ # GoldenFlow -- Cline Rules
2
+
3
+ Data transformation toolkit -- standardize, reshape, and normalize messy data. DQBench Transform Score: 100/100.
4
+
5
+ ## Related Projects
6
+
7
+ - **GoldenCheck:** `D:\show_case\goldencheck` -- Data validation.
8
+ - **GoldenMatch:** `D:\show_case\goldenmatch` -- Entity resolution.
9
+ - **GitHub:** `benzsevern/goldenflow`, `benzsevern/goldencheck`, `benzsevern/goldenmatch`
10
+
11
+ ## Branch & Merge SOP (all Golden Suite repos)
12
+
13
+ - Feature work goes on `feature/<name>` branches, never directly to main
14
+ - Merge via **squash merge PR** (watchers see PR activity, history stays clean)
15
+ - PR title format: `feat: <description>` or `fix: <description>`
16
+ - PR body: summary bullets + test plan
17
+ - Merge when: tests pass, docs updated. Days not weeks.
18
+ - After merge: delete remote branch
19
+
20
+ ## Environment
21
+
22
+ - Windows 11, bash shell (Git Bash)
23
+ - Python 3.12 at `C:\Users\bsevern\AppData\Local\Programs\Python\Python312\python.exe`
24
+ - Two GitHub accounts: `benzsevern` (personal) and `benzsevern-mjh` (work)
25
+ - MUST `gh auth switch --user benzsevern` before push, switch back to `benzsevern-mjh` after
26
+
27
+ ## Commands
28
+
29
+ ```bash
30
+ pip install -e ".[dev]" # Dev install
31
+ pip install -e ".[check]" # With GoldenCheck integration
32
+ pip install -e ".[mcp]" # With MCP server
33
+ pip install -e ".[all]" # Everything
34
+ pytest --tb=short -v # Run tests (158 passing)
35
+ ruff check . # Lint
36
+ ruff check . --fix # Auto-fix lint
37
+ ```
38
+
39
+ 14 CLI commands:
40
+ ```bash
41
+ goldenflow transform data.csv # Zero-config: auto-detect and fix
42
+ goldenflow transform data.csv -c goldenflow.yaml # Apply saved config
43
+ goldenflow transform data.csv --domain healthcare # Use a domain pack
44
+ goldenflow transform data.csv --strict # Fail on any transform error
45
+ goldenflow transform data.csv --llm # Enable LLM-enhanced transforms
46
+ goldenflow data.csv # Shorthand: auto-routes to transform
47
+ goldenflow map -s a.csv -t b.csv # Auto-map schemas between files
48
+ goldenflow learn data.csv -o config.yaml # Generate config from data patterns
49
+ goldenflow validate data.csv # Dry-run: show what would change
50
+ goldenflow diff before.csv after.csv # Compare pre/post transform
51
+ goldenflow profile data.csv # Show column profiles
52
+ goldenflow watch ./data/ # Auto-transform new/changed files
53
+ goldenflow schedule data.csv --every 1h # Run on a schedule
54
+ goldenflow stream large_file.csv # Stream-process in batches
55
+ goldenflow init data.csv # Interactive setup wizard
56
+ goldenflow demo # Generate sample data to try
57
+ goldenflow history # Show recent transform runs
58
+ goldenflow interactive data.csv # Launch TUI
59
+ goldenflow serve # REST API for real-time transforms
60
+ goldenflow mcp-serve # MCP server for Claude Desktop
61
+ ```
62
+
63
+ ## Architecture
64
+
65
+ ```
66
+ goldenflow/
67
+ ├── cli/ # Typer CLI (main.py -- all 14 commands; errors.py, init_wizard.py, watch.py, schedule.py)
68
+ ├── engine/ # TransformEngine, Manifest, profiler_bridge, selector, differ
69
+ ├── transforms/ # Transform library: text, phone, names, address, dates, categorical, numeric, auto_correct
70
+ ├── mapping/ # Schema mapping: name_similarity, profile_similarity, schema_mapper
71
+ ├── config/ # GoldenFlowConfig (Pydantic), YAML loader, config learner
72
+ ├── connectors/ # file.py (CSV/Excel/Parquet), database.py (connectorx), s3.py, gcs.py
73
+ ├── domains/ # Domain packs: base.py, people_hr.py, healthcare.py, finance.py, ecommerce.py, real_estate.py
74
+ ├── llm/ # LLM-assisted config correction (corrector.py) -- wired via --llm flag
75
+ ├── mcp/ # MCP server (server.py)
76
+ ├── reporters/ # rich_console.py, json_reporter.py
77
+ ├── tui/ # Textual TUI (app.py)
78
+ ├── streaming.py # StreamProcessor -- batch/incremental processing
79
+ ├── history.py # Run history tracking (~/.goldenflow/history/)
80
+ └── notebook.py # Jupyter _repr_html_ for TransformResult, Manifest, DatasetProfile
81
+ ```
82
+
83
+ ## Pipeline Flow
84
+
85
+ ```
86
+ read_file (connectors) -> profile_dataframe (profiler_bridge)
87
+ -> select_transforms (selector, by inferred type + auto_apply flag)
88
+ -> apply transforms (TransformEngine.transform_df)
89
+ -> record changes in Manifest
90
+ -> write output + manifest.json
91
+ -> save_run (history.py)
92
+ ```
93
+
94
+ Zero-config mode: `profile_dataframe` infers a type per column, `select_transforms` picks `auto_apply=True` transforms that match the type, sorted by priority descending.
95
+
96
+ ## Transform Registry
97
+
98
+ Transforms live in `goldenflow/transforms/` and self-register via decorator:
99
+
100
+ ```python
101
+ from goldenflow.transforms import register_transform
102
+
103
+ @register_transform(
104
+ name="phone_e164",
105
+ input_types=["phone"],
106
+ auto_apply=True,
107
+ priority=70,
108
+ mode="series",
109
+ )
110
+ def phone_e164(series: pl.Series) -> pl.Series:
111
+ ...
112
+ ```
113
+
114
+ All transform modules are imported in `goldenflow/__init__.py` at package load time -- that is the only registration mechanism. If you add a new module, add an import there.
115
+
116
+ ## Hybrid expr / series / dataframe Mode System
117
+
118
+ The `mode` field on `TransformInfo` controls how the engine applies a transform:
119
+
120
+ | mode | Input | Output | When to use |
121
+ |------|-------|--------|-------------|
122
+ | `"expr"` | `pl.Expr` | `pl.Expr` | Pure Polars operations (strip, lowercase). Stays in Rust; fastest. |
123
+ | `"series"` | `pl.Series` | `pl.Series` | Python logic per column (phone parsing, date parsing). Uses `map_batches` internally. |
124
+ | `"dataframe"` | `pl.DataFrame` | `pl.DataFrame` | Multi-column transforms (split_name, split_address). Receives and returns full frame. |
125
+
126
+ The engine in `engine/transformer.py` dispatches based on `TransformInfo.mode` -- do not add mode-specific logic anywhere else.
127
+
128
+ ## Streaming Module (streaming.py)
129
+
130
+ `StreamProcessor` wraps `TransformEngine` for incremental processing:
131
+ - `transform_one(record: dict)` -- single record, returns `TransformResult`
132
+ - `transform_batch(df: pl.DataFrame)` -- one batch
133
+ - `stream_file(path, chunk_size=10_000)` -- yields `TransformResult` per chunk
134
+ - `batches_processed` property -- count of batches completed
135
+
136
+ ## Cloud Connectors
137
+
138
+ - `connectors/s3.py` -- `read_s3(uri)` / `write_s3(df, uri)` using boto3
139
+ - `connectors/gcs.py` -- `read_gcs(uri)` / `write_gcs(df, uri)` using google-cloud-storage
140
+ - The file connector (`connectors/file.py`) detects `s3://` and `gs://` prefixes and delegates automatically.
141
+
142
+ ## History Module (history.py)
143
+
144
+ - Stores `RunRecord` JSON files in `~/.goldenflow/history/<run_id>.json`
145
+ - `save_run(record)` -- called by `TransformEngine.transform_file` after each run
146
+ - `list_runs(limit=20)` -- returns newest-first list of `RunRecord` objects
147
+ - `RunRecord` fields: `run_id`, `source`, `timestamp`, `rows`, `columns`, `transforms_applied`, `errors`, `duration_seconds`, `config_hash`, `manifest_path`
148
+
149
+ ## Notebook Module (notebook.py)
150
+
151
+ Monkey-patches `_repr_html_` onto three classes at import time:
152
+ - `TransformResult._repr_html_` -- summary table + transform list + DataFrame preview
153
+ - `Manifest._repr_html_` -- transform audit trail with before/after samples
154
+ - `DatasetProfile._repr_html_` -- column profile table
155
+
156
+ Imported in `goldenflow/__init__.py` as a side-effect import (no symbols exported).
157
+
158
+ ## LLM Corrector (llm/corrector.py)
159
+
160
+ Registers an additional transform that calls an LLM API for categorical correction. Activated by:
161
+ 1. Setting `GOLDENFLOW_LLM=1` environment variable
162
+ 2. Using `--llm` flag on the CLI (which does both the env var and the import)
163
+
164
+ Requires `OPENAI_API_KEY` or `ANTHROPIC_API_KEY`. Gracefully skips if no key is found.
165
+
166
+ ## Domain Packs (All 5 Implemented)
167
+
168
+ Each domain pack lives in `goldenflow/domains/<name>.py` and subclasses `DomainPack` from `base.py`:
169
+
170
+ | Module | `load_domain()` key | Focus |
171
+ |--------|---------------------|-------|
172
+ | `people_hr.py` | `"people_hr"` | Names, SSNs, employment dates, gender |
173
+ | `healthcare.py` | `"healthcare"` | Patient IDs, diagnosis codes, clinical dates |
174
+ | `finance.py` | `"finance"` | Currency, account numbers, transaction dates |
175
+ | `ecommerce.py` | `"ecommerce"` | SKUs, prices, order dates, addresses |
176
+ | `real_estate.py` | `"real_estate"` | Property addresses, listing dates, prices |
177
+
178
+ `load_domain(name)` is exported from `goldenflow/domains/__init__.py` and returns the pack or `None`.
179
+
180
+ ## CLI Modules
181
+
182
+ - `cli/main.py` -- all 14 commands (Typer app)
183
+ - `cli/errors.py` -- `cli_error_handler()` context manager for friendly error messages
184
+ - `cli/init_wizard.py` -- `run_wizard()` interactive setup wizard
185
+ - `cli/watch.py` -- `watch_directory()` polling loop
186
+ - `cli/schedule.py` -- `run_schedule()` interval parser + loop
187
+
188
+ ## Key Patterns
189
+
190
+ - **All transforms use `@register_transform`** -- never add to `_REGISTRY` directly
191
+ - **`TransformResult`** is a dataclass with `.df` (clean Polars DataFrame) and `.manifest` (Manifest)
192
+ - **`Manifest`** tracks every `TransformRecord`: column, transform name, rows affected, before/after samples
193
+ - **Polars-native** -- all data ops use Polars, never pandas
194
+ - **`parse_transform_name("truncate:50")`** splits parameterized transform strings into `("truncate", ["50"])`
195
+ - **`select_from_findings`** in `engine/selector.py` maps GoldenCheck finding check names to transform names (the `--from-findings` CLI flag)
196
+
197
+ ## Config Schema (goldenflow.yaml)
198
+
199
+ ```yaml
200
+ source: customers.csv
201
+ output: customers_clean.csv
202
+
203
+ transforms:
204
+ - column: phone
205
+ ops: [phone_e164]
206
+
207
+ renames:
208
+ email_address: email
209
+
210
+ drop: [internal_id]
211
+
212
+ dedup:
213
+ columns: [email]
214
+ keep: first
215
+ ```
216
+
217
+ Config is a `GoldenFlowConfig` Pydantic model (`config/schema.py`). `config/learner.py` auto-generates it from data profiles.
218
+
219
+ ## Integration with GoldenCheck and GoldenMatch
220
+
221
+ GoldenFlow sits in the middle of the Golden Suite pipeline:
222
+
223
+ ```
224
+ Raw Data -> GoldenCheck (profile & discover quality issues)
225
+ -> GoldenFlow (fix issues, standardize, reshape)
226
+ -> GoldenMatch (deduplicate, match, create golden records)
227
+ -> Production
228
+ ```
229
+
230
+ **GoldenCheck integration** (`pip install goldenflow[check]`):
231
+ - `engine/profiler_bridge.py` calls GoldenCheck's scanner to get column profiles
232
+ - `engine/selector.py:select_from_findings()` maps GoldenCheck finding checks to transform names
233
+ - CLI flag `goldenflow transform data.csv --from-findings findings.json`
234
+
235
+ **GoldenMatch integration**:
236
+ - GoldenFlow's output (clean CSV + manifest) feeds directly into `goldenmatch dedupe`
237
+ - Schema mapping (`goldenflow map`) resolves column name mismatches before matching
238
+
239
+ **Pipeline shorthand**:
240
+ ```bash
241
+ goldencheck scan data.csv | goldenflow transform --from-findings | goldenmatch dedupe
242
+ ```
243
+
244
+ ## Testing
245
+
246
+ - TDD: tests first, then implementation
247
+ - 158 tests passing
248
+ - Fixtures: `tests/fixtures/` (CSV files gitignored; add `!tests/fixtures/*.csv` exception if needed)
249
+ - Convention: `tests/{module}/test_{file}.py`
250
+ - Integration tests: `tests/test_integration.py`, `tests/test_public_api.py`
251
+ - Commit messages: conventional commits (`feat:`, `fix:`, `test:`, `docs:`, `chore:`)
252
+
253
+ ## Environment / Auth
254
+
255
+ API keys for LLM testing live in `.testing/.env` (gitignored):
256
+ ```bash
257
+ source .testing/.env # loads OPENAI_API_KEY, ANTHROPIC_API_KEY, TWINE credentials
258
+ ```
259
+
260
+ GitHub auth on Windows (Credential Manager ignores `gh auth switch`):
261
+ ```bash
262
+ gh auth switch --user benzsevern
263
+ GIT_ASKPASS=$(which echo) git -c credential.helper="!gh auth git-credential" push origin main
264
+ gh auth switch --user benzsevern-mjh # switch back after
265
+ ```
266
+
267
+ ## Benchmarks
268
+
269
+ ```bash
270
+ pip install dqbench && dqbench run goldenflow # DQBench transform benchmark (100/100)
271
+ dqbench run all # Compare against other tools
272
+ ```
273
+
274
+ ## Publishing
275
+
276
+ ```bash
277
+ python -m build && source .testing/.env && python -m twine upload dist/*
278
+ ```
279
+
280
+ ## Gotchas
281
+
282
+ - `*.csv` is in `.gitignore` -- test fixtures need `!tests/fixtures/*.csv` exception
283
+ - `__version__` is defined ONLY in `goldenflow/__init__.py` -- don't add a second copy in `cli/main.py`
284
+ - Transform module imports in `__init__.py` are load-order sensitive -- modules that depend on others (e.g. `auto_correct` depends on `categorical`) must be imported after
285
+ - `mode="dataframe"` transforms receive the **entire** DataFrame and must return one with the same or more columns -- do not drop columns silently
286
+ - `category_auto_correct` is suppressed for high-cardinality columns (>10% unique values) by `selector.py` -- this is intentional
287
+ - Ruff line length: 100 chars
288
+ - `config/learner.py` generates a YAML config from profiles; `config/loader.py` reads it back -- keep the Pydantic schema in `config/schema.py` as the single source of truth
289
+ - Cloud connectors (s3.py, gcs.py) have optional dependencies -- `pip install goldenflow[s3]` or `pip install goldenflow[gcs]`; they raise `ImportError` with a helpful message if the dependency is missing
290
+ - `streaming.py` reads the full file before batching (currently) -- for truly out-of-core processing, use Polars LazyFrame directly
291
+ - `history.py` stores runs in `~/.goldenflow/history/` -- this directory is created on first run and is not cleaned up automatically
292
+
293
+ ## Remote MCP Server
294
+ - Endpoint: https://goldenflow-mcp-production.up.railway.app/mcp/
295
+ - Smithery: https://smithery.ai/servers/benzsevern/goldenflow
296
+ - 10 tools, Streamable HTTP transport
297
+ - Dockerfile: Dockerfile.mcp
298
+ - Local HTTP: goldenflow mcp-serve --transport http --port 8150
@@ -0,0 +1,298 @@
1
+ # GoldenFlow -- Cursor Rules
2
+
3
+ Data transformation toolkit -- standardize, reshape, and normalize messy data. DQBench Transform Score: 100/100.
4
+
5
+ ## Related Projects
6
+
7
+ - **GoldenCheck:** `D:\show_case\goldencheck` -- Data validation.
8
+ - **GoldenMatch:** `D:\show_case\goldenmatch` -- Entity resolution.
9
+ - **GitHub:** `benzsevern/goldenflow`, `benzsevern/goldencheck`, `benzsevern/goldenmatch`
10
+
11
+ ## Branch & Merge SOP (all Golden Suite repos)
12
+
13
+ - Feature work goes on `feature/<name>` branches, never directly to main
14
+ - Merge via **squash merge PR** (watchers see PR activity, history stays clean)
15
+ - PR title format: `feat: <description>` or `fix: <description>`
16
+ - PR body: summary bullets + test plan
17
+ - Merge when: tests pass, docs updated. Days not weeks.
18
+ - After merge: delete remote branch
19
+
20
+ ## Environment
21
+
22
+ - Windows 11, bash shell (Git Bash)
23
+ - Python 3.12 at `C:\Users\bsevern\AppData\Local\Programs\Python\Python312\python.exe`
24
+ - Two GitHub accounts: `benzsevern` (personal) and `benzsevern-mjh` (work)
25
+ - MUST `gh auth switch --user benzsevern` before push, switch back to `benzsevern-mjh` after
26
+
27
+ ## Commands
28
+
29
+ ```bash
30
+ pip install -e ".[dev]" # Dev install
31
+ pip install -e ".[check]" # With GoldenCheck integration
32
+ pip install -e ".[mcp]" # With MCP server
33
+ pip install -e ".[all]" # Everything
34
+ pytest --tb=short -v # Run tests (158 passing)
35
+ ruff check . # Lint
36
+ ruff check . --fix # Auto-fix lint
37
+ ```
38
+
39
+ 14 CLI commands:
40
+ ```bash
41
+ goldenflow transform data.csv # Zero-config: auto-detect and fix
42
+ goldenflow transform data.csv -c goldenflow.yaml # Apply saved config
43
+ goldenflow transform data.csv --domain healthcare # Use a domain pack
44
+ goldenflow transform data.csv --strict # Fail on any transform error
45
+ goldenflow transform data.csv --llm # Enable LLM-enhanced transforms
46
+ goldenflow data.csv # Shorthand: auto-routes to transform
47
+ goldenflow map -s a.csv -t b.csv # Auto-map schemas between files
48
+ goldenflow learn data.csv -o config.yaml # Generate config from data patterns
49
+ goldenflow validate data.csv # Dry-run: show what would change
50
+ goldenflow diff before.csv after.csv # Compare pre/post transform
51
+ goldenflow profile data.csv # Show column profiles
52
+ goldenflow watch ./data/ # Auto-transform new/changed files
53
+ goldenflow schedule data.csv --every 1h # Run on a schedule
54
+ goldenflow stream large_file.csv # Stream-process in batches
55
+ goldenflow init data.csv # Interactive setup wizard
56
+ goldenflow demo # Generate sample data to try
57
+ goldenflow history # Show recent transform runs
58
+ goldenflow interactive data.csv # Launch TUI
59
+ goldenflow serve # REST API for real-time transforms
60
+ goldenflow mcp-serve # MCP server for Claude Desktop
61
+ ```
62
+
63
+ ## Architecture
64
+
65
+ ```
66
+ goldenflow/
67
+ ├── cli/ # Typer CLI (main.py -- all 14 commands; errors.py, init_wizard.py, watch.py, schedule.py)
68
+ ├── engine/ # TransformEngine, Manifest, profiler_bridge, selector, differ
69
+ ├── transforms/ # Transform library: text, phone, names, address, dates, categorical, numeric, auto_correct
70
+ ├── mapping/ # Schema mapping: name_similarity, profile_similarity, schema_mapper
71
+ ├── config/ # GoldenFlowConfig (Pydantic), YAML loader, config learner
72
+ ├── connectors/ # file.py (CSV/Excel/Parquet), database.py (connectorx), s3.py, gcs.py
73
+ ├── domains/ # Domain packs: base.py, people_hr.py, healthcare.py, finance.py, ecommerce.py, real_estate.py
74
+ ├── llm/ # LLM-assisted config correction (corrector.py) -- wired via --llm flag
75
+ ├── mcp/ # MCP server (server.py)
76
+ ├── reporters/ # rich_console.py, json_reporter.py
77
+ ├── tui/ # Textual TUI (app.py)
78
+ ├── streaming.py # StreamProcessor -- batch/incremental processing
79
+ ├── history.py # Run history tracking (~/.goldenflow/history/)
80
+ └── notebook.py # Jupyter _repr_html_ for TransformResult, Manifest, DatasetProfile
81
+ ```
82
+
83
+ ## Pipeline Flow
84
+
85
+ ```
86
+ read_file (connectors) -> profile_dataframe (profiler_bridge)
87
+ -> select_transforms (selector, by inferred type + auto_apply flag)
88
+ -> apply transforms (TransformEngine.transform_df)
89
+ -> record changes in Manifest
90
+ -> write output + manifest.json
91
+ -> save_run (history.py)
92
+ ```
93
+
94
+ Zero-config mode: `profile_dataframe` infers a type per column, `select_transforms` picks `auto_apply=True` transforms that match the type, sorted by priority descending.
95
+
96
+ ## Transform Registry
97
+
98
+ Transforms live in `goldenflow/transforms/` and self-register via decorator:
99
+
100
+ ```python
101
+ from goldenflow.transforms import register_transform
102
+
103
+ @register_transform(
104
+ name="phone_e164",
105
+ input_types=["phone"],
106
+ auto_apply=True,
107
+ priority=70,
108
+ mode="series",
109
+ )
110
+ def phone_e164(series: pl.Series) -> pl.Series:
111
+ ...
112
+ ```
113
+
114
+ All transform modules are imported in `goldenflow/__init__.py` at package load time -- that is the only registration mechanism. If you add a new module, add an import there.
115
+
116
+ ## Hybrid expr / series / dataframe Mode System
117
+
118
+ The `mode` field on `TransformInfo` controls how the engine applies a transform:
119
+
120
+ | mode | Input | Output | When to use |
121
+ |------|-------|--------|-------------|
122
+ | `"expr"` | `pl.Expr` | `pl.Expr` | Pure Polars operations (strip, lowercase). Stays in Rust; fastest. |
123
+ | `"series"` | `pl.Series` | `pl.Series` | Python logic per column (phone parsing, date parsing). Uses `map_batches` internally. |
124
+ | `"dataframe"` | `pl.DataFrame` | `pl.DataFrame` | Multi-column transforms (split_name, split_address). Receives and returns full frame. |
125
+
126
+ The engine in `engine/transformer.py` dispatches based on `TransformInfo.mode` -- do not add mode-specific logic anywhere else.
127
+
128
+ ## Streaming Module (streaming.py)
129
+
130
+ `StreamProcessor` wraps `TransformEngine` for incremental processing:
131
+ - `transform_one(record: dict)` -- single record, returns `TransformResult`
132
+ - `transform_batch(df: pl.DataFrame)` -- one batch
133
+ - `stream_file(path, chunk_size=10_000)` -- yields `TransformResult` per chunk
134
+ - `batches_processed` property -- count of batches completed
135
+
136
+ ## Cloud Connectors
137
+
138
+ - `connectors/s3.py` -- `read_s3(uri)` / `write_s3(df, uri)` using boto3
139
+ - `connectors/gcs.py` -- `read_gcs(uri)` / `write_gcs(df, uri)` using google-cloud-storage
140
+ - The file connector (`connectors/file.py`) detects `s3://` and `gs://` prefixes and delegates automatically.
141
+
142
+ ## History Module (history.py)
143
+
144
+ - Stores `RunRecord` JSON files in `~/.goldenflow/history/<run_id>.json`
145
+ - `save_run(record)` -- called by `TransformEngine.transform_file` after each run
146
+ - `list_runs(limit=20)` -- returns newest-first list of `RunRecord` objects
147
+ - `RunRecord` fields: `run_id`, `source`, `timestamp`, `rows`, `columns`, `transforms_applied`, `errors`, `duration_seconds`, `config_hash`, `manifest_path`
148
+
149
+ ## Notebook Module (notebook.py)
150
+
151
+ Monkey-patches `_repr_html_` onto three classes at import time:
152
+ - `TransformResult._repr_html_` -- summary table + transform list + DataFrame preview
153
+ - `Manifest._repr_html_` -- transform audit trail with before/after samples
154
+ - `DatasetProfile._repr_html_` -- column profile table
155
+
156
+ Imported in `goldenflow/__init__.py` as a side-effect import (no symbols exported).
157
+
158
+ ## LLM Corrector (llm/corrector.py)
159
+
160
+ Registers an additional transform that calls an LLM API for categorical correction. Activated by:
161
+ 1. Setting `GOLDENFLOW_LLM=1` environment variable
162
+ 2. Using `--llm` flag on the CLI (which does both the env var and the import)
163
+
164
+ Requires `OPENAI_API_KEY` or `ANTHROPIC_API_KEY`. Gracefully skips if no key is found.
165
+
166
+ ## Domain Packs (All 5 Implemented)
167
+
168
+ Each domain pack lives in `goldenflow/domains/<name>.py` and subclasses `DomainPack` from `base.py`:
169
+
170
+ | Module | `load_domain()` key | Focus |
171
+ |--------|---------------------|-------|
172
+ | `people_hr.py` | `"people_hr"` | Names, SSNs, employment dates, gender |
173
+ | `healthcare.py` | `"healthcare"` | Patient IDs, diagnosis codes, clinical dates |
174
+ | `finance.py` | `"finance"` | Currency, account numbers, transaction dates |
175
+ | `ecommerce.py` | `"ecommerce"` | SKUs, prices, order dates, addresses |
176
+ | `real_estate.py` | `"real_estate"` | Property addresses, listing dates, prices |
177
+
178
+ `load_domain(name)` is exported from `goldenflow/domains/__init__.py` and returns the pack or `None`.
179
+
180
+ ## CLI Modules
181
+
182
+ - `cli/main.py` -- all 14 commands (Typer app)
183
+ - `cli/errors.py` -- `cli_error_handler()` context manager for friendly error messages
184
+ - `cli/init_wizard.py` -- `run_wizard()` interactive setup wizard
185
+ - `cli/watch.py` -- `watch_directory()` polling loop
186
+ - `cli/schedule.py` -- `run_schedule()` interval parser + loop
187
+
188
+ ## Key Patterns
189
+
190
+ - **All transforms use `@register_transform`** -- never add to `_REGISTRY` directly
191
+ - **`TransformResult`** is a dataclass with `.df` (clean Polars DataFrame) and `.manifest` (Manifest)
192
+ - **`Manifest`** tracks every `TransformRecord`: column, transform name, rows affected, before/after samples
193
+ - **Polars-native** -- all data ops use Polars, never pandas
194
+ - **`parse_transform_name("truncate:50")`** splits parameterized transform strings into `("truncate", ["50"])`
195
+ - **`select_from_findings`** in `engine/selector.py` maps GoldenCheck finding check names to transform names (the `--from-findings` CLI flag)
196
+
197
+ ## Config Schema (goldenflow.yaml)
198
+
199
+ ```yaml
200
+ source: customers.csv
201
+ output: customers_clean.csv
202
+
203
+ transforms:
204
+ - column: phone
205
+ ops: [phone_e164]
206
+
207
+ renames:
208
+ email_address: email
209
+
210
+ drop: [internal_id]
211
+
212
+ dedup:
213
+ columns: [email]
214
+ keep: first
215
+ ```
216
+
217
+ Config is a `GoldenFlowConfig` Pydantic model (`config/schema.py`). `config/learner.py` auto-generates it from data profiles.
218
+
219
+ ## Integration with GoldenCheck and GoldenMatch
220
+
221
+ GoldenFlow sits in the middle of the Golden Suite pipeline:
222
+
223
+ ```
224
+ Raw Data -> GoldenCheck (profile & discover quality issues)
225
+ -> GoldenFlow (fix issues, standardize, reshape)
226
+ -> GoldenMatch (deduplicate, match, create golden records)
227
+ -> Production
228
+ ```
229
+
230
+ **GoldenCheck integration** (`pip install goldenflow[check]`):
231
+ - `engine/profiler_bridge.py` calls GoldenCheck's scanner to get column profiles
232
+ - `engine/selector.py:select_from_findings()` maps GoldenCheck finding checks to transform names
233
+ - CLI flag `goldenflow transform data.csv --from-findings findings.json`
234
+
235
+ **GoldenMatch integration**:
236
+ - GoldenFlow's output (clean CSV + manifest) feeds directly into `goldenmatch dedupe`
237
+ - Schema mapping (`goldenflow map`) resolves column name mismatches before matching
238
+
239
+ **Pipeline shorthand**:
240
+ ```bash
241
+ goldencheck scan data.csv | goldenflow transform --from-findings | goldenmatch dedupe
242
+ ```
243
+
244
+ ## Testing
245
+
246
+ - TDD: tests first, then implementation
247
+ - 158 tests passing
248
+ - Fixtures: `tests/fixtures/` (CSV files gitignored; add `!tests/fixtures/*.csv` exception if needed)
249
+ - Convention: `tests/{module}/test_{file}.py`
250
+ - Integration tests: `tests/test_integration.py`, `tests/test_public_api.py`
251
+ - Commit messages: conventional commits (`feat:`, `fix:`, `test:`, `docs:`, `chore:`)
252
+
253
+ ## Environment / Auth
254
+
255
+ API keys for LLM testing live in `.testing/.env` (gitignored):
256
+ ```bash
257
+ source .testing/.env # loads OPENAI_API_KEY, ANTHROPIC_API_KEY, TWINE credentials
258
+ ```
259
+
260
+ GitHub auth on Windows (Credential Manager ignores `gh auth switch`):
261
+ ```bash
262
+ gh auth switch --user benzsevern
263
+ GIT_ASKPASS=$(which echo) git -c credential.helper="!gh auth git-credential" push origin main
264
+ gh auth switch --user benzsevern-mjh # switch back after
265
+ ```
266
+
267
+ ## Benchmarks
268
+
269
+ ```bash
270
+ pip install dqbench && dqbench run goldenflow # DQBench transform benchmark (100/100)
271
+ dqbench run all # Compare against other tools
272
+ ```
273
+
274
+ ## Publishing
275
+
276
+ ```bash
277
+ python -m build && source .testing/.env && python -m twine upload dist/*
278
+ ```
279
+
280
+ ## Gotchas
281
+
282
+ - `*.csv` is in `.gitignore` -- test fixtures need `!tests/fixtures/*.csv` exception
283
+ - `__version__` is defined ONLY in `goldenflow/__init__.py` -- don't add a second copy in `cli/main.py`
284
+ - Transform module imports in `__init__.py` are load-order sensitive -- modules that depend on others (e.g. `auto_correct` depends on `categorical`) must be imported after
285
+ - `mode="dataframe"` transforms receive the **entire** DataFrame and must return one with the same or more columns -- do not drop columns silently
286
+ - `category_auto_correct` is suppressed for high-cardinality columns (>10% unique values) by `selector.py` -- this is intentional
287
+ - Ruff line length: 100 chars
288
+ - `config/learner.py` generates a YAML config from profiles; `config/loader.py` reads it back -- keep the Pydantic schema in `config/schema.py` as the single source of truth
289
+ - Cloud connectors (s3.py, gcs.py) have optional dependencies -- `pip install goldenflow[s3]` or `pip install goldenflow[gcs]`; they raise `ImportError` with a helpful message if the dependency is missing
290
+ - `streaming.py` reads the full file before batching (currently) -- for truly out-of-core processing, use Polars LazyFrame directly
291
+ - `history.py` stores runs in `~/.goldenflow/history/` -- this directory is created on first run and is not cleaned up automatically
292
+
293
+ ## Remote MCP Server
294
+ - Endpoint: https://goldenflow-mcp-production.up.railway.app/mcp/
295
+ - Smithery: https://smithery.ai/servers/benzsevern/goldenflow
296
+ - 10 tools, Streamable HTTP transport
297
+ - Dockerfile: Dockerfile.mcp
298
+ - Local HTTP: goldenflow mcp-serve --transport http --port 8150
@@ -0,0 +1,56 @@
1
+ name: Bug Report
2
+ description: Report something that isn't working correctly
3
+ labels: ["bug"]
4
+ body:
5
+ - type: markdown
6
+ attributes:
7
+ value: |
8
+ Thanks for reporting a bug! Please fill in the details below.
9
+ - type: input
10
+ id: version
11
+ attributes:
12
+ label: GoldenFlow version
13
+ description: "Run `goldenflow --version` or `pip show goldenflow`"
14
+ placeholder: "0.1.0"
15
+ validations:
16
+ required: true
17
+ - type: textarea
18
+ id: description
19
+ attributes:
20
+ label: What happened?
21
+ description: "Clear description of the bug"
22
+ validations:
23
+ required: true
24
+ - type: textarea
25
+ id: reproduce
26
+ attributes:
27
+ label: Steps to reproduce
28
+ description: "Minimal steps or config to reproduce the issue"
29
+ placeholder: |
30
+ 1. Run `goldenflow transform data.csv --domain healthcare`
31
+ 2. See error...
32
+ validations:
33
+ required: true
34
+ - type: textarea
35
+ id: config
36
+ attributes:
37
+ label: Config (if applicable)
38
+ description: "Paste your YAML config"
39
+ render: yaml
40
+ - type: textarea
41
+ id: error
42
+ attributes:
43
+ label: Error output
44
+ description: "Full error message or traceback"
45
+ render: shell
46
+ - type: dropdown
47
+ id: os
48
+ attributes:
49
+ label: Operating System
50
+ options:
51
+ - Windows
52
+ - macOS
53
+ - Linux
54
+ - Other
55
+ validations:
56
+ required: true
@@ -0,0 +1,5 @@
1
+ blank_issues_enabled: false
2
+ contact_links:
3
+ - name: Ask a Question
4
+ url: https://github.com/benzsevern/goldenflow/discussions
5
+ about: Get help from the community in Discussions