PyPI - goldenflow - Versions diffs - 1.0.0__tar.gz - Mend

goldenflow 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (147) hide show

goldenflow-1.0.0/.clinerules +298 -0
goldenflow-1.0.0/.cursorrules +298 -0
goldenflow-1.0.0/.github/ISSUE_TEMPLATE/bug_report.yml +56 -0
goldenflow-1.0.0/.github/ISSUE_TEMPLATE/config.yml +5 -0
goldenflow-1.0.0/.github/ISSUE_TEMPLATE/feature_request.yml +38 -0
goldenflow-1.0.0/.github/copilot-instructions.md +298 -0
goldenflow-1.0.0/.github/workflows/pages.yml +44 -0
goldenflow-1.0.0/.github/workflows/publish.yml +21 -0
goldenflow-1.0.0/.github/workflows/test.yml +46 -0
goldenflow-1.0.0/.gitignore +26 -0
goldenflow-1.0.0/.windsurfrules +298 -0
goldenflow-1.0.0/AGENTS.md +298 -0
goldenflow-1.0.0/CHANGELOG.md +31 -0
goldenflow-1.0.0/CLAUDE.md +306 -0
goldenflow-1.0.0/CODE_OF_CONDUCT.md +36 -0
goldenflow-1.0.0/CONTRIBUTING.md +233 -0
goldenflow-1.0.0/Dockerfile.mcp +6 -0
goldenflow-1.0.0/PKG-INFO +695 -0
goldenflow-1.0.0/README.md +635 -0
goldenflow-1.0.0/benchmarks/datasets/generate_datasets.py +523 -0
goldenflow-1.0.0/benchmarks/datasets/goldenflow_bench/data.csv +5001 -0
goldenflow-1.0.0/benchmarks/datasets/goldenflow_bench/ground_truth.json +211 -0
goldenflow-1.0.0/benchmarks/quality_benchmark.py +424 -0
goldenflow-1.0.0/benchmarks/schema_mapping_benchmark.py +256 -0
goldenflow-1.0.0/benchmarks/speed_benchmark.py +146 -0
goldenflow-1.0.0/docs/_config.yml +22 -0
goldenflow-1.0.0/docs/index.md +30 -0
goldenflow-1.0.0/docs/llms-full.txt +280 -0
goldenflow-1.0.0/docs/llms.txt +18 -0
goldenflow-1.0.0/docs/superpowers/plans/2026-03-25-goldenflow-implementation.md +4697 -0
goldenflow-1.0.0/docs/superpowers/specs/2026-03-25-goldenflow-design.md +425 -0
goldenflow-1.0.0/examples/README.md +13 -0
goldenflow-1.0.0/examples/config_based.py +34 -0
goldenflow-1.0.0/examples/domain_pack.py +36 -0
goldenflow-1.0.0/examples/transform_basic.py +28 -0
goldenflow-1.0.0/golden-suite.json +59 -0
goldenflow-1.0.0/goldenflow/__init__.py +113 -0
goldenflow-1.0.0/goldenflow/a2a/__init__.py +1 -0
goldenflow-1.0.0/goldenflow/a2a/server.py +135 -0
goldenflow-1.0.0/goldenflow/api/__init__.py +0 -0
goldenflow-1.0.0/goldenflow/api/server.py +80 -0
goldenflow-1.0.0/goldenflow/cli/__init__.py +0 -0
goldenflow-1.0.0/goldenflow/cli/errors.py +32 -0
goldenflow-1.0.0/goldenflow/cli/init_wizard.py +93 -0
goldenflow-1.0.0/goldenflow/cli/main.py +426 -0
goldenflow-1.0.0/goldenflow/cli/schedule.py +61 -0
goldenflow-1.0.0/goldenflow/cli/watch.py +53 -0
goldenflow-1.0.0/goldenflow/config/__init__.py +0 -0
goldenflow-1.0.0/goldenflow/config/learner.py +27 -0
goldenflow-1.0.0/goldenflow/config/loader.py +31 -0
goldenflow-1.0.0/goldenflow/config/schema.py +44 -0
goldenflow-1.0.0/goldenflow/connectors/__init__.py +3 -0
goldenflow-1.0.0/goldenflow/connectors/database.py +21 -0
goldenflow-1.0.0/goldenflow/connectors/file.py +55 -0
goldenflow-1.0.0/goldenflow/connectors/gcs.py +67 -0
goldenflow-1.0.0/goldenflow/connectors/s3.py +63 -0
goldenflow-1.0.0/goldenflow/domains/__init__.py +23 -0
goldenflow-1.0.0/goldenflow/domains/base.py +12 -0
goldenflow-1.0.0/goldenflow/domains/ecommerce.py +27 -0
goldenflow-1.0.0/goldenflow/domains/finance.py +37 -0
goldenflow-1.0.0/goldenflow/domains/healthcare.py +50 -0
goldenflow-1.0.0/goldenflow/domains/people_hr.py +64 -0
goldenflow-1.0.0/goldenflow/domains/real_estate.py +27 -0
goldenflow-1.0.0/goldenflow/engine/__init__.py +0 -0
goldenflow-1.0.0/goldenflow/engine/differ.py +54 -0
goldenflow-1.0.0/goldenflow/engine/manifest.py +74 -0
goldenflow-1.0.0/goldenflow/engine/profiler_bridge.py +201 -0
goldenflow-1.0.0/goldenflow/engine/selector.py +61 -0
goldenflow-1.0.0/goldenflow/engine/transformer.py +275 -0
goldenflow-1.0.0/goldenflow/history.py +57 -0
goldenflow-1.0.0/goldenflow/llm/__init__.py +0 -0
goldenflow-1.0.0/goldenflow/llm/corrector.py +102 -0
goldenflow-1.0.0/goldenflow/mapping/__init__.py +0 -0
goldenflow-1.0.0/goldenflow/mapping/name_similarity.py +44 -0
goldenflow-1.0.0/goldenflow/mapping/profile_similarity.py +34 -0
goldenflow-1.0.0/goldenflow/mapping/schema_mapper.py +90 -0
goldenflow-1.0.0/goldenflow/mcp/__init__.py +0 -0
goldenflow-1.0.0/goldenflow/mcp/server.py +321 -0
goldenflow-1.0.0/goldenflow/notebook.py +104 -0
goldenflow-1.0.0/goldenflow/py.typed +0 -0
goldenflow-1.0.0/goldenflow/reporters/__init__.py +0 -0
goldenflow-1.0.0/goldenflow/reporters/json_reporter.py +9 -0
goldenflow-1.0.0/goldenflow/reporters/rich_console.py +73 -0
goldenflow-1.0.0/goldenflow/streaming.py +61 -0
goldenflow-1.0.0/goldenflow/transforms/__init__.py +64 -0
goldenflow-1.0.0/goldenflow/transforms/address.py +134 -0
goldenflow-1.0.0/goldenflow/transforms/auto_correct.py +110 -0
goldenflow-1.0.0/goldenflow/transforms/categorical.py +121 -0
goldenflow-1.0.0/goldenflow/transforms/dates.py +86 -0
goldenflow-1.0.0/goldenflow/transforms/names.py +119 -0
goldenflow-1.0.0/goldenflow/transforms/numeric.py +57 -0
goldenflow-1.0.0/goldenflow/transforms/phone.py +74 -0
goldenflow-1.0.0/goldenflow/transforms/text.py +87 -0
goldenflow-1.0.0/goldenflow/tui/__init__.py +0 -0
goldenflow-1.0.0/goldenflow/tui/app.py +463 -0
goldenflow-1.0.0/llms.txt +17 -0
goldenflow-1.0.0/pyproject.toml +61 -0
goldenflow-1.0.0/server.json +28 -0
goldenflow-1.0.0/smithery.yaml +26 -0
goldenflow-1.0.0/tests/__init__.py +0 -0
goldenflow-1.0.0/tests/api/__init__.py +0 -0
goldenflow-1.0.0/tests/api/test_server.py +31 -0
goldenflow-1.0.0/tests/cli/__init__.py +0 -0
goldenflow-1.0.0/tests/cli/test_cli.py +47 -0
goldenflow-1.0.0/tests/cli/test_cli_polish.py +22 -0
goldenflow-1.0.0/tests/config/__init__.py +0 -0
goldenflow-1.0.0/tests/config/test_loader.py +50 -0
goldenflow-1.0.0/tests/config/test_schema.py +51 -0
goldenflow-1.0.0/tests/conftest.py +46 -0
goldenflow-1.0.0/tests/connectors/__init__.py +0 -0
goldenflow-1.0.0/tests/connectors/test_cloud.py +48 -0
goldenflow-1.0.0/tests/connectors/test_database.py +15 -0
goldenflow-1.0.0/tests/connectors/test_file.py +51 -0
goldenflow-1.0.0/tests/domains/__init__.py +0 -0
goldenflow-1.0.0/tests/domains/test_all_domains.py +25 -0
goldenflow-1.0.0/tests/domains/test_people_hr.py +25 -0
goldenflow-1.0.0/tests/engine/__init__.py +0 -0
goldenflow-1.0.0/tests/engine/test_differ.py +31 -0
goldenflow-1.0.0/tests/engine/test_manifest.py +48 -0
goldenflow-1.0.0/tests/engine/test_profiler_bridge.py +42 -0
goldenflow-1.0.0/tests/engine/test_selector.py +74 -0
goldenflow-1.0.0/tests/engine/test_transformer.py +110 -0
goldenflow-1.0.0/tests/fixtures/messy.csv +7 -0
goldenflow-1.0.0/tests/llm/__init__.py +0 -0
goldenflow-1.0.0/tests/llm/test_corrector.py +69 -0
goldenflow-1.0.0/tests/mapping/__init__.py +0 -0
goldenflow-1.0.0/tests/mapping/test_schema_mapper.py +42 -0
goldenflow-1.0.0/tests/mcp/__init__.py +0 -0
goldenflow-1.0.0/tests/mcp/test_mcp.py +72 -0
goldenflow-1.0.0/tests/test_a2a.py +74 -0
goldenflow-1.0.0/tests/test_history.py +26 -0
goldenflow-1.0.0/tests/test_integration.py +127 -0
goldenflow-1.0.0/tests/test_notebook.py +11 -0
goldenflow-1.0.0/tests/test_public_api.py +54 -0
goldenflow-1.0.0/tests/test_streaming.py +35 -0
goldenflow-1.0.0/tests/transforms/__init__.py +0 -0
goldenflow-1.0.0/tests/transforms/test_address.py +60 -0
goldenflow-1.0.0/tests/transforms/test_auto_correct.py +55 -0
goldenflow-1.0.0/tests/transforms/test_categorical.py +59 -0
goldenflow-1.0.0/tests/transforms/test_dates.py +47 -0
goldenflow-1.0.0/tests/transforms/test_names.py +56 -0
goldenflow-1.0.0/tests/transforms/test_numeric.py +32 -0
goldenflow-1.0.0/tests/transforms/test_phone.py +34 -0
goldenflow-1.0.0/tests/transforms/test_registry.py +68 -0
goldenflow-1.0.0/tests/transforms/test_text.py +73 -0
goldenflow-1.0.0/tests/tui/__init__.py +0 -0
goldenflow-1.0.0/tests/tui/test_tui.py +71 -0

goldenflow-1.0.0/.clinerules ADDED Viewed

@@ -0,0 +1,298 @@
+# GoldenFlow -- Cline Rules
+Data transformation toolkit -- standardize, reshape, and normalize messy data. DQBench Transform Score: 100/100.
+## Related Projects
+- **GoldenCheck:** `D:\show_case\goldencheck` -- Data validation.
+- **GoldenMatch:** `D:\show_case\goldenmatch` -- Entity resolution.
+- **GitHub:** `benzsevern/goldenflow`, `benzsevern/goldencheck`, `benzsevern/goldenmatch`
+## Branch & Merge SOP (all Golden Suite repos)
+- Feature work goes on `feature/<name>` branches, never directly to main
+- Merge via **squash merge PR** (watchers see PR activity, history stays clean)
+- PR title format: `feat: <description>` or `fix: <description>`
+- PR body: summary bullets + test plan
+- Merge when: tests pass, docs updated. Days not weeks.
+- After merge: delete remote branch
+## Environment
+- Windows 11, bash shell (Git Bash)
+- Python 3.12 at `C:\Users\bsevern\AppData\Local\Programs\Python\Python312\python.exe`
+- Two GitHub accounts: `benzsevern` (personal) and `benzsevern-mjh` (work)
+- MUST `gh auth switch --user benzsevern` before push, switch back to `benzsevern-mjh` after
+## Commands
+```bash
+pip install -e ".[dev]"             # Dev install
+pip install -e ".[check]"           # With GoldenCheck integration
+pip install -e ".[mcp]"             # With MCP server
+pip install -e ".[all]"             # Everything
+pytest --tb=short -v                # Run tests (158 passing)
+ruff check .                        # Lint
+ruff check . --fix                  # Auto-fix lint
+```
+14 CLI commands:
+```bash
+goldenflow transform data.csv                    # Zero-config: auto-detect and fix
+goldenflow transform data.csv -c goldenflow.yaml # Apply saved config
+goldenflow transform data.csv --domain healthcare # Use a domain pack
+goldenflow transform data.csv --strict           # Fail on any transform error
+goldenflow transform data.csv --llm              # Enable LLM-enhanced transforms
+goldenflow data.csv                              # Shorthand: auto-routes to transform
+goldenflow map -s a.csv -t b.csv                 # Auto-map schemas between files
+goldenflow learn data.csv -o config.yaml         # Generate config from data patterns
+goldenflow validate data.csv                     # Dry-run: show what would change
+goldenflow diff before.csv after.csv             # Compare pre/post transform
+goldenflow profile data.csv                      # Show column profiles
+goldenflow watch ./data/                         # Auto-transform new/changed files
+goldenflow schedule data.csv --every 1h          # Run on a schedule
+goldenflow stream large_file.csv                 # Stream-process in batches
+goldenflow init data.csv                         # Interactive setup wizard
+goldenflow demo                                  # Generate sample data to try
+goldenflow history                               # Show recent transform runs
+goldenflow interactive data.csv                  # Launch TUI
+goldenflow serve                                 # REST API for real-time transforms
+goldenflow mcp-serve                             # MCP server for Claude Desktop
+```
+## Architecture
+```
+goldenflow/
+├── cli/           # Typer CLI (main.py -- all 14 commands; errors.py, init_wizard.py, watch.py, schedule.py)
+├── engine/        # TransformEngine, Manifest, profiler_bridge, selector, differ
+├── transforms/    # Transform library: text, phone, names, address, dates, categorical, numeric, auto_correct
+├── mapping/       # Schema mapping: name_similarity, profile_similarity, schema_mapper
+├── config/        # GoldenFlowConfig (Pydantic), YAML loader, config learner
+├── connectors/    # file.py (CSV/Excel/Parquet), database.py (connectorx), s3.py, gcs.py
+├── domains/       # Domain packs: base.py, people_hr.py, healthcare.py, finance.py, ecommerce.py, real_estate.py
+├── llm/           # LLM-assisted config correction (corrector.py) -- wired via --llm flag
+├── mcp/           # MCP server (server.py)
+├── reporters/     # rich_console.py, json_reporter.py
+├── tui/           # Textual TUI (app.py)
+├── streaming.py   # StreamProcessor -- batch/incremental processing
+├── history.py     # Run history tracking (~/.goldenflow/history/)
+└── notebook.py    # Jupyter _repr_html_ for TransformResult, Manifest, DatasetProfile
+```
+## Pipeline Flow
+```
+read_file (connectors) -> profile_dataframe (profiler_bridge)
+-> select_transforms (selector, by inferred type + auto_apply flag)
+-> apply transforms (TransformEngine.transform_df)
+-> record changes in Manifest
+-> write output + manifest.json
+-> save_run (history.py)
+```
+Zero-config mode: `profile_dataframe` infers a type per column, `select_transforms` picks `auto_apply=True` transforms that match the type, sorted by priority descending.
+## Transform Registry
+Transforms live in `goldenflow/transforms/` and self-register via decorator:
+```python
+from goldenflow.transforms import register_transform
+@register_transform(
+    name="phone_e164",
+    input_types=["phone"],
+    auto_apply=True,
+    priority=70,
+    mode="series",
+)
+def phone_e164(series: pl.Series) -> pl.Series:
+    ...
+```
+All transform modules are imported in `goldenflow/__init__.py` at package load time -- that is the only registration mechanism. If you add a new module, add an import there.
+## Hybrid expr / series / dataframe Mode System
+The `mode` field on `TransformInfo` controls how the engine applies a transform:
+| mode | Input | Output | When to use |
+|------|-------|--------|-------------|
+| `"expr"` | `pl.Expr` | `pl.Expr` | Pure Polars operations (strip, lowercase). Stays in Rust; fastest. |
+| `"series"` | `pl.Series` | `pl.Series` | Python logic per column (phone parsing, date parsing). Uses `map_batches` internally. |
+| `"dataframe"` | `pl.DataFrame` | `pl.DataFrame` | Multi-column transforms (split_name, split_address). Receives and returns full frame. |
+The engine in `engine/transformer.py` dispatches based on `TransformInfo.mode` -- do not add mode-specific logic anywhere else.
+## Streaming Module (streaming.py)
+`StreamProcessor` wraps `TransformEngine` for incremental processing:
+- `transform_one(record: dict)` -- single record, returns `TransformResult`
+- `transform_batch(df: pl.DataFrame)` -- one batch
+- `stream_file(path, chunk_size=10_000)` -- yields `TransformResult` per chunk
+- `batches_processed` property -- count of batches completed
+## Cloud Connectors
+- `connectors/s3.py` -- `read_s3(uri)` / `write_s3(df, uri)` using boto3
+- `connectors/gcs.py` -- `read_gcs(uri)` / `write_gcs(df, uri)` using google-cloud-storage
+- The file connector (`connectors/file.py`) detects `s3://` and `gs://` prefixes and delegates automatically.
+## History Module (history.py)
+- Stores `RunRecord` JSON files in `~/.goldenflow/history/<run_id>.json`
+- `save_run(record)` -- called by `TransformEngine.transform_file` after each run
+- `list_runs(limit=20)` -- returns newest-first list of `RunRecord` objects
+- `RunRecord` fields: `run_id`, `source`, `timestamp`, `rows`, `columns`, `transforms_applied`, `errors`, `duration_seconds`, `config_hash`, `manifest_path`
+## Notebook Module (notebook.py)
+Monkey-patches `_repr_html_` onto three classes at import time:
+- `TransformResult._repr_html_` -- summary table + transform list + DataFrame preview
+- `Manifest._repr_html_` -- transform audit trail with before/after samples
+- `DatasetProfile._repr_html_` -- column profile table
+Imported in `goldenflow/__init__.py` as a side-effect import (no symbols exported).
+## LLM Corrector (llm/corrector.py)
+Registers an additional transform that calls an LLM API for categorical correction. Activated by:
+1. Setting `GOLDENFLOW_LLM=1` environment variable
+2. Using `--llm` flag on the CLI (which does both the env var and the import)
+Requires `OPENAI_API_KEY` or `ANTHROPIC_API_KEY`. Gracefully skips if no key is found.
+## Domain Packs (All 5 Implemented)
+Each domain pack lives in `goldenflow/domains/<name>.py` and subclasses `DomainPack` from `base.py`:
+| Module | `load_domain()` key | Focus |
+|--------|---------------------|-------|
+| `people_hr.py` | `"people_hr"` | Names, SSNs, employment dates, gender |
+| `healthcare.py` | `"healthcare"` | Patient IDs, diagnosis codes, clinical dates |
+| `finance.py` | `"finance"` | Currency, account numbers, transaction dates |
+| `ecommerce.py` | `"ecommerce"` | SKUs, prices, order dates, addresses |
+| `real_estate.py` | `"real_estate"` | Property addresses, listing dates, prices |
+`load_domain(name)` is exported from `goldenflow/domains/__init__.py` and returns the pack or `None`.
+## CLI Modules
+- `cli/main.py` -- all 14 commands (Typer app)
+- `cli/errors.py` -- `cli_error_handler()` context manager for friendly error messages
+- `cli/init_wizard.py` -- `run_wizard()` interactive setup wizard
+- `cli/watch.py` -- `watch_directory()` polling loop
+- `cli/schedule.py` -- `run_schedule()` interval parser + loop
+## Key Patterns
+- **All transforms use `@register_transform`** -- never add to `_REGISTRY` directly
+- **`TransformResult`** is a dataclass with `.df` (clean Polars DataFrame) and `.manifest` (Manifest)
+- **`Manifest`** tracks every `TransformRecord`: column, transform name, rows affected, before/after samples
+- **Polars-native** -- all data ops use Polars, never pandas
+- **`parse_transform_name("truncate:50")`** splits parameterized transform strings into `("truncate", ["50"])`
+- **`select_from_findings`** in `engine/selector.py` maps GoldenCheck finding check names to transform names (the `--from-findings` CLI flag)
+## Config Schema (goldenflow.yaml)
+```yaml
+source: customers.csv
+output: customers_clean.csv
+transforms:
+  - column: phone
+    ops: [phone_e164]
+renames:
+  email_address: email
+drop: [internal_id]
+dedup:
+  columns: [email]
+  keep: first
+```
+Config is a `GoldenFlowConfig` Pydantic model (`config/schema.py`). `config/learner.py` auto-generates it from data profiles.
+## Integration with GoldenCheck and GoldenMatch
+GoldenFlow sits in the middle of the Golden Suite pipeline:
+```
+Raw Data -> GoldenCheck (profile & discover quality issues)
+         -> GoldenFlow (fix issues, standardize, reshape)
+         -> GoldenMatch (deduplicate, match, create golden records)
+         -> Production
+```
+**GoldenCheck integration** (`pip install goldenflow[check]`):
+- `engine/profiler_bridge.py` calls GoldenCheck's scanner to get column profiles
+- `engine/selector.py:select_from_findings()` maps GoldenCheck finding checks to transform names
+- CLI flag `goldenflow transform data.csv --from-findings findings.json`
+**GoldenMatch integration**:
+- GoldenFlow's output (clean CSV + manifest) feeds directly into `goldenmatch dedupe`
+- Schema mapping (`goldenflow map`) resolves column name mismatches before matching
+**Pipeline shorthand**:
+```bash
+goldencheck scan data.csv | goldenflow transform --from-findings | goldenmatch dedupe
+```
+## Testing
+- TDD: tests first, then implementation
+- 158 tests passing
+- Fixtures: `tests/fixtures/` (CSV files gitignored; add `!tests/fixtures/*.csv` exception if needed)
+- Convention: `tests/{module}/test_{file}.py`
+- Integration tests: `tests/test_integration.py`, `tests/test_public_api.py`
+- Commit messages: conventional commits (`feat:`, `fix:`, `test:`, `docs:`, `chore:`)
+## Environment / Auth
+API keys for LLM testing live in `.testing/.env` (gitignored):
+```bash
+source .testing/.env   # loads OPENAI_API_KEY, ANTHROPIC_API_KEY, TWINE credentials
+```
+GitHub auth on Windows (Credential Manager ignores `gh auth switch`):
+```bash
+gh auth switch --user benzsevern
+GIT_ASKPASS=$(which echo) git -c credential.helper="!gh auth git-credential" push origin main
+gh auth switch --user benzsevern-mjh   # switch back after
+```
+## Benchmarks
+```bash
+pip install dqbench && dqbench run goldenflow   # DQBench transform benchmark (100/100)
+dqbench run all                                  # Compare against other tools
+```
+## Publishing
+```bash
+python -m build && source .testing/.env && python -m twine upload dist/*
+```
+## Gotchas
+- `*.csv` is in `.gitignore` -- test fixtures need `!tests/fixtures/*.csv` exception
+- `__version__` is defined ONLY in `goldenflow/__init__.py` -- don't add a second copy in `cli/main.py`
+- Transform module imports in `__init__.py` are load-order sensitive -- modules that depend on others (e.g. `auto_correct` depends on `categorical`) must be imported after
+- `mode="dataframe"` transforms receive the **entire** DataFrame and must return one with the same or more columns -- do not drop columns silently
+- `category_auto_correct` is suppressed for high-cardinality columns (>10% unique values) by `selector.py` -- this is intentional
+- Ruff line length: 100 chars
+- `config/learner.py` generates a YAML config from profiles; `config/loader.py` reads it back -- keep the Pydantic schema in `config/schema.py` as the single source of truth
+- Cloud connectors (s3.py, gcs.py) have optional dependencies -- `pip install goldenflow[s3]` or `pip install goldenflow[gcs]`; they raise `ImportError` with a helpful message if the dependency is missing
+- `streaming.py` reads the full file before batching (currently) -- for truly out-of-core processing, use Polars LazyFrame directly
+- `history.py` stores runs in `~/.goldenflow/history/` -- this directory is created on first run and is not cleaned up automatically
+## Remote MCP Server
+- Endpoint: https://goldenflow-mcp-production.up.railway.app/mcp/
+- Smithery: https://smithery.ai/servers/benzsevern/goldenflow
+- 10 tools, Streamable HTTP transport
+- Dockerfile: Dockerfile.mcp
+- Local HTTP: goldenflow mcp-serve --transport http --port 8150

goldenflow-1.0.0/.cursorrules ADDED Viewed

@@ -0,0 +1,298 @@
+# GoldenFlow -- Cursor Rules
+Data transformation toolkit -- standardize, reshape, and normalize messy data. DQBench Transform Score: 100/100.
+## Related Projects
+- **GoldenCheck:** `D:\show_case\goldencheck` -- Data validation.
+- **GoldenMatch:** `D:\show_case\goldenmatch` -- Entity resolution.
+- **GitHub:** `benzsevern/goldenflow`, `benzsevern/goldencheck`, `benzsevern/goldenmatch`
+## Branch & Merge SOP (all Golden Suite repos)
+- Feature work goes on `feature/<name>` branches, never directly to main
+- Merge via **squash merge PR** (watchers see PR activity, history stays clean)
+- PR title format: `feat: <description>` or `fix: <description>`
+- PR body: summary bullets + test plan
+- Merge when: tests pass, docs updated. Days not weeks.
+- After merge: delete remote branch
+## Environment
+- Windows 11, bash shell (Git Bash)
+- Python 3.12 at `C:\Users\bsevern\AppData\Local\Programs\Python\Python312\python.exe`
+- Two GitHub accounts: `benzsevern` (personal) and `benzsevern-mjh` (work)
+- MUST `gh auth switch --user benzsevern` before push, switch back to `benzsevern-mjh` after
+## Commands
+```bash
+pip install -e ".[dev]"             # Dev install
+pip install -e ".[check]"           # With GoldenCheck integration
+pip install -e ".[mcp]"             # With MCP server
+pip install -e ".[all]"             # Everything
+pytest --tb=short -v                # Run tests (158 passing)
+ruff check .                        # Lint
+ruff check . --fix                  # Auto-fix lint
+```
+14 CLI commands:
+```bash
+goldenflow transform data.csv                    # Zero-config: auto-detect and fix
+goldenflow transform data.csv -c goldenflow.yaml # Apply saved config
+goldenflow transform data.csv --domain healthcare # Use a domain pack
+goldenflow transform data.csv --strict           # Fail on any transform error
+goldenflow transform data.csv --llm              # Enable LLM-enhanced transforms
+goldenflow data.csv                              # Shorthand: auto-routes to transform
+goldenflow map -s a.csv -t b.csv                 # Auto-map schemas between files
+goldenflow learn data.csv -o config.yaml         # Generate config from data patterns
+goldenflow validate data.csv                     # Dry-run: show what would change
+goldenflow diff before.csv after.csv             # Compare pre/post transform
+goldenflow profile data.csv                      # Show column profiles
+goldenflow watch ./data/                         # Auto-transform new/changed files
+goldenflow schedule data.csv --every 1h          # Run on a schedule
+goldenflow stream large_file.csv                 # Stream-process in batches
+goldenflow init data.csv                         # Interactive setup wizard
+goldenflow demo                                  # Generate sample data to try
+goldenflow history                               # Show recent transform runs
+goldenflow interactive data.csv                  # Launch TUI
+goldenflow serve                                 # REST API for real-time transforms
+goldenflow mcp-serve                             # MCP server for Claude Desktop
+```
+## Architecture
+```
+goldenflow/
+├── cli/           # Typer CLI (main.py -- all 14 commands; errors.py, init_wizard.py, watch.py, schedule.py)
+├── engine/        # TransformEngine, Manifest, profiler_bridge, selector, differ
+├── transforms/    # Transform library: text, phone, names, address, dates, categorical, numeric, auto_correct
+├── mapping/       # Schema mapping: name_similarity, profile_similarity, schema_mapper
+├── config/        # GoldenFlowConfig (Pydantic), YAML loader, config learner
+├── connectors/    # file.py (CSV/Excel/Parquet), database.py (connectorx), s3.py, gcs.py
+├── domains/       # Domain packs: base.py, people_hr.py, healthcare.py, finance.py, ecommerce.py, real_estate.py
+├── llm/           # LLM-assisted config correction (corrector.py) -- wired via --llm flag
+├── mcp/           # MCP server (server.py)
+├── reporters/     # rich_console.py, json_reporter.py
+├── tui/           # Textual TUI (app.py)
+├── streaming.py   # StreamProcessor -- batch/incremental processing
+├── history.py     # Run history tracking (~/.goldenflow/history/)
+└── notebook.py    # Jupyter _repr_html_ for TransformResult, Manifest, DatasetProfile
+```
+## Pipeline Flow
+```
+read_file (connectors) -> profile_dataframe (profiler_bridge)
+-> select_transforms (selector, by inferred type + auto_apply flag)
+-> apply transforms (TransformEngine.transform_df)
+-> record changes in Manifest
+-> write output + manifest.json
+-> save_run (history.py)
+```
+Zero-config mode: `profile_dataframe` infers a type per column, `select_transforms` picks `auto_apply=True` transforms that match the type, sorted by priority descending.
+## Transform Registry
+Transforms live in `goldenflow/transforms/` and self-register via decorator:
+```python
+from goldenflow.transforms import register_transform
+@register_transform(
+    name="phone_e164",
+    input_types=["phone"],
+    auto_apply=True,
+    priority=70,
+    mode="series",
+)
+def phone_e164(series: pl.Series) -> pl.Series:
+    ...
+```
+All transform modules are imported in `goldenflow/__init__.py` at package load time -- that is the only registration mechanism. If you add a new module, add an import there.
+## Hybrid expr / series / dataframe Mode System
+The `mode` field on `TransformInfo` controls how the engine applies a transform:
+| mode | Input | Output | When to use |
+|------|-------|--------|-------------|
+| `"expr"` | `pl.Expr` | `pl.Expr` | Pure Polars operations (strip, lowercase). Stays in Rust; fastest. |
+| `"series"` | `pl.Series` | `pl.Series` | Python logic per column (phone parsing, date parsing). Uses `map_batches` internally. |
+| `"dataframe"` | `pl.DataFrame` | `pl.DataFrame` | Multi-column transforms (split_name, split_address). Receives and returns full frame. |
+The engine in `engine/transformer.py` dispatches based on `TransformInfo.mode` -- do not add mode-specific logic anywhere else.
+## Streaming Module (streaming.py)
+`StreamProcessor` wraps `TransformEngine` for incremental processing:
+- `transform_one(record: dict)` -- single record, returns `TransformResult`
+- `transform_batch(df: pl.DataFrame)` -- one batch
+- `stream_file(path, chunk_size=10_000)` -- yields `TransformResult` per chunk
+- `batches_processed` property -- count of batches completed
+## Cloud Connectors
+- `connectors/s3.py` -- `read_s3(uri)` / `write_s3(df, uri)` using boto3
+- `connectors/gcs.py` -- `read_gcs(uri)` / `write_gcs(df, uri)` using google-cloud-storage
+- The file connector (`connectors/file.py`) detects `s3://` and `gs://` prefixes and delegates automatically.
+## History Module (history.py)
+- Stores `RunRecord` JSON files in `~/.goldenflow/history/<run_id>.json`
+- `save_run(record)` -- called by `TransformEngine.transform_file` after each run
+- `list_runs(limit=20)` -- returns newest-first list of `RunRecord` objects
+- `RunRecord` fields: `run_id`, `source`, `timestamp`, `rows`, `columns`, `transforms_applied`, `errors`, `duration_seconds`, `config_hash`, `manifest_path`
+## Notebook Module (notebook.py)
+Monkey-patches `_repr_html_` onto three classes at import time:
+- `TransformResult._repr_html_` -- summary table + transform list + DataFrame preview
+- `Manifest._repr_html_` -- transform audit trail with before/after samples
+- `DatasetProfile._repr_html_` -- column profile table
+Imported in `goldenflow/__init__.py` as a side-effect import (no symbols exported).
+## LLM Corrector (llm/corrector.py)
+Registers an additional transform that calls an LLM API for categorical correction. Activated by:
+1. Setting `GOLDENFLOW_LLM=1` environment variable
+2. Using `--llm` flag on the CLI (which does both the env var and the import)
+Requires `OPENAI_API_KEY` or `ANTHROPIC_API_KEY`. Gracefully skips if no key is found.
+## Domain Packs (All 5 Implemented)
+Each domain pack lives in `goldenflow/domains/<name>.py` and subclasses `DomainPack` from `base.py`:
+| Module | `load_domain()` key | Focus |
+|--------|---------------------|-------|
+| `people_hr.py` | `"people_hr"` | Names, SSNs, employment dates, gender |
+| `healthcare.py` | `"healthcare"` | Patient IDs, diagnosis codes, clinical dates |
+| `finance.py` | `"finance"` | Currency, account numbers, transaction dates |
+| `ecommerce.py` | `"ecommerce"` | SKUs, prices, order dates, addresses |
+| `real_estate.py` | `"real_estate"` | Property addresses, listing dates, prices |
+`load_domain(name)` is exported from `goldenflow/domains/__init__.py` and returns the pack or `None`.
+## CLI Modules
+- `cli/main.py` -- all 14 commands (Typer app)
+- `cli/errors.py` -- `cli_error_handler()` context manager for friendly error messages
+- `cli/init_wizard.py` -- `run_wizard()` interactive setup wizard
+- `cli/watch.py` -- `watch_directory()` polling loop
+- `cli/schedule.py` -- `run_schedule()` interval parser + loop
+## Key Patterns
+- **All transforms use `@register_transform`** -- never add to `_REGISTRY` directly
+- **`TransformResult`** is a dataclass with `.df` (clean Polars DataFrame) and `.manifest` (Manifest)
+- **`Manifest`** tracks every `TransformRecord`: column, transform name, rows affected, before/after samples
+- **Polars-native** -- all data ops use Polars, never pandas
+- **`parse_transform_name("truncate:50")`** splits parameterized transform strings into `("truncate", ["50"])`
+- **`select_from_findings`** in `engine/selector.py` maps GoldenCheck finding check names to transform names (the `--from-findings` CLI flag)
+## Config Schema (goldenflow.yaml)
+```yaml
+source: customers.csv
+output: customers_clean.csv
+transforms:
+  - column: phone
+    ops: [phone_e164]
+renames:
+  email_address: email
+drop: [internal_id]
+dedup:
+  columns: [email]
+  keep: first
+```
+Config is a `GoldenFlowConfig` Pydantic model (`config/schema.py`). `config/learner.py` auto-generates it from data profiles.
+## Integration with GoldenCheck and GoldenMatch
+GoldenFlow sits in the middle of the Golden Suite pipeline:
+```
+Raw Data -> GoldenCheck (profile & discover quality issues)
+         -> GoldenFlow (fix issues, standardize, reshape)
+         -> GoldenMatch (deduplicate, match, create golden records)
+         -> Production
+```
+**GoldenCheck integration** (`pip install goldenflow[check]`):
+- `engine/profiler_bridge.py` calls GoldenCheck's scanner to get column profiles
+- `engine/selector.py:select_from_findings()` maps GoldenCheck finding checks to transform names
+- CLI flag `goldenflow transform data.csv --from-findings findings.json`
+**GoldenMatch integration**:
+- GoldenFlow's output (clean CSV + manifest) feeds directly into `goldenmatch dedupe`
+- Schema mapping (`goldenflow map`) resolves column name mismatches before matching
+**Pipeline shorthand**:
+```bash
+goldencheck scan data.csv | goldenflow transform --from-findings | goldenmatch dedupe
+```
+## Testing
+- TDD: tests first, then implementation
+- 158 tests passing
+- Fixtures: `tests/fixtures/` (CSV files gitignored; add `!tests/fixtures/*.csv` exception if needed)
+- Convention: `tests/{module}/test_{file}.py`
+- Integration tests: `tests/test_integration.py`, `tests/test_public_api.py`
+- Commit messages: conventional commits (`feat:`, `fix:`, `test:`, `docs:`, `chore:`)
+## Environment / Auth
+API keys for LLM testing live in `.testing/.env` (gitignored):
+```bash
+source .testing/.env   # loads OPENAI_API_KEY, ANTHROPIC_API_KEY, TWINE credentials
+```
+GitHub auth on Windows (Credential Manager ignores `gh auth switch`):
+```bash
+gh auth switch --user benzsevern
+GIT_ASKPASS=$(which echo) git -c credential.helper="!gh auth git-credential" push origin main
+gh auth switch --user benzsevern-mjh   # switch back after
+```
+## Benchmarks
+```bash
+pip install dqbench && dqbench run goldenflow   # DQBench transform benchmark (100/100)
+dqbench run all                                  # Compare against other tools
+```
+## Publishing
+```bash
+python -m build && source .testing/.env && python -m twine upload dist/*
+```
+## Gotchas
+- `*.csv` is in `.gitignore` -- test fixtures need `!tests/fixtures/*.csv` exception
+- `__version__` is defined ONLY in `goldenflow/__init__.py` -- don't add a second copy in `cli/main.py`
+- Transform module imports in `__init__.py` are load-order sensitive -- modules that depend on others (e.g. `auto_correct` depends on `categorical`) must be imported after
+- `mode="dataframe"` transforms receive the **entire** DataFrame and must return one with the same or more columns -- do not drop columns silently
+- `category_auto_correct` is suppressed for high-cardinality columns (>10% unique values) by `selector.py` -- this is intentional
+- Ruff line length: 100 chars
+- `config/learner.py` generates a YAML config from profiles; `config/loader.py` reads it back -- keep the Pydantic schema in `config/schema.py` as the single source of truth
+- Cloud connectors (s3.py, gcs.py) have optional dependencies -- `pip install goldenflow[s3]` or `pip install goldenflow[gcs]`; they raise `ImportError` with a helpful message if the dependency is missing
+- `streaming.py` reads the full file before batching (currently) -- for truly out-of-core processing, use Polars LazyFrame directly
+- `history.py` stores runs in `~/.goldenflow/history/` -- this directory is created on first run and is not cleaned up automatically
+## Remote MCP Server
+- Endpoint: https://goldenflow-mcp-production.up.railway.app/mcp/
+- Smithery: https://smithery.ai/servers/benzsevern/goldenflow
+- 10 tools, Streamable HTTP transport
+- Dockerfile: Dockerfile.mcp
+- Local HTTP: goldenflow mcp-serve --transport http --port 8150

goldenflow-1.0.0/.github/ISSUE_TEMPLATE/bug_report.yml ADDED Viewed

@@ -0,0 +1,56 @@
+name: Bug Report
+description: Report something that isn't working correctly
+labels: ["bug"]
+body:
+  - type: markdown
+    attributes:
+      value: |
+        Thanks for reporting a bug! Please fill in the details below.
+  - type: input
+    id: version
+    attributes:
+      label: GoldenFlow version
+      description: "Run `goldenflow --version` or `pip show goldenflow`"
+      placeholder: "0.1.0"
+    validations:
+      required: true
+  - type: textarea
+    id: description
+    attributes:
+      label: What happened?
+      description: "Clear description of the bug"
+    validations:
+      required: true
+  - type: textarea
+    id: reproduce
+    attributes:
+      label: Steps to reproduce
+      description: "Minimal steps or config to reproduce the issue"
+      placeholder: |
+        1. Run `goldenflow transform data.csv --domain healthcare`
+        2. See error...
+    validations:
+      required: true
+  - type: textarea
+    id: config
+    attributes:
+      label: Config (if applicable)
+      description: "Paste your YAML config"
+      render: yaml
+  - type: textarea
+    id: error
+    attributes:
+      label: Error output
+      description: "Full error message or traceback"
+      render: shell
+  - type: dropdown
+    id: os
+    attributes:
+      label: Operating System
+      options:
+        - Windows
+        - macOS
+        - Linux
+        - Other
+    validations:
+      required: true

goldenflow-1.0.0/.github/ISSUE_TEMPLATE/config.yml ADDED Viewed

@@ -0,0 +1,5 @@
+blank_issues_enabled: false
+contact_links:
+  - name: Ask a Question
+    url: https://github.com/benzsevern/goldenflow/discussions
+    about: Get help from the community in Discussions