data-morph-gemma 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. data_morph_gemma-0.1.0.dist-info/METADATA +177 -0
  2. data_morph_gemma-0.1.0.dist-info/RECORD +39 -0
  3. data_morph_gemma-0.1.0.dist-info/WHEEL +4 -0
  4. data_morph_gemma-0.1.0.dist-info/entry_points.txt +2 -0
  5. data_morph_gemma-0.1.0.dist-info/licenses/LICENSE +25 -0
  6. datamorph/__init__.py +19 -0
  7. datamorph/cli.py +84 -0
  8. datamorph/convert.py +146 -0
  9. datamorph/data/__init__.py +1 -0
  10. datamorph/data/collect.py +221 -0
  11. datamorph/data/envelope.py +20 -0
  12. datamorph/data/generators/__init__.py +1 -0
  13. datamorph/data/generators/base.py +48 -0
  14. datamorph/data/generators/uc1_csv_to_json.py +64 -0
  15. datamorph/data/generators/uc2_json_to_csv.py +59 -0
  16. datamorph/data/generators/uc3_txt_log_to_csv.py +64 -0
  17. datamorph/data/generators/uc4_csv_to_txt_report.py +62 -0
  18. datamorph/data/generators/uc5_schema_migration.py +49 -0
  19. datamorph/data/sandbox.py +95 -0
  20. datamorph/data/teacher_script.py +114 -0
  21. datamorph/evaluation/__init__.py +0 -0
  22. datamorph/evaluation/metrics.py +264 -0
  23. datamorph/evaluation/output_cleanup.py +116 -0
  24. datamorph/evaluation/runner.py +218 -0
  25. datamorph/evaluation/teacher.py +193 -0
  26. datamorph/extractor/__init__.py +15 -0
  27. datamorph/extractor/base.py +26 -0
  28. datamorph/extractor/csv_extractor.py +515 -0
  29. datamorph/extractor/json_extractor.py +447 -0
  30. datamorph/extractor/json_walker.py +217 -0
  31. datamorph/extractor/sampler.py +68 -0
  32. datamorph/extractor/txt_extractor.py +199 -0
  33. datamorph/extractor/warning_rules.py +473 -0
  34. datamorph/features/__init__.py +1 -0
  35. datamorph/features/format_pairs.py +57 -0
  36. datamorph/model.py +63 -0
  37. datamorph/models/__init__.py +0 -0
  38. datamorph/models/gemma_mlx.py +163 -0
  39. datamorph/models/gemma_script_teacher.py +100 -0
@@ -0,0 +1,177 @@
1
+ Metadata-Version: 2.4
2
+ Name: data-morph-gemma
3
+ Version: 0.1.0
4
+ Summary: Distill a CSV/JSON/TXT file-conversion capability from Claude Opus into a fine-tuned Gemma 2B (LoRA/QLoRA).
5
+ Project-URL: Homepage, https://github.com/LoveMig6334/data-morph
6
+ Project-URL: Repository, https://github.com/LoveMig6334/data-morph
7
+ Project-URL: Model (Hugging Face), https://huggingface.co/Bunnana/data-morph-gemma-2b
8
+ Author-email: Thatt Bunnag <tom.tom.thanet@gmail.com>
9
+ License-Expression: MIT
10
+ License-File: LICENSE
11
+ Keywords: csv,file-conversion,gemma,json,knowledge-distillation,llm,mlx
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Operating System :: MacOS
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
18
+ Classifier: Topic :: Utilities
19
+ Requires-Python: >=3.12
20
+ Requires-Dist: huggingface-hub>=0.30
21
+ Requires-Dist: pandas>=3.0.2
22
+ Provides-Extra: mlx
23
+ Requires-Dist: mlx-lm>=0.31.3; extra == 'mlx'
24
+ Requires-Dist: mlx-vlm>=0.5.0; extra == 'mlx'
25
+ Description-Content-Type: text/markdown
26
+
27
+ # data morph
28
+
29
+ **Open Source File Data Migration with Fine-tuned Small Language Model**
30
+
31
+ Knowledge distillation from a large-model agent (Claude Opus + Agent Skill) into a fine-tuned Gemma 2B, so developers can convert between file formats locally for free instead of paying for frontier-LLM API calls.
32
+
33
+ AI Builders 2026 · Track: Agentic AI + NLP
34
+
35
+ ## Problem
36
+
37
+ Rule-based parsers can't handle messy, context-dependent file conversions. Frontier LLMs can, but they're expensive at scale. This project distills that capability into a 2B-parameter model that runs locally.
38
+
39
+ ## Approach
40
+
41
+ 1. **Teacher**: Claude Opus + Claude Code + Agent Skill generates 500–1000 verified training pairs.
42
+ 2. **Student**: Gemma 2B, fine-tuned with LoRA / QLoRA.
43
+ 3. **Target**: ≥80% of teacher accuracy across 4 metrics — Format Validity, Schema Compliance, Loadability, Content Accuracy.
44
+
45
+ ### Pipeline architecture
46
+
47
+ Conversion is a **five-stage pipeline**, not a single end-to-end model call.
48
+ The model only ever sees a small structured metadata envelope, never the
49
+ full source file:
50
+
51
+ ```
52
+ [source file]
53
+
54
+ ├─→ [1. Metadata extractor] deterministic — schema + samples + warnings
55
+ ├─→ [2. Context summarizer] Gemma 2B base — short NL summary
56
+
57
+ [3. Script generator] Claude Opus (training) → Gemma 2B fine-tuned (inference)
58
+ ↓ outputs an executable Python script
59
+ [4. Sandbox executor] deterministic — runs the script
60
+ ↓ converted output file
61
+ [5. Validator] the 4 W2 metrics — format, schema, load, content
62
+
63
+ [output file]
64
+ ```
65
+
66
+ **Why this shape**: distillation target narrows from "transform a whole
67
+ file" (impractical for a 2 B model) to "read metadata, write a script"
68
+ (realistic). The model never sees full file content, so the pipeline scales
69
+ to arbitrary file sizes. Failures are debuggable — the script is a readable
70
+ intermediate artefact.
71
+
72
+ ### Status
73
+
74
+ **W1–W6 complete; W7 model surgery done — a 2.0 GB single-file student is production-validated.**
75
+
76
+ - **Data (W3):** 800 verified teacher pairs (100% accept), split into
77
+ `data/processed/{train,val,test}.jsonl` (650 / 80 / 70, content-disjoint).
78
+ - **EDA (W4):** `notebook/w4_eda.ipynb` — training-readiness audit (balance,
79
+ leakage, sequence-length budget).
80
+ - **Fine-tune (W5):** Gemma-4 E2B distilled via LoRA (`mlx_vlm.lora`, SFT) on the
81
+ envelope→script task. Best checkpoint (iter-400) selected by held-out eval.
82
+ - **Eval (W6):** on the held-out 70-case test set, through the full pipeline
83
+ (envelope → script → sandbox → 4 metrics), the fine-tuned student reaches
84
+ **65/70 one-shot** and **68/70 (0.971) at production retry≤3** — already ≥80%-of-teacher.
85
+ - **Shrink (W7):** the multimodal base is mostly dead weight for this task. A
86
+ three-step surgery (`scripts/build_textonly_student.py` + `prune_vocab.py`) fuses the
87
+ adapter, strips the unused **vision + audio towers**, prunes the **262 k vocab → 16 k**
88
+ (the corpus uses ~4.5 k tokens; the vocab indexes the two biggest tensors), then
89
+ re-quantizes — all on a pure `gemma4_text` model loaded via `mlx_lm`:
90
+
91
+ | Artifact | params | size | retry≤3 | % teacher |
92
+ |---|---:|---:|---:|---:|
93
+ | fine-tuned bf16 (runtime adapter) | 5.12 B | 9.6 GB | — | — |
94
+ | *prior 8-bit (full model)* | 5.1 B | 5.5 GB | 68/70 | ~97% |
95
+ | fused + text-only + vocab-16k, bf16 | 2.05 B | 3.8 GB | **69/70 (0.986)** | ~99% |
96
+ | **+ 8-bit (final ship artifact)** | **2.05 B** | **2.0 GB** | **67/70 (0.957)** | **~96%** |
97
+
98
+ **9.6 GB → 2.0 GB (−79%)** with accuracy still well above the **≥80%-of-teacher**
99
+ target on every metric. Each cut is lossless-by-construction (strip/prune, guarded by
100
+ a tokenizer round-trip verification gate) or a small retry-recoverable numerical cost.
101
+
102
+ **Next (W7 deployment):** push the 2.0 GB model to Hugging Face Hub with a model card,
103
+ ship the `pip`-installable pipeline wrapper. See `docs/progression.md` for the live tracker.
104
+
105
+ ## Supported formats
106
+
107
+ CSV, JSON, TXT — in 5 use cases (CSV→JSON nested, JSON→CSV flattening, TXT log→CSV, CSV→TXT report, schema migration).
108
+
109
+ ## Setup
110
+
111
+ Requires **Python 3.12** (chosen for stronger MLX support). Project is
112
+ managed by [`uv`](https://docs.astral.sh/uv/).
113
+
114
+ ```bash
115
+ uv sync # creates .venv from pyproject.toml + uv.lock
116
+ source .venv/bin/activate # macOS / Linux
117
+ # .venv\Scripts\activate # Windows
118
+ ```
119
+
120
+ Add a new dependency: `uv add <pkg>` (or `uv add --dev <pkg>` for dev-only).
121
+
122
+ ## Hardware / framework
123
+
124
+ - **Primary target**: MacBook Pro M5 Max (40 GPU cores, 120 GB unified memory) with **MLX**.
125
+ - **Fallback**: Google Colab + PyTorch + Unsloth (used when MLX is unavailable, e.g. on Windows).
126
+
127
+ ## Repo structure
128
+
129
+ ```
130
+ data/
131
+ raw/ # synthetic corpus from seeded generators (regenerable, gitignored)
132
+ interim/ # verified teacher pairs (envelope + analysis + script + scores)
133
+ processed/ # train/val/test chat JSONL for fine-tuning
134
+ test_set/ # 15 hand-crafted W2 baseline cases
135
+ notebook/ # EDA (w4_eda), fine-tune scaffold (w5_finetune), experiments
136
+ src/
137
+ extractor/ # Stage 1: deterministic metadata extractor — CSV, JSON, TXT (done)
138
+ evaluation/ # Stage 5: the 4 W2 metrics + Opus-baseline runner (DO NOT EDIT)
139
+ data/ # generators (oracle), sandbox (Stage 4), teacher_script + collect (Stage 3)
140
+ features/ # format_pairs: verified pairs → chat JSONL + disjoint split
141
+ models/ # LoRA/QLoRA fine-tune + inference (W5)
142
+ scripts/ # generate_corpus, collect_pairs, collect_all_parallel, build_dataset, baseline, plotting
143
+ skills/ # Agent-Skill prompts read by `claude -p` (file conversion + script generation)
144
+ tests/ # unit tests (metrics, extractor, data, features) + fixtures
145
+ models/ # Gemma-4 E2B (local, gitignored) + fine-tuned checkpoints
146
+ results/ # baseline run artefacts (per-run summary.json + plots)
147
+ docs/ # specs, plans, weekly reports (gitignored)
148
+ ```
149
+
150
+ ## Timeline (8 weeks)
151
+
152
+ | Week | Focus | Points |
153
+ |------|-------|-------:|
154
+ | 1 | Problem statement + use cases | 15 |
155
+ | 2 | Metrics + Claude Opus baseline | 15 |
156
+ | 3 | Teacher-generated training pairs | 15 |
157
+ | 4 | EDA + data cleaning | 20 |
158
+ | 5 | Fine-tune Gemma 2B (LoRA) | — |
159
+ | 6 | Evaluation + error analysis | 20 |
160
+ | 7 | Deployment (pip + HF Hub) | 15 |
161
+ | 8 | Blog, slides, poster | — |
162
+ | | **Total** | **100** (≥70 to pass) |
163
+
164
+ ## Deliverables
165
+
166
+ - GitHub repo (this one)
167
+ - Hugging Face Hub model + model card
168
+ - `pip install`-able Python package
169
+ - Medium blog post
170
+ - Presentation slides + A1 poster
171
+ - Facebook post (100–200 words)
172
+
173
+ ## Ethics
174
+
175
+ - Converted files may contain personal data → no uploads of user input.
176
+ - Teacher bias propagates to student — documented in model card.
177
+ - Hallucination risk mitigated by automated format/schema validation at inference time.
@@ -0,0 +1,39 @@
1
+ datamorph/__init__.py,sha256=FiR_B8RTr22nnvGnkkoLpNMo7Vf8dI8yk1krtNE_Y1s,538
2
+ datamorph/cli.py,sha256=9Uza4tUeHKxoZR36JnBNYFC17kBV8roCledjkQE9Crs,3008
3
+ datamorph/convert.py,sha256=8EXURaOtyBaNv3HNMpCkT0oeqaYAjYMpixkQ16nkxE8,5757
4
+ datamorph/model.py,sha256=DeyFXK1EvKVlHbhH9dWbvO5pTp1Y8V9ti4YFgmFXsoo,2290
5
+ datamorph/data/__init__.py,sha256=93ZipyncNI2o9q6MlZrwlCznV18KJhPhHP-AA3nnv88,86
6
+ datamorph/data/collect.py,sha256=CXNBVsmjODsaXkpkFb6r4CtpgiF91QMeCzF_fViwusI,7918
7
+ datamorph/data/envelope.py,sha256=ahjwfmeFZIVzUuz6sKGRnRtpz8TNzZDwFdMV6NZBiQo,701
8
+ datamorph/data/sandbox.py,sha256=EZ3f_7-ftBlvQpwS034OH4zh3-hIjFZVcfmYOmTRIRo,3008
9
+ datamorph/data/teacher_script.py,sha256=ygzvTgauBr2CWZyBGBwr8fNaVefuEq74V9R0utgqNbI,3768
10
+ datamorph/data/generators/__init__.py,sha256=JnJlFLJLOiLEc4OnKqUF2OSBTJDmmy42T23J-8tHGxA,68
11
+ datamorph/data/generators/base.py,sha256=xB9nW6Pyy5YvHo-rgXzRgOXwWQy2Fwmn_jBHksqgHxI,1411
12
+ datamorph/data/generators/uc1_csv_to_json.py,sha256=VD_Xs5LaHW3Sr4FwBn3FOz3_ZiO8fpxMrGVqnncXSw4,2307
13
+ datamorph/data/generators/uc2_json_to_csv.py,sha256=gNsCdfKl7WrwgF4wrY09LpGqnx76kGKSif7KcdrEAg0,1905
14
+ datamorph/data/generators/uc3_txt_log_to_csv.py,sha256=veXTw9VYiAAzsM0Zz87fR2cdfLRI17kfL0MilfjjlDE,2069
15
+ datamorph/data/generators/uc4_csv_to_txt_report.py,sha256=nVJsZqEYXhp9VGijX3VwrxjWJPO7kTIMrOMSU7Dn0-A,2094
16
+ datamorph/data/generators/uc5_schema_migration.py,sha256=7N-6wE4CrNNvRxTtBUgpEgH2K1hXeBvmBDeSPNrk6l4,1517
17
+ datamorph/evaluation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
+ datamorph/evaluation/metrics.py,sha256=8LP0jLTsPEhZj2yAKgjF2Q8dgvoRlDIsM0JLGHARsfU,8848
19
+ datamorph/evaluation/output_cleanup.py,sha256=LSvakugnZ0ai7hm_cH_qh5lA3OWjvH_FkmBpH83f8FA,3118
20
+ datamorph/evaluation/runner.py,sha256=nwXcCqYFf3zxSW3QLVNZF-5UmIw5PSPh916e4M6kf5g,7094
21
+ datamorph/evaluation/teacher.py,sha256=oLGiDNt_aVyOy187tmmrID1NCWAG6m1oiGLDixniIuY,5817
22
+ datamorph/extractor/__init__.py,sha256=wQY0zZ9KhJe6awTr2t7mUyerh_czed1mmMeqcSSc9kE,381
23
+ datamorph/extractor/base.py,sha256=IG3VlYeCvk06qukLAfyrpwPz1baDIKgpV5RQ1jvPH5Y,799
24
+ datamorph/extractor/csv_extractor.py,sha256=9NlkP3fkqwO_LaU0ZcCPtPrNC0DVc1FJ7irB_1s3-b8,15746
25
+ datamorph/extractor/json_extractor.py,sha256=rC4COf4liDP9v6EUP3vq4exNLqRtb--9fHikDmeA6f4,14436
26
+ datamorph/extractor/json_walker.py,sha256=tqMKbznIQcNju7OxGyjKlWyXh1RJDHracfGO20xcEU8,8586
27
+ datamorph/extractor/sampler.py,sha256=4KRDyw1f9SYhzmT0PQrKEDnIU1Ow_Orn7SCJFNkL8bc,1818
28
+ datamorph/extractor/txt_extractor.py,sha256=j27vc9x0YFY5b2fpBwQNRmrsW2IA7fZ-9Nkn1-rdCqM,7177
29
+ datamorph/extractor/warning_rules.py,sha256=rXuBZcRhWz5gX6yW9ZGlZwV-liqyzw6Loi1JJSJ8Skk,16389
30
+ datamorph/features/__init__.py,sha256=a7vHQBqSGE6p_znP4RGk9fjU8ybajwXcJJzWsyNJevk,83
31
+ datamorph/features/format_pairs.py,sha256=ld7bVfAdGaytFpWMsqM9fMoc7-7AwtUgae2DIaFhDAE,1918
32
+ datamorph/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
33
+ datamorph/models/gemma_mlx.py,sha256=fLoIZRrSQJjmXa8slvXSAi2asujqn0e0RpYKcyCjumo,5526
34
+ datamorph/models/gemma_script_teacher.py,sha256=nUdbtL_RdoBkzoMd1jRW9oRvPTPcwm41ZtiISELAYW8,3783
35
+ data_morph_gemma-0.1.0.dist-info/METADATA,sha256=4Bj79gd_tMBq4lgpnENEIqMzVwXoN8TjkvElWyjtyC8,8011
36
+ data_morph_gemma-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
37
+ data_morph_gemma-0.1.0.dist-info/entry_points.txt,sha256=PU5Llylbez88gMUi7dbW2sz8lvsj_tJo7WAl_P5o3DI,49
38
+ data_morph_gemma-0.1.0.dist-info/licenses/LICENSE,sha256=G2sAIF-Pu_k-HeoCBWec9i6AWiufweabaY_1to6EkF0,1278
39
+ data_morph_gemma-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.30.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ datamorph = datamorph.cli:main
@@ -0,0 +1,25 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Thatt Bunnag
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
22
+
23
+ Note: this license covers the data-morph source code. The distilled model
24
+ weights are a derivative of Google's Gemma and are governed separately by the
25
+ Gemma Terms of Use (https://ai.google.dev/gemma/terms).
datamorph/__init__.py ADDED
@@ -0,0 +1,19 @@
1
+ """datamorph — distill file-format conversion into a small local model.
2
+
3
+ Public API:
4
+
5
+ from datamorph import convert_file, ConversionResult
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from importlib.metadata import PackageNotFoundError, version
11
+
12
+ from datamorph.convert import ConversionResult, convert_file
13
+
14
+ try:
15
+ __version__ = version("data-morph-gemma")
16
+ except PackageNotFoundError: # not installed (e.g. running from a source tree)
17
+ __version__ = "0.0.0+unknown"
18
+
19
+ __all__ = ["convert_file", "ConversionResult", "__version__"]
datamorph/cli.py ADDED
@@ -0,0 +1,84 @@
1
+ """Command-line interface for datamorph.
2
+
3
+ datamorph convert input.csv output.json
4
+ datamorph convert log.txt --output-format csv > out.csv
5
+ datamorph --version
6
+
7
+ Exit codes: 0 = converted and validated, 1 = ran but output failed validation,
8
+ 2 = usage / input error.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import argparse
14
+ import sys
15
+
16
+ from datamorph import __version__, convert_file
17
+
18
+ FORMATS = ("csv", "json", "txt")
19
+
20
+
21
+ def build_parser() -> argparse.ArgumentParser:
22
+ parser = argparse.ArgumentParser(
23
+ prog="datamorph",
24
+ description="Convert files between CSV, JSON, and TXT with the distilled student model.",
25
+ )
26
+ parser.add_argument("--version", action="version", version=f"datamorph {__version__}")
27
+ sub = parser.add_subparsers(dest="command")
28
+
29
+ conv = sub.add_parser("convert", help="convert an input file to another format")
30
+ conv.add_argument("input", help="path to the source file")
31
+ conv.add_argument(
32
+ "output",
33
+ nargs="?",
34
+ help="path to write (its extension sets the target format); "
35
+ "if omitted, the result is printed to stdout and --output-format is required",
36
+ )
37
+ conv.add_argument("--input-format", choices=FORMATS, help="override input format detection")
38
+ conv.add_argument("--output-format", choices=FORMATS, help="target format (required if no output path)")
39
+ conv.add_argument("--instruction", help="extra natural-language guidance for the conversion")
40
+ conv.add_argument("--max-retries", type=int, default=3, help="retries with error feedback (default 3)")
41
+ conv.add_argument("--model", help="local model path or HF repo id (default: the published model)")
42
+ conv.add_argument("-q", "--quiet", action="store_true", help="suppress the status line on stderr")
43
+ return parser
44
+
45
+
46
+ def main(argv: list[str] | None = None) -> int:
47
+ parser = build_parser()
48
+ args = parser.parse_args(argv)
49
+
50
+ if args.command != "convert":
51
+ parser.print_help(sys.stderr)
52
+ return 2
53
+
54
+ try:
55
+ result = convert_file(
56
+ args.input,
57
+ args.output,
58
+ input_format=args.input_format,
59
+ output_format=args.output_format,
60
+ instruction=args.instruction,
61
+ max_retries=args.max_retries,
62
+ model=args.model,
63
+ )
64
+ except (FileNotFoundError, ValueError) as exc:
65
+ print(f"error: {exc}", file=sys.stderr)
66
+ return 2
67
+
68
+ if args.output is None:
69
+ sys.stdout.write(result.output_text)
70
+
71
+ if not args.quiet:
72
+ where = str(result.output_path) if result.output_path else "stdout"
73
+ status = "ok" if result.accepted else f"NOT VALIDATED ({result.error or 'low score'})"
74
+ print(
75
+ f"datamorph: {result.input_format} -> {result.output_format} {status} "
76
+ f"(retries={result.retries}, scores={result.scores}) -> {where}",
77
+ file=sys.stderr,
78
+ )
79
+
80
+ return 0 if result.accepted else 1
81
+
82
+
83
+ if __name__ == "__main__": # pragma: no cover
84
+ sys.exit(main())
datamorph/convert.py ADDED
@@ -0,0 +1,146 @@
1
+ """Public API: convert a file between formats with the distilled student model.
2
+
3
+ ``convert_file`` runs the production pipeline — extract a metadata envelope, have
4
+ the student write a Python conversion script, run it in a sandbox, and validate the
5
+ output — retrying on failures up to ``max_retries``. The model never sees the full
6
+ source file, only its envelope.
7
+
8
+ from datamorph import convert_file
9
+ result = convert_file("contacts.csv", "contacts.json")
10
+ print(result.accepted, result.output_path)
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ from dataclasses import dataclass, field
16
+ from pathlib import Path
17
+ from typing import Any, Callable, Protocol
18
+
19
+ from datamorph.data.envelope import extract_envelope
20
+ from datamorph.data.sandbox import run_script
21
+ from datamorph.evaluation.metrics import format_validity, loadability
22
+ from datamorph.model import resolve_model
23
+
24
+ # Self-contained format <-> extension map (kept here so the inference path does
25
+ # not import the data-generation package, which pulls in faker).
26
+ EXT = {"csv": ".csv", "json": ".json", "txt": ".txt"}
27
+ _FMT_BY_EXT = {ext: fmt for fmt, ext in EXT.items()}
28
+ SUPPORTED_FORMATS = tuple(EXT)
29
+
30
+
31
+ class TeacherFn(Protocol):
32
+ """Signature of the script author (the student model, or a test stub)."""
33
+
34
+ def __call__(
35
+ self, envelope: dict[str, Any], instruction: str, output_format: str,
36
+ *, feedback: str | None = ...,
37
+ ) -> Any: ...
38
+
39
+
40
+ @dataclass
41
+ class ConversionResult:
42
+ """Outcome of a single ``convert_file`` call."""
43
+
44
+ output_text: str
45
+ input_format: str
46
+ output_format: str
47
+ script: str = ""
48
+ scores: dict[str, float] = field(default_factory=dict)
49
+ accepted: bool = False
50
+ retries: int = 0
51
+ error: str | None = None
52
+ output_path: Path | None = None
53
+
54
+
55
+ def _detect_format(path: Path, explicit: str | None, role: str) -> str:
56
+ fmt = explicit.lower() if explicit else _FMT_BY_EXT.get(path.suffix.lower())
57
+ if fmt not in EXT:
58
+ raise ValueError(
59
+ f"Unsupported or undetected {role} format for {path.name!r}; pass "
60
+ f"{role}_format=<one of {SUPPORTED_FORMATS}>."
61
+ )
62
+ return fmt
63
+
64
+
65
+ def _default_teacher_fn(model: str | None) -> Callable:
66
+ """Select the model and return the real student script author."""
67
+ from datamorph.models import gemma_mlx
68
+ from datamorph.models.gemma_script_teacher import call_gemma_script_teacher
69
+
70
+ gemma_mlx.use_model(resolve_model(model), text_only=True)
71
+ return call_gemma_script_teacher
72
+
73
+
74
+ def convert_file(
75
+ input_path: str | Path,
76
+ output_path: str | Path | None = None,
77
+ *,
78
+ input_format: str | None = None,
79
+ output_format: str | None = None,
80
+ instruction: str | None = None,
81
+ max_retries: int = 3,
82
+ model: str | None = None,
83
+ teacher_fn: TeacherFn | None = None,
84
+ ) -> ConversionResult:
85
+ """Convert ``input_path`` to the target format, optionally writing ``output_path``.
86
+
87
+ Formats are auto-detected from file extensions when not given explicitly. The
88
+ pipeline retries up to ``max_retries`` times with error feedback. ``teacher_fn``
89
+ can be injected to run the pipeline without the model (used in tests).
90
+ """
91
+ input_path = Path(input_path)
92
+ if not input_path.exists():
93
+ raise FileNotFoundError(f"input file not found: {input_path}")
94
+
95
+ in_fmt = _detect_format(input_path, input_format, "input")
96
+ if output_format:
97
+ out_fmt = output_format.lower()
98
+ if out_fmt not in EXT:
99
+ raise ValueError(
100
+ f"Unsupported output_format {output_format!r}; one of {SUPPORTED_FORMATS}."
101
+ )
102
+ elif output_path is not None:
103
+ out_fmt = _detect_format(Path(output_path), None, "output")
104
+ else:
105
+ raise ValueError("Provide output_format=, or an output_path with a known extension.")
106
+
107
+ if teacher_fn is None:
108
+ teacher_fn = _default_teacher_fn(model)
109
+
110
+ envelope = extract_envelope(input_path, in_fmt)
111
+ envelope.pop("file_path", None) # never leak local paths
112
+ instruction = instruction or f"Convert this {in_fmt.upper()} to {out_fmt.upper()}."
113
+ out_suffix = EXT[out_fmt]
114
+
115
+ feedback: str | None = None
116
+ result = ConversionResult("", in_fmt, out_fmt, error="not_run")
117
+ for attempt in range(max_retries + 1):
118
+ tr = teacher_fn(envelope, instruction, out_fmt, feedback=feedback)
119
+ if not tr.ok:
120
+ result = ConversionResult("", in_fmt, out_fmt, script=tr.script,
121
+ retries=attempt, error="no_script")
122
+ feedback = "Your previous response had no <script> block. Output one."
123
+ continue
124
+ sr = run_script(tr.script, input_path, output_suffix=out_suffix)
125
+ if not sr.ok:
126
+ result = ConversionResult(sr.output_text, in_fmt, out_fmt, script=tr.script,
127
+ retries=attempt, error=sr.error_kind)
128
+ feedback = f"The script failed ({sr.error_kind}): {sr.stderr[:300]}. Fix it."
129
+ continue
130
+ out = sr.output_text
131
+ scores = {
132
+ "format_validity": format_validity(out, out_fmt),
133
+ "loadability": loadability(out, out_fmt),
134
+ }
135
+ accepted = scores["format_validity"] == 1.0 and scores["loadability"] == 1.0
136
+ result = ConversionResult(out, in_fmt, out_fmt, script=tr.script, scores=scores,
137
+ accepted=accepted, retries=attempt, error=None)
138
+ if accepted:
139
+ break
140
+ feedback = f"Output was not valid {out_fmt.upper()} (scores={scores}). Fix the script."
141
+
142
+ if output_path is not None and result.output_text:
143
+ output_path = Path(output_path)
144
+ output_path.write_text(result.output_text, encoding="utf-8")
145
+ result.output_path = output_path
146
+ return result
@@ -0,0 +1 @@
1
+ """data-morph data-collection layer (synthetic generators, sandbox, orchestrator)."""