PyPI - data-morph-gemma - Versions diffs - 0.1.0__tar.gz - Mend

data-morph-gemma 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (85) hide show

data_morph_gemma-0.1.0/.gitignore ADDED Viewed

@@ -0,0 +1,61 @@
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+*.egg-info/
+# Build artifacts
+dist/
+build/
+.pytest_cache/
+.mypy_cache/
+.ruff_cache/
+# Virtual environments
+.venv/
+venv/
+env/
+# Jupyter
+.ipynb_checkpoints/
+*.ipynb_checkpoints
+# IDEs
+.vscode/
+.idea/
+*.swp
+docs/
+# OS
+.DS_Store
+Thumbs.db
+# Data (don't commit large/raw data)
+data/raw/*
+data/processed/*
+data/interim/*
+!data/raw/.gitkeep
+!data/processed/.gitkeep
+!data/interim/.gitkeep
+# Models (often too large for git)
+models/*
+!models/.gitkeep
+# Secrets / env
+.env
+.env.*
+!.env.example
+# Claude Code project instructions (local)
+CLAUDE.md
+# Logs
+*.log
+logs/
+# MLflow / experiment tracking
+mlruns/
+wandb/

data_morph_gemma-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,25 @@
+MIT License
+Copyright (c) 2026 Thatt Bunnag
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+Note: this license covers the data-morph source code. The distilled model
+weights are a derivative of Google's Gemma and are governed separately by the
+Gemma Terms of Use (https://ai.google.dev/gemma/terms).

data_morph_gemma-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,177 @@
+Metadata-Version: 2.4
+Name: data-morph-gemma
+Version: 0.1.0
+Summary: Distill a CSV/JSON/TXT file-conversion capability from Claude Opus into a fine-tuned Gemma 2B (LoRA/QLoRA).
+Project-URL: Homepage, https://github.com/LoveMig6334/data-morph
+Project-URL: Repository, https://github.com/LoveMig6334/data-morph
+Project-URL: Model (Hugging Face), https://huggingface.co/Bunnana/data-morph-gemma-2b
+Author-email: Thatt Bunnag <tom.tom.thanet@gmail.com>
+License-Expression: MIT
+License-File: LICENSE
+Keywords: csv,file-conversion,gemma,json,knowledge-distillation,llm,mlx
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: Operating System :: MacOS
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Classifier: Topic :: Utilities
+Requires-Python: >=3.12
+Requires-Dist: huggingface-hub>=0.30
+Requires-Dist: pandas>=3.0.2
+Provides-Extra: mlx
+Requires-Dist: mlx-lm>=0.31.3; extra == 'mlx'
+Requires-Dist: mlx-vlm>=0.5.0; extra == 'mlx'
+Description-Content-Type: text/markdown
+# data morph
+**Open Source File Data Migration with Fine-tuned Small Language Model**
+Knowledge distillation from a large-model agent (Claude Opus + Agent Skill) into a fine-tuned Gemma 2B, so developers can convert between file formats locally for free instead of paying for frontier-LLM API calls.
+AI Builders 2026 · Track: Agentic AI + NLP
+## Problem
+Rule-based parsers can't handle messy, context-dependent file conversions. Frontier LLMs can, but they're expensive at scale. This project distills that capability into a 2B-parameter model that runs locally.
+## Approach
+1. **Teacher**: Claude Opus + Claude Code + Agent Skill generates 500–1000 verified training pairs.
+2. **Student**: Gemma 2B, fine-tuned with LoRA / QLoRA.
+3. **Target**: ≥80% of teacher accuracy across 4 metrics — Format Validity, Schema Compliance, Loadability, Content Accuracy.
+### Pipeline architecture
+Conversion is a **five-stage pipeline**, not a single end-to-end model call.
+The model only ever sees a small structured metadata envelope, never the
+full source file:
+```
+[source file]
+    │
+    ├─→ [1. Metadata extractor]  deterministic — schema + samples + warnings
+    ├─→ [2. Context summarizer]  Gemma 2B base — short NL summary
+    ↓
+[3. Script generator]   Claude Opus (training) → Gemma 2B fine-tuned (inference)
+    ↓ outputs an executable Python script
+[4. Sandbox executor]   deterministic — runs the script
+    ↓ converted output file
+[5. Validator]          the 4 W2 metrics — format, schema, load, content
+    ↓
+[output file]
+```
+**Why this shape**: distillation target narrows from "transform a whole
+file" (impractical for a 2 B model) to "read metadata, write a script"
+(realistic). The model never sees full file content, so the pipeline scales
+to arbitrary file sizes. Failures are debuggable — the script is a readable
+intermediate artefact.
+### Status
+**W1–W6 complete; W7 model surgery done — a 2.0 GB single-file student is production-validated.**
+- **Data (W3):** 800 verified teacher pairs (100% accept), split into
+  `data/processed/{train,val,test}.jsonl` (650 / 80 / 70, content-disjoint).
+- **EDA (W4):** `notebook/w4_eda.ipynb` — training-readiness audit (balance,
+  leakage, sequence-length budget).
+- **Fine-tune (W5):** Gemma-4 E2B distilled via LoRA (`mlx_vlm.lora`, SFT) on the
+  envelope→script task. Best checkpoint (iter-400) selected by held-out eval.
+- **Eval (W6):** on the held-out 70-case test set, through the full pipeline
+  (envelope → script → sandbox → 4 metrics), the fine-tuned student reaches
+  **65/70 one-shot** and **68/70 (0.971) at production retry≤3** — already ≥80%-of-teacher.
+- **Shrink (W7):** the multimodal base is mostly dead weight for this task. A
+  three-step surgery (`scripts/build_textonly_student.py` + `prune_vocab.py`) fuses the
+  adapter, strips the unused **vision + audio towers**, prunes the **262 k vocab → 16 k**
+  (the corpus uses ~4.5 k tokens; the vocab indexes the two biggest tensors), then
+  re-quantizes — all on a pure `gemma4_text` model loaded via `mlx_lm`:
+  | Artifact | params | size | retry≤3 | % teacher |
+  |---|---:|---:|---:|---:|
+  | fine-tuned bf16 (runtime adapter) | 5.12 B | 9.6 GB | — | — |
+  | *prior 8-bit (full model)* | 5.1 B | 5.5 GB | 68/70 | ~97% |
+  | fused + text-only + vocab-16k, bf16 | 2.05 B | 3.8 GB | **69/70 (0.986)** | ~99% |
+  | **+ 8-bit (final ship artifact)** | **2.05 B** | **2.0 GB** | **67/70 (0.957)** | **~96%** |
+  **9.6 GB → 2.0 GB (−79%)** with accuracy still well above the **≥80%-of-teacher**
+  target on every metric. Each cut is lossless-by-construction (strip/prune, guarded by
+  a tokenizer round-trip verification gate) or a small retry-recoverable numerical cost.
+**Next (W7 deployment):** push the 2.0 GB model to Hugging Face Hub with a model card,
+ship the `pip`-installable pipeline wrapper. See `docs/progression.md` for the live tracker.
+## Supported formats
+CSV, JSON, TXT — in 5 use cases (CSV→JSON nested, JSON→CSV flattening, TXT log→CSV, CSV→TXT report, schema migration).
+## Setup
+Requires **Python 3.12** (chosen for stronger MLX support). Project is
+managed by [`uv`](https://docs.astral.sh/uv/).
+```bash
+uv sync                        # creates .venv from pyproject.toml + uv.lock
+source .venv/bin/activate      # macOS / Linux
+# .venv\Scripts\activate       # Windows
+```
+Add a new dependency: `uv add <pkg>` (or `uv add --dev <pkg>` for dev-only).
+## Hardware / framework
+- **Primary target**: MacBook Pro M5 Max (40 GPU cores, 120 GB unified memory) with **MLX**.
+- **Fallback**: Google Colab + PyTorch + Unsloth (used when MLX is unavailable, e.g. on Windows).
+## Repo structure
+```
+data/
+  raw/          # synthetic corpus from seeded generators (regenerable, gitignored)
+  interim/      # verified teacher pairs (envelope + analysis + script + scores)
+  processed/    # train/val/test chat JSONL for fine-tuning
+  test_set/     # 15 hand-crafted W2 baseline cases
+notebook/       # EDA (w4_eda), fine-tune scaffold (w5_finetune), experiments
+src/
+  extractor/    # Stage 1: deterministic metadata extractor — CSV, JSON, TXT (done)
+  evaluation/   # Stage 5: the 4 W2 metrics + Opus-baseline runner (DO NOT EDIT)
+  data/         # generators (oracle), sandbox (Stage 4), teacher_script + collect (Stage 3)
+  features/     # format_pairs: verified pairs → chat JSONL + disjoint split
+  models/       # LoRA/QLoRA fine-tune + inference (W5)
+scripts/        # generate_corpus, collect_pairs, collect_all_parallel, build_dataset, baseline, plotting
+skills/         # Agent-Skill prompts read by `claude -p` (file conversion + script generation)
+tests/          # unit tests (metrics, extractor, data, features) + fixtures
+models/         # Gemma-4 E2B (local, gitignored) + fine-tuned checkpoints
+results/        # baseline run artefacts (per-run summary.json + plots)
+docs/           # specs, plans, weekly reports (gitignored)
+```
+## Timeline (8 weeks)
+| Week | Focus | Points |
+|------|-------|-------:|
+| 1 | Problem statement + use cases | 15 |
+| 2 | Metrics + Claude Opus baseline | 15 |
+| 3 | Teacher-generated training pairs | 15 |
+| 4 | EDA + data cleaning | 20 |
+| 5 | Fine-tune Gemma 2B (LoRA) | — |
+| 6 | Evaluation + error analysis | 20 |
+| 7 | Deployment (pip + HF Hub) | 15 |
+| 8 | Blog, slides, poster | — |
+| | **Total** | **100** (≥70 to pass) |
+## Deliverables
+- GitHub repo (this one)
+- Hugging Face Hub model + model card
+- `pip install`-able Python package
+- Medium blog post
+- Presentation slides + A1 poster
+- Facebook post (100–200 words)
+## Ethics
+- Converted files may contain personal data → no uploads of user input.
+- Teacher bias propagates to student — documented in model card.
+- Hallucination risk mitigated by automated format/schema validation at inference time.

data_morph_gemma-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,151 @@
+# data morph
+**Open Source File Data Migration with Fine-tuned Small Language Model**
+Knowledge distillation from a large-model agent (Claude Opus + Agent Skill) into a fine-tuned Gemma 2B, so developers can convert between file formats locally for free instead of paying for frontier-LLM API calls.
+AI Builders 2026 · Track: Agentic AI + NLP
+## Problem
+Rule-based parsers can't handle messy, context-dependent file conversions. Frontier LLMs can, but they're expensive at scale. This project distills that capability into a 2B-parameter model that runs locally.
+## Approach
+1. **Teacher**: Claude Opus + Claude Code + Agent Skill generates 500–1000 verified training pairs.
+2. **Student**: Gemma 2B, fine-tuned with LoRA / QLoRA.
+3. **Target**: ≥80% of teacher accuracy across 4 metrics — Format Validity, Schema Compliance, Loadability, Content Accuracy.
+### Pipeline architecture
+Conversion is a **five-stage pipeline**, not a single end-to-end model call.
+The model only ever sees a small structured metadata envelope, never the
+full source file:
+```
+[source file]
+    │
+    ├─→ [1. Metadata extractor]  deterministic — schema + samples + warnings
+    ├─→ [2. Context summarizer]  Gemma 2B base — short NL summary
+    ↓
+[3. Script generator]   Claude Opus (training) → Gemma 2B fine-tuned (inference)
+    ↓ outputs an executable Python script
+[4. Sandbox executor]   deterministic — runs the script
+    ↓ converted output file
+[5. Validator]          the 4 W2 metrics — format, schema, load, content
+    ↓
+[output file]
+```
+**Why this shape**: distillation target narrows from "transform a whole
+file" (impractical for a 2 B model) to "read metadata, write a script"
+(realistic). The model never sees full file content, so the pipeline scales
+to arbitrary file sizes. Failures are debuggable — the script is a readable
+intermediate artefact.
+### Status
+**W1–W6 complete; W7 model surgery done — a 2.0 GB single-file student is production-validated.**
+- **Data (W3):** 800 verified teacher pairs (100% accept), split into
+  `data/processed/{train,val,test}.jsonl` (650 / 80 / 70, content-disjoint).
+- **EDA (W4):** `notebook/w4_eda.ipynb` — training-readiness audit (balance,
+  leakage, sequence-length budget).
+- **Fine-tune (W5):** Gemma-4 E2B distilled via LoRA (`mlx_vlm.lora`, SFT) on the
+  envelope→script task. Best checkpoint (iter-400) selected by held-out eval.
+- **Eval (W6):** on the held-out 70-case test set, through the full pipeline
+  (envelope → script → sandbox → 4 metrics), the fine-tuned student reaches
+  **65/70 one-shot** and **68/70 (0.971) at production retry≤3** — already ≥80%-of-teacher.
+- **Shrink (W7):** the multimodal base is mostly dead weight for this task. A
+  three-step surgery (`scripts/build_textonly_student.py` + `prune_vocab.py`) fuses the
+  adapter, strips the unused **vision + audio towers**, prunes the **262 k vocab → 16 k**
+  (the corpus uses ~4.5 k tokens; the vocab indexes the two biggest tensors), then
+  re-quantizes — all on a pure `gemma4_text` model loaded via `mlx_lm`:
+  | Artifact | params | size | retry≤3 | % teacher |
+  |---|---:|---:|---:|---:|
+  | fine-tuned bf16 (runtime adapter) | 5.12 B | 9.6 GB | — | — |
+  | *prior 8-bit (full model)* | 5.1 B | 5.5 GB | 68/70 | ~97% |
+  | fused + text-only + vocab-16k, bf16 | 2.05 B | 3.8 GB | **69/70 (0.986)** | ~99% |
+  | **+ 8-bit (final ship artifact)** | **2.05 B** | **2.0 GB** | **67/70 (0.957)** | **~96%** |
+  **9.6 GB → 2.0 GB (−79%)** with accuracy still well above the **≥80%-of-teacher**
+  target on every metric. Each cut is lossless-by-construction (strip/prune, guarded by
+  a tokenizer round-trip verification gate) or a small retry-recoverable numerical cost.
+**Next (W7 deployment):** push the 2.0 GB model to Hugging Face Hub with a model card,
+ship the `pip`-installable pipeline wrapper. See `docs/progression.md` for the live tracker.
+## Supported formats
+CSV, JSON, TXT — in 5 use cases (CSV→JSON nested, JSON→CSV flattening, TXT log→CSV, CSV→TXT report, schema migration).
+## Setup
+Requires **Python 3.12** (chosen for stronger MLX support). Project is
+managed by [`uv`](https://docs.astral.sh/uv/).
+```bash
+uv sync                        # creates .venv from pyproject.toml + uv.lock
+source .venv/bin/activate      # macOS / Linux
+# .venv\Scripts\activate       # Windows
+```
+Add a new dependency: `uv add <pkg>` (or `uv add --dev <pkg>` for dev-only).
+## Hardware / framework
+- **Primary target**: MacBook Pro M5 Max (40 GPU cores, 120 GB unified memory) with **MLX**.
+- **Fallback**: Google Colab + PyTorch + Unsloth (used when MLX is unavailable, e.g. on Windows).
+## Repo structure
+```
+data/
+  raw/          # synthetic corpus from seeded generators (regenerable, gitignored)
+  interim/      # verified teacher pairs (envelope + analysis + script + scores)
+  processed/    # train/val/test chat JSONL for fine-tuning
+  test_set/     # 15 hand-crafted W2 baseline cases
+notebook/       # EDA (w4_eda), fine-tune scaffold (w5_finetune), experiments
+src/
+  extractor/    # Stage 1: deterministic metadata extractor — CSV, JSON, TXT (done)
+  evaluation/   # Stage 5: the 4 W2 metrics + Opus-baseline runner (DO NOT EDIT)
+  data/         # generators (oracle), sandbox (Stage 4), teacher_script + collect (Stage 3)
+  features/     # format_pairs: verified pairs → chat JSONL + disjoint split
+  models/       # LoRA/QLoRA fine-tune + inference (W5)
+scripts/        # generate_corpus, collect_pairs, collect_all_parallel, build_dataset, baseline, plotting
+skills/         # Agent-Skill prompts read by `claude -p` (file conversion + script generation)
+tests/          # unit tests (metrics, extractor, data, features) + fixtures
+models/         # Gemma-4 E2B (local, gitignored) + fine-tuned checkpoints
+results/        # baseline run artefacts (per-run summary.json + plots)
+docs/           # specs, plans, weekly reports (gitignored)
+```
+## Timeline (8 weeks)
+| Week | Focus | Points |
+|------|-------|-------:|
+| 1 | Problem statement + use cases | 15 |
+| 2 | Metrics + Claude Opus baseline | 15 |
+| 3 | Teacher-generated training pairs | 15 |
+| 4 | EDA + data cleaning | 20 |
+| 5 | Fine-tune Gemma 2B (LoRA) | — |
+| 6 | Evaluation + error analysis | 20 |
+| 7 | Deployment (pip + HF Hub) | 15 |
+| 8 | Blog, slides, poster | — |
+| | **Total** | **100** (≥70 to pass) |
+## Deliverables
+- GitHub repo (this one)
+- Hugging Face Hub model + model card
+- `pip install`-able Python package
+- Medium blog post
+- Presentation slides + A1 poster
+- Facebook post (100–200 words)
+## Ethics
+- Converted files may contain personal data → no uploads of user input.
+- Teacher bias propagates to student — documented in model card.
+- Hallucination risk mitigated by automated format/schema validation at inference time.

data_morph_gemma-0.1.0/datamorph/__init__.py ADDED Viewed

@@ -0,0 +1,19 @@
+"""datamorph — distill file-format conversion into a small local model.
+Public API:
+    from datamorph import convert_file, ConversionResult
+"""
+from __future__ import annotations
+from importlib.metadata import PackageNotFoundError, version
+from datamorph.convert import ConversionResult, convert_file
+try:
+    __version__ = version("data-morph-gemma")
+except PackageNotFoundError:  # not installed (e.g. running from a source tree)
+    __version__ = "0.0.0+unknown"
+__all__ = ["convert_file", "ConversionResult", "__version__"]

data_morph_gemma-0.1.0/datamorph/cli.py ADDED Viewed

@@ -0,0 +1,84 @@
+"""Command-line interface for datamorph.
+    datamorph convert input.csv output.json
+    datamorph convert log.txt --output-format csv > out.csv
+    datamorph --version
+Exit codes: 0 = converted and validated, 1 = ran but output failed validation,
+2 = usage / input error.
+"""
+from __future__ import annotations
+import argparse
+import sys
+from datamorph import __version__, convert_file
+FORMATS = ("csv", "json", "txt")
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        prog="datamorph",
+        description="Convert files between CSV, JSON, and TXT with the distilled student model.",
+    )
+    parser.add_argument("--version", action="version", version=f"datamorph {__version__}")
+    sub = parser.add_subparsers(dest="command")
+    conv = sub.add_parser("convert", help="convert an input file to another format")
+    conv.add_argument("input", help="path to the source file")
+    conv.add_argument(
+        "output",
+        nargs="?",
+        help="path to write (its extension sets the target format); "
+        "if omitted, the result is printed to stdout and --output-format is required",
+    )
+    conv.add_argument("--input-format", choices=FORMATS, help="override input format detection")
+    conv.add_argument("--output-format", choices=FORMATS, help="target format (required if no output path)")
+    conv.add_argument("--instruction", help="extra natural-language guidance for the conversion")
+    conv.add_argument("--max-retries", type=int, default=3, help="retries with error feedback (default 3)")
+    conv.add_argument("--model", help="local model path or HF repo id (default: the published model)")
+    conv.add_argument("-q", "--quiet", action="store_true", help="suppress the status line on stderr")
+    return parser
+def main(argv: list[str] | None = None) -> int:
+    parser = build_parser()
+    args = parser.parse_args(argv)
+    if args.command != "convert":
+        parser.print_help(sys.stderr)
+        return 2
+    try:
+        result = convert_file(
+            args.input,
+            args.output,
+            input_format=args.input_format,
+            output_format=args.output_format,
+            instruction=args.instruction,
+            max_retries=args.max_retries,
+            model=args.model,
+        )
+    except (FileNotFoundError, ValueError) as exc:
+        print(f"error: {exc}", file=sys.stderr)
+        return 2
+    if args.output is None:
+        sys.stdout.write(result.output_text)
+    if not args.quiet:
+        where = str(result.output_path) if result.output_path else "stdout"
+        status = "ok" if result.accepted else f"NOT VALIDATED ({result.error or 'low score'})"
+        print(
+            f"datamorph: {result.input_format} -> {result.output_format} {status} "
+            f"(retries={result.retries}, scores={result.scores}) -> {where}",
+            file=sys.stderr,
+        )
+    return 0 if result.accepted else 1
+if __name__ == "__main__":  # pragma: no cover
+    sys.exit(main())

data_morph_gemma-0.1.0/datamorph/convert.py ADDED Viewed

@@ -0,0 +1,146 @@
+"""Public API: convert a file between formats with the distilled student model.
+``convert_file`` runs the production pipeline — extract a metadata envelope, have
+the student write a Python conversion script, run it in a sandbox, and validate the
+output — retrying on failures up to ``max_retries``. The model never sees the full
+source file, only its envelope.
+    from datamorph import convert_file
+    result = convert_file("contacts.csv", "contacts.json")
+    print(result.accepted, result.output_path)
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Callable, Protocol
+from datamorph.data.envelope import extract_envelope
+from datamorph.data.sandbox import run_script
+from datamorph.evaluation.metrics import format_validity, loadability
+from datamorph.model import resolve_model
+# Self-contained format <-> extension map (kept here so the inference path does
+# not import the data-generation package, which pulls in faker).
+EXT = {"csv": ".csv", "json": ".json", "txt": ".txt"}
+_FMT_BY_EXT = {ext: fmt for fmt, ext in EXT.items()}
+SUPPORTED_FORMATS = tuple(EXT)
+class TeacherFn(Protocol):
+    """Signature of the script author (the student model, or a test stub)."""
+    def __call__(
+        self, envelope: dict[str, Any], instruction: str, output_format: str,
+        *, feedback: str | None = ...,
+    ) -> Any: ...
+@dataclass
+class ConversionResult:
+    """Outcome of a single ``convert_file`` call."""
+    output_text: str
+    input_format: str
+    output_format: str
+    script: str = ""
+    scores: dict[str, float] = field(default_factory=dict)
+    accepted: bool = False
+    retries: int = 0
+    error: str | None = None
+    output_path: Path | None = None
+def _detect_format(path: Path, explicit: str | None, role: str) -> str:
+    fmt = explicit.lower() if explicit else _FMT_BY_EXT.get(path.suffix.lower())
+    if fmt not in EXT:
+        raise ValueError(
+            f"Unsupported or undetected {role} format for {path.name!r}; pass "
+            f"{role}_format=<one of {SUPPORTED_FORMATS}>."
+        )
+    return fmt
+def _default_teacher_fn(model: str | None) -> Callable:
+    """Select the model and return the real student script author."""
+    from datamorph.models import gemma_mlx
+    from datamorph.models.gemma_script_teacher import call_gemma_script_teacher
+    gemma_mlx.use_model(resolve_model(model), text_only=True)
+    return call_gemma_script_teacher
+def convert_file(
+    input_path: str | Path,
+    output_path: str | Path | None = None,
+    *,
+    input_format: str | None = None,
+    output_format: str | None = None,
+    instruction: str | None = None,
+    max_retries: int = 3,
+    model: str | None = None,
+    teacher_fn: TeacherFn | None = None,
+) -> ConversionResult:
+    """Convert ``input_path`` to the target format, optionally writing ``output_path``.
+    Formats are auto-detected from file extensions when not given explicitly. The
+    pipeline retries up to ``max_retries`` times with error feedback. ``teacher_fn``
+    can be injected to run the pipeline without the model (used in tests).
+    """
+    input_path = Path(input_path)
+    if not input_path.exists():
+        raise FileNotFoundError(f"input file not found: {input_path}")
+    in_fmt = _detect_format(input_path, input_format, "input")
+    if output_format:
+        out_fmt = output_format.lower()
+        if out_fmt not in EXT:
+            raise ValueError(
+                f"Unsupported output_format {output_format!r}; one of {SUPPORTED_FORMATS}."
+            )
+    elif output_path is not None:
+        out_fmt = _detect_format(Path(output_path), None, "output")
+    else:
+        raise ValueError("Provide output_format=, or an output_path with a known extension.")
+    if teacher_fn is None:
+        teacher_fn = _default_teacher_fn(model)
+    envelope = extract_envelope(input_path, in_fmt)
+    envelope.pop("file_path", None)  # never leak local paths
+    instruction = instruction or f"Convert this {in_fmt.upper()} to {out_fmt.upper()}."
+    out_suffix = EXT[out_fmt]
+    feedback: str | None = None
+    result = ConversionResult("", in_fmt, out_fmt, error="not_run")
+    for attempt in range(max_retries + 1):
+        tr = teacher_fn(envelope, instruction, out_fmt, feedback=feedback)
+        if not tr.ok:
+            result = ConversionResult("", in_fmt, out_fmt, script=tr.script,
+                                      retries=attempt, error="no_script")
+            feedback = "Your previous response had no <script> block. Output one."
+            continue
+        sr = run_script(tr.script, input_path, output_suffix=out_suffix)
+        if not sr.ok:
+            result = ConversionResult(sr.output_text, in_fmt, out_fmt, script=tr.script,
+                                      retries=attempt, error=sr.error_kind)
+            feedback = f"The script failed ({sr.error_kind}): {sr.stderr[:300]}. Fix it."
+            continue
+        out = sr.output_text
+        scores = {
+            "format_validity": format_validity(out, out_fmt),
+            "loadability": loadability(out, out_fmt),
+        }
+        accepted = scores["format_validity"] == 1.0 and scores["loadability"] == 1.0
+        result = ConversionResult(out, in_fmt, out_fmt, script=tr.script, scores=scores,
+                                  accepted=accepted, retries=attempt, error=None)
+        if accepted:
+            break
+        feedback = f"Output was not valid {out_fmt.upper()} (scores={scores}). Fix the script."
+    if output_path is not None and result.output_text:
+        output_path = Path(output_path)
+        output_path.write_text(result.output_text, encoding="utf-8")
+        result.output_path = output_path
+    return result

data_morph_gemma-0.1.0/datamorph/data/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """data-morph data-collection layer (synthetic generators, sandbox, orchestrator)."""