data-morph-gemma 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. data_morph_gemma-0.1.0.dist-info/METADATA +177 -0
  2. data_morph_gemma-0.1.0.dist-info/RECORD +39 -0
  3. data_morph_gemma-0.1.0.dist-info/WHEEL +4 -0
  4. data_morph_gemma-0.1.0.dist-info/entry_points.txt +2 -0
  5. data_morph_gemma-0.1.0.dist-info/licenses/LICENSE +25 -0
  6. datamorph/__init__.py +19 -0
  7. datamorph/cli.py +84 -0
  8. datamorph/convert.py +146 -0
  9. datamorph/data/__init__.py +1 -0
  10. datamorph/data/collect.py +221 -0
  11. datamorph/data/envelope.py +20 -0
  12. datamorph/data/generators/__init__.py +1 -0
  13. datamorph/data/generators/base.py +48 -0
  14. datamorph/data/generators/uc1_csv_to_json.py +64 -0
  15. datamorph/data/generators/uc2_json_to_csv.py +59 -0
  16. datamorph/data/generators/uc3_txt_log_to_csv.py +64 -0
  17. datamorph/data/generators/uc4_csv_to_txt_report.py +62 -0
  18. datamorph/data/generators/uc5_schema_migration.py +49 -0
  19. datamorph/data/sandbox.py +95 -0
  20. datamorph/data/teacher_script.py +114 -0
  21. datamorph/evaluation/__init__.py +0 -0
  22. datamorph/evaluation/metrics.py +264 -0
  23. datamorph/evaluation/output_cleanup.py +116 -0
  24. datamorph/evaluation/runner.py +218 -0
  25. datamorph/evaluation/teacher.py +193 -0
  26. datamorph/extractor/__init__.py +15 -0
  27. datamorph/extractor/base.py +26 -0
  28. datamorph/extractor/csv_extractor.py +515 -0
  29. datamorph/extractor/json_extractor.py +447 -0
  30. datamorph/extractor/json_walker.py +217 -0
  31. datamorph/extractor/sampler.py +68 -0
  32. datamorph/extractor/txt_extractor.py +199 -0
  33. datamorph/extractor/warning_rules.py +473 -0
  34. datamorph/features/__init__.py +1 -0
  35. datamorph/features/format_pairs.py +57 -0
  36. datamorph/model.py +63 -0
  37. datamorph/models/__init__.py +0 -0
  38. datamorph/models/gemma_mlx.py +163 -0
  39. datamorph/models/gemma_script_teacher.py +100 -0
@@ -0,0 +1,218 @@
1
+ """Orchestrates the baseline evaluation across every test case."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import time
7
+ from dataclasses import dataclass, field
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+ from .metrics import score_all
12
+ from .teacher import call_teacher
13
+
14
+ EXT_BY_FORMAT = {"csv": ".csv", "json": ".json", "txt": ".txt"}
15
+
16
+
17
+ @dataclass
18
+ class CaseSpec:
19
+ case_dir: Path
20
+ meta: dict
21
+ input_text: str
22
+ expected_text: str
23
+
24
+ @property
25
+ def case_id(self) -> str:
26
+ return f"{self.case_dir.parent.name}/{self.case_dir.name}"
27
+
28
+
29
+ @dataclass
30
+ class CaseResult:
31
+ case_id: str
32
+ use_case: str
33
+ complexity: str
34
+ input_format: str
35
+ output_format: str
36
+ scores: dict[str, float] = field(default_factory=dict)
37
+ output_preview: str = ""
38
+ ok: bool = False
39
+ error: str | None = None
40
+ elapsed_sec: float = 0.0
41
+
42
+
43
+ _COMPLEXITY_ORDER = {"simple": 0, "medium": 1, "complex": 2}
44
+
45
+
46
+ def _case_sort_key(case_dir: Path) -> tuple:
47
+ # Sort by use-case dir, then by complexity (simple -> medium -> complex),
48
+ # then by case name. Avoids alphabetical mediums running before simples.
49
+ name = case_dir.name
50
+ complexity = name.split("_")[0]
51
+ return (
52
+ case_dir.parent.name,
53
+ _COMPLEXITY_ORDER.get(complexity, 99),
54
+ name,
55
+ )
56
+
57
+
58
+ def discover_cases(test_root: Path) -> list[CaseSpec]:
59
+ cases: list[CaseSpec] = []
60
+ case_dirs = sorted(
61
+ (d for d in test_root.glob("*/*/") if d.is_dir()),
62
+ key=_case_sort_key,
63
+ )
64
+ for case_dir in case_dirs:
65
+ if not case_dir.is_dir():
66
+ continue
67
+ meta_path = case_dir / "meta.json"
68
+ if not meta_path.exists():
69
+ continue
70
+ meta = json.loads(meta_path.read_text(encoding="utf-8"))
71
+ in_ext = EXT_BY_FORMAT[meta["input_format"]]
72
+ out_ext = EXT_BY_FORMAT[meta["output_format"]]
73
+ input_path = case_dir / f"input{in_ext}"
74
+ expected_path = case_dir / f"expected{out_ext}"
75
+ if not input_path.exists() or not expected_path.exists():
76
+ continue
77
+ cases.append(
78
+ CaseSpec(
79
+ case_dir=case_dir,
80
+ meta=meta,
81
+ input_text=input_path.read_text(encoding="utf-8"),
82
+ expected_text=expected_path.read_text(encoding="utf-8"),
83
+ )
84
+ )
85
+ return cases
86
+
87
+
88
+ def run_case(case: CaseSpec, outputs_dir: Path, model: str = "opus") -> CaseResult:
89
+ meta = case.meta
90
+ result = CaseResult(
91
+ case_id=case.case_id,
92
+ use_case=meta["use_case"],
93
+ complexity=meta["complexity"],
94
+ input_format=meta["input_format"],
95
+ output_format=meta["output_format"],
96
+ )
97
+ started = time.time()
98
+ teacher_result = call_teacher(
99
+ input_text=case.input_text,
100
+ input_format=meta["input_format"],
101
+ output_format=meta["output_format"],
102
+ prompt_hint=meta.get("prompt_hint", ""),
103
+ model=model,
104
+ )
105
+ result.elapsed_sec = round(time.time() - started, 2)
106
+
107
+ # Persist the raw teacher output even on failure — useful for error analysis.
108
+ case_out_dir = outputs_dir / case.case_dir.parent.name / case.case_dir.name
109
+ case_out_dir.mkdir(parents=True, exist_ok=True)
110
+ out_ext = EXT_BY_FORMAT[meta["output_format"]]
111
+
112
+ # Cleaned (or for Opus, unchanged) output — what the metrics score.
113
+ (case_out_dir / f"actual{out_ext}").write_text(
114
+ teacher_result.output, encoding="utf-8"
115
+ )
116
+ # For Gemma: also persist the raw pre-cleanup output for audit.
117
+ if model == "gemma":
118
+ (case_out_dir / f"raw_actual{out_ext}").write_text(
119
+ teacher_result.raw_output, encoding="utf-8"
120
+ )
121
+
122
+ # Per-backend metadata; filename kept as teacher_meta.json for artefact parity.
123
+ if model == "opus":
124
+ meta_payload: dict[str, Any] = {
125
+ "returncode": teacher_result.returncode,
126
+ "stderr": teacher_result.stderr[:1000],
127
+ "usage": teacher_result.raw_payload.get("usage"),
128
+ "session_id": teacher_result.raw_payload.get("session_id"),
129
+ "elapsed_sec": result.elapsed_sec,
130
+ }
131
+ else: # gemma
132
+ gm = teacher_result.gemma_meta or {}
133
+ raw_bytes = len(teacher_result.raw_output.encode("utf-8"))
134
+ clean_bytes = len(teacher_result.output.encode("utf-8"))
135
+ meta_payload = {
136
+ "model_id": gm.get("model_id"),
137
+ "n_prompt_tokens": gm.get("n_prompt_tokens"),
138
+ "n_generated_tokens": gm.get("n_generated_tokens"),
139
+ "tokens_per_sec": gm.get("tokens_per_sec"),
140
+ "elapsed_sec": gm.get("elapsed_sec", result.elapsed_sec),
141
+ "truncated": gm.get("truncated", False),
142
+ "cleanup_applied": teacher_result.cleanup_applied,
143
+ "raw_size_bytes": raw_bytes,
144
+ "cleaned_size_bytes": clean_bytes,
145
+ "stderr": teacher_result.stderr[:1000] or None,
146
+ }
147
+ (case_out_dir / "teacher_meta.json").write_text(
148
+ json.dumps(meta_payload, indent=2),
149
+ encoding="utf-8",
150
+ )
151
+
152
+ if not teacher_result.ok:
153
+ result.ok = False
154
+ result.error = teacher_result.stderr[:500] or "teacher returned empty output"
155
+ result.scores = {
156
+ "format_validity": 0.0,
157
+ "schema_compliance": 0.0,
158
+ "loadability": 0.0,
159
+ "content_accuracy": 0.0,
160
+ }
161
+ return result
162
+
163
+ result.ok = True
164
+ result.output_preview = teacher_result.output[:200]
165
+ result.scores = score_all(
166
+ actual=teacher_result.output,
167
+ expected=case.expected_text,
168
+ output_format=meta["output_format"],
169
+ required_substrings=meta.get("required_substrings"),
170
+ )
171
+ return result
172
+
173
+
174
+ def aggregate(results: list[CaseResult]) -> dict[str, Any]:
175
+ if not results:
176
+ return {}
177
+ metric_keys = [
178
+ "format_validity",
179
+ "schema_compliance",
180
+ "loadability",
181
+ "content_accuracy",
182
+ ]
183
+ overall = {
184
+ k: round(sum(r.scores.get(k, 0.0) for r in results) / len(results), 3)
185
+ for k in metric_keys
186
+ }
187
+
188
+ by_uc: dict[str, dict[str, Any]] = {}
189
+ for r in results:
190
+ bucket = by_uc.setdefault(r.use_case, {"n": 0, **{k: 0.0 for k in metric_keys}})
191
+ bucket["n"] += 1
192
+ for k in metric_keys:
193
+ bucket[k] += r.scores.get(k, 0.0)
194
+ for uc, bucket in by_uc.items():
195
+ n = bucket["n"]
196
+ for k in metric_keys:
197
+ bucket[k] = round(bucket[k] / n, 3)
198
+
199
+ by_complexity: dict[str, dict[str, Any]] = {}
200
+ for r in results:
201
+ bucket = by_complexity.setdefault(
202
+ r.complexity, {"n": 0, **{k: 0.0 for k in metric_keys}}
203
+ )
204
+ bucket["n"] += 1
205
+ for k in metric_keys:
206
+ bucket[k] += r.scores.get(k, 0.0)
207
+ for c, bucket in by_complexity.items():
208
+ n = bucket["n"]
209
+ for k in metric_keys:
210
+ bucket[k] = round(bucket[k] / n, 3)
211
+
212
+ return {
213
+ "overall": overall,
214
+ "by_use_case": by_uc,
215
+ "by_complexity": by_complexity,
216
+ "n_cases": len(results),
217
+ "n_inference_errors": sum(1 for r in results if not r.ok),
218
+ }
@@ -0,0 +1,193 @@
1
+ """Inference backends for the W2 baseline pipeline.
2
+
3
+ `model="opus"` runs the original `claude -p --model opus` subprocess (W2 teacher).
4
+ `model="gemma"` runs Gemma 2 2B IT via MLX in-process (student baseline, pre-fine-tune).
5
+
6
+ Filename `teacher.py` is kept as a misnomer to avoid breaking existing imports;
7
+ the module now hosts both teacher and student inference paths.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import json
13
+ import subprocess
14
+ from dataclasses import dataclass, field
15
+ from pathlib import Path
16
+
17
+ PROJECT_ROOT = Path(__file__).resolve().parents[2]
18
+ SKILL_REL_PATH = "skills/file_conversion_teacher.md"
19
+
20
+
21
+ @dataclass
22
+ class TeacherResult:
23
+ output: str
24
+ raw_payload: dict
25
+ returncode: int
26
+ stderr: str
27
+ raw_output: str = "" # pre-cleanup; equals `output` for Opus path
28
+ gemma_meta: dict | None = None # Gemma-only inference metadata
29
+ cleanup_applied: list[str] = field(default_factory=list)
30
+
31
+ @property
32
+ def ok(self) -> bool:
33
+ return self.returncode == 0 and bool(self.output)
34
+
35
+
36
+ def build_user_prompt(
37
+ input_text: str,
38
+ input_format: str,
39
+ output_format: str,
40
+ prompt_hint: str,
41
+ reference_skill: bool,
42
+ ) -> str:
43
+ """Assemble the user-role body.
44
+
45
+ `reference_skill=True` (Opus): instruct the model to Read the skill file.
46
+ `reference_skill=False` (Gemma): omit — the skill content is concatenated
47
+ in front of this body by the caller, since Gemma 2's chat template has
48
+ no `system` role.
49
+ """
50
+ preamble = (
51
+ f"Read the instructions in {SKILL_REL_PATH}, then follow them to convert "
52
+ f"the input below.\n\n"
53
+ if reference_skill
54
+ else ""
55
+ )
56
+ return (
57
+ f"{preamble}"
58
+ f"Conversion: {input_format.upper()} -> {output_format.upper()}\n"
59
+ f"Task-specific notes: {prompt_hint}\n\n"
60
+ f"Input (between the === markers):\n"
61
+ f"===\n{input_text}\n===\n\n"
62
+ f"Output the converted file content only. The first character of your "
63
+ f"response must be the first character of the converted file. No prose, "
64
+ f"no code fences, no markdown."
65
+ )
66
+
67
+
68
+ def call_teacher(
69
+ input_text: str,
70
+ input_format: str,
71
+ output_format: str,
72
+ prompt_hint: str,
73
+ timeout: int = 180,
74
+ model: str = "opus",
75
+ ) -> TeacherResult:
76
+ if model == "opus":
77
+ return _call_opus(input_text, input_format, output_format, prompt_hint, timeout)
78
+ if model == "gemma":
79
+ return _call_gemma(input_text, input_format, output_format, prompt_hint)
80
+ raise ValueError(f"Unknown model: {model!r} (expected 'opus' or 'gemma')")
81
+
82
+
83
+ def _call_opus(
84
+ input_text: str,
85
+ input_format: str,
86
+ output_format: str,
87
+ prompt_hint: str,
88
+ timeout: int,
89
+ ) -> TeacherResult:
90
+ prompt = build_user_prompt(
91
+ input_text, input_format, output_format, prompt_hint, reference_skill=True
92
+ )
93
+ cmd = [
94
+ "claude",
95
+ "-p",
96
+ prompt,
97
+ "--model",
98
+ "opus",
99
+ "--output-format",
100
+ "json",
101
+ "--allowedTools",
102
+ "Read",
103
+ ]
104
+ proc = subprocess.run(
105
+ cmd,
106
+ capture_output=True,
107
+ text=True,
108
+ cwd=str(PROJECT_ROOT),
109
+ timeout=timeout,
110
+ encoding="utf-8",
111
+ errors="replace",
112
+ )
113
+ if proc.returncode != 0:
114
+ return TeacherResult(
115
+ output="",
116
+ raw_payload={},
117
+ returncode=proc.returncode,
118
+ stderr=proc.stderr or "",
119
+ )
120
+ try:
121
+ payload = json.loads(proc.stdout)
122
+ except json.JSONDecodeError as e:
123
+ return TeacherResult(
124
+ output="",
125
+ raw_payload={"decode_error": str(e), "stdout_head": proc.stdout[:500]},
126
+ returncode=-1,
127
+ stderr=f"Could not decode claude -p JSON output: {e}",
128
+ )
129
+ output = payload.get("result", "") or ""
130
+ return TeacherResult(
131
+ output=output,
132
+ raw_output=output, # Opus output is not cleaned
133
+ raw_payload=payload,
134
+ returncode=0,
135
+ stderr=proc.stderr or "",
136
+ )
137
+
138
+
139
+ _SKILL_CACHE: dict[str, str] = {}
140
+
141
+
142
+ def _load_skill_text() -> str:
143
+ if "text" not in _SKILL_CACHE:
144
+ skill_path = PROJECT_ROOT / SKILL_REL_PATH
145
+ _SKILL_CACHE["text"] = skill_path.read_text(encoding="utf-8")
146
+ return _SKILL_CACHE["text"]
147
+
148
+
149
+ def _call_gemma(
150
+ input_text: str,
151
+ input_format: str,
152
+ output_format: str,
153
+ prompt_hint: str,
154
+ ) -> TeacherResult:
155
+ from datamorph.evaluation.output_cleanup import clean_model_output
156
+ from datamorph.models.gemma_mlx import generate as mlx_generate
157
+
158
+ skill = _load_skill_text()
159
+ user_body = build_user_prompt(
160
+ input_text, input_format, output_format, prompt_hint, reference_skill=False
161
+ )
162
+ # Gemma 2's chat template does not support a `system` role — fold the
163
+ # skill text into the user message, separated from the task instructions
164
+ # by a clear delimiter.
165
+ combined_user = f"{skill}\n\n---\n\n{user_body}"
166
+ messages = [{"role": "user", "content": combined_user}]
167
+ try:
168
+ gen = mlx_generate(messages)
169
+ except Exception as e:
170
+ return TeacherResult(
171
+ output="",
172
+ raw_payload={},
173
+ returncode=-1,
174
+ stderr=f"gemma_mlx.generate raised: {e!r}",
175
+ )
176
+
177
+ cleaned, applied = clean_model_output(gen.text, output_format)
178
+ return TeacherResult(
179
+ output=cleaned,
180
+ raw_output=gen.text,
181
+ raw_payload={"model_id": gen.model_id},
182
+ returncode=0,
183
+ stderr="",
184
+ cleanup_applied=applied,
185
+ gemma_meta={
186
+ "model_id": gen.model_id,
187
+ "n_prompt_tokens": gen.n_prompt_tokens,
188
+ "n_generated_tokens": gen.n_generated_tokens,
189
+ "tokens_per_sec": gen.tokens_per_sec,
190
+ "elapsed_sec": gen.elapsed_sec,
191
+ "truncated": gen.truncated,
192
+ },
193
+ )
@@ -0,0 +1,15 @@
1
+ """data-morph metadata extractors (CSV, JSON, TXT)."""
2
+
3
+ from .base import MetadataExtractor
4
+ from .csv_extractor import CSVExtractor
5
+ from .json_extractor import JSONExtractor
6
+ from .txt_extractor import TXTExtractor
7
+ from .warning_rules import MetadataWarning
8
+
9
+ __all__ = [
10
+ "CSVExtractor",
11
+ "JSONExtractor",
12
+ "TXTExtractor",
13
+ "MetadataExtractor",
14
+ "MetadataWarning",
15
+ ]
@@ -0,0 +1,26 @@
1
+ """Abstract base class for format-specific metadata extractors."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from abc import ABC, abstractmethod
6
+ from pathlib import Path
7
+ from typing import Any, ClassVar
8
+
9
+
10
+ class MetadataExtractor(ABC):
11
+ """Every format-specific extractor implements this contract.
12
+
13
+ Subclasses produce a metadata dict in the shared envelope schema
14
+ (see docs/superpowers/specs/2026-05-06-csv-metadata-extractor-design.md
15
+ section 5.1).
16
+ """
17
+
18
+ SCHEMA_VERSION: ClassVar[str] = "0.1"
19
+
20
+ @abstractmethod
21
+ def extract(self, file_path: Path) -> dict[str, Any]:
22
+ """Return a metadata dict in the shared envelope schema."""
23
+
24
+ @abstractmethod
25
+ def supports(self, file_path: Path) -> bool:
26
+ """Return True iff this extractor can handle the given file."""