data-morph-gemma 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. data_morph_gemma-0.1.0.dist-info/METADATA +177 -0
  2. data_morph_gemma-0.1.0.dist-info/RECORD +39 -0
  3. data_morph_gemma-0.1.0.dist-info/WHEEL +4 -0
  4. data_morph_gemma-0.1.0.dist-info/entry_points.txt +2 -0
  5. data_morph_gemma-0.1.0.dist-info/licenses/LICENSE +25 -0
  6. datamorph/__init__.py +19 -0
  7. datamorph/cli.py +84 -0
  8. datamorph/convert.py +146 -0
  9. datamorph/data/__init__.py +1 -0
  10. datamorph/data/collect.py +221 -0
  11. datamorph/data/envelope.py +20 -0
  12. datamorph/data/generators/__init__.py +1 -0
  13. datamorph/data/generators/base.py +48 -0
  14. datamorph/data/generators/uc1_csv_to_json.py +64 -0
  15. datamorph/data/generators/uc2_json_to_csv.py +59 -0
  16. datamorph/data/generators/uc3_txt_log_to_csv.py +64 -0
  17. datamorph/data/generators/uc4_csv_to_txt_report.py +62 -0
  18. datamorph/data/generators/uc5_schema_migration.py +49 -0
  19. datamorph/data/sandbox.py +95 -0
  20. datamorph/data/teacher_script.py +114 -0
  21. datamorph/evaluation/__init__.py +0 -0
  22. datamorph/evaluation/metrics.py +264 -0
  23. datamorph/evaluation/output_cleanup.py +116 -0
  24. datamorph/evaluation/runner.py +218 -0
  25. datamorph/evaluation/teacher.py +193 -0
  26. datamorph/extractor/__init__.py +15 -0
  27. datamorph/extractor/base.py +26 -0
  28. datamorph/extractor/csv_extractor.py +515 -0
  29. datamorph/extractor/json_extractor.py +447 -0
  30. datamorph/extractor/json_walker.py +217 -0
  31. datamorph/extractor/sampler.py +68 -0
  32. datamorph/extractor/txt_extractor.py +199 -0
  33. datamorph/extractor/warning_rules.py +473 -0
  34. datamorph/features/__init__.py +1 -0
  35. datamorph/features/format_pairs.py +57 -0
  36. datamorph/model.py +63 -0
  37. datamorph/models/__init__.py +0 -0
  38. datamorph/models/gemma_mlx.py +163 -0
  39. datamorph/models/gemma_script_teacher.py +100 -0
@@ -0,0 +1,114 @@
1
+ """Stage 3 — Claude Opus writes a conversion script from a metadata envelope.
2
+
3
+ Mirrors datamorph/evaluation/teacher.py::_call_opus (same `claude -p` invocation), but
4
+ the model returns <analysis> + <script> rather than a converted file. The live
5
+ call is exercised only by opt-in tests; parsing/prompt building are pure.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ import re
12
+ import subprocess
13
+ from dataclasses import dataclass
14
+ from pathlib import Path
15
+ from typing import Any
16
+
17
+ PROJECT_ROOT = Path(__file__).resolve().parents[2]
18
+ SKILL_REL_PATH = "skills/script_generation_teacher.md"
19
+
20
+ _ANALYSIS_RE = re.compile(r"<analysis>(.*?)</analysis>", re.DOTALL)
21
+ _SCRIPT_RE = re.compile(r"<script>(.*?)</script>", re.DOTALL)
22
+ _FENCE_RE = re.compile(r"^```(?:python|py)?\s*\n(.*?)\n```$", re.DOTALL)
23
+
24
+
25
+ @dataclass
26
+ class ScriptResult:
27
+ analysis: str
28
+ script: str
29
+ raw_output: str
30
+ returncode: int
31
+ stderr: str
32
+ raw_payload: dict
33
+
34
+ @property
35
+ def ok(self) -> bool:
36
+ return self.returncode == 0 and bool(self.script)
37
+
38
+
39
+ def _strip_fence(text: str) -> str:
40
+ stripped = text.strip()
41
+ m = _FENCE_RE.match(stripped)
42
+ return m.group(1).strip() if m else stripped
43
+
44
+
45
+ def parse_teacher_output(text: str) -> tuple[str, str]:
46
+ """Return (analysis, script). Script has any wrapping ```fence``` removed."""
47
+ a = _ANALYSIS_RE.search(text)
48
+ s = _SCRIPT_RE.search(text)
49
+ analysis = a.group(1).strip() if a else ""
50
+ script = _strip_fence(s.group(1)) if s else ""
51
+ return analysis, script
52
+
53
+
54
+ def build_script_prompt(
55
+ envelope: dict[str, Any],
56
+ instruction: str,
57
+ output_format: str,
58
+ feedback: str | None = None,
59
+ ) -> str:
60
+ env_json = json.dumps(envelope, indent=2, default=str)
61
+ fb = (
62
+ f"\n\nYour previous attempt failed: {feedback}\n"
63
+ f"Write a corrected <analysis> + <script>.\n"
64
+ if feedback
65
+ else ""
66
+ )
67
+ return (
68
+ f"Read the instructions in {SKILL_REL_PATH}, then write a Python conversion script.\n\n"
69
+ f"You are given the METADATA ENVELOPE of a source file (not the file itself):\n"
70
+ f"```json\n{env_json}\n```\n\n"
71
+ f"Task: {instruction}\n"
72
+ f"Target output format: {output_format.upper()}.\n\n"
73
+ f"The script must read the input file path from sys.argv[1] and write the converted "
74
+ f"output to sys.argv[2], using only the Python standard library and pandas. Respond "
75
+ f"with exactly an <analysis>...</analysis> block followed by a <script>...</script> "
76
+ f"block. No prose, no code fences outside the script tags."
77
+ f"{fb}"
78
+ )
79
+
80
+
81
+ def call_script_teacher(
82
+ envelope: dict[str, Any],
83
+ instruction: str,
84
+ output_format: str,
85
+ *,
86
+ timeout: int = 240,
87
+ feedback: str | None = None,
88
+ ) -> ScriptResult:
89
+ """Run `claude -p --model opus` and parse <analysis> + <script> from the result."""
90
+ prompt = build_script_prompt(envelope, instruction, output_format, feedback)
91
+ cmd = [
92
+ "claude", "-p", prompt,
93
+ "--model", "opus",
94
+ "--output-format", "json",
95
+ "--allowedTools", "Read",
96
+ ]
97
+ proc = subprocess.run(
98
+ cmd,
99
+ capture_output=True,
100
+ text=True,
101
+ cwd=str(PROJECT_ROOT),
102
+ timeout=timeout,
103
+ encoding="utf-8",
104
+ errors="replace",
105
+ )
106
+ if proc.returncode != 0:
107
+ return ScriptResult("", "", "", proc.returncode, proc.stderr or "", {})
108
+ try:
109
+ payload = json.loads(proc.stdout)
110
+ except json.JSONDecodeError as e:
111
+ return ScriptResult("", "", "", -1, f"decode error: {e}", {"stdout_head": proc.stdout[:500]})
112
+ raw = payload.get("result", "") or ""
113
+ analysis, script = parse_teacher_output(raw)
114
+ return ScriptResult(analysis, script, raw, 0, proc.stderr or "", payload)
File without changes
@@ -0,0 +1,264 @@
1
+ from __future__ import annotations
2
+
3
+ import csv
4
+ import io
5
+ import json
6
+ from typing import Any, Iterable
7
+
8
+ # 1. Format validity
9
+
10
+
11
+ def format_validity(output: str, output_format: str) -> float:
12
+ """Return 1.0 if output parses as the target format, else 0.0."""
13
+ fmt = output_format.lower()
14
+ if fmt == "json":
15
+ try:
16
+ json.loads(output)
17
+ return 1.0
18
+ except (json.JSONDecodeError, ValueError):
19
+ return 0.0
20
+ if fmt == "csv":
21
+ try:
22
+ reader = csv.reader(io.StringIO(output))
23
+ rows = list(reader)
24
+ if not rows:
25
+ return 0.0
26
+ width = len(rows[0])
27
+ if width == 0:
28
+ return 0.0
29
+ # every row must have the same column count
30
+ if not all(len(r) == width for r in rows):
31
+ return 0.0
32
+ return 1.0
33
+ except csv.Error:
34
+ return 0.0
35
+ if fmt == "txt":
36
+ return 1.0 if output.strip() else 0.0
37
+ raise ValueError(f"Unknown output_format: {output_format!r}")
38
+
39
+
40
+ # 2. Schema compliance
41
+
42
+
43
+ def _json_key_skeleton(obj: Any) -> Any:
44
+ """Recursively reduce a JSON value to its structural skeleton.
45
+
46
+ Dicts -> sorted tuple of (key, child_skeleton).
47
+ Lists -> ('list', child_skeleton_of_first) so we check the per-element shape
48
+ rather than length (caller decides whether length matters).
49
+ Scalars -> the type name.
50
+ """
51
+ if isinstance(obj, dict):
52
+ return tuple(sorted((k, _json_key_skeleton(v)) for k, v in obj.items()))
53
+ if isinstance(obj, list):
54
+ if not obj:
55
+ return ("list", "empty")
56
+ # Use first element's skeleton as representative; a well-formed output
57
+ # should have homogeneous elements in each array position.
58
+ return ("list", _json_key_skeleton(obj[0]))
59
+ return type(obj).__name__
60
+
61
+
62
+ def schema_compliance(actual: str, expected: str, output_format: str) -> float:
63
+ """Return 1.0 if actual's structural skeleton matches expected's, else 0.0."""
64
+ fmt = output_format.lower()
65
+ if fmt == "json":
66
+ try:
67
+ a = json.loads(actual)
68
+ e = json.loads(expected)
69
+ except (json.JSONDecodeError, ValueError):
70
+ return 0.0
71
+ return 1.0 if _json_key_skeleton(a) == _json_key_skeleton(e) else 0.0
72
+ if fmt == "csv":
73
+ try:
74
+ a_rows = list(csv.reader(io.StringIO(actual)))
75
+ e_rows = list(csv.reader(io.StringIO(expected)))
76
+ except csv.Error:
77
+ return 0.0
78
+ if not a_rows or not e_rows:
79
+ return 0.0
80
+ # header match (case-insensitive, trimmed)
81
+ a_hdr = [c.strip().lower() for c in a_rows[0]]
82
+ e_hdr = [c.strip().lower() for c in e_rows[0]]
83
+ return 1.0 if a_hdr == e_hdr else 0.0
84
+ if fmt == "txt":
85
+ # No meaningful structural check for freeform TXT.
86
+ return 1.0
87
+ raise ValueError(f"Unknown output_format: {output_format!r}")
88
+
89
+
90
+ # 3. Loadability
91
+
92
+
93
+ def loadability(output: str, output_format: str) -> float:
94
+ """Return 1.0 if pandas can load the output without error, else 0.0."""
95
+ fmt = output_format.lower()
96
+ if fmt == "json":
97
+ try:
98
+ import pandas as pd
99
+
100
+ data = json.loads(output)
101
+ # pd.json_normalize handles both lists-of-objects and nested dicts.
102
+ if isinstance(data, list):
103
+ pd.json_normalize(data)
104
+ elif isinstance(data, dict):
105
+ # normalize the first list-valued field if present, else wrap.
106
+ list_fields = [v for v in data.values() if isinstance(v, list)]
107
+ if list_fields:
108
+ pd.json_normalize(list_fields[0])
109
+ else:
110
+ pd.json_normalize([data])
111
+ else:
112
+ return 0.0
113
+ return 1.0
114
+ except Exception:
115
+ return 0.0
116
+ if fmt == "csv":
117
+ try:
118
+ import pandas as pd
119
+
120
+ df = pd.read_csv(io.StringIO(output))
121
+ return 1.0 if len(df.columns) > 0 else 0.0
122
+ except Exception:
123
+ return 0.0
124
+ if fmt == "txt":
125
+ return 1.0 if output.strip() else 0.0
126
+ raise ValueError(f"Unknown output_format: {output_format!r}")
127
+
128
+
129
+ # 4. Content accuracy
130
+
131
+
132
+ def _values_equal(a: Any, b: Any) -> bool:
133
+ """Compare two scalar values with light coercion.
134
+
135
+ - Numeric strings compare equal to numbers: "9.99" == 9.99.
136
+ - None == "". ("null" is handled by JSON already being None.)
137
+ - Strings compare case-sensitive after .strip().
138
+ """
139
+ if a == b:
140
+ return True
141
+ # Both numeric (possibly as strings)?
142
+ try:
143
+ fa, fb = float(a), float(b)
144
+ if fa == fb:
145
+ return True
146
+ except (TypeError, ValueError):
147
+ pass
148
+ # Both empty-ish?
149
+ if (a is None or a == "") and (b is None or b == ""):
150
+ return True
151
+ # String comparison with whitespace strip
152
+ if isinstance(a, str) and isinstance(b, str):
153
+ return a.strip() == b.strip()
154
+ return False
155
+
156
+
157
+ def _walk_json_leaves(obj: Any, path: str = "") -> Iterable[tuple[str, Any]]:
158
+ """Yield (key_path, leaf_value) pairs from a JSON-decoded object."""
159
+ if isinstance(obj, dict):
160
+ for k, v in obj.items():
161
+ new_path = f"{path}.{k}" if path else k
162
+ yield from _walk_json_leaves(v, new_path)
163
+ elif isinstance(obj, list):
164
+ for i, v in enumerate(obj):
165
+ new_path = f"{path}[{i}]"
166
+ yield from _walk_json_leaves(v, new_path)
167
+ else:
168
+ yield path, obj
169
+
170
+
171
+ def _json_content_accuracy(actual_text: str, expected_text: str) -> float:
172
+ """Fraction of expected leaf paths that match actual."""
173
+ try:
174
+ actual = json.loads(actual_text)
175
+ expected = json.loads(expected_text)
176
+ except (json.JSONDecodeError, ValueError):
177
+ return 0.0
178
+ actual_map = dict(_walk_json_leaves(actual))
179
+ expected_map = dict(_walk_json_leaves(expected))
180
+ if not expected_map:
181
+ return 0.0
182
+ matches = sum(
183
+ 1
184
+ for path, ev in expected_map.items()
185
+ if path in actual_map and _values_equal(actual_map[path], ev)
186
+ )
187
+ return matches / len(expected_map)
188
+
189
+
190
+ def _csv_content_accuracy(actual_text: str, expected_text: str) -> float:
191
+ """Fraction of expected cells that match actual (by header-aware row alignment).
192
+
193
+ Rows are aligned positionally; cells are compared by shared column name.
194
+ If the header differs, score is 0.0 (that's a schema-compliance issue).
195
+ """
196
+ try:
197
+ a_rows = list(csv.reader(io.StringIO(actual_text)))
198
+ e_rows = list(csv.reader(io.StringIO(expected_text)))
199
+ except csv.Error:
200
+ return 0.0
201
+ if len(a_rows) < 1 or len(e_rows) < 1:
202
+ return 0.0
203
+ a_hdr = [c.strip() for c in a_rows[0]]
204
+ e_hdr = [c.strip() for c in e_rows[0]]
205
+ if [h.lower() for h in a_hdr] != [h.lower() for h in e_hdr]:
206
+ return 0.0
207
+ a_data, e_data = a_rows[1:], e_rows[1:]
208
+ total = len(e_data) * len(e_hdr)
209
+ if total == 0:
210
+ return 0.0
211
+ matches = 0
212
+ for i, e_row in enumerate(e_data):
213
+ a_row = a_data[i] if i < len(a_data) else [""] * len(e_hdr)
214
+ for j, e_cell in enumerate(e_row):
215
+ a_cell = a_row[j] if j < len(a_row) else ""
216
+ if _values_equal(a_cell.strip(), e_cell.strip()):
217
+ matches += 1
218
+ return matches / total
219
+
220
+
221
+ def _txt_content_accuracy(actual_text: str, required_substrings: list[str]) -> float:
222
+ """Fraction of required substrings present in actual (case-insensitive)."""
223
+ if not required_substrings:
224
+ return 0.0
225
+ hay = actual_text.lower()
226
+ hits = sum(1 for s in required_substrings if s.lower() in hay)
227
+ return hits / len(required_substrings)
228
+
229
+
230
+ def content_accuracy(
231
+ actual: str,
232
+ expected: str,
233
+ output_format: str,
234
+ required_substrings: list[str] | None = None,
235
+ ) -> float:
236
+ """Dispatch to the format-appropriate content-accuracy routine."""
237
+ fmt = output_format.lower()
238
+ if fmt == "json":
239
+ return _json_content_accuracy(actual, expected)
240
+ if fmt == "csv":
241
+ return _csv_content_accuracy(actual, expected)
242
+ if fmt == "txt":
243
+ return _txt_content_accuracy(actual, required_substrings or [])
244
+ raise ValueError(f"Unknown output_format: {output_format!r}")
245
+
246
+
247
+ # Aggregate helper
248
+
249
+
250
+ def score_all(
251
+ actual: str,
252
+ expected: str,
253
+ output_format: str,
254
+ required_substrings: list[str] | None = None,
255
+ ) -> dict[str, float]:
256
+ """Run all four metrics and return a dict of scores."""
257
+ return {
258
+ "format_validity": format_validity(actual, output_format),
259
+ "schema_compliance": schema_compliance(actual, expected, output_format),
260
+ "loadability": loadability(actual, output_format),
261
+ "content_accuracy": content_accuracy(
262
+ actual, expected, output_format, required_substrings
263
+ ),
264
+ }
@@ -0,0 +1,116 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+
5
+
6
+ def clean_model_output(raw: str, output_format: str) -> tuple[str, list[str]]:
7
+ applied: list[str] = []
8
+ text = raw
9
+
10
+ stripped = text.strip()
11
+ if stripped != text:
12
+ applied.append("strip_whitespace")
13
+ text = stripped
14
+
15
+ fenced = _try_strip_code_fence(text)
16
+ if fenced is not None:
17
+ text = fenced
18
+ applied.append("strip_code_fence")
19
+
20
+ pre_stripped = _try_strip_preamble(text, output_format)
21
+ if pre_stripped is not None:
22
+ text = pre_stripped
23
+ applied.append("strip_preamble")
24
+
25
+ if output_format == "json":
26
+ trailing_stripped = _try_strip_trailing_prose_json(text)
27
+ if trailing_stripped is not None:
28
+ text = trailing_stripped
29
+ applied.append("strip_trailing_prose")
30
+
31
+ final = text.strip()
32
+ if final != text and "strip_whitespace" not in applied:
33
+ applied.append("strip_whitespace")
34
+ return final, applied
35
+
36
+
37
+ _FENCE_OPEN = re.compile(r"^```([A-Za-z0-9_+\-]*)\s*\n", re.MULTILINE)
38
+ _FENCE_CLOSE = "\n```"
39
+
40
+
41
+ def _try_strip_code_fence(text: str) -> str | None:
42
+ m = _FENCE_OPEN.match(text)
43
+ if not m:
44
+ return None
45
+ body_start = m.end()
46
+ close_idx = text.find(_FENCE_CLOSE, body_start)
47
+ if close_idx == -1:
48
+ return None # unclosed — skip
49
+ return text[body_start:close_idx]
50
+
51
+
52
+ def _try_strip_preamble(text: str, output_format: str) -> str | None:
53
+ if output_format == "txt":
54
+ return None
55
+ lines = text.split("\n")
56
+ if output_format == "json":
57
+ for i, line in enumerate(lines):
58
+ s = line.lstrip()
59
+ if s.startswith("{") or s.startswith("["):
60
+ if i == 0:
61
+ return None
62
+ return "\n".join(lines[i:])
63
+ return None
64
+ if output_format == "csv":
65
+ for i, line in enumerate(lines):
66
+ if "," in line:
67
+ if i == 0:
68
+ return None
69
+ return "\n".join(lines[i:])
70
+ return None
71
+ return None
72
+
73
+
74
+ def _try_strip_trailing_prose_json(text: str) -> str | None:
75
+ # Find first opening bracket
76
+ start = -1
77
+ open_ch = ""
78
+ for i, c in enumerate(text):
79
+ if c == "{" or c == "[":
80
+ start = i
81
+ open_ch = c
82
+ break
83
+ if start == -1:
84
+ return None
85
+ close_ch = "}" if open_ch == "{" else "]"
86
+ depth = 0
87
+ in_string = False
88
+ escape = False
89
+ end = -1
90
+ for i in range(start, len(text)):
91
+ c = text[i]
92
+ if escape:
93
+ escape = False
94
+ continue
95
+ if c == "\\":
96
+ escape = True
97
+ continue
98
+ if c == '"':
99
+ in_string = not in_string
100
+ continue
101
+ if in_string:
102
+ continue
103
+ if c == open_ch:
104
+ depth += 1
105
+ elif c == close_ch:
106
+ depth -= 1
107
+ if depth == 0:
108
+ end = i + 1
109
+ break
110
+ if end == -1:
111
+ return None
112
+ if end == len(text):
113
+ return None # nothing to strip
114
+ if text[end:].strip() == "":
115
+ return None
116
+ return text[:end]