data-morph-gemma 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. data_morph_gemma-0.1.0.dist-info/METADATA +177 -0
  2. data_morph_gemma-0.1.0.dist-info/RECORD +39 -0
  3. data_morph_gemma-0.1.0.dist-info/WHEEL +4 -0
  4. data_morph_gemma-0.1.0.dist-info/entry_points.txt +2 -0
  5. data_morph_gemma-0.1.0.dist-info/licenses/LICENSE +25 -0
  6. datamorph/__init__.py +19 -0
  7. datamorph/cli.py +84 -0
  8. datamorph/convert.py +146 -0
  9. datamorph/data/__init__.py +1 -0
  10. datamorph/data/collect.py +221 -0
  11. datamorph/data/envelope.py +20 -0
  12. datamorph/data/generators/__init__.py +1 -0
  13. datamorph/data/generators/base.py +48 -0
  14. datamorph/data/generators/uc1_csv_to_json.py +64 -0
  15. datamorph/data/generators/uc2_json_to_csv.py +59 -0
  16. datamorph/data/generators/uc3_txt_log_to_csv.py +64 -0
  17. datamorph/data/generators/uc4_csv_to_txt_report.py +62 -0
  18. datamorph/data/generators/uc5_schema_migration.py +49 -0
  19. datamorph/data/sandbox.py +95 -0
  20. datamorph/data/teacher_script.py +114 -0
  21. datamorph/evaluation/__init__.py +0 -0
  22. datamorph/evaluation/metrics.py +264 -0
  23. datamorph/evaluation/output_cleanup.py +116 -0
  24. datamorph/evaluation/runner.py +218 -0
  25. datamorph/evaluation/teacher.py +193 -0
  26. datamorph/extractor/__init__.py +15 -0
  27. datamorph/extractor/base.py +26 -0
  28. datamorph/extractor/csv_extractor.py +515 -0
  29. datamorph/extractor/json_extractor.py +447 -0
  30. datamorph/extractor/json_walker.py +217 -0
  31. datamorph/extractor/sampler.py +68 -0
  32. datamorph/extractor/txt_extractor.py +199 -0
  33. datamorph/extractor/warning_rules.py +473 -0
  34. datamorph/features/__init__.py +1 -0
  35. datamorph/features/format_pairs.py +57 -0
  36. datamorph/model.py +63 -0
  37. datamorph/models/__init__.py +0 -0
  38. datamorph/models/gemma_mlx.py +163 -0
  39. datamorph/models/gemma_script_teacher.py +100 -0
@@ -0,0 +1,221 @@
1
+ """Stage 3+4+5 orchestrator: envelope -> teacher script -> sandbox -> verify.
2
+
3
+ The teacher is injected as `teacher_fn` so the loop is fully testable without
4
+ any API calls. `collect_corpus` is the batch driver used by the CLI.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ from dataclasses import dataclass, field
11
+ from pathlib import Path
12
+ from typing import Any, Callable
13
+
14
+ from datamorph.data.envelope import extract_envelope
15
+ from datamorph.data.generators.base import EXT_BY_FORMAT
16
+ from datamorph.data.sandbox import run_script
17
+ from datamorph.data.teacher_script import ScriptResult, call_script_teacher
18
+ from datamorph.evaluation.metrics import score_all
19
+ from datamorph.evaluation.runner import CaseSpec, discover_cases
20
+
21
+ CA_MIN = 0.95
22
+ TeacherFn = Callable[..., ScriptResult]
23
+
24
+
25
+ @dataclass
26
+ class PairResult:
27
+ case_id: str
28
+ use_case: str
29
+ complexity: str
30
+ input_format: str
31
+ output_format: str
32
+ accepted: bool = False
33
+ scores: dict[str, float] = field(default_factory=dict)
34
+ retries: int = 0
35
+ error_kind: str = ""
36
+ reason: str = ""
37
+ envelope: dict[str, Any] = field(default_factory=dict)
38
+ instruction: str = ""
39
+ analysis: str = ""
40
+ script: str = ""
41
+ teacher_usage: dict[str, Any] | None = None # Opus token usage (cost + W6 analysis)
42
+
43
+
44
+ def _passes(scores: dict[str, float]) -> bool:
45
+ return (
46
+ scores.get("format_validity", 0.0) == 1.0
47
+ and scores.get("loadability", 0.0) == 1.0
48
+ and scores.get("schema_compliance", 0.0) == 1.0
49
+ and scores.get("content_accuracy", 0.0) >= CA_MIN
50
+ )
51
+
52
+
53
+ def _failing_metrics(scores: dict[str, float]) -> list[str]:
54
+ failing = []
55
+ for k in ("format_validity", "loadability", "schema_compliance"):
56
+ if scores.get(k, 0.0) < 1.0:
57
+ failing.append(k)
58
+ if scores.get("content_accuracy", 0.0) < CA_MIN:
59
+ failing.append("content_accuracy")
60
+ return failing
61
+
62
+
63
+ def collect_case(
64
+ case: CaseSpec,
65
+ *,
66
+ teacher_fn: TeacherFn = call_script_teacher,
67
+ max_retries: int = 3,
68
+ ) -> PairResult:
69
+ """Run the full teach->run->verify loop for one case, retrying with feedback."""
70
+ meta = case.meta
71
+ in_ext = EXT_BY_FORMAT[meta["input_format"]]
72
+ out_ext = EXT_BY_FORMAT[meta["output_format"]]
73
+ input_path = case.case_dir / f"input{in_ext}"
74
+
75
+ envelope = extract_envelope(input_path, meta["input_format"])
76
+ envelope.pop("file_path", None) # don't leak local paths into training data
77
+ instruction = meta.get("prompt_hint") or (
78
+ f"Convert this {meta['input_format'].upper()} to {meta['output_format'].upper()}."
79
+ )
80
+
81
+ result = PairResult(
82
+ case_id=case.case_id,
83
+ use_case=meta["use_case"],
84
+ complexity=meta["complexity"],
85
+ input_format=meta["input_format"],
86
+ output_format=meta["output_format"],
87
+ envelope=envelope,
88
+ instruction=instruction,
89
+ )
90
+
91
+ feedback: str | None = None
92
+ for attempt in range(max_retries + 1):
93
+ result.retries = attempt
94
+ tr = teacher_fn(envelope, instruction, meta["output_format"], feedback=feedback)
95
+ # Capture token usage from the latest teacher response (claude -p JSON
96
+ # payload carries it). One-shot data — not recoverable after the run.
97
+ usage = tr.raw_payload.get("usage") if tr.raw_payload else None
98
+ if usage:
99
+ result.teacher_usage = usage
100
+ if not tr.ok:
101
+ result.error_kind = "no_script"
102
+ result.reason = f"teacher produced no <script> (stderr: {tr.stderr[:200]})"
103
+ feedback = result.reason
104
+ continue
105
+
106
+ sr = run_script(tr.script, input_path, output_suffix=out_ext)
107
+ if not sr.ok:
108
+ result.error_kind = sr.error_kind
109
+ result.reason = f"script {sr.error_kind}: {sr.stderr[:300]}"
110
+ result.analysis, result.script = tr.analysis, tr.script
111
+ feedback = result.reason
112
+ continue
113
+
114
+ scores = score_all(
115
+ actual=sr.output_text,
116
+ expected=case.expected_text,
117
+ output_format=meta["output_format"],
118
+ required_substrings=meta.get("required_substrings"),
119
+ )
120
+ result.scores = scores
121
+ result.analysis, result.script = tr.analysis, tr.script
122
+ if _passes(scores):
123
+ result.accepted = True
124
+ result.error_kind = "ok"
125
+ result.reason = ""
126
+ return result
127
+ failing = _failing_metrics(scores)
128
+ result.error_kind = "low_score"
129
+ result.reason = f"output scored low on {failing}: {scores}"
130
+ feedback = result.reason
131
+
132
+ return result
133
+
134
+
135
+ def collect_corpus(
136
+ raw_root: Path,
137
+ interim_root: Path,
138
+ *,
139
+ teacher_fn: TeacherFn = call_script_teacher,
140
+ max_retries: int = 3,
141
+ limit: int | None = None,
142
+ resume: bool = False,
143
+ ) -> dict[str, Any]:
144
+ """Run collect_case over every corpus case; write accepted records + a manifest.
145
+
146
+ When ``resume`` is True, any case that already has an accepted record file in
147
+ ``interim_root`` is skipped without calling the teacher, so an interrupted run
148
+ (e.g. one stopped by a teacher usage limit) can be continued without
149
+ re-spending Opus calls on pairs already collected.
150
+ """
151
+ cases = discover_cases(raw_root)
152
+ if limit is not None:
153
+ cases = cases[:limit]
154
+ interim_root.mkdir(parents=True, exist_ok=True)
155
+
156
+ manifest: list[dict[str, Any]] = []
157
+ n_accepted = 0
158
+ n_skipped = 0
159
+ for case in cases:
160
+ out_path = interim_root / f"{case.meta['use_case']}__{case.case_dir.name}.json"
161
+ if resume and out_path.exists():
162
+ n_skipped += 1
163
+ manifest.append(
164
+ {
165
+ "case_id": case.case_id,
166
+ "use_case": case.meta["use_case"],
167
+ "accepted": True,
168
+ "error_kind": "skipped_existing",
169
+ "retries": 0,
170
+ "scores": {},
171
+ "reason": "",
172
+ }
173
+ )
174
+ continue
175
+ res = collect_case(case, teacher_fn=teacher_fn, max_retries=max_retries)
176
+ if res.accepted:
177
+ n_accepted += 1
178
+ record = {
179
+ "case_id": res.case_id,
180
+ "use_case": res.use_case,
181
+ "complexity": res.complexity,
182
+ "input_format": res.input_format,
183
+ "output_format": res.output_format,
184
+ "envelope": res.envelope,
185
+ "instruction": res.instruction,
186
+ "analysis": res.analysis,
187
+ "script": res.script,
188
+ "scores": res.scores,
189
+ "retries": res.retries,
190
+ "teacher_usage": res.teacher_usage,
191
+ }
192
+ out_path = interim_root / f"{res.use_case}__{case.case_dir.name}.json"
193
+ out_path.write_text(json.dumps(record, indent=2, default=str), encoding="utf-8")
194
+ manifest.append(
195
+ {
196
+ "case_id": res.case_id,
197
+ "use_case": res.use_case,
198
+ "accepted": res.accepted,
199
+ "error_kind": res.error_kind,
200
+ "retries": res.retries,
201
+ "scores": res.scores,
202
+ "reason": res.reason if not res.accepted else "",
203
+ }
204
+ )
205
+
206
+ n_attempted = len(cases) - n_skipped
207
+ summary = {
208
+ "n_cases": len(cases),
209
+ "n_skipped": n_skipped,
210
+ "n_attempted": n_attempted,
211
+ "n_accepted": n_accepted,
212
+ "n_records_total": n_accepted + n_skipped,
213
+ # accept_rate is over cases actually attempted this run (skipped ones
214
+ # were already accepted), so a resumed run's rate stays meaningful.
215
+ "accept_rate": round(n_accepted / n_attempted, 3) if n_attempted else 0.0,
216
+ "results": manifest,
217
+ }
218
+ (interim_root / "collect_manifest.json").write_text(
219
+ json.dumps(summary, indent=2, default=str), encoding="utf-8"
220
+ )
221
+ return summary
@@ -0,0 +1,20 @@
1
+ """Dispatch a source file to the right Stage-1 extractor and return its envelope."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ from datamorph.extractor import CSVExtractor, JSONExtractor, MetadataExtractor, TXTExtractor
9
+
10
+ _EXTRACTORS: dict[str, type[MetadataExtractor]] = {
11
+ "csv": CSVExtractor,
12
+ "json": JSONExtractor,
13
+ "txt": TXTExtractor,
14
+ }
15
+
16
+
17
+ def extract_envelope(input_path: Path, input_format: str) -> dict[str, Any]:
18
+ """Return the metadata envelope for `input_path`, dispatching by `input_format`."""
19
+ extractor_cls = _EXTRACTORS[input_format] # KeyError on unknown format is intentional
20
+ return extractor_cls().extract(input_path)
@@ -0,0 +1 @@
1
+ """Synthetic source-file generators — the ground-truth oracle."""
@@ -0,0 +1,48 @@
1
+ """Shared primitives for the synthetic source-file generators (the oracle)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from dataclasses import dataclass
7
+ from pathlib import Path
8
+ from typing import Any
9
+
10
+ from faker import Faker
11
+
12
+ EXT_BY_FORMAT = {"csv": ".csv", "json": ".json", "txt": ".txt"}
13
+
14
+
15
+ @dataclass(frozen=True)
16
+ class GeneratedCase:
17
+ """A single synthetic conversion case — both sides built from one seed."""
18
+
19
+ use_case: str
20
+ complexity: str
21
+ input_format: str
22
+ output_format: str
23
+ input_text: str
24
+ expected_text: str
25
+ meta: dict[str, Any]
26
+
27
+
28
+ def make_faker(seed: int) -> Faker:
29
+ """Return a Faker whose output is deterministic for the given seed."""
30
+ fake = Faker()
31
+ fake.seed_instance(seed)
32
+ return fake
33
+
34
+
35
+ def write_case(case: GeneratedCase, dest_root: Path, case_name: str) -> Path:
36
+ """Write input/expected/meta into dest_root/<use_case>/<case_name>/ (test_set layout)."""
37
+ case_dir = dest_root / case.use_case / case_name
38
+ case_dir.mkdir(parents=True, exist_ok=True)
39
+ (case_dir / f"input{EXT_BY_FORMAT[case.input_format]}").write_text(
40
+ case.input_text, encoding="utf-8"
41
+ )
42
+ (case_dir / f"expected{EXT_BY_FORMAT[case.output_format]}").write_text(
43
+ case.expected_text, encoding="utf-8"
44
+ )
45
+ (case_dir / "meta.json").write_text(
46
+ json.dumps(case.meta, indent=2) + "\n", encoding="utf-8"
47
+ )
48
+ return case_dir
@@ -0,0 +1,64 @@
1
+ """Generator: flat CSV (user_*/order_*) -> nested JSON (user object + orders array)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import csv
6
+ import io
7
+ import json
8
+ import random
9
+
10
+ from .base import GeneratedCase, make_faker
11
+
12
+ _N_USERS = {"simple": 3, "medium": 8, "complex": 20}
13
+ _MAX_ORDERS = {"simple": 1, "medium": 3, "complex": 5}
14
+ _ITEMS = ["Widget", "Gadget", "Thingamajig", "Doohickey", "Sprocket", "Cog"]
15
+
16
+
17
+ def generate(seed: int, complexity: str) -> GeneratedCase:
18
+ rng = random.Random(seed)
19
+ fake = make_faker(seed)
20
+ n_users = _N_USERS[complexity]
21
+ max_orders = _MAX_ORDERS[complexity]
22
+
23
+ rows: list[tuple[str, str, int, str, float]] = []
24
+ records: list[dict] = []
25
+ oid = 1000
26
+ for i in range(n_users):
27
+ name = f"{fake.first_name()}{i}" # index suffix guarantees uniqueness
28
+ email = f"{name.lower()}@example.com"
29
+ n_orders = 1 if complexity == "simple" else rng.randint(1, max_orders)
30
+ orders = []
31
+ for _ in range(n_orders):
32
+ oid += 1
33
+ item = rng.choice(_ITEMS)
34
+ price = round(rng.uniform(1, 100), 2)
35
+ rows.append((name, email, oid, item, price))
36
+ orders.append({"id": oid, "item": item, "price": price})
37
+ records.append({"user": {"name": name, "email": email}, "orders": orders})
38
+
39
+ buf = io.StringIO()
40
+ writer = csv.writer(buf, lineterminator="\n")
41
+ writer.writerow(["user_name", "user_email", "order_id", "order_item", "order_price"])
42
+ writer.writerows(rows)
43
+ input_text = buf.getvalue()
44
+ expected_text = json.dumps(records, indent=2) + "\n"
45
+
46
+ meta = {
47
+ "use_case": "uc1_csv_to_json_nested",
48
+ "complexity": complexity,
49
+ "input_format": "csv",
50
+ "output_format": "json",
51
+ "description": (
52
+ "Flat CSV with user_* and order_* columns -> nested JSON with user "
53
+ "object and orders array"
54
+ ),
55
+ "prompt_hint": (
56
+ "Group rows by user (user_name/user_email). user_* fields nest under "
57
+ "'user'; order_* fields become objects in an 'orders' array. Keep "
58
+ "order_id/order_price numeric (do not quote)."
59
+ ),
60
+ "seed": seed,
61
+ }
62
+ return GeneratedCase(
63
+ "uc1_csv_to_json_nested", complexity, "csv", "json", input_text, expected_text, meta
64
+ )
@@ -0,0 +1,59 @@
1
+ """Generator: nested JSON (name + address object) -> flattened CSV (dot-notation columns)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import csv
6
+ import io
7
+ import json
8
+ import random
9
+
10
+ from .base import GeneratedCase, make_faker
11
+
12
+ _N = {"simple": 3, "medium": 8, "complex": 20}
13
+
14
+
15
+ def generate(seed: int, complexity: str) -> GeneratedCase:
16
+ rng = random.Random(seed)
17
+ fake = make_faker(seed)
18
+ include_age = complexity != "simple"
19
+
20
+ records: list[dict] = []
21
+ for i in range(_N[complexity]):
22
+ # Strip commas from city so the oracle CSV stays comma-clean.
23
+ city = fake.city().replace(",", "")
24
+ rec: dict = {
25
+ "name": f"{fake.first_name()}{i}",
26
+ "address": {"city": city, "zip": str(fake.postcode())},
27
+ }
28
+ if include_age:
29
+ rec["age"] = rng.randint(18, 80)
30
+ records.append(rec)
31
+
32
+ input_text = json.dumps(records, indent=2) + "\n"
33
+
34
+ cols = ["name", "address.city", "address.zip"] + (["age"] if include_age else [])
35
+ buf = io.StringIO()
36
+ writer = csv.writer(buf, lineterminator="\n")
37
+ writer.writerow(cols)
38
+ for r in records:
39
+ row = [r["name"], r["address"]["city"], r["address"]["zip"]]
40
+ if include_age:
41
+ row.append(r["age"])
42
+ writer.writerow(row)
43
+ expected_text = buf.getvalue()
44
+
45
+ meta = {
46
+ "use_case": "uc2_json_to_csv_flatten",
47
+ "complexity": complexity,
48
+ "input_format": "json",
49
+ "output_format": "csv",
50
+ "description": "Nested JSON records -> flattened CSV with dot-notation columns",
51
+ "prompt_hint": (
52
+ "Flatten nested keys with dot notation (address.city, address.zip). "
53
+ "One CSV row per record. Header order: " + ",".join(cols) + "."
54
+ ),
55
+ "seed": seed,
56
+ }
57
+ return GeneratedCase(
58
+ "uc2_json_to_csv_flatten", complexity, "json", "csv", input_text, expected_text, meta
59
+ )
@@ -0,0 +1,64 @@
1
+ """Generator: bracketed-timestamp logs (TXT) -> CSV (timestamp,level,source,message)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import csv
6
+ import io
7
+ import random
8
+ from datetime import datetime, timedelta
9
+
10
+ from .base import GeneratedCase
11
+
12
+ _LEVELS = ["INFO", "WARN", "ERROR", "DEBUG"]
13
+ _SOURCES = ["app", "db", "auth", "cache", "api"]
14
+ # Comma-free messages keep the oracle CSV unquoted and exactly verifiable.
15
+ _MESSAGES = [
16
+ "Server started",
17
+ "Connection lost",
18
+ "Slow query detected",
19
+ "Reconnecting",
20
+ "User login succeeded",
21
+ "Cache miss",
22
+ "Request handled",
23
+ "Disk space low",
24
+ ]
25
+ _N_LINES = {"simple": 6, "medium": 30, "complex": 120}
26
+
27
+
28
+ def generate(seed: int, complexity: str) -> GeneratedCase:
29
+ rng = random.Random(seed)
30
+ n = _N_LINES[complexity]
31
+ t = datetime(2026, 4, 15, 10, 0, 0)
32
+
33
+ in_lines: list[str] = []
34
+ rows: list[tuple[str, str, str, str]] = []
35
+ for _ in range(n):
36
+ t = t + timedelta(seconds=rng.randint(1, 90))
37
+ level = rng.choice(_LEVELS)
38
+ source = rng.choice(_SOURCES)
39
+ msg = rng.choice(_MESSAGES)
40
+ in_lines.append(f"[{t:%Y-%m-%d %H:%M:%S}] {level} {source}: {msg}")
41
+ rows.append((f"{t:%Y-%m-%dT%H:%M:%S}", level, source, msg))
42
+
43
+ input_text = "\n".join(in_lines) + "\n"
44
+ buf = io.StringIO()
45
+ writer = csv.writer(buf, lineterminator="\n")
46
+ writer.writerow(["timestamp", "level", "source", "message"])
47
+ writer.writerows(rows)
48
+ expected_text = buf.getvalue()
49
+
50
+ meta = {
51
+ "use_case": "uc3_txt_log_to_csv",
52
+ "complexity": complexity,
53
+ "input_format": "txt",
54
+ "output_format": "csv",
55
+ "description": "Bracketed-timestamp logs -> CSV timestamp,level,source,message",
56
+ "prompt_hint": (
57
+ "Each line: [YYYY-MM-DD HH:MM:SS] LEVEL source: message. Output CSV "
58
+ "columns timestamp (ISO-8601 YYYY-MM-DDTHH:MM:SS), level, source, message."
59
+ ),
60
+ "seed": seed,
61
+ }
62
+ return GeneratedCase(
63
+ "uc3_txt_log_to_csv", complexity, "txt", "csv", input_text, expected_text, meta
64
+ )
@@ -0,0 +1,62 @@
1
+ """Generator: regional sales CSV -> human-readable TXT report with totals."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import csv
6
+ import io
7
+ import random
8
+
9
+ from .base import GeneratedCase
10
+
11
+ _POOL = [
12
+ "North", "South", "East", "West", "Central",
13
+ "Northeast", "Southwest", "Northwest", "Southeast",
14
+ ]
15
+ _N = {"simple": 4, "medium": 6, "complex": 9}
16
+
17
+
18
+ def generate(seed: int, complexity: str) -> GeneratedCase:
19
+ rng = random.Random(seed)
20
+ regions = _POOL[: _N[complexity]]
21
+ data: list[tuple[str, int, int]] = [
22
+ (reg, rng.randint(1000, 20000), rng.randint(5, 50)) for reg in regions
23
+ ]
24
+ total_sales = sum(s for _, s, _ in data)
25
+ total_units = sum(u for _, _, u in data)
26
+
27
+ buf = io.StringIO()
28
+ writer = csv.writer(buf, lineterminator="\n")
29
+ writer.writerow(["region", "sales", "units"])
30
+ for row in data:
31
+ writer.writerow(row)
32
+ input_text = buf.getvalue()
33
+
34
+ lines = ["Sales Report by Region", "======================", "Region Sales Units"]
35
+ for reg, s, u in data:
36
+ lines.append(f"{reg:<10}{s:<9}{u}")
37
+ lines += ["", f"Total Sales: {total_sales}", f"Total Units: {total_units}"]
38
+ expected_text = "\n".join(lines) + "\n"
39
+
40
+ required = ["Sales Report"]
41
+ for reg, s, _ in data:
42
+ required.extend([reg, str(s)])
43
+ required += [f"Total Sales: {total_sales}", f"Total Units: {total_units}"]
44
+
45
+ meta = {
46
+ "use_case": "uc4_csv_to_txt_report",
47
+ "complexity": complexity,
48
+ "input_format": "csv",
49
+ "output_format": "txt",
50
+ "description": "Regional sales CSV -> human-readable TXT report with totals",
51
+ "prompt_hint": (
52
+ "Produce a plain-text report titled 'Sales Report by Region', an "
53
+ "aligned table of the regions, then two summary lines "
54
+ "'Total Sales: <sum>' and 'Total Units: <sum>'."
55
+ ),
56
+ "content_accuracy_mode": "txt_substring",
57
+ "required_substrings": required,
58
+ "seed": seed,
59
+ }
60
+ return GeneratedCase(
61
+ "uc4_csv_to_txt_report", complexity, "csv", "txt", input_text, expected_text, meta
62
+ )
@@ -0,0 +1,49 @@
1
+ """Generator: JSON schema migration — rename user_* keys to bare names."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import random
7
+
8
+ from .base import GeneratedCase, make_faker
9
+
10
+ _N = {"simple": 3, "medium": 8, "complex": 20}
11
+
12
+
13
+ def generate(seed: int, complexity: str) -> GeneratedCase:
14
+ rng = random.Random(seed)
15
+ fake = make_faker(seed)
16
+ include_age = complexity != "simple"
17
+
18
+ src: list[dict] = []
19
+ dst: list[dict] = []
20
+ for i in range(_N[complexity]):
21
+ name = f"{fake.first_name()}{i}"
22
+ email = f"{name.lower()}@example.com"
23
+ s: dict = {"user_name": name, "user_email": email}
24
+ d: dict = {"name": name, "email": email}
25
+ if include_age:
26
+ age = rng.randint(18, 80)
27
+ s["user_age"] = age
28
+ d["age"] = age
29
+ src.append(s)
30
+ dst.append(d)
31
+
32
+ input_text = json.dumps(src, indent=2) + "\n"
33
+ expected_text = json.dumps(dst, indent=2) + "\n"
34
+
35
+ meta = {
36
+ "use_case": "uc5_schema_migration",
37
+ "complexity": complexity,
38
+ "input_format": "json",
39
+ "output_format": "json",
40
+ "description": "JSON schema migration — rename user_* keys to bare names",
41
+ "prompt_hint": (
42
+ "Rename keys on every record: user_name->name, user_email->email, "
43
+ "user_age->age. Preserve values and record order."
44
+ ),
45
+ "seed": seed,
46
+ }
47
+ return GeneratedCase(
48
+ "uc5_schema_migration", complexity, "json", "json", input_text, expected_text, meta
49
+ )
@@ -0,0 +1,95 @@
1
+ """Stage 4 — run an Opus-authored conversion script in a subprocess.
2
+
3
+ Trusted-but-buggy execution: a timeout and (POSIX) CPU limit guard against
4
+ runaway scripts. We intentionally do not cap virtual memory (RLIMIT_AS),
5
+ because pandas/numpy reserve large virtual address space and would crash.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import os
11
+ import subprocess
12
+ import sys
13
+ import tempfile
14
+ import time
15
+ from dataclasses import dataclass
16
+ from pathlib import Path
17
+
18
+ DEFAULT_TIMEOUT_SEC = 15.0
19
+ DEFAULT_CPU_SECONDS = 15
20
+
21
+
22
+ @dataclass(frozen=True)
23
+ class SandboxResult:
24
+ output_text: str
25
+ returncode: int
26
+ stderr: str
27
+ elapsed_sec: float
28
+ error_kind: str # "ok" | "syntax" | "runtime" | "timeout" | "empty_output"
29
+
30
+ @property
31
+ def ok(self) -> bool:
32
+ return self.error_kind == "ok"
33
+
34
+
35
+ def _posix_limits(cpu_seconds: int):
36
+ def _apply() -> None:
37
+ import resource
38
+
39
+ resource.setrlimit(resource.RLIMIT_CPU, (cpu_seconds, cpu_seconds))
40
+
41
+ return _apply
42
+
43
+
44
+ def run_script(
45
+ script: str,
46
+ input_path: Path,
47
+ *,
48
+ output_suffix: str,
49
+ timeout_sec: float = DEFAULT_TIMEOUT_SEC,
50
+ cpu_seconds: int = DEFAULT_CPU_SECONDS,
51
+ ) -> SandboxResult:
52
+ """Write `script` to a temp dir, run it on `input_path`, return the output."""
53
+
54
+ input_path = Path(input_path).resolve()
55
+ with tempfile.TemporaryDirectory() as tmp:
56
+ tmpdir = Path(tmp)
57
+ script_path = tmpdir / "convert.py"
58
+ out_path = tmpdir / f"output{output_suffix}"
59
+ script_path.write_text(script, encoding="utf-8")
60
+
61
+ preexec = _posix_limits(cpu_seconds) if os.name == "posix" else None
62
+ start = time.perf_counter()
63
+ try:
64
+ proc = subprocess.run(
65
+ [sys.executable, str(script_path), str(input_path), str(out_path)],
66
+ capture_output=True,
67
+ text=True,
68
+ timeout=timeout_sec,
69
+ cwd=str(tmpdir),
70
+ preexec_fn=preexec,
71
+ )
72
+ except subprocess.TimeoutExpired:
73
+ return SandboxResult(
74
+ output_text="",
75
+ returncode=-1,
76
+ stderr=f"Timed out after {timeout_sec}s",
77
+ elapsed_sec=time.perf_counter() - start,
78
+ error_kind="timeout",
79
+ )
80
+ elapsed = time.perf_counter() - start
81
+
82
+ if proc.returncode != 0:
83
+ syntax_markers = ("SyntaxError", "IndentationError", "TabError")
84
+ kind = (
85
+ "syntax" if any(m in proc.stderr for m in syntax_markers) else "runtime"
86
+ )
87
+ return SandboxResult("", proc.returncode, proc.stderr, elapsed, kind)
88
+
89
+ if not out_path.exists():
90
+ return SandboxResult("", 0, proc.stderr, elapsed, "empty_output")
91
+ output_text = out_path.read_text(encoding="utf-8")
92
+ if not output_text.strip():
93
+ return SandboxResult(output_text, 0, proc.stderr, elapsed, "empty_output")
94
+
95
+ return SandboxResult(output_text, 0, proc.stderr, elapsed, "ok")