data-morph-gemma 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_morph_gemma-0.1.0.dist-info/METADATA +177 -0
- data_morph_gemma-0.1.0.dist-info/RECORD +39 -0
- data_morph_gemma-0.1.0.dist-info/WHEEL +4 -0
- data_morph_gemma-0.1.0.dist-info/entry_points.txt +2 -0
- data_morph_gemma-0.1.0.dist-info/licenses/LICENSE +25 -0
- datamorph/__init__.py +19 -0
- datamorph/cli.py +84 -0
- datamorph/convert.py +146 -0
- datamorph/data/__init__.py +1 -0
- datamorph/data/collect.py +221 -0
- datamorph/data/envelope.py +20 -0
- datamorph/data/generators/__init__.py +1 -0
- datamorph/data/generators/base.py +48 -0
- datamorph/data/generators/uc1_csv_to_json.py +64 -0
- datamorph/data/generators/uc2_json_to_csv.py +59 -0
- datamorph/data/generators/uc3_txt_log_to_csv.py +64 -0
- datamorph/data/generators/uc4_csv_to_txt_report.py +62 -0
- datamorph/data/generators/uc5_schema_migration.py +49 -0
- datamorph/data/sandbox.py +95 -0
- datamorph/data/teacher_script.py +114 -0
- datamorph/evaluation/__init__.py +0 -0
- datamorph/evaluation/metrics.py +264 -0
- datamorph/evaluation/output_cleanup.py +116 -0
- datamorph/evaluation/runner.py +218 -0
- datamorph/evaluation/teacher.py +193 -0
- datamorph/extractor/__init__.py +15 -0
- datamorph/extractor/base.py +26 -0
- datamorph/extractor/csv_extractor.py +515 -0
- datamorph/extractor/json_extractor.py +447 -0
- datamorph/extractor/json_walker.py +217 -0
- datamorph/extractor/sampler.py +68 -0
- datamorph/extractor/txt_extractor.py +199 -0
- datamorph/extractor/warning_rules.py +473 -0
- datamorph/features/__init__.py +1 -0
- datamorph/features/format_pairs.py +57 -0
- datamorph/model.py +63 -0
- datamorph/models/__init__.py +0 -0
- datamorph/models/gemma_mlx.py +163 -0
- datamorph/models/gemma_script_teacher.py +100 -0
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
"""Stage 3+4+5 orchestrator: envelope -> teacher script -> sandbox -> verify.
|
|
2
|
+
|
|
3
|
+
The teacher is injected as `teacher_fn` so the loop is fully testable without
|
|
4
|
+
any API calls. `collect_corpus` is the batch driver used by the CLI.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
from dataclasses import dataclass, field
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any, Callable
|
|
13
|
+
|
|
14
|
+
from datamorph.data.envelope import extract_envelope
|
|
15
|
+
from datamorph.data.generators.base import EXT_BY_FORMAT
|
|
16
|
+
from datamorph.data.sandbox import run_script
|
|
17
|
+
from datamorph.data.teacher_script import ScriptResult, call_script_teacher
|
|
18
|
+
from datamorph.evaluation.metrics import score_all
|
|
19
|
+
from datamorph.evaluation.runner import CaseSpec, discover_cases
|
|
20
|
+
|
|
21
|
+
CA_MIN = 0.95
|
|
22
|
+
TeacherFn = Callable[..., ScriptResult]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class PairResult:
|
|
27
|
+
case_id: str
|
|
28
|
+
use_case: str
|
|
29
|
+
complexity: str
|
|
30
|
+
input_format: str
|
|
31
|
+
output_format: str
|
|
32
|
+
accepted: bool = False
|
|
33
|
+
scores: dict[str, float] = field(default_factory=dict)
|
|
34
|
+
retries: int = 0
|
|
35
|
+
error_kind: str = ""
|
|
36
|
+
reason: str = ""
|
|
37
|
+
envelope: dict[str, Any] = field(default_factory=dict)
|
|
38
|
+
instruction: str = ""
|
|
39
|
+
analysis: str = ""
|
|
40
|
+
script: str = ""
|
|
41
|
+
teacher_usage: dict[str, Any] | None = None # Opus token usage (cost + W6 analysis)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _passes(scores: dict[str, float]) -> bool:
|
|
45
|
+
return (
|
|
46
|
+
scores.get("format_validity", 0.0) == 1.0
|
|
47
|
+
and scores.get("loadability", 0.0) == 1.0
|
|
48
|
+
and scores.get("schema_compliance", 0.0) == 1.0
|
|
49
|
+
and scores.get("content_accuracy", 0.0) >= CA_MIN
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _failing_metrics(scores: dict[str, float]) -> list[str]:
|
|
54
|
+
failing = []
|
|
55
|
+
for k in ("format_validity", "loadability", "schema_compliance"):
|
|
56
|
+
if scores.get(k, 0.0) < 1.0:
|
|
57
|
+
failing.append(k)
|
|
58
|
+
if scores.get("content_accuracy", 0.0) < CA_MIN:
|
|
59
|
+
failing.append("content_accuracy")
|
|
60
|
+
return failing
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def collect_case(
|
|
64
|
+
case: CaseSpec,
|
|
65
|
+
*,
|
|
66
|
+
teacher_fn: TeacherFn = call_script_teacher,
|
|
67
|
+
max_retries: int = 3,
|
|
68
|
+
) -> PairResult:
|
|
69
|
+
"""Run the full teach->run->verify loop for one case, retrying with feedback."""
|
|
70
|
+
meta = case.meta
|
|
71
|
+
in_ext = EXT_BY_FORMAT[meta["input_format"]]
|
|
72
|
+
out_ext = EXT_BY_FORMAT[meta["output_format"]]
|
|
73
|
+
input_path = case.case_dir / f"input{in_ext}"
|
|
74
|
+
|
|
75
|
+
envelope = extract_envelope(input_path, meta["input_format"])
|
|
76
|
+
envelope.pop("file_path", None) # don't leak local paths into training data
|
|
77
|
+
instruction = meta.get("prompt_hint") or (
|
|
78
|
+
f"Convert this {meta['input_format'].upper()} to {meta['output_format'].upper()}."
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
result = PairResult(
|
|
82
|
+
case_id=case.case_id,
|
|
83
|
+
use_case=meta["use_case"],
|
|
84
|
+
complexity=meta["complexity"],
|
|
85
|
+
input_format=meta["input_format"],
|
|
86
|
+
output_format=meta["output_format"],
|
|
87
|
+
envelope=envelope,
|
|
88
|
+
instruction=instruction,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
feedback: str | None = None
|
|
92
|
+
for attempt in range(max_retries + 1):
|
|
93
|
+
result.retries = attempt
|
|
94
|
+
tr = teacher_fn(envelope, instruction, meta["output_format"], feedback=feedback)
|
|
95
|
+
# Capture token usage from the latest teacher response (claude -p JSON
|
|
96
|
+
# payload carries it). One-shot data — not recoverable after the run.
|
|
97
|
+
usage = tr.raw_payload.get("usage") if tr.raw_payload else None
|
|
98
|
+
if usage:
|
|
99
|
+
result.teacher_usage = usage
|
|
100
|
+
if not tr.ok:
|
|
101
|
+
result.error_kind = "no_script"
|
|
102
|
+
result.reason = f"teacher produced no <script> (stderr: {tr.stderr[:200]})"
|
|
103
|
+
feedback = result.reason
|
|
104
|
+
continue
|
|
105
|
+
|
|
106
|
+
sr = run_script(tr.script, input_path, output_suffix=out_ext)
|
|
107
|
+
if not sr.ok:
|
|
108
|
+
result.error_kind = sr.error_kind
|
|
109
|
+
result.reason = f"script {sr.error_kind}: {sr.stderr[:300]}"
|
|
110
|
+
result.analysis, result.script = tr.analysis, tr.script
|
|
111
|
+
feedback = result.reason
|
|
112
|
+
continue
|
|
113
|
+
|
|
114
|
+
scores = score_all(
|
|
115
|
+
actual=sr.output_text,
|
|
116
|
+
expected=case.expected_text,
|
|
117
|
+
output_format=meta["output_format"],
|
|
118
|
+
required_substrings=meta.get("required_substrings"),
|
|
119
|
+
)
|
|
120
|
+
result.scores = scores
|
|
121
|
+
result.analysis, result.script = tr.analysis, tr.script
|
|
122
|
+
if _passes(scores):
|
|
123
|
+
result.accepted = True
|
|
124
|
+
result.error_kind = "ok"
|
|
125
|
+
result.reason = ""
|
|
126
|
+
return result
|
|
127
|
+
failing = _failing_metrics(scores)
|
|
128
|
+
result.error_kind = "low_score"
|
|
129
|
+
result.reason = f"output scored low on {failing}: {scores}"
|
|
130
|
+
feedback = result.reason
|
|
131
|
+
|
|
132
|
+
return result
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def collect_corpus(
|
|
136
|
+
raw_root: Path,
|
|
137
|
+
interim_root: Path,
|
|
138
|
+
*,
|
|
139
|
+
teacher_fn: TeacherFn = call_script_teacher,
|
|
140
|
+
max_retries: int = 3,
|
|
141
|
+
limit: int | None = None,
|
|
142
|
+
resume: bool = False,
|
|
143
|
+
) -> dict[str, Any]:
|
|
144
|
+
"""Run collect_case over every corpus case; write accepted records + a manifest.
|
|
145
|
+
|
|
146
|
+
When ``resume`` is True, any case that already has an accepted record file in
|
|
147
|
+
``interim_root`` is skipped without calling the teacher, so an interrupted run
|
|
148
|
+
(e.g. one stopped by a teacher usage limit) can be continued without
|
|
149
|
+
re-spending Opus calls on pairs already collected.
|
|
150
|
+
"""
|
|
151
|
+
cases = discover_cases(raw_root)
|
|
152
|
+
if limit is not None:
|
|
153
|
+
cases = cases[:limit]
|
|
154
|
+
interim_root.mkdir(parents=True, exist_ok=True)
|
|
155
|
+
|
|
156
|
+
manifest: list[dict[str, Any]] = []
|
|
157
|
+
n_accepted = 0
|
|
158
|
+
n_skipped = 0
|
|
159
|
+
for case in cases:
|
|
160
|
+
out_path = interim_root / f"{case.meta['use_case']}__{case.case_dir.name}.json"
|
|
161
|
+
if resume and out_path.exists():
|
|
162
|
+
n_skipped += 1
|
|
163
|
+
manifest.append(
|
|
164
|
+
{
|
|
165
|
+
"case_id": case.case_id,
|
|
166
|
+
"use_case": case.meta["use_case"],
|
|
167
|
+
"accepted": True,
|
|
168
|
+
"error_kind": "skipped_existing",
|
|
169
|
+
"retries": 0,
|
|
170
|
+
"scores": {},
|
|
171
|
+
"reason": "",
|
|
172
|
+
}
|
|
173
|
+
)
|
|
174
|
+
continue
|
|
175
|
+
res = collect_case(case, teacher_fn=teacher_fn, max_retries=max_retries)
|
|
176
|
+
if res.accepted:
|
|
177
|
+
n_accepted += 1
|
|
178
|
+
record = {
|
|
179
|
+
"case_id": res.case_id,
|
|
180
|
+
"use_case": res.use_case,
|
|
181
|
+
"complexity": res.complexity,
|
|
182
|
+
"input_format": res.input_format,
|
|
183
|
+
"output_format": res.output_format,
|
|
184
|
+
"envelope": res.envelope,
|
|
185
|
+
"instruction": res.instruction,
|
|
186
|
+
"analysis": res.analysis,
|
|
187
|
+
"script": res.script,
|
|
188
|
+
"scores": res.scores,
|
|
189
|
+
"retries": res.retries,
|
|
190
|
+
"teacher_usage": res.teacher_usage,
|
|
191
|
+
}
|
|
192
|
+
out_path = interim_root / f"{res.use_case}__{case.case_dir.name}.json"
|
|
193
|
+
out_path.write_text(json.dumps(record, indent=2, default=str), encoding="utf-8")
|
|
194
|
+
manifest.append(
|
|
195
|
+
{
|
|
196
|
+
"case_id": res.case_id,
|
|
197
|
+
"use_case": res.use_case,
|
|
198
|
+
"accepted": res.accepted,
|
|
199
|
+
"error_kind": res.error_kind,
|
|
200
|
+
"retries": res.retries,
|
|
201
|
+
"scores": res.scores,
|
|
202
|
+
"reason": res.reason if not res.accepted else "",
|
|
203
|
+
}
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
n_attempted = len(cases) - n_skipped
|
|
207
|
+
summary = {
|
|
208
|
+
"n_cases": len(cases),
|
|
209
|
+
"n_skipped": n_skipped,
|
|
210
|
+
"n_attempted": n_attempted,
|
|
211
|
+
"n_accepted": n_accepted,
|
|
212
|
+
"n_records_total": n_accepted + n_skipped,
|
|
213
|
+
# accept_rate is over cases actually attempted this run (skipped ones
|
|
214
|
+
# were already accepted), so a resumed run's rate stays meaningful.
|
|
215
|
+
"accept_rate": round(n_accepted / n_attempted, 3) if n_attempted else 0.0,
|
|
216
|
+
"results": manifest,
|
|
217
|
+
}
|
|
218
|
+
(interim_root / "collect_manifest.json").write_text(
|
|
219
|
+
json.dumps(summary, indent=2, default=str), encoding="utf-8"
|
|
220
|
+
)
|
|
221
|
+
return summary
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"""Dispatch a source file to the right Stage-1 extractor and return its envelope."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from datamorph.extractor import CSVExtractor, JSONExtractor, MetadataExtractor, TXTExtractor
|
|
9
|
+
|
|
10
|
+
_EXTRACTORS: dict[str, type[MetadataExtractor]] = {
|
|
11
|
+
"csv": CSVExtractor,
|
|
12
|
+
"json": JSONExtractor,
|
|
13
|
+
"txt": TXTExtractor,
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def extract_envelope(input_path: Path, input_format: str) -> dict[str, Any]:
|
|
18
|
+
"""Return the metadata envelope for `input_path`, dispatching by `input_format`."""
|
|
19
|
+
extractor_cls = _EXTRACTORS[input_format] # KeyError on unknown format is intentional
|
|
20
|
+
return extractor_cls().extract(input_path)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Synthetic source-file generators — the ground-truth oracle."""
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""Shared primitives for the synthetic source-file generators (the oracle)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from faker import Faker
|
|
11
|
+
|
|
12
|
+
EXT_BY_FORMAT = {"csv": ".csv", "json": ".json", "txt": ".txt"}
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass(frozen=True)
|
|
16
|
+
class GeneratedCase:
|
|
17
|
+
"""A single synthetic conversion case — both sides built from one seed."""
|
|
18
|
+
|
|
19
|
+
use_case: str
|
|
20
|
+
complexity: str
|
|
21
|
+
input_format: str
|
|
22
|
+
output_format: str
|
|
23
|
+
input_text: str
|
|
24
|
+
expected_text: str
|
|
25
|
+
meta: dict[str, Any]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def make_faker(seed: int) -> Faker:
|
|
29
|
+
"""Return a Faker whose output is deterministic for the given seed."""
|
|
30
|
+
fake = Faker()
|
|
31
|
+
fake.seed_instance(seed)
|
|
32
|
+
return fake
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def write_case(case: GeneratedCase, dest_root: Path, case_name: str) -> Path:
|
|
36
|
+
"""Write input/expected/meta into dest_root/<use_case>/<case_name>/ (test_set layout)."""
|
|
37
|
+
case_dir = dest_root / case.use_case / case_name
|
|
38
|
+
case_dir.mkdir(parents=True, exist_ok=True)
|
|
39
|
+
(case_dir / f"input{EXT_BY_FORMAT[case.input_format]}").write_text(
|
|
40
|
+
case.input_text, encoding="utf-8"
|
|
41
|
+
)
|
|
42
|
+
(case_dir / f"expected{EXT_BY_FORMAT[case.output_format]}").write_text(
|
|
43
|
+
case.expected_text, encoding="utf-8"
|
|
44
|
+
)
|
|
45
|
+
(case_dir / "meta.json").write_text(
|
|
46
|
+
json.dumps(case.meta, indent=2) + "\n", encoding="utf-8"
|
|
47
|
+
)
|
|
48
|
+
return case_dir
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""Generator: flat CSV (user_*/order_*) -> nested JSON (user object + orders array)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import csv
|
|
6
|
+
import io
|
|
7
|
+
import json
|
|
8
|
+
import random
|
|
9
|
+
|
|
10
|
+
from .base import GeneratedCase, make_faker
|
|
11
|
+
|
|
12
|
+
_N_USERS = {"simple": 3, "medium": 8, "complex": 20}
|
|
13
|
+
_MAX_ORDERS = {"simple": 1, "medium": 3, "complex": 5}
|
|
14
|
+
_ITEMS = ["Widget", "Gadget", "Thingamajig", "Doohickey", "Sprocket", "Cog"]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def generate(seed: int, complexity: str) -> GeneratedCase:
|
|
18
|
+
rng = random.Random(seed)
|
|
19
|
+
fake = make_faker(seed)
|
|
20
|
+
n_users = _N_USERS[complexity]
|
|
21
|
+
max_orders = _MAX_ORDERS[complexity]
|
|
22
|
+
|
|
23
|
+
rows: list[tuple[str, str, int, str, float]] = []
|
|
24
|
+
records: list[dict] = []
|
|
25
|
+
oid = 1000
|
|
26
|
+
for i in range(n_users):
|
|
27
|
+
name = f"{fake.first_name()}{i}" # index suffix guarantees uniqueness
|
|
28
|
+
email = f"{name.lower()}@example.com"
|
|
29
|
+
n_orders = 1 if complexity == "simple" else rng.randint(1, max_orders)
|
|
30
|
+
orders = []
|
|
31
|
+
for _ in range(n_orders):
|
|
32
|
+
oid += 1
|
|
33
|
+
item = rng.choice(_ITEMS)
|
|
34
|
+
price = round(rng.uniform(1, 100), 2)
|
|
35
|
+
rows.append((name, email, oid, item, price))
|
|
36
|
+
orders.append({"id": oid, "item": item, "price": price})
|
|
37
|
+
records.append({"user": {"name": name, "email": email}, "orders": orders})
|
|
38
|
+
|
|
39
|
+
buf = io.StringIO()
|
|
40
|
+
writer = csv.writer(buf, lineterminator="\n")
|
|
41
|
+
writer.writerow(["user_name", "user_email", "order_id", "order_item", "order_price"])
|
|
42
|
+
writer.writerows(rows)
|
|
43
|
+
input_text = buf.getvalue()
|
|
44
|
+
expected_text = json.dumps(records, indent=2) + "\n"
|
|
45
|
+
|
|
46
|
+
meta = {
|
|
47
|
+
"use_case": "uc1_csv_to_json_nested",
|
|
48
|
+
"complexity": complexity,
|
|
49
|
+
"input_format": "csv",
|
|
50
|
+
"output_format": "json",
|
|
51
|
+
"description": (
|
|
52
|
+
"Flat CSV with user_* and order_* columns -> nested JSON with user "
|
|
53
|
+
"object and orders array"
|
|
54
|
+
),
|
|
55
|
+
"prompt_hint": (
|
|
56
|
+
"Group rows by user (user_name/user_email). user_* fields nest under "
|
|
57
|
+
"'user'; order_* fields become objects in an 'orders' array. Keep "
|
|
58
|
+
"order_id/order_price numeric (do not quote)."
|
|
59
|
+
),
|
|
60
|
+
"seed": seed,
|
|
61
|
+
}
|
|
62
|
+
return GeneratedCase(
|
|
63
|
+
"uc1_csv_to_json_nested", complexity, "csv", "json", input_text, expected_text, meta
|
|
64
|
+
)
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""Generator: nested JSON (name + address object) -> flattened CSV (dot-notation columns)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import csv
|
|
6
|
+
import io
|
|
7
|
+
import json
|
|
8
|
+
import random
|
|
9
|
+
|
|
10
|
+
from .base import GeneratedCase, make_faker
|
|
11
|
+
|
|
12
|
+
_N = {"simple": 3, "medium": 8, "complex": 20}
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def generate(seed: int, complexity: str) -> GeneratedCase:
|
|
16
|
+
rng = random.Random(seed)
|
|
17
|
+
fake = make_faker(seed)
|
|
18
|
+
include_age = complexity != "simple"
|
|
19
|
+
|
|
20
|
+
records: list[dict] = []
|
|
21
|
+
for i in range(_N[complexity]):
|
|
22
|
+
# Strip commas from city so the oracle CSV stays comma-clean.
|
|
23
|
+
city = fake.city().replace(",", "")
|
|
24
|
+
rec: dict = {
|
|
25
|
+
"name": f"{fake.first_name()}{i}",
|
|
26
|
+
"address": {"city": city, "zip": str(fake.postcode())},
|
|
27
|
+
}
|
|
28
|
+
if include_age:
|
|
29
|
+
rec["age"] = rng.randint(18, 80)
|
|
30
|
+
records.append(rec)
|
|
31
|
+
|
|
32
|
+
input_text = json.dumps(records, indent=2) + "\n"
|
|
33
|
+
|
|
34
|
+
cols = ["name", "address.city", "address.zip"] + (["age"] if include_age else [])
|
|
35
|
+
buf = io.StringIO()
|
|
36
|
+
writer = csv.writer(buf, lineterminator="\n")
|
|
37
|
+
writer.writerow(cols)
|
|
38
|
+
for r in records:
|
|
39
|
+
row = [r["name"], r["address"]["city"], r["address"]["zip"]]
|
|
40
|
+
if include_age:
|
|
41
|
+
row.append(r["age"])
|
|
42
|
+
writer.writerow(row)
|
|
43
|
+
expected_text = buf.getvalue()
|
|
44
|
+
|
|
45
|
+
meta = {
|
|
46
|
+
"use_case": "uc2_json_to_csv_flatten",
|
|
47
|
+
"complexity": complexity,
|
|
48
|
+
"input_format": "json",
|
|
49
|
+
"output_format": "csv",
|
|
50
|
+
"description": "Nested JSON records -> flattened CSV with dot-notation columns",
|
|
51
|
+
"prompt_hint": (
|
|
52
|
+
"Flatten nested keys with dot notation (address.city, address.zip). "
|
|
53
|
+
"One CSV row per record. Header order: " + ",".join(cols) + "."
|
|
54
|
+
),
|
|
55
|
+
"seed": seed,
|
|
56
|
+
}
|
|
57
|
+
return GeneratedCase(
|
|
58
|
+
"uc2_json_to_csv_flatten", complexity, "json", "csv", input_text, expected_text, meta
|
|
59
|
+
)
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""Generator: bracketed-timestamp logs (TXT) -> CSV (timestamp,level,source,message)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import csv
|
|
6
|
+
import io
|
|
7
|
+
import random
|
|
8
|
+
from datetime import datetime, timedelta
|
|
9
|
+
|
|
10
|
+
from .base import GeneratedCase
|
|
11
|
+
|
|
12
|
+
_LEVELS = ["INFO", "WARN", "ERROR", "DEBUG"]
|
|
13
|
+
_SOURCES = ["app", "db", "auth", "cache", "api"]
|
|
14
|
+
# Comma-free messages keep the oracle CSV unquoted and exactly verifiable.
|
|
15
|
+
_MESSAGES = [
|
|
16
|
+
"Server started",
|
|
17
|
+
"Connection lost",
|
|
18
|
+
"Slow query detected",
|
|
19
|
+
"Reconnecting",
|
|
20
|
+
"User login succeeded",
|
|
21
|
+
"Cache miss",
|
|
22
|
+
"Request handled",
|
|
23
|
+
"Disk space low",
|
|
24
|
+
]
|
|
25
|
+
_N_LINES = {"simple": 6, "medium": 30, "complex": 120}
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def generate(seed: int, complexity: str) -> GeneratedCase:
|
|
29
|
+
rng = random.Random(seed)
|
|
30
|
+
n = _N_LINES[complexity]
|
|
31
|
+
t = datetime(2026, 4, 15, 10, 0, 0)
|
|
32
|
+
|
|
33
|
+
in_lines: list[str] = []
|
|
34
|
+
rows: list[tuple[str, str, str, str]] = []
|
|
35
|
+
for _ in range(n):
|
|
36
|
+
t = t + timedelta(seconds=rng.randint(1, 90))
|
|
37
|
+
level = rng.choice(_LEVELS)
|
|
38
|
+
source = rng.choice(_SOURCES)
|
|
39
|
+
msg = rng.choice(_MESSAGES)
|
|
40
|
+
in_lines.append(f"[{t:%Y-%m-%d %H:%M:%S}] {level} {source}: {msg}")
|
|
41
|
+
rows.append((f"{t:%Y-%m-%dT%H:%M:%S}", level, source, msg))
|
|
42
|
+
|
|
43
|
+
input_text = "\n".join(in_lines) + "\n"
|
|
44
|
+
buf = io.StringIO()
|
|
45
|
+
writer = csv.writer(buf, lineterminator="\n")
|
|
46
|
+
writer.writerow(["timestamp", "level", "source", "message"])
|
|
47
|
+
writer.writerows(rows)
|
|
48
|
+
expected_text = buf.getvalue()
|
|
49
|
+
|
|
50
|
+
meta = {
|
|
51
|
+
"use_case": "uc3_txt_log_to_csv",
|
|
52
|
+
"complexity": complexity,
|
|
53
|
+
"input_format": "txt",
|
|
54
|
+
"output_format": "csv",
|
|
55
|
+
"description": "Bracketed-timestamp logs -> CSV timestamp,level,source,message",
|
|
56
|
+
"prompt_hint": (
|
|
57
|
+
"Each line: [YYYY-MM-DD HH:MM:SS] LEVEL source: message. Output CSV "
|
|
58
|
+
"columns timestamp (ISO-8601 YYYY-MM-DDTHH:MM:SS), level, source, message."
|
|
59
|
+
),
|
|
60
|
+
"seed": seed,
|
|
61
|
+
}
|
|
62
|
+
return GeneratedCase(
|
|
63
|
+
"uc3_txt_log_to_csv", complexity, "txt", "csv", input_text, expected_text, meta
|
|
64
|
+
)
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""Generator: regional sales CSV -> human-readable TXT report with totals."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import csv
|
|
6
|
+
import io
|
|
7
|
+
import random
|
|
8
|
+
|
|
9
|
+
from .base import GeneratedCase
|
|
10
|
+
|
|
11
|
+
_POOL = [
|
|
12
|
+
"North", "South", "East", "West", "Central",
|
|
13
|
+
"Northeast", "Southwest", "Northwest", "Southeast",
|
|
14
|
+
]
|
|
15
|
+
_N = {"simple": 4, "medium": 6, "complex": 9}
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def generate(seed: int, complexity: str) -> GeneratedCase:
|
|
19
|
+
rng = random.Random(seed)
|
|
20
|
+
regions = _POOL[: _N[complexity]]
|
|
21
|
+
data: list[tuple[str, int, int]] = [
|
|
22
|
+
(reg, rng.randint(1000, 20000), rng.randint(5, 50)) for reg in regions
|
|
23
|
+
]
|
|
24
|
+
total_sales = sum(s for _, s, _ in data)
|
|
25
|
+
total_units = sum(u for _, _, u in data)
|
|
26
|
+
|
|
27
|
+
buf = io.StringIO()
|
|
28
|
+
writer = csv.writer(buf, lineterminator="\n")
|
|
29
|
+
writer.writerow(["region", "sales", "units"])
|
|
30
|
+
for row in data:
|
|
31
|
+
writer.writerow(row)
|
|
32
|
+
input_text = buf.getvalue()
|
|
33
|
+
|
|
34
|
+
lines = ["Sales Report by Region", "======================", "Region Sales Units"]
|
|
35
|
+
for reg, s, u in data:
|
|
36
|
+
lines.append(f"{reg:<10}{s:<9}{u}")
|
|
37
|
+
lines += ["", f"Total Sales: {total_sales}", f"Total Units: {total_units}"]
|
|
38
|
+
expected_text = "\n".join(lines) + "\n"
|
|
39
|
+
|
|
40
|
+
required = ["Sales Report"]
|
|
41
|
+
for reg, s, _ in data:
|
|
42
|
+
required.extend([reg, str(s)])
|
|
43
|
+
required += [f"Total Sales: {total_sales}", f"Total Units: {total_units}"]
|
|
44
|
+
|
|
45
|
+
meta = {
|
|
46
|
+
"use_case": "uc4_csv_to_txt_report",
|
|
47
|
+
"complexity": complexity,
|
|
48
|
+
"input_format": "csv",
|
|
49
|
+
"output_format": "txt",
|
|
50
|
+
"description": "Regional sales CSV -> human-readable TXT report with totals",
|
|
51
|
+
"prompt_hint": (
|
|
52
|
+
"Produce a plain-text report titled 'Sales Report by Region', an "
|
|
53
|
+
"aligned table of the regions, then two summary lines "
|
|
54
|
+
"'Total Sales: <sum>' and 'Total Units: <sum>'."
|
|
55
|
+
),
|
|
56
|
+
"content_accuracy_mode": "txt_substring",
|
|
57
|
+
"required_substrings": required,
|
|
58
|
+
"seed": seed,
|
|
59
|
+
}
|
|
60
|
+
return GeneratedCase(
|
|
61
|
+
"uc4_csv_to_txt_report", complexity, "csv", "txt", input_text, expected_text, meta
|
|
62
|
+
)
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""Generator: JSON schema migration — rename user_* keys to bare names."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import random
|
|
7
|
+
|
|
8
|
+
from .base import GeneratedCase, make_faker
|
|
9
|
+
|
|
10
|
+
_N = {"simple": 3, "medium": 8, "complex": 20}
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def generate(seed: int, complexity: str) -> GeneratedCase:
|
|
14
|
+
rng = random.Random(seed)
|
|
15
|
+
fake = make_faker(seed)
|
|
16
|
+
include_age = complexity != "simple"
|
|
17
|
+
|
|
18
|
+
src: list[dict] = []
|
|
19
|
+
dst: list[dict] = []
|
|
20
|
+
for i in range(_N[complexity]):
|
|
21
|
+
name = f"{fake.first_name()}{i}"
|
|
22
|
+
email = f"{name.lower()}@example.com"
|
|
23
|
+
s: dict = {"user_name": name, "user_email": email}
|
|
24
|
+
d: dict = {"name": name, "email": email}
|
|
25
|
+
if include_age:
|
|
26
|
+
age = rng.randint(18, 80)
|
|
27
|
+
s["user_age"] = age
|
|
28
|
+
d["age"] = age
|
|
29
|
+
src.append(s)
|
|
30
|
+
dst.append(d)
|
|
31
|
+
|
|
32
|
+
input_text = json.dumps(src, indent=2) + "\n"
|
|
33
|
+
expected_text = json.dumps(dst, indent=2) + "\n"
|
|
34
|
+
|
|
35
|
+
meta = {
|
|
36
|
+
"use_case": "uc5_schema_migration",
|
|
37
|
+
"complexity": complexity,
|
|
38
|
+
"input_format": "json",
|
|
39
|
+
"output_format": "json",
|
|
40
|
+
"description": "JSON schema migration — rename user_* keys to bare names",
|
|
41
|
+
"prompt_hint": (
|
|
42
|
+
"Rename keys on every record: user_name->name, user_email->email, "
|
|
43
|
+
"user_age->age. Preserve values and record order."
|
|
44
|
+
),
|
|
45
|
+
"seed": seed,
|
|
46
|
+
}
|
|
47
|
+
return GeneratedCase(
|
|
48
|
+
"uc5_schema_migration", complexity, "json", "json", input_text, expected_text, meta
|
|
49
|
+
)
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
"""Stage 4 — run an Opus-authored conversion script in a subprocess.
|
|
2
|
+
|
|
3
|
+
Trusted-but-buggy execution: a timeout and (POSIX) CPU limit guard against
|
|
4
|
+
runaway scripts. We intentionally do not cap virtual memory (RLIMIT_AS),
|
|
5
|
+
because pandas/numpy reserve large virtual address space and would crash.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import os
|
|
11
|
+
import subprocess
|
|
12
|
+
import sys
|
|
13
|
+
import tempfile
|
|
14
|
+
import time
|
|
15
|
+
from dataclasses import dataclass
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
|
|
18
|
+
DEFAULT_TIMEOUT_SEC = 15.0
|
|
19
|
+
DEFAULT_CPU_SECONDS = 15
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass(frozen=True)
|
|
23
|
+
class SandboxResult:
|
|
24
|
+
output_text: str
|
|
25
|
+
returncode: int
|
|
26
|
+
stderr: str
|
|
27
|
+
elapsed_sec: float
|
|
28
|
+
error_kind: str # "ok" | "syntax" | "runtime" | "timeout" | "empty_output"
|
|
29
|
+
|
|
30
|
+
@property
|
|
31
|
+
def ok(self) -> bool:
|
|
32
|
+
return self.error_kind == "ok"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _posix_limits(cpu_seconds: int):
|
|
36
|
+
def _apply() -> None:
|
|
37
|
+
import resource
|
|
38
|
+
|
|
39
|
+
resource.setrlimit(resource.RLIMIT_CPU, (cpu_seconds, cpu_seconds))
|
|
40
|
+
|
|
41
|
+
return _apply
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def run_script(
|
|
45
|
+
script: str,
|
|
46
|
+
input_path: Path,
|
|
47
|
+
*,
|
|
48
|
+
output_suffix: str,
|
|
49
|
+
timeout_sec: float = DEFAULT_TIMEOUT_SEC,
|
|
50
|
+
cpu_seconds: int = DEFAULT_CPU_SECONDS,
|
|
51
|
+
) -> SandboxResult:
|
|
52
|
+
"""Write `script` to a temp dir, run it on `input_path`, return the output."""
|
|
53
|
+
|
|
54
|
+
input_path = Path(input_path).resolve()
|
|
55
|
+
with tempfile.TemporaryDirectory() as tmp:
|
|
56
|
+
tmpdir = Path(tmp)
|
|
57
|
+
script_path = tmpdir / "convert.py"
|
|
58
|
+
out_path = tmpdir / f"output{output_suffix}"
|
|
59
|
+
script_path.write_text(script, encoding="utf-8")
|
|
60
|
+
|
|
61
|
+
preexec = _posix_limits(cpu_seconds) if os.name == "posix" else None
|
|
62
|
+
start = time.perf_counter()
|
|
63
|
+
try:
|
|
64
|
+
proc = subprocess.run(
|
|
65
|
+
[sys.executable, str(script_path), str(input_path), str(out_path)],
|
|
66
|
+
capture_output=True,
|
|
67
|
+
text=True,
|
|
68
|
+
timeout=timeout_sec,
|
|
69
|
+
cwd=str(tmpdir),
|
|
70
|
+
preexec_fn=preexec,
|
|
71
|
+
)
|
|
72
|
+
except subprocess.TimeoutExpired:
|
|
73
|
+
return SandboxResult(
|
|
74
|
+
output_text="",
|
|
75
|
+
returncode=-1,
|
|
76
|
+
stderr=f"Timed out after {timeout_sec}s",
|
|
77
|
+
elapsed_sec=time.perf_counter() - start,
|
|
78
|
+
error_kind="timeout",
|
|
79
|
+
)
|
|
80
|
+
elapsed = time.perf_counter() - start
|
|
81
|
+
|
|
82
|
+
if proc.returncode != 0:
|
|
83
|
+
syntax_markers = ("SyntaxError", "IndentationError", "TabError")
|
|
84
|
+
kind = (
|
|
85
|
+
"syntax" if any(m in proc.stderr for m in syntax_markers) else "runtime"
|
|
86
|
+
)
|
|
87
|
+
return SandboxResult("", proc.returncode, proc.stderr, elapsed, kind)
|
|
88
|
+
|
|
89
|
+
if not out_path.exists():
|
|
90
|
+
return SandboxResult("", 0, proc.stderr, elapsed, "empty_output")
|
|
91
|
+
output_text = out_path.read_text(encoding="utf-8")
|
|
92
|
+
if not output_text.strip():
|
|
93
|
+
return SandboxResult(output_text, 0, proc.stderr, elapsed, "empty_output")
|
|
94
|
+
|
|
95
|
+
return SandboxResult(output_text, 0, proc.stderr, elapsed, "ok")
|