code2flow-toon 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. code2flow/__init__.py +47 -0
  2. code2flow/__main__.py +6 -0
  3. code2flow/analysis/__init__.py +17 -0
  4. code2flow/analysis/call_graph.py +210 -0
  5. code2flow/analysis/cfg.py +293 -0
  6. code2flow/analysis/coupling.py +77 -0
  7. code2flow/analysis/data_analysis.py +249 -0
  8. code2flow/analysis/dfg.py +224 -0
  9. code2flow/analysis/smells.py +192 -0
  10. code2flow/cli.py +464 -0
  11. code2flow/core/__init__.py +36 -0
  12. code2flow/core/analyzer.py +765 -0
  13. code2flow/core/config.py +177 -0
  14. code2flow/core/models.py +194 -0
  15. code2flow/core/streaming_analyzer.py +666 -0
  16. code2flow/exporters/__init__.py +17 -0
  17. code2flow/exporters/base.py +13 -0
  18. code2flow/exporters/json_exporter.py +17 -0
  19. code2flow/exporters/llm_exporter.py +199 -0
  20. code2flow/exporters/mermaid_exporter.py +67 -0
  21. code2flow/exporters/toon.py +401 -0
  22. code2flow/exporters/yaml_exporter.py +108 -0
  23. code2flow/llm_flow_generator.py +451 -0
  24. code2flow/llm_task_generator.py +263 -0
  25. code2flow/mermaid_generator.py +481 -0
  26. code2flow/nlp/__init__.py +23 -0
  27. code2flow/nlp/config.py +174 -0
  28. code2flow/nlp/entity_resolution.py +326 -0
  29. code2flow/nlp/intent_matching.py +297 -0
  30. code2flow/nlp/normalization.py +122 -0
  31. code2flow/nlp/pipeline.py +388 -0
  32. code2flow/patterns/__init__.py +0 -0
  33. code2flow/patterns/detector.py +168 -0
  34. code2flow/refactor/__init__.py +0 -0
  35. code2flow/refactor/prompt_engine.py +150 -0
  36. code2flow/visualizers/__init__.py +0 -0
  37. code2flow/visualizers/graph.py +196 -0
  38. code2flow_toon-0.2.4.dist-info/METADATA +599 -0
  39. code2flow_toon-0.2.4.dist-info/RECORD +43 -0
  40. code2flow_toon-0.2.4.dist-info/WHEEL +5 -0
  41. code2flow_toon-0.2.4.dist-info/entry_points.txt +2 -0
  42. code2flow_toon-0.2.4.dist-info/licenses/LICENSE +201 -0
  43. code2flow_toon-0.2.4.dist-info/top_level.txt +1 -0
@@ -0,0 +1,451 @@
1
+ import argparse
2
+ import re
3
+ import sys
4
+ from collections import Counter, defaultdict, deque
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+ from typing import Any, Dict, Iterable, List, Optional, Set, Tuple
8
+
9
+ import yaml
10
+
11
+
12
+ _FUNC_LABEL_PREFIX = "FUNC:"
13
+ _CALL_LABEL_PREFIX = "CALL "
14
+
15
+
16
+ def _strip_bom(text: str) -> str:
17
+ return text[1:] if text.startswith("\ufeff") else text
18
+
19
+
20
+ def _safe_read_yaml(path: Path) -> Dict[str, Any]:
21
+ raw = _strip_bom(path.read_text(encoding="utf-8"))
22
+ loaded = yaml.safe_load(raw) or {}
23
+ if not isinstance(loaded, dict):
24
+ raise ValueError("analysis.yaml must be a mapping at top-level")
25
+ return loaded
26
+
27
+
28
+ def _as_dict(d: Any) -> Dict[str, Any]:
29
+ return d if isinstance(d, dict) else {}
30
+
31
+
32
+ def _as_list(v: Any) -> List[Any]:
33
+ return v if isinstance(v, list) else []
34
+
35
+
36
+ def _shorten(s: str, max_len: int) -> str:
37
+ s = (s or "").strip()
38
+ if len(s) <= max_len:
39
+ return s
40
+ return s[: max(0, max_len - 1)].rstrip() + "…"
41
+
42
+
43
+ def _parse_call_label(label: str) -> Optional[str]:
44
+ label = (label or "").strip()
45
+ if not label.startswith(_CALL_LABEL_PREFIX):
46
+ return None
47
+ rest = label[len(_CALL_LABEL_PREFIX) :].strip()
48
+ rest = rest.replace("<", "").replace(">", "")
49
+
50
+ m = re.match(r"([A-Za-z_][A-Za-z0-9_\.]+)\s*\(", rest)
51
+ if m:
52
+ return m.group(1)
53
+
54
+ m = re.match(r"([A-Za-z_][A-Za-z0-9_\.]+)$", rest)
55
+ if m:
56
+ return m.group(1)
57
+
58
+ return None
59
+
60
+
61
+ def _parse_func_label(label: str) -> Optional[str]:
62
+ label = (label or "").strip()
63
+ if not label.startswith(_FUNC_LABEL_PREFIX):
64
+ return None
65
+ return label[len(_FUNC_LABEL_PREFIX) :].strip() or None
66
+
67
+
68
+ @dataclass(frozen=True)
69
+ class FuncSummary:
70
+ name: str
71
+ file: Optional[str]
72
+ line: Optional[int]
73
+ decisions: Tuple[str, ...]
74
+ calls: Tuple[str, ...]
75
+
76
+
77
+ def _collect_nodes(analysis: Dict[str, Any]) -> Dict[int, Dict[str, Any]]:
78
+ nodes = analysis.get("nodes")
79
+ if not isinstance(nodes, dict):
80
+ return {}
81
+
82
+ parsed: Dict[int, Dict[str, Any]] = {}
83
+ for k, v in nodes.items():
84
+ try:
85
+ node_id = int(k)
86
+ except Exception:
87
+ continue
88
+ if isinstance(v, dict):
89
+ parsed[node_id] = v
90
+ return parsed
91
+
92
+
93
+ def _collect_entrypoints(nodes: Dict[int, Dict[str, Any]]) -> List[Dict[str, Any]]:
94
+ by_file: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
95
+ for n in nodes.values():
96
+ f = n.get("file")
97
+ if isinstance(f, str):
98
+ by_file[f].append(n)
99
+
100
+ entrypoints: List[Dict[str, Any]] = []
101
+ for f, ns in by_file.items():
102
+ if not (f.endswith("__main__.py") or f.endswith("cli.py")):
103
+ continue
104
+
105
+ main_funcs = [n for n in ns if n.get("type") == "FUNC" and isinstance(n.get("function"), str)]
106
+ for n in main_funcs:
107
+ entrypoints.append(
108
+ {
109
+ "kind": "cli" if f.endswith("cli.py") else "module_main",
110
+ "file": f,
111
+ "function": n.get("function"),
112
+ "line": n.get("line"),
113
+ }
114
+ )
115
+
116
+ uniq: Dict[str, Dict[str, Any]] = {}
117
+ for ep in entrypoints:
118
+ key = str(ep.get("function") or "")
119
+ if key and key not in uniq:
120
+ uniq[key] = ep
121
+
122
+ return list(uniq.values())
123
+
124
+
125
+ def _collect_functions(nodes: Dict[int, Dict[str, Any]]) -> Set[str]:
126
+ out: Set[str] = set()
127
+ for n in nodes.values():
128
+ if n.get("type") != "FUNC":
129
+ continue
130
+ fn = n.get("function")
131
+ if isinstance(fn, str) and fn:
132
+ out.add(fn)
133
+ else:
134
+ parsed = _parse_func_label(str(n.get("label") or ""))
135
+ if parsed:
136
+ out.add(parsed)
137
+ return out
138
+
139
+
140
+ def _node_counts_by_function(nodes: Dict[int, Dict[str, Any]]) -> Counter[str]:
141
+ counts: Counter[str] = Counter()
142
+ for n in nodes.values():
143
+ fn = n.get("function")
144
+ if isinstance(fn, str) and fn:
145
+ counts[fn] += 1
146
+ return counts
147
+
148
+
149
+ def _pick_relevant_functions(
150
+ *,
151
+ entrypoints: List[Dict[str, Any]],
152
+ known_functions: Set[str],
153
+ func_summaries: Dict[str, FuncSummary],
154
+ nodes: Dict[int, Dict[str, Any]],
155
+ max_functions: int,
156
+ ) -> List[str]:
157
+ """Pick a compact but meaningful subset of functions.
158
+
159
+ In many real projects, the CFG "CALL" labels often point to external
160
+ functions (e.g. click.echo), so a pure call-graph reachability may select
161
+ almost nothing. Here we fall back to a scoring heuristic:
162
+ - start with entrypoints
163
+ - boost functions that have many nodes (more logic)
164
+ - boost functions with important keywords (extract, schema, openapi, dom, cli)
165
+ """
166
+
167
+ roots = [str(ep.get("function") or "") for ep in entrypoints]
168
+ roots = [r for r in roots if r in known_functions]
169
+
170
+ counts = _node_counts_by_function(nodes)
171
+
172
+ keyword_boosts = [
173
+ (".cli.", 50),
174
+ (".extract.", 80),
175
+ ("extract_schema", 120),
176
+ ("extract_schema_to_file", 120),
177
+ ("extract_appspec_to_file", 120),
178
+ ("openapi", 60),
179
+ ("dom", 40),
180
+ ("makefile", 40),
181
+ ("shell", 40),
182
+ ("python", 40),
183
+ ("validate", 20),
184
+ ("discover", 20),
185
+ ]
186
+
187
+ def score(fn: str) -> int:
188
+ s = 0
189
+ s += min(500, counts.get(fn, 0)) # node count baseline
190
+ for needle, boost in keyword_boosts:
191
+ if needle in fn:
192
+ s += boost
193
+ if fn in roots:
194
+ s += 1000
195
+ if fn in func_summaries and func_summaries[fn].decisions:
196
+ s += min(200, 10 * len(func_summaries[fn].decisions))
197
+ return s
198
+
199
+ scored = [(fn, score(fn)) for fn in known_functions]
200
+ scored.sort(key=lambda x: x[1], reverse=True)
201
+
202
+ picked: List[str] = []
203
+ for fn, _ in scored:
204
+ if len(picked) >= max_functions:
205
+ break
206
+ picked.append(fn)
207
+
208
+ return picked
209
+
210
+
211
+ def _summarize_functions(nodes: Dict[int, Dict[str, Any]], limit_decisions: int, limit_calls: int) -> Dict[str, FuncSummary]:
212
+ decisions_by_func: Dict[str, List[str]] = defaultdict(list)
213
+ calls_by_func: Dict[str, List[str]] = defaultdict(list)
214
+ loc_by_func: Dict[str, Tuple[Optional[str], Optional[int]]] = {}
215
+
216
+ for n in nodes.values():
217
+ fn = n.get("function")
218
+ if not isinstance(fn, str) or not fn:
219
+ continue
220
+
221
+ if fn not in loc_by_func:
222
+ loc_by_func[fn] = (
223
+ n.get("file") if isinstance(n.get("file"), str) else None,
224
+ n.get("line") if isinstance(n.get("line"), int) else None,
225
+ )
226
+
227
+ ntype = n.get("type")
228
+ label = str(n.get("label") or "")
229
+
230
+ if ntype == "IF":
231
+ decisions_by_func[fn].append(_shorten(label, 120))
232
+ elif ntype == "CALL":
233
+ callee = _parse_call_label(label)
234
+ if callee:
235
+ calls_by_func[fn].append(callee)
236
+
237
+ summaries: Dict[str, FuncSummary] = {}
238
+ for fn in set(list(decisions_by_func.keys()) + list(calls_by_func.keys()) + list(loc_by_func.keys())):
239
+ file, line = loc_by_func.get(fn, (None, None))
240
+
241
+ decision_counts = Counter(decisions_by_func.get(fn, []))
242
+ call_counts = Counter(calls_by_func.get(fn, []))
243
+
244
+ decisions = tuple([d for d, _ in decision_counts.most_common(limit_decisions)])
245
+ calls = tuple([c for c, _ in call_counts.most_common(limit_calls)])
246
+
247
+ summaries[fn] = FuncSummary(
248
+ name=fn,
249
+ file=file,
250
+ line=line,
251
+ decisions=decisions,
252
+ calls=calls,
253
+ )
254
+
255
+ return summaries
256
+
257
+
258
+ def _build_call_graph(func_summaries: Dict[str, FuncSummary], known_functions: Set[str]) -> Dict[str, Set[str]]:
259
+ g: Dict[str, Set[str]] = defaultdict(set)
260
+ for fn, s in func_summaries.items():
261
+ for callee in s.calls:
262
+ if callee in known_functions:
263
+ g[fn].add(callee)
264
+ return g
265
+
266
+
267
+ def _reachable(g: Dict[str, Set[str]], roots: Iterable[str], max_nodes: int) -> List[str]:
268
+ seen: Set[str] = set()
269
+ q: deque[str] = deque([r for r in roots if r])
270
+
271
+ while q and len(seen) < max_nodes:
272
+ cur = q.popleft()
273
+ if cur in seen:
274
+ continue
275
+ seen.add(cur)
276
+ for nxt in sorted(g.get(cur, set())):
277
+ if nxt not in seen:
278
+ q.append(nxt)
279
+
280
+ return list(seen)
281
+
282
+
283
+ def generate_llm_flow(
284
+ analysis: Dict[str, Any],
285
+ max_functions: int,
286
+ limit_decisions: int,
287
+ limit_calls: int,
288
+ ) -> Dict[str, Any]:
289
+ nodes = _collect_nodes(analysis)
290
+ entrypoints = _collect_entrypoints(nodes)
291
+
292
+ known_functions = _collect_functions(nodes)
293
+ func_summaries = _summarize_functions(nodes, limit_decisions=limit_decisions, limit_calls=limit_calls)
294
+
295
+ reachable = _pick_relevant_functions(
296
+ entrypoints=entrypoints,
297
+ known_functions=known_functions,
298
+ func_summaries=func_summaries,
299
+ nodes=nodes,
300
+ max_functions=max_functions,
301
+ )
302
+
303
+ functions_out: List[Dict[str, Any]] = []
304
+ for fn in sorted(reachable):
305
+ s = func_summaries.get(fn)
306
+ if not s:
307
+ continue
308
+ functions_out.append(
309
+ {
310
+ "name": s.name,
311
+ "file": s.file,
312
+ "line": s.line,
313
+ "decisions": list(s.decisions),
314
+ "calls": list(s.calls),
315
+ }
316
+ )
317
+
318
+ package_names = sorted({fn.split(".")[0] for fn in known_functions if "." in fn})
319
+
320
+ return {
321
+ "format": "llm_flow.v1",
322
+ "app": {
323
+ "packages": package_names,
324
+ "entrypoints": entrypoints,
325
+ },
326
+ "flow": {
327
+ "selected_functions": functions_out,
328
+ },
329
+ }
330
+
331
+
332
+ def render_llm_flow_md(flow: Dict[str, Any]) -> str:
333
+ app = _as_dict(flow.get("app"))
334
+ entrypoints = _as_list(app.get("entrypoints"))
335
+ selected = _as_list(_as_dict(flow.get("flow")).get("selected_functions"))
336
+
337
+ lines: List[str] = []
338
+ lines.append("# LLM Flow Summary")
339
+ lines.append("")
340
+
341
+ pkgs = _as_list(app.get("packages"))
342
+ if pkgs:
343
+ lines.append("## Packages")
344
+ for p in pkgs:
345
+ lines.append(f"- {p}")
346
+ lines.append("")
347
+
348
+ if entrypoints:
349
+ lines.append("## Entrypoints")
350
+ for ep in entrypoints:
351
+ epd = _as_dict(ep)
352
+ fn = epd.get("function")
353
+ f = epd.get("file")
354
+ ln = epd.get("line")
355
+ lines.append(f"- {fn} ({f}:{ln})")
356
+ lines.append("")
357
+
358
+ lines.append("## Selected functions")
359
+ for f in selected:
360
+ fd = _as_dict(f)
361
+ name = fd.get("name")
362
+ file = fd.get("file")
363
+ line = fd.get("line")
364
+ lines.append(f"### {name}")
365
+ lines.append(f"- Location: {file}:{line}")
366
+
367
+ decisions = _as_list(fd.get("decisions"))
368
+ if decisions:
369
+ lines.append("- Decisions:")
370
+ for d in decisions:
371
+ lines.append(f" - {_shorten(str(d), 180)}")
372
+
373
+ calls = _as_list(fd.get("calls"))
374
+ if calls:
375
+ lines.append("- Calls:")
376
+ for c in calls:
377
+ lines.append(f" - {c}")
378
+
379
+ lines.append("")
380
+
381
+ return "\n".join(lines).rstrip() + "\n"
382
+
383
+
384
+ def dump_yaml(data: Dict[str, Any]) -> str:
385
+ return yaml.safe_dump(
386
+ data,
387
+ sort_keys=False,
388
+ allow_unicode=True,
389
+ width=100,
390
+ default_flow_style=False,
391
+ )
392
+
393
+
394
+ def create_parser() -> argparse.ArgumentParser:
395
+ p = argparse.ArgumentParser(
396
+ prog="llm-flow-generator",
397
+ description="Generate compact LLM-friendly app flow summary from code2flow analysis.yaml",
398
+ )
399
+ p.add_argument(
400
+ "-i",
401
+ "--input",
402
+ default="./output/analysis.yaml",
403
+ help="Path to analysis.yaml (default: ./output/analysis.yaml)",
404
+ )
405
+ p.add_argument(
406
+ "-o",
407
+ "--output",
408
+ default="./output/llm_flow.yaml",
409
+ help="Output llm_flow.yaml path (default: ./output/llm_flow.yaml)",
410
+ )
411
+ p.add_argument(
412
+ "--md",
413
+ default=None,
414
+ help="Optional output Markdown summary path (e.g. ./output/llm_flow.md)",
415
+ )
416
+ p.add_argument("--max-functions", type=int, default=40)
417
+ p.add_argument("--limit-decisions", type=int, default=8)
418
+ p.add_argument("--limit-calls", type=int, default=12)
419
+ return p
420
+
421
+
422
+ def main(argv: Optional[List[str]] = None) -> int:
423
+ args = create_parser().parse_args(argv)
424
+
425
+ input_path = Path(args.input)
426
+ if not input_path.exists():
427
+ print(f"Error: input file not found: {input_path}", file=sys.stderr)
428
+ return 2
429
+
430
+ analysis = _safe_read_yaml(input_path)
431
+ flow = generate_llm_flow(
432
+ analysis,
433
+ max_functions=max(1, args.max_functions),
434
+ limit_decisions=max(0, args.limit_decisions),
435
+ limit_calls=max(0, args.limit_calls),
436
+ )
437
+
438
+ output_path = Path(args.output)
439
+ output_path.parent.mkdir(parents=True, exist_ok=True)
440
+ output_path.write_text(dump_yaml(flow), encoding="utf-8")
441
+
442
+ if args.md:
443
+ md_path = Path(args.md)
444
+ md_path.parent.mkdir(parents=True, exist_ok=True)
445
+ md_path.write_text(render_llm_flow_md(flow), encoding="utf-8")
446
+
447
+ return 0
448
+
449
+
450
+ if __name__ == "__main__":
451
+ raise SystemExit(main())
@@ -0,0 +1,263 @@
1
+ import argparse
2
+ import sys
3
+ from pathlib import Path
4
+ from typing import Any, Dict, List, Optional, Tuple
5
+
6
+ import yaml
7
+
8
+
9
+ def _strip_bom(text: str) -> str:
10
+ return text[1:] if text.startswith("\ufeff") else text
11
+
12
+
13
+ def _ensure_list(value: Any) -> List[Any]:
14
+ if value is None:
15
+ return []
16
+ if isinstance(value, list):
17
+ return value
18
+ return [value]
19
+
20
+
21
+ def _deep_get(d: Dict[str, Any], path: Tuple[str, ...]) -> Any:
22
+ cur: Any = d
23
+ for key in path:
24
+ if not isinstance(cur, dict) or key not in cur:
25
+ return None
26
+ cur = cur[key]
27
+ return cur
28
+
29
+
30
+ def normalize_llm_task(data: Dict[str, Any]) -> Dict[str, Any]:
31
+ task = data.get("task") or {}
32
+ context = data.get("context") or {}
33
+ deliverables = data.get("deliverables") or {}
34
+ interfaces = data.get("interfaces") or {}
35
+ rules = data.get("rules") or {}
36
+ acceptance = data.get("acceptance") or {}
37
+ examples = data.get("examples")
38
+ notes = data.get("notes_for_llm") or {}
39
+
40
+ normalized: Dict[str, Any] = {
41
+ "task": {
42
+ "title": task.get("title") or "",
43
+ "one_line_goal": task.get("one_line_goal") or "",
44
+ },
45
+ "context": {
46
+ "product_area": context.get("product_area") or "",
47
+ "current_behavior": context.get("current_behavior") or "",
48
+ "desired_behavior": context.get("desired_behavior") or "",
49
+ },
50
+ "deliverables": {
51
+ "language": deliverables.get("language") or "any",
52
+ "must_generate": _ensure_list(deliverables.get("must_generate")),
53
+ "files_to_create_or_edit": _ensure_list(deliverables.get("files_to_create_or_edit")),
54
+ },
55
+ "interfaces": {
56
+ "inputs": _ensure_list(interfaces.get("inputs")),
57
+ "outputs": _ensure_list(interfaces.get("outputs")),
58
+ },
59
+ "rules": {
60
+ "must": _ensure_list(rules.get("must")),
61
+ "must_not": _ensure_list(rules.get("must_not")),
62
+ "assumptions": _ensure_list(rules.get("assumptions")),
63
+ "edge_cases": _ensure_list(rules.get("edge_cases")),
64
+ "performance": _ensure_list(rules.get("performance")),
65
+ },
66
+ "acceptance": {
67
+ "tests": _ensure_list(acceptance.get("tests")),
68
+ "done_definition": _ensure_list(acceptance.get("done_definition")),
69
+ },
70
+ "examples": _ensure_list(examples),
71
+ "notes_for_llm": {
72
+ "constraints": _ensure_list(notes.get("constraints")),
73
+ "style": _ensure_list(notes.get("style")),
74
+ "language_specific_hints": _ensure_list(notes.get("language_specific_hints")),
75
+ },
76
+ }
77
+
78
+ return normalized
79
+
80
+
81
+ _SECTION_KEYS = {
82
+ "TITLE": ("task", "title"),
83
+ "GOAL": ("task", "one_line_goal"),
84
+ "PRODUCT_AREA": ("context", "product_area"),
85
+ "CURRENT": ("context", "current_behavior"),
86
+ "DESIRED": ("context", "desired_behavior"),
87
+ }
88
+
89
+
90
+ def _parse_bullets(lines: List[str]) -> List[str]:
91
+ items: List[str] = []
92
+ for raw in lines:
93
+ s = raw.strip()
94
+ if not s:
95
+ continue
96
+ if s.startswith("-"):
97
+ items.append(s[1:].strip())
98
+ else:
99
+ items.append(s)
100
+ return items
101
+
102
+
103
+ def parse_llm_task_text(text: str) -> Dict[str, Any]:
104
+ text = _strip_bom(text)
105
+ lines = text.replace("\r\n", "\n").replace("\r", "\n").split("\n")
106
+
107
+ sections: Dict[str, List[str]] = {}
108
+ current: Optional[str] = None
109
+
110
+ def start_section(name: str) -> None:
111
+ nonlocal current
112
+ current = name
113
+ sections.setdefault(name, [])
114
+
115
+ for line in lines:
116
+ stripped = line.strip()
117
+ if not stripped:
118
+ if current is not None:
119
+ sections[current].append("")
120
+ continue
121
+
122
+ upper = stripped.upper()
123
+ if upper.endswith(":"):
124
+ key = upper[:-1].strip()
125
+ if key in {
126
+ "TITLE",
127
+ "GOAL",
128
+ "CURRENT",
129
+ "DESIRED",
130
+ "INPUTS",
131
+ "OUTPUTS",
132
+ "RULES (MUST)",
133
+ "RULES (MUST NOT)",
134
+ "EDGE CASES",
135
+ "ACCEPTANCE TESTS",
136
+ "DELIVERABLES",
137
+ }:
138
+ start_section(key)
139
+ continue
140
+
141
+ if current is None:
142
+ continue
143
+ sections[current].append(line)
144
+
145
+ data: Dict[str, Any] = {
146
+ "task": {"title": "", "one_line_goal": ""},
147
+ "context": {"product_area": "", "current_behavior": "", "desired_behavior": ""},
148
+ "deliverables": {"language": "any", "must_generate": [], "files_to_create_or_edit": []},
149
+ "interfaces": {"inputs": [], "outputs": []},
150
+ "rules": {"must": [], "must_not": [], "assumptions": [], "edge_cases": [], "performance": []},
151
+ "acceptance": {"tests": [], "done_definition": []},
152
+ "examples": [],
153
+ "notes_for_llm": {"constraints": [], "style": [], "language_specific_hints": []},
154
+ }
155
+
156
+ for section_name, path in _SECTION_KEYS.items():
157
+ content_lines = sections.get(section_name)
158
+ if not content_lines:
159
+ continue
160
+ value = "\n".join(content_lines).strip()
161
+ if value:
162
+ parent = data
163
+ for key in path[:-1]:
164
+ parent = parent[key]
165
+ parent[path[-1]] = value
166
+
167
+ if sections.get("INPUTS"):
168
+ data["interfaces"]["inputs"] = _parse_bullets(sections["INPUTS"])
169
+ if sections.get("OUTPUTS"):
170
+ data["interfaces"]["outputs"] = _parse_bullets(sections["OUTPUTS"])
171
+ if sections.get("RULES (MUST)"):
172
+ data["rules"]["must"] = _parse_bullets(sections["RULES (MUST)"])
173
+ if sections.get("RULES (MUST NOT)"):
174
+ data["rules"]["must_not"] = _parse_bullets(sections["RULES (MUST NOT)"])
175
+ if sections.get("EDGE CASES"):
176
+ data["rules"]["edge_cases"] = _parse_bullets(sections["EDGE CASES"])
177
+
178
+ if sections.get("ACCEPTANCE TESTS"):
179
+ tests: List[Dict[str, str]] = []
180
+ raw_items = _parse_bullets(sections["ACCEPTANCE TESTS"])
181
+ for idx, item in enumerate(raw_items, 1):
182
+ tests.append({"name": f"test_{idx}", "given": "", "when": "", "then": item})
183
+ data["acceptance"]["tests"] = tests
184
+
185
+ if sections.get("DELIVERABLES"):
186
+ data["deliverables"]["must_generate"] = _parse_bullets(sections["DELIVERABLES"])
187
+
188
+ return data
189
+
190
+
191
+ def load_input(path: Path) -> Dict[str, Any]:
192
+ raw = path.read_text(encoding="utf-8")
193
+ raw = _strip_bom(raw)
194
+
195
+ if path.suffix.lower() in {".yaml", ".yml"}:
196
+ loaded = yaml.safe_load(raw) or {}
197
+ if not isinstance(loaded, dict):
198
+ raise ValueError("YAML input must be a mapping/object at top level")
199
+ return loaded
200
+
201
+ if path.suffix.lower() == ".json":
202
+ import json
203
+
204
+ loaded = json.loads(raw)
205
+ if not isinstance(loaded, dict):
206
+ raise ValueError("JSON input must be an object at top level")
207
+ return loaded
208
+
209
+ return parse_llm_task_text(raw)
210
+
211
+
212
+ def dump_yaml(data: Dict[str, Any]) -> str:
213
+ return yaml.safe_dump(
214
+ data,
215
+ sort_keys=False,
216
+ allow_unicode=True,
217
+ width=100,
218
+ default_flow_style=False,
219
+ )
220
+
221
+
222
+ def create_parser() -> argparse.ArgumentParser:
223
+ p = argparse.ArgumentParser(
224
+ prog="llm-task-generator",
225
+ description="Generate normalized llm_task.yaml from simplified task spec (text/YAML/JSON).",
226
+ )
227
+ p.add_argument("-i", "--input", required=True, help="Input file: .txt/.md/.yaml/.yml/.json")
228
+ p.add_argument("-o", "--output", required=True, help="Output YAML file path")
229
+ p.add_argument(
230
+ "--validate-only",
231
+ action="store_true",
232
+ help="Only validate/normalize input; do not write output file",
233
+ )
234
+ return p
235
+
236
+
237
+ def main(argv: Optional[List[str]] = None) -> int:
238
+ args = create_parser().parse_args(argv)
239
+
240
+ input_path = Path(args.input)
241
+ if not input_path.exists():
242
+ print(f"Error: input file not found: {input_path}", file=sys.stderr)
243
+ return 2
244
+
245
+ data = load_input(input_path)
246
+
247
+ if "task" not in data:
248
+ data = {"task": data}
249
+
250
+ normalized = normalize_llm_task(data)
251
+
252
+ if args.validate_only:
253
+ sys.stdout.write(dump_yaml(normalized))
254
+ return 0
255
+
256
+ output_path = Path(args.output)
257
+ output_path.parent.mkdir(parents=True, exist_ok=True)
258
+ output_path.write_text(dump_yaml(normalized), encoding="utf-8")
259
+ return 0
260
+
261
+
262
+ if __name__ == "__main__":
263
+ raise SystemExit(main())