clean-room-skill 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. package/.claude-plugin/marketplace.json +19 -0
  2. package/.claude-plugin/plugin.json +20 -0
  3. package/.codex-plugin/plugin.json +36 -0
  4. package/LICENSE +21 -0
  5. package/README.md +376 -0
  6. package/agents/clean-architect.md +27 -0
  7. package/agents/clean-qa-editor.md +27 -0
  8. package/agents/contaminated-manager-verifier.md +35 -0
  9. package/agents/contaminated-source-analyst.md +26 -0
  10. package/bin/install.js +535 -0
  11. package/examples/codex/.codex/agents/clean-architect.toml +17 -0
  12. package/examples/codex/.codex/agents/clean-qa-editor.toml +17 -0
  13. package/examples/codex/.codex/agents/contaminated-manager-verifier.toml +21 -0
  14. package/examples/codex/.codex/agents/contaminated-source-analyst.toml +17 -0
  15. package/hooks/check-artifact-leakage.py +317 -0
  16. package/hooks/clean-room-hook.py +88 -0
  17. package/hooks/clean_room_paths.py +130 -0
  18. package/hooks/deny-clean-room-shell.py +30 -0
  19. package/hooks/deny-clean-source-read.py +104 -0
  20. package/hooks/deny-contaminated-clean-write.py +134 -0
  21. package/hooks/hooks.json +44 -0
  22. package/hooks/require-clean-room-env.py +127 -0
  23. package/hooks/validate-handoff-package.py +140 -0
  24. package/hooks/validate-json-schema.py +283 -0
  25. package/lib/fs-utils.cjs +123 -0
  26. package/lib/hooks.cjs +214 -0
  27. package/package.json +49 -0
  28. package/plugin.json +20 -0
  29. package/skills/attended/SKILL.md +25 -0
  30. package/skills/clean-room/SKILL.md +134 -0
  31. package/skills/clean-room/assets/behavior-spec.schema.json +367 -0
  32. package/skills/clean-room/assets/contamination-incident.schema.json +60 -0
  33. package/skills/clean-room/assets/coverage-ledger.schema.json +139 -0
  34. package/skills/clean-room/assets/evidence-ledger.schema.json +80 -0
  35. package/skills/clean-room/assets/handoff-package.schema.json +114 -0
  36. package/skills/clean-room/assets/qc-report.schema.json +248 -0
  37. package/skills/clean-room/assets/skeleton-manifest.schema.json +239 -0
  38. package/skills/clean-room/assets/source-index.schema.json +622 -0
  39. package/skills/clean-room/assets/task-manifest.schema.json +593 -0
  40. package/skills/clean-room/examples/README.md +18 -0
  41. package/skills/clean-room/examples/minimal-spec-package/behavior-spec.json +61 -0
  42. package/skills/clean-room/examples/minimal-spec-package/coverage-ledger.json +27 -0
  43. package/skills/clean-room/examples/minimal-spec-package/evidence-ledger.json +17 -0
  44. package/skills/clean-room/examples/minimal-spec-package/handoff-package.json +26 -0
  45. package/skills/clean-room/examples/minimal-spec-package/qc-report.json +25 -0
  46. package/skills/clean-room/examples/minimal-spec-package/skeleton-manifest.json +45 -0
  47. package/skills/clean-room/examples/minimal-spec-package/source-index.json +156 -0
  48. package/skills/clean-room/examples/minimal-spec-package/task-manifest.json +220 -0
  49. package/skills/clean-room/references/LEAKAGE-RULES.md +92 -0
  50. package/skills/clean-room/references/PROCESS.md +185 -0
  51. package/skills/clean-room/references/SPEC-SCHEMA.md +185 -0
  52. package/skills/clean-room/references/TARGET-LANGUAGE-GUIDE.md +43 -0
  53. package/skills/clean-room/scripts/build_source_index.py +1253 -0
  54. package/skills/clean-room/scripts/clean_room_tool_manager.py +199 -0
  55. package/skills/clean-room/scripts/clean_room_tooling.py +370 -0
  56. package/skills/unattended/SKILL.md +26 -0
@@ -0,0 +1,1253 @@
1
+ #!/usr/bin/env python3
2
+ """Build a bounded contaminated-side source index for clean-room planning."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import argparse
7
+ import ast
8
+ import json
9
+ import math
10
+ import os
11
+ import platform
12
+ import re
13
+ import sys
14
+ import tempfile
15
+ from collections import defaultdict, deque
16
+ from datetime import datetime, timezone
17
+ from pathlib import Path
18
+ from typing import Any
19
+
20
+ import clean_room_tooling
21
+
22
+
23
+ DEFAULT_IGNORE_DIRS = (
24
+ ".git",
25
+ "node_modules",
26
+ ".venv",
27
+ "venv",
28
+ "dist",
29
+ "build",
30
+ "target",
31
+ ".next",
32
+ "coverage",
33
+ "__pycache__",
34
+ )
35
+ DEFAULT_MAX_FILES = 2000
36
+ DEFAULT_MAX_FILE_BYTES = 1_000_000
37
+ DEFAULT_MAX_TOTAL_BYTES = 50_000_000
38
+ DEFAULT_MAX_BATCH_TOKENS = 20_000
39
+ DEFAULT_LARGE_FILE_WORDS = 5_000
40
+ DEFAULT_LARGE_GROUP_WORDS = 15_000
41
+ DEFAULT_MAX_FILE_SEGMENTS = 200
42
+ MAX_SKIPPED_ENTRIES = 1000
43
+ MAX_IMPORTS_PER_FILE = 200
44
+ MAX_EXPORTS_PER_FILE = 200
45
+ C_LIKE_EXTENSIONS = {".c", ".cc", ".cpp", ".cxx", ".h", ".hh", ".hpp", ".hxx"}
46
+ CSHARP_EXTENSIONS = {".cs"}
47
+ GO_EXTENSIONS = {".go"}
48
+ JVM_EXTENSIONS = {".java", ".kt", ".kts"}
49
+ JS_TS_EXTENSIONS = {".js", ".jsx", ".mjs", ".cjs", ".ts", ".tsx", ".mts", ".cts"}
50
+ PYTHON_EXTENSIONS = {".py", ".pyi"}
51
+ RUST_EXTENSIONS = {".rs"}
52
+ SWIFT_EXTENSIONS = {".swift"}
53
+ RESOLVE_EXTENSIONS = (
54
+ ".py",
55
+ ".pyi",
56
+ ".js",
57
+ ".jsx",
58
+ ".mjs",
59
+ ".cjs",
60
+ ".ts",
61
+ ".tsx",
62
+ ".mts",
63
+ ".cts",
64
+ ".go",
65
+ ".rs",
66
+ ".java",
67
+ ".kt",
68
+ ".kts",
69
+ ".swift",
70
+ ".cs",
71
+ ".c",
72
+ ".cc",
73
+ ".cpp",
74
+ ".cxx",
75
+ ".h",
76
+ ".hh",
77
+ ".hpp",
78
+ ".hxx",
79
+ )
80
+ WORD_RE = re.compile(r"\b\w+\b", re.UNICODE)
81
+ JS_STRING_RE = r"['\"]([^'\"]+)['\"]"
82
+ IDENTIFIER_RE = r"[A-Za-z_$][\w$]*"
83
+
84
+
85
+ def parse_args() -> argparse.Namespace:
86
+ parser = argparse.ArgumentParser(
87
+ description="Build a bounded contaminated-side source-index.json for clean-room controller preflight."
88
+ )
89
+ parser.add_argument("--source-root", action="append", required=True, help="Authorized source root to index.")
90
+ parser.add_argument("--output", required=True, help="Path to write source-index.json.")
91
+ parser.add_argument(
92
+ "--contaminated-artifact-root",
93
+ action="append",
94
+ default=[],
95
+ help="Approved contaminated artifact root. Defaults to CLEAN_ROOM_CONTAMINATED_ARTIFACT_ROOTS.",
96
+ )
97
+ parser.add_argument("--task-id", required=True, help="Clean-room task id associated with this index.")
98
+ parser.add_argument("--max-files", type=int, default=DEFAULT_MAX_FILES)
99
+ parser.add_argument("--max-file-bytes", type=int, default=DEFAULT_MAX_FILE_BYTES)
100
+ parser.add_argument("--max-total-bytes", type=int, default=DEFAULT_MAX_TOTAL_BYTES)
101
+ parser.add_argument("--max-batch-tokens", type=int, default=DEFAULT_MAX_BATCH_TOKENS)
102
+ parser.add_argument("--large-file-words", type=int, default=DEFAULT_LARGE_FILE_WORDS)
103
+ parser.add_argument("--large-group-words", type=int, default=DEFAULT_LARGE_GROUP_WORDS)
104
+ parser.add_argument("--max-file-segments", type=int, default=DEFAULT_MAX_FILE_SEGMENTS)
105
+ parser.add_argument("--ignore-dir", action="append", default=[], help="Directory basename to skip.")
106
+ parser.add_argument(
107
+ "--skip-tool-detection",
108
+ action="store_true",
109
+ help="Do not record optional AST/indexing tool status in source-index.json.",
110
+ )
111
+ parser.add_argument(
112
+ "--probe-tools",
113
+ action="store_true",
114
+ help="Execute optional helper tools with version commands in dependency_report. Default is stat-only.",
115
+ )
116
+ parser.add_argument(
117
+ "--allow-working-project-tools",
118
+ action="store_true",
119
+ help="Allow dependency detection to consider .local/bin, .bin, node_modules/.bin, and npm prefix/global tools.",
120
+ )
121
+ return parser.parse_args()
122
+
123
+
124
+ def utc_now() -> str:
125
+ return datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z")
126
+
127
+
128
+ def source_roots(values: list[str]) -> list[dict[str, str]]:
129
+ roots: list[dict[str, str]] = []
130
+ seen: set[Path] = set()
131
+ for index, value in enumerate(values, start=1):
132
+ path = Path(value).expanduser().resolve()
133
+ if path in seen:
134
+ continue
135
+ if not path.is_dir():
136
+ raise SystemExit(f"source root is not a directory: {path}")
137
+ seen.add(path)
138
+ roots.append({"root_id": f"root-{index:03d}", "path": str(path)})
139
+ if not roots:
140
+ raise SystemExit("at least one unique --source-root is required")
141
+ return roots
142
+
143
+
144
+ def path_is_under(path: Path, root: Path) -> bool:
145
+ return path == root or root in path.parents
146
+
147
+
148
+ def contaminated_artifact_roots(args: argparse.Namespace) -> list[Path]:
149
+ values = list(args.contaminated_artifact_root)
150
+ values.extend(item for item in os.environ.get("CLEAN_ROOM_CONTAMINATED_ARTIFACT_ROOTS", "").split(os.pathsep) if item)
151
+ roots: list[Path] = []
152
+ seen: set[Path] = set()
153
+ for value in values:
154
+ root = Path(value).expanduser().resolve()
155
+ if root in seen:
156
+ continue
157
+ seen.add(root)
158
+ roots.append(root)
159
+ return roots
160
+
161
+
162
+ def checked_output_path(args: argparse.Namespace) -> Path:
163
+ output = Path(args.output).expanduser().resolve()
164
+ roots = contaminated_artifact_roots(args)
165
+ if not roots:
166
+ raise SystemExit(
167
+ "--output must be under CLEAN_ROOM_CONTAMINATED_ARTIFACT_ROOTS or an explicit --contaminated-artifact-root"
168
+ )
169
+ if not any(path_is_under(output, root) for root in roots):
170
+ allowed = ", ".join(root.as_posix() for root in roots)
171
+ raise SystemExit(f"--output must be under a contaminated artifact root ({allowed}): {output}")
172
+ return output
173
+
174
+
175
+ def add_skipped(skipped_entries: list[dict[str, str]], counters: dict[str, int], path: str, reason: str, kind: str) -> None:
176
+ counters["skipped_count"] += 1
177
+ if len(skipped_entries) < MAX_SKIPPED_ENTRIES:
178
+ skipped_entries.append({"path": path, "reason": reason, "kind": kind})
179
+
180
+
181
+ def language_for_path(path: Path) -> str:
182
+ suffix = path.suffix.lower()
183
+ if suffix in PYTHON_EXTENSIONS:
184
+ return "python"
185
+ if suffix in JS_TS_EXTENSIONS:
186
+ return "typescript" if "ts" in suffix else "javascript"
187
+ if suffix in GO_EXTENSIONS:
188
+ return "go"
189
+ if suffix in RUST_EXTENSIONS:
190
+ return "rust"
191
+ if suffix == ".java":
192
+ return "java"
193
+ if suffix in {".kt", ".kts"}:
194
+ return "kotlin"
195
+ if suffix in SWIFT_EXTENSIONS:
196
+ return "swift"
197
+ if suffix in CSHARP_EXTENSIONS:
198
+ return "csharp"
199
+ if suffix in C_LIKE_EXTENSIONS:
200
+ return "cpp" if suffix in {".cc", ".cpp", ".cxx", ".hh", ".hpp", ".hxx"} else "c"
201
+ return "text"
202
+
203
+
204
+ def line_count(text: str) -> int:
205
+ if not text:
206
+ return 0
207
+ return text.count("\n") + (0 if text.endswith("\n") else 1)
208
+
209
+
210
+ def metrics_for_text(data: bytes, text: str) -> dict[str, int]:
211
+ characters = len(text)
212
+ return {
213
+ "bytes": len(data),
214
+ "lines": line_count(text),
215
+ "words": len(WORD_RE.findall(text)),
216
+ "characters": characters,
217
+ "estimated_tokens": math.ceil(characters / 4),
218
+ }
219
+
220
+
221
+ def metrics_for_text_fragment(text: str) -> dict[str, int]:
222
+ return metrics_for_text(text.encode("utf-8", errors="replace"), text)
223
+
224
+
225
+ def empty_metrics() -> dict[str, int]:
226
+ return {"bytes": 0, "lines": 0, "words": 0, "characters": 0, "estimated_tokens": 0}
227
+
228
+
229
+ def add_metrics(left: dict[str, int], right: dict[str, int]) -> None:
230
+ for key in ("bytes", "lines", "words", "characters", "estimated_tokens"):
231
+ left[key] += right[key]
232
+
233
+
234
+ def truncate_items(items: list[dict[str, Any]], limit: int) -> tuple[list[dict[str, Any]], bool]:
235
+ return items[:limit], len(items) > limit
236
+
237
+
238
+ def literal_all(node: ast.AST) -> list[str]:
239
+ try:
240
+ value = ast.literal_eval(node)
241
+ except (ValueError, SyntaxError):
242
+ return []
243
+ if isinstance(value, (list, tuple)):
244
+ return [item for item in value if isinstance(item, str)]
245
+ return []
246
+
247
+
248
+ def scan_python(text: str) -> tuple[str, list[dict[str, Any]], list[dict[str, str]]]:
249
+ imports: list[dict[str, Any]] = []
250
+ exports: list[dict[str, str]] = []
251
+ try:
252
+ tree = ast.parse(text)
253
+ except SyntaxError:
254
+ return "python-ast-error", imports, exports
255
+
256
+ for node in tree.body:
257
+ if isinstance(node, ast.Import):
258
+ for alias in node.names:
259
+ imports.append(
260
+ {
261
+ "specifier": alias.name,
262
+ "kind": "python-import",
263
+ "is_relative": False,
264
+ "names": [alias.asname or alias.name.split(".")[0]],
265
+ }
266
+ )
267
+ elif isinstance(node, ast.ImportFrom):
268
+ module = node.module or ""
269
+ specifier = "." * node.level + module
270
+ imports.append(
271
+ {
272
+ "specifier": specifier,
273
+ "kind": "python-from-import",
274
+ "is_relative": node.level > 0,
275
+ "names": [alias.asname or alias.name for alias in node.names],
276
+ }
277
+ )
278
+ elif isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
279
+ exports.append({"name": node.name, "kind": "top-level-function"})
280
+ elif isinstance(node, ast.ClassDef):
281
+ exports.append({"name": node.name, "kind": "top-level-class"})
282
+ elif isinstance(node, (ast.Assign, ast.AnnAssign)):
283
+ targets: list[ast.AST] = []
284
+ if isinstance(node, ast.Assign):
285
+ targets = list(node.targets)
286
+ value = node.value
287
+ else:
288
+ targets = [node.target]
289
+ value = node.value
290
+ for target in targets:
291
+ if isinstance(target, ast.Name) and target.id == "__all__" and value is not None:
292
+ for name in literal_all(value):
293
+ exports.append({"name": name, "kind": "explicit-all"})
294
+ elif isinstance(target, ast.Name):
295
+ exports.append({"name": target.id, "kind": "top-level-assignment"})
296
+
297
+ return "python-ast", *truncate_scanned(imports, exports)
298
+
299
+
300
+ def strip_js_comments(text: str) -> str:
301
+ text = re.sub(r"/\*.*?\*/", " ", text, flags=re.S)
302
+ text = re.sub(r"(?m)//.*$", " ", text)
303
+ return text
304
+
305
+
306
+ def split_export_names(raw: str) -> list[str]:
307
+ names: list[str] = []
308
+ for part in raw.split(","):
309
+ item = part.strip()
310
+ if not item:
311
+ continue
312
+ match = re.search(r"\bas\s+([A-Za-z_$][\w$]*)$", item)
313
+ if match:
314
+ names.append(match.group(1))
315
+ else:
316
+ names.append(item.split()[0].strip())
317
+ return [name for name in names if re.fullmatch(r"[A-Za-z_$][\w$]*", name)]
318
+
319
+
320
+ def scan_js_ts(text: str) -> tuple[str, list[dict[str, Any]], list[dict[str, str]]]:
321
+ scanned = strip_js_comments(text)
322
+ imports: list[dict[str, Any]] = []
323
+ exports: list[dict[str, str]] = []
324
+
325
+ for match in re.finditer(rf"(?m)^\s*import(?:\s+type)?(?:[\s\w$*{{}},]+?\s+from\s*)?{JS_STRING_RE}", scanned):
326
+ imports.append(
327
+ {
328
+ "specifier": match.group(1),
329
+ "kind": "esm-import",
330
+ "is_relative": match.group(1).startswith("."),
331
+ "names": [],
332
+ }
333
+ )
334
+ for match in re.finditer(rf"(?m)^\s*export(?:\s+type)?\s+[^;]*?\s+from\s+{JS_STRING_RE}", scanned):
335
+ imports.append(
336
+ {
337
+ "specifier": match.group(1),
338
+ "kind": "esm-re-export",
339
+ "is_relative": match.group(1).startswith("."),
340
+ "names": [],
341
+ }
342
+ )
343
+ for match in re.finditer(rf"\brequire\(\s*{JS_STRING_RE}\s*\)", scanned):
344
+ imports.append(
345
+ {
346
+ "specifier": match.group(1),
347
+ "kind": "commonjs-require",
348
+ "is_relative": match.group(1).startswith("."),
349
+ "names": [],
350
+ }
351
+ )
352
+ for match in re.finditer(rf"\bimport\(\s*{JS_STRING_RE}\s*\)", scanned):
353
+ imports.append(
354
+ {
355
+ "specifier": match.group(1),
356
+ "kind": "dynamic-import",
357
+ "is_relative": match.group(1).startswith("."),
358
+ "names": [],
359
+ }
360
+ )
361
+
362
+ for match in re.finditer(
363
+ r"(?m)^\s*export\s+(?:async\s+)?(?:class|function|const|let|var|interface|type|enum)\s+([A-Za-z_$][\w$]*)",
364
+ scanned,
365
+ ):
366
+ exports.append({"name": match.group(1), "kind": "esm-declaration"})
367
+ for match in re.finditer(r"(?m)^\s*export\s+default\b", scanned):
368
+ exports.append({"name": "default", "kind": "esm-default"})
369
+ for match in re.finditer(r"(?m)^\s*export\s*{([^}]+)}", scanned):
370
+ for name in split_export_names(match.group(1)):
371
+ exports.append({"name": name, "kind": "esm-named"})
372
+ for match in re.finditer(r"\bmodule\.exports\s*=", scanned):
373
+ exports.append({"name": "module.exports", "kind": "commonjs-module"})
374
+ for match in re.finditer(r"\bexports\.([A-Za-z_$][\w$]*)\s*=", scanned):
375
+ exports.append({"name": match.group(1), "kind": "commonjs-named"})
376
+
377
+ return "javascript-typescript-scanner", *truncate_scanned(imports, exports)
378
+
379
+
380
+ def scan_go(text: str) -> tuple[str, list[dict[str, Any]], list[dict[str, str]]]:
381
+ imports: list[dict[str, Any]] = []
382
+ exports: list[dict[str, str]] = []
383
+
384
+ for block in re.finditer(r"(?ms)^\s*import\s*\((.*?)\)", text):
385
+ for match in re.finditer(r'"([^"]+)"', block.group(1)):
386
+ imports.append(
387
+ {
388
+ "specifier": match.group(1),
389
+ "kind": "go-import",
390
+ "is_relative": match.group(1).startswith("."),
391
+ "names": [],
392
+ }
393
+ )
394
+ for match in re.finditer(r'(?m)^\s*import\s+(?:[._A-Za-z]\w*\s+)?(?:"([^"]+)")', text):
395
+ imports.append(
396
+ {
397
+ "specifier": match.group(1),
398
+ "kind": "go-import",
399
+ "is_relative": match.group(1).startswith("."),
400
+ "names": [],
401
+ }
402
+ )
403
+ for match in re.finditer(rf"(?m)^\s*func\s+(?:\([^)]*\)\s*)?({IDENTIFIER_RE})\s*\(", text):
404
+ exports.append({"name": match.group(1), "kind": "go-function"})
405
+ for match in re.finditer(rf"(?m)^\s*type\s+({IDENTIFIER_RE})\b", text):
406
+ exports.append({"name": match.group(1), "kind": "go-type"})
407
+ for match in re.finditer(rf"(?m)^\s*(?:const|var)\s+(?:\(\s*)?({IDENTIFIER_RE})\b", text):
408
+ exports.append({"name": match.group(1), "kind": "go-binding"})
409
+
410
+ return "go-scanner", *truncate_scanned(imports, exports)
411
+
412
+
413
+ def scan_rust(text: str) -> tuple[str, list[dict[str, Any]], list[dict[str, str]]]:
414
+ imports: list[dict[str, Any]] = []
415
+ exports: list[dict[str, str]] = []
416
+
417
+ for match in re.finditer(r"(?m)^\s*use\s+([^;]+);", text):
418
+ specifier = re.sub(r"\s+", "", match.group(1))
419
+ imports.append(
420
+ {
421
+ "specifier": specifier,
422
+ "kind": "rust-use",
423
+ "is_relative": specifier.startswith(("self::", "super::", "crate::")),
424
+ "names": [],
425
+ }
426
+ )
427
+ for match in re.finditer(rf"(?m)^\s*(?:pub\s+)?mod\s+({IDENTIFIER_RE})\s*;", text):
428
+ imports.append(
429
+ {
430
+ "specifier": match.group(1),
431
+ "kind": "rust-mod",
432
+ "is_relative": True,
433
+ "names": [],
434
+ }
435
+ )
436
+ for match in re.finditer(
437
+ rf"(?m)^\s*(?:pub(?:\([^)]*\))?\s+)?(?:async\s+)?(?:fn|struct|enum|trait|type|const|static)\s+({IDENTIFIER_RE})\b",
438
+ text,
439
+ ):
440
+ exports.append({"name": match.group(1), "kind": "rust-declaration"})
441
+
442
+ return "rust-scanner", *truncate_scanned(imports, exports)
443
+
444
+
445
+ def scan_java_kotlin(language: str, text: str) -> tuple[str, list[dict[str, Any]], list[dict[str, str]]]:
446
+ imports: list[dict[str, Any]] = []
447
+ exports: list[dict[str, str]] = []
448
+ scanner = "java-scanner" if language == "java" else "kotlin-scanner"
449
+
450
+ for match in re.finditer(r"(?m)^\s*import\s+(?:static\s+)?([A-Za-z_][\w.]*\*?)\s*;?", text):
451
+ imports.append(
452
+ {
453
+ "specifier": match.group(1),
454
+ "kind": f"{language}-import",
455
+ "is_relative": False,
456
+ "names": [],
457
+ }
458
+ )
459
+ for match in re.finditer(
460
+ rf"(?m)^\s*(?:public|internal|private|protected|sealed|abstract|final|open|data|value|\s)*"
461
+ rf"(?:class|interface|enum|object|record)\s+({IDENTIFIER_RE})\b",
462
+ text,
463
+ ):
464
+ exports.append({"name": match.group(1), "kind": f"{language}-type"})
465
+ if language == "kotlin":
466
+ for match in re.finditer(rf"(?m)^\s*(?:public|internal|private|protected|suspend|\s)*fun\s+({IDENTIFIER_RE})\s*\(", text):
467
+ exports.append({"name": match.group(1), "kind": "kotlin-function"})
468
+
469
+ return scanner, *truncate_scanned(imports, exports)
470
+
471
+
472
+ def scan_swift(text: str) -> tuple[str, list[dict[str, Any]], list[dict[str, str]]]:
473
+ imports: list[dict[str, Any]] = []
474
+ exports: list[dict[str, str]] = []
475
+
476
+ for match in re.finditer(r"(?m)^\s*import\s+(?:@\w+\s+)?([A-Za-z_][\w.]*)", text):
477
+ imports.append(
478
+ {
479
+ "specifier": match.group(1),
480
+ "kind": "swift-import",
481
+ "is_relative": False,
482
+ "names": [],
483
+ }
484
+ )
485
+ for match in re.finditer(
486
+ rf"(?m)^\s*(?:public|internal|private|fileprivate|open|final|\s)*"
487
+ rf"(?:class|struct|enum|protocol|actor|func|let|var)\s+({IDENTIFIER_RE})\b",
488
+ text,
489
+ ):
490
+ exports.append({"name": match.group(1), "kind": "swift-declaration"})
491
+
492
+ return "swift-scanner", *truncate_scanned(imports, exports)
493
+
494
+
495
+ def scan_c_like(language: str, text: str) -> tuple[str, list[dict[str, Any]], list[dict[str, str]]]:
496
+ imports: list[dict[str, Any]] = []
497
+ exports: list[dict[str, str]] = []
498
+
499
+ for match in re.finditer(r'(?m)^\s*#\s*include\s*([<"])([^>"]+)[>"]', text):
500
+ imports.append(
501
+ {
502
+ "specifier": match.group(2),
503
+ "kind": f"{language}-include",
504
+ "is_relative": match.group(1) == '"',
505
+ "names": [],
506
+ }
507
+ )
508
+ for match in re.finditer(
509
+ rf"(?m)^\s*(?:extern\s+)?(?:[A-Za-z_][\w:<>,\s\*&~]+\s+)+({IDENTIFIER_RE})\s*\([^;]*\)\s*(?:;|{{)",
510
+ text,
511
+ ):
512
+ name = match.group(1)
513
+ if name not in {"if", "for", "while", "switch", "return"}:
514
+ exports.append({"name": name, "kind": f"{language}-function"})
515
+ for match in re.finditer(rf"(?m)^\s*(?:class|struct|enum)\s+({IDENTIFIER_RE})\b", text):
516
+ exports.append({"name": match.group(1), "kind": f"{language}-type"})
517
+
518
+ return f"{language}-scanner", *truncate_scanned(imports, exports)
519
+
520
+
521
+ def scan_csharp(text: str) -> tuple[str, list[dict[str, Any]], list[dict[str, str]]]:
522
+ imports: list[dict[str, Any]] = []
523
+ exports: list[dict[str, str]] = []
524
+
525
+ for match in re.finditer(r"(?m)^\s*using\s+(?:static\s+)?([A-Za-z_][\w.]*)\s*;", text):
526
+ imports.append(
527
+ {
528
+ "specifier": match.group(1),
529
+ "kind": "csharp-using",
530
+ "is_relative": False,
531
+ "names": [],
532
+ }
533
+ )
534
+ for match in re.finditer(
535
+ rf"(?m)^\s*(?:public|internal|private|protected|abstract|sealed|static|partial|\s)*"
536
+ rf"(?:class|interface|struct|enum|record)\s+({IDENTIFIER_RE})\b",
537
+ text,
538
+ ):
539
+ exports.append({"name": match.group(1), "kind": "csharp-type"})
540
+ for match in re.finditer(
541
+ rf"(?m)^\s*(?:public|internal|private|protected|static|async|virtual|override|\s)*"
542
+ rf"(?:[A-Za-z_][\w<>,\[\]?]+\s+)+({IDENTIFIER_RE})\s*\(",
543
+ text,
544
+ ):
545
+ name = match.group(1)
546
+ if name not in {"if", "for", "foreach", "while", "switch", "catch"}:
547
+ exports.append({"name": name, "kind": "csharp-method"})
548
+
549
+ return "csharp-scanner", *truncate_scanned(imports, exports)
550
+
551
+
552
+ def truncate_scanned(
553
+ imports: list[dict[str, Any]], exports: list[dict[str, str]]
554
+ ) -> tuple[list[dict[str, Any]], list[dict[str, str]]]:
555
+ truncated_imports, imports_truncated = truncate_items(imports, MAX_IMPORTS_PER_FILE)
556
+ truncated_exports, exports_truncated = truncate_items(exports, MAX_EXPORTS_PER_FILE)
557
+ if imports_truncated:
558
+ truncated_imports.append(
559
+ {
560
+ "specifier": "__truncated__",
561
+ "kind": "truncation-marker",
562
+ "is_relative": False,
563
+ "names": [],
564
+ }
565
+ )
566
+ if exports_truncated:
567
+ truncated_exports.append({"name": "__truncated__", "kind": "truncation-marker"})
568
+ return truncated_imports, truncated_exports
569
+
570
+
571
+ def scan_file(language: str, text: str) -> tuple[str, list[dict[str, Any]], list[dict[str, str]]]:
572
+ if language == "python":
573
+ return scan_python(text)
574
+ if language in {"javascript", "typescript"}:
575
+ return scan_js_ts(text)
576
+ if language == "go":
577
+ return scan_go(text)
578
+ if language == "rust":
579
+ return scan_rust(text)
580
+ if language in {"java", "kotlin"}:
581
+ return scan_java_kotlin(language, text)
582
+ if language == "swift":
583
+ return scan_swift(text)
584
+ if language in {"c", "cpp"}:
585
+ return scan_c_like(language, text)
586
+ if language == "csharp":
587
+ return scan_csharp(text)
588
+ return "text-metrics", [], []
589
+
590
+
591
+ def normalized_relative(path: Path) -> str:
592
+ return path.as_posix()
593
+
594
+
595
+ def add_path_aliases(path_map: dict[str, str], root_id: str, relative_path: Path, file_id: str) -> None:
596
+ rel = normalized_relative(relative_path)
597
+ key = f"{root_id}:{rel}"
598
+ path_map[key] = file_id
599
+ suffix = relative_path.suffix
600
+ if suffix:
601
+ path_map[f"{root_id}:{normalized_relative(relative_path.with_suffix(''))}"] = file_id
602
+ if relative_path.name in {
603
+ "__init__.py",
604
+ "index.js",
605
+ "index.jsx",
606
+ "index.ts",
607
+ "index.tsx",
608
+ "index.mjs",
609
+ "index.cjs",
610
+ "mod.rs",
611
+ }:
612
+ path_map[f"{root_id}:{normalized_relative(relative_path.parent)}"] = file_id
613
+
614
+
615
+ def resolve_candidate(path_map: dict[str, str], root_id: str, candidate: Path) -> str | None:
616
+ candidates = [candidate]
617
+ if not candidate.suffix:
618
+ candidates.extend(candidate.with_suffix(ext) for ext in RESOLVE_EXTENSIONS)
619
+ candidates.extend(candidate / f"index{ext}" for ext in RESOLVE_EXTENSIONS if ext != ".pyi")
620
+ candidates.append(candidate / "__init__.py")
621
+ for item in candidates:
622
+ key = f"{root_id}:{normalized_relative(item)}"
623
+ if key in path_map:
624
+ return path_map[key]
625
+ return None
626
+
627
+
628
+ def resolve_import(
629
+ path_map: dict[str, str],
630
+ file_record: dict[str, Any],
631
+ import_record: dict[str, Any],
632
+ ) -> str | None:
633
+ root_id = file_record["root_id"]
634
+ path = Path(file_record["path"])
635
+ specifier = import_record["specifier"]
636
+ kind = import_record["kind"]
637
+ if kind == "python-from-import" and specifier.startswith("."):
638
+ dot_count = len(specifier) - len(specifier.lstrip("."))
639
+ module = specifier[dot_count:]
640
+ base = path.parent
641
+ for _ in range(max(dot_count - 1, 0)):
642
+ base = base.parent
643
+ if not module:
644
+ for name in import_record.get("names", []):
645
+ resolved = resolve_candidate(path_map, root_id, base / str(name))
646
+ if resolved:
647
+ return resolved
648
+ candidate = base / Path(module.replace(".", "/")) if module else base
649
+ return resolve_candidate(path_map, root_id, candidate)
650
+ if kind == "python-import" and not specifier.startswith("."):
651
+ candidate = Path(specifier.replace(".", "/"))
652
+ return resolve_candidate(path_map, root_id, candidate)
653
+ if specifier.startswith("."):
654
+ return resolve_candidate(path_map, root_id, path.parent / specifier)
655
+ if kind == "rust-mod":
656
+ return resolve_candidate(path_map, root_id, path.parent / specifier)
657
+ if kind == "rust-use":
658
+ cleaned = re.sub(r"[{}*]", "", specifier)
659
+ first = cleaned.split("::", 1)[0]
660
+ if first in {"crate", "self"}:
661
+ remainder = cleaned.split("::", 1)[1] if "::" in cleaned else ""
662
+ base = Path("") if first == "crate" else path.parent
663
+ if remainder:
664
+ return resolve_candidate(path_map, root_id, base / Path(remainder.replace("::", "/")))
665
+ if first == "super":
666
+ remainder = cleaned.split("::", 1)[1] if "::" in cleaned else ""
667
+ base = path.parent.parent
668
+ if remainder:
669
+ return resolve_candidate(path_map, root_id, base / Path(remainder.replace("::", "/")))
670
+ if kind.endswith("-include") and import_record.get("is_relative"):
671
+ return resolve_candidate(path_map, root_id, path.parent / specifier)
672
+ if kind in {"java-import", "kotlin-import", "csharp-using"}:
673
+ return resolve_candidate(path_map, root_id, Path(specifier.rstrip(".*").replace(".", "/")))
674
+ if kind in {"go-import", "swift-import"}:
675
+ return resolve_candidate(path_map, root_id, Path(specifier.replace(".", "/")))
676
+ return None
677
+
678
+
679
+ def build_file_segments(
680
+ file_record: dict[str, Any],
681
+ text: str,
682
+ max_batch_tokens: int,
683
+ large_file_words: int,
684
+ max_file_segments: int,
685
+ ) -> list[dict[str, Any]]:
686
+ metrics = file_record["metrics"]
687
+ if metrics["estimated_tokens"] <= max_batch_tokens and metrics["words"] <= large_file_words:
688
+ return []
689
+
690
+ reason = "large-file-word-count" if metrics["words"] > large_file_words else "large-file-token-count"
691
+ lines = text.splitlines(keepends=True)
692
+ if not lines and text:
693
+ lines = [text]
694
+
695
+ segments: list[dict[str, Any]] = []
696
+ current_text: list[str] = []
697
+ current_metrics = empty_metrics()
698
+ start_line = 1
699
+ current_line = 1
700
+
701
+ def flush(end_line: int) -> None:
702
+ nonlocal current_text, current_metrics, start_line
703
+ if not current_text or len(segments) >= max_file_segments:
704
+ return
705
+ ordinal = len(segments) + 1
706
+ segments.append(
707
+ {
708
+ "segment_id": f"segment-{file_record['file_id']}-{ordinal:04d}",
709
+ "file_id": file_record["file_id"],
710
+ "ordinal": ordinal,
711
+ "start_line": start_line,
712
+ "end_line": max(start_line, end_line),
713
+ "metrics": dict(current_metrics),
714
+ "reason": reason,
715
+ }
716
+ )
717
+ current_text = []
718
+ current_metrics = empty_metrics()
719
+ start_line = end_line + 1
720
+
721
+ for line in lines:
722
+ line_metrics = metrics_for_text_fragment(line)
723
+ if current_text and current_metrics["estimated_tokens"] + line_metrics["estimated_tokens"] > max_batch_tokens:
724
+ flush(current_line - 1)
725
+ if not current_text:
726
+ start_line = current_line
727
+ current_text.append(line)
728
+ add_metrics(current_metrics, line_metrics)
729
+ current_line += max(line_metrics["lines"], 1)
730
+ if len(segments) >= max_file_segments:
731
+ break
732
+ if current_text and len(segments) < max_file_segments:
733
+ flush(current_line - 1)
734
+
735
+ return segments
736
+
737
+
738
+ def collect_files(
739
+ args: argparse.Namespace, roots: list[dict[str, str]]
740
+ ) -> tuple[list[dict[str, Any]], list[dict[str, Any]], list[dict[str, str]], dict[str, int]]:
741
+ ignore_dirs = set(DEFAULT_IGNORE_DIRS) | set(args.ignore_dir)
742
+ files: list[dict[str, Any]] = []
743
+ file_segments: list[dict[str, Any]] = []
744
+ skipped_entries: list[dict[str, str]] = []
745
+ counters = {"skipped_count": 0, "total_bytes": 0}
746
+ next_file_id = 1
747
+
748
+ for root in roots:
749
+ root_path = Path(root["path"])
750
+ for current_dir, dirnames, filenames in os.walk(root_path):
751
+ current = Path(current_dir)
752
+ kept_dirs: list[str] = []
753
+ for dirname in sorted(dirnames):
754
+ if dirname in ignore_dirs:
755
+ rel = normalized_relative((current / dirname).relative_to(root_path))
756
+ add_skipped(skipped_entries, counters, rel, "ignored-directory", "directory")
757
+ continue
758
+ kept_dirs.append(dirname)
759
+ dirnames[:] = kept_dirs
760
+
761
+ for filename in sorted(filenames):
762
+ source_path = current / filename
763
+ try:
764
+ resolved = source_path.resolve()
765
+ if not (resolved == root_path or root_path in resolved.parents):
766
+ rel = normalized_relative(source_path.relative_to(root_path))
767
+ add_skipped(skipped_entries, counters, rel, "symlink-outside-root", "file")
768
+ continue
769
+ stat = source_path.stat()
770
+ except OSError as exc:
771
+ rel = normalized_relative(source_path.relative_to(root_path))
772
+ add_skipped(skipped_entries, counters, rel, f"stat-error:{exc.__class__.__name__}", "file")
773
+ continue
774
+
775
+ rel_path = source_path.relative_to(root_path)
776
+ rel = normalized_relative(rel_path)
777
+ if len(files) >= args.max_files:
778
+ add_skipped(skipped_entries, counters, rel, "file-count-limit", "file")
779
+ continue
780
+ if stat.st_size > args.max_file_bytes:
781
+ add_skipped(skipped_entries, counters, rel, "file-byte-limit", "file")
782
+ continue
783
+ if counters["total_bytes"] + stat.st_size > args.max_total_bytes:
784
+ add_skipped(skipped_entries, counters, rel, "total-byte-limit", "file")
785
+ continue
786
+
787
+ try:
788
+ data = source_path.read_bytes()
789
+ except OSError as exc:
790
+ add_skipped(skipped_entries, counters, rel, f"read-error:{exc.__class__.__name__}", "file")
791
+ continue
792
+ if b"\0" in data:
793
+ add_skipped(skipped_entries, counters, rel, "binary-file", "file")
794
+ continue
795
+
796
+ text = data.decode("utf-8", errors="replace")
797
+ language = language_for_path(source_path)
798
+ scanner, imports, exports = scan_file(language, text)
799
+ metrics = metrics_for_text(data, text)
800
+ file_record = {
801
+ "file_id": f"file-{next_file_id:06d}",
802
+ "root_id": root["root_id"],
803
+ "path": rel,
804
+ "language": language,
805
+ "scanner": scanner,
806
+ "metrics": metrics,
807
+ "imports": imports,
808
+ "exports": exports,
809
+ }
810
+ files.append(file_record)
811
+ file_segments.extend(
812
+ build_file_segments(
813
+ file_record,
814
+ text,
815
+ args.max_batch_tokens,
816
+ args.large_file_words,
817
+ args.max_file_segments,
818
+ )
819
+ )
820
+ next_file_id += 1
821
+ counters["total_bytes"] += len(data)
822
+ return files, file_segments, skipped_entries, counters
823
+
824
+
825
+ def resolve_relationships(files: list[dict[str, Any]]) -> list[dict[str, Any]]:
826
+ path_map: dict[str, str] = {}
827
+ for file_record in files:
828
+ add_path_aliases(path_map, file_record["root_id"], Path(file_record["path"]), file_record["file_id"])
829
+
830
+ relationships: list[dict[str, Any]] = []
831
+ for file_record in files:
832
+ for import_record in file_record["imports"]:
833
+ resolved = resolve_import(path_map, file_record, import_record)
834
+ import_record["resolved_file_id"] = resolved
835
+ relationships.append(
836
+ {
837
+ "from_file_id": file_record["file_id"],
838
+ "to_file_id": resolved,
839
+ "specifier": import_record["specifier"],
840
+ "kind": import_record["kind"],
841
+ }
842
+ )
843
+ return relationships
844
+
845
+
846
+ def metric_sum(files: list[dict[str, Any]], file_ids: list[str]) -> dict[str, int]:
847
+ by_id = {file_record["file_id"]: file_record for file_record in files}
848
+ totals = empty_metrics()
849
+ for file_id in file_ids:
850
+ metrics = by_id[file_id]["metrics"]
851
+ add_metrics(totals, metrics)
852
+ return totals
853
+
854
+
855
+ def segment_metric_sum(file_segments: list[dict[str, Any]], segment_ids: list[str]) -> dict[str, int]:
856
+ by_id = {segment["segment_id"]: segment for segment in file_segments}
857
+ totals = empty_metrics()
858
+ for segment_id in segment_ids:
859
+ add_metrics(totals, by_id[segment_id]["metrics"])
860
+ return totals
861
+
862
+
863
+ def language_counts(files: list[dict[str, Any]], file_ids: list[str]) -> dict[str, int]:
864
+ by_id = {file_record["file_id"]: file_record for file_record in files}
865
+ counts: dict[str, int] = defaultdict(int)
866
+ for file_id in file_ids:
867
+ counts[str(by_id[file_id]["language"])] += 1
868
+ return dict(sorted(counts.items()))
869
+
870
+
871
+ def build_groups(files: list[dict[str, Any]], relationships: list[dict[str, Any]]) -> list[dict[str, Any]]:
872
+ file_ids = [file_record["file_id"] for file_record in files]
873
+ adjacency: dict[str, set[str]] = {file_id: set() for file_id in file_ids}
874
+ related: set[str] = set()
875
+ for relationship in relationships:
876
+ to_file_id = relationship.get("to_file_id")
877
+ if isinstance(to_file_id, str) and to_file_id in adjacency:
878
+ from_file_id = relationship["from_file_id"]
879
+ adjacency[from_file_id].add(to_file_id)
880
+ adjacency[to_file_id].add(from_file_id)
881
+ related.add(from_file_id)
882
+ related.add(to_file_id)
883
+
884
+ visited: set[str] = set()
885
+ raw_groups: list[tuple[str, list[str], str]] = []
886
+ for file_id in file_ids:
887
+ if file_id in visited or file_id not in related:
888
+ continue
889
+ queue: deque[str] = deque([file_id])
890
+ visited.add(file_id)
891
+ component: list[str] = []
892
+ while queue:
893
+ current = queue.popleft()
894
+ component.append(current)
895
+ for neighbor in sorted(adjacency[current]):
896
+ if neighbor not in visited:
897
+ visited.add(neighbor)
898
+ queue.append(neighbor)
899
+ raw_groups.append(("dependency-component", sorted(component), "Files connected by resolved local imports."))
900
+
901
+ by_directory: dict[str, list[str]] = defaultdict(list)
902
+ file_by_id = {file_record["file_id"]: file_record for file_record in files}
903
+ for file_id in file_ids:
904
+ if file_id in visited:
905
+ continue
906
+ directory = str(Path(file_by_id[file_id]["path"]).parent)
907
+ by_directory[directory].append(file_id)
908
+ for directory in sorted(by_directory):
909
+ raw_groups.append(
910
+ (
911
+ "directory-cluster",
912
+ sorted(by_directory[directory]),
913
+ f"Files grouped by directory fallback: {directory}.",
914
+ )
915
+ )
916
+
917
+ groups: list[dict[str, Any]] = []
918
+ for index, (reason, group_file_ids, note) in enumerate(raw_groups, start=1):
919
+ group_file_id_set = set(group_file_ids)
920
+ groups.append(
921
+ {
922
+ "group_id": f"group-{index:04d}",
923
+ "reason": reason,
924
+ "file_ids": group_file_ids,
925
+ "metrics": metric_sum(files, group_file_ids),
926
+ "language_counts": language_counts(files, group_file_ids),
927
+ "relationship_count": sum(
928
+ 1
929
+ for relationship in relationships
930
+ if relationship["from_file_id"] in group_file_id_set
931
+ and relationship.get("to_file_id") in group_file_id_set
932
+ ),
933
+ "notes": note,
934
+ }
935
+ )
936
+ return groups
937
+
938
+
939
+ def split_files_for_batch(
940
+ files: list[dict[str, Any]], file_ids: list[str], max_batch_tokens: int
941
+ ) -> list[list[str]]:
942
+ by_id = {file_record["file_id"]: file_record for file_record in files}
943
+ chunks: list[list[str]] = []
944
+ current: list[str] = []
945
+ current_tokens = 0
946
+ for file_id in sorted(file_ids, key=lambda item: (by_id[item]["path"], item)):
947
+ tokens = by_id[file_id]["metrics"]["estimated_tokens"]
948
+ if current and current_tokens + tokens > max_batch_tokens:
949
+ chunks.append(current)
950
+ current = []
951
+ current_tokens = 0
952
+ current.append(file_id)
953
+ current_tokens += tokens
954
+ if current:
955
+ chunks.append(current)
956
+ return chunks
957
+
958
+
959
+ def build_batches(
960
+ files: list[dict[str, Any]],
961
+ file_segments: list[dict[str, Any]],
962
+ groups: list[dict[str, Any]],
963
+ max_batch_tokens: int,
964
+ ) -> list[dict[str, Any]]:
965
+ batches: list[dict[str, Any]] = []
966
+ pending_group_ids: list[str] = []
967
+ pending_file_ids: list[str] = []
968
+ pending_tokens = 0
969
+ segments_by_file: dict[str, list[dict[str, Any]]] = defaultdict(list)
970
+ for segment in file_segments:
971
+ segments_by_file[segment["file_id"]].append(segment)
972
+
973
+ def flush(note: str) -> None:
974
+ nonlocal pending_group_ids, pending_file_ids, pending_tokens
975
+ if not pending_file_ids:
976
+ return
977
+ batches.append(
978
+ {
979
+ "batch_id": f"batch-{len(batches) + 1:04d}",
980
+ "group_ids": pending_group_ids,
981
+ "file_ids": pending_file_ids,
982
+ "segment_ids": [],
983
+ "metrics": metric_sum(files, pending_file_ids),
984
+ "language_counts": language_counts(files, pending_file_ids),
985
+ "notes": note,
986
+ }
987
+ )
988
+ pending_group_ids = []
989
+ pending_file_ids = []
990
+ pending_tokens = 0
991
+
992
+ for group in groups:
993
+ group_tokens = group["metrics"]["estimated_tokens"]
994
+ group_has_segments = any(file_id in segments_by_file for file_id in group["file_ids"])
995
+ if group_tokens > max_batch_tokens or group_has_segments:
996
+ flush(f"Fits max_batch_tokens {max_batch_tokens}.")
997
+ split_note = (
998
+ f"Split large {group['group_id']} to respect max_batch_tokens {max_batch_tokens}."
999
+ if group_tokens > max_batch_tokens
1000
+ else f"Split {group['group_id']} to preserve large-file segment boundaries."
1001
+ )
1002
+
1003
+ def append_regular_chunk(chunk_file_ids: list[str], note: str) -> None:
1004
+ batches.append(
1005
+ {
1006
+ "batch_id": f"batch-{len(batches) + 1:04d}",
1007
+ "group_ids": [group["group_id"]],
1008
+ "file_ids": chunk_file_ids,
1009
+ "segment_ids": [],
1010
+ "metrics": metric_sum(files, chunk_file_ids),
1011
+ "language_counts": language_counts(files, chunk_file_ids),
1012
+ "notes": note,
1013
+ }
1014
+ )
1015
+
1016
+ def append_segment_batches(file_id: str) -> None:
1017
+ for segment in segments_by_file[file_id]:
1018
+ segment_ids = [segment["segment_id"]]
1019
+ batches.append(
1020
+ {
1021
+ "batch_id": f"batch-{len(batches) + 1:04d}",
1022
+ "group_ids": [group["group_id"]],
1023
+ "file_ids": [file_id],
1024
+ "segment_ids": segment_ids,
1025
+ "metrics": segment_metric_sum(file_segments, segment_ids),
1026
+ "language_counts": language_counts(files, [file_id]),
1027
+ "notes": (
1028
+ f"Split large {file_id} from {group['group_id']} by line spans "
1029
+ f"to respect max_batch_tokens {max_batch_tokens}."
1030
+ ),
1031
+ }
1032
+ )
1033
+
1034
+ for chunk in split_files_for_batch(files, group["file_ids"], max_batch_tokens):
1035
+ if any(file_id in segments_by_file for file_id in chunk):
1036
+ regular_chunk: list[str] = []
1037
+ for file_id in chunk:
1038
+ if file_id in segments_by_file:
1039
+ if regular_chunk:
1040
+ append_regular_chunk(regular_chunk, split_note)
1041
+ regular_chunk = []
1042
+ append_segment_batches(file_id)
1043
+ else:
1044
+ regular_chunk.append(file_id)
1045
+ if regular_chunk:
1046
+ append_regular_chunk(regular_chunk, split_note)
1047
+ continue
1048
+ append_regular_chunk(chunk, split_note)
1049
+ continue
1050
+ if pending_file_ids and pending_tokens + group_tokens > max_batch_tokens:
1051
+ flush(f"Fits max_batch_tokens {max_batch_tokens}.")
1052
+ pending_group_ids.append(group["group_id"])
1053
+ pending_file_ids.extend(group["file_ids"])
1054
+ pending_tokens += group_tokens
1055
+ flush(f"Fits max_batch_tokens {max_batch_tokens}.")
1056
+ return batches
1057
+
1058
+
1059
+ def build_large_items(
1060
+ files: list[dict[str, Any]],
1061
+ groups: list[dict[str, Any]],
1062
+ batches: list[dict[str, Any]],
1063
+ large_file_words: int,
1064
+ large_group_words: int,
1065
+ max_batch_tokens: int,
1066
+ ) -> list[dict[str, Any]]:
1067
+ large_items: list[dict[str, Any]] = []
1068
+
1069
+ def reason_for(metrics: dict[str, int], word_limit: int) -> str | None:
1070
+ if metrics["words"] > word_limit:
1071
+ return "word-count-threshold"
1072
+ if metrics["estimated_tokens"] > max_batch_tokens:
1073
+ return "token-threshold"
1074
+ return None
1075
+
1076
+ for file_record in files:
1077
+ reason = reason_for(file_record["metrics"], large_file_words)
1078
+ if reason:
1079
+ large_items.append(
1080
+ {
1081
+ "item_id": file_record["file_id"],
1082
+ "kind": "file",
1083
+ "metrics": file_record["metrics"],
1084
+ "reason": reason,
1085
+ "notes": "Large source file should be assigned through file_segments or a narrow unit.",
1086
+ }
1087
+ )
1088
+ for group in groups:
1089
+ reason = reason_for(group["metrics"], large_group_words)
1090
+ if reason:
1091
+ large_items.append(
1092
+ {
1093
+ "item_id": group["group_id"],
1094
+ "kind": "group",
1095
+ "metrics": group["metrics"],
1096
+ "reason": reason,
1097
+ "notes": "Large dependency group should be decomposed through recommended_batches.",
1098
+ }
1099
+ )
1100
+ for batch in batches:
1101
+ reason = reason_for(batch["metrics"], large_group_words)
1102
+ if reason:
1103
+ large_items.append(
1104
+ {
1105
+ "item_id": batch["batch_id"],
1106
+ "kind": "batch",
1107
+ "metrics": batch["metrics"],
1108
+ "reason": reason,
1109
+ "notes": "Batch is still large; controller should narrow the unit before source analysis.",
1110
+ }
1111
+ )
1112
+ return large_items
1113
+
1114
+
1115
+ def aggregate_metrics(
1116
+ files: list[dict[str, Any]],
1117
+ file_segments: list[dict[str, Any]],
1118
+ relationships: list[dict[str, Any]],
1119
+ groups: list[dict[str, Any]],
1120
+ batches: list[dict[str, Any]],
1121
+ large_items: list[dict[str, Any]],
1122
+ skipped_count: int,
1123
+ skipped_entries: list[dict[str, str]],
1124
+ ) -> dict[str, int | bool]:
1125
+ file_ids = [file_record["file_id"] for file_record in files]
1126
+ totals = metric_sum(files, file_ids) if file_ids else empty_metrics()
1127
+ totals.update(
1128
+ {
1129
+ "file_count": len(files),
1130
+ "file_segment_count": len(file_segments),
1131
+ "skipped_count": skipped_count,
1132
+ "skipped_entries_truncated": skipped_count > len(skipped_entries),
1133
+ "relationship_count": len(relationships),
1134
+ "resolved_relationship_count": sum(1 for item in relationships if item.get("to_file_id")),
1135
+ "group_count": len(groups),
1136
+ "batch_count": len(batches),
1137
+ "large_item_count": len(large_items),
1138
+ }
1139
+ )
1140
+ return totals
1141
+
1142
+
1143
+ def atomic_write_json(path: Path, data: dict[str, Any]) -> None:
1144
+ path.parent.mkdir(parents=True, exist_ok=True)
1145
+ encoded = json.dumps(data, indent=2, sort_keys=False) + "\n"
1146
+ with tempfile.NamedTemporaryFile("w", encoding="utf-8", dir=path.parent, delete=False) as handle:
1147
+ tmp_path = Path(handle.name)
1148
+ handle.write(encoded)
1149
+ try:
1150
+ os.replace(tmp_path, path)
1151
+ except OSError:
1152
+ try:
1153
+ tmp_path.unlink()
1154
+ finally:
1155
+ raise
1156
+
1157
+
1158
+ def main() -> int:
1159
+ args = parse_args()
1160
+ if args.max_files < 1:
1161
+ raise SystemExit("--max-files must be at least 1")
1162
+ if args.max_file_bytes < 1:
1163
+ raise SystemExit("--max-file-bytes must be at least 1")
1164
+ if args.max_total_bytes < 1:
1165
+ raise SystemExit("--max-total-bytes must be at least 1")
1166
+ if args.max_batch_tokens < 1:
1167
+ raise SystemExit("--max-batch-tokens must be at least 1")
1168
+ if args.large_file_words < 1:
1169
+ raise SystemExit("--large-file-words must be at least 1")
1170
+ if args.large_group_words < 1:
1171
+ raise SystemExit("--large-group-words must be at least 1")
1172
+ if args.max_file_segments < 1:
1173
+ raise SystemExit("--max-file-segments must be at least 1")
1174
+
1175
+ output_path = checked_output_path(args)
1176
+ roots = source_roots(args.source_root)
1177
+ files, file_segments, skipped_entries, counters = collect_files(args, roots)
1178
+ relationships = resolve_relationships(files)
1179
+ groups = build_groups(files, relationships)
1180
+ batches = build_batches(files, file_segments, groups, args.max_batch_tokens)
1181
+ large_items = build_large_items(
1182
+ files,
1183
+ groups,
1184
+ batches,
1185
+ args.large_file_words,
1186
+ args.large_group_words,
1187
+ args.max_batch_tokens,
1188
+ )
1189
+ now = utc_now()
1190
+ output = {
1191
+ "index_id": f"source-index-{args.task_id}",
1192
+ "task_id": args.task_id,
1193
+ "created_at": now,
1194
+ "created_by_role": "controller-preflight",
1195
+ "domain": "contaminated",
1196
+ "generator": {
1197
+ "name": "build_source_index.py",
1198
+ "version": "1",
1199
+ "python_version": platform.python_version(),
1200
+ "scanner_modes": [
1201
+ "python-ast",
1202
+ "javascript-typescript-scanner",
1203
+ "go-scanner",
1204
+ "rust-scanner",
1205
+ "java-scanner",
1206
+ "kotlin-scanner",
1207
+ "swift-scanner",
1208
+ "c-scanner",
1209
+ "cpp-scanner",
1210
+ "csharp-scanner",
1211
+ "text-metrics",
1212
+ ],
1213
+ },
1214
+ "limits": {
1215
+ "max_files": args.max_files,
1216
+ "max_file_bytes": args.max_file_bytes,
1217
+ "max_total_bytes": args.max_total_bytes,
1218
+ "max_batch_tokens": args.max_batch_tokens,
1219
+ "large_file_words": args.large_file_words,
1220
+ "large_group_words": args.large_group_words,
1221
+ "max_file_segments": args.max_file_segments,
1222
+ "ignore_dirs": sorted(set(DEFAULT_IGNORE_DIRS) | set(args.ignore_dir)),
1223
+ },
1224
+ "dependency_report": (
1225
+ None
1226
+ if args.skip_tool_detection
1227
+ else clean_room_tooling.dependency_report(args.allow_working_project_tools, args.probe_tools)
1228
+ ),
1229
+ "source_roots": roots,
1230
+ "files": files,
1231
+ "file_segments": file_segments,
1232
+ "relationships": relationships,
1233
+ "groups": groups,
1234
+ "recommended_batches": batches,
1235
+ "large_items": large_items,
1236
+ "skipped_entries": skipped_entries,
1237
+ "aggregate_metrics": aggregate_metrics(
1238
+ files,
1239
+ file_segments,
1240
+ relationships,
1241
+ groups,
1242
+ batches,
1243
+ large_items,
1244
+ counters["skipped_count"],
1245
+ skipped_entries,
1246
+ ),
1247
+ }
1248
+ atomic_write_json(output_path, output)
1249
+ return 0
1250
+
1251
+
1252
+ if __name__ == "__main__":
1253
+ raise SystemExit(main())