clean-room-skill 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/marketplace.json +19 -0
- package/.claude-plugin/plugin.json +20 -0
- package/.codex-plugin/plugin.json +36 -0
- package/LICENSE +21 -0
- package/README.md +376 -0
- package/agents/clean-architect.md +27 -0
- package/agents/clean-qa-editor.md +27 -0
- package/agents/contaminated-manager-verifier.md +35 -0
- package/agents/contaminated-source-analyst.md +26 -0
- package/bin/install.js +535 -0
- package/examples/codex/.codex/agents/clean-architect.toml +17 -0
- package/examples/codex/.codex/agents/clean-qa-editor.toml +17 -0
- package/examples/codex/.codex/agents/contaminated-manager-verifier.toml +21 -0
- package/examples/codex/.codex/agents/contaminated-source-analyst.toml +17 -0
- package/hooks/check-artifact-leakage.py +317 -0
- package/hooks/clean-room-hook.py +88 -0
- package/hooks/clean_room_paths.py +130 -0
- package/hooks/deny-clean-room-shell.py +30 -0
- package/hooks/deny-clean-source-read.py +104 -0
- package/hooks/deny-contaminated-clean-write.py +134 -0
- package/hooks/hooks.json +44 -0
- package/hooks/require-clean-room-env.py +127 -0
- package/hooks/validate-handoff-package.py +140 -0
- package/hooks/validate-json-schema.py +283 -0
- package/lib/fs-utils.cjs +123 -0
- package/lib/hooks.cjs +214 -0
- package/package.json +49 -0
- package/plugin.json +20 -0
- package/skills/attended/SKILL.md +25 -0
- package/skills/clean-room/SKILL.md +134 -0
- package/skills/clean-room/assets/behavior-spec.schema.json +367 -0
- package/skills/clean-room/assets/contamination-incident.schema.json +60 -0
- package/skills/clean-room/assets/coverage-ledger.schema.json +139 -0
- package/skills/clean-room/assets/evidence-ledger.schema.json +80 -0
- package/skills/clean-room/assets/handoff-package.schema.json +114 -0
- package/skills/clean-room/assets/qc-report.schema.json +248 -0
- package/skills/clean-room/assets/skeleton-manifest.schema.json +239 -0
- package/skills/clean-room/assets/source-index.schema.json +622 -0
- package/skills/clean-room/assets/task-manifest.schema.json +593 -0
- package/skills/clean-room/examples/README.md +18 -0
- package/skills/clean-room/examples/minimal-spec-package/behavior-spec.json +61 -0
- package/skills/clean-room/examples/minimal-spec-package/coverage-ledger.json +27 -0
- package/skills/clean-room/examples/minimal-spec-package/evidence-ledger.json +17 -0
- package/skills/clean-room/examples/minimal-spec-package/handoff-package.json +26 -0
- package/skills/clean-room/examples/minimal-spec-package/qc-report.json +25 -0
- package/skills/clean-room/examples/minimal-spec-package/skeleton-manifest.json +45 -0
- package/skills/clean-room/examples/minimal-spec-package/source-index.json +156 -0
- package/skills/clean-room/examples/minimal-spec-package/task-manifest.json +220 -0
- package/skills/clean-room/references/LEAKAGE-RULES.md +92 -0
- package/skills/clean-room/references/PROCESS.md +185 -0
- package/skills/clean-room/references/SPEC-SCHEMA.md +185 -0
- package/skills/clean-room/references/TARGET-LANGUAGE-GUIDE.md +43 -0
- package/skills/clean-room/scripts/build_source_index.py +1253 -0
- package/skills/clean-room/scripts/clean_room_tool_manager.py +199 -0
- package/skills/clean-room/scripts/clean_room_tooling.py +370 -0
- package/skills/unattended/SKILL.md +26 -0
|
@@ -0,0 +1,1253 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Build a bounded contaminated-side source index for clean-room planning."""
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
import argparse
|
|
7
|
+
import ast
|
|
8
|
+
import json
|
|
9
|
+
import math
|
|
10
|
+
import os
|
|
11
|
+
import platform
|
|
12
|
+
import re
|
|
13
|
+
import sys
|
|
14
|
+
import tempfile
|
|
15
|
+
from collections import defaultdict, deque
|
|
16
|
+
from datetime import datetime, timezone
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from typing import Any
|
|
19
|
+
|
|
20
|
+
import clean_room_tooling
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
DEFAULT_IGNORE_DIRS = (
|
|
24
|
+
".git",
|
|
25
|
+
"node_modules",
|
|
26
|
+
".venv",
|
|
27
|
+
"venv",
|
|
28
|
+
"dist",
|
|
29
|
+
"build",
|
|
30
|
+
"target",
|
|
31
|
+
".next",
|
|
32
|
+
"coverage",
|
|
33
|
+
"__pycache__",
|
|
34
|
+
)
|
|
35
|
+
DEFAULT_MAX_FILES = 2000
|
|
36
|
+
DEFAULT_MAX_FILE_BYTES = 1_000_000
|
|
37
|
+
DEFAULT_MAX_TOTAL_BYTES = 50_000_000
|
|
38
|
+
DEFAULT_MAX_BATCH_TOKENS = 20_000
|
|
39
|
+
DEFAULT_LARGE_FILE_WORDS = 5_000
|
|
40
|
+
DEFAULT_LARGE_GROUP_WORDS = 15_000
|
|
41
|
+
DEFAULT_MAX_FILE_SEGMENTS = 200
|
|
42
|
+
MAX_SKIPPED_ENTRIES = 1000
|
|
43
|
+
MAX_IMPORTS_PER_FILE = 200
|
|
44
|
+
MAX_EXPORTS_PER_FILE = 200
|
|
45
|
+
C_LIKE_EXTENSIONS = {".c", ".cc", ".cpp", ".cxx", ".h", ".hh", ".hpp", ".hxx"}
|
|
46
|
+
CSHARP_EXTENSIONS = {".cs"}
|
|
47
|
+
GO_EXTENSIONS = {".go"}
|
|
48
|
+
JVM_EXTENSIONS = {".java", ".kt", ".kts"}
|
|
49
|
+
JS_TS_EXTENSIONS = {".js", ".jsx", ".mjs", ".cjs", ".ts", ".tsx", ".mts", ".cts"}
|
|
50
|
+
PYTHON_EXTENSIONS = {".py", ".pyi"}
|
|
51
|
+
RUST_EXTENSIONS = {".rs"}
|
|
52
|
+
SWIFT_EXTENSIONS = {".swift"}
|
|
53
|
+
RESOLVE_EXTENSIONS = (
|
|
54
|
+
".py",
|
|
55
|
+
".pyi",
|
|
56
|
+
".js",
|
|
57
|
+
".jsx",
|
|
58
|
+
".mjs",
|
|
59
|
+
".cjs",
|
|
60
|
+
".ts",
|
|
61
|
+
".tsx",
|
|
62
|
+
".mts",
|
|
63
|
+
".cts",
|
|
64
|
+
".go",
|
|
65
|
+
".rs",
|
|
66
|
+
".java",
|
|
67
|
+
".kt",
|
|
68
|
+
".kts",
|
|
69
|
+
".swift",
|
|
70
|
+
".cs",
|
|
71
|
+
".c",
|
|
72
|
+
".cc",
|
|
73
|
+
".cpp",
|
|
74
|
+
".cxx",
|
|
75
|
+
".h",
|
|
76
|
+
".hh",
|
|
77
|
+
".hpp",
|
|
78
|
+
".hxx",
|
|
79
|
+
)
|
|
80
|
+
WORD_RE = re.compile(r"\b\w+\b", re.UNICODE)
|
|
81
|
+
JS_STRING_RE = r"['\"]([^'\"]+)['\"]"
|
|
82
|
+
IDENTIFIER_RE = r"[A-Za-z_$][\w$]*"
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def parse_args() -> argparse.Namespace:
|
|
86
|
+
parser = argparse.ArgumentParser(
|
|
87
|
+
description="Build a bounded contaminated-side source-index.json for clean-room controller preflight."
|
|
88
|
+
)
|
|
89
|
+
parser.add_argument("--source-root", action="append", required=True, help="Authorized source root to index.")
|
|
90
|
+
parser.add_argument("--output", required=True, help="Path to write source-index.json.")
|
|
91
|
+
parser.add_argument(
|
|
92
|
+
"--contaminated-artifact-root",
|
|
93
|
+
action="append",
|
|
94
|
+
default=[],
|
|
95
|
+
help="Approved contaminated artifact root. Defaults to CLEAN_ROOM_CONTAMINATED_ARTIFACT_ROOTS.",
|
|
96
|
+
)
|
|
97
|
+
parser.add_argument("--task-id", required=True, help="Clean-room task id associated with this index.")
|
|
98
|
+
parser.add_argument("--max-files", type=int, default=DEFAULT_MAX_FILES)
|
|
99
|
+
parser.add_argument("--max-file-bytes", type=int, default=DEFAULT_MAX_FILE_BYTES)
|
|
100
|
+
parser.add_argument("--max-total-bytes", type=int, default=DEFAULT_MAX_TOTAL_BYTES)
|
|
101
|
+
parser.add_argument("--max-batch-tokens", type=int, default=DEFAULT_MAX_BATCH_TOKENS)
|
|
102
|
+
parser.add_argument("--large-file-words", type=int, default=DEFAULT_LARGE_FILE_WORDS)
|
|
103
|
+
parser.add_argument("--large-group-words", type=int, default=DEFAULT_LARGE_GROUP_WORDS)
|
|
104
|
+
parser.add_argument("--max-file-segments", type=int, default=DEFAULT_MAX_FILE_SEGMENTS)
|
|
105
|
+
parser.add_argument("--ignore-dir", action="append", default=[], help="Directory basename to skip.")
|
|
106
|
+
parser.add_argument(
|
|
107
|
+
"--skip-tool-detection",
|
|
108
|
+
action="store_true",
|
|
109
|
+
help="Do not record optional AST/indexing tool status in source-index.json.",
|
|
110
|
+
)
|
|
111
|
+
parser.add_argument(
|
|
112
|
+
"--probe-tools",
|
|
113
|
+
action="store_true",
|
|
114
|
+
help="Execute optional helper tools with version commands in dependency_report. Default is stat-only.",
|
|
115
|
+
)
|
|
116
|
+
parser.add_argument(
|
|
117
|
+
"--allow-working-project-tools",
|
|
118
|
+
action="store_true",
|
|
119
|
+
help="Allow dependency detection to consider .local/bin, .bin, node_modules/.bin, and npm prefix/global tools.",
|
|
120
|
+
)
|
|
121
|
+
return parser.parse_args()
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def utc_now() -> str:
|
|
125
|
+
return datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z")
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def source_roots(values: list[str]) -> list[dict[str, str]]:
|
|
129
|
+
roots: list[dict[str, str]] = []
|
|
130
|
+
seen: set[Path] = set()
|
|
131
|
+
for index, value in enumerate(values, start=1):
|
|
132
|
+
path = Path(value).expanduser().resolve()
|
|
133
|
+
if path in seen:
|
|
134
|
+
continue
|
|
135
|
+
if not path.is_dir():
|
|
136
|
+
raise SystemExit(f"source root is not a directory: {path}")
|
|
137
|
+
seen.add(path)
|
|
138
|
+
roots.append({"root_id": f"root-{index:03d}", "path": str(path)})
|
|
139
|
+
if not roots:
|
|
140
|
+
raise SystemExit("at least one unique --source-root is required")
|
|
141
|
+
return roots
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def path_is_under(path: Path, root: Path) -> bool:
|
|
145
|
+
return path == root or root in path.parents
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def contaminated_artifact_roots(args: argparse.Namespace) -> list[Path]:
|
|
149
|
+
values = list(args.contaminated_artifact_root)
|
|
150
|
+
values.extend(item for item in os.environ.get("CLEAN_ROOM_CONTAMINATED_ARTIFACT_ROOTS", "").split(os.pathsep) if item)
|
|
151
|
+
roots: list[Path] = []
|
|
152
|
+
seen: set[Path] = set()
|
|
153
|
+
for value in values:
|
|
154
|
+
root = Path(value).expanduser().resolve()
|
|
155
|
+
if root in seen:
|
|
156
|
+
continue
|
|
157
|
+
seen.add(root)
|
|
158
|
+
roots.append(root)
|
|
159
|
+
return roots
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def checked_output_path(args: argparse.Namespace) -> Path:
|
|
163
|
+
output = Path(args.output).expanduser().resolve()
|
|
164
|
+
roots = contaminated_artifact_roots(args)
|
|
165
|
+
if not roots:
|
|
166
|
+
raise SystemExit(
|
|
167
|
+
"--output must be under CLEAN_ROOM_CONTAMINATED_ARTIFACT_ROOTS or an explicit --contaminated-artifact-root"
|
|
168
|
+
)
|
|
169
|
+
if not any(path_is_under(output, root) for root in roots):
|
|
170
|
+
allowed = ", ".join(root.as_posix() for root in roots)
|
|
171
|
+
raise SystemExit(f"--output must be under a contaminated artifact root ({allowed}): {output}")
|
|
172
|
+
return output
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def add_skipped(skipped_entries: list[dict[str, str]], counters: dict[str, int], path: str, reason: str, kind: str) -> None:
|
|
176
|
+
counters["skipped_count"] += 1
|
|
177
|
+
if len(skipped_entries) < MAX_SKIPPED_ENTRIES:
|
|
178
|
+
skipped_entries.append({"path": path, "reason": reason, "kind": kind})
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def language_for_path(path: Path) -> str:
|
|
182
|
+
suffix = path.suffix.lower()
|
|
183
|
+
if suffix in PYTHON_EXTENSIONS:
|
|
184
|
+
return "python"
|
|
185
|
+
if suffix in JS_TS_EXTENSIONS:
|
|
186
|
+
return "typescript" if "ts" in suffix else "javascript"
|
|
187
|
+
if suffix in GO_EXTENSIONS:
|
|
188
|
+
return "go"
|
|
189
|
+
if suffix in RUST_EXTENSIONS:
|
|
190
|
+
return "rust"
|
|
191
|
+
if suffix == ".java":
|
|
192
|
+
return "java"
|
|
193
|
+
if suffix in {".kt", ".kts"}:
|
|
194
|
+
return "kotlin"
|
|
195
|
+
if suffix in SWIFT_EXTENSIONS:
|
|
196
|
+
return "swift"
|
|
197
|
+
if suffix in CSHARP_EXTENSIONS:
|
|
198
|
+
return "csharp"
|
|
199
|
+
if suffix in C_LIKE_EXTENSIONS:
|
|
200
|
+
return "cpp" if suffix in {".cc", ".cpp", ".cxx", ".hh", ".hpp", ".hxx"} else "c"
|
|
201
|
+
return "text"
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def line_count(text: str) -> int:
|
|
205
|
+
if not text:
|
|
206
|
+
return 0
|
|
207
|
+
return text.count("\n") + (0 if text.endswith("\n") else 1)
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def metrics_for_text(data: bytes, text: str) -> dict[str, int]:
|
|
211
|
+
characters = len(text)
|
|
212
|
+
return {
|
|
213
|
+
"bytes": len(data),
|
|
214
|
+
"lines": line_count(text),
|
|
215
|
+
"words": len(WORD_RE.findall(text)),
|
|
216
|
+
"characters": characters,
|
|
217
|
+
"estimated_tokens": math.ceil(characters / 4),
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def metrics_for_text_fragment(text: str) -> dict[str, int]:
|
|
222
|
+
return metrics_for_text(text.encode("utf-8", errors="replace"), text)
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def empty_metrics() -> dict[str, int]:
|
|
226
|
+
return {"bytes": 0, "lines": 0, "words": 0, "characters": 0, "estimated_tokens": 0}
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def add_metrics(left: dict[str, int], right: dict[str, int]) -> None:
|
|
230
|
+
for key in ("bytes", "lines", "words", "characters", "estimated_tokens"):
|
|
231
|
+
left[key] += right[key]
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def truncate_items(items: list[dict[str, Any]], limit: int) -> tuple[list[dict[str, Any]], bool]:
|
|
235
|
+
return items[:limit], len(items) > limit
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def literal_all(node: ast.AST) -> list[str]:
|
|
239
|
+
try:
|
|
240
|
+
value = ast.literal_eval(node)
|
|
241
|
+
except (ValueError, SyntaxError):
|
|
242
|
+
return []
|
|
243
|
+
if isinstance(value, (list, tuple)):
|
|
244
|
+
return [item for item in value if isinstance(item, str)]
|
|
245
|
+
return []
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def scan_python(text: str) -> tuple[str, list[dict[str, Any]], list[dict[str, str]]]:
|
|
249
|
+
imports: list[dict[str, Any]] = []
|
|
250
|
+
exports: list[dict[str, str]] = []
|
|
251
|
+
try:
|
|
252
|
+
tree = ast.parse(text)
|
|
253
|
+
except SyntaxError:
|
|
254
|
+
return "python-ast-error", imports, exports
|
|
255
|
+
|
|
256
|
+
for node in tree.body:
|
|
257
|
+
if isinstance(node, ast.Import):
|
|
258
|
+
for alias in node.names:
|
|
259
|
+
imports.append(
|
|
260
|
+
{
|
|
261
|
+
"specifier": alias.name,
|
|
262
|
+
"kind": "python-import",
|
|
263
|
+
"is_relative": False,
|
|
264
|
+
"names": [alias.asname or alias.name.split(".")[0]],
|
|
265
|
+
}
|
|
266
|
+
)
|
|
267
|
+
elif isinstance(node, ast.ImportFrom):
|
|
268
|
+
module = node.module or ""
|
|
269
|
+
specifier = "." * node.level + module
|
|
270
|
+
imports.append(
|
|
271
|
+
{
|
|
272
|
+
"specifier": specifier,
|
|
273
|
+
"kind": "python-from-import",
|
|
274
|
+
"is_relative": node.level > 0,
|
|
275
|
+
"names": [alias.asname or alias.name for alias in node.names],
|
|
276
|
+
}
|
|
277
|
+
)
|
|
278
|
+
elif isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
|
279
|
+
exports.append({"name": node.name, "kind": "top-level-function"})
|
|
280
|
+
elif isinstance(node, ast.ClassDef):
|
|
281
|
+
exports.append({"name": node.name, "kind": "top-level-class"})
|
|
282
|
+
elif isinstance(node, (ast.Assign, ast.AnnAssign)):
|
|
283
|
+
targets: list[ast.AST] = []
|
|
284
|
+
if isinstance(node, ast.Assign):
|
|
285
|
+
targets = list(node.targets)
|
|
286
|
+
value = node.value
|
|
287
|
+
else:
|
|
288
|
+
targets = [node.target]
|
|
289
|
+
value = node.value
|
|
290
|
+
for target in targets:
|
|
291
|
+
if isinstance(target, ast.Name) and target.id == "__all__" and value is not None:
|
|
292
|
+
for name in literal_all(value):
|
|
293
|
+
exports.append({"name": name, "kind": "explicit-all"})
|
|
294
|
+
elif isinstance(target, ast.Name):
|
|
295
|
+
exports.append({"name": target.id, "kind": "top-level-assignment"})
|
|
296
|
+
|
|
297
|
+
return "python-ast", *truncate_scanned(imports, exports)
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
def strip_js_comments(text: str) -> str:
|
|
301
|
+
text = re.sub(r"/\*.*?\*/", " ", text, flags=re.S)
|
|
302
|
+
text = re.sub(r"(?m)//.*$", " ", text)
|
|
303
|
+
return text
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
def split_export_names(raw: str) -> list[str]:
|
|
307
|
+
names: list[str] = []
|
|
308
|
+
for part in raw.split(","):
|
|
309
|
+
item = part.strip()
|
|
310
|
+
if not item:
|
|
311
|
+
continue
|
|
312
|
+
match = re.search(r"\bas\s+([A-Za-z_$][\w$]*)$", item)
|
|
313
|
+
if match:
|
|
314
|
+
names.append(match.group(1))
|
|
315
|
+
else:
|
|
316
|
+
names.append(item.split()[0].strip())
|
|
317
|
+
return [name for name in names if re.fullmatch(r"[A-Za-z_$][\w$]*", name)]
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
def scan_js_ts(text: str) -> tuple[str, list[dict[str, Any]], list[dict[str, str]]]:
|
|
321
|
+
scanned = strip_js_comments(text)
|
|
322
|
+
imports: list[dict[str, Any]] = []
|
|
323
|
+
exports: list[dict[str, str]] = []
|
|
324
|
+
|
|
325
|
+
for match in re.finditer(rf"(?m)^\s*import(?:\s+type)?(?:[\s\w$*{{}},]+?\s+from\s*)?{JS_STRING_RE}", scanned):
|
|
326
|
+
imports.append(
|
|
327
|
+
{
|
|
328
|
+
"specifier": match.group(1),
|
|
329
|
+
"kind": "esm-import",
|
|
330
|
+
"is_relative": match.group(1).startswith("."),
|
|
331
|
+
"names": [],
|
|
332
|
+
}
|
|
333
|
+
)
|
|
334
|
+
for match in re.finditer(rf"(?m)^\s*export(?:\s+type)?\s+[^;]*?\s+from\s+{JS_STRING_RE}", scanned):
|
|
335
|
+
imports.append(
|
|
336
|
+
{
|
|
337
|
+
"specifier": match.group(1),
|
|
338
|
+
"kind": "esm-re-export",
|
|
339
|
+
"is_relative": match.group(1).startswith("."),
|
|
340
|
+
"names": [],
|
|
341
|
+
}
|
|
342
|
+
)
|
|
343
|
+
for match in re.finditer(rf"\brequire\(\s*{JS_STRING_RE}\s*\)", scanned):
|
|
344
|
+
imports.append(
|
|
345
|
+
{
|
|
346
|
+
"specifier": match.group(1),
|
|
347
|
+
"kind": "commonjs-require",
|
|
348
|
+
"is_relative": match.group(1).startswith("."),
|
|
349
|
+
"names": [],
|
|
350
|
+
}
|
|
351
|
+
)
|
|
352
|
+
for match in re.finditer(rf"\bimport\(\s*{JS_STRING_RE}\s*\)", scanned):
|
|
353
|
+
imports.append(
|
|
354
|
+
{
|
|
355
|
+
"specifier": match.group(1),
|
|
356
|
+
"kind": "dynamic-import",
|
|
357
|
+
"is_relative": match.group(1).startswith("."),
|
|
358
|
+
"names": [],
|
|
359
|
+
}
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
for match in re.finditer(
|
|
363
|
+
r"(?m)^\s*export\s+(?:async\s+)?(?:class|function|const|let|var|interface|type|enum)\s+([A-Za-z_$][\w$]*)",
|
|
364
|
+
scanned,
|
|
365
|
+
):
|
|
366
|
+
exports.append({"name": match.group(1), "kind": "esm-declaration"})
|
|
367
|
+
for match in re.finditer(r"(?m)^\s*export\s+default\b", scanned):
|
|
368
|
+
exports.append({"name": "default", "kind": "esm-default"})
|
|
369
|
+
for match in re.finditer(r"(?m)^\s*export\s*{([^}]+)}", scanned):
|
|
370
|
+
for name in split_export_names(match.group(1)):
|
|
371
|
+
exports.append({"name": name, "kind": "esm-named"})
|
|
372
|
+
for match in re.finditer(r"\bmodule\.exports\s*=", scanned):
|
|
373
|
+
exports.append({"name": "module.exports", "kind": "commonjs-module"})
|
|
374
|
+
for match in re.finditer(r"\bexports\.([A-Za-z_$][\w$]*)\s*=", scanned):
|
|
375
|
+
exports.append({"name": match.group(1), "kind": "commonjs-named"})
|
|
376
|
+
|
|
377
|
+
return "javascript-typescript-scanner", *truncate_scanned(imports, exports)
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
def scan_go(text: str) -> tuple[str, list[dict[str, Any]], list[dict[str, str]]]:
|
|
381
|
+
imports: list[dict[str, Any]] = []
|
|
382
|
+
exports: list[dict[str, str]] = []
|
|
383
|
+
|
|
384
|
+
for block in re.finditer(r"(?ms)^\s*import\s*\((.*?)\)", text):
|
|
385
|
+
for match in re.finditer(r'"([^"]+)"', block.group(1)):
|
|
386
|
+
imports.append(
|
|
387
|
+
{
|
|
388
|
+
"specifier": match.group(1),
|
|
389
|
+
"kind": "go-import",
|
|
390
|
+
"is_relative": match.group(1).startswith("."),
|
|
391
|
+
"names": [],
|
|
392
|
+
}
|
|
393
|
+
)
|
|
394
|
+
for match in re.finditer(r'(?m)^\s*import\s+(?:[._A-Za-z]\w*\s+)?(?:"([^"]+)")', text):
|
|
395
|
+
imports.append(
|
|
396
|
+
{
|
|
397
|
+
"specifier": match.group(1),
|
|
398
|
+
"kind": "go-import",
|
|
399
|
+
"is_relative": match.group(1).startswith("."),
|
|
400
|
+
"names": [],
|
|
401
|
+
}
|
|
402
|
+
)
|
|
403
|
+
for match in re.finditer(rf"(?m)^\s*func\s+(?:\([^)]*\)\s*)?({IDENTIFIER_RE})\s*\(", text):
|
|
404
|
+
exports.append({"name": match.group(1), "kind": "go-function"})
|
|
405
|
+
for match in re.finditer(rf"(?m)^\s*type\s+({IDENTIFIER_RE})\b", text):
|
|
406
|
+
exports.append({"name": match.group(1), "kind": "go-type"})
|
|
407
|
+
for match in re.finditer(rf"(?m)^\s*(?:const|var)\s+(?:\(\s*)?({IDENTIFIER_RE})\b", text):
|
|
408
|
+
exports.append({"name": match.group(1), "kind": "go-binding"})
|
|
409
|
+
|
|
410
|
+
return "go-scanner", *truncate_scanned(imports, exports)
|
|
411
|
+
|
|
412
|
+
|
|
413
|
+
def scan_rust(text: str) -> tuple[str, list[dict[str, Any]], list[dict[str, str]]]:
|
|
414
|
+
imports: list[dict[str, Any]] = []
|
|
415
|
+
exports: list[dict[str, str]] = []
|
|
416
|
+
|
|
417
|
+
for match in re.finditer(r"(?m)^\s*use\s+([^;]+);", text):
|
|
418
|
+
specifier = re.sub(r"\s+", "", match.group(1))
|
|
419
|
+
imports.append(
|
|
420
|
+
{
|
|
421
|
+
"specifier": specifier,
|
|
422
|
+
"kind": "rust-use",
|
|
423
|
+
"is_relative": specifier.startswith(("self::", "super::", "crate::")),
|
|
424
|
+
"names": [],
|
|
425
|
+
}
|
|
426
|
+
)
|
|
427
|
+
for match in re.finditer(rf"(?m)^\s*(?:pub\s+)?mod\s+({IDENTIFIER_RE})\s*;", text):
|
|
428
|
+
imports.append(
|
|
429
|
+
{
|
|
430
|
+
"specifier": match.group(1),
|
|
431
|
+
"kind": "rust-mod",
|
|
432
|
+
"is_relative": True,
|
|
433
|
+
"names": [],
|
|
434
|
+
}
|
|
435
|
+
)
|
|
436
|
+
for match in re.finditer(
|
|
437
|
+
rf"(?m)^\s*(?:pub(?:\([^)]*\))?\s+)?(?:async\s+)?(?:fn|struct|enum|trait|type|const|static)\s+({IDENTIFIER_RE})\b",
|
|
438
|
+
text,
|
|
439
|
+
):
|
|
440
|
+
exports.append({"name": match.group(1), "kind": "rust-declaration"})
|
|
441
|
+
|
|
442
|
+
return "rust-scanner", *truncate_scanned(imports, exports)
|
|
443
|
+
|
|
444
|
+
|
|
445
|
+
def scan_java_kotlin(language: str, text: str) -> tuple[str, list[dict[str, Any]], list[dict[str, str]]]:
|
|
446
|
+
imports: list[dict[str, Any]] = []
|
|
447
|
+
exports: list[dict[str, str]] = []
|
|
448
|
+
scanner = "java-scanner" if language == "java" else "kotlin-scanner"
|
|
449
|
+
|
|
450
|
+
for match in re.finditer(r"(?m)^\s*import\s+(?:static\s+)?([A-Za-z_][\w.]*\*?)\s*;?", text):
|
|
451
|
+
imports.append(
|
|
452
|
+
{
|
|
453
|
+
"specifier": match.group(1),
|
|
454
|
+
"kind": f"{language}-import",
|
|
455
|
+
"is_relative": False,
|
|
456
|
+
"names": [],
|
|
457
|
+
}
|
|
458
|
+
)
|
|
459
|
+
for match in re.finditer(
|
|
460
|
+
rf"(?m)^\s*(?:public|internal|private|protected|sealed|abstract|final|open|data|value|\s)*"
|
|
461
|
+
rf"(?:class|interface|enum|object|record)\s+({IDENTIFIER_RE})\b",
|
|
462
|
+
text,
|
|
463
|
+
):
|
|
464
|
+
exports.append({"name": match.group(1), "kind": f"{language}-type"})
|
|
465
|
+
if language == "kotlin":
|
|
466
|
+
for match in re.finditer(rf"(?m)^\s*(?:public|internal|private|protected|suspend|\s)*fun\s+({IDENTIFIER_RE})\s*\(", text):
|
|
467
|
+
exports.append({"name": match.group(1), "kind": "kotlin-function"})
|
|
468
|
+
|
|
469
|
+
return scanner, *truncate_scanned(imports, exports)
|
|
470
|
+
|
|
471
|
+
|
|
472
|
+
def scan_swift(text: str) -> tuple[str, list[dict[str, Any]], list[dict[str, str]]]:
|
|
473
|
+
imports: list[dict[str, Any]] = []
|
|
474
|
+
exports: list[dict[str, str]] = []
|
|
475
|
+
|
|
476
|
+
for match in re.finditer(r"(?m)^\s*import\s+(?:@\w+\s+)?([A-Za-z_][\w.]*)", text):
|
|
477
|
+
imports.append(
|
|
478
|
+
{
|
|
479
|
+
"specifier": match.group(1),
|
|
480
|
+
"kind": "swift-import",
|
|
481
|
+
"is_relative": False,
|
|
482
|
+
"names": [],
|
|
483
|
+
}
|
|
484
|
+
)
|
|
485
|
+
for match in re.finditer(
|
|
486
|
+
rf"(?m)^\s*(?:public|internal|private|fileprivate|open|final|\s)*"
|
|
487
|
+
rf"(?:class|struct|enum|protocol|actor|func|let|var)\s+({IDENTIFIER_RE})\b",
|
|
488
|
+
text,
|
|
489
|
+
):
|
|
490
|
+
exports.append({"name": match.group(1), "kind": "swift-declaration"})
|
|
491
|
+
|
|
492
|
+
return "swift-scanner", *truncate_scanned(imports, exports)
|
|
493
|
+
|
|
494
|
+
|
|
495
|
+
def scan_c_like(language: str, text: str) -> tuple[str, list[dict[str, Any]], list[dict[str, str]]]:
|
|
496
|
+
imports: list[dict[str, Any]] = []
|
|
497
|
+
exports: list[dict[str, str]] = []
|
|
498
|
+
|
|
499
|
+
for match in re.finditer(r'(?m)^\s*#\s*include\s*([<"])([^>"]+)[>"]', text):
|
|
500
|
+
imports.append(
|
|
501
|
+
{
|
|
502
|
+
"specifier": match.group(2),
|
|
503
|
+
"kind": f"{language}-include",
|
|
504
|
+
"is_relative": match.group(1) == '"',
|
|
505
|
+
"names": [],
|
|
506
|
+
}
|
|
507
|
+
)
|
|
508
|
+
for match in re.finditer(
|
|
509
|
+
rf"(?m)^\s*(?:extern\s+)?(?:[A-Za-z_][\w:<>,\s\*&~]+\s+)+({IDENTIFIER_RE})\s*\([^;]*\)\s*(?:;|{{)",
|
|
510
|
+
text,
|
|
511
|
+
):
|
|
512
|
+
name = match.group(1)
|
|
513
|
+
if name not in {"if", "for", "while", "switch", "return"}:
|
|
514
|
+
exports.append({"name": name, "kind": f"{language}-function"})
|
|
515
|
+
for match in re.finditer(rf"(?m)^\s*(?:class|struct|enum)\s+({IDENTIFIER_RE})\b", text):
|
|
516
|
+
exports.append({"name": match.group(1), "kind": f"{language}-type"})
|
|
517
|
+
|
|
518
|
+
return f"{language}-scanner", *truncate_scanned(imports, exports)
|
|
519
|
+
|
|
520
|
+
|
|
521
|
+
def scan_csharp(text: str) -> tuple[str, list[dict[str, Any]], list[dict[str, str]]]:
|
|
522
|
+
imports: list[dict[str, Any]] = []
|
|
523
|
+
exports: list[dict[str, str]] = []
|
|
524
|
+
|
|
525
|
+
for match in re.finditer(r"(?m)^\s*using\s+(?:static\s+)?([A-Za-z_][\w.]*)\s*;", text):
|
|
526
|
+
imports.append(
|
|
527
|
+
{
|
|
528
|
+
"specifier": match.group(1),
|
|
529
|
+
"kind": "csharp-using",
|
|
530
|
+
"is_relative": False,
|
|
531
|
+
"names": [],
|
|
532
|
+
}
|
|
533
|
+
)
|
|
534
|
+
for match in re.finditer(
|
|
535
|
+
rf"(?m)^\s*(?:public|internal|private|protected|abstract|sealed|static|partial|\s)*"
|
|
536
|
+
rf"(?:class|interface|struct|enum|record)\s+({IDENTIFIER_RE})\b",
|
|
537
|
+
text,
|
|
538
|
+
):
|
|
539
|
+
exports.append({"name": match.group(1), "kind": "csharp-type"})
|
|
540
|
+
for match in re.finditer(
|
|
541
|
+
rf"(?m)^\s*(?:public|internal|private|protected|static|async|virtual|override|\s)*"
|
|
542
|
+
rf"(?:[A-Za-z_][\w<>,\[\]?]+\s+)+({IDENTIFIER_RE})\s*\(",
|
|
543
|
+
text,
|
|
544
|
+
):
|
|
545
|
+
name = match.group(1)
|
|
546
|
+
if name not in {"if", "for", "foreach", "while", "switch", "catch"}:
|
|
547
|
+
exports.append({"name": name, "kind": "csharp-method"})
|
|
548
|
+
|
|
549
|
+
return "csharp-scanner", *truncate_scanned(imports, exports)
|
|
550
|
+
|
|
551
|
+
|
|
552
|
+
def truncate_scanned(
|
|
553
|
+
imports: list[dict[str, Any]], exports: list[dict[str, str]]
|
|
554
|
+
) -> tuple[list[dict[str, Any]], list[dict[str, str]]]:
|
|
555
|
+
truncated_imports, imports_truncated = truncate_items(imports, MAX_IMPORTS_PER_FILE)
|
|
556
|
+
truncated_exports, exports_truncated = truncate_items(exports, MAX_EXPORTS_PER_FILE)
|
|
557
|
+
if imports_truncated:
|
|
558
|
+
truncated_imports.append(
|
|
559
|
+
{
|
|
560
|
+
"specifier": "__truncated__",
|
|
561
|
+
"kind": "truncation-marker",
|
|
562
|
+
"is_relative": False,
|
|
563
|
+
"names": [],
|
|
564
|
+
}
|
|
565
|
+
)
|
|
566
|
+
if exports_truncated:
|
|
567
|
+
truncated_exports.append({"name": "__truncated__", "kind": "truncation-marker"})
|
|
568
|
+
return truncated_imports, truncated_exports
|
|
569
|
+
|
|
570
|
+
|
|
571
|
+
def scan_file(language: str, text: str) -> tuple[str, list[dict[str, Any]], list[dict[str, str]]]:
|
|
572
|
+
if language == "python":
|
|
573
|
+
return scan_python(text)
|
|
574
|
+
if language in {"javascript", "typescript"}:
|
|
575
|
+
return scan_js_ts(text)
|
|
576
|
+
if language == "go":
|
|
577
|
+
return scan_go(text)
|
|
578
|
+
if language == "rust":
|
|
579
|
+
return scan_rust(text)
|
|
580
|
+
if language in {"java", "kotlin"}:
|
|
581
|
+
return scan_java_kotlin(language, text)
|
|
582
|
+
if language == "swift":
|
|
583
|
+
return scan_swift(text)
|
|
584
|
+
if language in {"c", "cpp"}:
|
|
585
|
+
return scan_c_like(language, text)
|
|
586
|
+
if language == "csharp":
|
|
587
|
+
return scan_csharp(text)
|
|
588
|
+
return "text-metrics", [], []
|
|
589
|
+
|
|
590
|
+
|
|
591
|
+
def normalized_relative(path: Path) -> str:
|
|
592
|
+
return path.as_posix()
|
|
593
|
+
|
|
594
|
+
|
|
595
|
+
def add_path_aliases(path_map: dict[str, str], root_id: str, relative_path: Path, file_id: str) -> None:
|
|
596
|
+
rel = normalized_relative(relative_path)
|
|
597
|
+
key = f"{root_id}:{rel}"
|
|
598
|
+
path_map[key] = file_id
|
|
599
|
+
suffix = relative_path.suffix
|
|
600
|
+
if suffix:
|
|
601
|
+
path_map[f"{root_id}:{normalized_relative(relative_path.with_suffix(''))}"] = file_id
|
|
602
|
+
if relative_path.name in {
|
|
603
|
+
"__init__.py",
|
|
604
|
+
"index.js",
|
|
605
|
+
"index.jsx",
|
|
606
|
+
"index.ts",
|
|
607
|
+
"index.tsx",
|
|
608
|
+
"index.mjs",
|
|
609
|
+
"index.cjs",
|
|
610
|
+
"mod.rs",
|
|
611
|
+
}:
|
|
612
|
+
path_map[f"{root_id}:{normalized_relative(relative_path.parent)}"] = file_id
|
|
613
|
+
|
|
614
|
+
|
|
615
|
+
def resolve_candidate(path_map: dict[str, str], root_id: str, candidate: Path) -> str | None:
|
|
616
|
+
candidates = [candidate]
|
|
617
|
+
if not candidate.suffix:
|
|
618
|
+
candidates.extend(candidate.with_suffix(ext) for ext in RESOLVE_EXTENSIONS)
|
|
619
|
+
candidates.extend(candidate / f"index{ext}" for ext in RESOLVE_EXTENSIONS if ext != ".pyi")
|
|
620
|
+
candidates.append(candidate / "__init__.py")
|
|
621
|
+
for item in candidates:
|
|
622
|
+
key = f"{root_id}:{normalized_relative(item)}"
|
|
623
|
+
if key in path_map:
|
|
624
|
+
return path_map[key]
|
|
625
|
+
return None
|
|
626
|
+
|
|
627
|
+
|
|
628
|
+
def resolve_import(
|
|
629
|
+
path_map: dict[str, str],
|
|
630
|
+
file_record: dict[str, Any],
|
|
631
|
+
import_record: dict[str, Any],
|
|
632
|
+
) -> str | None:
|
|
633
|
+
root_id = file_record["root_id"]
|
|
634
|
+
path = Path(file_record["path"])
|
|
635
|
+
specifier = import_record["specifier"]
|
|
636
|
+
kind = import_record["kind"]
|
|
637
|
+
if kind == "python-from-import" and specifier.startswith("."):
|
|
638
|
+
dot_count = len(specifier) - len(specifier.lstrip("."))
|
|
639
|
+
module = specifier[dot_count:]
|
|
640
|
+
base = path.parent
|
|
641
|
+
for _ in range(max(dot_count - 1, 0)):
|
|
642
|
+
base = base.parent
|
|
643
|
+
if not module:
|
|
644
|
+
for name in import_record.get("names", []):
|
|
645
|
+
resolved = resolve_candidate(path_map, root_id, base / str(name))
|
|
646
|
+
if resolved:
|
|
647
|
+
return resolved
|
|
648
|
+
candidate = base / Path(module.replace(".", "/")) if module else base
|
|
649
|
+
return resolve_candidate(path_map, root_id, candidate)
|
|
650
|
+
if kind == "python-import" and not specifier.startswith("."):
|
|
651
|
+
candidate = Path(specifier.replace(".", "/"))
|
|
652
|
+
return resolve_candidate(path_map, root_id, candidate)
|
|
653
|
+
if specifier.startswith("."):
|
|
654
|
+
return resolve_candidate(path_map, root_id, path.parent / specifier)
|
|
655
|
+
if kind == "rust-mod":
|
|
656
|
+
return resolve_candidate(path_map, root_id, path.parent / specifier)
|
|
657
|
+
if kind == "rust-use":
|
|
658
|
+
cleaned = re.sub(r"[{}*]", "", specifier)
|
|
659
|
+
first = cleaned.split("::", 1)[0]
|
|
660
|
+
if first in {"crate", "self"}:
|
|
661
|
+
remainder = cleaned.split("::", 1)[1] if "::" in cleaned else ""
|
|
662
|
+
base = Path("") if first == "crate" else path.parent
|
|
663
|
+
if remainder:
|
|
664
|
+
return resolve_candidate(path_map, root_id, base / Path(remainder.replace("::", "/")))
|
|
665
|
+
if first == "super":
|
|
666
|
+
remainder = cleaned.split("::", 1)[1] if "::" in cleaned else ""
|
|
667
|
+
base = path.parent.parent
|
|
668
|
+
if remainder:
|
|
669
|
+
return resolve_candidate(path_map, root_id, base / Path(remainder.replace("::", "/")))
|
|
670
|
+
if kind.endswith("-include") and import_record.get("is_relative"):
|
|
671
|
+
return resolve_candidate(path_map, root_id, path.parent / specifier)
|
|
672
|
+
if kind in {"java-import", "kotlin-import", "csharp-using"}:
|
|
673
|
+
return resolve_candidate(path_map, root_id, Path(specifier.rstrip(".*").replace(".", "/")))
|
|
674
|
+
if kind in {"go-import", "swift-import"}:
|
|
675
|
+
return resolve_candidate(path_map, root_id, Path(specifier.replace(".", "/")))
|
|
676
|
+
return None
|
|
677
|
+
|
|
678
|
+
|
|
679
|
+
def build_file_segments(
|
|
680
|
+
file_record: dict[str, Any],
|
|
681
|
+
text: str,
|
|
682
|
+
max_batch_tokens: int,
|
|
683
|
+
large_file_words: int,
|
|
684
|
+
max_file_segments: int,
|
|
685
|
+
) -> list[dict[str, Any]]:
|
|
686
|
+
metrics = file_record["metrics"]
|
|
687
|
+
if metrics["estimated_tokens"] <= max_batch_tokens and metrics["words"] <= large_file_words:
|
|
688
|
+
return []
|
|
689
|
+
|
|
690
|
+
reason = "large-file-word-count" if metrics["words"] > large_file_words else "large-file-token-count"
|
|
691
|
+
lines = text.splitlines(keepends=True)
|
|
692
|
+
if not lines and text:
|
|
693
|
+
lines = [text]
|
|
694
|
+
|
|
695
|
+
segments: list[dict[str, Any]] = []
|
|
696
|
+
current_text: list[str] = []
|
|
697
|
+
current_metrics = empty_metrics()
|
|
698
|
+
start_line = 1
|
|
699
|
+
current_line = 1
|
|
700
|
+
|
|
701
|
+
def flush(end_line: int) -> None:
|
|
702
|
+
nonlocal current_text, current_metrics, start_line
|
|
703
|
+
if not current_text or len(segments) >= max_file_segments:
|
|
704
|
+
return
|
|
705
|
+
ordinal = len(segments) + 1
|
|
706
|
+
segments.append(
|
|
707
|
+
{
|
|
708
|
+
"segment_id": f"segment-{file_record['file_id']}-{ordinal:04d}",
|
|
709
|
+
"file_id": file_record["file_id"],
|
|
710
|
+
"ordinal": ordinal,
|
|
711
|
+
"start_line": start_line,
|
|
712
|
+
"end_line": max(start_line, end_line),
|
|
713
|
+
"metrics": dict(current_metrics),
|
|
714
|
+
"reason": reason,
|
|
715
|
+
}
|
|
716
|
+
)
|
|
717
|
+
current_text = []
|
|
718
|
+
current_metrics = empty_metrics()
|
|
719
|
+
start_line = end_line + 1
|
|
720
|
+
|
|
721
|
+
for line in lines:
|
|
722
|
+
line_metrics = metrics_for_text_fragment(line)
|
|
723
|
+
if current_text and current_metrics["estimated_tokens"] + line_metrics["estimated_tokens"] > max_batch_tokens:
|
|
724
|
+
flush(current_line - 1)
|
|
725
|
+
if not current_text:
|
|
726
|
+
start_line = current_line
|
|
727
|
+
current_text.append(line)
|
|
728
|
+
add_metrics(current_metrics, line_metrics)
|
|
729
|
+
current_line += max(line_metrics["lines"], 1)
|
|
730
|
+
if len(segments) >= max_file_segments:
|
|
731
|
+
break
|
|
732
|
+
if current_text and len(segments) < max_file_segments:
|
|
733
|
+
flush(current_line - 1)
|
|
734
|
+
|
|
735
|
+
return segments
|
|
736
|
+
|
|
737
|
+
|
|
738
|
+
def collect_files(
|
|
739
|
+
args: argparse.Namespace, roots: list[dict[str, str]]
|
|
740
|
+
) -> tuple[list[dict[str, Any]], list[dict[str, Any]], list[dict[str, str]], dict[str, int]]:
|
|
741
|
+
ignore_dirs = set(DEFAULT_IGNORE_DIRS) | set(args.ignore_dir)
|
|
742
|
+
files: list[dict[str, Any]] = []
|
|
743
|
+
file_segments: list[dict[str, Any]] = []
|
|
744
|
+
skipped_entries: list[dict[str, str]] = []
|
|
745
|
+
counters = {"skipped_count": 0, "total_bytes": 0}
|
|
746
|
+
next_file_id = 1
|
|
747
|
+
|
|
748
|
+
for root in roots:
|
|
749
|
+
root_path = Path(root["path"])
|
|
750
|
+
for current_dir, dirnames, filenames in os.walk(root_path):
|
|
751
|
+
current = Path(current_dir)
|
|
752
|
+
kept_dirs: list[str] = []
|
|
753
|
+
for dirname in sorted(dirnames):
|
|
754
|
+
if dirname in ignore_dirs:
|
|
755
|
+
rel = normalized_relative((current / dirname).relative_to(root_path))
|
|
756
|
+
add_skipped(skipped_entries, counters, rel, "ignored-directory", "directory")
|
|
757
|
+
continue
|
|
758
|
+
kept_dirs.append(dirname)
|
|
759
|
+
dirnames[:] = kept_dirs
|
|
760
|
+
|
|
761
|
+
for filename in sorted(filenames):
|
|
762
|
+
source_path = current / filename
|
|
763
|
+
try:
|
|
764
|
+
resolved = source_path.resolve()
|
|
765
|
+
if not (resolved == root_path or root_path in resolved.parents):
|
|
766
|
+
rel = normalized_relative(source_path.relative_to(root_path))
|
|
767
|
+
add_skipped(skipped_entries, counters, rel, "symlink-outside-root", "file")
|
|
768
|
+
continue
|
|
769
|
+
stat = source_path.stat()
|
|
770
|
+
except OSError as exc:
|
|
771
|
+
rel = normalized_relative(source_path.relative_to(root_path))
|
|
772
|
+
add_skipped(skipped_entries, counters, rel, f"stat-error:{exc.__class__.__name__}", "file")
|
|
773
|
+
continue
|
|
774
|
+
|
|
775
|
+
rel_path = source_path.relative_to(root_path)
|
|
776
|
+
rel = normalized_relative(rel_path)
|
|
777
|
+
if len(files) >= args.max_files:
|
|
778
|
+
add_skipped(skipped_entries, counters, rel, "file-count-limit", "file")
|
|
779
|
+
continue
|
|
780
|
+
if stat.st_size > args.max_file_bytes:
|
|
781
|
+
add_skipped(skipped_entries, counters, rel, "file-byte-limit", "file")
|
|
782
|
+
continue
|
|
783
|
+
if counters["total_bytes"] + stat.st_size > args.max_total_bytes:
|
|
784
|
+
add_skipped(skipped_entries, counters, rel, "total-byte-limit", "file")
|
|
785
|
+
continue
|
|
786
|
+
|
|
787
|
+
try:
|
|
788
|
+
data = source_path.read_bytes()
|
|
789
|
+
except OSError as exc:
|
|
790
|
+
add_skipped(skipped_entries, counters, rel, f"read-error:{exc.__class__.__name__}", "file")
|
|
791
|
+
continue
|
|
792
|
+
if b"\0" in data:
|
|
793
|
+
add_skipped(skipped_entries, counters, rel, "binary-file", "file")
|
|
794
|
+
continue
|
|
795
|
+
|
|
796
|
+
text = data.decode("utf-8", errors="replace")
|
|
797
|
+
language = language_for_path(source_path)
|
|
798
|
+
scanner, imports, exports = scan_file(language, text)
|
|
799
|
+
metrics = metrics_for_text(data, text)
|
|
800
|
+
file_record = {
|
|
801
|
+
"file_id": f"file-{next_file_id:06d}",
|
|
802
|
+
"root_id": root["root_id"],
|
|
803
|
+
"path": rel,
|
|
804
|
+
"language": language,
|
|
805
|
+
"scanner": scanner,
|
|
806
|
+
"metrics": metrics,
|
|
807
|
+
"imports": imports,
|
|
808
|
+
"exports": exports,
|
|
809
|
+
}
|
|
810
|
+
files.append(file_record)
|
|
811
|
+
file_segments.extend(
|
|
812
|
+
build_file_segments(
|
|
813
|
+
file_record,
|
|
814
|
+
text,
|
|
815
|
+
args.max_batch_tokens,
|
|
816
|
+
args.large_file_words,
|
|
817
|
+
args.max_file_segments,
|
|
818
|
+
)
|
|
819
|
+
)
|
|
820
|
+
next_file_id += 1
|
|
821
|
+
counters["total_bytes"] += len(data)
|
|
822
|
+
return files, file_segments, skipped_entries, counters
|
|
823
|
+
|
|
824
|
+
|
|
825
|
+
def resolve_relationships(files: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
826
|
+
path_map: dict[str, str] = {}
|
|
827
|
+
for file_record in files:
|
|
828
|
+
add_path_aliases(path_map, file_record["root_id"], Path(file_record["path"]), file_record["file_id"])
|
|
829
|
+
|
|
830
|
+
relationships: list[dict[str, Any]] = []
|
|
831
|
+
for file_record in files:
|
|
832
|
+
for import_record in file_record["imports"]:
|
|
833
|
+
resolved = resolve_import(path_map, file_record, import_record)
|
|
834
|
+
import_record["resolved_file_id"] = resolved
|
|
835
|
+
relationships.append(
|
|
836
|
+
{
|
|
837
|
+
"from_file_id": file_record["file_id"],
|
|
838
|
+
"to_file_id": resolved,
|
|
839
|
+
"specifier": import_record["specifier"],
|
|
840
|
+
"kind": import_record["kind"],
|
|
841
|
+
}
|
|
842
|
+
)
|
|
843
|
+
return relationships
|
|
844
|
+
|
|
845
|
+
|
|
846
|
+
def metric_sum(files: list[dict[str, Any]], file_ids: list[str]) -> dict[str, int]:
|
|
847
|
+
by_id = {file_record["file_id"]: file_record for file_record in files}
|
|
848
|
+
totals = empty_metrics()
|
|
849
|
+
for file_id in file_ids:
|
|
850
|
+
metrics = by_id[file_id]["metrics"]
|
|
851
|
+
add_metrics(totals, metrics)
|
|
852
|
+
return totals
|
|
853
|
+
|
|
854
|
+
|
|
855
|
+
def segment_metric_sum(file_segments: list[dict[str, Any]], segment_ids: list[str]) -> dict[str, int]:
|
|
856
|
+
by_id = {segment["segment_id"]: segment for segment in file_segments}
|
|
857
|
+
totals = empty_metrics()
|
|
858
|
+
for segment_id in segment_ids:
|
|
859
|
+
add_metrics(totals, by_id[segment_id]["metrics"])
|
|
860
|
+
return totals
|
|
861
|
+
|
|
862
|
+
|
|
863
|
+
def language_counts(files: list[dict[str, Any]], file_ids: list[str]) -> dict[str, int]:
|
|
864
|
+
by_id = {file_record["file_id"]: file_record for file_record in files}
|
|
865
|
+
counts: dict[str, int] = defaultdict(int)
|
|
866
|
+
for file_id in file_ids:
|
|
867
|
+
counts[str(by_id[file_id]["language"])] += 1
|
|
868
|
+
return dict(sorted(counts.items()))
|
|
869
|
+
|
|
870
|
+
|
|
871
|
+
def build_groups(files: list[dict[str, Any]], relationships: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
872
|
+
file_ids = [file_record["file_id"] for file_record in files]
|
|
873
|
+
adjacency: dict[str, set[str]] = {file_id: set() for file_id in file_ids}
|
|
874
|
+
related: set[str] = set()
|
|
875
|
+
for relationship in relationships:
|
|
876
|
+
to_file_id = relationship.get("to_file_id")
|
|
877
|
+
if isinstance(to_file_id, str) and to_file_id in adjacency:
|
|
878
|
+
from_file_id = relationship["from_file_id"]
|
|
879
|
+
adjacency[from_file_id].add(to_file_id)
|
|
880
|
+
adjacency[to_file_id].add(from_file_id)
|
|
881
|
+
related.add(from_file_id)
|
|
882
|
+
related.add(to_file_id)
|
|
883
|
+
|
|
884
|
+
visited: set[str] = set()
|
|
885
|
+
raw_groups: list[tuple[str, list[str], str]] = []
|
|
886
|
+
for file_id in file_ids:
|
|
887
|
+
if file_id in visited or file_id not in related:
|
|
888
|
+
continue
|
|
889
|
+
queue: deque[str] = deque([file_id])
|
|
890
|
+
visited.add(file_id)
|
|
891
|
+
component: list[str] = []
|
|
892
|
+
while queue:
|
|
893
|
+
current = queue.popleft()
|
|
894
|
+
component.append(current)
|
|
895
|
+
for neighbor in sorted(adjacency[current]):
|
|
896
|
+
if neighbor not in visited:
|
|
897
|
+
visited.add(neighbor)
|
|
898
|
+
queue.append(neighbor)
|
|
899
|
+
raw_groups.append(("dependency-component", sorted(component), "Files connected by resolved local imports."))
|
|
900
|
+
|
|
901
|
+
by_directory: dict[str, list[str]] = defaultdict(list)
|
|
902
|
+
file_by_id = {file_record["file_id"]: file_record for file_record in files}
|
|
903
|
+
for file_id in file_ids:
|
|
904
|
+
if file_id in visited:
|
|
905
|
+
continue
|
|
906
|
+
directory = str(Path(file_by_id[file_id]["path"]).parent)
|
|
907
|
+
by_directory[directory].append(file_id)
|
|
908
|
+
for directory in sorted(by_directory):
|
|
909
|
+
raw_groups.append(
|
|
910
|
+
(
|
|
911
|
+
"directory-cluster",
|
|
912
|
+
sorted(by_directory[directory]),
|
|
913
|
+
f"Files grouped by directory fallback: {directory}.",
|
|
914
|
+
)
|
|
915
|
+
)
|
|
916
|
+
|
|
917
|
+
groups: list[dict[str, Any]] = []
|
|
918
|
+
for index, (reason, group_file_ids, note) in enumerate(raw_groups, start=1):
|
|
919
|
+
group_file_id_set = set(group_file_ids)
|
|
920
|
+
groups.append(
|
|
921
|
+
{
|
|
922
|
+
"group_id": f"group-{index:04d}",
|
|
923
|
+
"reason": reason,
|
|
924
|
+
"file_ids": group_file_ids,
|
|
925
|
+
"metrics": metric_sum(files, group_file_ids),
|
|
926
|
+
"language_counts": language_counts(files, group_file_ids),
|
|
927
|
+
"relationship_count": sum(
|
|
928
|
+
1
|
|
929
|
+
for relationship in relationships
|
|
930
|
+
if relationship["from_file_id"] in group_file_id_set
|
|
931
|
+
and relationship.get("to_file_id") in group_file_id_set
|
|
932
|
+
),
|
|
933
|
+
"notes": note,
|
|
934
|
+
}
|
|
935
|
+
)
|
|
936
|
+
return groups
|
|
937
|
+
|
|
938
|
+
|
|
939
|
+
def split_files_for_batch(
|
|
940
|
+
files: list[dict[str, Any]], file_ids: list[str], max_batch_tokens: int
|
|
941
|
+
) -> list[list[str]]:
|
|
942
|
+
by_id = {file_record["file_id"]: file_record for file_record in files}
|
|
943
|
+
chunks: list[list[str]] = []
|
|
944
|
+
current: list[str] = []
|
|
945
|
+
current_tokens = 0
|
|
946
|
+
for file_id in sorted(file_ids, key=lambda item: (by_id[item]["path"], item)):
|
|
947
|
+
tokens = by_id[file_id]["metrics"]["estimated_tokens"]
|
|
948
|
+
if current and current_tokens + tokens > max_batch_tokens:
|
|
949
|
+
chunks.append(current)
|
|
950
|
+
current = []
|
|
951
|
+
current_tokens = 0
|
|
952
|
+
current.append(file_id)
|
|
953
|
+
current_tokens += tokens
|
|
954
|
+
if current:
|
|
955
|
+
chunks.append(current)
|
|
956
|
+
return chunks
|
|
957
|
+
|
|
958
|
+
|
|
959
|
+
def build_batches(
|
|
960
|
+
files: list[dict[str, Any]],
|
|
961
|
+
file_segments: list[dict[str, Any]],
|
|
962
|
+
groups: list[dict[str, Any]],
|
|
963
|
+
max_batch_tokens: int,
|
|
964
|
+
) -> list[dict[str, Any]]:
|
|
965
|
+
batches: list[dict[str, Any]] = []
|
|
966
|
+
pending_group_ids: list[str] = []
|
|
967
|
+
pending_file_ids: list[str] = []
|
|
968
|
+
pending_tokens = 0
|
|
969
|
+
segments_by_file: dict[str, list[dict[str, Any]]] = defaultdict(list)
|
|
970
|
+
for segment in file_segments:
|
|
971
|
+
segments_by_file[segment["file_id"]].append(segment)
|
|
972
|
+
|
|
973
|
+
def flush(note: str) -> None:
|
|
974
|
+
nonlocal pending_group_ids, pending_file_ids, pending_tokens
|
|
975
|
+
if not pending_file_ids:
|
|
976
|
+
return
|
|
977
|
+
batches.append(
|
|
978
|
+
{
|
|
979
|
+
"batch_id": f"batch-{len(batches) + 1:04d}",
|
|
980
|
+
"group_ids": pending_group_ids,
|
|
981
|
+
"file_ids": pending_file_ids,
|
|
982
|
+
"segment_ids": [],
|
|
983
|
+
"metrics": metric_sum(files, pending_file_ids),
|
|
984
|
+
"language_counts": language_counts(files, pending_file_ids),
|
|
985
|
+
"notes": note,
|
|
986
|
+
}
|
|
987
|
+
)
|
|
988
|
+
pending_group_ids = []
|
|
989
|
+
pending_file_ids = []
|
|
990
|
+
pending_tokens = 0
|
|
991
|
+
|
|
992
|
+
for group in groups:
|
|
993
|
+
group_tokens = group["metrics"]["estimated_tokens"]
|
|
994
|
+
group_has_segments = any(file_id in segments_by_file for file_id in group["file_ids"])
|
|
995
|
+
if group_tokens > max_batch_tokens or group_has_segments:
|
|
996
|
+
flush(f"Fits max_batch_tokens {max_batch_tokens}.")
|
|
997
|
+
split_note = (
|
|
998
|
+
f"Split large {group['group_id']} to respect max_batch_tokens {max_batch_tokens}."
|
|
999
|
+
if group_tokens > max_batch_tokens
|
|
1000
|
+
else f"Split {group['group_id']} to preserve large-file segment boundaries."
|
|
1001
|
+
)
|
|
1002
|
+
|
|
1003
|
+
def append_regular_chunk(chunk_file_ids: list[str], note: str) -> None:
|
|
1004
|
+
batches.append(
|
|
1005
|
+
{
|
|
1006
|
+
"batch_id": f"batch-{len(batches) + 1:04d}",
|
|
1007
|
+
"group_ids": [group["group_id"]],
|
|
1008
|
+
"file_ids": chunk_file_ids,
|
|
1009
|
+
"segment_ids": [],
|
|
1010
|
+
"metrics": metric_sum(files, chunk_file_ids),
|
|
1011
|
+
"language_counts": language_counts(files, chunk_file_ids),
|
|
1012
|
+
"notes": note,
|
|
1013
|
+
}
|
|
1014
|
+
)
|
|
1015
|
+
|
|
1016
|
+
def append_segment_batches(file_id: str) -> None:
|
|
1017
|
+
for segment in segments_by_file[file_id]:
|
|
1018
|
+
segment_ids = [segment["segment_id"]]
|
|
1019
|
+
batches.append(
|
|
1020
|
+
{
|
|
1021
|
+
"batch_id": f"batch-{len(batches) + 1:04d}",
|
|
1022
|
+
"group_ids": [group["group_id"]],
|
|
1023
|
+
"file_ids": [file_id],
|
|
1024
|
+
"segment_ids": segment_ids,
|
|
1025
|
+
"metrics": segment_metric_sum(file_segments, segment_ids),
|
|
1026
|
+
"language_counts": language_counts(files, [file_id]),
|
|
1027
|
+
"notes": (
|
|
1028
|
+
f"Split large {file_id} from {group['group_id']} by line spans "
|
|
1029
|
+
f"to respect max_batch_tokens {max_batch_tokens}."
|
|
1030
|
+
),
|
|
1031
|
+
}
|
|
1032
|
+
)
|
|
1033
|
+
|
|
1034
|
+
for chunk in split_files_for_batch(files, group["file_ids"], max_batch_tokens):
|
|
1035
|
+
if any(file_id in segments_by_file for file_id in chunk):
|
|
1036
|
+
regular_chunk: list[str] = []
|
|
1037
|
+
for file_id in chunk:
|
|
1038
|
+
if file_id in segments_by_file:
|
|
1039
|
+
if regular_chunk:
|
|
1040
|
+
append_regular_chunk(regular_chunk, split_note)
|
|
1041
|
+
regular_chunk = []
|
|
1042
|
+
append_segment_batches(file_id)
|
|
1043
|
+
else:
|
|
1044
|
+
regular_chunk.append(file_id)
|
|
1045
|
+
if regular_chunk:
|
|
1046
|
+
append_regular_chunk(regular_chunk, split_note)
|
|
1047
|
+
continue
|
|
1048
|
+
append_regular_chunk(chunk, split_note)
|
|
1049
|
+
continue
|
|
1050
|
+
if pending_file_ids and pending_tokens + group_tokens > max_batch_tokens:
|
|
1051
|
+
flush(f"Fits max_batch_tokens {max_batch_tokens}.")
|
|
1052
|
+
pending_group_ids.append(group["group_id"])
|
|
1053
|
+
pending_file_ids.extend(group["file_ids"])
|
|
1054
|
+
pending_tokens += group_tokens
|
|
1055
|
+
flush(f"Fits max_batch_tokens {max_batch_tokens}.")
|
|
1056
|
+
return batches
|
|
1057
|
+
|
|
1058
|
+
|
|
1059
|
+
def build_large_items(
|
|
1060
|
+
files: list[dict[str, Any]],
|
|
1061
|
+
groups: list[dict[str, Any]],
|
|
1062
|
+
batches: list[dict[str, Any]],
|
|
1063
|
+
large_file_words: int,
|
|
1064
|
+
large_group_words: int,
|
|
1065
|
+
max_batch_tokens: int,
|
|
1066
|
+
) -> list[dict[str, Any]]:
|
|
1067
|
+
large_items: list[dict[str, Any]] = []
|
|
1068
|
+
|
|
1069
|
+
def reason_for(metrics: dict[str, int], word_limit: int) -> str | None:
|
|
1070
|
+
if metrics["words"] > word_limit:
|
|
1071
|
+
return "word-count-threshold"
|
|
1072
|
+
if metrics["estimated_tokens"] > max_batch_tokens:
|
|
1073
|
+
return "token-threshold"
|
|
1074
|
+
return None
|
|
1075
|
+
|
|
1076
|
+
for file_record in files:
|
|
1077
|
+
reason = reason_for(file_record["metrics"], large_file_words)
|
|
1078
|
+
if reason:
|
|
1079
|
+
large_items.append(
|
|
1080
|
+
{
|
|
1081
|
+
"item_id": file_record["file_id"],
|
|
1082
|
+
"kind": "file",
|
|
1083
|
+
"metrics": file_record["metrics"],
|
|
1084
|
+
"reason": reason,
|
|
1085
|
+
"notes": "Large source file should be assigned through file_segments or a narrow unit.",
|
|
1086
|
+
}
|
|
1087
|
+
)
|
|
1088
|
+
for group in groups:
|
|
1089
|
+
reason = reason_for(group["metrics"], large_group_words)
|
|
1090
|
+
if reason:
|
|
1091
|
+
large_items.append(
|
|
1092
|
+
{
|
|
1093
|
+
"item_id": group["group_id"],
|
|
1094
|
+
"kind": "group",
|
|
1095
|
+
"metrics": group["metrics"],
|
|
1096
|
+
"reason": reason,
|
|
1097
|
+
"notes": "Large dependency group should be decomposed through recommended_batches.",
|
|
1098
|
+
}
|
|
1099
|
+
)
|
|
1100
|
+
for batch in batches:
|
|
1101
|
+
reason = reason_for(batch["metrics"], large_group_words)
|
|
1102
|
+
if reason:
|
|
1103
|
+
large_items.append(
|
|
1104
|
+
{
|
|
1105
|
+
"item_id": batch["batch_id"],
|
|
1106
|
+
"kind": "batch",
|
|
1107
|
+
"metrics": batch["metrics"],
|
|
1108
|
+
"reason": reason,
|
|
1109
|
+
"notes": "Batch is still large; controller should narrow the unit before source analysis.",
|
|
1110
|
+
}
|
|
1111
|
+
)
|
|
1112
|
+
return large_items
|
|
1113
|
+
|
|
1114
|
+
|
|
1115
|
+
def aggregate_metrics(
|
|
1116
|
+
files: list[dict[str, Any]],
|
|
1117
|
+
file_segments: list[dict[str, Any]],
|
|
1118
|
+
relationships: list[dict[str, Any]],
|
|
1119
|
+
groups: list[dict[str, Any]],
|
|
1120
|
+
batches: list[dict[str, Any]],
|
|
1121
|
+
large_items: list[dict[str, Any]],
|
|
1122
|
+
skipped_count: int,
|
|
1123
|
+
skipped_entries: list[dict[str, str]],
|
|
1124
|
+
) -> dict[str, int | bool]:
|
|
1125
|
+
file_ids = [file_record["file_id"] for file_record in files]
|
|
1126
|
+
totals = metric_sum(files, file_ids) if file_ids else empty_metrics()
|
|
1127
|
+
totals.update(
|
|
1128
|
+
{
|
|
1129
|
+
"file_count": len(files),
|
|
1130
|
+
"file_segment_count": len(file_segments),
|
|
1131
|
+
"skipped_count": skipped_count,
|
|
1132
|
+
"skipped_entries_truncated": skipped_count > len(skipped_entries),
|
|
1133
|
+
"relationship_count": len(relationships),
|
|
1134
|
+
"resolved_relationship_count": sum(1 for item in relationships if item.get("to_file_id")),
|
|
1135
|
+
"group_count": len(groups),
|
|
1136
|
+
"batch_count": len(batches),
|
|
1137
|
+
"large_item_count": len(large_items),
|
|
1138
|
+
}
|
|
1139
|
+
)
|
|
1140
|
+
return totals
|
|
1141
|
+
|
|
1142
|
+
|
|
1143
|
+
def atomic_write_json(path: Path, data: dict[str, Any]) -> None:
|
|
1144
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
1145
|
+
encoded = json.dumps(data, indent=2, sort_keys=False) + "\n"
|
|
1146
|
+
with tempfile.NamedTemporaryFile("w", encoding="utf-8", dir=path.parent, delete=False) as handle:
|
|
1147
|
+
tmp_path = Path(handle.name)
|
|
1148
|
+
handle.write(encoded)
|
|
1149
|
+
try:
|
|
1150
|
+
os.replace(tmp_path, path)
|
|
1151
|
+
except OSError:
|
|
1152
|
+
try:
|
|
1153
|
+
tmp_path.unlink()
|
|
1154
|
+
finally:
|
|
1155
|
+
raise
|
|
1156
|
+
|
|
1157
|
+
|
|
1158
|
+
def main() -> int:
|
|
1159
|
+
args = parse_args()
|
|
1160
|
+
if args.max_files < 1:
|
|
1161
|
+
raise SystemExit("--max-files must be at least 1")
|
|
1162
|
+
if args.max_file_bytes < 1:
|
|
1163
|
+
raise SystemExit("--max-file-bytes must be at least 1")
|
|
1164
|
+
if args.max_total_bytes < 1:
|
|
1165
|
+
raise SystemExit("--max-total-bytes must be at least 1")
|
|
1166
|
+
if args.max_batch_tokens < 1:
|
|
1167
|
+
raise SystemExit("--max-batch-tokens must be at least 1")
|
|
1168
|
+
if args.large_file_words < 1:
|
|
1169
|
+
raise SystemExit("--large-file-words must be at least 1")
|
|
1170
|
+
if args.large_group_words < 1:
|
|
1171
|
+
raise SystemExit("--large-group-words must be at least 1")
|
|
1172
|
+
if args.max_file_segments < 1:
|
|
1173
|
+
raise SystemExit("--max-file-segments must be at least 1")
|
|
1174
|
+
|
|
1175
|
+
output_path = checked_output_path(args)
|
|
1176
|
+
roots = source_roots(args.source_root)
|
|
1177
|
+
files, file_segments, skipped_entries, counters = collect_files(args, roots)
|
|
1178
|
+
relationships = resolve_relationships(files)
|
|
1179
|
+
groups = build_groups(files, relationships)
|
|
1180
|
+
batches = build_batches(files, file_segments, groups, args.max_batch_tokens)
|
|
1181
|
+
large_items = build_large_items(
|
|
1182
|
+
files,
|
|
1183
|
+
groups,
|
|
1184
|
+
batches,
|
|
1185
|
+
args.large_file_words,
|
|
1186
|
+
args.large_group_words,
|
|
1187
|
+
args.max_batch_tokens,
|
|
1188
|
+
)
|
|
1189
|
+
now = utc_now()
|
|
1190
|
+
output = {
|
|
1191
|
+
"index_id": f"source-index-{args.task_id}",
|
|
1192
|
+
"task_id": args.task_id,
|
|
1193
|
+
"created_at": now,
|
|
1194
|
+
"created_by_role": "controller-preflight",
|
|
1195
|
+
"domain": "contaminated",
|
|
1196
|
+
"generator": {
|
|
1197
|
+
"name": "build_source_index.py",
|
|
1198
|
+
"version": "1",
|
|
1199
|
+
"python_version": platform.python_version(),
|
|
1200
|
+
"scanner_modes": [
|
|
1201
|
+
"python-ast",
|
|
1202
|
+
"javascript-typescript-scanner",
|
|
1203
|
+
"go-scanner",
|
|
1204
|
+
"rust-scanner",
|
|
1205
|
+
"java-scanner",
|
|
1206
|
+
"kotlin-scanner",
|
|
1207
|
+
"swift-scanner",
|
|
1208
|
+
"c-scanner",
|
|
1209
|
+
"cpp-scanner",
|
|
1210
|
+
"csharp-scanner",
|
|
1211
|
+
"text-metrics",
|
|
1212
|
+
],
|
|
1213
|
+
},
|
|
1214
|
+
"limits": {
|
|
1215
|
+
"max_files": args.max_files,
|
|
1216
|
+
"max_file_bytes": args.max_file_bytes,
|
|
1217
|
+
"max_total_bytes": args.max_total_bytes,
|
|
1218
|
+
"max_batch_tokens": args.max_batch_tokens,
|
|
1219
|
+
"large_file_words": args.large_file_words,
|
|
1220
|
+
"large_group_words": args.large_group_words,
|
|
1221
|
+
"max_file_segments": args.max_file_segments,
|
|
1222
|
+
"ignore_dirs": sorted(set(DEFAULT_IGNORE_DIRS) | set(args.ignore_dir)),
|
|
1223
|
+
},
|
|
1224
|
+
"dependency_report": (
|
|
1225
|
+
None
|
|
1226
|
+
if args.skip_tool_detection
|
|
1227
|
+
else clean_room_tooling.dependency_report(args.allow_working_project_tools, args.probe_tools)
|
|
1228
|
+
),
|
|
1229
|
+
"source_roots": roots,
|
|
1230
|
+
"files": files,
|
|
1231
|
+
"file_segments": file_segments,
|
|
1232
|
+
"relationships": relationships,
|
|
1233
|
+
"groups": groups,
|
|
1234
|
+
"recommended_batches": batches,
|
|
1235
|
+
"large_items": large_items,
|
|
1236
|
+
"skipped_entries": skipped_entries,
|
|
1237
|
+
"aggregate_metrics": aggregate_metrics(
|
|
1238
|
+
files,
|
|
1239
|
+
file_segments,
|
|
1240
|
+
relationships,
|
|
1241
|
+
groups,
|
|
1242
|
+
batches,
|
|
1243
|
+
large_items,
|
|
1244
|
+
counters["skipped_count"],
|
|
1245
|
+
skipped_entries,
|
|
1246
|
+
),
|
|
1247
|
+
}
|
|
1248
|
+
atomic_write_json(output_path, output)
|
|
1249
|
+
return 0
|
|
1250
|
+
|
|
1251
|
+
|
|
1252
|
+
if __name__ == "__main__":
|
|
1253
|
+
raise SystemExit(main())
|