sourcecode 0.37.0__py3-none-any.whl → 0.38.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sourcecode/__init__.py +1 -1
- sourcecode/adaptive_scanner.py +258 -0
- sourcecode/cli.py +17 -10
- sourcecode/repo_classifier.py +570 -0
- sourcecode/schema.py +1 -0
- sourcecode/serializer.py +5 -0
- {sourcecode-0.37.0.dist-info → sourcecode-0.38.0.dist-info}/METADATA +1 -1
- {sourcecode-0.37.0.dist-info → sourcecode-0.38.0.dist-info}/RECORD +11 -9
- {sourcecode-0.37.0.dist-info → sourcecode-0.38.0.dist-info}/WHEEL +0 -0
- {sourcecode-0.37.0.dist-info → sourcecode-0.38.0.dist-info}/entry_points.txt +0 -0
- {sourcecode-0.37.0.dist-info → sourcecode-0.38.0.dist-info}/licenses/LICENSE +0 -0
sourcecode/__init__.py
CHANGED
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
"""Adaptive file tree scanner with topology-aware depth budgets.
|
|
4
|
+
|
|
5
|
+
Replaces pure depth filtering with relevance-oriented traversal:
|
|
6
|
+
- Source roots (packages/*/src, apps/*/src) get deep scan budgets.
|
|
7
|
+
- Low-signal directories (docs/, benchmarks/) are limited to 2 levels.
|
|
8
|
+
- Generated/excluded directories (dist/, node_modules/) are skipped.
|
|
9
|
+
- Unclassified directories fall back to the base depth limit.
|
|
10
|
+
|
|
11
|
+
Drop-in replacement for FileScanner: same scan_tree() and find_manifests()
|
|
12
|
+
interface, same output format (None = file, dict = directory).
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import os
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Any, Optional, cast
|
|
18
|
+
|
|
19
|
+
from pathspec import GitIgnoreSpec
|
|
20
|
+
|
|
21
|
+
from sourcecode.repo_classifier import RepoTopology
|
|
22
|
+
from sourcecode.scanner import DEFAULT_EXCLUDES, MANIFEST_NAMES
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class AdaptiveScanner:
|
|
26
|
+
"""File tree scanner driven by repository topology.
|
|
27
|
+
|
|
28
|
+
When *topology* is provided, traversal depth is controlled per-directory:
|
|
29
|
+
directories inside source roots receive a deep budget; low-signal dirs
|
|
30
|
+
are restricted; generated dirs are excluded entirely.
|
|
31
|
+
|
|
32
|
+
When *topology* is None, falls back to the base depth limit — identical
|
|
33
|
+
behaviour to FileScanner.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def __init__(
|
|
37
|
+
self,
|
|
38
|
+
root: Path,
|
|
39
|
+
topology: Optional[RepoTopology] = None,
|
|
40
|
+
base_depth: int = 4,
|
|
41
|
+
extra_excludes: Optional[frozenset[str]] = None,
|
|
42
|
+
) -> None:
|
|
43
|
+
self.root = root.resolve()
|
|
44
|
+
self.topology = topology
|
|
45
|
+
self.base_depth = base_depth
|
|
46
|
+
self._excludes = DEFAULT_EXCLUDES | (extra_excludes or frozenset())
|
|
47
|
+
self._gitignore_spec: Optional[GitIgnoreSpec] = None
|
|
48
|
+
|
|
49
|
+
# Pre-compute lookup tables from topology for O(1) classification
|
|
50
|
+
# during traversal.
|
|
51
|
+
#
|
|
52
|
+
# Each entry is (path_parts_tuple, max_absolute_depth):
|
|
53
|
+
# source prefix → (src_parts, len(src_parts) + source_depth)
|
|
54
|
+
# low-signal prefix → (ls_parts, len(ls_parts) + low_signal_depth)
|
|
55
|
+
#
|
|
56
|
+
# "max_absolute_depth" is depth measured from the repo root, not from
|
|
57
|
+
# the classified directory. At depth D, files are visible; at depth
|
|
58
|
+
# >= max we clear dirnames and skip files.
|
|
59
|
+
self._source_prefixes: list[tuple[tuple[str, ...], int]] = []
|
|
60
|
+
self._low_signal_prefixes: list[tuple[tuple[str, ...], int]] = []
|
|
61
|
+
self._extra_exclude_names: frozenset[str] = frozenset()
|
|
62
|
+
|
|
63
|
+
if topology is not None:
|
|
64
|
+
budget = topology.scan_budget
|
|
65
|
+
for sr in topology.source_roots:
|
|
66
|
+
parts = tuple(p for p in sr.path.split("/") if p)
|
|
67
|
+
if parts:
|
|
68
|
+
max_d = len(parts) + budget.source_depth
|
|
69
|
+
self._source_prefixes.append((parts, max_d))
|
|
70
|
+
|
|
71
|
+
for lr in topology.low_signal_roots:
|
|
72
|
+
parts = tuple(p for p in lr.path.split("/") if p)
|
|
73
|
+
if parts:
|
|
74
|
+
max_d = len(parts) + budget.low_signal_depth
|
|
75
|
+
self._low_signal_prefixes.append((parts, max_d))
|
|
76
|
+
|
|
77
|
+
# Generated roots at depth 1 → add to excludes so os.walk never enters
|
|
78
|
+
top_generated = {
|
|
79
|
+
gr.path.split("/")[0]
|
|
80
|
+
for gr in topology.generated_roots
|
|
81
|
+
if "/" not in gr.path
|
|
82
|
+
}
|
|
83
|
+
self._extra_exclude_names = frozenset(top_generated)
|
|
84
|
+
|
|
85
|
+
# ------------------------------------------------------------------
|
|
86
|
+
# Gitignore
|
|
87
|
+
# ------------------------------------------------------------------
|
|
88
|
+
|
|
89
|
+
def _load_gitignore_spec(self) -> GitIgnoreSpec:
|
|
90
|
+
if self._gitignore_spec is None:
|
|
91
|
+
gitignore = self.root / ".gitignore"
|
|
92
|
+
lines: list[str] = []
|
|
93
|
+
if gitignore.exists():
|
|
94
|
+
try:
|
|
95
|
+
lines = gitignore.read_text(encoding="utf-8", errors="replace").splitlines()
|
|
96
|
+
except OSError:
|
|
97
|
+
pass
|
|
98
|
+
self._gitignore_spec = GitIgnoreSpec.from_lines(lines)
|
|
99
|
+
return self._gitignore_spec
|
|
100
|
+
|
|
101
|
+
def _is_excluded_by_gitignore(self, rel_path: str, is_dir: bool) -> bool:
|
|
102
|
+
spec = self._load_gitignore_spec()
|
|
103
|
+
path_to_match = rel_path + "/" if is_dir else rel_path
|
|
104
|
+
return spec.match_file(path_to_match)
|
|
105
|
+
|
|
106
|
+
# ------------------------------------------------------------------
|
|
107
|
+
# Depth budget computation — the core of adaptive traversal
|
|
108
|
+
# ------------------------------------------------------------------
|
|
109
|
+
|
|
110
|
+
def _compute_max_depth(self, rel_parts: tuple[str, ...]) -> int:
|
|
111
|
+
"""Return the maximum absolute depth allowed at *rel_parts*.
|
|
112
|
+
|
|
113
|
+
Depth is the number of path components from the repo root. Files
|
|
114
|
+
at depth D are included; the scan stops (dirnames cleared) when
|
|
115
|
+
depth >= returned value.
|
|
116
|
+
|
|
117
|
+
Priority order:
|
|
118
|
+
1. Inside a source root → deep budget (source_depth extra levels)
|
|
119
|
+
2. Ancestor of a source root → must allow traversal to reach it
|
|
120
|
+
3. Inside a low-signal root → restricted budget (low_signal_depth)
|
|
121
|
+
4. Default → base_depth
|
|
122
|
+
"""
|
|
123
|
+
if not self._source_prefixes and not self._low_signal_prefixes:
|
|
124
|
+
return self.base_depth
|
|
125
|
+
|
|
126
|
+
current_depth = len(rel_parts)
|
|
127
|
+
|
|
128
|
+
# Track the best depth found via ancestor matching (may have multiple
|
|
129
|
+
# source roots; return the maximum so all are reachable).
|
|
130
|
+
ancestor_best = self.base_depth
|
|
131
|
+
found_ancestor = False
|
|
132
|
+
|
|
133
|
+
for src_parts, src_max in self._source_prefixes:
|
|
134
|
+
n = len(src_parts)
|
|
135
|
+
if current_depth >= n:
|
|
136
|
+
# At or inside the source root
|
|
137
|
+
if rel_parts[:n] == src_parts:
|
|
138
|
+
return src_max # definite source territory — early exit
|
|
139
|
+
else:
|
|
140
|
+
# Ancestor check: src_parts starts with rel_parts?
|
|
141
|
+
if src_parts[:current_depth] == rel_parts:
|
|
142
|
+
found_ancestor = True
|
|
143
|
+
if src_max > ancestor_best:
|
|
144
|
+
ancestor_best = src_max
|
|
145
|
+
|
|
146
|
+
if found_ancestor:
|
|
147
|
+
return ancestor_best
|
|
148
|
+
|
|
149
|
+
# Low-signal roots (only if not already committed to a source path)
|
|
150
|
+
for ls_parts, ls_max in self._low_signal_prefixes:
|
|
151
|
+
n = len(ls_parts)
|
|
152
|
+
if current_depth >= n and rel_parts[:n] == ls_parts:
|
|
153
|
+
return ls_max
|
|
154
|
+
|
|
155
|
+
return self.base_depth
|
|
156
|
+
|
|
157
|
+
# ------------------------------------------------------------------
|
|
158
|
+
# Main traversal
|
|
159
|
+
# ------------------------------------------------------------------
|
|
160
|
+
|
|
161
|
+
def scan_tree(self) -> dict[str, Any]:
|
|
162
|
+
"""Build the nested file tree dictionary.
|
|
163
|
+
|
|
164
|
+
Returns dict where None = file (D-02) and dict = directory (D-01).
|
|
165
|
+
Depth limits are applied per-directory using topology-derived budgets.
|
|
166
|
+
"""
|
|
167
|
+
self._load_gitignore_spec()
|
|
168
|
+
root_tree: dict[str, Any] = {}
|
|
169
|
+
all_excludes = self._excludes | self._extra_exclude_names
|
|
170
|
+
|
|
171
|
+
for dirpath, dirnames, filenames in os.walk(self.root, followlinks=False):
|
|
172
|
+
current = Path(dirpath)
|
|
173
|
+
try:
|
|
174
|
+
rel = current.relative_to(self.root)
|
|
175
|
+
except ValueError:
|
|
176
|
+
continue
|
|
177
|
+
|
|
178
|
+
rel_parts = rel.parts
|
|
179
|
+
depth = len(rel_parts)
|
|
180
|
+
|
|
181
|
+
effective_max_depth = self._compute_max_depth(rel_parts)
|
|
182
|
+
|
|
183
|
+
if depth >= effective_max_depth:
|
|
184
|
+
dirnames.clear()
|
|
185
|
+
continue
|
|
186
|
+
|
|
187
|
+
# Filter dirnames in-place (critical: slice assignment)
|
|
188
|
+
dirnames[:] = [
|
|
189
|
+
d for d in dirnames
|
|
190
|
+
if d not in all_excludes
|
|
191
|
+
and not (current / d).is_symlink()
|
|
192
|
+
and not self._is_excluded_by_gitignore(
|
|
193
|
+
str(rel / d) if rel_parts else d,
|
|
194
|
+
is_dir=True,
|
|
195
|
+
)
|
|
196
|
+
]
|
|
197
|
+
|
|
198
|
+
node = self._get_or_create_node(root_tree, rel_parts)
|
|
199
|
+
|
|
200
|
+
for fname in filenames:
|
|
201
|
+
# Skip flag-shaped names (shell redirect artifacts)
|
|
202
|
+
if fname.startswith("-"):
|
|
203
|
+
continue
|
|
204
|
+
fpath = current / fname
|
|
205
|
+
if fpath.is_symlink():
|
|
206
|
+
continue
|
|
207
|
+
rel_file = str(rel / fname) if rel_parts else fname
|
|
208
|
+
if self._is_excluded_by_gitignore(rel_file, is_dir=False):
|
|
209
|
+
continue
|
|
210
|
+
node[fname] = None # D-02: None = file
|
|
211
|
+
|
|
212
|
+
# Ensure accepted subdirs exist as dict nodes
|
|
213
|
+
for d in dirnames:
|
|
214
|
+
if d not in node:
|
|
215
|
+
node[d] = {}
|
|
216
|
+
|
|
217
|
+
return root_tree
|
|
218
|
+
|
|
219
|
+
def _get_or_create_node(
|
|
220
|
+
self, tree: dict[str, Any], parts: tuple[str, ...]
|
|
221
|
+
) -> dict[str, Any]:
|
|
222
|
+
node = tree
|
|
223
|
+
for part in parts:
|
|
224
|
+
if part not in node or node[part] is None:
|
|
225
|
+
node[part] = {}
|
|
226
|
+
node = cast(dict[str, Any], node[part])
|
|
227
|
+
return node
|
|
228
|
+
|
|
229
|
+
# ------------------------------------------------------------------
|
|
230
|
+
# Manifest discovery — same interface as FileScanner
|
|
231
|
+
# ------------------------------------------------------------------
|
|
232
|
+
|
|
233
|
+
def find_manifests(self) -> list[str]:
|
|
234
|
+
"""Find manifest files at depth 0-1.
|
|
235
|
+
|
|
236
|
+
Identical logic to FileScanner.find_manifests() — depth-0 root
|
|
237
|
+
manifests plus depth-1 sub-package manifests, hidden dirs excluded.
|
|
238
|
+
"""
|
|
239
|
+
manifests: list[str] = []
|
|
240
|
+
for name in MANIFEST_NAMES:
|
|
241
|
+
candidate = self.root / name
|
|
242
|
+
if candidate.exists() and not candidate.is_symlink():
|
|
243
|
+
manifests.append(str(candidate))
|
|
244
|
+
try:
|
|
245
|
+
for child in self.root.iterdir():
|
|
246
|
+
if (
|
|
247
|
+
child.is_dir()
|
|
248
|
+
and not child.is_symlink()
|
|
249
|
+
and child.name not in self._excludes
|
|
250
|
+
and not child.name.startswith(".")
|
|
251
|
+
):
|
|
252
|
+
for name in MANIFEST_NAMES:
|
|
253
|
+
candidate = child / name
|
|
254
|
+
if candidate.exists() and not candidate.is_symlink():
|
|
255
|
+
manifests.append(str(candidate))
|
|
256
|
+
except PermissionError:
|
|
257
|
+
pass
|
|
258
|
+
return manifests
|
sourcecode/cli.py
CHANGED
|
@@ -714,6 +714,13 @@ def main(
|
|
|
714
714
|
# 1. Scan directory (SCAN-01 to SCAN-05)
|
|
715
715
|
redactor = SecretRedactor(enabled=not no_redact)
|
|
716
716
|
|
|
717
|
+
# Classify repository topology before scanning. This is a shallow
|
|
718
|
+
# filesystem read (depth 0-1 only) and completes in milliseconds.
|
|
719
|
+
# The topology drives per-directory depth budgets in AdaptiveScanner.
|
|
720
|
+
from sourcecode.adaptive_scanner import AdaptiveScanner
|
|
721
|
+
from sourcecode.repo_classifier import RepoClassifier
|
|
722
|
+
_topology = RepoClassifier().classify(target)
|
|
723
|
+
|
|
717
724
|
# Detect manifests before scan to adjust depth.
|
|
718
725
|
# find_manifests() only looks at depth 0-1, does not need the full tree.
|
|
719
726
|
_pre_scanner = FileScanner(target, max_depth=1)
|
|
@@ -735,7 +742,7 @@ def main(
|
|
|
735
742
|
no_tree = True # agents never need the raw file tree
|
|
736
743
|
typer.echo("[agent] dependencies env-map code-notes (no-tree)", err=True)
|
|
737
744
|
|
|
738
|
-
scanner =
|
|
745
|
+
scanner = AdaptiveScanner(target, topology=_topology, base_depth=effective_depth)
|
|
739
746
|
raw_tree = scanner.scan_tree()
|
|
740
747
|
|
|
741
748
|
# 2. Filter .env and *.secret entries from file tree (SEC-02, all levels)
|
|
@@ -775,16 +782,14 @@ def main(
|
|
|
775
782
|
detector = ProjectDetector(build_default_detectors())
|
|
776
783
|
workspace_analysis = WorkspaceAnalyzer().analyze(target, manifests)
|
|
777
784
|
|
|
778
|
-
#
|
|
779
|
-
#
|
|
780
|
-
# Only emit to TTY to avoid contaminating piped/CI output; agents read analysis_gaps.
|
|
785
|
+
# Adaptive traversal handles monorepo source root discovery automatically.
|
|
786
|
+
# Emit a diagnostic when topology confidence is low so users know why.
|
|
781
787
|
import sys as _sys
|
|
782
|
-
if
|
|
788
|
+
if _topology.workspace_type == "monorepo" and _topology.confidence < 0.5:
|
|
783
789
|
if _sys.stderr.isatty():
|
|
784
790
|
typer.echo(
|
|
785
|
-
|
|
786
|
-
"
|
|
787
|
-
"Use --depth 6 or higher for full coverage.",
|
|
791
|
+
"[traversal] monorepo detected but source root confidence is low "
|
|
792
|
+
f"({_topology.confidence:.0%}). Use --depth 8 or higher if files are missing.",
|
|
788
793
|
err=True,
|
|
789
794
|
)
|
|
790
795
|
|
|
@@ -896,7 +901,8 @@ def main(
|
|
|
896
901
|
workspace_root = target / workspace.path
|
|
897
902
|
if not workspace_root.exists() or not workspace_root.is_dir():
|
|
898
903
|
continue
|
|
899
|
-
|
|
904
|
+
_ws_topology = RepoClassifier().classify(workspace_root)
|
|
905
|
+
workspace_scanner = AdaptiveScanner(workspace_root, topology=_ws_topology, base_depth=depth)
|
|
900
906
|
workspace_tree = filter_sensitive_files(workspace_scanner.scan_tree())
|
|
901
907
|
workspace_manifests = workspace_scanner.find_manifests()
|
|
902
908
|
workspace_stacks, workspace_entry_points, _ = detector.detect(
|
|
@@ -1008,6 +1014,7 @@ def main(
|
|
|
1008
1014
|
metadata = AnalysisMetadata(
|
|
1009
1015
|
analyzed_path=str(target),
|
|
1010
1016
|
analyzer_fingerprints=_fingerprints,
|
|
1017
|
+
traversal_topology=_topology.as_dict(),
|
|
1011
1018
|
)
|
|
1012
1019
|
sm = SourceMap(
|
|
1013
1020
|
metadata=metadata,
|
|
@@ -1037,7 +1044,7 @@ def main(
|
|
|
1037
1044
|
target / ws.path,
|
|
1038
1045
|
(
|
|
1039
1046
|
filter_sensitive_files(
|
|
1040
|
-
|
|
1047
|
+
AdaptiveScanner(target / ws.path, base_depth=depth).scan_tree()
|
|
1041
1048
|
)
|
|
1042
1049
|
),
|
|
1043
1050
|
workspace=ws.path,
|
|
@@ -0,0 +1,570 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
"""Repository topology classifier for adaptive traversal.
|
|
4
|
+
|
|
5
|
+
Detects monorepo vs single-package structure, identifies source roots,
|
|
6
|
+
low-signal directories, and generated content. Feeds AdaptiveScanner
|
|
7
|
+
with per-path depth budgets so traversal is relevance-oriented, not
|
|
8
|
+
purely structural.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import json
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Any, Optional
|
|
15
|
+
|
|
16
|
+
# ---------------------------------------------------------------------------
|
|
17
|
+
# Signal tables
|
|
18
|
+
# ---------------------------------------------------------------------------
|
|
19
|
+
|
|
20
|
+
# Top-level dirs that almost always contain actual source code
|
|
21
|
+
_SOURCE_DIRS: frozenset[str] = frozenset({
|
|
22
|
+
"src", "lib", "source", "sources", "core",
|
|
23
|
+
"app", "server", "client", "backend", "frontend",
|
|
24
|
+
"cmd", "pkg", # Go conventions
|
|
25
|
+
"main", # Java src/main
|
|
26
|
+
"kotlin", "java", "scala", # JVM source dirs
|
|
27
|
+
})
|
|
28
|
+
|
|
29
|
+
# First-level dirs that act as workspace containers in monorepos
|
|
30
|
+
_WORKSPACE_CONTAINERS: frozenset[str] = frozenset({
|
|
31
|
+
"packages", "apps", "libs", "services", "internal",
|
|
32
|
+
"plugins", "modules", "components", "crates",
|
|
33
|
+
"workspaces", "projects",
|
|
34
|
+
})
|
|
35
|
+
|
|
36
|
+
# Directories with low signal value for AI code understanding
|
|
37
|
+
_LOW_SIGNAL_DIRS: frozenset[str] = frozenset({
|
|
38
|
+
"docs", "doc", "documentation", "docsrc", "website", "site",
|
|
39
|
+
"benchmark", "benchmarks", "bench", "perf", "perfs",
|
|
40
|
+
"examples", "example", "demo", "demos", "sample", "samples",
|
|
41
|
+
"fixtures", "fixture", "__fixtures__",
|
|
42
|
+
"scripts", "script", "tools", "tool",
|
|
43
|
+
"ci", ".ci",
|
|
44
|
+
"storybook", "stories", "__stories__",
|
|
45
|
+
"sandbox", "playground", "playgrounds",
|
|
46
|
+
"migrations", "migration",
|
|
47
|
+
".github", ".vscode", ".claude", ".cursor", ".idea",
|
|
48
|
+
"themes", "theme",
|
|
49
|
+
"static", "public", "assets",
|
|
50
|
+
})
|
|
51
|
+
|
|
52
|
+
# Directories to skip entirely — generated content and dependency stores
|
|
53
|
+
_GENERATED_DIRS: frozenset[str] = frozenset({
|
|
54
|
+
"dist", "build", "out", "output", "release", "releases",
|
|
55
|
+
"target", "coverage", ".next", ".nuxt", ".svelte-kit",
|
|
56
|
+
".turbo", "node_modules", "__pycache__",
|
|
57
|
+
".venv", "venv", "env",
|
|
58
|
+
".mypy_cache", ".pytest_cache", ".ruff_cache",
|
|
59
|
+
".nyc_output", ".tox",
|
|
60
|
+
"generated", ".generated", "gen", "_gen",
|
|
61
|
+
".cache", "cache",
|
|
62
|
+
"vendor",
|
|
63
|
+
".git",
|
|
64
|
+
})
|
|
65
|
+
|
|
66
|
+
# Manifest file names that mark a directory as a source package
|
|
67
|
+
_PACKAGE_MANIFESTS: frozenset[str] = frozenset({
|
|
68
|
+
"package.json", "pyproject.toml", "setup.py", "setup.cfg",
|
|
69
|
+
"go.mod", "Cargo.toml", "pom.xml", "build.gradle",
|
|
70
|
+
"build.gradle.kts", "composer.json", "Gemfile", "pubspec.yaml",
|
|
71
|
+
})
|
|
72
|
+
|
|
73
|
+
# Source file extensions — presence signals a directory has real code
|
|
74
|
+
_SOURCE_EXTENSIONS: frozenset[str] = frozenset({
|
|
75
|
+
".ts", ".tsx", ".js", ".jsx", ".mjs", ".cjs",
|
|
76
|
+
".py", ".go", ".rs", ".java", ".kt", ".rb",
|
|
77
|
+
".cs", ".swift", ".scala", ".cpp", ".c", ".h",
|
|
78
|
+
})
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
# ---------------------------------------------------------------------------
|
|
82
|
+
# Data structures
|
|
83
|
+
# ---------------------------------------------------------------------------
|
|
84
|
+
|
|
85
|
+
@dataclass
|
|
86
|
+
class SourceRoot:
|
|
87
|
+
"""A classified directory with traversal priority and depth budget."""
|
|
88
|
+
path: str # repo-relative path, forward slashes
|
|
89
|
+
signal: str # "high" | "medium" | "low" | "excluded"
|
|
90
|
+
reason: str # human-readable explanation
|
|
91
|
+
priority: float # 0.0–1.0 traversal priority
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
@dataclass
|
|
95
|
+
class ScanBudget:
|
|
96
|
+
"""Per-topology traversal budget constraints."""
|
|
97
|
+
max_files: int = 2000
|
|
98
|
+
base_depth: int = 4 # depth cap for unclassified paths
|
|
99
|
+
source_depth: int = 8 # additional levels allowed inside source roots
|
|
100
|
+
low_signal_depth: int = 2 # additional levels allowed inside low-signal roots
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
@dataclass
|
|
104
|
+
class RepoTopology:
|
|
105
|
+
"""Classified repository topology for adaptive traversal.
|
|
106
|
+
|
|
107
|
+
Produced by RepoClassifier.classify() and consumed by AdaptiveScanner.
|
|
108
|
+
The three root lists partition the repository's first-level directories
|
|
109
|
+
into source code, low-value content, and generated/excluded content.
|
|
110
|
+
"""
|
|
111
|
+
workspace_type: str = "unknown" # "monorepo" | "single-package" | "unknown"
|
|
112
|
+
source_roots: list[SourceRoot] = field(default_factory=list)
|
|
113
|
+
low_signal_roots: list[SourceRoot] = field(default_factory=list)
|
|
114
|
+
generated_roots: list[SourceRoot] = field(default_factory=list)
|
|
115
|
+
package_manager: str = "unknown"
|
|
116
|
+
confidence: float = 0.0
|
|
117
|
+
scan_budget: ScanBudget = field(default_factory=ScanBudget)
|
|
118
|
+
|
|
119
|
+
def as_dict(self) -> dict[str, Any]:
|
|
120
|
+
return {
|
|
121
|
+
"workspace_type": self.workspace_type,
|
|
122
|
+
"source_roots": [
|
|
123
|
+
{"path": r.path, "reason": r.reason, "priority": round(r.priority, 2)}
|
|
124
|
+
for r in self.source_roots
|
|
125
|
+
],
|
|
126
|
+
"low_signal_roots": [r.path for r in self.low_signal_roots],
|
|
127
|
+
"generated_roots": [r.path for r in self.generated_roots],
|
|
128
|
+
"package_manager": self.package_manager,
|
|
129
|
+
"confidence": round(self.confidence, 2),
|
|
130
|
+
"scan_budget": {
|
|
131
|
+
"base_depth": self.scan_budget.base_depth,
|
|
132
|
+
"source_depth": self.scan_budget.source_depth,
|
|
133
|
+
"low_signal_depth": self.scan_budget.low_signal_depth,
|
|
134
|
+
},
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
# ---------------------------------------------------------------------------
|
|
139
|
+
# RepoClassifier
|
|
140
|
+
# ---------------------------------------------------------------------------
|
|
141
|
+
|
|
142
|
+
class RepoClassifier:
|
|
143
|
+
"""Detects repository topology and classifies directories for adaptive traversal.
|
|
144
|
+
|
|
145
|
+
Reads workspace config files (pnpm-workspace.yaml, package.json workspaces,
|
|
146
|
+
turbo.json, nx.json, lerna.json, go.work, Cargo.toml), resolves package
|
|
147
|
+
glob patterns, and identifies which directories contain real source code
|
|
148
|
+
vs. docs, benchmarks, or generated content.
|
|
149
|
+
|
|
150
|
+
Classification is fast: only depth-0 and depth-1 filesystem reads.
|
|
151
|
+
"""
|
|
152
|
+
|
|
153
|
+
def classify(self, root: Path) -> RepoTopology:
|
|
154
|
+
"""Classify the repository at *root* and return its topology."""
|
|
155
|
+
topology = RepoTopology()
|
|
156
|
+
topology.package_manager = self._detect_package_manager(root)
|
|
157
|
+
|
|
158
|
+
markers = self._detect_markers(root)
|
|
159
|
+
workspace_patterns = self._read_workspace_patterns(root, markers)
|
|
160
|
+
|
|
161
|
+
try:
|
|
162
|
+
root_children = [
|
|
163
|
+
d for d in sorted(root.iterdir())
|
|
164
|
+
if d.is_dir() and not d.is_symlink()
|
|
165
|
+
]
|
|
166
|
+
except PermissionError:
|
|
167
|
+
root_children = []
|
|
168
|
+
|
|
169
|
+
source_roots = self._find_source_roots(
|
|
170
|
+
root, root_children, workspace_patterns, bool(markers) or bool(workspace_patterns)
|
|
171
|
+
)
|
|
172
|
+
low_signal_roots = self._find_low_signal_roots(root, root_children, source_roots)
|
|
173
|
+
generated_roots = self._find_generated_roots(root, root_children)
|
|
174
|
+
|
|
175
|
+
# Monorepo heuristic: explicit markers OR multiple packages found via
|
|
176
|
+
# workspace containers (packages/*, apps/*, etc.) without top-level src/
|
|
177
|
+
container_sourced = [
|
|
178
|
+
r for r in source_roots
|
|
179
|
+
if "container:" in r.reason or "workspace:" in r.reason
|
|
180
|
+
]
|
|
181
|
+
has_top_level_src = any(
|
|
182
|
+
r.reason == "top_level_source" for r in source_roots
|
|
183
|
+
)
|
|
184
|
+
is_monorepo = (
|
|
185
|
+
bool(markers)
|
|
186
|
+
or bool(workspace_patterns)
|
|
187
|
+
or (len(container_sourced) >= 2 and not has_top_level_src)
|
|
188
|
+
)
|
|
189
|
+
topology.workspace_type = "monorepo" if is_monorepo else "single-package"
|
|
190
|
+
|
|
191
|
+
topology.source_roots = sorted(source_roots, key=lambda r: -r.priority)
|
|
192
|
+
topology.low_signal_roots = low_signal_roots
|
|
193
|
+
topology.generated_roots = generated_roots
|
|
194
|
+
topology.confidence = self._compute_confidence(topology, is_monorepo)
|
|
195
|
+
topology.scan_budget = self._compute_budget(topology)
|
|
196
|
+
|
|
197
|
+
return topology
|
|
198
|
+
|
|
199
|
+
# ------------------------------------------------------------------
|
|
200
|
+
# Package manager detection
|
|
201
|
+
# ------------------------------------------------------------------
|
|
202
|
+
|
|
203
|
+
def _detect_package_manager(self, root: Path) -> str:
|
|
204
|
+
if (root / "pnpm-lock.yaml").exists() or (root / "pnpm-workspace.yaml").exists():
|
|
205
|
+
return "pnpm"
|
|
206
|
+
if (root / "yarn.lock").exists():
|
|
207
|
+
return "yarn"
|
|
208
|
+
if (root / "bun.lockb").exists() or (root / "bun.lock").exists():
|
|
209
|
+
return "bun"
|
|
210
|
+
if (root / "package-lock.json").exists():
|
|
211
|
+
return "npm"
|
|
212
|
+
if (root / "go.work").exists():
|
|
213
|
+
return "go-workspace"
|
|
214
|
+
if (root / "go.mod").exists():
|
|
215
|
+
return "go-modules"
|
|
216
|
+
if (root / "Cargo.toml").exists():
|
|
217
|
+
return "cargo"
|
|
218
|
+
if (root / "uv.lock").exists():
|
|
219
|
+
return "uv"
|
|
220
|
+
if (root / "Pipfile").exists():
|
|
221
|
+
return "pipenv"
|
|
222
|
+
if (root / "pyproject.toml").exists() or (root / "setup.py").exists():
|
|
223
|
+
return "python"
|
|
224
|
+
return "unknown"
|
|
225
|
+
|
|
226
|
+
# ------------------------------------------------------------------
|
|
227
|
+
# Workspace marker detection
|
|
228
|
+
# ------------------------------------------------------------------
|
|
229
|
+
|
|
230
|
+
def _detect_markers(self, root: Path) -> list[str]:
|
|
231
|
+
"""Return list of workspace marker file names present at root."""
|
|
232
|
+
markers: list[str] = []
|
|
233
|
+
for name in ("pnpm-workspace.yaml", "go.work", "turbo.json", "lerna.json", "nx.json"):
|
|
234
|
+
if (root / name).exists():
|
|
235
|
+
markers.append(name)
|
|
236
|
+
|
|
237
|
+
cargo = root / "Cargo.toml"
|
|
238
|
+
if cargo.exists():
|
|
239
|
+
try:
|
|
240
|
+
content = cargo.read_text(encoding="utf-8", errors="replace")
|
|
241
|
+
if "[workspace]" in content:
|
|
242
|
+
markers.append("Cargo.toml[workspace]")
|
|
243
|
+
except OSError:
|
|
244
|
+
pass
|
|
245
|
+
|
|
246
|
+
pkg = root / "package.json"
|
|
247
|
+
if pkg.exists():
|
|
248
|
+
try:
|
|
249
|
+
data = json.loads(pkg.read_text(encoding="utf-8", errors="replace"))
|
|
250
|
+
if "workspaces" in data:
|
|
251
|
+
markers.append("package.json[workspaces]")
|
|
252
|
+
except (json.JSONDecodeError, OSError, ValueError):
|
|
253
|
+
pass
|
|
254
|
+
|
|
255
|
+
return markers
|
|
256
|
+
|
|
257
|
+
# ------------------------------------------------------------------
|
|
258
|
+
# Workspace pattern extraction from config files
|
|
259
|
+
# ------------------------------------------------------------------
|
|
260
|
+
|
|
261
|
+
def _read_workspace_patterns(self, root: Path, markers: list[str]) -> list[str]:
|
|
262
|
+
"""Extract glob patterns from workspace config files."""
|
|
263
|
+
patterns: list[str] = []
|
|
264
|
+
|
|
265
|
+
if "pnpm-workspace.yaml" in markers:
|
|
266
|
+
patterns.extend(self._patterns_from_pnpm(root))
|
|
267
|
+
|
|
268
|
+
if "package.json[workspaces]" in markers:
|
|
269
|
+
patterns.extend(self._patterns_from_npm_workspaces(root))
|
|
270
|
+
|
|
271
|
+
if "nx.json" in markers:
|
|
272
|
+
patterns.extend(self._patterns_from_nx(root))
|
|
273
|
+
|
|
274
|
+
if "lerna.json" in markers:
|
|
275
|
+
patterns.extend(self._patterns_from_lerna(root))
|
|
276
|
+
|
|
277
|
+
if "Cargo.toml[workspace]" in markers:
|
|
278
|
+
patterns.extend(self._patterns_from_cargo_workspace(root))
|
|
279
|
+
|
|
280
|
+
if "go.work" in markers:
|
|
281
|
+
patterns.extend(self._patterns_from_go_work(root))
|
|
282
|
+
|
|
283
|
+
return list(dict.fromkeys(patterns)) # deduplicate, preserve order
|
|
284
|
+
|
|
285
|
+
def _patterns_from_pnpm(self, root: Path) -> list[str]:
|
|
286
|
+
try:
|
|
287
|
+
content = (root / "pnpm-workspace.yaml").read_text(encoding="utf-8", errors="replace")
|
|
288
|
+
result = []
|
|
289
|
+
for line in content.splitlines():
|
|
290
|
+
stripped = line.strip().lstrip("- ").strip("'\"")
|
|
291
|
+
if stripped and not stripped.startswith("#"):
|
|
292
|
+
result.append(stripped)
|
|
293
|
+
return result
|
|
294
|
+
except OSError:
|
|
295
|
+
return []
|
|
296
|
+
|
|
297
|
+
def _patterns_from_npm_workspaces(self, root: Path) -> list[str]:
|
|
298
|
+
try:
|
|
299
|
+
data = json.loads((root / "package.json").read_text(encoding="utf-8", errors="replace"))
|
|
300
|
+
ws = data.get("workspaces", [])
|
|
301
|
+
if isinstance(ws, list):
|
|
302
|
+
return [str(p) for p in ws]
|
|
303
|
+
if isinstance(ws, dict):
|
|
304
|
+
return [str(p) for p in ws.get("packages", [])]
|
|
305
|
+
except (json.JSONDecodeError, OSError, ValueError):
|
|
306
|
+
pass
|
|
307
|
+
return []
|
|
308
|
+
|
|
309
|
+
def _patterns_from_nx(self, root: Path) -> list[str]:
|
|
310
|
+
try:
|
|
311
|
+
data = json.loads((root / "nx.json").read_text(encoding="utf-8", errors="replace"))
|
|
312
|
+
patterns = []
|
|
313
|
+
wl = data.get("workspaceLayout", {})
|
|
314
|
+
if "appsDir" in wl:
|
|
315
|
+
patterns.append(f"{wl['appsDir']}/*")
|
|
316
|
+
if "libsDir" in wl:
|
|
317
|
+
patterns.append(f"{wl['libsDir']}/*")
|
|
318
|
+
return patterns
|
|
319
|
+
except (json.JSONDecodeError, OSError, ValueError):
|
|
320
|
+
return []
|
|
321
|
+
|
|
322
|
+
def _patterns_from_lerna(self, root: Path) -> list[str]:
|
|
323
|
+
try:
|
|
324
|
+
data = json.loads((root / "lerna.json").read_text(encoding="utf-8", errors="replace"))
|
|
325
|
+
pkgs = data.get("packages", ["packages/*"])
|
|
326
|
+
return [str(p) for p in pkgs] if isinstance(pkgs, list) else []
|
|
327
|
+
except (json.JSONDecodeError, OSError, ValueError):
|
|
328
|
+
return []
|
|
329
|
+
|
|
330
|
+
def _patterns_from_cargo_workspace(self, root: Path) -> list[str]:
|
|
331
|
+
try:
|
|
332
|
+
content = (root / "Cargo.toml").read_text(encoding="utf-8", errors="replace")
|
|
333
|
+
in_members = False
|
|
334
|
+
patterns = []
|
|
335
|
+
for line in content.splitlines():
|
|
336
|
+
stripped = line.strip()
|
|
337
|
+
if "members" in stripped and "=" in stripped:
|
|
338
|
+
in_members = True
|
|
339
|
+
if in_members:
|
|
340
|
+
for quote in ('"', "'"):
|
|
341
|
+
if quote in stripped:
|
|
342
|
+
for segment in stripped.split(quote):
|
|
343
|
+
segment = segment.strip(" [],")
|
|
344
|
+
if segment and "/" in segment:
|
|
345
|
+
patterns.append(segment)
|
|
346
|
+
if "]" in stripped:
|
|
347
|
+
in_members = False
|
|
348
|
+
return patterns
|
|
349
|
+
except OSError:
|
|
350
|
+
return []
|
|
351
|
+
|
|
352
|
+
def _patterns_from_go_work(self, root: Path) -> list[str]:
|
|
353
|
+
try:
|
|
354
|
+
content = (root / "go.work").read_text(encoding="utf-8", errors="replace")
|
|
355
|
+
patterns = []
|
|
356
|
+
for line in content.splitlines():
|
|
357
|
+
stripped = line.strip()
|
|
358
|
+
if stripped.startswith("use "):
|
|
359
|
+
target = stripped[4:].strip().strip("()")
|
|
360
|
+
if target and target != ".":
|
|
361
|
+
patterns.append(target.removeprefix("./").rstrip("/"))
|
|
362
|
+
elif stripped.startswith("./") and not stripped.startswith("//"):
|
|
363
|
+
patterns.append(stripped.removeprefix("./").rstrip())
|
|
364
|
+
return patterns
|
|
365
|
+
except OSError:
|
|
366
|
+
return []
|
|
367
|
+
|
|
368
|
+
# ------------------------------------------------------------------
|
|
369
|
+
# Source root discovery
|
|
370
|
+
# ------------------------------------------------------------------
|
|
371
|
+
|
|
372
|
+
def _find_source_roots(
|
|
373
|
+
self,
|
|
374
|
+
root: Path,
|
|
375
|
+
root_children: list[Path],
|
|
376
|
+
workspace_patterns: list[str],
|
|
377
|
+
is_monorepo: bool,
|
|
378
|
+
) -> list[SourceRoot]:
|
|
379
|
+
"""Identify directories that contain actual source code."""
|
|
380
|
+
result: list[SourceRoot] = []
|
|
381
|
+
seen: set[str] = set()
|
|
382
|
+
|
|
383
|
+
def _add(path_str: str, reason: str, priority: float) -> None:
|
|
384
|
+
if path_str not in seen:
|
|
385
|
+
seen.add(path_str)
|
|
386
|
+
result.append(SourceRoot(
|
|
387
|
+
path=path_str, signal="high", reason=reason, priority=priority
|
|
388
|
+
))
|
|
389
|
+
|
|
390
|
+
# 1. Resolve workspace glob patterns → packages → src/
|
|
391
|
+
for pattern in workspace_patterns:
|
|
392
|
+
try:
|
|
393
|
+
for pkg_dir in sorted(root.glob(pattern)):
|
|
394
|
+
if not pkg_dir.is_dir() or pkg_dir.is_symlink():
|
|
395
|
+
continue
|
|
396
|
+
try:
|
|
397
|
+
rel = pkg_dir.relative_to(root)
|
|
398
|
+
except ValueError:
|
|
399
|
+
continue
|
|
400
|
+
rel_str = str(rel).replace("\\", "/")
|
|
401
|
+
if not self._is_allowed_path(rel_str):
|
|
402
|
+
continue
|
|
403
|
+
|
|
404
|
+
found_src = False
|
|
405
|
+
for src_name in ("src", "lib", "source"):
|
|
406
|
+
src_dir = pkg_dir / src_name
|
|
407
|
+
if src_dir.is_dir() and not src_dir.is_symlink():
|
|
408
|
+
_add(f"{rel_str}/{src_name}", f"workspace:{pattern}", 0.92)
|
|
409
|
+
found_src = True
|
|
410
|
+
|
|
411
|
+
if not found_src and self._has_source_signal(pkg_dir):
|
|
412
|
+
_add(rel_str, f"workspace_flat:{pattern}", 0.72)
|
|
413
|
+
except Exception:
|
|
414
|
+
continue
|
|
415
|
+
|
|
416
|
+
# 2. Check known workspace container dirs even without explicit patterns
|
|
417
|
+
for child in root_children:
|
|
418
|
+
name = child.name
|
|
419
|
+
if name not in _WORKSPACE_CONTAINERS:
|
|
420
|
+
continue
|
|
421
|
+
try:
|
|
422
|
+
for pkg_dir in sorted(child.iterdir()):
|
|
423
|
+
if not pkg_dir.is_dir() or pkg_dir.is_symlink():
|
|
424
|
+
continue
|
|
425
|
+
try:
|
|
426
|
+
rel = pkg_dir.relative_to(root)
|
|
427
|
+
except ValueError:
|
|
428
|
+
continue
|
|
429
|
+
rel_str = str(rel).replace("\\", "/")
|
|
430
|
+
if not self._is_allowed_path(rel_str):
|
|
431
|
+
continue
|
|
432
|
+
|
|
433
|
+
found_src = False
|
|
434
|
+
for src_name in ("src", "lib", "source"):
|
|
435
|
+
src_dir = pkg_dir / src_name
|
|
436
|
+
if src_dir.is_dir() and not src_dir.is_symlink():
|
|
437
|
+
_add(f"{rel_str}/{src_name}", f"container:{name}", 0.88)
|
|
438
|
+
found_src = True
|
|
439
|
+
|
|
440
|
+
if not found_src and self._has_source_signal(pkg_dir):
|
|
441
|
+
_add(rel_str, f"container_flat:{name}", 0.68)
|
|
442
|
+
except PermissionError:
|
|
443
|
+
continue
|
|
444
|
+
|
|
445
|
+
# 3. Top-level source dirs (single-package repos or workspace containers)
|
|
446
|
+
for child in root_children:
|
|
447
|
+
name = child.name
|
|
448
|
+
if name in _SOURCE_DIRS and name not in _GENERATED_DIRS:
|
|
449
|
+
try:
|
|
450
|
+
rel_str = str(child.relative_to(root)).replace("\\", "/")
|
|
451
|
+
_add(rel_str, "top_level_source", 0.95)
|
|
452
|
+
except ValueError:
|
|
453
|
+
pass
|
|
454
|
+
|
|
455
|
+
# 4. Workspace containers themselves if they contain source files at root
|
|
456
|
+
for child in root_children:
|
|
457
|
+
name = child.name
|
|
458
|
+
if name in _WORKSPACE_CONTAINERS and name not in _GENERATED_DIRS:
|
|
459
|
+
try:
|
|
460
|
+
rel_str = str(child.relative_to(root)).replace("\\", "/")
|
|
461
|
+
except ValueError:
|
|
462
|
+
continue
|
|
463
|
+
if rel_str not in seen and self._has_source_signal(child):
|
|
464
|
+
_add(rel_str, f"workspace_container_source:{name}", 0.55)
|
|
465
|
+
|
|
466
|
+
return result
|
|
467
|
+
|
|
468
|
+
def _has_source_signal(self, directory: Path) -> bool:
|
|
469
|
+
"""Return True if directory has a manifest or source files."""
|
|
470
|
+
for name in _PACKAGE_MANIFESTS:
|
|
471
|
+
if (directory / name).exists():
|
|
472
|
+
return True
|
|
473
|
+
try:
|
|
474
|
+
for entry in directory.iterdir():
|
|
475
|
+
if entry.is_file() and entry.suffix.lower() in _SOURCE_EXTENSIONS:
|
|
476
|
+
return True
|
|
477
|
+
if entry.name in _PACKAGE_MANIFESTS:
|
|
478
|
+
return True
|
|
479
|
+
except PermissionError:
|
|
480
|
+
pass
|
|
481
|
+
return False
|
|
482
|
+
|
|
483
|
+
def _is_allowed_path(self, rel_str: str) -> bool:
|
|
484
|
+
parts = rel_str.split("/")
|
|
485
|
+
return all(p not in _GENERATED_DIRS for p in parts)
|
|
486
|
+
|
|
487
|
+
# ------------------------------------------------------------------
|
|
488
|
+
# Low-signal root discovery
|
|
489
|
+
# ------------------------------------------------------------------
|
|
490
|
+
|
|
491
|
+
def _find_low_signal_roots(
|
|
492
|
+
self,
|
|
493
|
+
root: Path,
|
|
494
|
+
root_children: list[Path],
|
|
495
|
+
source_roots: list[SourceRoot],
|
|
496
|
+
) -> list[SourceRoot]:
|
|
497
|
+
"""Identify root-level directories with low signal value."""
|
|
498
|
+
top_source_names = {sr.path.split("/")[0] for sr in source_roots}
|
|
499
|
+
low_signal: list[SourceRoot] = []
|
|
500
|
+
|
|
501
|
+
for child in root_children:
|
|
502
|
+
name = child.name
|
|
503
|
+
if name in top_source_names or name in _GENERATED_DIRS:
|
|
504
|
+
continue
|
|
505
|
+
try:
|
|
506
|
+
rel_str = str(child.relative_to(root)).replace("\\", "/")
|
|
507
|
+
except ValueError:
|
|
508
|
+
continue
|
|
509
|
+
|
|
510
|
+
if name in _LOW_SIGNAL_DIRS:
|
|
511
|
+
low_signal.append(SourceRoot(
|
|
512
|
+
path=rel_str, signal="low",
|
|
513
|
+
reason=f"low_signal:{name}", priority=0.15,
|
|
514
|
+
))
|
|
515
|
+
elif name.startswith("."):
|
|
516
|
+
low_signal.append(SourceRoot(
|
|
517
|
+
path=rel_str, signal="low",
|
|
518
|
+
reason="hidden_dir", priority=0.05,
|
|
519
|
+
))
|
|
520
|
+
|
|
521
|
+
return low_signal
|
|
522
|
+
|
|
523
|
+
# ------------------------------------------------------------------
|
|
524
|
+
# Generated root discovery
|
|
525
|
+
# ------------------------------------------------------------------
|
|
526
|
+
|
|
527
|
+
def _find_generated_roots(
|
|
528
|
+
self,
|
|
529
|
+
root: Path,
|
|
530
|
+
root_children: list[Path],
|
|
531
|
+
) -> list[SourceRoot]:
|
|
532
|
+
"""Identify root-level generated/excluded directories."""
|
|
533
|
+
generated: list[SourceRoot] = []
|
|
534
|
+
for child in root_children:
|
|
535
|
+
name = child.name
|
|
536
|
+
if name in _GENERATED_DIRS:
|
|
537
|
+
generated.append(SourceRoot(
|
|
538
|
+
path=name, signal="excluded",
|
|
539
|
+
reason=f"generated:{name}", priority=0.0,
|
|
540
|
+
))
|
|
541
|
+
return generated
|
|
542
|
+
|
|
543
|
+
# ------------------------------------------------------------------
|
|
544
|
+
# Budget and confidence
|
|
545
|
+
# ------------------------------------------------------------------
|
|
546
|
+
|
|
547
|
+
def _compute_confidence(self, topology: RepoTopology, is_monorepo: bool) -> float:
|
|
548
|
+
sc = len(topology.source_roots)
|
|
549
|
+
if sc >= 5:
|
|
550
|
+
return 0.95
|
|
551
|
+
if sc >= 2:
|
|
552
|
+
return 0.85
|
|
553
|
+
if sc >= 1:
|
|
554
|
+
return 0.75 if is_monorepo else 0.80
|
|
555
|
+
return 0.30
|
|
556
|
+
|
|
557
|
+
def _compute_budget(self, topology: RepoTopology) -> ScanBudget:
|
|
558
|
+
if topology.workspace_type == "monorepo":
|
|
559
|
+
return ScanBudget(
|
|
560
|
+
max_files=2000,
|
|
561
|
+
base_depth=4,
|
|
562
|
+
source_depth=8,
|
|
563
|
+
low_signal_depth=2,
|
|
564
|
+
)
|
|
565
|
+
return ScanBudget(
|
|
566
|
+
max_files=2000,
|
|
567
|
+
base_depth=6,
|
|
568
|
+
source_depth=8,
|
|
569
|
+
low_signal_depth=2,
|
|
570
|
+
)
|
sourcecode/schema.py
CHANGED
|
@@ -34,6 +34,7 @@ class AnalysisMetadata:
|
|
|
34
34
|
sourcecode_version: str = field(default_factory=_sourcecode_version)
|
|
35
35
|
analyzed_path: str = ""
|
|
36
36
|
analyzer_fingerprints: dict[str, str] = field(default_factory=dict)
|
|
37
|
+
traversal_topology: Optional[dict[str, Any]] = None
|
|
37
38
|
|
|
38
39
|
|
|
39
40
|
@dataclass
|
sourcecode/serializer.py
CHANGED
|
@@ -927,6 +927,9 @@ def _contract_view_minimal(
|
|
|
927
927
|
"project": project,
|
|
928
928
|
}
|
|
929
929
|
|
|
930
|
+
if sm.metadata.traversal_topology:
|
|
931
|
+
result["traversal"] = sm.metadata.traversal_topology
|
|
932
|
+
|
|
930
933
|
# Per-file contracts
|
|
931
934
|
if contracts:
|
|
932
935
|
serialized: list[dict[str, Any]] = []
|
|
@@ -1170,6 +1173,8 @@ def _contract_view_standard(
|
|
|
1170
1173
|
],
|
|
1171
1174
|
"entry_points": ep_groups["production"],
|
|
1172
1175
|
}
|
|
1176
|
+
if sm.metadata.traversal_topology:
|
|
1177
|
+
result["traversal"] = sm.metadata.traversal_topology
|
|
1173
1178
|
if ep_groups["development"]:
|
|
1174
1179
|
result["development_entry_points"] = ep_groups["development"]
|
|
1175
1180
|
|
|
@@ -1,9 +1,10 @@
|
|
|
1
|
-
sourcecode/__init__.py,sha256=
|
|
1
|
+
sourcecode/__init__.py,sha256=RjrfBH06OIJiq-xk4Hadj8Zl3Soer5r1Ct1ogF0xqaU,103
|
|
2
|
+
sourcecode/adaptive_scanner.py,sha256=6dh34C2qZXyRbw-8xBhbEwDdXanM6CRFRWayVoYITnA,10190
|
|
2
3
|
sourcecode/architecture_analyzer.py,sha256=H6noGgVArUJ25z1qC0fFA0KvJJeHZYyhKvKSkOyWHUk,23096
|
|
3
4
|
sourcecode/architecture_summary.py,sha256=rSY5MRiaz4N1YdG0pqDTDuFjSN7PO_Zplx-dtNzv2Yo,19985
|
|
4
5
|
sourcecode/ast_extractor.py,sha256=0OHQwTUBBc9lmqPLryVeB1z8dGIC6NhLlar800CD9oI,41129
|
|
5
6
|
sourcecode/classifier.py,sha256=GKTMN8qKZX7ponSwDJfN08RrasI4CVpq1_gFBgEopps,7093
|
|
6
|
-
sourcecode/cli.py,sha256=
|
|
7
|
+
sourcecode/cli.py,sha256=dJ0kkwC0pQ4LJyhjlbtHKSpD-TvRQQyhdhvjRCHPA8o,65280
|
|
7
8
|
sourcecode/code_notes_analyzer.py,sha256=rRd8bFYV0krjlxxQV0wenwE9K7pVpUQSR7KvSvUQKw4,9226
|
|
8
9
|
sourcecode/confidence_analyzer.py,sha256=HxJMPLI5ulqtkncnv98W4iVO6yMbpQo87VuxiuNbDmY,12167
|
|
9
10
|
sourcecode/context_summarizer.py,sha256=CiQrfBEzun949bWvmLabWoj2HhPn6Lw62ofqnsy0FlQ,6503
|
|
@@ -21,11 +22,12 @@ sourcecode/metrics_analyzer.py,sha256=e2cFwB9XubFq_dIVsP2PLjpr4wX0N6ulb3ol3sGDUe
|
|
|
21
22
|
sourcecode/prepare_context.py,sha256=vxEzr8czS3MFbdTx4hBJQlJLrl9cuvbHdL3ZokxFkvo,31384
|
|
22
23
|
sourcecode/redactor.py,sha256=xuGcadGEHaPw4qZXlMDvzMCsr4VOkdp3oBQptHyJk8c,2884
|
|
23
24
|
sourcecode/relevance_scorer.py,sha256=ea7_7AHVgahVEWK3ebKOpG67agzG_pGICu5f2KgzrIA,8133
|
|
25
|
+
sourcecode/repo_classifier.py,sha256=FG1vaWKdWXsWdl-S8hjVMiTqcwgaRXkDyvK4rPcOGtQ,22681
|
|
24
26
|
sourcecode/runtime_classifier.py,sha256=zWX3r3HCKHc-qtIobErOa8aKMmaoPYREtJKvPcBGPjQ,14792
|
|
25
27
|
sourcecode/scanner.py,sha256=aM3h9-DCQ3xKpeHpHYdo2vX6T5P95HA_YwZbkAVNwmo,8288
|
|
26
|
-
sourcecode/schema.py,sha256=
|
|
28
|
+
sourcecode/schema.py,sha256=dVA-3EbHBakHLkgeZF-LfjKClEFRgPZkzblXpDTshFA,20796
|
|
27
29
|
sourcecode/semantic_analyzer.py,sha256=asQfJf-EhzYaOTA-iMuZsrVXtbW7SV2WEKCxgsxa88Y,79413
|
|
28
|
-
sourcecode/serializer.py,sha256=
|
|
30
|
+
sourcecode/serializer.py,sha256=qJRJV_z-T_wU615KMA1ez5IIeV3wcexh29lY4-fcgjs,51329
|
|
29
31
|
sourcecode/summarizer.py,sha256=ZuzIdm3t8A-d5MuQL0TSNLrd-L0IQIuguIxeNXMNJf8,16070
|
|
30
32
|
sourcecode/tree_utils.py,sha256=Fj9OIuUksBvgibNd3feog0sMDjVypJzPexp5lvMoYWI,1424
|
|
31
33
|
sourcecode/workspace.py,sha256=fQlVoNx8S-fSHpKoJ0JBvEHCFkxszH0KZVJed1i3TRk,6845
|
|
@@ -56,8 +58,8 @@ sourcecode/telemetry/consent.py,sha256=wLMvGNJeSSyZoNkQXpoUioY6mMv4Qdvuw7S9jAEWn
|
|
|
56
58
|
sourcecode/telemetry/events.py,sha256=oEvvulfsv5GIDWG2174gSS6tNB95w38AIYiYeifGKlE,2294
|
|
57
59
|
sourcecode/telemetry/filters.py,sha256=Asa71oRl7q3Wt_FMwuufIZJFzSYdgRNKS8LHCIyFeYE,4805
|
|
58
60
|
sourcecode/telemetry/transport.py,sha256=KJeIPCPWMdmbCP3ySGs2iUlia34U6vWne2dZsUezesw,1560
|
|
59
|
-
sourcecode-0.
|
|
60
|
-
sourcecode-0.
|
|
61
|
-
sourcecode-0.
|
|
62
|
-
sourcecode-0.
|
|
63
|
-
sourcecode-0.
|
|
61
|
+
sourcecode-0.38.0.dist-info/METADATA,sha256=-RJ8bdDTHeuGmWN-iNo4eYkjPTuSnfriYYD1O59Gmwc,25209
|
|
62
|
+
sourcecode-0.38.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
63
|
+
sourcecode-0.38.0.dist-info/entry_points.txt,sha256=ex3F9rmbXeyDIoFQHtkEqTsKSaJow8F0LrVu8XfIktQ,57
|
|
64
|
+
sourcecode-0.38.0.dist-info/licenses/LICENSE,sha256=7DdHrU9Z_3e7dSvq4ISijZNjnuHo5NIHNiHDouMQ9JU,10491
|
|
65
|
+
sourcecode-0.38.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|