sourcecode 0.37.0__py3-none-any.whl → 0.38.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sourcecode/__init__.py CHANGED
@@ -1,3 +1,3 @@
1
1
  """sourcecode — Deterministic codebase context maps for AI coding agents."""
2
2
 
3
- __version__ = "0.37.0"
3
+ __version__ = "0.38.0"
@@ -0,0 +1,258 @@
1
+ from __future__ import annotations
2
+
3
+ """Adaptive file tree scanner with topology-aware depth budgets.
4
+
5
+ Replaces pure depth filtering with relevance-oriented traversal:
6
+ - Source roots (packages/*/src, apps/*/src) get deep scan budgets.
7
+ - Low-signal directories (docs/, benchmarks/) are limited to 2 levels.
8
+ - Generated/excluded directories (dist/, node_modules/) are skipped.
9
+ - Unclassified directories fall back to the base depth limit.
10
+
11
+ Drop-in replacement for FileScanner: same scan_tree() and find_manifests()
12
+ interface, same output format (None = file, dict = directory).
13
+ """
14
+
15
+ import os
16
+ from pathlib import Path
17
+ from typing import Any, Optional, cast
18
+
19
+ from pathspec import GitIgnoreSpec
20
+
21
+ from sourcecode.repo_classifier import RepoTopology
22
+ from sourcecode.scanner import DEFAULT_EXCLUDES, MANIFEST_NAMES
23
+
24
+
25
+ class AdaptiveScanner:
26
+ """File tree scanner driven by repository topology.
27
+
28
+ When *topology* is provided, traversal depth is controlled per-directory:
29
+ directories inside source roots receive a deep budget; low-signal dirs
30
+ are restricted; generated dirs are excluded entirely.
31
+
32
+ When *topology* is None, falls back to the base depth limit — identical
33
+ behaviour to FileScanner.
34
+ """
35
+
36
+ def __init__(
37
+ self,
38
+ root: Path,
39
+ topology: Optional[RepoTopology] = None,
40
+ base_depth: int = 4,
41
+ extra_excludes: Optional[frozenset[str]] = None,
42
+ ) -> None:
43
+ self.root = root.resolve()
44
+ self.topology = topology
45
+ self.base_depth = base_depth
46
+ self._excludes = DEFAULT_EXCLUDES | (extra_excludes or frozenset())
47
+ self._gitignore_spec: Optional[GitIgnoreSpec] = None
48
+
49
+ # Pre-compute lookup tables from topology for O(1) classification
50
+ # during traversal.
51
+ #
52
+ # Each entry is (path_parts_tuple, max_absolute_depth):
53
+ # source prefix → (src_parts, len(src_parts) + source_depth)
54
+ # low-signal prefix → (ls_parts, len(ls_parts) + low_signal_depth)
55
+ #
56
+ # "max_absolute_depth" is depth measured from the repo root, not from
57
+ # the classified directory. At depth D, files are visible; at depth
58
+ # >= max we clear dirnames and skip files.
59
+ self._source_prefixes: list[tuple[tuple[str, ...], int]] = []
60
+ self._low_signal_prefixes: list[tuple[tuple[str, ...], int]] = []
61
+ self._extra_exclude_names: frozenset[str] = frozenset()
62
+
63
+ if topology is not None:
64
+ budget = topology.scan_budget
65
+ for sr in topology.source_roots:
66
+ parts = tuple(p for p in sr.path.split("/") if p)
67
+ if parts:
68
+ max_d = len(parts) + budget.source_depth
69
+ self._source_prefixes.append((parts, max_d))
70
+
71
+ for lr in topology.low_signal_roots:
72
+ parts = tuple(p for p in lr.path.split("/") if p)
73
+ if parts:
74
+ max_d = len(parts) + budget.low_signal_depth
75
+ self._low_signal_prefixes.append((parts, max_d))
76
+
77
+ # Generated roots at depth 1 → add to excludes so os.walk never enters
78
+ top_generated = {
79
+ gr.path.split("/")[0]
80
+ for gr in topology.generated_roots
81
+ if "/" not in gr.path
82
+ }
83
+ self._extra_exclude_names = frozenset(top_generated)
84
+
85
+ # ------------------------------------------------------------------
86
+ # Gitignore
87
+ # ------------------------------------------------------------------
88
+
89
+ def _load_gitignore_spec(self) -> GitIgnoreSpec:
90
+ if self._gitignore_spec is None:
91
+ gitignore = self.root / ".gitignore"
92
+ lines: list[str] = []
93
+ if gitignore.exists():
94
+ try:
95
+ lines = gitignore.read_text(encoding="utf-8", errors="replace").splitlines()
96
+ except OSError:
97
+ pass
98
+ self._gitignore_spec = GitIgnoreSpec.from_lines(lines)
99
+ return self._gitignore_spec
100
+
101
+ def _is_excluded_by_gitignore(self, rel_path: str, is_dir: bool) -> bool:
102
+ spec = self._load_gitignore_spec()
103
+ path_to_match = rel_path + "/" if is_dir else rel_path
104
+ return spec.match_file(path_to_match)
105
+
106
+ # ------------------------------------------------------------------
107
+ # Depth budget computation — the core of adaptive traversal
108
+ # ------------------------------------------------------------------
109
+
110
+ def _compute_max_depth(self, rel_parts: tuple[str, ...]) -> int:
111
+ """Return the maximum absolute depth allowed at *rel_parts*.
112
+
113
+ Depth is the number of path components from the repo root. Files
114
+ at depth D are included; the scan stops (dirnames cleared) when
115
+ depth >= returned value.
116
+
117
+ Priority order:
118
+ 1. Inside a source root → deep budget (source_depth extra levels)
119
+ 2. Ancestor of a source root → must allow traversal to reach it
120
+ 3. Inside a low-signal root → restricted budget (low_signal_depth)
121
+ 4. Default → base_depth
122
+ """
123
+ if not self._source_prefixes and not self._low_signal_prefixes:
124
+ return self.base_depth
125
+
126
+ current_depth = len(rel_parts)
127
+
128
+ # Track the best depth found via ancestor matching (may have multiple
129
+ # source roots; return the maximum so all are reachable).
130
+ ancestor_best = self.base_depth
131
+ found_ancestor = False
132
+
133
+ for src_parts, src_max in self._source_prefixes:
134
+ n = len(src_parts)
135
+ if current_depth >= n:
136
+ # At or inside the source root
137
+ if rel_parts[:n] == src_parts:
138
+ return src_max # definite source territory — early exit
139
+ else:
140
+ # Ancestor check: src_parts starts with rel_parts?
141
+ if src_parts[:current_depth] == rel_parts:
142
+ found_ancestor = True
143
+ if src_max > ancestor_best:
144
+ ancestor_best = src_max
145
+
146
+ if found_ancestor:
147
+ return ancestor_best
148
+
149
+ # Low-signal roots (only if not already committed to a source path)
150
+ for ls_parts, ls_max in self._low_signal_prefixes:
151
+ n = len(ls_parts)
152
+ if current_depth >= n and rel_parts[:n] == ls_parts:
153
+ return ls_max
154
+
155
+ return self.base_depth
156
+
157
+ # ------------------------------------------------------------------
158
+ # Main traversal
159
+ # ------------------------------------------------------------------
160
+
161
+ def scan_tree(self) -> dict[str, Any]:
162
+ """Build the nested file tree dictionary.
163
+
164
+ Returns dict where None = file (D-02) and dict = directory (D-01).
165
+ Depth limits are applied per-directory using topology-derived budgets.
166
+ """
167
+ self._load_gitignore_spec()
168
+ root_tree: dict[str, Any] = {}
169
+ all_excludes = self._excludes | self._extra_exclude_names
170
+
171
+ for dirpath, dirnames, filenames in os.walk(self.root, followlinks=False):
172
+ current = Path(dirpath)
173
+ try:
174
+ rel = current.relative_to(self.root)
175
+ except ValueError:
176
+ continue
177
+
178
+ rel_parts = rel.parts
179
+ depth = len(rel_parts)
180
+
181
+ effective_max_depth = self._compute_max_depth(rel_parts)
182
+
183
+ if depth >= effective_max_depth:
184
+ dirnames.clear()
185
+ continue
186
+
187
+ # Filter dirnames in-place (critical: slice assignment)
188
+ dirnames[:] = [
189
+ d for d in dirnames
190
+ if d not in all_excludes
191
+ and not (current / d).is_symlink()
192
+ and not self._is_excluded_by_gitignore(
193
+ str(rel / d) if rel_parts else d,
194
+ is_dir=True,
195
+ )
196
+ ]
197
+
198
+ node = self._get_or_create_node(root_tree, rel_parts)
199
+
200
+ for fname in filenames:
201
+ # Skip flag-shaped names (shell redirect artifacts)
202
+ if fname.startswith("-"):
203
+ continue
204
+ fpath = current / fname
205
+ if fpath.is_symlink():
206
+ continue
207
+ rel_file = str(rel / fname) if rel_parts else fname
208
+ if self._is_excluded_by_gitignore(rel_file, is_dir=False):
209
+ continue
210
+ node[fname] = None # D-02: None = file
211
+
212
+ # Ensure accepted subdirs exist as dict nodes
213
+ for d in dirnames:
214
+ if d not in node:
215
+ node[d] = {}
216
+
217
+ return root_tree
218
+
219
+ def _get_or_create_node(
220
+ self, tree: dict[str, Any], parts: tuple[str, ...]
221
+ ) -> dict[str, Any]:
222
+ node = tree
223
+ for part in parts:
224
+ if part not in node or node[part] is None:
225
+ node[part] = {}
226
+ node = cast(dict[str, Any], node[part])
227
+ return node
228
+
229
+ # ------------------------------------------------------------------
230
+ # Manifest discovery — same interface as FileScanner
231
+ # ------------------------------------------------------------------
232
+
233
+ def find_manifests(self) -> list[str]:
234
+ """Find manifest files at depth 0-1.
235
+
236
+ Identical logic to FileScanner.find_manifests() — depth-0 root
237
+ manifests plus depth-1 sub-package manifests, hidden dirs excluded.
238
+ """
239
+ manifests: list[str] = []
240
+ for name in MANIFEST_NAMES:
241
+ candidate = self.root / name
242
+ if candidate.exists() and not candidate.is_symlink():
243
+ manifests.append(str(candidate))
244
+ try:
245
+ for child in self.root.iterdir():
246
+ if (
247
+ child.is_dir()
248
+ and not child.is_symlink()
249
+ and child.name not in self._excludes
250
+ and not child.name.startswith(".")
251
+ ):
252
+ for name in MANIFEST_NAMES:
253
+ candidate = child / name
254
+ if candidate.exists() and not candidate.is_symlink():
255
+ manifests.append(str(candidate))
256
+ except PermissionError:
257
+ pass
258
+ return manifests
sourcecode/cli.py CHANGED
@@ -714,6 +714,13 @@ def main(
714
714
  # 1. Scan directory (SCAN-01 to SCAN-05)
715
715
  redactor = SecretRedactor(enabled=not no_redact)
716
716
 
717
+ # Classify repository topology before scanning. This is a shallow
718
+ # filesystem read (depth 0-1 only) and completes in milliseconds.
719
+ # The topology drives per-directory depth budgets in AdaptiveScanner.
720
+ from sourcecode.adaptive_scanner import AdaptiveScanner
721
+ from sourcecode.repo_classifier import RepoClassifier
722
+ _topology = RepoClassifier().classify(target)
723
+
717
724
  # Detect manifests before scan to adjust depth.
718
725
  # find_manifests() only looks at depth 0-1, does not need the full tree.
719
726
  _pre_scanner = FileScanner(target, max_depth=1)
@@ -735,7 +742,7 @@ def main(
735
742
  no_tree = True # agents never need the raw file tree
736
743
  typer.echo("[agent] dependencies env-map code-notes (no-tree)", err=True)
737
744
 
738
- scanner = FileScanner(target, max_depth=effective_depth)
745
+ scanner = AdaptiveScanner(target, topology=_topology, base_depth=effective_depth)
739
746
  raw_tree = scanner.scan_tree()
740
747
 
741
748
  # 2. Filter .env and *.secret entries from file tree (SEC-02, all levels)
@@ -775,16 +782,14 @@ def main(
775
782
  detector = ProjectDetector(build_default_detectors())
776
783
  workspace_analysis = WorkspaceAnalyzer().analyze(target, manifests)
777
784
 
778
- # Warn when scanning a monorepo at default depth — typical package sources
779
- # (packages/*/src/) live at depth 5+, so default depth=4 silently misses them.
780
- # Only emit to TTY to avoid contaminating piped/CI output; agents read analysis_gaps.
785
+ # Adaptive traversal handles monorepo source root discovery automatically.
786
+ # Emit a diagnostic when topology confidence is low so users know why.
781
787
  import sys as _sys
782
- if workspace_analysis.is_monorepo and depth <= 4 and effective_depth <= 4:
788
+ if _topology.workspace_type == "monorepo" and _topology.confidence < 0.5:
783
789
  if _sys.stderr.isatty():
784
790
  typer.echo(
785
- f"[warning] monorepo detected with --depth {depth}. "
786
- "Source files in packages/*/src/ (depth 5+) may be invisible. "
787
- "Use --depth 6 or higher for full coverage.",
791
+ "[traversal] monorepo detected but source root confidence is low "
792
+ f"({_topology.confidence:.0%}). Use --depth 8 or higher if files are missing.",
788
793
  err=True,
789
794
  )
790
795
 
@@ -896,7 +901,8 @@ def main(
896
901
  workspace_root = target / workspace.path
897
902
  if not workspace_root.exists() or not workspace_root.is_dir():
898
903
  continue
899
- workspace_scanner = FileScanner(workspace_root, max_depth=depth)
904
+ _ws_topology = RepoClassifier().classify(workspace_root)
905
+ workspace_scanner = AdaptiveScanner(workspace_root, topology=_ws_topology, base_depth=depth)
900
906
  workspace_tree = filter_sensitive_files(workspace_scanner.scan_tree())
901
907
  workspace_manifests = workspace_scanner.find_manifests()
902
908
  workspace_stacks, workspace_entry_points, _ = detector.detect(
@@ -1008,6 +1014,7 @@ def main(
1008
1014
  metadata = AnalysisMetadata(
1009
1015
  analyzed_path=str(target),
1010
1016
  analyzer_fingerprints=_fingerprints,
1017
+ traversal_topology=_topology.as_dict(),
1011
1018
  )
1012
1019
  sm = SourceMap(
1013
1020
  metadata=metadata,
@@ -1037,7 +1044,7 @@ def main(
1037
1044
  target / ws.path,
1038
1045
  (
1039
1046
  filter_sensitive_files(
1040
- FileScanner(target / ws.path, max_depth=depth).scan_tree()
1047
+ AdaptiveScanner(target / ws.path, base_depth=depth).scan_tree()
1041
1048
  )
1042
1049
  ),
1043
1050
  workspace=ws.path,
@@ -0,0 +1,570 @@
1
+ from __future__ import annotations
2
+
3
+ """Repository topology classifier for adaptive traversal.
4
+
5
+ Detects monorepo vs single-package structure, identifies source roots,
6
+ low-signal directories, and generated content. Feeds AdaptiveScanner
7
+ with per-path depth budgets so traversal is relevance-oriented, not
8
+ purely structural.
9
+ """
10
+
11
+ import json
12
+ from dataclasses import dataclass, field
13
+ from pathlib import Path
14
+ from typing import Any, Optional
15
+
16
+ # ---------------------------------------------------------------------------
17
+ # Signal tables
18
+ # ---------------------------------------------------------------------------
19
+
20
+ # Top-level dirs that almost always contain actual source code
21
+ _SOURCE_DIRS: frozenset[str] = frozenset({
22
+ "src", "lib", "source", "sources", "core",
23
+ "app", "server", "client", "backend", "frontend",
24
+ "cmd", "pkg", # Go conventions
25
+ "main", # Java src/main
26
+ "kotlin", "java", "scala", # JVM source dirs
27
+ })
28
+
29
+ # First-level dirs that act as workspace containers in monorepos
30
+ _WORKSPACE_CONTAINERS: frozenset[str] = frozenset({
31
+ "packages", "apps", "libs", "services", "internal",
32
+ "plugins", "modules", "components", "crates",
33
+ "workspaces", "projects",
34
+ })
35
+
36
+ # Directories with low signal value for AI code understanding
37
+ _LOW_SIGNAL_DIRS: frozenset[str] = frozenset({
38
+ "docs", "doc", "documentation", "docsrc", "website", "site",
39
+ "benchmark", "benchmarks", "bench", "perf", "perfs",
40
+ "examples", "example", "demo", "demos", "sample", "samples",
41
+ "fixtures", "fixture", "__fixtures__",
42
+ "scripts", "script", "tools", "tool",
43
+ "ci", ".ci",
44
+ "storybook", "stories", "__stories__",
45
+ "sandbox", "playground", "playgrounds",
46
+ "migrations", "migration",
47
+ ".github", ".vscode", ".claude", ".cursor", ".idea",
48
+ "themes", "theme",
49
+ "static", "public", "assets",
50
+ })
51
+
52
+ # Directories to skip entirely — generated content and dependency stores
53
+ _GENERATED_DIRS: frozenset[str] = frozenset({
54
+ "dist", "build", "out", "output", "release", "releases",
55
+ "target", "coverage", ".next", ".nuxt", ".svelte-kit",
56
+ ".turbo", "node_modules", "__pycache__",
57
+ ".venv", "venv", "env",
58
+ ".mypy_cache", ".pytest_cache", ".ruff_cache",
59
+ ".nyc_output", ".tox",
60
+ "generated", ".generated", "gen", "_gen",
61
+ ".cache", "cache",
62
+ "vendor",
63
+ ".git",
64
+ })
65
+
66
+ # Manifest file names that mark a directory as a source package
67
+ _PACKAGE_MANIFESTS: frozenset[str] = frozenset({
68
+ "package.json", "pyproject.toml", "setup.py", "setup.cfg",
69
+ "go.mod", "Cargo.toml", "pom.xml", "build.gradle",
70
+ "build.gradle.kts", "composer.json", "Gemfile", "pubspec.yaml",
71
+ })
72
+
73
+ # Source file extensions — presence signals a directory has real code
74
+ _SOURCE_EXTENSIONS: frozenset[str] = frozenset({
75
+ ".ts", ".tsx", ".js", ".jsx", ".mjs", ".cjs",
76
+ ".py", ".go", ".rs", ".java", ".kt", ".rb",
77
+ ".cs", ".swift", ".scala", ".cpp", ".c", ".h",
78
+ })
79
+
80
+
81
+ # ---------------------------------------------------------------------------
82
+ # Data structures
83
+ # ---------------------------------------------------------------------------
84
+
85
+ @dataclass
86
+ class SourceRoot:
87
+ """A classified directory with traversal priority and depth budget."""
88
+ path: str # repo-relative path, forward slashes
89
+ signal: str # "high" | "medium" | "low" | "excluded"
90
+ reason: str # human-readable explanation
91
+ priority: float # 0.0–1.0 traversal priority
92
+
93
+
94
+ @dataclass
95
+ class ScanBudget:
96
+ """Per-topology traversal budget constraints."""
97
+ max_files: int = 2000
98
+ base_depth: int = 4 # depth cap for unclassified paths
99
+ source_depth: int = 8 # additional levels allowed inside source roots
100
+ low_signal_depth: int = 2 # additional levels allowed inside low-signal roots
101
+
102
+
103
+ @dataclass
104
+ class RepoTopology:
105
+ """Classified repository topology for adaptive traversal.
106
+
107
+ Produced by RepoClassifier.classify() and consumed by AdaptiveScanner.
108
+ The three root lists partition the repository's first-level directories
109
+ into source code, low-value content, and generated/excluded content.
110
+ """
111
+ workspace_type: str = "unknown" # "monorepo" | "single-package" | "unknown"
112
+ source_roots: list[SourceRoot] = field(default_factory=list)
113
+ low_signal_roots: list[SourceRoot] = field(default_factory=list)
114
+ generated_roots: list[SourceRoot] = field(default_factory=list)
115
+ package_manager: str = "unknown"
116
+ confidence: float = 0.0
117
+ scan_budget: ScanBudget = field(default_factory=ScanBudget)
118
+
119
+ def as_dict(self) -> dict[str, Any]:
120
+ return {
121
+ "workspace_type": self.workspace_type,
122
+ "source_roots": [
123
+ {"path": r.path, "reason": r.reason, "priority": round(r.priority, 2)}
124
+ for r in self.source_roots
125
+ ],
126
+ "low_signal_roots": [r.path for r in self.low_signal_roots],
127
+ "generated_roots": [r.path for r in self.generated_roots],
128
+ "package_manager": self.package_manager,
129
+ "confidence": round(self.confidence, 2),
130
+ "scan_budget": {
131
+ "base_depth": self.scan_budget.base_depth,
132
+ "source_depth": self.scan_budget.source_depth,
133
+ "low_signal_depth": self.scan_budget.low_signal_depth,
134
+ },
135
+ }
136
+
137
+
138
+ # ---------------------------------------------------------------------------
139
+ # RepoClassifier
140
+ # ---------------------------------------------------------------------------
141
+
142
+ class RepoClassifier:
143
+ """Detects repository topology and classifies directories for adaptive traversal.
144
+
145
+ Reads workspace config files (pnpm-workspace.yaml, package.json workspaces,
146
+ turbo.json, nx.json, lerna.json, go.work, Cargo.toml), resolves package
147
+ glob patterns, and identifies which directories contain real source code
148
+ vs. docs, benchmarks, or generated content.
149
+
150
+ Classification is fast: only depth-0 and depth-1 filesystem reads.
151
+ """
152
+
153
+ def classify(self, root: Path) -> RepoTopology:
154
+ """Classify the repository at *root* and return its topology."""
155
+ topology = RepoTopology()
156
+ topology.package_manager = self._detect_package_manager(root)
157
+
158
+ markers = self._detect_markers(root)
159
+ workspace_patterns = self._read_workspace_patterns(root, markers)
160
+
161
+ try:
162
+ root_children = [
163
+ d for d in sorted(root.iterdir())
164
+ if d.is_dir() and not d.is_symlink()
165
+ ]
166
+ except PermissionError:
167
+ root_children = []
168
+
169
+ source_roots = self._find_source_roots(
170
+ root, root_children, workspace_patterns, bool(markers) or bool(workspace_patterns)
171
+ )
172
+ low_signal_roots = self._find_low_signal_roots(root, root_children, source_roots)
173
+ generated_roots = self._find_generated_roots(root, root_children)
174
+
175
+ # Monorepo heuristic: explicit markers OR multiple packages found via
176
+ # workspace containers (packages/*, apps/*, etc.) without top-level src/
177
+ container_sourced = [
178
+ r for r in source_roots
179
+ if "container:" in r.reason or "workspace:" in r.reason
180
+ ]
181
+ has_top_level_src = any(
182
+ r.reason == "top_level_source" for r in source_roots
183
+ )
184
+ is_monorepo = (
185
+ bool(markers)
186
+ or bool(workspace_patterns)
187
+ or (len(container_sourced) >= 2 and not has_top_level_src)
188
+ )
189
+ topology.workspace_type = "monorepo" if is_monorepo else "single-package"
190
+
191
+ topology.source_roots = sorted(source_roots, key=lambda r: -r.priority)
192
+ topology.low_signal_roots = low_signal_roots
193
+ topology.generated_roots = generated_roots
194
+ topology.confidence = self._compute_confidence(topology, is_monorepo)
195
+ topology.scan_budget = self._compute_budget(topology)
196
+
197
+ return topology
198
+
199
+ # ------------------------------------------------------------------
200
+ # Package manager detection
201
+ # ------------------------------------------------------------------
202
+
203
+ def _detect_package_manager(self, root: Path) -> str:
204
+ if (root / "pnpm-lock.yaml").exists() or (root / "pnpm-workspace.yaml").exists():
205
+ return "pnpm"
206
+ if (root / "yarn.lock").exists():
207
+ return "yarn"
208
+ if (root / "bun.lockb").exists() or (root / "bun.lock").exists():
209
+ return "bun"
210
+ if (root / "package-lock.json").exists():
211
+ return "npm"
212
+ if (root / "go.work").exists():
213
+ return "go-workspace"
214
+ if (root / "go.mod").exists():
215
+ return "go-modules"
216
+ if (root / "Cargo.toml").exists():
217
+ return "cargo"
218
+ if (root / "uv.lock").exists():
219
+ return "uv"
220
+ if (root / "Pipfile").exists():
221
+ return "pipenv"
222
+ if (root / "pyproject.toml").exists() or (root / "setup.py").exists():
223
+ return "python"
224
+ return "unknown"
225
+
226
+ # ------------------------------------------------------------------
227
+ # Workspace marker detection
228
+ # ------------------------------------------------------------------
229
+
230
+ def _detect_markers(self, root: Path) -> list[str]:
231
+ """Return list of workspace marker file names present at root."""
232
+ markers: list[str] = []
233
+ for name in ("pnpm-workspace.yaml", "go.work", "turbo.json", "lerna.json", "nx.json"):
234
+ if (root / name).exists():
235
+ markers.append(name)
236
+
237
+ cargo = root / "Cargo.toml"
238
+ if cargo.exists():
239
+ try:
240
+ content = cargo.read_text(encoding="utf-8", errors="replace")
241
+ if "[workspace]" in content:
242
+ markers.append("Cargo.toml[workspace]")
243
+ except OSError:
244
+ pass
245
+
246
+ pkg = root / "package.json"
247
+ if pkg.exists():
248
+ try:
249
+ data = json.loads(pkg.read_text(encoding="utf-8", errors="replace"))
250
+ if "workspaces" in data:
251
+ markers.append("package.json[workspaces]")
252
+ except (json.JSONDecodeError, OSError, ValueError):
253
+ pass
254
+
255
+ return markers
256
+
257
+ # ------------------------------------------------------------------
258
+ # Workspace pattern extraction from config files
259
+ # ------------------------------------------------------------------
260
+
261
+ def _read_workspace_patterns(self, root: Path, markers: list[str]) -> list[str]:
262
+ """Extract glob patterns from workspace config files."""
263
+ patterns: list[str] = []
264
+
265
+ if "pnpm-workspace.yaml" in markers:
266
+ patterns.extend(self._patterns_from_pnpm(root))
267
+
268
+ if "package.json[workspaces]" in markers:
269
+ patterns.extend(self._patterns_from_npm_workspaces(root))
270
+
271
+ if "nx.json" in markers:
272
+ patterns.extend(self._patterns_from_nx(root))
273
+
274
+ if "lerna.json" in markers:
275
+ patterns.extend(self._patterns_from_lerna(root))
276
+
277
+ if "Cargo.toml[workspace]" in markers:
278
+ patterns.extend(self._patterns_from_cargo_workspace(root))
279
+
280
+ if "go.work" in markers:
281
+ patterns.extend(self._patterns_from_go_work(root))
282
+
283
+ return list(dict.fromkeys(patterns)) # deduplicate, preserve order
284
+
285
+ def _patterns_from_pnpm(self, root: Path) -> list[str]:
286
+ try:
287
+ content = (root / "pnpm-workspace.yaml").read_text(encoding="utf-8", errors="replace")
288
+ result = []
289
+ for line in content.splitlines():
290
+ stripped = line.strip().lstrip("- ").strip("'\"")
291
+ if stripped and not stripped.startswith("#"):
292
+ result.append(stripped)
293
+ return result
294
+ except OSError:
295
+ return []
296
+
297
+ def _patterns_from_npm_workspaces(self, root: Path) -> list[str]:
298
+ try:
299
+ data = json.loads((root / "package.json").read_text(encoding="utf-8", errors="replace"))
300
+ ws = data.get("workspaces", [])
301
+ if isinstance(ws, list):
302
+ return [str(p) for p in ws]
303
+ if isinstance(ws, dict):
304
+ return [str(p) for p in ws.get("packages", [])]
305
+ except (json.JSONDecodeError, OSError, ValueError):
306
+ pass
307
+ return []
308
+
309
+ def _patterns_from_nx(self, root: Path) -> list[str]:
310
+ try:
311
+ data = json.loads((root / "nx.json").read_text(encoding="utf-8", errors="replace"))
312
+ patterns = []
313
+ wl = data.get("workspaceLayout", {})
314
+ if "appsDir" in wl:
315
+ patterns.append(f"{wl['appsDir']}/*")
316
+ if "libsDir" in wl:
317
+ patterns.append(f"{wl['libsDir']}/*")
318
+ return patterns
319
+ except (json.JSONDecodeError, OSError, ValueError):
320
+ return []
321
+
322
+ def _patterns_from_lerna(self, root: Path) -> list[str]:
323
+ try:
324
+ data = json.loads((root / "lerna.json").read_text(encoding="utf-8", errors="replace"))
325
+ pkgs = data.get("packages", ["packages/*"])
326
+ return [str(p) for p in pkgs] if isinstance(pkgs, list) else []
327
+ except (json.JSONDecodeError, OSError, ValueError):
328
+ return []
329
+
330
+ def _patterns_from_cargo_workspace(self, root: Path) -> list[str]:
331
+ try:
332
+ content = (root / "Cargo.toml").read_text(encoding="utf-8", errors="replace")
333
+ in_members = False
334
+ patterns = []
335
+ for line in content.splitlines():
336
+ stripped = line.strip()
337
+ if "members" in stripped and "=" in stripped:
338
+ in_members = True
339
+ if in_members:
340
+ for quote in ('"', "'"):
341
+ if quote in stripped:
342
+ for segment in stripped.split(quote):
343
+ segment = segment.strip(" [],")
344
+ if segment and "/" in segment:
345
+ patterns.append(segment)
346
+ if "]" in stripped:
347
+ in_members = False
348
+ return patterns
349
+ except OSError:
350
+ return []
351
+
352
+ def _patterns_from_go_work(self, root: Path) -> list[str]:
353
+ try:
354
+ content = (root / "go.work").read_text(encoding="utf-8", errors="replace")
355
+ patterns = []
356
+ for line in content.splitlines():
357
+ stripped = line.strip()
358
+ if stripped.startswith("use "):
359
+ target = stripped[4:].strip().strip("()")
360
+ if target and target != ".":
361
+ patterns.append(target.removeprefix("./").rstrip("/"))
362
+ elif stripped.startswith("./") and not stripped.startswith("//"):
363
+ patterns.append(stripped.removeprefix("./").rstrip())
364
+ return patterns
365
+ except OSError:
366
+ return []
367
+
368
+ # ------------------------------------------------------------------
369
+ # Source root discovery
370
+ # ------------------------------------------------------------------
371
+
372
+ def _find_source_roots(
373
+ self,
374
+ root: Path,
375
+ root_children: list[Path],
376
+ workspace_patterns: list[str],
377
+ is_monorepo: bool,
378
+ ) -> list[SourceRoot]:
379
+ """Identify directories that contain actual source code."""
380
+ result: list[SourceRoot] = []
381
+ seen: set[str] = set()
382
+
383
+ def _add(path_str: str, reason: str, priority: float) -> None:
384
+ if path_str not in seen:
385
+ seen.add(path_str)
386
+ result.append(SourceRoot(
387
+ path=path_str, signal="high", reason=reason, priority=priority
388
+ ))
389
+
390
+ # 1. Resolve workspace glob patterns → packages → src/
391
+ for pattern in workspace_patterns:
392
+ try:
393
+ for pkg_dir in sorted(root.glob(pattern)):
394
+ if not pkg_dir.is_dir() or pkg_dir.is_symlink():
395
+ continue
396
+ try:
397
+ rel = pkg_dir.relative_to(root)
398
+ except ValueError:
399
+ continue
400
+ rel_str = str(rel).replace("\\", "/")
401
+ if not self._is_allowed_path(rel_str):
402
+ continue
403
+
404
+ found_src = False
405
+ for src_name in ("src", "lib", "source"):
406
+ src_dir = pkg_dir / src_name
407
+ if src_dir.is_dir() and not src_dir.is_symlink():
408
+ _add(f"{rel_str}/{src_name}", f"workspace:{pattern}", 0.92)
409
+ found_src = True
410
+
411
+ if not found_src and self._has_source_signal(pkg_dir):
412
+ _add(rel_str, f"workspace_flat:{pattern}", 0.72)
413
+ except Exception:
414
+ continue
415
+
416
+ # 2. Check known workspace container dirs even without explicit patterns
417
+ for child in root_children:
418
+ name = child.name
419
+ if name not in _WORKSPACE_CONTAINERS:
420
+ continue
421
+ try:
422
+ for pkg_dir in sorted(child.iterdir()):
423
+ if not pkg_dir.is_dir() or pkg_dir.is_symlink():
424
+ continue
425
+ try:
426
+ rel = pkg_dir.relative_to(root)
427
+ except ValueError:
428
+ continue
429
+ rel_str = str(rel).replace("\\", "/")
430
+ if not self._is_allowed_path(rel_str):
431
+ continue
432
+
433
+ found_src = False
434
+ for src_name in ("src", "lib", "source"):
435
+ src_dir = pkg_dir / src_name
436
+ if src_dir.is_dir() and not src_dir.is_symlink():
437
+ _add(f"{rel_str}/{src_name}", f"container:{name}", 0.88)
438
+ found_src = True
439
+
440
+ if not found_src and self._has_source_signal(pkg_dir):
441
+ _add(rel_str, f"container_flat:{name}", 0.68)
442
+ except PermissionError:
443
+ continue
444
+
445
+ # 3. Top-level source dirs (single-package repos or workspace containers)
446
+ for child in root_children:
447
+ name = child.name
448
+ if name in _SOURCE_DIRS and name not in _GENERATED_DIRS:
449
+ try:
450
+ rel_str = str(child.relative_to(root)).replace("\\", "/")
451
+ _add(rel_str, "top_level_source", 0.95)
452
+ except ValueError:
453
+ pass
454
+
455
+ # 4. Workspace containers themselves if they contain source files at root
456
+ for child in root_children:
457
+ name = child.name
458
+ if name in _WORKSPACE_CONTAINERS and name not in _GENERATED_DIRS:
459
+ try:
460
+ rel_str = str(child.relative_to(root)).replace("\\", "/")
461
+ except ValueError:
462
+ continue
463
+ if rel_str not in seen and self._has_source_signal(child):
464
+ _add(rel_str, f"workspace_container_source:{name}", 0.55)
465
+
466
+ return result
467
+
468
+ def _has_source_signal(self, directory: Path) -> bool:
469
+ """Return True if directory has a manifest or source files."""
470
+ for name in _PACKAGE_MANIFESTS:
471
+ if (directory / name).exists():
472
+ return True
473
+ try:
474
+ for entry in directory.iterdir():
475
+ if entry.is_file() and entry.suffix.lower() in _SOURCE_EXTENSIONS:
476
+ return True
477
+ if entry.name in _PACKAGE_MANIFESTS:
478
+ return True
479
+ except PermissionError:
480
+ pass
481
+ return False
482
+
483
+ def _is_allowed_path(self, rel_str: str) -> bool:
484
+ parts = rel_str.split("/")
485
+ return all(p not in _GENERATED_DIRS for p in parts)
486
+
487
+ # ------------------------------------------------------------------
488
+ # Low-signal root discovery
489
+ # ------------------------------------------------------------------
490
+
491
+ def _find_low_signal_roots(
492
+ self,
493
+ root: Path,
494
+ root_children: list[Path],
495
+ source_roots: list[SourceRoot],
496
+ ) -> list[SourceRoot]:
497
+ """Identify root-level directories with low signal value."""
498
+ top_source_names = {sr.path.split("/")[0] for sr in source_roots}
499
+ low_signal: list[SourceRoot] = []
500
+
501
+ for child in root_children:
502
+ name = child.name
503
+ if name in top_source_names or name in _GENERATED_DIRS:
504
+ continue
505
+ try:
506
+ rel_str = str(child.relative_to(root)).replace("\\", "/")
507
+ except ValueError:
508
+ continue
509
+
510
+ if name in _LOW_SIGNAL_DIRS:
511
+ low_signal.append(SourceRoot(
512
+ path=rel_str, signal="low",
513
+ reason=f"low_signal:{name}", priority=0.15,
514
+ ))
515
+ elif name.startswith("."):
516
+ low_signal.append(SourceRoot(
517
+ path=rel_str, signal="low",
518
+ reason="hidden_dir", priority=0.05,
519
+ ))
520
+
521
+ return low_signal
522
+
523
+ # ------------------------------------------------------------------
524
+ # Generated root discovery
525
+ # ------------------------------------------------------------------
526
+
527
+ def _find_generated_roots(
528
+ self,
529
+ root: Path,
530
+ root_children: list[Path],
531
+ ) -> list[SourceRoot]:
532
+ """Identify root-level generated/excluded directories."""
533
+ generated: list[SourceRoot] = []
534
+ for child in root_children:
535
+ name = child.name
536
+ if name in _GENERATED_DIRS:
537
+ generated.append(SourceRoot(
538
+ path=name, signal="excluded",
539
+ reason=f"generated:{name}", priority=0.0,
540
+ ))
541
+ return generated
542
+
543
+ # ------------------------------------------------------------------
544
+ # Budget and confidence
545
+ # ------------------------------------------------------------------
546
+
547
+ def _compute_confidence(self, topology: RepoTopology, is_monorepo: bool) -> float:
548
+ sc = len(topology.source_roots)
549
+ if sc >= 5:
550
+ return 0.95
551
+ if sc >= 2:
552
+ return 0.85
553
+ if sc >= 1:
554
+ return 0.75 if is_monorepo else 0.80
555
+ return 0.30
556
+
557
+ def _compute_budget(self, topology: RepoTopology) -> ScanBudget:
558
+ if topology.workspace_type == "monorepo":
559
+ return ScanBudget(
560
+ max_files=2000,
561
+ base_depth=4,
562
+ source_depth=8,
563
+ low_signal_depth=2,
564
+ )
565
+ return ScanBudget(
566
+ max_files=2000,
567
+ base_depth=6,
568
+ source_depth=8,
569
+ low_signal_depth=2,
570
+ )
sourcecode/schema.py CHANGED
@@ -34,6 +34,7 @@ class AnalysisMetadata:
34
34
  sourcecode_version: str = field(default_factory=_sourcecode_version)
35
35
  analyzed_path: str = ""
36
36
  analyzer_fingerprints: dict[str, str] = field(default_factory=dict)
37
+ traversal_topology: Optional[dict[str, Any]] = None
37
38
 
38
39
 
39
40
  @dataclass
sourcecode/serializer.py CHANGED
@@ -927,6 +927,9 @@ def _contract_view_minimal(
927
927
  "project": project,
928
928
  }
929
929
 
930
+ if sm.metadata.traversal_topology:
931
+ result["traversal"] = sm.metadata.traversal_topology
932
+
930
933
  # Per-file contracts
931
934
  if contracts:
932
935
  serialized: list[dict[str, Any]] = []
@@ -1170,6 +1173,8 @@ def _contract_view_standard(
1170
1173
  ],
1171
1174
  "entry_points": ep_groups["production"],
1172
1175
  }
1176
+ if sm.metadata.traversal_topology:
1177
+ result["traversal"] = sm.metadata.traversal_topology
1173
1178
  if ep_groups["development"]:
1174
1179
  result["development_entry_points"] = ep_groups["development"]
1175
1180
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sourcecode
3
- Version: 0.37.0
3
+ Version: 0.38.0
4
4
  Summary: Deterministic codebase context for AI coding agents
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -1,9 +1,10 @@
1
- sourcecode/__init__.py,sha256=Xha8jq1XWD5Ze_B5mEne-d5fOfBVVwnX-Ieg7spvalk,103
1
+ sourcecode/__init__.py,sha256=RjrfBH06OIJiq-xk4Hadj8Zl3Soer5r1Ct1ogF0xqaU,103
2
+ sourcecode/adaptive_scanner.py,sha256=6dh34C2qZXyRbw-8xBhbEwDdXanM6CRFRWayVoYITnA,10190
2
3
  sourcecode/architecture_analyzer.py,sha256=H6noGgVArUJ25z1qC0fFA0KvJJeHZYyhKvKSkOyWHUk,23096
3
4
  sourcecode/architecture_summary.py,sha256=rSY5MRiaz4N1YdG0pqDTDuFjSN7PO_Zplx-dtNzv2Yo,19985
4
5
  sourcecode/ast_extractor.py,sha256=0OHQwTUBBc9lmqPLryVeB1z8dGIC6NhLlar800CD9oI,41129
5
6
  sourcecode/classifier.py,sha256=GKTMN8qKZX7ponSwDJfN08RrasI4CVpq1_gFBgEopps,7093
6
- sourcecode/cli.py,sha256=LSGytpRlyMFdmHugrP3USDhPb0hiigHn0PL9Ppac3R4,64852
7
+ sourcecode/cli.py,sha256=dJ0kkwC0pQ4LJyhjlbtHKSpD-TvRQQyhdhvjRCHPA8o,65280
7
8
  sourcecode/code_notes_analyzer.py,sha256=rRd8bFYV0krjlxxQV0wenwE9K7pVpUQSR7KvSvUQKw4,9226
8
9
  sourcecode/confidence_analyzer.py,sha256=HxJMPLI5ulqtkncnv98W4iVO6yMbpQo87VuxiuNbDmY,12167
9
10
  sourcecode/context_summarizer.py,sha256=CiQrfBEzun949bWvmLabWoj2HhPn6Lw62ofqnsy0FlQ,6503
@@ -21,11 +22,12 @@ sourcecode/metrics_analyzer.py,sha256=e2cFwB9XubFq_dIVsP2PLjpr4wX0N6ulb3ol3sGDUe
21
22
  sourcecode/prepare_context.py,sha256=vxEzr8czS3MFbdTx4hBJQlJLrl9cuvbHdL3ZokxFkvo,31384
22
23
  sourcecode/redactor.py,sha256=xuGcadGEHaPw4qZXlMDvzMCsr4VOkdp3oBQptHyJk8c,2884
23
24
  sourcecode/relevance_scorer.py,sha256=ea7_7AHVgahVEWK3ebKOpG67agzG_pGICu5f2KgzrIA,8133
25
+ sourcecode/repo_classifier.py,sha256=FG1vaWKdWXsWdl-S8hjVMiTqcwgaRXkDyvK4rPcOGtQ,22681
24
26
  sourcecode/runtime_classifier.py,sha256=zWX3r3HCKHc-qtIobErOa8aKMmaoPYREtJKvPcBGPjQ,14792
25
27
  sourcecode/scanner.py,sha256=aM3h9-DCQ3xKpeHpHYdo2vX6T5P95HA_YwZbkAVNwmo,8288
26
- sourcecode/schema.py,sha256=AShu_bcP30TYaw4Dl1nYy8aFnBCKxrUli3LhU3MZTjs,20739
28
+ sourcecode/schema.py,sha256=dVA-3EbHBakHLkgeZF-LfjKClEFRgPZkzblXpDTshFA,20796
27
29
  sourcecode/semantic_analyzer.py,sha256=asQfJf-EhzYaOTA-iMuZsrVXtbW7SV2WEKCxgsxa88Y,79413
28
- sourcecode/serializer.py,sha256=uQGcytdaaM3qzxXcZ2NMjXYvzdvT9PP45960t-Thgqk,51128
30
+ sourcecode/serializer.py,sha256=qJRJV_z-T_wU615KMA1ez5IIeV3wcexh29lY4-fcgjs,51329
29
31
  sourcecode/summarizer.py,sha256=ZuzIdm3t8A-d5MuQL0TSNLrd-L0IQIuguIxeNXMNJf8,16070
30
32
  sourcecode/tree_utils.py,sha256=Fj9OIuUksBvgibNd3feog0sMDjVypJzPexp5lvMoYWI,1424
31
33
  sourcecode/workspace.py,sha256=fQlVoNx8S-fSHpKoJ0JBvEHCFkxszH0KZVJed1i3TRk,6845
@@ -56,8 +58,8 @@ sourcecode/telemetry/consent.py,sha256=wLMvGNJeSSyZoNkQXpoUioY6mMv4Qdvuw7S9jAEWn
56
58
  sourcecode/telemetry/events.py,sha256=oEvvulfsv5GIDWG2174gSS6tNB95w38AIYiYeifGKlE,2294
57
59
  sourcecode/telemetry/filters.py,sha256=Asa71oRl7q3Wt_FMwuufIZJFzSYdgRNKS8LHCIyFeYE,4805
58
60
  sourcecode/telemetry/transport.py,sha256=KJeIPCPWMdmbCP3ySGs2iUlia34U6vWne2dZsUezesw,1560
59
- sourcecode-0.37.0.dist-info/METADATA,sha256=75XE0yybH_O7U8rxcP6ZY2MdvibRaxALg4io5V9RsU4,25209
60
- sourcecode-0.37.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
61
- sourcecode-0.37.0.dist-info/entry_points.txt,sha256=ex3F9rmbXeyDIoFQHtkEqTsKSaJow8F0LrVu8XfIktQ,57
62
- sourcecode-0.37.0.dist-info/licenses/LICENSE,sha256=7DdHrU9Z_3e7dSvq4ISijZNjnuHo5NIHNiHDouMQ9JU,10491
63
- sourcecode-0.37.0.dist-info/RECORD,,
61
+ sourcecode-0.38.0.dist-info/METADATA,sha256=-RJ8bdDTHeuGmWN-iNo4eYkjPTuSnfriYYD1O59Gmwc,25209
62
+ sourcecode-0.38.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
63
+ sourcecode-0.38.0.dist-info/entry_points.txt,sha256=ex3F9rmbXeyDIoFQHtkEqTsKSaJow8F0LrVu8XfIktQ,57
64
+ sourcecode-0.38.0.dist-info/licenses/LICENSE,sha256=7DdHrU9Z_3e7dSvq4ISijZNjnuHo5NIHNiHDouMQ9JU,10491
65
+ sourcecode-0.38.0.dist-info/RECORD,,