sourcecode 0.37.0__tar.gz → 0.38.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sourcecode-0.37.0 → sourcecode-0.38.0}/PKG-INFO +1 -1
- {sourcecode-0.37.0 → sourcecode-0.38.0}/pyproject.toml +1 -1
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/__init__.py +1 -1
- sourcecode-0.38.0/src/sourcecode/adaptive_scanner.py +258 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/cli.py +17 -10
- sourcecode-0.38.0/src/sourcecode/repo_classifier.py +570 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/schema.py +1 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/serializer.py +5 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/.agents/skills/source-command-gsd-join-discord/SKILL.md +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/.agents/skills/source-command-gsd-review-backlog/SKILL.md +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/.agents/skills/source-command-gsd-workstreams/SKILL.md +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/.gitignore +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/.ruff.toml +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/CONTRIBUTING.md +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/LICENSE +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/README.md +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/SECURITY.md +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/docs/privacy.md +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/docs/schema.md +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/raw +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/architecture_analyzer.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/architecture_summary.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/ast_extractor.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/classifier.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/code_notes_analyzer.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/confidence_analyzer.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/context_summarizer.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/contract_model.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/contract_pipeline.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/coverage_parser.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/dependency_analyzer.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/detectors/__init__.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/detectors/base.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/detectors/csproj_parser.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/detectors/dart.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/detectors/dotnet.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/detectors/elixir.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/detectors/go.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/detectors/heuristic.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/detectors/hybrid.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/detectors/java.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/detectors/jvm_ext.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/detectors/nodejs.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/detectors/parsers.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/detectors/php.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/detectors/project.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/detectors/python.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/detectors/ruby.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/detectors/rust.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/detectors/systems.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/detectors/terraform.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/detectors/tooling.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/doc_analyzer.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/entrypoint_classifier.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/env_analyzer.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/file_classifier.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/git_analyzer.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/graph_analyzer.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/metrics_analyzer.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/prepare_context.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/redactor.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/relevance_scorer.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/runtime_classifier.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/scanner.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/semantic_analyzer.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/summarizer.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/telemetry/__init__.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/telemetry/config.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/telemetry/consent.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/telemetry/events.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/telemetry/filters.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/telemetry/transport.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/tree_utils.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/src/sourcecode/workspace.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/__init__.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/conftest.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/fixtures/coverage.xml +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/fixtures/fastapi_app/pyproject.toml +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/fixtures/fastapi_app/src/main.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/fixtures/go_service/cmd/api/main.go +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/fixtures/go_service/go.mod +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/fixtures/jacoco.xml +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/fixtures/lcov.info +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/fixtures/nextjs_app/app/page.tsx +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/fixtures/nextjs_app/package.json +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/fixtures/nextjs_app/pnpm-lock.yaml +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/fixtures/pnpm_monorepo/apps/web/app/page.tsx +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/fixtures/pnpm_monorepo/apps/web/package.json +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/fixtures/pnpm_monorepo/packages/api/main.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/fixtures/pnpm_monorepo/packages/api/pyproject.toml +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/fixtures/pnpm_monorepo/pnpm-workspace.yaml +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/test_architecture_analyzer.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/test_architecture_summary.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/test_ast_extractor.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/test_classifier.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/test_cli.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/test_code_notes_analyzer.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/test_contract_pipeline.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/test_coverage_parser.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/test_cross_consistency.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/test_dependency_analyzer_node_python.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/test_dependency_analyzer_polyglot.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/test_dependency_schema.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/test_detector_dotnet.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/test_detector_go_rust_java.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/test_detector_nodejs.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/test_detector_php_ruby_dart.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/test_detector_python.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/test_detector_universal_managed.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/test_detector_universal_systems.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/test_detectors_base.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/test_doc_analyzer_jsdom.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/test_doc_analyzer_python.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/test_graph_analyzer_polyglot.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/test_graph_analyzer_python_node.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/test_graph_schema.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/test_hybrid_inference.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/test_integration.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/test_integration_dependencies.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/test_integration_detection.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/test_integration_docs.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/test_integration_graph_modules.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/test_integration_lqn.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/test_integration_metrics.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/test_integration_multistack.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/test_integration_semantics.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/test_integration_universal.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/test_metrics_analyzer.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/test_packaging.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/test_phase1_improvements.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/test_pipeline_integrity.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/test_real_projects.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/test_redactor.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/test_scanner.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/test_schema.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/test_schema_normalization.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/test_semantic_analyzer_node.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/test_semantic_analyzer_python.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/test_semantic_import_resolution.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/test_semantic_schema.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/test_signal_hierarchy.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/test_summarizer.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/test_telemetry.py +0 -0
- {sourcecode-0.37.0 → sourcecode-0.38.0}/tests/test_workspace_analyzer.py +0 -0
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
"""Adaptive file tree scanner with topology-aware depth budgets.
|
|
4
|
+
|
|
5
|
+
Replaces pure depth filtering with relevance-oriented traversal:
|
|
6
|
+
- Source roots (packages/*/src, apps/*/src) get deep scan budgets.
|
|
7
|
+
- Low-signal directories (docs/, benchmarks/) are limited to 2 levels.
|
|
8
|
+
- Generated/excluded directories (dist/, node_modules/) are skipped.
|
|
9
|
+
- Unclassified directories fall back to the base depth limit.
|
|
10
|
+
|
|
11
|
+
Drop-in replacement for FileScanner: same scan_tree() and find_manifests()
|
|
12
|
+
interface, same output format (None = file, dict = directory).
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import os
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Any, Optional, cast
|
|
18
|
+
|
|
19
|
+
from pathspec import GitIgnoreSpec
|
|
20
|
+
|
|
21
|
+
from sourcecode.repo_classifier import RepoTopology
|
|
22
|
+
from sourcecode.scanner import DEFAULT_EXCLUDES, MANIFEST_NAMES
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class AdaptiveScanner:
|
|
26
|
+
"""File tree scanner driven by repository topology.
|
|
27
|
+
|
|
28
|
+
When *topology* is provided, traversal depth is controlled per-directory:
|
|
29
|
+
directories inside source roots receive a deep budget; low-signal dirs
|
|
30
|
+
are restricted; generated dirs are excluded entirely.
|
|
31
|
+
|
|
32
|
+
When *topology* is None, falls back to the base depth limit — identical
|
|
33
|
+
behaviour to FileScanner.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def __init__(
|
|
37
|
+
self,
|
|
38
|
+
root: Path,
|
|
39
|
+
topology: Optional[RepoTopology] = None,
|
|
40
|
+
base_depth: int = 4,
|
|
41
|
+
extra_excludes: Optional[frozenset[str]] = None,
|
|
42
|
+
) -> None:
|
|
43
|
+
self.root = root.resolve()
|
|
44
|
+
self.topology = topology
|
|
45
|
+
self.base_depth = base_depth
|
|
46
|
+
self._excludes = DEFAULT_EXCLUDES | (extra_excludes or frozenset())
|
|
47
|
+
self._gitignore_spec: Optional[GitIgnoreSpec] = None
|
|
48
|
+
|
|
49
|
+
# Pre-compute lookup tables from topology for O(1) classification
|
|
50
|
+
# during traversal.
|
|
51
|
+
#
|
|
52
|
+
# Each entry is (path_parts_tuple, max_absolute_depth):
|
|
53
|
+
# source prefix → (src_parts, len(src_parts) + source_depth)
|
|
54
|
+
# low-signal prefix → (ls_parts, len(ls_parts) + low_signal_depth)
|
|
55
|
+
#
|
|
56
|
+
# "max_absolute_depth" is depth measured from the repo root, not from
|
|
57
|
+
# the classified directory. At depth D, files are visible; at depth
|
|
58
|
+
# >= max we clear dirnames and skip files.
|
|
59
|
+
self._source_prefixes: list[tuple[tuple[str, ...], int]] = []
|
|
60
|
+
self._low_signal_prefixes: list[tuple[tuple[str, ...], int]] = []
|
|
61
|
+
self._extra_exclude_names: frozenset[str] = frozenset()
|
|
62
|
+
|
|
63
|
+
if topology is not None:
|
|
64
|
+
budget = topology.scan_budget
|
|
65
|
+
for sr in topology.source_roots:
|
|
66
|
+
parts = tuple(p for p in sr.path.split("/") if p)
|
|
67
|
+
if parts:
|
|
68
|
+
max_d = len(parts) + budget.source_depth
|
|
69
|
+
self._source_prefixes.append((parts, max_d))
|
|
70
|
+
|
|
71
|
+
for lr in topology.low_signal_roots:
|
|
72
|
+
parts = tuple(p for p in lr.path.split("/") if p)
|
|
73
|
+
if parts:
|
|
74
|
+
max_d = len(parts) + budget.low_signal_depth
|
|
75
|
+
self._low_signal_prefixes.append((parts, max_d))
|
|
76
|
+
|
|
77
|
+
# Generated roots at depth 1 → add to excludes so os.walk never enters
|
|
78
|
+
top_generated = {
|
|
79
|
+
gr.path.split("/")[0]
|
|
80
|
+
for gr in topology.generated_roots
|
|
81
|
+
if "/" not in gr.path
|
|
82
|
+
}
|
|
83
|
+
self._extra_exclude_names = frozenset(top_generated)
|
|
84
|
+
|
|
85
|
+
# ------------------------------------------------------------------
|
|
86
|
+
# Gitignore
|
|
87
|
+
# ------------------------------------------------------------------
|
|
88
|
+
|
|
89
|
+
def _load_gitignore_spec(self) -> GitIgnoreSpec:
|
|
90
|
+
if self._gitignore_spec is None:
|
|
91
|
+
gitignore = self.root / ".gitignore"
|
|
92
|
+
lines: list[str] = []
|
|
93
|
+
if gitignore.exists():
|
|
94
|
+
try:
|
|
95
|
+
lines = gitignore.read_text(encoding="utf-8", errors="replace").splitlines()
|
|
96
|
+
except OSError:
|
|
97
|
+
pass
|
|
98
|
+
self._gitignore_spec = GitIgnoreSpec.from_lines(lines)
|
|
99
|
+
return self._gitignore_spec
|
|
100
|
+
|
|
101
|
+
def _is_excluded_by_gitignore(self, rel_path: str, is_dir: bool) -> bool:
|
|
102
|
+
spec = self._load_gitignore_spec()
|
|
103
|
+
path_to_match = rel_path + "/" if is_dir else rel_path
|
|
104
|
+
return spec.match_file(path_to_match)
|
|
105
|
+
|
|
106
|
+
# ------------------------------------------------------------------
|
|
107
|
+
# Depth budget computation — the core of adaptive traversal
|
|
108
|
+
# ------------------------------------------------------------------
|
|
109
|
+
|
|
110
|
+
def _compute_max_depth(self, rel_parts: tuple[str, ...]) -> int:
|
|
111
|
+
"""Return the maximum absolute depth allowed at *rel_parts*.
|
|
112
|
+
|
|
113
|
+
Depth is the number of path components from the repo root. Files
|
|
114
|
+
at depth D are included; the scan stops (dirnames cleared) when
|
|
115
|
+
depth >= returned value.
|
|
116
|
+
|
|
117
|
+
Priority order:
|
|
118
|
+
1. Inside a source root → deep budget (source_depth extra levels)
|
|
119
|
+
2. Ancestor of a source root → must allow traversal to reach it
|
|
120
|
+
3. Inside a low-signal root → restricted budget (low_signal_depth)
|
|
121
|
+
4. Default → base_depth
|
|
122
|
+
"""
|
|
123
|
+
if not self._source_prefixes and not self._low_signal_prefixes:
|
|
124
|
+
return self.base_depth
|
|
125
|
+
|
|
126
|
+
current_depth = len(rel_parts)
|
|
127
|
+
|
|
128
|
+
# Track the best depth found via ancestor matching (may have multiple
|
|
129
|
+
# source roots; return the maximum so all are reachable).
|
|
130
|
+
ancestor_best = self.base_depth
|
|
131
|
+
found_ancestor = False
|
|
132
|
+
|
|
133
|
+
for src_parts, src_max in self._source_prefixes:
|
|
134
|
+
n = len(src_parts)
|
|
135
|
+
if current_depth >= n:
|
|
136
|
+
# At or inside the source root
|
|
137
|
+
if rel_parts[:n] == src_parts:
|
|
138
|
+
return src_max # definite source territory — early exit
|
|
139
|
+
else:
|
|
140
|
+
# Ancestor check: src_parts starts with rel_parts?
|
|
141
|
+
if src_parts[:current_depth] == rel_parts:
|
|
142
|
+
found_ancestor = True
|
|
143
|
+
if src_max > ancestor_best:
|
|
144
|
+
ancestor_best = src_max
|
|
145
|
+
|
|
146
|
+
if found_ancestor:
|
|
147
|
+
return ancestor_best
|
|
148
|
+
|
|
149
|
+
# Low-signal roots (only if not already committed to a source path)
|
|
150
|
+
for ls_parts, ls_max in self._low_signal_prefixes:
|
|
151
|
+
n = len(ls_parts)
|
|
152
|
+
if current_depth >= n and rel_parts[:n] == ls_parts:
|
|
153
|
+
return ls_max
|
|
154
|
+
|
|
155
|
+
return self.base_depth
|
|
156
|
+
|
|
157
|
+
# ------------------------------------------------------------------
|
|
158
|
+
# Main traversal
|
|
159
|
+
# ------------------------------------------------------------------
|
|
160
|
+
|
|
161
|
+
def scan_tree(self) -> dict[str, Any]:
|
|
162
|
+
"""Build the nested file tree dictionary.
|
|
163
|
+
|
|
164
|
+
Returns dict where None = file (D-02) and dict = directory (D-01).
|
|
165
|
+
Depth limits are applied per-directory using topology-derived budgets.
|
|
166
|
+
"""
|
|
167
|
+
self._load_gitignore_spec()
|
|
168
|
+
root_tree: dict[str, Any] = {}
|
|
169
|
+
all_excludes = self._excludes | self._extra_exclude_names
|
|
170
|
+
|
|
171
|
+
for dirpath, dirnames, filenames in os.walk(self.root, followlinks=False):
|
|
172
|
+
current = Path(dirpath)
|
|
173
|
+
try:
|
|
174
|
+
rel = current.relative_to(self.root)
|
|
175
|
+
except ValueError:
|
|
176
|
+
continue
|
|
177
|
+
|
|
178
|
+
rel_parts = rel.parts
|
|
179
|
+
depth = len(rel_parts)
|
|
180
|
+
|
|
181
|
+
effective_max_depth = self._compute_max_depth(rel_parts)
|
|
182
|
+
|
|
183
|
+
if depth >= effective_max_depth:
|
|
184
|
+
dirnames.clear()
|
|
185
|
+
continue
|
|
186
|
+
|
|
187
|
+
# Filter dirnames in-place (critical: slice assignment)
|
|
188
|
+
dirnames[:] = [
|
|
189
|
+
d for d in dirnames
|
|
190
|
+
if d not in all_excludes
|
|
191
|
+
and not (current / d).is_symlink()
|
|
192
|
+
and not self._is_excluded_by_gitignore(
|
|
193
|
+
str(rel / d) if rel_parts else d,
|
|
194
|
+
is_dir=True,
|
|
195
|
+
)
|
|
196
|
+
]
|
|
197
|
+
|
|
198
|
+
node = self._get_or_create_node(root_tree, rel_parts)
|
|
199
|
+
|
|
200
|
+
for fname in filenames:
|
|
201
|
+
# Skip flag-shaped names (shell redirect artifacts)
|
|
202
|
+
if fname.startswith("-"):
|
|
203
|
+
continue
|
|
204
|
+
fpath = current / fname
|
|
205
|
+
if fpath.is_symlink():
|
|
206
|
+
continue
|
|
207
|
+
rel_file = str(rel / fname) if rel_parts else fname
|
|
208
|
+
if self._is_excluded_by_gitignore(rel_file, is_dir=False):
|
|
209
|
+
continue
|
|
210
|
+
node[fname] = None # D-02: None = file
|
|
211
|
+
|
|
212
|
+
# Ensure accepted subdirs exist as dict nodes
|
|
213
|
+
for d in dirnames:
|
|
214
|
+
if d not in node:
|
|
215
|
+
node[d] = {}
|
|
216
|
+
|
|
217
|
+
return root_tree
|
|
218
|
+
|
|
219
|
+
def _get_or_create_node(
|
|
220
|
+
self, tree: dict[str, Any], parts: tuple[str, ...]
|
|
221
|
+
) -> dict[str, Any]:
|
|
222
|
+
node = tree
|
|
223
|
+
for part in parts:
|
|
224
|
+
if part not in node or node[part] is None:
|
|
225
|
+
node[part] = {}
|
|
226
|
+
node = cast(dict[str, Any], node[part])
|
|
227
|
+
return node
|
|
228
|
+
|
|
229
|
+
# ------------------------------------------------------------------
|
|
230
|
+
# Manifest discovery — same interface as FileScanner
|
|
231
|
+
# ------------------------------------------------------------------
|
|
232
|
+
|
|
233
|
+
def find_manifests(self) -> list[str]:
|
|
234
|
+
"""Find manifest files at depth 0-1.
|
|
235
|
+
|
|
236
|
+
Identical logic to FileScanner.find_manifests() — depth-0 root
|
|
237
|
+
manifests plus depth-1 sub-package manifests, hidden dirs excluded.
|
|
238
|
+
"""
|
|
239
|
+
manifests: list[str] = []
|
|
240
|
+
for name in MANIFEST_NAMES:
|
|
241
|
+
candidate = self.root / name
|
|
242
|
+
if candidate.exists() and not candidate.is_symlink():
|
|
243
|
+
manifests.append(str(candidate))
|
|
244
|
+
try:
|
|
245
|
+
for child in self.root.iterdir():
|
|
246
|
+
if (
|
|
247
|
+
child.is_dir()
|
|
248
|
+
and not child.is_symlink()
|
|
249
|
+
and child.name not in self._excludes
|
|
250
|
+
and not child.name.startswith(".")
|
|
251
|
+
):
|
|
252
|
+
for name in MANIFEST_NAMES:
|
|
253
|
+
candidate = child / name
|
|
254
|
+
if candidate.exists() and not candidate.is_symlink():
|
|
255
|
+
manifests.append(str(candidate))
|
|
256
|
+
except PermissionError:
|
|
257
|
+
pass
|
|
258
|
+
return manifests
|
|
@@ -714,6 +714,13 @@ def main(
|
|
|
714
714
|
# 1. Scan directory (SCAN-01 to SCAN-05)
|
|
715
715
|
redactor = SecretRedactor(enabled=not no_redact)
|
|
716
716
|
|
|
717
|
+
# Classify repository topology before scanning. This is a shallow
|
|
718
|
+
# filesystem read (depth 0-1 only) and completes in milliseconds.
|
|
719
|
+
# The topology drives per-directory depth budgets in AdaptiveScanner.
|
|
720
|
+
from sourcecode.adaptive_scanner import AdaptiveScanner
|
|
721
|
+
from sourcecode.repo_classifier import RepoClassifier
|
|
722
|
+
_topology = RepoClassifier().classify(target)
|
|
723
|
+
|
|
717
724
|
# Detect manifests before scan to adjust depth.
|
|
718
725
|
# find_manifests() only looks at depth 0-1, does not need the full tree.
|
|
719
726
|
_pre_scanner = FileScanner(target, max_depth=1)
|
|
@@ -735,7 +742,7 @@ def main(
|
|
|
735
742
|
no_tree = True # agents never need the raw file tree
|
|
736
743
|
typer.echo("[agent] dependencies env-map code-notes (no-tree)", err=True)
|
|
737
744
|
|
|
738
|
-
scanner =
|
|
745
|
+
scanner = AdaptiveScanner(target, topology=_topology, base_depth=effective_depth)
|
|
739
746
|
raw_tree = scanner.scan_tree()
|
|
740
747
|
|
|
741
748
|
# 2. Filter .env and *.secret entries from file tree (SEC-02, all levels)
|
|
@@ -775,16 +782,14 @@ def main(
|
|
|
775
782
|
detector = ProjectDetector(build_default_detectors())
|
|
776
783
|
workspace_analysis = WorkspaceAnalyzer().analyze(target, manifests)
|
|
777
784
|
|
|
778
|
-
#
|
|
779
|
-
#
|
|
780
|
-
# Only emit to TTY to avoid contaminating piped/CI output; agents read analysis_gaps.
|
|
785
|
+
# Adaptive traversal handles monorepo source root discovery automatically.
|
|
786
|
+
# Emit a diagnostic when topology confidence is low so users know why.
|
|
781
787
|
import sys as _sys
|
|
782
|
-
if
|
|
788
|
+
if _topology.workspace_type == "monorepo" and _topology.confidence < 0.5:
|
|
783
789
|
if _sys.stderr.isatty():
|
|
784
790
|
typer.echo(
|
|
785
|
-
|
|
786
|
-
"
|
|
787
|
-
"Use --depth 6 or higher for full coverage.",
|
|
791
|
+
"[traversal] monorepo detected but source root confidence is low "
|
|
792
|
+
f"({_topology.confidence:.0%}). Use --depth 8 or higher if files are missing.",
|
|
788
793
|
err=True,
|
|
789
794
|
)
|
|
790
795
|
|
|
@@ -896,7 +901,8 @@ def main(
|
|
|
896
901
|
workspace_root = target / workspace.path
|
|
897
902
|
if not workspace_root.exists() or not workspace_root.is_dir():
|
|
898
903
|
continue
|
|
899
|
-
|
|
904
|
+
_ws_topology = RepoClassifier().classify(workspace_root)
|
|
905
|
+
workspace_scanner = AdaptiveScanner(workspace_root, topology=_ws_topology, base_depth=depth)
|
|
900
906
|
workspace_tree = filter_sensitive_files(workspace_scanner.scan_tree())
|
|
901
907
|
workspace_manifests = workspace_scanner.find_manifests()
|
|
902
908
|
workspace_stacks, workspace_entry_points, _ = detector.detect(
|
|
@@ -1008,6 +1014,7 @@ def main(
|
|
|
1008
1014
|
metadata = AnalysisMetadata(
|
|
1009
1015
|
analyzed_path=str(target),
|
|
1010
1016
|
analyzer_fingerprints=_fingerprints,
|
|
1017
|
+
traversal_topology=_topology.as_dict(),
|
|
1011
1018
|
)
|
|
1012
1019
|
sm = SourceMap(
|
|
1013
1020
|
metadata=metadata,
|
|
@@ -1037,7 +1044,7 @@ def main(
|
|
|
1037
1044
|
target / ws.path,
|
|
1038
1045
|
(
|
|
1039
1046
|
filter_sensitive_files(
|
|
1040
|
-
|
|
1047
|
+
AdaptiveScanner(target / ws.path, base_depth=depth).scan_tree()
|
|
1041
1048
|
)
|
|
1042
1049
|
),
|
|
1043
1050
|
workspace=ws.path,
|