PyPI - sourcecode - Versions diffs - 0.42.0__py3-none-any.whl → 0.44.0__py3-none-any.whl - Mend

sourcecode 0.42.0py3-none-any.whl → 0.44.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

sourcecode/__init__.py +1 -1
sourcecode/cli.py +30 -0
sourcecode/context_scorer.py +404 -0
sourcecode/contract_model.py +1 -0
sourcecode/contract_pipeline.py +59 -25
sourcecode/prepare_context.py +27 -1
sourcecode/ranking_engine.py +29 -7
sourcecode/serializer.py +49 -5
{sourcecode-0.42.0.dist-info → sourcecode-0.44.0.dist-info}/METADATA +1 -1
{sourcecode-0.42.0.dist-info → sourcecode-0.44.0.dist-info}/RECORD +13 -12
{sourcecode-0.42.0.dist-info → sourcecode-0.44.0.dist-info}/WHEEL +0 -0
{sourcecode-0.42.0.dist-info → sourcecode-0.44.0.dist-info}/entry_points.txt +0 -0
{sourcecode-0.42.0.dist-info → sourcecode-0.44.0.dist-info}/licenses/LICENSE +0 -0

sourcecode/__init__.py CHANGED Viewed

@@ -1,3 +1,3 @@
 """sourcecode — Deterministic codebase context maps for AI coding agents."""
-__version__ = "0.42.0"
+__version__ = "0.44.0"

sourcecode/cli.py CHANGED Viewed

@@ -181,6 +181,7 @@ _OPTIONS_WITH_VALUE: frozenset[str] = frozenset({
     "--dependency-depth",
     "--rank-by",
     "--symbol",
+    "--max-importers",
 })
@@ -594,6 +595,17 @@ def main(
         "--symbol",
         help="Contract mode: extract localized context for a specific symbol name. Returns defining file + all importers.",
     ),
+    max_importers: int = typer.Option(
+        50,
+        "--max-importers",
+        help=(
+            "Maximum importer files returned by --symbol (default: 50). "
+            "Popular symbols can have hundreds of importers — this prevents output explosion. "
+            "Defining files are never truncated. Override: --symbol Foo --max-importers 200."
+        ),
+        min=1,
+        max=10000,
+    ),
     copy: bool = typer.Option(
         False,
         "--copy",
@@ -770,6 +782,21 @@ def main(
         code_notes = True
         no_tree = True  # agents never need the raw file tree
         typer.echo("[agent] dependencies env-map code-notes (no-tree)", err=True)
+        # Warn about flags that are computed but excluded from agent_view output
+        _agent_suppressed: list[str] = []
+        if full_metrics:
+            _agent_suppressed.append("--full-metrics")
+        if graph_modules:
+            _agent_suppressed.append("--graph-modules")
+        if docs:
+            _agent_suppressed.append("--docs")
+        if _agent_suppressed:
+            typer.echo(
+                f"[agent] warning: {', '.join(_agent_suppressed)} computed but excluded "
+                "from --agent output — agent_view does not include these sections. "
+                "Remove these flags to skip unnecessary computation.",
+                err=True,
+            )
     scanner = AdaptiveScanner(target, topology=_topology, base_depth=effective_depth)
     raw_tree = scanner.scan_tree()
@@ -1343,6 +1370,9 @@ def main(
             changed_only=changed_only,
             symbol=symbol,
             compress_types=compress_types,
+            max_importers=max_importers,
+            semantic_calls=sm.semantic_calls or None,
+            code_notes=sm.code_notes or None,
         )
         sm = _replace(sm, file_contracts=_contracts, contract_summary=_contract_summary)
         if symbol is not None and len(_contracts) == 0:

sourcecode/context_scorer.py ADDED Viewed

@@ -0,0 +1,404 @@
+"""context_scorer.py — Unified node scoring and minimum-sufficient subgraph selection.
+Aggregates all available signals (structural, semantic, git, annotations, proximity)
+into a NodeScore per file, then uses greedy selection to produce the minimum-sufficient
+subgraph that maximises explanatory value within a context budget.
+Design invariants:
+  - Deterministic: sort key is always (-score, path). Path breaks all ties.
+  - No LLMs, no randomness, no external I/O.
+  - All signals optional: degrades gracefully when data is absent.
+  - SCORER_VERSION: bump on any formula change so callers can detect drift.
+"""
+from __future__ import annotations
+from collections import Counter, deque
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Optional
+SCORER_VERSION = "1"
+# ---------------------------------------------------------------------------
+# Edge weight tables
+# ---------------------------------------------------------------------------
+_EDGE_BASE_WEIGHTS: dict[str, float] = {
+    "imports":  1.00,  # structural dependency — strongest signal
+    "extends":  0.90,  # inheritance / implementation — tight coupling
+    "calls":    0.80,  # behavioral dependency
+    "contains": 0.30,  # membership — low marginal information
+}
+_CONFIDENCE_MULT: dict[str, float] = {
+    "high":   1.0,
+    "medium": 0.7,
+    "low":    0.3,
+}
+# Annotation kinds weighted at 2× (actionable defects vs informational notes)
+_HIGH_SEVERITY_NOTES: frozenset[str] = frozenset({"BUG", "FIXME", "HACK", "XXX"})
+# ---------------------------------------------------------------------------
+# Data model
+# ---------------------------------------------------------------------------
+@dataclass
+class NodeScore:
+    """Unified scoring breakdown for a single file node.
+    score / display_score drive all ranking and selection decisions.
+    The component fields (structural, semantic, annotation, proximity) allow
+    callers to inspect which signals dominated the final score.
+    """
+    path: str
+    score: float          # final weighted score (higher = more relevant)
+    display_score: float  # clamped [0.0, 1.0] for output fields
+    structural: float     # contribution from RankingEngine
+    semantic: float       # call graph centrality [0.0, 1.0]
+    annotation: float     # code note density [0.0, 1.0]
+    proximity: float      # BFS closeness to focus [0.0, 1.0]
+    reasons: list[str]
+# ---------------------------------------------------------------------------
+# Core scorer
+# ---------------------------------------------------------------------------
+class ContextScorer:
+    """Unified file scoring and minimum-sufficient subgraph selection.
+    Stateless once constructed. Thread-safe (no mutable state after __init__).
+    """
+    def __init__(
+        self,
+        monorepo_packages: Optional[list] = None,
+    ) -> None:
+        from sourcecode.ranking_engine import RankingEngine
+        self._engine = RankingEngine(monorepo_packages or [])
+    def score_nodes(
+        self,
+        contracts: list[Any],
+        *,
+        semantic_calls: Optional[list] = None,
+        git_hotspots: Optional[dict[str, int]] = None,
+        code_notes: Optional[list] = None,
+        focus_path: Optional[str] = None,
+        task: str = "default",
+    ) -> dict[str, NodeScore]:
+        """Compute a NodeScore for every contract.
+        Parameters
+        ----------
+        contracts       FileContract list. fan_in, fan_out, is_entrypoint,
+                        is_changed, and exports must be set before calling.
+        semantic_calls  list[CallRecord] from --semantics (optional).
+        git_hotspots    {path: commit_count} from git analysis (optional).
+        code_notes      list[CodeNote] from --code-notes (optional).
+        focus_path      Anchor file for proximity BFS (optional).
+        task            Task profile: fix-bug | refactor | explain | …
+        Returns
+        -------
+        dict mapping path → NodeScore for every contract path.
+        """
+        from sourcecode.ranking_engine import TASK_WEIGHTS
+        w = TASK_WEIGHTS.get(task, TASK_WEIGHTS["default"])
+        _hotspots = git_hotspots or {}
+        max_fan_in = max((c.fan_in for c in contracts), default=1)
+        max_churn = max(_hotspots.values(), default=1)
+        # Pre-compute optional signal maps
+        sem_centrality: dict[str, float] = {}
+        if semantic_calls:
+            sem_centrality = _semantic_centrality(semantic_calls, contracts)
+        max_semantic = max(sem_centrality.values(), default=1.0) or 1.0
+        ann_density: dict[str, float] = {}
+        if code_notes:
+            ann_density = _annotation_density(code_notes, contracts)
+        prox_scores: dict[str, float] = {}
+        if focus_path:
+            prox_scores = _proximity_bfs(focus_path, contracts, semantic_calls or [])
+        result: dict[str, NodeScore] = {}
+        for c in contracts:
+            sem = sem_centrality.get(c.path, 0.0)
+            ann = ann_density.get(c.path, 0.0)
+            prox = prox_scores.get(c.path, 0.0)
+            # Structural + git + annotation + semantic centrality via unified engine
+            fs = self._engine.score(
+                c.path,
+                fan_in=c.fan_in,
+                fan_out=c.fan_out,
+                max_fan_in=max_fan_in,
+                git_churn=_hotspots.get(c.path, 0),
+                max_churn=max_churn,
+                is_entrypoint=c.is_entrypoint,
+                is_changed=c.is_changed,
+                export_count=len(c.exports),
+                task=task,
+                semantic_centrality=sem,
+                max_semantic=max_semantic,
+            )
+            # Proximity is a graph operation, computed here and added on top
+            prox_contrib = prox * 0.50 * w.proximity
+            final = fs.score + prox_contrib
+            reasons = list(fs.reasons)
+            if prox >= 0.80 and prox_contrib > 0:
+                reasons.append("close to focus")
+            elif prox >= 0.50 and prox_contrib > 0:
+                reasons.append("near focus")
+            result[c.path] = NodeScore(
+                path=c.path,
+                score=final,
+                display_score=max(0.0, min(1.0, final)),
+                structural=fs.score,
+                semantic=sem,
+                annotation=ann,
+                proximity=prox,
+                reasons=reasons,
+            )
+        return result
+    def select_subgraph(
+        self,
+        node_scores: dict[str, NodeScore],
+        contracts: list[Any],
+        *,
+        budget: int = 30,
+        min_score: float = 0.05,
+    ) -> list[str]:
+        """Greedy minimum-sufficient subgraph selection with diversity re-ranking.
+        At each round, recomputes effective scores for all remaining candidates
+        (raw_score × (1 - redundancy_penalty)), then picks the highest. This
+        allows a file from a new directory to beat a clustered sibling even if
+        the sibling has a higher raw score — the selection actively prefers
+        coverage over concentration.
+        Stops when the budget is exhausted or no remaining candidate has an
+        effective score above min_score.
+        O(n × budget) — negligible for typical budgets (15-30) and file counts.
+        Deterministic: tie-break by path on every round.
+        Parameters
+        ----------
+        node_scores  output of score_nodes()
+        contracts    same FileContract list passed to score_nodes()
+                     (used for directory-based redundancy; may be empty)
+        budget       maximum number of nodes to select
+        min_score    discard candidates whose effective score is below this
+        """
+        contract_map = {c.path: c for c in contracts}
+        remaining: dict[str, NodeScore] = dict(node_scores)
+        selected: list[str] = []
+        selected_set: set[str] = set()
+        while len(selected) < budget and remaining:
+            best_path: str | None = None
+            best_effective: float = -1.0
+            for path, ns in remaining.items():
+                if ns.score < min_score:
+                    continue
+                penalty = _redundancy_penalty(path, selected_set, contract_map)
+                effective = ns.score * (1.0 - penalty)
+                # Strict tie-break by path ensures determinism
+                if effective > best_effective or (
+                    effective == best_effective
+                    and best_path is not None
+                    and path < best_path
+                ):
+                    best_effective = effective
+                    best_path = path
+            if best_path is None or best_effective < min_score:
+                break
+            selected.append(best_path)
+            selected_set.add(best_path)
+            del remaining[best_path]
+        return selected
+    @staticmethod
+    def edge_weight(kind: str, confidence: str) -> float:
+        """Scalar weight for a graph edge based on relationship type and confidence.
+        Higher weight = stronger information dependency between the connected nodes.
+        """
+        base = _EDGE_BASE_WEIGHTS.get(kind, 0.50)
+        mult = _CONFIDENCE_MULT.get(confidence, 0.50)
+        return base * mult
+# ---------------------------------------------------------------------------
+# Signal computers (module-level, pure functions)
+# ---------------------------------------------------------------------------
+def _semantic_centrality(
+    semantic_calls: list,
+    contracts: list,
+) -> dict[str, float]:
+    """Per-file centrality from the call graph.
+    centrality(path) = (weighted_fan_in × 2 + weighted_fan_out) / max
+    where weight = confidence multiplier (high=1.0, medium=0.7, low=0.3).
+    Returns a dict normalised to [0.0, 1.0] across the contract set.
+    """
+    path_set = {c.path for c in contracts}
+    fan_in: Counter[str] = Counter()
+    fan_out: Counter[str] = Counter()
+    for call in semantic_calls:
+        w = _CONFIDENCE_MULT.get(getattr(call, "confidence", "medium"), 0.7)
+        callee = getattr(call, "callee_path", None)
+        caller = getattr(call, "caller_path", None)
+        if callee and callee in path_set:
+            fan_in[callee] += w
+        if caller and caller in path_set:
+            fan_out[caller] += w
+    raw = {p: fan_in[p] * 2.0 + fan_out[p] for p in path_set}
+    max_val = max(raw.values(), default=0.0)
+    if max_val <= 0.0:
+        return {p: 0.0 for p in path_set}
+    return {p: v / max_val for p, v in raw.items()}
+def _proximity_bfs(
+    focus_path: str,
+    contracts: list,
+    semantic_calls: list,
+) -> dict[str, float]:
+    """BFS from focus_path through import + call edges.
+    Traversal is bidirectional (imports and calls traversed in both directions)
+    so the proximity score reflects reachability in any direction from the focus.
+    proximity(path) = 1.0 / (2 ** distance)
+      distance=0 → 1.00 (the focus itself)
+      distance=1 → 0.50
+      distance=2 → 0.25
+      distance=3 → 0.125
+      distance=4 → 0.0625  (max depth)
+    BFS neighbours are sorted before enqueuing to ensure determinism.
+    """
+    path_set = {c.path for c in contracts}
+    # Build bidirectional adjacency from import graph
+    adj: dict[str, set[str]] = {p: set() for p in path_set}
+    for c in contracts:
+        base_dir = str(Path(c.path).parent).replace("\\", "/")
+        for imp in c.imports:
+            src = getattr(imp, "source", "")
+            if not src.startswith("."):
+                continue
+            for t in _resolve_import(base_dir, src, path_set):
+                adj[c.path].add(t)
+                adj[t].add(c.path)
+    # Augment with call graph edges
+    for call in semantic_calls:
+        caller = getattr(call, "caller_path", None)
+        callee = getattr(call, "callee_path", None)
+        if caller in adj and callee in adj:
+            adj[caller].add(callee)
+            adj[callee].add(caller)
+    if focus_path not in adj:
+        return {}
+    distances: dict[str, int] = {focus_path: 0}
+    queue: deque[str] = deque([focus_path])
+    while queue:
+        node = queue.popleft()
+        d = distances[node]
+        if d >= 4:
+            continue
+        for neighbor in sorted(adj.get(node, set())):
+            if neighbor not in distances:
+                distances[neighbor] = d + 1
+                queue.append(neighbor)
+    return {p: 1.0 / (2 ** d) for p, d in distances.items()}
+def _annotation_density(
+    code_notes: list,
+    contracts: list,
+) -> dict[str, float]:
+    """Severity-weighted annotation density per file, normalised [0.0, 1.0].
+    BUG / FIXME / HACK / XXX count 2×; all other kinds count 1×.
+    """
+    path_set = {c.path for c in contracts}
+    weighted: Counter[str] = Counter()
+    for note in code_notes:
+        path = getattr(note, "path", None)
+        if path not in path_set:
+            continue
+        kind = getattr(note, "kind", "").upper()
+        weighted[path] += 2.0 if kind in _HIGH_SEVERITY_NOTES else 1.0
+    max_val = max(weighted.values(), default=1.0)
+    return {p: min(weighted.get(p, 0.0) / max_val, 1.0) for p in path_set}
+def _redundancy_penalty(
+    path: str,
+    selected_set: set[str],
+    contract_map: dict,
+) -> float:
+    """Penalty for adding a file from the same directory as already-selected files.
+    Rationale: files in the same directory address the same concern; the
+    marginal explanatory gain of the n-th file from a directory is lower than
+    that of the first file from a new directory.
+    Penalty grows by 0.10 per same-directory sibling, capped at 0.40.
+    The 0.40 cap ensures no node is ever fully excluded by proximity alone.
+    """
+    if not selected_set:
+        return 0.0
+    path_dir = str(Path(path).parent)
+    same_dir_count = sum(
+        1 for s in selected_set
+        if str(Path(s).parent) == path_dir
+    )
+    return min(same_dir_count * 0.10, 0.40)
+def _resolve_import(base_dir: str, src: str, path_set: set[str]) -> list[str]:
+    """Approximate resolution of a relative import specifier to known paths.
+    Mirrors the logic in contract_pipeline._resolve_relative without importing
+    from that module (avoids circular import).
+    """
+    src = src.lstrip("./")
+    if not src:
+        return []
+    exts = (".ts", ".tsx", ".js", ".jsx", ".py", "/index.ts", "/index.js", "/index.tsx")
+    for ext in exts:
+        candidate = f"{base_dir}/{src}{ext}".replace("//", "/")
+        if candidate in path_set:
+            return [candidate]
+    candidate = f"{base_dir}/{src}".replace("//", "/")
+    if candidate in path_set:
+        return [candidate]
+    return []

sourcecode/contract_model.py CHANGED Viewed

@@ -109,3 +109,4 @@ class ContractSummary:
     method_breakdown: dict[str, int] = field(default_factory=dict)
     ranked_by: str = "relevance"
     limitations: list[str] = field(default_factory=list)
+    symbol_truncation: Optional[dict] = None  # set when --symbol truncates importers

sourcecode/contract_pipeline.py CHANGED Viewed

@@ -175,6 +175,9 @@ class ContractPipeline:
         changed_only: bool = False,
         symbol: Optional[str] = None,
         compress_types: bool = False,
+        max_importers: int = 50,
+        semantic_calls: Optional[list] = None,
+        code_notes: Optional[list] = None,
     ) -> tuple[list[FileContract], ContractSummary]:
         """Run the full extraction pipeline.
@@ -256,40 +259,42 @@ class ContractPipeline:
         if rank_by == "git-churn":
             churn = _get_git_churn(root, [c.path for c in contracts])
-        # 6. Compute relevance scores via unified ranking engine
-        max_fan_in = max((c.fan_in for c in contracts), default=1) if contracts else 1
-        max_churn_val = max(churn.values(), default=1) if churn else 1
+        # 6. Compute relevance scores via unified scoring engine.
+        # ContextScorer wraps RankingEngine and enriches scores with semantic
+        # centrality (when semantic_calls available) and annotation density
+        # (when code_notes available). Falls back to structural signals only
+        # when neither is present — identical to the old behaviour.
+        from sourcecode.context_scorer import ContextScorer
+        _ctx_scorer = ContextScorer(monorepo_packages)
+        _node_scores = _ctx_scorer.score_nodes(
+            contracts,
+            semantic_calls=semantic_calls,
+            code_notes=code_notes,
+            git_hotspots=churn,
+            task="default",
+        )
         for c in contracts:
-            fs = engine.score(
-                c.path,
-                fan_in=c.fan_in,
-                fan_out=c.fan_out,
-                max_fan_in=max_fan_in,
-                git_churn=churn.get(c.path, 0),
-                max_churn=max_churn_val,
-                is_entrypoint=c.is_entrypoint,
-                is_changed=c.is_changed,
-                export_count=len(c.exports),
-                task="default",
-            )
-            c.relevance_score = fs.display_score
-            c.ranking_reasons = fs.reasons
+            ns = _node_scores[c.path]
+            c.relevance_score = ns.display_score
+            c.ranking_reasons = ns.reasons
         # 7. Rank
         contracts = self._rank(contracts, rank_by)
         # 8. Symbol filter — keep files that define or import the symbol
+        _symbol_truncation: Optional[dict] = None
         if symbol:
-            contracts = _filter_by_symbol(contracts, symbol)
+            contracts, _symbol_truncation = _filter_by_symbol(contracts, symbol, max_importers=max_importers)
             # When shallow scan missed the defining file (deep monorepo), fall back
             # to a grep-based filesystem search over the full directory tree.
             if not contracts:
-                contracts = self._symbol_deep_scan(
+                contracts, _symbol_truncation = self._symbol_deep_scan(
                     root, symbol,
                     known_paths=set(src_paths),
                     entry_paths=entry_paths,
                     changed_files=changed_files,
                     engine=engine,
+                    max_importers=max_importers,
                 )
         # 9. Entrypoints-only filter
@@ -313,6 +318,7 @@ class ContractPipeline:
             method_breakdown=dict(method_counts),
             ranked_by=rank_by,
             limitations=limitations,
+            symbol_truncation=_symbol_truncation,
         )
         return contracts, summary
@@ -332,7 +338,8 @@ class ContractPipeline:
         entry_paths: set[str],
         changed_files: set[str],
         engine: RankingEngine,
-    ) -> list[FileContract]:
+        max_importers: int = 50,
+    ) -> tuple[list[FileContract], dict]:
         """Grep-based fallback when the shallow scan missed the defining files.
         Searches the full directory tree for source files containing *symbol*,
@@ -356,7 +363,7 @@ class ContractPipeline:
             contract.ranking_reasons = fs.reasons
             extra.append(contract)
-        return _filter_by_symbol(extra, symbol)
+        return _filter_by_symbol(extra, symbol, max_importers=max_importers)
 # ---------------------------------------------------------------------------
@@ -412,7 +419,11 @@ def _limit_symbols(contracts: list[FileContract], max_symbols: int) -> list[File
 # Symbol-aware filter
 # ---------------------------------------------------------------------------
-def _filter_by_symbol(contracts: list[FileContract], symbol: str) -> list[FileContract]:
+def _filter_by_symbol(
+    contracts: list[FileContract],
+    symbol: str,
+    max_importers: int = 50,
+) -> tuple[list[FileContract], dict]:
     """Return contracts that define, import, or structurally reference *symbol*.
     Four tiers applied in order:
@@ -423,6 +434,8 @@ def _filter_by_symbol(contracts: list[FileContract], symbol: str) -> list[FileCo
        function signatures (word-boundary). Only used when tiers 1-3 fail.
     Defining contracts are ranked first; importers and references follow.
+    max_importers caps tier 3 results to prevent output explosion on popular symbols.
+    Returns (contracts, truncation_metadata).
     """
     sym_l = symbol.lower()
     word_re = re.compile(
@@ -466,8 +479,14 @@ def _filter_by_symbol(contracts: list[FileContract], symbol: str) -> list[FileCo
     # Tier 3: import matching (case-insensitive when no definers found)
     ci_imports = len(defining) == 0
-    importer_paths = {c.path for c in contracts if _imports_sym(c, case=ci_imports)}
-    importers = [c for c in contracts if c.path in importer_paths and c.path not in defining_paths]
+    all_importer_paths = {c.path for c in contracts if _imports_sym(c, case=ci_imports)}
+    all_importers = [c for c in contracts if c.path in all_importer_paths and c.path not in defining_paths]
+    # Apply importer cap — definers are never truncated
+    total_importers = len(all_importers)
+    truncated = total_importers > max_importers
+    importers = all_importers[:max_importers] if truncated else all_importers
+    importer_paths = {c.path for c in importers}
     # Tier 4: type-reference matching (only when tiers 1-3 yield nothing)
     references: list[FileContract] = []
@@ -483,12 +502,27 @@ def _filter_by_symbol(contracts: list[FileContract], symbol: str) -> list[FileCo
             seen.add(c.path)
             merged.append(c)
-    return sorted(merged, key=lambda c: (
+    result = sorted(merged, key=lambda c: (
         c.path not in defining_paths,
         c.path not in importer_paths,
         -c.relevance_score,
     ))
+    truncation: dict = {
+        "symbol": symbol,
+        "definers_found": len(defining),
+        "importers_found": total_importers,
+        "importers_returned": len(importers),
+        "references_found": len(references),
+        "total_returned": len(result),
+        "truncated": truncated,
+    }
+    if truncated:
+        truncation["truncation_reason"] = "max_importers_limit"
+        truncation["override_hint"] = f"--symbol {symbol} --max-importers {total_importers}"
+    return result, truncation
 # ---------------------------------------------------------------------------
 # Deep symbol scan — grep-based fallback for shallow-scanned repos

sourcecode/prepare_context.py CHANGED Viewed

@@ -701,7 +701,33 @@ class TaskContextBuilder:
         # Deterministic: score desc, then path asc as tiebreaker
         scored.sort(key=lambda x: (-x[0], x[1]))
-        return [f for _, _, f in scored[:15]]
+        # Apply directory-diversity selection via ContextScorer.
+        # Files from the same directory share the same concern; the scorer
+        # applies a small redundancy penalty so the final set spans more of
+        # the codebase rather than clustering inside a single directory.
+        # Falls back to top-15 slice when scorer is unavailable.
+        try:
+            from sourcecode.context_scorer import ContextScorer, NodeScore
+            _ctx = ContextScorer()
+            _ns: dict[str, NodeScore] = {
+                path: NodeScore(
+                    path=path,
+                    score=total,
+                    display_score=min(total / 3.0, 1.0),
+                    structural=total,
+                    semantic=0.0,
+                    annotation=0.0,
+                    proximity=0.0,
+                    reasons=[rf.reason] if rf.reason else ["source file"],
+                )
+                for total, path, rf in scored
+            }
+            _selected = _ctx.select_subgraph(_ns, contracts=[], budget=15, min_score=0.05)
+            _rf_map = {path: rf for _, path, rf in scored}
+            return [_rf_map[p] for p in _selected if p in _rf_map]
+        except Exception:
+            return [f for _, _, f in scored[:15]]
     def _is_test(self, path: str) -> bool:
         name = Path(path).name.lower()

sourcecode/ranking_engine.py CHANGED Viewed

@@ -45,60 +45,71 @@ class TaskWeights:
     code_notes: float = 0.5
     exports: float = 0.3
     is_changed: float = 0.8
+    # Call graph centrality from --semantics (ContextScorer feeds this in)
+    semantic_centrality: float = 0.5
+    # BFS proximity to a focus symbol/file (added by ContextScorer on top)
+    proximity: float = 1.0
 # Task profiles: each emphasizes different signals for different agent goals.
 # The contrast between profiles is intentional — fix-bug and explain must
 # produce meaningfully different ranked sets from the same codebase.
 TASK_WEIGHTS: dict[str, TaskWeights] = {
-    # fix-bug: files with bug annotations, recent churn, actively changed logic
+    # fix-bug: bug annotations, recent churn, changed files, proximity to focus
     "fix-bug": TaskWeights(
         path_relevance=0.5, entrypoint=0.5,
         fan_in=0.8, fan_out=0.3,
         git_churn=1.5, code_notes=3.0,
         exports=0.2, is_changed=2.0,
+        semantic_centrality=0.5, proximity=2.0,
     ),
-    # refactor: highly-coupled files, technical debt, complex hubs
+    # refactor: hub modules, coupling, technical debt, call graph hubs
     "refactor": TaskWeights(
         path_relevance=0.8, entrypoint=0.3,
         fan_in=2.0, fan_out=2.0,
         git_churn=0.3, code_notes=2.0,
         exports=1.0, is_changed=0.3,
+        semantic_centrality=1.5, proximity=0.5,
     ),
-    # explain: stable core, entrypoints, central modules — ignore churn noise
+    # explain: stable core, entrypoints, call graph backbone — ignore churn
     "explain": TaskWeights(
         path_relevance=2.0, entrypoint=3.0,
         fan_in=0.8, fan_out=0.3,
         git_churn=0.0, code_notes=0.0,
         exports=0.5, is_changed=0.0,
+        semantic_centrality=1.0, proximity=0.3,
     ),
-    # onboard: same as explain but also values hub modules
+    # onboard: entrypoints + hub modules + call graph backbone
     "onboard": TaskWeights(
         path_relevance=2.0, entrypoint=3.0,
         fan_in=1.2, fan_out=0.5,
         git_churn=0.0, code_notes=0.0,
         exports=1.0, is_changed=0.0,
+        semantic_centrality=1.2, proximity=0.3,
     ),
-    # generate-tests: source files with large public API, not yet covered
+    # generate-tests: large public API, call graph reachability
     "generate-tests": TaskWeights(
         path_relevance=0.8, entrypoint=0.3,
         fan_in=1.5, fan_out=0.8,
         git_churn=0.5, code_notes=0.5,
         exports=2.5, is_changed=0.5,
+        semantic_centrality=0.8, proximity=0.5,
     ),
-    # review-pr: changed files and their importers
+    # review-pr: changed files, their importers, impact radius
     "review-pr": TaskWeights(
         path_relevance=0.5, entrypoint=0.5,
         fan_in=1.5, fan_out=0.5,
         git_churn=0.5, code_notes=0.8,
         exports=0.3, is_changed=3.0,
+        semantic_centrality=1.0, proximity=1.5,
     ),
-    # delta: changed files and dependency impact
+    # delta: changed files, dependency impact, call graph proximity
     "delta": TaskWeights(
         path_relevance=0.5, entrypoint=0.5,
         fan_in=1.5, fan_out=0.5,
         git_churn=0.5, code_notes=0.5,
         exports=0.3, is_changed=3.0,
+        semantic_centrality=1.0, proximity=1.0,
     ),
     # default: balanced, no task bias
     "default": TaskWeights(),
@@ -139,6 +150,8 @@ class RankingEngine:
         code_note_count: int = 0,
         export_count: int = 0,
         task: str = "default",
+        semantic_centrality: float = 0.0,
+        max_semantic: float = 1.0,
     ) -> FileScore:
         """Compute a scored, explained ranking for a single file.
@@ -203,6 +216,15 @@ class RankingEngine:
             raw += 0.2 * w.is_changed
             reasons.append("uncommitted changes")
+        # 9. Semantic call-graph centrality (fed by ContextScorer from --semantics)
+        if semantic_centrality > 0 and w.semantic_centrality > 0:
+            sc_norm = min(semantic_centrality / max(max_semantic, 1e-9), 1.0)
+            raw += sc_norm * 0.25 * w.semantic_centrality
+            if sc_norm >= 0.60:
+                reasons.append("call graph hub")
+            elif sc_norm >= 0.25:
+                reasons.append("call graph contributor")
         # Monorepo package role
         pkg_role = self._scorer.package_role(norm)
         if pkg_role in _WORKSPACE_CORE_ROLES:

sourcecode/serializer.py CHANGED Viewed

@@ -186,6 +186,21 @@ def _file_relevance(sm: SourceMap, *, limit: int = 15) -> list[dict[str, Any]]:
         git_churn = {h.file: h.commit_count for h in gc.change_hotspots}
     max_churn = max(git_churn.values(), default=1)
+    # Incorporate semantic hotspots from --semantics when available.
+    # Hotspots rank files by call-graph centrality (fan_in×2 + fan_out),
+    # normalised across the analysed files.
+    semantic_hub_scores: dict[str, float] = {}
+    ss = sm.semantic_summary
+    if ss and getattr(ss, "requested", False) and ss.hotspots:
+        max_importance = max(
+            (h.get("importance_score", 0.0) for h in ss.hotspots),
+            default=1.0,
+        ) or 1.0
+        for h in ss.hotspots:
+            p = h.get("path", "")
+            if p:
+                semantic_hub_scores[p] = h.get("importance_score", 0.0) / max_importance
     entry_paths = {ep.path for ep in sm.entry_points}
     scored: list[tuple[float, dict[str, Any]]] = []
@@ -202,7 +217,9 @@ def _file_relevance(sm: SourceMap, *, limit: int = 15) -> list[dict[str, Any]]:
             continue
         content_rel = file_class.relevance if file_class else 0.0
-        combined = fs.score + content_rel
+        # Semantic hub bonus: normalised call-graph centrality adds up to +0.30
+        sem_hub = semantic_hub_scores.get(path, 0.0) * 0.30
+        combined = fs.score + content_rel + sem_hub
         if combined <= 0 and not (file_class and file_class.relevance > 0.3):
             continue
@@ -217,6 +234,8 @@ def _file_relevance(sm: SourceMap, *, limit: int = 15) -> list[dict[str, Any]]:
         }
         ranking_reasons = [r for r in fs.reasons if r != "source file"]
+        if sem_hub >= 0.15:
+            ranking_reasons.append("call graph hub")
         if ranking_reasons:
             item["ranking_reasons"] = ranking_reasons
@@ -722,8 +741,10 @@ def agent_view(sm: SourceMap) -> dict[str, Any]:
     # production runtime is represented as entry_points=[], never by fallback.
     ep_groups = _entry_point_groups(sm.entry_points)
     result["entry_points"] = ep_groups["production"]
-    result["development_entry_points"] = ep_groups["development"]
-    result["auxiliary_entry_points"] = ep_groups["auxiliary"]
+    if ep_groups["development"]:
+        result["development_entry_points"] = ep_groups["development"]
+    if ep_groups["auxiliary"]:
+        result["auxiliary_entry_points"] = ep_groups["auxiliary"]
     # ── 3. Architecture ───────────────────────────────────────────────────────
     result["architecture"] = _architecture_context(sm)
@@ -888,6 +909,23 @@ def agent_view(sm: SourceMap) -> dict[str, Any]:
     if analysis_gaps:
         result["analysis_gaps"] = analysis_gaps
+    # ── 8. Agent mode metadata — explicit transparency about auto-enabled/suppressed flags ──
+    _auto_enabled: list[str] = ["--dependencies", "--env-map", "--code-notes"]
+    _suppressed: list[str] = []
+    if sm.metrics_summary is not None and sm.metrics_summary.requested:
+        _suppressed.append("--full-metrics")
+    if sm.module_graph is not None and sm.module_graph.summary.requested:
+        _suppressed.append("--graph-modules")
+    if sm.doc_summary is not None and sm.doc_summary.requested:
+        _suppressed.append("--docs")
+    agent_mode_meta: dict[str, Any] = {
+        "auto_enabled": _auto_enabled,
+    }
+    if _suppressed:
+        agent_mode_meta["suppressed_flags"] = _suppressed
+        agent_mode_meta["suppressed_note"] = "computed but excluded from agent_view"
+    result["agent_mode"] = agent_mode_meta
     return result
@@ -918,9 +956,11 @@ def standard_view(sm: SourceMap, *, include_tree: bool = False) -> dict[str, Any
         "architecture_summary": sm.architecture_summary,
         "stacks": [asdict(s) for s in sm.stacks],
         "entry_points": ep_groups["production"],
-        "development_entry_points": ep_groups["development"],
-        "auxiliary_entry_points": ep_groups["auxiliary"],
     }
+    if ep_groups["development"]:
+        result["development_entry_points"] = ep_groups["development"]
+    if ep_groups["auxiliary"]:
+        result["auxiliary_entry_points"] = ep_groups["auxiliary"]
     # Layer B — signals (only when the corresponding analyzer ran)
     if sm.dependency_summary is not None and sm.dependency_summary.requested:
@@ -1125,6 +1165,8 @@ def _contract_view_minimal(
             summary["degraded"] = True
             summary["degraded_hint"] = "install sourcecode[ast] for full TS/JS extraction"
         result["summary"] = summary
+        if cs.symbol_truncation:
+            result["symbol_query"] = cs.symbol_truncation
     return result
@@ -1404,6 +1446,8 @@ def _contract_view_standard(
         }
         if cs.limitations:
             result["contract_summary"]["limitations"] = cs.limitations
+        if cs.symbol_truncation:
+            result["symbol_query"] = cs.symbol_truncation
     return result

{sourcecode-0.42.0.dist-info → sourcecode-0.44.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sourcecode
-Version: 0.42.0
+Version: 0.44.0
 Summary: Deterministic codebase context for AI coding agents
 License:                                  Apache License
                                    Version 2.0, January 2004

{sourcecode-0.42.0.dist-info → sourcecode-0.44.0.dist-info}/RECORD RENAMED Viewed

@@ -1,15 +1,16 @@
-sourcecode/__init__.py,sha256=K7shxEMemP2ulUio4YBuziIbKkDcIuDkcsLEFth5CwM,103
+sourcecode/__init__.py,sha256=2Lz--0KUPsKNbk6_eT1ko8NRsFtoDHnAHBo9uNtCIDo,103
 sourcecode/adaptive_scanner.py,sha256=6dh34C2qZXyRbw-8xBhbEwDdXanM6CRFRWayVoYITnA,10190
 sourcecode/architecture_analyzer.py,sha256=O4AXc7l_WTzIXrcAzstqZy-TGKNaFa6p3MzpgVjaO8g,27749
 sourcecode/architecture_summary.py,sha256=rSY5MRiaz4N1YdG0pqDTDuFjSN7PO_Zplx-dtNzv2Yo,19985
 sourcecode/ast_extractor.py,sha256=0OHQwTUBBc9lmqPLryVeB1z8dGIC6NhLlar800CD9oI,41129
 sourcecode/classifier.py,sha256=GKTMN8qKZX7ponSwDJfN08RrasI4CVpq1_gFBgEopps,7093
-sourcecode/cli.py,sha256=BBAS66tCeNt48iZrykJZ-H0TpI3zmgrAs6P3H3NSIws,67589
+sourcecode/cli.py,sha256=ON1YG8WzJ0Nb-zmlf65AzAgyCqkL1iiprJo2jwz6Pqk,68885
 sourcecode/code_notes_analyzer.py,sha256=rRd8bFYV0krjlxxQV0wenwE9K7pVpUQSR7KvSvUQKw4,9226
 sourcecode/confidence_analyzer.py,sha256=HxJMPLI5ulqtkncnv98W4iVO6yMbpQo87VuxiuNbDmY,12167
+sourcecode/context_scorer.py,sha256=nhppAo80fblAqcB9Ns0iQd21TZUrl2mQMo_xzPgavRE,14679
 sourcecode/context_summarizer.py,sha256=CiQrfBEzun949bWvmLabWoj2HhPn6Lw62ofqnsy0FlQ,6503
-sourcecode/contract_model.py,sha256=wpYNWGzHAVnyGxniGqNMk96TCmWbVVOqNSc3Kauajrg,3348
-sourcecode/contract_pipeline.py,sha256=af30z1l4LiSOngawYkrpzQC-8huIJOgbQ8EJrq_PDSc,22967
+sourcecode/contract_model.py,sha256=gCf9-Kj0G7l0lvRTAcRfFAfMgs1Rpizv4mKovQLYUkw,3434
+sourcecode/contract_pipeline.py,sha256=C3TJycL7pMRku7HQ5YbNFXxEZywtPQm8YaASRbYjs2g,24454
 sourcecode/coverage_parser.py,sha256=q0LeZJaX1bnntLu-ImksdBsMlpsVmk_iUfSaB4eaJGo,19702
 sourcecode/dependency_analyzer.py,sha256=Exq0BfInvfS5iAg9xAr6WI2uPNuotkIudTKcYJcRhB8,52757
 sourcecode/doc_analyzer.py,sha256=TttdS7mndKQhyJCfJnnAsyGCJrf-TIL7oXxDlTLUFKE,21248
@@ -19,8 +20,8 @@ sourcecode/file_classifier.py,sha256=_KfFIIolharaIxbSTrCkaWauQIqNHCyor_n47RGyDh8
 sourcecode/git_analyzer.py,sha256=PD3eNWydznQ6KLNpxGzBqizIHoPIKevfwz9Xyf_pDt4,11600
 sourcecode/graph_analyzer.py,sha256=hMOsLLz9B0UnQ4xwbHdgr3bFvqpw0bQ8kN-xmEn3Krk,64156
 sourcecode/metrics_analyzer.py,sha256=e2cFwB9XubFq_dIVsP2PLjpr4wX0N6ulb3ol3sGDUeo,20777
-sourcecode/prepare_context.py,sha256=a0_ThVNJ8v98UTrgnrnjacovvCd-2HWJug1scenUtEU,31044
-sourcecode/ranking_engine.py,sha256=XdhzahKGleYNW3N0GqGW9salPOXx2BNp8KqXpaeHHmw,8247
+sourcecode/prepare_context.py,sha256=qmxMvTlteeEGwDaNDRoRj0iGY1D7goVD_yV1MVOeQkM,32261
+sourcecode/ranking_engine.py,sha256=virVglafZufioHpZpwktjMvUiL0TZELWQCQnQNV8dFo,9360
 sourcecode/redactor.py,sha256=xuGcadGEHaPw4qZXlMDvzMCsr4VOkdp3oBQptHyJk8c,2884
 sourcecode/relevance_scorer.py,sha256=E74w7nlsNVobO3LqKHiMtBd84ONwGp8uDpwXJEjRtLA,8330
 sourcecode/repo_classifier.py,sha256=FG1vaWKdWXsWdl-S8hjVMiTqcwgaRXkDyvK4rPcOGtQ,22681
@@ -28,7 +29,7 @@ sourcecode/runtime_classifier.py,sha256=zWX3r3HCKHc-qtIobErOa8aKMmaoPYREtJKvPcBG
 sourcecode/scanner.py,sha256=aM3h9-DCQ3xKpeHpHYdo2vX6T5P95HA_YwZbkAVNwmo,8288
 sourcecode/schema.py,sha256=ofEge9hTWHOTjeWt7ceCDQWzP-uhhenrYX2usjW2KVU,22759
 sourcecode/semantic_analyzer.py,sha256=16EFTgM7ooW0m5gNUKOlTSn7IEMLSzKmzQn-cWaSqjs,82604
-sourcecode/serializer.py,sha256=VUiBxA2w9CqlblXqhHQMXEUvysxTaNljgiATbw6MJ4A,56927
+sourcecode/serializer.py,sha256=uDYSGjNjyrI2Qqvq23dl0owfi7zUVo8bwHBJ2RlGdz8,58975
 sourcecode/summarizer.py,sha256=ZuzIdm3t8A-d5MuQL0TSNLrd-L0IQIuguIxeNXMNJf8,16070
 sourcecode/tree_utils.py,sha256=Fj9OIuUksBvgibNd3feog0sMDjVypJzPexp5lvMoYWI,1424
 sourcecode/workspace.py,sha256=fQlVoNx8S-fSHpKoJ0JBvEHCFkxszH0KZVJed1i3TRk,6845
@@ -59,8 +60,8 @@ sourcecode/telemetry/consent.py,sha256=wLMvGNJeSSyZoNkQXpoUioY6mMv4Qdvuw7S9jAEWn
 sourcecode/telemetry/events.py,sha256=oEvvulfsv5GIDWG2174gSS6tNB95w38AIYiYeifGKlE,2294
 sourcecode/telemetry/filters.py,sha256=Asa71oRl7q3Wt_FMwuufIZJFzSYdgRNKS8LHCIyFeYE,4805
 sourcecode/telemetry/transport.py,sha256=KJeIPCPWMdmbCP3ySGs2iUlia34U6vWne2dZsUezesw,1560
-sourcecode-0.42.0.dist-info/METADATA,sha256=-H--yzWSnQ5wpiUOXDmKirFowuaAGWb-LhUMSLYiTQ8,25209
-sourcecode-0.42.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
-sourcecode-0.42.0.dist-info/entry_points.txt,sha256=ex3F9rmbXeyDIoFQHtkEqTsKSaJow8F0LrVu8XfIktQ,57
-sourcecode-0.42.0.dist-info/licenses/LICENSE,sha256=7DdHrU9Z_3e7dSvq4ISijZNjnuHo5NIHNiHDouMQ9JU,10491
-sourcecode-0.42.0.dist-info/RECORD,,
+sourcecode-0.44.0.dist-info/METADATA,sha256=pxp0MxePWfJw419_NjcL3P8C2Rk4yjH9JqIKMyDuVqo,25209
+sourcecode-0.44.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
+sourcecode-0.44.0.dist-info/entry_points.txt,sha256=ex3F9rmbXeyDIoFQHtkEqTsKSaJow8F0LrVu8XfIktQ,57
+sourcecode-0.44.0.dist-info/licenses/LICENSE,sha256=7DdHrU9Z_3e7dSvq4ISijZNjnuHo5NIHNiHDouMQ9JU,10491
+sourcecode-0.44.0.dist-info/RECORD,,

{sourcecode-0.42.0.dist-info → sourcecode-0.44.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{sourcecode-0.42.0.dist-info → sourcecode-0.44.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{sourcecode-0.42.0.dist-info → sourcecode-0.44.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

sourcecode 0.42.0__py3-none-any.whl → 0.44.0__py3-none-any.whl

sourcecode 0.42.0py3-none-any.whl → 0.44.0py3-none-any.whl