PyPI - sourcecode - Versions diffs - 0.39.0__tar.gz → 0.42.0__tar.gz - Mend

sourcecode 0.39.0tar.gz → 0.42.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (147) hide show

{sourcecode-0.39.0 → sourcecode-0.42.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sourcecode
-Version: 0.39.0
+Version: 0.42.0
 Summary: Deterministic codebase context for AI coding agents
 License:                                  Apache License
                                    Version 2.0, January 2004

{sourcecode-0.39.0 → sourcecode-0.42.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "sourcecode"
-version = "0.39.0"
+version = "0.42.0"
 description = "Deterministic codebase context for AI coding agents"
 readme = "README.md"
 requires-python = ">=3.9"

{sourcecode-0.39.0 → sourcecode-0.42.0}/src/sourcecode/__init__.py RENAMED Viewed

@@ -1,3 +1,3 @@
 """sourcecode — Deterministic codebase context maps for AI coding agents."""
-__version__ = "0.39.0"
+__version__ = "0.42.0"

{sourcecode-0.39.0 → sourcecode-0.42.0}/src/sourcecode/architecture_analyzer.py RENAMED Viewed

@@ -172,6 +172,7 @@ class ArchitectureAnalyzer:
         graph: Optional[ModuleGraph] = None,
     ) -> ArchitectureAnalysis:
         limitations: list[str] = []
+        evidence: list[dict] = []
         # Step 1: filter paths
         filtered = self._filter_paths(sm.file_paths)
@@ -180,6 +181,8 @@ class ArchitectureAnalyzer:
                 requested=True,
                 pattern="unknown",
                 limitations=["Arquitectura no inferida: proyecto sin archivos de codigo suficientes"],
+                evidence=[{"type": "none", "paths": [], "reason": "insufficient source files", "confidence": "high"}],
+                tentative=False,
             )
         # Step 2: domain clustering
@@ -193,17 +196,32 @@ class ArchitectureAnalyzer:
             elif pattern == "unknown":
                 limitations.append("Patron de capas no reconocido: estructura de directorios sin senales claras")
-        # Step 3b: monorepo override — workspace config is hard evidence
-        if self._has_workspace_config(sm.file_paths) and pattern not in (
+        # Step 3b: monorepo override — workspace config is hard evidence.
+        # Overrides all weak inferred patterns; only truly specialised patterns
+        # (cqrs, clean, onion, hexagonal) take precedence over workspace config.
+        has_workspace = self._has_workspace_config(sm.file_paths)
+        if has_workspace and pattern not in (
             "monorepo", "cqrs", "clean", "onion", "hexagonal"
         ):
             mono_layers = self._detect_monorepo_packages(filtered)
-            if mono_layers or pattern in (None, "unknown", "flat", "modular", "layered"):
+            # Override whenever: monorepo packages detected, OR pattern is any weak/generic type.
+            # "fullstack", "layered", "mvc", "microservices", "modular", "flat", "unknown", None
+            # all yield to workspace config evidence.
+            _WEAK_PATTERNS = {None, "unknown", "flat", "modular", "layered",
+                              "fullstack", "mvc", "microservices"}
+            if mono_layers or pattern in _WEAK_PATTERNS:
                 pattern = "monorepo"
                 layers = mono_layers
                 limitations.append(
                     "Workspace config detectado — arquitectura refleja topologia de paquetes"
                 )
+                ws_files = [p for p in sm.file_paths if p.split("/")[-1] in _WORKSPACE_CONFIG_FILES]
+                evidence.append({
+                    "type": "workspace_config",
+                    "paths": ws_files[:4],
+                    "reason": "Monorepo workspace config file(s) detected — hard evidence for monorepo topology",
+                    "confidence": "high",
+                })
         # Step 4: bounded context inference
         bounded_contexts = self._infer_bounded_contexts(domains, graph)
@@ -212,25 +230,91 @@ class ArchitectureAnalyzer:
         confidence: Literal["high", "medium", "low"]
         strong_domains = [d for d in domains if d.confidence in ("high", "medium")]
         all_layers_weak = layers and all(l.confidence == "low" for l in layers)
+        method = "graph+structure" if graph is not None else "filesystem_inference"
+        # High-confidence evidence (workspace config) makes pattern non-tentative.
+        tentative = not any(e.get("confidence") == "high" for e in evidence)
+        # _hard_evidence: high-confidence evidence was already set (e.g. workspace_config).
+        # When True, tentative must stay False and confidence must stay at least "medium".
+        _hard_evidence = not tentative  # tentative=False iff high-conf evidence present
         if pattern not in (None, "unknown", "flat"):
-            if all_layers_weak:
+            if graph is not None:
+                # Import graph provided — structural validation available
+                confidence = "medium" if len(strong_domains) >= 3 else "low"
+                evidence.append({
+                    "type": "import_graph",
+                    "paths": [n.id for n in graph.nodes[:6]],
+                    "reason": f"Module import graph with {len(graph.nodes)} nodes used for pattern validation",
+                    "confidence": "medium",
+                })
+            elif all_layers_weak:
                 # Layers came from file-naming heuristic only, not directory structure
                 confidence = "low"
+                if not _hard_evidence:
+                    tentative = True
                 limitations.append(
                     "Low confidence inference: pattern inferred from filenames only, without import graph confirmation"
                 )
+                evidence.append({
+                    "type": "filesystem_naming",
+                    "paths": [l.files[0] for l in layers if l.files][:6],
+                    "reason": (
+                        f"Pattern '{pattern}' inferred from file stem naming conventions only "
+                        "(e.g. *_controller.py, *_service.py). "
+                        "No directory structure or import graph confirmation."
+                    ),
+                    "confidence": "low",
+                })
             else:
-                confidence = "medium" if len(strong_domains) >= 3 else "low"
-                if graph is None:
+                # Directory structure match (or monorepo/workspace override with no layers)
+                confidence = "medium" if (_hard_evidence or len(strong_domains) >= 3) else "low"
+                if confidence == "low" and not _hard_evidence:
+                    tentative = True
+                if not _hard_evidence:
                     limitations.append(
                         "Pattern not confirmed by module import graph; run with --graph-modules for structural validation"
                     )
+                if not _hard_evidence:
+                    matched_dirs = sorted({
+                        p.replace("\\", "/").split("/")[0]
+                        for layer in layers for p in layer.files
+                    })
+                    evidence.append({
+                        "type": "filesystem_naming",
+                        "paths": matched_dirs[:8],
+                        "reason": (
+                            f"Pattern '{pattern}' inferred from directory names matching layer keywords. "
+                            "Import graph not available — structural direction of dependencies unverified."
+                        ),
+                        "confidence": "low" if confidence == "low" else "medium",
+                    })
         elif len(strong_domains) >= 1:
             confidence = "medium"
+            if not _hard_evidence:
+                tentative = True
+            evidence.append({
+                "type": "filesystem_naming",
+                "paths": [d.name for d in strong_domains[:6]],
+                "reason": "Domain clustering from directory names; no layer pattern confirmed",
+                "confidence": "low",
+            })
         else:
             confidence = "low"
-        method = "graph+structure" if graph is not None else "filesystem_inference"
+            if not _hard_evidence:
+                tentative = True
+            if not evidence:
+                limitations.append(
+                    "insufficient_evidence: no recognizable architectural signals found; "
+                    "filesystem structure does not match known patterns"
+                )
+                evidence.append({
+                    "type": "filesystem_naming",
+                    "paths": filtered[:6],
+                    "reason": "Only filesystem paths available; no pattern matched",
+                    "confidence": "low",
+                })
         return ArchitectureAnalysis(
             requested=True,
@@ -241,6 +325,8 @@ class ArchitectureAnalyzer:
             confidence=confidence,
             method=method,
             limitations=limitations,
+            evidence=evidence,
+            tentative=tentative,
         )
     # ------------------------------------------------------------------

{sourcecode-0.39.0 → sourcecode-0.42.0}/src/sourcecode/cli.py RENAMED Viewed

@@ -566,7 +566,7 @@ def main(
     entrypoints_only: bool = typer.Option(
         False,
         "--entrypoints-only",
-        help="Contract mode: include only files that are entrypoints or have exported symbols.",
+        help="Contract mode: include only files that are runtime entrypoints or have exported symbols (public API surface). Note: 'entrypoints' here includes all files with exports, not strictly detected runtime entry points.",
     ),
     changed_only: bool = typer.Option(
         False,
@@ -1156,11 +1156,15 @@ def main(
         _all_call_files = set(_fan_in) | set(_fan_out)
         _hotspots: list[dict] = []
-        # Filter test paths from hotspots — they dominate fan-in by calling many modules
+        # Filter test, noise, and auxiliary paths — they dominate fan-in but carry no signal
         _TEST_MARKERS = {"/test", "/tests", "/spec", "/specs", "_test.", ".test.", ".spec."}
+        from sourcecode.ranking_engine import RankingEngine as _RankingEngine
+        _sem_engine = _RankingEngine(sm.monorepo_packages)
         for _p in _all_call_files:
             if any(_m in _p for _m in _TEST_MARKERS) or _p.startswith("test"):
                 continue
+            if _sem_engine.is_noise(_p) or _sem_engine.is_auxiliary(_p):
+                continue
             _in = _fan_in[_p]
             _out = _fan_out[_p]
             _score = _in * 2.0 + _out * 1.0

{sourcecode-0.39.0 → sourcecode-0.42.0}/src/sourcecode/contract_model.py RENAMED Viewed

@@ -91,6 +91,7 @@ class FileContract:
     fan_out: int = 0  # how many files this imports
     is_entrypoint: bool = False
     is_changed: bool = False
+    ranking_reasons: list[str] = field(default_factory=list)
     # Extraction quality
     extraction_method: str = "heuristic"  # ast | tree_sitter | heuristic

{sourcecode-0.39.0 → sourcecode-0.42.0}/src/sourcecode/contract_pipeline.py RENAMED Viewed

@@ -17,6 +17,7 @@ from typing import Any, Literal, Optional
 from sourcecode.ast_extractor import AstExtractor, _LANGUAGE_MAP
 from sourcecode.contract_model import ContractSummary, FileContract
+from sourcecode.ranking_engine import RankingEngine
 from sourcecode.relevance_scorer import RelevanceScorer
 from sourcecode.schema import EntryPoint, MonorepoPackageInfo
@@ -27,22 +28,6 @@ from sourcecode.schema import EntryPoint, MonorepoPackageInfo
 _MAX_FILES = 500      # hard cap on files extracted per run
 _SRC_EXTENSIONS: frozenset[str] = frozenset(_LANGUAGE_MAP.keys())
-# Role-based score adjustments applied after contract extraction.
-# Runtime roles get a boost; config/util are neutral or penalized.
-_ROLE_SCORE: dict[str, float] = {
-    "entrypoint": 0.15,
-    "service":    0.10,
-    "route":      0.10,
-    "api":        0.08,
-    "middleware": 0.06,
-    "store":      0.05,
-    "model":      0.05,
-    "hook":       0.05,
-    "component":  0.03,
-    "util":       0.00,
-    "config":    -0.10,
-    "unknown":    0.00,
-}
 RankStrategy = Literal["relevance", "centrality", "git-churn"]
@@ -60,9 +45,10 @@ def _get_changed_files(root: Path) -> set[str]:
     ]:
         try:
             result = subprocess.run(
-                cmd, cwd=root, capture_output=True, text=True, timeout=10
+                cmd, cwd=root, capture_output=True, text=True,
+                encoding="utf-8", errors="replace", timeout=10,
             )
-            for line in result.stdout.splitlines():
+            for line in (result.stdout or "").splitlines():
                 line = line.strip()
                 if line:
                     changed.add(line.replace("\\", "/"))
@@ -71,9 +57,10 @@ def _get_changed_files(root: Path) -> set[str]:
     try:
         result = subprocess.run(
             ["git", "status", "--porcelain"],
-            cwd=root, capture_output=True, text=True, timeout=10
+            cwd=root, capture_output=True, text=True,
+            encoding="utf-8", errors="replace", timeout=10,
         )
-        for line in result.stdout.splitlines():
+        for line in (result.stdout or "").splitlines():
             if len(line) > 3:
                 changed.add(line[3:].strip().replace("\\", "/"))
     except Exception:
@@ -144,11 +131,12 @@ def _get_git_churn(root: Path, file_paths: list[str]) -> dict[str, int]:
     try:
         result = subprocess.run(
             ["git", "log", "--name-only", "--format=", "--since=90.days.ago"],
-            cwd=root, capture_output=True, text=True, timeout=15,
+            cwd=root, capture_output=True, text=True,
+            encoding="utf-8", errors="replace", timeout=15,
         )
         path_set = set(file_paths)
         counter: Counter[str] = Counter()
-        for line in result.stdout.splitlines():
+        for line in (result.stdout or "").splitlines():
             line = line.strip().replace("\\", "/")
             if line in path_set:
                 counter[line] += 1
@@ -194,6 +182,7 @@ class ContractPipeline:
         """
         entry_paths = {ep.path.replace("\\", "/") for ep in (entry_points or [])}
         scorer = RelevanceScorer(monorepo_packages)
+        engine = RankingEngine(monorepo_packages)
         # 1. Changed files (for --changed-only and ranking)
         changed_files: set[str] = set()
@@ -267,9 +256,24 @@ class ContractPipeline:
         if rank_by == "git-churn":
             churn = _get_git_churn(root, [c.path for c in contracts])
-        # 6. Compute relevance scores
+        # 6. Compute relevance scores via unified ranking engine
+        max_fan_in = max((c.fan_in for c in contracts), default=1) if contracts else 1
+        max_churn_val = max(churn.values(), default=1) if churn else 1
         for c in contracts:
-            c.relevance_score = self._score(c, scorer, churn)
+            fs = engine.score(
+                c.path,
+                fan_in=c.fan_in,
+                fan_out=c.fan_out,
+                max_fan_in=max_fan_in,
+                git_churn=churn.get(c.path, 0),
+                max_churn=max_churn_val,
+                is_entrypoint=c.is_entrypoint,
+                is_changed=c.is_changed,
+                export_count=len(c.exports),
+                task="default",
+            )
+            c.relevance_score = fs.display_score
+            c.ranking_reasons = fs.reasons
         # 7. Rank
         contracts = self._rank(contracts, rank_by)
@@ -285,7 +289,7 @@ class ContractPipeline:
                     known_paths=set(src_paths),
                     entry_paths=entry_paths,
                     changed_files=changed_files,
-                    scorer=scorer,
+                    engine=engine,
                 )
         # 9. Entrypoints-only filter
@@ -312,45 +316,13 @@ class ContractPipeline:
         )
         return contracts, summary
-    def _score(
-        self,
-        c: FileContract,
-        scorer: RelevanceScorer,
-        churn: dict[str, int],
-    ) -> float:
-        base = scorer.score(c.path)
-        if c.is_entrypoint:
-            base += 0.3
-        if c.is_changed:
-            base += 0.2
-        # Fan-in is the strongest signal: many callers = critical contract
-        fi_score = min(c.fan_in / 10.0, 0.3)
-        fo_score = min(c.fan_out / 15.0, 0.15)
-        base += fi_score + fo_score
-        # Exported API value
-        export_count = len(c.exports)
-        base += min(export_count / 20.0, 0.1)
-        # Churn
-        churn_score = min(churn.get(c.path, 0) / 20.0, 0.1)
-        base += churn_score
-        # Role-based boost: runtime roles score higher than auxiliary
-        base += _ROLE_SCORE.get(c.role, 0.0)
-        return min(1.0, base)
     def _rank(self, contracts: list[FileContract], rank_by: RankStrategy) -> list[FileContract]:
         if rank_by == "centrality":
-            # Approximate centrality: fan_in + fan_out
-            return sorted(contracts, key=lambda c: -(c.fan_in + c.fan_out))
+            return sorted(contracts, key=lambda c: (-(c.fan_in + c.fan_out), c.path))
         if rank_by == "git-churn":
-            return sorted(contracts, key=lambda c: (-c.is_changed, -c.relevance_score))
-        # Default: relevance
-        return sorted(contracts, key=lambda c: (-c.is_entrypoint, -c.relevance_score))
+            return sorted(contracts, key=lambda c: (-c.is_changed, -c.relevance_score, c.path))
+        # Default: relevance — path breaks ties deterministically
+        return sorted(contracts, key=lambda c: (-c.is_entrypoint, -c.relevance_score, c.path))
     def _symbol_deep_scan(
         self,
@@ -359,7 +331,7 @@ class ContractPipeline:
         known_paths: set[str],
         entry_paths: set[str],
         changed_files: set[str],
-        scorer: RelevanceScorer,
+        engine: RankingEngine,
     ) -> list[FileContract]:
         """Grep-based fallback when the shallow scan missed the defining files.
@@ -367,7 +339,7 @@ class ContractPipeline:
         extracts contracts for candidates not already processed, then re-applies
         the symbol filter. Fan-in/fan-out are not computed for these contracts.
         """
-        candidates = _find_symbol_files(root, symbol, known_paths, scorer)
+        candidates = _find_symbol_files(root, symbol, known_paths, engine)
         if not candidates:
             return []
@@ -379,7 +351,9 @@ class ContractPipeline:
                 continue
             contract.is_entrypoint = rel_path in entry_paths
             contract.is_changed = rel_path in changed_files
-            contract.relevance_score = scorer.score(rel_path)
+            fs = engine.score(rel_path, is_entrypoint=contract.is_entrypoint, is_changed=contract.is_changed)
+            contract.relevance_score = fs.display_score
+            contract.ranking_reasons = fs.reasons
             extra.append(contract)
         return _filter_by_symbol(extra, symbol)
@@ -531,7 +505,7 @@ def _find_symbol_files(
     root: Path,
     symbol: str,
     known_paths: set[str],
-    scorer: RelevanceScorer,
+    engine: RankingEngine,
 ) -> list[str]:
     """Find source files outside *known_paths* that contain *symbol* as text.
@@ -560,7 +534,7 @@ def _find_symbol_files(
             if line.startswith("./"):
                 line = line[2:]
             line = line.replace("\\", "/")
-            if line and line not in known_paths and not scorer.is_noise(line):
+            if line and line not in known_paths and not engine.is_noise(line):
                 found.append(line)
         return found
     except Exception:
@@ -578,7 +552,7 @@ def _find_symbol_files(
                 rel_str = str(rel).replace("\\", "/")
             except ValueError:
                 continue
-            if rel_str in known_paths or scorer.is_noise(rel_str):
+            if rel_str in known_paths or engine.is_noise(rel_str):
                 continue
             try:
                 content = Path(full).read_text(encoding="utf-8", errors="replace")

{sourcecode-0.39.0 → sourcecode-0.42.0}/src/sourcecode/doc_analyzer.py RENAMED Viewed

@@ -132,6 +132,8 @@ class DocAnalyzer:
         records: list[DocRecord] = []
         limitations: list[str] = list(limitations_pre)
         languages: set[str] = set()
+        # Track per-language support status for honest reporting
+        unsupported_langs: set[str] = set()
         for relative_path in file_paths:
             abs_path = root / relative_path
@@ -176,8 +178,18 @@ class DocAnalyzer:
                 # Unsupported language — D-04: no emitir DocRecord, solo registrar limitation
                 limitations.append(f"docs_unavailable:{norm_path}:language={lang}")
                 languages.add(lang)
+                unsupported_langs.add(lang)
                 # NO records.append() here
+        # Build language_coverage: explicit per-language support status
+        _SUPPORTED_LANGS = {"python", "javascript", "typescript"}
+        lang_coverage: dict[str, str] = {}
+        for lang in languages:
+            if lang in _SUPPORTED_LANGS:
+                lang_coverage[lang] = "supported"
+            else:
+                lang_coverage[lang] = "unsupported"
         # Build summary
         symbol_count = sum(1 for r in records if r.kind != "module")
         total_count = len(records)
@@ -185,6 +197,22 @@ class DocAnalyzer:
         if any(r.doc_text and r.doc_text.endswith(self._TRUNCATION_SUFFIX) for r in records):
             truncated = True
+        # Explicit absence signal: scanned files but found nothing
+        if total_count == 0 and file_paths:
+            limitations.append(
+                f"no_docs_found: {len(file_paths)} file(s) scanned, "
+                "no docstrings or JSDoc comments found"
+            )
+        # Warn explicitly when unsupported languages are present — agents must not
+        # assume full coverage when Java/Go/Rust files are in scope but not analyzed.
+        if unsupported_langs:
+            sorted_unsupported = sorted(unsupported_langs)
+            limitations.append(
+                f"docs_not_extracted: language(s) {sorted_unsupported} present but not supported; "
+                "only Python and JS/TS docstrings are extracted"
+            )
         summary = DocSummary(
             requested=True,
             total_count=total_count,
@@ -193,6 +221,7 @@ class DocAnalyzer:
             depth=depth,
             truncated=truncated,
             limitations=limitations,
+            language_coverage=lang_coverage,
         )
         return records, summary

sourcecode 0.39.0__tar.gz → 0.42.0__tar.gz

sourcecode 0.39.0tar.gz → 0.42.0tar.gz