PyPI - sourcecode - Versions diffs - 1.33.13__tar.gz → 1.33.15__tar.gz - Mend

sourcecode 1.33.13tar.gz → 1.33.15tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (95) hide show

{sourcecode-1.33.13 → sourcecode-1.33.15}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sourcecode
-Version: 1.33.13
+Version: 1.33.15
 Summary: Persistent structural context and ultra-fast repeated analysis for AI coding agents
 License-File: LICENSE
 Keywords: agents,ai,codebase,context,developer-tools,llm

{sourcecode-1.33.13 → sourcecode-1.33.15}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "sourcecode"
-version = "1.33.13"
+version = "1.33.15"
 description = "Persistent structural context and ultra-fast repeated analysis for AI coding agents"
 readme = "README.md"
 requires-python = ">=3.9"

{sourcecode-1.33.13 → sourcecode-1.33.15}/src/sourcecode/__init__.py RENAMED Viewed

@@ -1,3 +1,3 @@
 """sourcecode — Deterministic codebase context maps for AI coding agents."""
-__version__ = "1.33.13"
+__version__ = "1.33.15"

{sourcecode-1.33.13 → sourcecode-1.33.15}/src/sourcecode/file_classifier.py RENAMED Viewed

@@ -187,16 +187,19 @@ class FileClassifier:
             if java_class is not None:
                 return java_class
-        if self._has_any_import(imports, _API_IMPORTS):
-            evidence = self._matched_imports(imports, _API_IMPORTS)
+        # Fix 4: call _matched_imports once per category instead of twice
+        # (_has_any_import was calling _matched_imports and discarding the result,
+        # then the caller invoked it again to get the evidence — halving throughput).
+        evidence = self._matched_imports(imports, _API_IMPORTS)
+        if evidence:
             return FileClassification(norm, "api_layer", "high", 0.82, "imports API/server framework", evidence)
-        if self._has_any_import(imports, _DB_IMPORTS):
-            evidence = self._matched_imports(imports, _DB_IMPORTS)
+        evidence = self._matched_imports(imports, _DB_IMPORTS)
+        if evidence:
             return FileClassification(norm, "database_layer", "high", 0.78, "imports database/persistence dependency", evidence)
-        if self._has_any_import(imports, _INFRA_IMPORTS):
-            evidence = self._matched_imports(imports, _INFRA_IMPORTS)
+        evidence = self._matched_imports(imports, _INFRA_IMPORTS)
+        if evidence:
             return FileClassification(norm, "infrastructure", "high", 0.72, "imports infrastructure dependency", evidence)
         role = self._package_role(norm)

{sourcecode-1.33.13 → sourcecode-1.33.15}/src/sourcecode/prepare_context.py RENAMED Viewed

@@ -1781,7 +1781,12 @@ class TaskContextBuilder:
                 # Pass 4: inject files whose path matches symptom keywords.
                 # CamelCase-expand the filename stem so "OfflineSessionLoader" matches
                 # the keyword "offline" even without an explicit directory separator.
+                # Large repos: cap per-keyword injections so a common term like
+                # "authentication" (50+ path matches in an IAM repo) cannot flood the
+                # candidate list and push specific terms like "ldap" out of the budget.
                 _p4_dirs_of_injected: set[str] = set()  # directories of high-score injects
+                _P4_KW_CAP = 15  # max path-injections per keyword in large repos
+                _p4_kw_counts: dict[str, int] = {}
                 for _p in all_paths:
                     if _p in _existing_paths:
                         continue
@@ -1798,6 +1803,16 @@ class TaskContextBuilder:
                     _matching_kws = [kw for kw in symptom_keywords if kw in _p_search]
                     if not _matching_kws:
                         continue
+                    # In large repos, skip keywords already at cap; keep file only if at
+                    # least one keyword still has quota (multi-kw matches exhaust each
+                    # keyword's quota independently so specific terms survive longer).
+                    if _is_large_repo:
+                        _matching_kws = [
+                            kw for kw in _matching_kws
+                            if _p4_kw_counts.get(kw, 0) < _P4_KW_CAP
+                        ]
+                        if not _matching_kws:
+                            continue
                     _boost = 0.2 * len(_matching_kws)
                     _injected_score = round(min(0.5 + _boost, 1.0), 2)
                     _first_kw = _matching_kws[0]
@@ -1810,6 +1825,9 @@ class TaskContextBuilder:
                     ))
                     _existing_paths.add(_p)
                     _sx_direct_path.append(_p)
+                    if _is_large_repo:
+                        for _kw in _matching_kws:
+                            _p4_kw_counts[_kw] = _p4_kw_counts.get(_kw, 0) + 1
                     if _injected_score >= 0.7:
                         _p4_dirs_of_injected.add(str(Path(_p).parent))
@@ -1818,6 +1836,15 @@ class TaskContextBuilder:
                 # the candidate pool (e.g. AkitaBaseService containing setLoading).
                 _src_exts = frozenset({".java", ".py", ".ts", ".js", ".kt", ".go"})
                 _frontend_kws = [kw for kw in symptom_keywords if kw in _FRONTEND_SYMPTOM_MAP]
+                # Fix 5: In large repos, skip frontend→backend synonym grep for keywords
+                # that already have direct path matches — those are backend terms (e.g.
+                # "login" in an IAM repo) that don't need UI→service-layer translation.
+                # Prevents "authentication" grep flooding keycloak with SAML adapter files.
+                if _is_large_repo and _frontend_kws:
+                    _frontend_kws = [
+                        kw for kw in _frontend_kws
+                        if not any(kw in p.lower() for p in _sx_direct_path)
+                    ]
                 _backend_terms_set: list[str] = []
                 if _frontend_kws:
                     _bt: list[str] = []
@@ -1864,9 +1891,15 @@ class TaskContextBuilder:
                 # architecturally adjacent classes that don't mention symptom keywords
                 # in their own name (e.g. InfinispanOfflineSessionCacheEntryLifespan…
                 # siblings in the same infinispan/ package).
+                # Large repos: cap total co-location injections so that a keyword
+                # matching many directories doesn't flood the candidate list.
                 if _is_large_repo and _p4_dirs_of_injected:
                     _coloc_existing = {rf.path for rf in relevant_files}
+                    _P4C_CAP = 30
+                    _coloc_count = 0
                     for _cp in all_paths:
+                        if _coloc_count >= _P4C_CAP:
+                            break
                         if _cp in _coloc_existing:
                             continue
                         if Path(_cp).suffix.lower() not in _src_exts:
@@ -1880,6 +1913,7 @@ class TaskContextBuilder:
                                 why="directory proximity injection",
                             ))
                             _coloc_existing.add(_cp)
+                            _coloc_count += 1
                 # Sort before content scan so top candidates get read first.
                 # In large repos: prioritise symptom_match files within each score band
@@ -1898,6 +1932,7 @@ class TaskContextBuilder:
                 _no_scan_candidates = relevant_files[_CONTENT_SCAN_LIMIT:]
                 _boosted: list[RelevantFile] = []
+                _raw_signals: dict[str, float] = {}  # uncapped accumulated signal per file
                 _scanned_body: dict[str, str] = {}  # cache for graph expansion (Pass 5)
                 for _rf in _scan_candidates:
                     _extra = 0.0
@@ -1971,7 +2006,9 @@ class TaskContextBuilder:
                     elif _extra_syn > 0:
                         _new_reason = _rf.reason + f", synonym-match backend (+{_extra_syn:.2f})"
-                    _final_score = round(min(_rf.score + _total_extra, 1.0), 2)
+                    _raw_signal = _rf.score + _total_extra  # uncapped for ranking
+                    _raw_signals[_rf.path] = _raw_signal
+                    _final_score = round(min(_raw_signal, 1.0), 2)
                     _boosted.append(RelevantFile(
                         path=_rf.path,
                         role=_rf.role,
@@ -1980,21 +2017,14 @@ class TaskContextBuilder:
                         why=_rf.why,
                     ))
-                # Use total boost as a secondary sort key so symptom-matched files
-                # that were boosted from a lower base score rank above structural
-                # files that coincidentally reach the same capped score of 1.0.
-                # This prevents budget-trimming from discarding the most relevant files.
-                _boost_totals: dict[str, float] = {}
-                for _rf in _scan_candidates:
-                    pass  # populated below
-                _boost_totals = {}
-                for _idx, _rf in enumerate(_scan_candidates):
-                    _b_rf = _boosted[_idx]
-                    _boost_totals[_b_rf.path] = round(_b_rf.score - _rf.score, 4)
+                # Sort by uncapped raw signal so files with more accumulated evidence
+                # (path matches + content hits + commit matches) rank above files that
+                # merely cap at the same display score of 1.0.
+                # _raw_signals holds each file's full sum before the display cap.
+                # Files not content-scanned (_no_scan_candidates) use their base score.
                 relevant_files = sorted(
                     _boosted + _no_scan_candidates,
-                    key=lambda rf: (-rf.score, -_boost_totals.get(rf.path, 0)),
+                    key=lambda rf: -_raw_signals.get(rf.path, rf.score),
                 )
                 # Pass 5: reverse graph expansion from high-score seed nodes.
@@ -2093,9 +2123,14 @@ class TaskContextBuilder:
                         if _gx_new:
                             relevant_files = sorted(
                                 relevant_files + _gx_new,
-                                key=lambda rf: (-rf.score, -_boost_totals.get(rf.path, 0)),
+                                key=lambda rf: -_raw_signals.get(rf.path, rf.score),
                             )
+                # Fix 2: Cap output for large repos to stay within agent context budgets.
+                # Raw signal sort above ensures highest-signal files survive the cut.
+                if _is_large_repo and len(relevant_files) > 40:
+                    relevant_files = relevant_files[:40]
                 # Synonym note (only when synonyms actually fired)
                 if _frontend_kws and _sx_synonyms:
                     symptom_note = (

{sourcecode-1.33.13 → sourcecode-1.33.15}/src/sourcecode/ris.py RENAMED Viewed

@@ -437,6 +437,18 @@ def get_cold_start_context(repo_root: Path) -> dict:
             "endpoints": endpoints,
             "hotspots": ris.git_context_snapshot.get("hotspots", []),
             "validation": _validation,
+            # Fix 3: _cache wrapper for backward compat with CLI schema consumers.
+            # CLI outputs inject _cache via _inject_cache_meta; MCP cold-start path
+            # skips that step, leaving agents that read _cache.cache_source with None.
+            "_cache": {
+                "cache_source": "RIS",
+                "git_head_at_generation": ris.git_head or "",
+                "current_git_head": current_head or "",
+                "is_stale": stale,
+                "has_uncommitted_changes": uncommitted,
+                "generated_at": ris.last_updated_at,
+                "data_scope": "RIS_BOOTSTRAP",
+            },
         }
         if not endpoints and _is_java:
             result["endpoints_hint"] = (