PyPI - sourcecode - Versions diffs - 1.33.11__tar.gz → 1.33.12__tar.gz - Mend

sourcecode 1.33.11tar.gz → 1.33.12tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (95) hide show

{sourcecode-1.33.11 → sourcecode-1.33.12}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sourcecode
-Version: 1.33.11
+Version: 1.33.12
 Summary: Persistent structural context and ultra-fast repeated analysis for AI coding agents
 License-File: LICENSE
 Keywords: agents,ai,codebase,context,developer-tools,llm
@@ -39,7 +39,7 @@ Description-Content-Type: text/markdown
 **Persistent structural context and ultra-fast repeated analysis for AI coding agents.**
-![Version](https://img.shields.io/badge/version-1.33.11-blue)
+![Version](https://img.shields.io/badge/version-1.33.12-blue)
 ![Python](https://img.shields.io/badge/python-3.10%2B-green)
 ---

{sourcecode-1.33.11 → sourcecode-1.33.12}/README.md RENAMED Viewed

@@ -2,7 +2,7 @@
 **Persistent structural context and ultra-fast repeated analysis for AI coding agents.**
-![Version](https://img.shields.io/badge/version-1.33.11-blue)
+![Version](https://img.shields.io/badge/version-1.33.12-blue)
 ![Python](https://img.shields.io/badge/python-3.10%2B-green)
 ---

{sourcecode-1.33.11 → sourcecode-1.33.12}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "sourcecode"
-version = "1.33.11"
+version = "1.33.12"
 description = "Persistent structural context and ultra-fast repeated analysis for AI coding agents"
 readme = "README.md"
 requires-python = ">=3.9"

{sourcecode-1.33.11 → sourcecode-1.33.12}/src/sourcecode/__init__.py RENAMED Viewed

@@ -1,3 +1,3 @@
 """sourcecode — Deterministic codebase context maps for AI coding agents."""
-__version__ = "1.33.11"
+__version__ = "1.33.12"

{sourcecode-1.33.11 → sourcecode-1.33.12}/src/sourcecode/prepare_context.py RENAMED Viewed

@@ -627,6 +627,21 @@ _FRONTEND_SYMPTOM_MAP: dict[str, list[str]] = {
     "trabajador": ["trabajador", "empleado", "worker", "asignacion", "trabajadordao", "trabajadorservice"],
 }
+# Generic words that add noise when used as symptom keywords in large repos.
+# "token" and "user" are too ubiquitous in auth systems to be useful alone.
+_SYMPTOM_STOP_WORDS: frozenset[str] = frozenset({
+    "fails", "fail", "failed", "failure",
+    "not", "for", "with", "when", "that", "the", "and", "but",
+    "are", "has", "had", "have", "was", "were",
+    "get", "set", "can", "does", "did", "should", "would", "could",
+    "null", "none", "empty", "invalid", "incorrect", "wrong", "missing",
+    "error", "issue", "problem", "bug",
+    "from", "into", "via", "due", "also", "after", "before",
+    "slow", "fast", "new", "old",
+})
+# Repo-scale threshold: above this file count, use stricter injection logic.
+_LARGE_REPO_THRESHOLD = 500
 MAX_FILES_FAST = 2000  # above this threshold --fast uses git-index-only mode
@@ -1695,7 +1710,7 @@ class TaskContextBuilder:
             _camel_expanded = _re.sub(r'([A-Z]+)([A-Z][a-z])', r'\1 \2', _camel_expanded)
             symptom_keywords = [
                 w.lower() for w in _re.split(r"[\s\W]+", _camel_expanded)
-                if len(w) > 2
+                if len(w) > 2 and w.lower() not in _SYMPTOM_STOP_WORDS
             ]
             if symptom_keywords:
                 # Pre-compile combined keyword pattern for fast content scanning
@@ -1759,14 +1774,27 @@ class TaskContextBuilder:
                     ))
                     _existing_paths.add(_cp)
-                # Pass 4: inject files whose path matches symptom keywords
+                # Scale-awareness: large repos need wider scan and stricter injection.
+                _is_large_repo = len(all_paths) > _LARGE_REPO_THRESHOLD
+                # Pass 4: inject files whose path matches symptom keywords.
+                # CamelCase-expand the filename stem so "OfflineSessionLoader" matches
+                # the keyword "offline" even without an explicit directory separator.
+                _p4_dirs_of_injected: set[str] = set()  # directories of high-score injects
                 for _p in all_paths:
                     if _p in _existing_paths:
                         continue
                     if Path(_p).suffix.lower() not in _ALL_EXTENSIONS:
                         continue
                     _p_lower = _p.lower()
-                    _matching_kws = [kw for kw in symptom_keywords if kw in _p_lower]
+                    # CamelCase-expand the stem and append to the search string so
+                    # "OfflineSessionLoader" → "offline session loader" can match
+                    # individual keyword tokens beyond what substring search finds.
+                    _stem_raw = Path(_p).stem
+                    _stem_exp = _re.sub(r'([a-z])([A-Z])', r'\1 \2', _stem_raw)
+                    _stem_exp = _re.sub(r'([A-Z]+)([A-Z][a-z])', r'\1 \2', _stem_exp).lower()
+                    _p_search = _p_lower + " " + _stem_exp
+                    _matching_kws = [kw for kw in symptom_keywords if kw in _p_search]
                     if not _matching_kws:
                         continue
                     _boost = 0.2 * len(_matching_kws)
@@ -1781,6 +1809,8 @@ class TaskContextBuilder:
                     ))
                     _existing_paths.add(_p)
                     _sx_direct_path.append(_p)
+                    if _injected_score >= 0.7:
+                        _p4_dirs_of_injected.add(str(Path(_p).parent))
                 # Pass 4b: grep-based injection for frontend→backend synonym terms.
                 # Runs parallel grep for each backend term to find files not yet in
@@ -1828,9 +1858,41 @@ class TaskContextBuilder:
                         ))
                         _existing_paths_now.add(_gf)
-                # Sort before content scan so top candidates get read first
-                relevant_files = sorted(relevant_files, key=lambda rf: -rf.score)
-                _CONTENT_SCAN_LIMIT = 80
+                # Pass 4c: subsystem co-location — inject sibling files from the same
+                # directories as high-score (≥0.7) path-matched files. This catches
+                # architecturally adjacent classes that don't mention symptom keywords
+                # in their own name (e.g. InfinispanOfflineSessionCacheEntryLifespan…
+                # siblings in the same infinispan/ package).
+                if _is_large_repo and _p4_dirs_of_injected:
+                    _coloc_existing = {rf.path for rf in relevant_files}
+                    for _cp in all_paths:
+                        if _cp in _coloc_existing:
+                            continue
+                        if Path(_cp).suffix.lower() not in _src_exts:
+                            continue
+                        if str(Path(_cp).parent) in _p4_dirs_of_injected:
+                            relevant_files.append(RelevantFile(
+                                path=_cp,
+                                role="symptom_match",
+                                score=0.55,
+                                reason="subsystem co-location: same directory as symptom-matched file",
+                                why="directory proximity injection",
+                            ))
+                            _coloc_existing.add(_cp)
+                # Sort before content scan so top candidates get read first.
+                # In large repos: prioritise symptom_match files within each score band
+                # so that subsystem-relevant files are content-scanned before generic
+                # structural files at the same score.
+                if _is_large_repo:
+                    relevant_files = sorted(
+                        relevant_files,
+                        key=lambda rf: (-rf.score, 0 if rf.role == "symptom_match" else 1),
+                    )
+                    _CONTENT_SCAN_LIMIT = 150
+                else:
+                    relevant_files = sorted(relevant_files, key=lambda rf: -rf.score)
+                    _CONTENT_SCAN_LIMIT = 80
                 _scan_candidates = relevant_files[:_CONTENT_SCAN_LIMIT]
                 _no_scan_candidates = relevant_files[_CONTENT_SCAN_LIMIT:]
@@ -1905,15 +1967,31 @@ class TaskContextBuilder:
                     elif _extra_syn > 0:
                         _new_reason = _rf.reason + f", synonym-match backend (+{_extra_syn:.2f})"
+                    _final_score = round(min(_rf.score + _total_extra, 1.0), 2)
                     _boosted.append(RelevantFile(
                         path=_rf.path,
                         role=_rf.role,
-                        score=round(min(_rf.score + _total_extra, 1.0), 2),
+                        score=_final_score,
                         reason=_new_reason,
                         why=_rf.why,
                     ))
-                relevant_files = sorted(_boosted + _no_scan_candidates, key=lambda rf: -rf.score)
+                # Use total boost as a secondary sort key so symptom-matched files
+                # that were boosted from a lower base score rank above structural
+                # files that coincidentally reach the same capped score of 1.0.
+                # This prevents budget-trimming from discarding the most relevant files.
+                _boost_totals: dict[str, float] = {}
+                for _rf in _scan_candidates:
+                    pass  # populated below
+                _boost_totals = {}
+                for _idx, _rf in enumerate(_scan_candidates):
+                    _b_rf = _boosted[_idx]
+                    _boost_totals[_b_rf.path] = round(_b_rf.score - _rf.score, 4)
+                relevant_files = sorted(
+                    _boosted + _no_scan_candidates,
+                    key=lambda rf: (-rf.score, -_boost_totals.get(rf.path, 0)),
+                )
                 # Synonym note (only when synonyms actually fired)
                 if _frontend_kws and _sx_synonyms:
@@ -2390,7 +2468,8 @@ class TaskContextBuilder:
                     else:
                         _symptom_class_names.add(_tok)
                 _symptom_tokens = {
-                    w.lower() for w in _re_bug.split(r'[\s\W]+', symptom) if len(w) > 2
+                    w.lower() for w in _re_bug.split(r'[\s\W]+', symptom)
+                    if len(w) > 2 and w.lower() not in _SYMPTOM_STOP_WORDS
                 }
         scored: list[tuple[float, str, RelevantFile]] = []
@@ -2487,9 +2566,16 @@ class TaskContextBuilder:
                             content_boost += 0.8
                             _why_parts.append("exception type in path (+0.8)")
-                # AND-weighted token intersection — multiple matching tokens >> single
+                # AND-weighted token intersection — multiple matching tokens >> single.
+                # CamelCase-expand the filename stem so "OfflineSessionLoader" contributes
+                # "offline", "session", "loader" as individual tokens beyond what the raw
+                # path splitting yields. This lets multi-word symptoms match class names.
                 if _symptom_tokens:
                     _path_parts = set(path_lower.replace("/", " ").replace(".", " ").replace("_", " ").split())
+                    _stem_cc = Path(path).stem
+                    _stem_cc_exp = _re_bug.sub(r'([a-z])([A-Z])', r'\1 \2', _stem_cc)
+                    _stem_cc_exp = _re_bug.sub(r'([A-Z]+)([A-Z][a-z])', r'\1 \2', _stem_cc_exp).lower()
+                    _path_parts.update(_stem_cc_exp.split())
                     _intersection = _symptom_tokens & _path_parts
                     _n_match = len(_intersection)
                     if _n_match >= 3:

{sourcecode-1.33.11 → sourcecode-1.33.12}/src/sourcecode/repository_ir.py RENAMED Viewed

@@ -888,15 +888,40 @@ def _extract_mapped_paths(source: str, class_fqn: str) -> dict[str, str]:
 # Phase 3 — Symbol relation graph
 # ---------------------------------------------------------------------------
+def _build_same_package_map(symbols: list[SymbolRecord]) -> dict[str, dict[str, str]]:
+    """Build {package: {simple_name: FQN}} map from all class/interface symbols.
+    Used by build_repo_ir to resolve same-package types that need no explicit import.
+    In Java, classes in the same package reference each other without import statements,
+    so import_map is empty for them — this map provides the fallback resolution.
+    """
+    result: dict[str, dict[str, str]] = {}
+    for sym in symbols:
+        if sym.type not in ("class", "interface") or "#" in sym.symbol:
+            continue
+        pkg = sym.symbol.rsplit(".", 1)[0] if "." in sym.symbol else ""
+        simple = sym.symbol.split(".")[-1]
+        result.setdefault(pkg, {})[simple] = sym.symbol
+    return result
 def _build_relations(
     symbols: list[SymbolRecord],
     raw_imports: list[str],
     source: str,
     package: str,
     rel_path: str,
+    same_pkg_types: dict[str, str] | None = None,
 ) -> list[RelationEdge]:
-    """Phase 3: Build directed relation graph for symbols in one file."""
+    """Phase 3: Build directed relation graph for symbols in one file.
+    same_pkg_types: {simple_name → FQN} for classes in the same package.
+    Passed by build_repo_ir after a first pass that collects all symbols.
+    Enables resolving injection targets that share a package with the caller
+    and therefore need no explicit Java import statement.
+    """
     edges: list[RelationEdge] = []
+    _same_pkg: dict[str, str] = same_pkg_types or {}
     import_map: dict[str, str] = {}
     for fqn in raw_imports:
@@ -929,15 +954,27 @@ def _build_relations(
                 ))
         if sym.type == "field":
-            for imp_fqn in sym.imports_used:
+            _inject_ann = next(
+                (a for a in sym.annotations if a in _INJECT_ANNOTATIONS), "@Autowired"
+            )
+            _field_targets: set[str] = set(sym.imports_used)
+            # Same-package field injection: imports_used is empty when the field type
+            # shares a package with the declaring class (no import needed in Java).
+            # Extract type from signature ("Type name") and resolve via same_pkg_types.
+            if not _field_targets and _same_pkg:
+                _sig_type = (sym.signature or "").split()[0] if sym.signature else ""
+                _sig_base = re.sub(r'<.*', '', _sig_type).strip()
+                if _sig_base and _sig_base[0].isupper():
+                    _same_fqn = _same_pkg.get(_sig_base)
+                    if _same_fqn and _same_fqn != _enclosing_class(sym_fqn):
+                        _field_targets.add(_same_fqn)
+            for imp_fqn in _field_targets:
                 edges.append(RelationEdge(
                     from_symbol=sym_fqn,
                     to_symbol=imp_fqn,
                     type="injects",
                     confidence="high",
-                    evidence={"type": "annotation", "value": next(
-                        (a for a in sym.annotations if a in _INJECT_ANNOTATIONS), "@Autowired"
-                    )},
+                    evidence={"type": "annotation", "value": _inject_ann},
                 ))
     # ── Constructor injection ─────────────────────────────────────────────────
@@ -949,7 +986,7 @@ def _build_relations(
             continue
         for simple_type in sym.param_types:
             base = re.sub(r'<.*', '', simple_type).strip()
-            fqn = import_map.get(base)
+            fqn = import_map.get(base) or _same_pkg.get(base)
             if fqn:
                 edges.append(RelationEdge(
                     from_symbol=sym.symbol,
@@ -982,7 +1019,7 @@ def _build_relations(
                 continue
             _ftype = fld.group("type").strip()
             _base = re.sub(r'<.*', '', _ftype).strip()
-            _fqn = import_map.get(_base)
+            _fqn = import_map.get(_base) or _same_pkg.get(_base)
             if _fqn:
                 edges.append(RelationEdge(
                     from_symbol=sym.symbol,
@@ -2632,24 +2669,38 @@ def build_repo_ir(
     if since:
         _since_changed = _get_git_changed_files(root, since)
+    # Pass 1: extract symbols from all files so we can build the same-package
+    # type map before building relations.  Java classes in the same package
+    # reference each other without import statements, so import_map alone cannot
+    # resolve them — _build_same_package_map provides the cross-file fallback.
+    _per_file: list[tuple[str, str, str, list[str], list[SymbolRecord]]] = []
     for rel_path in sorted(file_paths):
         abs_path = root / rel_path
         try:
             source = abs_path.read_text(encoding="utf-8", errors="replace")
         except OSError:
             continue
+        package, symbols, raw_imports = _extract_symbols(source, rel_path)
+        all_symbols.extend(symbols)
+        _per_file.append((rel_path, source, package, raw_imports, symbols))
+    # Build {package: {simple_name: FQN}} from every class/interface found.
+    _same_pkg_map: dict[str, dict[str, str]] = _build_same_package_map(all_symbols)
+    # Pass 2: build relations with same-package type resolution available.
+    for rel_path, source, package, raw_imports, symbols in _per_file:
+        same_pkg_types = _same_pkg_map.get(package, {})
+        relations = _build_relations(
+            symbols, raw_imports, source, package, rel_path,
+            same_pkg_types=same_pkg_types,
+        )
         old_source: Optional[str] = None
         if since:
-            # Only fetch old content for files known to have changed.
-            # Unchanged files have no diff entries — skip git show entirely.
             _file_changed = _since_changed is None or rel_path in _since_changed
             if _file_changed:
                 old_source = _get_git_old_content(root, rel_path, since)
-        package, symbols, raw_imports = _extract_symbols(source, rel_path)
-        relations = _build_relations(symbols, raw_imports, source, package, rel_path)
         if old_source is not None:
             _, old_symbols, _ = _extract_symbols(old_source, rel_path)
             all_changed.extend(_diff_symbols(old_symbols, symbols))
@@ -2664,7 +2715,6 @@ def build_repo_ir(
                     confidence="high",
                 ))
-        all_symbols.extend(symbols)
         all_relations.extend(relations)
     spring_summary = _build_spring_summary(all_symbols)