sourcecode 1.33.11__tar.gz → 1.33.13__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. {sourcecode-1.33.11 → sourcecode-1.33.13}/PKG-INFO +2 -2
  2. {sourcecode-1.33.11 → sourcecode-1.33.13}/README.md +1 -1
  3. {sourcecode-1.33.11 → sourcecode-1.33.13}/pyproject.toml +1 -1
  4. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/__init__.py +1 -1
  5. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/cli.py +6 -0
  6. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/output_budget.py +2 -2
  7. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/prepare_context.py +202 -12
  8. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/repository_ir.py +63 -13
  9. {sourcecode-1.33.11 → sourcecode-1.33.13}/.github/workflows/build-windows.yml +0 -0
  10. {sourcecode-1.33.11 → sourcecode-1.33.13}/.gitignore +0 -0
  11. {sourcecode-1.33.11 → sourcecode-1.33.13}/.ruff.toml +0 -0
  12. {sourcecode-1.33.11 → sourcecode-1.33.13}/CHANGELOG.md +0 -0
  13. {sourcecode-1.33.11 → sourcecode-1.33.13}/CONTRIBUTING.md +0 -0
  14. {sourcecode-1.33.11 → sourcecode-1.33.13}/LICENSE +0 -0
  15. {sourcecode-1.33.11 → sourcecode-1.33.13}/SECURITY.md +0 -0
  16. {sourcecode-1.33.11 → sourcecode-1.33.13}/raw +0 -0
  17. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/adaptive_scanner.py +0 -0
  18. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/architecture_analyzer.py +0 -0
  19. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/architecture_summary.py +0 -0
  20. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/ast_extractor.py +0 -0
  21. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/cache.py +0 -0
  22. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/canonical_ir.py +0 -0
  23. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/classifier.py +0 -0
  24. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/code_notes_analyzer.py +0 -0
  25. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/confidence_analyzer.py +0 -0
  26. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/context_scorer.py +0 -0
  27. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/context_summarizer.py +0 -0
  28. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/contract_model.py +0 -0
  29. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/contract_pipeline.py +0 -0
  30. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/coverage_parser.py +0 -0
  31. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/dependency_analyzer.py +0 -0
  32. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/detectors/__init__.py +0 -0
  33. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/detectors/base.py +0 -0
  34. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/detectors/csproj_parser.py +0 -0
  35. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/detectors/dart.py +0 -0
  36. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/detectors/dotnet.py +0 -0
  37. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/detectors/elixir.py +0 -0
  38. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/detectors/go.py +0 -0
  39. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/detectors/heuristic.py +0 -0
  40. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/detectors/hybrid.py +0 -0
  41. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/detectors/java.py +0 -0
  42. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/detectors/jvm_ext.py +0 -0
  43. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/detectors/nodejs.py +0 -0
  44. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/detectors/parsers.py +0 -0
  45. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/detectors/php.py +0 -0
  46. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/detectors/project.py +0 -0
  47. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/detectors/python.py +0 -0
  48. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/detectors/ruby.py +0 -0
  49. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/detectors/rust.py +0 -0
  50. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/detectors/systems.py +0 -0
  51. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/detectors/terraform.py +0 -0
  52. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/detectors/tooling.py +0 -0
  53. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/doc_analyzer.py +0 -0
  54. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/entrypoint_classifier.py +0 -0
  55. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/env_analyzer.py +0 -0
  56. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/error_schema.py +0 -0
  57. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/file_classifier.py +0 -0
  58. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/flow_analyzer.py +0 -0
  59. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/git_analyzer.py +0 -0
  60. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/graph_analyzer.py +0 -0
  61. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/license.py +0 -0
  62. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/mcp/__init__.py +0 -0
  63. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/mcp/onboarding/__init__.py +0 -0
  64. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/mcp/onboarding/applier.py +0 -0
  65. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/mcp/onboarding/backup.py +0 -0
  66. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/mcp/onboarding/detector.py +0 -0
  67. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/mcp/onboarding/planner.py +0 -0
  68. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/mcp/orchestrator.py +0 -0
  69. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/mcp/registry.py +0 -0
  70. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/mcp/runner.py +0 -0
  71. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/mcp/server.py +0 -0
  72. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/mcp_nudge.py +0 -0
  73. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/metrics_analyzer.py +0 -0
  74. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/path_filters.py +0 -0
  75. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/pr_comment_renderer.py +0 -0
  76. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/progress.py +0 -0
  77. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/ranking_engine.py +0 -0
  78. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/redactor.py +0 -0
  79. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/relevance_scorer.py +0 -0
  80. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/repo_classifier.py +0 -0
  81. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/ris.py +0 -0
  82. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/runtime_classifier.py +0 -0
  83. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/scanner.py +0 -0
  84. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/schema.py +0 -0
  85. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/semantic_analyzer.py +0 -0
  86. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/serializer.py +0 -0
  87. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/summarizer.py +0 -0
  88. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/telemetry/__init__.py +0 -0
  89. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/telemetry/config.py +0 -0
  90. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/telemetry/consent.py +0 -0
  91. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/telemetry/events.py +0 -0
  92. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/telemetry/filters.py +0 -0
  93. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/telemetry/transport.py +0 -0
  94. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/tree_utils.py +0 -0
  95. {sourcecode-1.33.11 → sourcecode-1.33.13}/src/sourcecode/workspace.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sourcecode
3
- Version: 1.33.11
3
+ Version: 1.33.13
4
4
  Summary: Persistent structural context and ultra-fast repeated analysis for AI coding agents
5
5
  License-File: LICENSE
6
6
  Keywords: agents,ai,codebase,context,developer-tools,llm
@@ -39,7 +39,7 @@ Description-Content-Type: text/markdown
39
39
 
40
40
  **Persistent structural context and ultra-fast repeated analysis for AI coding agents.**
41
41
 
42
- ![Version](https://img.shields.io/badge/version-1.33.11-blue)
42
+ ![Version](https://img.shields.io/badge/version-1.33.12-blue)
43
43
  ![Python](https://img.shields.io/badge/python-3.10%2B-green)
44
44
 
45
45
  ---
@@ -2,7 +2,7 @@
2
2
 
3
3
  **Persistent structural context and ultra-fast repeated analysis for AI coding agents.**
4
4
 
5
- ![Version](https://img.shields.io/badge/version-1.33.11-blue)
5
+ ![Version](https://img.shields.io/badge/version-1.33.12-blue)
6
6
  ![Python](https://img.shields.io/badge/python-3.10%2B-green)
7
7
 
8
8
  ---
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "sourcecode"
7
- version = "1.33.11"
7
+ version = "1.33.13"
8
8
  description = "Persistent structural context and ultra-fast repeated analysis for AI coding agents"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.9"
@@ -1,3 +1,3 @@
1
1
  """sourcecode — Deterministic codebase context maps for AI coding agents."""
2
2
 
3
- __version__ = "1.33.11"
3
+ __version__ = "1.33.13"
@@ -1102,6 +1102,9 @@ def main(
1102
1102
  obj = _jm.loads(raw)
1103
1103
  if isinstance(obj, dict):
1104
1104
  obj["_cache"] = meta
1105
+ # Top-level cache_source for one release — backward compat alias
1106
+ if "cache_source" in meta:
1107
+ obj["cache_source"] = meta["cache_source"]
1105
1108
  return _jm.dumps(obj, indent=2, ensure_ascii=False)
1106
1109
  except Exception:
1107
1110
  pass
@@ -2273,6 +2276,9 @@ def _make_explanation(reason: str, why: str) -> str:
2273
2276
  def _serialize_relevant_file(f: Any) -> dict:
2274
2277
  from dataclasses import asdict as _asdict
2275
2278
  d = {k: v for k, v in _asdict(f).items() if v != "" and v is not None}
2279
+ # Emit 'file' as backward-compat alias for 'path' for one release
2280
+ if "path" in d:
2281
+ d["file"] = d["path"]
2276
2282
  reason = d.pop("reason", "") or ""
2277
2283
  why = d.pop("why", "") or ""
2278
2284
  # Expose score as a rounded float so agents can rank/filter files deterministically.
@@ -67,7 +67,7 @@ _TRIM_SCHEDULE: list[tuple[str, str | None, int]] = [
67
67
  ("execution_paths", None, 0),
68
68
  ("dependency_graph_summary", None, 0),
69
69
  # Step 6 — last resort
70
- ("relevant_files", None, 3),
70
+ ("relevant_files", None, 10),
71
71
  ("suspected_areas", None, 0),
72
72
  ("key_dependencies", None, 0),
73
73
  ]
@@ -148,7 +148,7 @@ def trim_to_budget(data: dict, budget_bytes: int, *, label: str = "") -> dict:
148
148
  # Budget constants (bytes) — used by CLI callers
149
149
  BUDGET_COMPACT = 30_000 # compact/agent main cmd
150
150
  BUDGET_AGENT = 40_000 # agent main cmd (slightly more headroom)
151
- BUDGET_FIX_BUG = 100_000 # fix-bug (with or without --symptom)
151
+ BUDGET_FIX_BUG = 200_000 # fix-bug (with or without --symptom)
152
152
  BUDGET_REVIEW_PR = 100_000 # review-pr
153
153
  BUDGET_ONBOARD = 30_000 # onboard
154
154
  BUDGET_EXPLAIN = 30_000 # explain
@@ -627,6 +627,21 @@ _FRONTEND_SYMPTOM_MAP: dict[str, list[str]] = {
627
627
  "trabajador": ["trabajador", "empleado", "worker", "asignacion", "trabajadordao", "trabajadorservice"],
628
628
  }
629
629
 
630
+ # Generic words that add noise when used as symptom keywords in large repos.
631
+ # "token" and "user" are too ubiquitous in auth systems to be useful alone.
632
+ _SYMPTOM_STOP_WORDS: frozenset[str] = frozenset({
633
+ "fails", "fail", "failed", "failure",
634
+ "not", "for", "with", "when", "that", "the", "and", "but",
635
+ "are", "has", "had", "have", "was", "were",
636
+ "get", "set", "can", "does", "did", "should", "would", "could",
637
+ "null", "none", "empty", "invalid", "incorrect", "wrong", "missing",
638
+ "error", "issue", "problem", "bug",
639
+ "from", "into", "via", "due", "also", "after", "before",
640
+ "slow", "fast", "new", "old",
641
+ })
642
+
643
+ # Repo-scale threshold: above this file count, use stricter injection logic.
644
+ _LARGE_REPO_THRESHOLD = 500
630
645
 
631
646
  MAX_FILES_FAST = 2000 # above this threshold --fast uses git-index-only mode
632
647
 
@@ -1695,7 +1710,7 @@ class TaskContextBuilder:
1695
1710
  _camel_expanded = _re.sub(r'([A-Z]+)([A-Z][a-z])', r'\1 \2', _camel_expanded)
1696
1711
  symptom_keywords = [
1697
1712
  w.lower() for w in _re.split(r"[\s\W]+", _camel_expanded)
1698
- if len(w) > 2
1713
+ if len(w) > 2 and w.lower() not in _SYMPTOM_STOP_WORDS
1699
1714
  ]
1700
1715
  if symptom_keywords:
1701
1716
  # Pre-compile combined keyword pattern for fast content scanning
@@ -1710,6 +1725,7 @@ class TaskContextBuilder:
1710
1725
  _sx_commits: list[dict] = []
1711
1726
  _sx_synonyms: list[str] = []
1712
1727
  _sx_boosts: list[dict] = []
1728
+ _sx_graph_expanded: list[str] = []
1713
1729
 
1714
1730
  # Pass 1: surface code notes whose text contains any keyword
1715
1731
  _note_matched_paths: dict[str, int] = {} # path → count of matching notes
@@ -1759,14 +1775,27 @@ class TaskContextBuilder:
1759
1775
  ))
1760
1776
  _existing_paths.add(_cp)
1761
1777
 
1762
- # Pass 4: inject files whose path matches symptom keywords
1778
+ # Scale-awareness: large repos need wider scan and stricter injection.
1779
+ _is_large_repo = len(all_paths) > _LARGE_REPO_THRESHOLD
1780
+
1781
+ # Pass 4: inject files whose path matches symptom keywords.
1782
+ # CamelCase-expand the filename stem so "OfflineSessionLoader" matches
1783
+ # the keyword "offline" even without an explicit directory separator.
1784
+ _p4_dirs_of_injected: set[str] = set() # directories of high-score injects
1763
1785
  for _p in all_paths:
1764
1786
  if _p in _existing_paths:
1765
1787
  continue
1766
1788
  if Path(_p).suffix.lower() not in _ALL_EXTENSIONS:
1767
1789
  continue
1768
1790
  _p_lower = _p.lower()
1769
- _matching_kws = [kw for kw in symptom_keywords if kw in _p_lower]
1791
+ # CamelCase-expand the stem and append to the search string so
1792
+ # "OfflineSessionLoader" → "offline session loader" can match
1793
+ # individual keyword tokens beyond what substring search finds.
1794
+ _stem_raw = Path(_p).stem
1795
+ _stem_exp = _re.sub(r'([a-z])([A-Z])', r'\1 \2', _stem_raw)
1796
+ _stem_exp = _re.sub(r'([A-Z]+)([A-Z][a-z])', r'\1 \2', _stem_exp).lower()
1797
+ _p_search = _p_lower + " " + _stem_exp
1798
+ _matching_kws = [kw for kw in symptom_keywords if kw in _p_search]
1770
1799
  if not _matching_kws:
1771
1800
  continue
1772
1801
  _boost = 0.2 * len(_matching_kws)
@@ -1781,6 +1810,8 @@ class TaskContextBuilder:
1781
1810
  ))
1782
1811
  _existing_paths.add(_p)
1783
1812
  _sx_direct_path.append(_p)
1813
+ if _injected_score >= 0.7:
1814
+ _p4_dirs_of_injected.add(str(Path(_p).parent))
1784
1815
 
1785
1816
  # Pass 4b: grep-based injection for frontend→backend synonym terms.
1786
1817
  # Runs parallel grep for each backend term to find files not yet in
@@ -1828,13 +1859,46 @@ class TaskContextBuilder:
1828
1859
  ))
1829
1860
  _existing_paths_now.add(_gf)
1830
1861
 
1831
- # Sort before content scan so top candidates get read first
1832
- relevant_files = sorted(relevant_files, key=lambda rf: -rf.score)
1833
- _CONTENT_SCAN_LIMIT = 80
1862
+ # Pass 4c: subsystem co-location inject sibling files from the same
1863
+ # directories as high-score (≥0.7) path-matched files. This catches
1864
+ # architecturally adjacent classes that don't mention symptom keywords
1865
+ # in their own name (e.g. InfinispanOfflineSessionCacheEntryLifespan…
1866
+ # siblings in the same infinispan/ package).
1867
+ if _is_large_repo and _p4_dirs_of_injected:
1868
+ _coloc_existing = {rf.path for rf in relevant_files}
1869
+ for _cp in all_paths:
1870
+ if _cp in _coloc_existing:
1871
+ continue
1872
+ if Path(_cp).suffix.lower() not in _src_exts:
1873
+ continue
1874
+ if str(Path(_cp).parent) in _p4_dirs_of_injected:
1875
+ relevant_files.append(RelevantFile(
1876
+ path=_cp,
1877
+ role="symptom_match",
1878
+ score=0.55,
1879
+ reason="subsystem co-location: same directory as symptom-matched file",
1880
+ why="directory proximity injection",
1881
+ ))
1882
+ _coloc_existing.add(_cp)
1883
+
1884
+ # Sort before content scan so top candidates get read first.
1885
+ # In large repos: prioritise symptom_match files within each score band
1886
+ # so that subsystem-relevant files are content-scanned before generic
1887
+ # structural files at the same score.
1888
+ if _is_large_repo:
1889
+ relevant_files = sorted(
1890
+ relevant_files,
1891
+ key=lambda rf: (-rf.score, 0 if rf.role == "symptom_match" else 1),
1892
+ )
1893
+ _CONTENT_SCAN_LIMIT = 150
1894
+ else:
1895
+ relevant_files = sorted(relevant_files, key=lambda rf: -rf.score)
1896
+ _CONTENT_SCAN_LIMIT = 80
1834
1897
  _scan_candidates = relevant_files[:_CONTENT_SCAN_LIMIT]
1835
1898
  _no_scan_candidates = relevant_files[_CONTENT_SCAN_LIMIT:]
1836
1899
 
1837
1900
  _boosted: list[RelevantFile] = []
1901
+ _scanned_body: dict[str, str] = {} # cache for graph expansion (Pass 5)
1838
1902
  for _rf in _scan_candidates:
1839
1903
  _extra = 0.0
1840
1904
  _extra_syn = 0.0
@@ -1869,9 +1933,11 @@ class TaskContextBuilder:
1869
1933
  _body_lower = ""
1870
1934
  if Path(_rf.path).suffix.lower() in _src_exts:
1871
1935
  try:
1872
- _body_lower = (self.root / _rf.path).read_text(
1936
+ _raw_body = (self.root / _rf.path).read_text(
1873
1937
  encoding="utf-8", errors="replace"
1874
- )[:12000].lower() # ~300 lines avg
1938
+ )[:12000] # ~300 lines avg
1939
+ _scanned_body[_rf.path] = _raw_body # cache for Pass 5
1940
+ _body_lower = _raw_body.lower()
1875
1941
  except OSError:
1876
1942
  pass
1877
1943
 
@@ -1905,15 +1971,130 @@ class TaskContextBuilder:
1905
1971
  elif _extra_syn > 0:
1906
1972
  _new_reason = _rf.reason + f", synonym-match backend (+{_extra_syn:.2f})"
1907
1973
 
1974
+ _final_score = round(min(_rf.score + _total_extra, 1.0), 2)
1908
1975
  _boosted.append(RelevantFile(
1909
1976
  path=_rf.path,
1910
1977
  role=_rf.role,
1911
- score=round(min(_rf.score + _total_extra, 1.0), 2),
1978
+ score=_final_score,
1912
1979
  reason=_new_reason,
1913
1980
  why=_rf.why,
1914
1981
  ))
1915
1982
 
1916
- relevant_files = sorted(_boosted + _no_scan_candidates, key=lambda rf: -rf.score)
1983
+ # Use total boost as a secondary sort key so symptom-matched files
1984
+ # that were boosted from a lower base score rank above structural
1985
+ # files that coincidentally reach the same capped score of 1.0.
1986
+ # This prevents budget-trimming from discarding the most relevant files.
1987
+ _boost_totals: dict[str, float] = {}
1988
+ for _rf in _scan_candidates:
1989
+ pass # populated below
1990
+ _boost_totals = {}
1991
+ for _idx, _rf in enumerate(_scan_candidates):
1992
+ _b_rf = _boosted[_idx]
1993
+ _boost_totals[_b_rf.path] = round(_b_rf.score - _rf.score, 4)
1994
+
1995
+ relevant_files = sorted(
1996
+ _boosted + _no_scan_candidates,
1997
+ key=lambda rf: (-rf.score, -_boost_totals.get(rf.path, 0)),
1998
+ )
1999
+
2000
+ # Pass 5: reverse graph expansion from high-score seed nodes.
2001
+ # Identifies which source files in the repo REFERENCE the seed
2002
+ # classes (imports, implements, extends, field declarations).
2003
+ # This is a reverse-import lookup: for seed class "UserProvider",
2004
+ # it finds JpaUserProvider / DefaultUserSessionProvider which import
2005
+ # UserProvider — even though those files don't contain symptom
2006
+ # keywords in their own path.
2007
+ # Seeds include any high-score file (not just symptom_match role)
2008
+ # so that files found by _rank_files class-name matching also expand.
2009
+ if not fast:
2010
+ import re as _re_gx
2011
+ _GX_SEED_THRESH = 0.5
2012
+ _GX_EXPAND_CAP = 30
2013
+ _GX_HOP_DECAY = 0.6
2014
+
2015
+ # Collect seed class names from high-score results
2016
+ _gx_seed_stems: dict[str, float] = {} # stem → score
2017
+ for _gx_rf in relevant_files:
2018
+ if _gx_rf.score < _GX_SEED_THRESH:
2019
+ continue
2020
+ if Path(_gx_rf.path).suffix.lower() not in _src_exts:
2021
+ continue
2022
+ _gx_stem = Path(_gx_rf.path).stem
2023
+ _gx_seed_stems[_gx_stem] = max(
2024
+ _gx_seed_stems.get(_gx_stem, 0.0), _gx_rf.score
2025
+ )
2026
+
2027
+ if _gx_seed_stems:
2028
+ # Compile per-stem word-boundary patterns for fast matching
2029
+ import re as _re_gx2
2030
+ _gx_patterns: dict[str, Any] = {
2031
+ stem: _re_gx2.compile(rf'\b{_re_gx2.escape(stem)}\b')
2032
+ for stem in _gx_seed_stems
2033
+ }
2034
+
2035
+ _gx_existing = {rf.path for rf in relevant_files}
2036
+ _gx_new: list[RelevantFile] = []
2037
+ _gx_added: set[str] = set()
2038
+
2039
+ # Candidates: non-test source files not yet in results.
2040
+ # Small repos: scan all; large repos: use pre-scanned content only.
2041
+ # Test files are excluded (fix-bug focuses on production code).
2042
+ if _is_large_repo:
2043
+ _gx_candidates = [
2044
+ p for p in _scanned_body
2045
+ if p not in _gx_existing and not self._is_test(p)
2046
+ ]
2047
+ else:
2048
+ _gx_candidates = [
2049
+ p for p in all_paths
2050
+ if p not in _gx_existing
2051
+ and Path(p).suffix.lower() in _src_exts
2052
+ and not self._is_test(p)
2053
+ ]
2054
+
2055
+ for _gx_cand in _gx_candidates:
2056
+ if len(_gx_new) >= _GX_EXPAND_CAP:
2057
+ break
2058
+ if _gx_cand in _gx_added:
2059
+ continue
2060
+
2061
+ # Use cached content or read fresh (small repos only)
2062
+ _gx_body = _scanned_body.get(_gx_cand)
2063
+ if _gx_body is None:
2064
+ if _is_large_repo:
2065
+ continue # never do fresh reads on large repos in Pass 5
2066
+ try:
2067
+ _gx_body = (self.root / _gx_cand).read_text(
2068
+ encoding="utf-8", errors="replace"
2069
+ )[:8000]
2070
+ except OSError:
2071
+ continue
2072
+
2073
+ # Reverse lookup: does this file reference any seed class?
2074
+ for _gx_stem, _gx_seed_score in _gx_seed_stems.items():
2075
+ if _gx_patterns[_gx_stem].search(_gx_body):
2076
+ _hop1_score = round(
2077
+ min(_gx_seed_score * _GX_HOP_DECAY, 0.85), 2
2078
+ )
2079
+ _gx_new.append(RelevantFile(
2080
+ path=_gx_cand,
2081
+ role="symptom_match",
2082
+ score=_hop1_score,
2083
+ reason=(
2084
+ f"graph_expansion: references {_gx_stem} "
2085
+ f"(1-hop reverse import)"
2086
+ ),
2087
+ why=f"graph_expansion: 1 hop from {_gx_stem}",
2088
+ ))
2089
+ _gx_added.add(_gx_cand)
2090
+ _sx_graph_expanded.append(_gx_cand)
2091
+ break # one match per candidate is enough
2092
+
2093
+ if _gx_new:
2094
+ relevant_files = sorted(
2095
+ relevant_files + _gx_new,
2096
+ key=lambda rf: (-rf.score, -_boost_totals.get(rf.path, 0)),
2097
+ )
1917
2098
 
1918
2099
  # Synonym note (only when synonyms actually fired)
1919
2100
  if _frontend_kws and _sx_synonyms:
@@ -1938,6 +2119,7 @@ class TaskContextBuilder:
1938
2119
  "content_matches": _sx_content[:10],
1939
2120
  "commit_matches": _sx_commits[:10],
1940
2121
  "synonym_matches": _sx_synonyms[:10],
2122
+ "graph_expansion": _sx_graph_expanded[:10],
1941
2123
  "boosts": _sx_boosts[:30],
1942
2124
  "final_boost": round(
1943
2125
  sum(b["value"] for b in _sx_boosts), 3
@@ -2390,7 +2572,8 @@ class TaskContextBuilder:
2390
2572
  else:
2391
2573
  _symptom_class_names.add(_tok)
2392
2574
  _symptom_tokens = {
2393
- w.lower() for w in _re_bug.split(r'[\s\W]+', symptom) if len(w) > 2
2575
+ w.lower() for w in _re_bug.split(r'[\s\W]+', symptom)
2576
+ if len(w) > 2 and w.lower() not in _SYMPTOM_STOP_WORDS
2394
2577
  }
2395
2578
 
2396
2579
  scored: list[tuple[float, str, RelevantFile]] = []
@@ -2487,9 +2670,16 @@ class TaskContextBuilder:
2487
2670
  content_boost += 0.8
2488
2671
  _why_parts.append("exception type in path (+0.8)")
2489
2672
 
2490
- # AND-weighted token intersection — multiple matching tokens >> single
2673
+ # AND-weighted token intersection — multiple matching tokens >> single.
2674
+ # CamelCase-expand the filename stem so "OfflineSessionLoader" contributes
2675
+ # "offline", "session", "loader" as individual tokens beyond what the raw
2676
+ # path splitting yields. This lets multi-word symptoms match class names.
2491
2677
  if _symptom_tokens:
2492
2678
  _path_parts = set(path_lower.replace("/", " ").replace(".", " ").replace("_", " ").split())
2679
+ _stem_cc = Path(path).stem
2680
+ _stem_cc_exp = _re_bug.sub(r'([a-z])([A-Z])', r'\1 \2', _stem_cc)
2681
+ _stem_cc_exp = _re_bug.sub(r'([A-Z]+)([A-Z][a-z])', r'\1 \2', _stem_cc_exp).lower()
2682
+ _path_parts.update(_stem_cc_exp.split())
2493
2683
  _intersection = _symptom_tokens & _path_parts
2494
2684
  _n_match = len(_intersection)
2495
2685
  if _n_match >= 3:
@@ -888,15 +888,40 @@ def _extract_mapped_paths(source: str, class_fqn: str) -> dict[str, str]:
888
888
  # Phase 3 — Symbol relation graph
889
889
  # ---------------------------------------------------------------------------
890
890
 
891
+ def _build_same_package_map(symbols: list[SymbolRecord]) -> dict[str, dict[str, str]]:
892
+ """Build {package: {simple_name: FQN}} map from all class/interface symbols.
893
+
894
+ Used by build_repo_ir to resolve same-package types that need no explicit import.
895
+ In Java, classes in the same package reference each other without import statements,
896
+ so import_map is empty for them — this map provides the fallback resolution.
897
+ """
898
+ result: dict[str, dict[str, str]] = {}
899
+ for sym in symbols:
900
+ if sym.type not in ("class", "interface") or "#" in sym.symbol:
901
+ continue
902
+ pkg = sym.symbol.rsplit(".", 1)[0] if "." in sym.symbol else ""
903
+ simple = sym.symbol.split(".")[-1]
904
+ result.setdefault(pkg, {})[simple] = sym.symbol
905
+ return result
906
+
907
+
891
908
  def _build_relations(
892
909
  symbols: list[SymbolRecord],
893
910
  raw_imports: list[str],
894
911
  source: str,
895
912
  package: str,
896
913
  rel_path: str,
914
+ same_pkg_types: dict[str, str] | None = None,
897
915
  ) -> list[RelationEdge]:
898
- """Phase 3: Build directed relation graph for symbols in one file."""
916
+ """Phase 3: Build directed relation graph for symbols in one file.
917
+
918
+ same_pkg_types: {simple_name → FQN} for classes in the same package.
919
+ Passed by build_repo_ir after a first pass that collects all symbols.
920
+ Enables resolving injection targets that share a package with the caller
921
+ and therefore need no explicit Java import statement.
922
+ """
899
923
  edges: list[RelationEdge] = []
924
+ _same_pkg: dict[str, str] = same_pkg_types or {}
900
925
 
901
926
  import_map: dict[str, str] = {}
902
927
  for fqn in raw_imports:
@@ -929,15 +954,27 @@ def _build_relations(
929
954
  ))
930
955
 
931
956
  if sym.type == "field":
932
- for imp_fqn in sym.imports_used:
957
+ _inject_ann = next(
958
+ (a for a in sym.annotations if a in _INJECT_ANNOTATIONS), "@Autowired"
959
+ )
960
+ _field_targets: set[str] = set(sym.imports_used)
961
+ # Same-package field injection: imports_used is empty when the field type
962
+ # shares a package with the declaring class (no import needed in Java).
963
+ # Extract type from signature ("Type name") and resolve via same_pkg_types.
964
+ if not _field_targets and _same_pkg:
965
+ _sig_type = (sym.signature or "").split()[0] if sym.signature else ""
966
+ _sig_base = re.sub(r'<.*', '', _sig_type).strip()
967
+ if _sig_base and _sig_base[0].isupper():
968
+ _same_fqn = _same_pkg.get(_sig_base)
969
+ if _same_fqn and _same_fqn != _enclosing_class(sym_fqn):
970
+ _field_targets.add(_same_fqn)
971
+ for imp_fqn in _field_targets:
933
972
  edges.append(RelationEdge(
934
973
  from_symbol=sym_fqn,
935
974
  to_symbol=imp_fqn,
936
975
  type="injects",
937
976
  confidence="high",
938
- evidence={"type": "annotation", "value": next(
939
- (a for a in sym.annotations if a in _INJECT_ANNOTATIONS), "@Autowired"
940
- )},
977
+ evidence={"type": "annotation", "value": _inject_ann},
941
978
  ))
942
979
 
943
980
  # ── Constructor injection ─────────────────────────────────────────────────
@@ -949,7 +986,7 @@ def _build_relations(
949
986
  continue
950
987
  for simple_type in sym.param_types:
951
988
  base = re.sub(r'<.*', '', simple_type).strip()
952
- fqn = import_map.get(base)
989
+ fqn = import_map.get(base) or _same_pkg.get(base)
953
990
  if fqn:
954
991
  edges.append(RelationEdge(
955
992
  from_symbol=sym.symbol,
@@ -982,7 +1019,7 @@ def _build_relations(
982
1019
  continue
983
1020
  _ftype = fld.group("type").strip()
984
1021
  _base = re.sub(r'<.*', '', _ftype).strip()
985
- _fqn = import_map.get(_base)
1022
+ _fqn = import_map.get(_base) or _same_pkg.get(_base)
986
1023
  if _fqn:
987
1024
  edges.append(RelationEdge(
988
1025
  from_symbol=sym.symbol,
@@ -2632,24 +2669,38 @@ def build_repo_ir(
2632
2669
  if since:
2633
2670
  _since_changed = _get_git_changed_files(root, since)
2634
2671
 
2672
+ # Pass 1: extract symbols from all files so we can build the same-package
2673
+ # type map before building relations. Java classes in the same package
2674
+ # reference each other without import statements, so import_map alone cannot
2675
+ # resolve them — _build_same_package_map provides the cross-file fallback.
2676
+ _per_file: list[tuple[str, str, str, list[str], list[SymbolRecord]]] = []
2635
2677
  for rel_path in sorted(file_paths):
2636
2678
  abs_path = root / rel_path
2637
2679
  try:
2638
2680
  source = abs_path.read_text(encoding="utf-8", errors="replace")
2639
2681
  except OSError:
2640
2682
  continue
2683
+ package, symbols, raw_imports = _extract_symbols(source, rel_path)
2684
+ all_symbols.extend(symbols)
2685
+ _per_file.append((rel_path, source, package, raw_imports, symbols))
2686
+
2687
+ # Build {package: {simple_name: FQN}} from every class/interface found.
2688
+ _same_pkg_map: dict[str, dict[str, str]] = _build_same_package_map(all_symbols)
2689
+
2690
+ # Pass 2: build relations with same-package type resolution available.
2691
+ for rel_path, source, package, raw_imports, symbols in _per_file:
2692
+ same_pkg_types = _same_pkg_map.get(package, {})
2693
+ relations = _build_relations(
2694
+ symbols, raw_imports, source, package, rel_path,
2695
+ same_pkg_types=same_pkg_types,
2696
+ )
2641
2697
 
2642
2698
  old_source: Optional[str] = None
2643
2699
  if since:
2644
- # Only fetch old content for files known to have changed.
2645
- # Unchanged files have no diff entries — skip git show entirely.
2646
2700
  _file_changed = _since_changed is None or rel_path in _since_changed
2647
2701
  if _file_changed:
2648
2702
  old_source = _get_git_old_content(root, rel_path, since)
2649
2703
 
2650
- package, symbols, raw_imports = _extract_symbols(source, rel_path)
2651
- relations = _build_relations(symbols, raw_imports, source, package, rel_path)
2652
-
2653
2704
  if old_source is not None:
2654
2705
  _, old_symbols, _ = _extract_symbols(old_source, rel_path)
2655
2706
  all_changed.extend(_diff_symbols(old_symbols, symbols))
@@ -2664,7 +2715,6 @@ def build_repo_ir(
2664
2715
  confidence="high",
2665
2716
  ))
2666
2717
 
2667
- all_symbols.extend(symbols)
2668
2718
  all_relations.extend(relations)
2669
2719
 
2670
2720
  spring_summary = _build_spring_summary(all_symbols)
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes