sourcecode 1.31.16__py3-none-any.whl → 1.31.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sourcecode/__init__.py +1 -1
- sourcecode/cache.py +470 -0
- sourcecode/cli.py +17 -13
- sourcecode/repository_ir.py +141 -1
- {sourcecode-1.31.16.dist-info → sourcecode-1.31.18.dist-info}/METADATA +111 -40
- {sourcecode-1.31.16.dist-info → sourcecode-1.31.18.dist-info}/RECORD +9 -8
- {sourcecode-1.31.16.dist-info → sourcecode-1.31.18.dist-info}/WHEEL +0 -0
- {sourcecode-1.31.16.dist-info → sourcecode-1.31.18.dist-info}/entry_points.txt +0 -0
- {sourcecode-1.31.16.dist-info → sourcecode-1.31.18.dist-info}/licenses/LICENSE +0 -0
sourcecode/__init__.py
CHANGED
sourcecode/cache.py
ADDED
|
@@ -0,0 +1,470 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Snapshot cache manager for sourcecode — v2.
|
|
3
|
+
|
|
4
|
+
Cache layout
|
|
5
|
+
------------
|
|
6
|
+
~/.sourcecode/cache/<repo_id>/
|
|
7
|
+
snapshot-<git_sha>-<flags_hash>.json.gz ← versioned envelope
|
|
8
|
+
cas/
|
|
9
|
+
<blob_hash16>.gz ← content-addressed blobs
|
|
10
|
+
|
|
11
|
+
Schema
|
|
12
|
+
------
|
|
13
|
+
Every snapshot file is a gzip-compressed JSON *envelope*:
|
|
14
|
+
|
|
15
|
+
{
|
|
16
|
+
"sv": "2", // schema version — bump to invalidate all
|
|
17
|
+
"key": "abc1234-aabbccdd", // cache key (git_sha + flags_hash)
|
|
18
|
+
"ts": "2026-05-24T22:00:00Z", // write timestamp (ISO-8601 UTC)
|
|
19
|
+
"fmt": "json", // output format: "json" | "yaml"
|
|
20
|
+
"layers": {"heuristic": "...", ...}, // analyzer fingerprints at write time
|
|
21
|
+
// ── content (one of two forms) ──────────────────────────────────────
|
|
22
|
+
"snap": {...}, // inline fields (small) — JSON mode
|
|
23
|
+
"cas": {"file_paths": "<h16>",…} // large fields deduped into CAS store
|
|
24
|
+
// — OR —
|
|
25
|
+
"raw": "<content string>" // YAML or unparseable JSON stored as-is
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
Content-addressed store (CAS)
|
|
29
|
+
-----------------------------
|
|
30
|
+
Large top-level JSON fields (> _CAS_THRESHOLD bytes) are extracted into the
|
|
31
|
+
``cas/`` directory as individual gzip-compressed blobs identified by a 16-char
|
|
32
|
+
SHA-256 hash of their uncompressed bytes. Two snapshots that share an
|
|
33
|
+
identical ``file_paths`` array reference the *same* blob — zero duplication.
|
|
34
|
+
|
|
35
|
+
Eviction / GC
|
|
36
|
+
-------------
|
|
37
|
+
After each write, ``_gc()`` keeps snapshots from the last
|
|
38
|
+
``SOURCECODE_CACHE_KEEP_COMMITS`` distinct git commits (default 5, override via
|
|
39
|
+
env var). A CAS sweep runs concurrently: blobs unreferenced by any surviving
|
|
40
|
+
snapshot are deleted.
|
|
41
|
+
|
|
42
|
+
Backward compatibility
|
|
43
|
+
----------------------
|
|
44
|
+
v1 files (raw gzip'd content, no envelope) are detected by the absence of an
|
|
45
|
+
``sv`` key in the decompressed JSON, and served transparently. Legacy files
|
|
46
|
+
in ``<repo>/.sourcecode-cache/`` are also checked as a final fallback.
|
|
47
|
+
|
|
48
|
+
Env vars
|
|
49
|
+
--------
|
|
50
|
+
SOURCECODE_CACHE_DIR Override global cache base (default: ~/.sourcecode/cache)
|
|
51
|
+
SOURCECODE_CACHE_KEEP_COMMITS How many git commits to retain (default: 5; 0 = unlimited)
|
|
52
|
+
"""
|
|
53
|
+
from __future__ import annotations
|
|
54
|
+
|
|
55
|
+
import gzip
|
|
56
|
+
import hashlib
|
|
57
|
+
import json
|
|
58
|
+
import os
|
|
59
|
+
import re
|
|
60
|
+
from datetime import datetime, timezone
|
|
61
|
+
from pathlib import Path
|
|
62
|
+
from typing import Any, Optional
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
# ---------------------------------------------------------------------------
|
|
66
|
+
# Version / constants
|
|
67
|
+
# ---------------------------------------------------------------------------
|
|
68
|
+
|
|
69
|
+
#: Bump this string to invalidate *all* existing cached snapshots.
|
|
70
|
+
SCHEMA_VERSION: str = "2"
|
|
71
|
+
|
|
72
|
+
#: Fields eligible for CAS deduplication (applied to top-level JSON dict keys).
|
|
73
|
+
_CAS_FIELDS: frozenset[str] = frozenset([
|
|
74
|
+
"file_paths",
|
|
75
|
+
"entry_points",
|
|
76
|
+
"docs",
|
|
77
|
+
"dependencies",
|
|
78
|
+
"graph",
|
|
79
|
+
"semantic_calls",
|
|
80
|
+
"semantic_symbols",
|
|
81
|
+
"architecture",
|
|
82
|
+
"metrics",
|
|
83
|
+
"git_history",
|
|
84
|
+
"env_map",
|
|
85
|
+
"code_notes",
|
|
86
|
+
])
|
|
87
|
+
|
|
88
|
+
#: Serialised size threshold (bytes) above which a field is moved to CAS.
|
|
89
|
+
_CAS_THRESHOLD: int = 4096
|
|
90
|
+
|
|
91
|
+
_DEFAULT_KEEP_COMMITS: int = 5
|
|
92
|
+
|
|
93
|
+
# Matches "snapshot-<hex_commit>-<hex_flags>.json.gz"
|
|
94
|
+
_SNAPSHOT_RE = re.compile(r"^snapshot-([0-9a-f]+)-[0-9a-f]+\.json\.gz$")
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
# ---------------------------------------------------------------------------
|
|
98
|
+
# Public API — location helpers
|
|
99
|
+
# ---------------------------------------------------------------------------
|
|
100
|
+
|
|
101
|
+
def repo_id(repo_root: Path) -> str:
|
|
102
|
+
"""Stable 16-char hex identifier derived from the canonical repo path."""
|
|
103
|
+
return hashlib.sha256(str(repo_root.resolve()).encode()).hexdigest()[:16]
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def cache_dir(repo_root: Path) -> Path:
|
|
107
|
+
"""
|
|
108
|
+
Return the per-repo cache directory (``~/.sourcecode/cache/<repo_id>/``).
|
|
109
|
+
|
|
110
|
+
Override the base via ``SOURCECODE_CACHE_DIR``.
|
|
111
|
+
"""
|
|
112
|
+
env_base = os.environ.get("SOURCECODE_CACHE_DIR", "")
|
|
113
|
+
base: Path = Path(env_base) if env_base else Path.home() / ".sourcecode" / "cache"
|
|
114
|
+
return base / repo_id(repo_root)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
# ---------------------------------------------------------------------------
|
|
118
|
+
# Public API — read / write
|
|
119
|
+
# ---------------------------------------------------------------------------
|
|
120
|
+
|
|
121
|
+
def read(repo_root: Path, cache_key: str) -> Optional[str]:
|
|
122
|
+
"""
|
|
123
|
+
Return the cached snapshot string for *cache_key*, or ``None`` on miss.
|
|
124
|
+
|
|
125
|
+
Lookup order:
|
|
126
|
+
1. ``<cache_dir>/snapshot-<cache_key>.json.gz`` — v2 envelope (new)
|
|
127
|
+
2. ``<repo_root>/.sourcecode-cache/snapshot-<cache_key>.json`` — legacy
|
|
128
|
+
"""
|
|
129
|
+
cache_d = cache_dir(repo_root)
|
|
130
|
+
|
|
131
|
+
# ── 1. Global location (.json.gz, v2 envelope or v1 raw) ───────────────
|
|
132
|
+
gz_path = cache_d / f"snapshot-{cache_key}.json.gz"
|
|
133
|
+
if gz_path.exists():
|
|
134
|
+
try:
|
|
135
|
+
result = _parse_envelope(gz_path.read_bytes(), cache_d)
|
|
136
|
+
if result is not None:
|
|
137
|
+
return result
|
|
138
|
+
except Exception:
|
|
139
|
+
pass
|
|
140
|
+
_safe_unlink(gz_path) # corrupted or version mismatch — evict
|
|
141
|
+
return None
|
|
142
|
+
|
|
143
|
+
# ── 2. Legacy location (<repo>/.sourcecode-cache/*.json) ───────────────
|
|
144
|
+
legacy = repo_root / ".sourcecode-cache" / f"snapshot-{cache_key}.json"
|
|
145
|
+
if legacy.exists():
|
|
146
|
+
try:
|
|
147
|
+
return legacy.read_text(encoding="utf-8")
|
|
148
|
+
except Exception:
|
|
149
|
+
return None
|
|
150
|
+
|
|
151
|
+
return None
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def write(
|
|
155
|
+
repo_root: Path,
|
|
156
|
+
cache_key: str,
|
|
157
|
+
content: str,
|
|
158
|
+
*,
|
|
159
|
+
fmt: str = "json",
|
|
160
|
+
layers: Optional[dict[str, str]] = None,
|
|
161
|
+
) -> None:
|
|
162
|
+
"""
|
|
163
|
+
Persist *content* as a versioned, optionally CAS-deduped snapshot.
|
|
164
|
+
|
|
165
|
+
Parameters
|
|
166
|
+
----------
|
|
167
|
+
repo_root : Path
|
|
168
|
+
Root directory of the analysed repository.
|
|
169
|
+
cache_key : str
|
|
170
|
+
``"{git_sha}-{flags_hash}"`` identifying this analysis.
|
|
171
|
+
content : str
|
|
172
|
+
Final rendered output (JSON or YAML string).
|
|
173
|
+
fmt : str
|
|
174
|
+
``"json"`` or ``"yaml"`` — determines whether CAS extraction applies.
|
|
175
|
+
layers : dict[str, str], optional
|
|
176
|
+
Analyzer fingerprints (from ``_compute_analyzer_fingerprints()``).
|
|
177
|
+
Stored in the envelope for future layer-aware reuse.
|
|
178
|
+
|
|
179
|
+
Writes are always best-effort: any failure is silently swallowed.
|
|
180
|
+
"""
|
|
181
|
+
cache_d = cache_dir(repo_root)
|
|
182
|
+
dest = cache_d / f"snapshot-{cache_key}.json.gz"
|
|
183
|
+
try:
|
|
184
|
+
cache_d.mkdir(parents=True, exist_ok=True)
|
|
185
|
+
payload = _build_envelope(cache_key, content, fmt, layers or {}, cache_d)
|
|
186
|
+
dest.write_bytes(payload)
|
|
187
|
+
except Exception:
|
|
188
|
+
return # non-fatal
|
|
189
|
+
|
|
190
|
+
_gc(cache_d)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
# ---------------------------------------------------------------------------
|
|
194
|
+
# Envelope (de)serialisation
|
|
195
|
+
# ---------------------------------------------------------------------------
|
|
196
|
+
|
|
197
|
+
def _now_iso() -> str:
|
|
198
|
+
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def _build_envelope(
|
|
202
|
+
cache_key: str,
|
|
203
|
+
content: str,
|
|
204
|
+
fmt: str,
|
|
205
|
+
layers: dict[str, str],
|
|
206
|
+
cache_d: Path,
|
|
207
|
+
) -> bytes:
|
|
208
|
+
"""Build a versioned envelope and return gzip-compressed bytes."""
|
|
209
|
+
envelope: dict[str, Any] = {
|
|
210
|
+
"sv": SCHEMA_VERSION,
|
|
211
|
+
"key": cache_key,
|
|
212
|
+
"ts": _now_iso(),
|
|
213
|
+
"fmt": fmt,
|
|
214
|
+
"layers": layers,
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
if fmt == "json":
|
|
218
|
+
# Try to parse and extract large fields into CAS
|
|
219
|
+
try:
|
|
220
|
+
snap_dict = json.loads(content)
|
|
221
|
+
if isinstance(snap_dict, dict):
|
|
222
|
+
inline, cas_refs = _cas_extract(snap_dict, cache_d)
|
|
223
|
+
envelope["snap"] = inline
|
|
224
|
+
if cas_refs:
|
|
225
|
+
envelope["cas"] = cas_refs
|
|
226
|
+
else:
|
|
227
|
+
# JSON array or primitive — store as-is
|
|
228
|
+
envelope["raw"] = content
|
|
229
|
+
except Exception:
|
|
230
|
+
envelope["raw"] = content
|
|
231
|
+
else:
|
|
232
|
+
# YAML or unknown format — store raw string
|
|
233
|
+
envelope["raw"] = content
|
|
234
|
+
|
|
235
|
+
return gzip.compress(
|
|
236
|
+
json.dumps(envelope, ensure_ascii=False).encode("utf-8"),
|
|
237
|
+
compresslevel=6,
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def _parse_envelope(data: bytes, cache_d: Path) -> Optional[str]:
|
|
242
|
+
"""
|
|
243
|
+
Decompress *data*, parse envelope, resolve CAS refs, return content string.
|
|
244
|
+
|
|
245
|
+
Returns ``None`` on schema version mismatch, CAS miss, or parse failure.
|
|
246
|
+
v1 files (no envelope wrapper) are detected and served transparently.
|
|
247
|
+
"""
|
|
248
|
+
try:
|
|
249
|
+
raw_bytes = gzip.decompress(data)
|
|
250
|
+
except Exception:
|
|
251
|
+
return None
|
|
252
|
+
|
|
253
|
+
# ── v1 detection ────────────────────────────────────────────────────────
|
|
254
|
+
# v1 stored the content string directly (gzip'd UTF-8), not an envelope.
|
|
255
|
+
# Heuristic: if decompressed bytes are not a JSON object with an "sv" key,
|
|
256
|
+
# treat as v1 and return the raw bytes as the content string.
|
|
257
|
+
try:
|
|
258
|
+
envelope = json.loads(raw_bytes.decode("utf-8"))
|
|
259
|
+
except Exception:
|
|
260
|
+
# Not JSON at all (e.g. YAML v1) — return as-is
|
|
261
|
+
try:
|
|
262
|
+
return raw_bytes.decode("utf-8")
|
|
263
|
+
except Exception:
|
|
264
|
+
return None
|
|
265
|
+
|
|
266
|
+
if not isinstance(envelope, dict) or envelope.get("sv") != SCHEMA_VERSION:
|
|
267
|
+
# dict without "sv" → v1 JSON snapshot; non-matching sv → old envelope
|
|
268
|
+
# Serve v1 transparently; reject mismatched schema versions as a miss.
|
|
269
|
+
if isinstance(envelope, dict) and "sv" in envelope:
|
|
270
|
+
return None # schema version mismatch
|
|
271
|
+
# No "sv" at all → v1 format, raw content
|
|
272
|
+
return raw_bytes.decode("utf-8")
|
|
273
|
+
|
|
274
|
+
# ── v2 envelope ─────────────────────────────────────────────────────────
|
|
275
|
+
if "raw" in envelope:
|
|
276
|
+
return envelope["raw"]
|
|
277
|
+
|
|
278
|
+
if "snap" in envelope:
|
|
279
|
+
inline: dict[str, Any] = envelope["snap"]
|
|
280
|
+
cas_refs: dict[str, str] = envelope.get("cas", {})
|
|
281
|
+
if cas_refs:
|
|
282
|
+
restored = _cas_restore(inline, cas_refs, cache_d)
|
|
283
|
+
if restored is None:
|
|
284
|
+
return None # CAS miss (blob evicted or corrupted)
|
|
285
|
+
else:
|
|
286
|
+
restored = dict(inline)
|
|
287
|
+
# Re-serialise with the same parameters used by the pipeline.
|
|
288
|
+
# json.loads → json.dumps round-trips correctly: Python 3.7+ preserves
|
|
289
|
+
# dict insertion order and the pipeline uses indent=2, ensure_ascii=False.
|
|
290
|
+
return json.dumps(restored, indent=2, ensure_ascii=False)
|
|
291
|
+
|
|
292
|
+
return None # malformed envelope
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
# ---------------------------------------------------------------------------
|
|
296
|
+
# CAS store
|
|
297
|
+
# ---------------------------------------------------------------------------
|
|
298
|
+
|
|
299
|
+
def _cas_dir(cache_d: Path) -> Path:
|
|
300
|
+
return cache_d / "cas"
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
def _cas_path(cache_d: Path, blob_hash: str) -> Path:
|
|
304
|
+
return _cas_dir(cache_d) / f"{blob_hash}.gz"
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def _cas_store_blob(cache_d: Path, serialised: str) -> str:
|
|
308
|
+
"""
|
|
309
|
+
Store *serialised* (a JSON string) in the CAS. Idempotent.
|
|
310
|
+
|
|
311
|
+
Returns the 16-char SHA-256 hex hash that identifies the blob.
|
|
312
|
+
"""
|
|
313
|
+
raw = serialised.encode("utf-8")
|
|
314
|
+
blob_hash = hashlib.sha256(raw).hexdigest()[:16]
|
|
315
|
+
path = _cas_path(cache_d, blob_hash)
|
|
316
|
+
if not path.exists():
|
|
317
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
318
|
+
path.write_bytes(gzip.compress(raw, compresslevel=6))
|
|
319
|
+
return blob_hash
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
def _cas_load_blob(cache_d: Path, blob_hash: str) -> Optional[str]:
|
|
323
|
+
"""Return the stored JSON string for *blob_hash*, or ``None`` if absent."""
|
|
324
|
+
path = _cas_path(cache_d, blob_hash)
|
|
325
|
+
if not path.exists():
|
|
326
|
+
return None
|
|
327
|
+
try:
|
|
328
|
+
return gzip.decompress(path.read_bytes()).decode("utf-8")
|
|
329
|
+
except Exception:
|
|
330
|
+
return None
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
def _cas_extract(
|
|
334
|
+
snap_dict: dict[str, Any],
|
|
335
|
+
cache_d: Path,
|
|
336
|
+
) -> tuple[dict[str, Any], dict[str, str]]:
|
|
337
|
+
"""
|
|
338
|
+
Walk *snap_dict* top-level fields. Fields that:
|
|
339
|
+
- are in ``_CAS_FIELDS``
|
|
340
|
+
- serialise to more than ``_CAS_THRESHOLD`` bytes
|
|
341
|
+
|
|
342
|
+
… are stored as CAS blobs and replaced with their hash in the returned
|
|
343
|
+
``cas_refs`` mapping. Other fields remain inline.
|
|
344
|
+
"""
|
|
345
|
+
inline: dict[str, Any] = {}
|
|
346
|
+
cas_refs: dict[str, str] = {}
|
|
347
|
+
|
|
348
|
+
for key, value in snap_dict.items():
|
|
349
|
+
if key in _CAS_FIELDS and value is not None:
|
|
350
|
+
serialised = json.dumps(value, ensure_ascii=False)
|
|
351
|
+
if len(serialised.encode("utf-8")) > _CAS_THRESHOLD:
|
|
352
|
+
blob_hash = _cas_store_blob(cache_d, serialised)
|
|
353
|
+
cas_refs[key] = blob_hash
|
|
354
|
+
continue
|
|
355
|
+
inline[key] = value
|
|
356
|
+
|
|
357
|
+
return inline, cas_refs
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def _cas_restore(
|
|
361
|
+
inline: dict[str, Any],
|
|
362
|
+
cas_refs: dict[str, str],
|
|
363
|
+
cache_d: Path,
|
|
364
|
+
) -> Optional[dict[str, Any]]:
|
|
365
|
+
"""
|
|
366
|
+
Reconstruct a full snapshot dict by loading CAS blobs for *cas_refs*.
|
|
367
|
+
|
|
368
|
+
Returns ``None`` if any blob is missing (treat as cache miss).
|
|
369
|
+
"""
|
|
370
|
+
result: dict[str, Any] = dict(inline)
|
|
371
|
+
for field, blob_hash in cas_refs.items():
|
|
372
|
+
blob_str = _cas_load_blob(cache_d, blob_hash)
|
|
373
|
+
if blob_str is None:
|
|
374
|
+
return None # blob evicted or corrupted → full miss
|
|
375
|
+
try:
|
|
376
|
+
result[field] = json.loads(blob_str)
|
|
377
|
+
except Exception:
|
|
378
|
+
return None
|
|
379
|
+
return result
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
# ---------------------------------------------------------------------------
|
|
383
|
+
# Eviction / GC
|
|
384
|
+
# ---------------------------------------------------------------------------
|
|
385
|
+
|
|
386
|
+
def _gc(cache_d: Path) -> None:
|
|
387
|
+
"""
|
|
388
|
+
Evict old snapshots and sweep orphaned CAS blobs.
|
|
389
|
+
|
|
390
|
+
Keeps snapshots from the last ``SOURCECODE_CACHE_KEEP_COMMITS`` distinct
|
|
391
|
+
git commits (determined by mtime of files in each commit group).
|
|
392
|
+
"""
|
|
393
|
+
keep = int(os.environ.get("SOURCECODE_CACHE_KEEP_COMMITS", _DEFAULT_KEEP_COMMITS))
|
|
394
|
+
|
|
395
|
+
try:
|
|
396
|
+
all_snapshots = list(cache_d.glob("snapshot-*.json.gz"))
|
|
397
|
+
if not all_snapshots:
|
|
398
|
+
return
|
|
399
|
+
|
|
400
|
+
# Group snapshot files by commit SHA
|
|
401
|
+
groups: dict[str, list[Path]] = {}
|
|
402
|
+
for f in all_snapshots:
|
|
403
|
+
m = _SNAPSHOT_RE.match(f.name)
|
|
404
|
+
if m:
|
|
405
|
+
groups.setdefault(m.group(1), []).append(f)
|
|
406
|
+
|
|
407
|
+
surviving: list[Path]
|
|
408
|
+
|
|
409
|
+
if keep <= 0 or len(groups) <= keep:
|
|
410
|
+
# No eviction needed — but still sweep CAS
|
|
411
|
+
surviving = all_snapshots
|
|
412
|
+
else:
|
|
413
|
+
def _newest_mtime(commit: str) -> float:
|
|
414
|
+
return max(p.stat().st_mtime for p in groups[commit])
|
|
415
|
+
|
|
416
|
+
sorted_commits = sorted(groups, key=_newest_mtime, reverse=True)
|
|
417
|
+
surviving = []
|
|
418
|
+
for i, commit in enumerate(sorted_commits):
|
|
419
|
+
if i < keep:
|
|
420
|
+
surviving.extend(groups[commit])
|
|
421
|
+
else:
|
|
422
|
+
for f in groups[commit]:
|
|
423
|
+
_safe_unlink(f)
|
|
424
|
+
|
|
425
|
+
_gc_cas(cache_d, surviving)
|
|
426
|
+
|
|
427
|
+
except Exception:
|
|
428
|
+
pass # GC failure is non-fatal
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
def _gc_cas(cache_d: Path, surviving_snapshots: list[Path]) -> None:
|
|
432
|
+
"""
|
|
433
|
+
Delete CAS blobs not referenced by any snapshot in *surviving_snapshots*.
|
|
434
|
+
|
|
435
|
+
Walks each snapshot's ``cas`` dict to collect live hashes; deletes the rest.
|
|
436
|
+
"""
|
|
437
|
+
cas_d = _cas_dir(cache_d)
|
|
438
|
+
if not cas_d.exists():
|
|
439
|
+
return
|
|
440
|
+
|
|
441
|
+
try:
|
|
442
|
+
# Collect all hashes referenced by surviving snapshots
|
|
443
|
+
referenced: set[str] = set()
|
|
444
|
+
for snap_path in surviving_snapshots:
|
|
445
|
+
try:
|
|
446
|
+
raw = gzip.decompress(snap_path.read_bytes())
|
|
447
|
+
env = json.loads(raw.decode("utf-8"))
|
|
448
|
+
if isinstance(env, dict) and "cas" in env:
|
|
449
|
+
referenced.update(env["cas"].values())
|
|
450
|
+
except Exception:
|
|
451
|
+
pass # unreadable snapshot — conservatively keep its blobs unknown
|
|
452
|
+
|
|
453
|
+
# Delete blobs not referenced by any surviving snapshot
|
|
454
|
+
for blob in cas_d.glob("*.gz"):
|
|
455
|
+
if blob.stem not in referenced:
|
|
456
|
+
_safe_unlink(blob)
|
|
457
|
+
|
|
458
|
+
except Exception:
|
|
459
|
+
pass # CAS sweep failure is non-fatal
|
|
460
|
+
|
|
461
|
+
|
|
462
|
+
# ---------------------------------------------------------------------------
|
|
463
|
+
# Utilities
|
|
464
|
+
# ---------------------------------------------------------------------------
|
|
465
|
+
|
|
466
|
+
def _safe_unlink(path: Path) -> None:
|
|
467
|
+
try:
|
|
468
|
+
path.unlink(missing_ok=True)
|
|
469
|
+
except Exception:
|
|
470
|
+
pass
|
sourcecode/cli.py
CHANGED
|
@@ -876,14 +876,16 @@ def main(
|
|
|
876
876
|
architecture = True # agents need full architectural signal (M4)
|
|
877
877
|
graph_modules = True # IC-003: import graph needed for architecture confidence
|
|
878
878
|
|
|
879
|
-
# ── GAP-9: Cache check — serve from
|
|
879
|
+
# ── GAP-9: Cache check — serve from global cache when git SHA unchanged ──
|
|
880
|
+
# Cache is stored in ~/.sourcecode/cache/<repo_id>/ (outside the repo).
|
|
881
|
+
# Snapshots are gzip-compressed (.json.gz) — ~85 % smaller than plain JSON.
|
|
882
|
+
# Eviction keeps the last SOURCECODE_CACHE_KEEP_COMMITS commits (default 5).
|
|
880
883
|
import hashlib as _hashlib
|
|
881
884
|
import subprocess as _sub
|
|
882
|
-
|
|
885
|
+
from sourcecode import cache as _cache_mod
|
|
883
886
|
_cache_hit_content: Optional[str] = None
|
|
884
887
|
_git_sha = ""
|
|
885
888
|
_cache_key = ""
|
|
886
|
-
_cache_file: Optional[Path] = None
|
|
887
889
|
if not no_cache:
|
|
888
890
|
try:
|
|
889
891
|
_sha_r = _sub.run(
|
|
@@ -921,13 +923,10 @@ def main(
|
|
|
921
923
|
)
|
|
922
924
|
_flags_h = _hashlib.md5(_flags_str.encode()).hexdigest()[:8]
|
|
923
925
|
_cache_key = f"{_git_sha}-{_flags_h}"
|
|
924
|
-
|
|
925
|
-
if _cache_file.exists():
|
|
926
|
-
_cache_hit_content = _cache_file.read_text(encoding="utf-8")
|
|
926
|
+
_cache_hit_content = _cache_mod.read(target, _cache_key)
|
|
927
927
|
except Exception:
|
|
928
928
|
_git_sha = ""
|
|
929
929
|
_cache_key = ""
|
|
930
|
-
_cache_file = None
|
|
931
930
|
|
|
932
931
|
if _cache_hit_content is not None:
|
|
933
932
|
from sourcecode.serializer import write_output
|
|
@@ -1762,12 +1761,17 @@ def main(
|
|
|
1762
1761
|
write_output(content, output=output)
|
|
1763
1762
|
|
|
1764
1763
|
# GAP-9: Persist to cache for future identical runs (git SHA unchanged)
|
|
1765
|
-
|
|
1766
|
-
|
|
1767
|
-
|
|
1768
|
-
|
|
1769
|
-
|
|
1770
|
-
|
|
1764
|
+
# Writes versioned envelope to ~/.sourcecode/cache/<repo_id>/<key>.json.gz.
|
|
1765
|
+
# Large JSON fields are extracted into shared CAS blobs (deduplication).
|
|
1766
|
+
# GC runs inline after each write (keep last N commits + CAS sweep).
|
|
1767
|
+
if not no_cache and _cache_key and not _pipeline_error:
|
|
1768
|
+
_cache_mod.write(
|
|
1769
|
+
target,
|
|
1770
|
+
_cache_key,
|
|
1771
|
+
content,
|
|
1772
|
+
fmt=format,
|
|
1773
|
+
layers=_compute_analyzer_fingerprints(),
|
|
1774
|
+
)
|
|
1771
1775
|
|
|
1772
1776
|
if _pipeline_error:
|
|
1773
1777
|
raise typer.Exit(code=2)
|
sourcecode/repository_ir.py
CHANGED
|
@@ -2691,6 +2691,60 @@ def apply_ir_size_limits(
|
|
|
2691
2691
|
"global_score": (ir.get("impact") or {}).get("global_score", 0),
|
|
2692
2692
|
"ranked_nodes": ranked,
|
|
2693
2693
|
}
|
|
2694
|
+
|
|
2695
|
+
# ── Trim reverse_graph to match node/edge limits ──────────────────────────
|
|
2696
|
+
# BUG-P0-02: reverse_graph was never bounded by --max-nodes/--max-edges.
|
|
2697
|
+
# A 26K-node repo (Broadleaf) emits ~3MB of reverse_graph even when
|
|
2698
|
+
# --max-nodes 200 --max-edges 500 is requested.
|
|
2699
|
+
full_rg: dict = ir.get("reverse_graph") or {}
|
|
2700
|
+
if full_rg:
|
|
2701
|
+
# Inner caller-list cap: prevents individual entries from dominating budget.
|
|
2702
|
+
# Formula: max(20, max_nodes // 4) when max_nodes given; 50 otherwise.
|
|
2703
|
+
def _cap_rg_lists(entry: dict, cap: int) -> dict:
|
|
2704
|
+
return {k: (v[:cap] if isinstance(v, list) and len(v) > cap else v)
|
|
2705
|
+
for k, v in entry.items()}
|
|
2706
|
+
|
|
2707
|
+
if kept_fqns is not None:
|
|
2708
|
+
# max_nodes was applied — restrict reverse_graph to kept nodes only.
|
|
2709
|
+
# Cap inner caller lists proportionally: large max_nodes → more callers shown.
|
|
2710
|
+
_inner_cap = max(20, max_nodes // 4) if max_nodes else 50
|
|
2711
|
+
trimmed_rg: dict = {
|
|
2712
|
+
k: _cap_rg_lists(v, _inner_cap)
|
|
2713
|
+
for k, v in full_rg.items()
|
|
2714
|
+
if k in kept_fqns
|
|
2715
|
+
}
|
|
2716
|
+
out["reverse_graph"] = trimmed_rg
|
|
2717
|
+
_rg_trimmed_count = len(full_rg) - len(trimmed_rg)
|
|
2718
|
+
if _rg_trimmed_count:
|
|
2719
|
+
out["reverse_graph_note"] = (
|
|
2720
|
+
f"reverse_graph trimmed: {len(trimmed_rg)}/{len(full_rg)} entries "
|
|
2721
|
+
f"kept (matching --max-nodes {max_nodes} kept nodes), "
|
|
2722
|
+
f"caller lists capped at {_inner_cap}. "
|
|
2723
|
+
"Use --output for full reverse_graph."
|
|
2724
|
+
)
|
|
2725
|
+
elif max_edges is not None:
|
|
2726
|
+
# Only max_edges given (no max_nodes): cap reverse_graph keys
|
|
2727
|
+
# proportionally. Target: at most max_edges keys, sorted by in-degree
|
|
2728
|
+
# (most-connected hubs first) so the most useful entries survive.
|
|
2729
|
+
_rg_limit = max(1, min(max_edges, len(full_rg)))
|
|
2730
|
+
_rg_sorted_keys = sorted(
|
|
2731
|
+
full_rg.keys(),
|
|
2732
|
+
key=lambda k: sum(len(v) for v in full_rg[k].values() if isinstance(v, list)),
|
|
2733
|
+
reverse=True,
|
|
2734
|
+
)
|
|
2735
|
+
_inner_cap = 50
|
|
2736
|
+
out["reverse_graph"] = {
|
|
2737
|
+
k: _cap_rg_lists(full_rg[k], _inner_cap)
|
|
2738
|
+
for k in _rg_sorted_keys[:_rg_limit]
|
|
2739
|
+
}
|
|
2740
|
+
if len(full_rg) > _rg_limit:
|
|
2741
|
+
out["reverse_graph_note"] = (
|
|
2742
|
+
f"reverse_graph trimmed: {_rg_limit}/{len(full_rg)} entries "
|
|
2743
|
+
f"kept (top by in-degree, bounded by --max-edges {max_edges}), "
|
|
2744
|
+
f"caller lists capped at {_inner_cap}. "
|
|
2745
|
+
"Use --output for full reverse_graph."
|
|
2746
|
+
)
|
|
2747
|
+
|
|
2694
2748
|
return out
|
|
2695
2749
|
|
|
2696
2750
|
|
|
@@ -2849,7 +2903,9 @@ def compute_blast_radius(
|
|
|
2849
2903
|
"""
|
|
2850
2904
|
reverse_graph: dict[str, dict[str, list[str]]] = ir.get("reverse_graph") or {}
|
|
2851
2905
|
route_surface: list[dict] = ir.get("route_surface") or []
|
|
2852
|
-
|
|
2906
|
+
_graph: dict = ir.get("graph") or {}
|
|
2907
|
+
graph_nodes: list[dict] = _graph.get("nodes") or []
|
|
2908
|
+
graph_edges: list[dict] = _graph.get("edges") or []
|
|
2853
2909
|
subsystems: list[dict] = ir.get("subsystems") or []
|
|
2854
2910
|
|
|
2855
2911
|
# ── 1. Resolve target → one or more FQNs ─────────────────────────────────
|
|
@@ -2914,6 +2970,76 @@ def compute_blast_radius(
|
|
|
2914
2970
|
if _effective_depth > 1:
|
|
2915
2971
|
queue.append((c, 1))
|
|
2916
2972
|
|
|
2973
|
+
# ── 2a. Interface bridging: Spring DI / CDI / IoC pattern ────────────────
|
|
2974
|
+
# In DI frameworks (Spring, CDI, Guice), callers inject the INTERFACE, not
|
|
2975
|
+
# the Impl. e.g. `impact OrderServiceImpl` → 0 direct callers, because every
|
|
2976
|
+
# caller wires against OrderService.
|
|
2977
|
+
#
|
|
2978
|
+
# Root cause: implements edges in graph.edges often carry unresolved short-name
|
|
2979
|
+
# `to` values (e.g. "OrderService" not FQN), so _build_reverse_adjacency drops
|
|
2980
|
+
# them (to_symbol ∉ all_fqns). The reverse_graph["...OrderService"] therefore
|
|
2981
|
+
# has no "implements" key — we cannot scan it from the reverse side.
|
|
2982
|
+
#
|
|
2983
|
+
# Fix: scan FORWARD graph edges for type=implements FROM our matched classes.
|
|
2984
|
+
# Resolve the `to` value (short or FQN) against reverse_graph keys via suffix
|
|
2985
|
+
# matching. Gather non-structural callers of those interface keys and merge
|
|
2986
|
+
# them into direct_callers.
|
|
2987
|
+
_iface_bridging: list[dict] = [] # [{interface, caller_count}] for output metadata
|
|
2988
|
+
|
|
2989
|
+
_target_is_interface = any(
|
|
2990
|
+
n.get("symbol_kind") == "interface" or n.get("type") == "interface"
|
|
2991
|
+
for n in graph_nodes
|
|
2992
|
+
if n.get("fqn") in matched_fqns
|
|
2993
|
+
)
|
|
2994
|
+
|
|
2995
|
+
if not _target_is_interface and graph_edges:
|
|
2996
|
+
# Build suffix→FQN lookup for reverse_graph keys (one-time, O(n))
|
|
2997
|
+
_rg_suffix_map: dict[str, list[str]] = {}
|
|
2998
|
+
for _rg_key in reverse_graph:
|
|
2999
|
+
_sfx = _simple_name(_rg_key)
|
|
3000
|
+
_rg_suffix_map.setdefault(_sfx, []).append(_rg_key)
|
|
3001
|
+
|
|
3002
|
+
_BRIDGE_SKIP = frozenset({
|
|
3003
|
+
"implements", "extends", "contained_in", "annotated_with"
|
|
3004
|
+
})
|
|
3005
|
+
|
|
3006
|
+
for _edge in graph_edges:
|
|
3007
|
+
if _edge.get("type") != "implements":
|
|
3008
|
+
continue
|
|
3009
|
+
_from = _edge.get("from") or ""
|
|
3010
|
+
if _from not in matched_fqns:
|
|
3011
|
+
continue
|
|
3012
|
+
# Resolve `to` (may be short name like "OrderService" or full FQN)
|
|
3013
|
+
_to_raw = _edge.get("to") or ""
|
|
3014
|
+
_to_simple = _simple_name(_to_raw)
|
|
3015
|
+
_candidate_iface_keys: list[str] = []
|
|
3016
|
+
if _to_raw in reverse_graph:
|
|
3017
|
+
_candidate_iface_keys = [_to_raw]
|
|
3018
|
+
else:
|
|
3019
|
+
_candidate_iface_keys = _rg_suffix_map.get(_to_simple, [])
|
|
3020
|
+
|
|
3021
|
+
for _iface_fqn in _candidate_iface_keys:
|
|
3022
|
+
_rg_entry = reverse_graph[_iface_fqn]
|
|
3023
|
+
_iface_callers = [
|
|
3024
|
+
c
|
|
3025
|
+
for _etype, _clist in _rg_entry.items()
|
|
3026
|
+
if _etype not in _BRIDGE_SKIP
|
|
3027
|
+
for c in _clist
|
|
3028
|
+
if c not in matched_fqns
|
|
3029
|
+
]
|
|
3030
|
+
if not _iface_callers:
|
|
3031
|
+
continue
|
|
3032
|
+
_iface_bridging.append({
|
|
3033
|
+
"interface": _iface_fqn,
|
|
3034
|
+
"caller_count": len(_iface_callers),
|
|
3035
|
+
})
|
|
3036
|
+
for c in _iface_callers:
|
|
3037
|
+
if c not in all_affected:
|
|
3038
|
+
all_affected[c] = 1
|
|
3039
|
+
direct_callers.append(c)
|
|
3040
|
+
if _effective_depth > 1:
|
|
3041
|
+
queue.append((c, 1))
|
|
3042
|
+
|
|
2917
3043
|
# BFS for indirect callers
|
|
2918
3044
|
indirect_callers: list[str] = []
|
|
2919
3045
|
visited: set[str] = set(matched_fqns) | set(direct_callers)
|
|
@@ -3142,6 +3268,13 @@ def compute_blast_radius(
|
|
|
3142
3268
|
if n_modules > 1:
|
|
3143
3269
|
_parts.append(f"impact crosses {n_modules} modules")
|
|
3144
3270
|
|
|
3271
|
+
if _iface_bridging:
|
|
3272
|
+
_iface_names = [b["interface"].split(".")[-1] for b in _iface_bridging]
|
|
3273
|
+
_parts.append(
|
|
3274
|
+
f"callers resolved via interface{'s' if len(_iface_names) > 1 else ''} "
|
|
3275
|
+
f"({', '.join(_iface_names)}) — Spring/CDI DI pattern"
|
|
3276
|
+
)
|
|
3277
|
+
|
|
3145
3278
|
if not _parts:
|
|
3146
3279
|
explanation = f"No callers or dependents found for {target!r}. Low-risk isolated change."
|
|
3147
3280
|
else:
|
|
@@ -3181,6 +3314,13 @@ def compute_blast_radius(
|
|
|
3181
3314
|
}
|
|
3182
3315
|
if _candidates_out:
|
|
3183
3316
|
out["candidates"] = _candidates_out
|
|
3317
|
+
if _iface_bridging:
|
|
3318
|
+
out["via_interface_resolution"] = _iface_bridging
|
|
3319
|
+
out["via_interface_note"] = (
|
|
3320
|
+
"Target is a concrete class injected via interface(s) in DI frameworks "
|
|
3321
|
+
"(Spring/CDI/Guice). direct_callers includes callers of the implemented "
|
|
3322
|
+
"interface(s) — these are the real production dependents."
|
|
3323
|
+
)
|
|
3184
3324
|
if len(direct_callers) > 30:
|
|
3185
3325
|
out["direct_callers_note"] = (
|
|
3186
3326
|
f"Showing 30/{n_direct} direct callers. Use --output to inspect full IR."
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sourcecode
|
|
3
|
-
Version: 1.31.
|
|
3
|
+
Version: 1.31.18
|
|
4
4
|
Summary: Deterministic codebase context for AI coding agents
|
|
5
5
|
License: Apache License
|
|
6
6
|
Version 2.0, January 2004
|
|
@@ -225,7 +225,7 @@ Description-Content-Type: text/markdown
|
|
|
225
225
|
|
|
226
226
|
**AI-ready change intelligence for Java/Spring enterprise monoliths.**
|
|
227
227
|
|
|
228
|
-

|
|
229
229
|

|
|
230
230
|
|
|
231
231
|
---
|
|
@@ -263,7 +263,7 @@ pipx install sourcecode
|
|
|
263
263
|
|
|
264
264
|
```bash
|
|
265
265
|
sourcecode version
|
|
266
|
-
# sourcecode 1.31.
|
|
266
|
+
# sourcecode 1.31.18
|
|
267
267
|
```
|
|
268
268
|
|
|
269
269
|
---
|
|
@@ -302,15 +302,24 @@ sourcecode fix-bug /path/to/repo --symptom "NullPointerException in checkout"
|
|
|
302
302
|
|
|
303
303
|
Measured against open-source enterprise Java repos:
|
|
304
304
|
|
|
305
|
-
| Repo |
|
|
306
|
-
|
|
307
|
-
| BroadleafCommerce |
|
|
308
|
-
| Keycloak |
|
|
305
|
+
| Repo | Java files | Cold scan (`--compact`) | Cache hit | Cache speedup | Endpoints found |
|
|
306
|
+
|------|-----------|------------------------|-----------|---------------|----------------|
|
|
307
|
+
| BroadleafCommerce | 2,985 | 2.9s | 0.20s | ~13x | 130 |
|
|
308
|
+
| Keycloak | 7,885 | 9.0s | 0.27s | ~33x | 693 |
|
|
309
309
|
|
|
310
|
-
|
|
310
|
+
The cache is keyed on file content hashes — invalidated only when source changes. Speedup varies by repo size and OS I/O.
|
|
311
311
|
|
|
312
|
-
|
|
313
|
-
|
|
312
|
+
**Token sizes (measured):**
|
|
313
|
+
|
|
314
|
+
| Mode | BroadleafCommerce | Keycloak |
|
|
315
|
+
|------|------------------|---------|
|
|
316
|
+
| `--compact` | ~2,900 | ~4,000 |
|
|
317
|
+
| `--agent` | ~4,800 | ~5,500 |
|
|
318
|
+
| `onboard` | ~2,600 | n/a |
|
|
319
|
+
| `fix-bug` (trimmed) | ~27,000 | ~4,600 |
|
|
320
|
+
|
|
321
|
+
**`impact` on high-fan-in classes:**
|
|
322
|
+
For hub interfaces (1000+ direct dependents), use `--depth 1` — direct endpoints are already the most actionable signal. Depth=4 on very large repos may take 90+ seconds.
|
|
314
323
|
|
|
315
324
|
---
|
|
316
325
|
|
|
@@ -318,7 +327,7 @@ For hub interfaces (2000+ direct dependents), use `--depth 1` — it gives you t
|
|
|
318
327
|
|
|
319
328
|
| Flag | Alias | Default | Description |
|
|
320
329
|
|------|-------|---------|-------------|
|
|
321
|
-
| `--compact` | | off | High-signal summary (typically
|
|
330
|
+
| `--compact` | | off | High-signal summary (typically 2,500–4,000 tokens for mid-to-large Java repos): stacks, entry points, dependencies, confidence, gaps. Includes `transactional_boundaries` for Spring projects. |
|
|
322
331
|
| `--agent` | | off | Structured JSON for AI agents: project identity, entry points, architecture, dependencies, confidence. More detail than `--compact`. ~4500–5500 tokens. |
|
|
323
332
|
| `--full` | | off | Remove truncation limits on `transactional_boundaries`, `mybatis.dto_mappers`, and other capped lists. |
|
|
324
333
|
| `--git-context` | `-g` | off | Include git activity: recent commits, change hotspots, and uncommitted file count. |
|
|
@@ -360,9 +369,10 @@ sourcecode impact OrderService . --depth 2 # limit BFS depth
|
|
|
360
369
|
| `candidates` | On partial match: up to 10 FQNs ranked by relevance |
|
|
361
370
|
|
|
362
371
|
**Best practices:**
|
|
363
|
-
- Target **interfaces**, not implementations: `impact OrderService` > `impact OrderServiceImpl`.
|
|
372
|
+
- Target **interfaces**, not implementations: `impact OrderService` > `impact OrderServiceImpl`. In Spring projects, callers inject the interface via `@Autowired` — the impl has zero direct callers in the graph even though it runs all the code. Querying the impl returns `direct_callers: []` with no error; querying the interface returns the real blast radius.
|
|
364
373
|
- Use `--depth 1` when the target has 200+ callers — direct endpoints are already the most actionable signal.
|
|
365
374
|
- The cache applies to the underlying IR scan — second `impact` run on the same repo is significantly faster.
|
|
375
|
+
- When you get `direct_callers: 0` for a `@Service` or `@Repository` class, that is almost certainly the interface-injection pattern. Re-run with the interface name.
|
|
366
376
|
|
|
367
377
|
**Supported targets:**
|
|
368
378
|
- Simple class name: `OrderService`
|
|
@@ -389,14 +399,16 @@ Extracts all Spring MVC (`@GetMapping`, `@PostMapping`, `@RequestMapping`, etc.)
|
|
|
389
399
|
## `repo-ir` — Symbol-level IR
|
|
390
400
|
|
|
391
401
|
```bash
|
|
392
|
-
sourcecode repo-ir /path/to/repo
|
|
393
|
-
sourcecode repo-ir /path/to/repo --summary-only # compact: analysis + impact, no full graph
|
|
402
|
+
sourcecode repo-ir /path/to/repo --summary-only # recommended: analysis + impact, no full graph (~20K tokens)
|
|
394
403
|
sourcecode repo-ir /path/to/repo --since HEAD~1 # symbol-level diff
|
|
395
|
-
sourcecode repo-ir /path/to/repo --
|
|
404
|
+
sourcecode repo-ir /path/to/repo --files src/.../OrderService.java # single-file IR
|
|
405
|
+
sourcecode repo-ir /path/to/repo --max-nodes 200 --max-edges 500 # limits forward graph only — see note below
|
|
396
406
|
```
|
|
397
407
|
|
|
398
408
|
Builds a deterministic symbol graph: classes, methods, import/injection edges, Spring roles, subsystems. Output is JSON with `graph`, `reverse_graph`, `impact`, `subsystems`, and `route_surface`.
|
|
399
409
|
|
|
410
|
+
**Size warning:** Without `--summary-only`, output can exceed 1MB for mid-size repos. `--max-nodes`/`--max-edges` limit the forward `graph` section only — the `reverse_graph` section is not bounded by these flags and is the largest component. Always use `--summary-only` unless you need the full graph for downstream tooling.
|
|
411
|
+
|
|
400
412
|
---
|
|
401
413
|
|
|
402
414
|
## `onboard` — [OSS Core] Codebase orientation
|
|
@@ -475,47 +487,103 @@ Note: `sourcecode onboard`, `sourcecode fix-bug`, `sourcecode review-pr`, and `s
|
|
|
475
487
|
|
|
476
488
|
## How to use sourcecode effectively
|
|
477
489
|
|
|
478
|
-
###
|
|
490
|
+
### Onboarding — new repo, new agent session
|
|
479
491
|
|
|
480
492
|
```bash
|
|
481
|
-
#
|
|
482
|
-
sourcecode /repo --
|
|
493
|
+
# Bounded context at session start (~2,500–5,500 tokens)
|
|
494
|
+
sourcecode /repo --compact # fast overview
|
|
495
|
+
sourcecode /repo --agent # more detail: file relevance, architecture, event flows
|
|
496
|
+
sourcecode onboard /repo # task-structured: entry points, key files, gaps
|
|
497
|
+
```
|
|
483
498
|
|
|
484
|
-
|
|
485
|
-
sourcecode impact PaymentService /repo --depth 1 | ask-agent "What are the risks?"
|
|
499
|
+
Use `--compact` or `--agent` as first-prompt injection for AI coding agents. Both are bounded and deterministic.
|
|
486
500
|
|
|
487
|
-
|
|
488
|
-
|
|
501
|
+
### Impact analysis — before touching a class
|
|
502
|
+
|
|
503
|
+
```bash
|
|
504
|
+
# Always target the INTERFACE in Spring projects:
|
|
505
|
+
sourcecode impact OrderService /repo # ✓ correct: 30 callers, 11 endpoints
|
|
506
|
+
sourcecode impact OrderServiceImpl /repo # ✗ wrong: 0 callers (Spring DI blindness)
|
|
507
|
+
|
|
508
|
+
# Large hub interfaces — depth=1 is faster and still actionable:
|
|
509
|
+
sourcecode impact KeycloakSession /repo --depth 1
|
|
510
|
+
|
|
511
|
+
# If you get direct_callers:[] for a @Service class, re-query the interface.
|
|
489
512
|
```
|
|
490
513
|
|
|
491
|
-
###
|
|
514
|
+
### Bug triage — symptom-driven
|
|
492
515
|
|
|
493
516
|
```bash
|
|
494
|
-
#
|
|
495
|
-
sourcecode /repo --
|
|
517
|
+
# Specific symptoms produce the best signal:
|
|
518
|
+
sourcecode fix-bug /repo --symptom "OIDC token refresh fails after realm update"
|
|
519
|
+
sourcecode fix-bug /repo --symptom "NullPointerException in OrderService during checkout"
|
|
496
520
|
|
|
497
|
-
#
|
|
498
|
-
|
|
499
|
-
|
|
521
|
+
# Generic symptoms produce noisy output (100s of files) — be specific.
|
|
522
|
+
# Use --output to capture full output without budget truncation.
|
|
523
|
+
sourcecode fix-bug /repo --symptom "payment timeout" --output triage.json
|
|
500
524
|
```
|
|
501
525
|
|
|
502
|
-
###
|
|
526
|
+
### PR review
|
|
503
527
|
|
|
504
528
|
```bash
|
|
505
|
-
#
|
|
506
|
-
sourcecode
|
|
507
|
-
|
|
529
|
+
# JSON for programmatic use:
|
|
530
|
+
sourcecode review-pr /repo --since main --output review.json
|
|
531
|
+
jq '.ci_decision' review.json # "analysis_success" | "git_ref_error"
|
|
532
|
+
|
|
533
|
+
# Markdown for GitHub comment:
|
|
534
|
+
sourcecode review-pr /repo --since main --format github-comment
|
|
535
|
+
|
|
536
|
+
# CI/CD gate — parse risk and test coverage fields:
|
|
537
|
+
jq '{ci_decision, test_coverage_risk, impact_summary}' review.json
|
|
508
538
|
```
|
|
509
539
|
|
|
510
|
-
###
|
|
540
|
+
### Modernization planning
|
|
511
541
|
|
|
512
542
|
```bash
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
#
|
|
543
|
+
sourcecode modernize /repo
|
|
544
|
+
# high_coupling_nodes: classes most risky to change (by fan-in degree)
|
|
545
|
+
# dead_zone_candidates: classes with zero callers — safe to remove or refactor
|
|
546
|
+
# Note: hotspot_candidates may be empty in annotation-heavy codebases —
|
|
547
|
+
# check high_coupling_nodes directly for coupling signal.
|
|
548
|
+
```
|
|
516
549
|
|
|
517
|
-
|
|
518
|
-
|
|
550
|
+
### Symbol IR for downstream tooling
|
|
551
|
+
|
|
552
|
+
```bash
|
|
553
|
+
# Always use --summary-only unless you need the full graph:
|
|
554
|
+
sourcecode repo-ir /repo --summary-only --output ir.json # ~20K tokens
|
|
555
|
+
sourcecode repo-ir /repo --since HEAD~3 --summary-only # changed symbols only
|
|
556
|
+
|
|
557
|
+
# Full graph warning: output can exceed 1MB for mid-size repos.
|
|
558
|
+
# --max-nodes/--max-edges only limit the forward graph, not reverse_graph.
|
|
559
|
+
```
|
|
560
|
+
|
|
561
|
+
### With AI agents (Claude, GPT-4, etc.)
|
|
562
|
+
|
|
563
|
+
```bash
|
|
564
|
+
# Start agent session with bounded context:
|
|
565
|
+
sourcecode /repo --agent --output context.json && cat context.json | agent-cli
|
|
566
|
+
|
|
567
|
+
# For a specific change task, combine context + impact:
|
|
568
|
+
sourcecode /repo --compact > context.json
|
|
569
|
+
sourcecode impact PaymentService /repo --depth 1 >> impact.json
|
|
570
|
+
# Feed both to agent: "Given this context and impact, what are the risks of changing PaymentService?"
|
|
571
|
+
|
|
572
|
+
# For PR review:
|
|
573
|
+
sourcecode review-pr /repo --since main --format github-comment
|
|
574
|
+
# Paste directly into GitHub PR description or feed to agent
|
|
575
|
+
```
|
|
576
|
+
|
|
577
|
+
### In CI/CD pipelines
|
|
578
|
+
|
|
579
|
+
```bash
|
|
580
|
+
# Deterministic, content-hash cached — safe to run on every commit
|
|
581
|
+
sourcecode /repo --compact --no-cache --output context.json
|
|
582
|
+
|
|
583
|
+
# PR gate
|
|
584
|
+
sourcecode review-pr /repo --since $BASE_REF --output review.json
|
|
585
|
+
DECISION=$(jq -r '.ci_decision' review.json)
|
|
586
|
+
if [ "$DECISION" != "analysis_success" ]; then echo "Review failed: $DECISION"; fi
|
|
519
587
|
```
|
|
520
588
|
|
|
521
589
|
---
|
|
@@ -525,8 +593,11 @@ sourcecode impact KeycloakSession /repo --depth 1
|
|
|
525
593
|
- No runtime analysis — all signals are static (annotation, import graph, file structure)
|
|
526
594
|
- No semantic code understanding — it reads structure, not logic
|
|
527
595
|
- Architecture pattern detection works best for Spring MVC layered apps; SPI/plugin architectures (e.g. Quarkus extension model) are classified as "layered" which may be inaccurate
|
|
528
|
-
- Endpoint recall for JAX-RS subresource locator pattern is ~65% — endpoints mounted dynamically via factory methods are not individually counted
|
|
529
|
-
- `impact` on implementation classes (e.g. `OrderServiceImpl`) reflects callers of the implementation specifically
|
|
596
|
+
- Endpoint recall for JAX-RS subresource locator pattern is ~65% — endpoints mounted dynamically via factory methods are not individually counted. JAX-RS sub-resource paths (method-level `@Path` inside a `@Path`-annotated class) are extracted as relative paths, not the fully composed URL.
|
|
597
|
+
- `impact` on implementation classes (e.g. `OrderServiceImpl`) reflects callers of the implementation specifically — **in Spring Boot projects this is almost always zero**, because callers inject the interface via `@Autowired`. Always target the interface (`OrderService`) to get the real blast radius. The tool does not auto-resolve impl → interface. When `direct_callers: []` is returned with `confidence_level: high` for a `@Service` class, treat it as a prompt to re-query the interface.
|
|
598
|
+
- `no_security_signal` on endpoints means no method-level security annotations (`@PreAuthorize`, `@Secured`) were found — it does **not** mean the endpoint is unsecured. Projects using Spring Security filter chains, XML security config, or custom filters will show 100% `no_security_signal` even when fully secured.
|
|
599
|
+
- `hotspot_candidates` in `modernize` output reflects graph coupling, not git churn — in annotation-heavy codebases it is often empty even though real hotspots exist. Check `high_coupling_nodes` directly for the coupling picture.
|
|
600
|
+
- `project_summary` is extracted from the repository README — it may reflect marketing language rather than architectural description
|
|
530
601
|
|
|
531
602
|
---
|
|
532
603
|
|
|
@@ -1,11 +1,12 @@
|
|
|
1
|
-
sourcecode/__init__.py,sha256=
|
|
1
|
+
sourcecode/__init__.py,sha256=RKBkTCXd0nPibD6uZj_CLNSWfxJYQOS-gsplP4C8K_g,104
|
|
2
2
|
sourcecode/adaptive_scanner.py,sha256=XffluXKzJUXrMtjEiAOnSNPZnztdIcts17T9ouHeID0,10521
|
|
3
3
|
sourcecode/architecture_analyzer.py,sha256=4R13Yb02OrPeB4IH3z6V_g7HWhmGcRHbI8CobCVnRrc,39111
|
|
4
4
|
sourcecode/architecture_summary.py,sha256=z34_6v7cSwy98cof2UVciGho7SCrZ93tiqMmq5WNzRQ,20405
|
|
5
5
|
sourcecode/ast_extractor.py,sha256=XgrZg2DcWcUm9r87cRG3KGO7IK2TIL_N-CvhSbUmmh4,49901
|
|
6
|
+
sourcecode/cache.py,sha256=HDkUZqXOovBc1PjTg-JpOQlyKhUMmEhiG789R7L4Wms,16348
|
|
6
7
|
sourcecode/canonical_ir.py,sha256=NZu0XICv__hkQGKzW2LNQLRqb1L28K2p_WQCQKS5Zlk,23141
|
|
7
8
|
sourcecode/classifier.py,sha256=yWeq6agTjkFa3zuNa-gdVIHtjoBoPoVlJnX-b7tdVJs,7851
|
|
8
|
-
sourcecode/cli.py,sha256=
|
|
9
|
+
sourcecode/cli.py,sha256=zBJZqoOntf3m4UWqvixrNdSDdytevuYJF4rDvxXTM8k,139621
|
|
9
10
|
sourcecode/code_notes_analyzer.py,sha256=EJemNCNc9Dn-1RZYu-aNbK0ELzmsyC4s6FdHi3XyNEI,9392
|
|
10
11
|
sourcecode/confidence_analyzer.py,sha256=_jckZSxksV-OU38vbkxfVNBnWCtlCq8Vwfg23x1uspA,19054
|
|
11
12
|
sourcecode/context_scorer.py,sha256=QpChSpsmaAYz91rXA4Ue5xzQmNz_ZboZN09YOHScq1U,14679
|
|
@@ -31,7 +32,7 @@ sourcecode/ranking_engine.py,sha256=ZAucq_YX2KkWUuAZf4P0lhtQ_38vEFnUhuGtSZd1S0E,
|
|
|
31
32
|
sourcecode/redactor.py,sha256=xuGcadGEHaPw4qZXlMDvzMCsr4VOkdp3oBQptHyJk8c,2884
|
|
32
33
|
sourcecode/relevance_scorer.py,sha256=MYF4FFkveAQps9SmTeTlh6ODiBz2F--_hWNeHMLtUHQ,8405
|
|
33
34
|
sourcecode/repo_classifier.py,sha256=FG1vaWKdWXsWdl-S8hjVMiTqcwgaRXkDyvK4rPcOGtQ,22681
|
|
34
|
-
sourcecode/repository_ir.py,sha256=
|
|
35
|
+
sourcecode/repository_ir.py,sha256=NooCrMJYqycKSYTEroVWTYR8X83hHaAYKTsgYxvlz-I,140221
|
|
35
36
|
sourcecode/runtime_classifier.py,sha256=uTAD6BDCiBLUZEDRfqk718kM4RTT_vAbfkcOI2_Xx58,18432
|
|
36
37
|
sourcecode/scanner.py,sha256=WdOQ78mMzjR1NjmKTlbxdgwinnCTfAhxCVLBEFQiFHU,8899
|
|
37
38
|
sourcecode/schema.py,sha256=aHNXDf8LGyUC8ZDE_VS9kiskC2-Oswhi_WnpdGy6HDw,24897
|
|
@@ -75,8 +76,8 @@ sourcecode/telemetry/consent.py,sha256=wLMvGNJeSSyZoNkQXpoUioY6mMv4Qdvuw7S9jAEWn
|
|
|
75
76
|
sourcecode/telemetry/events.py,sha256=oEvvulfsv5GIDWG2174gSS6tNB95w38AIYiYeifGKlE,2294
|
|
76
77
|
sourcecode/telemetry/filters.py,sha256=Asa71oRl7q3Wt_FMwuufIZJFzSYdgRNKS8LHCIyFeYE,4805
|
|
77
78
|
sourcecode/telemetry/transport.py,sha256=KJeIPCPWMdmbCP3ySGs2iUlia34U6vWne2dZsUezesw,1560
|
|
78
|
-
sourcecode-1.31.
|
|
79
|
-
sourcecode-1.31.
|
|
80
|
-
sourcecode-1.31.
|
|
81
|
-
sourcecode-1.31.
|
|
82
|
-
sourcecode-1.31.
|
|
79
|
+
sourcecode-1.31.18.dist-info/METADATA,sha256=paObgQ32RFOKlwHD7oyNK6tRtbEBRStsmeXXSg4RaPw,31103
|
|
80
|
+
sourcecode-1.31.18.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
81
|
+
sourcecode-1.31.18.dist-info/entry_points.txt,sha256=ex3F9rmbXeyDIoFQHtkEqTsKSaJow8F0LrVu8XfIktQ,57
|
|
82
|
+
sourcecode-1.31.18.dist-info/licenses/LICENSE,sha256=7DdHrU9Z_3e7dSvq4ISijZNjnuHo5NIHNiHDouMQ9JU,10491
|
|
83
|
+
sourcecode-1.31.18.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|