sourcecode 1.32.3__py3-none-any.whl → 1.32.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sourcecode/__init__.py +1 -1
- sourcecode/cache.py +119 -10
- sourcecode/cache.tmp_new +772 -0
- sourcecode/cli.py +129 -37
- sourcecode/ris.py +7 -2
- {sourcecode-1.32.3.dist-info → sourcecode-1.32.5.dist-info}/METADATA +1 -187
- {sourcecode-1.32.3.dist-info → sourcecode-1.32.5.dist-info}/RECORD +10 -9
- {sourcecode-1.32.3.dist-info → sourcecode-1.32.5.dist-info}/WHEEL +0 -0
- {sourcecode-1.32.3.dist-info → sourcecode-1.32.5.dist-info}/entry_points.txt +0 -0
- {sourcecode-1.32.3.dist-info → sourcecode-1.32.5.dist-info}/licenses/LICENSE +0 -0
sourcecode/__init__.py
CHANGED
sourcecode/cache.py
CHANGED
|
@@ -72,6 +72,11 @@ SCHEMA_VERSION: str = "2"
|
|
|
72
72
|
#: Bump to invalidate all L1 core caches (independent of snapshot version).
|
|
73
73
|
CORE_SCHEMA_VERSION: str = "1"
|
|
74
74
|
|
|
75
|
+
#: Bump when analysis logic or output schema changes — NOT on every package release.
|
|
76
|
+
#: This is the stable part of the L1 core cache key. Package version bumps (patch,
|
|
77
|
+
#: minor) must NOT bump this value unless the cached data format actually changed.
|
|
78
|
+
ANALYZER_CACHE_VERSION: str = "1"
|
|
79
|
+
|
|
75
80
|
#: Fields eligible for CAS deduplication (applied to top-level JSON dict keys).
|
|
76
81
|
_CAS_FIELDS: frozenset[str] = frozenset([
|
|
77
82
|
"file_paths",
|
|
@@ -92,6 +97,8 @@ _CAS_FIELDS: frozenset[str] = frozenset([
|
|
|
92
97
|
_CAS_THRESHOLD: int = 4096
|
|
93
98
|
|
|
94
99
|
_DEFAULT_KEEP_COMMITS: int = 5
|
|
100
|
+
_DEFAULT_MAX_CORES: int = 20
|
|
101
|
+
_DEFAULT_MAX_SIZE_MB: int = 50
|
|
95
102
|
|
|
96
103
|
# Matches "snapshot-<hex_commit>-<hex_flags>.json.gz"
|
|
97
104
|
_SNAPSHOT_RE = re.compile(r"^snapshot-([0-9a-f]+)-[0-9a-f]+\.json\.gz$")
|
|
@@ -124,6 +131,58 @@ def cache_dir(repo_root: Path) -> Path:
|
|
|
124
131
|
return base / repo_id(repo_root)
|
|
125
132
|
|
|
126
133
|
|
|
134
|
+
# ---------------------------------------------------------------------------
|
|
135
|
+
# Public API — observability
|
|
136
|
+
# ---------------------------------------------------------------------------
|
|
137
|
+
|
|
138
|
+
def status(repo_root: Path) -> dict[str, Any]:
|
|
139
|
+
"""Return a stats dict describing the current cache state for *repo_root*.
|
|
140
|
+
|
|
141
|
+
Keys: ``cache_dir``, ``cores``, ``snapshots``, ``views``, ``cas_blobs``,
|
|
142
|
+
``total_size_bytes``, ``total_size_mb``.
|
|
143
|
+
"""
|
|
144
|
+
cache_d = cache_dir(repo_root)
|
|
145
|
+
if not cache_d.exists():
|
|
146
|
+
return {
|
|
147
|
+
"cache_dir": str(cache_d),
|
|
148
|
+
"cores": 0, "snapshots": 0, "views": 0, "cas_blobs": 0,
|
|
149
|
+
"total_size_bytes": 0, "total_size_mb": 0.0,
|
|
150
|
+
}
|
|
151
|
+
cores = list(cache_d.glob("core-*.json.gz"))
|
|
152
|
+
snapshots = list(cache_d.glob("snapshot-*.json.gz"))
|
|
153
|
+
views = list(cache_d.glob("view-*.json.gz"))
|
|
154
|
+
cas_blobs = list((_cas_dir(cache_d)).glob("*.gz")) if _cas_dir(cache_d).exists() else []
|
|
155
|
+
all_files = cores + snapshots + views + cas_blobs
|
|
156
|
+
total_bytes = sum(f.stat().st_size for f in all_files if f.exists())
|
|
157
|
+
return {
|
|
158
|
+
"cache_dir": str(cache_d),
|
|
159
|
+
"cores": len(cores),
|
|
160
|
+
"snapshots": len(snapshots),
|
|
161
|
+
"views": len(views),
|
|
162
|
+
"cas_blobs": len(cas_blobs),
|
|
163
|
+
"total_size_bytes": total_bytes,
|
|
164
|
+
"total_size_mb": round(total_bytes / (1024 * 1024), 2),
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def clear(repo_root: Path) -> int:
|
|
169
|
+
"""Delete all cache files for *repo_root*. Returns the number of files removed."""
|
|
170
|
+
cache_d = cache_dir(repo_root)
|
|
171
|
+
if not cache_d.exists():
|
|
172
|
+
return 0
|
|
173
|
+
removed = 0
|
|
174
|
+
for pattern in ("core-*.json.gz", "snapshot-*.json.gz", "view-*.json.gz"):
|
|
175
|
+
for f in cache_d.glob(pattern):
|
|
176
|
+
_safe_unlink(f)
|
|
177
|
+
removed += 1
|
|
178
|
+
cas_d = _cas_dir(cache_d)
|
|
179
|
+
if cas_d.exists():
|
|
180
|
+
for f in cas_d.glob("*.gz"):
|
|
181
|
+
_safe_unlink(f)
|
|
182
|
+
removed += 1
|
|
183
|
+
return removed
|
|
184
|
+
|
|
185
|
+
|
|
127
186
|
# ---------------------------------------------------------------------------
|
|
128
187
|
# Public API — read / write
|
|
129
188
|
# ---------------------------------------------------------------------------
|
|
@@ -193,7 +252,7 @@ def write(
|
|
|
193
252
|
try:
|
|
194
253
|
cache_d.mkdir(parents=True, exist_ok=True)
|
|
195
254
|
payload = _build_envelope(cache_key, content, fmt, layers or {}, cache_d)
|
|
196
|
-
dest
|
|
255
|
+
_atomic_write(dest, payload)
|
|
197
256
|
except Exception:
|
|
198
257
|
return # non-fatal
|
|
199
258
|
|
|
@@ -275,7 +334,7 @@ def write_core(repo_root: Path, core_key: str, core_data: dict[str, Any]) -> str
|
|
|
275
334
|
json.dumps(envelope, ensure_ascii=False).encode("utf-8"),
|
|
276
335
|
compresslevel=6,
|
|
277
336
|
)
|
|
278
|
-
dest
|
|
337
|
+
_atomic_write(dest, payload)
|
|
279
338
|
except Exception:
|
|
280
339
|
pass
|
|
281
340
|
|
|
@@ -327,7 +386,7 @@ def write_view(
|
|
|
327
386
|
try:
|
|
328
387
|
cache_d.mkdir(parents=True, exist_ok=True)
|
|
329
388
|
payload = _build_envelope(view_key, content, fmt, layers or {}, cache_d)
|
|
330
|
-
dest
|
|
389
|
+
_atomic_write(dest, payload)
|
|
331
390
|
except Exception:
|
|
332
391
|
pass
|
|
333
392
|
|
|
@@ -457,7 +516,7 @@ def _cas_store_blob(cache_d: Path, serialised: str) -> str:
|
|
|
457
516
|
path = _cas_path(cache_d, blob_hash)
|
|
458
517
|
if not path.exists():
|
|
459
518
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
460
|
-
path
|
|
519
|
+
_atomic_write(path, gzip.compress(raw, compresslevel=6))
|
|
461
520
|
return blob_hash
|
|
462
521
|
|
|
463
522
|
|
|
@@ -529,12 +588,16 @@ def _cas_restore(
|
|
|
529
588
|
def _gc(cache_d: Path) -> None:
|
|
530
589
|
"""Evict old snapshots/cores/views and sweep orphaned CAS blobs.
|
|
531
590
|
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
591
|
+
Three eviction passes (all non-fatal):
|
|
592
|
+
1. Commit-based: keep only last SOURCECODE_CACHE_KEEP_COMMITS distinct SHAs.
|
|
593
|
+
2. Core-count: keep at most SOURCECODE_CACHE_MAX_CORES core files (LRU).
|
|
594
|
+
3. Size-based: if total cache exceeds SOURCECODE_CACHE_MAX_SIZE_MB, evict
|
|
595
|
+
oldest core+snapshot files until under budget.
|
|
596
|
+
Views and CAS blobs are swept after each pass.
|
|
536
597
|
"""
|
|
537
598
|
keep = int(os.environ.get("SOURCECODE_CACHE_KEEP_COMMITS", _DEFAULT_KEEP_COMMITS))
|
|
599
|
+
max_cores = int(os.environ.get("SOURCECODE_CACHE_MAX_CORES", _DEFAULT_MAX_CORES))
|
|
600
|
+
max_size_bytes = int(os.environ.get("SOURCECODE_CACHE_MAX_SIZE_MB", _DEFAULT_MAX_SIZE_MB)) * 1024 * 1024
|
|
538
601
|
|
|
539
602
|
try:
|
|
540
603
|
all_snapshots = list(cache_d.glob("snapshot-*.json.gz"))
|
|
@@ -544,7 +607,7 @@ def _gc(cache_d: Path) -> None:
|
|
|
544
607
|
if not all_snapshots and not all_cores and not all_views:
|
|
545
608
|
return
|
|
546
609
|
|
|
547
|
-
#
|
|
610
|
+
# ── Pass 1: commit-based eviction ──────────────────────────────────
|
|
548
611
|
groups: dict[str, list[Path]] = {}
|
|
549
612
|
for f in all_snapshots:
|
|
550
613
|
m = _SNAPSHOT_RE.match(f.name)
|
|
@@ -558,7 +621,6 @@ def _gc(cache_d: Path) -> None:
|
|
|
558
621
|
surviving: list[Path]
|
|
559
622
|
|
|
560
623
|
if keep <= 0 or len(groups) <= keep:
|
|
561
|
-
# No eviction needed — but still sweep views + CAS
|
|
562
624
|
surviving = all_snapshots + all_cores
|
|
563
625
|
else:
|
|
564
626
|
def _newest_mtime(commit: str) -> float:
|
|
@@ -573,7 +635,37 @@ def _gc(cache_d: Path) -> None:
|
|
|
573
635
|
for f in groups[commit]:
|
|
574
636
|
_safe_unlink(f)
|
|
575
637
|
|
|
638
|
+
# ── Pass 2: per-repo core count cap ────────────────────────────────
|
|
639
|
+
if max_cores > 0:
|
|
640
|
+
surviving_cores = [p for p in surviving if p.name.startswith("core-") and p.exists()]
|
|
641
|
+
if len(surviving_cores) > max_cores:
|
|
642
|
+
surviving_cores.sort(key=lambda p: p.stat().st_mtime, reverse=True)
|
|
643
|
+
for evict in surviving_cores[max_cores:]:
|
|
644
|
+
_safe_unlink(evict)
|
|
645
|
+
surviving = [p for p in surviving if p != evict]
|
|
646
|
+
|
|
647
|
+
# ── Pass 3: total size cap ──────────────────────────────────────────
|
|
648
|
+
if max_size_bytes > 0:
|
|
649
|
+
size_candidates = [p for p in surviving if p.exists()]
|
|
650
|
+
# Include CAS blobs in the size budget calculation
|
|
651
|
+
cas_d_sz = _cas_dir(cache_d)
|
|
652
|
+
cas_files = list(cas_d_sz.glob("*.gz")) if cas_d_sz.exists() else []
|
|
653
|
+
total = sum(p.stat().st_size for p in size_candidates if not p.name.startswith("view-"))
|
|
654
|
+
total += sum(p.stat().st_size for p in cas_files if p.exists())
|
|
655
|
+
if total > max_size_bytes:
|
|
656
|
+
# Sort oldest-first; evict core+snapshot files until under budget
|
|
657
|
+
size_candidates.sort(key=lambda p: p.stat().st_mtime)
|
|
658
|
+
for evict in size_candidates:
|
|
659
|
+
if evict.name.startswith("view-"):
|
|
660
|
+
continue
|
|
661
|
+
total -= evict.stat().st_size if evict.exists() else 0
|
|
662
|
+
_safe_unlink(evict)
|
|
663
|
+
surviving = [p for p in surviving if p != evict]
|
|
664
|
+
if total <= max_size_bytes:
|
|
665
|
+
break
|
|
666
|
+
|
|
576
667
|
# Prune view files whose core hash is no longer in the surviving set
|
|
668
|
+
all_views = list(cache_d.glob("view-*.json.gz"))
|
|
577
669
|
_gc_views(cache_d, surviving, all_views)
|
|
578
670
|
|
|
579
671
|
# Sweep orphaned CAS blobs (surviving snapshots + view files may ref them)
|
|
@@ -648,6 +740,23 @@ def _gc_cas(cache_d: Path, surviving_snapshots: list[Path]) -> None:
|
|
|
648
740
|
# Utilities
|
|
649
741
|
# ---------------------------------------------------------------------------
|
|
650
742
|
|
|
743
|
+
def _atomic_write(dest: Path, data: bytes) -> None:
|
|
744
|
+
"""Write *data* to *dest* atomically via a sibling .tmp file + rename.
|
|
745
|
+
|
|
746
|
+
On POSIX, ``Path.replace()`` is a single ``rename(2)`` syscall — the
|
|
747
|
+
destination either has the old content or the new content, never a partial
|
|
748
|
+
write. The .tmp suffix keeps the partial file out of glob patterns used
|
|
749
|
+
by the cache reader and GC.
|
|
750
|
+
"""
|
|
751
|
+
tmp = dest.with_suffix(".tmp")
|
|
752
|
+
try:
|
|
753
|
+
tmp.write_bytes(data)
|
|
754
|
+
tmp.replace(dest)
|
|
755
|
+
except Exception:
|
|
756
|
+
_safe_unlink(tmp)
|
|
757
|
+
raise
|
|
758
|
+
|
|
759
|
+
|
|
651
760
|
def _safe_unlink(path: Path) -> None:
|
|
652
761
|
try:
|
|
653
762
|
path.unlink(missing_ok=True)
|