sourcecode 1.32.3__py3-none-any.whl → 1.32.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sourcecode/__init__.py CHANGED
@@ -1,3 +1,3 @@
1
1
  """sourcecode — Deterministic codebase context maps for AI coding agents."""
2
2
 
3
- __version__ = "1.32.3"
3
+ __version__ = "1.32.5"
sourcecode/cache.py CHANGED
@@ -72,6 +72,11 @@ SCHEMA_VERSION: str = "2"
72
72
  #: Bump to invalidate all L1 core caches (independent of snapshot version).
73
73
  CORE_SCHEMA_VERSION: str = "1"
74
74
 
75
+ #: Bump when analysis logic or output schema changes — NOT on every package release.
76
+ #: This is the stable part of the L1 core cache key. Package version bumps (patch,
77
+ #: minor) must NOT bump this value unless the cached data format actually changed.
78
+ ANALYZER_CACHE_VERSION: str = "1"
79
+
75
80
  #: Fields eligible for CAS deduplication (applied to top-level JSON dict keys).
76
81
  _CAS_FIELDS: frozenset[str] = frozenset([
77
82
  "file_paths",
@@ -92,6 +97,8 @@ _CAS_FIELDS: frozenset[str] = frozenset([
92
97
  _CAS_THRESHOLD: int = 4096
93
98
 
94
99
  _DEFAULT_KEEP_COMMITS: int = 5
100
+ _DEFAULT_MAX_CORES: int = 20
101
+ _DEFAULT_MAX_SIZE_MB: int = 50
95
102
 
96
103
  # Matches "snapshot-<hex_commit>-<hex_flags>.json.gz"
97
104
  _SNAPSHOT_RE = re.compile(r"^snapshot-([0-9a-f]+)-[0-9a-f]+\.json\.gz$")
@@ -124,6 +131,58 @@ def cache_dir(repo_root: Path) -> Path:
124
131
  return base / repo_id(repo_root)
125
132
 
126
133
 
134
+ # ---------------------------------------------------------------------------
135
+ # Public API — observability
136
+ # ---------------------------------------------------------------------------
137
+
138
+ def status(repo_root: Path) -> dict[str, Any]:
139
+ """Return a stats dict describing the current cache state for *repo_root*.
140
+
141
+ Keys: ``cache_dir``, ``cores``, ``snapshots``, ``views``, ``cas_blobs``,
142
+ ``total_size_bytes``, ``total_size_mb``.
143
+ """
144
+ cache_d = cache_dir(repo_root)
145
+ if not cache_d.exists():
146
+ return {
147
+ "cache_dir": str(cache_d),
148
+ "cores": 0, "snapshots": 0, "views": 0, "cas_blobs": 0,
149
+ "total_size_bytes": 0, "total_size_mb": 0.0,
150
+ }
151
+ cores = list(cache_d.glob("core-*.json.gz"))
152
+ snapshots = list(cache_d.glob("snapshot-*.json.gz"))
153
+ views = list(cache_d.glob("view-*.json.gz"))
154
+ cas_blobs = list((_cas_dir(cache_d)).glob("*.gz")) if _cas_dir(cache_d).exists() else []
155
+ all_files = cores + snapshots + views + cas_blobs
156
+ total_bytes = sum(f.stat().st_size for f in all_files if f.exists())
157
+ return {
158
+ "cache_dir": str(cache_d),
159
+ "cores": len(cores),
160
+ "snapshots": len(snapshots),
161
+ "views": len(views),
162
+ "cas_blobs": len(cas_blobs),
163
+ "total_size_bytes": total_bytes,
164
+ "total_size_mb": round(total_bytes / (1024 * 1024), 2),
165
+ }
166
+
167
+
168
+ def clear(repo_root: Path) -> int:
169
+ """Delete all cache files for *repo_root*. Returns the number of files removed."""
170
+ cache_d = cache_dir(repo_root)
171
+ if not cache_d.exists():
172
+ return 0
173
+ removed = 0
174
+ for pattern in ("core-*.json.gz", "snapshot-*.json.gz", "view-*.json.gz"):
175
+ for f in cache_d.glob(pattern):
176
+ _safe_unlink(f)
177
+ removed += 1
178
+ cas_d = _cas_dir(cache_d)
179
+ if cas_d.exists():
180
+ for f in cas_d.glob("*.gz"):
181
+ _safe_unlink(f)
182
+ removed += 1
183
+ return removed
184
+
185
+
127
186
  # ---------------------------------------------------------------------------
128
187
  # Public API — read / write
129
188
  # ---------------------------------------------------------------------------
@@ -193,7 +252,7 @@ def write(
193
252
  try:
194
253
  cache_d.mkdir(parents=True, exist_ok=True)
195
254
  payload = _build_envelope(cache_key, content, fmt, layers or {}, cache_d)
196
- dest.write_bytes(payload)
255
+ _atomic_write(dest, payload)
197
256
  except Exception:
198
257
  return # non-fatal
199
258
 
@@ -275,7 +334,7 @@ def write_core(repo_root: Path, core_key: str, core_data: dict[str, Any]) -> str
275
334
  json.dumps(envelope, ensure_ascii=False).encode("utf-8"),
276
335
  compresslevel=6,
277
336
  )
278
- dest.write_bytes(payload)
337
+ _atomic_write(dest, payload)
279
338
  except Exception:
280
339
  pass
281
340
 
@@ -327,7 +386,7 @@ def write_view(
327
386
  try:
328
387
  cache_d.mkdir(parents=True, exist_ok=True)
329
388
  payload = _build_envelope(view_key, content, fmt, layers or {}, cache_d)
330
- dest.write_bytes(payload)
389
+ _atomic_write(dest, payload)
331
390
  except Exception:
332
391
  pass
333
392
 
@@ -457,7 +516,7 @@ def _cas_store_blob(cache_d: Path, serialised: str) -> str:
457
516
  path = _cas_path(cache_d, blob_hash)
458
517
  if not path.exists():
459
518
  path.parent.mkdir(parents=True, exist_ok=True)
460
- path.write_bytes(gzip.compress(raw, compresslevel=6))
519
+ _atomic_write(path, gzip.compress(raw, compresslevel=6))
461
520
  return blob_hash
462
521
 
463
522
 
@@ -529,12 +588,16 @@ def _cas_restore(
529
588
  def _gc(cache_d: Path) -> None:
530
589
  """Evict old snapshots/cores/views and sweep orphaned CAS blobs.
531
590
 
532
- Keeps snapshots and cores from the last ``SOURCECODE_CACHE_KEEP_COMMITS``
533
- distinct git commits (determined by newest mtime within each commit group).
534
- Views are then pruned: a view survives only when its core-hash prefix
535
- matches a core file in the surviving set.
591
+ Three eviction passes (all non-fatal):
592
+ 1. Commit-based: keep only last SOURCECODE_CACHE_KEEP_COMMITS distinct SHAs.
593
+ 2. Core-count: keep at most SOURCECODE_CACHE_MAX_CORES core files (LRU).
594
+ 3. Size-based: if total cache exceeds SOURCECODE_CACHE_MAX_SIZE_MB, evict
595
+ oldest core+snapshot files until under budget.
596
+ Views and CAS blobs are swept after each pass.
536
597
  """
537
598
  keep = int(os.environ.get("SOURCECODE_CACHE_KEEP_COMMITS", _DEFAULT_KEEP_COMMITS))
599
+ max_cores = int(os.environ.get("SOURCECODE_CACHE_MAX_CORES", _DEFAULT_MAX_CORES))
600
+ max_size_bytes = int(os.environ.get("SOURCECODE_CACHE_MAX_SIZE_MB", _DEFAULT_MAX_SIZE_MB)) * 1024 * 1024
538
601
 
539
602
  try:
540
603
  all_snapshots = list(cache_d.glob("snapshot-*.json.gz"))
@@ -544,7 +607,7 @@ def _gc(cache_d: Path) -> None:
544
607
  if not all_snapshots and not all_cores and not all_views:
545
608
  return
546
609
 
547
- # Group snapshot + core files by commit SHA
610
+ # ── Pass 1: commit-based eviction ──────────────────────────────────
548
611
  groups: dict[str, list[Path]] = {}
549
612
  for f in all_snapshots:
550
613
  m = _SNAPSHOT_RE.match(f.name)
@@ -558,7 +621,6 @@ def _gc(cache_d: Path) -> None:
558
621
  surviving: list[Path]
559
622
 
560
623
  if keep <= 0 or len(groups) <= keep:
561
- # No eviction needed — but still sweep views + CAS
562
624
  surviving = all_snapshots + all_cores
563
625
  else:
564
626
  def _newest_mtime(commit: str) -> float:
@@ -573,7 +635,37 @@ def _gc(cache_d: Path) -> None:
573
635
  for f in groups[commit]:
574
636
  _safe_unlink(f)
575
637
 
638
+ # ── Pass 2: per-repo core count cap ────────────────────────────────
639
+ if max_cores > 0:
640
+ surviving_cores = [p for p in surviving if p.name.startswith("core-") and p.exists()]
641
+ if len(surviving_cores) > max_cores:
642
+ surviving_cores.sort(key=lambda p: p.stat().st_mtime, reverse=True)
643
+ for evict in surviving_cores[max_cores:]:
644
+ _safe_unlink(evict)
645
+ surviving = [p for p in surviving if p != evict]
646
+
647
+ # ── Pass 3: total size cap ──────────────────────────────────────────
648
+ if max_size_bytes > 0:
649
+ size_candidates = [p for p in surviving if p.exists()]
650
+ # Include CAS blobs in the size budget calculation
651
+ cas_d_sz = _cas_dir(cache_d)
652
+ cas_files = list(cas_d_sz.glob("*.gz")) if cas_d_sz.exists() else []
653
+ total = sum(p.stat().st_size for p in size_candidates if not p.name.startswith("view-"))
654
+ total += sum(p.stat().st_size for p in cas_files if p.exists())
655
+ if total > max_size_bytes:
656
+ # Sort oldest-first; evict core+snapshot files until under budget
657
+ size_candidates.sort(key=lambda p: p.stat().st_mtime)
658
+ for evict in size_candidates:
659
+ if evict.name.startswith("view-"):
660
+ continue
661
+ total -= evict.stat().st_size if evict.exists() else 0
662
+ _safe_unlink(evict)
663
+ surviving = [p for p in surviving if p != evict]
664
+ if total <= max_size_bytes:
665
+ break
666
+
576
667
  # Prune view files whose core hash is no longer in the surviving set
668
+ all_views = list(cache_d.glob("view-*.json.gz"))
577
669
  _gc_views(cache_d, surviving, all_views)
578
670
 
579
671
  # Sweep orphaned CAS blobs (surviving snapshots + view files may ref them)
@@ -648,6 +740,23 @@ def _gc_cas(cache_d: Path, surviving_snapshots: list[Path]) -> None:
648
740
  # Utilities
649
741
  # ---------------------------------------------------------------------------
650
742
 
743
+ def _atomic_write(dest: Path, data: bytes) -> None:
744
+ """Write *data* to *dest* atomically via a sibling .tmp file + rename.
745
+
746
+ On POSIX, ``Path.replace()`` is a single ``rename(2)`` syscall — the
747
+ destination either has the old content or the new content, never a partial
748
+ write. The .tmp suffix keeps the partial file out of glob patterns used
749
+ by the cache reader and GC.
750
+ """
751
+ tmp = dest.with_suffix(".tmp")
752
+ try:
753
+ tmp.write_bytes(data)
754
+ tmp.replace(dest)
755
+ except Exception:
756
+ _safe_unlink(tmp)
757
+ raise
758
+
759
+
651
760
  def _safe_unlink(path: Path) -> None:
652
761
  try:
653
762
  path.unlink(missing_ok=True)