multi-forge 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (311) hide show
  1. forge/__init__.py +3 -0
  2. forge/_extensions/agents/.gitkeep +0 -0
  3. forge/_extensions/commands/.gitkeep +0 -0
  4. forge/_extensions/skills/analyze/SKILL.md +87 -0
  5. forge/_extensions/skills/challenge/SKILL.md +91 -0
  6. forge/_extensions/skills/consensus/SKILL.md +120 -0
  7. forge/_extensions/skills/consensus/resources/code_consensus_evaluation.md +94 -0
  8. forge/_extensions/skills/consensus/resources/consensus_evaluation.md +70 -0
  9. forge/_extensions/skills/consensus/resources/synthesis.md +101 -0
  10. forge/_extensions/skills/debate/SKILL.md +116 -0
  11. forge/_extensions/skills/debate/resources/code_debate_evaluation.md +101 -0
  12. forge/_extensions/skills/debate/resources/debate_evaluation.md +90 -0
  13. forge/_extensions/skills/panel/SKILL.md +141 -0
  14. forge/_extensions/skills/panel/resources/synthesis.md +103 -0
  15. forge/_extensions/skills/qa/SKILL.md +704 -0
  16. forge/_extensions/skills/qa/resources/checklist/0-enable.md +78 -0
  17. forge/_extensions/skills/qa/resources/checklist/1-preflight.md +24 -0
  18. forge/_extensions/skills/qa/resources/checklist/10-resume.md +143 -0
  19. forge/_extensions/skills/qa/resources/checklist/11-config.md +150 -0
  20. forge/_extensions/skills/qa/resources/checklist/12-search.md +58 -0
  21. forge/_extensions/skills/qa/resources/checklist/13-guard.md +237 -0
  22. forge/_extensions/skills/qa/resources/checklist/14-workflow.md +305 -0
  23. forge/_extensions/skills/qa/resources/checklist/15-skills.md +155 -0
  24. forge/_extensions/skills/qa/resources/checklist/16-handoff.md +224 -0
  25. forge/_extensions/skills/qa/resources/checklist/17-info.md +50 -0
  26. forge/_extensions/skills/qa/resources/checklist/18-disable.md +84 -0
  27. forge/_extensions/skills/qa/resources/checklist/19-uninstall.md +146 -0
  28. forge/_extensions/skills/qa/resources/checklist/2-extensions.md +188 -0
  29. forge/_extensions/skills/qa/resources/checklist/20-cleanup.md +36 -0
  30. forge/_extensions/skills/qa/resources/checklist/3-auth.md +234 -0
  31. forge/_extensions/skills/qa/resources/checklist/4-proxy.md +481 -0
  32. forge/_extensions/skills/qa/resources/checklist/5-session.md +541 -0
  33. forge/_extensions/skills/qa/resources/checklist/6-hooks.md +275 -0
  34. forge/_extensions/skills/qa/resources/checklist/7-costs.md +309 -0
  35. forge/_extensions/skills/qa/resources/checklist/8-status-line.md +174 -0
  36. forge/_extensions/skills/qa/resources/checklist/9-direct-commands.md +146 -0
  37. forge/_extensions/skills/qa/resources/checklist.md +103 -0
  38. forge/_extensions/skills/qa/resources/report-template.md +62 -0
  39. forge/_extensions/skills/qa/scripts/start-container.sh +529 -0
  40. forge/_extensions/skills/qa/scripts/walkthrough-state.py +1137 -0
  41. forge/_extensions/skills/review/SKILL.md +125 -0
  42. forge/_extensions/skills/review/references/claude-4.6.md +474 -0
  43. forge/_extensions/skills/review/references/claude-4.7.md +710 -0
  44. forge/_extensions/skills/review/references/gemini-3.1.md +546 -0
  45. forge/_extensions/skills/review/references/gpt-5.5.md +490 -0
  46. forge/_extensions/skills/review/references/skills-writing-guide.md +1588 -0
  47. forge/_extensions/skills/review/resources/code-anthropic.md +160 -0
  48. forge/_extensions/skills/review/resources/code-gemini.md +184 -0
  49. forge/_extensions/skills/review/resources/code-openai.md +203 -0
  50. forge/_extensions/skills/review/resources/code.md +160 -0
  51. forge/_extensions/skills/review-docs/SKILL.md +121 -0
  52. forge/_extensions/skills/review-docs/resources/docs-anthropic.md +170 -0
  53. forge/_extensions/skills/review-docs/resources/docs-gemini.md +204 -0
  54. forge/_extensions/skills/review-docs/resources/docs-openai.md +231 -0
  55. forge/_extensions/skills/review-docs/resources/docs.md +170 -0
  56. forge/_extensions/skills/smoke-test/SKILL.md +27 -0
  57. forge/_extensions/skills/smoke-test/scripts/smoke-test.sh +118 -0
  58. forge/_extensions/skills/understand/SKILL.md +148 -0
  59. forge/_extensions/skills/understand/resources/code-anthropic.md +163 -0
  60. forge/_extensions/skills/understand/resources/code-gemini.md +194 -0
  61. forge/_extensions/skills/understand/resources/code-openai.md +181 -0
  62. forge/_extensions/skills/understand/resources/code.md +163 -0
  63. forge/_extensions/skills/understand/resources/docs-anthropic.md +177 -0
  64. forge/_extensions/skills/understand/resources/docs-gemini.md +202 -0
  65. forge/_extensions/skills/understand/resources/docs-openai.md +191 -0
  66. forge/_extensions/skills/understand/resources/docs.md +177 -0
  67. forge/_extensions/skills/walkthrough/SKILL.md +599 -0
  68. forge/_extensions/skills/walkthrough/resources/checklist.md +765 -0
  69. forge/_extensions/skills/walkthrough/scripts/run-in-repo.sh +118 -0
  70. forge/_extensions/skills/walkthrough/scripts/setup-test-repo.sh +198 -0
  71. forge/_extensions/skills/walkthrough/scripts/walkthrough-state.py +1137 -0
  72. forge/backend/__init__.py +174 -0
  73. forge/backend/adapters/__init__.py +38 -0
  74. forge/backend/adapters/litellm.py +158 -0
  75. forge/backend/creation.py +89 -0
  76. forge/backend/registry.py +178 -0
  77. forge/cli/__init__.py +16 -0
  78. forge/cli/auth.py +483 -0
  79. forge/cli/backend.py +298 -0
  80. forge/cli/claude.py +411 -0
  81. forge/cli/config_cmd.py +303 -0
  82. forge/cli/extensions.py +1001 -0
  83. forge/cli/gc.py +165 -0
  84. forge/cli/guard.py +1018 -0
  85. forge/cli/guards.py +106 -0
  86. forge/cli/handoff.py +110 -0
  87. forge/cli/hooks/__init__.py +36 -0
  88. forge/cli/hooks/_group.py +20 -0
  89. forge/cli/hooks/_helpers.py +149 -0
  90. forge/cli/hooks/commands.py +1677 -0
  91. forge/cli/hooks/direct_commands.py +1304 -0
  92. forge/cli/hooks/install.py +232 -0
  93. forge/cli/hooks/policy.py +151 -0
  94. forge/cli/hooks/read_hygiene.py +74 -0
  95. forge/cli/hooks/verification.py +370 -0
  96. forge/cli/logs.py +406 -0
  97. forge/cli/main.py +292 -0
  98. forge/cli/proxy.py +1821 -0
  99. forge/cli/proxy_costs.py +313 -0
  100. forge/cli/search.py +416 -0
  101. forge/cli/session.py +892 -0
  102. forge/cli/session_addendum.py +81 -0
  103. forge/cli/session_fork.py +750 -0
  104. forge/cli/session_handoff.py +141 -0
  105. forge/cli/session_lifecycle.py +2053 -0
  106. forge/cli/session_manage.py +1336 -0
  107. forge/cli/session_memory.py +201 -0
  108. forge/cli/status_line.py +1398 -0
  109. forge/cli/workflow.py +1964 -0
  110. forge/config/__init__.py +110 -0
  111. forge/config/dataclass_utils.py +88 -0
  112. forge/config/defaults/__init__.py +0 -0
  113. forge/config/defaults/backends/__init__.py +0 -0
  114. forge/config/defaults/backends/litellm.yaml +196 -0
  115. forge/config/defaults/templates/__init__.py +0 -0
  116. forge/config/defaults/templates/litellm-anthropic-local.yaml +33 -0
  117. forge/config/defaults/templates/litellm-anthropic.yaml +24 -0
  118. forge/config/defaults/templates/litellm-gemini-flash-local.yaml +37 -0
  119. forge/config/defaults/templates/litellm-gemini-local.yaml +32 -0
  120. forge/config/defaults/templates/litellm-gemini-test.yaml +34 -0
  121. forge/config/defaults/templates/litellm-gemini.yaml +21 -0
  122. forge/config/defaults/templates/litellm-openai-codex-local.yaml +36 -0
  123. forge/config/defaults/templates/litellm-openai-local.yaml +38 -0
  124. forge/config/defaults/templates/litellm-openai.yaml +28 -0
  125. forge/config/defaults/templates/openrouter-anthropic.yaml +23 -0
  126. forge/config/defaults/templates/openrouter-deepseek.yaml +26 -0
  127. forge/config/defaults/templates/openrouter-gemini-flash.yaml +26 -0
  128. forge/config/defaults/templates/openrouter-gemini.yaml +23 -0
  129. forge/config/defaults/templates/openrouter-glm.yaml +23 -0
  130. forge/config/defaults/templates/openrouter-kimi.yaml +30 -0
  131. forge/config/defaults/templates/openrouter-minimax.yaml +26 -0
  132. forge/config/defaults/templates/openrouter-openai-codex.yaml +23 -0
  133. forge/config/defaults/templates/openrouter-openai.yaml +28 -0
  134. forge/config/defaults/templates/openrouter-qwen.yaml +25 -0
  135. forge/config/loader.py +675 -0
  136. forge/config/schema.py +448 -0
  137. forge/core/__init__.py +5 -0
  138. forge/core/auth/__init__.py +67 -0
  139. forge/core/auth/capabilities.py +219 -0
  140. forge/core/auth/credentials_file.py +244 -0
  141. forge/core/auth/protocols.py +18 -0
  142. forge/core/auth/secrets.py +243 -0
  143. forge/core/auth/template_secrets.py +112 -0
  144. forge/core/data/__init__.py +5 -0
  145. forge/core/data/model_catalog.yaml +1522 -0
  146. forge/core/data/pricing.yaml +140 -0
  147. forge/core/data/system_prompt_addendums/__init__.py +0 -0
  148. forge/core/data/system_prompt_addendums/gemini.md +330 -0
  149. forge/core/data/system_prompt_addendums/openai.md +328 -0
  150. forge/core/llm/__init__.py +231 -0
  151. forge/core/llm/clients/__init__.py +14 -0
  152. forge/core/llm/clients/base.py +115 -0
  153. forge/core/llm/clients/litellm.py +619 -0
  154. forge/core/llm/clients/openai_compat.py +244 -0
  155. forge/core/llm/clients/openrouter.py +234 -0
  156. forge/core/llm/credentials.py +439 -0
  157. forge/core/llm/detection.py +86 -0
  158. forge/core/llm/errors.py +44 -0
  159. forge/core/llm/protocols.py +80 -0
  160. forge/core/llm/types.py +176 -0
  161. forge/core/logging.py +146 -0
  162. forge/core/models/__init__.py +91 -0
  163. forge/core/models/catalog.py +467 -0
  164. forge/core/models/pricing.py +165 -0
  165. forge/core/models/types.py +167 -0
  166. forge/core/naming.py +212 -0
  167. forge/core/ops/__init__.py +73 -0
  168. forge/core/ops/context.py +141 -0
  169. forge/core/ops/gc.py +802 -0
  170. forge/core/ops/proxy.py +146 -0
  171. forge/core/ops/resolution.py +135 -0
  172. forge/core/ops/session.py +344 -0
  173. forge/core/ops/session_context.py +548 -0
  174. forge/core/paths.py +38 -0
  175. forge/core/process.py +54 -0
  176. forge/core/reactive/__init__.py +38 -0
  177. forge/core/reactive/cost_tracking.py +300 -0
  178. forge/core/reactive/env.py +180 -0
  179. forge/core/reactive/proxy.py +78 -0
  180. forge/core/reactive/routing.py +622 -0
  181. forge/core/reactive/session_runner.py +185 -0
  182. forge/core/reactive/structured_output.py +62 -0
  183. forge/core/reactive/tagger.py +94 -0
  184. forge/core/reactive/throttle.py +132 -0
  185. forge/core/state/__init__.py +59 -0
  186. forge/core/state/exceptions.py +59 -0
  187. forge/core/state/io.py +140 -0
  188. forge/core/state/lock.py +99 -0
  189. forge/core/state/timestamps.py +60 -0
  190. forge/core/transcript.py +78 -0
  191. forge/core/typing_helpers.py +24 -0
  192. forge/core/workqueue/__init__.py +67 -0
  193. forge/core/workqueue/queue.py +552 -0
  194. forge/core/workqueue/types.py +63 -0
  195. forge/guard/__init__.py +26 -0
  196. forge/guard/deterministic/__init__.py +26 -0
  197. forge/guard/deterministic/base.py +158 -0
  198. forge/guard/deterministic/coding_standards.py +256 -0
  199. forge/guard/deterministic/registry.py +148 -0
  200. forge/guard/deterministic/tdd.py +171 -0
  201. forge/guard/engine.py +216 -0
  202. forge/guard/protocols.py +91 -0
  203. forge/guard/queries.py +96 -0
  204. forge/guard/semantic/__init__.py +34 -0
  205. forge/guard/semantic/promotion.py +18 -0
  206. forge/guard/semantic/supervisor.py +813 -0
  207. forge/guard/semantic/verdict.py +183 -0
  208. forge/guard/store.py +124 -0
  209. forge/guard/team/__init__.py +6 -0
  210. forge/guard/team/config.py +24 -0
  211. forge/guard/team/handlers.py +209 -0
  212. forge/guard/team/prompts.py +41 -0
  213. forge/guard/types.py +125 -0
  214. forge/guard/workflow/__init__.py +17 -0
  215. forge/guard/workflow/branches.py +67 -0
  216. forge/guard/workflow/config.py +63 -0
  217. forge/guard/workflow/divergence.py +113 -0
  218. forge/guard/workflow/policy.py +87 -0
  219. forge/guard/workflow/stages.py +205 -0
  220. forge/install/__init__.py +55 -0
  221. forge/install/cli.py +281 -0
  222. forge/install/exceptions.py +163 -0
  223. forge/install/hooks.py +109 -0
  224. forge/install/installer.py +1037 -0
  225. forge/install/models.py +321 -0
  226. forge/install/preset.py +272 -0
  227. forge/install/settings_merge.py +831 -0
  228. forge/install/tracking.py +238 -0
  229. forge/install/version.py +141 -0
  230. forge/proxy/__init__.py +0 -0
  231. forge/proxy/base_client.py +181 -0
  232. forge/proxy/client_adapter.py +476 -0
  233. forge/proxy/client_factory.py +531 -0
  234. forge/proxy/converters.py +1206 -0
  235. forge/proxy/cost_logger.py +132 -0
  236. forge/proxy/cost_tracker.py +242 -0
  237. forge/proxy/data_models.py +338 -0
  238. forge/proxy/error_hints.py +92 -0
  239. forge/proxy/metrics.py +222 -0
  240. forge/proxy/model_spec.py +158 -0
  241. forge/proxy/proxies.py +333 -0
  242. forge/proxy/proxy_identity.py +134 -0
  243. forge/proxy/proxy_orchestrator.py +1018 -0
  244. forge/proxy/proxy_startup.py +54 -0
  245. forge/proxy/server.py +1561 -0
  246. forge/proxy/utils.py +537 -0
  247. forge/review/__init__.py +6 -0
  248. forge/review/adversarial.py +111 -0
  249. forge/review/consensus.py +236 -0
  250. forge/review/engine.py +356 -0
  251. forge/review/models.py +437 -0
  252. forge/review/resources/__init__.py +5 -0
  253. forge/review/resources/codereview-performance.md +85 -0
  254. forge/review/resources/codereview-quick.md +75 -0
  255. forge/review/resources/codereview-security.md +92 -0
  256. forge/review/resources/codereview.md +85 -0
  257. forge/review/resources/docreview-quick.md +75 -0
  258. forge/review/resources/docreview.md +86 -0
  259. forge/review/resources/thinkdeep.md +89 -0
  260. forge/review/routing.py +368 -0
  261. forge/review/synthesis.py +73 -0
  262. forge/runtime_config.py +438 -0
  263. forge/search/__init__.py +55 -0
  264. forge/search/bm25_store.py +264 -0
  265. forge/search/content_store.py +197 -0
  266. forge/search/engine.py +352 -0
  267. forge/search/exceptions.py +51 -0
  268. forge/search/extractor.py +234 -0
  269. forge/search/index_state.py +295 -0
  270. forge/search/store.py +215 -0
  271. forge/search/tokenizer.py +24 -0
  272. forge/session/__init__.py +130 -0
  273. forge/session/active.py +339 -0
  274. forge/session/artifacts.py +202 -0
  275. forge/session/claude/__init__.py +50 -0
  276. forge/session/claude/cleanup.py +105 -0
  277. forge/session/claude/invoke.py +236 -0
  278. forge/session/claude/paths.py +200 -0
  279. forge/session/cleanup.py +216 -0
  280. forge/session/config.py +34 -0
  281. forge/session/direct_model.py +107 -0
  282. forge/session/effective.py +169 -0
  283. forge/session/exceptions.py +255 -0
  284. forge/session/handoff.py +881 -0
  285. forge/session/handoff_agent.py +544 -0
  286. forge/session/hooks/__init__.py +35 -0
  287. forge/session/hooks/models.py +73 -0
  288. forge/session/hooks/session_start.py +507 -0
  289. forge/session/identity.py +84 -0
  290. forge/session/index.py +553 -0
  291. forge/session/manager.py +1506 -0
  292. forge/session/models.py +572 -0
  293. forge/session/overrides.py +344 -0
  294. forge/session/plan_resolution.py +286 -0
  295. forge/session/prev_sessions.py +128 -0
  296. forge/session/store.py +431 -0
  297. forge/session/validation.py +47 -0
  298. forge/session/worktree/__init__.py +65 -0
  299. forge/session/worktree/cleanup.py +262 -0
  300. forge/session/worktree/config_copy.py +203 -0
  301. forge/session/worktree/create.py +332 -0
  302. forge/sidecar/__init__.py +29 -0
  303. forge/sidecar/container.py +161 -0
  304. forge/sidecar/docker.py +86 -0
  305. forge/sidecar/secrets.py +19 -0
  306. multi_forge-0.2.0.dist-info/METADATA +242 -0
  307. multi_forge-0.2.0.dist-info/RECORD +311 -0
  308. multi_forge-0.2.0.dist-info/WHEEL +4 -0
  309. multi_forge-0.2.0.dist-info/entry_points.txt +2 -0
  310. multi_forge-0.2.0.dist-info/licenses/LICENSE +203 -0
  311. multi_forge-0.2.0.dist-info/licenses/NOTICE +14 -0
@@ -0,0 +1,264 @@
1
+ """Persistent BM25 index store.
2
+
3
+ Persists precomputed BM25 data structures (term frequencies, document
4
+ frequencies, corpus stats) so queries only run scoring, not index
5
+ construction.
6
+
7
+ Store location: <project_root>/.forge/search-index/bm25_index.json
8
+
9
+ Follows the same patterns as SearchDocumentStore/IndexStateStore:
10
+ atomic writes, file locking, schema versioning, self-healing on missing file.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import json
16
+ import logging
17
+ from dataclasses import asdict, dataclass, field
18
+ from pathlib import Path
19
+ from typing import Any
20
+
21
+ from forge.core.state import (
22
+ SchemaVersionError,
23
+ atomic_write_json,
24
+ file_lock_for_target,
25
+ now_iso,
26
+ )
27
+
28
+ from .exceptions import BM25IndexCorruptedError
29
+ from .index_state import SEARCH_INDEX_DIR
30
+
31
+ logger = logging.getLogger(__name__)
32
+
33
+ BM25_INDEX_FILENAME = "bm25_index.json"
34
+ BM25_INDEX_VERSION = 1
35
+
36
+ # Bump when TOKEN_RE or tokenize() logic changes — mismatch forces rebuild
37
+ # to prevent silently wrong scores.
38
+ TOKENIZER_ID = "v1"
39
+
40
+ STORE_LOCK_TIMEOUT_S = 5.0
41
+ HANDLER_LOCK_TIMEOUT_S = 1.0
42
+
43
+
44
+ @dataclass
45
+ class BM25IndexData:
46
+ """Serializable BM25 index state.
47
+
48
+ Positional alignment: doc_keys[i], doc_lens[i], term_freqs[i] all
49
+ refer to the same document.
50
+ """
51
+
52
+ doc_keys: list[str] = field(default_factory=list)
53
+ doc_lens: list[int] = field(default_factory=list)
54
+ term_freqs: list[dict[str, int]] = field(default_factory=list)
55
+ doc_freqs: dict[str, int] = field(default_factory=dict)
56
+ avgdl: float = 0.0
57
+ k1: float = 1.5
58
+ b: float = 0.75
59
+ tokenizer_id: str = TOKENIZER_ID
60
+
61
+ def to_dict(self) -> dict[str, Any]:
62
+ """Serialize to dict for JSON storage."""
63
+ return asdict(self)
64
+
65
+
66
+ def _get_bm25_index_path(forge_root: Path) -> Path:
67
+ return forge_root / ".forge" / SEARCH_INDEX_DIR / BM25_INDEX_FILENAME
68
+
69
+
70
+ class BM25IndexStore:
71
+ """Manage per-project persistent BM25 index.
72
+
73
+ Store location: <forge_root>/.forge/search-index/bm25_index.json
74
+
75
+ Error handling:
76
+ - Missing file: returns None (no index built yet)
77
+ - Corrupted file: raises BM25IndexCorruptedError
78
+ - Wrong schema version: raises SchemaVersionError
79
+ - Tokenizer ID mismatch: raises SchemaVersionError (forces rebuild)
80
+ """
81
+
82
+ def __init__(
83
+ self,
84
+ forge_root: Path | None = None,
85
+ *,
86
+ store_path: Path | None = None,
87
+ ) -> None:
88
+ if store_path:
89
+ self._store_path = store_path
90
+ elif forge_root:
91
+ self._store_path = _get_bm25_index_path(forge_root)
92
+ else:
93
+ raise ValueError("Either forge_root or store_path required")
94
+
95
+ @property
96
+ def store_path(self) -> Path:
97
+ return self._store_path
98
+
99
+ def exists(self) -> bool:
100
+ return self._store_path.is_file()
101
+
102
+ def read(self) -> BM25IndexData | None:
103
+ """Read BM25 index from disk.
104
+
105
+ Returns None if the file does not exist (no index built yet).
106
+
107
+ Raises:
108
+ BM25IndexCorruptedError: If the file contains invalid JSON,
109
+ tokenizer ID mismatch, or positional arrays are misaligned.
110
+ SchemaVersionError: If schema version doesn't match.
111
+ """
112
+ if not self.exists():
113
+ return None
114
+
115
+ path_str = str(self._store_path)
116
+
117
+ try:
118
+ with open(self._store_path, encoding="utf-8") as f:
119
+ data = json.load(f)
120
+ except json.JSONDecodeError as e:
121
+ raise BM25IndexCorruptedError(path_str, f"invalid JSON: {e}") from e
122
+ except OSError as e:
123
+ raise BM25IndexCorruptedError(path_str, f"read error: {e}") from e
124
+
125
+ if not isinstance(data, dict):
126
+ raise BM25IndexCorruptedError(
127
+ path_str,
128
+ f"expected JSON object, got {type(data).__name__}",
129
+ )
130
+
131
+ version = data.get("schema_version")
132
+ if version is None:
133
+ raise BM25IndexCorruptedError(path_str, "missing schema_version")
134
+ if version != BM25_INDEX_VERSION:
135
+ raise SchemaVersionError(path_str, BM25_INDEX_VERSION, version)
136
+
137
+ stored_tokenizer = data.get("tokenizer_id", "")
138
+ if stored_tokenizer != TOKENIZER_ID:
139
+ raise BM25IndexCorruptedError(
140
+ path_str,
141
+ f"tokenizer mismatch: index has '{stored_tokenizer}', "
142
+ f"current is '{TOKENIZER_ID}'. Run 'forge search rebuild-index' to fix.",
143
+ )
144
+
145
+ try:
146
+ index_data = BM25IndexData(
147
+ doc_keys=data.get("doc_keys", []),
148
+ doc_lens=data.get("doc_lens", []),
149
+ term_freqs=data.get("term_freqs", []),
150
+ doc_freqs=data.get("doc_freqs", {}),
151
+ avgdl=float(data.get("avgdl", 0.0)),
152
+ k1=float(data.get("k1", 1.5)),
153
+ b=float(data.get("b", 0.75)),
154
+ tokenizer_id=stored_tokenizer,
155
+ )
156
+ except (TypeError, ValueError) as e:
157
+ raise BM25IndexCorruptedError(path_str, f"invalid data: {e}") from e
158
+
159
+ n_keys = len(index_data.doc_keys)
160
+ n_lens = len(index_data.doc_lens)
161
+ n_freqs = len(index_data.term_freqs)
162
+ if n_keys != n_lens or n_keys != n_freqs:
163
+ raise BM25IndexCorruptedError(
164
+ path_str,
165
+ f"positional array length mismatch: doc_keys={n_keys}, "
166
+ f"doc_lens={n_lens}, term_freqs={n_freqs}. "
167
+ "Run 'forge search rebuild-index' to fix.",
168
+ )
169
+
170
+ return index_data
171
+
172
+ def write(self, data: BM25IndexData) -> None:
173
+ """Write BM25 index atomically. Creates parent directories if needed."""
174
+ payload: dict[str, Any] = {
175
+ "schema_version": BM25_INDEX_VERSION,
176
+ "updated_at": now_iso(),
177
+ "tokenizer_id": data.tokenizer_id,
178
+ **data.to_dict(),
179
+ }
180
+ atomic_write_json(self._store_path, payload)
181
+
182
+ def replace_all(
183
+ self,
184
+ data: BM25IndexData,
185
+ *,
186
+ timeout_s: float = STORE_LOCK_TIMEOUT_S,
187
+ ) -> None:
188
+ """Replace entire index under lock (for rebuild-index)."""
189
+ with file_lock_for_target(target_path=self._store_path, timeout_s=timeout_s):
190
+ self.write(data)
191
+
192
+ def upsert_document(
193
+ self,
194
+ doc_key: str,
195
+ term_freq: dict[str, int],
196
+ doc_len: int,
197
+ *,
198
+ timeout_s: float = HANDLER_LOCK_TIMEOUT_S,
199
+ ) -> None:
200
+ """Add or replace a document in the index (locked, idempotent).
201
+
202
+ If doc_key already exists, its old contribution is removed first
203
+ (doc_freqs decremented) before adding the new entry. This ensures
204
+ work queue retries don't create duplicates or double-increment.
205
+ """
206
+ with file_lock_for_target(target_path=self._store_path, timeout_s=timeout_s):
207
+ data = self.read()
208
+ if data is None:
209
+ data = BM25IndexData()
210
+
211
+ _remove_doc_from_data(data, doc_key)
212
+
213
+ data.doc_keys.append(doc_key)
214
+ data.doc_lens.append(doc_len)
215
+ data.term_freqs.append(term_freq)
216
+ for term, count in term_freq.items():
217
+ if count > 0:
218
+ data.doc_freqs[term] = data.doc_freqs.get(term, 0) + 1
219
+
220
+ data.avgdl = sum(data.doc_lens) / max(len(data.doc_lens), 1)
221
+
222
+ self.write(data)
223
+
224
+ def remove_document(
225
+ self,
226
+ doc_key: str,
227
+ *,
228
+ timeout_s: float = HANDLER_LOCK_TIMEOUT_S,
229
+ ) -> bool:
230
+ """Remove a document from the index (locked). Returns True if found."""
231
+ with file_lock_for_target(target_path=self._store_path, timeout_s=timeout_s):
232
+ data = self.read()
233
+ if data is None:
234
+ return False
235
+
236
+ removed = _remove_doc_from_data(data, doc_key)
237
+ if not removed:
238
+ return False
239
+
240
+ data.avgdl = sum(data.doc_lens) / max(len(data.doc_lens), 1)
241
+
242
+ self.write(data)
243
+ return True
244
+
245
+
246
+ def _remove_doc_from_data(data: BM25IndexData, doc_key: str) -> bool:
247
+ """Remove a document from BM25IndexData in-place. Returns True if found."""
248
+ try:
249
+ idx = data.doc_keys.index(doc_key)
250
+ except ValueError:
251
+ return False
252
+
253
+ old_tf = data.term_freqs[idx]
254
+ for term in old_tf:
255
+ if term in data.doc_freqs:
256
+ data.doc_freqs[term] -= 1
257
+ if data.doc_freqs[term] <= 0:
258
+ del data.doc_freqs[term]
259
+
260
+ data.doc_keys.pop(idx)
261
+ data.doc_lens.pop(idx)
262
+ data.term_freqs.pop(idx)
263
+
264
+ return True
@@ -0,0 +1,197 @@
1
+ """Content store for lazy snippet loading.
2
+
3
+ Stores document content strings keyed by transcript_path. Content is loaded
4
+ at query time only for top-K results (snippet extraction), not for scoring.
5
+
6
+ Store location: <project_root>/.forge/search-index/content.json
7
+
8
+ Follows the same patterns as other search stores: atomic writes, file
9
+ locking, schema versioning, self-healing on missing file.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import json
15
+ import logging
16
+ from pathlib import Path
17
+ from typing import Any
18
+
19
+ from forge.core.state import (
20
+ SchemaVersionError,
21
+ atomic_write_json,
22
+ file_lock_for_target,
23
+ now_iso,
24
+ )
25
+
26
+ from .exceptions import ContentStoreCorruptedError
27
+ from .index_state import SEARCH_INDEX_DIR
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+ # File and schema constants
32
+ CONTENT_FILENAME = "content.json"
33
+ CONTENT_STORE_VERSION = 1
34
+
35
+ # Lock timeouts
36
+ STORE_LOCK_TIMEOUT_S = 5.0
37
+ HANDLER_LOCK_TIMEOUT_S = 1.0
38
+
39
+
40
+ def _get_content_store_path(forge_root: Path) -> Path:
41
+ return forge_root / ".forge" / SEARCH_INDEX_DIR / CONTENT_FILENAME
42
+
43
+
44
+ class ContentStore:
45
+ """Manage per-project content store for lazy snippet loading.
46
+
47
+ Store location: <forge_root>/.forge/search-index/content.json
48
+ Maps transcript_path -> extracted content string.
49
+
50
+ Error handling:
51
+ - Missing file: returns empty dict (self-healing)
52
+ - Corrupted file: raises ContentStoreCorruptedError
53
+ - Wrong schema version: raises SchemaVersionError
54
+ """
55
+
56
+ def __init__(
57
+ self,
58
+ forge_root: Path | None = None,
59
+ *,
60
+ store_path: Path | None = None,
61
+ ) -> None:
62
+ if store_path:
63
+ self._store_path = store_path
64
+ elif forge_root:
65
+ self._store_path = _get_content_store_path(forge_root)
66
+ else:
67
+ raise ValueError("Either forge_root or store_path required")
68
+
69
+ @property
70
+ def store_path(self) -> Path:
71
+ return self._store_path
72
+
73
+ def exists(self) -> bool:
74
+ return self._store_path.is_file()
75
+
76
+ def read_all(self) -> dict[str, str]:
77
+ """Read all content from disk.
78
+
79
+ Returns empty dict if the file does not exist (self-healing).
80
+
81
+ Raises:
82
+ ContentStoreCorruptedError: If the file contains invalid JSON.
83
+ SchemaVersionError: If the schema version doesn't match.
84
+ """
85
+ if not self.exists():
86
+ return {}
87
+
88
+ path_str = str(self._store_path)
89
+
90
+ try:
91
+ with open(self._store_path, encoding="utf-8") as f:
92
+ data = json.load(f)
93
+ except json.JSONDecodeError as e:
94
+ raise ContentStoreCorruptedError(path_str, f"invalid JSON: {e}") from e
95
+ except OSError as e:
96
+ raise ContentStoreCorruptedError(path_str, f"read error: {e}") from e
97
+
98
+ if not isinstance(data, dict):
99
+ raise ContentStoreCorruptedError(
100
+ path_str,
101
+ f"expected JSON object, got {type(data).__name__}",
102
+ )
103
+
104
+ version = data.get("schema_version")
105
+ if version is None:
106
+ raise ContentStoreCorruptedError(path_str, "missing schema_version")
107
+ if version != CONTENT_STORE_VERSION:
108
+ raise SchemaVersionError(path_str, CONTENT_STORE_VERSION, version)
109
+
110
+ content = data.get("content", {})
111
+ if not isinstance(content, dict):
112
+ logger.warning(
113
+ "Content store %s has non-dict 'content' field (got %s), treating as empty",
114
+ path_str,
115
+ type(content).__name__,
116
+ )
117
+ return {}
118
+
119
+ return content
120
+
121
+ def read_keys(self, keys: list[str]) -> dict[str, str]:
122
+ """Read content for specific document keys only.
123
+
124
+ Loads the full JSON file (unavoidable with JSON format) but returns
125
+ only the requested keys. This is the method used at query time for
126
+ snippet extraction of top-K results.
127
+
128
+ Returns:
129
+ Dict mapping requested keys to their content strings.
130
+ Keys not found in the store are omitted from the result.
131
+ """
132
+ all_content = self.read_all()
133
+ return {k: all_content[k] for k in keys if k in all_content}
134
+
135
+ def write(self, content_map: dict[str, str]) -> None:
136
+ """Write content store atomically. Creates parent directories if needed."""
137
+ payload: dict[str, Any] = {
138
+ "schema_version": CONTENT_STORE_VERSION,
139
+ "updated_at": now_iso(),
140
+ "content": content_map,
141
+ }
142
+ atomic_write_json(self._store_path, payload)
143
+
144
+ def replace_all(
145
+ self,
146
+ content_map: dict[str, str],
147
+ *,
148
+ timeout_s: float = STORE_LOCK_TIMEOUT_S,
149
+ ) -> None:
150
+ """Replace all content under lock (for rebuild-index)."""
151
+ with file_lock_for_target(target_path=self._store_path, timeout_s=timeout_s):
152
+ self.write(content_map)
153
+
154
+ def add(
155
+ self,
156
+ doc_key: str,
157
+ content: str,
158
+ *,
159
+ timeout_s: float = HANDLER_LOCK_TIMEOUT_S,
160
+ ) -> None:
161
+ """Add or replace content for a document (locked, idempotent)."""
162
+ with file_lock_for_target(target_path=self._store_path, timeout_s=timeout_s):
163
+ content_map = self.read_all()
164
+ content_map[doc_key] = content
165
+ self.write(content_map)
166
+
167
+ def remove(
168
+ self,
169
+ doc_key: str,
170
+ *,
171
+ timeout_s: float = HANDLER_LOCK_TIMEOUT_S,
172
+ ) -> bool:
173
+ """Remove content for a document (locked). Returns True if found."""
174
+ with file_lock_for_target(target_path=self._store_path, timeout_s=timeout_s):
175
+ content_map = self.read_all()
176
+ if doc_key not in content_map:
177
+ return False
178
+ del content_map[doc_key]
179
+ self.write(content_map)
180
+ return True
181
+
182
+ def prune_keys(
183
+ self,
184
+ valid_keys: set[str],
185
+ *,
186
+ timeout_s: float = STORE_LOCK_TIMEOUT_S,
187
+ ) -> list[str]:
188
+ """Remove entries not in valid_keys (locked). Returns removed keys."""
189
+ with file_lock_for_target(target_path=self._store_path, timeout_s=timeout_s):
190
+ content_map = self.read_all()
191
+ removed = [k for k in content_map if k not in valid_keys]
192
+ if not removed:
193
+ return []
194
+ for k in removed:
195
+ del content_map[k]
196
+ self.write(content_map)
197
+ return removed