multi-forge 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (311) hide show
  1. forge/__init__.py +3 -0
  2. forge/_extensions/agents/.gitkeep +0 -0
  3. forge/_extensions/commands/.gitkeep +0 -0
  4. forge/_extensions/skills/analyze/SKILL.md +87 -0
  5. forge/_extensions/skills/challenge/SKILL.md +91 -0
  6. forge/_extensions/skills/consensus/SKILL.md +120 -0
  7. forge/_extensions/skills/consensus/resources/code_consensus_evaluation.md +94 -0
  8. forge/_extensions/skills/consensus/resources/consensus_evaluation.md +70 -0
  9. forge/_extensions/skills/consensus/resources/synthesis.md +101 -0
  10. forge/_extensions/skills/debate/SKILL.md +116 -0
  11. forge/_extensions/skills/debate/resources/code_debate_evaluation.md +101 -0
  12. forge/_extensions/skills/debate/resources/debate_evaluation.md +90 -0
  13. forge/_extensions/skills/panel/SKILL.md +141 -0
  14. forge/_extensions/skills/panel/resources/synthesis.md +103 -0
  15. forge/_extensions/skills/qa/SKILL.md +704 -0
  16. forge/_extensions/skills/qa/resources/checklist/0-enable.md +78 -0
  17. forge/_extensions/skills/qa/resources/checklist/1-preflight.md +24 -0
  18. forge/_extensions/skills/qa/resources/checklist/10-resume.md +143 -0
  19. forge/_extensions/skills/qa/resources/checklist/11-config.md +150 -0
  20. forge/_extensions/skills/qa/resources/checklist/12-search.md +58 -0
  21. forge/_extensions/skills/qa/resources/checklist/13-guard.md +237 -0
  22. forge/_extensions/skills/qa/resources/checklist/14-workflow.md +305 -0
  23. forge/_extensions/skills/qa/resources/checklist/15-skills.md +155 -0
  24. forge/_extensions/skills/qa/resources/checklist/16-handoff.md +224 -0
  25. forge/_extensions/skills/qa/resources/checklist/17-info.md +50 -0
  26. forge/_extensions/skills/qa/resources/checklist/18-disable.md +84 -0
  27. forge/_extensions/skills/qa/resources/checklist/19-uninstall.md +146 -0
  28. forge/_extensions/skills/qa/resources/checklist/2-extensions.md +188 -0
  29. forge/_extensions/skills/qa/resources/checklist/20-cleanup.md +36 -0
  30. forge/_extensions/skills/qa/resources/checklist/3-auth.md +234 -0
  31. forge/_extensions/skills/qa/resources/checklist/4-proxy.md +481 -0
  32. forge/_extensions/skills/qa/resources/checklist/5-session.md +541 -0
  33. forge/_extensions/skills/qa/resources/checklist/6-hooks.md +275 -0
  34. forge/_extensions/skills/qa/resources/checklist/7-costs.md +309 -0
  35. forge/_extensions/skills/qa/resources/checklist/8-status-line.md +174 -0
  36. forge/_extensions/skills/qa/resources/checklist/9-direct-commands.md +146 -0
  37. forge/_extensions/skills/qa/resources/checklist.md +103 -0
  38. forge/_extensions/skills/qa/resources/report-template.md +62 -0
  39. forge/_extensions/skills/qa/scripts/start-container.sh +529 -0
  40. forge/_extensions/skills/qa/scripts/walkthrough-state.py +1137 -0
  41. forge/_extensions/skills/review/SKILL.md +125 -0
  42. forge/_extensions/skills/review/references/claude-4.6.md +474 -0
  43. forge/_extensions/skills/review/references/claude-4.7.md +710 -0
  44. forge/_extensions/skills/review/references/gemini-3.1.md +546 -0
  45. forge/_extensions/skills/review/references/gpt-5.5.md +490 -0
  46. forge/_extensions/skills/review/references/skills-writing-guide.md +1588 -0
  47. forge/_extensions/skills/review/resources/code-anthropic.md +160 -0
  48. forge/_extensions/skills/review/resources/code-gemini.md +184 -0
  49. forge/_extensions/skills/review/resources/code-openai.md +203 -0
  50. forge/_extensions/skills/review/resources/code.md +160 -0
  51. forge/_extensions/skills/review-docs/SKILL.md +121 -0
  52. forge/_extensions/skills/review-docs/resources/docs-anthropic.md +170 -0
  53. forge/_extensions/skills/review-docs/resources/docs-gemini.md +204 -0
  54. forge/_extensions/skills/review-docs/resources/docs-openai.md +231 -0
  55. forge/_extensions/skills/review-docs/resources/docs.md +170 -0
  56. forge/_extensions/skills/smoke-test/SKILL.md +27 -0
  57. forge/_extensions/skills/smoke-test/scripts/smoke-test.sh +118 -0
  58. forge/_extensions/skills/understand/SKILL.md +148 -0
  59. forge/_extensions/skills/understand/resources/code-anthropic.md +163 -0
  60. forge/_extensions/skills/understand/resources/code-gemini.md +194 -0
  61. forge/_extensions/skills/understand/resources/code-openai.md +181 -0
  62. forge/_extensions/skills/understand/resources/code.md +163 -0
  63. forge/_extensions/skills/understand/resources/docs-anthropic.md +177 -0
  64. forge/_extensions/skills/understand/resources/docs-gemini.md +202 -0
  65. forge/_extensions/skills/understand/resources/docs-openai.md +191 -0
  66. forge/_extensions/skills/understand/resources/docs.md +177 -0
  67. forge/_extensions/skills/walkthrough/SKILL.md +599 -0
  68. forge/_extensions/skills/walkthrough/resources/checklist.md +765 -0
  69. forge/_extensions/skills/walkthrough/scripts/run-in-repo.sh +118 -0
  70. forge/_extensions/skills/walkthrough/scripts/setup-test-repo.sh +198 -0
  71. forge/_extensions/skills/walkthrough/scripts/walkthrough-state.py +1137 -0
  72. forge/backend/__init__.py +174 -0
  73. forge/backend/adapters/__init__.py +38 -0
  74. forge/backend/adapters/litellm.py +158 -0
  75. forge/backend/creation.py +89 -0
  76. forge/backend/registry.py +178 -0
  77. forge/cli/__init__.py +16 -0
  78. forge/cli/auth.py +483 -0
  79. forge/cli/backend.py +298 -0
  80. forge/cli/claude.py +411 -0
  81. forge/cli/config_cmd.py +303 -0
  82. forge/cli/extensions.py +1001 -0
  83. forge/cli/gc.py +165 -0
  84. forge/cli/guard.py +1018 -0
  85. forge/cli/guards.py +106 -0
  86. forge/cli/handoff.py +110 -0
  87. forge/cli/hooks/__init__.py +36 -0
  88. forge/cli/hooks/_group.py +20 -0
  89. forge/cli/hooks/_helpers.py +149 -0
  90. forge/cli/hooks/commands.py +1677 -0
  91. forge/cli/hooks/direct_commands.py +1304 -0
  92. forge/cli/hooks/install.py +232 -0
  93. forge/cli/hooks/policy.py +151 -0
  94. forge/cli/hooks/read_hygiene.py +74 -0
  95. forge/cli/hooks/verification.py +370 -0
  96. forge/cli/logs.py +406 -0
  97. forge/cli/main.py +292 -0
  98. forge/cli/proxy.py +1821 -0
  99. forge/cli/proxy_costs.py +313 -0
  100. forge/cli/search.py +416 -0
  101. forge/cli/session.py +892 -0
  102. forge/cli/session_addendum.py +81 -0
  103. forge/cli/session_fork.py +750 -0
  104. forge/cli/session_handoff.py +141 -0
  105. forge/cli/session_lifecycle.py +2053 -0
  106. forge/cli/session_manage.py +1336 -0
  107. forge/cli/session_memory.py +201 -0
  108. forge/cli/status_line.py +1398 -0
  109. forge/cli/workflow.py +1964 -0
  110. forge/config/__init__.py +110 -0
  111. forge/config/dataclass_utils.py +88 -0
  112. forge/config/defaults/__init__.py +0 -0
  113. forge/config/defaults/backends/__init__.py +0 -0
  114. forge/config/defaults/backends/litellm.yaml +196 -0
  115. forge/config/defaults/templates/__init__.py +0 -0
  116. forge/config/defaults/templates/litellm-anthropic-local.yaml +33 -0
  117. forge/config/defaults/templates/litellm-anthropic.yaml +24 -0
  118. forge/config/defaults/templates/litellm-gemini-flash-local.yaml +37 -0
  119. forge/config/defaults/templates/litellm-gemini-local.yaml +32 -0
  120. forge/config/defaults/templates/litellm-gemini-test.yaml +34 -0
  121. forge/config/defaults/templates/litellm-gemini.yaml +21 -0
  122. forge/config/defaults/templates/litellm-openai-codex-local.yaml +36 -0
  123. forge/config/defaults/templates/litellm-openai-local.yaml +38 -0
  124. forge/config/defaults/templates/litellm-openai.yaml +28 -0
  125. forge/config/defaults/templates/openrouter-anthropic.yaml +23 -0
  126. forge/config/defaults/templates/openrouter-deepseek.yaml +26 -0
  127. forge/config/defaults/templates/openrouter-gemini-flash.yaml +26 -0
  128. forge/config/defaults/templates/openrouter-gemini.yaml +23 -0
  129. forge/config/defaults/templates/openrouter-glm.yaml +23 -0
  130. forge/config/defaults/templates/openrouter-kimi.yaml +30 -0
  131. forge/config/defaults/templates/openrouter-minimax.yaml +26 -0
  132. forge/config/defaults/templates/openrouter-openai-codex.yaml +23 -0
  133. forge/config/defaults/templates/openrouter-openai.yaml +28 -0
  134. forge/config/defaults/templates/openrouter-qwen.yaml +25 -0
  135. forge/config/loader.py +675 -0
  136. forge/config/schema.py +448 -0
  137. forge/core/__init__.py +5 -0
  138. forge/core/auth/__init__.py +67 -0
  139. forge/core/auth/capabilities.py +219 -0
  140. forge/core/auth/credentials_file.py +244 -0
  141. forge/core/auth/protocols.py +18 -0
  142. forge/core/auth/secrets.py +243 -0
  143. forge/core/auth/template_secrets.py +112 -0
  144. forge/core/data/__init__.py +5 -0
  145. forge/core/data/model_catalog.yaml +1522 -0
  146. forge/core/data/pricing.yaml +140 -0
  147. forge/core/data/system_prompt_addendums/__init__.py +0 -0
  148. forge/core/data/system_prompt_addendums/gemini.md +330 -0
  149. forge/core/data/system_prompt_addendums/openai.md +328 -0
  150. forge/core/llm/__init__.py +231 -0
  151. forge/core/llm/clients/__init__.py +14 -0
  152. forge/core/llm/clients/base.py +115 -0
  153. forge/core/llm/clients/litellm.py +619 -0
  154. forge/core/llm/clients/openai_compat.py +244 -0
  155. forge/core/llm/clients/openrouter.py +234 -0
  156. forge/core/llm/credentials.py +439 -0
  157. forge/core/llm/detection.py +86 -0
  158. forge/core/llm/errors.py +44 -0
  159. forge/core/llm/protocols.py +80 -0
  160. forge/core/llm/types.py +176 -0
  161. forge/core/logging.py +146 -0
  162. forge/core/models/__init__.py +91 -0
  163. forge/core/models/catalog.py +467 -0
  164. forge/core/models/pricing.py +165 -0
  165. forge/core/models/types.py +167 -0
  166. forge/core/naming.py +212 -0
  167. forge/core/ops/__init__.py +73 -0
  168. forge/core/ops/context.py +141 -0
  169. forge/core/ops/gc.py +802 -0
  170. forge/core/ops/proxy.py +146 -0
  171. forge/core/ops/resolution.py +135 -0
  172. forge/core/ops/session.py +344 -0
  173. forge/core/ops/session_context.py +548 -0
  174. forge/core/paths.py +38 -0
  175. forge/core/process.py +54 -0
  176. forge/core/reactive/__init__.py +38 -0
  177. forge/core/reactive/cost_tracking.py +300 -0
  178. forge/core/reactive/env.py +180 -0
  179. forge/core/reactive/proxy.py +78 -0
  180. forge/core/reactive/routing.py +622 -0
  181. forge/core/reactive/session_runner.py +185 -0
  182. forge/core/reactive/structured_output.py +62 -0
  183. forge/core/reactive/tagger.py +94 -0
  184. forge/core/reactive/throttle.py +132 -0
  185. forge/core/state/__init__.py +59 -0
  186. forge/core/state/exceptions.py +59 -0
  187. forge/core/state/io.py +140 -0
  188. forge/core/state/lock.py +99 -0
  189. forge/core/state/timestamps.py +60 -0
  190. forge/core/transcript.py +78 -0
  191. forge/core/typing_helpers.py +24 -0
  192. forge/core/workqueue/__init__.py +67 -0
  193. forge/core/workqueue/queue.py +552 -0
  194. forge/core/workqueue/types.py +63 -0
  195. forge/guard/__init__.py +26 -0
  196. forge/guard/deterministic/__init__.py +26 -0
  197. forge/guard/deterministic/base.py +158 -0
  198. forge/guard/deterministic/coding_standards.py +256 -0
  199. forge/guard/deterministic/registry.py +148 -0
  200. forge/guard/deterministic/tdd.py +171 -0
  201. forge/guard/engine.py +216 -0
  202. forge/guard/protocols.py +91 -0
  203. forge/guard/queries.py +96 -0
  204. forge/guard/semantic/__init__.py +34 -0
  205. forge/guard/semantic/promotion.py +18 -0
  206. forge/guard/semantic/supervisor.py +813 -0
  207. forge/guard/semantic/verdict.py +183 -0
  208. forge/guard/store.py +124 -0
  209. forge/guard/team/__init__.py +6 -0
  210. forge/guard/team/config.py +24 -0
  211. forge/guard/team/handlers.py +209 -0
  212. forge/guard/team/prompts.py +41 -0
  213. forge/guard/types.py +125 -0
  214. forge/guard/workflow/__init__.py +17 -0
  215. forge/guard/workflow/branches.py +67 -0
  216. forge/guard/workflow/config.py +63 -0
  217. forge/guard/workflow/divergence.py +113 -0
  218. forge/guard/workflow/policy.py +87 -0
  219. forge/guard/workflow/stages.py +205 -0
  220. forge/install/__init__.py +55 -0
  221. forge/install/cli.py +281 -0
  222. forge/install/exceptions.py +163 -0
  223. forge/install/hooks.py +109 -0
  224. forge/install/installer.py +1037 -0
  225. forge/install/models.py +321 -0
  226. forge/install/preset.py +272 -0
  227. forge/install/settings_merge.py +831 -0
  228. forge/install/tracking.py +238 -0
  229. forge/install/version.py +141 -0
  230. forge/proxy/__init__.py +0 -0
  231. forge/proxy/base_client.py +181 -0
  232. forge/proxy/client_adapter.py +476 -0
  233. forge/proxy/client_factory.py +531 -0
  234. forge/proxy/converters.py +1206 -0
  235. forge/proxy/cost_logger.py +132 -0
  236. forge/proxy/cost_tracker.py +242 -0
  237. forge/proxy/data_models.py +338 -0
  238. forge/proxy/error_hints.py +92 -0
  239. forge/proxy/metrics.py +222 -0
  240. forge/proxy/model_spec.py +158 -0
  241. forge/proxy/proxies.py +333 -0
  242. forge/proxy/proxy_identity.py +134 -0
  243. forge/proxy/proxy_orchestrator.py +1018 -0
  244. forge/proxy/proxy_startup.py +54 -0
  245. forge/proxy/server.py +1561 -0
  246. forge/proxy/utils.py +537 -0
  247. forge/review/__init__.py +6 -0
  248. forge/review/adversarial.py +111 -0
  249. forge/review/consensus.py +236 -0
  250. forge/review/engine.py +356 -0
  251. forge/review/models.py +437 -0
  252. forge/review/resources/__init__.py +5 -0
  253. forge/review/resources/codereview-performance.md +85 -0
  254. forge/review/resources/codereview-quick.md +75 -0
  255. forge/review/resources/codereview-security.md +92 -0
  256. forge/review/resources/codereview.md +85 -0
  257. forge/review/resources/docreview-quick.md +75 -0
  258. forge/review/resources/docreview.md +86 -0
  259. forge/review/resources/thinkdeep.md +89 -0
  260. forge/review/routing.py +368 -0
  261. forge/review/synthesis.py +73 -0
  262. forge/runtime_config.py +438 -0
  263. forge/search/__init__.py +55 -0
  264. forge/search/bm25_store.py +264 -0
  265. forge/search/content_store.py +197 -0
  266. forge/search/engine.py +352 -0
  267. forge/search/exceptions.py +51 -0
  268. forge/search/extractor.py +234 -0
  269. forge/search/index_state.py +295 -0
  270. forge/search/store.py +215 -0
  271. forge/search/tokenizer.py +24 -0
  272. forge/session/__init__.py +130 -0
  273. forge/session/active.py +339 -0
  274. forge/session/artifacts.py +202 -0
  275. forge/session/claude/__init__.py +50 -0
  276. forge/session/claude/cleanup.py +105 -0
  277. forge/session/claude/invoke.py +236 -0
  278. forge/session/claude/paths.py +200 -0
  279. forge/session/cleanup.py +216 -0
  280. forge/session/config.py +34 -0
  281. forge/session/direct_model.py +107 -0
  282. forge/session/effective.py +169 -0
  283. forge/session/exceptions.py +255 -0
  284. forge/session/handoff.py +881 -0
  285. forge/session/handoff_agent.py +544 -0
  286. forge/session/hooks/__init__.py +35 -0
  287. forge/session/hooks/models.py +73 -0
  288. forge/session/hooks/session_start.py +507 -0
  289. forge/session/identity.py +84 -0
  290. forge/session/index.py +553 -0
  291. forge/session/manager.py +1506 -0
  292. forge/session/models.py +572 -0
  293. forge/session/overrides.py +344 -0
  294. forge/session/plan_resolution.py +286 -0
  295. forge/session/prev_sessions.py +128 -0
  296. forge/session/store.py +431 -0
  297. forge/session/validation.py +47 -0
  298. forge/session/worktree/__init__.py +65 -0
  299. forge/session/worktree/cleanup.py +262 -0
  300. forge/session/worktree/config_copy.py +203 -0
  301. forge/session/worktree/create.py +332 -0
  302. forge/sidecar/__init__.py +29 -0
  303. forge/sidecar/container.py +161 -0
  304. forge/sidecar/docker.py +86 -0
  305. forge/sidecar/secrets.py +19 -0
  306. multi_forge-0.2.0.dist-info/METADATA +242 -0
  307. multi_forge-0.2.0.dist-info/RECORD +311 -0
  308. multi_forge-0.2.0.dist-info/WHEEL +4 -0
  309. multi_forge-0.2.0.dist-info/entry_points.txt +2 -0
  310. multi_forge-0.2.0.dist-info/licenses/LICENSE +203 -0
  311. multi_forge-0.2.0.dist-info/licenses/NOTICE +14 -0
forge/search/engine.py ADDED
@@ -0,0 +1,352 @@
1
+ """BM25 search engine for transcript documents.
2
+
3
+ Provides BM25Okapi ranking for keyword search over extracted transcript content.
4
+ No external dependencies — hand-rolled BM25 implementation (~30 lines of math).
5
+
6
+ Two search entry points:
7
+ - search(): Legacy path — builds BM25 from in-memory documents (query-time construction)
8
+ - search_from_index(): Persistent index path — loads precomputed BM25 data (scoring only)
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import math
14
+ from collections.abc import Callable
15
+ from dataclasses import dataclass
16
+ from typing import Any
17
+
18
+ from .exceptions import BM25IndexCorruptedError, ContentStoreCorruptedError
19
+ from .extractor import SearchDocument, SearchDocumentMeta
20
+ from .tokenizer import TOKEN_RE, tokenize
21
+
22
+ # Search defaults
23
+ SNIPPET_LENGTH = 300
24
+ DEFAULT_LIMIT = 10
25
+
26
+
27
+ class BM25:
28
+ """BM25Okapi implementation for ranking documents against a query.
29
+
30
+ Standard BM25 with term frequency saturation and document length
31
+ normalization. No external dependencies.
32
+
33
+ Args:
34
+ documents: List of tokenized documents (each is a list of terms).
35
+ k1: Term frequency saturation parameter.
36
+ b: Document length normalization parameter.
37
+ """
38
+
39
+ def __init__(
40
+ self,
41
+ documents: list[list[str]],
42
+ *,
43
+ k1: float = 1.5,
44
+ b: float = 0.75,
45
+ ) -> None:
46
+ self._k1 = k1
47
+ self._b = b
48
+ self._doc_count = len(documents)
49
+ self._doc_lens = [len(d) for d in documents]
50
+ self._avgdl = sum(self._doc_lens) / max(self._doc_count, 1)
51
+
52
+ # Per-doc term frequencies
53
+ self._term_freqs: list[dict[str, int]] = []
54
+ # Number of docs containing each term
55
+ self._doc_freqs: dict[str, int] = {}
56
+
57
+ for doc in documents:
58
+ tf: dict[str, int] = {}
59
+ for term in doc:
60
+ tf[term] = tf.get(term, 0) + 1
61
+ self._term_freqs.append(tf)
62
+ for term in tf:
63
+ self._doc_freqs[term] = self._doc_freqs.get(term, 0) + 1
64
+
65
+ @property
66
+ def doc_freqs(self) -> dict[str, int]:
67
+ """Number of documents containing each term."""
68
+ return self._doc_freqs
69
+
70
+ @classmethod
71
+ def from_precomputed(
72
+ cls,
73
+ *,
74
+ term_freqs: list[dict[str, int]],
75
+ doc_freqs: dict[str, int],
76
+ doc_lens: list[int],
77
+ avgdl: float,
78
+ k1: float = 1.5,
79
+ b: float = 0.75,
80
+ ) -> BM25:
81
+ """Construct BM25 from pre-computed index data (no tokenization).
82
+
83
+ This is the fast path for persistent indices — skips the O(total_tokens)
84
+ initialization and directly sets internal state.
85
+ """
86
+ instance = cls.__new__(cls)
87
+ instance._k1 = k1
88
+ instance._b = b
89
+ instance._doc_count = len(doc_lens)
90
+ instance._doc_lens = doc_lens
91
+ instance._avgdl = avgdl
92
+ instance._term_freqs = term_freqs
93
+ instance._doc_freqs = doc_freqs
94
+ return instance
95
+
96
+ def to_precomputed(self) -> dict:
97
+ """Export pre-computed data for persistence.
98
+
99
+ Returns dict with keys: term_freqs, doc_freqs, doc_lens, avgdl, k1, b.
100
+ """
101
+ return {
102
+ "term_freqs": self._term_freqs,
103
+ "doc_freqs": dict(self._doc_freqs),
104
+ "doc_lens": list(self._doc_lens),
105
+ "avgdl": self._avgdl,
106
+ "k1": self._k1,
107
+ "b": self._b,
108
+ }
109
+
110
+ def score(self, query: list[str]) -> list[float]:
111
+ """Score all documents against the given query terms.
112
+
113
+ Returns list of scores in the same order as documents passed to __init__.
114
+ """
115
+ scores = [0.0] * self._doc_count
116
+ for term in query:
117
+ if term not in self._doc_freqs:
118
+ continue
119
+ df = self._doc_freqs[term]
120
+ idf = math.log((self._doc_count - df + 0.5) / (df + 0.5) + 1.0)
121
+ for i in range(self._doc_count):
122
+ tf = self._term_freqs[i].get(term, 0)
123
+ if tf == 0:
124
+ continue
125
+ dl = self._doc_lens[i]
126
+ tf_norm = (tf * (self._k1 + 1)) / (tf + self._k1 * (1 - self._b + self._b * dl / self._avgdl))
127
+ scores[i] += idf * tf_norm
128
+ return scores
129
+
130
+
131
+ def _best_snippet(
132
+ content: str,
133
+ query_tokens: list[str],
134
+ length: int = SNIPPET_LENGTH,
135
+ *,
136
+ doc_freqs: dict[str, int] | None = None,
137
+ ) -> str:
138
+ """Extract a snippet centered on the rarest query term's first occurrence.
139
+
140
+ Scans the content for all query token matches in a single O(n) pass,
141
+ preferring the first occurrence of the rarest term (lowest doc_freqs
142
+ count). This anchors snippets on the most distinctive query term rather
143
+ than the first match of any term.
144
+
145
+ Iterates on the original content (not lowercased) to preserve correct
146
+ character positions for Unicode text where lowercasing can change length.
147
+
148
+ Falls back to the first `length` characters if no query terms are found.
149
+ """
150
+ if len(content) <= length:
151
+ return content
152
+
153
+ query_set = set(query_tokens)
154
+
155
+ # Single pass: find first occurrence of the rarest query term
156
+ best_pos: int | None = None
157
+ best_rarity = float("inf")
158
+
159
+ for match in TOKEN_RE.finditer(content):
160
+ token = match.group().lower()
161
+ if token not in query_set:
162
+ continue
163
+ rarity = doc_freqs.get(token, 0) if doc_freqs else 0
164
+ if rarity < best_rarity:
165
+ best_pos = match.start()
166
+ best_rarity = rarity
167
+ if rarity <= 1:
168
+ break # Term appears in ≤1 doc — can't get rarer
169
+
170
+ if best_pos is not None:
171
+ return _extract_window(content, best_pos, length)
172
+
173
+ # No query terms found — fall back to beginning
174
+ return content[:length]
175
+
176
+
177
+ def _extract_window(content: str, center: int, length: int) -> str:
178
+ """Extract a snippet window centered on a character position."""
179
+ start = max(0, center - length // 2)
180
+ end = start + length
181
+ if end > len(content):
182
+ end = len(content)
183
+ start = max(0, end - length)
184
+ snippet = content[start:end]
185
+ prefix = "..." if start > 0 else ""
186
+ suffix = "..." if end < len(content) else ""
187
+ return prefix + snippet + suffix
188
+
189
+
190
+ @dataclass
191
+ class SearchResult:
192
+ """A single search result."""
193
+
194
+ session_name: str
195
+ session_id: str
196
+ score: float
197
+ snippet: str
198
+ transcript_path: str
199
+ metadata: dict[str, Any]
200
+
201
+
202
+ def search(
203
+ query: str,
204
+ documents: list[SearchDocument],
205
+ *,
206
+ limit: int = DEFAULT_LIMIT,
207
+ ) -> list[SearchResult]:
208
+ """Search documents using BM25.
209
+
210
+ Builds BM25 index at query time from provided documents.
211
+ Returns top-K results sorted by score descending.
212
+
213
+ Args:
214
+ query: Search query string.
215
+ documents: List of SearchDocument to search over.
216
+ limit: Maximum number of results to return.
217
+
218
+ Returns:
219
+ List of SearchResult sorted by score descending.
220
+ """
221
+ if not query.strip() or not documents:
222
+ return []
223
+
224
+ query_tokens = tokenize(query)
225
+ if not query_tokens:
226
+ return []
227
+
228
+ doc_tokens = [doc.tokens if doc.tokens is not None else tokenize(doc.content) for doc in documents]
229
+ bm25 = BM25(doc_tokens)
230
+ scores = bm25.score(query_tokens)
231
+
232
+ # Pair scores with documents, filter zero scores, sort descending
233
+ scored = [(s, doc) for s, doc in zip(scores, documents) if s > 0]
234
+ scored.sort(key=lambda x: x[0], reverse=True)
235
+
236
+ results: list[SearchResult] = []
237
+ for s, doc in scored[:limit]:
238
+ results.append(
239
+ SearchResult(
240
+ session_name=doc.session_name,
241
+ session_id=doc.session_id,
242
+ score=round(s, 4),
243
+ snippet=_best_snippet(doc.content, query_tokens, doc_freqs=bm25.doc_freqs),
244
+ transcript_path=doc.transcript_path,
245
+ metadata=doc.metadata,
246
+ )
247
+ )
248
+ return results
249
+
250
+
251
+ def search_from_index(
252
+ query: str,
253
+ *,
254
+ doc_keys: list[str],
255
+ term_freqs: list[dict[str, int]],
256
+ doc_freqs: dict[str, int],
257
+ doc_lens: list[int],
258
+ avgdl: float,
259
+ k1: float = 1.5,
260
+ b: float = 0.75,
261
+ content_loader: Callable[[list[str]], dict[str, str]],
262
+ doc_metadata: dict[str, SearchDocumentMeta],
263
+ limit: int = DEFAULT_LIMIT,
264
+ ) -> list[SearchResult]:
265
+ """Search using a pre-computed persistent BM25 index.
266
+
267
+ This is the fast path: loads precomputed data structures, runs scoring
268
+ only, then lazily loads content for snippet extraction on top-K results.
269
+
270
+ Args:
271
+ query: Search query string.
272
+ doc_keys: Positional document keys (transcript_paths) matching term_freqs/doc_lens.
273
+ term_freqs: Per-document term frequency dicts (positional).
274
+ doc_freqs: Global document frequency dict.
275
+ doc_lens: Per-document token counts (positional).
276
+ avgdl: Average document length across corpus.
277
+ k1: BM25 term saturation parameter.
278
+ b: BM25 length normalization parameter.
279
+ content_loader: Callable that takes a list of doc keys and returns {key: content}.
280
+ doc_metadata: Mapping of transcript_path -> SearchDocumentMeta.
281
+ limit: Maximum number of results.
282
+
283
+ Returns:
284
+ List of SearchResult sorted by score descending.
285
+
286
+ Raises:
287
+ BM25IndexCorruptedError: If doc_keys has entries not in doc_metadata.
288
+ ContentStoreCorruptedError: If content_loader is missing a top-K key.
289
+ """
290
+ if not query.strip() or not doc_keys:
291
+ return []
292
+
293
+ query_tokens = tokenize(query)
294
+ if not query_tokens:
295
+ return []
296
+
297
+ # Validate invariant: every indexed doc must have metadata
298
+ missing_meta = [k for k in doc_keys if k not in doc_metadata]
299
+ if missing_meta:
300
+ raise BM25IndexCorruptedError(
301
+ "bm25_index",
302
+ f"{len(missing_meta)} indexed documents missing from metadata store. "
303
+ "Run 'forge search rebuild-index' to fix.",
304
+ )
305
+
306
+ # Score using precomputed data (no token iteration)
307
+ bm25 = BM25.from_precomputed(
308
+ term_freqs=term_freqs,
309
+ doc_freqs=doc_freqs,
310
+ doc_lens=doc_lens,
311
+ avgdl=avgdl,
312
+ k1=k1,
313
+ b=b,
314
+ )
315
+ scores = bm25.score(query_tokens)
316
+
317
+ # Pair scores with doc keys, filter zero scores, sort descending
318
+ scored = [(s, key) for s, key in zip(scores, doc_keys) if s > 0]
319
+ scored.sort(key=lambda x: x[0], reverse=True)
320
+ top_k = scored[:limit]
321
+
322
+ if not top_k:
323
+ return []
324
+
325
+ # Lazy content loading: only fetch content for top-K results
326
+ top_keys = [key for _, key in top_k]
327
+ content_map = content_loader(top_keys)
328
+
329
+ # Validate content availability
330
+ missing_content = [k for k in top_keys if k not in content_map]
331
+ if missing_content:
332
+ raise ContentStoreCorruptedError(
333
+ "content",
334
+ f"{len(missing_content)} top-K documents missing from content store. "
335
+ "Run 'forge search rebuild-index' to fix.",
336
+ )
337
+
338
+ results: list[SearchResult] = []
339
+ for s, key in top_k:
340
+ meta = doc_metadata[key]
341
+ content = content_map[key]
342
+ results.append(
343
+ SearchResult(
344
+ session_name=meta.session_name,
345
+ session_id=meta.session_id,
346
+ score=round(s, 4),
347
+ snippet=_best_snippet(content, query_tokens, doc_freqs=doc_freqs),
348
+ transcript_path=meta.transcript_path,
349
+ metadata=meta.metadata,
350
+ )
351
+ )
352
+ return results
@@ -0,0 +1,51 @@
1
+ """Exceptions for the Forge search module.
2
+
3
+ Follows the forge.core.state exception hierarchy:
4
+ - SearchError is the module-level base
5
+ - IndexStateCorruptedError inherits StateCorruptedError for consistency
6
+ with BackendRegistryCorruptedError and other state corruption errors
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from forge.core.state import StateCorruptedError
12
+
13
+
14
+ class SearchError(Exception):
15
+ """Base exception for search module operations."""
16
+
17
+
18
+ class IndexStateCorruptedError(StateCorruptedError):
19
+ """Raised when the index state file cannot be parsed.
20
+
21
+ Inherits (path, reason) signature from StateCorruptedError.
22
+ """
23
+
24
+ pass
25
+
26
+
27
+ class SearchDocumentStoreCorruptedError(StateCorruptedError):
28
+ """Raised when the document store file cannot be parsed.
29
+
30
+ Inherits (path, reason) signature from StateCorruptedError.
31
+ """
32
+
33
+ pass
34
+
35
+
36
+ class BM25IndexCorruptedError(StateCorruptedError):
37
+ """Raised when the BM25 index file cannot be parsed or is inconsistent.
38
+
39
+ Inherits (path, reason) signature from StateCorruptedError.
40
+ """
41
+
42
+ pass
43
+
44
+
45
+ class ContentStoreCorruptedError(StateCorruptedError):
46
+ """Raised when the content store file cannot be parsed or is inconsistent.
47
+
48
+ Inherits (path, reason) signature from StateCorruptedError.
49
+ """
50
+
51
+ pass
@@ -0,0 +1,234 @@
1
+ """Content extraction from JSONL transcripts for search indexing.
2
+
3
+ Extracts searchable text from Forge transcript artifacts, producing one
4
+ SearchDocument per transcript file. Content extraction rules (design.md §5.5):
5
+ - User/assistant text messages: fully indexed
6
+ - Tool inputs (file paths, commands): truncated to 100 chars
7
+ - Tool results: truncated to 500 chars
8
+
9
+ Uses shared parsing primitives from forge.core.transcript.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import json
15
+ import logging
16
+ from dataclasses import asdict, dataclass, field
17
+ from pathlib import Path
18
+ from typing import Any
19
+
20
+ from forge.core.state import now_iso
21
+ from forge.core.transcript import parse_jsonl_transcript, truncate
22
+
23
+ from .tokenizer import tokenize
24
+
25
+ # --- Data classes ---
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+ # Truncation limits
30
+ TOOL_RESULT_TRUNCATE_CHARS = 500
31
+ TOOL_ARG_TRUNCATE_CHARS = 100
32
+
33
+
34
+ @dataclass
35
+ class SearchDocumentMeta:
36
+ """Metadata-only view of a search document (no content, no tokens).
37
+
38
+ Used by the v2 document store for lightweight persistence and by
39
+ search_from_index() for result construction.
40
+ """
41
+
42
+ transcript_path: str # Absolute path (JSON-serializable key)
43
+ session_name: str
44
+ session_id: str
45
+ extracted_at: str # ISO8601
46
+ metadata: dict[str, Any] = field(default_factory=dict)
47
+
48
+ def to_dict(self) -> dict[str, Any]:
49
+ """Serialize to dict for JSON storage."""
50
+ return asdict(self)
51
+
52
+
53
+ @dataclass
54
+ class SearchDocument:
55
+ """Extracted content from a single transcript file for search indexing.
56
+
57
+ Full extraction output including content and tokens. Used at extraction
58
+ time; callers decompose into metadata, term frequencies, and content
59
+ for the three-store architecture via decompose_document().
60
+ """
61
+
62
+ transcript_path: str # Absolute path (JSON-serializable key)
63
+ session_name: str
64
+ session_id: str
65
+ content: str # Full extracted text for BM25 indexing
66
+ extracted_at: str # ISO8601
67
+ metadata: dict[str, Any] = field(default_factory=dict)
68
+ tokens: list[str] | None = None # Cached tokenization (used at extraction time)
69
+
70
+ def to_dict(self) -> dict[str, Any]:
71
+ """Serialize to dict for JSON storage."""
72
+ return asdict(self)
73
+
74
+
75
+ def extract_document(
76
+ transcript_path: Path,
77
+ session_name: str,
78
+ session_id: str,
79
+ worktree_path: str,
80
+ ) -> SearchDocument:
81
+ """Extract searchable content from a JSONL transcript file.
82
+
83
+ Parses each JSONL line and extracts:
84
+ - User/assistant text messages (full)
85
+ - Tool use summaries (name + key args, truncated)
86
+ - Tool results (truncated to 500 chars)
87
+
88
+ Args:
89
+ transcript_path: Absolute path to the .jsonl transcript file.
90
+ session_name: Forge session name.
91
+ session_id: Claude session UUID.
92
+ worktree_path: Worktree path where session ran.
93
+
94
+ Returns:
95
+ SearchDocument with extracted content and metadata.
96
+
97
+ Raises:
98
+ FileNotFoundError: If transcript_path does not exist.
99
+ """
100
+ if not transcript_path.is_file():
101
+ raise FileNotFoundError(str(transcript_path))
102
+
103
+ entries = parse_jsonl_transcript(transcript_path)
104
+ parts: list[str] = []
105
+ message_count = 0
106
+ first_ts = ""
107
+ last_ts = ""
108
+
109
+ for entry in entries:
110
+ extracted = _extract_entry_text(entry)
111
+ if extracted is None:
112
+ continue
113
+
114
+ role, text, timestamp = extracted
115
+ parts.append(f"[{role}] {text}")
116
+ message_count += 1
117
+
118
+ if timestamp:
119
+ if not first_ts:
120
+ first_ts = timestamp
121
+ last_ts = timestamp
122
+
123
+ content = "\n".join(parts)
124
+
125
+ return SearchDocument(
126
+ transcript_path=str(transcript_path),
127
+ session_name=session_name,
128
+ session_id=session_id,
129
+ content=content,
130
+ extracted_at=now_iso(),
131
+ metadata={
132
+ "message_count": message_count,
133
+ "first_timestamp": first_ts,
134
+ "last_timestamp": last_ts,
135
+ "worktree_path": worktree_path,
136
+ },
137
+ tokens=tokenize(content),
138
+ )
139
+
140
+
141
+ def _extract_entry_text(entry: dict[str, Any]) -> tuple[str, str, str] | None:
142
+ """Extract text content from a single transcript entry.
143
+
144
+ Returns:
145
+ (role, text, timestamp) tuple, or None if entry is not a valid message.
146
+ """
147
+ message = entry.get("message")
148
+ if not isinstance(message, dict):
149
+ return None
150
+
151
+ role = message.get("role")
152
+ if role not in ("user", "assistant"):
153
+ return None
154
+
155
+ content = message.get("content")
156
+ if not isinstance(content, list):
157
+ return None
158
+
159
+ text_parts: list[str] = []
160
+
161
+ for block in content:
162
+ if not isinstance(block, dict):
163
+ continue
164
+
165
+ block_type = block.get("type")
166
+
167
+ if block_type == "text":
168
+ t = block.get("text")
169
+ if isinstance(t, str) and t:
170
+ text_parts.append(t)
171
+
172
+ elif block_type == "tool_use":
173
+ name = block.get("name", "unknown")
174
+ inp = block.get("input", {})
175
+ if isinstance(inp, dict):
176
+ path = inp.get("file_path") or inp.get("path")
177
+ cmd = inp.get("command")
178
+ if path:
179
+ text_parts.append(f"{name}(path={truncate(str(path), TOOL_ARG_TRUNCATE_CHARS)})")
180
+ elif cmd:
181
+ text_parts.append(f"{name}(command={truncate(str(cmd), TOOL_ARG_TRUNCATE_CHARS)})")
182
+ else:
183
+ text_parts.append(f"{name}(...)")
184
+ else:
185
+ text_parts.append(f"{name}(...)")
186
+
187
+ elif block_type == "tool_result":
188
+ result = block.get("content", "")
189
+ # Handle non-string tool results (dict/list in some Claude versions)
190
+ if not isinstance(result, str):
191
+ try:
192
+ result = json.dumps(result, ensure_ascii=False)
193
+ except (TypeError, ValueError):
194
+ result = str(result)
195
+ if result:
196
+ text_parts.append(f"[result: {truncate(result, TOOL_RESULT_TRUNCATE_CHARS)}]")
197
+
198
+ if not text_parts:
199
+ return None
200
+
201
+ timestamp = entry.get("timestamp", "")
202
+ if not isinstance(timestamp, str):
203
+ timestamp = ""
204
+
205
+ return role, " ".join(text_parts), timestamp
206
+
207
+
208
+ # --- Decomposition (full document → three-store components) ---
209
+
210
+
211
+ def decompose_document(
212
+ doc: SearchDocument,
213
+ ) -> tuple[SearchDocumentMeta, dict[str, int], int, str]:
214
+ """Decompose a full SearchDocument into components for the three-store architecture.
215
+
216
+ Returns:
217
+ (metadata, term_freq, doc_len, content) where:
218
+ - metadata: SearchDocumentMeta for the document store
219
+ - term_freq: term frequency dict for the BM25 index store
220
+ - doc_len: token count for BM25 length normalization
221
+ - content: raw content string for the content store
222
+ """
223
+ tokens = doc.tokens if doc.tokens is not None else tokenize(doc.content)
224
+ tf: dict[str, int] = {}
225
+ for t in tokens:
226
+ tf[t] = tf.get(t, 0) + 1
227
+ meta = SearchDocumentMeta(
228
+ transcript_path=doc.transcript_path,
229
+ session_name=doc.session_name,
230
+ session_id=doc.session_id,
231
+ extracted_at=doc.extracted_at,
232
+ metadata=doc.metadata,
233
+ )
234
+ return meta, tf, len(tokens), doc.content