multi-forge 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- forge/__init__.py +3 -0
- forge/_extensions/agents/.gitkeep +0 -0
- forge/_extensions/commands/.gitkeep +0 -0
- forge/_extensions/skills/analyze/SKILL.md +87 -0
- forge/_extensions/skills/challenge/SKILL.md +91 -0
- forge/_extensions/skills/consensus/SKILL.md +120 -0
- forge/_extensions/skills/consensus/resources/code_consensus_evaluation.md +94 -0
- forge/_extensions/skills/consensus/resources/consensus_evaluation.md +70 -0
- forge/_extensions/skills/consensus/resources/synthesis.md +101 -0
- forge/_extensions/skills/debate/SKILL.md +116 -0
- forge/_extensions/skills/debate/resources/code_debate_evaluation.md +101 -0
- forge/_extensions/skills/debate/resources/debate_evaluation.md +90 -0
- forge/_extensions/skills/panel/SKILL.md +141 -0
- forge/_extensions/skills/panel/resources/synthesis.md +103 -0
- forge/_extensions/skills/qa/SKILL.md +704 -0
- forge/_extensions/skills/qa/resources/checklist/0-enable.md +78 -0
- forge/_extensions/skills/qa/resources/checklist/1-preflight.md +24 -0
- forge/_extensions/skills/qa/resources/checklist/10-resume.md +143 -0
- forge/_extensions/skills/qa/resources/checklist/11-config.md +150 -0
- forge/_extensions/skills/qa/resources/checklist/12-search.md +58 -0
- forge/_extensions/skills/qa/resources/checklist/13-guard.md +237 -0
- forge/_extensions/skills/qa/resources/checklist/14-workflow.md +305 -0
- forge/_extensions/skills/qa/resources/checklist/15-skills.md +155 -0
- forge/_extensions/skills/qa/resources/checklist/16-handoff.md +224 -0
- forge/_extensions/skills/qa/resources/checklist/17-info.md +50 -0
- forge/_extensions/skills/qa/resources/checklist/18-disable.md +84 -0
- forge/_extensions/skills/qa/resources/checklist/19-uninstall.md +146 -0
- forge/_extensions/skills/qa/resources/checklist/2-extensions.md +188 -0
- forge/_extensions/skills/qa/resources/checklist/20-cleanup.md +36 -0
- forge/_extensions/skills/qa/resources/checklist/3-auth.md +234 -0
- forge/_extensions/skills/qa/resources/checklist/4-proxy.md +481 -0
- forge/_extensions/skills/qa/resources/checklist/5-session.md +541 -0
- forge/_extensions/skills/qa/resources/checklist/6-hooks.md +275 -0
- forge/_extensions/skills/qa/resources/checklist/7-costs.md +309 -0
- forge/_extensions/skills/qa/resources/checklist/8-status-line.md +174 -0
- forge/_extensions/skills/qa/resources/checklist/9-direct-commands.md +146 -0
- forge/_extensions/skills/qa/resources/checklist.md +103 -0
- forge/_extensions/skills/qa/resources/report-template.md +62 -0
- forge/_extensions/skills/qa/scripts/start-container.sh +529 -0
- forge/_extensions/skills/qa/scripts/walkthrough-state.py +1137 -0
- forge/_extensions/skills/review/SKILL.md +125 -0
- forge/_extensions/skills/review/references/claude-4.6.md +474 -0
- forge/_extensions/skills/review/references/claude-4.7.md +710 -0
- forge/_extensions/skills/review/references/gemini-3.1.md +546 -0
- forge/_extensions/skills/review/references/gpt-5.5.md +490 -0
- forge/_extensions/skills/review/references/skills-writing-guide.md +1588 -0
- forge/_extensions/skills/review/resources/code-anthropic.md +160 -0
- forge/_extensions/skills/review/resources/code-gemini.md +184 -0
- forge/_extensions/skills/review/resources/code-openai.md +203 -0
- forge/_extensions/skills/review/resources/code.md +160 -0
- forge/_extensions/skills/review-docs/SKILL.md +121 -0
- forge/_extensions/skills/review-docs/resources/docs-anthropic.md +170 -0
- forge/_extensions/skills/review-docs/resources/docs-gemini.md +204 -0
- forge/_extensions/skills/review-docs/resources/docs-openai.md +231 -0
- forge/_extensions/skills/review-docs/resources/docs.md +170 -0
- forge/_extensions/skills/smoke-test/SKILL.md +27 -0
- forge/_extensions/skills/smoke-test/scripts/smoke-test.sh +118 -0
- forge/_extensions/skills/understand/SKILL.md +148 -0
- forge/_extensions/skills/understand/resources/code-anthropic.md +163 -0
- forge/_extensions/skills/understand/resources/code-gemini.md +194 -0
- forge/_extensions/skills/understand/resources/code-openai.md +181 -0
- forge/_extensions/skills/understand/resources/code.md +163 -0
- forge/_extensions/skills/understand/resources/docs-anthropic.md +177 -0
- forge/_extensions/skills/understand/resources/docs-gemini.md +202 -0
- forge/_extensions/skills/understand/resources/docs-openai.md +191 -0
- forge/_extensions/skills/understand/resources/docs.md +177 -0
- forge/_extensions/skills/walkthrough/SKILL.md +599 -0
- forge/_extensions/skills/walkthrough/resources/checklist.md +765 -0
- forge/_extensions/skills/walkthrough/scripts/run-in-repo.sh +118 -0
- forge/_extensions/skills/walkthrough/scripts/setup-test-repo.sh +198 -0
- forge/_extensions/skills/walkthrough/scripts/walkthrough-state.py +1137 -0
- forge/backend/__init__.py +174 -0
- forge/backend/adapters/__init__.py +38 -0
- forge/backend/adapters/litellm.py +158 -0
- forge/backend/creation.py +89 -0
- forge/backend/registry.py +178 -0
- forge/cli/__init__.py +16 -0
- forge/cli/auth.py +483 -0
- forge/cli/backend.py +298 -0
- forge/cli/claude.py +411 -0
- forge/cli/config_cmd.py +303 -0
- forge/cli/extensions.py +1001 -0
- forge/cli/gc.py +165 -0
- forge/cli/guard.py +1018 -0
- forge/cli/guards.py +106 -0
- forge/cli/handoff.py +110 -0
- forge/cli/hooks/__init__.py +36 -0
- forge/cli/hooks/_group.py +20 -0
- forge/cli/hooks/_helpers.py +149 -0
- forge/cli/hooks/commands.py +1677 -0
- forge/cli/hooks/direct_commands.py +1304 -0
- forge/cli/hooks/install.py +232 -0
- forge/cli/hooks/policy.py +151 -0
- forge/cli/hooks/read_hygiene.py +74 -0
- forge/cli/hooks/verification.py +370 -0
- forge/cli/logs.py +406 -0
- forge/cli/main.py +292 -0
- forge/cli/proxy.py +1821 -0
- forge/cli/proxy_costs.py +313 -0
- forge/cli/search.py +416 -0
- forge/cli/session.py +892 -0
- forge/cli/session_addendum.py +81 -0
- forge/cli/session_fork.py +750 -0
- forge/cli/session_handoff.py +141 -0
- forge/cli/session_lifecycle.py +2053 -0
- forge/cli/session_manage.py +1336 -0
- forge/cli/session_memory.py +201 -0
- forge/cli/status_line.py +1398 -0
- forge/cli/workflow.py +1964 -0
- forge/config/__init__.py +110 -0
- forge/config/dataclass_utils.py +88 -0
- forge/config/defaults/__init__.py +0 -0
- forge/config/defaults/backends/__init__.py +0 -0
- forge/config/defaults/backends/litellm.yaml +196 -0
- forge/config/defaults/templates/__init__.py +0 -0
- forge/config/defaults/templates/litellm-anthropic-local.yaml +33 -0
- forge/config/defaults/templates/litellm-anthropic.yaml +24 -0
- forge/config/defaults/templates/litellm-gemini-flash-local.yaml +37 -0
- forge/config/defaults/templates/litellm-gemini-local.yaml +32 -0
- forge/config/defaults/templates/litellm-gemini-test.yaml +34 -0
- forge/config/defaults/templates/litellm-gemini.yaml +21 -0
- forge/config/defaults/templates/litellm-openai-codex-local.yaml +36 -0
- forge/config/defaults/templates/litellm-openai-local.yaml +38 -0
- forge/config/defaults/templates/litellm-openai.yaml +28 -0
- forge/config/defaults/templates/openrouter-anthropic.yaml +23 -0
- forge/config/defaults/templates/openrouter-deepseek.yaml +26 -0
- forge/config/defaults/templates/openrouter-gemini-flash.yaml +26 -0
- forge/config/defaults/templates/openrouter-gemini.yaml +23 -0
- forge/config/defaults/templates/openrouter-glm.yaml +23 -0
- forge/config/defaults/templates/openrouter-kimi.yaml +30 -0
- forge/config/defaults/templates/openrouter-minimax.yaml +26 -0
- forge/config/defaults/templates/openrouter-openai-codex.yaml +23 -0
- forge/config/defaults/templates/openrouter-openai.yaml +28 -0
- forge/config/defaults/templates/openrouter-qwen.yaml +25 -0
- forge/config/loader.py +675 -0
- forge/config/schema.py +448 -0
- forge/core/__init__.py +5 -0
- forge/core/auth/__init__.py +67 -0
- forge/core/auth/capabilities.py +219 -0
- forge/core/auth/credentials_file.py +244 -0
- forge/core/auth/protocols.py +18 -0
- forge/core/auth/secrets.py +243 -0
- forge/core/auth/template_secrets.py +112 -0
- forge/core/data/__init__.py +5 -0
- forge/core/data/model_catalog.yaml +1522 -0
- forge/core/data/pricing.yaml +140 -0
- forge/core/data/system_prompt_addendums/__init__.py +0 -0
- forge/core/data/system_prompt_addendums/gemini.md +330 -0
- forge/core/data/system_prompt_addendums/openai.md +328 -0
- forge/core/llm/__init__.py +231 -0
- forge/core/llm/clients/__init__.py +14 -0
- forge/core/llm/clients/base.py +115 -0
- forge/core/llm/clients/litellm.py +619 -0
- forge/core/llm/clients/openai_compat.py +244 -0
- forge/core/llm/clients/openrouter.py +234 -0
- forge/core/llm/credentials.py +439 -0
- forge/core/llm/detection.py +86 -0
- forge/core/llm/errors.py +44 -0
- forge/core/llm/protocols.py +80 -0
- forge/core/llm/types.py +176 -0
- forge/core/logging.py +146 -0
- forge/core/models/__init__.py +91 -0
- forge/core/models/catalog.py +467 -0
- forge/core/models/pricing.py +165 -0
- forge/core/models/types.py +167 -0
- forge/core/naming.py +212 -0
- forge/core/ops/__init__.py +73 -0
- forge/core/ops/context.py +141 -0
- forge/core/ops/gc.py +802 -0
- forge/core/ops/proxy.py +146 -0
- forge/core/ops/resolution.py +135 -0
- forge/core/ops/session.py +344 -0
- forge/core/ops/session_context.py +548 -0
- forge/core/paths.py +38 -0
- forge/core/process.py +54 -0
- forge/core/reactive/__init__.py +38 -0
- forge/core/reactive/cost_tracking.py +300 -0
- forge/core/reactive/env.py +180 -0
- forge/core/reactive/proxy.py +78 -0
- forge/core/reactive/routing.py +622 -0
- forge/core/reactive/session_runner.py +185 -0
- forge/core/reactive/structured_output.py +62 -0
- forge/core/reactive/tagger.py +94 -0
- forge/core/reactive/throttle.py +132 -0
- forge/core/state/__init__.py +59 -0
- forge/core/state/exceptions.py +59 -0
- forge/core/state/io.py +140 -0
- forge/core/state/lock.py +99 -0
- forge/core/state/timestamps.py +60 -0
- forge/core/transcript.py +78 -0
- forge/core/typing_helpers.py +24 -0
- forge/core/workqueue/__init__.py +67 -0
- forge/core/workqueue/queue.py +552 -0
- forge/core/workqueue/types.py +63 -0
- forge/guard/__init__.py +26 -0
- forge/guard/deterministic/__init__.py +26 -0
- forge/guard/deterministic/base.py +158 -0
- forge/guard/deterministic/coding_standards.py +256 -0
- forge/guard/deterministic/registry.py +148 -0
- forge/guard/deterministic/tdd.py +171 -0
- forge/guard/engine.py +216 -0
- forge/guard/protocols.py +91 -0
- forge/guard/queries.py +96 -0
- forge/guard/semantic/__init__.py +34 -0
- forge/guard/semantic/promotion.py +18 -0
- forge/guard/semantic/supervisor.py +813 -0
- forge/guard/semantic/verdict.py +183 -0
- forge/guard/store.py +124 -0
- forge/guard/team/__init__.py +6 -0
- forge/guard/team/config.py +24 -0
- forge/guard/team/handlers.py +209 -0
- forge/guard/team/prompts.py +41 -0
- forge/guard/types.py +125 -0
- forge/guard/workflow/__init__.py +17 -0
- forge/guard/workflow/branches.py +67 -0
- forge/guard/workflow/config.py +63 -0
- forge/guard/workflow/divergence.py +113 -0
- forge/guard/workflow/policy.py +87 -0
- forge/guard/workflow/stages.py +205 -0
- forge/install/__init__.py +55 -0
- forge/install/cli.py +281 -0
- forge/install/exceptions.py +163 -0
- forge/install/hooks.py +109 -0
- forge/install/installer.py +1037 -0
- forge/install/models.py +321 -0
- forge/install/preset.py +272 -0
- forge/install/settings_merge.py +831 -0
- forge/install/tracking.py +238 -0
- forge/install/version.py +141 -0
- forge/proxy/__init__.py +0 -0
- forge/proxy/base_client.py +181 -0
- forge/proxy/client_adapter.py +476 -0
- forge/proxy/client_factory.py +531 -0
- forge/proxy/converters.py +1206 -0
- forge/proxy/cost_logger.py +132 -0
- forge/proxy/cost_tracker.py +242 -0
- forge/proxy/data_models.py +338 -0
- forge/proxy/error_hints.py +92 -0
- forge/proxy/metrics.py +222 -0
- forge/proxy/model_spec.py +158 -0
- forge/proxy/proxies.py +333 -0
- forge/proxy/proxy_identity.py +134 -0
- forge/proxy/proxy_orchestrator.py +1018 -0
- forge/proxy/proxy_startup.py +54 -0
- forge/proxy/server.py +1561 -0
- forge/proxy/utils.py +537 -0
- forge/review/__init__.py +6 -0
- forge/review/adversarial.py +111 -0
- forge/review/consensus.py +236 -0
- forge/review/engine.py +356 -0
- forge/review/models.py +437 -0
- forge/review/resources/__init__.py +5 -0
- forge/review/resources/codereview-performance.md +85 -0
- forge/review/resources/codereview-quick.md +75 -0
- forge/review/resources/codereview-security.md +92 -0
- forge/review/resources/codereview.md +85 -0
- forge/review/resources/docreview-quick.md +75 -0
- forge/review/resources/docreview.md +86 -0
- forge/review/resources/thinkdeep.md +89 -0
- forge/review/routing.py +368 -0
- forge/review/synthesis.py +73 -0
- forge/runtime_config.py +438 -0
- forge/search/__init__.py +55 -0
- forge/search/bm25_store.py +264 -0
- forge/search/content_store.py +197 -0
- forge/search/engine.py +352 -0
- forge/search/exceptions.py +51 -0
- forge/search/extractor.py +234 -0
- forge/search/index_state.py +295 -0
- forge/search/store.py +215 -0
- forge/search/tokenizer.py +24 -0
- forge/session/__init__.py +130 -0
- forge/session/active.py +339 -0
- forge/session/artifacts.py +202 -0
- forge/session/claude/__init__.py +50 -0
- forge/session/claude/cleanup.py +105 -0
- forge/session/claude/invoke.py +236 -0
- forge/session/claude/paths.py +200 -0
- forge/session/cleanup.py +216 -0
- forge/session/config.py +34 -0
- forge/session/direct_model.py +107 -0
- forge/session/effective.py +169 -0
- forge/session/exceptions.py +255 -0
- forge/session/handoff.py +881 -0
- forge/session/handoff_agent.py +544 -0
- forge/session/hooks/__init__.py +35 -0
- forge/session/hooks/models.py +73 -0
- forge/session/hooks/session_start.py +507 -0
- forge/session/identity.py +84 -0
- forge/session/index.py +553 -0
- forge/session/manager.py +1506 -0
- forge/session/models.py +572 -0
- forge/session/overrides.py +344 -0
- forge/session/plan_resolution.py +286 -0
- forge/session/prev_sessions.py +128 -0
- forge/session/store.py +431 -0
- forge/session/validation.py +47 -0
- forge/session/worktree/__init__.py +65 -0
- forge/session/worktree/cleanup.py +262 -0
- forge/session/worktree/config_copy.py +203 -0
- forge/session/worktree/create.py +332 -0
- forge/sidecar/__init__.py +29 -0
- forge/sidecar/container.py +161 -0
- forge/sidecar/docker.py +86 -0
- forge/sidecar/secrets.py +19 -0
- multi_forge-0.2.0.dist-info/METADATA +242 -0
- multi_forge-0.2.0.dist-info/RECORD +311 -0
- multi_forge-0.2.0.dist-info/WHEEL +4 -0
- multi_forge-0.2.0.dist-info/entry_points.txt +2 -0
- multi_forge-0.2.0.dist-info/licenses/LICENSE +203 -0
- multi_forge-0.2.0.dist-info/licenses/NOTICE +14 -0
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
"""Persistent BM25 index store.
|
|
2
|
+
|
|
3
|
+
Persists precomputed BM25 data structures (term frequencies, document
|
|
4
|
+
frequencies, corpus stats) so queries only run scoring, not index
|
|
5
|
+
construction.
|
|
6
|
+
|
|
7
|
+
Store location: <project_root>/.forge/search-index/bm25_index.json
|
|
8
|
+
|
|
9
|
+
Follows the same patterns as SearchDocumentStore/IndexStateStore:
|
|
10
|
+
atomic writes, file locking, schema versioning, self-healing on missing file.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import json
|
|
16
|
+
import logging
|
|
17
|
+
from dataclasses import asdict, dataclass, field
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
from typing import Any
|
|
20
|
+
|
|
21
|
+
from forge.core.state import (
|
|
22
|
+
SchemaVersionError,
|
|
23
|
+
atomic_write_json,
|
|
24
|
+
file_lock_for_target,
|
|
25
|
+
now_iso,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
from .exceptions import BM25IndexCorruptedError
|
|
29
|
+
from .index_state import SEARCH_INDEX_DIR
|
|
30
|
+
|
|
31
|
+
logger = logging.getLogger(__name__)
|
|
32
|
+
|
|
33
|
+
BM25_INDEX_FILENAME = "bm25_index.json"
|
|
34
|
+
BM25_INDEX_VERSION = 1
|
|
35
|
+
|
|
36
|
+
# Bump when TOKEN_RE or tokenize() logic changes — mismatch forces rebuild
|
|
37
|
+
# to prevent silently wrong scores.
|
|
38
|
+
TOKENIZER_ID = "v1"
|
|
39
|
+
|
|
40
|
+
STORE_LOCK_TIMEOUT_S = 5.0
|
|
41
|
+
HANDLER_LOCK_TIMEOUT_S = 1.0
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@dataclass
|
|
45
|
+
class BM25IndexData:
|
|
46
|
+
"""Serializable BM25 index state.
|
|
47
|
+
|
|
48
|
+
Positional alignment: doc_keys[i], doc_lens[i], term_freqs[i] all
|
|
49
|
+
refer to the same document.
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
doc_keys: list[str] = field(default_factory=list)
|
|
53
|
+
doc_lens: list[int] = field(default_factory=list)
|
|
54
|
+
term_freqs: list[dict[str, int]] = field(default_factory=list)
|
|
55
|
+
doc_freqs: dict[str, int] = field(default_factory=dict)
|
|
56
|
+
avgdl: float = 0.0
|
|
57
|
+
k1: float = 1.5
|
|
58
|
+
b: float = 0.75
|
|
59
|
+
tokenizer_id: str = TOKENIZER_ID
|
|
60
|
+
|
|
61
|
+
def to_dict(self) -> dict[str, Any]:
|
|
62
|
+
"""Serialize to dict for JSON storage."""
|
|
63
|
+
return asdict(self)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _get_bm25_index_path(forge_root: Path) -> Path:
|
|
67
|
+
return forge_root / ".forge" / SEARCH_INDEX_DIR / BM25_INDEX_FILENAME
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class BM25IndexStore:
|
|
71
|
+
"""Manage per-project persistent BM25 index.
|
|
72
|
+
|
|
73
|
+
Store location: <forge_root>/.forge/search-index/bm25_index.json
|
|
74
|
+
|
|
75
|
+
Error handling:
|
|
76
|
+
- Missing file: returns None (no index built yet)
|
|
77
|
+
- Corrupted file: raises BM25IndexCorruptedError
|
|
78
|
+
- Wrong schema version: raises SchemaVersionError
|
|
79
|
+
- Tokenizer ID mismatch: raises SchemaVersionError (forces rebuild)
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
def __init__(
|
|
83
|
+
self,
|
|
84
|
+
forge_root: Path | None = None,
|
|
85
|
+
*,
|
|
86
|
+
store_path: Path | None = None,
|
|
87
|
+
) -> None:
|
|
88
|
+
if store_path:
|
|
89
|
+
self._store_path = store_path
|
|
90
|
+
elif forge_root:
|
|
91
|
+
self._store_path = _get_bm25_index_path(forge_root)
|
|
92
|
+
else:
|
|
93
|
+
raise ValueError("Either forge_root or store_path required")
|
|
94
|
+
|
|
95
|
+
@property
|
|
96
|
+
def store_path(self) -> Path:
|
|
97
|
+
return self._store_path
|
|
98
|
+
|
|
99
|
+
def exists(self) -> bool:
|
|
100
|
+
return self._store_path.is_file()
|
|
101
|
+
|
|
102
|
+
def read(self) -> BM25IndexData | None:
|
|
103
|
+
"""Read BM25 index from disk.
|
|
104
|
+
|
|
105
|
+
Returns None if the file does not exist (no index built yet).
|
|
106
|
+
|
|
107
|
+
Raises:
|
|
108
|
+
BM25IndexCorruptedError: If the file contains invalid JSON,
|
|
109
|
+
tokenizer ID mismatch, or positional arrays are misaligned.
|
|
110
|
+
SchemaVersionError: If schema version doesn't match.
|
|
111
|
+
"""
|
|
112
|
+
if not self.exists():
|
|
113
|
+
return None
|
|
114
|
+
|
|
115
|
+
path_str = str(self._store_path)
|
|
116
|
+
|
|
117
|
+
try:
|
|
118
|
+
with open(self._store_path, encoding="utf-8") as f:
|
|
119
|
+
data = json.load(f)
|
|
120
|
+
except json.JSONDecodeError as e:
|
|
121
|
+
raise BM25IndexCorruptedError(path_str, f"invalid JSON: {e}") from e
|
|
122
|
+
except OSError as e:
|
|
123
|
+
raise BM25IndexCorruptedError(path_str, f"read error: {e}") from e
|
|
124
|
+
|
|
125
|
+
if not isinstance(data, dict):
|
|
126
|
+
raise BM25IndexCorruptedError(
|
|
127
|
+
path_str,
|
|
128
|
+
f"expected JSON object, got {type(data).__name__}",
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
version = data.get("schema_version")
|
|
132
|
+
if version is None:
|
|
133
|
+
raise BM25IndexCorruptedError(path_str, "missing schema_version")
|
|
134
|
+
if version != BM25_INDEX_VERSION:
|
|
135
|
+
raise SchemaVersionError(path_str, BM25_INDEX_VERSION, version)
|
|
136
|
+
|
|
137
|
+
stored_tokenizer = data.get("tokenizer_id", "")
|
|
138
|
+
if stored_tokenizer != TOKENIZER_ID:
|
|
139
|
+
raise BM25IndexCorruptedError(
|
|
140
|
+
path_str,
|
|
141
|
+
f"tokenizer mismatch: index has '{stored_tokenizer}', "
|
|
142
|
+
f"current is '{TOKENIZER_ID}'. Run 'forge search rebuild-index' to fix.",
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
try:
|
|
146
|
+
index_data = BM25IndexData(
|
|
147
|
+
doc_keys=data.get("doc_keys", []),
|
|
148
|
+
doc_lens=data.get("doc_lens", []),
|
|
149
|
+
term_freqs=data.get("term_freqs", []),
|
|
150
|
+
doc_freqs=data.get("doc_freqs", {}),
|
|
151
|
+
avgdl=float(data.get("avgdl", 0.0)),
|
|
152
|
+
k1=float(data.get("k1", 1.5)),
|
|
153
|
+
b=float(data.get("b", 0.75)),
|
|
154
|
+
tokenizer_id=stored_tokenizer,
|
|
155
|
+
)
|
|
156
|
+
except (TypeError, ValueError) as e:
|
|
157
|
+
raise BM25IndexCorruptedError(path_str, f"invalid data: {e}") from e
|
|
158
|
+
|
|
159
|
+
n_keys = len(index_data.doc_keys)
|
|
160
|
+
n_lens = len(index_data.doc_lens)
|
|
161
|
+
n_freqs = len(index_data.term_freqs)
|
|
162
|
+
if n_keys != n_lens or n_keys != n_freqs:
|
|
163
|
+
raise BM25IndexCorruptedError(
|
|
164
|
+
path_str,
|
|
165
|
+
f"positional array length mismatch: doc_keys={n_keys}, "
|
|
166
|
+
f"doc_lens={n_lens}, term_freqs={n_freqs}. "
|
|
167
|
+
"Run 'forge search rebuild-index' to fix.",
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
return index_data
|
|
171
|
+
|
|
172
|
+
def write(self, data: BM25IndexData) -> None:
|
|
173
|
+
"""Write BM25 index atomically. Creates parent directories if needed."""
|
|
174
|
+
payload: dict[str, Any] = {
|
|
175
|
+
"schema_version": BM25_INDEX_VERSION,
|
|
176
|
+
"updated_at": now_iso(),
|
|
177
|
+
"tokenizer_id": data.tokenizer_id,
|
|
178
|
+
**data.to_dict(),
|
|
179
|
+
}
|
|
180
|
+
atomic_write_json(self._store_path, payload)
|
|
181
|
+
|
|
182
|
+
def replace_all(
|
|
183
|
+
self,
|
|
184
|
+
data: BM25IndexData,
|
|
185
|
+
*,
|
|
186
|
+
timeout_s: float = STORE_LOCK_TIMEOUT_S,
|
|
187
|
+
) -> None:
|
|
188
|
+
"""Replace entire index under lock (for rebuild-index)."""
|
|
189
|
+
with file_lock_for_target(target_path=self._store_path, timeout_s=timeout_s):
|
|
190
|
+
self.write(data)
|
|
191
|
+
|
|
192
|
+
def upsert_document(
|
|
193
|
+
self,
|
|
194
|
+
doc_key: str,
|
|
195
|
+
term_freq: dict[str, int],
|
|
196
|
+
doc_len: int,
|
|
197
|
+
*,
|
|
198
|
+
timeout_s: float = HANDLER_LOCK_TIMEOUT_S,
|
|
199
|
+
) -> None:
|
|
200
|
+
"""Add or replace a document in the index (locked, idempotent).
|
|
201
|
+
|
|
202
|
+
If doc_key already exists, its old contribution is removed first
|
|
203
|
+
(doc_freqs decremented) before adding the new entry. This ensures
|
|
204
|
+
work queue retries don't create duplicates or double-increment.
|
|
205
|
+
"""
|
|
206
|
+
with file_lock_for_target(target_path=self._store_path, timeout_s=timeout_s):
|
|
207
|
+
data = self.read()
|
|
208
|
+
if data is None:
|
|
209
|
+
data = BM25IndexData()
|
|
210
|
+
|
|
211
|
+
_remove_doc_from_data(data, doc_key)
|
|
212
|
+
|
|
213
|
+
data.doc_keys.append(doc_key)
|
|
214
|
+
data.doc_lens.append(doc_len)
|
|
215
|
+
data.term_freqs.append(term_freq)
|
|
216
|
+
for term, count in term_freq.items():
|
|
217
|
+
if count > 0:
|
|
218
|
+
data.doc_freqs[term] = data.doc_freqs.get(term, 0) + 1
|
|
219
|
+
|
|
220
|
+
data.avgdl = sum(data.doc_lens) / max(len(data.doc_lens), 1)
|
|
221
|
+
|
|
222
|
+
self.write(data)
|
|
223
|
+
|
|
224
|
+
def remove_document(
|
|
225
|
+
self,
|
|
226
|
+
doc_key: str,
|
|
227
|
+
*,
|
|
228
|
+
timeout_s: float = HANDLER_LOCK_TIMEOUT_S,
|
|
229
|
+
) -> bool:
|
|
230
|
+
"""Remove a document from the index (locked). Returns True if found."""
|
|
231
|
+
with file_lock_for_target(target_path=self._store_path, timeout_s=timeout_s):
|
|
232
|
+
data = self.read()
|
|
233
|
+
if data is None:
|
|
234
|
+
return False
|
|
235
|
+
|
|
236
|
+
removed = _remove_doc_from_data(data, doc_key)
|
|
237
|
+
if not removed:
|
|
238
|
+
return False
|
|
239
|
+
|
|
240
|
+
data.avgdl = sum(data.doc_lens) / max(len(data.doc_lens), 1)
|
|
241
|
+
|
|
242
|
+
self.write(data)
|
|
243
|
+
return True
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def _remove_doc_from_data(data: BM25IndexData, doc_key: str) -> bool:
|
|
247
|
+
"""Remove a document from BM25IndexData in-place. Returns True if found."""
|
|
248
|
+
try:
|
|
249
|
+
idx = data.doc_keys.index(doc_key)
|
|
250
|
+
except ValueError:
|
|
251
|
+
return False
|
|
252
|
+
|
|
253
|
+
old_tf = data.term_freqs[idx]
|
|
254
|
+
for term in old_tf:
|
|
255
|
+
if term in data.doc_freqs:
|
|
256
|
+
data.doc_freqs[term] -= 1
|
|
257
|
+
if data.doc_freqs[term] <= 0:
|
|
258
|
+
del data.doc_freqs[term]
|
|
259
|
+
|
|
260
|
+
data.doc_keys.pop(idx)
|
|
261
|
+
data.doc_lens.pop(idx)
|
|
262
|
+
data.term_freqs.pop(idx)
|
|
263
|
+
|
|
264
|
+
return True
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
"""Content store for lazy snippet loading.
|
|
2
|
+
|
|
3
|
+
Stores document content strings keyed by transcript_path. Content is loaded
|
|
4
|
+
at query time only for top-K results (snippet extraction), not for scoring.
|
|
5
|
+
|
|
6
|
+
Store location: <project_root>/.forge/search-index/content.json
|
|
7
|
+
|
|
8
|
+
Follows the same patterns as other search stores: atomic writes, file
|
|
9
|
+
locking, schema versioning, self-healing on missing file.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import json
|
|
15
|
+
import logging
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
from forge.core.state import (
|
|
20
|
+
SchemaVersionError,
|
|
21
|
+
atomic_write_json,
|
|
22
|
+
file_lock_for_target,
|
|
23
|
+
now_iso,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
from .exceptions import ContentStoreCorruptedError
|
|
27
|
+
from .index_state import SEARCH_INDEX_DIR
|
|
28
|
+
|
|
29
|
+
logger = logging.getLogger(__name__)
|
|
30
|
+
|
|
31
|
+
# File and schema constants
|
|
32
|
+
CONTENT_FILENAME = "content.json"
|
|
33
|
+
CONTENT_STORE_VERSION = 1
|
|
34
|
+
|
|
35
|
+
# Lock timeouts
|
|
36
|
+
STORE_LOCK_TIMEOUT_S = 5.0
|
|
37
|
+
HANDLER_LOCK_TIMEOUT_S = 1.0
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _get_content_store_path(forge_root: Path) -> Path:
|
|
41
|
+
return forge_root / ".forge" / SEARCH_INDEX_DIR / CONTENT_FILENAME
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class ContentStore:
|
|
45
|
+
"""Manage per-project content store for lazy snippet loading.
|
|
46
|
+
|
|
47
|
+
Store location: <forge_root>/.forge/search-index/content.json
|
|
48
|
+
Maps transcript_path -> extracted content string.
|
|
49
|
+
|
|
50
|
+
Error handling:
|
|
51
|
+
- Missing file: returns empty dict (self-healing)
|
|
52
|
+
- Corrupted file: raises ContentStoreCorruptedError
|
|
53
|
+
- Wrong schema version: raises SchemaVersionError
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
def __init__(
|
|
57
|
+
self,
|
|
58
|
+
forge_root: Path | None = None,
|
|
59
|
+
*,
|
|
60
|
+
store_path: Path | None = None,
|
|
61
|
+
) -> None:
|
|
62
|
+
if store_path:
|
|
63
|
+
self._store_path = store_path
|
|
64
|
+
elif forge_root:
|
|
65
|
+
self._store_path = _get_content_store_path(forge_root)
|
|
66
|
+
else:
|
|
67
|
+
raise ValueError("Either forge_root or store_path required")
|
|
68
|
+
|
|
69
|
+
@property
|
|
70
|
+
def store_path(self) -> Path:
|
|
71
|
+
return self._store_path
|
|
72
|
+
|
|
73
|
+
def exists(self) -> bool:
|
|
74
|
+
return self._store_path.is_file()
|
|
75
|
+
|
|
76
|
+
def read_all(self) -> dict[str, str]:
|
|
77
|
+
"""Read all content from disk.
|
|
78
|
+
|
|
79
|
+
Returns empty dict if the file does not exist (self-healing).
|
|
80
|
+
|
|
81
|
+
Raises:
|
|
82
|
+
ContentStoreCorruptedError: If the file contains invalid JSON.
|
|
83
|
+
SchemaVersionError: If the schema version doesn't match.
|
|
84
|
+
"""
|
|
85
|
+
if not self.exists():
|
|
86
|
+
return {}
|
|
87
|
+
|
|
88
|
+
path_str = str(self._store_path)
|
|
89
|
+
|
|
90
|
+
try:
|
|
91
|
+
with open(self._store_path, encoding="utf-8") as f:
|
|
92
|
+
data = json.load(f)
|
|
93
|
+
except json.JSONDecodeError as e:
|
|
94
|
+
raise ContentStoreCorruptedError(path_str, f"invalid JSON: {e}") from e
|
|
95
|
+
except OSError as e:
|
|
96
|
+
raise ContentStoreCorruptedError(path_str, f"read error: {e}") from e
|
|
97
|
+
|
|
98
|
+
if not isinstance(data, dict):
|
|
99
|
+
raise ContentStoreCorruptedError(
|
|
100
|
+
path_str,
|
|
101
|
+
f"expected JSON object, got {type(data).__name__}",
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
version = data.get("schema_version")
|
|
105
|
+
if version is None:
|
|
106
|
+
raise ContentStoreCorruptedError(path_str, "missing schema_version")
|
|
107
|
+
if version != CONTENT_STORE_VERSION:
|
|
108
|
+
raise SchemaVersionError(path_str, CONTENT_STORE_VERSION, version)
|
|
109
|
+
|
|
110
|
+
content = data.get("content", {})
|
|
111
|
+
if not isinstance(content, dict):
|
|
112
|
+
logger.warning(
|
|
113
|
+
"Content store %s has non-dict 'content' field (got %s), treating as empty",
|
|
114
|
+
path_str,
|
|
115
|
+
type(content).__name__,
|
|
116
|
+
)
|
|
117
|
+
return {}
|
|
118
|
+
|
|
119
|
+
return content
|
|
120
|
+
|
|
121
|
+
def read_keys(self, keys: list[str]) -> dict[str, str]:
|
|
122
|
+
"""Read content for specific document keys only.
|
|
123
|
+
|
|
124
|
+
Loads the full JSON file (unavoidable with JSON format) but returns
|
|
125
|
+
only the requested keys. This is the method used at query time for
|
|
126
|
+
snippet extraction of top-K results.
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
Dict mapping requested keys to their content strings.
|
|
130
|
+
Keys not found in the store are omitted from the result.
|
|
131
|
+
"""
|
|
132
|
+
all_content = self.read_all()
|
|
133
|
+
return {k: all_content[k] for k in keys if k in all_content}
|
|
134
|
+
|
|
135
|
+
def write(self, content_map: dict[str, str]) -> None:
|
|
136
|
+
"""Write content store atomically. Creates parent directories if needed."""
|
|
137
|
+
payload: dict[str, Any] = {
|
|
138
|
+
"schema_version": CONTENT_STORE_VERSION,
|
|
139
|
+
"updated_at": now_iso(),
|
|
140
|
+
"content": content_map,
|
|
141
|
+
}
|
|
142
|
+
atomic_write_json(self._store_path, payload)
|
|
143
|
+
|
|
144
|
+
def replace_all(
|
|
145
|
+
self,
|
|
146
|
+
content_map: dict[str, str],
|
|
147
|
+
*,
|
|
148
|
+
timeout_s: float = STORE_LOCK_TIMEOUT_S,
|
|
149
|
+
) -> None:
|
|
150
|
+
"""Replace all content under lock (for rebuild-index)."""
|
|
151
|
+
with file_lock_for_target(target_path=self._store_path, timeout_s=timeout_s):
|
|
152
|
+
self.write(content_map)
|
|
153
|
+
|
|
154
|
+
def add(
|
|
155
|
+
self,
|
|
156
|
+
doc_key: str,
|
|
157
|
+
content: str,
|
|
158
|
+
*,
|
|
159
|
+
timeout_s: float = HANDLER_LOCK_TIMEOUT_S,
|
|
160
|
+
) -> None:
|
|
161
|
+
"""Add or replace content for a document (locked, idempotent)."""
|
|
162
|
+
with file_lock_for_target(target_path=self._store_path, timeout_s=timeout_s):
|
|
163
|
+
content_map = self.read_all()
|
|
164
|
+
content_map[doc_key] = content
|
|
165
|
+
self.write(content_map)
|
|
166
|
+
|
|
167
|
+
def remove(
|
|
168
|
+
self,
|
|
169
|
+
doc_key: str,
|
|
170
|
+
*,
|
|
171
|
+
timeout_s: float = HANDLER_LOCK_TIMEOUT_S,
|
|
172
|
+
) -> bool:
|
|
173
|
+
"""Remove content for a document (locked). Returns True if found."""
|
|
174
|
+
with file_lock_for_target(target_path=self._store_path, timeout_s=timeout_s):
|
|
175
|
+
content_map = self.read_all()
|
|
176
|
+
if doc_key not in content_map:
|
|
177
|
+
return False
|
|
178
|
+
del content_map[doc_key]
|
|
179
|
+
self.write(content_map)
|
|
180
|
+
return True
|
|
181
|
+
|
|
182
|
+
def prune_keys(
|
|
183
|
+
self,
|
|
184
|
+
valid_keys: set[str],
|
|
185
|
+
*,
|
|
186
|
+
timeout_s: float = STORE_LOCK_TIMEOUT_S,
|
|
187
|
+
) -> list[str]:
|
|
188
|
+
"""Remove entries not in valid_keys (locked). Returns removed keys."""
|
|
189
|
+
with file_lock_for_target(target_path=self._store_path, timeout_s=timeout_s):
|
|
190
|
+
content_map = self.read_all()
|
|
191
|
+
removed = [k for k in content_map if k not in valid_keys]
|
|
192
|
+
if not removed:
|
|
193
|
+
return []
|
|
194
|
+
for k in removed:
|
|
195
|
+
del content_map[k]
|
|
196
|
+
self.write(content_map)
|
|
197
|
+
return removed
|