multi-forge 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- forge/__init__.py +3 -0
- forge/_extensions/agents/.gitkeep +0 -0
- forge/_extensions/commands/.gitkeep +0 -0
- forge/_extensions/skills/analyze/SKILL.md +87 -0
- forge/_extensions/skills/challenge/SKILL.md +91 -0
- forge/_extensions/skills/consensus/SKILL.md +120 -0
- forge/_extensions/skills/consensus/resources/code_consensus_evaluation.md +94 -0
- forge/_extensions/skills/consensus/resources/consensus_evaluation.md +70 -0
- forge/_extensions/skills/consensus/resources/synthesis.md +101 -0
- forge/_extensions/skills/debate/SKILL.md +116 -0
- forge/_extensions/skills/debate/resources/code_debate_evaluation.md +101 -0
- forge/_extensions/skills/debate/resources/debate_evaluation.md +90 -0
- forge/_extensions/skills/panel/SKILL.md +141 -0
- forge/_extensions/skills/panel/resources/synthesis.md +103 -0
- forge/_extensions/skills/qa/SKILL.md +704 -0
- forge/_extensions/skills/qa/resources/checklist/0-enable.md +78 -0
- forge/_extensions/skills/qa/resources/checklist/1-preflight.md +24 -0
- forge/_extensions/skills/qa/resources/checklist/10-resume.md +143 -0
- forge/_extensions/skills/qa/resources/checklist/11-config.md +150 -0
- forge/_extensions/skills/qa/resources/checklist/12-search.md +58 -0
- forge/_extensions/skills/qa/resources/checklist/13-guard.md +237 -0
- forge/_extensions/skills/qa/resources/checklist/14-workflow.md +305 -0
- forge/_extensions/skills/qa/resources/checklist/15-skills.md +155 -0
- forge/_extensions/skills/qa/resources/checklist/16-handoff.md +224 -0
- forge/_extensions/skills/qa/resources/checklist/17-info.md +50 -0
- forge/_extensions/skills/qa/resources/checklist/18-disable.md +84 -0
- forge/_extensions/skills/qa/resources/checklist/19-uninstall.md +146 -0
- forge/_extensions/skills/qa/resources/checklist/2-extensions.md +188 -0
- forge/_extensions/skills/qa/resources/checklist/20-cleanup.md +36 -0
- forge/_extensions/skills/qa/resources/checklist/3-auth.md +234 -0
- forge/_extensions/skills/qa/resources/checklist/4-proxy.md +481 -0
- forge/_extensions/skills/qa/resources/checklist/5-session.md +541 -0
- forge/_extensions/skills/qa/resources/checklist/6-hooks.md +275 -0
- forge/_extensions/skills/qa/resources/checklist/7-costs.md +309 -0
- forge/_extensions/skills/qa/resources/checklist/8-status-line.md +174 -0
- forge/_extensions/skills/qa/resources/checklist/9-direct-commands.md +146 -0
- forge/_extensions/skills/qa/resources/checklist.md +103 -0
- forge/_extensions/skills/qa/resources/report-template.md +62 -0
- forge/_extensions/skills/qa/scripts/start-container.sh +529 -0
- forge/_extensions/skills/qa/scripts/walkthrough-state.py +1137 -0
- forge/_extensions/skills/review/SKILL.md +125 -0
- forge/_extensions/skills/review/references/claude-4.6.md +474 -0
- forge/_extensions/skills/review/references/claude-4.7.md +710 -0
- forge/_extensions/skills/review/references/gemini-3.1.md +546 -0
- forge/_extensions/skills/review/references/gpt-5.5.md +490 -0
- forge/_extensions/skills/review/references/skills-writing-guide.md +1588 -0
- forge/_extensions/skills/review/resources/code-anthropic.md +160 -0
- forge/_extensions/skills/review/resources/code-gemini.md +184 -0
- forge/_extensions/skills/review/resources/code-openai.md +203 -0
- forge/_extensions/skills/review/resources/code.md +160 -0
- forge/_extensions/skills/review-docs/SKILL.md +121 -0
- forge/_extensions/skills/review-docs/resources/docs-anthropic.md +170 -0
- forge/_extensions/skills/review-docs/resources/docs-gemini.md +204 -0
- forge/_extensions/skills/review-docs/resources/docs-openai.md +231 -0
- forge/_extensions/skills/review-docs/resources/docs.md +170 -0
- forge/_extensions/skills/smoke-test/SKILL.md +27 -0
- forge/_extensions/skills/smoke-test/scripts/smoke-test.sh +118 -0
- forge/_extensions/skills/understand/SKILL.md +148 -0
- forge/_extensions/skills/understand/resources/code-anthropic.md +163 -0
- forge/_extensions/skills/understand/resources/code-gemini.md +194 -0
- forge/_extensions/skills/understand/resources/code-openai.md +181 -0
- forge/_extensions/skills/understand/resources/code.md +163 -0
- forge/_extensions/skills/understand/resources/docs-anthropic.md +177 -0
- forge/_extensions/skills/understand/resources/docs-gemini.md +202 -0
- forge/_extensions/skills/understand/resources/docs-openai.md +191 -0
- forge/_extensions/skills/understand/resources/docs.md +177 -0
- forge/_extensions/skills/walkthrough/SKILL.md +599 -0
- forge/_extensions/skills/walkthrough/resources/checklist.md +765 -0
- forge/_extensions/skills/walkthrough/scripts/run-in-repo.sh +118 -0
- forge/_extensions/skills/walkthrough/scripts/setup-test-repo.sh +198 -0
- forge/_extensions/skills/walkthrough/scripts/walkthrough-state.py +1137 -0
- forge/backend/__init__.py +174 -0
- forge/backend/adapters/__init__.py +38 -0
- forge/backend/adapters/litellm.py +158 -0
- forge/backend/creation.py +89 -0
- forge/backend/registry.py +178 -0
- forge/cli/__init__.py +16 -0
- forge/cli/auth.py +483 -0
- forge/cli/backend.py +298 -0
- forge/cli/claude.py +411 -0
- forge/cli/config_cmd.py +303 -0
- forge/cli/extensions.py +1001 -0
- forge/cli/gc.py +165 -0
- forge/cli/guard.py +1018 -0
- forge/cli/guards.py +106 -0
- forge/cli/handoff.py +110 -0
- forge/cli/hooks/__init__.py +36 -0
- forge/cli/hooks/_group.py +20 -0
- forge/cli/hooks/_helpers.py +149 -0
- forge/cli/hooks/commands.py +1677 -0
- forge/cli/hooks/direct_commands.py +1304 -0
- forge/cli/hooks/install.py +232 -0
- forge/cli/hooks/policy.py +151 -0
- forge/cli/hooks/read_hygiene.py +74 -0
- forge/cli/hooks/verification.py +370 -0
- forge/cli/logs.py +406 -0
- forge/cli/main.py +292 -0
- forge/cli/proxy.py +1821 -0
- forge/cli/proxy_costs.py +313 -0
- forge/cli/search.py +416 -0
- forge/cli/session.py +892 -0
- forge/cli/session_addendum.py +81 -0
- forge/cli/session_fork.py +750 -0
- forge/cli/session_handoff.py +141 -0
- forge/cli/session_lifecycle.py +2053 -0
- forge/cli/session_manage.py +1336 -0
- forge/cli/session_memory.py +201 -0
- forge/cli/status_line.py +1398 -0
- forge/cli/workflow.py +1964 -0
- forge/config/__init__.py +110 -0
- forge/config/dataclass_utils.py +88 -0
- forge/config/defaults/__init__.py +0 -0
- forge/config/defaults/backends/__init__.py +0 -0
- forge/config/defaults/backends/litellm.yaml +196 -0
- forge/config/defaults/templates/__init__.py +0 -0
- forge/config/defaults/templates/litellm-anthropic-local.yaml +33 -0
- forge/config/defaults/templates/litellm-anthropic.yaml +24 -0
- forge/config/defaults/templates/litellm-gemini-flash-local.yaml +37 -0
- forge/config/defaults/templates/litellm-gemini-local.yaml +32 -0
- forge/config/defaults/templates/litellm-gemini-test.yaml +34 -0
- forge/config/defaults/templates/litellm-gemini.yaml +21 -0
- forge/config/defaults/templates/litellm-openai-codex-local.yaml +36 -0
- forge/config/defaults/templates/litellm-openai-local.yaml +38 -0
- forge/config/defaults/templates/litellm-openai.yaml +28 -0
- forge/config/defaults/templates/openrouter-anthropic.yaml +23 -0
- forge/config/defaults/templates/openrouter-deepseek.yaml +26 -0
- forge/config/defaults/templates/openrouter-gemini-flash.yaml +26 -0
- forge/config/defaults/templates/openrouter-gemini.yaml +23 -0
- forge/config/defaults/templates/openrouter-glm.yaml +23 -0
- forge/config/defaults/templates/openrouter-kimi.yaml +30 -0
- forge/config/defaults/templates/openrouter-minimax.yaml +26 -0
- forge/config/defaults/templates/openrouter-openai-codex.yaml +23 -0
- forge/config/defaults/templates/openrouter-openai.yaml +28 -0
- forge/config/defaults/templates/openrouter-qwen.yaml +25 -0
- forge/config/loader.py +675 -0
- forge/config/schema.py +448 -0
- forge/core/__init__.py +5 -0
- forge/core/auth/__init__.py +67 -0
- forge/core/auth/capabilities.py +219 -0
- forge/core/auth/credentials_file.py +244 -0
- forge/core/auth/protocols.py +18 -0
- forge/core/auth/secrets.py +243 -0
- forge/core/auth/template_secrets.py +112 -0
- forge/core/data/__init__.py +5 -0
- forge/core/data/model_catalog.yaml +1522 -0
- forge/core/data/pricing.yaml +140 -0
- forge/core/data/system_prompt_addendums/__init__.py +0 -0
- forge/core/data/system_prompt_addendums/gemini.md +330 -0
- forge/core/data/system_prompt_addendums/openai.md +328 -0
- forge/core/llm/__init__.py +231 -0
- forge/core/llm/clients/__init__.py +14 -0
- forge/core/llm/clients/base.py +115 -0
- forge/core/llm/clients/litellm.py +619 -0
- forge/core/llm/clients/openai_compat.py +244 -0
- forge/core/llm/clients/openrouter.py +234 -0
- forge/core/llm/credentials.py +439 -0
- forge/core/llm/detection.py +86 -0
- forge/core/llm/errors.py +44 -0
- forge/core/llm/protocols.py +80 -0
- forge/core/llm/types.py +176 -0
- forge/core/logging.py +146 -0
- forge/core/models/__init__.py +91 -0
- forge/core/models/catalog.py +467 -0
- forge/core/models/pricing.py +165 -0
- forge/core/models/types.py +167 -0
- forge/core/naming.py +212 -0
- forge/core/ops/__init__.py +73 -0
- forge/core/ops/context.py +141 -0
- forge/core/ops/gc.py +802 -0
- forge/core/ops/proxy.py +146 -0
- forge/core/ops/resolution.py +135 -0
- forge/core/ops/session.py +344 -0
- forge/core/ops/session_context.py +548 -0
- forge/core/paths.py +38 -0
- forge/core/process.py +54 -0
- forge/core/reactive/__init__.py +38 -0
- forge/core/reactive/cost_tracking.py +300 -0
- forge/core/reactive/env.py +180 -0
- forge/core/reactive/proxy.py +78 -0
- forge/core/reactive/routing.py +622 -0
- forge/core/reactive/session_runner.py +185 -0
- forge/core/reactive/structured_output.py +62 -0
- forge/core/reactive/tagger.py +94 -0
- forge/core/reactive/throttle.py +132 -0
- forge/core/state/__init__.py +59 -0
- forge/core/state/exceptions.py +59 -0
- forge/core/state/io.py +140 -0
- forge/core/state/lock.py +99 -0
- forge/core/state/timestamps.py +60 -0
- forge/core/transcript.py +78 -0
- forge/core/typing_helpers.py +24 -0
- forge/core/workqueue/__init__.py +67 -0
- forge/core/workqueue/queue.py +552 -0
- forge/core/workqueue/types.py +63 -0
- forge/guard/__init__.py +26 -0
- forge/guard/deterministic/__init__.py +26 -0
- forge/guard/deterministic/base.py +158 -0
- forge/guard/deterministic/coding_standards.py +256 -0
- forge/guard/deterministic/registry.py +148 -0
- forge/guard/deterministic/tdd.py +171 -0
- forge/guard/engine.py +216 -0
- forge/guard/protocols.py +91 -0
- forge/guard/queries.py +96 -0
- forge/guard/semantic/__init__.py +34 -0
- forge/guard/semantic/promotion.py +18 -0
- forge/guard/semantic/supervisor.py +813 -0
- forge/guard/semantic/verdict.py +183 -0
- forge/guard/store.py +124 -0
- forge/guard/team/__init__.py +6 -0
- forge/guard/team/config.py +24 -0
- forge/guard/team/handlers.py +209 -0
- forge/guard/team/prompts.py +41 -0
- forge/guard/types.py +125 -0
- forge/guard/workflow/__init__.py +17 -0
- forge/guard/workflow/branches.py +67 -0
- forge/guard/workflow/config.py +63 -0
- forge/guard/workflow/divergence.py +113 -0
- forge/guard/workflow/policy.py +87 -0
- forge/guard/workflow/stages.py +205 -0
- forge/install/__init__.py +55 -0
- forge/install/cli.py +281 -0
- forge/install/exceptions.py +163 -0
- forge/install/hooks.py +109 -0
- forge/install/installer.py +1037 -0
- forge/install/models.py +321 -0
- forge/install/preset.py +272 -0
- forge/install/settings_merge.py +831 -0
- forge/install/tracking.py +238 -0
- forge/install/version.py +141 -0
- forge/proxy/__init__.py +0 -0
- forge/proxy/base_client.py +181 -0
- forge/proxy/client_adapter.py +476 -0
- forge/proxy/client_factory.py +531 -0
- forge/proxy/converters.py +1206 -0
- forge/proxy/cost_logger.py +132 -0
- forge/proxy/cost_tracker.py +242 -0
- forge/proxy/data_models.py +338 -0
- forge/proxy/error_hints.py +92 -0
- forge/proxy/metrics.py +222 -0
- forge/proxy/model_spec.py +158 -0
- forge/proxy/proxies.py +333 -0
- forge/proxy/proxy_identity.py +134 -0
- forge/proxy/proxy_orchestrator.py +1018 -0
- forge/proxy/proxy_startup.py +54 -0
- forge/proxy/server.py +1561 -0
- forge/proxy/utils.py +537 -0
- forge/review/__init__.py +6 -0
- forge/review/adversarial.py +111 -0
- forge/review/consensus.py +236 -0
- forge/review/engine.py +356 -0
- forge/review/models.py +437 -0
- forge/review/resources/__init__.py +5 -0
- forge/review/resources/codereview-performance.md +85 -0
- forge/review/resources/codereview-quick.md +75 -0
- forge/review/resources/codereview-security.md +92 -0
- forge/review/resources/codereview.md +85 -0
- forge/review/resources/docreview-quick.md +75 -0
- forge/review/resources/docreview.md +86 -0
- forge/review/resources/thinkdeep.md +89 -0
- forge/review/routing.py +368 -0
- forge/review/synthesis.py +73 -0
- forge/runtime_config.py +438 -0
- forge/search/__init__.py +55 -0
- forge/search/bm25_store.py +264 -0
- forge/search/content_store.py +197 -0
- forge/search/engine.py +352 -0
- forge/search/exceptions.py +51 -0
- forge/search/extractor.py +234 -0
- forge/search/index_state.py +295 -0
- forge/search/store.py +215 -0
- forge/search/tokenizer.py +24 -0
- forge/session/__init__.py +130 -0
- forge/session/active.py +339 -0
- forge/session/artifacts.py +202 -0
- forge/session/claude/__init__.py +50 -0
- forge/session/claude/cleanup.py +105 -0
- forge/session/claude/invoke.py +236 -0
- forge/session/claude/paths.py +200 -0
- forge/session/cleanup.py +216 -0
- forge/session/config.py +34 -0
- forge/session/direct_model.py +107 -0
- forge/session/effective.py +169 -0
- forge/session/exceptions.py +255 -0
- forge/session/handoff.py +881 -0
- forge/session/handoff_agent.py +544 -0
- forge/session/hooks/__init__.py +35 -0
- forge/session/hooks/models.py +73 -0
- forge/session/hooks/session_start.py +507 -0
- forge/session/identity.py +84 -0
- forge/session/index.py +553 -0
- forge/session/manager.py +1506 -0
- forge/session/models.py +572 -0
- forge/session/overrides.py +344 -0
- forge/session/plan_resolution.py +286 -0
- forge/session/prev_sessions.py +128 -0
- forge/session/store.py +431 -0
- forge/session/validation.py +47 -0
- forge/session/worktree/__init__.py +65 -0
- forge/session/worktree/cleanup.py +262 -0
- forge/session/worktree/config_copy.py +203 -0
- forge/session/worktree/create.py +332 -0
- forge/sidecar/__init__.py +29 -0
- forge/sidecar/container.py +161 -0
- forge/sidecar/docker.py +86 -0
- forge/sidecar/secrets.py +19 -0
- multi_forge-0.2.0.dist-info/METADATA +242 -0
- multi_forge-0.2.0.dist-info/RECORD +311 -0
- multi_forge-0.2.0.dist-info/WHEEL +4 -0
- multi_forge-0.2.0.dist-info/entry_points.txt +2 -0
- multi_forge-0.2.0.dist-info/licenses/LICENSE +203 -0
- multi_forge-0.2.0.dist-info/licenses/NOTICE +14 -0
forge/search/engine.py
ADDED
|
@@ -0,0 +1,352 @@
|
|
|
1
|
+
"""BM25 search engine for transcript documents.
|
|
2
|
+
|
|
3
|
+
Provides BM25Okapi ranking for keyword search over extracted transcript content.
|
|
4
|
+
No external dependencies — hand-rolled BM25 implementation (~30 lines of math).
|
|
5
|
+
|
|
6
|
+
Two search entry points:
|
|
7
|
+
- search(): Legacy path — builds BM25 from in-memory documents (query-time construction)
|
|
8
|
+
- search_from_index(): Persistent index path — loads precomputed BM25 data (scoring only)
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import math
|
|
14
|
+
from collections.abc import Callable
|
|
15
|
+
from dataclasses import dataclass
|
|
16
|
+
from typing import Any
|
|
17
|
+
|
|
18
|
+
from .exceptions import BM25IndexCorruptedError, ContentStoreCorruptedError
|
|
19
|
+
from .extractor import SearchDocument, SearchDocumentMeta
|
|
20
|
+
from .tokenizer import TOKEN_RE, tokenize
|
|
21
|
+
|
|
22
|
+
# Search defaults
|
|
23
|
+
SNIPPET_LENGTH = 300
|
|
24
|
+
DEFAULT_LIMIT = 10
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class BM25:
|
|
28
|
+
"""BM25Okapi implementation for ranking documents against a query.
|
|
29
|
+
|
|
30
|
+
Standard BM25 with term frequency saturation and document length
|
|
31
|
+
normalization. No external dependencies.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
documents: List of tokenized documents (each is a list of terms).
|
|
35
|
+
k1: Term frequency saturation parameter.
|
|
36
|
+
b: Document length normalization parameter.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
def __init__(
|
|
40
|
+
self,
|
|
41
|
+
documents: list[list[str]],
|
|
42
|
+
*,
|
|
43
|
+
k1: float = 1.5,
|
|
44
|
+
b: float = 0.75,
|
|
45
|
+
) -> None:
|
|
46
|
+
self._k1 = k1
|
|
47
|
+
self._b = b
|
|
48
|
+
self._doc_count = len(documents)
|
|
49
|
+
self._doc_lens = [len(d) for d in documents]
|
|
50
|
+
self._avgdl = sum(self._doc_lens) / max(self._doc_count, 1)
|
|
51
|
+
|
|
52
|
+
# Per-doc term frequencies
|
|
53
|
+
self._term_freqs: list[dict[str, int]] = []
|
|
54
|
+
# Number of docs containing each term
|
|
55
|
+
self._doc_freqs: dict[str, int] = {}
|
|
56
|
+
|
|
57
|
+
for doc in documents:
|
|
58
|
+
tf: dict[str, int] = {}
|
|
59
|
+
for term in doc:
|
|
60
|
+
tf[term] = tf.get(term, 0) + 1
|
|
61
|
+
self._term_freqs.append(tf)
|
|
62
|
+
for term in tf:
|
|
63
|
+
self._doc_freqs[term] = self._doc_freqs.get(term, 0) + 1
|
|
64
|
+
|
|
65
|
+
@property
|
|
66
|
+
def doc_freqs(self) -> dict[str, int]:
|
|
67
|
+
"""Number of documents containing each term."""
|
|
68
|
+
return self._doc_freqs
|
|
69
|
+
|
|
70
|
+
@classmethod
|
|
71
|
+
def from_precomputed(
|
|
72
|
+
cls,
|
|
73
|
+
*,
|
|
74
|
+
term_freqs: list[dict[str, int]],
|
|
75
|
+
doc_freqs: dict[str, int],
|
|
76
|
+
doc_lens: list[int],
|
|
77
|
+
avgdl: float,
|
|
78
|
+
k1: float = 1.5,
|
|
79
|
+
b: float = 0.75,
|
|
80
|
+
) -> BM25:
|
|
81
|
+
"""Construct BM25 from pre-computed index data (no tokenization).
|
|
82
|
+
|
|
83
|
+
This is the fast path for persistent indices — skips the O(total_tokens)
|
|
84
|
+
initialization and directly sets internal state.
|
|
85
|
+
"""
|
|
86
|
+
instance = cls.__new__(cls)
|
|
87
|
+
instance._k1 = k1
|
|
88
|
+
instance._b = b
|
|
89
|
+
instance._doc_count = len(doc_lens)
|
|
90
|
+
instance._doc_lens = doc_lens
|
|
91
|
+
instance._avgdl = avgdl
|
|
92
|
+
instance._term_freqs = term_freqs
|
|
93
|
+
instance._doc_freqs = doc_freqs
|
|
94
|
+
return instance
|
|
95
|
+
|
|
96
|
+
def to_precomputed(self) -> dict:
|
|
97
|
+
"""Export pre-computed data for persistence.
|
|
98
|
+
|
|
99
|
+
Returns dict with keys: term_freqs, doc_freqs, doc_lens, avgdl, k1, b.
|
|
100
|
+
"""
|
|
101
|
+
return {
|
|
102
|
+
"term_freqs": self._term_freqs,
|
|
103
|
+
"doc_freqs": dict(self._doc_freqs),
|
|
104
|
+
"doc_lens": list(self._doc_lens),
|
|
105
|
+
"avgdl": self._avgdl,
|
|
106
|
+
"k1": self._k1,
|
|
107
|
+
"b": self._b,
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
def score(self, query: list[str]) -> list[float]:
|
|
111
|
+
"""Score all documents against the given query terms.
|
|
112
|
+
|
|
113
|
+
Returns list of scores in the same order as documents passed to __init__.
|
|
114
|
+
"""
|
|
115
|
+
scores = [0.0] * self._doc_count
|
|
116
|
+
for term in query:
|
|
117
|
+
if term not in self._doc_freqs:
|
|
118
|
+
continue
|
|
119
|
+
df = self._doc_freqs[term]
|
|
120
|
+
idf = math.log((self._doc_count - df + 0.5) / (df + 0.5) + 1.0)
|
|
121
|
+
for i in range(self._doc_count):
|
|
122
|
+
tf = self._term_freqs[i].get(term, 0)
|
|
123
|
+
if tf == 0:
|
|
124
|
+
continue
|
|
125
|
+
dl = self._doc_lens[i]
|
|
126
|
+
tf_norm = (tf * (self._k1 + 1)) / (tf + self._k1 * (1 - self._b + self._b * dl / self._avgdl))
|
|
127
|
+
scores[i] += idf * tf_norm
|
|
128
|
+
return scores
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _best_snippet(
|
|
132
|
+
content: str,
|
|
133
|
+
query_tokens: list[str],
|
|
134
|
+
length: int = SNIPPET_LENGTH,
|
|
135
|
+
*,
|
|
136
|
+
doc_freqs: dict[str, int] | None = None,
|
|
137
|
+
) -> str:
|
|
138
|
+
"""Extract a snippet centered on the rarest query term's first occurrence.
|
|
139
|
+
|
|
140
|
+
Scans the content for all query token matches in a single O(n) pass,
|
|
141
|
+
preferring the first occurrence of the rarest term (lowest doc_freqs
|
|
142
|
+
count). This anchors snippets on the most distinctive query term rather
|
|
143
|
+
than the first match of any term.
|
|
144
|
+
|
|
145
|
+
Iterates on the original content (not lowercased) to preserve correct
|
|
146
|
+
character positions for Unicode text where lowercasing can change length.
|
|
147
|
+
|
|
148
|
+
Falls back to the first `length` characters if no query terms are found.
|
|
149
|
+
"""
|
|
150
|
+
if len(content) <= length:
|
|
151
|
+
return content
|
|
152
|
+
|
|
153
|
+
query_set = set(query_tokens)
|
|
154
|
+
|
|
155
|
+
# Single pass: find first occurrence of the rarest query term
|
|
156
|
+
best_pos: int | None = None
|
|
157
|
+
best_rarity = float("inf")
|
|
158
|
+
|
|
159
|
+
for match in TOKEN_RE.finditer(content):
|
|
160
|
+
token = match.group().lower()
|
|
161
|
+
if token not in query_set:
|
|
162
|
+
continue
|
|
163
|
+
rarity = doc_freqs.get(token, 0) if doc_freqs else 0
|
|
164
|
+
if rarity < best_rarity:
|
|
165
|
+
best_pos = match.start()
|
|
166
|
+
best_rarity = rarity
|
|
167
|
+
if rarity <= 1:
|
|
168
|
+
break # Term appears in ≤1 doc — can't get rarer
|
|
169
|
+
|
|
170
|
+
if best_pos is not None:
|
|
171
|
+
return _extract_window(content, best_pos, length)
|
|
172
|
+
|
|
173
|
+
# No query terms found — fall back to beginning
|
|
174
|
+
return content[:length]
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def _extract_window(content: str, center: int, length: int) -> str:
|
|
178
|
+
"""Extract a snippet window centered on a character position."""
|
|
179
|
+
start = max(0, center - length // 2)
|
|
180
|
+
end = start + length
|
|
181
|
+
if end > len(content):
|
|
182
|
+
end = len(content)
|
|
183
|
+
start = max(0, end - length)
|
|
184
|
+
snippet = content[start:end]
|
|
185
|
+
prefix = "..." if start > 0 else ""
|
|
186
|
+
suffix = "..." if end < len(content) else ""
|
|
187
|
+
return prefix + snippet + suffix
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
@dataclass
|
|
191
|
+
class SearchResult:
|
|
192
|
+
"""A single search result."""
|
|
193
|
+
|
|
194
|
+
session_name: str
|
|
195
|
+
session_id: str
|
|
196
|
+
score: float
|
|
197
|
+
snippet: str
|
|
198
|
+
transcript_path: str
|
|
199
|
+
metadata: dict[str, Any]
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def search(
|
|
203
|
+
query: str,
|
|
204
|
+
documents: list[SearchDocument],
|
|
205
|
+
*,
|
|
206
|
+
limit: int = DEFAULT_LIMIT,
|
|
207
|
+
) -> list[SearchResult]:
|
|
208
|
+
"""Search documents using BM25.
|
|
209
|
+
|
|
210
|
+
Builds BM25 index at query time from provided documents.
|
|
211
|
+
Returns top-K results sorted by score descending.
|
|
212
|
+
|
|
213
|
+
Args:
|
|
214
|
+
query: Search query string.
|
|
215
|
+
documents: List of SearchDocument to search over.
|
|
216
|
+
limit: Maximum number of results to return.
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
List of SearchResult sorted by score descending.
|
|
220
|
+
"""
|
|
221
|
+
if not query.strip() or not documents:
|
|
222
|
+
return []
|
|
223
|
+
|
|
224
|
+
query_tokens = tokenize(query)
|
|
225
|
+
if not query_tokens:
|
|
226
|
+
return []
|
|
227
|
+
|
|
228
|
+
doc_tokens = [doc.tokens if doc.tokens is not None else tokenize(doc.content) for doc in documents]
|
|
229
|
+
bm25 = BM25(doc_tokens)
|
|
230
|
+
scores = bm25.score(query_tokens)
|
|
231
|
+
|
|
232
|
+
# Pair scores with documents, filter zero scores, sort descending
|
|
233
|
+
scored = [(s, doc) for s, doc in zip(scores, documents) if s > 0]
|
|
234
|
+
scored.sort(key=lambda x: x[0], reverse=True)
|
|
235
|
+
|
|
236
|
+
results: list[SearchResult] = []
|
|
237
|
+
for s, doc in scored[:limit]:
|
|
238
|
+
results.append(
|
|
239
|
+
SearchResult(
|
|
240
|
+
session_name=doc.session_name,
|
|
241
|
+
session_id=doc.session_id,
|
|
242
|
+
score=round(s, 4),
|
|
243
|
+
snippet=_best_snippet(doc.content, query_tokens, doc_freqs=bm25.doc_freqs),
|
|
244
|
+
transcript_path=doc.transcript_path,
|
|
245
|
+
metadata=doc.metadata,
|
|
246
|
+
)
|
|
247
|
+
)
|
|
248
|
+
return results
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def search_from_index(
|
|
252
|
+
query: str,
|
|
253
|
+
*,
|
|
254
|
+
doc_keys: list[str],
|
|
255
|
+
term_freqs: list[dict[str, int]],
|
|
256
|
+
doc_freqs: dict[str, int],
|
|
257
|
+
doc_lens: list[int],
|
|
258
|
+
avgdl: float,
|
|
259
|
+
k1: float = 1.5,
|
|
260
|
+
b: float = 0.75,
|
|
261
|
+
content_loader: Callable[[list[str]], dict[str, str]],
|
|
262
|
+
doc_metadata: dict[str, SearchDocumentMeta],
|
|
263
|
+
limit: int = DEFAULT_LIMIT,
|
|
264
|
+
) -> list[SearchResult]:
|
|
265
|
+
"""Search using a pre-computed persistent BM25 index.
|
|
266
|
+
|
|
267
|
+
This is the fast path: loads precomputed data structures, runs scoring
|
|
268
|
+
only, then lazily loads content for snippet extraction on top-K results.
|
|
269
|
+
|
|
270
|
+
Args:
|
|
271
|
+
query: Search query string.
|
|
272
|
+
doc_keys: Positional document keys (transcript_paths) matching term_freqs/doc_lens.
|
|
273
|
+
term_freqs: Per-document term frequency dicts (positional).
|
|
274
|
+
doc_freqs: Global document frequency dict.
|
|
275
|
+
doc_lens: Per-document token counts (positional).
|
|
276
|
+
avgdl: Average document length across corpus.
|
|
277
|
+
k1: BM25 term saturation parameter.
|
|
278
|
+
b: BM25 length normalization parameter.
|
|
279
|
+
content_loader: Callable that takes a list of doc keys and returns {key: content}.
|
|
280
|
+
doc_metadata: Mapping of transcript_path -> SearchDocumentMeta.
|
|
281
|
+
limit: Maximum number of results.
|
|
282
|
+
|
|
283
|
+
Returns:
|
|
284
|
+
List of SearchResult sorted by score descending.
|
|
285
|
+
|
|
286
|
+
Raises:
|
|
287
|
+
BM25IndexCorruptedError: If doc_keys has entries not in doc_metadata.
|
|
288
|
+
ContentStoreCorruptedError: If content_loader is missing a top-K key.
|
|
289
|
+
"""
|
|
290
|
+
if not query.strip() or not doc_keys:
|
|
291
|
+
return []
|
|
292
|
+
|
|
293
|
+
query_tokens = tokenize(query)
|
|
294
|
+
if not query_tokens:
|
|
295
|
+
return []
|
|
296
|
+
|
|
297
|
+
# Validate invariant: every indexed doc must have metadata
|
|
298
|
+
missing_meta = [k for k in doc_keys if k not in doc_metadata]
|
|
299
|
+
if missing_meta:
|
|
300
|
+
raise BM25IndexCorruptedError(
|
|
301
|
+
"bm25_index",
|
|
302
|
+
f"{len(missing_meta)} indexed documents missing from metadata store. "
|
|
303
|
+
"Run 'forge search rebuild-index' to fix.",
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
# Score using precomputed data (no token iteration)
|
|
307
|
+
bm25 = BM25.from_precomputed(
|
|
308
|
+
term_freqs=term_freqs,
|
|
309
|
+
doc_freqs=doc_freqs,
|
|
310
|
+
doc_lens=doc_lens,
|
|
311
|
+
avgdl=avgdl,
|
|
312
|
+
k1=k1,
|
|
313
|
+
b=b,
|
|
314
|
+
)
|
|
315
|
+
scores = bm25.score(query_tokens)
|
|
316
|
+
|
|
317
|
+
# Pair scores with doc keys, filter zero scores, sort descending
|
|
318
|
+
scored = [(s, key) for s, key in zip(scores, doc_keys) if s > 0]
|
|
319
|
+
scored.sort(key=lambda x: x[0], reverse=True)
|
|
320
|
+
top_k = scored[:limit]
|
|
321
|
+
|
|
322
|
+
if not top_k:
|
|
323
|
+
return []
|
|
324
|
+
|
|
325
|
+
# Lazy content loading: only fetch content for top-K results
|
|
326
|
+
top_keys = [key for _, key in top_k]
|
|
327
|
+
content_map = content_loader(top_keys)
|
|
328
|
+
|
|
329
|
+
# Validate content availability
|
|
330
|
+
missing_content = [k for k in top_keys if k not in content_map]
|
|
331
|
+
if missing_content:
|
|
332
|
+
raise ContentStoreCorruptedError(
|
|
333
|
+
"content",
|
|
334
|
+
f"{len(missing_content)} top-K documents missing from content store. "
|
|
335
|
+
"Run 'forge search rebuild-index' to fix.",
|
|
336
|
+
)
|
|
337
|
+
|
|
338
|
+
results: list[SearchResult] = []
|
|
339
|
+
for s, key in top_k:
|
|
340
|
+
meta = doc_metadata[key]
|
|
341
|
+
content = content_map[key]
|
|
342
|
+
results.append(
|
|
343
|
+
SearchResult(
|
|
344
|
+
session_name=meta.session_name,
|
|
345
|
+
session_id=meta.session_id,
|
|
346
|
+
score=round(s, 4),
|
|
347
|
+
snippet=_best_snippet(content, query_tokens, doc_freqs=doc_freqs),
|
|
348
|
+
transcript_path=meta.transcript_path,
|
|
349
|
+
metadata=meta.metadata,
|
|
350
|
+
)
|
|
351
|
+
)
|
|
352
|
+
return results
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""Exceptions for the Forge search module.
|
|
2
|
+
|
|
3
|
+
Follows the forge.core.state exception hierarchy:
|
|
4
|
+
- SearchError is the module-level base
|
|
5
|
+
- IndexStateCorruptedError inherits StateCorruptedError for consistency
|
|
6
|
+
with BackendRegistryCorruptedError and other state corruption errors
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from forge.core.state import StateCorruptedError
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class SearchError(Exception):
|
|
15
|
+
"""Base exception for search module operations."""
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class IndexStateCorruptedError(StateCorruptedError):
|
|
19
|
+
"""Raised when the index state file cannot be parsed.
|
|
20
|
+
|
|
21
|
+
Inherits (path, reason) signature from StateCorruptedError.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
pass
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class SearchDocumentStoreCorruptedError(StateCorruptedError):
|
|
28
|
+
"""Raised when the document store file cannot be parsed.
|
|
29
|
+
|
|
30
|
+
Inherits (path, reason) signature from StateCorruptedError.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
pass
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class BM25IndexCorruptedError(StateCorruptedError):
|
|
37
|
+
"""Raised when the BM25 index file cannot be parsed or is inconsistent.
|
|
38
|
+
|
|
39
|
+
Inherits (path, reason) signature from StateCorruptedError.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
pass
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class ContentStoreCorruptedError(StateCorruptedError):
|
|
46
|
+
"""Raised when the content store file cannot be parsed or is inconsistent.
|
|
47
|
+
|
|
48
|
+
Inherits (path, reason) signature from StateCorruptedError.
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
pass
|
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
"""Content extraction from JSONL transcripts for search indexing.
|
|
2
|
+
|
|
3
|
+
Extracts searchable text from Forge transcript artifacts, producing one
|
|
4
|
+
SearchDocument per transcript file. Content extraction rules (design.md §5.5):
|
|
5
|
+
- User/assistant text messages: fully indexed
|
|
6
|
+
- Tool inputs (file paths, commands): truncated to 100 chars
|
|
7
|
+
- Tool results: truncated to 500 chars
|
|
8
|
+
|
|
9
|
+
Uses shared parsing primitives from forge.core.transcript.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import json
|
|
15
|
+
import logging
|
|
16
|
+
from dataclasses import asdict, dataclass, field
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from typing import Any
|
|
19
|
+
|
|
20
|
+
from forge.core.state import now_iso
|
|
21
|
+
from forge.core.transcript import parse_jsonl_transcript, truncate
|
|
22
|
+
|
|
23
|
+
from .tokenizer import tokenize
|
|
24
|
+
|
|
25
|
+
# --- Data classes ---
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
# Truncation limits
|
|
30
|
+
TOOL_RESULT_TRUNCATE_CHARS = 500
|
|
31
|
+
TOOL_ARG_TRUNCATE_CHARS = 100
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class SearchDocumentMeta:
|
|
36
|
+
"""Metadata-only view of a search document (no content, no tokens).
|
|
37
|
+
|
|
38
|
+
Used by the v2 document store for lightweight persistence and by
|
|
39
|
+
search_from_index() for result construction.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
transcript_path: str # Absolute path (JSON-serializable key)
|
|
43
|
+
session_name: str
|
|
44
|
+
session_id: str
|
|
45
|
+
extracted_at: str # ISO8601
|
|
46
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
47
|
+
|
|
48
|
+
def to_dict(self) -> dict[str, Any]:
|
|
49
|
+
"""Serialize to dict for JSON storage."""
|
|
50
|
+
return asdict(self)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass
|
|
54
|
+
class SearchDocument:
|
|
55
|
+
"""Extracted content from a single transcript file for search indexing.
|
|
56
|
+
|
|
57
|
+
Full extraction output including content and tokens. Used at extraction
|
|
58
|
+
time; callers decompose into metadata, term frequencies, and content
|
|
59
|
+
for the three-store architecture via decompose_document().
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
transcript_path: str # Absolute path (JSON-serializable key)
|
|
63
|
+
session_name: str
|
|
64
|
+
session_id: str
|
|
65
|
+
content: str # Full extracted text for BM25 indexing
|
|
66
|
+
extracted_at: str # ISO8601
|
|
67
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
68
|
+
tokens: list[str] | None = None # Cached tokenization (used at extraction time)
|
|
69
|
+
|
|
70
|
+
def to_dict(self) -> dict[str, Any]:
|
|
71
|
+
"""Serialize to dict for JSON storage."""
|
|
72
|
+
return asdict(self)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def extract_document(
|
|
76
|
+
transcript_path: Path,
|
|
77
|
+
session_name: str,
|
|
78
|
+
session_id: str,
|
|
79
|
+
worktree_path: str,
|
|
80
|
+
) -> SearchDocument:
|
|
81
|
+
"""Extract searchable content from a JSONL transcript file.
|
|
82
|
+
|
|
83
|
+
Parses each JSONL line and extracts:
|
|
84
|
+
- User/assistant text messages (full)
|
|
85
|
+
- Tool use summaries (name + key args, truncated)
|
|
86
|
+
- Tool results (truncated to 500 chars)
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
transcript_path: Absolute path to the .jsonl transcript file.
|
|
90
|
+
session_name: Forge session name.
|
|
91
|
+
session_id: Claude session UUID.
|
|
92
|
+
worktree_path: Worktree path where session ran.
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
SearchDocument with extracted content and metadata.
|
|
96
|
+
|
|
97
|
+
Raises:
|
|
98
|
+
FileNotFoundError: If transcript_path does not exist.
|
|
99
|
+
"""
|
|
100
|
+
if not transcript_path.is_file():
|
|
101
|
+
raise FileNotFoundError(str(transcript_path))
|
|
102
|
+
|
|
103
|
+
entries = parse_jsonl_transcript(transcript_path)
|
|
104
|
+
parts: list[str] = []
|
|
105
|
+
message_count = 0
|
|
106
|
+
first_ts = ""
|
|
107
|
+
last_ts = ""
|
|
108
|
+
|
|
109
|
+
for entry in entries:
|
|
110
|
+
extracted = _extract_entry_text(entry)
|
|
111
|
+
if extracted is None:
|
|
112
|
+
continue
|
|
113
|
+
|
|
114
|
+
role, text, timestamp = extracted
|
|
115
|
+
parts.append(f"[{role}] {text}")
|
|
116
|
+
message_count += 1
|
|
117
|
+
|
|
118
|
+
if timestamp:
|
|
119
|
+
if not first_ts:
|
|
120
|
+
first_ts = timestamp
|
|
121
|
+
last_ts = timestamp
|
|
122
|
+
|
|
123
|
+
content = "\n".join(parts)
|
|
124
|
+
|
|
125
|
+
return SearchDocument(
|
|
126
|
+
transcript_path=str(transcript_path),
|
|
127
|
+
session_name=session_name,
|
|
128
|
+
session_id=session_id,
|
|
129
|
+
content=content,
|
|
130
|
+
extracted_at=now_iso(),
|
|
131
|
+
metadata={
|
|
132
|
+
"message_count": message_count,
|
|
133
|
+
"first_timestamp": first_ts,
|
|
134
|
+
"last_timestamp": last_ts,
|
|
135
|
+
"worktree_path": worktree_path,
|
|
136
|
+
},
|
|
137
|
+
tokens=tokenize(content),
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _extract_entry_text(entry: dict[str, Any]) -> tuple[str, str, str] | None:
|
|
142
|
+
"""Extract text content from a single transcript entry.
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
(role, text, timestamp) tuple, or None if entry is not a valid message.
|
|
146
|
+
"""
|
|
147
|
+
message = entry.get("message")
|
|
148
|
+
if not isinstance(message, dict):
|
|
149
|
+
return None
|
|
150
|
+
|
|
151
|
+
role = message.get("role")
|
|
152
|
+
if role not in ("user", "assistant"):
|
|
153
|
+
return None
|
|
154
|
+
|
|
155
|
+
content = message.get("content")
|
|
156
|
+
if not isinstance(content, list):
|
|
157
|
+
return None
|
|
158
|
+
|
|
159
|
+
text_parts: list[str] = []
|
|
160
|
+
|
|
161
|
+
for block in content:
|
|
162
|
+
if not isinstance(block, dict):
|
|
163
|
+
continue
|
|
164
|
+
|
|
165
|
+
block_type = block.get("type")
|
|
166
|
+
|
|
167
|
+
if block_type == "text":
|
|
168
|
+
t = block.get("text")
|
|
169
|
+
if isinstance(t, str) and t:
|
|
170
|
+
text_parts.append(t)
|
|
171
|
+
|
|
172
|
+
elif block_type == "tool_use":
|
|
173
|
+
name = block.get("name", "unknown")
|
|
174
|
+
inp = block.get("input", {})
|
|
175
|
+
if isinstance(inp, dict):
|
|
176
|
+
path = inp.get("file_path") or inp.get("path")
|
|
177
|
+
cmd = inp.get("command")
|
|
178
|
+
if path:
|
|
179
|
+
text_parts.append(f"{name}(path={truncate(str(path), TOOL_ARG_TRUNCATE_CHARS)})")
|
|
180
|
+
elif cmd:
|
|
181
|
+
text_parts.append(f"{name}(command={truncate(str(cmd), TOOL_ARG_TRUNCATE_CHARS)})")
|
|
182
|
+
else:
|
|
183
|
+
text_parts.append(f"{name}(...)")
|
|
184
|
+
else:
|
|
185
|
+
text_parts.append(f"{name}(...)")
|
|
186
|
+
|
|
187
|
+
elif block_type == "tool_result":
|
|
188
|
+
result = block.get("content", "")
|
|
189
|
+
# Handle non-string tool results (dict/list in some Claude versions)
|
|
190
|
+
if not isinstance(result, str):
|
|
191
|
+
try:
|
|
192
|
+
result = json.dumps(result, ensure_ascii=False)
|
|
193
|
+
except (TypeError, ValueError):
|
|
194
|
+
result = str(result)
|
|
195
|
+
if result:
|
|
196
|
+
text_parts.append(f"[result: {truncate(result, TOOL_RESULT_TRUNCATE_CHARS)}]")
|
|
197
|
+
|
|
198
|
+
if not text_parts:
|
|
199
|
+
return None
|
|
200
|
+
|
|
201
|
+
timestamp = entry.get("timestamp", "")
|
|
202
|
+
if not isinstance(timestamp, str):
|
|
203
|
+
timestamp = ""
|
|
204
|
+
|
|
205
|
+
return role, " ".join(text_parts), timestamp
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
# --- Decomposition (full document → three-store components) ---
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def decompose_document(
|
|
212
|
+
doc: SearchDocument,
|
|
213
|
+
) -> tuple[SearchDocumentMeta, dict[str, int], int, str]:
|
|
214
|
+
"""Decompose a full SearchDocument into components for the three-store architecture.
|
|
215
|
+
|
|
216
|
+
Returns:
|
|
217
|
+
(metadata, term_freq, doc_len, content) where:
|
|
218
|
+
- metadata: SearchDocumentMeta for the document store
|
|
219
|
+
- term_freq: term frequency dict for the BM25 index store
|
|
220
|
+
- doc_len: token count for BM25 length normalization
|
|
221
|
+
- content: raw content string for the content store
|
|
222
|
+
"""
|
|
223
|
+
tokens = doc.tokens if doc.tokens is not None else tokenize(doc.content)
|
|
224
|
+
tf: dict[str, int] = {}
|
|
225
|
+
for t in tokens:
|
|
226
|
+
tf[t] = tf.get(t, 0) + 1
|
|
227
|
+
meta = SearchDocumentMeta(
|
|
228
|
+
transcript_path=doc.transcript_path,
|
|
229
|
+
session_name=doc.session_name,
|
|
230
|
+
session_id=doc.session_id,
|
|
231
|
+
extracted_at=doc.extracted_at,
|
|
232
|
+
metadata=doc.metadata,
|
|
233
|
+
)
|
|
234
|
+
return meta, tf, len(tokens), doc.content
|