code-context-control 2.28.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cli/__init__.py +1 -0
- cli/_hook_utils.py +99 -0
- cli/c3.py +6152 -0
- cli/commands/__init__.py +1 -0
- cli/commands/common.py +312 -0
- cli/commands/parser.py +286 -0
- cli/docs.html +3178 -0
- cli/edits.html +878 -0
- cli/hook_auto_snapshot.py +142 -0
- cli/hook_c3_signal.py +61 -0
- cli/hook_c3read.py +116 -0
- cli/hook_edit_ledger.py +213 -0
- cli/hook_edit_unlock.py +170 -0
- cli/hook_filter.py +130 -0
- cli/hook_ghost_files.py +238 -0
- cli/hook_pretool_enforce.py +334 -0
- cli/hook_read.py +200 -0
- cli/hook_session_stats.py +62 -0
- cli/hook_terse_advisor.py +190 -0
- cli/hub.html +3764 -0
- cli/hub_server.py +1619 -0
- cli/mcp_proxy.py +428 -0
- cli/mcp_server.py +660 -0
- cli/server.py +2985 -0
- cli/tools/__init__.py +4 -0
- cli/tools/_helpers.py +65 -0
- cli/tools/agent.py +1165 -0
- cli/tools/compress.py +215 -0
- cli/tools/delegate.py +1184 -0
- cli/tools/edit.py +313 -0
- cli/tools/edits.py +118 -0
- cli/tools/filter.py +285 -0
- cli/tools/impact.py +163 -0
- cli/tools/memory.py +469 -0
- cli/tools/read.py +224 -0
- cli/tools/search.py +337 -0
- cli/tools/session.py +95 -0
- cli/tools/shell.py +193 -0
- cli/tools/status.py +306 -0
- cli/tools/validate.py +310 -0
- cli/ui/api.js +36 -0
- cli/ui/app.js +207 -0
- cli/ui/components/chat.js +758 -0
- cli/ui/components/dashboard.js +689 -0
- cli/ui/components/edits.js +220 -0
- cli/ui/components/instructions.js +481 -0
- cli/ui/components/memory.js +626 -0
- cli/ui/components/sessions.js +606 -0
- cli/ui/components/settings.js +1404 -0
- cli/ui/components/sidebar.js +156 -0
- cli/ui/icons.js +51 -0
- cli/ui/shared.js +119 -0
- cli/ui/theme.js +22 -0
- cli/ui.html +168 -0
- cli/ui_legacy.html +6797 -0
- cli/ui_nano.html +503 -0
- code_context_control-2.28.0.dist-info/METADATA +248 -0
- code_context_control-2.28.0.dist-info/RECORD +150 -0
- code_context_control-2.28.0.dist-info/WHEEL +5 -0
- code_context_control-2.28.0.dist-info/entry_points.txt +4 -0
- code_context_control-2.28.0.dist-info/licenses/LICENSE +201 -0
- code_context_control-2.28.0.dist-info/top_level.txt +5 -0
- core/__init__.py +75 -0
- core/config.py +269 -0
- core/ide.py +188 -0
- oracle/__init__.py +1 -0
- oracle/config.py +75 -0
- oracle/oracle.html +3900 -0
- oracle/oracle_server.py +663 -0
- oracle/services/__init__.py +1 -0
- oracle/services/c3_bridge.py +210 -0
- oracle/services/chat_engine.py +1103 -0
- oracle/services/chat_store.py +155 -0
- oracle/services/cross_memory.py +154 -0
- oracle/services/federated_graph.py +463 -0
- oracle/services/health_checker.py +117 -0
- oracle/services/insight_engine.py +307 -0
- oracle/services/memory_reader.py +106 -0
- oracle/services/memory_writer.py +182 -0
- oracle/services/ollama_bridge.py +332 -0
- oracle/services/project_scanner.py +87 -0
- oracle/services/review_agent.py +206 -0
- services/__init__.py +1 -0
- services/activity_log.py +93 -0
- services/agent_base.py +124 -0
- services/agents.py +1529 -0
- services/auto_memory.py +407 -0
- services/bench/__init__.py +6 -0
- services/bench/external/__init__.py +29 -0
- services/bench/external/aider_polyglot.py +405 -0
- services/bench/external/swe_bench.py +485 -0
- services/benchmark_dashboard.py +596 -0
- services/claude_md.py +785 -0
- services/compressor.py +592 -0
- services/context_snapshot.py +356 -0
- services/conversation_store.py +870 -0
- services/doc_index.py +537 -0
- services/e2e_benchmark.py +2884 -0
- services/e2e_evaluator.py +396 -0
- services/e2e_tasks.py +743 -0
- services/edit_ledger.py +459 -0
- services/embedding_index.py +341 -0
- services/error_reporting.py +123 -0
- services/file_memory.py +734 -0
- services/hub_service.py +585 -0
- services/indexer.py +712 -0
- services/memory.py +318 -0
- services/memory_consolidator.py +538 -0
- services/memory_graph.py +382 -0
- services/memory_grounder.py +304 -0
- services/memory_scorer.py +246 -0
- services/metrics.py +86 -0
- services/notifications.py +209 -0
- services/ollama_client.py +201 -0
- services/output_filter.py +488 -0
- services/parser.py +1238 -0
- services/project_manager.py +579 -0
- services/protocol.py +306 -0
- services/proxy_state.py +152 -0
- services/retrieval_broker.py +129 -0
- services/router.py +414 -0
- services/runtime.py +326 -0
- services/session_benchmark.py +1945 -0
- services/session_manager.py +1026 -0
- services/session_preloader.py +251 -0
- services/text_index.py +90 -0
- services/tool_classifier.py +176 -0
- services/transcript_index.py +340 -0
- services/validation_cache.py +155 -0
- services/vector_store.py +299 -0
- services/version_tracker.py +271 -0
- services/watcher.py +192 -0
- tui/__init__.py +0 -0
- tui/backend.py +59 -0
- tui/main.py +145 -0
- tui/screens/__init__.py +1 -0
- tui/screens/benchmark_view.py +109 -0
- tui/screens/claudemd_view.py +46 -0
- tui/screens/compress_view.py +52 -0
- tui/screens/index_view.py +74 -0
- tui/screens/init_view.py +82 -0
- tui/screens/mcp_view.py +73 -0
- tui/screens/optimize_view.py +41 -0
- tui/screens/pipe_view.py +46 -0
- tui/screens/projects_view.py +355 -0
- tui/screens/search_view.py +55 -0
- tui/screens/session_view.py +143 -0
- tui/screens/stats.py +158 -0
- tui/screens/ui_view.py +54 -0
- tui/theme.tcss +335 -0
|
@@ -0,0 +1,870 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ConversationStore — records and searches full conversation turns.
|
|
3
|
+
|
|
4
|
+
Storage layout under .c3/conversations/:
|
|
5
|
+
sessions.json — session metadata index
|
|
6
|
+
{session_id}.jsonl — full turn records (one JSON object per line)
|
|
7
|
+
{session_id}.jsonl.gz — gzip-compressed archive for old sessions
|
|
8
|
+
|
|
9
|
+
Sources:
|
|
10
|
+
- Claude Code: auto-synced from ~/.claude/projects/<slug>/*.jsonl
|
|
11
|
+
- All IDEs: manual logging via add_turn() (called by c3_convo_log MCP tool)
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import gzip
|
|
15
|
+
import json
|
|
16
|
+
import math
|
|
17
|
+
import re
|
|
18
|
+
import time
|
|
19
|
+
from collections import Counter
|
|
20
|
+
from datetime import datetime
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
|
|
23
|
+
from core import count_tokens
|
|
24
|
+
from services.text_index import TextIndex
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class ConversationStore:
|
|
28
|
+
"""Stores, indexes, and searches full conversation turns."""
|
|
29
|
+
|
|
30
|
+
COMPRESS_AFTER_DAYS = 30
|
|
31
|
+
MAX_TURNS_PER_SESSION = 1000
|
|
32
|
+
MAX_TEXT_LEN = 50000 # characters kept per turn (preserve full assistant outputs)
|
|
33
|
+
MAX_SEARCH_TEXT = 1200 # characters used per chunk for TF-IDF scoring
|
|
34
|
+
MAX_TRANSCRIPT_FILES = 100
|
|
35
|
+
SEARCH_CHUNK_CHARS = 1200
|
|
36
|
+
SEARCH_CHUNK_OVERLAP = 200
|
|
37
|
+
|
|
38
|
+
def __init__(self, project_path: str):
|
|
39
|
+
self.project_path = Path(project_path).resolve()
|
|
40
|
+
self.store_dir = self.project_path / ".c3" / "conversations"
|
|
41
|
+
self.store_dir.mkdir(parents=True, exist_ok=True)
|
|
42
|
+
self._sessions_file = self.store_dir / "sessions.json"
|
|
43
|
+
self._sessions: list = [] # in-memory cache, cleared on write
|
|
44
|
+
self._search_index = TextIndex()
|
|
45
|
+
self._search_meta: dict[str, dict] = {}
|
|
46
|
+
self._search_dirty = True
|
|
47
|
+
|
|
48
|
+
# ── Public API ──────────────────────────────────────────────────────────
|
|
49
|
+
|
|
50
|
+
def sync(self, source: str = "all", force: bool = False) -> dict:
|
|
51
|
+
"""Sync conversations from known transcript providers and import adapters.
|
|
52
|
+
|
|
53
|
+
source:
|
|
54
|
+
- "all" (default): sync all supported sources
|
|
55
|
+
- "claude": sync Claude Code transcripts only
|
|
56
|
+
- "gemini": sync Gemini CLI transcripts only
|
|
57
|
+
- "imports": sync external import files from .c3/conversations/imports
|
|
58
|
+
|
|
59
|
+
Returns {synced, total, by_source, errors?}.
|
|
60
|
+
"""
|
|
61
|
+
selected = self._normalize_source(source)
|
|
62
|
+
claude_dir = self._find_transcript_dir()
|
|
63
|
+
gemini_dir = self._find_gemini_transcript_dir()
|
|
64
|
+
imports_root = self.store_dir / "imports"
|
|
65
|
+
imports_available = imports_root.exists() and any(imports_root.rglob("*.jsonl"))
|
|
66
|
+
|
|
67
|
+
availability = {
|
|
68
|
+
"claude": bool(claude_dir),
|
|
69
|
+
"gemini": bool(gemini_dir),
|
|
70
|
+
"imports": bool(imports_available),
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
providers = []
|
|
74
|
+
warnings = []
|
|
75
|
+
if selected in ("all", "claude"):
|
|
76
|
+
if availability["claude"]:
|
|
77
|
+
providers.append(("claude", lambda: self._sync_claude(force=force)))
|
|
78
|
+
elif selected == "claude":
|
|
79
|
+
warnings.append("Claude transcript directory not found for this project")
|
|
80
|
+
elif not availability["gemini"]:
|
|
81
|
+
if availability["imports"]:
|
|
82
|
+
warnings.append("Claude transcript directory not found; synced imports source instead")
|
|
83
|
+
else:
|
|
84
|
+
warnings.append("Claude transcript directory not found; skipped claude source")
|
|
85
|
+
|
|
86
|
+
if selected in ("all", "gemini"):
|
|
87
|
+
if availability["gemini"]:
|
|
88
|
+
providers.append(("gemini", lambda: self._sync_gemini(force=force)))
|
|
89
|
+
elif selected == "gemini":
|
|
90
|
+
warnings.append("Gemini transcript directory not found for this project")
|
|
91
|
+
|
|
92
|
+
if selected in ("all", "imports"):
|
|
93
|
+
if availability["imports"]:
|
|
94
|
+
providers.append(("imports", lambda: self._sync_imports(force=force)))
|
|
95
|
+
elif selected == "imports":
|
|
96
|
+
warnings.append("No import transcripts found under .c3/conversations/imports")
|
|
97
|
+
|
|
98
|
+
total_synced = 0
|
|
99
|
+
by_source = {}
|
|
100
|
+
errors = []
|
|
101
|
+
for name, provider in providers:
|
|
102
|
+
try:
|
|
103
|
+
synced = int(provider() or 0)
|
|
104
|
+
by_source[name] = synced
|
|
105
|
+
total_synced += synced
|
|
106
|
+
except Exception as e:
|
|
107
|
+
by_source[name] = 0
|
|
108
|
+
errors.append(f"{name}: {str(e)[:120]}")
|
|
109
|
+
|
|
110
|
+
self._compress_old()
|
|
111
|
+
result = {
|
|
112
|
+
"synced": total_synced,
|
|
113
|
+
"total": len(self._load_sessions()),
|
|
114
|
+
"by_source": by_source,
|
|
115
|
+
"requested_source": selected,
|
|
116
|
+
"forced": bool(force),
|
|
117
|
+
"available_sources": availability,
|
|
118
|
+
}
|
|
119
|
+
if warnings:
|
|
120
|
+
result["warnings"] = warnings
|
|
121
|
+
if errors:
|
|
122
|
+
result["errors"] = errors
|
|
123
|
+
self._search_dirty = True
|
|
124
|
+
return result
|
|
125
|
+
|
|
126
|
+
def list_sessions(self, limit: int = 100) -> list:
|
|
127
|
+
"""Return session metadata sorted by most recent first."""
|
|
128
|
+
sessions = self._load_sessions()
|
|
129
|
+
sessions.sort(key=lambda s: s.get("started", 0), reverse=True)
|
|
130
|
+
return sessions[:limit]
|
|
131
|
+
|
|
132
|
+
def get_session(self, session_id: str, offset: int = 0, limit: int = None) -> list:
|
|
133
|
+
"""Return turn list for a session, optionally paginated."""
|
|
134
|
+
turns = self._read_turns(session_id)
|
|
135
|
+
if offset < 0:
|
|
136
|
+
offset = 0
|
|
137
|
+
if limit is None or limit <= 0:
|
|
138
|
+
return turns[offset:]
|
|
139
|
+
return turns[offset:offset + limit]
|
|
140
|
+
|
|
141
|
+
def add_turn(self, session_id: str, role: str, text: str,
|
|
142
|
+
tool_calls: list = None, ts: float = None, source: str = "manual") -> dict:
|
|
143
|
+
"""Manually append a single turn to a session (for non-Claude-Code IDEs).
|
|
144
|
+
|
|
145
|
+
Creates the session metadata if it does not exist yet.
|
|
146
|
+
"""
|
|
147
|
+
ts = ts or time.time()
|
|
148
|
+
tokens = count_tokens(text)
|
|
149
|
+
turn = {
|
|
150
|
+
"id": f"t{int(ts * 1000)}",
|
|
151
|
+
"ts": ts,
|
|
152
|
+
"role": role,
|
|
153
|
+
"text": text[:self.MAX_TEXT_LEN],
|
|
154
|
+
"tokens": tokens,
|
|
155
|
+
"source": self._normalize_source(source),
|
|
156
|
+
}
|
|
157
|
+
if tool_calls:
|
|
158
|
+
turn["tool_calls"] = tool_calls
|
|
159
|
+
|
|
160
|
+
# Append to JSONL
|
|
161
|
+
session_file = self.store_dir / f"{session_id}.jsonl"
|
|
162
|
+
with open(session_file, "a", encoding="utf-8") as f:
|
|
163
|
+
f.write(json.dumps(turn, ensure_ascii=False) + "\n")
|
|
164
|
+
|
|
165
|
+
# Update session index
|
|
166
|
+
sessions = self._load_sessions()
|
|
167
|
+
existing = next((s for s in sessions if s["session_id"] == session_id), None)
|
|
168
|
+
if existing:
|
|
169
|
+
existing["turns"] = existing.get("turns", 0) + 1
|
|
170
|
+
existing["ended"] = ts
|
|
171
|
+
existing["user_tokens"] = existing.get("user_tokens", 0) + (tokens if role == "user" else 0)
|
|
172
|
+
existing["assistant_tokens"] = existing.get("assistant_tokens", 0) + (tokens if role == "assistant" else 0)
|
|
173
|
+
if source and existing.get("source") in (None, "", "manual"):
|
|
174
|
+
existing["source"] = self._normalize_source(source)
|
|
175
|
+
if role == "user" and existing.get("turns", 0) == 1:
|
|
176
|
+
existing["title"] = text[:100].replace("\n", " ")
|
|
177
|
+
else:
|
|
178
|
+
sessions.append({
|
|
179
|
+
"session_id": session_id,
|
|
180
|
+
"title": (text[:100].replace("\n", " ") if role == "user" else session_id[:24]),
|
|
181
|
+
"source": self._normalize_source(source),
|
|
182
|
+
"source_file": None,
|
|
183
|
+
"source_mtime": 0,
|
|
184
|
+
"started": ts,
|
|
185
|
+
"ended": ts,
|
|
186
|
+
"turns": 1,
|
|
187
|
+
"user_tokens": tokens if role == "user" else 0,
|
|
188
|
+
"assistant_tokens": tokens if role == "assistant" else 0,
|
|
189
|
+
"compressed": False,
|
|
190
|
+
})
|
|
191
|
+
self._save_sessions(sessions)
|
|
192
|
+
self._search_dirty = True
|
|
193
|
+
return turn
|
|
194
|
+
|
|
195
|
+
def search(self, query: str, limit: int = 30, session_id: str = None) -> list:
|
|
196
|
+
"""TF-IDF search over chunked conversation turns."""
|
|
197
|
+
self._ensure_search_index()
|
|
198
|
+
if not self._search_meta:
|
|
199
|
+
return []
|
|
200
|
+
|
|
201
|
+
ranked = self._search_index.search(query, top_k=max(limit * 4, 20))
|
|
202
|
+
results = []
|
|
203
|
+
seen_turns = set()
|
|
204
|
+
for key, score in ranked:
|
|
205
|
+
meta = self._search_meta.get(key)
|
|
206
|
+
if not meta:
|
|
207
|
+
continue
|
|
208
|
+
if session_id and meta["session_id"] != session_id:
|
|
209
|
+
continue
|
|
210
|
+
turn_key = meta["turn_key"]
|
|
211
|
+
if turn_key in seen_turns:
|
|
212
|
+
continue
|
|
213
|
+
seen_turns.add(turn_key)
|
|
214
|
+
results.append({
|
|
215
|
+
"session_id": meta["session_id"],
|
|
216
|
+
"session_title": meta["session_title"],
|
|
217
|
+
"source": meta["source"],
|
|
218
|
+
"ts": meta["ts"],
|
|
219
|
+
"role": meta["role"],
|
|
220
|
+
"text": meta["text"],
|
|
221
|
+
"snippet": meta["snippet"],
|
|
222
|
+
"tokens": meta["tokens"],
|
|
223
|
+
"turn_source": meta["turn_source"],
|
|
224
|
+
"turn_key": turn_key,
|
|
225
|
+
"chunk_key": key,
|
|
226
|
+
"chunk_index": meta["chunk_index"],
|
|
227
|
+
"score": round(score, 4),
|
|
228
|
+
})
|
|
229
|
+
if len(results) >= limit:
|
|
230
|
+
break
|
|
231
|
+
return results
|
|
232
|
+
|
|
233
|
+
def get_stats(self) -> dict:
|
|
234
|
+
"""Return aggregate statistics."""
|
|
235
|
+
sessions = self._load_sessions()
|
|
236
|
+
by_source = Counter(self._normalize_source(s.get("source", "manual")) for s in sessions)
|
|
237
|
+
return {
|
|
238
|
+
"sessions": len(sessions),
|
|
239
|
+
"turns": sum(s.get("turns", 0) for s in sessions),
|
|
240
|
+
"user_tokens": sum(s.get("user_tokens", 0) for s in sessions),
|
|
241
|
+
"assistant_tokens": sum(s.get("assistant_tokens", 0) for s in sessions),
|
|
242
|
+
"compressed_sessions": sum(1 for s in sessions if s.get("compressed")),
|
|
243
|
+
"sources": dict(by_source),
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
# ── Transcript Parsing ─────────────────────────────────────────────────
|
|
247
|
+
|
|
248
|
+
def _find_transcript_dir(self):
|
|
249
|
+
"""Locate Claude Code's project transcript directory."""
|
|
250
|
+
import re as _re
|
|
251
|
+
home = Path.home()
|
|
252
|
+
projects_dir = home / ".claude" / "projects"
|
|
253
|
+
if not projects_dir.exists():
|
|
254
|
+
return None
|
|
255
|
+
|
|
256
|
+
project_str = str(self.project_path)
|
|
257
|
+
|
|
258
|
+
# Claude Code slugifies the absolute path by replacing every
|
|
259
|
+
# non-alphanumeric character with '-' and stripping leading dashes.
|
|
260
|
+
slug = _re.sub(r"[^a-zA-Z0-9]", "-", project_str).lstrip("-")
|
|
261
|
+
direct = projects_dir / slug
|
|
262
|
+
if direct.exists():
|
|
263
|
+
return direct
|
|
264
|
+
|
|
265
|
+
# Fallback 1: old slug algorithm (kept for backwards compat with
|
|
266
|
+
# directories created by earlier versions of C3 or other tools).
|
|
267
|
+
old_slug = project_str.replace("\\", "--").replace("/", "--").replace(":", "").lstrip("-")
|
|
268
|
+
if old_slug != slug:
|
|
269
|
+
old_direct = projects_dir / old_slug
|
|
270
|
+
if old_direct.exists():
|
|
271
|
+
return old_direct
|
|
272
|
+
|
|
273
|
+
# Fallback 2: normalize both sides to bare alphanumerics and find the
|
|
274
|
+
# best-matching directory (handles edge-cases in slugification variants).
|
|
275
|
+
def _bare(s):
|
|
276
|
+
return _re.sub(r"[^a-z0-9]", "", s.lower())
|
|
277
|
+
|
|
278
|
+
target_bare = _bare(project_str)
|
|
279
|
+
best = None
|
|
280
|
+
best_len = 0
|
|
281
|
+
for d in projects_dir.iterdir():
|
|
282
|
+
if not d.is_dir():
|
|
283
|
+
continue
|
|
284
|
+
d_bare = _bare(d.name)
|
|
285
|
+
# Must share all alphanumeric chars of the project path
|
|
286
|
+
if d_bare == target_bare:
|
|
287
|
+
n = len(list(d.glob("*.jsonl")))
|
|
288
|
+
if n > best_len:
|
|
289
|
+
best = d
|
|
290
|
+
best_len = n
|
|
291
|
+
return best
|
|
292
|
+
|
|
293
|
+
def _sync_claude(self, force: bool = False) -> int:
|
|
294
|
+
"""Sync transcripts from Claude Code."""
|
|
295
|
+
transcript_dir = self._find_transcript_dir()
|
|
296
|
+
if not transcript_dir:
|
|
297
|
+
return 0
|
|
298
|
+
|
|
299
|
+
jsonl_files = sorted(
|
|
300
|
+
transcript_dir.glob("*.jsonl"),
|
|
301
|
+
key=lambda f: f.stat().st_mtime,
|
|
302
|
+
reverse=True,
|
|
303
|
+
)[:self.MAX_TRANSCRIPT_FILES]
|
|
304
|
+
|
|
305
|
+
existing = {
|
|
306
|
+
f"claude::{s.get('source_file')}": s
|
|
307
|
+
for s in self._load_sessions()
|
|
308
|
+
if s.get("source_file")
|
|
309
|
+
}
|
|
310
|
+
synced = 0
|
|
311
|
+
for jf in jsonl_files:
|
|
312
|
+
try:
|
|
313
|
+
mtime = jf.stat().st_mtime
|
|
314
|
+
source_file = jf.name
|
|
315
|
+
existing_key = f"claude::{source_file}"
|
|
316
|
+
if (not force) and existing_key in existing and abs(existing[existing_key].get("source_mtime", 0) - mtime) < 1:
|
|
317
|
+
continue
|
|
318
|
+
|
|
319
|
+
turns = self._extract_turns(jf)
|
|
320
|
+
if not turns:
|
|
321
|
+
continue
|
|
322
|
+
for t in turns:
|
|
323
|
+
t["source"] = "claude"
|
|
324
|
+
|
|
325
|
+
session_id = jf.stem
|
|
326
|
+
self._upsert_synced_session(
|
|
327
|
+
session_id=session_id,
|
|
328
|
+
turns=turns,
|
|
329
|
+
source="claude",
|
|
330
|
+
source_file=source_file,
|
|
331
|
+
source_mtime=mtime,
|
|
332
|
+
)
|
|
333
|
+
synced += 1
|
|
334
|
+
except Exception:
|
|
335
|
+
continue
|
|
336
|
+
return synced
|
|
337
|
+
|
|
338
|
+
def _find_gemini_transcript_dir(self):
|
|
339
|
+
"""Locate Gemini CLI's project transcript directory."""
|
|
340
|
+
import hashlib
|
|
341
|
+
home = Path.home()
|
|
342
|
+
project_str = str(self.project_path)
|
|
343
|
+
slug = hashlib.sha256(project_str.encode('utf-8')).hexdigest()
|
|
344
|
+
chats_dir = home / ".gemini" / "tmp" / slug / "chats"
|
|
345
|
+
if chats_dir.exists() and chats_dir.is_dir():
|
|
346
|
+
return chats_dir
|
|
347
|
+
return None
|
|
348
|
+
|
|
349
|
+
def _sync_gemini(self, force: bool = False) -> int:
|
|
350
|
+
"""Sync transcripts from Gemini CLI."""
|
|
351
|
+
transcript_dir = self._find_gemini_transcript_dir()
|
|
352
|
+
if not transcript_dir:
|
|
353
|
+
return 0
|
|
354
|
+
|
|
355
|
+
json_files = sorted(
|
|
356
|
+
transcript_dir.glob("*.json"),
|
|
357
|
+
key=lambda f: f.stat().st_mtime,
|
|
358
|
+
reverse=True,
|
|
359
|
+
)[:self.MAX_TRANSCRIPT_FILES]
|
|
360
|
+
|
|
361
|
+
existing = {
|
|
362
|
+
f"gemini::{s.get('source_file')}": s
|
|
363
|
+
for s in self._load_sessions()
|
|
364
|
+
if s.get("source_file")
|
|
365
|
+
}
|
|
366
|
+
synced = 0
|
|
367
|
+
for jf in json_files:
|
|
368
|
+
try:
|
|
369
|
+
mtime = jf.stat().st_mtime
|
|
370
|
+
source_file = jf.name
|
|
371
|
+
existing_key = f"gemini::{source_file}"
|
|
372
|
+
if (not force) and existing_key in existing and abs(existing[existing_key].get("source_mtime", 0) - mtime) < 1:
|
|
373
|
+
continue
|
|
374
|
+
|
|
375
|
+
turns = self._extract_turns_gemini(jf)
|
|
376
|
+
if not turns:
|
|
377
|
+
continue
|
|
378
|
+
|
|
379
|
+
with open(jf, "r", encoding="utf-8") as f:
|
|
380
|
+
data = json.load(f)
|
|
381
|
+
session_id = data.get("sessionId", jf.stem)
|
|
382
|
+
|
|
383
|
+
self._upsert_synced_session(
|
|
384
|
+
session_id=session_id,
|
|
385
|
+
turns=turns,
|
|
386
|
+
source="gemini",
|
|
387
|
+
source_file=source_file,
|
|
388
|
+
source_mtime=mtime,
|
|
389
|
+
)
|
|
390
|
+
synced += 1
|
|
391
|
+
except Exception:
|
|
392
|
+
continue
|
|
393
|
+
return synced
|
|
394
|
+
|
|
395
|
+
def _extract_turns_gemini(self, json_path: Path) -> list:
|
|
396
|
+
turns = []
|
|
397
|
+
try:
|
|
398
|
+
with open(json_path, "r", encoding="utf-8") as f:
|
|
399
|
+
data = json.load(f)
|
|
400
|
+
|
|
401
|
+
messages = data.get("messages", [])
|
|
402
|
+
for i, msg in enumerate(messages):
|
|
403
|
+
role = "assistant" if msg.get("type") == "gemini" else msg.get("type", "user")
|
|
404
|
+
text = msg.get("content", "")
|
|
405
|
+
ts_raw = msg.get("timestamp")
|
|
406
|
+
|
|
407
|
+
ts = json_path.stat().st_mtime
|
|
408
|
+
if ts_raw:
|
|
409
|
+
try:
|
|
410
|
+
ts_raw = ts_raw.replace("Z", "+00:00")
|
|
411
|
+
dt = datetime.fromisoformat(ts_raw)
|
|
412
|
+
ts = dt.timestamp()
|
|
413
|
+
except:
|
|
414
|
+
pass
|
|
415
|
+
|
|
416
|
+
tokens = 0
|
|
417
|
+
tok_data = msg.get("tokens", {})
|
|
418
|
+
if isinstance(tok_data, dict):
|
|
419
|
+
tokens = tok_data.get("output", 0) if role == "assistant" else tok_data.get("input", 0)
|
|
420
|
+
if not tokens:
|
|
421
|
+
tokens = count_tokens(text)
|
|
422
|
+
|
|
423
|
+
turns.append({
|
|
424
|
+
"id": msg.get("id", f"t{int(ts * 1000)}_{i}"),
|
|
425
|
+
"ts": ts,
|
|
426
|
+
"role": role,
|
|
427
|
+
"text": text[:self.MAX_TEXT_LEN],
|
|
428
|
+
"tokens": tokens,
|
|
429
|
+
"source": "gemini",
|
|
430
|
+
"tool_calls": []
|
|
431
|
+
})
|
|
432
|
+
except Exception:
|
|
433
|
+
pass
|
|
434
|
+
return turns
|
|
435
|
+
|
|
436
|
+
def _sync_imports(self, force: bool = False) -> int:
|
|
437
|
+
"""Sync generic JSONL imports from .c3/conversations/imports/<source>/*.jsonl."""
|
|
438
|
+
imports_root = self.store_dir / "imports"
|
|
439
|
+
if not imports_root.exists():
|
|
440
|
+
return 0
|
|
441
|
+
|
|
442
|
+
jsonl_files = sorted(imports_root.rglob("*.jsonl"), key=lambda f: f.stat().st_mtime, reverse=True)
|
|
443
|
+
existing = {
|
|
444
|
+
f"{self._normalize_source(s.get('source', 'manual'))}::{s.get('source_file')}": s
|
|
445
|
+
for s in self._load_sessions()
|
|
446
|
+
if s.get("source_file")
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
synced = 0
|
|
450
|
+
for jf in jsonl_files[:self.MAX_TRANSCRIPT_FILES]:
|
|
451
|
+
try:
|
|
452
|
+
rel = jf.relative_to(imports_root)
|
|
453
|
+
source = self._normalize_source(rel.parts[0] if len(rel.parts) > 1 else "imports")
|
|
454
|
+
mtime = jf.stat().st_mtime
|
|
455
|
+
source_file = str(rel).replace("\\", "/")
|
|
456
|
+
existing_key = f"{source}::{source_file}"
|
|
457
|
+
if (not force) and existing_key in existing and abs(existing[existing_key].get("source_mtime", 0) - mtime) < 1:
|
|
458
|
+
continue
|
|
459
|
+
|
|
460
|
+
turns = self._extract_turns_generic(jf, source=source)
|
|
461
|
+
if not turns:
|
|
462
|
+
continue
|
|
463
|
+
|
|
464
|
+
# Keep source namespaced to avoid collisions with Claude stem ids.
|
|
465
|
+
session_id = f"{source}_{jf.stem}"
|
|
466
|
+
self._upsert_synced_session(
|
|
467
|
+
session_id=session_id,
|
|
468
|
+
turns=turns,
|
|
469
|
+
source=source,
|
|
470
|
+
source_file=source_file,
|
|
471
|
+
source_mtime=mtime,
|
|
472
|
+
)
|
|
473
|
+
synced += 1
|
|
474
|
+
except Exception:
|
|
475
|
+
continue
|
|
476
|
+
return synced
|
|
477
|
+
|
|
478
|
+
def _upsert_synced_session(self, session_id: str, turns: list, source: str, source_file: str, source_mtime: float):
|
|
479
|
+
"""Write synced turns and upsert session metadata."""
|
|
480
|
+
first_user = next((t.get("text", "") for t in turns if t.get("role") == "user"), "")
|
|
481
|
+
title = (first_user[:100].strip() or session_id[:24]).replace("\n", " ")
|
|
482
|
+
user_tok = sum(t.get("tokens", 0) for t in turns if t.get("role") == "user")
|
|
483
|
+
asst_tok = sum(t.get("tokens", 0) for t in turns if t.get("role") == "assistant")
|
|
484
|
+
|
|
485
|
+
meta = {
|
|
486
|
+
"session_id": session_id,
|
|
487
|
+
"title": title,
|
|
488
|
+
"source": self._normalize_source(source),
|
|
489
|
+
"source_file": source_file,
|
|
490
|
+
"source_mtime": source_mtime,
|
|
491
|
+
"started": turns[0]["ts"] if turns else time.time(),
|
|
492
|
+
"ended": turns[-1]["ts"] if turns else time.time(),
|
|
493
|
+
"turns": len(turns),
|
|
494
|
+
"user_tokens": user_tok,
|
|
495
|
+
"assistant_tokens": asst_tok,
|
|
496
|
+
"compressed": False,
|
|
497
|
+
}
|
|
498
|
+
self._write_turns(session_id, turns)
|
|
499
|
+
self._upsert_session(meta)
|
|
500
|
+
|
|
501
|
+
def _extract_turns(self, jsonl_path: Path) -> list:
|
|
502
|
+
"""Parse a Claude Code JSONL file into a list of turn dicts."""
|
|
503
|
+
entries = []
|
|
504
|
+
try:
|
|
505
|
+
with open(jsonl_path, encoding="utf-8", errors="replace") as f:
|
|
506
|
+
for line in f:
|
|
507
|
+
line = line.strip()
|
|
508
|
+
if not line:
|
|
509
|
+
continue
|
|
510
|
+
try:
|
|
511
|
+
entries.append(json.loads(line))
|
|
512
|
+
except json.JSONDecodeError:
|
|
513
|
+
continue
|
|
514
|
+
except Exception:
|
|
515
|
+
return []
|
|
516
|
+
|
|
517
|
+
turns = []
|
|
518
|
+
turn_num = 0
|
|
519
|
+
for entry in entries:
|
|
520
|
+
etype = entry.get("type", "")
|
|
521
|
+
if etype in ("progress", "file-history-snapshot", "system"):
|
|
522
|
+
continue
|
|
523
|
+
|
|
524
|
+
role = entry.get("role", "")
|
|
525
|
+
msg = entry.get("message", {})
|
|
526
|
+
if isinstance(msg, dict):
|
|
527
|
+
role = role or msg.get("role", "")
|
|
528
|
+
|
|
529
|
+
if role not in ("user", "assistant"):
|
|
530
|
+
continue
|
|
531
|
+
|
|
532
|
+
text, tool_calls = self._extract_content(entry)
|
|
533
|
+
if not text and not tool_calls:
|
|
534
|
+
continue
|
|
535
|
+
|
|
536
|
+
turn_num += 1
|
|
537
|
+
ts_raw = entry.get("timestamp", "")
|
|
538
|
+
t = {
|
|
539
|
+
"id": f"t{turn_num:04d}",
|
|
540
|
+
"ts": self._parse_ts(ts_raw),
|
|
541
|
+
"role": role,
|
|
542
|
+
"text": text[:self.MAX_TEXT_LEN],
|
|
543
|
+
"tokens": count_tokens(text),
|
|
544
|
+
}
|
|
545
|
+
if tool_calls:
|
|
546
|
+
t["tool_calls"] = tool_calls
|
|
547
|
+
turns.append(t)
|
|
548
|
+
|
|
549
|
+
if len(turns) >= self.MAX_TURNS_PER_SESSION:
|
|
550
|
+
break
|
|
551
|
+
|
|
552
|
+
return turns
|
|
553
|
+
|
|
554
|
+
def _extract_turns_generic(self, jsonl_path: Path, source: str = "imports") -> list:
|
|
555
|
+
"""Parse generic JSONL transcripts from non-Claude systems.
|
|
556
|
+
|
|
557
|
+
Supported line shapes:
|
|
558
|
+
- {"role":"user|assistant","text":"...","ts":...}
|
|
559
|
+
- {"role":"...","content":"..."}
|
|
560
|
+
- {"message":{"role":"...","content":"..."},"timestamp":"..."}
|
|
561
|
+
- {"content":[{"type":"text","text":"..."}], "role":"..."}
|
|
562
|
+
"""
|
|
563
|
+
entries = []
|
|
564
|
+
try:
|
|
565
|
+
with open(jsonl_path, encoding="utf-8", errors="replace") as f:
|
|
566
|
+
for line in f:
|
|
567
|
+
line = line.strip()
|
|
568
|
+
if not line:
|
|
569
|
+
continue
|
|
570
|
+
try:
|
|
571
|
+
entries.append(json.loads(line))
|
|
572
|
+
except json.JSONDecodeError:
|
|
573
|
+
continue
|
|
574
|
+
except Exception:
|
|
575
|
+
return []
|
|
576
|
+
|
|
577
|
+
turns = []
|
|
578
|
+
turn_num = 0
|
|
579
|
+
for entry in entries:
|
|
580
|
+
if not isinstance(entry, dict):
|
|
581
|
+
continue
|
|
582
|
+
role = (entry.get("role") or "").strip().lower()
|
|
583
|
+
msg = entry.get("message", {})
|
|
584
|
+
if isinstance(msg, dict):
|
|
585
|
+
role = role or (msg.get("role") or "").strip().lower()
|
|
586
|
+
if role not in ("user", "assistant"):
|
|
587
|
+
continue
|
|
588
|
+
|
|
589
|
+
text, tool_calls = self._extract_content(entry)
|
|
590
|
+
if not text and isinstance(entry.get("text"), str):
|
|
591
|
+
text = entry.get("text", "")
|
|
592
|
+
if not text and isinstance(msg, dict):
|
|
593
|
+
mcontent = msg.get("content")
|
|
594
|
+
if isinstance(mcontent, str):
|
|
595
|
+
text = mcontent
|
|
596
|
+
|
|
597
|
+
if not text and not tool_calls:
|
|
598
|
+
continue
|
|
599
|
+
|
|
600
|
+
turn_num += 1
|
|
601
|
+
ts_raw = entry.get("ts", entry.get("timestamp", ""))
|
|
602
|
+
t = {
|
|
603
|
+
"id": f"t{turn_num:04d}",
|
|
604
|
+
"ts": self._parse_ts(ts_raw),
|
|
605
|
+
"role": role,
|
|
606
|
+
"text": (text or "")[:self.MAX_TEXT_LEN],
|
|
607
|
+
"tokens": count_tokens(text or ""),
|
|
608
|
+
"source": self._normalize_source(source),
|
|
609
|
+
}
|
|
610
|
+
if tool_calls:
|
|
611
|
+
t["tool_calls"] = tool_calls
|
|
612
|
+
turns.append(t)
|
|
613
|
+
if len(turns) >= self.MAX_TURNS_PER_SESSION:
|
|
614
|
+
break
|
|
615
|
+
return turns
|
|
616
|
+
|
|
617
|
+
def _extract_content(self, entry: dict):
|
|
618
|
+
"""Return (text_str, tool_calls_list) from a transcript entry."""
|
|
619
|
+
parts = []
|
|
620
|
+
tool_calls = []
|
|
621
|
+
|
|
622
|
+
content = entry.get("content", "")
|
|
623
|
+
msg = entry.get("message", {})
|
|
624
|
+
if isinstance(msg, dict):
|
|
625
|
+
content = content or msg.get("content", "")
|
|
626
|
+
|
|
627
|
+
if isinstance(content, str) and content:
|
|
628
|
+
parts.append(content)
|
|
629
|
+
elif isinstance(content, list):
|
|
630
|
+
for block in content:
|
|
631
|
+
if not isinstance(block, dict):
|
|
632
|
+
if isinstance(block, str):
|
|
633
|
+
parts.append(block)
|
|
634
|
+
continue
|
|
635
|
+
btype = block.get("type", "")
|
|
636
|
+
if btype == "text":
|
|
637
|
+
t = block.get("text", "")
|
|
638
|
+
if t:
|
|
639
|
+
parts.append(t)
|
|
640
|
+
elif btype == "tool_use":
|
|
641
|
+
name = block.get("name", "")
|
|
642
|
+
inp = block.get("input", {})
|
|
643
|
+
args_str = ""
|
|
644
|
+
if isinstance(inp, dict):
|
|
645
|
+
args_str = " ".join(
|
|
646
|
+
f"{k}={str(v)[:80]}" for k, v in list(inp.items())[:4]
|
|
647
|
+
)[:160]
|
|
648
|
+
tool_calls.append({"tool": name, "args": args_str})
|
|
649
|
+
# Skip tool_result and thinking blocks
|
|
650
|
+
|
|
651
|
+
# Preserve line boundaries so markdown lists/checkboxes remain parseable in UI.
|
|
652
|
+
return "\n".join(p for p in parts if p).strip(), tool_calls
|
|
653
|
+
|
|
654
|
+
@staticmethod
|
|
655
|
+
def _parse_ts(ts_raw) -> float:
|
|
656
|
+
if isinstance(ts_raw, (int, float)):
|
|
657
|
+
return float(ts_raw)
|
|
658
|
+
if isinstance(ts_raw, str) and ts_raw:
|
|
659
|
+
try:
|
|
660
|
+
dt = datetime.fromisoformat(ts_raw.replace("Z", "+00:00"))
|
|
661
|
+
return dt.timestamp()
|
|
662
|
+
except Exception:
|
|
663
|
+
pass
|
|
664
|
+
return time.time()
|
|
665
|
+
|
|
666
|
+
# ── Storage ────────────────────────────────────────────────────────────
|
|
667
|
+
|
|
668
|
+
def _write_turns(self, session_id: str, turns: list):
|
|
669
|
+
path = self.store_dir / f"{session_id}.jsonl"
|
|
670
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
671
|
+
for t in turns:
|
|
672
|
+
f.write(json.dumps(t, ensure_ascii=False) + "\n")
|
|
673
|
+
self._search_dirty = True
|
|
674
|
+
|
|
675
|
+
def _read_turns(self, session_id: str) -> list:
|
|
676
|
+
gz_path = self.store_dir / f"{session_id}.jsonl.gz"
|
|
677
|
+
plain_path = self.store_dir / f"{session_id}.jsonl"
|
|
678
|
+
turns = []
|
|
679
|
+
|
|
680
|
+
source = None
|
|
681
|
+
if gz_path.exists():
|
|
682
|
+
source = gz_path
|
|
683
|
+
opener = lambda p: gzip.open(p, "rt", encoding="utf-8")
|
|
684
|
+
elif plain_path.exists():
|
|
685
|
+
source = plain_path
|
|
686
|
+
opener = lambda p: open(p, encoding="utf-8")
|
|
687
|
+
|
|
688
|
+
if source:
|
|
689
|
+
try:
|
|
690
|
+
with opener(source) as f:
|
|
691
|
+
for line in f:
|
|
692
|
+
line = line.strip()
|
|
693
|
+
if line:
|
|
694
|
+
try:
|
|
695
|
+
turns.append(json.loads(line))
|
|
696
|
+
except Exception:
|
|
697
|
+
pass
|
|
698
|
+
except Exception:
|
|
699
|
+
pass
|
|
700
|
+
|
|
701
|
+
return turns
|
|
702
|
+
|
|
703
|
+
def _compress_old(self):
|
|
704
|
+
"""gzip-compress session files older than COMPRESS_AFTER_DAYS."""
|
|
705
|
+
cutoff = time.time() - self.COMPRESS_AFTER_DAYS * 86400
|
|
706
|
+
sessions = self._load_sessions()
|
|
707
|
+
changed = False
|
|
708
|
+
for s in sessions:
|
|
709
|
+
if s.get("compressed") or s.get("ended", 0) > cutoff:
|
|
710
|
+
continue
|
|
711
|
+
sid = s["session_id"]
|
|
712
|
+
plain = self.store_dir / f"{sid}.jsonl"
|
|
713
|
+
gz = self.store_dir / f"{sid}.jsonl.gz"
|
|
714
|
+
if not plain.exists() or gz.exists():
|
|
715
|
+
continue
|
|
716
|
+
try:
|
|
717
|
+
with open(plain, "rb") as fin, gzip.open(gz, "wb") as fout:
|
|
718
|
+
fout.write(fin.read())
|
|
719
|
+
plain.unlink()
|
|
720
|
+
s["compressed"] = True
|
|
721
|
+
changed = True
|
|
722
|
+
except Exception:
|
|
723
|
+
pass
|
|
724
|
+
if changed:
|
|
725
|
+
self._save_sessions(sessions)
|
|
726
|
+
self._search_dirty = True
|
|
727
|
+
|
|
728
|
+
def _upsert_session(self, meta: dict):
|
|
729
|
+
sessions = self._load_sessions()
|
|
730
|
+
for i, s in enumerate(sessions):
|
|
731
|
+
if s["session_id"] == meta["session_id"]:
|
|
732
|
+
sessions[i] = meta
|
|
733
|
+
self._save_sessions(sessions)
|
|
734
|
+
return
|
|
735
|
+
sessions.append(meta)
|
|
736
|
+
self._save_sessions(sessions)
|
|
737
|
+
|
|
738
|
+
def _load_sessions(self) -> list:
|
|
739
|
+
if self._sessions:
|
|
740
|
+
return self._sessions
|
|
741
|
+
if self._sessions_file.exists():
|
|
742
|
+
try:
|
|
743
|
+
with open(self._sessions_file, encoding="utf-8") as f:
|
|
744
|
+
loaded = json.load(f)
|
|
745
|
+
changed = False
|
|
746
|
+
for s in loaded:
|
|
747
|
+
src = s.get("source")
|
|
748
|
+
if not src:
|
|
749
|
+
src = "claude" if s.get("source_file") else "manual"
|
|
750
|
+
s["source"] = src
|
|
751
|
+
changed = True
|
|
752
|
+
norm = self._normalize_source(src)
|
|
753
|
+
if norm != src:
|
|
754
|
+
s["source"] = norm
|
|
755
|
+
changed = True
|
|
756
|
+
self._sessions = loaded
|
|
757
|
+
if changed:
|
|
758
|
+
self._save_sessions(self._sessions)
|
|
759
|
+
return self._sessions
|
|
760
|
+
except Exception:
|
|
761
|
+
pass
|
|
762
|
+
self._sessions = []
|
|
763
|
+
return self._sessions
|
|
764
|
+
|
|
765
|
+
def _ensure_search_index(self):
|
|
766
|
+
if not self._search_dirty:
|
|
767
|
+
return
|
|
768
|
+
|
|
769
|
+
docs = {}
|
|
770
|
+
meta = {}
|
|
771
|
+
for session in self._load_sessions():
|
|
772
|
+
sid = session.get("session_id", "")
|
|
773
|
+
if not sid:
|
|
774
|
+
continue
|
|
775
|
+
try:
|
|
776
|
+
turns = self._read_turns(sid)
|
|
777
|
+
except Exception:
|
|
778
|
+
continue
|
|
779
|
+
for turn in turns:
|
|
780
|
+
turn_key = f"{sid}:{turn.get('id', '')}"
|
|
781
|
+
for chunk_index, snippet in enumerate(self._chunk_text(turn.get("text", ""))):
|
|
782
|
+
chunk_key = f"{turn_key}:{chunk_index}"
|
|
783
|
+
docs[chunk_key] = snippet[:self.MAX_SEARCH_TEXT]
|
|
784
|
+
meta[chunk_key] = {
|
|
785
|
+
"turn_key": turn_key,
|
|
786
|
+
"session_id": sid,
|
|
787
|
+
"session_title": session.get("title", ""),
|
|
788
|
+
"source": session.get("source", "manual"),
|
|
789
|
+
"ts": turn.get("ts", 0),
|
|
790
|
+
"role": turn.get("role", ""),
|
|
791
|
+
"text": turn.get("text", ""),
|
|
792
|
+
"snippet": snippet,
|
|
793
|
+
"tokens": turn.get("tokens", 0),
|
|
794
|
+
"turn_source": turn.get("source", session.get("source", "manual")),
|
|
795
|
+
"chunk_index": chunk_index,
|
|
796
|
+
}
|
|
797
|
+
self._search_index.rebuild(docs)
|
|
798
|
+
self._search_meta = meta
|
|
799
|
+
self._search_dirty = False
|
|
800
|
+
|
|
801
|
+
def _save_sessions(self, sessions: list):
|
|
802
|
+
self._sessions = sessions
|
|
803
|
+
with open(self._sessions_file, "w", encoding="utf-8") as f:
|
|
804
|
+
json.dump(sessions, f, ensure_ascii=False, indent=2)
|
|
805
|
+
self._search_dirty = True
|
|
806
|
+
|
|
807
|
+
def _chunk_text(self, text: str) -> list[str]:
|
|
808
|
+
text = (text or "").strip()
|
|
809
|
+
if not text:
|
|
810
|
+
return []
|
|
811
|
+
if len(text) <= self.SEARCH_CHUNK_CHARS:
|
|
812
|
+
return [text]
|
|
813
|
+
|
|
814
|
+
chunks = []
|
|
815
|
+
step = max(1, self.SEARCH_CHUNK_CHARS - self.SEARCH_CHUNK_OVERLAP)
|
|
816
|
+
start = 0
|
|
817
|
+
while start < len(text):
|
|
818
|
+
end = min(len(text), start + self.SEARCH_CHUNK_CHARS)
|
|
819
|
+
chunks.append(text[start:end])
|
|
820
|
+
if end >= len(text):
|
|
821
|
+
break
|
|
822
|
+
start += step
|
|
823
|
+
return chunks
|
|
824
|
+
|
|
825
|
+
# ── TF-IDF Search ──────────────────────────────────────────────────────
|
|
826
|
+
|
|
827
|
+
def _tfidf_search(self, query: str, docs: dict, top_k: int) -> list:
|
|
828
|
+
q_terms = Counter(self._tokenize(query))
|
|
829
|
+
N = len(docs)
|
|
830
|
+
if N == 0 or not q_terms:
|
|
831
|
+
return []
|
|
832
|
+
|
|
833
|
+
# Build document-frequency table
|
|
834
|
+
df: Counter = Counter()
|
|
835
|
+
tok_docs: dict = {}
|
|
836
|
+
for key, text in docs.items():
|
|
837
|
+
terms = self._tokenize(text)
|
|
838
|
+
tok_docs[key] = Counter(terms)
|
|
839
|
+
for t in set(terms):
|
|
840
|
+
df[t] += 1
|
|
841
|
+
|
|
842
|
+
scores: dict = {}
|
|
843
|
+
for key, term_counts in tok_docs.items():
|
|
844
|
+
total = sum(term_counts.values()) or 1
|
|
845
|
+
score = 0.0
|
|
846
|
+
for term, q_count in q_terms.items():
|
|
847
|
+
tf = term_counts.get(term, 0) / total
|
|
848
|
+
idf = math.log((N + 1) / (df.get(term, 0) + 1)) + 1
|
|
849
|
+
score += tf * idf * q_count
|
|
850
|
+
if score > 0:
|
|
851
|
+
scores[key] = score
|
|
852
|
+
|
|
853
|
+
return sorted(scores.items(), key=lambda x: x[1], reverse=True)[:top_k]
|
|
854
|
+
|
|
855
|
+
@staticmethod
|
|
856
|
+
def _tokenize(text: str) -> list:
|
|
857
|
+
return re.findall(r"[a-z0-9_]+", text.lower())
|
|
858
|
+
|
|
859
|
+
@staticmethod
|
|
860
|
+
def _normalize_source(source: str) -> str:
|
|
861
|
+
raw = (source or "manual").strip().lower()
|
|
862
|
+
if not raw:
|
|
863
|
+
return "manual"
|
|
864
|
+
aliases = {
|
|
865
|
+
"transcript": "claude",
|
|
866
|
+
"claude-code": "claude",
|
|
867
|
+
"claude_code": "claude",
|
|
868
|
+
"mcp": "manual",
|
|
869
|
+
}
|
|
870
|
+
return aliases.get(raw, raw)
|