threadkeeper 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- threadkeeper/__init__.py +8 -0
- threadkeeper/_mcp.py +6 -0
- threadkeeper/_setup.py +299 -0
- threadkeeper/adapters/__init__.py +40 -0
- threadkeeper/adapters/_hook_helpers.py +72 -0
- threadkeeper/adapters/base.py +152 -0
- threadkeeper/adapters/claude_code.py +178 -0
- threadkeeper/adapters/claude_desktop.py +128 -0
- threadkeeper/adapters/codex.py +259 -0
- threadkeeper/adapters/copilot.py +195 -0
- threadkeeper/adapters/gemini.py +169 -0
- threadkeeper/adapters/vscode.py +144 -0
- threadkeeper/brief.py +735 -0
- threadkeeper/config.py +216 -0
- threadkeeper/curator.py +390 -0
- threadkeeper/db.py +474 -0
- threadkeeper/embeddings.py +232 -0
- threadkeeper/extract_daemon.py +125 -0
- threadkeeper/helpers.py +101 -0
- threadkeeper/i18n.py +342 -0
- threadkeeper/identity.py +237 -0
- threadkeeper/ingest.py +507 -0
- threadkeeper/lessons.py +170 -0
- threadkeeper/nudges.py +257 -0
- threadkeeper/process_health.py +202 -0
- threadkeeper/review_prompts.py +207 -0
- threadkeeper/search_proxy.py +160 -0
- threadkeeper/server.py +55 -0
- threadkeeper/shadow_review.py +358 -0
- threadkeeper/skill_watcher.py +96 -0
- threadkeeper/spawn_budget.py +246 -0
- threadkeeper/tools/__init__.py +2 -0
- threadkeeper/tools/concepts.py +111 -0
- threadkeeper/tools/consolidate.py +222 -0
- threadkeeper/tools/core_memory.py +109 -0
- threadkeeper/tools/correlation.py +116 -0
- threadkeeper/tools/curator.py +121 -0
- threadkeeper/tools/dialectic.py +359 -0
- threadkeeper/tools/dialog.py +131 -0
- threadkeeper/tools/distill.py +184 -0
- threadkeeper/tools/extract.py +411 -0
- threadkeeper/tools/graph.py +183 -0
- threadkeeper/tools/invariants.py +177 -0
- threadkeeper/tools/lessons.py +110 -0
- threadkeeper/tools/missed_spawns.py +142 -0
- threadkeeper/tools/peers.py +579 -0
- threadkeeper/tools/pickup.py +148 -0
- threadkeeper/tools/probes.py +251 -0
- threadkeeper/tools/process_health.py +90 -0
- threadkeeper/tools/session.py +34 -0
- threadkeeper/tools/shadow_review.py +106 -0
- threadkeeper/tools/skills.py +856 -0
- threadkeeper/tools/spawn.py +871 -0
- threadkeeper/tools/style.py +44 -0
- threadkeeper/tools/threads.py +299 -0
- threadkeeper-0.4.0.dist-info/METADATA +351 -0
- threadkeeper-0.4.0.dist-info/RECORD +61 -0
- threadkeeper-0.4.0.dist-info/WHEEL +5 -0
- threadkeeper-0.4.0.dist-info/entry_points.txt +2 -0
- threadkeeper-0.4.0.dist-info/licenses/LICENSE +21 -0
- threadkeeper-0.4.0.dist-info/top_level.txt +1 -0
threadkeeper/ingest.py
ADDED
|
@@ -0,0 +1,507 @@
|
|
|
1
|
+
"""Live ingestion of Claude Code jsonl transcripts into dialog_messages/_fts.
|
|
2
|
+
Background daemon ticks every INGEST_INTERVAL_S; brief() can also call _ingest_recent_only directly."""
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json as _json
|
|
6
|
+
import os
|
|
7
|
+
import sqlite3
|
|
8
|
+
import threading
|
|
9
|
+
import time
|
|
10
|
+
from datetime import datetime as _dt
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Optional
|
|
13
|
+
|
|
14
|
+
from .config import (
|
|
15
|
+
INGEST_CAP_PER_CALL,
|
|
16
|
+
INGEST_INTERVAL_S,
|
|
17
|
+
INGEST_RECENT_WINDOW_S,
|
|
18
|
+
SEMANTIC_AVAILABLE,
|
|
19
|
+
)
|
|
20
|
+
from .db import get_db
|
|
21
|
+
from .embeddings import _embed
|
|
22
|
+
|
|
23
|
+
_ingest_thread: Optional[threading.Thread] = None
|
|
24
|
+
_ingest_lock = threading.Lock()
|
|
25
|
+
_ingest_interval_s = INGEST_INTERVAL_S
|
|
26
|
+
_ingest_recent_window_s = INGEST_RECENT_WINDOW_S
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _backfill_dialog_fts_if_empty(conn: sqlite3.Connection) -> None:
|
|
30
|
+
"""Populate dialog_fts from dialog_messages on first start (or after a
|
|
31
|
+
schema add when most rows are already in dialog_messages but not in FTS).
|
|
32
|
+
|
|
33
|
+
Compares row counts: if dialog_fts is meaningfully behind dialog_messages,
|
|
34
|
+
backfill the gap (uuids missing from fts). Idempotent — only inserts
|
|
35
|
+
rows whose uuid isn't already in dialog_fts. Roughly 5-10s for 100k
|
|
36
|
+
records on a laptop — one-time cost."""
|
|
37
|
+
try:
|
|
38
|
+
msg_cnt = conn.execute(
|
|
39
|
+
"SELECT COUNT(*) c FROM dialog_messages"
|
|
40
|
+
).fetchone()["c"]
|
|
41
|
+
fts_cnt = conn.execute(
|
|
42
|
+
"SELECT COUNT(*) c FROM dialog_fts"
|
|
43
|
+
).fetchone()["c"]
|
|
44
|
+
except sqlite3.OperationalError:
|
|
45
|
+
return
|
|
46
|
+
if fts_cnt >= msg_cnt - 5:
|
|
47
|
+
# close enough — newly-arrived rows fill via INSERT trigger in _ingest_file
|
|
48
|
+
conn.execute(
|
|
49
|
+
"INSERT INTO style (key, value, updated_at) VALUES (?,?,?) "
|
|
50
|
+
"ON CONFLICT(key) DO UPDATE SET value=excluded.value, "
|
|
51
|
+
"updated_at=excluded.updated_at",
|
|
52
|
+
("fts_backfilled", str(fts_cnt), int(time.time())),
|
|
53
|
+
)
|
|
54
|
+
conn.commit()
|
|
55
|
+
return
|
|
56
|
+
# backfill rows present in dialog_messages but missing from dialog_fts
|
|
57
|
+
missing = conn.execute(
|
|
58
|
+
"SELECT d.uuid, d.content FROM dialog_messages d "
|
|
59
|
+
"LEFT JOIN dialog_fts f ON f.uuid = d.uuid "
|
|
60
|
+
"WHERE f.uuid IS NULL"
|
|
61
|
+
).fetchall()
|
|
62
|
+
batch: list[tuple[str, str]] = []
|
|
63
|
+
added = 0
|
|
64
|
+
for r in missing:
|
|
65
|
+
batch.append((r["uuid"], r["content"]))
|
|
66
|
+
if len(batch) >= 5000:
|
|
67
|
+
conn.executemany(
|
|
68
|
+
"INSERT INTO dialog_fts (uuid, content) VALUES (?, ?)",
|
|
69
|
+
batch,
|
|
70
|
+
)
|
|
71
|
+
conn.commit()
|
|
72
|
+
added += len(batch)
|
|
73
|
+
batch = []
|
|
74
|
+
if batch:
|
|
75
|
+
conn.executemany(
|
|
76
|
+
"INSERT INTO dialog_fts (uuid, content) VALUES (?, ?)",
|
|
77
|
+
batch,
|
|
78
|
+
)
|
|
79
|
+
added += len(batch)
|
|
80
|
+
final_cnt = conn.execute(
|
|
81
|
+
"SELECT COUNT(*) c FROM dialog_fts"
|
|
82
|
+
).fetchone()["c"]
|
|
83
|
+
conn.execute(
|
|
84
|
+
"INSERT INTO style (key, value, updated_at) VALUES (?,?,?) "
|
|
85
|
+
"ON CONFLICT(key) DO UPDATE SET value=excluded.value, "
|
|
86
|
+
"updated_at=excluded.updated_at",
|
|
87
|
+
("fts_backfilled", f"{final_cnt}+{added}", int(time.time())),
|
|
88
|
+
)
|
|
89
|
+
conn.commit()
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _parse_ts(ts: str) -> int:
|
|
93
|
+
try:
|
|
94
|
+
return int(_dt.fromisoformat(ts.replace("Z", "+00:00")).timestamp())
|
|
95
|
+
except Exception:
|
|
96
|
+
return int(time.time())
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _scan_message_for_skill_use(msg: dict) -> list[str]:
|
|
100
|
+
"""Return Skill tool_use invocations found in a single message dict.
|
|
101
|
+
Handles both flat and nested content arrays; accepts either 'skill' or
|
|
102
|
+
'name' key inside the tool_use input payload. Returns [] for non-
|
|
103
|
+
matching messages.
|
|
104
|
+
"""
|
|
105
|
+
found: list[str] = []
|
|
106
|
+
|
|
107
|
+
def _walk(node) -> None:
|
|
108
|
+
if isinstance(node, list):
|
|
109
|
+
for item in node:
|
|
110
|
+
_walk(item)
|
|
111
|
+
return
|
|
112
|
+
if not isinstance(node, dict):
|
|
113
|
+
return
|
|
114
|
+
if node.get("type") == "tool_use" and node.get("name") == "Skill":
|
|
115
|
+
inp = node.get("input") or {}
|
|
116
|
+
if isinstance(inp, dict):
|
|
117
|
+
val = inp.get("skill") or inp.get("name")
|
|
118
|
+
if isinstance(val, str) and val:
|
|
119
|
+
found.append(val)
|
|
120
|
+
# Recurse into anything that might wrap further content blocks.
|
|
121
|
+
for v in node.values():
|
|
122
|
+
if isinstance(v, (list, dict)):
|
|
123
|
+
_walk(v)
|
|
124
|
+
|
|
125
|
+
_walk(msg)
|
|
126
|
+
return found
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _extract_text(msg: dict) -> str:
|
|
130
|
+
"""Pull searchable text from a message; skip tool_use args, cap tool_results."""
|
|
131
|
+
content = msg.get("content", "")
|
|
132
|
+
if isinstance(content, str):
|
|
133
|
+
return content
|
|
134
|
+
if not isinstance(content, list):
|
|
135
|
+
return ""
|
|
136
|
+
parts: list[str] = []
|
|
137
|
+
for block in content:
|
|
138
|
+
if not isinstance(block, dict):
|
|
139
|
+
continue
|
|
140
|
+
t = block.get("type")
|
|
141
|
+
if t == "text":
|
|
142
|
+
parts.append(block.get("text", ""))
|
|
143
|
+
elif t == "thinking":
|
|
144
|
+
parts.append(f"[thinking] {block.get('thinking', '')}")
|
|
145
|
+
elif t == "tool_result":
|
|
146
|
+
tr = block.get("content", "")
|
|
147
|
+
if isinstance(tr, list):
|
|
148
|
+
tr = " ".join(b.get("text", "") for b in tr if isinstance(b, dict))
|
|
149
|
+
if isinstance(tr, str) and tr:
|
|
150
|
+
parts.append(f"[tool_result] {tr[:800]}")
|
|
151
|
+
# tool_use blocks deliberately skipped (noisy for semantic search)
|
|
152
|
+
return "\n".join(p for p in parts if p)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def _ingest_file(conn: sqlite3.Connection, fp: Path, max_msgs: int,
|
|
156
|
+
adapter=None) -> int:
|
|
157
|
+
"""Incrementally ingest one transcript file from the given adapter.
|
|
158
|
+
Returns number of new messages added.
|
|
159
|
+
|
|
160
|
+
When `adapter` is None (legacy callers), the Claude Code adapter is
|
|
161
|
+
used so the function's old contract still holds.
|
|
162
|
+
|
|
163
|
+
Strategy: skip the file entirely if mtime hasn't advanced past
|
|
164
|
+
ingest_state.last_mtime. Otherwise use `adapter.iter_messages(fp)`
|
|
165
|
+
to enumerate normalized messages and dedup via dialog_messages.uuid.
|
|
166
|
+
"""
|
|
167
|
+
if adapter is None:
|
|
168
|
+
from .adapters import _CLAUDE_CODE as _claude_default # type: ignore
|
|
169
|
+
adapter = _claude_default
|
|
170
|
+
if not fp.exists():
|
|
171
|
+
return 0
|
|
172
|
+
stat = fp.stat()
|
|
173
|
+
mtime = int(stat.st_mtime)
|
|
174
|
+
state = conn.execute(
|
|
175
|
+
"SELECT last_mtime FROM ingest_state WHERE file_path=?", (str(fp),)
|
|
176
|
+
).fetchone()
|
|
177
|
+
last_mtime = state["last_mtime"] if state else 0
|
|
178
|
+
if mtime <= last_mtime:
|
|
179
|
+
return 0
|
|
180
|
+
added = 0
|
|
181
|
+
try:
|
|
182
|
+
for nm in adapter.iter_messages(fp):
|
|
183
|
+
if added >= max_msgs:
|
|
184
|
+
break
|
|
185
|
+
if not nm.uuid:
|
|
186
|
+
continue
|
|
187
|
+
if conn.execute(
|
|
188
|
+
"SELECT 1 FROM dialog_messages WHERE uuid=?", (nm.uuid,)
|
|
189
|
+
).fetchone():
|
|
190
|
+
continue
|
|
191
|
+
# Skill scan first — runs even for tool-only assistant turns
|
|
192
|
+
# whose text body would fail the >=10 char filter below.
|
|
193
|
+
if nm.role == "assistant":
|
|
194
|
+
for skill_name in _scan_message_for_skill_use(nm.raw):
|
|
195
|
+
try:
|
|
196
|
+
conn.execute(
|
|
197
|
+
"INSERT INTO skill_usage "
|
|
198
|
+
"(name, created_at, created_by_origin) "
|
|
199
|
+
"VALUES (?, ?, 'foreground') "
|
|
200
|
+
"ON CONFLICT(name) DO NOTHING",
|
|
201
|
+
(skill_name, nm.created_at),
|
|
202
|
+
)
|
|
203
|
+
conn.execute(
|
|
204
|
+
"UPDATE skill_usage "
|
|
205
|
+
"SET last_used_at=?, use_count=use_count+1 "
|
|
206
|
+
"WHERE name=? AND (last_used_at IS NULL "
|
|
207
|
+
"OR last_used_at < ?)",
|
|
208
|
+
(nm.created_at, skill_name, nm.created_at),
|
|
209
|
+
)
|
|
210
|
+
except sqlite3.OperationalError:
|
|
211
|
+
pass # skill_usage missing on this conn
|
|
212
|
+
text = nm.content
|
|
213
|
+
if not text or len(text) < 10:
|
|
214
|
+
continue
|
|
215
|
+
emb = _embed(text[:2000]) if SEMANTIC_AVAILABLE else None
|
|
216
|
+
conn.execute(
|
|
217
|
+
"INSERT INTO dialog_messages (uuid, source, project, session_id, "
|
|
218
|
+
"role, content, model, created_at, embedding) "
|
|
219
|
+
"VALUES (?,?,?,?,?,?,?,?,?)",
|
|
220
|
+
(nm.uuid, adapter.name, adapter.project_label(fp),
|
|
221
|
+
nm.session_id, nm.role, text,
|
|
222
|
+
nm.model, nm.created_at, emb)
|
|
223
|
+
)
|
|
224
|
+
try:
|
|
225
|
+
conn.execute(
|
|
226
|
+
"INSERT INTO dialog_fts (uuid, content) VALUES (?, ?)",
|
|
227
|
+
(nm.uuid, text),
|
|
228
|
+
)
|
|
229
|
+
except sqlite3.OperationalError:
|
|
230
|
+
pass
|
|
231
|
+
if emb is not None:
|
|
232
|
+
try:
|
|
233
|
+
from .embeddings import _vec_upsert_dialog
|
|
234
|
+
_vec_upsert_dialog(conn, nm.uuid, emb)
|
|
235
|
+
except Exception:
|
|
236
|
+
pass
|
|
237
|
+
added += 1
|
|
238
|
+
except OSError:
|
|
239
|
+
return added
|
|
240
|
+
conn.execute(
|
|
241
|
+
"INSERT INTO ingest_state (file_path, last_size, last_mtime, ingested_at, msg_count) "
|
|
242
|
+
"VALUES (?,?,?,?,?) "
|
|
243
|
+
"ON CONFLICT(file_path) DO UPDATE SET "
|
|
244
|
+
" last_size=excluded.last_size, last_mtime=excluded.last_mtime, "
|
|
245
|
+
" ingested_at=excluded.ingested_at, msg_count=ingest_state.msg_count+excluded.msg_count",
|
|
246
|
+
(str(fp), stat.st_size, mtime, int(time.time()), added)
|
|
247
|
+
)
|
|
248
|
+
return added
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def _ingest_all(conn: sqlite3.Connection, max_msgs: int = 1_000_000) -> tuple[int, int]:
|
|
252
|
+
"""Iterate every installed CLI adapter, incrementally ingest each
|
|
253
|
+
transcript file. Returns (new_msgs, files_seen) across ALL adapters."""
|
|
254
|
+
from .adapters import installed_adapters
|
|
255
|
+
total = 0
|
|
256
|
+
files_seen = 0
|
|
257
|
+
for adapter in installed_adapters():
|
|
258
|
+
files = adapter.transcript_files()
|
|
259
|
+
files_seen += len(files)
|
|
260
|
+
files = sorted(
|
|
261
|
+
files,
|
|
262
|
+
key=lambda p: p.stat().st_mtime if p.exists() else 0,
|
|
263
|
+
reverse=True,
|
|
264
|
+
)
|
|
265
|
+
for fp in files:
|
|
266
|
+
if total >= max_msgs:
|
|
267
|
+
break
|
|
268
|
+
total += _ingest_file(conn, fp, max_msgs - total, adapter=adapter)
|
|
269
|
+
conn.commit()
|
|
270
|
+
return (total, files_seen)
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def _ingest_recent_only(conn: sqlite3.Connection,
|
|
274
|
+
max_msgs: int = 200,
|
|
275
|
+
max_age_s: int = 600) -> tuple[int, int]:
|
|
276
|
+
"""Live-mode ingest: only transcript files modified within `max_age_s`,
|
|
277
|
+
across ALL installed CLI adapters.
|
|
278
|
+
|
|
279
|
+
Commits after EACH file so the background tick doesn't hold a long
|
|
280
|
+
write lock — multi-writer contention (parent + children + ingester)
|
|
281
|
+
deadlocks fast otherwise."""
|
|
282
|
+
from .adapters import installed_adapters
|
|
283
|
+
cutoff = time.time() - max_age_s
|
|
284
|
+
fresh: list[tuple[float, Path, object]] = []
|
|
285
|
+
for adapter in installed_adapters():
|
|
286
|
+
for p in adapter.transcript_files():
|
|
287
|
+
try:
|
|
288
|
+
m = p.stat().st_mtime
|
|
289
|
+
except OSError:
|
|
290
|
+
continue
|
|
291
|
+
if m > cutoff:
|
|
292
|
+
fresh.append((m, p, adapter))
|
|
293
|
+
fresh.sort(key=lambda x: x[0], reverse=True)
|
|
294
|
+
total = 0
|
|
295
|
+
for _, fp, adapter in fresh:
|
|
296
|
+
if total >= max_msgs:
|
|
297
|
+
break
|
|
298
|
+
added = _ingest_file(conn, fp, max_msgs - total, adapter=adapter)
|
|
299
|
+
total += added
|
|
300
|
+
if added:
|
|
301
|
+
try:
|
|
302
|
+
conn.commit()
|
|
303
|
+
except sqlite3.OperationalError:
|
|
304
|
+
pass
|
|
305
|
+
return (total, len(fresh))
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
def _backfill_skill_usage_from_jsonls(conn: sqlite3.Connection) -> int:
|
|
309
|
+
"""One-shot historical scan across every installed adapter. Finds
|
|
310
|
+
assistant messages with tool_use(name='Skill') blocks and bumps
|
|
311
|
+
skill_usage counters. Idempotent — the UPDATE guard on last_used_at
|
|
312
|
+
prevents double-counting.
|
|
313
|
+
|
|
314
|
+
Skill-tool semantics are Claude-specific in practice (other CLIs
|
|
315
|
+
don't emit `tool_use name='Skill'` blocks), but the scanner is
|
|
316
|
+
defensive and silently returns [] on unmatched payload shapes —
|
|
317
|
+
so iterating all adapters is safe.
|
|
318
|
+
|
|
319
|
+
Returns the number of (skill_name, message) pairs processed.
|
|
320
|
+
"""
|
|
321
|
+
from .adapters import installed_adapters
|
|
322
|
+
processed = 0
|
|
323
|
+
for adapter in installed_adapters():
|
|
324
|
+
for fp in adapter.transcript_files():
|
|
325
|
+
try:
|
|
326
|
+
for nm in adapter.iter_messages(fp):
|
|
327
|
+
if nm.role != "assistant":
|
|
328
|
+
continue
|
|
329
|
+
skills = _scan_message_for_skill_use(nm.raw)
|
|
330
|
+
if not skills:
|
|
331
|
+
continue
|
|
332
|
+
for skill_name in skills:
|
|
333
|
+
try:
|
|
334
|
+
conn.execute(
|
|
335
|
+
"INSERT INTO skill_usage "
|
|
336
|
+
"(name, created_at, created_by_origin) "
|
|
337
|
+
"VALUES (?, ?, 'foreground') "
|
|
338
|
+
"ON CONFLICT(name) DO NOTHING",
|
|
339
|
+
(skill_name, nm.created_at),
|
|
340
|
+
)
|
|
341
|
+
conn.execute(
|
|
342
|
+
"UPDATE skill_usage "
|
|
343
|
+
"SET last_used_at=?, use_count=use_count+1 "
|
|
344
|
+
"WHERE name=? AND (last_used_at IS NULL "
|
|
345
|
+
"OR last_used_at < ?)",
|
|
346
|
+
(nm.created_at, skill_name, nm.created_at),
|
|
347
|
+
)
|
|
348
|
+
processed += 1
|
|
349
|
+
except sqlite3.OperationalError:
|
|
350
|
+
pass
|
|
351
|
+
except OSError:
|
|
352
|
+
continue
|
|
353
|
+
try:
|
|
354
|
+
conn.commit()
|
|
355
|
+
except sqlite3.OperationalError:
|
|
356
|
+
pass
|
|
357
|
+
return processed
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def _backfill_note_embeddings(conn: sqlite3.Connection, max_n: int = 20) -> int:
|
|
361
|
+
"""Embed up to `max_n` notes whose embedding column is NULL, and mirror
|
|
362
|
+
every newly-embedded blob into notes_vec.
|
|
363
|
+
|
|
364
|
+
Light spawned children (NO_EMBEDDINGS=1) write notes with embedding=NULL
|
|
365
|
+
because they don't carry the model. A parent process with embeddings
|
|
366
|
+
available catches them up here so semantic search isn't permanently
|
|
367
|
+
blind to those notes. No-op when this process doesn't have embeddings.
|
|
368
|
+
Returns the number of rows updated.
|
|
369
|
+
"""
|
|
370
|
+
from .config import SEMANTIC_AVAILABLE
|
|
371
|
+
if not SEMANTIC_AVAILABLE:
|
|
372
|
+
return 0
|
|
373
|
+
try:
|
|
374
|
+
rows = conn.execute(
|
|
375
|
+
"SELECT id, content FROM notes "
|
|
376
|
+
"WHERE embedding IS NULL "
|
|
377
|
+
"ORDER BY id DESC LIMIT ?",
|
|
378
|
+
(max_n,),
|
|
379
|
+
).fetchall()
|
|
380
|
+
except sqlite3.OperationalError:
|
|
381
|
+
return 0
|
|
382
|
+
if not rows:
|
|
383
|
+
return 0
|
|
384
|
+
from .embeddings import _embed, _vec_upsert_note
|
|
385
|
+
updated = 0
|
|
386
|
+
for r in rows:
|
|
387
|
+
try:
|
|
388
|
+
emb = _embed(r["content"])
|
|
389
|
+
except Exception:
|
|
390
|
+
continue
|
|
391
|
+
if emb is None:
|
|
392
|
+
continue
|
|
393
|
+
try:
|
|
394
|
+
conn.execute(
|
|
395
|
+
"UPDATE notes SET embedding=? WHERE id=?",
|
|
396
|
+
(emb, r["id"]),
|
|
397
|
+
)
|
|
398
|
+
_vec_upsert_note(conn, r["id"], emb)
|
|
399
|
+
updated += 1
|
|
400
|
+
except sqlite3.OperationalError:
|
|
401
|
+
continue
|
|
402
|
+
if updated:
|
|
403
|
+
try:
|
|
404
|
+
conn.commit()
|
|
405
|
+
except sqlite3.OperationalError:
|
|
406
|
+
pass
|
|
407
|
+
return updated
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
def _backfill_vec_tables(conn: sqlite3.Connection, batch: int = 500) -> tuple[int, int]:
|
|
411
|
+
"""One-shot migration: mirror existing notes.embedding and
|
|
412
|
+
dialog_messages.embedding BLOBs into notes_vec / dialog_vec.
|
|
413
|
+
|
|
414
|
+
Idempotent — `INSERT OR REPLACE` won't duplicate. Returns
|
|
415
|
+
(notes_inserted, dialog_inserted). Called from the background ingester
|
|
416
|
+
tick; bails fast when there's nothing to do.
|
|
417
|
+
"""
|
|
418
|
+
from .config import SEMANTIC_AVAILABLE
|
|
419
|
+
from .db import vec_available
|
|
420
|
+
if not SEMANTIC_AVAILABLE or not vec_available():
|
|
421
|
+
return (0, 0)
|
|
422
|
+
from .embeddings import _vec_upsert_note, _vec_upsert_dialog
|
|
423
|
+
n_notes = 0
|
|
424
|
+
n_dialog = 0
|
|
425
|
+
try:
|
|
426
|
+
# Notes that have embedding but aren't yet in notes_vec.
|
|
427
|
+
rows = conn.execute(
|
|
428
|
+
"SELECT n.id, n.embedding FROM notes n "
|
|
429
|
+
"LEFT JOIN notes_vec v ON v.id = n.id "
|
|
430
|
+
"WHERE n.embedding IS NOT NULL AND v.id IS NULL "
|
|
431
|
+
"LIMIT ?",
|
|
432
|
+
(batch,),
|
|
433
|
+
).fetchall()
|
|
434
|
+
for r in rows:
|
|
435
|
+
_vec_upsert_note(conn, r["id"], r["embedding"])
|
|
436
|
+
n_notes += 1
|
|
437
|
+
except sqlite3.OperationalError:
|
|
438
|
+
pass
|
|
439
|
+
try:
|
|
440
|
+
# Dialog messages with embedding but no dialog_vec_map row → need
|
|
441
|
+
# mirroring. (We check via the map because dialog_vec is keyed
|
|
442
|
+
# by rowid, not uuid.)
|
|
443
|
+
rows = conn.execute(
|
|
444
|
+
"SELECT d.uuid, d.embedding FROM dialog_messages d "
|
|
445
|
+
"LEFT JOIN dialog_vec_map m ON m.uuid = d.uuid "
|
|
446
|
+
"WHERE d.embedding IS NOT NULL AND m.uuid IS NULL "
|
|
447
|
+
"LIMIT ?",
|
|
448
|
+
(batch,),
|
|
449
|
+
).fetchall()
|
|
450
|
+
for r in rows:
|
|
451
|
+
_vec_upsert_dialog(conn, r["uuid"], r["embedding"])
|
|
452
|
+
n_dialog += 1
|
|
453
|
+
except sqlite3.OperationalError:
|
|
454
|
+
pass
|
|
455
|
+
if n_notes or n_dialog:
|
|
456
|
+
try:
|
|
457
|
+
conn.commit()
|
|
458
|
+
except sqlite3.OperationalError:
|
|
459
|
+
pass
|
|
460
|
+
return (n_notes, n_dialog)
|
|
461
|
+
|
|
462
|
+
|
|
463
|
+
def _start_background_ingester() -> None:
|
|
464
|
+
"""Start a daemon thread that incrementally ingests recently-modified jsonl
|
|
465
|
+
files. Idempotent: subsequent calls are no-ops. Daemon=True so it dies with
|
|
466
|
+
the process; no shutdown handshake needed."""
|
|
467
|
+
global _ingest_thread
|
|
468
|
+
if _ingest_thread is not None and _ingest_thread.is_alive():
|
|
469
|
+
return
|
|
470
|
+
if _ingest_interval_s <= 0:
|
|
471
|
+
return # disabled via env
|
|
472
|
+
|
|
473
|
+
def _loop() -> None:
|
|
474
|
+
while True:
|
|
475
|
+
time.sleep(_ingest_interval_s)
|
|
476
|
+
try:
|
|
477
|
+
if not _ingest_lock.acquire(blocking=False):
|
|
478
|
+
continue # another tick still running, skip
|
|
479
|
+
try:
|
|
480
|
+
bg_conn = get_db()
|
|
481
|
+
try:
|
|
482
|
+
_ingest_recent_only(
|
|
483
|
+
bg_conn,
|
|
484
|
+
max_msgs=200,
|
|
485
|
+
max_age_s=_ingest_recent_window_s,
|
|
486
|
+
)
|
|
487
|
+
# Embedding backfill: light children write notes
|
|
488
|
+
# with embedding=NULL (NO_EMBEDDINGS=1). Parent
|
|
489
|
+
# processes with SEMANTIC_AVAILABLE catch them up
|
|
490
|
+
# asynchronously so semantic search recovers
|
|
491
|
+
# without blocking the child.
|
|
492
|
+
_backfill_note_embeddings(bg_conn, max_n=20)
|
|
493
|
+
# vec0 backfill: mirror legacy BLOB embeddings
|
|
494
|
+
# into the vec0 virtual tables in batches so the
|
|
495
|
+
# sub-linear index gradually warms up.
|
|
496
|
+
_backfill_vec_tables(bg_conn, batch=500)
|
|
497
|
+
finally:
|
|
498
|
+
bg_conn.close()
|
|
499
|
+
finally:
|
|
500
|
+
_ingest_lock.release()
|
|
501
|
+
except Exception:
|
|
502
|
+
pass # never crash the daemon
|
|
503
|
+
|
|
504
|
+
_ingest_thread = threading.Thread(
|
|
505
|
+
target=_loop, name="thread-keeper-live-ingest", daemon=True
|
|
506
|
+
)
|
|
507
|
+
_ingest_thread.start()
|
threadkeeper/lessons.py
ADDED
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
"""CLI-agnostic procedural-knowledge store at ~/.threadkeeper/lessons.md.
|
|
2
|
+
|
|
3
|
+
The learning loop (auto-review on close_thread + shadow_review daemon)
|
|
4
|
+
materializes lessons here. Every supported CLI's per-user instructions
|
|
5
|
+
file references this path so the lessons take effect in any of them.
|
|
6
|
+
|
|
7
|
+
Format on disk:
|
|
8
|
+
|
|
9
|
+
# thread-keeper lessons
|
|
10
|
+
|
|
11
|
+
Procedural knowledge accumulated across sessions. Auto-managed by
|
|
12
|
+
the learning loop — do not edit by hand; new entries are appended.
|
|
13
|
+
|
|
14
|
+
<!-- LESSON:BEGIN slug=<slug> ts=<unix> source=<thread_id|shadow> -->
|
|
15
|
+
## <slug>
|
|
16
|
+
> <one-line summary>
|
|
17
|
+
|
|
18
|
+
<body of the lesson>
|
|
19
|
+
<!-- LESSON:END slug=<slug> -->
|
|
20
|
+
|
|
21
|
+
<!-- LESSON:BEGIN ... -->
|
|
22
|
+
...
|
|
23
|
+
|
|
24
|
+
The sentinel-bracketed sections make per-entry diffs cheap and let us
|
|
25
|
+
update or de-duplicate without rewriting the whole file. New entries
|
|
26
|
+
land at the bottom (chronological).
|
|
27
|
+
"""
|
|
28
|
+
from __future__ import annotations
|
|
29
|
+
|
|
30
|
+
import os
|
|
31
|
+
import re
|
|
32
|
+
import time
|
|
33
|
+
from pathlib import Path
|
|
34
|
+
from typing import Iterator, Optional
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
_LESSONS_PATH = Path(
|
|
38
|
+
os.environ.get("THREADKEEPER_LESSONS", "~/.threadkeeper/lessons.md")
|
|
39
|
+
).expanduser()
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
_HEADER = """\
|
|
43
|
+
# thread-keeper lessons
|
|
44
|
+
|
|
45
|
+
Procedural knowledge accumulated across sessions. Auto-managed by the
|
|
46
|
+
learning loop — do not edit by hand; new entries are appended.
|
|
47
|
+
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
_SLUG_RE = re.compile(r"[^a-z0-9-]+")
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _slugify(title: str) -> str:
|
|
55
|
+
"""Produce a safe filesystem/url slug from a lesson title."""
|
|
56
|
+
s = title.strip().lower().replace(" ", "-")
|
|
57
|
+
s = _SLUG_RE.sub("-", s)
|
|
58
|
+
s = re.sub(r"-{2,}", "-", s).strip("-")
|
|
59
|
+
return s or "untitled"
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _ensure_file(path: Path) -> None:
|
|
63
|
+
"""Create the lessons file with the standard header if absent."""
|
|
64
|
+
if path.exists():
|
|
65
|
+
return
|
|
66
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
67
|
+
path.write_text(_HEADER)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _format_section(slug: str, summary: str, body: str,
|
|
71
|
+
source: str, ts: int) -> str:
|
|
72
|
+
"""One LESSON-BEGIN…LESSON-END block with the sentinel markers."""
|
|
73
|
+
summary_line = f"> {summary.strip()}" if summary.strip() else ""
|
|
74
|
+
body_text = body.strip()
|
|
75
|
+
return (
|
|
76
|
+
f"<!-- LESSON:BEGIN slug={slug} ts={ts} source={source} -->\n"
|
|
77
|
+
f"## {slug}\n"
|
|
78
|
+
+ (f"{summary_line}\n\n" if summary_line else "\n")
|
|
79
|
+
+ body_text + "\n"
|
|
80
|
+
f"<!-- LESSON:END slug={slug} -->\n\n"
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
_BLOCK_RE = re.compile(
|
|
85
|
+
r"<!-- LESSON:BEGIN slug=(?P<slug>[^\s]+)[^>]*-->"
|
|
86
|
+
r"(?P<body>.*?)"
|
|
87
|
+
r"<!-- LESSON:END slug=(?P=slug) -->",
|
|
88
|
+
re.DOTALL,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def append_lesson(
|
|
93
|
+
title: str,
|
|
94
|
+
body: str,
|
|
95
|
+
summary: str = "",
|
|
96
|
+
source: str = "",
|
|
97
|
+
path: Optional[Path] = None,
|
|
98
|
+
) -> str:
|
|
99
|
+
"""Append a new lesson section, or replace an existing one with the
|
|
100
|
+
same slug. Returns the slug.
|
|
101
|
+
|
|
102
|
+
`title` becomes the section header (sluggified for the sentinel).
|
|
103
|
+
`body` is markdown; `summary` is a one-liner shown right after the
|
|
104
|
+
header. `source` is a free-text provenance tag — typically a thread
|
|
105
|
+
id ("Tabc123") or "shadow" for shadow_review writes.
|
|
106
|
+
"""
|
|
107
|
+
fp = path or _LESSONS_PATH
|
|
108
|
+
_ensure_file(fp)
|
|
109
|
+
slug = _slugify(title)
|
|
110
|
+
ts = int(time.time())
|
|
111
|
+
new_section = _format_section(slug, summary, body, source or "", ts)
|
|
112
|
+
|
|
113
|
+
body_existing = fp.read_text()
|
|
114
|
+
# If a section with this slug already exists, replace it in-place
|
|
115
|
+
# (idempotent re-materialization of the same lesson).
|
|
116
|
+
target_begin = f"<!-- LESSON:BEGIN slug={slug} "
|
|
117
|
+
target_end = f"<!-- LESSON:END slug={slug} -->"
|
|
118
|
+
if target_begin in body_existing and target_end in body_existing:
|
|
119
|
+
head, _, rest = body_existing.partition(target_begin)
|
|
120
|
+
# Find the matching END after the BEGIN.
|
|
121
|
+
end_marker = target_end
|
|
122
|
+
end_idx = rest.find(end_marker)
|
|
123
|
+
if end_idx >= 0:
|
|
124
|
+
tail = rest[end_idx + len(end_marker):]
|
|
125
|
+
body_existing = head + new_section.rstrip() + "\n" + tail.lstrip("\n")
|
|
126
|
+
else:
|
|
127
|
+
# Malformed file (BEGIN without END) — just append at end.
|
|
128
|
+
body_existing = body_existing.rstrip() + "\n\n" + new_section
|
|
129
|
+
else:
|
|
130
|
+
body_existing = body_existing.rstrip() + "\n\n" + new_section
|
|
131
|
+
fp.write_text(body_existing)
|
|
132
|
+
return slug
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def iter_lessons(path: Optional[Path] = None) -> Iterator[dict]:
|
|
136
|
+
"""Yield every lesson section as a dict with keys:
|
|
137
|
+
slug, body (raw markdown between BEGIN/END), ts, source.
|
|
138
|
+
|
|
139
|
+
Order is file-order (chronological if writes are append-only)."""
|
|
140
|
+
fp = path or _LESSONS_PATH
|
|
141
|
+
if not fp.exists():
|
|
142
|
+
return
|
|
143
|
+
body = fp.read_text()
|
|
144
|
+
for m in _BLOCK_RE.finditer(body):
|
|
145
|
+
slug = m.group("slug")
|
|
146
|
+
block_body = m.group("body").strip()
|
|
147
|
+
# Parse ts and source out of the BEGIN line we already matched.
|
|
148
|
+
begin_line = body[m.start():m.start() + 200].split("\n", 1)[0]
|
|
149
|
+
ts_match = re.search(r"ts=(\d+)", begin_line)
|
|
150
|
+
source_match = re.search(r"source=([^\s>]+)", begin_line)
|
|
151
|
+
yield {
|
|
152
|
+
"slug": slug,
|
|
153
|
+
"body": block_body,
|
|
154
|
+
"ts": int(ts_match.group(1)) if ts_match else 0,
|
|
155
|
+
"source": source_match.group(1) if source_match else "",
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def count_lessons(path: Optional[Path] = None) -> int:
|
|
160
|
+
"""Cheap count for diagnostic surfaces (brief, shadow_review_status)."""
|
|
161
|
+
fp = path or _LESSONS_PATH
|
|
162
|
+
if not fp.exists():
|
|
163
|
+
return 0
|
|
164
|
+
return len(_BLOCK_RE.findall(fp.read_text()))
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def get_path() -> Path:
|
|
168
|
+
"""Public accessor — used by _setup to reference the file in the
|
|
169
|
+
managed-instructions block."""
|
|
170
|
+
return _LESSONS_PATH
|