threadkeeper 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. threadkeeper/__init__.py +8 -0
  2. threadkeeper/_mcp.py +6 -0
  3. threadkeeper/_setup.py +299 -0
  4. threadkeeper/adapters/__init__.py +40 -0
  5. threadkeeper/adapters/_hook_helpers.py +72 -0
  6. threadkeeper/adapters/base.py +152 -0
  7. threadkeeper/adapters/claude_code.py +178 -0
  8. threadkeeper/adapters/claude_desktop.py +128 -0
  9. threadkeeper/adapters/codex.py +259 -0
  10. threadkeeper/adapters/copilot.py +195 -0
  11. threadkeeper/adapters/gemini.py +169 -0
  12. threadkeeper/adapters/vscode.py +144 -0
  13. threadkeeper/brief.py +735 -0
  14. threadkeeper/config.py +216 -0
  15. threadkeeper/curator.py +390 -0
  16. threadkeeper/db.py +474 -0
  17. threadkeeper/embeddings.py +232 -0
  18. threadkeeper/extract_daemon.py +125 -0
  19. threadkeeper/helpers.py +101 -0
  20. threadkeeper/i18n.py +342 -0
  21. threadkeeper/identity.py +237 -0
  22. threadkeeper/ingest.py +507 -0
  23. threadkeeper/lessons.py +170 -0
  24. threadkeeper/nudges.py +257 -0
  25. threadkeeper/process_health.py +202 -0
  26. threadkeeper/review_prompts.py +207 -0
  27. threadkeeper/search_proxy.py +160 -0
  28. threadkeeper/server.py +55 -0
  29. threadkeeper/shadow_review.py +358 -0
  30. threadkeeper/skill_watcher.py +96 -0
  31. threadkeeper/spawn_budget.py +246 -0
  32. threadkeeper/tools/__init__.py +2 -0
  33. threadkeeper/tools/concepts.py +111 -0
  34. threadkeeper/tools/consolidate.py +222 -0
  35. threadkeeper/tools/core_memory.py +109 -0
  36. threadkeeper/tools/correlation.py +116 -0
  37. threadkeeper/tools/curator.py +121 -0
  38. threadkeeper/tools/dialectic.py +359 -0
  39. threadkeeper/tools/dialog.py +131 -0
  40. threadkeeper/tools/distill.py +184 -0
  41. threadkeeper/tools/extract.py +411 -0
  42. threadkeeper/tools/graph.py +183 -0
  43. threadkeeper/tools/invariants.py +177 -0
  44. threadkeeper/tools/lessons.py +110 -0
  45. threadkeeper/tools/missed_spawns.py +142 -0
  46. threadkeeper/tools/peers.py +579 -0
  47. threadkeeper/tools/pickup.py +148 -0
  48. threadkeeper/tools/probes.py +251 -0
  49. threadkeeper/tools/process_health.py +90 -0
  50. threadkeeper/tools/session.py +34 -0
  51. threadkeeper/tools/shadow_review.py +106 -0
  52. threadkeeper/tools/skills.py +856 -0
  53. threadkeeper/tools/spawn.py +871 -0
  54. threadkeeper/tools/style.py +44 -0
  55. threadkeeper/tools/threads.py +299 -0
  56. threadkeeper-0.4.0.dist-info/METADATA +351 -0
  57. threadkeeper-0.4.0.dist-info/RECORD +61 -0
  58. threadkeeper-0.4.0.dist-info/WHEEL +5 -0
  59. threadkeeper-0.4.0.dist-info/entry_points.txt +2 -0
  60. threadkeeper-0.4.0.dist-info/licenses/LICENSE +21 -0
  61. threadkeeper-0.4.0.dist-info/top_level.txt +1 -0
threadkeeper/ingest.py ADDED
@@ -0,0 +1,507 @@
1
+ """Live ingestion of Claude Code jsonl transcripts into dialog_messages/_fts.
2
+ Background daemon ticks every INGEST_INTERVAL_S; brief() can also call _ingest_recent_only directly."""
3
+ from __future__ import annotations
4
+
5
+ import json as _json
6
+ import os
7
+ import sqlite3
8
+ import threading
9
+ import time
10
+ from datetime import datetime as _dt
11
+ from pathlib import Path
12
+ from typing import Optional
13
+
14
+ from .config import (
15
+ INGEST_CAP_PER_CALL,
16
+ INGEST_INTERVAL_S,
17
+ INGEST_RECENT_WINDOW_S,
18
+ SEMANTIC_AVAILABLE,
19
+ )
20
+ from .db import get_db
21
+ from .embeddings import _embed
22
+
23
+ _ingest_thread: Optional[threading.Thread] = None
24
+ _ingest_lock = threading.Lock()
25
+ _ingest_interval_s = INGEST_INTERVAL_S
26
+ _ingest_recent_window_s = INGEST_RECENT_WINDOW_S
27
+
28
+
29
+ def _backfill_dialog_fts_if_empty(conn: sqlite3.Connection) -> None:
30
+ """Populate dialog_fts from dialog_messages on first start (or after a
31
+ schema add when most rows are already in dialog_messages but not in FTS).
32
+
33
+ Compares row counts: if dialog_fts is meaningfully behind dialog_messages,
34
+ backfill the gap (uuids missing from fts). Idempotent — only inserts
35
+ rows whose uuid isn't already in dialog_fts. Roughly 5-10s for 100k
36
+ records on a laptop — one-time cost."""
37
+ try:
38
+ msg_cnt = conn.execute(
39
+ "SELECT COUNT(*) c FROM dialog_messages"
40
+ ).fetchone()["c"]
41
+ fts_cnt = conn.execute(
42
+ "SELECT COUNT(*) c FROM dialog_fts"
43
+ ).fetchone()["c"]
44
+ except sqlite3.OperationalError:
45
+ return
46
+ if fts_cnt >= msg_cnt - 5:
47
+ # close enough — newly-arrived rows fill via INSERT trigger in _ingest_file
48
+ conn.execute(
49
+ "INSERT INTO style (key, value, updated_at) VALUES (?,?,?) "
50
+ "ON CONFLICT(key) DO UPDATE SET value=excluded.value, "
51
+ "updated_at=excluded.updated_at",
52
+ ("fts_backfilled", str(fts_cnt), int(time.time())),
53
+ )
54
+ conn.commit()
55
+ return
56
+ # backfill rows present in dialog_messages but missing from dialog_fts
57
+ missing = conn.execute(
58
+ "SELECT d.uuid, d.content FROM dialog_messages d "
59
+ "LEFT JOIN dialog_fts f ON f.uuid = d.uuid "
60
+ "WHERE f.uuid IS NULL"
61
+ ).fetchall()
62
+ batch: list[tuple[str, str]] = []
63
+ added = 0
64
+ for r in missing:
65
+ batch.append((r["uuid"], r["content"]))
66
+ if len(batch) >= 5000:
67
+ conn.executemany(
68
+ "INSERT INTO dialog_fts (uuid, content) VALUES (?, ?)",
69
+ batch,
70
+ )
71
+ conn.commit()
72
+ added += len(batch)
73
+ batch = []
74
+ if batch:
75
+ conn.executemany(
76
+ "INSERT INTO dialog_fts (uuid, content) VALUES (?, ?)",
77
+ batch,
78
+ )
79
+ added += len(batch)
80
+ final_cnt = conn.execute(
81
+ "SELECT COUNT(*) c FROM dialog_fts"
82
+ ).fetchone()["c"]
83
+ conn.execute(
84
+ "INSERT INTO style (key, value, updated_at) VALUES (?,?,?) "
85
+ "ON CONFLICT(key) DO UPDATE SET value=excluded.value, "
86
+ "updated_at=excluded.updated_at",
87
+ ("fts_backfilled", f"{final_cnt}+{added}", int(time.time())),
88
+ )
89
+ conn.commit()
90
+
91
+
92
+ def _parse_ts(ts: str) -> int:
93
+ try:
94
+ return int(_dt.fromisoformat(ts.replace("Z", "+00:00")).timestamp())
95
+ except Exception:
96
+ return int(time.time())
97
+
98
+
99
+ def _scan_message_for_skill_use(msg: dict) -> list[str]:
100
+ """Return Skill tool_use invocations found in a single message dict.
101
+ Handles both flat and nested content arrays; accepts either 'skill' or
102
+ 'name' key inside the tool_use input payload. Returns [] for non-
103
+ matching messages.
104
+ """
105
+ found: list[str] = []
106
+
107
+ def _walk(node) -> None:
108
+ if isinstance(node, list):
109
+ for item in node:
110
+ _walk(item)
111
+ return
112
+ if not isinstance(node, dict):
113
+ return
114
+ if node.get("type") == "tool_use" and node.get("name") == "Skill":
115
+ inp = node.get("input") or {}
116
+ if isinstance(inp, dict):
117
+ val = inp.get("skill") or inp.get("name")
118
+ if isinstance(val, str) and val:
119
+ found.append(val)
120
+ # Recurse into anything that might wrap further content blocks.
121
+ for v in node.values():
122
+ if isinstance(v, (list, dict)):
123
+ _walk(v)
124
+
125
+ _walk(msg)
126
+ return found
127
+
128
+
129
+ def _extract_text(msg: dict) -> str:
130
+ """Pull searchable text from a message; skip tool_use args, cap tool_results."""
131
+ content = msg.get("content", "")
132
+ if isinstance(content, str):
133
+ return content
134
+ if not isinstance(content, list):
135
+ return ""
136
+ parts: list[str] = []
137
+ for block in content:
138
+ if not isinstance(block, dict):
139
+ continue
140
+ t = block.get("type")
141
+ if t == "text":
142
+ parts.append(block.get("text", ""))
143
+ elif t == "thinking":
144
+ parts.append(f"[thinking] {block.get('thinking', '')}")
145
+ elif t == "tool_result":
146
+ tr = block.get("content", "")
147
+ if isinstance(tr, list):
148
+ tr = " ".join(b.get("text", "") for b in tr if isinstance(b, dict))
149
+ if isinstance(tr, str) and tr:
150
+ parts.append(f"[tool_result] {tr[:800]}")
151
+ # tool_use blocks deliberately skipped (noisy for semantic search)
152
+ return "\n".join(p for p in parts if p)
153
+
154
+
155
+ def _ingest_file(conn: sqlite3.Connection, fp: Path, max_msgs: int,
156
+ adapter=None) -> int:
157
+ """Incrementally ingest one transcript file from the given adapter.
158
+ Returns number of new messages added.
159
+
160
+ When `adapter` is None (legacy callers), the Claude Code adapter is
161
+ used so the function's old contract still holds.
162
+
163
+ Strategy: skip the file entirely if mtime hasn't advanced past
164
+ ingest_state.last_mtime. Otherwise use `adapter.iter_messages(fp)`
165
+ to enumerate normalized messages and dedup via dialog_messages.uuid.
166
+ """
167
+ if adapter is None:
168
+ from .adapters import _CLAUDE_CODE as _claude_default # type: ignore
169
+ adapter = _claude_default
170
+ if not fp.exists():
171
+ return 0
172
+ stat = fp.stat()
173
+ mtime = int(stat.st_mtime)
174
+ state = conn.execute(
175
+ "SELECT last_mtime FROM ingest_state WHERE file_path=?", (str(fp),)
176
+ ).fetchone()
177
+ last_mtime = state["last_mtime"] if state else 0
178
+ if mtime <= last_mtime:
179
+ return 0
180
+ added = 0
181
+ try:
182
+ for nm in adapter.iter_messages(fp):
183
+ if added >= max_msgs:
184
+ break
185
+ if not nm.uuid:
186
+ continue
187
+ if conn.execute(
188
+ "SELECT 1 FROM dialog_messages WHERE uuid=?", (nm.uuid,)
189
+ ).fetchone():
190
+ continue
191
+ # Skill scan first — runs even for tool-only assistant turns
192
+ # whose text body would fail the >=10 char filter below.
193
+ if nm.role == "assistant":
194
+ for skill_name in _scan_message_for_skill_use(nm.raw):
195
+ try:
196
+ conn.execute(
197
+ "INSERT INTO skill_usage "
198
+ "(name, created_at, created_by_origin) "
199
+ "VALUES (?, ?, 'foreground') "
200
+ "ON CONFLICT(name) DO NOTHING",
201
+ (skill_name, nm.created_at),
202
+ )
203
+ conn.execute(
204
+ "UPDATE skill_usage "
205
+ "SET last_used_at=?, use_count=use_count+1 "
206
+ "WHERE name=? AND (last_used_at IS NULL "
207
+ "OR last_used_at < ?)",
208
+ (nm.created_at, skill_name, nm.created_at),
209
+ )
210
+ except sqlite3.OperationalError:
211
+ pass # skill_usage missing on this conn
212
+ text = nm.content
213
+ if not text or len(text) < 10:
214
+ continue
215
+ emb = _embed(text[:2000]) if SEMANTIC_AVAILABLE else None
216
+ conn.execute(
217
+ "INSERT INTO dialog_messages (uuid, source, project, session_id, "
218
+ "role, content, model, created_at, embedding) "
219
+ "VALUES (?,?,?,?,?,?,?,?,?)",
220
+ (nm.uuid, adapter.name, adapter.project_label(fp),
221
+ nm.session_id, nm.role, text,
222
+ nm.model, nm.created_at, emb)
223
+ )
224
+ try:
225
+ conn.execute(
226
+ "INSERT INTO dialog_fts (uuid, content) VALUES (?, ?)",
227
+ (nm.uuid, text),
228
+ )
229
+ except sqlite3.OperationalError:
230
+ pass
231
+ if emb is not None:
232
+ try:
233
+ from .embeddings import _vec_upsert_dialog
234
+ _vec_upsert_dialog(conn, nm.uuid, emb)
235
+ except Exception:
236
+ pass
237
+ added += 1
238
+ except OSError:
239
+ return added
240
+ conn.execute(
241
+ "INSERT INTO ingest_state (file_path, last_size, last_mtime, ingested_at, msg_count) "
242
+ "VALUES (?,?,?,?,?) "
243
+ "ON CONFLICT(file_path) DO UPDATE SET "
244
+ " last_size=excluded.last_size, last_mtime=excluded.last_mtime, "
245
+ " ingested_at=excluded.ingested_at, msg_count=ingest_state.msg_count+excluded.msg_count",
246
+ (str(fp), stat.st_size, mtime, int(time.time()), added)
247
+ )
248
+ return added
249
+
250
+
251
+ def _ingest_all(conn: sqlite3.Connection, max_msgs: int = 1_000_000) -> tuple[int, int]:
252
+ """Iterate every installed CLI adapter, incrementally ingest each
253
+ transcript file. Returns (new_msgs, files_seen) across ALL adapters."""
254
+ from .adapters import installed_adapters
255
+ total = 0
256
+ files_seen = 0
257
+ for adapter in installed_adapters():
258
+ files = adapter.transcript_files()
259
+ files_seen += len(files)
260
+ files = sorted(
261
+ files,
262
+ key=lambda p: p.stat().st_mtime if p.exists() else 0,
263
+ reverse=True,
264
+ )
265
+ for fp in files:
266
+ if total >= max_msgs:
267
+ break
268
+ total += _ingest_file(conn, fp, max_msgs - total, adapter=adapter)
269
+ conn.commit()
270
+ return (total, files_seen)
271
+
272
+
273
+ def _ingest_recent_only(conn: sqlite3.Connection,
274
+ max_msgs: int = 200,
275
+ max_age_s: int = 600) -> tuple[int, int]:
276
+ """Live-mode ingest: only transcript files modified within `max_age_s`,
277
+ across ALL installed CLI adapters.
278
+
279
+ Commits after EACH file so the background tick doesn't hold a long
280
+ write lock — multi-writer contention (parent + children + ingester)
281
+ deadlocks fast otherwise."""
282
+ from .adapters import installed_adapters
283
+ cutoff = time.time() - max_age_s
284
+ fresh: list[tuple[float, Path, object]] = []
285
+ for adapter in installed_adapters():
286
+ for p in adapter.transcript_files():
287
+ try:
288
+ m = p.stat().st_mtime
289
+ except OSError:
290
+ continue
291
+ if m > cutoff:
292
+ fresh.append((m, p, adapter))
293
+ fresh.sort(key=lambda x: x[0], reverse=True)
294
+ total = 0
295
+ for _, fp, adapter in fresh:
296
+ if total >= max_msgs:
297
+ break
298
+ added = _ingest_file(conn, fp, max_msgs - total, adapter=adapter)
299
+ total += added
300
+ if added:
301
+ try:
302
+ conn.commit()
303
+ except sqlite3.OperationalError:
304
+ pass
305
+ return (total, len(fresh))
306
+
307
+
308
+ def _backfill_skill_usage_from_jsonls(conn: sqlite3.Connection) -> int:
309
+ """One-shot historical scan across every installed adapter. Finds
310
+ assistant messages with tool_use(name='Skill') blocks and bumps
311
+ skill_usage counters. Idempotent — the UPDATE guard on last_used_at
312
+ prevents double-counting.
313
+
314
+ Skill-tool semantics are Claude-specific in practice (other CLIs
315
+ don't emit `tool_use name='Skill'` blocks), but the scanner is
316
+ defensive and silently returns [] on unmatched payload shapes —
317
+ so iterating all adapters is safe.
318
+
319
+ Returns the number of (skill_name, message) pairs processed.
320
+ """
321
+ from .adapters import installed_adapters
322
+ processed = 0
323
+ for adapter in installed_adapters():
324
+ for fp in adapter.transcript_files():
325
+ try:
326
+ for nm in adapter.iter_messages(fp):
327
+ if nm.role != "assistant":
328
+ continue
329
+ skills = _scan_message_for_skill_use(nm.raw)
330
+ if not skills:
331
+ continue
332
+ for skill_name in skills:
333
+ try:
334
+ conn.execute(
335
+ "INSERT INTO skill_usage "
336
+ "(name, created_at, created_by_origin) "
337
+ "VALUES (?, ?, 'foreground') "
338
+ "ON CONFLICT(name) DO NOTHING",
339
+ (skill_name, nm.created_at),
340
+ )
341
+ conn.execute(
342
+ "UPDATE skill_usage "
343
+ "SET last_used_at=?, use_count=use_count+1 "
344
+ "WHERE name=? AND (last_used_at IS NULL "
345
+ "OR last_used_at < ?)",
346
+ (nm.created_at, skill_name, nm.created_at),
347
+ )
348
+ processed += 1
349
+ except sqlite3.OperationalError:
350
+ pass
351
+ except OSError:
352
+ continue
353
+ try:
354
+ conn.commit()
355
+ except sqlite3.OperationalError:
356
+ pass
357
+ return processed
358
+
359
+
360
+ def _backfill_note_embeddings(conn: sqlite3.Connection, max_n: int = 20) -> int:
361
+ """Embed up to `max_n` notes whose embedding column is NULL, and mirror
362
+ every newly-embedded blob into notes_vec.
363
+
364
+ Light spawned children (NO_EMBEDDINGS=1) write notes with embedding=NULL
365
+ because they don't carry the model. A parent process with embeddings
366
+ available catches them up here so semantic search isn't permanently
367
+ blind to those notes. No-op when this process doesn't have embeddings.
368
+ Returns the number of rows updated.
369
+ """
370
+ from .config import SEMANTIC_AVAILABLE
371
+ if not SEMANTIC_AVAILABLE:
372
+ return 0
373
+ try:
374
+ rows = conn.execute(
375
+ "SELECT id, content FROM notes "
376
+ "WHERE embedding IS NULL "
377
+ "ORDER BY id DESC LIMIT ?",
378
+ (max_n,),
379
+ ).fetchall()
380
+ except sqlite3.OperationalError:
381
+ return 0
382
+ if not rows:
383
+ return 0
384
+ from .embeddings import _embed, _vec_upsert_note
385
+ updated = 0
386
+ for r in rows:
387
+ try:
388
+ emb = _embed(r["content"])
389
+ except Exception:
390
+ continue
391
+ if emb is None:
392
+ continue
393
+ try:
394
+ conn.execute(
395
+ "UPDATE notes SET embedding=? WHERE id=?",
396
+ (emb, r["id"]),
397
+ )
398
+ _vec_upsert_note(conn, r["id"], emb)
399
+ updated += 1
400
+ except sqlite3.OperationalError:
401
+ continue
402
+ if updated:
403
+ try:
404
+ conn.commit()
405
+ except sqlite3.OperationalError:
406
+ pass
407
+ return updated
408
+
409
+
410
+ def _backfill_vec_tables(conn: sqlite3.Connection, batch: int = 500) -> tuple[int, int]:
411
+ """One-shot migration: mirror existing notes.embedding and
412
+ dialog_messages.embedding BLOBs into notes_vec / dialog_vec.
413
+
414
+ Idempotent — `INSERT OR REPLACE` won't duplicate. Returns
415
+ (notes_inserted, dialog_inserted). Called from the background ingester
416
+ tick; bails fast when there's nothing to do.
417
+ """
418
+ from .config import SEMANTIC_AVAILABLE
419
+ from .db import vec_available
420
+ if not SEMANTIC_AVAILABLE or not vec_available():
421
+ return (0, 0)
422
+ from .embeddings import _vec_upsert_note, _vec_upsert_dialog
423
+ n_notes = 0
424
+ n_dialog = 0
425
+ try:
426
+ # Notes that have embedding but aren't yet in notes_vec.
427
+ rows = conn.execute(
428
+ "SELECT n.id, n.embedding FROM notes n "
429
+ "LEFT JOIN notes_vec v ON v.id = n.id "
430
+ "WHERE n.embedding IS NOT NULL AND v.id IS NULL "
431
+ "LIMIT ?",
432
+ (batch,),
433
+ ).fetchall()
434
+ for r in rows:
435
+ _vec_upsert_note(conn, r["id"], r["embedding"])
436
+ n_notes += 1
437
+ except sqlite3.OperationalError:
438
+ pass
439
+ try:
440
+ # Dialog messages with embedding but no dialog_vec_map row → need
441
+ # mirroring. (We check via the map because dialog_vec is keyed
442
+ # by rowid, not uuid.)
443
+ rows = conn.execute(
444
+ "SELECT d.uuid, d.embedding FROM dialog_messages d "
445
+ "LEFT JOIN dialog_vec_map m ON m.uuid = d.uuid "
446
+ "WHERE d.embedding IS NOT NULL AND m.uuid IS NULL "
447
+ "LIMIT ?",
448
+ (batch,),
449
+ ).fetchall()
450
+ for r in rows:
451
+ _vec_upsert_dialog(conn, r["uuid"], r["embedding"])
452
+ n_dialog += 1
453
+ except sqlite3.OperationalError:
454
+ pass
455
+ if n_notes or n_dialog:
456
+ try:
457
+ conn.commit()
458
+ except sqlite3.OperationalError:
459
+ pass
460
+ return (n_notes, n_dialog)
461
+
462
+
463
+ def _start_background_ingester() -> None:
464
+ """Start a daemon thread that incrementally ingests recently-modified jsonl
465
+ files. Idempotent: subsequent calls are no-ops. Daemon=True so it dies with
466
+ the process; no shutdown handshake needed."""
467
+ global _ingest_thread
468
+ if _ingest_thread is not None and _ingest_thread.is_alive():
469
+ return
470
+ if _ingest_interval_s <= 0:
471
+ return # disabled via env
472
+
473
+ def _loop() -> None:
474
+ while True:
475
+ time.sleep(_ingest_interval_s)
476
+ try:
477
+ if not _ingest_lock.acquire(blocking=False):
478
+ continue # another tick still running, skip
479
+ try:
480
+ bg_conn = get_db()
481
+ try:
482
+ _ingest_recent_only(
483
+ bg_conn,
484
+ max_msgs=200,
485
+ max_age_s=_ingest_recent_window_s,
486
+ )
487
+ # Embedding backfill: light children write notes
488
+ # with embedding=NULL (NO_EMBEDDINGS=1). Parent
489
+ # processes with SEMANTIC_AVAILABLE catch them up
490
+ # asynchronously so semantic search recovers
491
+ # without blocking the child.
492
+ _backfill_note_embeddings(bg_conn, max_n=20)
493
+ # vec0 backfill: mirror legacy BLOB embeddings
494
+ # into the vec0 virtual tables in batches so the
495
+ # sub-linear index gradually warms up.
496
+ _backfill_vec_tables(bg_conn, batch=500)
497
+ finally:
498
+ bg_conn.close()
499
+ finally:
500
+ _ingest_lock.release()
501
+ except Exception:
502
+ pass # never crash the daemon
503
+
504
+ _ingest_thread = threading.Thread(
505
+ target=_loop, name="thread-keeper-live-ingest", daemon=True
506
+ )
507
+ _ingest_thread.start()
@@ -0,0 +1,170 @@
1
+ """CLI-agnostic procedural-knowledge store at ~/.threadkeeper/lessons.md.
2
+
3
+ The learning loop (auto-review on close_thread + shadow_review daemon)
4
+ materializes lessons here. Every supported CLI's per-user instructions
5
+ file references this path so the lessons take effect in any of them.
6
+
7
+ Format on disk:
8
+
9
+ # thread-keeper lessons
10
+
11
+ Procedural knowledge accumulated across sessions. Auto-managed by
12
+ the learning loop — do not edit by hand; new entries are appended.
13
+
14
+ <!-- LESSON:BEGIN slug=<slug> ts=<unix> source=<thread_id|shadow> -->
15
+ ## <slug>
16
+ > <one-line summary>
17
+
18
+ <body of the lesson>
19
+ <!-- LESSON:END slug=<slug> -->
20
+
21
+ <!-- LESSON:BEGIN ... -->
22
+ ...
23
+
24
+ The sentinel-bracketed sections make per-entry diffs cheap and let us
25
+ update or de-duplicate without rewriting the whole file. New entries
26
+ land at the bottom (chronological).
27
+ """
28
+ from __future__ import annotations
29
+
30
+ import os
31
+ import re
32
+ import time
33
+ from pathlib import Path
34
+ from typing import Iterator, Optional
35
+
36
+
37
+ _LESSONS_PATH = Path(
38
+ os.environ.get("THREADKEEPER_LESSONS", "~/.threadkeeper/lessons.md")
39
+ ).expanduser()
40
+
41
+
42
+ _HEADER = """\
43
+ # thread-keeper lessons
44
+
45
+ Procedural knowledge accumulated across sessions. Auto-managed by the
46
+ learning loop — do not edit by hand; new entries are appended.
47
+
48
+ """
49
+
50
+
51
+ _SLUG_RE = re.compile(r"[^a-z0-9-]+")
52
+
53
+
54
+ def _slugify(title: str) -> str:
55
+ """Produce a safe filesystem/url slug from a lesson title."""
56
+ s = title.strip().lower().replace(" ", "-")
57
+ s = _SLUG_RE.sub("-", s)
58
+ s = re.sub(r"-{2,}", "-", s).strip("-")
59
+ return s or "untitled"
60
+
61
+
62
+ def _ensure_file(path: Path) -> None:
63
+ """Create the lessons file with the standard header if absent."""
64
+ if path.exists():
65
+ return
66
+ path.parent.mkdir(parents=True, exist_ok=True)
67
+ path.write_text(_HEADER)
68
+
69
+
70
+ def _format_section(slug: str, summary: str, body: str,
71
+ source: str, ts: int) -> str:
72
+ """One LESSON-BEGIN…LESSON-END block with the sentinel markers."""
73
+ summary_line = f"> {summary.strip()}" if summary.strip() else ""
74
+ body_text = body.strip()
75
+ return (
76
+ f"<!-- LESSON:BEGIN slug={slug} ts={ts} source={source} -->\n"
77
+ f"## {slug}\n"
78
+ + (f"{summary_line}\n\n" if summary_line else "\n")
79
+ + body_text + "\n"
80
+ f"<!-- LESSON:END slug={slug} -->\n\n"
81
+ )
82
+
83
+
84
+ _BLOCK_RE = re.compile(
85
+ r"<!-- LESSON:BEGIN slug=(?P<slug>[^\s]+)[^>]*-->"
86
+ r"(?P<body>.*?)"
87
+ r"<!-- LESSON:END slug=(?P=slug) -->",
88
+ re.DOTALL,
89
+ )
90
+
91
+
92
+ def append_lesson(
93
+ title: str,
94
+ body: str,
95
+ summary: str = "",
96
+ source: str = "",
97
+ path: Optional[Path] = None,
98
+ ) -> str:
99
+ """Append a new lesson section, or replace an existing one with the
100
+ same slug. Returns the slug.
101
+
102
+ `title` becomes the section header (sluggified for the sentinel).
103
+ `body` is markdown; `summary` is a one-liner shown right after the
104
+ header. `source` is a free-text provenance tag — typically a thread
105
+ id ("Tabc123") or "shadow" for shadow_review writes.
106
+ """
107
+ fp = path or _LESSONS_PATH
108
+ _ensure_file(fp)
109
+ slug = _slugify(title)
110
+ ts = int(time.time())
111
+ new_section = _format_section(slug, summary, body, source or "", ts)
112
+
113
+ body_existing = fp.read_text()
114
+ # If a section with this slug already exists, replace it in-place
115
+ # (idempotent re-materialization of the same lesson).
116
+ target_begin = f"<!-- LESSON:BEGIN slug={slug} "
117
+ target_end = f"<!-- LESSON:END slug={slug} -->"
118
+ if target_begin in body_existing and target_end in body_existing:
119
+ head, _, rest = body_existing.partition(target_begin)
120
+ # Find the matching END after the BEGIN.
121
+ end_marker = target_end
122
+ end_idx = rest.find(end_marker)
123
+ if end_idx >= 0:
124
+ tail = rest[end_idx + len(end_marker):]
125
+ body_existing = head + new_section.rstrip() + "\n" + tail.lstrip("\n")
126
+ else:
127
+ # Malformed file (BEGIN without END) — just append at end.
128
+ body_existing = body_existing.rstrip() + "\n\n" + new_section
129
+ else:
130
+ body_existing = body_existing.rstrip() + "\n\n" + new_section
131
+ fp.write_text(body_existing)
132
+ return slug
133
+
134
+
135
+ def iter_lessons(path: Optional[Path] = None) -> Iterator[dict]:
136
+ """Yield every lesson section as a dict with keys:
137
+ slug, body (raw markdown between BEGIN/END), ts, source.
138
+
139
+ Order is file-order (chronological if writes are append-only)."""
140
+ fp = path or _LESSONS_PATH
141
+ if not fp.exists():
142
+ return
143
+ body = fp.read_text()
144
+ for m in _BLOCK_RE.finditer(body):
145
+ slug = m.group("slug")
146
+ block_body = m.group("body").strip()
147
+ # Parse ts and source out of the BEGIN line we already matched.
148
+ begin_line = body[m.start():m.start() + 200].split("\n", 1)[0]
149
+ ts_match = re.search(r"ts=(\d+)", begin_line)
150
+ source_match = re.search(r"source=([^\s>]+)", begin_line)
151
+ yield {
152
+ "slug": slug,
153
+ "body": block_body,
154
+ "ts": int(ts_match.group(1)) if ts_match else 0,
155
+ "source": source_match.group(1) if source_match else "",
156
+ }
157
+
158
+
159
+ def count_lessons(path: Optional[Path] = None) -> int:
160
+ """Cheap count for diagnostic surfaces (brief, shadow_review_status)."""
161
+ fp = path or _LESSONS_PATH
162
+ if not fp.exists():
163
+ return 0
164
+ return len(_BLOCK_RE.findall(fp.read_text()))
165
+
166
+
167
+ def get_path() -> Path:
168
+ """Public accessor — used by _setup to reference the file in the
169
+ managed-instructions block."""
170
+ return _LESSONS_PATH