claude_memory 0.10.0 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. checksums.yaml +4 -4
  2. data/.claude/memory.sqlite3 +0 -0
  3. data/.claude/rules/claude_memory.generated.md +42 -64
  4. data/.claude/skills/release/SKILL.md +44 -6
  5. data/.claude/skills/study-repo/SKILL.md +15 -0
  6. data/.claude-plugin/commands/audit-memory.md +68 -0
  7. data/.claude-plugin/marketplace.json +1 -1
  8. data/.claude-plugin/plugin.json +1 -1
  9. data/CHANGELOG.md +70 -0
  10. data/CLAUDE.md +20 -5
  11. data/README.md +64 -2
  12. data/db/migrations/018_add_otel_telemetry.rb +81 -0
  13. data/docs/1_0_punchlist.md +522 -89
  14. data/docs/GETTING_STARTED.md +3 -1
  15. data/docs/api_stability.md +341 -0
  16. data/docs/architecture.md +3 -3
  17. data/docs/audit_runbook.md +209 -0
  18. data/docs/claude_monitoring.md +956 -0
  19. data/docs/dashboard.md +23 -3
  20. data/docs/improvements.md +329 -5
  21. data/docs/influence/ai-memory-systems-2026.md +403 -0
  22. data/docs/memory_audit_2026-05-21.md +303 -0
  23. data/docs/plugin.md +1 -1
  24. data/docs/quality_review.md +35 -0
  25. data/lib/claude_memory/audit/checks.rb +239 -0
  26. data/lib/claude_memory/audit/finding.rb +33 -0
  27. data/lib/claude_memory/audit/runner.rb +73 -0
  28. data/lib/claude_memory/commands/audit_command.rb +117 -0
  29. data/lib/claude_memory/commands/dashboard_command.rb +2 -1
  30. data/lib/claude_memory/commands/digest_command.rb +95 -3
  31. data/lib/claude_memory/commands/hook_command.rb +27 -2
  32. data/lib/claude_memory/commands/import_auto_memory_command.rb +180 -0
  33. data/lib/claude_memory/commands/initializers/hooks_configurator.rb +7 -4
  34. data/lib/claude_memory/commands/otel_command.rb +240 -0
  35. data/lib/claude_memory/commands/registry.rb +5 -1
  36. data/lib/claude_memory/commands/show_command.rb +90 -0
  37. data/lib/claude_memory/commands/stats_command.rb +94 -2
  38. data/lib/claude_memory/configuration.rb +60 -0
  39. data/lib/claude_memory/core/fact_query_builder.rb +1 -0
  40. data/lib/claude_memory/dashboard/api.rb +8 -0
  41. data/lib/claude_memory/dashboard/index.html +140 -1
  42. data/lib/claude_memory/dashboard/prompt_journey.rb +48 -0
  43. data/lib/claude_memory/dashboard/server.rb +86 -0
  44. data/lib/claude_memory/dashboard/telemetry.rb +156 -0
  45. data/lib/claude_memory/dashboard/trust.rb +180 -11
  46. data/lib/claude_memory/deprecations.rb +106 -0
  47. data/lib/claude_memory/distill/bare_conclusion_detector.rb +71 -0
  48. data/lib/claude_memory/distill/reference_material_detector.rb +37 -4
  49. data/lib/claude_memory/hook/auto_memory_mirror.rb +7 -3
  50. data/lib/claude_memory/hook/context_injector.rb +11 -2
  51. data/lib/claude_memory/hook/handler.rb +142 -1
  52. data/lib/claude_memory/mcp/tool_definitions.rb +3 -3
  53. data/lib/claude_memory/otel/attributes.rb +118 -0
  54. data/lib/claude_memory/otel/constants.rb +32 -0
  55. data/lib/claude_memory/otel/ingestor.rb +54 -0
  56. data/lib/claude_memory/otel/otlp_json_envelope.rb +254 -0
  57. data/lib/claude_memory/otel/prompt_scope.rb +108 -0
  58. data/lib/claude_memory/otel/settings_writer.rb +122 -0
  59. data/lib/claude_memory/otel/status.rb +58 -0
  60. data/lib/claude_memory/recall/staleness_annotator.rb +73 -0
  61. data/lib/claude_memory/resolve/predicate_policy.rb +17 -1
  62. data/lib/claude_memory/resolve/resolver.rb +30 -3
  63. data/lib/claude_memory/shortcuts.rb +61 -18
  64. data/lib/claude_memory/store/prompt_journey_query.rb +87 -0
  65. data/lib/claude_memory/store/schema_manager.rb +1 -1
  66. data/lib/claude_memory/store/sqlite_store.rb +136 -0
  67. data/lib/claude_memory/sweep/maintenance.rb +31 -1
  68. data/lib/claude_memory/sweep/sweeper.rb +6 -0
  69. data/lib/claude_memory/templates/hooks.example.json +5 -0
  70. data/lib/claude_memory/version.rb +1 -1
  71. data/lib/claude_memory.rb +20 -0
  72. metadata +28 -1
data/README.md CHANGED
@@ -140,7 +140,51 @@ File-searchable questions ("what version is this?") and one-shot code generation
140
140
  - **Claude-Powered**: Uses Claude's intelligence to extract facts (no API key needed)
141
141
  - **Token Efficient**: 10x reduction in memory queries with progressive disclosure
142
142
  - **Database Maintenance**: Compact, export, and backup commands
143
- - **Built-in Observability** (0.10.0+): `claude-memory dashboard` opens a local web UI with a moments feed, trust panel, conflicts dedup, knowledge index, 👍/👎 feedback, and a 30-day utilization ratio. See **[Dashboard guide →](docs/dashboard.md)**. `claude-memory digest` writes a weekly markdown report; `claude-memory census` audits the predicate vocabulary across projects.
143
+ - **Built-in Observability** (0.10.0+): `claude-memory dashboard` opens a local web UI with a moments feed, trust panel (token budget, quality score, utilization, feedback), conflicts dedup, knowledge index, and 👍/👎 feedback. See **[Dashboard guide →](docs/dashboard.md)**. `claude-memory digest` writes a weekly markdown report (Activity, Context cost, Quality, New knowledge, Utilization, Conflicts, Feedback); `claude-memory show` prints what would be injected next SessionStart; `claude-memory census` audits the predicate vocabulary across projects.
144
+ - **OpenTelemetry ingestion** (Unreleased): point Claude Code's OTLP exporter at the dashboard and the new "Telemetry" tab shows per-API-call cost in USD, token usage by model, top tools by latency, and a per-prompt event waterfall. One-line setup:
145
+
146
+ ```bash
147
+ claude-memory dashboard --port 3377 & # start the receiver
148
+ claude-memory otel --enable # writes telemetry env into .claude/settings.json
149
+ claude-memory otel --enable-traces # optional: include OpenTelemetry spans
150
+ claude-memory otel --status # confirm metrics are flowing
151
+ ```
152
+
153
+ Only metrics and event names are captured by default — verbatim prompts and bodies stay off until you explicitly opt in via `claude-memory otel --capture-prompts`. The receiver binds to `127.0.0.1` only.
154
+
155
+ ## What's New in 0.11.0
156
+
157
+ Five user-visible signals so you can answer "is memory still worth it?" with
158
+ numbers, not vibes:
159
+
160
+ - **Token budget telemetry** — every SessionStart context injection now
161
+ records its estimated `context_tokens`. `claude-memory stats --tokens
162
+ [--since DAYS]` reports p50/p95/avg/min/max plus a histogram across
163
+ <500 / 500-1k / 1-2k / 2-5k / 5k+ buckets so you can see the per-session
164
+ cost at a glance. The dashboard's Trust panel and `claude-memory digest`
165
+ surface the same numbers.
166
+ - **Hallucination-rate metric** — the dashboard now scores how *clean* the
167
+ fact base is, not just how full it is. `Distill::BareConclusionDetector`
168
+ flags `decision` / `convention` facts that skipped the reason-clause
169
+ requirement. Trust panel shows `quality_score` (live 30-day window with
170
+ historical baseline beneath). `claude-memory digest` adds a Quality
171
+ section with rejection rate.
172
+ - **`claude-memory show`** — new command prints what memory *would* inject
173
+ at the next SessionStart in plain Markdown. Footer reports fact count,
174
+ ~token estimate, and char count so you see the cost at a glance. Default
175
+ hides the raw-transcript "Pending Knowledge" dump for readability;
176
+ `--pending` opts in. `--source startup|resume|clear` simulates each
177
+ fresh-session entrypoint.
178
+ - **First-week ROI nudge** — at SessionEnd, memory now prints
179
+ `memory contributed N facts this session, %used = X` for the first 10
180
+ sessions, then quiets. Cold-start trust signal — you don't have to know
181
+ about the dashboard. Opt out with `CLAUDE_MEMORY_NO_NUDGE=1`.
182
+ - **Harm benchmark prototype** — first ClaudeMemory benchmark that
183
+ measures whether memory can make Claude *wrong*. Three hand-written
184
+ cases (stale-tech, mismatched-scope, superseded-but-undetected) under
185
+ `spec/benchmarks/e2e/harm_bench_spec.rb`. Real-mode run on the 0.11
186
+ release reported 0/3 harm; the full 10-15-case corpus + release gate
187
+ lands in 0.12.
144
188
 
145
189
  ## What's New in 0.10.0
146
190
 
@@ -273,11 +317,26 @@ The uninstall command removes:
273
317
  - 📊 [Dashboard](docs/dashboard.md) - Local web UI for inspection and trust signals (0.10.0+)
274
318
  - 🔧 [Plugin Setup](docs/plugin.md) - Claude Code integration
275
319
  - 🏗️ [Architecture](docs/architecture.md) - Technical deep dive
320
+ - 🔒 [API Stability](docs/api_stability.md) - What's stable / experimental / internal across releases (0.12.0+)
276
321
  - 📝 [Changelog](CHANGELOG.md) - Release notes
277
322
 
278
323
  ## Benchmarks
279
324
 
280
- ClaudeMemory includes **DevMemBench**, a developer-domain benchmark suite that measures retrieval quality and truth maintenance accuracy. All offline benchmarks run locally at zero cost.
325
+ ClaudeMemory includes **DevMemBench**, a developer-domain benchmark suite that measures retrieval quality, truth maintenance accuracy, **negative-fact harm**, and **uplift over a hand-written CLAUDE.md baseline**. All offline benchmarks run locally at zero cost; end-to-end and comparative runs use real Claude (~$5-15 per full run).
326
+
327
+ ### Does memory ever make Claude *wrong*?
328
+
329
+ Every other benchmark measures whether memory helps. The negative-fact harm benchmark measures whether memory can hurt — injecting a stale, mis-scoped, superseded, or reference-material fact and watching Claude follow it. 13 scenarios across 4 harm classes, each with a realistic project scaffold whose actual state contradicts the wrong fact, scored best-of-3 by majority vote. The run fails the build if any scenario reliably produces a harm (>1%).
330
+
331
+ ```bash
332
+ EVAL_MODE=real HARM_BENCH_RUNS=3 EVAL_MAX_BUDGET_USD=0.50 bundle exec rspec spec/benchmarks/e2e/harm_bench_spec.rb
333
+ ```
334
+
335
+ **0.12 baseline (2026-05-28): 0/13 harm.** See [`spec/benchmarks/README.md`](spec/benchmarks/README.md#harm_scenariosyml-13-scenarios-full-corpus-0120) for the full corpus and methodology.
336
+
337
+ ### Is this better than a hand-written CLAUDE.md?
338
+
339
+ The single most important question for adoption is whether dynamic retrieval beats static context injection. ClaudeMemory ships a `CLAUDE.md baseline` adapter and a comparative E2E harness for exactly this. **The numbers aren't published yet (as of 0.12):** the current harness compares static CLAUDE.md (auto-loaded into every prompt) against ClaudeMemory's MCP-tool retrieval, but in headless `claude -p` mode Claude doesn't proactively call the recall tools, so the comparison doesn't yet exercise ClaudeMemory's retrieval path fairly. Publishing that gap as a headline number would mislead. The harness fix is tracked for 0.13 — see [`docs/1_0_punchlist.md`](docs/1_0_punchlist.md) #4.
281
340
 
282
341
  ### Latest Results
283
342
 
@@ -290,6 +349,9 @@ ClaudeMemory includes **DevMemBench**, a developer-domain benchmark suite that m
290
349
  | **Hybrid Retrieval** | Recall@5 (100 queries aggregate) | **72.7%** |
291
350
  | **Hybrid Retrieval** | Recall@10 (20 hard queries) | **62.8%** |
292
351
  | **Scope Ranking** | Queries returning expected facts | **5/5** |
352
+ | **Negative-Fact Harm (prototype)** | 0.11 baseline (3 scenarios, real Claude) | **0/3** |
353
+ | **Negative-Fact Harm (full corpus)** | 0.12 baseline (13 scenarios, best-of-3, real Claude) | **0/13 (0.0%)** |
354
+ | **E2E vs CLAUDE.md baseline** | 0.12 acceptance-rate delta (10 scenarios) | *deferred to 0.13 — harness doesn't exercise headless retrieval (#4)* |
293
355
 
294
356
  Semantic and hybrid retrieval use [fastembed-rb](https://github.com/khasinski/fastembed-rb) with the BAAI/bge-small-en-v1.5 model (384-dim, runs locally, no API key needed).
295
357
 
@@ -0,0 +1,81 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Migration v18: OpenTelemetry ingestion tables.
4
+ #
5
+ # ClaudeMemory's dashboard accepts OTLP/HTTP/JSON exports from Claude Code so
6
+ # users can see cost-per-API-call, token usage by model, latency, and per-prompt
7
+ # event journeys without leaving the dashboard.
8
+ #
9
+ # Three storage tables:
10
+ # - otel_metrics: numeric data points (token counts, USD cost, durations).
11
+ # Two value columns (value_int + value_float) preserve int64 precision for
12
+ # counters like token counts that exceed Float's 2^53 mantissa.
13
+ # - otel_events: log-style records (user_prompt, tool_result, api_request,
14
+ # skill_activated, ...). Indexed on prompt_id for the journey UNION.
15
+ # - otel_traces: spans. Table ships now so the schema is forward-ready, but
16
+ # the dashboard's POST /v1/traces returns 501 until the user opts in via
17
+ # `claude-memory otel --enable-traces`.
18
+ #
19
+ # Plus an additive prompt_id column on activity_events so existing hook
20
+ # events (recall, hook_ingest, hook_context) can be UNION-joined into the
21
+ # Prompt Journey panel.
22
+ Sequel.migration do
23
+ up do
24
+ create_table?(:otel_metrics) do
25
+ primary_key :id
26
+ String :name, null: false
27
+ String :value_type, null: false
28
+ Bignum :value_int
29
+ Float :value_float
30
+ String :unit
31
+ String :attributes_json, text: true
32
+ String :resource_json, text: true
33
+ String :recorded_at, null: false
34
+ end
35
+ run "CREATE INDEX IF NOT EXISTS idx_otel_metrics_name_time ON otel_metrics(name, recorded_at)"
36
+ run "CREATE INDEX IF NOT EXISTS idx_otel_metrics_recorded_at ON otel_metrics(recorded_at)"
37
+
38
+ create_table?(:otel_events) do
39
+ primary_key :id
40
+ String :event_name, null: false
41
+ String :session_id
42
+ String :prompt_id
43
+ String :attributes_json, text: true
44
+ String :resource_json, text: true
45
+ String :occurred_at, null: false
46
+ end
47
+ run "CREATE INDEX IF NOT EXISTS idx_otel_events_name_time ON otel_events(event_name, occurred_at)"
48
+ run "CREATE INDEX IF NOT EXISTS idx_otel_events_session ON otel_events(session_id)"
49
+ run "CREATE INDEX IF NOT EXISTS idx_otel_events_prompt ON otel_events(prompt_id)"
50
+
51
+ create_table?(:otel_traces) do
52
+ primary_key :id
53
+ String :trace_id, null: false
54
+ String :span_id, null: false
55
+ String :parent_span_id
56
+ String :name, null: false
57
+ String :session_id
58
+ String :prompt_id
59
+ Bignum :start_unix_nano
60
+ Bignum :end_unix_nano
61
+ Integer :duration_ms
62
+ String :status_code
63
+ String :attributes_json, text: true
64
+ String :resource_json, text: true
65
+ String :recorded_at, null: false
66
+ end
67
+ run "CREATE INDEX IF NOT EXISTS idx_otel_traces_trace ON otel_traces(trace_id)"
68
+ run "CREATE INDEX IF NOT EXISTS idx_otel_traces_time ON otel_traces(recorded_at)"
69
+
70
+ alter_table(:activity_events) { add_column :prompt_id, String }
71
+ run "CREATE INDEX IF NOT EXISTS idx_activity_events_prompt ON activity_events(prompt_id)"
72
+ end
73
+
74
+ down do
75
+ run "DROP INDEX IF EXISTS idx_activity_events_prompt"
76
+ alter_table(:activity_events) { drop_column :prompt_id }
77
+ drop_table?(:otel_traces)
78
+ drop_table?(:otel_events)
79
+ drop_table?(:otel_metrics)
80
+ end
81
+ end