claude_memory 0.10.0 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.claude/memory.sqlite3 +0 -0
- data/.claude/rules/claude_memory.generated.md +42 -64
- data/.claude/skills/release/SKILL.md +44 -6
- data/.claude/skills/study-repo/SKILL.md +15 -0
- data/.claude-plugin/commands/audit-memory.md +68 -0
- data/.claude-plugin/marketplace.json +1 -1
- data/.claude-plugin/plugin.json +1 -1
- data/CHANGELOG.md +70 -0
- data/CLAUDE.md +20 -5
- data/README.md +64 -2
- data/db/migrations/018_add_otel_telemetry.rb +81 -0
- data/docs/1_0_punchlist.md +522 -89
- data/docs/GETTING_STARTED.md +3 -1
- data/docs/api_stability.md +341 -0
- data/docs/architecture.md +3 -3
- data/docs/audit_runbook.md +209 -0
- data/docs/claude_monitoring.md +956 -0
- data/docs/dashboard.md +23 -3
- data/docs/improvements.md +329 -5
- data/docs/influence/ai-memory-systems-2026.md +403 -0
- data/docs/memory_audit_2026-05-21.md +303 -0
- data/docs/plugin.md +1 -1
- data/docs/quality_review.md +35 -0
- data/lib/claude_memory/audit/checks.rb +239 -0
- data/lib/claude_memory/audit/finding.rb +33 -0
- data/lib/claude_memory/audit/runner.rb +73 -0
- data/lib/claude_memory/commands/audit_command.rb +117 -0
- data/lib/claude_memory/commands/dashboard_command.rb +2 -1
- data/lib/claude_memory/commands/digest_command.rb +95 -3
- data/lib/claude_memory/commands/hook_command.rb +27 -2
- data/lib/claude_memory/commands/import_auto_memory_command.rb +180 -0
- data/lib/claude_memory/commands/initializers/hooks_configurator.rb +7 -4
- data/lib/claude_memory/commands/otel_command.rb +240 -0
- data/lib/claude_memory/commands/registry.rb +5 -1
- data/lib/claude_memory/commands/show_command.rb +90 -0
- data/lib/claude_memory/commands/stats_command.rb +94 -2
- data/lib/claude_memory/configuration.rb +60 -0
- data/lib/claude_memory/core/fact_query_builder.rb +1 -0
- data/lib/claude_memory/dashboard/api.rb +8 -0
- data/lib/claude_memory/dashboard/index.html +140 -1
- data/lib/claude_memory/dashboard/prompt_journey.rb +48 -0
- data/lib/claude_memory/dashboard/server.rb +86 -0
- data/lib/claude_memory/dashboard/telemetry.rb +156 -0
- data/lib/claude_memory/dashboard/trust.rb +180 -11
- data/lib/claude_memory/deprecations.rb +106 -0
- data/lib/claude_memory/distill/bare_conclusion_detector.rb +71 -0
- data/lib/claude_memory/distill/reference_material_detector.rb +37 -4
- data/lib/claude_memory/hook/auto_memory_mirror.rb +7 -3
- data/lib/claude_memory/hook/context_injector.rb +11 -2
- data/lib/claude_memory/hook/handler.rb +142 -1
- data/lib/claude_memory/mcp/tool_definitions.rb +3 -3
- data/lib/claude_memory/otel/attributes.rb +118 -0
- data/lib/claude_memory/otel/constants.rb +32 -0
- data/lib/claude_memory/otel/ingestor.rb +54 -0
- data/lib/claude_memory/otel/otlp_json_envelope.rb +254 -0
- data/lib/claude_memory/otel/prompt_scope.rb +108 -0
- data/lib/claude_memory/otel/settings_writer.rb +122 -0
- data/lib/claude_memory/otel/status.rb +58 -0
- data/lib/claude_memory/recall/staleness_annotator.rb +73 -0
- data/lib/claude_memory/resolve/predicate_policy.rb +17 -1
- data/lib/claude_memory/resolve/resolver.rb +30 -3
- data/lib/claude_memory/shortcuts.rb +61 -18
- data/lib/claude_memory/store/prompt_journey_query.rb +87 -0
- data/lib/claude_memory/store/schema_manager.rb +1 -1
- data/lib/claude_memory/store/sqlite_store.rb +136 -0
- data/lib/claude_memory/sweep/maintenance.rb +31 -1
- data/lib/claude_memory/sweep/sweeper.rb +6 -0
- data/lib/claude_memory/templates/hooks.example.json +5 -0
- data/lib/claude_memory/version.rb +1 -1
- data/lib/claude_memory.rb +20 -0
- metadata +28 -1
data/README.md
CHANGED
|
@@ -140,7 +140,51 @@ File-searchable questions ("what version is this?") and one-shot code generation
|
|
|
140
140
|
- **Claude-Powered**: Uses Claude's intelligence to extract facts (no API key needed)
|
|
141
141
|
- **Token Efficient**: 10x reduction in memory queries with progressive disclosure
|
|
142
142
|
- **Database Maintenance**: Compact, export, and backup commands
|
|
143
|
-
- **Built-in Observability** (0.10.0+): `claude-memory dashboard` opens a local web UI with a moments feed, trust panel, conflicts dedup, knowledge index, 👍/👎 feedback
|
|
143
|
+
- **Built-in Observability** (0.10.0+): `claude-memory dashboard` opens a local web UI with a moments feed, trust panel (token budget, quality score, utilization, feedback), conflicts dedup, knowledge index, and 👍/👎 feedback. See **[Dashboard guide →](docs/dashboard.md)**. `claude-memory digest` writes a weekly markdown report (Activity, Context cost, Quality, New knowledge, Utilization, Conflicts, Feedback); `claude-memory show` prints what would be injected next SessionStart; `claude-memory census` audits the predicate vocabulary across projects.
|
|
144
|
+
- **OpenTelemetry ingestion** (Unreleased): point Claude Code's OTLP exporter at the dashboard and the new "Telemetry" tab shows per-API-call cost in USD, token usage by model, top tools by latency, and a per-prompt event waterfall. One-line setup:
|
|
145
|
+
|
|
146
|
+
```bash
|
|
147
|
+
claude-memory dashboard --port 3377 & # start the receiver
|
|
148
|
+
claude-memory otel --enable # writes telemetry env into .claude/settings.json
|
|
149
|
+
claude-memory otel --enable-traces # optional: include OpenTelemetry spans
|
|
150
|
+
claude-memory otel --status # confirm metrics are flowing
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
Only metrics and event names are captured by default — verbatim prompts and bodies stay off until you explicitly opt in via `claude-memory otel --capture-prompts`. The receiver binds to `127.0.0.1` only.
|
|
154
|
+
|
|
155
|
+
## What's New in 0.11.0
|
|
156
|
+
|
|
157
|
+
Five user-visible signals so you can answer "is memory still worth it?" with
|
|
158
|
+
numbers, not vibes:
|
|
159
|
+
|
|
160
|
+
- **Token budget telemetry** — every SessionStart context injection now
|
|
161
|
+
records its estimated `context_tokens`. `claude-memory stats --tokens
|
|
162
|
+
[--since DAYS]` reports p50/p95/avg/min/max plus a histogram across
|
|
163
|
+
<500 / 500-1k / 1-2k / 2-5k / 5k+ buckets so you can see the per-session
|
|
164
|
+
cost at a glance. The dashboard's Trust panel and `claude-memory digest`
|
|
165
|
+
surface the same numbers.
|
|
166
|
+
- **Hallucination-rate metric** — the dashboard now scores how *clean* the
|
|
167
|
+
fact base is, not just how full it is. `Distill::BareConclusionDetector`
|
|
168
|
+
flags `decision` / `convention` facts that skipped the reason-clause
|
|
169
|
+
requirement. Trust panel shows `quality_score` (live 30-day window with
|
|
170
|
+
historical baseline beneath). `claude-memory digest` adds a Quality
|
|
171
|
+
section with rejection rate.
|
|
172
|
+
- **`claude-memory show`** — new command prints what memory *would* inject
|
|
173
|
+
at the next SessionStart in plain Markdown. Footer reports fact count,
|
|
174
|
+
~token estimate, and char count so you see the cost at a glance. Default
|
|
175
|
+
hides the raw-transcript "Pending Knowledge" dump for readability;
|
|
176
|
+
`--pending` opts in. `--source startup|resume|clear` simulates each
|
|
177
|
+
fresh-session entrypoint.
|
|
178
|
+
- **First-week ROI nudge** — at SessionEnd, memory now prints
|
|
179
|
+
`memory contributed N facts this session, %used = X` for the first 10
|
|
180
|
+
sessions, then quiets. Cold-start trust signal — you don't have to know
|
|
181
|
+
about the dashboard. Opt out with `CLAUDE_MEMORY_NO_NUDGE=1`.
|
|
182
|
+
- **Harm benchmark prototype** — first ClaudeMemory benchmark that
|
|
183
|
+
measures whether memory can make Claude *wrong*. Three hand-written
|
|
184
|
+
cases (stale-tech, mismatched-scope, superseded-but-undetected) under
|
|
185
|
+
`spec/benchmarks/e2e/harm_bench_spec.rb`. Real-mode run on the 0.11
|
|
186
|
+
release reported 0/3 harm; the full 10-15-case corpus + release gate
|
|
187
|
+
lands in 0.12.
|
|
144
188
|
|
|
145
189
|
## What's New in 0.10.0
|
|
146
190
|
|
|
@@ -273,11 +317,26 @@ The uninstall command removes:
|
|
|
273
317
|
- 📊 [Dashboard](docs/dashboard.md) - Local web UI for inspection and trust signals (0.10.0+)
|
|
274
318
|
- 🔧 [Plugin Setup](docs/plugin.md) - Claude Code integration
|
|
275
319
|
- 🏗️ [Architecture](docs/architecture.md) - Technical deep dive
|
|
320
|
+
- 🔒 [API Stability](docs/api_stability.md) - What's stable / experimental / internal across releases (0.12.0+)
|
|
276
321
|
- 📝 [Changelog](CHANGELOG.md) - Release notes
|
|
277
322
|
|
|
278
323
|
## Benchmarks
|
|
279
324
|
|
|
280
|
-
ClaudeMemory includes **DevMemBench**, a developer-domain benchmark suite that measures retrieval quality
|
|
325
|
+
ClaudeMemory includes **DevMemBench**, a developer-domain benchmark suite that measures retrieval quality, truth maintenance accuracy, **negative-fact harm**, and **uplift over a hand-written CLAUDE.md baseline**. All offline benchmarks run locally at zero cost; end-to-end and comparative runs use real Claude (~$5-15 per full run).
|
|
326
|
+
|
|
327
|
+
### Does memory ever make Claude *wrong*?
|
|
328
|
+
|
|
329
|
+
Every other benchmark measures whether memory helps. The negative-fact harm benchmark measures whether memory can hurt — injecting a stale, mis-scoped, superseded, or reference-material fact and watching Claude follow it. 13 scenarios across 4 harm classes, each with a realistic project scaffold whose actual state contradicts the wrong fact, scored best-of-3 by majority vote. The run fails the build if any scenario reliably produces a harm (>1%).
|
|
330
|
+
|
|
331
|
+
```bash
|
|
332
|
+
EVAL_MODE=real HARM_BENCH_RUNS=3 EVAL_MAX_BUDGET_USD=0.50 bundle exec rspec spec/benchmarks/e2e/harm_bench_spec.rb
|
|
333
|
+
```
|
|
334
|
+
|
|
335
|
+
**0.12 baseline (2026-05-28): 0/13 harm.** See [`spec/benchmarks/README.md`](spec/benchmarks/README.md#harm_scenariosyml-13-scenarios-full-corpus-0120) for the full corpus and methodology.
|
|
336
|
+
|
|
337
|
+
### Is this better than a hand-written CLAUDE.md?
|
|
338
|
+
|
|
339
|
+
The single most important question for adoption is whether dynamic retrieval beats static context injection. ClaudeMemory ships a `CLAUDE.md baseline` adapter and a comparative E2E harness for exactly this. **The numbers aren't published yet (as of 0.12):** the current harness compares static CLAUDE.md (auto-loaded into every prompt) against ClaudeMemory's MCP-tool retrieval, but in headless `claude -p` mode Claude doesn't proactively call the recall tools, so the comparison doesn't yet exercise ClaudeMemory's retrieval path fairly. Publishing that gap as a headline number would mislead. The harness fix is tracked for 0.13 — see [`docs/1_0_punchlist.md`](docs/1_0_punchlist.md) #4.
|
|
281
340
|
|
|
282
341
|
### Latest Results
|
|
283
342
|
|
|
@@ -290,6 +349,9 @@ ClaudeMemory includes **DevMemBench**, a developer-domain benchmark suite that m
|
|
|
290
349
|
| **Hybrid Retrieval** | Recall@5 (100 queries aggregate) | **72.7%** |
|
|
291
350
|
| **Hybrid Retrieval** | Recall@10 (20 hard queries) | **62.8%** |
|
|
292
351
|
| **Scope Ranking** | Queries returning expected facts | **5/5** |
|
|
352
|
+
| **Negative-Fact Harm (prototype)** | 0.11 baseline (3 scenarios, real Claude) | **0/3** |
|
|
353
|
+
| **Negative-Fact Harm (full corpus)** | 0.12 baseline (13 scenarios, best-of-3, real Claude) | **0/13 (0.0%)** |
|
|
354
|
+
| **E2E vs CLAUDE.md baseline** | 0.12 acceptance-rate delta (10 scenarios) | *deferred to 0.13 — harness doesn't exercise headless retrieval (#4)* |
|
|
293
355
|
|
|
294
356
|
Semantic and hybrid retrieval use [fastembed-rb](https://github.com/khasinski/fastembed-rb) with the BAAI/bge-small-en-v1.5 model (384-dim, runs locally, no API key needed).
|
|
295
357
|
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Migration v18: OpenTelemetry ingestion tables.
|
|
4
|
+
#
|
|
5
|
+
# ClaudeMemory's dashboard accepts OTLP/HTTP/JSON exports from Claude Code so
|
|
6
|
+
# users can see cost-per-API-call, token usage by model, latency, and per-prompt
|
|
7
|
+
# event journeys without leaving the dashboard.
|
|
8
|
+
#
|
|
9
|
+
# Three storage tables:
|
|
10
|
+
# - otel_metrics: numeric data points (token counts, USD cost, durations).
|
|
11
|
+
# Two value columns (value_int + value_float) preserve int64 precision for
|
|
12
|
+
# counters like token counts that exceed Float's 2^53 mantissa.
|
|
13
|
+
# - otel_events: log-style records (user_prompt, tool_result, api_request,
|
|
14
|
+
# skill_activated, ...). Indexed on prompt_id for the journey UNION.
|
|
15
|
+
# - otel_traces: spans. Table ships now so the schema is forward-ready, but
|
|
16
|
+
# the dashboard's POST /v1/traces returns 501 until the user opts in via
|
|
17
|
+
# `claude-memory otel --enable-traces`.
|
|
18
|
+
#
|
|
19
|
+
# Plus an additive prompt_id column on activity_events so existing hook
|
|
20
|
+
# events (recall, hook_ingest, hook_context) can be UNION-joined into the
|
|
21
|
+
# Prompt Journey panel.
|
|
22
|
+
Sequel.migration do
|
|
23
|
+
up do
|
|
24
|
+
create_table?(:otel_metrics) do
|
|
25
|
+
primary_key :id
|
|
26
|
+
String :name, null: false
|
|
27
|
+
String :value_type, null: false
|
|
28
|
+
Bignum :value_int
|
|
29
|
+
Float :value_float
|
|
30
|
+
String :unit
|
|
31
|
+
String :attributes_json, text: true
|
|
32
|
+
String :resource_json, text: true
|
|
33
|
+
String :recorded_at, null: false
|
|
34
|
+
end
|
|
35
|
+
run "CREATE INDEX IF NOT EXISTS idx_otel_metrics_name_time ON otel_metrics(name, recorded_at)"
|
|
36
|
+
run "CREATE INDEX IF NOT EXISTS idx_otel_metrics_recorded_at ON otel_metrics(recorded_at)"
|
|
37
|
+
|
|
38
|
+
create_table?(:otel_events) do
|
|
39
|
+
primary_key :id
|
|
40
|
+
String :event_name, null: false
|
|
41
|
+
String :session_id
|
|
42
|
+
String :prompt_id
|
|
43
|
+
String :attributes_json, text: true
|
|
44
|
+
String :resource_json, text: true
|
|
45
|
+
String :occurred_at, null: false
|
|
46
|
+
end
|
|
47
|
+
run "CREATE INDEX IF NOT EXISTS idx_otel_events_name_time ON otel_events(event_name, occurred_at)"
|
|
48
|
+
run "CREATE INDEX IF NOT EXISTS idx_otel_events_session ON otel_events(session_id)"
|
|
49
|
+
run "CREATE INDEX IF NOT EXISTS idx_otel_events_prompt ON otel_events(prompt_id)"
|
|
50
|
+
|
|
51
|
+
create_table?(:otel_traces) do
|
|
52
|
+
primary_key :id
|
|
53
|
+
String :trace_id, null: false
|
|
54
|
+
String :span_id, null: false
|
|
55
|
+
String :parent_span_id
|
|
56
|
+
String :name, null: false
|
|
57
|
+
String :session_id
|
|
58
|
+
String :prompt_id
|
|
59
|
+
Bignum :start_unix_nano
|
|
60
|
+
Bignum :end_unix_nano
|
|
61
|
+
Integer :duration_ms
|
|
62
|
+
String :status_code
|
|
63
|
+
String :attributes_json, text: true
|
|
64
|
+
String :resource_json, text: true
|
|
65
|
+
String :recorded_at, null: false
|
|
66
|
+
end
|
|
67
|
+
run "CREATE INDEX IF NOT EXISTS idx_otel_traces_trace ON otel_traces(trace_id)"
|
|
68
|
+
run "CREATE INDEX IF NOT EXISTS idx_otel_traces_time ON otel_traces(recorded_at)"
|
|
69
|
+
|
|
70
|
+
alter_table(:activity_events) { add_column :prompt_id, String }
|
|
71
|
+
run "CREATE INDEX IF NOT EXISTS idx_activity_events_prompt ON activity_events(prompt_id)"
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
down do
|
|
75
|
+
run "DROP INDEX IF EXISTS idx_activity_events_prompt"
|
|
76
|
+
alter_table(:activity_events) { drop_column :prompt_id }
|
|
77
|
+
drop_table?(:otel_traces)
|
|
78
|
+
drop_table?(:otel_events)
|
|
79
|
+
drop_table?(:otel_metrics)
|
|
80
|
+
end
|
|
81
|
+
end
|