RubyGems - claude_memory - Versions diffs - 0.10.0 → 0.12.0 - Mend

claude_memory 0.10.0 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (72) hide show

checksums.yaml +4 -4
data/.claude/memory.sqlite3 +0 -0
data/.claude/rules/claude_memory.generated.md +42 -64
data/.claude/skills/release/SKILL.md +44 -6
data/.claude/skills/study-repo/SKILL.md +15 -0
data/.claude-plugin/commands/audit-memory.md +68 -0
data/.claude-plugin/marketplace.json +1 -1
data/.claude-plugin/plugin.json +1 -1
data/CHANGELOG.md +70 -0
data/CLAUDE.md +20 -5
data/README.md +64 -2
data/db/migrations/018_add_otel_telemetry.rb +81 -0
data/docs/1_0_punchlist.md +522 -89
data/docs/GETTING_STARTED.md +3 -1
data/docs/api_stability.md +341 -0
data/docs/architecture.md +3 -3
data/docs/audit_runbook.md +209 -0
data/docs/claude_monitoring.md +956 -0
data/docs/dashboard.md +23 -3
data/docs/improvements.md +329 -5
data/docs/influence/ai-memory-systems-2026.md +403 -0
data/docs/memory_audit_2026-05-21.md +303 -0
data/docs/plugin.md +1 -1
data/docs/quality_review.md +35 -0
data/lib/claude_memory/audit/checks.rb +239 -0
data/lib/claude_memory/audit/finding.rb +33 -0
data/lib/claude_memory/audit/runner.rb +73 -0
data/lib/claude_memory/commands/audit_command.rb +117 -0
data/lib/claude_memory/commands/dashboard_command.rb +2 -1
data/lib/claude_memory/commands/digest_command.rb +95 -3
data/lib/claude_memory/commands/hook_command.rb +27 -2
data/lib/claude_memory/commands/import_auto_memory_command.rb +180 -0
data/lib/claude_memory/commands/initializers/hooks_configurator.rb +7 -4
data/lib/claude_memory/commands/otel_command.rb +240 -0
data/lib/claude_memory/commands/registry.rb +5 -1
data/lib/claude_memory/commands/show_command.rb +90 -0
data/lib/claude_memory/commands/stats_command.rb +94 -2
data/lib/claude_memory/configuration.rb +60 -0
data/lib/claude_memory/core/fact_query_builder.rb +1 -0
data/lib/claude_memory/dashboard/api.rb +8 -0
data/lib/claude_memory/dashboard/index.html +140 -1
data/lib/claude_memory/dashboard/prompt_journey.rb +48 -0
data/lib/claude_memory/dashboard/server.rb +86 -0
data/lib/claude_memory/dashboard/telemetry.rb +156 -0
data/lib/claude_memory/dashboard/trust.rb +180 -11
data/lib/claude_memory/deprecations.rb +106 -0
data/lib/claude_memory/distill/bare_conclusion_detector.rb +71 -0
data/lib/claude_memory/distill/reference_material_detector.rb +37 -4
data/lib/claude_memory/hook/auto_memory_mirror.rb +7 -3
data/lib/claude_memory/hook/context_injector.rb +11 -2
data/lib/claude_memory/hook/handler.rb +142 -1
data/lib/claude_memory/mcp/tool_definitions.rb +3 -3
data/lib/claude_memory/otel/attributes.rb +118 -0
data/lib/claude_memory/otel/constants.rb +32 -0
data/lib/claude_memory/otel/ingestor.rb +54 -0
data/lib/claude_memory/otel/otlp_json_envelope.rb +254 -0
data/lib/claude_memory/otel/prompt_scope.rb +108 -0
data/lib/claude_memory/otel/settings_writer.rb +122 -0
data/lib/claude_memory/otel/status.rb +58 -0
data/lib/claude_memory/recall/staleness_annotator.rb +73 -0
data/lib/claude_memory/resolve/predicate_policy.rb +17 -1
data/lib/claude_memory/resolve/resolver.rb +30 -3
data/lib/claude_memory/shortcuts.rb +61 -18
data/lib/claude_memory/store/prompt_journey_query.rb +87 -0
data/lib/claude_memory/store/schema_manager.rb +1 -1
data/lib/claude_memory/store/sqlite_store.rb +136 -0
data/lib/claude_memory/sweep/maintenance.rb +31 -1
data/lib/claude_memory/sweep/sweeper.rb +6 -0
data/lib/claude_memory/templates/hooks.example.json +5 -0
data/lib/claude_memory/version.rb +1 -1
data/lib/claude_memory.rb +20 -0
metadata +28 -1

data/README.md CHANGED Viewed

@@ -140,7 +140,51 @@ File-searchable questions ("what version is this?") and one-shot code generation
 - **Claude-Powered**: Uses Claude's intelligence to extract facts (no API key needed)
 - **Token Efficient**: 10x reduction in memory queries with progressive disclosure
 - **Database Maintenance**: Compact, export, and backup commands
-- **Built-in Observability** (0.10.0+): `claude-memory dashboard` opens a local web UI with a moments feed, trust panel, conflicts dedup, knowledge index, 👍/👎 feedback, and a 30-day utilization ratio. See **[Dashboard guide →](docs/dashboard.md)**. `claude-memory digest` writes a weekly markdown report; `claude-memory census` audits the predicate vocabulary across projects.
+- **Built-in Observability** (0.10.0+): `claude-memory dashboard` opens a local web UI with a moments feed, trust panel (token budget, quality score, utilization, feedback), conflicts dedup, knowledge index, and 👍/👎 feedback. See **[Dashboard guide →](docs/dashboard.md)**. `claude-memory digest` writes a weekly markdown report (Activity, Context cost, Quality, New knowledge, Utilization, Conflicts, Feedback); `claude-memory show` prints what would be injected next SessionStart; `claude-memory census` audits the predicate vocabulary across projects.
+- **OpenTelemetry ingestion** (Unreleased): point Claude Code's OTLP exporter at the dashboard and the new "Telemetry" tab shows per-API-call cost in USD, token usage by model, top tools by latency, and a per-prompt event waterfall. One-line setup:
+  ```bash
+  claude-memory dashboard --port 3377 &   # start the receiver
+  claude-memory otel --enable              # writes telemetry env into .claude/settings.json
+  claude-memory otel --enable-traces       # optional: include OpenTelemetry spans
+  claude-memory otel --status              # confirm metrics are flowing
+  ```
+  Only metrics and event names are captured by default — verbatim prompts and bodies stay off until you explicitly opt in via `claude-memory otel --capture-prompts`. The receiver binds to `127.0.0.1` only.
+## What's New in 0.11.0
+Five user-visible signals so you can answer "is memory still worth it?" with
+numbers, not vibes:
+- **Token budget telemetry** — every SessionStart context injection now
+  records its estimated `context_tokens`. `claude-memory stats --tokens
+  [--since DAYS]` reports p50/p95/avg/min/max plus a histogram across
+  <500 / 500-1k / 1-2k / 2-5k / 5k+ buckets so you can see the per-session
+  cost at a glance. The dashboard's Trust panel and `claude-memory digest`
+  surface the same numbers.
+- **Hallucination-rate metric** — the dashboard now scores how *clean* the
+  fact base is, not just how full it is. `Distill::BareConclusionDetector`
+  flags `decision` / `convention` facts that skipped the reason-clause
+  requirement. Trust panel shows `quality_score` (live 30-day window with
+  historical baseline beneath). `claude-memory digest` adds a Quality
+  section with rejection rate.
+- **`claude-memory show`** — new command prints what memory *would* inject
+  at the next SessionStart in plain Markdown. Footer reports fact count,
+  ~token estimate, and char count so you see the cost at a glance. Default
+  hides the raw-transcript "Pending Knowledge" dump for readability;
+  `--pending` opts in. `--source startup|resume|clear` simulates each
+  fresh-session entrypoint.
+- **First-week ROI nudge** — at SessionEnd, memory now prints
+  `memory contributed N facts this session, %used = X` for the first 10
+  sessions, then quiets. Cold-start trust signal — you don't have to know
+  about the dashboard. Opt out with `CLAUDE_MEMORY_NO_NUDGE=1`.
+- **Harm benchmark prototype** — first ClaudeMemory benchmark that
+  measures whether memory can make Claude *wrong*. Three hand-written
+  cases (stale-tech, mismatched-scope, superseded-but-undetected) under
+  `spec/benchmarks/e2e/harm_bench_spec.rb`. Real-mode run on the 0.11
+  release reported 0/3 harm; the full 10-15-case corpus + release gate
+  lands in 0.12.
 ## What's New in 0.10.0
@@ -273,11 +317,26 @@ The uninstall command removes:
 - 📊 [Dashboard](docs/dashboard.md) - Local web UI for inspection and trust signals (0.10.0+)
 - 🔧 [Plugin Setup](docs/plugin.md) - Claude Code integration
 - 🏗️ [Architecture](docs/architecture.md) - Technical deep dive
+- 🔒 [API Stability](docs/api_stability.md) - What's stable / experimental / internal across releases (0.12.0+)
 - 📝 [Changelog](CHANGELOG.md) - Release notes
 ## Benchmarks
-ClaudeMemory includes **DevMemBench**, a developer-domain benchmark suite that measures retrieval quality and truth maintenance accuracy. All offline benchmarks run locally at zero cost.
+ClaudeMemory includes **DevMemBench**, a developer-domain benchmark suite that measures retrieval quality, truth maintenance accuracy, **negative-fact harm**, and **uplift over a hand-written CLAUDE.md baseline**. All offline benchmarks run locally at zero cost; end-to-end and comparative runs use real Claude (~$5-15 per full run).
+### Does memory ever make Claude *wrong*?
+Every other benchmark measures whether memory helps. The negative-fact harm benchmark measures whether memory can hurt — injecting a stale, mis-scoped, superseded, or reference-material fact and watching Claude follow it. 13 scenarios across 4 harm classes, each with a realistic project scaffold whose actual state contradicts the wrong fact, scored best-of-3 by majority vote. The run fails the build if any scenario reliably produces a harm (>1%).
+```bash
+EVAL_MODE=real HARM_BENCH_RUNS=3 EVAL_MAX_BUDGET_USD=0.50 bundle exec rspec spec/benchmarks/e2e/harm_bench_spec.rb
+```
+**0.12 baseline (2026-05-28): 0/13 harm.** See [`spec/benchmarks/README.md`](spec/benchmarks/README.md#harm_scenariosyml-13-scenarios-full-corpus-0120) for the full corpus and methodology.
+### Is this better than a hand-written CLAUDE.md?
+The single most important question for adoption is whether dynamic retrieval beats static context injection. ClaudeMemory ships a `CLAUDE.md baseline` adapter and a comparative E2E harness for exactly this. **The numbers aren't published yet (as of 0.12):** the current harness compares static CLAUDE.md (auto-loaded into every prompt) against ClaudeMemory's MCP-tool retrieval, but in headless `claude -p` mode Claude doesn't proactively call the recall tools, so the comparison doesn't yet exercise ClaudeMemory's retrieval path fairly. Publishing that gap as a headline number would mislead. The harness fix is tracked for 0.13 — see [`docs/1_0_punchlist.md`](docs/1_0_punchlist.md) #4.
 ### Latest Results
@@ -290,6 +349,9 @@ ClaudeMemory includes **DevMemBench**, a developer-domain benchmark suite that m
 | **Hybrid Retrieval** | Recall@5 (100 queries aggregate) | **72.7%** |
 | **Hybrid Retrieval** | Recall@10 (20 hard queries) | **62.8%** |
 | **Scope Ranking** | Queries returning expected facts | **5/5** |
+| **Negative-Fact Harm (prototype)** | 0.11 baseline (3 scenarios, real Claude) | **0/3** |
+| **Negative-Fact Harm (full corpus)** | 0.12 baseline (13 scenarios, best-of-3, real Claude) | **0/13 (0.0%)** |
+| **E2E vs CLAUDE.md baseline** | 0.12 acceptance-rate delta (10 scenarios) | *deferred to 0.13 — harness doesn't exercise headless retrieval (#4)* |
 Semantic and hybrid retrieval use [fastembed-rb](https://github.com/khasinski/fastembed-rb) with the BAAI/bge-small-en-v1.5 model (384-dim, runs locally, no API key needed).

data/db/migrations/018_add_otel_telemetry.rb ADDED Viewed

@@ -0,0 +1,81 @@
+# frozen_string_literal: true
+# Migration v18: OpenTelemetry ingestion tables.
+#
+# ClaudeMemory's dashboard accepts OTLP/HTTP/JSON exports from Claude Code so
+# users can see cost-per-API-call, token usage by model, latency, and per-prompt
+# event journeys without leaving the dashboard.
+#
+# Three storage tables:
+#   - otel_metrics: numeric data points (token counts, USD cost, durations).
+#     Two value columns (value_int + value_float) preserve int64 precision for
+#     counters like token counts that exceed Float's 2^53 mantissa.
+#   - otel_events: log-style records (user_prompt, tool_result, api_request,
+#     skill_activated, ...). Indexed on prompt_id for the journey UNION.
+#   - otel_traces: spans. Table ships now so the schema is forward-ready, but
+#     the dashboard's POST /v1/traces returns 501 until the user opts in via
+#     `claude-memory otel --enable-traces`.
+#
+# Plus an additive prompt_id column on activity_events so existing hook
+# events (recall, hook_ingest, hook_context) can be UNION-joined into the
+# Prompt Journey panel.
+Sequel.migration do
+  up do
+    create_table?(:otel_metrics) do
+      primary_key :id
+      String :name, null: false
+      String :value_type, null: false
+      Bignum :value_int
+      Float :value_float
+      String :unit
+      String :attributes_json, text: true
+      String :resource_json, text: true
+      String :recorded_at, null: false
+    end
+    run "CREATE INDEX IF NOT EXISTS idx_otel_metrics_name_time ON otel_metrics(name, recorded_at)"
+    run "CREATE INDEX IF NOT EXISTS idx_otel_metrics_recorded_at ON otel_metrics(recorded_at)"
+    create_table?(:otel_events) do
+      primary_key :id
+      String :event_name, null: false
+      String :session_id
+      String :prompt_id
+      String :attributes_json, text: true
+      String :resource_json, text: true
+      String :occurred_at, null: false
+    end
+    run "CREATE INDEX IF NOT EXISTS idx_otel_events_name_time ON otel_events(event_name, occurred_at)"
+    run "CREATE INDEX IF NOT EXISTS idx_otel_events_session ON otel_events(session_id)"
+    run "CREATE INDEX IF NOT EXISTS idx_otel_events_prompt ON otel_events(prompt_id)"
+    create_table?(:otel_traces) do
+      primary_key :id
+      String :trace_id, null: false
+      String :span_id, null: false
+      String :parent_span_id
+      String :name, null: false
+      String :session_id
+      String :prompt_id
+      Bignum :start_unix_nano
+      Bignum :end_unix_nano
+      Integer :duration_ms
+      String :status_code
+      String :attributes_json, text: true
+      String :resource_json, text: true
+      String :recorded_at, null: false
+    end
+    run "CREATE INDEX IF NOT EXISTS idx_otel_traces_trace ON otel_traces(trace_id)"
+    run "CREATE INDEX IF NOT EXISTS idx_otel_traces_time ON otel_traces(recorded_at)"
+    alter_table(:activity_events) { add_column :prompt_id, String }
+    run "CREATE INDEX IF NOT EXISTS idx_activity_events_prompt ON activity_events(prompt_id)"
+  end
+  down do
+    run "DROP INDEX IF EXISTS idx_activity_events_prompt"
+    alter_table(:activity_events) { drop_column :prompt_id }
+    drop_table?(:otel_traces)
+    drop_table?(:otel_events)
+    drop_table?(:otel_metrics)
+  end
+end