npm - gitmem-mcp - Versions diffs - 1.0.13 → 1.0.15 - Mend

gitmem-mcp 1.0.13 → 1.0.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/CHANGELOG.md +5 -0
package/bin/init-wizard.js +15 -0
package/dist/services/thread-dedup.d.ts +27 -7
package/dist/services/thread-dedup.js +112 -10
package/dist/tools/create-thread.d.ts +1 -1
package/hooks/commands/gitmem-status.md +48 -0
package/hooks/scripts/display-hook.sh +141 -0
package/package.json +1 -1

package/CHANGELOG.md CHANGED Viewed

@@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
+## [1.0.15] - 2026-02-16
+### Fixed
+- **Thread dedup without API key**: Dedup silently fell back to exact text match when no embedding API key (OpenAI/OpenRouter/Ollama) was set — which is the default for free tier users. Near-duplicate threads with the same topic but different wording slipped through. Added zero-dependency token overlap coefficient as a middle tier (threshold 0.6, lowered to 0.4 when threads share an issue prefix like `OD-692:`). Also upgraded `deduplicateThreadList` with the same logic. +18 unit tests.
 ## [1.0.12] - 2026-02-16
 ### Fixed

package/bin/init-wizard.js CHANGED Viewed

@@ -104,6 +104,11 @@ function buildHooks() {
       {
         matcher: "Bash",
         hooks: [
+          {
+            type: "command",
+            command: `bash ${relScripts}/credential-guard.sh`,
+            timeout: 3000,
+          },
           {
             type: "command",
             command: `bash ${relScripts}/recall-check.sh`,
@@ -111,6 +116,16 @@ function buildHooks() {
           },
         ],
       },
+      {
+        matcher: "Read",
+        hooks: [
+          {
+            type: "command",
+            command: `bash ${relScripts}/credential-guard.sh`,
+            timeout: 3000,
+          },
+        ],
+      },
       {
         matcher: "Write",
         hooks: [

package/dist/services/thread-dedup.d.ts CHANGED Viewed

@@ -1,14 +1,16 @@
 /**
  * Thread Deduplication Service (Phase 3)
  *
- * Pure functions for detecting duplicate threads by embedding similarity
- * or normalized text equality. Zero I/O — all Supabase and embedding
- * calls live in the caller (create-thread.ts).
+ * Pure functions for detecting duplicate threads by embedding similarity,
+ * token overlap, or normalized text equality. Zero I/O — all Supabase
+ * and embedding calls live in the caller (create-thread.ts).
  *
  * Strategy:
  *   1. If embedding available: cosine similarity > 0.85 → duplicate
- *   2. If embedding unavailable: normalized text equality → duplicate
- *   3. If no existing threads: skip check
+ *   2. Token overlap coefficient > 0.6 → duplicate (no API key needed)
+ *      - Lowered to 0.4 when both threads share an issue prefix (e.g., OD-692:)
+ *   3. Normalized text equality → duplicate
+ *   4. If no existing threads: skip check
  */
 import type { ThreadObject } from "../types/index.js";
 export interface ThreadWithEmbedding {
@@ -21,9 +23,11 @@ export interface DedupResult {
     matched_thread_id: string | null;
     matched_text: string | null;
     similarity: number | null;
-    method: "embedding" | "text_normalization" | "skipped";
+    method: "embedding" | "token_overlap" | "text_normalization" | "skipped";
 }
 export declare const DEDUP_SIMILARITY_THRESHOLD = 0.85;
+export declare const TOKEN_OVERLAP_THRESHOLD = 0.6;
+export declare const TOKEN_OVERLAP_ISSUE_PREFIX_THRESHOLD = 0.4;
 /**
  * Check if new thread text is a semantic duplicate of any existing open thread.
  *
@@ -43,7 +47,23 @@ export declare function cosineSimilarity(a: number[], b: number[]): number;
  */
 export declare function normalizeText(text: string): string;
 /**
- * Deduplicate a thread list by both ID and normalized text.
+ * Tokenize text into content words for overlap comparison.
+ * Lowercase, split on non-alphanumeric boundaries, remove stop words.
+ */
+export declare function tokenize(text: string): Set<string>;
+/**
+ * Overlap coefficient: |intersection| / min(|A|, |B|).
+ * Handles the common case where one thread is a shorter variant of another.
+ * Returns 0 if either set is empty.
+ */
+export declare function tokenOverlap(a: Set<string>, b: Set<string>): number;
+/**
+ * Extract issue prefix like "OD-692" or "PROJ-123" from thread text.
+ * Returns null if no prefix found.
+ */
+export declare function extractIssuePrefix(text: string): string | null;
+/**
+ * Deduplicate a thread list by ID, normalized text, and token overlap.
  * First-seen wins. Skips empty-text threads. Does not mutate input.
  *
  * Applied at every thread loading/merging exit point to guarantee

package/dist/services/thread-dedup.js CHANGED Viewed

@@ -1,17 +1,28 @@
 /**
  * Thread Deduplication Service (Phase 3)
  *
- * Pure functions for detecting duplicate threads by embedding similarity
- * or normalized text equality. Zero I/O — all Supabase and embedding
- * calls live in the caller (create-thread.ts).
+ * Pure functions for detecting duplicate threads by embedding similarity,
+ * token overlap, or normalized text equality. Zero I/O — all Supabase
+ * and embedding calls live in the caller (create-thread.ts).
  *
  * Strategy:
  *   1. If embedding available: cosine similarity > 0.85 → duplicate
- *   2. If embedding unavailable: normalized text equality → duplicate
- *   3. If no existing threads: skip check
+ *   2. Token overlap coefficient > 0.6 → duplicate (no API key needed)
+ *      - Lowered to 0.4 when both threads share an issue prefix (e.g., OD-692:)
+ *   3. Normalized text equality → duplicate
+ *   4. If no existing threads: skip check
  */
 // ---------- Constants ----------
 export const DEDUP_SIMILARITY_THRESHOLD = 0.85;
+export const TOKEN_OVERLAP_THRESHOLD = 0.6;
+export const TOKEN_OVERLAP_ISSUE_PREFIX_THRESHOLD = 0.4;
+const STOP_WORDS = new Set([
+    "a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "for",
+    "of", "with", "by", "from", "is", "it", "be", "as", "was", "are",
+    "been", "being", "have", "has", "had", "do", "does", "did", "will",
+    "that", "this", "not", "no", "so", "if", "its", "also", "into",
+    "than", "then", "can", "just", "about", "up", "out", "still",
+]);
 // ---------- Core ----------
 /**
  * Check if new thread text is a semantic duplicate of any existing open thread.
@@ -60,6 +71,36 @@ export function checkDuplicate(newText, newEmbedding, existingThreads) {
             method: "embedding",
         };
     }
+    // Token overlap check (works without any API key)
+    const newTokens = tokenize(newText);
+    const newPrefix = extractIssuePrefix(newText);
+    if (newTokens.size > 0) {
+        let bestOverlap = -1;
+        let bestThread = null;
+        for (const thread of existingThreads) {
+            const existingTokens = tokenize(thread.text);
+            if (existingTokens.size === 0)
+                continue;
+            const overlap = tokenOverlap(newTokens, existingTokens);
+            const existingPrefix = extractIssuePrefix(thread.text);
+            const threshold = newPrefix && existingPrefix && newPrefix === existingPrefix
+                ? TOKEN_OVERLAP_ISSUE_PREFIX_THRESHOLD
+                : TOKEN_OVERLAP_THRESHOLD;
+            if (overlap > threshold && overlap > bestOverlap) {
+                bestOverlap = overlap;
+                bestThread = thread;
+            }
+        }
+        if (bestThread && bestOverlap > 0) {
+            return {
+                is_duplicate: true,
+                matched_thread_id: bestThread.thread_id,
+                matched_text: bestThread.text,
+                similarity: round(bestOverlap, 4),
+                method: "token_overlap",
+            };
+        }
+    }
     // Text normalization fallback (conservative: exact match only)
     const normalizedNew = normalizeText(newText);
     for (const thread of existingThreads) {
@@ -106,13 +147,49 @@ export function normalizeText(text) {
         .trim()
         .replace(/[.!?;:]+$/, "");
 }
+/**
+ * Tokenize text into content words for overlap comparison.
+ * Lowercase, split on non-alphanumeric boundaries, remove stop words.
+ */
+export function tokenize(text) {
+    const words = text
+        .toLowerCase()
+        .split(/[^a-z0-9-]+/)
+        .filter((w) => w.length > 1 && !STOP_WORDS.has(w));
+    return new Set(words);
+}
+/**
+ * Overlap coefficient: |intersection| / min(|A|, |B|).
+ * Handles the common case where one thread is a shorter variant of another.
+ * Returns 0 if either set is empty.
+ */
+export function tokenOverlap(a, b) {
+    if (a.size === 0 || b.size === 0)
+        return 0;
+    let intersection = 0;
+    const smaller = a.size <= b.size ? a : b;
+    const larger = a.size <= b.size ? b : a;
+    for (const word of smaller) {
+        if (larger.has(word))
+            intersection++;
+    }
+    return intersection / Math.min(a.size, b.size);
+}
+/**
+ * Extract issue prefix like "OD-692" or "PROJ-123" from thread text.
+ * Returns null if no prefix found.
+ */
+export function extractIssuePrefix(text) {
+    const match = text.match(/^([A-Z]+-\d+)/i);
+    return match ? match[1].toUpperCase() : null;
+}
 function round(value, decimals) {
     const factor = 10 ** decimals;
     return Math.round(value * factor) / factor;
 }
 // ---------- List Deduplication ----------
 /**
- * Deduplicate a thread list by both ID and normalized text.
+ * Deduplicate a thread list by ID, normalized text, and token overlap.
  * First-seen wins. Skips empty-text threads. Does not mutate input.
  *
  * Applied at every thread loading/merging exit point to guarantee
@@ -120,19 +197,44 @@ function round(value, decimals) {
  */
 export function deduplicateThreadList(threads) {
     const seenIds = new Set();
-    const seenText = new Set();
     const result = [];
+    // Track accepted threads with their tokens for overlap comparison
+    const accepted = [];
     for (const thread of threads) {
         const text = thread.text || "";
         const key = normalizeText(text);
         // Skip empty-text threads
         if (!key)
             continue;
-        // Skip if we've seen this ID or this normalized text
-        if (seenIds.has(thread.id) || seenText.has(key))
+        // Skip if we've seen this ID
+        if (seenIds.has(thread.id))
+            continue;
+        // Check exact text match against accepted threads
+        const tokens = tokenize(text);
+        const prefix = extractIssuePrefix(text);
+        let isDuplicate = false;
+        for (const prev of accepted) {
+            // Exact normalized text match
+            if (normalizeText(prev.text) === key) {
+                isDuplicate = true;
+                break;
+            }
+            // Token overlap match
+            if (tokens.size > 0 && prev.tokens.size > 0) {
+                const overlap = tokenOverlap(tokens, prev.tokens);
+                const threshold = prefix && prev.prefix && prefix === prev.prefix
+                    ? TOKEN_OVERLAP_ISSUE_PREFIX_THRESHOLD
+                    : TOKEN_OVERLAP_THRESHOLD;
+                if (overlap > threshold) {
+                    isDuplicate = true;
+                    break;
+                }
+            }
+        }
+        if (isDuplicate)
             continue;
         seenIds.add(thread.id);
-        seenText.add(key);
+        accepted.push({ text, tokens, prefix });
         result.push(thread);
     }
     return result;

package/dist/tools/create-thread.d.ts CHANGED Viewed

@@ -33,7 +33,7 @@ export interface CreateThreadResult {
     deduplicated?: boolean;
     /** Phase 3: dedup gate details */
     dedup?: {
-        method: "embedding" | "text_normalization" | "skipped";
+        method: "embedding" | "token_overlap" | "text_normalization" | "skipped";
         similarity: number | null;
         matched_thread_id: string | null;
     };

package/hooks/commands/gitmem-status.md ADDED Viewed

@@ -0,0 +1,48 @@
+---
+description: Show GitMem session status, hook activity, and recall state
+allowed-tools: ["Bash", "Read", "mcp__gitmem__gitmem-cache-status"]
+---
+# GitMem Status
+Check the current state of the GitMem hooks plugin and active session.
+## Instructions
+1. **Check if gitmem MCP is connected:**
+   Call the `mcp__gitmem__gitmem-cache-status` tool. If it returns a response, the server is connected. Report scar count and cache age from the response.
+   Do NOT use `claude mcp list` — its health check returns false negatives.
+2. **Check active session:**
+   Read `.gitmem/active-session.json` if it exists. Report:
+   - Session ID
+   - Agent identity
+   - Start time
+   - Whether scars have been surfaced (surfaced_scars timestamp)
+3. **Check hook state:**
+   Look for `/tmp/gitmem-hooks-*` directories. Report:
+   - Session start time
+   - Tool call count
+   - Last nag time
+   - Whether stop_hook_active guard is set
+4. **Check audit trail:**
+   Read `/tmp/gitmem-hooks-*/audit.jsonl` if it exists. Report:
+   - Total LOOKED events (recall/search calls)
+   - Total ACTION events (consequential actions)
+   - Whether any ACTION was taken without a prior LOOKED (potential gap)
+   - Last 5 audit entries
+5. **Summary:**
+   Present a concise status block:
+   ```
+   GitMem Status
+   ├── MCP Server: connected/disconnected
+   ├── Active Session: <id> (started <time>)
+   ├── Recall: last called <time> / never called
+   ├── Tool Calls: <count>
+   ├── Audit Trail: <looked> LOOKED / <action> ACTION events
+   │   └── Look-before-act: <yes/gap detected>
+   └── Hooks: session-start ✓ | recall-check ✓ | post-tool-use ✓ | close-check ✓
+   ```

package/hooks/scripts/display-hook.sh ADDED Viewed

@@ -0,0 +1,141 @@
+#!/bin/bash
+# GitMem Hooks Plugin — PostToolUse Display Hook
+#
+# PURPOSE: Deterministic MCP tool output display.
+# Routes formatted display directly to the terminal (bypassing the LLM)
+# and replaces the LLM's view with machine-readable data only.
+#
+# Architecture:
+#   Channel 1 (stdout, exit 0) → User sees in terminal, LLM does NOT
+#   Channel 2 (updatedMCPToolOutput) → LLM sees, user does NOT directly
+#
+# The gitmem MCP server returns responses with an optional separator:
+#   [formatted display]
+#   ═══ GITMEM_DATA ═══
+#   {"machine": "data"}
+#
+# This hook splits on that separator. If no separator exists, the entire
+# response is treated as display-only.
+#
+# Input: JSON via stdin with tool_name, tool_input, tool_response
+# Output: JSON with hookSpecificOutput.updatedMCPToolOutput (or exit 0)
+set -e
+# Read hook input from stdin
+HOOK_INPUT=$(cat -)
+# Extract tool name
+TOOL_NAME=""
+if command -v jq &>/dev/null; then
+    TOOL_NAME=$(echo "$HOOK_INPUT" | jq -r '.tool_name // empty' 2>/dev/null)
+else
+    TOOL_NAME=$(echo "$HOOK_INPUT" | node -e "
+        let d='';
+        process.stdin.on('data',c=>d+=c);
+        process.stdin.on('end',()=>{
+            try { process.stdout.write(JSON.parse(d).tool_name||''); }
+            catch(e) { process.stdout.write(''); }
+        });
+    " 2>/dev/null)
+fi
+# Only process gitmem MCP tools
+case "$TOOL_NAME" in
+    mcp__gitmem__*) ;;
+    *) exit 0 ;;
+esac
+# Extract tool response — try both field names Claude Code might use
+TOOL_RESPONSE=""
+if command -v jq &>/dev/null; then
+    TOOL_RESPONSE=$(echo "$HOOK_INPUT" | jq -r '(.tool_response // .tool_output // empty)' 2>/dev/null)
+else
+    TOOL_RESPONSE=$(echo "$HOOK_INPUT" | node -e "
+        let d='';
+        process.stdin.on('data',c=>d+=c);
+        process.stdin.on('end',()=>{
+            try {
+                const j=JSON.parse(d);
+                process.stdout.write(String(j.tool_response||j.tool_output||''));
+            } catch(e) { process.stdout.write(''); }
+        });
+    " 2>/dev/null)
+fi
+# No response to process
+if [ -z "$TOOL_RESPONSE" ] || [ "$TOOL_RESPONSE" = "null" ]; then
+    exit 0
+fi
+# ============================================================================
+# Split response into DISPLAY and MACHINE DATA
+# ============================================================================
+SEPARATOR="═══ GITMEM_DATA ═══"
+# Check if separator exists in response
+if echo "$TOOL_RESPONSE" | grep -qF "$SEPARATOR"; then
+    # Split: display = everything before separator, data = everything after
+    DISPLAY_PART=$(echo "$TOOL_RESPONSE" | awk -v sep="$SEPARATOR" '{if ($0 == sep) exit; print}')
+    MACHINE_PART=$(echo "$TOOL_RESPONSE" | awk -v sep="$SEPARATOR" 'found{print} $0==sep{found=1}')
+else
+    # No separator — entire response is display-only
+    DISPLAY_PART="$TOOL_RESPONSE"
+    MACHINE_PART=""
+fi
+# ============================================================================
+# Strip DISPLAY PROTOCOL suffix from display portion
+# ============================================================================
+# Remove the separator line and everything after it (DISPLAY PROTOCOL instructions)
+DISPLAY_CLEAN=$(echo "$DISPLAY_PART" | awk '/^───────────────────────────────────────────────────$/{exit} {print}')
+# If stripping removed everything (shouldn't happen), fall back to full display
+if [ -z "$DISPLAY_CLEAN" ]; then
+    DISPLAY_CLEAN="$DISPLAY_PART"
+fi
+# ============================================================================
+# Channel 1: Print display to stdout (user sees, LLM does not)
+# ============================================================================
+echo "$DISPLAY_CLEAN"
+# ============================================================================
+# Channel 2: Return updatedMCPToolOutput (LLM sees, user does not directly)
+# ============================================================================
+# Build the LLM-facing replacement
+SHORT_NAME=$(echo "$TOOL_NAME" | sed 's/^mcp__gitmem__//')
+if [ -n "$MACHINE_PART" ]; then
+    # Has machine data — give LLM the structured data
+    LLM_TEXT="gitmem ${SHORT_NAME} · output displayed to user\n${MACHINE_PART}"
+else
+    # Display-only — give LLM a brief summary
+    # Include the display text so LLM can still reference the content
+    LLM_TEXT="gitmem ${SHORT_NAME} · output displayed to user\n${DISPLAY_CLEAN}"
+fi
+# Escape for JSON embedding (handle newlines, quotes, backslashes)
+if command -v jq &>/dev/null; then
+    LLM_JSON=$(printf '%s' "$LLM_TEXT" | jq -Rs '.')
+else
+    LLM_JSON=$(printf '%s' "$LLM_TEXT" | node -e "
+        let d='';
+        process.stdin.on('data',c=>d+=c);
+        process.stdin.on('end',()=>process.stdout.write(JSON.stringify(d)));
+    " 2>/dev/null)
+fi
+# Return the hook response with updatedMCPToolOutput
+cat <<HOOKJSON
+{
+  "hookSpecificOutput": {
+    "hookEventName": "PostToolUse",
+    "updatedMCPToolOutput": ${LLM_JSON}
+  }
+}
+HOOKJSON

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "gitmem-mcp",
-  "version": "1.0.13",
+  "version": "1.0.15",
   "description": "Institutional memory for AI coding agents. Memory that compounds.",
   "type": "module",
   "main": "dist/index.js",