gitmem-mcp 1.0.13 → 1.0.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [1.0.15] - 2026-02-16
11
+
12
+ ### Fixed
13
+ - **Thread dedup without API key**: Dedup silently fell back to exact text match when no embedding API key (OpenAI/OpenRouter/Ollama) was set — which is the default for free tier users. Near-duplicate threads with the same topic but different wording slipped through. Added zero-dependency token overlap coefficient as a middle tier (threshold 0.6, lowered to 0.4 when threads share an issue prefix like `OD-692:`). Also upgraded `deduplicateThreadList` with the same logic. +18 unit tests.
14
+
10
15
  ## [1.0.12] - 2026-02-16
11
16
 
12
17
  ### Fixed
@@ -104,6 +104,11 @@ function buildHooks() {
104
104
  {
105
105
  matcher: "Bash",
106
106
  hooks: [
107
+ {
108
+ type: "command",
109
+ command: `bash ${relScripts}/credential-guard.sh`,
110
+ timeout: 3000,
111
+ },
107
112
  {
108
113
  type: "command",
109
114
  command: `bash ${relScripts}/recall-check.sh`,
@@ -111,6 +116,16 @@ function buildHooks() {
111
116
  },
112
117
  ],
113
118
  },
119
+ {
120
+ matcher: "Read",
121
+ hooks: [
122
+ {
123
+ type: "command",
124
+ command: `bash ${relScripts}/credential-guard.sh`,
125
+ timeout: 3000,
126
+ },
127
+ ],
128
+ },
114
129
  {
115
130
  matcher: "Write",
116
131
  hooks: [
@@ -1,14 +1,16 @@
1
1
  /**
2
2
  * Thread Deduplication Service (Phase 3)
3
3
  *
4
- * Pure functions for detecting duplicate threads by embedding similarity
5
- * or normalized text equality. Zero I/O — all Supabase and embedding
6
- * calls live in the caller (create-thread.ts).
4
+ * Pure functions for detecting duplicate threads by embedding similarity,
5
+ * token overlap, or normalized text equality. Zero I/O — all Supabase
6
+ * and embedding calls live in the caller (create-thread.ts).
7
7
  *
8
8
  * Strategy:
9
9
  * 1. If embedding available: cosine similarity > 0.85 → duplicate
10
- * 2. If embedding unavailable: normalized text equality → duplicate
11
- * 3. If no existing threads: skip check
10
+ * 2. Token overlap coefficient > 0.6 → duplicate (no API key needed)
11
+ * - Lowered to 0.4 when both threads share an issue prefix (e.g., OD-692:)
12
+ * 3. Normalized text equality → duplicate
13
+ * 4. If no existing threads: skip check
12
14
  */
13
15
  import type { ThreadObject } from "../types/index.js";
14
16
  export interface ThreadWithEmbedding {
@@ -21,9 +23,11 @@ export interface DedupResult {
21
23
  matched_thread_id: string | null;
22
24
  matched_text: string | null;
23
25
  similarity: number | null;
24
- method: "embedding" | "text_normalization" | "skipped";
26
+ method: "embedding" | "token_overlap" | "text_normalization" | "skipped";
25
27
  }
26
28
  export declare const DEDUP_SIMILARITY_THRESHOLD = 0.85;
29
+ export declare const TOKEN_OVERLAP_THRESHOLD = 0.6;
30
+ export declare const TOKEN_OVERLAP_ISSUE_PREFIX_THRESHOLD = 0.4;
27
31
  /**
28
32
  * Check if new thread text is a semantic duplicate of any existing open thread.
29
33
  *
@@ -43,7 +47,23 @@ export declare function cosineSimilarity(a: number[], b: number[]): number;
43
47
  */
44
48
  export declare function normalizeText(text: string): string;
45
49
  /**
46
- * Deduplicate a thread list by both ID and normalized text.
50
+ * Tokenize text into content words for overlap comparison.
51
+ * Lowercase, split on non-alphanumeric boundaries, remove stop words.
52
+ */
53
+ export declare function tokenize(text: string): Set<string>;
54
+ /**
55
+ * Overlap coefficient: |intersection| / min(|A|, |B|).
56
+ * Handles the common case where one thread is a shorter variant of another.
57
+ * Returns 0 if either set is empty.
58
+ */
59
+ export declare function tokenOverlap(a: Set<string>, b: Set<string>): number;
60
+ /**
61
+ * Extract issue prefix like "OD-692" or "PROJ-123" from thread text.
62
+ * Returns null if no prefix found.
63
+ */
64
+ export declare function extractIssuePrefix(text: string): string | null;
65
+ /**
66
+ * Deduplicate a thread list by ID, normalized text, and token overlap.
47
67
  * First-seen wins. Skips empty-text threads. Does not mutate input.
48
68
  *
49
69
  * Applied at every thread loading/merging exit point to guarantee
@@ -1,17 +1,28 @@
1
1
  /**
2
2
  * Thread Deduplication Service (Phase 3)
3
3
  *
4
- * Pure functions for detecting duplicate threads by embedding similarity
5
- * or normalized text equality. Zero I/O — all Supabase and embedding
6
- * calls live in the caller (create-thread.ts).
4
+ * Pure functions for detecting duplicate threads by embedding similarity,
5
+ * token overlap, or normalized text equality. Zero I/O — all Supabase
6
+ * and embedding calls live in the caller (create-thread.ts).
7
7
  *
8
8
  * Strategy:
9
9
  * 1. If embedding available: cosine similarity > 0.85 → duplicate
10
- * 2. If embedding unavailable: normalized text equality → duplicate
11
- * 3. If no existing threads: skip check
10
+ * 2. Token overlap coefficient > 0.6 → duplicate (no API key needed)
11
+ * - Lowered to 0.4 when both threads share an issue prefix (e.g., OD-692:)
12
+ * 3. Normalized text equality → duplicate
13
+ * 4. If no existing threads: skip check
12
14
  */
13
15
  // ---------- Constants ----------
14
16
  export const DEDUP_SIMILARITY_THRESHOLD = 0.85;
17
+ export const TOKEN_OVERLAP_THRESHOLD = 0.6;
18
+ export const TOKEN_OVERLAP_ISSUE_PREFIX_THRESHOLD = 0.4;
19
+ const STOP_WORDS = new Set([
20
+ "a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "for",
21
+ "of", "with", "by", "from", "is", "it", "be", "as", "was", "are",
22
+ "been", "being", "have", "has", "had", "do", "does", "did", "will",
23
+ "that", "this", "not", "no", "so", "if", "its", "also", "into",
24
+ "than", "then", "can", "just", "about", "up", "out", "still",
25
+ ]);
15
26
  // ---------- Core ----------
16
27
  /**
17
28
  * Check if new thread text is a semantic duplicate of any existing open thread.
@@ -60,6 +71,36 @@ export function checkDuplicate(newText, newEmbedding, existingThreads) {
60
71
  method: "embedding",
61
72
  };
62
73
  }
74
+ // Token overlap check (works without any API key)
75
+ const newTokens = tokenize(newText);
76
+ const newPrefix = extractIssuePrefix(newText);
77
+ if (newTokens.size > 0) {
78
+ let bestOverlap = -1;
79
+ let bestThread = null;
80
+ for (const thread of existingThreads) {
81
+ const existingTokens = tokenize(thread.text);
82
+ if (existingTokens.size === 0)
83
+ continue;
84
+ const overlap = tokenOverlap(newTokens, existingTokens);
85
+ const existingPrefix = extractIssuePrefix(thread.text);
86
+ const threshold = newPrefix && existingPrefix && newPrefix === existingPrefix
87
+ ? TOKEN_OVERLAP_ISSUE_PREFIX_THRESHOLD
88
+ : TOKEN_OVERLAP_THRESHOLD;
89
+ if (overlap > threshold && overlap > bestOverlap) {
90
+ bestOverlap = overlap;
91
+ bestThread = thread;
92
+ }
93
+ }
94
+ if (bestThread && bestOverlap > 0) {
95
+ return {
96
+ is_duplicate: true,
97
+ matched_thread_id: bestThread.thread_id,
98
+ matched_text: bestThread.text,
99
+ similarity: round(bestOverlap, 4),
100
+ method: "token_overlap",
101
+ };
102
+ }
103
+ }
63
104
  // Text normalization fallback (conservative: exact match only)
64
105
  const normalizedNew = normalizeText(newText);
65
106
  for (const thread of existingThreads) {
@@ -106,13 +147,49 @@ export function normalizeText(text) {
106
147
  .trim()
107
148
  .replace(/[.!?;:]+$/, "");
108
149
  }
150
+ /**
151
+ * Tokenize text into content words for overlap comparison.
152
+ * Lowercase, split on non-alphanumeric boundaries, remove stop words.
153
+ */
154
+ export function tokenize(text) {
155
+ const words = text
156
+ .toLowerCase()
157
+ .split(/[^a-z0-9-]+/)
158
+ .filter((w) => w.length > 1 && !STOP_WORDS.has(w));
159
+ return new Set(words);
160
+ }
161
+ /**
162
+ * Overlap coefficient: |intersection| / min(|A|, |B|).
163
+ * Handles the common case where one thread is a shorter variant of another.
164
+ * Returns 0 if either set is empty.
165
+ */
166
+ export function tokenOverlap(a, b) {
167
+ if (a.size === 0 || b.size === 0)
168
+ return 0;
169
+ let intersection = 0;
170
+ const smaller = a.size <= b.size ? a : b;
171
+ const larger = a.size <= b.size ? b : a;
172
+ for (const word of smaller) {
173
+ if (larger.has(word))
174
+ intersection++;
175
+ }
176
+ return intersection / Math.min(a.size, b.size);
177
+ }
178
+ /**
179
+ * Extract issue prefix like "OD-692" or "PROJ-123" from thread text.
180
+ * Returns null if no prefix found.
181
+ */
182
+ export function extractIssuePrefix(text) {
183
+ const match = text.match(/^([A-Z]+-\d+)/i);
184
+ return match ? match[1].toUpperCase() : null;
185
+ }
109
186
  function round(value, decimals) {
110
187
  const factor = 10 ** decimals;
111
188
  return Math.round(value * factor) / factor;
112
189
  }
113
190
  // ---------- List Deduplication ----------
114
191
  /**
115
- * Deduplicate a thread list by both ID and normalized text.
192
+ * Deduplicate a thread list by ID, normalized text, and token overlap.
116
193
  * First-seen wins. Skips empty-text threads. Does not mutate input.
117
194
  *
118
195
  * Applied at every thread loading/merging exit point to guarantee
@@ -120,19 +197,44 @@ function round(value, decimals) {
120
197
  */
121
198
  export function deduplicateThreadList(threads) {
122
199
  const seenIds = new Set();
123
- const seenText = new Set();
124
200
  const result = [];
201
+ // Track accepted threads with their tokens for overlap comparison
202
+ const accepted = [];
125
203
  for (const thread of threads) {
126
204
  const text = thread.text || "";
127
205
  const key = normalizeText(text);
128
206
  // Skip empty-text threads
129
207
  if (!key)
130
208
  continue;
131
- // Skip if we've seen this ID or this normalized text
132
- if (seenIds.has(thread.id) || seenText.has(key))
209
+ // Skip if we've seen this ID
210
+ if (seenIds.has(thread.id))
211
+ continue;
212
+ // Check exact text match against accepted threads
213
+ const tokens = tokenize(text);
214
+ const prefix = extractIssuePrefix(text);
215
+ let isDuplicate = false;
216
+ for (const prev of accepted) {
217
+ // Exact normalized text match
218
+ if (normalizeText(prev.text) === key) {
219
+ isDuplicate = true;
220
+ break;
221
+ }
222
+ // Token overlap match
223
+ if (tokens.size > 0 && prev.tokens.size > 0) {
224
+ const overlap = tokenOverlap(tokens, prev.tokens);
225
+ const threshold = prefix && prev.prefix && prefix === prev.prefix
226
+ ? TOKEN_OVERLAP_ISSUE_PREFIX_THRESHOLD
227
+ : TOKEN_OVERLAP_THRESHOLD;
228
+ if (overlap > threshold) {
229
+ isDuplicate = true;
230
+ break;
231
+ }
232
+ }
233
+ }
234
+ if (isDuplicate)
133
235
  continue;
134
236
  seenIds.add(thread.id);
135
- seenText.add(key);
237
+ accepted.push({ text, tokens, prefix });
136
238
  result.push(thread);
137
239
  }
138
240
  return result;
@@ -33,7 +33,7 @@ export interface CreateThreadResult {
33
33
  deduplicated?: boolean;
34
34
  /** Phase 3: dedup gate details */
35
35
  dedup?: {
36
- method: "embedding" | "text_normalization" | "skipped";
36
+ method: "embedding" | "token_overlap" | "text_normalization" | "skipped";
37
37
  similarity: number | null;
38
38
  matched_thread_id: string | null;
39
39
  };
@@ -0,0 +1,48 @@
1
+ ---
2
+ description: Show GitMem session status, hook activity, and recall state
3
+ allowed-tools: ["Bash", "Read", "mcp__gitmem__gitmem-cache-status"]
4
+ ---
5
+
6
+ # GitMem Status
7
+
8
+ Check the current state of the GitMem hooks plugin and active session.
9
+
10
+ ## Instructions
11
+
12
+ 1. **Check if gitmem MCP is connected:**
13
+ Call the `mcp__gitmem__gitmem-cache-status` tool. If it returns a response, the server is connected. Report scar count and cache age from the response.
14
+ Do NOT use `claude mcp list` — its health check returns false negatives.
15
+
16
+ 2. **Check active session:**
17
+ Read `.gitmem/active-session.json` if it exists. Report:
18
+ - Session ID
19
+ - Agent identity
20
+ - Start time
21
+ - Whether scars have been surfaced (surfaced_scars timestamp)
22
+
23
+ 3. **Check hook state:**
24
+ Look for `/tmp/gitmem-hooks-*` directories. Report:
25
+ - Session start time
26
+ - Tool call count
27
+ - Last nag time
28
+ - Whether stop_hook_active guard is set
29
+
30
+ 4. **Check audit trail:**
31
+ Read `/tmp/gitmem-hooks-*/audit.jsonl` if it exists. Report:
32
+ - Total LOOKED events (recall/search calls)
33
+ - Total ACTION events (consequential actions)
34
+ - Whether any ACTION was taken without a prior LOOKED (potential gap)
35
+ - Last 5 audit entries
36
+
37
+ 5. **Summary:**
38
+ Present a concise status block:
39
+ ```
40
+ GitMem Status
41
+ ├── MCP Server: connected/disconnected
42
+ ├── Active Session: <id> (started <time>)
43
+ ├── Recall: last called <time> / never called
44
+ ├── Tool Calls: <count>
45
+ ├── Audit Trail: <looked> LOOKED / <action> ACTION events
46
+ │ └── Look-before-act: <yes/gap detected>
47
+ └── Hooks: session-start ✓ | recall-check ✓ | post-tool-use ✓ | close-check ✓
48
+ ```
@@ -0,0 +1,141 @@
1
+ #!/bin/bash
2
+ # GitMem Hooks Plugin — PostToolUse Display Hook
3
+ #
4
+ # PURPOSE: Deterministic MCP tool output display.
5
+ # Routes formatted display directly to the terminal (bypassing the LLM)
6
+ # and replaces the LLM's view with machine-readable data only.
7
+ #
8
+ # Architecture:
9
+ # Channel 1 (stdout, exit 0) → User sees in terminal, LLM does NOT
10
+ # Channel 2 (updatedMCPToolOutput) → LLM sees, user does NOT directly
11
+ #
12
+ # The gitmem MCP server returns responses with an optional separator:
13
+ # [formatted display]
14
+ # ═══ GITMEM_DATA ═══
15
+ # {"machine": "data"}
16
+ #
17
+ # This hook splits on that separator. If no separator exists, the entire
18
+ # response is treated as display-only.
19
+ #
20
+ # Input: JSON via stdin with tool_name, tool_input, tool_response
21
+ # Output: JSON with hookSpecificOutput.updatedMCPToolOutput (or exit 0)
22
+
23
+ set -e
24
+
25
+ # Read hook input from stdin
26
+ HOOK_INPUT=$(cat -)
27
+
28
+ # Extract tool name
29
+ TOOL_NAME=""
30
+ if command -v jq &>/dev/null; then
31
+ TOOL_NAME=$(echo "$HOOK_INPUT" | jq -r '.tool_name // empty' 2>/dev/null)
32
+ else
33
+ TOOL_NAME=$(echo "$HOOK_INPUT" | node -e "
34
+ let d='';
35
+ process.stdin.on('data',c=>d+=c);
36
+ process.stdin.on('end',()=>{
37
+ try { process.stdout.write(JSON.parse(d).tool_name||''); }
38
+ catch(e) { process.stdout.write(''); }
39
+ });
40
+ " 2>/dev/null)
41
+ fi
42
+
43
+ # Only process gitmem MCP tools
44
+ case "$TOOL_NAME" in
45
+ mcp__gitmem__*) ;;
46
+ *) exit 0 ;;
47
+ esac
48
+
49
+ # Extract tool response — try both field names Claude Code might use
50
+ TOOL_RESPONSE=""
51
+ if command -v jq &>/dev/null; then
52
+ TOOL_RESPONSE=$(echo "$HOOK_INPUT" | jq -r '(.tool_response // .tool_output // empty)' 2>/dev/null)
53
+ else
54
+ TOOL_RESPONSE=$(echo "$HOOK_INPUT" | node -e "
55
+ let d='';
56
+ process.stdin.on('data',c=>d+=c);
57
+ process.stdin.on('end',()=>{
58
+ try {
59
+ const j=JSON.parse(d);
60
+ process.stdout.write(String(j.tool_response||j.tool_output||''));
61
+ } catch(e) { process.stdout.write(''); }
62
+ });
63
+ " 2>/dev/null)
64
+ fi
65
+
66
+ # No response to process
67
+ if [ -z "$TOOL_RESPONSE" ] || [ "$TOOL_RESPONSE" = "null" ]; then
68
+ exit 0
69
+ fi
70
+
71
+ # ============================================================================
72
+ # Split response into DISPLAY and MACHINE DATA
73
+ # ============================================================================
74
+
75
+ SEPARATOR="═══ GITMEM_DATA ═══"
76
+
77
+ # Check if separator exists in response
78
+ if echo "$TOOL_RESPONSE" | grep -qF "$SEPARATOR"; then
79
+ # Split: display = everything before separator, data = everything after
80
+ DISPLAY_PART=$(echo "$TOOL_RESPONSE" | awk -v sep="$SEPARATOR" '{if ($0 == sep) exit; print}')
81
+ MACHINE_PART=$(echo "$TOOL_RESPONSE" | awk -v sep="$SEPARATOR" 'found{print} $0==sep{found=1}')
82
+ else
83
+ # No separator — entire response is display-only
84
+ DISPLAY_PART="$TOOL_RESPONSE"
85
+ MACHINE_PART=""
86
+ fi
87
+
88
+ # ============================================================================
89
+ # Strip DISPLAY PROTOCOL suffix from display portion
90
+ # ============================================================================
91
+
92
+ # Remove the separator line and everything after it (DISPLAY PROTOCOL instructions)
93
+ DISPLAY_CLEAN=$(echo "$DISPLAY_PART" | awk '/^───────────────────────────────────────────────────$/{exit} {print}')
94
+
95
+ # If stripping removed everything (shouldn't happen), fall back to full display
96
+ if [ -z "$DISPLAY_CLEAN" ]; then
97
+ DISPLAY_CLEAN="$DISPLAY_PART"
98
+ fi
99
+
100
+ # ============================================================================
101
+ # Channel 1: Print display to stdout (user sees, LLM does not)
102
+ # ============================================================================
103
+
104
+ echo "$DISPLAY_CLEAN"
105
+
106
+ # ============================================================================
107
+ # Channel 2: Return updatedMCPToolOutput (LLM sees, user does not directly)
108
+ # ============================================================================
109
+
110
+ # Build the LLM-facing replacement
111
+ SHORT_NAME=$(echo "$TOOL_NAME" | sed 's/^mcp__gitmem__//')
112
+
113
+ if [ -n "$MACHINE_PART" ]; then
114
+ # Has machine data — give LLM the structured data
115
+ LLM_TEXT="gitmem ${SHORT_NAME} · output displayed to user\n${MACHINE_PART}"
116
+ else
117
+ # Display-only — give LLM a brief summary
118
+ # Include the display text so LLM can still reference the content
119
+ LLM_TEXT="gitmem ${SHORT_NAME} · output displayed to user\n${DISPLAY_CLEAN}"
120
+ fi
121
+
122
+ # Escape for JSON embedding (handle newlines, quotes, backslashes)
123
+ if command -v jq &>/dev/null; then
124
+ LLM_JSON=$(printf '%s' "$LLM_TEXT" | jq -Rs '.')
125
+ else
126
+ LLM_JSON=$(printf '%s' "$LLM_TEXT" | node -e "
127
+ let d='';
128
+ process.stdin.on('data',c=>d+=c);
129
+ process.stdin.on('end',()=>process.stdout.write(JSON.stringify(d)));
130
+ " 2>/dev/null)
131
+ fi
132
+
133
+ # Return the hook response with updatedMCPToolOutput
134
+ cat <<HOOKJSON
135
+ {
136
+ "hookSpecificOutput": {
137
+ "hookEventName": "PostToolUse",
138
+ "updatedMCPToolOutput": ${LLM_JSON}
139
+ }
140
+ }
141
+ HOOKJSON
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "gitmem-mcp",
3
- "version": "1.0.13",
3
+ "version": "1.0.15",
4
4
  "description": "Institutional memory for AI coding agents. Memory that compounds.",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",