gitmem-mcp 1.0.14 → 1.0.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [1.0.15] - 2026-02-16
11
+
12
+ ### Fixed
13
+ - **Thread dedup without API key**: Dedup silently fell back to exact text match when no embedding API key (OpenAI/OpenRouter/Ollama) was set — which is the default for free tier users. Near-duplicate threads with the same topic but different wording slipped through. Added zero-dependency token overlap coefficient as a middle tier (threshold 0.6, lowered to 0.4 when threads share an issue prefix like `OD-692:`). Also upgraded `deduplicateThreadList` with the same logic. +18 unit tests.
14
+
10
15
  ## [1.0.12] - 2026-02-16
11
16
 
12
17
  ### Fixed
@@ -1,14 +1,16 @@
1
1
  /**
2
2
  * Thread Deduplication Service (Phase 3)
3
3
  *
4
- * Pure functions for detecting duplicate threads by embedding similarity
5
- * or normalized text equality. Zero I/O — all Supabase and embedding
6
- * calls live in the caller (create-thread.ts).
4
+ * Pure functions for detecting duplicate threads by embedding similarity,
5
+ * token overlap, or normalized text equality. Zero I/O — all Supabase
6
+ * and embedding calls live in the caller (create-thread.ts).
7
7
  *
8
8
  * Strategy:
9
9
  * 1. If embedding available: cosine similarity > 0.85 → duplicate
10
- * 2. If embedding unavailable: normalized text equality → duplicate
11
- * 3. If no existing threads: skip check
10
+ * 2. Token overlap coefficient > 0.6 → duplicate (no API key needed)
11
+ * - Lowered to 0.4 when both threads share an issue prefix (e.g., OD-692:)
12
+ * 3. Normalized text equality → duplicate
13
+ * 4. If no existing threads: skip check
12
14
  */
13
15
  import type { ThreadObject } from "../types/index.js";
14
16
  export interface ThreadWithEmbedding {
@@ -21,9 +23,11 @@ export interface DedupResult {
21
23
  matched_thread_id: string | null;
22
24
  matched_text: string | null;
23
25
  similarity: number | null;
24
- method: "embedding" | "text_normalization" | "skipped";
26
+ method: "embedding" | "token_overlap" | "text_normalization" | "skipped";
25
27
  }
26
28
  export declare const DEDUP_SIMILARITY_THRESHOLD = 0.85;
29
+ export declare const TOKEN_OVERLAP_THRESHOLD = 0.6;
30
+ export declare const TOKEN_OVERLAP_ISSUE_PREFIX_THRESHOLD = 0.4;
27
31
  /**
28
32
  * Check if new thread text is a semantic duplicate of any existing open thread.
29
33
  *
@@ -43,7 +47,23 @@ export declare function cosineSimilarity(a: number[], b: number[]): number;
43
47
  */
44
48
  export declare function normalizeText(text: string): string;
45
49
  /**
46
- * Deduplicate a thread list by both ID and normalized text.
50
+ * Tokenize text into content words for overlap comparison.
51
+ * Lowercase, split on non-alphanumeric boundaries, remove stop words.
52
+ */
53
+ export declare function tokenize(text: string): Set<string>;
54
+ /**
55
+ * Overlap coefficient: |intersection| / min(|A|, |B|).
56
+ * Handles the common case where one thread is a shorter variant of another.
57
+ * Returns 0 if either set is empty.
58
+ */
59
+ export declare function tokenOverlap(a: Set<string>, b: Set<string>): number;
60
+ /**
61
+ * Extract issue prefix like "OD-692" or "PROJ-123" from thread text.
62
+ * Returns null if no prefix found.
63
+ */
64
+ export declare function extractIssuePrefix(text: string): string | null;
65
+ /**
66
+ * Deduplicate a thread list by ID, normalized text, and token overlap.
47
67
  * First-seen wins. Skips empty-text threads. Does not mutate input.
48
68
  *
49
69
  * Applied at every thread loading/merging exit point to guarantee
@@ -1,17 +1,28 @@
1
1
  /**
2
2
  * Thread Deduplication Service (Phase 3)
3
3
  *
4
- * Pure functions for detecting duplicate threads by embedding similarity
5
- * or normalized text equality. Zero I/O — all Supabase and embedding
6
- * calls live in the caller (create-thread.ts).
4
+ * Pure functions for detecting duplicate threads by embedding similarity,
5
+ * token overlap, or normalized text equality. Zero I/O — all Supabase
6
+ * and embedding calls live in the caller (create-thread.ts).
7
7
  *
8
8
  * Strategy:
9
9
  * 1. If embedding available: cosine similarity > 0.85 → duplicate
10
- * 2. If embedding unavailable: normalized text equality → duplicate
11
- * 3. If no existing threads: skip check
10
+ * 2. Token overlap coefficient > 0.6 → duplicate (no API key needed)
11
+ * - Lowered to 0.4 when both threads share an issue prefix (e.g., OD-692:)
12
+ * 3. Normalized text equality → duplicate
13
+ * 4. If no existing threads: skip check
12
14
  */
13
15
  // ---------- Constants ----------
14
16
  export const DEDUP_SIMILARITY_THRESHOLD = 0.85;
17
+ export const TOKEN_OVERLAP_THRESHOLD = 0.6;
18
+ export const TOKEN_OVERLAP_ISSUE_PREFIX_THRESHOLD = 0.4;
19
+ const STOP_WORDS = new Set([
20
+ "a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "for",
21
+ "of", "with", "by", "from", "is", "it", "be", "as", "was", "are",
22
+ "been", "being", "have", "has", "had", "do", "does", "did", "will",
23
+ "that", "this", "not", "no", "so", "if", "its", "also", "into",
24
+ "than", "then", "can", "just", "about", "up", "out", "still",
25
+ ]);
15
26
  // ---------- Core ----------
16
27
  /**
17
28
  * Check if new thread text is a semantic duplicate of any existing open thread.
@@ -60,6 +71,36 @@ export function checkDuplicate(newText, newEmbedding, existingThreads) {
60
71
  method: "embedding",
61
72
  };
62
73
  }
74
+ // Token overlap check (works without any API key)
75
+ const newTokens = tokenize(newText);
76
+ const newPrefix = extractIssuePrefix(newText);
77
+ if (newTokens.size > 0) {
78
+ let bestOverlap = -1;
79
+ let bestThread = null;
80
+ for (const thread of existingThreads) {
81
+ const existingTokens = tokenize(thread.text);
82
+ if (existingTokens.size === 0)
83
+ continue;
84
+ const overlap = tokenOverlap(newTokens, existingTokens);
85
+ const existingPrefix = extractIssuePrefix(thread.text);
86
+ const threshold = newPrefix && existingPrefix && newPrefix === existingPrefix
87
+ ? TOKEN_OVERLAP_ISSUE_PREFIX_THRESHOLD
88
+ : TOKEN_OVERLAP_THRESHOLD;
89
+ if (overlap > threshold && overlap > bestOverlap) {
90
+ bestOverlap = overlap;
91
+ bestThread = thread;
92
+ }
93
+ }
94
+ if (bestThread && bestOverlap > 0) {
95
+ return {
96
+ is_duplicate: true,
97
+ matched_thread_id: bestThread.thread_id,
98
+ matched_text: bestThread.text,
99
+ similarity: round(bestOverlap, 4),
100
+ method: "token_overlap",
101
+ };
102
+ }
103
+ }
63
104
  // Text normalization fallback (conservative: exact match only)
64
105
  const normalizedNew = normalizeText(newText);
65
106
  for (const thread of existingThreads) {
@@ -106,13 +147,49 @@ export function normalizeText(text) {
106
147
  .trim()
107
148
  .replace(/[.!?;:]+$/, "");
108
149
  }
150
+ /**
151
+ * Tokenize text into content words for overlap comparison.
152
+ * Lowercase, split on non-alphanumeric boundaries, remove stop words.
153
+ */
154
+ export function tokenize(text) {
155
+ const words = text
156
+ .toLowerCase()
157
+ .split(/[^a-z0-9-]+/)
158
+ .filter((w) => w.length > 1 && !STOP_WORDS.has(w));
159
+ return new Set(words);
160
+ }
161
+ /**
162
+ * Overlap coefficient: |intersection| / min(|A|, |B|).
163
+ * Handles the common case where one thread is a shorter variant of another.
164
+ * Returns 0 if either set is empty.
165
+ */
166
+ export function tokenOverlap(a, b) {
167
+ if (a.size === 0 || b.size === 0)
168
+ return 0;
169
+ let intersection = 0;
170
+ const smaller = a.size <= b.size ? a : b;
171
+ const larger = a.size <= b.size ? b : a;
172
+ for (const word of smaller) {
173
+ if (larger.has(word))
174
+ intersection++;
175
+ }
176
+ return intersection / Math.min(a.size, b.size);
177
+ }
178
+ /**
179
+ * Extract issue prefix like "OD-692" or "PROJ-123" from thread text.
180
+ * Returns null if no prefix found.
181
+ */
182
+ export function extractIssuePrefix(text) {
183
+ const match = text.match(/^([A-Z]+-\d+)/i);
184
+ return match ? match[1].toUpperCase() : null;
185
+ }
109
186
  function round(value, decimals) {
110
187
  const factor = 10 ** decimals;
111
188
  return Math.round(value * factor) / factor;
112
189
  }
113
190
  // ---------- List Deduplication ----------
114
191
  /**
115
- * Deduplicate a thread list by both ID and normalized text.
192
+ * Deduplicate a thread list by ID, normalized text, and token overlap.
116
193
  * First-seen wins. Skips empty-text threads. Does not mutate input.
117
194
  *
118
195
  * Applied at every thread loading/merging exit point to guarantee
@@ -120,19 +197,44 @@ function round(value, decimals) {
120
197
  */
121
198
  export function deduplicateThreadList(threads) {
122
199
  const seenIds = new Set();
123
- const seenText = new Set();
124
200
  const result = [];
201
+ // Track accepted threads with their tokens for overlap comparison
202
+ const accepted = [];
125
203
  for (const thread of threads) {
126
204
  const text = thread.text || "";
127
205
  const key = normalizeText(text);
128
206
  // Skip empty-text threads
129
207
  if (!key)
130
208
  continue;
131
- // Skip if we've seen this ID or this normalized text
132
- if (seenIds.has(thread.id) || seenText.has(key))
209
+ // Skip if we've seen this ID
210
+ if (seenIds.has(thread.id))
211
+ continue;
212
+ // Check exact text match against accepted threads
213
+ const tokens = tokenize(text);
214
+ const prefix = extractIssuePrefix(text);
215
+ let isDuplicate = false;
216
+ for (const prev of accepted) {
217
+ // Exact normalized text match
218
+ if (normalizeText(prev.text) === key) {
219
+ isDuplicate = true;
220
+ break;
221
+ }
222
+ // Token overlap match
223
+ if (tokens.size > 0 && prev.tokens.size > 0) {
224
+ const overlap = tokenOverlap(tokens, prev.tokens);
225
+ const threshold = prefix && prev.prefix && prefix === prev.prefix
226
+ ? TOKEN_OVERLAP_ISSUE_PREFIX_THRESHOLD
227
+ : TOKEN_OVERLAP_THRESHOLD;
228
+ if (overlap > threshold) {
229
+ isDuplicate = true;
230
+ break;
231
+ }
232
+ }
233
+ }
234
+ if (isDuplicate)
133
235
  continue;
134
236
  seenIds.add(thread.id);
135
- seenText.add(key);
237
+ accepted.push({ text, tokens, prefix });
136
238
  result.push(thread);
137
239
  }
138
240
  return result;
@@ -33,7 +33,7 @@ export interface CreateThreadResult {
33
33
  deduplicated?: boolean;
34
34
  /** Phase 3: dedup gate details */
35
35
  dedup?: {
36
- method: "embedding" | "text_normalization" | "skipped";
36
+ method: "embedding" | "token_overlap" | "text_normalization" | "skipped";
37
37
  similarity: number | null;
38
38
  matched_thread_id: string | null;
39
39
  };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "gitmem-mcp",
3
- "version": "1.0.14",
3
+ "version": "1.0.15",
4
4
  "description": "Institutional memory for AI coding agents. Memory that compounds.",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",