npm - gitmem-mcp - Versions diffs - 1.0.14 → 1.0.15 - Mend

gitmem-mcp 1.0.14 → 1.0.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/CHANGELOG.md +5 -0
package/dist/services/thread-dedup.d.ts +27 -7
package/dist/services/thread-dedup.js +112 -10
package/dist/tools/create-thread.d.ts +1 -1
package/package.json +1 -1

package/CHANGELOG.md CHANGED Viewed

@@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
+## [1.0.15] - 2026-02-16
+### Fixed
+- **Thread dedup without API key**: Dedup silently fell back to exact text match when no embedding API key (OpenAI/OpenRouter/Ollama) was set — which is the default for free tier users. Near-duplicate threads with the same topic but different wording slipped through. Added zero-dependency token overlap coefficient as a middle tier (threshold 0.6, lowered to 0.4 when threads share an issue prefix like `OD-692:`). Also upgraded `deduplicateThreadList` with the same logic. +18 unit tests.
 ## [1.0.12] - 2026-02-16
 ### Fixed

package/dist/services/thread-dedup.d.ts CHANGED Viewed

@@ -1,14 +1,16 @@
 /**
  * Thread Deduplication Service (Phase 3)
  *
- * Pure functions for detecting duplicate threads by embedding similarity
- * or normalized text equality. Zero I/O — all Supabase and embedding
- * calls live in the caller (create-thread.ts).
+ * Pure functions for detecting duplicate threads by embedding similarity,
+ * token overlap, or normalized text equality. Zero I/O — all Supabase
+ * and embedding calls live in the caller (create-thread.ts).
  *
  * Strategy:
  *   1. If embedding available: cosine similarity > 0.85 → duplicate
- *   2. If embedding unavailable: normalized text equality → duplicate
- *   3. If no existing threads: skip check
+ *   2. Token overlap coefficient > 0.6 → duplicate (no API key needed)
+ *      - Lowered to 0.4 when both threads share an issue prefix (e.g., OD-692:)
+ *   3. Normalized text equality → duplicate
+ *   4. If no existing threads: skip check
  */
 import type { ThreadObject } from "../types/index.js";
 export interface ThreadWithEmbedding {
@@ -21,9 +23,11 @@ export interface DedupResult {
     matched_thread_id: string | null;
     matched_text: string | null;
     similarity: number | null;
-    method: "embedding" | "text_normalization" | "skipped";
+    method: "embedding" | "token_overlap" | "text_normalization" | "skipped";
 }
 export declare const DEDUP_SIMILARITY_THRESHOLD = 0.85;
+export declare const TOKEN_OVERLAP_THRESHOLD = 0.6;
+export declare const TOKEN_OVERLAP_ISSUE_PREFIX_THRESHOLD = 0.4;
 /**
  * Check if new thread text is a semantic duplicate of any existing open thread.
  *
@@ -43,7 +47,23 @@ export declare function cosineSimilarity(a: number[], b: number[]): number;
  */
 export declare function normalizeText(text: string): string;
 /**
- * Deduplicate a thread list by both ID and normalized text.
+ * Tokenize text into content words for overlap comparison.
+ * Lowercase, split on non-alphanumeric boundaries, remove stop words.
+ */
+export declare function tokenize(text: string): Set<string>;
+/**
+ * Overlap coefficient: |intersection| / min(|A|, |B|).
+ * Handles the common case where one thread is a shorter variant of another.
+ * Returns 0 if either set is empty.
+ */
+export declare function tokenOverlap(a: Set<string>, b: Set<string>): number;
+/**
+ * Extract issue prefix like "OD-692" or "PROJ-123" from thread text.
+ * Returns null if no prefix found.
+ */
+export declare function extractIssuePrefix(text: string): string | null;
+/**
+ * Deduplicate a thread list by ID, normalized text, and token overlap.
  * First-seen wins. Skips empty-text threads. Does not mutate input.
  *
  * Applied at every thread loading/merging exit point to guarantee

package/dist/services/thread-dedup.js CHANGED Viewed

@@ -1,17 +1,28 @@
 /**
  * Thread Deduplication Service (Phase 3)
  *
- * Pure functions for detecting duplicate threads by embedding similarity
- * or normalized text equality. Zero I/O — all Supabase and embedding
- * calls live in the caller (create-thread.ts).
+ * Pure functions for detecting duplicate threads by embedding similarity,
+ * token overlap, or normalized text equality. Zero I/O — all Supabase
+ * and embedding calls live in the caller (create-thread.ts).
  *
  * Strategy:
  *   1. If embedding available: cosine similarity > 0.85 → duplicate
- *   2. If embedding unavailable: normalized text equality → duplicate
- *   3. If no existing threads: skip check
+ *   2. Token overlap coefficient > 0.6 → duplicate (no API key needed)
+ *      - Lowered to 0.4 when both threads share an issue prefix (e.g., OD-692:)
+ *   3. Normalized text equality → duplicate
+ *   4. If no existing threads: skip check
  */
 // ---------- Constants ----------
 export const DEDUP_SIMILARITY_THRESHOLD = 0.85;
+export const TOKEN_OVERLAP_THRESHOLD = 0.6;
+export const TOKEN_OVERLAP_ISSUE_PREFIX_THRESHOLD = 0.4;
+const STOP_WORDS = new Set([
+    "a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "for",
+    "of", "with", "by", "from", "is", "it", "be", "as", "was", "are",
+    "been", "being", "have", "has", "had", "do", "does", "did", "will",
+    "that", "this", "not", "no", "so", "if", "its", "also", "into",
+    "than", "then", "can", "just", "about", "up", "out", "still",
+]);
 // ---------- Core ----------
 /**
  * Check if new thread text is a semantic duplicate of any existing open thread.
@@ -60,6 +71,36 @@ export function checkDuplicate(newText, newEmbedding, existingThreads) {
             method: "embedding",
         };
     }
+    // Token overlap check (works without any API key)
+    const newTokens = tokenize(newText);
+    const newPrefix = extractIssuePrefix(newText);
+    if (newTokens.size > 0) {
+        let bestOverlap = -1;
+        let bestThread = null;
+        for (const thread of existingThreads) {
+            const existingTokens = tokenize(thread.text);
+            if (existingTokens.size === 0)
+                continue;
+            const overlap = tokenOverlap(newTokens, existingTokens);
+            const existingPrefix = extractIssuePrefix(thread.text);
+            const threshold = newPrefix && existingPrefix && newPrefix === existingPrefix
+                ? TOKEN_OVERLAP_ISSUE_PREFIX_THRESHOLD
+                : TOKEN_OVERLAP_THRESHOLD;
+            if (overlap > threshold && overlap > bestOverlap) {
+                bestOverlap = overlap;
+                bestThread = thread;
+            }
+        }
+        if (bestThread && bestOverlap > 0) {
+            return {
+                is_duplicate: true,
+                matched_thread_id: bestThread.thread_id,
+                matched_text: bestThread.text,
+                similarity: round(bestOverlap, 4),
+                method: "token_overlap",
+            };
+        }
+    }
     // Text normalization fallback (conservative: exact match only)
     const normalizedNew = normalizeText(newText);
     for (const thread of existingThreads) {
@@ -106,13 +147,49 @@ export function normalizeText(text) {
         .trim()
         .replace(/[.!?;:]+$/, "");
 }
+/**
+ * Tokenize text into content words for overlap comparison.
+ * Lowercase, split on non-alphanumeric boundaries, remove stop words.
+ */
+export function tokenize(text) {
+    const words = text
+        .toLowerCase()
+        .split(/[^a-z0-9-]+/)
+        .filter((w) => w.length > 1 && !STOP_WORDS.has(w));
+    return new Set(words);
+}
+/**
+ * Overlap coefficient: |intersection| / min(|A|, |B|).
+ * Handles the common case where one thread is a shorter variant of another.
+ * Returns 0 if either set is empty.
+ */
+export function tokenOverlap(a, b) {
+    if (a.size === 0 || b.size === 0)
+        return 0;
+    let intersection = 0;
+    const smaller = a.size <= b.size ? a : b;
+    const larger = a.size <= b.size ? b : a;
+    for (const word of smaller) {
+        if (larger.has(word))
+            intersection++;
+    }
+    return intersection / Math.min(a.size, b.size);
+}
+/**
+ * Extract issue prefix like "OD-692" or "PROJ-123" from thread text.
+ * Returns null if no prefix found.
+ */
+export function extractIssuePrefix(text) {
+    const match = text.match(/^([A-Z]+-\d+)/i);
+    return match ? match[1].toUpperCase() : null;
+}
 function round(value, decimals) {
     const factor = 10 ** decimals;
     return Math.round(value * factor) / factor;
 }
 // ---------- List Deduplication ----------
 /**
- * Deduplicate a thread list by both ID and normalized text.
+ * Deduplicate a thread list by ID, normalized text, and token overlap.
  * First-seen wins. Skips empty-text threads. Does not mutate input.
  *
  * Applied at every thread loading/merging exit point to guarantee
@@ -120,19 +197,44 @@ function round(value, decimals) {
  */
 export function deduplicateThreadList(threads) {
     const seenIds = new Set();
-    const seenText = new Set();
     const result = [];
+    // Track accepted threads with their tokens for overlap comparison
+    const accepted = [];
     for (const thread of threads) {
         const text = thread.text || "";
         const key = normalizeText(text);
         // Skip empty-text threads
         if (!key)
             continue;
-        // Skip if we've seen this ID or this normalized text
-        if (seenIds.has(thread.id) || seenText.has(key))
+        // Skip if we've seen this ID
+        if (seenIds.has(thread.id))
+            continue;
+        // Check exact text match against accepted threads
+        const tokens = tokenize(text);
+        const prefix = extractIssuePrefix(text);
+        let isDuplicate = false;
+        for (const prev of accepted) {
+            // Exact normalized text match
+            if (normalizeText(prev.text) === key) {
+                isDuplicate = true;
+                break;
+            }
+            // Token overlap match
+            if (tokens.size > 0 && prev.tokens.size > 0) {
+                const overlap = tokenOverlap(tokens, prev.tokens);
+                const threshold = prefix && prev.prefix && prefix === prev.prefix
+                    ? TOKEN_OVERLAP_ISSUE_PREFIX_THRESHOLD
+                    : TOKEN_OVERLAP_THRESHOLD;
+                if (overlap > threshold) {
+                    isDuplicate = true;
+                    break;
+                }
+            }
+        }
+        if (isDuplicate)
             continue;
         seenIds.add(thread.id);
-        seenText.add(key);
+        accepted.push({ text, tokens, prefix });
         result.push(thread);
     }
     return result;

package/dist/tools/create-thread.d.ts CHANGED Viewed

@@ -33,7 +33,7 @@ export interface CreateThreadResult {
     deduplicated?: boolean;
     /** Phase 3: dedup gate details */
     dedup?: {
-        method: "embedding" | "text_normalization" | "skipped";
+        method: "embedding" | "token_overlap" | "text_normalization" | "skipped";
         similarity: number | null;
         matched_thread_id: string | null;
     };

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "gitmem-mcp",
-  "version": "1.0.14",
+  "version": "1.0.15",
   "description": "Institutional memory for AI coding agents. Memory that compounds.",
   "type": "module",
   "main": "dist/index.js",