npm - specmem-hardwicksoftware - Versions diffs - 3.7.35 → 3.7.38 - Mend

specmem-hardwicksoftware 3.7.35 → 3.7.38

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (71) hide show

package/CHANGELOG.md +34 -0
package/README.md +11 -15
package/bin/specmem-autoclaude.cjs +12 -1
package/bin/specmem-cli.cjs +1077 -11
package/bin/specmem-console.cjs +890 -63
package/bootstrap.cjs +10 -2
package/claude-hooks/agent-loading-hook.cjs +16 -16
package/claude-hooks/agent-loading-hook.js +28 -21
package/claude-hooks/agent-type-matcher.js +1 -1
package/claude-hooks/background-completion-silencer.js +1 -1
package/claude-hooks/file-claim-enforcer.cjs +37 -36
package/claude-hooks/output-cleaner.cjs +1 -1
package/claude-hooks/refusal-detector-hook.cjs +53 -0
package/claude-hooks/settings.json +64 -4
package/claude-hooks/smart-search-interceptor.js +1 -1
package/claude-hooks/specmem-search-enforcer.cjs +2 -11
package/claude-hooks/specmem-team-member-inject.js +1 -1
package/claude-hooks/specmem-unified-hook.py +1 -1
package/claude-hooks/subagent-loading-hook.cjs +1 -1
package/claude-hooks/task-progress-hook.cjs +7 -7
package/claude-hooks/task-progress-hook.js +3 -3
package/claude-hooks/team-comms-enforcer.cjs +113 -47
package/claude-hooks/use-code-pointers.cjs +1 -1
package/dist/claude-sessions/sessionParser.js +5 -0
package/dist/cli/deploy-to-claude.js +9 -2
package/dist/codebase/codebaseIndexer.js +48 -17
package/dist/codebase/exclusions.js +3 -4
package/dist/codebase/index.js +4 -0
package/dist/codebase/pdfExtractor.js +298 -0
package/dist/dashboard/api/taskTeamMembers.js +2 -2
package/dist/db/bigBrainMigrations.js +29 -0
package/dist/hooks/hookManager.js +4 -4
package/dist/hooks/teamFramingCli.js +1 -1
package/dist/hooks/teamMemberPrepromptHook.js +5 -5
package/dist/index.js +49 -12
package/dist/init/claudeConfigInjector.js +27 -8
package/dist/installer/autoInstall.js +7 -1
package/dist/mcp/compactionProxy.js +1052 -192
package/dist/mcp/compactionProxyDaemon.js +112 -37
package/dist/mcp/contextVault.js +439 -0
package/dist/mcp/embeddingServerManager.js +151 -17
package/dist/mcp/mcpProtocolHandler.js +6 -1
package/dist/mcp/miniCOTServerManager.js +82 -8
package/dist/mcp/specMemServer.js +45 -10
package/dist/mcp/toolRegistry.js +6 -0
package/dist/startup/startupIndexing.js +14 -0
package/dist/team-members/taskOrchestrator.js +3 -3
package/dist/team-members/taskTeamMemberLogger.js +2 -2
package/dist/tools/goofy/deployTeamMember.js +3 -3
package/dist/tools/goofy/digInTheVault.js +81 -0
package/dist/tools/goofy/findCodePointers.js +17 -0
package/dist/tools/goofy/findWhatISaid.js +19 -0
package/dist/tools/goofy/stashTheGoods.js +56 -0
package/dist/tools/teamMemberDeployer.js +2 -2
package/dist/watcher/changeHandler.js +65 -8
package/dist/watcher/changeQueue.js +20 -1
package/embedding-sandbox/frankenstein-embeddings.py +4 -3
package/embedding-sandbox/mini-cot-service.py +11 -13
package/embedding-sandbox/pdf-text-extract.py +208 -0
package/package.json +1 -1
package/scripts/deploy-hooks.cjs +12 -4
package/scripts/fast-batch-embedder.cjs +2 -2
package/scripts/force-retry.cjs +34 -0
package/scripts/global-postinstall.cjs +97 -4
package/scripts/poetic-abliteration.cjs +379 -0
package/scripts/refusal-enforcer.cjs +88 -0
package/scripts/specmem-init.cjs +222 -41
package/specmem/model-config.json +6 -6
package/specmem/supervisord.conf +1 -1
package/svg-sections/readme-token-compaction.svg +246 -0
package/claude-hooks/agent-chooser-hook.js +0 -179

package/dist/team-members/taskTeamMemberLogger.js CHANGED Viewed

@@ -1,7 +1,7 @@
 /**
- * taskTeamMemberLogger.ts - Logs  Code Task team member activity to SpecMem database
+ * taskTeamMemberLogger.ts - Logs  Code Agent team member activity to SpecMem database
  *
- * yo fr fr this bridges the gap between  Code's Task tool and SpecMem tracking
+ * yo fr fr this bridges the gap between  Code's Agent tool and SpecMem tracking
  *
  * Problem: Task-deployed team members are invisible to SpecMem dashboard
  * Solution: Log team member activity before/after Task deployment

package/dist/tools/goofy/deployTeamMember.js CHANGED Viewed

@@ -1,8 +1,8 @@
 /**
- * deployTeamMember - The Task tool but actually works with MCP
+ * deployTeamMember - The Agent tool but actually works with MCP
  *
  * Spawns team members in screen sessions with full SpecMem MCP access.
- * This is basically skidding  Code's Task tool but making it not suck.
+ * This is basically skidding  Code's Agent tool but making it not suck.
  */
 import { deployTeamMember as deployTeamMemberImpl } from '../teamMemberDeployer.js';
 import { logger } from '../../utils/logger.js';
@@ -10,7 +10,7 @@ export class DeployTeamMember {
     name = 'deployTeamMember';
     description = `Deploy a team member with full SpecMem MCP access.
-This is like the Task tool but actually works - spawned team members get full access
+This is like the Agent tool but actually works - spawned team members get full access
 to all SpecMem MCP tools including team member communication (sayToTeamMember, listenForMessages,
 sendHeartbeat, getActiveTeamMembers).

package/dist/tools/goofy/digInTheVault.js ADDED Viewed

@@ -0,0 +1,81 @@
+/**
+ * digInTheVault - search the context vault for stashed content
+ *
+ * when you need to find something in all that stashed content
+ * without loading it all into context. BM25-ranked via tsvector.
+ *
+ * supports:
+ *   - query search across all vaults (project-scoped)
+ *   - targeted search within a specific vault_id
+ *   - full dump of a vault_id (get_all mode)
+ *   - stats mode (no query = return vault statistics)
+ */
+import { digInTheVault as doTheDig, getFullStash, getVaultStats } from '../../mcp/contextVault.js';
+export class DigInTheVault {
+  name = 'dig_in_the_vault';
+  description = 'Search the context vault for previously stashed content. Returns BM25-ranked chunks matching your query. Use vault_id to search a specific stash, or omit to search all. Use get_all:true to retrieve everything from a vault.';
+  inputSchema = {
+    type: 'object',
+    properties: {
+      query: {
+        type: 'string',
+        description: 'What to search for in the vault'
+      },
+      vault_id: {
+        type: 'string',
+        description: 'Search within a specific stash (the id from vault_receipt)'
+      },
+      limit: {
+        type: 'number',
+        description: 'Max results to return (default: 10)',
+        default: 10
+      },
+      get_all: {
+        type: 'boolean',
+        description: 'Retrieve ALL chunks for a vault_id in order (ignores query)',
+        default: false
+      }
+    }
+  };
+  async execute(params) {
+    const { query, vault_id, limit = 10, get_all = false } = params;
+    try {
+      // get_all mode: dump full stash content
+      if (get_all && vault_id) {
+        const full = await getFullStash(vault_id);
+        if (!full) return `<vault_dig ok="false" error="vault ${vault_id} not found or expired"/>`;
+        return full;
+      }
+      // No query = return vault stats
+      if (!query) {
+        const stats = await getVaultStats();
+        return JSON.stringify(stats, null, 2);
+      }
+      // BM25-ranked search
+      const results = await doTheDig(query, vault_id || null, limit);
+      if (results.length === 0) {
+        return vault_id
+          ? `<vault_dig ok="false" vault_id="${vault_id}" query="${query}" error="no matches — try different terms or get_all:true"/>`
+          : `<vault_dig ok="false" query="${query}" error="no matches in any vault"/>`;
+      }
+      // Format results compactly
+      const lines = results.map((r, i) => {
+        const header = `── [${i + 1}] vault:${r.vault_id} chunk:${r.chunk_idx} rank:${parseFloat(r.rank).toFixed(3)} ──`;
+        return `${header}\n${r.content}`;
+      });
+      return lines.join('\n\n');
+    } catch (err) {
+      return `<vault_dig ok="false" error="${err.message}"/>`;
+    }
+  }
+}

package/dist/tools/goofy/findCodePointers.js CHANGED Viewed

@@ -410,6 +410,23 @@ export class FindCodePointers {
                 attribution: SPECMEM_ATTRIBUTION
             };
         }
+        // FAST FAIL: Invalid query detection - return immediately with format hint
+        const queryTrimmed = params.query.trim();
+        // Check for natural language questions (not code terms)
+        const isQuestion = /^(how|what|why|when|where|who|can|does|is|should|would|could)\s/i.test(queryTrimmed);
+        if (isQuestion && queryTrimmed.length > 50) {
+            // Likely a natural language question instead of code terms - fail fast with hint
+            logger.warn({ query: params.query }, '[CodePointers] Invalid query format detected - failing fast with hint');
+            return {
+                results: [],
+                query: params.query,
+                total_found: 0,
+                search_type: 'semantic',
+                attribution: SPECMEM_ATTRIBUTION,
+                error: 'INVALID_QUERY_FORMAT',
+                error_hint: 'Query appears to be a natural language question. For code search, use CODE TERMS like "admin login auth" NOT "how does admin login work". See /specmem/HOW_TO_USE.md for examples.'
+            };
+        }
         // MODE SELECTION: Return options if user wants to choose
         if (params.galleryMode === 'ask') {
             return this.returnModeOptions(params.query);

package/dist/tools/goofy/findWhatISaid.js CHANGED Viewed

@@ -690,6 +690,25 @@ export class FindWhatISaid {
                     highlights: []
                 }];
         }
+        // FAST FAIL: Whitespace-only or clearly invalid query
+        const queryTrimmed = params.query.trim();
+        if (queryTrimmed.length < 2) {
+            logger.warn({ query: params.query }, '[find_memory] Query too short - failing fast');
+            return [{
+                memory: {
+                    id: 'error',
+                    content: 'Query too short. Provide at least 2 characters for meaningful search.',
+                    createdAt: new Date(),
+                    updatedAt: new Date(),
+                    tags: ['error'],
+                    importance: 'low',
+                    memoryType: 'semantic',
+                    metadata: { _isError: true }
+                },
+                similarity: 0,
+                highlights: []
+            }];
+        }
         logger.debug({ query: params.query, limit: params.limit }, 'searching memories fr');
         // Broadcast COT to dashboard
         cotStart('find_memory', params.query || 'browsing');

package/dist/tools/goofy/stashTheGoods.js ADDED Viewed

@@ -0,0 +1,56 @@
+/**
+ * stashTheGoods - manually vault content for later retrieval
+ *
+ * when you got data that's too thicc for the context window
+ * but you might need it later, stash it in the vault fr
+ *
+ * auto-stash handles most cases, but this tool lets claude
+ * manually stash anything it wants to keep searchable
+ */
+import { stashTheGoods as doTheStash, formatVaultReceipt } from '../../mcp/contextVault.js';
+export class StashTheGoods {
+  name = 'stash_the_goods';
+  description = 'Stash large content in the context vault for token-efficient retrieval later. Content is chunked, indexed with full-text search, and retrievable via dig_in_the_vault. Auto-expires after 24h.';
+  inputSchema = {
+    type: 'object',
+    properties: {
+      content: {
+        type: 'string',
+        description: 'The content to stash in the vault'
+      },
+      tool_name: {
+        type: 'string',
+        description: 'Which tool produced this content (for tracking)'
+      },
+      tags: {
+        type: 'array',
+        items: { type: 'string' },
+        description: 'Optional tags for categorization'
+      }
+    },
+    required: ['content']
+  };
+  async execute(params) {
+    const { content, tool_name, tags } = params;
+    if (!content || content.length === 0) {
+      return '<vault_receipt ok="false" error="nothing to stash fr"/>';
+    }
+    try {
+      const result = await doTheStash(content, {
+        tool: tool_name || 'manual_stash',
+        tags: tags || [],
+        manual: true,
+      });
+      return formatVaultReceipt(result);
+    } catch (err) {
+      return `<vault_receipt ok="false" error="${err.message}"/>`;
+    }
+  }
+}

package/dist/tools/teamMemberDeployer.js CHANGED Viewed

@@ -1,7 +1,7 @@
 /**
  * Team Member Deployer Tool
  *
- * "Skidded" version of  Code's Task tool that actually works with MCP
+ * "Skidded" version of  Code's Agent tool that actually works with MCP
  * Spawns team members with full SpecMem MCP access
  *
  * Now integrates with TeamCommsService for team-based team member coordination:
@@ -23,7 +23,7 @@ import { getAgentsJson, isValidAgentType } from './agentDefinitions.js';
 /**
  * Deploy a team member with full SpecMem MCP access
  *
- * This is basically the Task tool but it actually fucking works
+ * This is basically the Agent tool but it actually fucking works
  * because we spawn team members with MCP configured
  *
  * Now with team communication support for coordinated multi-team-member work

package/dist/watcher/changeHandler.js CHANGED Viewed

@@ -21,6 +21,7 @@ import { getCoordinator } from '../coordination/integration.js';
 import { isMinifiedOrBundled, isBinaryFile, EXCLUSION_CONFIG } from '../codebase/exclusions.js';
 import { getProjectPathForInsert } from '../services/ProjectContext.js';
 import { getEmbeddingTimeout } from '../config/embeddingTimeouts.js';
+import { extractPdfText, isPdfFile } from '../codebase/pdfExtractor.js';
 // Retry helper for transient embedding failures (timeout, socket reset, etc.)
 const WATCHER_MAX_RETRIES = parseInt(process.env['SPECMEM_WATCHER_RETRIES'] || '2');
 async function withWatcherRetry(operation, filePath) {
@@ -132,6 +133,51 @@ export class AutoUpdateTheMemories {
                 this.stats.filesSkipped++;
                 return;
             }
+            // PDF files: extract text via PyMuPDF instead of reading as UTF-8
+            if (isPdfFile(event.path)) {
+                const pdfResult = await extractPdfText(event.path);
+                if (!pdfResult || !pdfResult.text) {
+                    logger.debug({ path: event.relativePath }, 'PDF extraction failed or empty — skipping');
+                    this.stats.filesSkipped++;
+                    return;
+                }
+                // Use extracted PDF text as content for the standard metadata flow
+                const metadata = await this.extractFileMetadata(event.path, event.relativePath, pdfResult.text);
+                if (metadata.size > this.config.maxFileSizeBytes) {
+                    this.stats.filesSkipped++;
+                    return;
+                }
+                const content = pdfResult.text;
+                const existingMemory = await this.findMemoryByContentHash(metadata.contentHash);
+                if (existingMemory) {
+                    this.stats.filesSkipped++;
+                    return;
+                }
+                let embedding;
+                const WATCHER_EMBEDDING_TIMEOUT = getEmbeddingTimeout('fileWatcher');
+                try {
+                    embedding = await withWatcherRetry(async () => {
+                        return new Promise((resolve, reject) => {
+                            const timeoutId = setTimeout(() => reject(new Error('Embedding timeout for PDF')), WATCHER_EMBEDDING_TIMEOUT);
+                            this.config.embeddingProvider.generateEmbedding(content)
+                                .then(result => { clearTimeout(timeoutId); resolve(result); })
+                                .catch(error => { clearTimeout(timeoutId); reject(error); });
+                        });
+                    }, event.path);
+                } catch (embErr) {
+                    logger.warn({ path: event.relativePath, error: embErr.message }, 'PDF embedding failed — storing without embedding');
+                }
+                await this.storeMemory({
+                    content,
+                    metadata,
+                    embedding,
+                    tags: ['codebase', 'auto-ingested', 'pdf'],
+                });
+                this.stats.filesIngested++;
+                logger.info({ path: event.relativePath, pages: pdfResult.pages, chars: pdfResult.chars }, 'PDF file indexed');
+                this.coordinator.emitFileAdded(event.path, event.relativePath, metadata.size);
+                return;
+            }
             // check if binary
             if (await isBinaryFile(event.path)) {
                 logger.debug({ path: event.relativePath }, 'skipping binary file');
@@ -250,14 +296,25 @@ export class AutoUpdateTheMemories {
                 this.stats.filesSkipped++;
                 return;
             }
-            // FIX MED-13: Check binary before extractFileMetadata (same as handleFileAdded)
-            if (await isBinaryFile(event.path)) {
-                logger.debug({ path: event.relativePath }, 'skipping binary file update');
-                this.stats.filesSkipped++;
-                return;
+            // PDF files: extract text via PyMuPDF
+            let pdfContent = null;
+            if (isPdfFile(event.path)) {
+                const pdfResult = await extractPdfText(event.path);
+                if (!pdfResult || !pdfResult.text) {
+                    this.stats.filesSkipped++;
+                    return;
+                }
+                pdfContent = pdfResult.text;
+            } else {
+                // FIX MED-13: Check binary before extractFileMetadata (same as handleFileAdded)
+                if (await isBinaryFile(event.path)) {
+                    logger.debug({ path: event.relativePath }, 'skipping binary file update');
+                    this.stats.filesSkipped++;
+                    return;
+                }
             }
             // extract new metadata (FIX 7.04: content included to avoid double read)
-            const metadata = await this.extractFileMetadata(event.path, event.relativePath);
+            const metadata = await this.extractFileMetadata(event.path, event.relativePath, pdfContent);
             // check file size
             if (metadata.size > this.config.maxFileSizeBytes) {
                 logger.warn({
@@ -419,9 +476,9 @@ export class AutoUpdateTheMemories {
     /**
      * extractFileMetadata - reads file and generates metadata
      */
-    async extractFileMetadata(path, relativePath) {
+    async extractFileMetadata(path, relativePath, preExtractedContent) {
         const stats = await fs.stat(path);
-        const content = await fs.readFile(path, 'utf-8');
+        const content = preExtractedContent || await fs.readFile(path, 'utf-8');
         const contentHash = this.hashContent(content);
         return {
             path,

package/dist/watcher/changeQueue.js CHANGED Viewed

@@ -22,6 +22,7 @@ export class QueueTheChangesUp {
     config;
     queue = [];
     processing = false;
+    paused = false; // pause queue processing without stopping (e.g. during background indexing)
     processingInterval = null;
     changeHandler;
     // deduplication map: path -> latest queued change
@@ -162,13 +163,31 @@ export class QueueTheChangesUp {
             logger.debug({ cancelledTimeouts: cancelledCount }, 'cancelled pending retry timeouts');
         }
     }
+    /**
+     * pause - temporarily halt batch processing without stopping the queue.
+     * Changes still enqueue but won't be processed until resume().
+     * Used during background indexing to avoid resource contention.
+     */
+    pause(reason = '') {
+        if (this.paused) return;
+        this.paused = true;
+        logger.info({ reason, pendingCount: this.queue.length }, 'queue PAUSED');
+    }
+    /**
+     * resume - resume batch processing after pause
+     */
+    resume() {
+        if (!this.paused) return;
+        this.paused = false;
+        logger.info({ pendingCount: this.queue.length }, 'queue RESUMED');
+    }
     /**
      * processBatch - processes a batch of changes
      *
      * nah bruh processing this whole batch at once
      */
     async processBatch() {
-        if (this.queue.length === 0) {
+        if (this.paused || this.queue.length === 0) {
             return;
         }
         logger.debug({

package/embedding-sandbox/frankenstein-embeddings.py CHANGED Viewed

@@ -3916,20 +3916,21 @@ class EmbeddingServer:
             # Extract requestId for persistent socket multiplexing
             request_id = request.get('requestId')
-            # Send "processing" heartbeat ONLY for embedding requests (not health/kys/ready)
+            # Send "working" status ONLY for embedding requests (not health/kys/ready)
+            # "working" means actually processing your query (vs "processing" which was ambiguous)
             # Meta requests expect a single response - sending a heartbeat first breaks the protocol
             # and causes clients to read the heartbeat as the actual response
             if not is_meta_request:
                 text = request.get('text') or request.get('texts')
                 text_length = len(text) if isinstance(text, str) else (len(text) if text else 0)
                 heartbeat = {
-                    'status': 'processing',
+                    'status': 'working',
                     'text_length': text_length
                 }
                 if request_id:
                     heartbeat['requestId'] = request_id
                 hb_ok = self._safe_sendall(conn, json.dumps(heartbeat).encode('utf-8') + b'\n')
-                print(f"[WORKER {thread_name}] Heartbeat sent ok={hb_ok}", file=sys.stderr, flush=True)
+                print(f"[WORKER {thread_name}] Working status sent ok={hb_ok}", file=sys.stderr, flush=True)
             # Process - each thread gets its own call stack
             print(f"[WORKER {thread_name}] Calling handle_request(type={req_type})...", file=sys.stderr, flush=True)

package/embedding-sandbox/mini-cot-service.py CHANGED Viewed

@@ -391,20 +391,18 @@ class ModelManager:
                     file_name="model_quantized.onnx"
                 )
             else:
-                # Fallback: download PyTorch model from HuggingFace
-                print(f"🧠 No local model, downloading {self.model_name}...", file=sys.stderr)
-                from transformers import AutoModelForCausalLM
-                cache_dir = '/tmp/mini-cot-models'
-                os.makedirs(cache_dir, exist_ok=True)
-                self.torch_model = AutoModelForCausalLM.from_pretrained(
-                    self.model_name,
-                    cache_dir=cache_dir,
-                    torch_dtype=torch.float32,
-                    low_cpu_mem_usage=True,
-                    trust_remote_code=True
+                # No local model found — refuse to download from the internet
+                search_paths = [
+                    os.environ.get('SPECMEM_MODEL_CACHE', '(not set)'),
+                    os.path.join(os.path.dirname(__file__), '..', 'models', 'pythia-410m-onnx-quant'),
+                    '/app/models/pythia-onnx-quant',
+                ]
+                raise RuntimeError(
+                    f"Local ONNX model not found. Searched:\n"
+                    + "\n".join(f"  - {p}" for p in search_paths)
+                    + "\n\nRun `specmem init` to download models via Git LFS release tarball."
+                    + "\nSpecMem will NOT download models from the internet at runtime."
                 )
-                self.torch_model.eval()
-                torch.set_grad_enabled(False)
             self.torch_loaded = True
             print(f"🧠 Generation model loaded for crawl analysis", file=sys.stderr)

package/embedding-sandbox/pdf-text-extract.py ADDED Viewed

@@ -0,0 +1,208 @@
+#!/usr/bin/env python3
+"""
+pdf-text-extract.py — PDF text extraction for SpecMem codebase indexing
+Uses PyMuPDF (fitz) for instant digital PDF text extraction (0.003s/page).
+Falls back to Tesseract OCR via PyMuPDF's built-in integration for scanned pages.
+Usage:
+    # Single file mode
+    python3 pdf-text-extract.py <pdf_path> [--max-pages N] [--language LANG]
+    # Batch mode (JSONL — one result per line, one Python startup for N PDFs)
+    python3 pdf-text-extract.py --batch file1.pdf file2.pdf ... [--max-pages N]
+Output (JSON/JSONL to stdout):
+    {"path": "/abs/path.pdf", "text": "...", "pages": 5, "scanned_pages": [3], "chars": 12345}
+    {"path": "/abs/path2.pdf", "error": "..."}
+"""
+import sys
+import json
+import os
+import argparse
+# ---------------------------------------------------------------------------
+# Auto-install pymupdf if missing (matches frankenstein-embeddings pattern)
+# ---------------------------------------------------------------------------
+def _ensure_pymupdf():
+    try:
+        import pymupdf
+        return pymupdf
+    except ImportError:
+        pass
+    # Try legacy import name
+    try:
+        import fitz
+        return fitz
+    except ImportError:
+        pass
+    # Auto-install
+    try:
+        import subprocess
+        sys.stderr.write('[pdf-text-extract] pymupdf not found, installing...\n')
+        subprocess.check_call(
+            [sys.executable, '-m', 'pip', 'install', '--quiet', 'pymupdf'],
+            stdout=subprocess.DEVNULL
+        )
+        try:
+            import pymupdf
+            return pymupdf
+        except ImportError:
+            import fitz
+            return fitz
+    except Exception as e:
+        _error_exit(f'Failed to install pymupdf: {e}')
+def _error_exit(msg):
+    """Print error JSON and exit."""
+    print(json.dumps({'error': str(msg)}, ensure_ascii=False))
+    sys.exit(1)
+def _is_scanned_page(page, text):
+    """
+    Heuristic: page is likely scanned if:
+    1. Extracted text is very short (< 50 chars after stripping)
+    2. Page has images covering >60% of page area
+    """
+    stripped = text.strip()
+    if len(stripped) > 50:
+        return False
+    try:
+        images = page.get_image_info()
+        if not images:
+            return False
+        page_area = abs(page.rect)
+        if page_area == 0:
+            return False
+        image_area = 0
+        for img in images:
+            if 'bbox' in img:
+                try:
+                    import pymupdf
+                    r = pymupdf.Rect(img['bbox'])
+                except (ImportError, Exception):
+                    import fitz
+                    r = fitz.Rect(img['bbox'])
+                image_area += abs(r)
+        return (image_area / page_area) >= 0.6
+    except Exception:
+        return False
+def _ocr_page(page, language='eng'):
+    """
+    Attempt Tesseract OCR on a scanned page via PyMuPDF's built-in integration.
+    Returns extracted text or empty string if tesseract unavailable.
+    """
+    try:
+        tp = page.get_textpage_ocr(language=language, dpi=300)
+        return page.get_text(textpage=tp).strip()
+    except Exception as e:
+        msg = str(e).lower()
+        if 'tesseract' in msg or 'not installed' in msg or 'not found' in msg:
+            # Tesseract not installed — skip OCR, return what we have
+            sys.stderr.write(f'[pdf-text-extract] Tesseract not available, skipping OCR for scanned page\n')
+            return ''
+        # Other error — still don't crash
+        sys.stderr.write(f'[pdf-text-extract] OCR failed: {e}\n')
+        return ''
+def extract_pdf(pdf_path, max_pages=100, language='eng'):
+    """
+    Extract text from PDF using PyMuPDF.
+    Digital pages: instant text extraction.
+    Scanned pages: Tesseract OCR fallback.
+    """
+    pymupdf = _ensure_pymupdf()
+    if not os.path.isfile(pdf_path):
+        return {'error': f'File not found: {pdf_path}'}
+    try:
+        doc = pymupdf.open(pdf_path)
+    except Exception as e:
+        msg = str(e).lower()
+        if 'password' in msg or 'encrypt' in msg:
+            return {'error': f'PDF is password-protected: {pdf_path}'}
+        return {'error': f'Failed to open PDF: {e}'}
+    total_pages = len(doc)
+    process_count = min(total_pages, max_pages)
+    truncated = total_pages > max_pages
+    texts = []
+    scanned_pages = []
+    for i in range(process_count):
+        page = doc[i]
+        text = page.get_text().strip()
+        if _is_scanned_page(page, text):
+            # Try OCR
+            ocr_text = _ocr_page(page, language)
+            if ocr_text:
+                text = ocr_text
+                scanned_pages.append(i + 1)  # 1-indexed
+            # If OCR also empty, keep whatever minimal text we got
+        if text:
+            if process_count > 1:
+                texts.append(f'--- Page {i + 1} ---\n{text}')
+            else:
+                texts.append(text)
+    doc.close()
+    full_text = '\n\n'.join(texts)
+    result = {
+        'text': full_text,
+        'pages': process_count,
+        'chars': len(full_text),
+    }
+    if scanned_pages:
+        result['scanned_pages'] = scanned_pages
+    if truncated:
+        result['truncated'] = True
+        result['total_pages'] = total_pages
+    return result
+def main():
+    parser = argparse.ArgumentParser(description='Extract text from PDF files')
+    parser.add_argument('pdf_path', nargs='?', help='Path to the PDF file (single mode)')
+    parser.add_argument('--batch', nargs='+', metavar='PDF',
+                        help='Batch mode: extract multiple PDFs (JSONL output, one line per PDF)')
+    parser.add_argument('--max-pages', type=int, default=100,
+                        help='Maximum pages to process per PDF (default: 100)')
+    parser.add_argument('--language', default='eng',
+                        help='Tesseract language for OCR fallback (default: eng)')
+    args = parser.parse_args()
+    if args.batch:
+        # Batch mode — JSONL output, one result per line
+        # Single Python startup for N PDFs (avoids repeated interpreter overhead)
+        for pdf_path in args.batch:
+            result = extract_pdf(pdf_path, args.max_pages, args.language)
+            result['path'] = pdf_path
+            print(json.dumps(result, ensure_ascii=False), flush=True)
+    elif args.pdf_path:
+        # Single file mode
+        result = extract_pdf(args.pdf_path, args.max_pages, args.language)
+        result['path'] = args.pdf_path
+        print(json.dumps(result, ensure_ascii=False))
+    else:
+        parser.print_help()
+        sys.exit(1)
+if __name__ == '__main__':
+    main()

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "specmem-hardwicksoftware",
-  "version": "3.7.35",
+  "version": "3.7.38",
   "type": "module",
   "description": "Your Claude Code sessions don't have to start from scratch anymore — SpecMem gives your AI real memory. It won't forget your conversations, your code, or your architecture decisions between sessions. That's the whole point. Semantic code indexing that actually works: TypeScript, JavaScript, Python, Go, Rust, Java, Kotlin, C, C++, HTML and more. It doesn't just track functions — it gets classes, methods, fields, constants, enums, macros, imports, structs, the whole codebase graph. There's chat memory too, powered by pgvector embeddings. You've also got token compression, team coordination, multi-agent comms, and file watching built in. 74+ MCP tools. Runs on PostgreSQL + Docker. It's kind of a big deal. justcalljon.pro",
   "main": "dist/index.js",