npm - specmem-hardwicksoftware - Versions diffs - 3.7.35 → 3.7.38 - Mend

specmem-hardwicksoftware 3.7.35 → 3.7.38

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (71) hide show

package/CHANGELOG.md +34 -0
package/README.md +11 -15
package/bin/specmem-autoclaude.cjs +12 -1
package/bin/specmem-cli.cjs +1077 -11
package/bin/specmem-console.cjs +890 -63
package/bootstrap.cjs +10 -2
package/claude-hooks/agent-loading-hook.cjs +16 -16
package/claude-hooks/agent-loading-hook.js +28 -21
package/claude-hooks/agent-type-matcher.js +1 -1
package/claude-hooks/background-completion-silencer.js +1 -1
package/claude-hooks/file-claim-enforcer.cjs +37 -36
package/claude-hooks/output-cleaner.cjs +1 -1
package/claude-hooks/refusal-detector-hook.cjs +53 -0
package/claude-hooks/settings.json +64 -4
package/claude-hooks/smart-search-interceptor.js +1 -1
package/claude-hooks/specmem-search-enforcer.cjs +2 -11
package/claude-hooks/specmem-team-member-inject.js +1 -1
package/claude-hooks/specmem-unified-hook.py +1 -1
package/claude-hooks/subagent-loading-hook.cjs +1 -1
package/claude-hooks/task-progress-hook.cjs +7 -7
package/claude-hooks/task-progress-hook.js +3 -3
package/claude-hooks/team-comms-enforcer.cjs +113 -47
package/claude-hooks/use-code-pointers.cjs +1 -1
package/dist/claude-sessions/sessionParser.js +5 -0
package/dist/cli/deploy-to-claude.js +9 -2
package/dist/codebase/codebaseIndexer.js +48 -17
package/dist/codebase/exclusions.js +3 -4
package/dist/codebase/index.js +4 -0
package/dist/codebase/pdfExtractor.js +298 -0
package/dist/dashboard/api/taskTeamMembers.js +2 -2
package/dist/db/bigBrainMigrations.js +29 -0
package/dist/hooks/hookManager.js +4 -4
package/dist/hooks/teamFramingCli.js +1 -1
package/dist/hooks/teamMemberPrepromptHook.js +5 -5
package/dist/index.js +49 -12
package/dist/init/claudeConfigInjector.js +27 -8
package/dist/installer/autoInstall.js +7 -1
package/dist/mcp/compactionProxy.js +1052 -192
package/dist/mcp/compactionProxyDaemon.js +112 -37
package/dist/mcp/contextVault.js +439 -0
package/dist/mcp/embeddingServerManager.js +151 -17
package/dist/mcp/mcpProtocolHandler.js +6 -1
package/dist/mcp/miniCOTServerManager.js +82 -8
package/dist/mcp/specMemServer.js +45 -10
package/dist/mcp/toolRegistry.js +6 -0
package/dist/startup/startupIndexing.js +14 -0
package/dist/team-members/taskOrchestrator.js +3 -3
package/dist/team-members/taskTeamMemberLogger.js +2 -2
package/dist/tools/goofy/deployTeamMember.js +3 -3
package/dist/tools/goofy/digInTheVault.js +81 -0
package/dist/tools/goofy/findCodePointers.js +17 -0
package/dist/tools/goofy/findWhatISaid.js +19 -0
package/dist/tools/goofy/stashTheGoods.js +56 -0
package/dist/tools/teamMemberDeployer.js +2 -2
package/dist/watcher/changeHandler.js +65 -8
package/dist/watcher/changeQueue.js +20 -1
package/embedding-sandbox/frankenstein-embeddings.py +4 -3
package/embedding-sandbox/mini-cot-service.py +11 -13
package/embedding-sandbox/pdf-text-extract.py +208 -0
package/package.json +1 -1
package/scripts/deploy-hooks.cjs +12 -4
package/scripts/fast-batch-embedder.cjs +2 -2
package/scripts/force-retry.cjs +34 -0
package/scripts/global-postinstall.cjs +97 -4
package/scripts/poetic-abliteration.cjs +379 -0
package/scripts/refusal-enforcer.cjs +88 -0
package/scripts/specmem-init.cjs +222 -41
package/specmem/model-config.json +6 -6
package/specmem/supervisord.conf +1 -1
package/svg-sections/readme-token-compaction.svg +246 -0
package/claude-hooks/agent-chooser-hook.js +0 -179

package/claude-hooks/task-progress-hook.cjs CHANGED Viewed

@@ -1,6 +1,6 @@
 #!/usr/bin/env node
 /**
- * TASK PROGRESS HOOK - Real loading bars for Task tool agents
+ * AGENT PROGRESS HOOK - Real loading bars for Agent tool agents
  *
  * Writes DIRECTLY to /dev/tty to bypass Claude's stdout capture
  * This actually shows content in the terminal!
@@ -92,8 +92,8 @@ process.stdin.on('end', async () => {
     const data = JSON.parse(input);
     const { hookEventName, toolName } = data;
-    // Only handle Task tool
-    if (toolName !== 'Task') {
+    // Only handle Agent tool
+    if (toolName !== 'Agent') {
       console.log(JSON.stringify({ continue: true }));
       return;
     }
@@ -114,7 +114,7 @@ process.stdin.on('end', async () => {
 function handlePreTask(data) {
   const { toolInput } = data;
-  const description = toolInput?.description || 'Task';
+  const description = toolInput?.description || 'Agent';
   const runInBackground = toolInput?.run_in_background !== false;
   // Track task
@@ -148,11 +148,11 @@ function handlePreTask(data) {
       hookEventName: 'PreToolUse',
       additionalContext: `
 [AGENT #${taskNum} DEPLOYED]
-Task: ${description}
+Agent: ${description}
 Status: Running in background
 OUTPUT PROGRESS using send_team_message():
-- When starting: send_team_message({message: "🔄 Starting: [task]"})
+- When starting: send_team_message({message: "🔄 Starting: [agent task]"})
 - During work: send_team_message({message: "📝 Progress: [update]"})
 - When done: send_team_message({message: "✅ Completed: [summary]"})
 `
@@ -164,7 +164,7 @@ OUTPUT PROGRESS using send_team_message():
 function handlePostTask(data) {
   const { toolInput, toolOutput } = data;
-  const description = toolInput?.description || 'Task';
+  const description = toolInput?.description || 'Agent';
   killSpinner();

package/claude-hooks/task-progress-hook.js CHANGED Viewed

@@ -93,7 +93,7 @@ process.stdin.on('end', async () => {
     const { hookEventName, toolName } = data;
     // Only handle Task tool
-    if (toolName !== 'Task') {
+    if (toolName !== 'Agent') {
       console.log(JSON.stringify({ continue: true }));
       return;
     }
@@ -114,7 +114,7 @@ process.stdin.on('end', async () => {
 function handlePreTask(data) {
   const { toolInput } = data;
-  const description = toolInput?.description || 'Task';
+  const description = toolInput?.description || 'Agent';
   const runInBackground = toolInput?.run_in_background !== false;
   // Track task
@@ -164,7 +164,7 @@ OUTPUT PROGRESS using send_team_message():
 function handlePostTask(data) {
   const { toolInput, toolOutput } = data;
-  const description = toolInput?.description || 'Task';
+  const description = toolInput?.description || 'Agent';
   killSpinner();

package/claude-hooks/team-comms-enforcer.cjs CHANGED Viewed

@@ -77,7 +77,7 @@ try {
 // CONFIGURATION
 // ============================================================================
 const MAX_SEARCHES_BEFORE_BLOCK = 2;  // Every other search must use find_code_pointers/find_memory
-const TEAM_COMMS_CHECK_INTERVAL = 4;  // MUST read_team_messages every 4 tool usages
+const TEAM_COMMS_CHECK_INTERVAL = 3;  // MUST send_team_message every 3 tool usages
 const BROADCAST_CHECK_INTERVAL = 5;   // MUST read_team_messages w/ include_broadcasts every 5 tool usages
 const HELP_CHECK_INTERVAL = 8;        // Check help requests every 8 tool usages
@@ -118,14 +118,18 @@ const HELP_CHECK_TOOLS = [
 // NOTE: Read is NOT included — agents abuse Read to reset search counters
 const BASIC_SEARCH_TOOLS = ['Grep', 'Glob'];
+// READ + SEARCH combined — forces team msg every 3 reads OR searches
+const READ_SEARCH_TOOLS = ['Read', 'Grep', 'Glob'];
+const READ_SEARCH_COMMS_INTERVAL = 3;  // Must send_team_message every 3 reads/searches
 // Dangerous tools that require full compliance
 const WRITE_TOOLS = ['Edit', 'Write', 'NotebookEdit'];
 // FULL COMPLIANCE TOOLS - agents use these to bypass everything
 // Requires: announced + claimed + usedMemoryTools
 // - Bash: can run grep/cat/sed/echo to bypass all limits
-// - Task: can spawn sub-agents to bypass limits
-const FULL_COMPLIANCE_TOOLS = ['Bash', 'Task'];
+// - Agent: can spawn sub-agents to bypass limits
+const FULL_COMPLIANCE_TOOLS = ['Bash', 'Agent'];
 // Tools that are always allowed (reading team state + cross-swarm help + research)
 const ALWAYS_ALLOWED = [
@@ -149,7 +153,6 @@ const ALWAYS_ALLOWED = [
   'WebFetch',
   'WebSearch',
   'ToolSearch',
-  'Read',
 ];
 // ============================================================================
@@ -187,6 +190,9 @@ function getAgentState(tracking, sessionId) {
       needsCommsCheck: false,      // HARD BLOCK until they read team messages
       needsBroadcastCheck: false,  // HARD BLOCK until they read broadcasts
       needsHelpCheck: false,       // Flag when they hit the limit
+      readSearchCount: 0,          // Read/Grep/Glob count since last team msg
+      preClaimMsgSent: false,      // Must send team msg BEFORE claim_task
+      postReleasePending: false,   // Must send team msg AFTER release_task
       lastActivity: Date.now()
     };
   }
@@ -222,33 +228,11 @@ function isRunningAsAgent() {
   // Deployed team members — always enforce
   if (isTeamMemberFn()) return true;
-  // Method 2: General-purpose subagents (CLAUDE_SUBAGENT=1)
-  // These DO have MCP tools and SHOULD be enforced.
-  // Exclude Explore/Plan agents — they don't have MCP tools and can't comply.
-  // We check agents.json to see if the active subagent has MCP tools.
+  // Method 2: CLAUDE_SUBAGENT=1 — env var is proof enough, no agents.json check needed
   if (process.env.CLAUDE_SUBAGENT === '1' || process.env.CLAUDE_AGENT_ID) {
-    try {
-      const agentsFile = `${PROJECT_TMP_DIR}/agents.json`;
-      if (fs.existsSync(agentsFile)) {
-        const data = JSON.parse(fs.readFileSync(agentsFile, 'utf8'));
-        const now = Date.now();
-        for (const agent of Object.values(data.agents || {})) {
-          // Active agent (started within 10 min, no endTime)
-          if (!agent.endTime && agent.startTime && (now - agent.startTime < 600000)) {
-            // Check if this agent has MCP tools (general-purpose agents do)
-            const tools = agent.tools || [];
-            const hasMcpTools = tools.some(t => t.startsWith('mcp__specmem__'));
-            if (hasMcpTools) return true;
-          }
-        }
-      }
-    } catch {}
-    // No agents.json or no MCP tools found — this is likely Explore/Plan, skip enforcement
-    return false;
+    return true;
   }
-  // Method 3: Check subagent tracking as fallback (parent context seeing active agents)
-  // This does NOT enforce on the parent — only on processes with CLAUDE_SUBAGENT=1
   return false;
 }
@@ -343,6 +327,16 @@ process.stdin.on('end', () => {
     // ========================================================================
     if (ANNOUNCE_TOOLS.includes(toolName)) {
       state.announced = true;
+      // Reset comms counter on SEND (agents must send updates, not just read)
+      state.commsToolCount = 0;
+      state.lastCommsCheck = Date.now();
+      state.needsCommsCheck = false;
+      // Reset read/search counter — team msg obligation fulfilled
+      state.readSearchCount = 0;
+      // Cleared to claim files (must msg BEFORE claiming)
+      state.preClaimMsgSent = true;
+      // Release obligation fulfilled (must msg AFTER releasing)
+      state.postReleasePending = false;
     }
     if (CLAIM_TOOLS.includes(toolName)) {
       state.claimed = true;
@@ -370,6 +364,8 @@ process.stdin.on('end', () => {
         fs.writeFileSync(GLOBAL_CLAIMS_FILE, JSON.stringify(globalClaims, null, 2));
       } catch (e) {}
       state.currentClaimId = claimId;
+      // Consumed — next claim needs a fresh team msg
+      state.preClaimMsgSent = false;
     }
     if (toolName === 'mcp__specmem__release_task') {
       // Remove this session's claims from GLOBAL file
@@ -384,18 +380,23 @@ process.stdin.on('end', () => {
       } catch (e) {}
       state.claimed = false;
       state.editedFiles = [];
+      // Must send team msg AFTER releasing — announce the release
+      state.postReleasePending = true;
     }
     if (MEMORY_TOOLS.includes(toolName)) {
       state.usedMemoryTools = true;
       state.searchCount = 0;  // Reset search counter — allows next 2 searches
       // usedMemoryTools resets to false after 2 more searches (see BASIC_SEARCH_TOOLS block)
     }
-    // Track team comms reads - resets comms counter
+    // Track Read/Search count for team comms cadence
+    if (READ_SEARCH_TOOLS.includes(toolName)) {
+      state.readSearchCount = (state.readSearchCount || 0) + 1;
+    }
+    // Track team comms reads - resets BROADCAST counter only
+    // Comms counter now resets on SEND via ANNOUNCE_TOOLS, not on READ
     if (BROADCAST_CHECK_TOOLS.includes(toolName)) {
-      state.commsToolCount = 0;
-      state.lastCommsCheck = Date.now();
-      state.needsCommsCheck = false;
-      // Also reset broadcast counter IF they included broadcasts
+      // Broadcast counter reset IF they included broadcasts
       const params = data.tool_input || {};
       if (params.include_broadcasts !== false) {
         state.broadcastToolCount = 0;
@@ -431,16 +432,16 @@ process.stdin.on('end', () => {
     state.helpToolUsageCount = (state.helpToolUsageCount || 0) + 1;
     // ========================================================================
-    // HARD BLOCK: Must read team messages every 4 tool usages
-    // read_team_messages() satisfies this - any mode
+    // HARD BLOCK: Must send team message every 3 tool usages
+    // send_team_message() or broadcast_to_team() satisfies this
     // ========================================================================
-    if (state.commsToolCount >= TEAM_COMMS_CHECK_INTERVAL && !BROADCAST_CHECK_TOOLS.includes(toolName)) {
+    if (state.commsToolCount >= TEAM_COMMS_CHECK_INTERVAL && !ANNOUNCE_TOOLS.includes(toolName)) {
       state.needsCommsCheck = true;
       state.blockedCount++;
       saveTracking(tracking);
       console.log(blockResponse(
-        'mcp__specmem__read_team_messages',
-        `Quick check-in — other team members may have updates that affect your work. Call: read_team_messages({include_swarms: true, limit: 5})`
+        'mcp__specmem__send_team_message',
+        `Time to update the team on your progress. Call: send_team_message({type:"status", message:"[what you're doing / what you found]"})`
       ));
       return;
     }
@@ -474,6 +475,48 @@ process.stdin.on('end', () => {
       return;
     }
+    // ========================================================================
+    // HARD BLOCK: Read/Search cadence — must send team msg every 3 reads/searches
+    // Tracks Read, Grep, Glob separately from general comms counter
+    // ========================================================================
+    if ((state.readSearchCount || 0) >= READ_SEARCH_COMMS_INTERVAL && !ANNOUNCE_TOOLS.includes(toolName)) {
+      state.blockedCount++;
+      saveTracking(tracking);
+      console.log(blockResponse(
+        'mcp__specmem__send_team_message',
+        `You've done ${state.readSearchCount} reads/searches without updating the team. Share what you found! Call: send_team_message({type:"update", message:"[share findings from your recent reads/searches]"})`
+      ));
+      return;
+    }
+    // ========================================================================
+    // HARD BLOCK: Must send team msg BEFORE claiming a file
+    // Announce what you're about to claim so teammates know
+    // ========================================================================
+    if (toolName === 'mcp__specmem__claim_task' && !state.preClaimMsgSent) {
+      state.blockedCount++;
+      saveTracking(tracking);
+      console.log(blockResponse(
+        'mcp__specmem__send_team_message',
+        `Announce your claim FIRST! Tell the team what files/area you're about to work on. Call: send_team_message({type:"status", message:"Claiming [files/area] — about to work on [description]"})`
+      ));
+      return;
+    }
+    // ========================================================================
+    // HARD BLOCK: Must send team msg AFTER releasing a claim
+    // Let teammates know files are available again
+    // ========================================================================
+    if (state.postReleasePending && !ANNOUNCE_TOOLS.includes(toolName)) {
+      state.blockedCount++;
+      saveTracking(tracking);
+      console.log(blockResponse(
+        'mcp__specmem__send_team_message',
+        `You released a claim but didn't tell the team! Announce the release. Call: send_team_message({type:"update", message:"Released claim on [files] — files are free for others"})`
+      ));
+      return;
+    }
     // ========================================================================
     // ALWAYS ALLOWED TOOLS - pass through after counter checks
     // ========================================================================
@@ -488,7 +531,7 @@ process.stdin.on('end', () => {
     // ========================================================================
     if (state.commsToolCount === TEAM_COMMS_CHECK_INTERVAL - 1) {
       console.log(allowWithReminder(
-        `Heads up — good time to check in with the team: read_team_messages({include_swarms: true, limit: 5})`
+        `Heads up — good time to update the team: send_team_message({type:"status", message:"[progress update]"})`
       ));
       // Don't return - continue to other checks
     }
@@ -609,18 +652,41 @@ process.stdin.on('end', () => {
     }
     // ========================================================================
-    // CLAIM RELEASE ENFORCEMENT — After ANY edit, BLOCK until release
-    // Flow: claim_task → Edit/Write → release_task → claim_task → Edit/Write → release_task
+    // CLAIM RELEASE + NOTIFICATION ENFORCEMENT — After edit, BLOCK until release AND notify
+    // Flow: claim_task → Edit/Write → release_task + send_team_message → next task
     // ========================================================================
     if (state.editedFiles && state.editedFiles.length > 0 && state.claimed && !WRITE_TOOLS.includes(toolName)) {
-      // Allow: release_task, always-allowed tools, and write tools (handled in WRITE_TOOLS block)
-      if (!ALWAYS_ALLOWED.includes(toolName) && toolName !== 'mcp__specmem__release_task') {
+      const isReleaseTool = toolName === 'mcp__specmem__release_task';
+      const isNotifyTool = ANNOUNCE_TOOLS.includes(toolName);
+      // Track completion of release/notify obligations
+      if (isReleaseTool) state.releasedClaim = true;
+      if (isNotifyTool) state.releaseNotified = true;
+      // Both obligations met — clear state and continue
+      if (state.releasedClaim && state.releaseNotified) {
+        state.editedFiles = [];
+        state.releasedClaim = false;
+        state.releaseNotified = false;
+        state.claimed = false;
+        state.currentClaimId = null;
+      }
+      // Allow release/notify tools and always-allowed tools through
+      else if (!isReleaseTool && !isNotifyTool && !ALWAYS_ALLOWED.includes(toolName)) {
         state.blockedCount++;
         saveTracking(tracking);
-        console.log(blockResponse(
-          'mcp__specmem__release_task',
-          `You're done editing ${state.editedFiles[state.editedFiles.length - 1]} — release the claim so other team members can work on it. Call: release_task({claimId:"${state.currentClaimId || 'your-claim-id'}"})`
-        ));
+        if (!state.releasedClaim) {
+          console.log(blockResponse(
+            'mcp__specmem__release_task',
+            `Done editing ${state.editedFiles[state.editedFiles.length - 1]} — release the claim so others can work on it. Call: release_task({claimId:"${state.currentClaimId || 'your-claim-id'}"})`
+          ));
+        } else {
+          console.log(blockResponse(
+            'mcp__specmem__send_team_message',
+            `Claim released — now notify the team about your changes. Call: send_team_message({type:"update", message:"Finished editing ${state.editedFiles[state.editedFiles.length - 1]}: [describe what you changed]"})`
+          ));
+        }
         return;
       }
     }

package/claude-hooks/use-code-pointers.cjs CHANGED Viewed

@@ -73,7 +73,7 @@ async function generateEmbedding(text, socketPath) {
       for (const line of lines) {
         try {
           const resp = JSON.parse(line);
-          if (resp.status === 'processing') continue;
+          if (resp.status === 'working') continue;
           if (resp.embedding) { socket.end(); resolve(resp.embedding); return; }
           if (resp.error) { socket.end(); reject(new Error(resp.error)); return; }
         } catch (e) {}

package/dist/claude-sessions/sessionParser.js CHANGED Viewed

@@ -996,6 +996,11 @@ export function isToolOrThinkingContent(content) {
         return true;
     if (trimmed.startsWith('[Tool:'))
         return true;
+    // Skip task/agent notification XML blocks — system noise, not conversation
+    if (trimmed.startsWith('<task-notification>'))
+        return true;
+    if (trimmed.includes('<task-id>') && trimmed.includes('</task-id>'))
+        return true;
     // Check for [CLAUDE] prefixed tool versions
     if (trimmed.startsWith('[CLAUDE] [Tools:'))
         return true;

package/dist/cli/deploy-to-claude.js CHANGED Viewed

@@ -218,7 +218,7 @@ function updateSettings() {
     const settingsPath = path.join(CLAUDE_HOME, 'settings.json');
     try {
         let settings = {};
-        // Load existing settings
+        // Load existing settings - PRESERVE all non-specmem keys (env, model, etc.)
         if (fs.existsSync(settingsPath)) {
             try {
                 settings = JSON.parse(fs.readFileSync(settingsPath, 'utf-8'));
@@ -227,6 +227,9 @@ function updateSettings() {
                 log('Could not parse existing settings.json, creating new one');
             }
         }
+        // Capture user's custom env BEFORE any modifications.
+        // These include ANTHROPIC_BASE_URL, ANTHROPIC_AUTH_TOKEN, model overrides, etc.
+        const _userCustomEnv = settings.env;
         // IMPORTANT: Do NOT write hooks to main settings.json
         // All hook config lives in ~/.claude/hooks/settings.json (deployed as a file)
         // Writing hooks here would cause DOUBLE-FIRING of every hook
@@ -275,9 +278,13 @@ function updateSettings() {
                 settings.permissions.allow.push(perm);
             }
         }
+        // Restore user's custom env - NEVER clobber ANTHROPIC_BASE_URL, model overrides, etc.
+        if (_userCustomEnv !== undefined) {
+            settings.env = _userCustomEnv;
+        }
         // Write updated settings
         fs.writeFileSync(settingsPath, JSON.stringify(settings, null, 2));
-        log('Updated settings.json (permissions only — hooks in hooks/settings.json)');
+        log('Updated settings.json (permissions only — hooks in hooks/settings.json — custom env preserved)');
         return true;
     }
     catch (error) {

package/dist/codebase/codebaseIndexer.js CHANGED Viewed

@@ -28,6 +28,7 @@ import * as os from 'os';
 import { v4 as uuidv4 } from 'uuid';
 import chokidar from 'chokidar';
 import { logger } from '../utils/logger.js';
+import { extractPdfText, extractPdfBatch, isPdfFile } from './pdfExtractor.js';
 import { getProjectPath } from '../config.js';
 import { getCoordinator } from '../coordination/integration.js';
 /**
@@ -36,15 +37,15 @@ import { getCoordinator } from '../coordination/integration.js';
  */
 function loadResourceLimits() {
     const limits = {
-        cpuMax: 40,          // max CPU % target (back-pressure threshold)
+        cpuMax: 35,          // max CPU % target (back-pressure threshold)
         cpuMin: 10,          // min CPU % (crawl mode)
-        ramMaxMb: 6000,      // max RAM MB
+        ramMaxMb: 4000,      // max RAM MB (safe for 8GB laptops)
         ramMinMb: 2000,      // min RAM MB
         batchSize: 25,       // files per batch (was 200!)
-        maxConcurrency: 8,   // max parallel file reads within a batch
+        maxConcurrency: 4,   // max parallel file reads (safe for dual-core i3s)
         batchDelayMs: 50,    // delay between batches (ms)
         batchDelayMaxMs: 2000, // max delay under heavy load
-        cpuCoreMax: 0,       // 0 = auto (use all cores)
+        cpuCoreMax: 2,       // max CPU cores (safe for dual-core i3s)
     };
     // 1. Read from model-config.json
     try {
@@ -177,7 +178,8 @@ const DEFAULT_CONFIG = {
         '.c', '.cpp', '.h', '.hpp',
         '.swift',
         '.dockerfile', 'Dockerfile',
-        '.env.example', '.env.template'
+        '.env.example', '.env.template',
+        '.pdf'
     ],
     maxFileSizeBytes: 1024 * 1024, // 1MB
     generateEmbeddings: true,
@@ -444,16 +446,25 @@ export class CodebaseIndexer {
                     const stats = await fs.stat(filePath);
                     if (stats.size > this.config.maxFileSizeBytes)
                         return;
-                    if (await this.isBinaryFile(filePath))
-                        return;
-                    const content = await fs.readFile(filePath, 'utf-8');
+                    // PDF files: extract text via PyMuPDF instead of reading as UTF-8
+                    let content;
+                    if (isPdfFile(filePath)) {
+                        const pdfResult = await extractPdfText(filePath);
+                        if (!pdfResult || !pdfResult.text) return;
+                        content = pdfResult.text;
+                        logger.debug({ filePath: relativePath, pages: pdfResult.pages, chars: pdfResult.chars }, 'PDF text extracted');
+                    } else {
+                        if (await this.isBinaryFile(filePath))
+                            return;
+                        content = await fs.readFile(filePath, 'utf-8');
+                    }
                     const contentHash = this.hashContent(content);
                     const existingHash = existingHashes.get(relativePath);
                     if (existingHash === contentHash) {
                         skipped++;
                         return;
                     }
-                    const indexedFile = await this.indexFile(filePath);
+                    const indexedFile = await this.indexFile(filePath, isPdfFile(filePath) ? content : undefined);
                     if (indexedFile) {
                         this.index.set(indexedFile.filePath, indexedFile);
                         changedFiles.push(indexedFile);
@@ -616,9 +627,17 @@ export class CodebaseIndexer {
                     if (existing && existing.mtime && stats.mtime.getTime() <= existing.mtime) {
                         return { skipped: true, relativePath, mtimeSkip: true };
                     }
-                    if (await this.isBinaryFile(filePath))
-                        return null;
-                    const content = await fs.readFile(filePath, 'utf-8');
+                    // PDF files: extract text via PyMuPDF instead of reading as UTF-8
+                    let content;
+                    if (isPdfFile(filePath)) {
+                        const pdfResult = await extractPdfText(filePath);
+                        if (!pdfResult || !pdfResult.text) return null;
+                        content = pdfResult.text;
+                    } else {
+                        if (await this.isBinaryFile(filePath))
+                            return null;
+                        content = await fs.readFile(filePath, 'utf-8');
+                    }
                     const contentHash = this.hashContent(content);
                     if (existing && existing.hash === contentHash) {
                         return { skipped: true, relativePath, hashSkip: true };
@@ -1178,7 +1197,7 @@ export class CodebaseIndexer {
     /**
      * indexFile - reads and indexes a single file with enhanced analysis
      */
-    async indexFile(absolutePath) {
+    async indexFile(absolutePath, preExtractedContent) {
         try {
             const stats = await fs.stat(absolutePath);
             // skip if too large
@@ -1186,11 +1205,23 @@ export class CodebaseIndexer {
                 logger.debug({ path: absolutePath, size: stats.size }, 'skipping large file');
                 return null;
             }
-            // skip if binary
-            if (await this.isBinaryFile(absolutePath)) {
-                return null;
+            // PDF files: use pre-extracted content or extract on demand
+            let content;
+            if (isPdfFile(absolutePath)) {
+                if (preExtractedContent) {
+                    content = preExtractedContent;
+                } else {
+                    const pdfResult = await extractPdfText(absolutePath);
+                    if (!pdfResult || !pdfResult.text) return null;
+                    content = pdfResult.text;
+                }
+            } else {
+                // skip if binary
+                if (await this.isBinaryFile(absolutePath)) {
+                    return null;
+                }
+                content = await fs.readFile(absolutePath, 'utf-8');
             }
-            const content = await fs.readFile(absolutePath, 'utf-8');
             const relativePath = path.relative(this.config.codebasePath, absolutePath);
             const fileName = path.basename(absolutePath);
             const extension = path.extname(absolutePath).toLowerCase();

package/dist/codebase/exclusions.js CHANGED Viewed

@@ -47,7 +47,7 @@ export const EXCLUSION_CONFIG = {
         '*.db',
         // Binary assets
         '*.png', '*.jpg', '*.jpeg', '*.gif', '*.ico', '*.webp',
-        '*.pdf', '*.zip', '*.tar', '*.gz', '*.rar', '*.7z',
+        '*.zip', '*.tar', '*.gz', '*.rar', '*.7z',
         '*.mp3', '*.mp4', '*.avi', '*.mov', '*.mkv',
         '*.ttf', '*.woff', '*.woff2', '*.eot', '*.otf',
         '*.exe', '*.dll', '*.so', '*.dylib', '*.bin',
@@ -145,7 +145,6 @@ const DEFAULT_EXCLUSIONS = [
     '*.mp4',
     '*.avi',
     '*.mov',
-    '*.pdf',
     '*.zip',
     '*.tar',
     '*.gz',
@@ -547,8 +546,8 @@ const BINARY_EXTENSIONS = new Set([
     '.zip', '.tar', '.gz', '.bz2', '.7z', '.rar', '.xz', '.lz', '.lzma',
     // executables and libraries
     '.exe', '.dll', '.so', '.dylib', '.bin', '.out', '.app', '.msi', '.deb', '.rpm',
-    // documents (binary formats)
-    '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.odt', '.ods', '.odp',
+    // documents (binary formats — PDF handled by pdfExtractor.js)
+    '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.odt', '.ods', '.odp',
     // fonts
     '.ttf', '.otf', '.woff', '.woff2', '.eot',
     // databases

package/dist/codebase/index.js CHANGED Viewed

@@ -6,6 +6,10 @@
 // ========================================
 export { SkipTheBoringShit, isBinaryFile, getFileSizeBytes, getExclusionHandler, resetExclusionHandler, DEFAULT_EXCLUSIONS } from './exclusions.js';
 // ========================================
+// PDF EXTRACTION - pdfExtractor
+// ========================================
+export { extractPdfText, extractPdfBatch, isPdfFile, isPdfExtractionAvailable } from './pdfExtractor.js';
+// ========================================
 // LANGUAGE DETECTION - whatLanguageIsThis
 // ========================================
 export { WhatLanguageIsThis, getLanguageDetector, resetLanguageDetector, LANGUAGE_REGISTRY, EXTENSION_INDEX, FILENAME_MAPPINGS } from './languageDetection.js';