specmem-hardwicksoftware 3.7.35 → 3.7.36

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. package/CHANGELOG.md +34 -0
  2. package/README.md +11 -15
  3. package/bin/specmem-console.cjs +839 -51
  4. package/claude-hooks/agent-chooser-hook.js +6 -6
  5. package/claude-hooks/agent-loading-hook.cjs +16 -16
  6. package/claude-hooks/agent-loading-hook.js +18 -18
  7. package/claude-hooks/agent-type-matcher.js +1 -1
  8. package/claude-hooks/background-completion-silencer.js +1 -1
  9. package/claude-hooks/file-claim-enforcer.cjs +37 -36
  10. package/claude-hooks/output-cleaner.cjs +1 -1
  11. package/claude-hooks/settings.json +27 -3
  12. package/claude-hooks/specmem-search-enforcer.cjs +2 -11
  13. package/claude-hooks/specmem-team-member-inject.js +1 -1
  14. package/claude-hooks/specmem-unified-hook.py +1 -1
  15. package/claude-hooks/subagent-loading-hook.cjs +1 -1
  16. package/claude-hooks/task-progress-hook.cjs +7 -7
  17. package/claude-hooks/task-progress-hook.js +3 -3
  18. package/claude-hooks/team-comms-enforcer.cjs +49 -47
  19. package/dist/claude-sessions/sessionParser.js +5 -0
  20. package/dist/codebase/codebaseIndexer.js +48 -17
  21. package/dist/codebase/exclusions.js +3 -4
  22. package/dist/codebase/index.js +4 -0
  23. package/dist/codebase/pdfExtractor.js +298 -0
  24. package/dist/dashboard/api/taskTeamMembers.js +2 -2
  25. package/dist/db/bigBrainMigrations.js +29 -0
  26. package/dist/hooks/hookManager.js +4 -4
  27. package/dist/hooks/teamFramingCli.js +1 -1
  28. package/dist/hooks/teamMemberPrepromptHook.js +5 -5
  29. package/dist/init/claudeConfigInjector.js +2 -2
  30. package/dist/mcp/compactionProxy.js +834 -186
  31. package/dist/mcp/compactionProxyDaemon.js +112 -37
  32. package/dist/mcp/contextVault.js +439 -0
  33. package/dist/mcp/embeddingServerManager.js +61 -1
  34. package/dist/mcp/mcpProtocolHandler.js +6 -1
  35. package/dist/mcp/miniCOTServerManager.js +82 -8
  36. package/dist/mcp/specMemServer.js +45 -10
  37. package/dist/mcp/toolRegistry.js +6 -0
  38. package/dist/startup/startupIndexing.js +14 -0
  39. package/dist/team-members/taskOrchestrator.js +3 -3
  40. package/dist/team-members/taskTeamMemberLogger.js +2 -2
  41. package/dist/tools/goofy/deployTeamMember.js +3 -3
  42. package/dist/tools/goofy/digInTheVault.js +81 -0
  43. package/dist/tools/goofy/stashTheGoods.js +56 -0
  44. package/dist/tools/teamMemberDeployer.js +2 -2
  45. package/dist/watcher/changeHandler.js +65 -8
  46. package/dist/watcher/changeQueue.js +20 -1
  47. package/embedding-sandbox/mini-cot-service.py +11 -13
  48. package/embedding-sandbox/pdf-text-extract.py +208 -0
  49. package/package.json +1 -1
  50. package/scripts/deploy-hooks.cjs +2 -2
  51. package/scripts/global-postinstall.cjs +2 -2
  52. package/scripts/specmem-init.cjs +130 -36
  53. package/specmem/model-config.json +6 -6
  54. package/specmem/supervisord.conf +1 -1
  55. package/svg-sections/readme-token-compaction.svg +246 -0
@@ -77,7 +77,7 @@ try {
77
77
  // CONFIGURATION
78
78
  // ============================================================================
79
79
  const MAX_SEARCHES_BEFORE_BLOCK = 2; // Every other search must use find_code_pointers/find_memory
80
- const TEAM_COMMS_CHECK_INTERVAL = 4; // MUST read_team_messages every 4 tool usages
80
+ const TEAM_COMMS_CHECK_INTERVAL = 3; // MUST send_team_message every 3 tool usages
81
81
  const BROADCAST_CHECK_INTERVAL = 5; // MUST read_team_messages w/ include_broadcasts every 5 tool usages
82
82
  const HELP_CHECK_INTERVAL = 8; // Check help requests every 8 tool usages
83
83
 
@@ -124,8 +124,8 @@ const WRITE_TOOLS = ['Edit', 'Write', 'NotebookEdit'];
124
124
  // FULL COMPLIANCE TOOLS - agents use these to bypass everything
125
125
  // Requires: announced + claimed + usedMemoryTools
126
126
  // - Bash: can run grep/cat/sed/echo to bypass all limits
127
- // - Task: can spawn sub-agents to bypass limits
128
- const FULL_COMPLIANCE_TOOLS = ['Bash', 'Task'];
127
+ // - Agent: can spawn sub-agents to bypass limits
128
+ const FULL_COMPLIANCE_TOOLS = ['Bash', 'Agent'];
129
129
 
130
130
  // Tools that are always allowed (reading team state + cross-swarm help + research)
131
131
  const ALWAYS_ALLOWED = [
@@ -149,7 +149,6 @@ const ALWAYS_ALLOWED = [
149
149
  'WebFetch',
150
150
  'WebSearch',
151
151
  'ToolSearch',
152
- 'Read',
153
152
  ];
154
153
 
155
154
  // ============================================================================
@@ -222,33 +221,11 @@ function isRunningAsAgent() {
222
221
  // Deployed team members — always enforce
223
222
  if (isTeamMemberFn()) return true;
224
223
 
225
- // Method 2: General-purpose subagents (CLAUDE_SUBAGENT=1)
226
- // These DO have MCP tools and SHOULD be enforced.
227
- // Exclude Explore/Plan agents — they don't have MCP tools and can't comply.
228
- // We check agents.json to see if the active subagent has MCP tools.
224
+ // Method 2: CLAUDE_SUBAGENT=1 — env var is proof enough, no agents.json check needed
229
225
  if (process.env.CLAUDE_SUBAGENT === '1' || process.env.CLAUDE_AGENT_ID) {
230
- try {
231
- const agentsFile = `${PROJECT_TMP_DIR}/agents.json`;
232
- if (fs.existsSync(agentsFile)) {
233
- const data = JSON.parse(fs.readFileSync(agentsFile, 'utf8'));
234
- const now = Date.now();
235
- for (const agent of Object.values(data.agents || {})) {
236
- // Active agent (started within 10 min, no endTime)
237
- if (!agent.endTime && agent.startTime && (now - agent.startTime < 600000)) {
238
- // Check if this agent has MCP tools (general-purpose agents do)
239
- const tools = agent.tools || [];
240
- const hasMcpTools = tools.some(t => t.startsWith('mcp__specmem__'));
241
- if (hasMcpTools) return true;
242
- }
243
- }
244
- }
245
- } catch {}
246
- // No agents.json or no MCP tools found — this is likely Explore/Plan, skip enforcement
247
- return false;
226
+ return true;
248
227
  }
249
228
 
250
- // Method 3: Check subagent tracking as fallback (parent context seeing active agents)
251
- // This does NOT enforce on the parent — only on processes with CLAUDE_SUBAGENT=1
252
229
  return false;
253
230
  }
254
231
 
@@ -343,6 +320,10 @@ process.stdin.on('end', () => {
343
320
  // ========================================================================
344
321
  if (ANNOUNCE_TOOLS.includes(toolName)) {
345
322
  state.announced = true;
323
+ // Reset comms counter on SEND (agents must send updates, not just read)
324
+ state.commsToolCount = 0;
325
+ state.lastCommsCheck = Date.now();
326
+ state.needsCommsCheck = false;
346
327
  }
347
328
  if (CLAIM_TOOLS.includes(toolName)) {
348
329
  state.claimed = true;
@@ -390,12 +371,10 @@ process.stdin.on('end', () => {
390
371
  state.searchCount = 0; // Reset search counter — allows next 2 searches
391
372
  // usedMemoryTools resets to false after 2 more searches (see BASIC_SEARCH_TOOLS block)
392
373
  }
393
- // Track team comms reads - resets comms counter
374
+ // Track team comms reads - resets BROADCAST counter only
375
+ // Comms counter now resets on SEND via ANNOUNCE_TOOLS, not on READ
394
376
  if (BROADCAST_CHECK_TOOLS.includes(toolName)) {
395
- state.commsToolCount = 0;
396
- state.lastCommsCheck = Date.now();
397
- state.needsCommsCheck = false;
398
- // Also reset broadcast counter IF they included broadcasts
377
+ // Broadcast counter reset IF they included broadcasts
399
378
  const params = data.tool_input || {};
400
379
  if (params.include_broadcasts !== false) {
401
380
  state.broadcastToolCount = 0;
@@ -431,16 +410,16 @@ process.stdin.on('end', () => {
431
410
  state.helpToolUsageCount = (state.helpToolUsageCount || 0) + 1;
432
411
 
433
412
  // ========================================================================
434
- // HARD BLOCK: Must read team messages every 4 tool usages
435
- // read_team_messages() satisfies this - any mode
413
+ // HARD BLOCK: Must send team message every 3 tool usages
414
+ // send_team_message() or broadcast_to_team() satisfies this
436
415
  // ========================================================================
437
- if (state.commsToolCount >= TEAM_COMMS_CHECK_INTERVAL && !BROADCAST_CHECK_TOOLS.includes(toolName)) {
416
+ if (state.commsToolCount >= TEAM_COMMS_CHECK_INTERVAL && !ANNOUNCE_TOOLS.includes(toolName)) {
438
417
  state.needsCommsCheck = true;
439
418
  state.blockedCount++;
440
419
  saveTracking(tracking);
441
420
  console.log(blockResponse(
442
- 'mcp__specmem__read_team_messages',
443
- `Quick check-in other team members may have updates that affect your work. Call: read_team_messages({include_swarms: true, limit: 5})`
421
+ 'mcp__specmem__send_team_message',
422
+ `Time to update the team on your progress. Call: send_team_message({type:"status", message:"[what you're doing / what you found]"})`
444
423
  ));
445
424
  return;
446
425
  }
@@ -488,7 +467,7 @@ process.stdin.on('end', () => {
488
467
  // ========================================================================
489
468
  if (state.commsToolCount === TEAM_COMMS_CHECK_INTERVAL - 1) {
490
469
  console.log(allowWithReminder(
491
- `Heads up — good time to check in with the team: read_team_messages({include_swarms: true, limit: 5})`
470
+ `Heads up — good time to update the team: send_team_message({type:"status", message:"[progress update]"})`
492
471
  ));
493
472
  // Don't return - continue to other checks
494
473
  }
@@ -609,18 +588,41 @@ process.stdin.on('end', () => {
609
588
  }
610
589
 
611
590
  // ========================================================================
612
- // CLAIM RELEASE ENFORCEMENT — After ANY edit, BLOCK until release
613
- // Flow: claim_task → Edit/Write → release_task claim_taskEdit/Write → release_task
591
+ // CLAIM RELEASE + NOTIFICATION ENFORCEMENT — After edit, BLOCK until release AND notify
592
+ // Flow: claim_task → Edit/Write → release_task + send_team_messagenext task
614
593
  // ========================================================================
615
594
  if (state.editedFiles && state.editedFiles.length > 0 && state.claimed && !WRITE_TOOLS.includes(toolName)) {
616
- // Allow: release_task, always-allowed tools, and write tools (handled in WRITE_TOOLS block)
617
- if (!ALWAYS_ALLOWED.includes(toolName) && toolName !== 'mcp__specmem__release_task') {
595
+ const isReleaseTool = toolName === 'mcp__specmem__release_task';
596
+ const isNotifyTool = ANNOUNCE_TOOLS.includes(toolName);
597
+
598
+ // Track completion of release/notify obligations
599
+ if (isReleaseTool) state.releasedClaim = true;
600
+ if (isNotifyTool) state.releaseNotified = true;
601
+
602
+ // Both obligations met — clear state and continue
603
+ if (state.releasedClaim && state.releaseNotified) {
604
+ state.editedFiles = [];
605
+ state.releasedClaim = false;
606
+ state.releaseNotified = false;
607
+ state.claimed = false;
608
+ state.currentClaimId = null;
609
+ }
610
+ // Allow release/notify tools and always-allowed tools through
611
+ else if (!isReleaseTool && !isNotifyTool && !ALWAYS_ALLOWED.includes(toolName)) {
618
612
  state.blockedCount++;
619
613
  saveTracking(tracking);
620
- console.log(blockResponse(
621
- 'mcp__specmem__release_task',
622
- `You're done editing ${state.editedFiles[state.editedFiles.length - 1]} — release the claim so other team members can work on it. Call: release_task({claimId:"${state.currentClaimId || 'your-claim-id'}"})`
623
- ));
614
+
615
+ if (!state.releasedClaim) {
616
+ console.log(blockResponse(
617
+ 'mcp__specmem__release_task',
618
+ `Done editing ${state.editedFiles[state.editedFiles.length - 1]} — release the claim so others can work on it. Call: release_task({claimId:"${state.currentClaimId || 'your-claim-id'}"})`
619
+ ));
620
+ } else {
621
+ console.log(blockResponse(
622
+ 'mcp__specmem__send_team_message',
623
+ `Claim released — now notify the team about your changes. Call: send_team_message({type:"update", message:"Finished editing ${state.editedFiles[state.editedFiles.length - 1]}: [describe what you changed]"})`
624
+ ));
625
+ }
624
626
  return;
625
627
  }
626
628
  }
@@ -996,6 +996,11 @@ export function isToolOrThinkingContent(content) {
996
996
  return true;
997
997
  if (trimmed.startsWith('[Tool:'))
998
998
  return true;
999
+ // Skip task/agent notification XML blocks — system noise, not conversation
1000
+ if (trimmed.startsWith('<task-notification>'))
1001
+ return true;
1002
+ if (trimmed.includes('<task-id>') && trimmed.includes('</task-id>'))
1003
+ return true;
999
1004
  // Check for [CLAUDE] prefixed tool versions
1000
1005
  if (trimmed.startsWith('[CLAUDE] [Tools:'))
1001
1006
  return true;
@@ -28,6 +28,7 @@ import * as os from 'os';
28
28
  import { v4 as uuidv4 } from 'uuid';
29
29
  import chokidar from 'chokidar';
30
30
  import { logger } from '../utils/logger.js';
31
+ import { extractPdfText, extractPdfBatch, isPdfFile } from './pdfExtractor.js';
31
32
  import { getProjectPath } from '../config.js';
32
33
  import { getCoordinator } from '../coordination/integration.js';
33
34
  /**
@@ -36,15 +37,15 @@ import { getCoordinator } from '../coordination/integration.js';
36
37
  */
37
38
  function loadResourceLimits() {
38
39
  const limits = {
39
- cpuMax: 40, // max CPU % target (back-pressure threshold)
40
+ cpuMax: 35, // max CPU % target (back-pressure threshold)
40
41
  cpuMin: 10, // min CPU % (crawl mode)
41
- ramMaxMb: 6000, // max RAM MB
42
+ ramMaxMb: 4000, // max RAM MB (safe for 8GB laptops)
42
43
  ramMinMb: 2000, // min RAM MB
43
44
  batchSize: 25, // files per batch (was 200!)
44
- maxConcurrency: 8, // max parallel file reads within a batch
45
+ maxConcurrency: 4, // max parallel file reads (safe for dual-core i3s)
45
46
  batchDelayMs: 50, // delay between batches (ms)
46
47
  batchDelayMaxMs: 2000, // max delay under heavy load
47
- cpuCoreMax: 0, // 0 = auto (use all cores)
48
+ cpuCoreMax: 2, // max CPU cores (safe for dual-core i3s)
48
49
  };
49
50
  // 1. Read from model-config.json
50
51
  try {
@@ -177,7 +178,8 @@ const DEFAULT_CONFIG = {
177
178
  '.c', '.cpp', '.h', '.hpp',
178
179
  '.swift',
179
180
  '.dockerfile', 'Dockerfile',
180
- '.env.example', '.env.template'
181
+ '.env.example', '.env.template',
182
+ '.pdf'
181
183
  ],
182
184
  maxFileSizeBytes: 1024 * 1024, // 1MB
183
185
  generateEmbeddings: true,
@@ -444,16 +446,25 @@ export class CodebaseIndexer {
444
446
  const stats = await fs.stat(filePath);
445
447
  if (stats.size > this.config.maxFileSizeBytes)
446
448
  return;
447
- if (await this.isBinaryFile(filePath))
448
- return;
449
- const content = await fs.readFile(filePath, 'utf-8');
449
+ // PDF files: extract text via PyMuPDF instead of reading as UTF-8
450
+ let content;
451
+ if (isPdfFile(filePath)) {
452
+ const pdfResult = await extractPdfText(filePath);
453
+ if (!pdfResult || !pdfResult.text) return;
454
+ content = pdfResult.text;
455
+ logger.debug({ filePath: relativePath, pages: pdfResult.pages, chars: pdfResult.chars }, 'PDF text extracted');
456
+ } else {
457
+ if (await this.isBinaryFile(filePath))
458
+ return;
459
+ content = await fs.readFile(filePath, 'utf-8');
460
+ }
450
461
  const contentHash = this.hashContent(content);
451
462
  const existingHash = existingHashes.get(relativePath);
452
463
  if (existingHash === contentHash) {
453
464
  skipped++;
454
465
  return;
455
466
  }
456
- const indexedFile = await this.indexFile(filePath);
467
+ const indexedFile = await this.indexFile(filePath, isPdfFile(filePath) ? content : undefined);
457
468
  if (indexedFile) {
458
469
  this.index.set(indexedFile.filePath, indexedFile);
459
470
  changedFiles.push(indexedFile);
@@ -616,9 +627,17 @@ export class CodebaseIndexer {
616
627
  if (existing && existing.mtime && stats.mtime.getTime() <= existing.mtime) {
617
628
  return { skipped: true, relativePath, mtimeSkip: true };
618
629
  }
619
- if (await this.isBinaryFile(filePath))
620
- return null;
621
- const content = await fs.readFile(filePath, 'utf-8');
630
+ // PDF files: extract text via PyMuPDF instead of reading as UTF-8
631
+ let content;
632
+ if (isPdfFile(filePath)) {
633
+ const pdfResult = await extractPdfText(filePath);
634
+ if (!pdfResult || !pdfResult.text) return null;
635
+ content = pdfResult.text;
636
+ } else {
637
+ if (await this.isBinaryFile(filePath))
638
+ return null;
639
+ content = await fs.readFile(filePath, 'utf-8');
640
+ }
622
641
  const contentHash = this.hashContent(content);
623
642
  if (existing && existing.hash === contentHash) {
624
643
  return { skipped: true, relativePath, hashSkip: true };
@@ -1178,7 +1197,7 @@ export class CodebaseIndexer {
1178
1197
  /**
1179
1198
  * indexFile - reads and indexes a single file with enhanced analysis
1180
1199
  */
1181
- async indexFile(absolutePath) {
1200
+ async indexFile(absolutePath, preExtractedContent) {
1182
1201
  try {
1183
1202
  const stats = await fs.stat(absolutePath);
1184
1203
  // skip if too large
@@ -1186,11 +1205,23 @@ export class CodebaseIndexer {
1186
1205
  logger.debug({ path: absolutePath, size: stats.size }, 'skipping large file');
1187
1206
  return null;
1188
1207
  }
1189
- // skip if binary
1190
- if (await this.isBinaryFile(absolutePath)) {
1191
- return null;
1208
+ // PDF files: use pre-extracted content or extract on demand
1209
+ let content;
1210
+ if (isPdfFile(absolutePath)) {
1211
+ if (preExtractedContent) {
1212
+ content = preExtractedContent;
1213
+ } else {
1214
+ const pdfResult = await extractPdfText(absolutePath);
1215
+ if (!pdfResult || !pdfResult.text) return null;
1216
+ content = pdfResult.text;
1217
+ }
1218
+ } else {
1219
+ // skip if binary
1220
+ if (await this.isBinaryFile(absolutePath)) {
1221
+ return null;
1222
+ }
1223
+ content = await fs.readFile(absolutePath, 'utf-8');
1192
1224
  }
1193
- const content = await fs.readFile(absolutePath, 'utf-8');
1194
1225
  const relativePath = path.relative(this.config.codebasePath, absolutePath);
1195
1226
  const fileName = path.basename(absolutePath);
1196
1227
  const extension = path.extname(absolutePath).toLowerCase();
@@ -47,7 +47,7 @@ export const EXCLUSION_CONFIG = {
47
47
  '*.db',
48
48
  // Binary assets
49
49
  '*.png', '*.jpg', '*.jpeg', '*.gif', '*.ico', '*.webp',
50
- '*.pdf', '*.zip', '*.tar', '*.gz', '*.rar', '*.7z',
50
+ '*.zip', '*.tar', '*.gz', '*.rar', '*.7z',
51
51
  '*.mp3', '*.mp4', '*.avi', '*.mov', '*.mkv',
52
52
  '*.ttf', '*.woff', '*.woff2', '*.eot', '*.otf',
53
53
  '*.exe', '*.dll', '*.so', '*.dylib', '*.bin',
@@ -145,7 +145,6 @@ const DEFAULT_EXCLUSIONS = [
145
145
  '*.mp4',
146
146
  '*.avi',
147
147
  '*.mov',
148
- '*.pdf',
149
148
  '*.zip',
150
149
  '*.tar',
151
150
  '*.gz',
@@ -547,8 +546,8 @@ const BINARY_EXTENSIONS = new Set([
547
546
  '.zip', '.tar', '.gz', '.bz2', '.7z', '.rar', '.xz', '.lz', '.lzma',
548
547
  // executables and libraries
549
548
  '.exe', '.dll', '.so', '.dylib', '.bin', '.out', '.app', '.msi', '.deb', '.rpm',
550
- // documents (binary formats)
551
- '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.odt', '.ods', '.odp',
549
+ // documents (binary formats — PDF handled by pdfExtractor.js)
550
+ '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.odt', '.ods', '.odp',
552
551
  // fonts
553
552
  '.ttf', '.otf', '.woff', '.woff2', '.eot',
554
553
  // databases
@@ -6,6 +6,10 @@
6
6
  // ========================================
7
7
  export { SkipTheBoringShit, isBinaryFile, getFileSizeBytes, getExclusionHandler, resetExclusionHandler, DEFAULT_EXCLUSIONS } from './exclusions.js';
8
8
  // ========================================
9
+ // PDF EXTRACTION - pdfExtractor
10
+ // ========================================
11
+ export { extractPdfText, extractPdfBatch, isPdfFile, isPdfExtractionAvailable } from './pdfExtractor.js';
12
+ // ========================================
9
13
  // LANGUAGE DETECTION - whatLanguageIsThis
10
14
  // ========================================
11
15
  export { WhatLanguageIsThis, getLanguageDetector, resetLanguageDetector, LANGUAGE_REGISTRY, EXTENSION_INDEX, FILENAME_MAPPINGS } from './languageDetection.js';
@@ -0,0 +1,298 @@
1
+ /**
2
+ * pdfExtractor.js — PDF text extraction for codebase indexing
3
+ *
4
+ * Spawns pdf-text-extract.py (PyMuPDF) as a child process.
5
+ * Digital PDFs: instant text extraction (0.003s/page).
6
+ * Scanned PDFs: Tesseract OCR fallback (1-3s/page).
7
+ *
8
+ * Used by codebaseIndexer.js and changeHandler.js to index PDF files
9
+ * alongside regular source code.
10
+ */
11
+ import { spawn } from 'child_process';
12
+ import { existsSync } from 'fs';
13
+ import { join, dirname } from 'path';
14
+ import { logger } from '../utils/logger.js';
15
+ import { getPythonPath } from '../utils/projectEnv.js';
16
+
17
+ const PDF_EXTRACT_TIMEOUT_MS = 60000; // 60s — generous for large scanned PDFs
18
+ const MAX_PDF_PAGES = 100;
19
+
20
+ // Cache the script path after first discovery
21
+ let _cachedScriptPath = null;
22
+ let _pymupdfAvailable = null; // null = unknown, true/false after first check
23
+
24
+ /**
25
+ * Find the pdf-text-extract.py script.
26
+ * Search pattern matches miniCOTServerManager.findMiniCOTScript().
27
+ */
28
+ function findPdfExtractScript() {
29
+ if (_cachedScriptPath && existsSync(_cachedScriptPath)) {
30
+ return _cachedScriptPath;
31
+ }
32
+
33
+ const specmemRoot = dirname(dirname(__dirname));
34
+ const possiblePaths = [
35
+ // SpecMem package root (works for all installs)
36
+ join(specmemRoot, 'embedding-sandbox', 'pdf-text-extract.py'),
37
+ // Direct package root (flat layout)
38
+ join(specmemRoot, 'pdf-text-extract.py'),
39
+ // Global npm install fallback
40
+ join(dirname(dirname(process.execPath)), 'lib', 'node_modules', 'specmem-hardwicksoftware', 'embedding-sandbox', 'pdf-text-extract.py'),
41
+ ];
42
+
43
+ for (const p of possiblePaths) {
44
+ if (existsSync(p)) {
45
+ _cachedScriptPath = p;
46
+ logger.debug({ path: p }, '[pdfExtractor] Found pdf-text-extract.py');
47
+ return p;
48
+ }
49
+ }
50
+
51
+ logger.warn({ searchedPaths: possiblePaths }, '[pdfExtractor] pdf-text-extract.py not found');
52
+ return null;
53
+ }
54
+
55
+ /**
56
+ * Extract text from a PDF file.
57
+ *
58
+ * @param {string} filePath - Absolute path to the PDF file
59
+ * @param {object} options - { maxPages?: number, language?: string }
60
+ * @returns {Promise<{text: string, pages: number, scannedPages?: number[], chars: number} | null>}
61
+ * Returns null if extraction fails or pymupdf not available.
62
+ */
63
+ export async function extractPdfText(filePath, options = {}) {
64
+ const { maxPages = MAX_PDF_PAGES, language = 'eng' } = options;
65
+
66
+ // Fast bail if we already know pymupdf is unavailable
67
+ if (_pymupdfAvailable === false) {
68
+ return null;
69
+ }
70
+
71
+ const scriptPath = findPdfExtractScript();
72
+ if (!scriptPath) {
73
+ logger.warn('[pdfExtractor] Cannot extract PDF — script not found');
74
+ return null;
75
+ }
76
+
77
+ const pythonPath = getPythonPath();
78
+
79
+ return new Promise((resolve) => {
80
+ const args = [scriptPath, filePath, '--max-pages', String(maxPages), '--language', language];
81
+ let stdout = '';
82
+ let stderr = '';
83
+ let resolved = false;
84
+
85
+ const proc = spawn(pythonPath, args, {
86
+ timeout: PDF_EXTRACT_TIMEOUT_MS,
87
+ stdio: ['ignore', 'pipe', 'pipe'],
88
+ env: { ...process.env },
89
+ });
90
+
91
+ const timeoutId = setTimeout(() => {
92
+ if (!resolved) {
93
+ resolved = true;
94
+ proc.kill('SIGKILL');
95
+ logger.warn({ filePath, timeoutMs: PDF_EXTRACT_TIMEOUT_MS }, '[pdfExtractor] PDF extraction timed out');
96
+ resolve(null);
97
+ }
98
+ }, PDF_EXTRACT_TIMEOUT_MS);
99
+
100
+ proc.stdout.on('data', (data) => { stdout += data.toString(); });
101
+ proc.stderr.on('data', (data) => { stderr += data.toString(); });
102
+
103
+ proc.on('close', (code) => {
104
+ clearTimeout(timeoutId);
105
+ if (resolved) return;
106
+ resolved = true;
107
+
108
+ if (stderr && stderr.includes('pymupdf not found')) {
109
+ _pymupdfAvailable = false;
110
+ logger.warn('[pdfExtractor] pymupdf not installed — PDF indexing disabled');
111
+ resolve(null);
112
+ return;
113
+ }
114
+
115
+ if (code !== 0) {
116
+ logger.warn({ filePath, code, stderr: stderr.slice(0, 200) }, '[pdfExtractor] PDF extraction failed');
117
+ resolve(null);
118
+ return;
119
+ }
120
+
121
+ try {
122
+ const result = JSON.parse(stdout.trim());
123
+ if (result.error) {
124
+ logger.warn({ filePath, error: result.error }, '[pdfExtractor] PDF extraction error');
125
+ resolve(null);
126
+ return;
127
+ }
128
+
129
+ // Mark pymupdf as available on first success
130
+ if (_pymupdfAvailable === null) {
131
+ _pymupdfAvailable = true;
132
+ }
133
+
134
+ resolve({
135
+ text: result.text,
136
+ pages: result.pages,
137
+ chars: result.chars,
138
+ scannedPages: result.scanned_pages || [],
139
+ truncated: result.truncated || false,
140
+ totalPages: result.total_pages || result.pages,
141
+ });
142
+ } catch (parseErr) {
143
+ logger.warn({ filePath, stdout: stdout.slice(0, 200) }, '[pdfExtractor] Failed to parse extraction result');
144
+ resolve(null);
145
+ }
146
+ });
147
+
148
+ proc.on('error', (err) => {
149
+ clearTimeout(timeoutId);
150
+ if (!resolved) {
151
+ resolved = true;
152
+ logger.warn({ filePath, error: err.message }, '[pdfExtractor] Failed to spawn Python');
153
+ resolve(null);
154
+ }
155
+ });
156
+ });
157
+ }
158
+
159
+ /**
160
+ * Extract text from multiple PDFs in a single Python process (batch mode).
161
+ * One Python startup for N PDFs — avoids interpreter overhead per file.
162
+ * Returns a Map<filePath, result> where result is the extraction output or null.
163
+ *
164
+ * @param {string[]} filePaths - Absolute paths to PDF files
165
+ * @param {object} options - { maxPages?: number, language?: string }
166
+ * @returns {Promise<Map<string, {text: string, pages: number, chars: number} | null>>}
167
+ */
168
+ export async function extractPdfBatch(filePaths, options = {}) {
169
+ const { maxPages = MAX_PDF_PAGES, language = 'eng' } = options;
170
+ const results = new Map();
171
+
172
+ if (!filePaths.length) return results;
173
+
174
+ // Fast bail if pymupdf is known unavailable
175
+ if (_pymupdfAvailable === false) {
176
+ for (const fp of filePaths) results.set(fp, null);
177
+ return results;
178
+ }
179
+
180
+ const scriptPath = findPdfExtractScript();
181
+ if (!scriptPath) {
182
+ for (const fp of filePaths) results.set(fp, null);
183
+ return results;
184
+ }
185
+
186
+ const pythonPath = getPythonPath();
187
+ // Batch timeout: 60s base + 10s per PDF (scanned pages take 1-3s each)
188
+ const batchTimeout = PDF_EXTRACT_TIMEOUT_MS + (filePaths.length * 10000);
189
+
190
+ return new Promise((resolve) => {
191
+ const args = [scriptPath, '--batch', ...filePaths, '--max-pages', String(maxPages), '--language', language];
192
+ let stdout = '';
193
+ let stderr = '';
194
+ let resolved = false;
195
+
196
+ const proc = spawn(pythonPath, args, {
197
+ stdio: ['ignore', 'pipe', 'pipe'],
198
+ env: { ...process.env },
199
+ });
200
+
201
+ const timeoutId = setTimeout(() => {
202
+ if (!resolved) {
203
+ resolved = true;
204
+ proc.kill('SIGKILL');
205
+ logger.warn({ count: filePaths.length, timeoutMs: batchTimeout }, '[pdfExtractor] Batch extraction timed out');
206
+ // Return whatever we parsed so far + null for the rest
207
+ for (const fp of filePaths) {
208
+ if (!results.has(fp)) results.set(fp, null);
209
+ }
210
+ resolve(results);
211
+ }
212
+ }, batchTimeout);
213
+
214
+ proc.stdout.on('data', (data) => { stdout += data.toString(); });
215
+ proc.stderr.on('data', (data) => { stderr += data.toString(); });
216
+
217
+ proc.on('close', (code) => {
218
+ clearTimeout(timeoutId);
219
+ if (resolved) return;
220
+ resolved = true;
221
+
222
+ if (stderr && stderr.includes('pymupdf not found')) {
223
+ _pymupdfAvailable = false;
224
+ logger.warn('[pdfExtractor] pymupdf not installed — PDF indexing disabled');
225
+ for (const fp of filePaths) results.set(fp, null);
226
+ resolve(results);
227
+ return;
228
+ }
229
+
230
+ // Parse JSONL — one JSON object per line
231
+ const lines = stdout.split('\n').filter(l => l.trim());
232
+ for (const line of lines) {
233
+ try {
234
+ const result = JSON.parse(line);
235
+ const path = result.path;
236
+ if (!path) continue;
237
+
238
+ if (result.error) {
239
+ logger.warn({ path, error: result.error }, '[pdfExtractor] PDF extraction error');
240
+ results.set(path, null);
241
+ continue;
242
+ }
243
+
244
+ if (_pymupdfAvailable === null) _pymupdfAvailable = true;
245
+
246
+ results.set(path, {
247
+ text: result.text,
248
+ pages: result.pages,
249
+ chars: result.chars,
250
+ scannedPages: result.scanned_pages || [],
251
+ truncated: result.truncated || false,
252
+ totalPages: result.total_pages || result.pages,
253
+ });
254
+ } catch (e) {
255
+ logger.debug({ line: line.slice(0, 100) }, '[pdfExtractor] Failed to parse JSONL line');
256
+ }
257
+ }
258
+
259
+ // Fill nulls for any missing paths
260
+ for (const fp of filePaths) {
261
+ if (!results.has(fp)) results.set(fp, null);
262
+ }
263
+
264
+ logger.info({
265
+ total: filePaths.length,
266
+ extracted: [...results.values()].filter(v => v !== null).length,
267
+ failed: [...results.values()].filter(v => v === null).length,
268
+ }, '[pdfExtractor] Batch extraction complete');
269
+
270
+ resolve(results);
271
+ });
272
+
273
+ proc.on('error', (err) => {
274
+ clearTimeout(timeoutId);
275
+ if (!resolved) {
276
+ resolved = true;
277
+ logger.warn({ error: err.message }, '[pdfExtractor] Failed to spawn Python for batch');
278
+ for (const fp of filePaths) results.set(fp, null);
279
+ resolve(results);
280
+ }
281
+ });
282
+ });
283
+ }
284
+
285
+ /**
286
+ * Check if PDF extraction is available (pymupdf installed + script found).
287
+ */
288
+ export function isPdfExtractionAvailable() {
289
+ if (_pymupdfAvailable === false) return false;
290
+ return findPdfExtractScript() !== null;
291
+ }
292
+
293
+ /**
294
+ * Check if a file is a PDF by extension.
295
+ */
296
+ export function isPdfFile(filePath) {
297
+ return filePath.toLowerCase().endsWith('.pdf');
298
+ }
@@ -1,7 +1,7 @@
1
1
  /**
2
- * taskTeamMembers.ts - API endpoints for Task team member tracking
2
+ * taskTeamMembers.ts - API endpoints for Agent team member tracking
3
3
  *
4
- * yo fr fr this lets you view and manually log Task team members
4
+ * yo fr fr this lets you view and manually log Agent team members
5
5
  */
6
6
  import { Router } from 'express';
7
7
  import { z } from 'zod';