specmem-hardwicksoftware 3.7.35 → 3.7.36
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +34 -0
- package/README.md +11 -15
- package/bin/specmem-console.cjs +839 -51
- package/claude-hooks/agent-chooser-hook.js +6 -6
- package/claude-hooks/agent-loading-hook.cjs +16 -16
- package/claude-hooks/agent-loading-hook.js +18 -18
- package/claude-hooks/agent-type-matcher.js +1 -1
- package/claude-hooks/background-completion-silencer.js +1 -1
- package/claude-hooks/file-claim-enforcer.cjs +37 -36
- package/claude-hooks/output-cleaner.cjs +1 -1
- package/claude-hooks/settings.json +27 -3
- package/claude-hooks/specmem-search-enforcer.cjs +2 -11
- package/claude-hooks/specmem-team-member-inject.js +1 -1
- package/claude-hooks/specmem-unified-hook.py +1 -1
- package/claude-hooks/subagent-loading-hook.cjs +1 -1
- package/claude-hooks/task-progress-hook.cjs +7 -7
- package/claude-hooks/task-progress-hook.js +3 -3
- package/claude-hooks/team-comms-enforcer.cjs +49 -47
- package/dist/claude-sessions/sessionParser.js +5 -0
- package/dist/codebase/codebaseIndexer.js +48 -17
- package/dist/codebase/exclusions.js +3 -4
- package/dist/codebase/index.js +4 -0
- package/dist/codebase/pdfExtractor.js +298 -0
- package/dist/dashboard/api/taskTeamMembers.js +2 -2
- package/dist/db/bigBrainMigrations.js +29 -0
- package/dist/hooks/hookManager.js +4 -4
- package/dist/hooks/teamFramingCli.js +1 -1
- package/dist/hooks/teamMemberPrepromptHook.js +5 -5
- package/dist/init/claudeConfigInjector.js +2 -2
- package/dist/mcp/compactionProxy.js +834 -186
- package/dist/mcp/compactionProxyDaemon.js +112 -37
- package/dist/mcp/contextVault.js +439 -0
- package/dist/mcp/embeddingServerManager.js +61 -1
- package/dist/mcp/mcpProtocolHandler.js +6 -1
- package/dist/mcp/miniCOTServerManager.js +82 -8
- package/dist/mcp/specMemServer.js +45 -10
- package/dist/mcp/toolRegistry.js +6 -0
- package/dist/startup/startupIndexing.js +14 -0
- package/dist/team-members/taskOrchestrator.js +3 -3
- package/dist/team-members/taskTeamMemberLogger.js +2 -2
- package/dist/tools/goofy/deployTeamMember.js +3 -3
- package/dist/tools/goofy/digInTheVault.js +81 -0
- package/dist/tools/goofy/stashTheGoods.js +56 -0
- package/dist/tools/teamMemberDeployer.js +2 -2
- package/dist/watcher/changeHandler.js +65 -8
- package/dist/watcher/changeQueue.js +20 -1
- package/embedding-sandbox/mini-cot-service.py +11 -13
- package/embedding-sandbox/pdf-text-extract.py +208 -0
- package/package.json +1 -1
- package/scripts/deploy-hooks.cjs +2 -2
- package/scripts/global-postinstall.cjs +2 -2
- package/scripts/specmem-init.cjs +130 -36
- package/specmem/model-config.json +6 -6
- package/specmem/supervisord.conf +1 -1
- package/svg-sections/readme-token-compaction.svg +246 -0
|
@@ -77,7 +77,7 @@ try {
|
|
|
77
77
|
// CONFIGURATION
|
|
78
78
|
// ============================================================================
|
|
79
79
|
const MAX_SEARCHES_BEFORE_BLOCK = 2; // Every other search must use find_code_pointers/find_memory
|
|
80
|
-
const TEAM_COMMS_CHECK_INTERVAL =
|
|
80
|
+
const TEAM_COMMS_CHECK_INTERVAL = 3; // MUST send_team_message every 3 tool usages
|
|
81
81
|
const BROADCAST_CHECK_INTERVAL = 5; // MUST read_team_messages w/ include_broadcasts every 5 tool usages
|
|
82
82
|
const HELP_CHECK_INTERVAL = 8; // Check help requests every 8 tool usages
|
|
83
83
|
|
|
@@ -124,8 +124,8 @@ const WRITE_TOOLS = ['Edit', 'Write', 'NotebookEdit'];
|
|
|
124
124
|
// FULL COMPLIANCE TOOLS - agents use these to bypass everything
|
|
125
125
|
// Requires: announced + claimed + usedMemoryTools
|
|
126
126
|
// - Bash: can run grep/cat/sed/echo to bypass all limits
|
|
127
|
-
// -
|
|
128
|
-
const FULL_COMPLIANCE_TOOLS = ['Bash', '
|
|
127
|
+
// - Agent: can spawn sub-agents to bypass limits
|
|
128
|
+
const FULL_COMPLIANCE_TOOLS = ['Bash', 'Agent'];
|
|
129
129
|
|
|
130
130
|
// Tools that are always allowed (reading team state + cross-swarm help + research)
|
|
131
131
|
const ALWAYS_ALLOWED = [
|
|
@@ -149,7 +149,6 @@ const ALWAYS_ALLOWED = [
|
|
|
149
149
|
'WebFetch',
|
|
150
150
|
'WebSearch',
|
|
151
151
|
'ToolSearch',
|
|
152
|
-
'Read',
|
|
153
152
|
];
|
|
154
153
|
|
|
155
154
|
// ============================================================================
|
|
@@ -222,33 +221,11 @@ function isRunningAsAgent() {
|
|
|
222
221
|
// Deployed team members — always enforce
|
|
223
222
|
if (isTeamMemberFn()) return true;
|
|
224
223
|
|
|
225
|
-
// Method 2:
|
|
226
|
-
// These DO have MCP tools and SHOULD be enforced.
|
|
227
|
-
// Exclude Explore/Plan agents — they don't have MCP tools and can't comply.
|
|
228
|
-
// We check agents.json to see if the active subagent has MCP tools.
|
|
224
|
+
// Method 2: CLAUDE_SUBAGENT=1 — env var is proof enough, no agents.json check needed
|
|
229
225
|
if (process.env.CLAUDE_SUBAGENT === '1' || process.env.CLAUDE_AGENT_ID) {
|
|
230
|
-
|
|
231
|
-
const agentsFile = `${PROJECT_TMP_DIR}/agents.json`;
|
|
232
|
-
if (fs.existsSync(agentsFile)) {
|
|
233
|
-
const data = JSON.parse(fs.readFileSync(agentsFile, 'utf8'));
|
|
234
|
-
const now = Date.now();
|
|
235
|
-
for (const agent of Object.values(data.agents || {})) {
|
|
236
|
-
// Active agent (started within 10 min, no endTime)
|
|
237
|
-
if (!agent.endTime && agent.startTime && (now - agent.startTime < 600000)) {
|
|
238
|
-
// Check if this agent has MCP tools (general-purpose agents do)
|
|
239
|
-
const tools = agent.tools || [];
|
|
240
|
-
const hasMcpTools = tools.some(t => t.startsWith('mcp__specmem__'));
|
|
241
|
-
if (hasMcpTools) return true;
|
|
242
|
-
}
|
|
243
|
-
}
|
|
244
|
-
}
|
|
245
|
-
} catch {}
|
|
246
|
-
// No agents.json or no MCP tools found — this is likely Explore/Plan, skip enforcement
|
|
247
|
-
return false;
|
|
226
|
+
return true;
|
|
248
227
|
}
|
|
249
228
|
|
|
250
|
-
// Method 3: Check subagent tracking as fallback (parent context seeing active agents)
|
|
251
|
-
// This does NOT enforce on the parent — only on processes with CLAUDE_SUBAGENT=1
|
|
252
229
|
return false;
|
|
253
230
|
}
|
|
254
231
|
|
|
@@ -343,6 +320,10 @@ process.stdin.on('end', () => {
|
|
|
343
320
|
// ========================================================================
|
|
344
321
|
if (ANNOUNCE_TOOLS.includes(toolName)) {
|
|
345
322
|
state.announced = true;
|
|
323
|
+
// Reset comms counter on SEND (agents must send updates, not just read)
|
|
324
|
+
state.commsToolCount = 0;
|
|
325
|
+
state.lastCommsCheck = Date.now();
|
|
326
|
+
state.needsCommsCheck = false;
|
|
346
327
|
}
|
|
347
328
|
if (CLAIM_TOOLS.includes(toolName)) {
|
|
348
329
|
state.claimed = true;
|
|
@@ -390,12 +371,10 @@ process.stdin.on('end', () => {
|
|
|
390
371
|
state.searchCount = 0; // Reset search counter — allows next 2 searches
|
|
391
372
|
// usedMemoryTools resets to false after 2 more searches (see BASIC_SEARCH_TOOLS block)
|
|
392
373
|
}
|
|
393
|
-
// Track team comms reads - resets
|
|
374
|
+
// Track team comms reads - resets BROADCAST counter only
|
|
375
|
+
// Comms counter now resets on SEND via ANNOUNCE_TOOLS, not on READ
|
|
394
376
|
if (BROADCAST_CHECK_TOOLS.includes(toolName)) {
|
|
395
|
-
|
|
396
|
-
state.lastCommsCheck = Date.now();
|
|
397
|
-
state.needsCommsCheck = false;
|
|
398
|
-
// Also reset broadcast counter IF they included broadcasts
|
|
377
|
+
// Broadcast counter reset IF they included broadcasts
|
|
399
378
|
const params = data.tool_input || {};
|
|
400
379
|
if (params.include_broadcasts !== false) {
|
|
401
380
|
state.broadcastToolCount = 0;
|
|
@@ -431,16 +410,16 @@ process.stdin.on('end', () => {
|
|
|
431
410
|
state.helpToolUsageCount = (state.helpToolUsageCount || 0) + 1;
|
|
432
411
|
|
|
433
412
|
// ========================================================================
|
|
434
|
-
// HARD BLOCK: Must
|
|
435
|
-
//
|
|
413
|
+
// HARD BLOCK: Must send team message every 3 tool usages
|
|
414
|
+
// send_team_message() or broadcast_to_team() satisfies this
|
|
436
415
|
// ========================================================================
|
|
437
|
-
if (state.commsToolCount >= TEAM_COMMS_CHECK_INTERVAL && !
|
|
416
|
+
if (state.commsToolCount >= TEAM_COMMS_CHECK_INTERVAL && !ANNOUNCE_TOOLS.includes(toolName)) {
|
|
438
417
|
state.needsCommsCheck = true;
|
|
439
418
|
state.blockedCount++;
|
|
440
419
|
saveTracking(tracking);
|
|
441
420
|
console.log(blockResponse(
|
|
442
|
-
'
|
|
443
|
-
`
|
|
421
|
+
'mcp__specmem__send_team_message',
|
|
422
|
+
`Time to update the team on your progress. Call: send_team_message({type:"status", message:"[what you're doing / what you found]"})`
|
|
444
423
|
));
|
|
445
424
|
return;
|
|
446
425
|
}
|
|
@@ -488,7 +467,7 @@ process.stdin.on('end', () => {
|
|
|
488
467
|
// ========================================================================
|
|
489
468
|
if (state.commsToolCount === TEAM_COMMS_CHECK_INTERVAL - 1) {
|
|
490
469
|
console.log(allowWithReminder(
|
|
491
|
-
`Heads up — good time to
|
|
470
|
+
`Heads up — good time to update the team: send_team_message({type:"status", message:"[progress update]"})`
|
|
492
471
|
));
|
|
493
472
|
// Don't return - continue to other checks
|
|
494
473
|
}
|
|
@@ -609,18 +588,41 @@ process.stdin.on('end', () => {
|
|
|
609
588
|
}
|
|
610
589
|
|
|
611
590
|
// ========================================================================
|
|
612
|
-
// CLAIM RELEASE ENFORCEMENT — After
|
|
613
|
-
// Flow: claim_task → Edit/Write → release_task
|
|
591
|
+
// CLAIM RELEASE + NOTIFICATION ENFORCEMENT — After edit, BLOCK until release AND notify
|
|
592
|
+
// Flow: claim_task → Edit/Write → release_task + send_team_message → next task
|
|
614
593
|
// ========================================================================
|
|
615
594
|
if (state.editedFiles && state.editedFiles.length > 0 && state.claimed && !WRITE_TOOLS.includes(toolName)) {
|
|
616
|
-
|
|
617
|
-
|
|
595
|
+
const isReleaseTool = toolName === 'mcp__specmem__release_task';
|
|
596
|
+
const isNotifyTool = ANNOUNCE_TOOLS.includes(toolName);
|
|
597
|
+
|
|
598
|
+
// Track completion of release/notify obligations
|
|
599
|
+
if (isReleaseTool) state.releasedClaim = true;
|
|
600
|
+
if (isNotifyTool) state.releaseNotified = true;
|
|
601
|
+
|
|
602
|
+
// Both obligations met — clear state and continue
|
|
603
|
+
if (state.releasedClaim && state.releaseNotified) {
|
|
604
|
+
state.editedFiles = [];
|
|
605
|
+
state.releasedClaim = false;
|
|
606
|
+
state.releaseNotified = false;
|
|
607
|
+
state.claimed = false;
|
|
608
|
+
state.currentClaimId = null;
|
|
609
|
+
}
|
|
610
|
+
// Allow release/notify tools and always-allowed tools through
|
|
611
|
+
else if (!isReleaseTool && !isNotifyTool && !ALWAYS_ALLOWED.includes(toolName)) {
|
|
618
612
|
state.blockedCount++;
|
|
619
613
|
saveTracking(tracking);
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
614
|
+
|
|
615
|
+
if (!state.releasedClaim) {
|
|
616
|
+
console.log(blockResponse(
|
|
617
|
+
'mcp__specmem__release_task',
|
|
618
|
+
`Done editing ${state.editedFiles[state.editedFiles.length - 1]} — release the claim so others can work on it. Call: release_task({claimId:"${state.currentClaimId || 'your-claim-id'}"})`
|
|
619
|
+
));
|
|
620
|
+
} else {
|
|
621
|
+
console.log(blockResponse(
|
|
622
|
+
'mcp__specmem__send_team_message',
|
|
623
|
+
`Claim released — now notify the team about your changes. Call: send_team_message({type:"update", message:"Finished editing ${state.editedFiles[state.editedFiles.length - 1]}: [describe what you changed]"})`
|
|
624
|
+
));
|
|
625
|
+
}
|
|
624
626
|
return;
|
|
625
627
|
}
|
|
626
628
|
}
|
|
@@ -996,6 +996,11 @@ export function isToolOrThinkingContent(content) {
|
|
|
996
996
|
return true;
|
|
997
997
|
if (trimmed.startsWith('[Tool:'))
|
|
998
998
|
return true;
|
|
999
|
+
// Skip task/agent notification XML blocks — system noise, not conversation
|
|
1000
|
+
if (trimmed.startsWith('<task-notification>'))
|
|
1001
|
+
return true;
|
|
1002
|
+
if (trimmed.includes('<task-id>') && trimmed.includes('</task-id>'))
|
|
1003
|
+
return true;
|
|
999
1004
|
// Check for [CLAUDE] prefixed tool versions
|
|
1000
1005
|
if (trimmed.startsWith('[CLAUDE] [Tools:'))
|
|
1001
1006
|
return true;
|
|
@@ -28,6 +28,7 @@ import * as os from 'os';
|
|
|
28
28
|
import { v4 as uuidv4 } from 'uuid';
|
|
29
29
|
import chokidar from 'chokidar';
|
|
30
30
|
import { logger } from '../utils/logger.js';
|
|
31
|
+
import { extractPdfText, extractPdfBatch, isPdfFile } from './pdfExtractor.js';
|
|
31
32
|
import { getProjectPath } from '../config.js';
|
|
32
33
|
import { getCoordinator } from '../coordination/integration.js';
|
|
33
34
|
/**
|
|
@@ -36,15 +37,15 @@ import { getCoordinator } from '../coordination/integration.js';
|
|
|
36
37
|
*/
|
|
37
38
|
function loadResourceLimits() {
|
|
38
39
|
const limits = {
|
|
39
|
-
cpuMax:
|
|
40
|
+
cpuMax: 35, // max CPU % target (back-pressure threshold)
|
|
40
41
|
cpuMin: 10, // min CPU % (crawl mode)
|
|
41
|
-
ramMaxMb:
|
|
42
|
+
ramMaxMb: 4000, // max RAM MB (safe for 8GB laptops)
|
|
42
43
|
ramMinMb: 2000, // min RAM MB
|
|
43
44
|
batchSize: 25, // files per batch (was 200!)
|
|
44
|
-
maxConcurrency:
|
|
45
|
+
maxConcurrency: 4, // max parallel file reads (safe for dual-core i3s)
|
|
45
46
|
batchDelayMs: 50, // delay between batches (ms)
|
|
46
47
|
batchDelayMaxMs: 2000, // max delay under heavy load
|
|
47
|
-
cpuCoreMax:
|
|
48
|
+
cpuCoreMax: 2, // max CPU cores (safe for dual-core i3s)
|
|
48
49
|
};
|
|
49
50
|
// 1. Read from model-config.json
|
|
50
51
|
try {
|
|
@@ -177,7 +178,8 @@ const DEFAULT_CONFIG = {
|
|
|
177
178
|
'.c', '.cpp', '.h', '.hpp',
|
|
178
179
|
'.swift',
|
|
179
180
|
'.dockerfile', 'Dockerfile',
|
|
180
|
-
'.env.example', '.env.template'
|
|
181
|
+
'.env.example', '.env.template',
|
|
182
|
+
'.pdf'
|
|
181
183
|
],
|
|
182
184
|
maxFileSizeBytes: 1024 * 1024, // 1MB
|
|
183
185
|
generateEmbeddings: true,
|
|
@@ -444,16 +446,25 @@ export class CodebaseIndexer {
|
|
|
444
446
|
const stats = await fs.stat(filePath);
|
|
445
447
|
if (stats.size > this.config.maxFileSizeBytes)
|
|
446
448
|
return;
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
449
|
+
// PDF files: extract text via PyMuPDF instead of reading as UTF-8
|
|
450
|
+
let content;
|
|
451
|
+
if (isPdfFile(filePath)) {
|
|
452
|
+
const pdfResult = await extractPdfText(filePath);
|
|
453
|
+
if (!pdfResult || !pdfResult.text) return;
|
|
454
|
+
content = pdfResult.text;
|
|
455
|
+
logger.debug({ filePath: relativePath, pages: pdfResult.pages, chars: pdfResult.chars }, 'PDF text extracted');
|
|
456
|
+
} else {
|
|
457
|
+
if (await this.isBinaryFile(filePath))
|
|
458
|
+
return;
|
|
459
|
+
content = await fs.readFile(filePath, 'utf-8');
|
|
460
|
+
}
|
|
450
461
|
const contentHash = this.hashContent(content);
|
|
451
462
|
const existingHash = existingHashes.get(relativePath);
|
|
452
463
|
if (existingHash === contentHash) {
|
|
453
464
|
skipped++;
|
|
454
465
|
return;
|
|
455
466
|
}
|
|
456
|
-
const indexedFile = await this.indexFile(filePath);
|
|
467
|
+
const indexedFile = await this.indexFile(filePath, isPdfFile(filePath) ? content : undefined);
|
|
457
468
|
if (indexedFile) {
|
|
458
469
|
this.index.set(indexedFile.filePath, indexedFile);
|
|
459
470
|
changedFiles.push(indexedFile);
|
|
@@ -616,9 +627,17 @@ export class CodebaseIndexer {
|
|
|
616
627
|
if (existing && existing.mtime && stats.mtime.getTime() <= existing.mtime) {
|
|
617
628
|
return { skipped: true, relativePath, mtimeSkip: true };
|
|
618
629
|
}
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
630
|
+
// PDF files: extract text via PyMuPDF instead of reading as UTF-8
|
|
631
|
+
let content;
|
|
632
|
+
if (isPdfFile(filePath)) {
|
|
633
|
+
const pdfResult = await extractPdfText(filePath);
|
|
634
|
+
if (!pdfResult || !pdfResult.text) return null;
|
|
635
|
+
content = pdfResult.text;
|
|
636
|
+
} else {
|
|
637
|
+
if (await this.isBinaryFile(filePath))
|
|
638
|
+
return null;
|
|
639
|
+
content = await fs.readFile(filePath, 'utf-8');
|
|
640
|
+
}
|
|
622
641
|
const contentHash = this.hashContent(content);
|
|
623
642
|
if (existing && existing.hash === contentHash) {
|
|
624
643
|
return { skipped: true, relativePath, hashSkip: true };
|
|
@@ -1178,7 +1197,7 @@ export class CodebaseIndexer {
|
|
|
1178
1197
|
/**
|
|
1179
1198
|
* indexFile - reads and indexes a single file with enhanced analysis
|
|
1180
1199
|
*/
|
|
1181
|
-
async indexFile(absolutePath) {
|
|
1200
|
+
async indexFile(absolutePath, preExtractedContent) {
|
|
1182
1201
|
try {
|
|
1183
1202
|
const stats = await fs.stat(absolutePath);
|
|
1184
1203
|
// skip if too large
|
|
@@ -1186,11 +1205,23 @@ export class CodebaseIndexer {
|
|
|
1186
1205
|
logger.debug({ path: absolutePath, size: stats.size }, 'skipping large file');
|
|
1187
1206
|
return null;
|
|
1188
1207
|
}
|
|
1189
|
-
//
|
|
1190
|
-
|
|
1191
|
-
|
|
1208
|
+
// PDF files: use pre-extracted content or extract on demand
|
|
1209
|
+
let content;
|
|
1210
|
+
if (isPdfFile(absolutePath)) {
|
|
1211
|
+
if (preExtractedContent) {
|
|
1212
|
+
content = preExtractedContent;
|
|
1213
|
+
} else {
|
|
1214
|
+
const pdfResult = await extractPdfText(absolutePath);
|
|
1215
|
+
if (!pdfResult || !pdfResult.text) return null;
|
|
1216
|
+
content = pdfResult.text;
|
|
1217
|
+
}
|
|
1218
|
+
} else {
|
|
1219
|
+
// skip if binary
|
|
1220
|
+
if (await this.isBinaryFile(absolutePath)) {
|
|
1221
|
+
return null;
|
|
1222
|
+
}
|
|
1223
|
+
content = await fs.readFile(absolutePath, 'utf-8');
|
|
1192
1224
|
}
|
|
1193
|
-
const content = await fs.readFile(absolutePath, 'utf-8');
|
|
1194
1225
|
const relativePath = path.relative(this.config.codebasePath, absolutePath);
|
|
1195
1226
|
const fileName = path.basename(absolutePath);
|
|
1196
1227
|
const extension = path.extname(absolutePath).toLowerCase();
|
|
@@ -47,7 +47,7 @@ export const EXCLUSION_CONFIG = {
|
|
|
47
47
|
'*.db',
|
|
48
48
|
// Binary assets
|
|
49
49
|
'*.png', '*.jpg', '*.jpeg', '*.gif', '*.ico', '*.webp',
|
|
50
|
-
'*.
|
|
50
|
+
'*.zip', '*.tar', '*.gz', '*.rar', '*.7z',
|
|
51
51
|
'*.mp3', '*.mp4', '*.avi', '*.mov', '*.mkv',
|
|
52
52
|
'*.ttf', '*.woff', '*.woff2', '*.eot', '*.otf',
|
|
53
53
|
'*.exe', '*.dll', '*.so', '*.dylib', '*.bin',
|
|
@@ -145,7 +145,6 @@ const DEFAULT_EXCLUSIONS = [
|
|
|
145
145
|
'*.mp4',
|
|
146
146
|
'*.avi',
|
|
147
147
|
'*.mov',
|
|
148
|
-
'*.pdf',
|
|
149
148
|
'*.zip',
|
|
150
149
|
'*.tar',
|
|
151
150
|
'*.gz',
|
|
@@ -547,8 +546,8 @@ const BINARY_EXTENSIONS = new Set([
|
|
|
547
546
|
'.zip', '.tar', '.gz', '.bz2', '.7z', '.rar', '.xz', '.lz', '.lzma',
|
|
548
547
|
// executables and libraries
|
|
549
548
|
'.exe', '.dll', '.so', '.dylib', '.bin', '.out', '.app', '.msi', '.deb', '.rpm',
|
|
550
|
-
// documents (binary formats)
|
|
551
|
-
'.
|
|
549
|
+
// documents (binary formats — PDF handled by pdfExtractor.js)
|
|
550
|
+
'.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.odt', '.ods', '.odp',
|
|
552
551
|
// fonts
|
|
553
552
|
'.ttf', '.otf', '.woff', '.woff2', '.eot',
|
|
554
553
|
// databases
|
package/dist/codebase/index.js
CHANGED
|
@@ -6,6 +6,10 @@
|
|
|
6
6
|
// ========================================
|
|
7
7
|
export { SkipTheBoringShit, isBinaryFile, getFileSizeBytes, getExclusionHandler, resetExclusionHandler, DEFAULT_EXCLUSIONS } from './exclusions.js';
|
|
8
8
|
// ========================================
|
|
9
|
+
// PDF EXTRACTION - pdfExtractor
|
|
10
|
+
// ========================================
|
|
11
|
+
export { extractPdfText, extractPdfBatch, isPdfFile, isPdfExtractionAvailable } from './pdfExtractor.js';
|
|
12
|
+
// ========================================
|
|
9
13
|
// LANGUAGE DETECTION - whatLanguageIsThis
|
|
10
14
|
// ========================================
|
|
11
15
|
export { WhatLanguageIsThis, getLanguageDetector, resetLanguageDetector, LANGUAGE_REGISTRY, EXTENSION_INDEX, FILENAME_MAPPINGS } from './languageDetection.js';
|
|
@@ -0,0 +1,298 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pdfExtractor.js — PDF text extraction for codebase indexing
|
|
3
|
+
*
|
|
4
|
+
* Spawns pdf-text-extract.py (PyMuPDF) as a child process.
|
|
5
|
+
* Digital PDFs: instant text extraction (0.003s/page).
|
|
6
|
+
* Scanned PDFs: Tesseract OCR fallback (1-3s/page).
|
|
7
|
+
*
|
|
8
|
+
* Used by codebaseIndexer.js and changeHandler.js to index PDF files
|
|
9
|
+
* alongside regular source code.
|
|
10
|
+
*/
|
|
11
|
+
import { spawn } from 'child_process';
|
|
12
|
+
import { existsSync } from 'fs';
|
|
13
|
+
import { join, dirname } from 'path';
|
|
14
|
+
import { logger } from '../utils/logger.js';
|
|
15
|
+
import { getPythonPath } from '../utils/projectEnv.js';
|
|
16
|
+
|
|
17
|
+
const PDF_EXTRACT_TIMEOUT_MS = 60000; // 60s — generous for large scanned PDFs
|
|
18
|
+
const MAX_PDF_PAGES = 100;
|
|
19
|
+
|
|
20
|
+
// Cache the script path after first discovery
|
|
21
|
+
let _cachedScriptPath = null;
|
|
22
|
+
let _pymupdfAvailable = null; // null = unknown, true/false after first check
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* Find the pdf-text-extract.py script.
|
|
26
|
+
* Search pattern matches miniCOTServerManager.findMiniCOTScript().
|
|
27
|
+
*/
|
|
28
|
+
function findPdfExtractScript() {
|
|
29
|
+
if (_cachedScriptPath && existsSync(_cachedScriptPath)) {
|
|
30
|
+
return _cachedScriptPath;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
const specmemRoot = dirname(dirname(__dirname));
|
|
34
|
+
const possiblePaths = [
|
|
35
|
+
// SpecMem package root (works for all installs)
|
|
36
|
+
join(specmemRoot, 'embedding-sandbox', 'pdf-text-extract.py'),
|
|
37
|
+
// Direct package root (flat layout)
|
|
38
|
+
join(specmemRoot, 'pdf-text-extract.py'),
|
|
39
|
+
// Global npm install fallback
|
|
40
|
+
join(dirname(dirname(process.execPath)), 'lib', 'node_modules', 'specmem-hardwicksoftware', 'embedding-sandbox', 'pdf-text-extract.py'),
|
|
41
|
+
];
|
|
42
|
+
|
|
43
|
+
for (const p of possiblePaths) {
|
|
44
|
+
if (existsSync(p)) {
|
|
45
|
+
_cachedScriptPath = p;
|
|
46
|
+
logger.debug({ path: p }, '[pdfExtractor] Found pdf-text-extract.py');
|
|
47
|
+
return p;
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
logger.warn({ searchedPaths: possiblePaths }, '[pdfExtractor] pdf-text-extract.py not found');
|
|
52
|
+
return null;
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* Extract text from a PDF file.
|
|
57
|
+
*
|
|
58
|
+
* @param {string} filePath - Absolute path to the PDF file
|
|
59
|
+
* @param {object} options - { maxPages?: number, language?: string }
|
|
60
|
+
* @returns {Promise<{text: string, pages: number, scannedPages?: number[], chars: number} | null>}
|
|
61
|
+
* Returns null if extraction fails or pymupdf not available.
|
|
62
|
+
*/
|
|
63
|
+
export async function extractPdfText(filePath, options = {}) {
|
|
64
|
+
const { maxPages = MAX_PDF_PAGES, language = 'eng' } = options;
|
|
65
|
+
|
|
66
|
+
// Fast bail if we already know pymupdf is unavailable
|
|
67
|
+
if (_pymupdfAvailable === false) {
|
|
68
|
+
return null;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
const scriptPath = findPdfExtractScript();
|
|
72
|
+
if (!scriptPath) {
|
|
73
|
+
logger.warn('[pdfExtractor] Cannot extract PDF — script not found');
|
|
74
|
+
return null;
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
const pythonPath = getPythonPath();
|
|
78
|
+
|
|
79
|
+
return new Promise((resolve) => {
|
|
80
|
+
const args = [scriptPath, filePath, '--max-pages', String(maxPages), '--language', language];
|
|
81
|
+
let stdout = '';
|
|
82
|
+
let stderr = '';
|
|
83
|
+
let resolved = false;
|
|
84
|
+
|
|
85
|
+
const proc = spawn(pythonPath, args, {
|
|
86
|
+
timeout: PDF_EXTRACT_TIMEOUT_MS,
|
|
87
|
+
stdio: ['ignore', 'pipe', 'pipe'],
|
|
88
|
+
env: { ...process.env },
|
|
89
|
+
});
|
|
90
|
+
|
|
91
|
+
const timeoutId = setTimeout(() => {
|
|
92
|
+
if (!resolved) {
|
|
93
|
+
resolved = true;
|
|
94
|
+
proc.kill('SIGKILL');
|
|
95
|
+
logger.warn({ filePath, timeoutMs: PDF_EXTRACT_TIMEOUT_MS }, '[pdfExtractor] PDF extraction timed out');
|
|
96
|
+
resolve(null);
|
|
97
|
+
}
|
|
98
|
+
}, PDF_EXTRACT_TIMEOUT_MS);
|
|
99
|
+
|
|
100
|
+
proc.stdout.on('data', (data) => { stdout += data.toString(); });
|
|
101
|
+
proc.stderr.on('data', (data) => { stderr += data.toString(); });
|
|
102
|
+
|
|
103
|
+
proc.on('close', (code) => {
|
|
104
|
+
clearTimeout(timeoutId);
|
|
105
|
+
if (resolved) return;
|
|
106
|
+
resolved = true;
|
|
107
|
+
|
|
108
|
+
if (stderr && stderr.includes('pymupdf not found')) {
|
|
109
|
+
_pymupdfAvailable = false;
|
|
110
|
+
logger.warn('[pdfExtractor] pymupdf not installed — PDF indexing disabled');
|
|
111
|
+
resolve(null);
|
|
112
|
+
return;
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
if (code !== 0) {
|
|
116
|
+
logger.warn({ filePath, code, stderr: stderr.slice(0, 200) }, '[pdfExtractor] PDF extraction failed');
|
|
117
|
+
resolve(null);
|
|
118
|
+
return;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
try {
|
|
122
|
+
const result = JSON.parse(stdout.trim());
|
|
123
|
+
if (result.error) {
|
|
124
|
+
logger.warn({ filePath, error: result.error }, '[pdfExtractor] PDF extraction error');
|
|
125
|
+
resolve(null);
|
|
126
|
+
return;
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
// Mark pymupdf as available on first success
|
|
130
|
+
if (_pymupdfAvailable === null) {
|
|
131
|
+
_pymupdfAvailable = true;
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
resolve({
|
|
135
|
+
text: result.text,
|
|
136
|
+
pages: result.pages,
|
|
137
|
+
chars: result.chars,
|
|
138
|
+
scannedPages: result.scanned_pages || [],
|
|
139
|
+
truncated: result.truncated || false,
|
|
140
|
+
totalPages: result.total_pages || result.pages,
|
|
141
|
+
});
|
|
142
|
+
} catch (parseErr) {
|
|
143
|
+
logger.warn({ filePath, stdout: stdout.slice(0, 200) }, '[pdfExtractor] Failed to parse extraction result');
|
|
144
|
+
resolve(null);
|
|
145
|
+
}
|
|
146
|
+
});
|
|
147
|
+
|
|
148
|
+
proc.on('error', (err) => {
|
|
149
|
+
clearTimeout(timeoutId);
|
|
150
|
+
if (!resolved) {
|
|
151
|
+
resolved = true;
|
|
152
|
+
logger.warn({ filePath, error: err.message }, '[pdfExtractor] Failed to spawn Python');
|
|
153
|
+
resolve(null);
|
|
154
|
+
}
|
|
155
|
+
});
|
|
156
|
+
});
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
/**
|
|
160
|
+
* Extract text from multiple PDFs in a single Python process (batch mode).
|
|
161
|
+
* One Python startup for N PDFs — avoids interpreter overhead per file.
|
|
162
|
+
* Returns a Map<filePath, result> where result is the extraction output or null.
|
|
163
|
+
*
|
|
164
|
+
* @param {string[]} filePaths - Absolute paths to PDF files
|
|
165
|
+
* @param {object} options - { maxPages?: number, language?: string }
|
|
166
|
+
* @returns {Promise<Map<string, {text: string, pages: number, chars: number} | null>>}
|
|
167
|
+
*/
|
|
168
|
+
export async function extractPdfBatch(filePaths, options = {}) {
|
|
169
|
+
const { maxPages = MAX_PDF_PAGES, language = 'eng' } = options;
|
|
170
|
+
const results = new Map();
|
|
171
|
+
|
|
172
|
+
if (!filePaths.length) return results;
|
|
173
|
+
|
|
174
|
+
// Fast bail if pymupdf is known unavailable
|
|
175
|
+
if (_pymupdfAvailable === false) {
|
|
176
|
+
for (const fp of filePaths) results.set(fp, null);
|
|
177
|
+
return results;
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
const scriptPath = findPdfExtractScript();
|
|
181
|
+
if (!scriptPath) {
|
|
182
|
+
for (const fp of filePaths) results.set(fp, null);
|
|
183
|
+
return results;
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
const pythonPath = getPythonPath();
|
|
187
|
+
// Batch timeout: 60s base + 10s per PDF (scanned pages take 1-3s each)
|
|
188
|
+
const batchTimeout = PDF_EXTRACT_TIMEOUT_MS + (filePaths.length * 10000);
|
|
189
|
+
|
|
190
|
+
return new Promise((resolve) => {
|
|
191
|
+
const args = [scriptPath, '--batch', ...filePaths, '--max-pages', String(maxPages), '--language', language];
|
|
192
|
+
let stdout = '';
|
|
193
|
+
let stderr = '';
|
|
194
|
+
let resolved = false;
|
|
195
|
+
|
|
196
|
+
const proc = spawn(pythonPath, args, {
|
|
197
|
+
stdio: ['ignore', 'pipe', 'pipe'],
|
|
198
|
+
env: { ...process.env },
|
|
199
|
+
});
|
|
200
|
+
|
|
201
|
+
const timeoutId = setTimeout(() => {
|
|
202
|
+
if (!resolved) {
|
|
203
|
+
resolved = true;
|
|
204
|
+
proc.kill('SIGKILL');
|
|
205
|
+
logger.warn({ count: filePaths.length, timeoutMs: batchTimeout }, '[pdfExtractor] Batch extraction timed out');
|
|
206
|
+
// Return whatever we parsed so far + null for the rest
|
|
207
|
+
for (const fp of filePaths) {
|
|
208
|
+
if (!results.has(fp)) results.set(fp, null);
|
|
209
|
+
}
|
|
210
|
+
resolve(results);
|
|
211
|
+
}
|
|
212
|
+
}, batchTimeout);
|
|
213
|
+
|
|
214
|
+
proc.stdout.on('data', (data) => { stdout += data.toString(); });
|
|
215
|
+
proc.stderr.on('data', (data) => { stderr += data.toString(); });
|
|
216
|
+
|
|
217
|
+
proc.on('close', (code) => {
|
|
218
|
+
clearTimeout(timeoutId);
|
|
219
|
+
if (resolved) return;
|
|
220
|
+
resolved = true;
|
|
221
|
+
|
|
222
|
+
if (stderr && stderr.includes('pymupdf not found')) {
|
|
223
|
+
_pymupdfAvailable = false;
|
|
224
|
+
logger.warn('[pdfExtractor] pymupdf not installed — PDF indexing disabled');
|
|
225
|
+
for (const fp of filePaths) results.set(fp, null);
|
|
226
|
+
resolve(results);
|
|
227
|
+
return;
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
// Parse JSONL — one JSON object per line
|
|
231
|
+
const lines = stdout.split('\n').filter(l => l.trim());
|
|
232
|
+
for (const line of lines) {
|
|
233
|
+
try {
|
|
234
|
+
const result = JSON.parse(line);
|
|
235
|
+
const path = result.path;
|
|
236
|
+
if (!path) continue;
|
|
237
|
+
|
|
238
|
+
if (result.error) {
|
|
239
|
+
logger.warn({ path, error: result.error }, '[pdfExtractor] PDF extraction error');
|
|
240
|
+
results.set(path, null);
|
|
241
|
+
continue;
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
if (_pymupdfAvailable === null) _pymupdfAvailable = true;
|
|
245
|
+
|
|
246
|
+
results.set(path, {
|
|
247
|
+
text: result.text,
|
|
248
|
+
pages: result.pages,
|
|
249
|
+
chars: result.chars,
|
|
250
|
+
scannedPages: result.scanned_pages || [],
|
|
251
|
+
truncated: result.truncated || false,
|
|
252
|
+
totalPages: result.total_pages || result.pages,
|
|
253
|
+
});
|
|
254
|
+
} catch (e) {
|
|
255
|
+
logger.debug({ line: line.slice(0, 100) }, '[pdfExtractor] Failed to parse JSONL line');
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
// Fill nulls for any missing paths
|
|
260
|
+
for (const fp of filePaths) {
|
|
261
|
+
if (!results.has(fp)) results.set(fp, null);
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
logger.info({
|
|
265
|
+
total: filePaths.length,
|
|
266
|
+
extracted: [...results.values()].filter(v => v !== null).length,
|
|
267
|
+
failed: [...results.values()].filter(v => v === null).length,
|
|
268
|
+
}, '[pdfExtractor] Batch extraction complete');
|
|
269
|
+
|
|
270
|
+
resolve(results);
|
|
271
|
+
});
|
|
272
|
+
|
|
273
|
+
proc.on('error', (err) => {
|
|
274
|
+
clearTimeout(timeoutId);
|
|
275
|
+
if (!resolved) {
|
|
276
|
+
resolved = true;
|
|
277
|
+
logger.warn({ error: err.message }, '[pdfExtractor] Failed to spawn Python for batch');
|
|
278
|
+
for (const fp of filePaths) results.set(fp, null);
|
|
279
|
+
resolve(results);
|
|
280
|
+
}
|
|
281
|
+
});
|
|
282
|
+
});
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
/**
|
|
286
|
+
* Check if PDF extraction is available (pymupdf installed + script found).
|
|
287
|
+
*/
|
|
288
|
+
export function isPdfExtractionAvailable() {
|
|
289
|
+
if (_pymupdfAvailable === false) return false;
|
|
290
|
+
return findPdfExtractScript() !== null;
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
/**
|
|
294
|
+
* Check if a file is a PDF by extension.
|
|
295
|
+
*/
|
|
296
|
+
export function isPdfFile(filePath) {
|
|
297
|
+
return filePath.toLowerCase().endsWith('.pdf');
|
|
298
|
+
}
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* taskTeamMembers.ts - API endpoints for
|
|
2
|
+
* taskTeamMembers.ts - API endpoints for Agent team member tracking
|
|
3
3
|
*
|
|
4
|
-
* yo fr fr this lets you view and manually log
|
|
4
|
+
* yo fr fr this lets you view and manually log Agent team members
|
|
5
5
|
*/
|
|
6
6
|
import { Router } from 'express';
|
|
7
7
|
import { z } from 'zod';
|