specmem-hardwicksoftware 3.7.35 → 3.7.38
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +34 -0
- package/README.md +11 -15
- package/bin/specmem-autoclaude.cjs +12 -1
- package/bin/specmem-cli.cjs +1077 -11
- package/bin/specmem-console.cjs +890 -63
- package/bootstrap.cjs +10 -2
- package/claude-hooks/agent-loading-hook.cjs +16 -16
- package/claude-hooks/agent-loading-hook.js +28 -21
- package/claude-hooks/agent-type-matcher.js +1 -1
- package/claude-hooks/background-completion-silencer.js +1 -1
- package/claude-hooks/file-claim-enforcer.cjs +37 -36
- package/claude-hooks/output-cleaner.cjs +1 -1
- package/claude-hooks/refusal-detector-hook.cjs +53 -0
- package/claude-hooks/settings.json +64 -4
- package/claude-hooks/smart-search-interceptor.js +1 -1
- package/claude-hooks/specmem-search-enforcer.cjs +2 -11
- package/claude-hooks/specmem-team-member-inject.js +1 -1
- package/claude-hooks/specmem-unified-hook.py +1 -1
- package/claude-hooks/subagent-loading-hook.cjs +1 -1
- package/claude-hooks/task-progress-hook.cjs +7 -7
- package/claude-hooks/task-progress-hook.js +3 -3
- package/claude-hooks/team-comms-enforcer.cjs +113 -47
- package/claude-hooks/use-code-pointers.cjs +1 -1
- package/dist/claude-sessions/sessionParser.js +5 -0
- package/dist/cli/deploy-to-claude.js +9 -2
- package/dist/codebase/codebaseIndexer.js +48 -17
- package/dist/codebase/exclusions.js +3 -4
- package/dist/codebase/index.js +4 -0
- package/dist/codebase/pdfExtractor.js +298 -0
- package/dist/dashboard/api/taskTeamMembers.js +2 -2
- package/dist/db/bigBrainMigrations.js +29 -0
- package/dist/hooks/hookManager.js +4 -4
- package/dist/hooks/teamFramingCli.js +1 -1
- package/dist/hooks/teamMemberPrepromptHook.js +5 -5
- package/dist/index.js +49 -12
- package/dist/init/claudeConfigInjector.js +27 -8
- package/dist/installer/autoInstall.js +7 -1
- package/dist/mcp/compactionProxy.js +1052 -192
- package/dist/mcp/compactionProxyDaemon.js +112 -37
- package/dist/mcp/contextVault.js +439 -0
- package/dist/mcp/embeddingServerManager.js +151 -17
- package/dist/mcp/mcpProtocolHandler.js +6 -1
- package/dist/mcp/miniCOTServerManager.js +82 -8
- package/dist/mcp/specMemServer.js +45 -10
- package/dist/mcp/toolRegistry.js +6 -0
- package/dist/startup/startupIndexing.js +14 -0
- package/dist/team-members/taskOrchestrator.js +3 -3
- package/dist/team-members/taskTeamMemberLogger.js +2 -2
- package/dist/tools/goofy/deployTeamMember.js +3 -3
- package/dist/tools/goofy/digInTheVault.js +81 -0
- package/dist/tools/goofy/findCodePointers.js +17 -0
- package/dist/tools/goofy/findWhatISaid.js +19 -0
- package/dist/tools/goofy/stashTheGoods.js +56 -0
- package/dist/tools/teamMemberDeployer.js +2 -2
- package/dist/watcher/changeHandler.js +65 -8
- package/dist/watcher/changeQueue.js +20 -1
- package/embedding-sandbox/frankenstein-embeddings.py +4 -3
- package/embedding-sandbox/mini-cot-service.py +11 -13
- package/embedding-sandbox/pdf-text-extract.py +208 -0
- package/package.json +1 -1
- package/scripts/deploy-hooks.cjs +12 -4
- package/scripts/fast-batch-embedder.cjs +2 -2
- package/scripts/force-retry.cjs +34 -0
- package/scripts/global-postinstall.cjs +97 -4
- package/scripts/poetic-abliteration.cjs +379 -0
- package/scripts/refusal-enforcer.cjs +88 -0
- package/scripts/specmem-init.cjs +222 -41
- package/specmem/model-config.json +6 -6
- package/specmem/supervisord.conf +1 -1
- package/svg-sections/readme-token-compaction.svg +246 -0
- package/claude-hooks/agent-chooser-hook.js +0 -179
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* taskTeamMemberLogger.ts - Logs Code
|
|
2
|
+
* taskTeamMemberLogger.ts - Logs Code Agent team member activity to SpecMem database
|
|
3
3
|
*
|
|
4
|
-
* yo fr fr this bridges the gap between Code's
|
|
4
|
+
* yo fr fr this bridges the gap between Code's Agent tool and SpecMem tracking
|
|
5
5
|
*
|
|
6
6
|
* Problem: Task-deployed team members are invisible to SpecMem dashboard
|
|
7
7
|
* Solution: Log team member activity before/after Task deployment
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* deployTeamMember - The
|
|
2
|
+
* deployTeamMember - The Agent tool but actually works with MCP
|
|
3
3
|
*
|
|
4
4
|
* Spawns team members in screen sessions with full SpecMem MCP access.
|
|
5
|
-
* This is basically skidding Code's
|
|
5
|
+
* This is basically skidding Code's Agent tool but making it not suck.
|
|
6
6
|
*/
|
|
7
7
|
import { deployTeamMember as deployTeamMemberImpl } from '../teamMemberDeployer.js';
|
|
8
8
|
import { logger } from '../../utils/logger.js';
|
|
@@ -10,7 +10,7 @@ export class DeployTeamMember {
|
|
|
10
10
|
name = 'deployTeamMember';
|
|
11
11
|
description = `Deploy a team member with full SpecMem MCP access.
|
|
12
12
|
|
|
13
|
-
This is like the
|
|
13
|
+
This is like the Agent tool but actually works - spawned team members get full access
|
|
14
14
|
to all SpecMem MCP tools including team member communication (sayToTeamMember, listenForMessages,
|
|
15
15
|
sendHeartbeat, getActiveTeamMembers).
|
|
16
16
|
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* digInTheVault - search the context vault for stashed content
|
|
3
|
+
*
|
|
4
|
+
* when you need to find something in all that stashed content
|
|
5
|
+
* without loading it all into context. BM25-ranked via tsvector.
|
|
6
|
+
*
|
|
7
|
+
* supports:
|
|
8
|
+
* - query search across all vaults (project-scoped)
|
|
9
|
+
* - targeted search within a specific vault_id
|
|
10
|
+
* - full dump of a vault_id (get_all mode)
|
|
11
|
+
* - stats mode (no query = return vault statistics)
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
import { digInTheVault as doTheDig, getFullStash, getVaultStats } from '../../mcp/contextVault.js';
|
|
15
|
+
|
|
16
|
+
export class DigInTheVault {
|
|
17
|
+
name = 'dig_in_the_vault';
|
|
18
|
+
description = 'Search the context vault for previously stashed content. Returns BM25-ranked chunks matching your query. Use vault_id to search a specific stash, or omit to search all. Use get_all:true to retrieve everything from a vault.';
|
|
19
|
+
|
|
20
|
+
inputSchema = {
|
|
21
|
+
type: 'object',
|
|
22
|
+
properties: {
|
|
23
|
+
query: {
|
|
24
|
+
type: 'string',
|
|
25
|
+
description: 'What to search for in the vault'
|
|
26
|
+
},
|
|
27
|
+
vault_id: {
|
|
28
|
+
type: 'string',
|
|
29
|
+
description: 'Search within a specific stash (the id from vault_receipt)'
|
|
30
|
+
},
|
|
31
|
+
limit: {
|
|
32
|
+
type: 'number',
|
|
33
|
+
description: 'Max results to return (default: 10)',
|
|
34
|
+
default: 10
|
|
35
|
+
},
|
|
36
|
+
get_all: {
|
|
37
|
+
type: 'boolean',
|
|
38
|
+
description: 'Retrieve ALL chunks for a vault_id in order (ignores query)',
|
|
39
|
+
default: false
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
};
|
|
43
|
+
|
|
44
|
+
async execute(params) {
|
|
45
|
+
const { query, vault_id, limit = 10, get_all = false } = params;
|
|
46
|
+
|
|
47
|
+
try {
|
|
48
|
+
// get_all mode: dump full stash content
|
|
49
|
+
if (get_all && vault_id) {
|
|
50
|
+
const full = await getFullStash(vault_id);
|
|
51
|
+
if (!full) return `<vault_dig ok="false" error="vault ${vault_id} not found or expired"/>`;
|
|
52
|
+
return full;
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
// No query = return vault stats
|
|
56
|
+
if (!query) {
|
|
57
|
+
const stats = await getVaultStats();
|
|
58
|
+
return JSON.stringify(stats, null, 2);
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
// BM25-ranked search
|
|
62
|
+
const results = await doTheDig(query, vault_id || null, limit);
|
|
63
|
+
|
|
64
|
+
if (results.length === 0) {
|
|
65
|
+
return vault_id
|
|
66
|
+
? `<vault_dig ok="false" vault_id="${vault_id}" query="${query}" error="no matches — try different terms or get_all:true"/>`
|
|
67
|
+
: `<vault_dig ok="false" query="${query}" error="no matches in any vault"/>`;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
// Format results compactly
|
|
71
|
+
const lines = results.map((r, i) => {
|
|
72
|
+
const header = `── [${i + 1}] vault:${r.vault_id} chunk:${r.chunk_idx} rank:${parseFloat(r.rank).toFixed(3)} ──`;
|
|
73
|
+
return `${header}\n${r.content}`;
|
|
74
|
+
});
|
|
75
|
+
|
|
76
|
+
return lines.join('\n\n');
|
|
77
|
+
} catch (err) {
|
|
78
|
+
return `<vault_dig ok="false" error="${err.message}"/>`;
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
}
|
|
@@ -410,6 +410,23 @@ export class FindCodePointers {
|
|
|
410
410
|
attribution: SPECMEM_ATTRIBUTION
|
|
411
411
|
};
|
|
412
412
|
}
|
|
413
|
+
// FAST FAIL: Invalid query detection - return immediately with format hint
|
|
414
|
+
const queryTrimmed = params.query.trim();
|
|
415
|
+
// Check for natural language questions (not code terms)
|
|
416
|
+
const isQuestion = /^(how|what|why|when|where|who|can|does|is|should|would|could)\s/i.test(queryTrimmed);
|
|
417
|
+
if (isQuestion && queryTrimmed.length > 50) {
|
|
418
|
+
// Likely a natural language question instead of code terms - fail fast with hint
|
|
419
|
+
logger.warn({ query: params.query }, '[CodePointers] Invalid query format detected - failing fast with hint');
|
|
420
|
+
return {
|
|
421
|
+
results: [],
|
|
422
|
+
query: params.query,
|
|
423
|
+
total_found: 0,
|
|
424
|
+
search_type: 'semantic',
|
|
425
|
+
attribution: SPECMEM_ATTRIBUTION,
|
|
426
|
+
error: 'INVALID_QUERY_FORMAT',
|
|
427
|
+
error_hint: 'Query appears to be a natural language question. For code search, use CODE TERMS like "admin login auth" NOT "how does admin login work". See /specmem/HOW_TO_USE.md for examples.'
|
|
428
|
+
};
|
|
429
|
+
}
|
|
413
430
|
// MODE SELECTION: Return options if user wants to choose
|
|
414
431
|
if (params.galleryMode === 'ask') {
|
|
415
432
|
return this.returnModeOptions(params.query);
|
|
@@ -690,6 +690,25 @@ export class FindWhatISaid {
|
|
|
690
690
|
highlights: []
|
|
691
691
|
}];
|
|
692
692
|
}
|
|
693
|
+
// FAST FAIL: Whitespace-only or clearly invalid query
|
|
694
|
+
const queryTrimmed = params.query.trim();
|
|
695
|
+
if (queryTrimmed.length < 2) {
|
|
696
|
+
logger.warn({ query: params.query }, '[find_memory] Query too short - failing fast');
|
|
697
|
+
return [{
|
|
698
|
+
memory: {
|
|
699
|
+
id: 'error',
|
|
700
|
+
content: 'Query too short. Provide at least 2 characters for meaningful search.',
|
|
701
|
+
createdAt: new Date(),
|
|
702
|
+
updatedAt: new Date(),
|
|
703
|
+
tags: ['error'],
|
|
704
|
+
importance: 'low',
|
|
705
|
+
memoryType: 'semantic',
|
|
706
|
+
metadata: { _isError: true }
|
|
707
|
+
},
|
|
708
|
+
similarity: 0,
|
|
709
|
+
highlights: []
|
|
710
|
+
}];
|
|
711
|
+
}
|
|
693
712
|
logger.debug({ query: params.query, limit: params.limit }, 'searching memories fr');
|
|
694
713
|
// Broadcast COT to dashboard
|
|
695
714
|
cotStart('find_memory', params.query || 'browsing');
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* stashTheGoods - manually vault content for later retrieval
|
|
3
|
+
*
|
|
4
|
+
* when you got data that's too thicc for the context window
|
|
5
|
+
* but you might need it later, stash it in the vault fr
|
|
6
|
+
*
|
|
7
|
+
* auto-stash handles most cases, but this tool lets claude
|
|
8
|
+
* manually stash anything it wants to keep searchable
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import { stashTheGoods as doTheStash, formatVaultReceipt } from '../../mcp/contextVault.js';
|
|
12
|
+
|
|
13
|
+
export class StashTheGoods {
|
|
14
|
+
name = 'stash_the_goods';
|
|
15
|
+
description = 'Stash large content in the context vault for token-efficient retrieval later. Content is chunked, indexed with full-text search, and retrievable via dig_in_the_vault. Auto-expires after 24h.';
|
|
16
|
+
|
|
17
|
+
inputSchema = {
|
|
18
|
+
type: 'object',
|
|
19
|
+
properties: {
|
|
20
|
+
content: {
|
|
21
|
+
type: 'string',
|
|
22
|
+
description: 'The content to stash in the vault'
|
|
23
|
+
},
|
|
24
|
+
tool_name: {
|
|
25
|
+
type: 'string',
|
|
26
|
+
description: 'Which tool produced this content (for tracking)'
|
|
27
|
+
},
|
|
28
|
+
tags: {
|
|
29
|
+
type: 'array',
|
|
30
|
+
items: { type: 'string' },
|
|
31
|
+
description: 'Optional tags for categorization'
|
|
32
|
+
}
|
|
33
|
+
},
|
|
34
|
+
required: ['content']
|
|
35
|
+
};
|
|
36
|
+
|
|
37
|
+
async execute(params) {
|
|
38
|
+
const { content, tool_name, tags } = params;
|
|
39
|
+
|
|
40
|
+
if (!content || content.length === 0) {
|
|
41
|
+
return '<vault_receipt ok="false" error="nothing to stash fr"/>';
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
try {
|
|
45
|
+
const result = await doTheStash(content, {
|
|
46
|
+
tool: tool_name || 'manual_stash',
|
|
47
|
+
tags: tags || [],
|
|
48
|
+
manual: true,
|
|
49
|
+
});
|
|
50
|
+
|
|
51
|
+
return formatVaultReceipt(result);
|
|
52
|
+
} catch (err) {
|
|
53
|
+
return `<vault_receipt ok="false" error="${err.message}"/>`;
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
}
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Team Member Deployer Tool
|
|
3
3
|
*
|
|
4
|
-
* "Skidded" version of Code's
|
|
4
|
+
* "Skidded" version of Code's Agent tool that actually works with MCP
|
|
5
5
|
* Spawns team members with full SpecMem MCP access
|
|
6
6
|
*
|
|
7
7
|
* Now integrates with TeamCommsService for team-based team member coordination:
|
|
@@ -23,7 +23,7 @@ import { getAgentsJson, isValidAgentType } from './agentDefinitions.js';
|
|
|
23
23
|
/**
|
|
24
24
|
* Deploy a team member with full SpecMem MCP access
|
|
25
25
|
*
|
|
26
|
-
* This is basically the
|
|
26
|
+
* This is basically the Agent tool but it actually fucking works
|
|
27
27
|
* because we spawn team members with MCP configured
|
|
28
28
|
*
|
|
29
29
|
* Now with team communication support for coordinated multi-team-member work
|
|
@@ -21,6 +21,7 @@ import { getCoordinator } from '../coordination/integration.js';
|
|
|
21
21
|
import { isMinifiedOrBundled, isBinaryFile, EXCLUSION_CONFIG } from '../codebase/exclusions.js';
|
|
22
22
|
import { getProjectPathForInsert } from '../services/ProjectContext.js';
|
|
23
23
|
import { getEmbeddingTimeout } from '../config/embeddingTimeouts.js';
|
|
24
|
+
import { extractPdfText, isPdfFile } from '../codebase/pdfExtractor.js';
|
|
24
25
|
// Retry helper for transient embedding failures (timeout, socket reset, etc.)
|
|
25
26
|
const WATCHER_MAX_RETRIES = parseInt(process.env['SPECMEM_WATCHER_RETRIES'] || '2');
|
|
26
27
|
async function withWatcherRetry(operation, filePath) {
|
|
@@ -132,6 +133,51 @@ export class AutoUpdateTheMemories {
|
|
|
132
133
|
this.stats.filesSkipped++;
|
|
133
134
|
return;
|
|
134
135
|
}
|
|
136
|
+
// PDF files: extract text via PyMuPDF instead of reading as UTF-8
|
|
137
|
+
if (isPdfFile(event.path)) {
|
|
138
|
+
const pdfResult = await extractPdfText(event.path);
|
|
139
|
+
if (!pdfResult || !pdfResult.text) {
|
|
140
|
+
logger.debug({ path: event.relativePath }, 'PDF extraction failed or empty — skipping');
|
|
141
|
+
this.stats.filesSkipped++;
|
|
142
|
+
return;
|
|
143
|
+
}
|
|
144
|
+
// Use extracted PDF text as content for the standard metadata flow
|
|
145
|
+
const metadata = await this.extractFileMetadata(event.path, event.relativePath, pdfResult.text);
|
|
146
|
+
if (metadata.size > this.config.maxFileSizeBytes) {
|
|
147
|
+
this.stats.filesSkipped++;
|
|
148
|
+
return;
|
|
149
|
+
}
|
|
150
|
+
const content = pdfResult.text;
|
|
151
|
+
const existingMemory = await this.findMemoryByContentHash(metadata.contentHash);
|
|
152
|
+
if (existingMemory) {
|
|
153
|
+
this.stats.filesSkipped++;
|
|
154
|
+
return;
|
|
155
|
+
}
|
|
156
|
+
let embedding;
|
|
157
|
+
const WATCHER_EMBEDDING_TIMEOUT = getEmbeddingTimeout('fileWatcher');
|
|
158
|
+
try {
|
|
159
|
+
embedding = await withWatcherRetry(async () => {
|
|
160
|
+
return new Promise((resolve, reject) => {
|
|
161
|
+
const timeoutId = setTimeout(() => reject(new Error('Embedding timeout for PDF')), WATCHER_EMBEDDING_TIMEOUT);
|
|
162
|
+
this.config.embeddingProvider.generateEmbedding(content)
|
|
163
|
+
.then(result => { clearTimeout(timeoutId); resolve(result); })
|
|
164
|
+
.catch(error => { clearTimeout(timeoutId); reject(error); });
|
|
165
|
+
});
|
|
166
|
+
}, event.path);
|
|
167
|
+
} catch (embErr) {
|
|
168
|
+
logger.warn({ path: event.relativePath, error: embErr.message }, 'PDF embedding failed — storing without embedding');
|
|
169
|
+
}
|
|
170
|
+
await this.storeMemory({
|
|
171
|
+
content,
|
|
172
|
+
metadata,
|
|
173
|
+
embedding,
|
|
174
|
+
tags: ['codebase', 'auto-ingested', 'pdf'],
|
|
175
|
+
});
|
|
176
|
+
this.stats.filesIngested++;
|
|
177
|
+
logger.info({ path: event.relativePath, pages: pdfResult.pages, chars: pdfResult.chars }, 'PDF file indexed');
|
|
178
|
+
this.coordinator.emitFileAdded(event.path, event.relativePath, metadata.size);
|
|
179
|
+
return;
|
|
180
|
+
}
|
|
135
181
|
// check if binary
|
|
136
182
|
if (await isBinaryFile(event.path)) {
|
|
137
183
|
logger.debug({ path: event.relativePath }, 'skipping binary file');
|
|
@@ -250,14 +296,25 @@ export class AutoUpdateTheMemories {
|
|
|
250
296
|
this.stats.filesSkipped++;
|
|
251
297
|
return;
|
|
252
298
|
}
|
|
253
|
-
//
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
299
|
+
// PDF files: extract text via PyMuPDF
|
|
300
|
+
let pdfContent = null;
|
|
301
|
+
if (isPdfFile(event.path)) {
|
|
302
|
+
const pdfResult = await extractPdfText(event.path);
|
|
303
|
+
if (!pdfResult || !pdfResult.text) {
|
|
304
|
+
this.stats.filesSkipped++;
|
|
305
|
+
return;
|
|
306
|
+
}
|
|
307
|
+
pdfContent = pdfResult.text;
|
|
308
|
+
} else {
|
|
309
|
+
// FIX MED-13: Check binary before extractFileMetadata (same as handleFileAdded)
|
|
310
|
+
if (await isBinaryFile(event.path)) {
|
|
311
|
+
logger.debug({ path: event.relativePath }, 'skipping binary file update');
|
|
312
|
+
this.stats.filesSkipped++;
|
|
313
|
+
return;
|
|
314
|
+
}
|
|
258
315
|
}
|
|
259
316
|
// extract new metadata (FIX 7.04: content included to avoid double read)
|
|
260
|
-
const metadata = await this.extractFileMetadata(event.path, event.relativePath);
|
|
317
|
+
const metadata = await this.extractFileMetadata(event.path, event.relativePath, pdfContent);
|
|
261
318
|
// check file size
|
|
262
319
|
if (metadata.size > this.config.maxFileSizeBytes) {
|
|
263
320
|
logger.warn({
|
|
@@ -419,9 +476,9 @@ export class AutoUpdateTheMemories {
|
|
|
419
476
|
/**
|
|
420
477
|
* extractFileMetadata - reads file and generates metadata
|
|
421
478
|
*/
|
|
422
|
-
async extractFileMetadata(path, relativePath) {
|
|
479
|
+
async extractFileMetadata(path, relativePath, preExtractedContent) {
|
|
423
480
|
const stats = await fs.stat(path);
|
|
424
|
-
const content = await fs.readFile(path, 'utf-8');
|
|
481
|
+
const content = preExtractedContent || await fs.readFile(path, 'utf-8');
|
|
425
482
|
const contentHash = this.hashContent(content);
|
|
426
483
|
return {
|
|
427
484
|
path,
|
|
@@ -22,6 +22,7 @@ export class QueueTheChangesUp {
|
|
|
22
22
|
config;
|
|
23
23
|
queue = [];
|
|
24
24
|
processing = false;
|
|
25
|
+
paused = false; // pause queue processing without stopping (e.g. during background indexing)
|
|
25
26
|
processingInterval = null;
|
|
26
27
|
changeHandler;
|
|
27
28
|
// deduplication map: path -> latest queued change
|
|
@@ -162,13 +163,31 @@ export class QueueTheChangesUp {
|
|
|
162
163
|
logger.debug({ cancelledTimeouts: cancelledCount }, 'cancelled pending retry timeouts');
|
|
163
164
|
}
|
|
164
165
|
}
|
|
166
|
+
/**
|
|
167
|
+
* pause - temporarily halt batch processing without stopping the queue.
|
|
168
|
+
* Changes still enqueue but won't be processed until resume().
|
|
169
|
+
* Used during background indexing to avoid resource contention.
|
|
170
|
+
*/
|
|
171
|
+
pause(reason = '') {
|
|
172
|
+
if (this.paused) return;
|
|
173
|
+
this.paused = true;
|
|
174
|
+
logger.info({ reason, pendingCount: this.queue.length }, 'queue PAUSED');
|
|
175
|
+
}
|
|
176
|
+
/**
|
|
177
|
+
* resume - resume batch processing after pause
|
|
178
|
+
*/
|
|
179
|
+
resume() {
|
|
180
|
+
if (!this.paused) return;
|
|
181
|
+
this.paused = false;
|
|
182
|
+
logger.info({ pendingCount: this.queue.length }, 'queue RESUMED');
|
|
183
|
+
}
|
|
165
184
|
/**
|
|
166
185
|
* processBatch - processes a batch of changes
|
|
167
186
|
*
|
|
168
187
|
* nah bruh processing this whole batch at once
|
|
169
188
|
*/
|
|
170
189
|
async processBatch() {
|
|
171
|
-
if (this.queue.length === 0) {
|
|
190
|
+
if (this.paused || this.queue.length === 0) {
|
|
172
191
|
return;
|
|
173
192
|
}
|
|
174
193
|
logger.debug({
|
|
@@ -3916,20 +3916,21 @@ class EmbeddingServer:
|
|
|
3916
3916
|
# Extract requestId for persistent socket multiplexing
|
|
3917
3917
|
request_id = request.get('requestId')
|
|
3918
3918
|
|
|
3919
|
-
# Send "
|
|
3919
|
+
# Send "working" status ONLY for embedding requests (not health/kys/ready)
|
|
3920
|
+
# "working" means actually processing your query (vs "processing" which was ambiguous)
|
|
3920
3921
|
# Meta requests expect a single response - sending a heartbeat first breaks the protocol
|
|
3921
3922
|
# and causes clients to read the heartbeat as the actual response
|
|
3922
3923
|
if not is_meta_request:
|
|
3923
3924
|
text = request.get('text') or request.get('texts')
|
|
3924
3925
|
text_length = len(text) if isinstance(text, str) else (len(text) if text else 0)
|
|
3925
3926
|
heartbeat = {
|
|
3926
|
-
'status': '
|
|
3927
|
+
'status': 'working',
|
|
3927
3928
|
'text_length': text_length
|
|
3928
3929
|
}
|
|
3929
3930
|
if request_id:
|
|
3930
3931
|
heartbeat['requestId'] = request_id
|
|
3931
3932
|
hb_ok = self._safe_sendall(conn, json.dumps(heartbeat).encode('utf-8') + b'\n')
|
|
3932
|
-
print(f"[WORKER {thread_name}]
|
|
3933
|
+
print(f"[WORKER {thread_name}] Working status sent ok={hb_ok}", file=sys.stderr, flush=True)
|
|
3933
3934
|
|
|
3934
3935
|
# Process - each thread gets its own call stack
|
|
3935
3936
|
print(f"[WORKER {thread_name}] Calling handle_request(type={req_type})...", file=sys.stderr, flush=True)
|
|
@@ -391,20 +391,18 @@ class ModelManager:
|
|
|
391
391
|
file_name="model_quantized.onnx"
|
|
392
392
|
)
|
|
393
393
|
else:
|
|
394
|
-
#
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
394
|
+
# No local model found — refuse to download from the internet
|
|
395
|
+
search_paths = [
|
|
396
|
+
os.environ.get('SPECMEM_MODEL_CACHE', '(not set)'),
|
|
397
|
+
os.path.join(os.path.dirname(__file__), '..', 'models', 'pythia-410m-onnx-quant'),
|
|
398
|
+
'/app/models/pythia-onnx-quant',
|
|
399
|
+
]
|
|
400
|
+
raise RuntimeError(
|
|
401
|
+
f"Local ONNX model not found. Searched:\n"
|
|
402
|
+
+ "\n".join(f" - {p}" for p in search_paths)
|
|
403
|
+
+ "\n\nRun `specmem init` to download models via Git LFS release tarball."
|
|
404
|
+
+ "\nSpecMem will NOT download models from the internet at runtime."
|
|
405
405
|
)
|
|
406
|
-
self.torch_model.eval()
|
|
407
|
-
torch.set_grad_enabled(False)
|
|
408
406
|
|
|
409
407
|
self.torch_loaded = True
|
|
410
408
|
print(f"🧠 Generation model loaded for crawl analysis", file=sys.stderr)
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
pdf-text-extract.py — PDF text extraction for SpecMem codebase indexing
|
|
4
|
+
|
|
5
|
+
Uses PyMuPDF (fitz) for instant digital PDF text extraction (0.003s/page).
|
|
6
|
+
Falls back to Tesseract OCR via PyMuPDF's built-in integration for scanned pages.
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
# Single file mode
|
|
10
|
+
python3 pdf-text-extract.py <pdf_path> [--max-pages N] [--language LANG]
|
|
11
|
+
|
|
12
|
+
# Batch mode (JSONL — one result per line, one Python startup for N PDFs)
|
|
13
|
+
python3 pdf-text-extract.py --batch file1.pdf file2.pdf ... [--max-pages N]
|
|
14
|
+
|
|
15
|
+
Output (JSON/JSONL to stdout):
|
|
16
|
+
{"path": "/abs/path.pdf", "text": "...", "pages": 5, "scanned_pages": [3], "chars": 12345}
|
|
17
|
+
{"path": "/abs/path2.pdf", "error": "..."}
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
import sys
|
|
21
|
+
import json
|
|
22
|
+
import os
|
|
23
|
+
import argparse
|
|
24
|
+
|
|
25
|
+
# ---------------------------------------------------------------------------
|
|
26
|
+
# Auto-install pymupdf if missing (matches frankenstein-embeddings pattern)
|
|
27
|
+
# ---------------------------------------------------------------------------
|
|
28
|
+
def _ensure_pymupdf():
|
|
29
|
+
try:
|
|
30
|
+
import pymupdf
|
|
31
|
+
return pymupdf
|
|
32
|
+
except ImportError:
|
|
33
|
+
pass
|
|
34
|
+
# Try legacy import name
|
|
35
|
+
try:
|
|
36
|
+
import fitz
|
|
37
|
+
return fitz
|
|
38
|
+
except ImportError:
|
|
39
|
+
pass
|
|
40
|
+
# Auto-install
|
|
41
|
+
try:
|
|
42
|
+
import subprocess
|
|
43
|
+
sys.stderr.write('[pdf-text-extract] pymupdf not found, installing...\n')
|
|
44
|
+
subprocess.check_call(
|
|
45
|
+
[sys.executable, '-m', 'pip', 'install', '--quiet', 'pymupdf'],
|
|
46
|
+
stdout=subprocess.DEVNULL
|
|
47
|
+
)
|
|
48
|
+
try:
|
|
49
|
+
import pymupdf
|
|
50
|
+
return pymupdf
|
|
51
|
+
except ImportError:
|
|
52
|
+
import fitz
|
|
53
|
+
return fitz
|
|
54
|
+
except Exception as e:
|
|
55
|
+
_error_exit(f'Failed to install pymupdf: {e}')
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _error_exit(msg):
|
|
59
|
+
"""Print error JSON and exit."""
|
|
60
|
+
print(json.dumps({'error': str(msg)}, ensure_ascii=False))
|
|
61
|
+
sys.exit(1)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _is_scanned_page(page, text):
|
|
65
|
+
"""
|
|
66
|
+
Heuristic: page is likely scanned if:
|
|
67
|
+
1. Extracted text is very short (< 50 chars after stripping)
|
|
68
|
+
2. Page has images covering >60% of page area
|
|
69
|
+
"""
|
|
70
|
+
stripped = text.strip()
|
|
71
|
+
if len(stripped) > 50:
|
|
72
|
+
return False
|
|
73
|
+
|
|
74
|
+
try:
|
|
75
|
+
images = page.get_image_info()
|
|
76
|
+
if not images:
|
|
77
|
+
return False
|
|
78
|
+
page_area = abs(page.rect)
|
|
79
|
+
if page_area == 0:
|
|
80
|
+
return False
|
|
81
|
+
image_area = 0
|
|
82
|
+
for img in images:
|
|
83
|
+
if 'bbox' in img:
|
|
84
|
+
try:
|
|
85
|
+
import pymupdf
|
|
86
|
+
r = pymupdf.Rect(img['bbox'])
|
|
87
|
+
except (ImportError, Exception):
|
|
88
|
+
import fitz
|
|
89
|
+
r = fitz.Rect(img['bbox'])
|
|
90
|
+
image_area += abs(r)
|
|
91
|
+
return (image_area / page_area) >= 0.6
|
|
92
|
+
except Exception:
|
|
93
|
+
return False
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _ocr_page(page, language='eng'):
|
|
97
|
+
"""
|
|
98
|
+
Attempt Tesseract OCR on a scanned page via PyMuPDF's built-in integration.
|
|
99
|
+
Returns extracted text or empty string if tesseract unavailable.
|
|
100
|
+
"""
|
|
101
|
+
try:
|
|
102
|
+
tp = page.get_textpage_ocr(language=language, dpi=300)
|
|
103
|
+
return page.get_text(textpage=tp).strip()
|
|
104
|
+
except Exception as e:
|
|
105
|
+
msg = str(e).lower()
|
|
106
|
+
if 'tesseract' in msg or 'not installed' in msg or 'not found' in msg:
|
|
107
|
+
# Tesseract not installed — skip OCR, return what we have
|
|
108
|
+
sys.stderr.write(f'[pdf-text-extract] Tesseract not available, skipping OCR for scanned page\n')
|
|
109
|
+
return ''
|
|
110
|
+
# Other error — still don't crash
|
|
111
|
+
sys.stderr.write(f'[pdf-text-extract] OCR failed: {e}\n')
|
|
112
|
+
return ''
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def extract_pdf(pdf_path, max_pages=100, language='eng'):
|
|
116
|
+
"""
|
|
117
|
+
Extract text from PDF using PyMuPDF.
|
|
118
|
+
Digital pages: instant text extraction.
|
|
119
|
+
Scanned pages: Tesseract OCR fallback.
|
|
120
|
+
"""
|
|
121
|
+
pymupdf = _ensure_pymupdf()
|
|
122
|
+
|
|
123
|
+
if not os.path.isfile(pdf_path):
|
|
124
|
+
return {'error': f'File not found: {pdf_path}'}
|
|
125
|
+
|
|
126
|
+
try:
|
|
127
|
+
doc = pymupdf.open(pdf_path)
|
|
128
|
+
except Exception as e:
|
|
129
|
+
msg = str(e).lower()
|
|
130
|
+
if 'password' in msg or 'encrypt' in msg:
|
|
131
|
+
return {'error': f'PDF is password-protected: {pdf_path}'}
|
|
132
|
+
return {'error': f'Failed to open PDF: {e}'}
|
|
133
|
+
|
|
134
|
+
total_pages = len(doc)
|
|
135
|
+
process_count = min(total_pages, max_pages)
|
|
136
|
+
truncated = total_pages > max_pages
|
|
137
|
+
|
|
138
|
+
texts = []
|
|
139
|
+
scanned_pages = []
|
|
140
|
+
|
|
141
|
+
for i in range(process_count):
|
|
142
|
+
page = doc[i]
|
|
143
|
+
text = page.get_text().strip()
|
|
144
|
+
|
|
145
|
+
if _is_scanned_page(page, text):
|
|
146
|
+
# Try OCR
|
|
147
|
+
ocr_text = _ocr_page(page, language)
|
|
148
|
+
if ocr_text:
|
|
149
|
+
text = ocr_text
|
|
150
|
+
scanned_pages.append(i + 1) # 1-indexed
|
|
151
|
+
# If OCR also empty, keep whatever minimal text we got
|
|
152
|
+
|
|
153
|
+
if text:
|
|
154
|
+
if process_count > 1:
|
|
155
|
+
texts.append(f'--- Page {i + 1} ---\n{text}')
|
|
156
|
+
else:
|
|
157
|
+
texts.append(text)
|
|
158
|
+
|
|
159
|
+
doc.close()
|
|
160
|
+
|
|
161
|
+
full_text = '\n\n'.join(texts)
|
|
162
|
+
|
|
163
|
+
result = {
|
|
164
|
+
'text': full_text,
|
|
165
|
+
'pages': process_count,
|
|
166
|
+
'chars': len(full_text),
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
if scanned_pages:
|
|
170
|
+
result['scanned_pages'] = scanned_pages
|
|
171
|
+
if truncated:
|
|
172
|
+
result['truncated'] = True
|
|
173
|
+
result['total_pages'] = total_pages
|
|
174
|
+
|
|
175
|
+
return result
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def main():
|
|
179
|
+
parser = argparse.ArgumentParser(description='Extract text from PDF files')
|
|
180
|
+
parser.add_argument('pdf_path', nargs='?', help='Path to the PDF file (single mode)')
|
|
181
|
+
parser.add_argument('--batch', nargs='+', metavar='PDF',
|
|
182
|
+
help='Batch mode: extract multiple PDFs (JSONL output, one line per PDF)')
|
|
183
|
+
parser.add_argument('--max-pages', type=int, default=100,
|
|
184
|
+
help='Maximum pages to process per PDF (default: 100)')
|
|
185
|
+
parser.add_argument('--language', default='eng',
|
|
186
|
+
help='Tesseract language for OCR fallback (default: eng)')
|
|
187
|
+
|
|
188
|
+
args = parser.parse_args()
|
|
189
|
+
|
|
190
|
+
if args.batch:
|
|
191
|
+
# Batch mode — JSONL output, one result per line
|
|
192
|
+
# Single Python startup for N PDFs (avoids repeated interpreter overhead)
|
|
193
|
+
for pdf_path in args.batch:
|
|
194
|
+
result = extract_pdf(pdf_path, args.max_pages, args.language)
|
|
195
|
+
result['path'] = pdf_path
|
|
196
|
+
print(json.dumps(result, ensure_ascii=False), flush=True)
|
|
197
|
+
elif args.pdf_path:
|
|
198
|
+
# Single file mode
|
|
199
|
+
result = extract_pdf(args.pdf_path, args.max_pages, args.language)
|
|
200
|
+
result['path'] = args.pdf_path
|
|
201
|
+
print(json.dumps(result, ensure_ascii=False))
|
|
202
|
+
else:
|
|
203
|
+
parser.print_help()
|
|
204
|
+
sys.exit(1)
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
if __name__ == '__main__':
|
|
208
|
+
main()
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "specmem-hardwicksoftware",
|
|
3
|
-
"version": "3.7.
|
|
3
|
+
"version": "3.7.38",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "Your Claude Code sessions don't have to start from scratch anymore — SpecMem gives your AI real memory. It won't forget your conversations, your code, or your architecture decisions between sessions. That's the whole point. Semantic code indexing that actually works: TypeScript, JavaScript, Python, Go, Rust, Java, Kotlin, C, C++, HTML and more. It doesn't just track functions — it gets classes, methods, fields, constants, enums, macros, imports, structs, the whole codebase graph. There's chat memory too, powered by pgvector embeddings. You've also got token compression, team coordination, multi-agent comms, and file watching built in. 74+ MCP tools. Runs on PostgreSQL + Docker. It's kind of a big deal. justcalljon.pro",
|
|
6
6
|
"main": "dist/index.js",
|