specmem-hardwicksoftware 3.7.34 → 3.7.36
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +34 -0
- package/README.md +11 -15
- package/bin/specmem-console.cjs +839 -51
- package/claude-hooks/agent-chooser-hook.js +6 -6
- package/claude-hooks/agent-loading-hook.cjs +16 -16
- package/claude-hooks/agent-loading-hook.js +18 -18
- package/claude-hooks/agent-type-matcher.js +1 -1
- package/claude-hooks/background-completion-silencer.js +1 -1
- package/claude-hooks/file-claim-enforcer.cjs +37 -36
- package/claude-hooks/output-cleaner.cjs +1 -1
- package/claude-hooks/settings.json +27 -3
- package/claude-hooks/specmem-search-enforcer.cjs +2 -11
- package/claude-hooks/specmem-team-member-inject.js +1 -1
- package/claude-hooks/specmem-unified-hook.py +1 -1
- package/claude-hooks/subagent-loading-hook.cjs +1 -1
- package/claude-hooks/task-progress-hook.cjs +7 -7
- package/claude-hooks/task-progress-hook.js +3 -3
- package/claude-hooks/team-comms-enforcer.cjs +49 -47
- package/dist/claude-sessions/sessionParser.js +5 -0
- package/dist/codebase/codebaseIndexer.js +48 -17
- package/dist/codebase/exclusions.js +3 -4
- package/dist/codebase/index.js +4 -0
- package/dist/codebase/pdfExtractor.js +298 -0
- package/dist/dashboard/api/taskTeamMembers.js +2 -2
- package/dist/db/bigBrainMigrations.js +29 -0
- package/dist/hooks/hookManager.js +4 -4
- package/dist/hooks/teamFramingCli.js +1 -1
- package/dist/hooks/teamMemberPrepromptHook.js +5 -5
- package/dist/init/claudeConfigInjector.js +2 -2
- package/dist/mcp/compactionProxy.js +834 -186
- package/dist/mcp/compactionProxyDaemon.js +112 -37
- package/dist/mcp/contextVault.js +439 -0
- package/dist/mcp/embeddingServerManager.js +61 -1
- package/dist/mcp/mcpProtocolHandler.js +6 -1
- package/dist/mcp/miniCOTServerManager.js +82 -8
- package/dist/mcp/specMemServer.js +45 -10
- package/dist/mcp/toolRegistry.js +6 -0
- package/dist/startup/startupIndexing.js +14 -0
- package/dist/team-members/taskOrchestrator.js +3 -3
- package/dist/team-members/taskTeamMemberLogger.js +2 -2
- package/dist/tools/goofy/deployTeamMember.js +3 -3
- package/dist/tools/goofy/digInTheVault.js +81 -0
- package/dist/tools/goofy/stashTheGoods.js +56 -0
- package/dist/tools/teamMemberDeployer.js +2 -2
- package/dist/watcher/changeHandler.js +65 -8
- package/dist/watcher/changeQueue.js +20 -1
- package/dist/watcher/index.js +37 -2
- package/embedding-sandbox/mini-cot-service.py +11 -13
- package/embedding-sandbox/pdf-text-extract.py +208 -0
- package/package.json +1 -1
- package/scripts/deploy-hooks.cjs +2 -2
- package/scripts/global-postinstall.cjs +2 -2
- package/scripts/specmem-init.cjs +130 -36
- package/specmem/model-config.json +6 -6
- package/specmem/supervisord.conf +1 -1
- package/svg-sections/readme-token-compaction.svg +246 -0
|
@@ -21,6 +21,7 @@ import { getCoordinator } from '../coordination/integration.js';
|
|
|
21
21
|
import { isMinifiedOrBundled, isBinaryFile, EXCLUSION_CONFIG } from '../codebase/exclusions.js';
|
|
22
22
|
import { getProjectPathForInsert } from '../services/ProjectContext.js';
|
|
23
23
|
import { getEmbeddingTimeout } from '../config/embeddingTimeouts.js';
|
|
24
|
+
import { extractPdfText, isPdfFile } from '../codebase/pdfExtractor.js';
|
|
24
25
|
// Retry helper for transient embedding failures (timeout, socket reset, etc.)
|
|
25
26
|
const WATCHER_MAX_RETRIES = parseInt(process.env['SPECMEM_WATCHER_RETRIES'] || '2');
|
|
26
27
|
async function withWatcherRetry(operation, filePath) {
|
|
@@ -132,6 +133,51 @@ export class AutoUpdateTheMemories {
|
|
|
132
133
|
this.stats.filesSkipped++;
|
|
133
134
|
return;
|
|
134
135
|
}
|
|
136
|
+
// PDF files: extract text via PyMuPDF instead of reading as UTF-8
|
|
137
|
+
if (isPdfFile(event.path)) {
|
|
138
|
+
const pdfResult = await extractPdfText(event.path);
|
|
139
|
+
if (!pdfResult || !pdfResult.text) {
|
|
140
|
+
logger.debug({ path: event.relativePath }, 'PDF extraction failed or empty — skipping');
|
|
141
|
+
this.stats.filesSkipped++;
|
|
142
|
+
return;
|
|
143
|
+
}
|
|
144
|
+
// Use extracted PDF text as content for the standard metadata flow
|
|
145
|
+
const metadata = await this.extractFileMetadata(event.path, event.relativePath, pdfResult.text);
|
|
146
|
+
if (metadata.size > this.config.maxFileSizeBytes) {
|
|
147
|
+
this.stats.filesSkipped++;
|
|
148
|
+
return;
|
|
149
|
+
}
|
|
150
|
+
const content = pdfResult.text;
|
|
151
|
+
const existingMemory = await this.findMemoryByContentHash(metadata.contentHash);
|
|
152
|
+
if (existingMemory) {
|
|
153
|
+
this.stats.filesSkipped++;
|
|
154
|
+
return;
|
|
155
|
+
}
|
|
156
|
+
let embedding;
|
|
157
|
+
const WATCHER_EMBEDDING_TIMEOUT = getEmbeddingTimeout('fileWatcher');
|
|
158
|
+
try {
|
|
159
|
+
embedding = await withWatcherRetry(async () => {
|
|
160
|
+
return new Promise((resolve, reject) => {
|
|
161
|
+
const timeoutId = setTimeout(() => reject(new Error('Embedding timeout for PDF')), WATCHER_EMBEDDING_TIMEOUT);
|
|
162
|
+
this.config.embeddingProvider.generateEmbedding(content)
|
|
163
|
+
.then(result => { clearTimeout(timeoutId); resolve(result); })
|
|
164
|
+
.catch(error => { clearTimeout(timeoutId); reject(error); });
|
|
165
|
+
});
|
|
166
|
+
}, event.path);
|
|
167
|
+
} catch (embErr) {
|
|
168
|
+
logger.warn({ path: event.relativePath, error: embErr.message }, 'PDF embedding failed — storing without embedding');
|
|
169
|
+
}
|
|
170
|
+
await this.storeMemory({
|
|
171
|
+
content,
|
|
172
|
+
metadata,
|
|
173
|
+
embedding,
|
|
174
|
+
tags: ['codebase', 'auto-ingested', 'pdf'],
|
|
175
|
+
});
|
|
176
|
+
this.stats.filesIngested++;
|
|
177
|
+
logger.info({ path: event.relativePath, pages: pdfResult.pages, chars: pdfResult.chars }, 'PDF file indexed');
|
|
178
|
+
this.coordinator.emitFileAdded(event.path, event.relativePath, metadata.size);
|
|
179
|
+
return;
|
|
180
|
+
}
|
|
135
181
|
// check if binary
|
|
136
182
|
if (await isBinaryFile(event.path)) {
|
|
137
183
|
logger.debug({ path: event.relativePath }, 'skipping binary file');
|
|
@@ -250,14 +296,25 @@ export class AutoUpdateTheMemories {
|
|
|
250
296
|
this.stats.filesSkipped++;
|
|
251
297
|
return;
|
|
252
298
|
}
|
|
253
|
-
//
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
299
|
+
// PDF files: extract text via PyMuPDF
|
|
300
|
+
let pdfContent = null;
|
|
301
|
+
if (isPdfFile(event.path)) {
|
|
302
|
+
const pdfResult = await extractPdfText(event.path);
|
|
303
|
+
if (!pdfResult || !pdfResult.text) {
|
|
304
|
+
this.stats.filesSkipped++;
|
|
305
|
+
return;
|
|
306
|
+
}
|
|
307
|
+
pdfContent = pdfResult.text;
|
|
308
|
+
} else {
|
|
309
|
+
// FIX MED-13: Check binary before extractFileMetadata (same as handleFileAdded)
|
|
310
|
+
if (await isBinaryFile(event.path)) {
|
|
311
|
+
logger.debug({ path: event.relativePath }, 'skipping binary file update');
|
|
312
|
+
this.stats.filesSkipped++;
|
|
313
|
+
return;
|
|
314
|
+
}
|
|
258
315
|
}
|
|
259
316
|
// extract new metadata (FIX 7.04: content included to avoid double read)
|
|
260
|
-
const metadata = await this.extractFileMetadata(event.path, event.relativePath);
|
|
317
|
+
const metadata = await this.extractFileMetadata(event.path, event.relativePath, pdfContent);
|
|
261
318
|
// check file size
|
|
262
319
|
if (metadata.size > this.config.maxFileSizeBytes) {
|
|
263
320
|
logger.warn({
|
|
@@ -419,9 +476,9 @@ export class AutoUpdateTheMemories {
|
|
|
419
476
|
/**
|
|
420
477
|
* extractFileMetadata - reads file and generates metadata
|
|
421
478
|
*/
|
|
422
|
-
async extractFileMetadata(path, relativePath) {
|
|
479
|
+
async extractFileMetadata(path, relativePath, preExtractedContent) {
|
|
423
480
|
const stats = await fs.stat(path);
|
|
424
|
-
const content = await fs.readFile(path, 'utf-8');
|
|
481
|
+
const content = preExtractedContent || await fs.readFile(path, 'utf-8');
|
|
425
482
|
const contentHash = this.hashContent(content);
|
|
426
483
|
return {
|
|
427
484
|
path,
|
|
@@ -22,6 +22,7 @@ export class QueueTheChangesUp {
|
|
|
22
22
|
config;
|
|
23
23
|
queue = [];
|
|
24
24
|
processing = false;
|
|
25
|
+
paused = false; // pause queue processing without stopping (e.g. during background indexing)
|
|
25
26
|
processingInterval = null;
|
|
26
27
|
changeHandler;
|
|
27
28
|
// deduplication map: path -> latest queued change
|
|
@@ -162,13 +163,31 @@ export class QueueTheChangesUp {
|
|
|
162
163
|
logger.debug({ cancelledTimeouts: cancelledCount }, 'cancelled pending retry timeouts');
|
|
163
164
|
}
|
|
164
165
|
}
|
|
166
|
+
/**
|
|
167
|
+
* pause - temporarily halt batch processing without stopping the queue.
|
|
168
|
+
* Changes still enqueue but won't be processed until resume().
|
|
169
|
+
* Used during background indexing to avoid resource contention.
|
|
170
|
+
*/
|
|
171
|
+
pause(reason = '') {
|
|
172
|
+
if (this.paused) return;
|
|
173
|
+
this.paused = true;
|
|
174
|
+
logger.info({ reason, pendingCount: this.queue.length }, 'queue PAUSED');
|
|
175
|
+
}
|
|
176
|
+
/**
|
|
177
|
+
* resume - resume batch processing after pause
|
|
178
|
+
*/
|
|
179
|
+
resume() {
|
|
180
|
+
if (!this.paused) return;
|
|
181
|
+
this.paused = false;
|
|
182
|
+
logger.info({ pendingCount: this.queue.length }, 'queue RESUMED');
|
|
183
|
+
}
|
|
165
184
|
/**
|
|
166
185
|
* processBatch - processes a batch of changes
|
|
167
186
|
*
|
|
168
187
|
* nah bruh processing this whole batch at once
|
|
169
188
|
*/
|
|
170
189
|
async processBatch() {
|
|
171
|
-
if (this.queue.length === 0) {
|
|
190
|
+
if (this.paused || this.queue.length === 0) {
|
|
172
191
|
return;
|
|
173
192
|
}
|
|
174
193
|
logger.debug({
|
package/dist/watcher/index.js
CHANGED
|
@@ -31,6 +31,12 @@ export class WatcherManager {
|
|
|
31
31
|
syncTimeout = null;
|
|
32
32
|
lastLowScoreResyncAt = 0;
|
|
33
33
|
lastLowScoreResyncScore = null; // track score at last resync to detect drops
|
|
34
|
+
// Drift-resync plateau detection: stop resyncing if score isn't improving
|
|
35
|
+
lastDriftResyncAt = 0;
|
|
36
|
+
lastDriftResyncScore = null;
|
|
37
|
+
driftResyncNoImprovementCount = 0; // consecutive resyncs that didn't improve score
|
|
38
|
+
static DRIFT_RESYNC_MAX_NO_IMPROVEMENT = 2; // after 2 consecutive no-improvement resyncs, accept plateau
|
|
39
|
+
static DRIFT_RESYNC_COOLDOWN_MS = 15 * 60 * 1000; // 15 min cooldown between drift resyncs
|
|
34
40
|
constructor(config) {
|
|
35
41
|
// Create handler first - it's the core component
|
|
36
42
|
this.handler = new AutoUpdateTheMemories(config.handler);
|
|
@@ -126,7 +132,7 @@ export class WatcherManager {
|
|
|
126
132
|
const LOW_SCORE_THRESHOLD = parseFloat(process.env['SPECMEM_LOW_SCORE_THRESHOLD'] || '0.85');
|
|
127
133
|
const LOW_SCORE_DROP_THRESHOLD = parseFloat(process.env['SPECMEM_LOW_SCORE_DROP_THRESHOLD'] || '0.10');
|
|
128
134
|
const LOW_SCORE_DEBOUNCE_MS = parseInt(process.env['SPECMEM_LOW_SCORE_DEBOUNCE_MS'] || String(15 * 60 * 1000), 10);
|
|
129
|
-
if (report.syncScore
|
|
135
|
+
if (report.syncScore <= LOW_SCORE_THRESHOLD) {
|
|
130
136
|
// First time seeing low score — always resync
|
|
131
137
|
// After that, only resync if score dropped by >=10% from the post-resync score
|
|
132
138
|
const scoreDrop = this.lastLowScoreResyncScore !== null
|
|
@@ -163,8 +169,21 @@ export class WatcherManager {
|
|
|
163
169
|
missingFromMcp: report.missingFromMcp.length,
|
|
164
170
|
contentMismatch: report.contentMismatch.length
|
|
165
171
|
}, 'drift detected during periodic check');
|
|
166
|
-
// Auto-resync when drift is detected
|
|
172
|
+
// Auto-resync when drift is detected — with plateau detection + cooldown
|
|
167
173
|
if (report.missingFromMcp.length > 0 || report.contentMismatch.length > 0) {
|
|
174
|
+
// Plateau guard: if we've resynced N times without improvement, accept the score
|
|
175
|
+
if (this.driftResyncNoImprovementCount >= WatcherManager.DRIFT_RESYNC_MAX_NO_IMPROVEMENT) {
|
|
176
|
+
logger.info({ syncScore: report.syncScore, noImprovementCount: this.driftResyncNoImprovementCount }, 'drift-resync plateau reached — score is stable, accepting current sync level');
|
|
177
|
+
return;
|
|
178
|
+
}
|
|
179
|
+
// Cooldown guard: don't resync more than once per 15 min via drift path
|
|
180
|
+
const now = Date.now();
|
|
181
|
+
const driftCooldownRemaining = WatcherManager.DRIFT_RESYNC_COOLDOWN_MS - (now - this.lastDriftResyncAt);
|
|
182
|
+
if (driftCooldownRemaining > 0) {
|
|
183
|
+
logger.debug({ syncScore: report.syncScore, cooldownRemainingSec: Math.round(driftCooldownRemaining / 1000) }, 'drift-resync on cooldown — skipping');
|
|
184
|
+
return;
|
|
185
|
+
}
|
|
186
|
+
this.lastDriftResyncAt = now;
|
|
168
187
|
logger.info('periodic check triggering auto-resync...');
|
|
169
188
|
const resyncResult = await this.syncChecker.resyncEverythingFrFr();
|
|
170
189
|
logger.info({
|
|
@@ -175,6 +194,16 @@ export class WatcherManager {
|
|
|
175
194
|
// Update score after resync
|
|
176
195
|
const postReport = await this.syncChecker.checkSync();
|
|
177
196
|
await this.writeSyncScore(postReport.syncScore);
|
|
197
|
+
// Plateau detection: did this resync actually improve the score?
|
|
198
|
+
const improvement = postReport.syncScore - (this.lastDriftResyncScore ?? 0);
|
|
199
|
+
if (improvement < 0.01) { // less than 1% improvement = no meaningful change
|
|
200
|
+
this.driftResyncNoImprovementCount++;
|
|
201
|
+
logger.warn({ syncScore: postReport.syncScore, previousScore: this.lastDriftResyncScore, noImprovementCount: this.driftResyncNoImprovementCount, maxAllowed: WatcherManager.DRIFT_RESYNC_MAX_NO_IMPROVEMENT }, 'drift-resync did not improve score — tracking plateau');
|
|
202
|
+
} else {
|
|
203
|
+
// Score improved — reset plateau counter
|
|
204
|
+
this.driftResyncNoImprovementCount = 0;
|
|
205
|
+
}
|
|
206
|
+
this.lastDriftResyncScore = postReport.syncScore;
|
|
178
207
|
}
|
|
179
208
|
}
|
|
180
209
|
}
|
|
@@ -307,6 +336,12 @@ export class WatcherManager {
|
|
|
307
336
|
* resync - manually trigger full resync
|
|
308
337
|
*/
|
|
309
338
|
async resync() {
|
|
339
|
+
// Manual resync resets all plateau/cooldown state so it always runs fresh
|
|
340
|
+
this.driftResyncNoImprovementCount = 0;
|
|
341
|
+
this.lastDriftResyncAt = 0;
|
|
342
|
+
this.lastDriftResyncScore = null;
|
|
343
|
+
this.lastLowScoreResyncScore = null;
|
|
344
|
+
this.lastLowScoreResyncAt = 0;
|
|
310
345
|
return await this.syncChecker.resyncEverythingFrFr();
|
|
311
346
|
}
|
|
312
347
|
/**
|
|
@@ -391,20 +391,18 @@ class ModelManager:
|
|
|
391
391
|
file_name="model_quantized.onnx"
|
|
392
392
|
)
|
|
393
393
|
else:
|
|
394
|
-
#
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
394
|
+
# No local model found — refuse to download from the internet
|
|
395
|
+
search_paths = [
|
|
396
|
+
os.environ.get('SPECMEM_MODEL_CACHE', '(not set)'),
|
|
397
|
+
os.path.join(os.path.dirname(__file__), '..', 'models', 'pythia-410m-onnx-quant'),
|
|
398
|
+
'/app/models/pythia-onnx-quant',
|
|
399
|
+
]
|
|
400
|
+
raise RuntimeError(
|
|
401
|
+
f"Local ONNX model not found. Searched:\n"
|
|
402
|
+
+ "\n".join(f" - {p}" for p in search_paths)
|
|
403
|
+
+ "\n\nRun `specmem init` to download models via Git LFS release tarball."
|
|
404
|
+
+ "\nSpecMem will NOT download models from the internet at runtime."
|
|
405
405
|
)
|
|
406
|
-
self.torch_model.eval()
|
|
407
|
-
torch.set_grad_enabled(False)
|
|
408
406
|
|
|
409
407
|
self.torch_loaded = True
|
|
410
408
|
print(f"🧠 Generation model loaded for crawl analysis", file=sys.stderr)
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
pdf-text-extract.py — PDF text extraction for SpecMem codebase indexing
|
|
4
|
+
|
|
5
|
+
Uses PyMuPDF (fitz) for instant digital PDF text extraction (0.003s/page).
|
|
6
|
+
Falls back to Tesseract OCR via PyMuPDF's built-in integration for scanned pages.
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
# Single file mode
|
|
10
|
+
python3 pdf-text-extract.py <pdf_path> [--max-pages N] [--language LANG]
|
|
11
|
+
|
|
12
|
+
# Batch mode (JSONL — one result per line, one Python startup for N PDFs)
|
|
13
|
+
python3 pdf-text-extract.py --batch file1.pdf file2.pdf ... [--max-pages N]
|
|
14
|
+
|
|
15
|
+
Output (JSON/JSONL to stdout):
|
|
16
|
+
{"path": "/abs/path.pdf", "text": "...", "pages": 5, "scanned_pages": [3], "chars": 12345}
|
|
17
|
+
{"path": "/abs/path2.pdf", "error": "..."}
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
import sys
|
|
21
|
+
import json
|
|
22
|
+
import os
|
|
23
|
+
import argparse
|
|
24
|
+
|
|
25
|
+
# ---------------------------------------------------------------------------
|
|
26
|
+
# Auto-install pymupdf if missing (matches frankenstein-embeddings pattern)
|
|
27
|
+
# ---------------------------------------------------------------------------
|
|
28
|
+
def _ensure_pymupdf():
|
|
29
|
+
try:
|
|
30
|
+
import pymupdf
|
|
31
|
+
return pymupdf
|
|
32
|
+
except ImportError:
|
|
33
|
+
pass
|
|
34
|
+
# Try legacy import name
|
|
35
|
+
try:
|
|
36
|
+
import fitz
|
|
37
|
+
return fitz
|
|
38
|
+
except ImportError:
|
|
39
|
+
pass
|
|
40
|
+
# Auto-install
|
|
41
|
+
try:
|
|
42
|
+
import subprocess
|
|
43
|
+
sys.stderr.write('[pdf-text-extract] pymupdf not found, installing...\n')
|
|
44
|
+
subprocess.check_call(
|
|
45
|
+
[sys.executable, '-m', 'pip', 'install', '--quiet', 'pymupdf'],
|
|
46
|
+
stdout=subprocess.DEVNULL
|
|
47
|
+
)
|
|
48
|
+
try:
|
|
49
|
+
import pymupdf
|
|
50
|
+
return pymupdf
|
|
51
|
+
except ImportError:
|
|
52
|
+
import fitz
|
|
53
|
+
return fitz
|
|
54
|
+
except Exception as e:
|
|
55
|
+
_error_exit(f'Failed to install pymupdf: {e}')
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _error_exit(msg):
|
|
59
|
+
"""Print error JSON and exit."""
|
|
60
|
+
print(json.dumps({'error': str(msg)}, ensure_ascii=False))
|
|
61
|
+
sys.exit(1)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _is_scanned_page(page, text):
|
|
65
|
+
"""
|
|
66
|
+
Heuristic: page is likely scanned if:
|
|
67
|
+
1. Extracted text is very short (< 50 chars after stripping)
|
|
68
|
+
2. Page has images covering >60% of page area
|
|
69
|
+
"""
|
|
70
|
+
stripped = text.strip()
|
|
71
|
+
if len(stripped) > 50:
|
|
72
|
+
return False
|
|
73
|
+
|
|
74
|
+
try:
|
|
75
|
+
images = page.get_image_info()
|
|
76
|
+
if not images:
|
|
77
|
+
return False
|
|
78
|
+
page_area = abs(page.rect)
|
|
79
|
+
if page_area == 0:
|
|
80
|
+
return False
|
|
81
|
+
image_area = 0
|
|
82
|
+
for img in images:
|
|
83
|
+
if 'bbox' in img:
|
|
84
|
+
try:
|
|
85
|
+
import pymupdf
|
|
86
|
+
r = pymupdf.Rect(img['bbox'])
|
|
87
|
+
except (ImportError, Exception):
|
|
88
|
+
import fitz
|
|
89
|
+
r = fitz.Rect(img['bbox'])
|
|
90
|
+
image_area += abs(r)
|
|
91
|
+
return (image_area / page_area) >= 0.6
|
|
92
|
+
except Exception:
|
|
93
|
+
return False
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _ocr_page(page, language='eng'):
|
|
97
|
+
"""
|
|
98
|
+
Attempt Tesseract OCR on a scanned page via PyMuPDF's built-in integration.
|
|
99
|
+
Returns extracted text or empty string if tesseract unavailable.
|
|
100
|
+
"""
|
|
101
|
+
try:
|
|
102
|
+
tp = page.get_textpage_ocr(language=language, dpi=300)
|
|
103
|
+
return page.get_text(textpage=tp).strip()
|
|
104
|
+
except Exception as e:
|
|
105
|
+
msg = str(e).lower()
|
|
106
|
+
if 'tesseract' in msg or 'not installed' in msg or 'not found' in msg:
|
|
107
|
+
# Tesseract not installed — skip OCR, return what we have
|
|
108
|
+
sys.stderr.write(f'[pdf-text-extract] Tesseract not available, skipping OCR for scanned page\n')
|
|
109
|
+
return ''
|
|
110
|
+
# Other error — still don't crash
|
|
111
|
+
sys.stderr.write(f'[pdf-text-extract] OCR failed: {e}\n')
|
|
112
|
+
return ''
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def extract_pdf(pdf_path, max_pages=100, language='eng'):
|
|
116
|
+
"""
|
|
117
|
+
Extract text from PDF using PyMuPDF.
|
|
118
|
+
Digital pages: instant text extraction.
|
|
119
|
+
Scanned pages: Tesseract OCR fallback.
|
|
120
|
+
"""
|
|
121
|
+
pymupdf = _ensure_pymupdf()
|
|
122
|
+
|
|
123
|
+
if not os.path.isfile(pdf_path):
|
|
124
|
+
return {'error': f'File not found: {pdf_path}'}
|
|
125
|
+
|
|
126
|
+
try:
|
|
127
|
+
doc = pymupdf.open(pdf_path)
|
|
128
|
+
except Exception as e:
|
|
129
|
+
msg = str(e).lower()
|
|
130
|
+
if 'password' in msg or 'encrypt' in msg:
|
|
131
|
+
return {'error': f'PDF is password-protected: {pdf_path}'}
|
|
132
|
+
return {'error': f'Failed to open PDF: {e}'}
|
|
133
|
+
|
|
134
|
+
total_pages = len(doc)
|
|
135
|
+
process_count = min(total_pages, max_pages)
|
|
136
|
+
truncated = total_pages > max_pages
|
|
137
|
+
|
|
138
|
+
texts = []
|
|
139
|
+
scanned_pages = []
|
|
140
|
+
|
|
141
|
+
for i in range(process_count):
|
|
142
|
+
page = doc[i]
|
|
143
|
+
text = page.get_text().strip()
|
|
144
|
+
|
|
145
|
+
if _is_scanned_page(page, text):
|
|
146
|
+
# Try OCR
|
|
147
|
+
ocr_text = _ocr_page(page, language)
|
|
148
|
+
if ocr_text:
|
|
149
|
+
text = ocr_text
|
|
150
|
+
scanned_pages.append(i + 1) # 1-indexed
|
|
151
|
+
# If OCR also empty, keep whatever minimal text we got
|
|
152
|
+
|
|
153
|
+
if text:
|
|
154
|
+
if process_count > 1:
|
|
155
|
+
texts.append(f'--- Page {i + 1} ---\n{text}')
|
|
156
|
+
else:
|
|
157
|
+
texts.append(text)
|
|
158
|
+
|
|
159
|
+
doc.close()
|
|
160
|
+
|
|
161
|
+
full_text = '\n\n'.join(texts)
|
|
162
|
+
|
|
163
|
+
result = {
|
|
164
|
+
'text': full_text,
|
|
165
|
+
'pages': process_count,
|
|
166
|
+
'chars': len(full_text),
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
if scanned_pages:
|
|
170
|
+
result['scanned_pages'] = scanned_pages
|
|
171
|
+
if truncated:
|
|
172
|
+
result['truncated'] = True
|
|
173
|
+
result['total_pages'] = total_pages
|
|
174
|
+
|
|
175
|
+
return result
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def main():
|
|
179
|
+
parser = argparse.ArgumentParser(description='Extract text from PDF files')
|
|
180
|
+
parser.add_argument('pdf_path', nargs='?', help='Path to the PDF file (single mode)')
|
|
181
|
+
parser.add_argument('--batch', nargs='+', metavar='PDF',
|
|
182
|
+
help='Batch mode: extract multiple PDFs (JSONL output, one line per PDF)')
|
|
183
|
+
parser.add_argument('--max-pages', type=int, default=100,
|
|
184
|
+
help='Maximum pages to process per PDF (default: 100)')
|
|
185
|
+
parser.add_argument('--language', default='eng',
|
|
186
|
+
help='Tesseract language for OCR fallback (default: eng)')
|
|
187
|
+
|
|
188
|
+
args = parser.parse_args()
|
|
189
|
+
|
|
190
|
+
if args.batch:
|
|
191
|
+
# Batch mode — JSONL output, one result per line
|
|
192
|
+
# Single Python startup for N PDFs (avoids repeated interpreter overhead)
|
|
193
|
+
for pdf_path in args.batch:
|
|
194
|
+
result = extract_pdf(pdf_path, args.max_pages, args.language)
|
|
195
|
+
result['path'] = pdf_path
|
|
196
|
+
print(json.dumps(result, ensure_ascii=False), flush=True)
|
|
197
|
+
elif args.pdf_path:
|
|
198
|
+
# Single file mode
|
|
199
|
+
result = extract_pdf(args.pdf_path, args.max_pages, args.language)
|
|
200
|
+
result['path'] = args.pdf_path
|
|
201
|
+
print(json.dumps(result, ensure_ascii=False))
|
|
202
|
+
else:
|
|
203
|
+
parser.print_help()
|
|
204
|
+
sys.exit(1)
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
if __name__ == '__main__':
|
|
208
|
+
main()
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "specmem-hardwicksoftware",
|
|
3
|
-
"version": "3.7.
|
|
3
|
+
"version": "3.7.36",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "Your Claude Code sessions don't have to start from scratch anymore — SpecMem gives your AI real memory. It won't forget your conversations, your code, or your architecture decisions between sessions. That's the whole point. Semantic code indexing that actually works: TypeScript, JavaScript, Python, Go, Rust, Java, Kotlin, C, C++, HTML and more. It doesn't just track functions — it gets classes, methods, fields, constants, enums, macros, imports, structs, the whole codebase graph. There's chat memory too, powered by pgvector embeddings. You've also got token compression, team coordination, multi-agent comms, and file watching built in. 74+ MCP tools. Runs on PostgreSQL + Docker. It's kind of a big deal. justcalljon.pro",
|
|
6
6
|
"main": "dist/index.js",
|
package/scripts/deploy-hooks.cjs
CHANGED
|
@@ -172,7 +172,7 @@ function getHookConfig() {
|
|
|
172
172
|
],
|
|
173
173
|
"PreToolUse": [
|
|
174
174
|
{
|
|
175
|
-
"matcher": "
|
|
175
|
+
"matcher": "Agent",
|
|
176
176
|
"hooks": [
|
|
177
177
|
{
|
|
178
178
|
"type": "command",
|
|
@@ -421,7 +421,7 @@ function getHookConfig() {
|
|
|
421
421
|
],
|
|
422
422
|
"PostToolUse": [
|
|
423
423
|
{
|
|
424
|
-
"matcher": "
|
|
424
|
+
"matcher": "Agent",
|
|
425
425
|
"hooks": [
|
|
426
426
|
{
|
|
427
427
|
"type": "command",
|
|
@@ -1756,7 +1756,7 @@ function configureSettings() {
|
|
|
1756
1756
|
// PreToolUse hooks - Agent loading with chooser - MERGE
|
|
1757
1757
|
const specmemPreToolUseHooks = [
|
|
1758
1758
|
{
|
|
1759
|
-
matcher: '
|
|
1759
|
+
matcher: 'Agent',
|
|
1760
1760
|
hooks: [{
|
|
1761
1761
|
type: 'command',
|
|
1762
1762
|
command: `node ${path.join(CLAUDE_HOOKS_DIR, 'agent-loading-hook.js')}`,
|
|
@@ -1843,7 +1843,7 @@ function configureSettings() {
|
|
|
1843
1843
|
// PostToolUse hooks - agent completion tracking - MERGE
|
|
1844
1844
|
const specmemPostToolUseHooks = [
|
|
1845
1845
|
{
|
|
1846
|
-
matcher: '
|
|
1846
|
+
matcher: 'Agent',
|
|
1847
1847
|
hooks: [{
|
|
1848
1848
|
type: 'command',
|
|
1849
1849
|
command: `node ${path.join(CLAUDE_HOOKS_DIR, 'task-progress-hook.js')}`,
|