specmem-hardwicksoftware 3.7.35 → 3.7.38

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/CHANGELOG.md +34 -0
  2. package/README.md +11 -15
  3. package/bin/specmem-autoclaude.cjs +12 -1
  4. package/bin/specmem-cli.cjs +1077 -11
  5. package/bin/specmem-console.cjs +890 -63
  6. package/bootstrap.cjs +10 -2
  7. package/claude-hooks/agent-loading-hook.cjs +16 -16
  8. package/claude-hooks/agent-loading-hook.js +28 -21
  9. package/claude-hooks/agent-type-matcher.js +1 -1
  10. package/claude-hooks/background-completion-silencer.js +1 -1
  11. package/claude-hooks/file-claim-enforcer.cjs +37 -36
  12. package/claude-hooks/output-cleaner.cjs +1 -1
  13. package/claude-hooks/refusal-detector-hook.cjs +53 -0
  14. package/claude-hooks/settings.json +64 -4
  15. package/claude-hooks/smart-search-interceptor.js +1 -1
  16. package/claude-hooks/specmem-search-enforcer.cjs +2 -11
  17. package/claude-hooks/specmem-team-member-inject.js +1 -1
  18. package/claude-hooks/specmem-unified-hook.py +1 -1
  19. package/claude-hooks/subagent-loading-hook.cjs +1 -1
  20. package/claude-hooks/task-progress-hook.cjs +7 -7
  21. package/claude-hooks/task-progress-hook.js +3 -3
  22. package/claude-hooks/team-comms-enforcer.cjs +113 -47
  23. package/claude-hooks/use-code-pointers.cjs +1 -1
  24. package/dist/claude-sessions/sessionParser.js +5 -0
  25. package/dist/cli/deploy-to-claude.js +9 -2
  26. package/dist/codebase/codebaseIndexer.js +48 -17
  27. package/dist/codebase/exclusions.js +3 -4
  28. package/dist/codebase/index.js +4 -0
  29. package/dist/codebase/pdfExtractor.js +298 -0
  30. package/dist/dashboard/api/taskTeamMembers.js +2 -2
  31. package/dist/db/bigBrainMigrations.js +29 -0
  32. package/dist/hooks/hookManager.js +4 -4
  33. package/dist/hooks/teamFramingCli.js +1 -1
  34. package/dist/hooks/teamMemberPrepromptHook.js +5 -5
  35. package/dist/index.js +49 -12
  36. package/dist/init/claudeConfigInjector.js +27 -8
  37. package/dist/installer/autoInstall.js +7 -1
  38. package/dist/mcp/compactionProxy.js +1052 -192
  39. package/dist/mcp/compactionProxyDaemon.js +112 -37
  40. package/dist/mcp/contextVault.js +439 -0
  41. package/dist/mcp/embeddingServerManager.js +151 -17
  42. package/dist/mcp/mcpProtocolHandler.js +6 -1
  43. package/dist/mcp/miniCOTServerManager.js +82 -8
  44. package/dist/mcp/specMemServer.js +45 -10
  45. package/dist/mcp/toolRegistry.js +6 -0
  46. package/dist/startup/startupIndexing.js +14 -0
  47. package/dist/team-members/taskOrchestrator.js +3 -3
  48. package/dist/team-members/taskTeamMemberLogger.js +2 -2
  49. package/dist/tools/goofy/deployTeamMember.js +3 -3
  50. package/dist/tools/goofy/digInTheVault.js +81 -0
  51. package/dist/tools/goofy/findCodePointers.js +17 -0
  52. package/dist/tools/goofy/findWhatISaid.js +19 -0
  53. package/dist/tools/goofy/stashTheGoods.js +56 -0
  54. package/dist/tools/teamMemberDeployer.js +2 -2
  55. package/dist/watcher/changeHandler.js +65 -8
  56. package/dist/watcher/changeQueue.js +20 -1
  57. package/embedding-sandbox/frankenstein-embeddings.py +4 -3
  58. package/embedding-sandbox/mini-cot-service.py +11 -13
  59. package/embedding-sandbox/pdf-text-extract.py +208 -0
  60. package/package.json +1 -1
  61. package/scripts/deploy-hooks.cjs +12 -4
  62. package/scripts/fast-batch-embedder.cjs +2 -2
  63. package/scripts/force-retry.cjs +34 -0
  64. package/scripts/global-postinstall.cjs +97 -4
  65. package/scripts/poetic-abliteration.cjs +379 -0
  66. package/scripts/refusal-enforcer.cjs +88 -0
  67. package/scripts/specmem-init.cjs +222 -41
  68. package/specmem/model-config.json +6 -6
  69. package/specmem/supervisord.conf +1 -1
  70. package/svg-sections/readme-token-compaction.svg +246 -0
  71. package/claude-hooks/agent-chooser-hook.js +0 -179
@@ -1,7 +1,7 @@
1
1
  /**
2
- * taskTeamMemberLogger.ts - Logs Code Task team member activity to SpecMem database
2
+ * taskTeamMemberLogger.ts - Logs Code Agent team member activity to SpecMem database
3
3
  *
4
- * yo fr fr this bridges the gap between Code's Task tool and SpecMem tracking
4
+ * yo fr fr this bridges the gap between Code's Agent tool and SpecMem tracking
5
5
  *
6
6
  * Problem: Task-deployed team members are invisible to SpecMem dashboard
7
7
  * Solution: Log team member activity before/after Task deployment
@@ -1,8 +1,8 @@
1
1
  /**
2
- * deployTeamMember - The Task tool but actually works with MCP
2
+ * deployTeamMember - The Agent tool but actually works with MCP
3
3
  *
4
4
  * Spawns team members in screen sessions with full SpecMem MCP access.
5
- * This is basically skidding Code's Task tool but making it not suck.
5
+ * This is basically skidding Code's Agent tool but making it not suck.
6
6
  */
7
7
  import { deployTeamMember as deployTeamMemberImpl } from '../teamMemberDeployer.js';
8
8
  import { logger } from '../../utils/logger.js';
@@ -10,7 +10,7 @@ export class DeployTeamMember {
10
10
  name = 'deployTeamMember';
11
11
  description = `Deploy a team member with full SpecMem MCP access.
12
12
 
13
- This is like the Task tool but actually works - spawned team members get full access
13
+ This is like the Agent tool but actually works - spawned team members get full access
14
14
  to all SpecMem MCP tools including team member communication (sayToTeamMember, listenForMessages,
15
15
  sendHeartbeat, getActiveTeamMembers).
16
16
 
@@ -0,0 +1,81 @@
1
+ /**
2
+ * digInTheVault - search the context vault for stashed content
3
+ *
4
+ * when you need to find something in all that stashed content
5
+ * without loading it all into context. BM25-ranked via tsvector.
6
+ *
7
+ * supports:
8
+ * - query search across all vaults (project-scoped)
9
+ * - targeted search within a specific vault_id
10
+ * - full dump of a vault_id (get_all mode)
11
+ * - stats mode (no query = return vault statistics)
12
+ */
13
+
14
+ import { digInTheVault as doTheDig, getFullStash, getVaultStats } from '../../mcp/contextVault.js';
15
+
16
+ export class DigInTheVault {
17
+ name = 'dig_in_the_vault';
18
+ description = 'Search the context vault for previously stashed content. Returns BM25-ranked chunks matching your query. Use vault_id to search a specific stash, or omit to search all. Use get_all:true to retrieve everything from a vault.';
19
+
20
+ inputSchema = {
21
+ type: 'object',
22
+ properties: {
23
+ query: {
24
+ type: 'string',
25
+ description: 'What to search for in the vault'
26
+ },
27
+ vault_id: {
28
+ type: 'string',
29
+ description: 'Search within a specific stash (the id from vault_receipt)'
30
+ },
31
+ limit: {
32
+ type: 'number',
33
+ description: 'Max results to return (default: 10)',
34
+ default: 10
35
+ },
36
+ get_all: {
37
+ type: 'boolean',
38
+ description: 'Retrieve ALL chunks for a vault_id in order (ignores query)',
39
+ default: false
40
+ }
41
+ }
42
+ };
43
+
44
+ async execute(params) {
45
+ const { query, vault_id, limit = 10, get_all = false } = params;
46
+
47
+ try {
48
+ // get_all mode: dump full stash content
49
+ if (get_all && vault_id) {
50
+ const full = await getFullStash(vault_id);
51
+ if (!full) return `<vault_dig ok="false" error="vault ${vault_id} not found or expired"/>`;
52
+ return full;
53
+ }
54
+
55
+ // No query = return vault stats
56
+ if (!query) {
57
+ const stats = await getVaultStats();
58
+ return JSON.stringify(stats, null, 2);
59
+ }
60
+
61
+ // BM25-ranked search
62
+ const results = await doTheDig(query, vault_id || null, limit);
63
+
64
+ if (results.length === 0) {
65
+ return vault_id
66
+ ? `<vault_dig ok="false" vault_id="${vault_id}" query="${query}" error="no matches — try different terms or get_all:true"/>`
67
+ : `<vault_dig ok="false" query="${query}" error="no matches in any vault"/>`;
68
+ }
69
+
70
+ // Format results compactly
71
+ const lines = results.map((r, i) => {
72
+ const header = `── [${i + 1}] vault:${r.vault_id} chunk:${r.chunk_idx} rank:${parseFloat(r.rank).toFixed(3)} ──`;
73
+ return `${header}\n${r.content}`;
74
+ });
75
+
76
+ return lines.join('\n\n');
77
+ } catch (err) {
78
+ return `<vault_dig ok="false" error="${err.message}"/>`;
79
+ }
80
+ }
81
+ }
@@ -410,6 +410,23 @@ export class FindCodePointers {
410
410
  attribution: SPECMEM_ATTRIBUTION
411
411
  };
412
412
  }
413
+ // FAST FAIL: Invalid query detection - return immediately with format hint
414
+ const queryTrimmed = params.query.trim();
415
+ // Check for natural language questions (not code terms)
416
+ const isQuestion = /^(how|what|why|when|where|who|can|does|is|should|would|could)\s/i.test(queryTrimmed);
417
+ if (isQuestion && queryTrimmed.length > 50) {
418
+ // Likely a natural language question instead of code terms - fail fast with hint
419
+ logger.warn({ query: params.query }, '[CodePointers] Invalid query format detected - failing fast with hint');
420
+ return {
421
+ results: [],
422
+ query: params.query,
423
+ total_found: 0,
424
+ search_type: 'semantic',
425
+ attribution: SPECMEM_ATTRIBUTION,
426
+ error: 'INVALID_QUERY_FORMAT',
427
+ error_hint: 'Query appears to be a natural language question. For code search, use CODE TERMS like "admin login auth" NOT "how does admin login work". See /specmem/HOW_TO_USE.md for examples.'
428
+ };
429
+ }
413
430
  // MODE SELECTION: Return options if user wants to choose
414
431
  if (params.galleryMode === 'ask') {
415
432
  return this.returnModeOptions(params.query);
@@ -690,6 +690,25 @@ export class FindWhatISaid {
690
690
  highlights: []
691
691
  }];
692
692
  }
693
+ // FAST FAIL: Whitespace-only or clearly invalid query
694
+ const queryTrimmed = params.query.trim();
695
+ if (queryTrimmed.length < 2) {
696
+ logger.warn({ query: params.query }, '[find_memory] Query too short - failing fast');
697
+ return [{
698
+ memory: {
699
+ id: 'error',
700
+ content: 'Query too short. Provide at least 2 characters for meaningful search.',
701
+ createdAt: new Date(),
702
+ updatedAt: new Date(),
703
+ tags: ['error'],
704
+ importance: 'low',
705
+ memoryType: 'semantic',
706
+ metadata: { _isError: true }
707
+ },
708
+ similarity: 0,
709
+ highlights: []
710
+ }];
711
+ }
693
712
  logger.debug({ query: params.query, limit: params.limit }, 'searching memories fr');
694
713
  // Broadcast COT to dashboard
695
714
  cotStart('find_memory', params.query || 'browsing');
@@ -0,0 +1,56 @@
1
+ /**
2
+ * stashTheGoods - manually vault content for later retrieval
3
+ *
4
+ * when you got data that's too thicc for the context window
5
+ * but you might need it later, stash it in the vault fr
6
+ *
7
+ * auto-stash handles most cases, but this tool lets claude
8
+ * manually stash anything it wants to keep searchable
9
+ */
10
+
11
+ import { stashTheGoods as doTheStash, formatVaultReceipt } from '../../mcp/contextVault.js';
12
+
13
+ export class StashTheGoods {
14
+ name = 'stash_the_goods';
15
+ description = 'Stash large content in the context vault for token-efficient retrieval later. Content is chunked, indexed with full-text search, and retrievable via dig_in_the_vault. Auto-expires after 24h.';
16
+
17
+ inputSchema = {
18
+ type: 'object',
19
+ properties: {
20
+ content: {
21
+ type: 'string',
22
+ description: 'The content to stash in the vault'
23
+ },
24
+ tool_name: {
25
+ type: 'string',
26
+ description: 'Which tool produced this content (for tracking)'
27
+ },
28
+ tags: {
29
+ type: 'array',
30
+ items: { type: 'string' },
31
+ description: 'Optional tags for categorization'
32
+ }
33
+ },
34
+ required: ['content']
35
+ };
36
+
37
+ async execute(params) {
38
+ const { content, tool_name, tags } = params;
39
+
40
+ if (!content || content.length === 0) {
41
+ return '<vault_receipt ok="false" error="nothing to stash fr"/>';
42
+ }
43
+
44
+ try {
45
+ const result = await doTheStash(content, {
46
+ tool: tool_name || 'manual_stash',
47
+ tags: tags || [],
48
+ manual: true,
49
+ });
50
+
51
+ return formatVaultReceipt(result);
52
+ } catch (err) {
53
+ return `<vault_receipt ok="false" error="${err.message}"/>`;
54
+ }
55
+ }
56
+ }
@@ -1,7 +1,7 @@
1
1
  /**
2
2
  * Team Member Deployer Tool
3
3
  *
4
- * "Skidded" version of Code's Task tool that actually works with MCP
4
+ * "Skidded" version of Code's Agent tool that actually works with MCP
5
5
  * Spawns team members with full SpecMem MCP access
6
6
  *
7
7
  * Now integrates with TeamCommsService for team-based team member coordination:
@@ -23,7 +23,7 @@ import { getAgentsJson, isValidAgentType } from './agentDefinitions.js';
23
23
  /**
24
24
  * Deploy a team member with full SpecMem MCP access
25
25
  *
26
- * This is basically the Task tool but it actually fucking works
26
+ * This is basically the Agent tool but it actually fucking works
27
27
  * because we spawn team members with MCP configured
28
28
  *
29
29
  * Now with team communication support for coordinated multi-team-member work
@@ -21,6 +21,7 @@ import { getCoordinator } from '../coordination/integration.js';
21
21
  import { isMinifiedOrBundled, isBinaryFile, EXCLUSION_CONFIG } from '../codebase/exclusions.js';
22
22
  import { getProjectPathForInsert } from '../services/ProjectContext.js';
23
23
  import { getEmbeddingTimeout } from '../config/embeddingTimeouts.js';
24
+ import { extractPdfText, isPdfFile } from '../codebase/pdfExtractor.js';
24
25
  // Retry helper for transient embedding failures (timeout, socket reset, etc.)
25
26
  const WATCHER_MAX_RETRIES = parseInt(process.env['SPECMEM_WATCHER_RETRIES'] || '2');
26
27
  async function withWatcherRetry(operation, filePath) {
@@ -132,6 +133,51 @@ export class AutoUpdateTheMemories {
132
133
  this.stats.filesSkipped++;
133
134
  return;
134
135
  }
136
+ // PDF files: extract text via PyMuPDF instead of reading as UTF-8
137
+ if (isPdfFile(event.path)) {
138
+ const pdfResult = await extractPdfText(event.path);
139
+ if (!pdfResult || !pdfResult.text) {
140
+ logger.debug({ path: event.relativePath }, 'PDF extraction failed or empty — skipping');
141
+ this.stats.filesSkipped++;
142
+ return;
143
+ }
144
+ // Use extracted PDF text as content for the standard metadata flow
145
+ const metadata = await this.extractFileMetadata(event.path, event.relativePath, pdfResult.text);
146
+ if (metadata.size > this.config.maxFileSizeBytes) {
147
+ this.stats.filesSkipped++;
148
+ return;
149
+ }
150
+ const content = pdfResult.text;
151
+ const existingMemory = await this.findMemoryByContentHash(metadata.contentHash);
152
+ if (existingMemory) {
153
+ this.stats.filesSkipped++;
154
+ return;
155
+ }
156
+ let embedding;
157
+ const WATCHER_EMBEDDING_TIMEOUT = getEmbeddingTimeout('fileWatcher');
158
+ try {
159
+ embedding = await withWatcherRetry(async () => {
160
+ return new Promise((resolve, reject) => {
161
+ const timeoutId = setTimeout(() => reject(new Error('Embedding timeout for PDF')), WATCHER_EMBEDDING_TIMEOUT);
162
+ this.config.embeddingProvider.generateEmbedding(content)
163
+ .then(result => { clearTimeout(timeoutId); resolve(result); })
164
+ .catch(error => { clearTimeout(timeoutId); reject(error); });
165
+ });
166
+ }, event.path);
167
+ } catch (embErr) {
168
+ logger.warn({ path: event.relativePath, error: embErr.message }, 'PDF embedding failed — storing without embedding');
169
+ }
170
+ await this.storeMemory({
171
+ content,
172
+ metadata,
173
+ embedding,
174
+ tags: ['codebase', 'auto-ingested', 'pdf'],
175
+ });
176
+ this.stats.filesIngested++;
177
+ logger.info({ path: event.relativePath, pages: pdfResult.pages, chars: pdfResult.chars }, 'PDF file indexed');
178
+ this.coordinator.emitFileAdded(event.path, event.relativePath, metadata.size);
179
+ return;
180
+ }
135
181
  // check if binary
136
182
  if (await isBinaryFile(event.path)) {
137
183
  logger.debug({ path: event.relativePath }, 'skipping binary file');
@@ -250,14 +296,25 @@ export class AutoUpdateTheMemories {
250
296
  this.stats.filesSkipped++;
251
297
  return;
252
298
  }
253
- // FIX MED-13: Check binary before extractFileMetadata (same as handleFileAdded)
254
- if (await isBinaryFile(event.path)) {
255
- logger.debug({ path: event.relativePath }, 'skipping binary file update');
256
- this.stats.filesSkipped++;
257
- return;
299
+ // PDF files: extract text via PyMuPDF
300
+ let pdfContent = null;
301
+ if (isPdfFile(event.path)) {
302
+ const pdfResult = await extractPdfText(event.path);
303
+ if (!pdfResult || !pdfResult.text) {
304
+ this.stats.filesSkipped++;
305
+ return;
306
+ }
307
+ pdfContent = pdfResult.text;
308
+ } else {
309
+ // FIX MED-13: Check binary before extractFileMetadata (same as handleFileAdded)
310
+ if (await isBinaryFile(event.path)) {
311
+ logger.debug({ path: event.relativePath }, 'skipping binary file update');
312
+ this.stats.filesSkipped++;
313
+ return;
314
+ }
258
315
  }
259
316
  // extract new metadata (FIX 7.04: content included to avoid double read)
260
- const metadata = await this.extractFileMetadata(event.path, event.relativePath);
317
+ const metadata = await this.extractFileMetadata(event.path, event.relativePath, pdfContent);
261
318
  // check file size
262
319
  if (metadata.size > this.config.maxFileSizeBytes) {
263
320
  logger.warn({
@@ -419,9 +476,9 @@ export class AutoUpdateTheMemories {
419
476
  /**
420
477
  * extractFileMetadata - reads file and generates metadata
421
478
  */
422
- async extractFileMetadata(path, relativePath) {
479
+ async extractFileMetadata(path, relativePath, preExtractedContent) {
423
480
  const stats = await fs.stat(path);
424
- const content = await fs.readFile(path, 'utf-8');
481
+ const content = preExtractedContent || await fs.readFile(path, 'utf-8');
425
482
  const contentHash = this.hashContent(content);
426
483
  return {
427
484
  path,
@@ -22,6 +22,7 @@ export class QueueTheChangesUp {
22
22
  config;
23
23
  queue = [];
24
24
  processing = false;
25
+ paused = false; // pause queue processing without stopping (e.g. during background indexing)
25
26
  processingInterval = null;
26
27
  changeHandler;
27
28
  // deduplication map: path -> latest queued change
@@ -162,13 +163,31 @@ export class QueueTheChangesUp {
162
163
  logger.debug({ cancelledTimeouts: cancelledCount }, 'cancelled pending retry timeouts');
163
164
  }
164
165
  }
166
+ /**
167
+ * pause - temporarily halt batch processing without stopping the queue.
168
+ * Changes still enqueue but won't be processed until resume().
169
+ * Used during background indexing to avoid resource contention.
170
+ */
171
+ pause(reason = '') {
172
+ if (this.paused) return;
173
+ this.paused = true;
174
+ logger.info({ reason, pendingCount: this.queue.length }, 'queue PAUSED');
175
+ }
176
+ /**
177
+ * resume - resume batch processing after pause
178
+ */
179
+ resume() {
180
+ if (!this.paused) return;
181
+ this.paused = false;
182
+ logger.info({ pendingCount: this.queue.length }, 'queue RESUMED');
183
+ }
165
184
  /**
166
185
  * processBatch - processes a batch of changes
167
186
  *
168
187
  * nah bruh processing this whole batch at once
169
188
  */
170
189
  async processBatch() {
171
- if (this.queue.length === 0) {
190
+ if (this.paused || this.queue.length === 0) {
172
191
  return;
173
192
  }
174
193
  logger.debug({
@@ -3916,20 +3916,21 @@ class EmbeddingServer:
3916
3916
  # Extract requestId for persistent socket multiplexing
3917
3917
  request_id = request.get('requestId')
3918
3918
 
3919
- # Send "processing" heartbeat ONLY for embedding requests (not health/kys/ready)
3919
+ # Send "working" status ONLY for embedding requests (not health/kys/ready)
3920
+ # "working" means actually processing your query (vs "processing" which was ambiguous)
3920
3921
  # Meta requests expect a single response - sending a heartbeat first breaks the protocol
3921
3922
  # and causes clients to read the heartbeat as the actual response
3922
3923
  if not is_meta_request:
3923
3924
  text = request.get('text') or request.get('texts')
3924
3925
  text_length = len(text) if isinstance(text, str) else (len(text) if text else 0)
3925
3926
  heartbeat = {
3926
- 'status': 'processing',
3927
+ 'status': 'working',
3927
3928
  'text_length': text_length
3928
3929
  }
3929
3930
  if request_id:
3930
3931
  heartbeat['requestId'] = request_id
3931
3932
  hb_ok = self._safe_sendall(conn, json.dumps(heartbeat).encode('utf-8') + b'\n')
3932
- print(f"[WORKER {thread_name}] Heartbeat sent ok={hb_ok}", file=sys.stderr, flush=True)
3933
+ print(f"[WORKER {thread_name}] Working status sent ok={hb_ok}", file=sys.stderr, flush=True)
3933
3934
 
3934
3935
  # Process - each thread gets its own call stack
3935
3936
  print(f"[WORKER {thread_name}] Calling handle_request(type={req_type})...", file=sys.stderr, flush=True)
@@ -391,20 +391,18 @@ class ModelManager:
391
391
  file_name="model_quantized.onnx"
392
392
  )
393
393
  else:
394
- # Fallback: download PyTorch model from HuggingFace
395
- print(f"🧠 No local model, downloading {self.model_name}...", file=sys.stderr)
396
- from transformers import AutoModelForCausalLM
397
- cache_dir = '/tmp/mini-cot-models'
398
- os.makedirs(cache_dir, exist_ok=True)
399
- self.torch_model = AutoModelForCausalLM.from_pretrained(
400
- self.model_name,
401
- cache_dir=cache_dir,
402
- torch_dtype=torch.float32,
403
- low_cpu_mem_usage=True,
404
- trust_remote_code=True
394
+ # No local model found — refuse to download from the internet
395
+ search_paths = [
396
+ os.environ.get('SPECMEM_MODEL_CACHE', '(not set)'),
397
+ os.path.join(os.path.dirname(__file__), '..', 'models', 'pythia-410m-onnx-quant'),
398
+ '/app/models/pythia-onnx-quant',
399
+ ]
400
+ raise RuntimeError(
401
+ f"Local ONNX model not found. Searched:\n"
402
+ + "\n".join(f" - {p}" for p in search_paths)
403
+ + "\n\nRun `specmem init` to download models via Git LFS release tarball."
404
+ + "\nSpecMem will NOT download models from the internet at runtime."
405
405
  )
406
- self.torch_model.eval()
407
- torch.set_grad_enabled(False)
408
406
 
409
407
  self.torch_loaded = True
410
408
  print(f"🧠 Generation model loaded for crawl analysis", file=sys.stderr)
@@ -0,0 +1,208 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ pdf-text-extract.py — PDF text extraction for SpecMem codebase indexing
4
+
5
+ Uses PyMuPDF (fitz) for instant digital PDF text extraction (0.003s/page).
6
+ Falls back to Tesseract OCR via PyMuPDF's built-in integration for scanned pages.
7
+
8
+ Usage:
9
+ # Single file mode
10
+ python3 pdf-text-extract.py <pdf_path> [--max-pages N] [--language LANG]
11
+
12
+ # Batch mode (JSONL — one result per line, one Python startup for N PDFs)
13
+ python3 pdf-text-extract.py --batch file1.pdf file2.pdf ... [--max-pages N]
14
+
15
+ Output (JSON/JSONL to stdout):
16
+ {"path": "/abs/path.pdf", "text": "...", "pages": 5, "scanned_pages": [3], "chars": 12345}
17
+ {"path": "/abs/path2.pdf", "error": "..."}
18
+ """
19
+
20
+ import sys
21
+ import json
22
+ import os
23
+ import argparse
24
+
25
+ # ---------------------------------------------------------------------------
26
+ # Auto-install pymupdf if missing (matches frankenstein-embeddings pattern)
27
+ # ---------------------------------------------------------------------------
28
+ def _ensure_pymupdf():
29
+ try:
30
+ import pymupdf
31
+ return pymupdf
32
+ except ImportError:
33
+ pass
34
+ # Try legacy import name
35
+ try:
36
+ import fitz
37
+ return fitz
38
+ except ImportError:
39
+ pass
40
+ # Auto-install
41
+ try:
42
+ import subprocess
43
+ sys.stderr.write('[pdf-text-extract] pymupdf not found, installing...\n')
44
+ subprocess.check_call(
45
+ [sys.executable, '-m', 'pip', 'install', '--quiet', 'pymupdf'],
46
+ stdout=subprocess.DEVNULL
47
+ )
48
+ try:
49
+ import pymupdf
50
+ return pymupdf
51
+ except ImportError:
52
+ import fitz
53
+ return fitz
54
+ except Exception as e:
55
+ _error_exit(f'Failed to install pymupdf: {e}')
56
+
57
+
58
+ def _error_exit(msg):
59
+ """Print error JSON and exit."""
60
+ print(json.dumps({'error': str(msg)}, ensure_ascii=False))
61
+ sys.exit(1)
62
+
63
+
64
+ def _is_scanned_page(page, text):
65
+ """
66
+ Heuristic: page is likely scanned if:
67
+ 1. Extracted text is very short (< 50 chars after stripping)
68
+ 2. Page has images covering >60% of page area
69
+ """
70
+ stripped = text.strip()
71
+ if len(stripped) > 50:
72
+ return False
73
+
74
+ try:
75
+ images = page.get_image_info()
76
+ if not images:
77
+ return False
78
+ page_area = abs(page.rect)
79
+ if page_area == 0:
80
+ return False
81
+ image_area = 0
82
+ for img in images:
83
+ if 'bbox' in img:
84
+ try:
85
+ import pymupdf
86
+ r = pymupdf.Rect(img['bbox'])
87
+ except (ImportError, Exception):
88
+ import fitz
89
+ r = fitz.Rect(img['bbox'])
90
+ image_area += abs(r)
91
+ return (image_area / page_area) >= 0.6
92
+ except Exception:
93
+ return False
94
+
95
+
96
+ def _ocr_page(page, language='eng'):
97
+ """
98
+ Attempt Tesseract OCR on a scanned page via PyMuPDF's built-in integration.
99
+ Returns extracted text or empty string if tesseract unavailable.
100
+ """
101
+ try:
102
+ tp = page.get_textpage_ocr(language=language, dpi=300)
103
+ return page.get_text(textpage=tp).strip()
104
+ except Exception as e:
105
+ msg = str(e).lower()
106
+ if 'tesseract' in msg or 'not installed' in msg or 'not found' in msg:
107
+ # Tesseract not installed — skip OCR, return what we have
108
+ sys.stderr.write(f'[pdf-text-extract] Tesseract not available, skipping OCR for scanned page\n')
109
+ return ''
110
+ # Other error — still don't crash
111
+ sys.stderr.write(f'[pdf-text-extract] OCR failed: {e}\n')
112
+ return ''
113
+
114
+
115
+ def extract_pdf(pdf_path, max_pages=100, language='eng'):
116
+ """
117
+ Extract text from PDF using PyMuPDF.
118
+ Digital pages: instant text extraction.
119
+ Scanned pages: Tesseract OCR fallback.
120
+ """
121
+ pymupdf = _ensure_pymupdf()
122
+
123
+ if not os.path.isfile(pdf_path):
124
+ return {'error': f'File not found: {pdf_path}'}
125
+
126
+ try:
127
+ doc = pymupdf.open(pdf_path)
128
+ except Exception as e:
129
+ msg = str(e).lower()
130
+ if 'password' in msg or 'encrypt' in msg:
131
+ return {'error': f'PDF is password-protected: {pdf_path}'}
132
+ return {'error': f'Failed to open PDF: {e}'}
133
+
134
+ total_pages = len(doc)
135
+ process_count = min(total_pages, max_pages)
136
+ truncated = total_pages > max_pages
137
+
138
+ texts = []
139
+ scanned_pages = []
140
+
141
+ for i in range(process_count):
142
+ page = doc[i]
143
+ text = page.get_text().strip()
144
+
145
+ if _is_scanned_page(page, text):
146
+ # Try OCR
147
+ ocr_text = _ocr_page(page, language)
148
+ if ocr_text:
149
+ text = ocr_text
150
+ scanned_pages.append(i + 1) # 1-indexed
151
+ # If OCR also empty, keep whatever minimal text we got
152
+
153
+ if text:
154
+ if process_count > 1:
155
+ texts.append(f'--- Page {i + 1} ---\n{text}')
156
+ else:
157
+ texts.append(text)
158
+
159
+ doc.close()
160
+
161
+ full_text = '\n\n'.join(texts)
162
+
163
+ result = {
164
+ 'text': full_text,
165
+ 'pages': process_count,
166
+ 'chars': len(full_text),
167
+ }
168
+
169
+ if scanned_pages:
170
+ result['scanned_pages'] = scanned_pages
171
+ if truncated:
172
+ result['truncated'] = True
173
+ result['total_pages'] = total_pages
174
+
175
+ return result
176
+
177
+
178
+ def main():
179
+ parser = argparse.ArgumentParser(description='Extract text from PDF files')
180
+ parser.add_argument('pdf_path', nargs='?', help='Path to the PDF file (single mode)')
181
+ parser.add_argument('--batch', nargs='+', metavar='PDF',
182
+ help='Batch mode: extract multiple PDFs (JSONL output, one line per PDF)')
183
+ parser.add_argument('--max-pages', type=int, default=100,
184
+ help='Maximum pages to process per PDF (default: 100)')
185
+ parser.add_argument('--language', default='eng',
186
+ help='Tesseract language for OCR fallback (default: eng)')
187
+
188
+ args = parser.parse_args()
189
+
190
+ if args.batch:
191
+ # Batch mode — JSONL output, one result per line
192
+ # Single Python startup for N PDFs (avoids repeated interpreter overhead)
193
+ for pdf_path in args.batch:
194
+ result = extract_pdf(pdf_path, args.max_pages, args.language)
195
+ result['path'] = pdf_path
196
+ print(json.dumps(result, ensure_ascii=False), flush=True)
197
+ elif args.pdf_path:
198
+ # Single file mode
199
+ result = extract_pdf(args.pdf_path, args.max_pages, args.language)
200
+ result['path'] = args.pdf_path
201
+ print(json.dumps(result, ensure_ascii=False))
202
+ else:
203
+ parser.print_help()
204
+ sys.exit(1)
205
+
206
+
207
+ if __name__ == '__main__':
208
+ main()
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "specmem-hardwicksoftware",
3
- "version": "3.7.35",
3
+ "version": "3.7.38",
4
4
  "type": "module",
5
5
  "description": "Your Claude Code sessions don't have to start from scratch anymore — SpecMem gives your AI real memory. It won't forget your conversations, your code, or your architecture decisions between sessions. That's the whole point. Semantic code indexing that actually works: TypeScript, JavaScript, Python, Go, Rust, Java, Kotlin, C, C++, HTML and more. It doesn't just track functions — it gets classes, methods, fields, constants, enums, macros, imports, structs, the whole codebase graph. There's chat memory too, powered by pgvector embeddings. You've also got token compression, team coordination, multi-agent comms, and file watching built in. 74+ MCP tools. Runs on PostgreSQL + Docker. It's kind of a big deal. justcalljon.pro",
6
6
  "main": "dist/index.js",