npm - bluera-knowledge - Versions diffs - 0.9.43 → 0.10.1 - Mend

bluera-knowledge 0.9.43 → 0.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

package/.claude-plugin/plugin.json +1 -1
package/CHANGELOG.md +75 -0
package/README.md +114 -42
package/commands/sync.md +96 -0
package/dist/{chunk-MQE32YY6.js → chunk-6U45VP5Z.js} +42 -6
package/dist/chunk-6U45VP5Z.js.map +1 -0
package/dist/{chunk-CUHYSPRV.js → chunk-DP5XBPQV.js} +372 -2
package/dist/chunk-DP5XBPQV.js.map +1 -0
package/dist/{chunk-DWAIT2OD.js → chunk-UE4ZIJYA.js} +74 -5
package/dist/{chunk-DWAIT2OD.js.map → chunk-UE4ZIJYA.js.map} +1 -1
package/dist/index.js +216 -7
package/dist/index.js.map +1 -1
package/dist/mcp/server.js +2 -2
package/dist/workers/background-worker-cli.js +4 -3
package/dist/workers/background-worker-cli.js.map +1 -1
package/hooks/check-dependencies.sh +29 -0
package/package.json +1 -1
package/python/crawl_worker.py +6 -1
package/src/cli/commands/crawl.test.ts +43 -3
package/src/cli/commands/crawl.ts +3 -3
package/src/cli/commands/sync.test.ts +54 -0
package/src/cli/commands/sync.ts +264 -0
package/src/cli/index.ts +1 -0
package/src/crawl/claude-client.test.ts +195 -24
package/src/crawl/claude-client.ts +38 -3
package/src/crawl/intelligent-crawler.test.ts +65 -0
package/src/crawl/intelligent-crawler.ts +14 -2
package/src/index.ts +2 -0
package/src/mcp/commands/index.ts +2 -0
package/src/mcp/commands/sync.commands.test.ts +283 -0
package/src/mcp/commands/sync.commands.ts +233 -0
package/src/services/gitignore.service.test.ts +157 -0
package/src/services/gitignore.service.ts +132 -0
package/src/services/store-definition.service.test.ts +440 -0
package/src/services/store-definition.service.ts +198 -0
package/src/services/store.service.test.ts +279 -1
package/src/services/store.service.ts +101 -4
package/src/types/index.ts +18 -0
package/src/types/store-definition.test.ts +492 -0
package/src/types/store-definition.ts +129 -0
package/src/workers/background-worker.ts +1 -1
package/dist/chunk-CUHYSPRV.js.map +0 -1
package/dist/chunk-MQE32YY6.js.map +0 -1

package/src/crawl/claude-client.test.ts CHANGED Viewed

@@ -81,7 +81,11 @@ describe('ClaudeClient', () => {
   describe('determineCrawlUrls', () => {
     it('should successfully parse valid crawl strategy response', async () => {
-      const promise = client.determineCrawlUrls('<html>test</html>', 'Find all docs');
+      const promise = client.determineCrawlUrls(
+        'https://example.com',
+        '<html>test</html>',
+        'Find all docs'
+      );
       // Simulate successful response
       setTimeout(() => {
@@ -102,8 +106,68 @@ describe('ClaudeClient', () => {
       expect(result.reasoning).toBe('Found documentation pages');
     });
+    it('should extract structured_output from Claude CLI wrapper format', async () => {
+      const promise = client.determineCrawlUrls(
+        'https://example.com',
+        '<html>test</html>',
+        'Find all docs'
+      );
+      // Claude CLI with --json-schema returns this wrapper format
+      setTimeout(() => {
+        mockProcess.stdout.emit(
+          'data',
+          Buffer.from(
+            JSON.stringify({
+              type: 'result',
+              subtype: 'success',
+              result: '',
+              structured_output: {
+                urls: ['https://example.com/page1', 'https://example.com/page2'],
+                reasoning: 'Found documentation pages',
+              },
+            })
+          )
+        );
+        mockProcess.emit('close', 0);
+      }, 10);
+      const result = await promise;
+      expect(result.urls).toEqual(['https://example.com/page1', 'https://example.com/page2']);
+      expect(result.reasoning).toBe('Found documentation pages');
+    });
+    it('should fall back to raw response when structured_output is not an object', async () => {
+      const promise = client.determineCrawlUrls(
+        'https://example.com',
+        '<html>test</html>',
+        'Find all docs'
+      );
+      // When structured_output is not an object, use the raw response
+      // (which will fail validation if it doesn't have urls/reasoning)
+      setTimeout(() => {
+        mockProcess.stdout.emit(
+          'data',
+          Buffer.from(
+            JSON.stringify({
+              type: 'result',
+              structured_output: 'not an object',
+            })
+          )
+        );
+        mockProcess.emit('close', 0);
+      }, 10);
+      await expect(promise).rejects.toThrow('invalid crawl strategy');
+    });
     it('should call spawn with correct arguments for determineCrawlUrls', async () => {
-      const promise = client.determineCrawlUrls('<html>test</html>', 'Find all docs');
+      const promise = client.determineCrawlUrls(
+        'https://example.com',
+        '<html>test</html>',
+        'Find all docs'
+      );
       setTimeout(() => {
         mockProcess.stdout.emit(
@@ -136,7 +200,11 @@ describe('ClaudeClient', () => {
     });
     it('should write prompt to stdin', async () => {
-      const promise = client.determineCrawlUrls('<html><body>Test</body></html>', 'Find tutorials');
+      const promise = client.determineCrawlUrls(
+        'https://example.com',
+        '<html><body>Test</body></html>',
+        'Find tutorials'
+      );
       setTimeout(() => {
         mockProcess.stdout.emit(
@@ -160,7 +228,11 @@ describe('ClaudeClient', () => {
     });
     it('should reject when response has no urls array', async () => {
-      const promise = client.determineCrawlUrls('<html>test</html>', 'Find all');
+      const promise = client.determineCrawlUrls(
+        'https://example.com',
+        '<html>test</html>',
+        'Find all'
+      );
       setTimeout(() => {
         mockProcess.stdout.emit(
@@ -178,7 +250,11 @@ describe('ClaudeClient', () => {
     });
     it('should reject when response has empty urls array', async () => {
-      const promise = client.determineCrawlUrls('<html>test</html>', 'Find all');
+      const promise = client.determineCrawlUrls(
+        'https://example.com',
+        '<html>test</html>',
+        'Find all'
+      );
       setTimeout(() => {
         mockProcess.stdout.emit(
@@ -197,7 +273,11 @@ describe('ClaudeClient', () => {
     });
     it('should reject when response has no reasoning', async () => {
-      const promise = client.determineCrawlUrls('<html>test</html>', 'Find all');
+      const promise = client.determineCrawlUrls(
+        'https://example.com',
+        '<html>test</html>',
+        'Find all'
+      );
       setTimeout(() => {
         mockProcess.stdout.emit(
@@ -215,7 +295,11 @@ describe('ClaudeClient', () => {
     });
     it('should reject when urls contains non-string values', async () => {
-      const promise = client.determineCrawlUrls('<html>test</html>', 'Find all');
+      const promise = client.determineCrawlUrls(
+        'https://example.com',
+        '<html>test</html>',
+        'Find all'
+      );
       setTimeout(() => {
         mockProcess.stdout.emit(
@@ -234,7 +318,11 @@ describe('ClaudeClient', () => {
     });
     it('should reject when response is not valid JSON', async () => {
-      const promise = client.determineCrawlUrls('<html>test</html>', 'Find all');
+      const promise = client.determineCrawlUrls(
+        'https://example.com',
+        '<html>test</html>',
+        'Find all'
+      );
       setTimeout(() => {
         mockProcess.stdout.emit('data', Buffer.from('Not valid JSON'));
@@ -245,7 +333,11 @@ describe('ClaudeClient', () => {
     });
     it('should reject when response is null', async () => {
-      const promise = client.determineCrawlUrls('<html>test</html>', 'Find all');
+      const promise = client.determineCrawlUrls(
+        'https://example.com',
+        '<html>test</html>',
+        'Find all'
+      );
       setTimeout(() => {
         mockProcess.stdout.emit('data', Buffer.from('null'));
@@ -257,7 +349,7 @@ describe('ClaudeClient', () => {
     it('should truncate HTML longer than 50000 characters', async () => {
       const longHtml = '<html>' + 'a'.repeat(60000) + '</html>';
-      const promise = client.determineCrawlUrls(longHtml, 'Find all');
+      const promise = client.determineCrawlUrls('https://example.com', longHtml, 'Find all');
       setTimeout(() => {
         mockProcess.stdout.emit(
@@ -281,7 +373,7 @@ describe('ClaudeClient', () => {
     it('should not truncate HTML shorter than 50000 characters', async () => {
       const shortHtml = '<html><body>Short content</body></html>';
-      const promise = client.determineCrawlUrls(shortHtml, 'Find all');
+      const promise = client.determineCrawlUrls('https://example.com', shortHtml, 'Find all');
       setTimeout(() => {
         mockProcess.stdout.emit(
@@ -302,6 +394,33 @@ describe('ClaudeClient', () => {
       expect(writtenPrompt).toContain(shortHtml);
       expect(writtenPrompt).not.toContain('[... HTML truncated ...]');
     });
+    it('should include seedUrl in prompt for relative URL resolution', async () => {
+      const promise = client.determineCrawlUrls(
+        'https://code.claude.com/docs',
+        '<html><a href="/docs/en/hooks">Hooks</a></html>',
+        'Find all docs'
+      );
+      setTimeout(() => {
+        mockProcess.stdout.emit(
+          'data',
+          Buffer.from(
+            JSON.stringify({
+              urls: ['https://code.claude.com/docs/en/hooks'],
+              reasoning: 'Found hooks documentation',
+            })
+          )
+        );
+        mockProcess.emit('close', 0);
+      }, 10);
+      await promise;
+      const writtenPrompt = vi.mocked(mockProcess.stdin.write).mock.calls[0]?.[0] as string;
+      expect(writtenPrompt).toContain('Base URL: https://code.claude.com/docs');
+      expect(writtenPrompt).toContain('resolve them against the Base URL');
+    });
   });
   describe('extractContent', () => {
@@ -386,7 +505,11 @@ describe('ClaudeClient', () => {
   describe('Subprocess Management', () => {
     it('should handle process spawn errors', async () => {
-      const promise = client.determineCrawlUrls('<html>test</html>', 'Find all');
+      const promise = client.determineCrawlUrls(
+        'https://example.com',
+        '<html>test</html>',
+        'Find all'
+      );
       setTimeout(() => {
         mockProcess.emit('error', new Error('spawn ENOENT'));
@@ -410,7 +533,11 @@ describe('ClaudeClient', () => {
     });
     it('should collect stderr data', async () => {
-      const promise = client.determineCrawlUrls('<html>test</html>', 'Find all');
+      const promise = client.determineCrawlUrls(
+        'https://example.com',
+        '<html>test</html>',
+        'Find all'
+      );
       setTimeout(() => {
         mockProcess.stderr.emit('data', Buffer.from('Error message 1\n'));
@@ -466,14 +593,22 @@ describe('ClaudeClient', () => {
   describe('Timeout Handling', () => {
     it('should timeout after configured timeout period', async () => {
-      const promise = client.determineCrawlUrls('<html>test</html>', 'Find all');
+      const promise = client.determineCrawlUrls(
+        'https://example.com',
+        '<html>test</html>',
+        'Find all'
+      );
       // Don't emit close event - let it timeout
       await expect(promise).rejects.toThrow('timed out after 100ms');
     });
     it('should kill process on timeout', async () => {
-      const promise = client.determineCrawlUrls('<html>test</html>', 'Find all');
+      const promise = client.determineCrawlUrls(
+        'https://example.com',
+        '<html>test</html>',
+        'Find all'
+      );
       await expect(promise).rejects.toThrow('timed out');
       expect(mockProcess.kill).toHaveBeenCalledWith('SIGTERM');
@@ -540,7 +675,11 @@ describe('ClaudeClient', () => {
   describe('JSON Parsing', () => {
     it('should handle malformed JSON in response', async () => {
-      const promise = client.determineCrawlUrls('<html>test</html>', 'Find all');
+      const promise = client.determineCrawlUrls(
+        'https://example.com',
+        '<html>test</html>',
+        'Find all'
+      );
       setTimeout(() => {
         mockProcess.stdout.emit('data', Buffer.from('{ invalid json }'));
@@ -551,7 +690,11 @@ describe('ClaudeClient', () => {
     });
     it('should handle incomplete JSON in response', async () => {
-      const promise = client.determineCrawlUrls('<html>test</html>', 'Find all');
+      const promise = client.determineCrawlUrls(
+        'https://example.com',
+        '<html>test</html>',
+        'Find all'
+      );
       setTimeout(() => {
         mockProcess.stdout.emit('data', Buffer.from('{"urls": ["https://example.com"'));
@@ -562,7 +705,11 @@ describe('ClaudeClient', () => {
     });
     it('should handle JSON with extra whitespace', async () => {
-      const promise = client.determineCrawlUrls('<html>test</html>', 'Find all');
+      const promise = client.determineCrawlUrls(
+        'https://example.com',
+        '<html>test</html>',
+        'Find all'
+      );
       setTimeout(() => {
         mockProcess.stdout.emit(
@@ -582,7 +729,11 @@ describe('ClaudeClient', () => {
     });
     it('should handle JSON arrays as invalid for determineCrawlUrls', async () => {
-      const promise = client.determineCrawlUrls('<html>test</html>', 'Find all');
+      const promise = client.determineCrawlUrls(
+        'https://example.com',
+        '<html>test</html>',
+        'Find all'
+      );
       setTimeout(() => {
         mockProcess.stdout.emit('data', Buffer.from('[]'));
@@ -593,7 +744,11 @@ describe('ClaudeClient', () => {
     });
     it('should handle JSON primitives as invalid for determineCrawlUrls', async () => {
-      const promise = client.determineCrawlUrls('<html>test</html>', 'Find all');
+      const promise = client.determineCrawlUrls(
+        'https://example.com',
+        '<html>test</html>',
+        'Find all'
+      );
       setTimeout(() => {
         mockProcess.stdout.emit('data', Buffer.from('"string response"'));
@@ -606,7 +761,11 @@ describe('ClaudeClient', () => {
   describe('Response Validation', () => {
     it('should validate urls is an array', async () => {
-      const promise = client.determineCrawlUrls('<html>test</html>', 'Find all');
+      const promise = client.determineCrawlUrls(
+        'https://example.com',
+        '<html>test</html>',
+        'Find all'
+      );
       setTimeout(() => {
         mockProcess.stdout.emit(
@@ -625,7 +784,11 @@ describe('ClaudeClient', () => {
     });
     it('should validate reasoning is a string', async () => {
-      const promise = client.determineCrawlUrls('<html>test</html>', 'Find all');
+      const promise = client.determineCrawlUrls(
+        'https://example.com',
+        '<html>test</html>',
+        'Find all'
+      );
       setTimeout(() => {
         mockProcess.stdout.emit(
@@ -644,7 +807,11 @@ describe('ClaudeClient', () => {
     });
     it('should accept valid response with multiple URLs', async () => {
-      const promise = client.determineCrawlUrls('<html>test</html>', 'Find all');
+      const promise = client.determineCrawlUrls(
+        'https://example.com',
+        '<html>test</html>',
+        'Find all'
+      );
       setTimeout(() => {
         mockProcess.stdout.emit(
@@ -671,7 +838,11 @@ describe('ClaudeClient', () => {
   describe('Error Messages', () => {
     it('should wrap errors with context for determineCrawlUrls', async () => {
-      const promise = client.determineCrawlUrls('<html>test</html>', 'Find all');
+      const promise = client.determineCrawlUrls(
+        'https://example.com',
+        '<html>test</html>',
+        'Find all'
+      );
       setTimeout(() => {
         mockProcess.emit('close', 1);

package/src/crawl/claude-client.ts CHANGED Viewed

@@ -69,25 +69,38 @@ export class ClaudeClient {
   /**
    * Determine which URLs to crawl based on natural language instruction
    *
+   * @param seedUrl - The URL of the seed page (for resolving relative URLs)
    * @param seedHtml - HTML content of the seed page
    * @param instruction - Natural language crawl instruction (e.g., "scrape all Getting Started pages")
    * @returns List of URLs to crawl with reasoning
    */
-  async determineCrawlUrls(seedHtml: string, instruction: string): Promise<CrawlStrategy> {
+  async determineCrawlUrls(
+    seedUrl: string,
+    seedHtml: string,
+    instruction: string
+  ): Promise<CrawlStrategy> {
     const prompt = `You are analyzing a webpage to determine which pages to crawl based on the user's instruction.
+Base URL: ${seedUrl}
 Instruction: ${instruction}
 Webpage HTML (analyze the navigation structure, links, and content):
 ${this.truncateHtml(seedHtml, 50000)}
-Based on the instruction, extract and return a list of absolute URLs that should be crawled. Look for navigation menus, sidebars, headers, and link structures that match the instruction.
+Based on the instruction, extract and return a list of absolute URLs that should be crawled. When you encounter relative URLs (starting with "/" or without a protocol), resolve them against the Base URL. For example, if Base URL is "https://example.com/docs" and you see href="/docs/hooks", return "https://example.com/docs/hooks".
+Look for navigation menus, sidebars, headers, and link structures that match the instruction.
 Return only URLs that are relevant to the instruction. If the instruction mentions specific sections (e.g., "Getting Started"), find links in those sections.`;
     try {
       const result = await this.callClaude(prompt, CRAWL_STRATEGY_SCHEMA);
-      const parsed: unknown = JSON.parse(result);
+      const rawParsed: unknown = JSON.parse(result);
+      // Claude CLI with --json-schema returns wrapper: {type, result, structured_output: {...}}
+      // Extract structured_output if present, otherwise use raw response
+      const parsed = this.extractStructuredOutput(rawParsed);
       // Validate and narrow type
       if (
@@ -223,4 +236,26 @@ ${this.truncateMarkdown(markdown, 100000)}`;
     return `${markdown.substring(0, maxLength)}\n\n[... content truncated ...]`;
   }
+  /**
+   * Type guard to check if value is a record (plain object)
+   */
+  private isRecord(value: unknown): value is Record<string, unknown> {
+    return typeof value === 'object' && value !== null && !Array.isArray(value);
+  }
+  /**
+   * Extract structured_output from Claude CLI wrapper format if present.
+   * Claude CLI with --json-schema returns: {type, result, structured_output: {...}}
+   * This method extracts the inner structured_output, or returns the raw value if not wrapped.
+   */
+  private extractStructuredOutput(rawParsed: unknown): unknown {
+    if (this.isRecord(rawParsed) && 'structured_output' in rawParsed) {
+      const structuredOutput = rawParsed['structured_output'];
+      if (typeof structuredOutput === 'object') {
+        return structuredOutput;
+      }
+    }
+    return rawParsed;
+  }
 }

package/src/crawl/intelligent-crawler.test.ts CHANGED Viewed

@@ -971,4 +971,69 @@ describe('IntelligentCrawler', () => {
       expect(mockClaudeClient.extractContent).not.toHaveBeenCalled();
     });
   });
+  describe('Single-Page Crawl Warning', () => {
+    it('should emit warning when only 1 page crawled with maxPages > 1', async () => {
+      // Mock no links found - single page crawl
+      mockPythonBridge.crawl.mockResolvedValue({ pages: [{ links: [] }] });
+      const results = [];
+      for await (const result of crawler.crawl('https://example.com', {
+        simple: true,
+        maxPages: 50,
+      })) {
+        results.push(result);
+      }
+      expect(results).toHaveLength(1);
+      const warningEvents = progressEvents.filter(
+        (e) => e.type === 'error' && e.message?.includes('Only crawled 1 page')
+      );
+      expect(warningEvents).toHaveLength(1);
+      expect(warningEvents[0]?.message).toContain('--fast');
+      expect(warningEvents[0]?.message).toContain('maxPages=50');
+    });
+    it('should NOT emit warning when maxPages is 1', async () => {
+      mockPythonBridge.crawl.mockResolvedValue({ pages: [{ links: [] }] });
+      const results = [];
+      for await (const result of crawler.crawl('https://example.com', {
+        simple: true,
+        maxPages: 1,
+      })) {
+        results.push(result);
+      }
+      expect(results).toHaveLength(1);
+      const warningEvents = progressEvents.filter(
+        (e) => e.type === 'error' && e.message?.includes('Only crawled 1 page')
+      );
+      expect(warningEvents).toHaveLength(0);
+    });
+    it('should NOT emit warning when multiple pages crawled', async () => {
+      mockPythonBridge.crawl
+        .mockResolvedValueOnce({ pages: [{ links: ['https://example.com/page2'] }] })
+        .mockResolvedValueOnce({ pages: [{ links: [] }] });
+      vi.mocked(axios.get)
+        .mockResolvedValueOnce({ data: '<html><body>Page1</body></html>' })
+        .mockResolvedValueOnce({ data: '<html><body>Page2</body></html>' });
+      const results = [];
+      for await (const result of crawler.crawl('https://example.com', {
+        simple: true,
+        maxPages: 50,
+      })) {
+        results.push(result);
+      }
+      expect(results).toHaveLength(2);
+      const warningEvents = progressEvents.filter(
+        (e) => e.type === 'error' && e.message?.includes('Only crawled 1 page')
+      );
+      expect(warningEvents).toHaveLength(0);
+    });
+  });
 });

package/src/crawl/intelligent-crawler.ts CHANGED Viewed

@@ -109,6 +109,18 @@ export class IntelligentCrawler extends EventEmitter {
       'Crawl complete'
     );
+    // Warn if crawl discovered far fewer pages than requested
+    if (this.visited.size === 1 && maxPages > 1) {
+      const warningProgress: CrawlProgress = {
+        type: 'error',
+        pagesVisited: this.visited.size,
+        totalPages: maxPages,
+        message: `Warning: Only crawled 1 page despite maxPages=${String(maxPages)}. Link discovery may have failed. If using --fast mode, try without it for JavaScript-heavy sites.`,
+        error: new Error('Low page discovery'),
+      };
+      this.emit('progress', warningProgress);
+    }
     const completeProgress: CrawlProgress = {
       type: 'complete',
       pagesVisited: this.visited.size,
@@ -157,8 +169,8 @@ export class IntelligentCrawler extends EventEmitter {
       const seedHtml = await this.fetchHtml(seedUrl, useHeadless);
-      // Step 2: Ask Claude which URLs to crawl
-      strategy = await this.claudeClient.determineCrawlUrls(seedHtml, crawlInstruction);
+      // Step 2: Ask Claude which URLs to crawl (pass seedUrl for relative URL resolution)
+      strategy = await this.claudeClient.determineCrawlUrls(seedUrl, seedHtml, crawlInstruction);
       const strategyCompleteProgress: CrawlProgress = {
         type: 'strategy',

package/src/index.ts CHANGED Viewed

@@ -16,6 +16,7 @@ import { createSearchCommand } from './cli/commands/search.js';
 import { createServeCommand } from './cli/commands/serve.js';
 import { createSetupCommand } from './cli/commands/setup.js';
 import { createStoreCommand } from './cli/commands/store.js';
+import { createSyncCommand } from './cli/commands/sync.js';
 import { createProgram, getGlobalOptions } from './cli/program.js';
 // Default paths
@@ -105,6 +106,7 @@ program.addCommand(createIndexCommand(() => getGlobalOptions(program)));
 program.addCommand(createServeCommand(() => getGlobalOptions(program)));
 program.addCommand(createCrawlCommand(() => getGlobalOptions(program)));
 program.addCommand(createSetupCommand(() => getGlobalOptions(program)));
+program.addCommand(createSyncCommand(() => getGlobalOptions(program)));
 program.addCommand(createMCPCommand(() => getGlobalOptions(program)));
 // Show comprehensive help when no arguments provided

package/src/mcp/commands/index.ts CHANGED Viewed

@@ -9,11 +9,13 @@ import { jobCommands } from './job.commands.js';
 import { metaCommands } from './meta.commands.js';
 import { commandRegistry } from './registry.js';
 import { storeCommands } from './store.commands.js';
+import { syncCommands } from './sync.commands.js';
 // Register all commands
 commandRegistry.registerAll(storeCommands);
 commandRegistry.registerAll(jobCommands);
 commandRegistry.registerAll(metaCommands);
+commandRegistry.registerAll(syncCommands);
 // Re-export for convenience
 export { commandRegistry, executeCommand, generateHelp } from './registry.js';