bluera-knowledge 0.9.26 → 0.9.30

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. package/.claude/commands/commit.md +4 -7
  2. package/.claude/hooks/post-edit-check.sh +21 -24
  3. package/.claude/skills/atomic-commits/SKILL.md +6 -0
  4. package/.claude-plugin/plugin.json +1 -1
  5. package/.env.example +4 -0
  6. package/.husky/pre-push +12 -2
  7. package/.versionrc.json +0 -4
  8. package/CHANGELOG.md +69 -0
  9. package/README.md +55 -20
  10. package/bun.lock +35 -1
  11. package/commands/crawl.md +2 -0
  12. package/dist/{chunk-BICFAWMN.js → chunk-DNOIM7BO.js} +73 -8
  13. package/dist/chunk-DNOIM7BO.js.map +1 -0
  14. package/dist/{chunk-5QMHZUC4.js → chunk-NJUMU4X2.js} +462 -105
  15. package/dist/chunk-NJUMU4X2.js.map +1 -0
  16. package/dist/{chunk-J7J6LXOJ.js → chunk-SZNTYLYT.js} +106 -41
  17. package/dist/chunk-SZNTYLYT.js.map +1 -0
  18. package/dist/index.js +65 -25
  19. package/dist/index.js.map +1 -1
  20. package/dist/mcp/server.js +2 -2
  21. package/dist/workers/background-worker-cli.js +2 -2
  22. package/eslint.config.js +1 -1
  23. package/package.json +3 -1
  24. package/src/analysis/ast-parser.test.ts +46 -0
  25. package/src/cli/commands/crawl.test.ts +99 -12
  26. package/src/cli/commands/crawl.ts +76 -24
  27. package/src/crawl/article-converter.ts +36 -1
  28. package/src/crawl/bridge.ts +18 -7
  29. package/src/crawl/intelligent-crawler.ts +45 -4
  30. package/src/db/embeddings.test.ts +16 -0
  31. package/src/logging/index.ts +29 -0
  32. package/src/logging/logger.test.ts +75 -0
  33. package/src/logging/logger.ts +147 -0
  34. package/src/logging/payload.test.ts +152 -0
  35. package/src/logging/payload.ts +121 -0
  36. package/src/mcp/handlers/search.handler.test.ts +28 -9
  37. package/src/mcp/handlers/search.handler.ts +69 -29
  38. package/src/mcp/handlers/store.handler.test.ts +1 -0
  39. package/src/mcp/server.ts +44 -16
  40. package/src/services/chunking.service.ts +23 -0
  41. package/src/services/index.service.test.ts +921 -1
  42. package/src/services/index.service.ts +76 -1
  43. package/src/services/index.ts +10 -1
  44. package/src/services/search.service.test.ts +573 -21
  45. package/src/services/search.service.ts +257 -105
  46. package/src/services/snippet.service.ts +28 -3
  47. package/src/services/token.service.test.ts +45 -0
  48. package/src/services/token.service.ts +33 -0
  49. package/src/types/result.test.ts +10 -0
  50. package/tests/integration/cli-consistency.test.ts +1 -4
  51. package/vitest.config.ts +4 -0
  52. package/dist/chunk-5QMHZUC4.js.map +0 -1
  53. package/dist/chunk-BICFAWMN.js.map +0 -1
  54. package/dist/chunk-J7J6LXOJ.js.map +0 -1
  55. package/scripts/readme-version-updater.cjs +0 -18
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "bluera-knowledge",
3
- "version": "0.9.26",
3
+ "version": "0.9.30",
4
4
  "description": "CLI tool for managing knowledge stores with semantic search",
5
5
  "type": "module",
6
6
  "bin": {
@@ -92,6 +92,8 @@
92
92
  "hono": "^4.11.1",
93
93
  "node-addon-api": "^8.5.0",
94
94
  "ora": "^9.0.0",
95
+ "pino": "^9.6.0",
96
+ "pino-roll": "^1.3.0",
95
97
  "slurp-ai": "^1.0.6",
96
98
  "tree-sitter": "^0.25.0",
97
99
  "tree-sitter-go": "^0.23.1",
@@ -356,6 +356,52 @@ import type { Props } from "./types";
356
356
  });
357
357
 
358
358
  describe('Edge cases and special scenarios', () => {
359
+ it('handles default exported class', () => {
360
+ const code = 'export default class DefaultClass { method() {} }';
361
+ const nodes = parser.parse(code, 'typescript');
362
+
363
+ expect(nodes).toHaveLength(1);
364
+ expect(nodes[0]?.exported).toBe(true);
365
+ expect(nodes[0]?.name).toBe('DefaultClass');
366
+ });
367
+
368
+ it('handles anonymous default exported class (no id)', () => {
369
+ const code = 'export default class { method() {} }';
370
+ const nodes = parser.parse(code, 'javascript');
371
+
372
+ // Anonymous classes don't have an id, should be skipped
373
+ expect(nodes.filter(n => n.type === 'class')).toHaveLength(0);
374
+ });
375
+
376
+ it('handles class with computed property method (non-identifier key)', () => {
377
+ const code = `class MyClass {
378
+ ['computed']() { return 42; }
379
+ normalMethod() { return 1; }
380
+ }`;
381
+ const nodes = parser.parse(code, 'javascript');
382
+
383
+ // Computed properties have StringLiteral keys, not Identifier
384
+ // Only normalMethod should be captured
385
+ expect(nodes[0]?.methods).toHaveLength(1);
386
+ expect(nodes[0]?.methods?.[0]?.name).toBe('normalMethod');
387
+ });
388
+
389
+ it('handles class with rest parameters in method', () => {
390
+ const code = `class MyClass {
391
+ method(...args) { return args; }
392
+ }`;
393
+ const nodes = parser.parse(code, 'javascript');
394
+
395
+ expect(nodes[0]?.methods?.[0]?.signature).toBe('method(param)');
396
+ });
397
+
398
+ it('handles function with rest parameters', () => {
399
+ const code = 'function spread(...items) { return items; }';
400
+ const nodes = parser.parse(code, 'javascript');
401
+
402
+ expect(nodes[0]?.signature).toBe('spread(param)');
403
+ });
404
+
359
405
  it('handles functions with complex destructured parameters', () => {
360
406
  const code = 'function complex({ a, b }, [c, d]) { return a + b + c + d; }';
361
407
  const nodes = parser.parse(code, 'javascript');
@@ -38,6 +38,7 @@ describe('crawl command execution', () => {
38
38
  store: {
39
39
  getByIdOrName: vi.fn(),
40
40
  list: vi.fn(),
41
+ create: vi.fn(),
41
42
  },
42
43
  lance: {
43
44
  initialize: vi.fn(),
@@ -291,23 +292,110 @@ describe('crawl command execution', () => {
291
292
  });
292
293
  });
293
294
 
294
- describe('error handling', () => {
295
- it('exits with code 3 when store not found', async () => {
295
+ describe('store auto-creation', () => {
296
+ it('auto-creates web store when store does not exist', async () => {
297
+ const createdStore: WebStore = {
298
+ id: createStoreId('new-store-id'),
299
+ name: 'new-store',
300
+ type: 'web',
301
+ url: 'https://example.com',
302
+ depth: 2,
303
+ createdAt: new Date(),
304
+ updatedAt: new Date(),
305
+ };
306
+
296
307
  mockServices.store.getByIdOrName.mockResolvedValue(undefined);
308
+ mockServices.store.create.mockResolvedValue({ success: true, data: createdStore });
309
+ mockServices.lance.initialize.mockResolvedValue(undefined);
310
+ mockServices.embeddings.embed.mockResolvedValue([0.1, 0.2, 0.3]);
311
+ mockServices.lance.addDocuments.mockResolvedValue(undefined);
312
+
313
+ mockCrawler.crawl.mockReturnValue(
314
+ (async function* () {
315
+ yield {
316
+ url: 'https://example.com/page1',
317
+ title: 'Page 1',
318
+ markdown: '# Content',
319
+ depth: 0,
320
+ };
321
+ })()
322
+ );
297
323
 
298
324
  const command = createCrawlCommand(getOptions);
299
325
  const actionHandler = command._actionHandler;
300
326
 
301
- await expect(actionHandler(['https://example.com', 'nonexistent-store'])).rejects.toThrow('process.exit: 3');
327
+ await actionHandler(['https://example.com', 'new-store']);
302
328
 
303
- expect(consoleErrorSpy).toHaveBeenCalledWith(
304
- 'Error: Web store not found: nonexistent-store'
329
+ expect(mockServices.store.create).toHaveBeenCalledWith({
330
+ name: 'new-store',
331
+ type: 'web',
332
+ url: 'https://example.com',
333
+ });
334
+ expect(consoleLogSpy).toHaveBeenCalledWith('Created web store: new-store');
335
+ expect(mockCrawler.crawl).toHaveBeenCalled();
336
+ });
337
+
338
+ it('throws error when store creation fails', async () => {
339
+ mockServices.store.getByIdOrName.mockResolvedValue(undefined);
340
+ mockServices.store.create.mockResolvedValue({ success: false, error: new Error('Name already exists') });
341
+
342
+ const command = createCrawlCommand(getOptions);
343
+ const actionHandler = command._actionHandler;
344
+
345
+ await expect(actionHandler(['https://example.com', 'bad-store'])).rejects.toThrow(
346
+ 'Failed to create store: Name already exists'
305
347
  );
306
- expect(processExitSpy).toHaveBeenCalledWith(3);
307
348
  expect(mockCrawler.crawl).not.toHaveBeenCalled();
308
349
  });
309
350
 
310
- it('exits with code 3 when store is not a web store', async () => {
351
+ it('includes storeCreated in JSON output when store was created', async () => {
352
+ const createdStore: WebStore = {
353
+ id: createStoreId('new-store-id'),
354
+ name: 'new-store',
355
+ type: 'web',
356
+ url: 'https://example.com',
357
+ depth: 2,
358
+ createdAt: new Date(),
359
+ updatedAt: new Date(),
360
+ };
361
+
362
+ mockServices.store.getByIdOrName.mockResolvedValue(undefined);
363
+ mockServices.store.create.mockResolvedValue({ success: true, data: createdStore });
364
+ mockServices.lance.initialize.mockResolvedValue(undefined);
365
+ mockServices.embeddings.embed.mockResolvedValue([0.1, 0.2, 0.3]);
366
+ mockServices.lance.addDocuments.mockResolvedValue(undefined);
367
+
368
+ mockCrawler.crawl.mockReturnValue(
369
+ (async function* () {
370
+ yield {
371
+ url: 'https://example.com/page1',
372
+ title: 'Page 1',
373
+ markdown: '# Content',
374
+ depth: 0,
375
+ };
376
+ })()
377
+ );
378
+
379
+ getOptions = () => ({
380
+ config: undefined,
381
+ dataDir: '/tmp/test',
382
+ quiet: false,
383
+ format: 'json',
384
+ });
385
+
386
+ const command = createCrawlCommand(getOptions);
387
+ const actionHandler = command._actionHandler;
388
+
389
+ await actionHandler(['https://example.com', 'new-store']);
390
+
391
+ expect(consoleLogSpy).toHaveBeenCalledWith(
392
+ expect.stringContaining('"storeCreated": true')
393
+ );
394
+ });
395
+ });
396
+
397
+ describe('error handling', () => {
398
+ it('throws error when store is not a web store', async () => {
311
399
  const mockFileStore = {
312
400
  id: createStoreId('store-1'),
313
401
  name: 'file-store',
@@ -322,10 +410,9 @@ describe('crawl command execution', () => {
322
410
  const command = createCrawlCommand(getOptions);
323
411
  const actionHandler = command._actionHandler;
324
412
 
325
- await expect(actionHandler(['https://example.com', 'file-store'])).rejects.toThrow('process.exit: 3');
326
-
327
- expect(consoleErrorSpy).toHaveBeenCalledWith('Error: Web store not found: file-store');
328
- expect(processExitSpy).toHaveBeenCalledWith(3);
413
+ await expect(actionHandler(['https://example.com', 'file-store'])).rejects.toThrow(
414
+ 'Store "file-store" exists but is not a web store (type: file)'
415
+ );
329
416
  });
330
417
 
331
418
  it('exits with code 6 when crawling fails', async () => {
@@ -933,7 +1020,7 @@ describe('crawl command execution', () => {
933
1020
  await actionHandler(['https://example.com', 'test-store']);
934
1021
 
935
1022
  expect(consoleLogSpy).toHaveBeenCalledWith('Crawling https://example.com');
936
- expect(consoleLogSpy).toHaveBeenCalledWith('Crawled and indexed 1 pages');
1023
+ expect(consoleLogSpy).toHaveBeenCalledWith('Crawled 1 pages, indexed 1 chunks');
937
1024
  } finally {
938
1025
  Object.defineProperty(process.stdout, 'isTTY', { value: originalIsTTY, configurable: true });
939
1026
  }
@@ -6,6 +6,9 @@ import { IntelligentCrawler, type CrawlProgress } from '../../crawl/intelligent-
6
6
  import { createDocumentId } from '../../types/brands.js';
7
7
  import type { GlobalOptions } from '../program.js';
8
8
  import type { Document } from '../../types/document.js';
9
+ import type { WebStore } from '../../types/store.js';
10
+ import { ChunkingService } from '../../services/chunking.service.js';
11
+ import { classifyWebContentType } from '../../services/index.service.js';
9
12
 
10
13
  export function createCrawlCommand(getOptions: () => GlobalOptions): Command {
11
14
  return new Command('crawl')
@@ -27,10 +30,38 @@ export function createCrawlCommand(getOptions: () => GlobalOptions): Command {
27
30
  const globalOpts = getOptions();
28
31
  const services = await createServices(globalOpts.config, globalOpts.dataDir);
29
32
 
30
- const store = await services.store.getByIdOrName(storeIdOrName);
31
- if (!store || store.type !== 'web') {
32
- console.error(`Error: Web store not found: ${storeIdOrName}`);
33
- process.exit(3);
33
+ // Look up or auto-create web store
34
+ let store: WebStore;
35
+ let storeCreated = false;
36
+ const existingStore = await services.store.getByIdOrName(storeIdOrName);
37
+
38
+ if (!existingStore) {
39
+ // Auto-create web store
40
+ const result = await services.store.create({
41
+ name: storeIdOrName,
42
+ type: 'web',
43
+ url,
44
+ });
45
+ if (!result.success) {
46
+ await destroyServices(services);
47
+ throw new Error(`Failed to create store: ${result.error.message}`);
48
+ }
49
+ // Type narrowing: success check above ensures result.data is Store
50
+ // We know it's a WebStore because we created it with type: 'web'
51
+ const createdStore = result.data;
52
+ if (createdStore.type !== 'web') {
53
+ throw new Error('Unexpected store type after creation');
54
+ }
55
+ store = createdStore;
56
+ storeCreated = true;
57
+ if (globalOpts.quiet !== true && globalOpts.format !== 'json') {
58
+ console.log(`Created web store: ${store.name}`);
59
+ }
60
+ } else if (existingStore.type !== 'web') {
61
+ await destroyServices(services);
62
+ throw new Error(`Store "${storeIdOrName}" exists but is not a web store (type: ${existingStore.type})`);
63
+ } else {
64
+ store = existingStore;
34
65
  }
35
66
 
36
67
  const maxPages = cmdOptions.maxPages !== undefined ? parseInt(cmdOptions.maxPages) : 50;
@@ -47,7 +78,10 @@ export function createCrawlCommand(getOptions: () => GlobalOptions): Command {
47
78
  }
48
79
 
49
80
  const crawler = new IntelligentCrawler();
81
+ // Use web preset for larger prose-friendly chunks
82
+ const webChunker = ChunkingService.forContentType('web');
50
83
  let pagesIndexed = 0;
84
+ let chunksCreated = 0;
51
85
 
52
86
  // Listen for progress events
53
87
  crawler.on('progress', (progress: CrawlProgress) => {
@@ -78,24 +112,40 @@ export function createCrawlCommand(getOptions: () => GlobalOptions): Command {
78
112
  ...(cmdOptions.simple !== undefined && { simple: cmdOptions.simple }),
79
113
  useHeadless: cmdOptions.headless ?? false,
80
114
  })) {
81
- // Embed and index the content (use extracted if available, otherwise markdown)
82
- const contentToEmbed = result.extracted !== undefined ? result.extracted : result.markdown;
83
- const vector = await services.embeddings.embed(contentToEmbed);
84
-
85
- docs.push({
86
- id: createDocumentId(`${store.id}-${createHash('md5').update(result.url).digest('hex')}`),
87
- content: contentToEmbed,
88
- vector,
89
- metadata: {
90
- type: 'web',
91
- storeId: store.id,
92
- url: result.url,
93
- title: result.title,
94
- extracted: result.extracted !== undefined,
95
- depth: result.depth,
96
- indexedAt: new Date(),
97
- },
98
- });
115
+ // Use extracted content if available, otherwise markdown
116
+ const contentToProcess = result.extracted !== undefined ? result.extracted : result.markdown;
117
+
118
+ // Chunk the content using markdown-aware chunking (web content is converted to markdown)
119
+ const chunks = webChunker.chunk(contentToProcess, `${result.url}.md`);
120
+ const fileType = classifyWebContentType(result.url, result.title);
121
+ const urlHash = createHash('md5').update(result.url).digest('hex');
122
+
123
+ for (const chunk of chunks) {
124
+ const chunkId = chunks.length > 1
125
+ ? `${store.id}-${urlHash}-${String(chunk.chunkIndex)}`
126
+ : `${store.id}-${urlHash}`;
127
+ const vector = await services.embeddings.embed(chunk.content);
128
+
129
+ docs.push({
130
+ id: createDocumentId(chunkId),
131
+ content: chunk.content,
132
+ vector,
133
+ metadata: {
134
+ type: chunks.length > 1 ? 'chunk' : 'web',
135
+ storeId: store.id,
136
+ url: result.url,
137
+ title: result.title,
138
+ extracted: result.extracted !== undefined,
139
+ depth: result.depth,
140
+ indexedAt: new Date(),
141
+ fileType,
142
+ chunkIndex: chunk.chunkIndex,
143
+ totalChunks: chunk.totalChunks,
144
+ sectionHeader: chunk.sectionHeader,
145
+ },
146
+ });
147
+ chunksCreated++;
148
+ }
99
149
 
100
150
  pagesIndexed++;
101
151
  }
@@ -111,8 +161,10 @@ export function createCrawlCommand(getOptions: () => GlobalOptions): Command {
111
161
  const crawlResult = {
112
162
  success: true,
113
163
  store: store.name,
164
+ storeCreated,
114
165
  url,
115
166
  pagesCrawled: pagesIndexed,
167
+ chunksCreated,
116
168
  mode: cmdOptions.simple === true ? 'simple' : 'intelligent',
117
169
  hadCrawlInstruction: cmdOptions.crawl !== undefined,
118
170
  hadExtractInstruction: cmdOptions.extract !== undefined,
@@ -121,9 +173,9 @@ export function createCrawlCommand(getOptions: () => GlobalOptions): Command {
121
173
  if (globalOpts.format === 'json') {
122
174
  console.log(JSON.stringify(crawlResult, null, 2));
123
175
  } else if (spinner !== undefined) {
124
- spinner.succeed(`Crawled and indexed ${String(pagesIndexed)} pages`);
176
+ spinner.succeed(`Crawled ${String(pagesIndexed)} pages, indexed ${String(chunksCreated)} chunks`);
125
177
  } else if (globalOpts.quiet !== true) {
126
- console.log(`Crawled and indexed ${String(pagesIndexed)} pages`);
178
+ console.log(`Crawled ${String(pagesIndexed)} pages, indexed ${String(chunksCreated)} chunks`);
127
179
  }
128
180
  } catch (error) {
129
181
  const message = `Crawl failed: ${error instanceof Error ? error.message : String(error)}`;
@@ -7,6 +7,9 @@ import { extractFromHtml } from '@extractus/article-extractor';
7
7
  import TurndownService from 'turndown';
8
8
  import { gfm } from 'turndown-plugin-gfm';
9
9
  import { preprocessHtmlForCodeBlocks, cleanupMarkdown } from './markdown-utils.js';
10
+ import { createLogger, truncateForLog } from '../logging/index.js';
11
+
12
+ const logger = createLogger('article-converter');
10
13
 
11
14
  export interface ConversionResult {
12
15
  markdown: string;
@@ -28,6 +31,8 @@ export async function convertHtmlToMarkdown(
28
31
  html: string,
29
32
  url: string,
30
33
  ): Promise<ConversionResult> {
34
+ logger.debug({ url, htmlLength: html.length }, 'Starting HTML conversion');
35
+
31
36
  try {
32
37
  // Step 1: Extract main article content
33
38
  let articleHtml: string;
@@ -38,13 +43,25 @@ export async function convertHtmlToMarkdown(
38
43
  if (article !== null && article.content !== undefined && article.content !== '') {
39
44
  articleHtml = article.content;
40
45
  title = article.title !== undefined && article.title !== '' ? article.title : undefined;
46
+ logger.debug({
47
+ url,
48
+ title,
49
+ extractedLength: articleHtml.length,
50
+ usedFullHtml: false,
51
+ }, 'Article content extracted');
41
52
  } else {
42
53
  // Fallback to full HTML if extraction fails
43
54
  articleHtml = html;
55
+ logger.debug({ url, usedFullHtml: true }, 'Article extraction returned empty, using full HTML');
44
56
  }
45
- } catch {
57
+ } catch (extractError) {
46
58
  // Fallback to full HTML if extraction fails
47
59
  articleHtml = html;
60
+ logger.debug({
61
+ url,
62
+ usedFullHtml: true,
63
+ error: extractError instanceof Error ? extractError.message : String(extractError),
64
+ }, 'Article extraction failed, using full HTML');
48
65
  }
49
66
 
50
67
  // Step 2: Preprocess HTML for code blocks
@@ -83,12 +100,30 @@ export async function convertHtmlToMarkdown(
83
100
  // Step 4: Cleanup markdown with comprehensive regex patterns
84
101
  const markdown = cleanupMarkdown(rawMarkdown);
85
102
 
103
+ logger.debug({
104
+ url,
105
+ title,
106
+ rawMarkdownLength: rawMarkdown.length,
107
+ finalMarkdownLength: markdown.length,
108
+ }, 'HTML to markdown conversion complete');
109
+
110
+ // Log markdown preview at trace level
111
+ logger.trace({
112
+ url,
113
+ markdownPreview: truncateForLog(markdown, 1000),
114
+ }, 'Markdown content preview');
115
+
86
116
  return {
87
117
  markdown,
88
118
  ...(title !== undefined && { title }),
89
119
  success: true,
90
120
  };
91
121
  } catch (error) {
122
+ logger.error({
123
+ url,
124
+ error: error instanceof Error ? error.message : String(error),
125
+ }, 'HTML to markdown conversion failed');
126
+
92
127
  return {
93
128
  markdown: '',
94
129
  success: false,
@@ -11,6 +11,9 @@ import {
11
11
  validateHeadlessResult,
12
12
  validateParsePythonResult,
13
13
  } from './schemas.js';
14
+ import { createLogger } from '../logging/index.js';
15
+
16
+ const logger = createLogger('python-bridge');
14
17
 
15
18
  // Re-export for backwards compatibility
16
19
  export type { CrawledLink, ParsePythonResult };
@@ -32,24 +35,26 @@ export class PythonBridge {
32
35
  start(): Promise<void> {
33
36
  if (this.process) return Promise.resolve();
34
37
 
38
+ logger.debug('Starting Python bridge process');
39
+
35
40
  this.process = spawn('python3', ['python/crawl_worker.py'], {
36
41
  stdio: ['pipe', 'pipe', 'pipe'],
37
42
  });
38
43
 
39
44
  // Add error handler for process spawn errors
40
45
  this.process.on('error', (err) => {
41
- console.error('Python bridge process error:', err);
46
+ logger.error({ error: err.message, stack: err.stack }, 'Python bridge process error');
42
47
  this.rejectAllPending(new Error(`Process error: ${err.message}`));
43
48
  });
44
49
 
45
50
  // Add exit handler to detect non-zero exits
46
51
  this.process.on('exit', (code, signal) => {
47
52
  if (code !== 0 && code !== null) {
48
- console.error(`Python bridge process exited with code ${String(code)}`);
53
+ logger.error({ code }, 'Python bridge process exited with non-zero code');
49
54
  this.rejectAllPending(new Error(`Process exited with code ${String(code)}`));
50
55
  } else if (signal && !this.stoppingIntentionally) {
51
56
  // Only log if we didn't intentionally stop the process
52
- console.error(`Python bridge process killed with signal ${signal}`);
57
+ logger.error({ signal }, 'Python bridge process killed with signal');
53
58
  this.rejectAllPending(new Error(`Process killed with signal ${signal}`));
54
59
  }
55
60
  this.process = null;
@@ -60,7 +65,7 @@ export class PythonBridge {
60
65
  if (this.process.stderr) {
61
66
  const stderrRl = createInterface({ input: this.process.stderr });
62
67
  stderrRl.on('line', (line) => {
63
- console.error('Python bridge stderr:', line);
68
+ logger.warn({ stderr: line }, 'Python bridge stderr output');
64
69
  });
65
70
  }
66
71
 
@@ -106,11 +111,14 @@ export class PythonBridge {
106
111
  } catch (error: unknown) {
107
112
  // Log validation failure with original response for debugging
108
113
  if (error instanceof ZodError) {
109
- console.error('Python bridge response validation failed:', error.issues);
110
- console.error('Original response:', JSON.stringify(response.result));
114
+ logger.error({
115
+ issues: error.issues,
116
+ response: JSON.stringify(response.result),
117
+ }, 'Python bridge response validation failed');
111
118
  pending.reject(new Error(`Invalid response format from Python bridge: ${error.message}`));
112
119
  } else {
113
120
  const errorMessage = error instanceof Error ? error.message : String(error);
121
+ logger.error({ error: errorMessage }, 'Response validation error');
114
122
  pending.reject(new Error(`Response validation error: ${errorMessage}`));
115
123
  }
116
124
  }
@@ -118,7 +126,10 @@ export class PythonBridge {
118
126
  // If neither result nor error, leave pending (will timeout)
119
127
  }
120
128
  } catch (err) {
121
- console.error('Failed to parse JSON response from Python bridge:', err, 'Line:', line);
129
+ logger.error({
130
+ error: err instanceof Error ? err.message : String(err),
131
+ line,
132
+ }, 'Failed to parse JSON response from Python bridge');
122
133
  }
123
134
  });
124
135
 
@@ -8,6 +8,9 @@ import axios from 'axios';
8
8
  import { ClaudeClient, type CrawlStrategy } from './claude-client.js';
9
9
  import { convertHtmlToMarkdown } from './article-converter.js';
10
10
  import { PythonBridge, type CrawledLink } from './bridge.js';
11
+ import { createLogger, summarizePayload } from '../logging/index.js';
12
+
13
+ const logger = createLogger('crawler');
11
14
 
12
15
  export interface CrawlOptions {
13
16
  crawlInstruction?: string; // Natural language: what to crawl
@@ -69,6 +72,13 @@ export class IntelligentCrawler extends EventEmitter {
69
72
  this.visited.clear();
70
73
  this.stopped = false;
71
74
 
75
+ logger.info({
76
+ seedUrl,
77
+ maxPages,
78
+ mode: simple ? 'simple' : (crawlInstruction !== undefined && crawlInstruction !== '' ? 'intelligent' : 'simple'),
79
+ hasExtractInstruction: extractInstruction !== undefined,
80
+ }, 'Starting crawl');
81
+
72
82
  const startProgress: CrawlProgress = {
73
83
  type: 'start',
74
84
  pagesVisited: 0,
@@ -86,6 +96,11 @@ export class IntelligentCrawler extends EventEmitter {
86
96
  yield* this.crawlSimple(seedUrl, extractInstruction, maxPages, options.useHeadless ?? false);
87
97
  }
88
98
 
99
+ logger.info({
100
+ seedUrl,
101
+ pagesVisited: this.visited.size,
102
+ }, 'Crawl complete');
103
+
89
104
  const completeProgress: CrawlProgress = {
90
105
  type: 'complete',
91
106
  pagesVisited: this.visited.size,
@@ -220,9 +235,9 @@ export class IntelligentCrawler extends EventEmitter {
220
235
  const links = await this.extractLinks(current.url, useHeadless);
221
236
 
222
237
  if (links.length === 0) {
223
- console.warn(`No links found on ${current.url} - page may be a leaf node`);
238
+ logger.debug({ url: current.url }, 'No links found - page may be a leaf node');
224
239
  } else {
225
- console.log(`Found ${String(links.length)} links on ${current.url}`);
240
+ logger.debug({ url: current.url, linkCount: links.length }, 'Links extracted from page');
226
241
  }
227
242
 
228
243
  for (const link of links) {
@@ -283,9 +298,16 @@ export class IntelligentCrawler extends EventEmitter {
283
298
  const conversion = await convertHtmlToMarkdown(html, url);
284
299
 
285
300
  if (!conversion.success) {
301
+ logger.error({ url, error: conversion.error }, 'HTML to markdown conversion failed');
286
302
  throw new Error(`Failed to convert HTML: ${conversion.error ?? 'Unknown error'}`);
287
303
  }
288
304
 
305
+ logger.debug({
306
+ url,
307
+ title: conversion.title,
308
+ markdownLength: conversion.markdown.length,
309
+ }, 'Article converted to markdown');
310
+
289
311
  let extracted: string | undefined;
290
312
 
291
313
  // Optional: Extract specific information using Claude
@@ -342,13 +364,23 @@ export class IntelligentCrawler extends EventEmitter {
342
364
  * Fetch HTML content from a URL
343
365
  */
344
366
  private async fetchHtml(url: string, useHeadless: boolean = false): Promise<string> {
367
+ const startTime = Date.now();
368
+ logger.debug({ url, useHeadless }, 'Fetching HTML');
369
+
345
370
  if (useHeadless) {
346
371
  try {
347
372
  const result = await this.pythonBridge.fetchHeadless(url);
373
+ const durationMs = Date.now() - startTime;
374
+ logger.info({
375
+ url,
376
+ useHeadless: true,
377
+ durationMs,
378
+ ...summarizePayload(result.html, 'raw-html', url),
379
+ }, 'Raw HTML fetched');
348
380
  return result.html;
349
381
  } catch (error) {
350
382
  // Fallback to axios if headless fails
351
- console.warn(`Headless fetch failed for ${url}, falling back to axios:`, error);
383
+ logger.warn({ url, error: error instanceof Error ? error.message : String(error) }, 'Headless fetch failed, falling back to axios');
352
384
  }
353
385
  }
354
386
 
@@ -362,8 +394,17 @@ export class IntelligentCrawler extends EventEmitter {
362
394
  },
363
395
  });
364
396
 
397
+ const durationMs = Date.now() - startTime;
398
+ logger.info({
399
+ url,
400
+ useHeadless: false,
401
+ durationMs,
402
+ ...summarizePayload(response.data, 'raw-html', url),
403
+ }, 'Raw HTML fetched');
404
+
365
405
  return response.data;
366
406
  } catch (error) {
407
+ logger.error({ url, error: error instanceof Error ? error.message : String(error) }, 'Failed to fetch HTML');
367
408
  throw new Error(
368
409
  `Failed to fetch ${url}: ${error instanceof Error ? error.message : String(error)}`,
369
410
  );
@@ -398,7 +439,7 @@ export class IntelligentCrawler extends EventEmitter {
398
439
  } catch (error: unknown) {
399
440
  // Log the error for debugging
400
441
  const errorMessage = error instanceof Error ? error.message : String(error);
401
- console.error(`Failed to extract links from ${url}:`, errorMessage);
442
+ logger.error({ url, error: errorMessage }, 'Failed to extract links');
402
443
 
403
444
  // Re-throw the error instead of silently swallowing it
404
445
  throw new Error(`Link extraction failed for ${url}: ${errorMessage}`);
@@ -48,6 +48,22 @@ describe('EmbeddingEngine', () => {
48
48
  const customEngine = new EmbeddingEngine('Xenova/all-MiniLM-L6-v2', 512);
49
49
  expect(customEngine.getDimensions()).toBe(512);
50
50
  });
51
+
52
+ it('skips initialization when already initialized', async () => {
53
+ // Engine is already initialized from beforeAll
54
+ // Calling initialize again should be a no-op
55
+ await engine.initialize();
56
+ const embedding = await engine.embed('test');
57
+ expect(embedding).toHaveLength(384);
58
+ });
59
+
60
+ it('handles large batch with multiple chunks', async () => {
61
+ // Create enough texts to trigger multiple batch iterations (BATCH_SIZE = 32)
62
+ const texts = Array.from({ length: 40 }, (_, i) => `Text number ${String(i)}`);
63
+ const embeddings = await engine.embedBatch(texts);
64
+ expect(embeddings).toHaveLength(40);
65
+ expect(embeddings.every((e) => e.length === 384)).toBe(true);
66
+ }, 60000);
51
67
  });
52
68
 
53
69
  function cosineSimilarity(a: number[], b: number[]): number {