bluera-knowledge 0.9.26 → 0.9.30
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/commands/commit.md +4 -7
- package/.claude/hooks/post-edit-check.sh +21 -24
- package/.claude/skills/atomic-commits/SKILL.md +6 -0
- package/.claude-plugin/plugin.json +1 -1
- package/.env.example +4 -0
- package/.husky/pre-push +12 -2
- package/.versionrc.json +0 -4
- package/CHANGELOG.md +69 -0
- package/README.md +55 -20
- package/bun.lock +35 -1
- package/commands/crawl.md +2 -0
- package/dist/{chunk-BICFAWMN.js → chunk-DNOIM7BO.js} +73 -8
- package/dist/chunk-DNOIM7BO.js.map +1 -0
- package/dist/{chunk-5QMHZUC4.js → chunk-NJUMU4X2.js} +462 -105
- package/dist/chunk-NJUMU4X2.js.map +1 -0
- package/dist/{chunk-J7J6LXOJ.js → chunk-SZNTYLYT.js} +106 -41
- package/dist/chunk-SZNTYLYT.js.map +1 -0
- package/dist/index.js +65 -25
- package/dist/index.js.map +1 -1
- package/dist/mcp/server.js +2 -2
- package/dist/workers/background-worker-cli.js +2 -2
- package/eslint.config.js +1 -1
- package/package.json +3 -1
- package/src/analysis/ast-parser.test.ts +46 -0
- package/src/cli/commands/crawl.test.ts +99 -12
- package/src/cli/commands/crawl.ts +76 -24
- package/src/crawl/article-converter.ts +36 -1
- package/src/crawl/bridge.ts +18 -7
- package/src/crawl/intelligent-crawler.ts +45 -4
- package/src/db/embeddings.test.ts +16 -0
- package/src/logging/index.ts +29 -0
- package/src/logging/logger.test.ts +75 -0
- package/src/logging/logger.ts +147 -0
- package/src/logging/payload.test.ts +152 -0
- package/src/logging/payload.ts +121 -0
- package/src/mcp/handlers/search.handler.test.ts +28 -9
- package/src/mcp/handlers/search.handler.ts +69 -29
- package/src/mcp/handlers/store.handler.test.ts +1 -0
- package/src/mcp/server.ts +44 -16
- package/src/services/chunking.service.ts +23 -0
- package/src/services/index.service.test.ts +921 -1
- package/src/services/index.service.ts +76 -1
- package/src/services/index.ts +10 -1
- package/src/services/search.service.test.ts +573 -21
- package/src/services/search.service.ts +257 -105
- package/src/services/snippet.service.ts +28 -3
- package/src/services/token.service.test.ts +45 -0
- package/src/services/token.service.ts +33 -0
- package/src/types/result.test.ts +10 -0
- package/tests/integration/cli-consistency.test.ts +1 -4
- package/vitest.config.ts +4 -0
- package/dist/chunk-5QMHZUC4.js.map +0 -1
- package/dist/chunk-BICFAWMN.js.map +0 -1
- package/dist/chunk-J7J6LXOJ.js.map +0 -1
- package/scripts/readme-version-updater.cjs +0 -18
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "bluera-knowledge",
|
|
3
|
-
"version": "0.9.
|
|
3
|
+
"version": "0.9.30",
|
|
4
4
|
"description": "CLI tool for managing knowledge stores with semantic search",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
|
@@ -92,6 +92,8 @@
|
|
|
92
92
|
"hono": "^4.11.1",
|
|
93
93
|
"node-addon-api": "^8.5.0",
|
|
94
94
|
"ora": "^9.0.0",
|
|
95
|
+
"pino": "^9.6.0",
|
|
96
|
+
"pino-roll": "^1.3.0",
|
|
95
97
|
"slurp-ai": "^1.0.6",
|
|
96
98
|
"tree-sitter": "^0.25.0",
|
|
97
99
|
"tree-sitter-go": "^0.23.1",
|
|
@@ -356,6 +356,52 @@ import type { Props } from "./types";
|
|
|
356
356
|
});
|
|
357
357
|
|
|
358
358
|
describe('Edge cases and special scenarios', () => {
|
|
359
|
+
it('handles default exported class', () => {
|
|
360
|
+
const code = 'export default class DefaultClass { method() {} }';
|
|
361
|
+
const nodes = parser.parse(code, 'typescript');
|
|
362
|
+
|
|
363
|
+
expect(nodes).toHaveLength(1);
|
|
364
|
+
expect(nodes[0]?.exported).toBe(true);
|
|
365
|
+
expect(nodes[0]?.name).toBe('DefaultClass');
|
|
366
|
+
});
|
|
367
|
+
|
|
368
|
+
it('handles anonymous default exported class (no id)', () => {
|
|
369
|
+
const code = 'export default class { method() {} }';
|
|
370
|
+
const nodes = parser.parse(code, 'javascript');
|
|
371
|
+
|
|
372
|
+
// Anonymous classes don't have an id, should be skipped
|
|
373
|
+
expect(nodes.filter(n => n.type === 'class')).toHaveLength(0);
|
|
374
|
+
});
|
|
375
|
+
|
|
376
|
+
it('handles class with computed property method (non-identifier key)', () => {
|
|
377
|
+
const code = `class MyClass {
|
|
378
|
+
['computed']() { return 42; }
|
|
379
|
+
normalMethod() { return 1; }
|
|
380
|
+
}`;
|
|
381
|
+
const nodes = parser.parse(code, 'javascript');
|
|
382
|
+
|
|
383
|
+
// Computed properties have StringLiteral keys, not Identifier
|
|
384
|
+
// Only normalMethod should be captured
|
|
385
|
+
expect(nodes[0]?.methods).toHaveLength(1);
|
|
386
|
+
expect(nodes[0]?.methods?.[0]?.name).toBe('normalMethod');
|
|
387
|
+
});
|
|
388
|
+
|
|
389
|
+
it('handles class with rest parameters in method', () => {
|
|
390
|
+
const code = `class MyClass {
|
|
391
|
+
method(...args) { return args; }
|
|
392
|
+
}`;
|
|
393
|
+
const nodes = parser.parse(code, 'javascript');
|
|
394
|
+
|
|
395
|
+
expect(nodes[0]?.methods?.[0]?.signature).toBe('method(param)');
|
|
396
|
+
});
|
|
397
|
+
|
|
398
|
+
it('handles function with rest parameters', () => {
|
|
399
|
+
const code = 'function spread(...items) { return items; }';
|
|
400
|
+
const nodes = parser.parse(code, 'javascript');
|
|
401
|
+
|
|
402
|
+
expect(nodes[0]?.signature).toBe('spread(param)');
|
|
403
|
+
});
|
|
404
|
+
|
|
359
405
|
it('handles functions with complex destructured parameters', () => {
|
|
360
406
|
const code = 'function complex({ a, b }, [c, d]) { return a + b + c + d; }';
|
|
361
407
|
const nodes = parser.parse(code, 'javascript');
|
|
@@ -38,6 +38,7 @@ describe('crawl command execution', () => {
|
|
|
38
38
|
store: {
|
|
39
39
|
getByIdOrName: vi.fn(),
|
|
40
40
|
list: vi.fn(),
|
|
41
|
+
create: vi.fn(),
|
|
41
42
|
},
|
|
42
43
|
lance: {
|
|
43
44
|
initialize: vi.fn(),
|
|
@@ -291,23 +292,110 @@ describe('crawl command execution', () => {
|
|
|
291
292
|
});
|
|
292
293
|
});
|
|
293
294
|
|
|
294
|
-
describe('
|
|
295
|
-
it('
|
|
295
|
+
describe('store auto-creation', () => {
|
|
296
|
+
it('auto-creates web store when store does not exist', async () => {
|
|
297
|
+
const createdStore: WebStore = {
|
|
298
|
+
id: createStoreId('new-store-id'),
|
|
299
|
+
name: 'new-store',
|
|
300
|
+
type: 'web',
|
|
301
|
+
url: 'https://example.com',
|
|
302
|
+
depth: 2,
|
|
303
|
+
createdAt: new Date(),
|
|
304
|
+
updatedAt: new Date(),
|
|
305
|
+
};
|
|
306
|
+
|
|
296
307
|
mockServices.store.getByIdOrName.mockResolvedValue(undefined);
|
|
308
|
+
mockServices.store.create.mockResolvedValue({ success: true, data: createdStore });
|
|
309
|
+
mockServices.lance.initialize.mockResolvedValue(undefined);
|
|
310
|
+
mockServices.embeddings.embed.mockResolvedValue([0.1, 0.2, 0.3]);
|
|
311
|
+
mockServices.lance.addDocuments.mockResolvedValue(undefined);
|
|
312
|
+
|
|
313
|
+
mockCrawler.crawl.mockReturnValue(
|
|
314
|
+
(async function* () {
|
|
315
|
+
yield {
|
|
316
|
+
url: 'https://example.com/page1',
|
|
317
|
+
title: 'Page 1',
|
|
318
|
+
markdown: '# Content',
|
|
319
|
+
depth: 0,
|
|
320
|
+
};
|
|
321
|
+
})()
|
|
322
|
+
);
|
|
297
323
|
|
|
298
324
|
const command = createCrawlCommand(getOptions);
|
|
299
325
|
const actionHandler = command._actionHandler;
|
|
300
326
|
|
|
301
|
-
await
|
|
327
|
+
await actionHandler(['https://example.com', 'new-store']);
|
|
302
328
|
|
|
303
|
-
expect(
|
|
304
|
-
|
|
329
|
+
expect(mockServices.store.create).toHaveBeenCalledWith({
|
|
330
|
+
name: 'new-store',
|
|
331
|
+
type: 'web',
|
|
332
|
+
url: 'https://example.com',
|
|
333
|
+
});
|
|
334
|
+
expect(consoleLogSpy).toHaveBeenCalledWith('Created web store: new-store');
|
|
335
|
+
expect(mockCrawler.crawl).toHaveBeenCalled();
|
|
336
|
+
});
|
|
337
|
+
|
|
338
|
+
it('throws error when store creation fails', async () => {
|
|
339
|
+
mockServices.store.getByIdOrName.mockResolvedValue(undefined);
|
|
340
|
+
mockServices.store.create.mockResolvedValue({ success: false, error: new Error('Name already exists') });
|
|
341
|
+
|
|
342
|
+
const command = createCrawlCommand(getOptions);
|
|
343
|
+
const actionHandler = command._actionHandler;
|
|
344
|
+
|
|
345
|
+
await expect(actionHandler(['https://example.com', 'bad-store'])).rejects.toThrow(
|
|
346
|
+
'Failed to create store: Name already exists'
|
|
305
347
|
);
|
|
306
|
-
expect(processExitSpy).toHaveBeenCalledWith(3);
|
|
307
348
|
expect(mockCrawler.crawl).not.toHaveBeenCalled();
|
|
308
349
|
});
|
|
309
350
|
|
|
310
|
-
it('
|
|
351
|
+
it('includes storeCreated in JSON output when store was created', async () => {
|
|
352
|
+
const createdStore: WebStore = {
|
|
353
|
+
id: createStoreId('new-store-id'),
|
|
354
|
+
name: 'new-store',
|
|
355
|
+
type: 'web',
|
|
356
|
+
url: 'https://example.com',
|
|
357
|
+
depth: 2,
|
|
358
|
+
createdAt: new Date(),
|
|
359
|
+
updatedAt: new Date(),
|
|
360
|
+
};
|
|
361
|
+
|
|
362
|
+
mockServices.store.getByIdOrName.mockResolvedValue(undefined);
|
|
363
|
+
mockServices.store.create.mockResolvedValue({ success: true, data: createdStore });
|
|
364
|
+
mockServices.lance.initialize.mockResolvedValue(undefined);
|
|
365
|
+
mockServices.embeddings.embed.mockResolvedValue([0.1, 0.2, 0.3]);
|
|
366
|
+
mockServices.lance.addDocuments.mockResolvedValue(undefined);
|
|
367
|
+
|
|
368
|
+
mockCrawler.crawl.mockReturnValue(
|
|
369
|
+
(async function* () {
|
|
370
|
+
yield {
|
|
371
|
+
url: 'https://example.com/page1',
|
|
372
|
+
title: 'Page 1',
|
|
373
|
+
markdown: '# Content',
|
|
374
|
+
depth: 0,
|
|
375
|
+
};
|
|
376
|
+
})()
|
|
377
|
+
);
|
|
378
|
+
|
|
379
|
+
getOptions = () => ({
|
|
380
|
+
config: undefined,
|
|
381
|
+
dataDir: '/tmp/test',
|
|
382
|
+
quiet: false,
|
|
383
|
+
format: 'json',
|
|
384
|
+
});
|
|
385
|
+
|
|
386
|
+
const command = createCrawlCommand(getOptions);
|
|
387
|
+
const actionHandler = command._actionHandler;
|
|
388
|
+
|
|
389
|
+
await actionHandler(['https://example.com', 'new-store']);
|
|
390
|
+
|
|
391
|
+
expect(consoleLogSpy).toHaveBeenCalledWith(
|
|
392
|
+
expect.stringContaining('"storeCreated": true')
|
|
393
|
+
);
|
|
394
|
+
});
|
|
395
|
+
});
|
|
396
|
+
|
|
397
|
+
describe('error handling', () => {
|
|
398
|
+
it('throws error when store is not a web store', async () => {
|
|
311
399
|
const mockFileStore = {
|
|
312
400
|
id: createStoreId('store-1'),
|
|
313
401
|
name: 'file-store',
|
|
@@ -322,10 +410,9 @@ describe('crawl command execution', () => {
|
|
|
322
410
|
const command = createCrawlCommand(getOptions);
|
|
323
411
|
const actionHandler = command._actionHandler;
|
|
324
412
|
|
|
325
|
-
await expect(actionHandler(['https://example.com', 'file-store'])).rejects.toThrow(
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
expect(processExitSpy).toHaveBeenCalledWith(3);
|
|
413
|
+
await expect(actionHandler(['https://example.com', 'file-store'])).rejects.toThrow(
|
|
414
|
+
'Store "file-store" exists but is not a web store (type: file)'
|
|
415
|
+
);
|
|
329
416
|
});
|
|
330
417
|
|
|
331
418
|
it('exits with code 6 when crawling fails', async () => {
|
|
@@ -933,7 +1020,7 @@ describe('crawl command execution', () => {
|
|
|
933
1020
|
await actionHandler(['https://example.com', 'test-store']);
|
|
934
1021
|
|
|
935
1022
|
expect(consoleLogSpy).toHaveBeenCalledWith('Crawling https://example.com');
|
|
936
|
-
expect(consoleLogSpy).toHaveBeenCalledWith('Crawled
|
|
1023
|
+
expect(consoleLogSpy).toHaveBeenCalledWith('Crawled 1 pages, indexed 1 chunks');
|
|
937
1024
|
} finally {
|
|
938
1025
|
Object.defineProperty(process.stdout, 'isTTY', { value: originalIsTTY, configurable: true });
|
|
939
1026
|
}
|
|
@@ -6,6 +6,9 @@ import { IntelligentCrawler, type CrawlProgress } from '../../crawl/intelligent-
|
|
|
6
6
|
import { createDocumentId } from '../../types/brands.js';
|
|
7
7
|
import type { GlobalOptions } from '../program.js';
|
|
8
8
|
import type { Document } from '../../types/document.js';
|
|
9
|
+
import type { WebStore } from '../../types/store.js';
|
|
10
|
+
import { ChunkingService } from '../../services/chunking.service.js';
|
|
11
|
+
import { classifyWebContentType } from '../../services/index.service.js';
|
|
9
12
|
|
|
10
13
|
export function createCrawlCommand(getOptions: () => GlobalOptions): Command {
|
|
11
14
|
return new Command('crawl')
|
|
@@ -27,10 +30,38 @@ export function createCrawlCommand(getOptions: () => GlobalOptions): Command {
|
|
|
27
30
|
const globalOpts = getOptions();
|
|
28
31
|
const services = await createServices(globalOpts.config, globalOpts.dataDir);
|
|
29
32
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
33
|
+
// Look up or auto-create web store
|
|
34
|
+
let store: WebStore;
|
|
35
|
+
let storeCreated = false;
|
|
36
|
+
const existingStore = await services.store.getByIdOrName(storeIdOrName);
|
|
37
|
+
|
|
38
|
+
if (!existingStore) {
|
|
39
|
+
// Auto-create web store
|
|
40
|
+
const result = await services.store.create({
|
|
41
|
+
name: storeIdOrName,
|
|
42
|
+
type: 'web',
|
|
43
|
+
url,
|
|
44
|
+
});
|
|
45
|
+
if (!result.success) {
|
|
46
|
+
await destroyServices(services);
|
|
47
|
+
throw new Error(`Failed to create store: ${result.error.message}`);
|
|
48
|
+
}
|
|
49
|
+
// Type narrowing: success check above ensures result.data is Store
|
|
50
|
+
// We know it's a WebStore because we created it with type: 'web'
|
|
51
|
+
const createdStore = result.data;
|
|
52
|
+
if (createdStore.type !== 'web') {
|
|
53
|
+
throw new Error('Unexpected store type after creation');
|
|
54
|
+
}
|
|
55
|
+
store = createdStore;
|
|
56
|
+
storeCreated = true;
|
|
57
|
+
if (globalOpts.quiet !== true && globalOpts.format !== 'json') {
|
|
58
|
+
console.log(`Created web store: ${store.name}`);
|
|
59
|
+
}
|
|
60
|
+
} else if (existingStore.type !== 'web') {
|
|
61
|
+
await destroyServices(services);
|
|
62
|
+
throw new Error(`Store "${storeIdOrName}" exists but is not a web store (type: ${existingStore.type})`);
|
|
63
|
+
} else {
|
|
64
|
+
store = existingStore;
|
|
34
65
|
}
|
|
35
66
|
|
|
36
67
|
const maxPages = cmdOptions.maxPages !== undefined ? parseInt(cmdOptions.maxPages) : 50;
|
|
@@ -47,7 +78,10 @@ export function createCrawlCommand(getOptions: () => GlobalOptions): Command {
|
|
|
47
78
|
}
|
|
48
79
|
|
|
49
80
|
const crawler = new IntelligentCrawler();
|
|
81
|
+
// Use web preset for larger prose-friendly chunks
|
|
82
|
+
const webChunker = ChunkingService.forContentType('web');
|
|
50
83
|
let pagesIndexed = 0;
|
|
84
|
+
let chunksCreated = 0;
|
|
51
85
|
|
|
52
86
|
// Listen for progress events
|
|
53
87
|
crawler.on('progress', (progress: CrawlProgress) => {
|
|
@@ -78,24 +112,40 @@ export function createCrawlCommand(getOptions: () => GlobalOptions): Command {
|
|
|
78
112
|
...(cmdOptions.simple !== undefined && { simple: cmdOptions.simple }),
|
|
79
113
|
useHeadless: cmdOptions.headless ?? false,
|
|
80
114
|
})) {
|
|
81
|
-
//
|
|
82
|
-
const
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
115
|
+
// Use extracted content if available, otherwise markdown
|
|
116
|
+
const contentToProcess = result.extracted !== undefined ? result.extracted : result.markdown;
|
|
117
|
+
|
|
118
|
+
// Chunk the content using markdown-aware chunking (web content is converted to markdown)
|
|
119
|
+
const chunks = webChunker.chunk(contentToProcess, `${result.url}.md`);
|
|
120
|
+
const fileType = classifyWebContentType(result.url, result.title);
|
|
121
|
+
const urlHash = createHash('md5').update(result.url).digest('hex');
|
|
122
|
+
|
|
123
|
+
for (const chunk of chunks) {
|
|
124
|
+
const chunkId = chunks.length > 1
|
|
125
|
+
? `${store.id}-${urlHash}-${String(chunk.chunkIndex)}`
|
|
126
|
+
: `${store.id}-${urlHash}`;
|
|
127
|
+
const vector = await services.embeddings.embed(chunk.content);
|
|
128
|
+
|
|
129
|
+
docs.push({
|
|
130
|
+
id: createDocumentId(chunkId),
|
|
131
|
+
content: chunk.content,
|
|
132
|
+
vector,
|
|
133
|
+
metadata: {
|
|
134
|
+
type: chunks.length > 1 ? 'chunk' : 'web',
|
|
135
|
+
storeId: store.id,
|
|
136
|
+
url: result.url,
|
|
137
|
+
title: result.title,
|
|
138
|
+
extracted: result.extracted !== undefined,
|
|
139
|
+
depth: result.depth,
|
|
140
|
+
indexedAt: new Date(),
|
|
141
|
+
fileType,
|
|
142
|
+
chunkIndex: chunk.chunkIndex,
|
|
143
|
+
totalChunks: chunk.totalChunks,
|
|
144
|
+
sectionHeader: chunk.sectionHeader,
|
|
145
|
+
},
|
|
146
|
+
});
|
|
147
|
+
chunksCreated++;
|
|
148
|
+
}
|
|
99
149
|
|
|
100
150
|
pagesIndexed++;
|
|
101
151
|
}
|
|
@@ -111,8 +161,10 @@ export function createCrawlCommand(getOptions: () => GlobalOptions): Command {
|
|
|
111
161
|
const crawlResult = {
|
|
112
162
|
success: true,
|
|
113
163
|
store: store.name,
|
|
164
|
+
storeCreated,
|
|
114
165
|
url,
|
|
115
166
|
pagesCrawled: pagesIndexed,
|
|
167
|
+
chunksCreated,
|
|
116
168
|
mode: cmdOptions.simple === true ? 'simple' : 'intelligent',
|
|
117
169
|
hadCrawlInstruction: cmdOptions.crawl !== undefined,
|
|
118
170
|
hadExtractInstruction: cmdOptions.extract !== undefined,
|
|
@@ -121,9 +173,9 @@ export function createCrawlCommand(getOptions: () => GlobalOptions): Command {
|
|
|
121
173
|
if (globalOpts.format === 'json') {
|
|
122
174
|
console.log(JSON.stringify(crawlResult, null, 2));
|
|
123
175
|
} else if (spinner !== undefined) {
|
|
124
|
-
spinner.succeed(`Crawled
|
|
176
|
+
spinner.succeed(`Crawled ${String(pagesIndexed)} pages, indexed ${String(chunksCreated)} chunks`);
|
|
125
177
|
} else if (globalOpts.quiet !== true) {
|
|
126
|
-
console.log(`Crawled
|
|
178
|
+
console.log(`Crawled ${String(pagesIndexed)} pages, indexed ${String(chunksCreated)} chunks`);
|
|
127
179
|
}
|
|
128
180
|
} catch (error) {
|
|
129
181
|
const message = `Crawl failed: ${error instanceof Error ? error.message : String(error)}`;
|
|
@@ -7,6 +7,9 @@ import { extractFromHtml } from '@extractus/article-extractor';
|
|
|
7
7
|
import TurndownService from 'turndown';
|
|
8
8
|
import { gfm } from 'turndown-plugin-gfm';
|
|
9
9
|
import { preprocessHtmlForCodeBlocks, cleanupMarkdown } from './markdown-utils.js';
|
|
10
|
+
import { createLogger, truncateForLog } from '../logging/index.js';
|
|
11
|
+
|
|
12
|
+
const logger = createLogger('article-converter');
|
|
10
13
|
|
|
11
14
|
export interface ConversionResult {
|
|
12
15
|
markdown: string;
|
|
@@ -28,6 +31,8 @@ export async function convertHtmlToMarkdown(
|
|
|
28
31
|
html: string,
|
|
29
32
|
url: string,
|
|
30
33
|
): Promise<ConversionResult> {
|
|
34
|
+
logger.debug({ url, htmlLength: html.length }, 'Starting HTML conversion');
|
|
35
|
+
|
|
31
36
|
try {
|
|
32
37
|
// Step 1: Extract main article content
|
|
33
38
|
let articleHtml: string;
|
|
@@ -38,13 +43,25 @@ export async function convertHtmlToMarkdown(
|
|
|
38
43
|
if (article !== null && article.content !== undefined && article.content !== '') {
|
|
39
44
|
articleHtml = article.content;
|
|
40
45
|
title = article.title !== undefined && article.title !== '' ? article.title : undefined;
|
|
46
|
+
logger.debug({
|
|
47
|
+
url,
|
|
48
|
+
title,
|
|
49
|
+
extractedLength: articleHtml.length,
|
|
50
|
+
usedFullHtml: false,
|
|
51
|
+
}, 'Article content extracted');
|
|
41
52
|
} else {
|
|
42
53
|
// Fallback to full HTML if extraction fails
|
|
43
54
|
articleHtml = html;
|
|
55
|
+
logger.debug({ url, usedFullHtml: true }, 'Article extraction returned empty, using full HTML');
|
|
44
56
|
}
|
|
45
|
-
} catch {
|
|
57
|
+
} catch (extractError) {
|
|
46
58
|
// Fallback to full HTML if extraction fails
|
|
47
59
|
articleHtml = html;
|
|
60
|
+
logger.debug({
|
|
61
|
+
url,
|
|
62
|
+
usedFullHtml: true,
|
|
63
|
+
error: extractError instanceof Error ? extractError.message : String(extractError),
|
|
64
|
+
}, 'Article extraction failed, using full HTML');
|
|
48
65
|
}
|
|
49
66
|
|
|
50
67
|
// Step 2: Preprocess HTML for code blocks
|
|
@@ -83,12 +100,30 @@ export async function convertHtmlToMarkdown(
|
|
|
83
100
|
// Step 4: Cleanup markdown with comprehensive regex patterns
|
|
84
101
|
const markdown = cleanupMarkdown(rawMarkdown);
|
|
85
102
|
|
|
103
|
+
logger.debug({
|
|
104
|
+
url,
|
|
105
|
+
title,
|
|
106
|
+
rawMarkdownLength: rawMarkdown.length,
|
|
107
|
+
finalMarkdownLength: markdown.length,
|
|
108
|
+
}, 'HTML to markdown conversion complete');
|
|
109
|
+
|
|
110
|
+
// Log markdown preview at trace level
|
|
111
|
+
logger.trace({
|
|
112
|
+
url,
|
|
113
|
+
markdownPreview: truncateForLog(markdown, 1000),
|
|
114
|
+
}, 'Markdown content preview');
|
|
115
|
+
|
|
86
116
|
return {
|
|
87
117
|
markdown,
|
|
88
118
|
...(title !== undefined && { title }),
|
|
89
119
|
success: true,
|
|
90
120
|
};
|
|
91
121
|
} catch (error) {
|
|
122
|
+
logger.error({
|
|
123
|
+
url,
|
|
124
|
+
error: error instanceof Error ? error.message : String(error),
|
|
125
|
+
}, 'HTML to markdown conversion failed');
|
|
126
|
+
|
|
92
127
|
return {
|
|
93
128
|
markdown: '',
|
|
94
129
|
success: false,
|
package/src/crawl/bridge.ts
CHANGED
|
@@ -11,6 +11,9 @@ import {
|
|
|
11
11
|
validateHeadlessResult,
|
|
12
12
|
validateParsePythonResult,
|
|
13
13
|
} from './schemas.js';
|
|
14
|
+
import { createLogger } from '../logging/index.js';
|
|
15
|
+
|
|
16
|
+
const logger = createLogger('python-bridge');
|
|
14
17
|
|
|
15
18
|
// Re-export for backwards compatibility
|
|
16
19
|
export type { CrawledLink, ParsePythonResult };
|
|
@@ -32,24 +35,26 @@ export class PythonBridge {
|
|
|
32
35
|
start(): Promise<void> {
|
|
33
36
|
if (this.process) return Promise.resolve();
|
|
34
37
|
|
|
38
|
+
logger.debug('Starting Python bridge process');
|
|
39
|
+
|
|
35
40
|
this.process = spawn('python3', ['python/crawl_worker.py'], {
|
|
36
41
|
stdio: ['pipe', 'pipe', 'pipe'],
|
|
37
42
|
});
|
|
38
43
|
|
|
39
44
|
// Add error handler for process spawn errors
|
|
40
45
|
this.process.on('error', (err) => {
|
|
41
|
-
|
|
46
|
+
logger.error({ error: err.message, stack: err.stack }, 'Python bridge process error');
|
|
42
47
|
this.rejectAllPending(new Error(`Process error: ${err.message}`));
|
|
43
48
|
});
|
|
44
49
|
|
|
45
50
|
// Add exit handler to detect non-zero exits
|
|
46
51
|
this.process.on('exit', (code, signal) => {
|
|
47
52
|
if (code !== 0 && code !== null) {
|
|
48
|
-
|
|
53
|
+
logger.error({ code }, 'Python bridge process exited with non-zero code');
|
|
49
54
|
this.rejectAllPending(new Error(`Process exited with code ${String(code)}`));
|
|
50
55
|
} else if (signal && !this.stoppingIntentionally) {
|
|
51
56
|
// Only log if we didn't intentionally stop the process
|
|
52
|
-
|
|
57
|
+
logger.error({ signal }, 'Python bridge process killed with signal');
|
|
53
58
|
this.rejectAllPending(new Error(`Process killed with signal ${signal}`));
|
|
54
59
|
}
|
|
55
60
|
this.process = null;
|
|
@@ -60,7 +65,7 @@ export class PythonBridge {
|
|
|
60
65
|
if (this.process.stderr) {
|
|
61
66
|
const stderrRl = createInterface({ input: this.process.stderr });
|
|
62
67
|
stderrRl.on('line', (line) => {
|
|
63
|
-
|
|
68
|
+
logger.warn({ stderr: line }, 'Python bridge stderr output');
|
|
64
69
|
});
|
|
65
70
|
}
|
|
66
71
|
|
|
@@ -106,11 +111,14 @@ export class PythonBridge {
|
|
|
106
111
|
} catch (error: unknown) {
|
|
107
112
|
// Log validation failure with original response for debugging
|
|
108
113
|
if (error instanceof ZodError) {
|
|
109
|
-
|
|
110
|
-
|
|
114
|
+
logger.error({
|
|
115
|
+
issues: error.issues,
|
|
116
|
+
response: JSON.stringify(response.result),
|
|
117
|
+
}, 'Python bridge response validation failed');
|
|
111
118
|
pending.reject(new Error(`Invalid response format from Python bridge: ${error.message}`));
|
|
112
119
|
} else {
|
|
113
120
|
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
121
|
+
logger.error({ error: errorMessage }, 'Response validation error');
|
|
114
122
|
pending.reject(new Error(`Response validation error: ${errorMessage}`));
|
|
115
123
|
}
|
|
116
124
|
}
|
|
@@ -118,7 +126,10 @@ export class PythonBridge {
|
|
|
118
126
|
// If neither result nor error, leave pending (will timeout)
|
|
119
127
|
}
|
|
120
128
|
} catch (err) {
|
|
121
|
-
|
|
129
|
+
logger.error({
|
|
130
|
+
error: err instanceof Error ? err.message : String(err),
|
|
131
|
+
line,
|
|
132
|
+
}, 'Failed to parse JSON response from Python bridge');
|
|
122
133
|
}
|
|
123
134
|
});
|
|
124
135
|
|
|
@@ -8,6 +8,9 @@ import axios from 'axios';
|
|
|
8
8
|
import { ClaudeClient, type CrawlStrategy } from './claude-client.js';
|
|
9
9
|
import { convertHtmlToMarkdown } from './article-converter.js';
|
|
10
10
|
import { PythonBridge, type CrawledLink } from './bridge.js';
|
|
11
|
+
import { createLogger, summarizePayload } from '../logging/index.js';
|
|
12
|
+
|
|
13
|
+
const logger = createLogger('crawler');
|
|
11
14
|
|
|
12
15
|
export interface CrawlOptions {
|
|
13
16
|
crawlInstruction?: string; // Natural language: what to crawl
|
|
@@ -69,6 +72,13 @@ export class IntelligentCrawler extends EventEmitter {
|
|
|
69
72
|
this.visited.clear();
|
|
70
73
|
this.stopped = false;
|
|
71
74
|
|
|
75
|
+
logger.info({
|
|
76
|
+
seedUrl,
|
|
77
|
+
maxPages,
|
|
78
|
+
mode: simple ? 'simple' : (crawlInstruction !== undefined && crawlInstruction !== '' ? 'intelligent' : 'simple'),
|
|
79
|
+
hasExtractInstruction: extractInstruction !== undefined,
|
|
80
|
+
}, 'Starting crawl');
|
|
81
|
+
|
|
72
82
|
const startProgress: CrawlProgress = {
|
|
73
83
|
type: 'start',
|
|
74
84
|
pagesVisited: 0,
|
|
@@ -86,6 +96,11 @@ export class IntelligentCrawler extends EventEmitter {
|
|
|
86
96
|
yield* this.crawlSimple(seedUrl, extractInstruction, maxPages, options.useHeadless ?? false);
|
|
87
97
|
}
|
|
88
98
|
|
|
99
|
+
logger.info({
|
|
100
|
+
seedUrl,
|
|
101
|
+
pagesVisited: this.visited.size,
|
|
102
|
+
}, 'Crawl complete');
|
|
103
|
+
|
|
89
104
|
const completeProgress: CrawlProgress = {
|
|
90
105
|
type: 'complete',
|
|
91
106
|
pagesVisited: this.visited.size,
|
|
@@ -220,9 +235,9 @@ export class IntelligentCrawler extends EventEmitter {
|
|
|
220
235
|
const links = await this.extractLinks(current.url, useHeadless);
|
|
221
236
|
|
|
222
237
|
if (links.length === 0) {
|
|
223
|
-
|
|
238
|
+
logger.debug({ url: current.url }, 'No links found - page may be a leaf node');
|
|
224
239
|
} else {
|
|
225
|
-
|
|
240
|
+
logger.debug({ url: current.url, linkCount: links.length }, 'Links extracted from page');
|
|
226
241
|
}
|
|
227
242
|
|
|
228
243
|
for (const link of links) {
|
|
@@ -283,9 +298,16 @@ export class IntelligentCrawler extends EventEmitter {
|
|
|
283
298
|
const conversion = await convertHtmlToMarkdown(html, url);
|
|
284
299
|
|
|
285
300
|
if (!conversion.success) {
|
|
301
|
+
logger.error({ url, error: conversion.error }, 'HTML to markdown conversion failed');
|
|
286
302
|
throw new Error(`Failed to convert HTML: ${conversion.error ?? 'Unknown error'}`);
|
|
287
303
|
}
|
|
288
304
|
|
|
305
|
+
logger.debug({
|
|
306
|
+
url,
|
|
307
|
+
title: conversion.title,
|
|
308
|
+
markdownLength: conversion.markdown.length,
|
|
309
|
+
}, 'Article converted to markdown');
|
|
310
|
+
|
|
289
311
|
let extracted: string | undefined;
|
|
290
312
|
|
|
291
313
|
// Optional: Extract specific information using Claude
|
|
@@ -342,13 +364,23 @@ export class IntelligentCrawler extends EventEmitter {
|
|
|
342
364
|
* Fetch HTML content from a URL
|
|
343
365
|
*/
|
|
344
366
|
private async fetchHtml(url: string, useHeadless: boolean = false): Promise<string> {
|
|
367
|
+
const startTime = Date.now();
|
|
368
|
+
logger.debug({ url, useHeadless }, 'Fetching HTML');
|
|
369
|
+
|
|
345
370
|
if (useHeadless) {
|
|
346
371
|
try {
|
|
347
372
|
const result = await this.pythonBridge.fetchHeadless(url);
|
|
373
|
+
const durationMs = Date.now() - startTime;
|
|
374
|
+
logger.info({
|
|
375
|
+
url,
|
|
376
|
+
useHeadless: true,
|
|
377
|
+
durationMs,
|
|
378
|
+
...summarizePayload(result.html, 'raw-html', url),
|
|
379
|
+
}, 'Raw HTML fetched');
|
|
348
380
|
return result.html;
|
|
349
381
|
} catch (error) {
|
|
350
382
|
// Fallback to axios if headless fails
|
|
351
|
-
|
|
383
|
+
logger.warn({ url, error: error instanceof Error ? error.message : String(error) }, 'Headless fetch failed, falling back to axios');
|
|
352
384
|
}
|
|
353
385
|
}
|
|
354
386
|
|
|
@@ -362,8 +394,17 @@ export class IntelligentCrawler extends EventEmitter {
|
|
|
362
394
|
},
|
|
363
395
|
});
|
|
364
396
|
|
|
397
|
+
const durationMs = Date.now() - startTime;
|
|
398
|
+
logger.info({
|
|
399
|
+
url,
|
|
400
|
+
useHeadless: false,
|
|
401
|
+
durationMs,
|
|
402
|
+
...summarizePayload(response.data, 'raw-html', url),
|
|
403
|
+
}, 'Raw HTML fetched');
|
|
404
|
+
|
|
365
405
|
return response.data;
|
|
366
406
|
} catch (error) {
|
|
407
|
+
logger.error({ url, error: error instanceof Error ? error.message : String(error) }, 'Failed to fetch HTML');
|
|
367
408
|
throw new Error(
|
|
368
409
|
`Failed to fetch ${url}: ${error instanceof Error ? error.message : String(error)}`,
|
|
369
410
|
);
|
|
@@ -398,7 +439,7 @@ export class IntelligentCrawler extends EventEmitter {
|
|
|
398
439
|
} catch (error: unknown) {
|
|
399
440
|
// Log the error for debugging
|
|
400
441
|
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
401
|
-
|
|
442
|
+
logger.error({ url, error: errorMessage }, 'Failed to extract links');
|
|
402
443
|
|
|
403
444
|
// Re-throw the error instead of silently swallowing it
|
|
404
445
|
throw new Error(`Link extraction failed for ${url}: ${errorMessage}`);
|
|
@@ -48,6 +48,22 @@ describe('EmbeddingEngine', () => {
|
|
|
48
48
|
const customEngine = new EmbeddingEngine('Xenova/all-MiniLM-L6-v2', 512);
|
|
49
49
|
expect(customEngine.getDimensions()).toBe(512);
|
|
50
50
|
});
|
|
51
|
+
|
|
52
|
+
it('skips initialization when already initialized', async () => {
|
|
53
|
+
// Engine is already initialized from beforeAll
|
|
54
|
+
// Calling initialize again should be a no-op
|
|
55
|
+
await engine.initialize();
|
|
56
|
+
const embedding = await engine.embed('test');
|
|
57
|
+
expect(embedding).toHaveLength(384);
|
|
58
|
+
});
|
|
59
|
+
|
|
60
|
+
it('handles large batch with multiple chunks', async () => {
|
|
61
|
+
// Create enough texts to trigger multiple batch iterations (BATCH_SIZE = 32)
|
|
62
|
+
const texts = Array.from({ length: 40 }, (_, i) => `Text number ${String(i)}`);
|
|
63
|
+
const embeddings = await engine.embedBatch(texts);
|
|
64
|
+
expect(embeddings).toHaveLength(40);
|
|
65
|
+
expect(embeddings.every((e) => e.length === 384)).toBe(true);
|
|
66
|
+
}, 60000);
|
|
51
67
|
});
|
|
52
68
|
|
|
53
69
|
function cosineSimilarity(a: number[], b: number[]): number {
|