bluera-knowledge 0.9.26 → 0.9.31
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/commands/commit.md +4 -7
- package/.claude/hooks/post-edit-check.sh +21 -24
- package/.claude/skills/atomic-commits/SKILL.md +6 -0
- package/.claude-plugin/plugin.json +1 -1
- package/.env.example +4 -0
- package/.husky/pre-push +12 -2
- package/.versionrc.json +0 -4
- package/BUGS-FOUND.md +71 -0
- package/CHANGELOG.md +76 -0
- package/README.md +55 -20
- package/bun.lock +35 -1
- package/commands/crawl.md +2 -0
- package/dist/{chunk-BICFAWMN.js → chunk-2SJHNRXD.js} +73 -8
- package/dist/chunk-2SJHNRXD.js.map +1 -0
- package/dist/{chunk-J7J6LXOJ.js → chunk-OGEY66FZ.js} +106 -41
- package/dist/chunk-OGEY66FZ.js.map +1 -0
- package/dist/{chunk-5QMHZUC4.js → chunk-RWSXP3PQ.js} +482 -106
- package/dist/chunk-RWSXP3PQ.js.map +1 -0
- package/dist/index.js +73 -28
- package/dist/index.js.map +1 -1
- package/dist/mcp/server.js +2 -2
- package/dist/workers/background-worker-cli.js +2 -2
- package/eslint.config.js +1 -1
- package/package.json +3 -1
- package/src/analysis/ast-parser.test.ts +46 -0
- package/src/cli/commands/crawl.test.ts +99 -12
- package/src/cli/commands/crawl.ts +76 -24
- package/src/cli/commands/store.test.ts +68 -1
- package/src/cli/commands/store.ts +9 -3
- package/src/crawl/article-converter.ts +36 -1
- package/src/crawl/bridge.ts +18 -7
- package/src/crawl/intelligent-crawler.ts +45 -4
- package/src/db/embeddings.test.ts +16 -0
- package/src/db/lance.test.ts +31 -0
- package/src/db/lance.ts +8 -0
- package/src/logging/index.ts +29 -0
- package/src/logging/logger.test.ts +75 -0
- package/src/logging/logger.ts +147 -0
- package/src/logging/payload.test.ts +152 -0
- package/src/logging/payload.ts +121 -0
- package/src/mcp/handlers/search.handler.test.ts +28 -9
- package/src/mcp/handlers/search.handler.ts +69 -29
- package/src/mcp/handlers/store.handler.test.ts +1 -0
- package/src/mcp/server.ts +44 -16
- package/src/services/chunking.service.ts +23 -0
- package/src/services/index.service.test.ts +921 -1
- package/src/services/index.service.ts +76 -1
- package/src/services/index.ts +20 -2
- package/src/services/search.service.test.ts +573 -21
- package/src/services/search.service.ts +257 -105
- package/src/services/services.test.ts +2 -2
- package/src/services/snippet.service.ts +28 -3
- package/src/services/store.service.test.ts +28 -0
- package/src/services/store.service.ts +4 -0
- package/src/services/token.service.test.ts +45 -0
- package/src/services/token.service.ts +33 -0
- package/src/types/result.test.ts +10 -0
- package/tests/integration/cli-consistency.test.ts +1 -4
- package/vitest.config.ts +4 -0
- package/dist/chunk-5QMHZUC4.js.map +0 -1
- package/dist/chunk-BICFAWMN.js.map +0 -1
- package/dist/chunk-J7J6LXOJ.js.map +0 -1
- package/scripts/readme-version-updater.cjs +0 -18
package/eslint.config.js
CHANGED
|
@@ -42,7 +42,7 @@ export default tseslint.config(
|
|
|
42
42
|
},
|
|
43
43
|
},
|
|
44
44
|
{
|
|
45
|
-
ignores: ['dist/**', 'node_modules/**', '*.config.js', '**/*.test.ts', 'tests/**/*.ts'],
|
|
45
|
+
ignores: ['dist/**', 'node_modules/**', '*.config.js', '*.config.ts', '**/*.test.ts', 'tests/**/*.ts'],
|
|
46
46
|
},
|
|
47
47
|
// Test files: Apply custom skip-comment rule only
|
|
48
48
|
{
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "bluera-knowledge",
|
|
3
|
-
"version": "0.9.
|
|
3
|
+
"version": "0.9.31",
|
|
4
4
|
"description": "CLI tool for managing knowledge stores with semantic search",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
|
@@ -92,6 +92,8 @@
|
|
|
92
92
|
"hono": "^4.11.1",
|
|
93
93
|
"node-addon-api": "^8.5.0",
|
|
94
94
|
"ora": "^9.0.0",
|
|
95
|
+
"pino": "^9.6.0",
|
|
96
|
+
"pino-roll": "^1.3.0",
|
|
95
97
|
"slurp-ai": "^1.0.6",
|
|
96
98
|
"tree-sitter": "^0.25.0",
|
|
97
99
|
"tree-sitter-go": "^0.23.1",
|
|
@@ -356,6 +356,52 @@ import type { Props } from "./types";
|
|
|
356
356
|
});
|
|
357
357
|
|
|
358
358
|
describe('Edge cases and special scenarios', () => {
|
|
359
|
+
it('handles default exported class', () => {
|
|
360
|
+
const code = 'export default class DefaultClass { method() {} }';
|
|
361
|
+
const nodes = parser.parse(code, 'typescript');
|
|
362
|
+
|
|
363
|
+
expect(nodes).toHaveLength(1);
|
|
364
|
+
expect(nodes[0]?.exported).toBe(true);
|
|
365
|
+
expect(nodes[0]?.name).toBe('DefaultClass');
|
|
366
|
+
});
|
|
367
|
+
|
|
368
|
+
it('handles anonymous default exported class (no id)', () => {
|
|
369
|
+
const code = 'export default class { method() {} }';
|
|
370
|
+
const nodes = parser.parse(code, 'javascript');
|
|
371
|
+
|
|
372
|
+
// Anonymous classes don't have an id, should be skipped
|
|
373
|
+
expect(nodes.filter(n => n.type === 'class')).toHaveLength(0);
|
|
374
|
+
});
|
|
375
|
+
|
|
376
|
+
it('handles class with computed property method (non-identifier key)', () => {
|
|
377
|
+
const code = `class MyClass {
|
|
378
|
+
['computed']() { return 42; }
|
|
379
|
+
normalMethod() { return 1; }
|
|
380
|
+
}`;
|
|
381
|
+
const nodes = parser.parse(code, 'javascript');
|
|
382
|
+
|
|
383
|
+
// Computed properties have StringLiteral keys, not Identifier
|
|
384
|
+
// Only normalMethod should be captured
|
|
385
|
+
expect(nodes[0]?.methods).toHaveLength(1);
|
|
386
|
+
expect(nodes[0]?.methods?.[0]?.name).toBe('normalMethod');
|
|
387
|
+
});
|
|
388
|
+
|
|
389
|
+
it('handles class with rest parameters in method', () => {
|
|
390
|
+
const code = `class MyClass {
|
|
391
|
+
method(...args) { return args; }
|
|
392
|
+
}`;
|
|
393
|
+
const nodes = parser.parse(code, 'javascript');
|
|
394
|
+
|
|
395
|
+
expect(nodes[0]?.methods?.[0]?.signature).toBe('method(param)');
|
|
396
|
+
});
|
|
397
|
+
|
|
398
|
+
it('handles function with rest parameters', () => {
|
|
399
|
+
const code = 'function spread(...items) { return items; }';
|
|
400
|
+
const nodes = parser.parse(code, 'javascript');
|
|
401
|
+
|
|
402
|
+
expect(nodes[0]?.signature).toBe('spread(param)');
|
|
403
|
+
});
|
|
404
|
+
|
|
359
405
|
it('handles functions with complex destructured parameters', () => {
|
|
360
406
|
const code = 'function complex({ a, b }, [c, d]) { return a + b + c + d; }';
|
|
361
407
|
const nodes = parser.parse(code, 'javascript');
|
|
@@ -38,6 +38,7 @@ describe('crawl command execution', () => {
|
|
|
38
38
|
store: {
|
|
39
39
|
getByIdOrName: vi.fn(),
|
|
40
40
|
list: vi.fn(),
|
|
41
|
+
create: vi.fn(),
|
|
41
42
|
},
|
|
42
43
|
lance: {
|
|
43
44
|
initialize: vi.fn(),
|
|
@@ -291,23 +292,110 @@ describe('crawl command execution', () => {
|
|
|
291
292
|
});
|
|
292
293
|
});
|
|
293
294
|
|
|
294
|
-
describe('
|
|
295
|
-
it('
|
|
295
|
+
describe('store auto-creation', () => {
|
|
296
|
+
it('auto-creates web store when store does not exist', async () => {
|
|
297
|
+
const createdStore: WebStore = {
|
|
298
|
+
id: createStoreId('new-store-id'),
|
|
299
|
+
name: 'new-store',
|
|
300
|
+
type: 'web',
|
|
301
|
+
url: 'https://example.com',
|
|
302
|
+
depth: 2,
|
|
303
|
+
createdAt: new Date(),
|
|
304
|
+
updatedAt: new Date(),
|
|
305
|
+
};
|
|
306
|
+
|
|
296
307
|
mockServices.store.getByIdOrName.mockResolvedValue(undefined);
|
|
308
|
+
mockServices.store.create.mockResolvedValue({ success: true, data: createdStore });
|
|
309
|
+
mockServices.lance.initialize.mockResolvedValue(undefined);
|
|
310
|
+
mockServices.embeddings.embed.mockResolvedValue([0.1, 0.2, 0.3]);
|
|
311
|
+
mockServices.lance.addDocuments.mockResolvedValue(undefined);
|
|
312
|
+
|
|
313
|
+
mockCrawler.crawl.mockReturnValue(
|
|
314
|
+
(async function* () {
|
|
315
|
+
yield {
|
|
316
|
+
url: 'https://example.com/page1',
|
|
317
|
+
title: 'Page 1',
|
|
318
|
+
markdown: '# Content',
|
|
319
|
+
depth: 0,
|
|
320
|
+
};
|
|
321
|
+
})()
|
|
322
|
+
);
|
|
297
323
|
|
|
298
324
|
const command = createCrawlCommand(getOptions);
|
|
299
325
|
const actionHandler = command._actionHandler;
|
|
300
326
|
|
|
301
|
-
await
|
|
327
|
+
await actionHandler(['https://example.com', 'new-store']);
|
|
302
328
|
|
|
303
|
-
expect(
|
|
304
|
-
|
|
329
|
+
expect(mockServices.store.create).toHaveBeenCalledWith({
|
|
330
|
+
name: 'new-store',
|
|
331
|
+
type: 'web',
|
|
332
|
+
url: 'https://example.com',
|
|
333
|
+
});
|
|
334
|
+
expect(consoleLogSpy).toHaveBeenCalledWith('Created web store: new-store');
|
|
335
|
+
expect(mockCrawler.crawl).toHaveBeenCalled();
|
|
336
|
+
});
|
|
337
|
+
|
|
338
|
+
it('throws error when store creation fails', async () => {
|
|
339
|
+
mockServices.store.getByIdOrName.mockResolvedValue(undefined);
|
|
340
|
+
mockServices.store.create.mockResolvedValue({ success: false, error: new Error('Name already exists') });
|
|
341
|
+
|
|
342
|
+
const command = createCrawlCommand(getOptions);
|
|
343
|
+
const actionHandler = command._actionHandler;
|
|
344
|
+
|
|
345
|
+
await expect(actionHandler(['https://example.com', 'bad-store'])).rejects.toThrow(
|
|
346
|
+
'Failed to create store: Name already exists'
|
|
305
347
|
);
|
|
306
|
-
expect(processExitSpy).toHaveBeenCalledWith(3);
|
|
307
348
|
expect(mockCrawler.crawl).not.toHaveBeenCalled();
|
|
308
349
|
});
|
|
309
350
|
|
|
310
|
-
it('
|
|
351
|
+
it('includes storeCreated in JSON output when store was created', async () => {
|
|
352
|
+
const createdStore: WebStore = {
|
|
353
|
+
id: createStoreId('new-store-id'),
|
|
354
|
+
name: 'new-store',
|
|
355
|
+
type: 'web',
|
|
356
|
+
url: 'https://example.com',
|
|
357
|
+
depth: 2,
|
|
358
|
+
createdAt: new Date(),
|
|
359
|
+
updatedAt: new Date(),
|
|
360
|
+
};
|
|
361
|
+
|
|
362
|
+
mockServices.store.getByIdOrName.mockResolvedValue(undefined);
|
|
363
|
+
mockServices.store.create.mockResolvedValue({ success: true, data: createdStore });
|
|
364
|
+
mockServices.lance.initialize.mockResolvedValue(undefined);
|
|
365
|
+
mockServices.embeddings.embed.mockResolvedValue([0.1, 0.2, 0.3]);
|
|
366
|
+
mockServices.lance.addDocuments.mockResolvedValue(undefined);
|
|
367
|
+
|
|
368
|
+
mockCrawler.crawl.mockReturnValue(
|
|
369
|
+
(async function* () {
|
|
370
|
+
yield {
|
|
371
|
+
url: 'https://example.com/page1',
|
|
372
|
+
title: 'Page 1',
|
|
373
|
+
markdown: '# Content',
|
|
374
|
+
depth: 0,
|
|
375
|
+
};
|
|
376
|
+
})()
|
|
377
|
+
);
|
|
378
|
+
|
|
379
|
+
getOptions = () => ({
|
|
380
|
+
config: undefined,
|
|
381
|
+
dataDir: '/tmp/test',
|
|
382
|
+
quiet: false,
|
|
383
|
+
format: 'json',
|
|
384
|
+
});
|
|
385
|
+
|
|
386
|
+
const command = createCrawlCommand(getOptions);
|
|
387
|
+
const actionHandler = command._actionHandler;
|
|
388
|
+
|
|
389
|
+
await actionHandler(['https://example.com', 'new-store']);
|
|
390
|
+
|
|
391
|
+
expect(consoleLogSpy).toHaveBeenCalledWith(
|
|
392
|
+
expect.stringContaining('"storeCreated": true')
|
|
393
|
+
);
|
|
394
|
+
});
|
|
395
|
+
});
|
|
396
|
+
|
|
397
|
+
describe('error handling', () => {
|
|
398
|
+
it('throws error when store is not a web store', async () => {
|
|
311
399
|
const mockFileStore = {
|
|
312
400
|
id: createStoreId('store-1'),
|
|
313
401
|
name: 'file-store',
|
|
@@ -322,10 +410,9 @@ describe('crawl command execution', () => {
|
|
|
322
410
|
const command = createCrawlCommand(getOptions);
|
|
323
411
|
const actionHandler = command._actionHandler;
|
|
324
412
|
|
|
325
|
-
await expect(actionHandler(['https://example.com', 'file-store'])).rejects.toThrow(
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
expect(processExitSpy).toHaveBeenCalledWith(3);
|
|
413
|
+
await expect(actionHandler(['https://example.com', 'file-store'])).rejects.toThrow(
|
|
414
|
+
'Store "file-store" exists but is not a web store (type: file)'
|
|
415
|
+
);
|
|
329
416
|
});
|
|
330
417
|
|
|
331
418
|
it('exits with code 6 when crawling fails', async () => {
|
|
@@ -933,7 +1020,7 @@ describe('crawl command execution', () => {
|
|
|
933
1020
|
await actionHandler(['https://example.com', 'test-store']);
|
|
934
1021
|
|
|
935
1022
|
expect(consoleLogSpy).toHaveBeenCalledWith('Crawling https://example.com');
|
|
936
|
-
expect(consoleLogSpy).toHaveBeenCalledWith('Crawled
|
|
1023
|
+
expect(consoleLogSpy).toHaveBeenCalledWith('Crawled 1 pages, indexed 1 chunks');
|
|
937
1024
|
} finally {
|
|
938
1025
|
Object.defineProperty(process.stdout, 'isTTY', { value: originalIsTTY, configurable: true });
|
|
939
1026
|
}
|
|
@@ -6,6 +6,9 @@ import { IntelligentCrawler, type CrawlProgress } from '../../crawl/intelligent-
|
|
|
6
6
|
import { createDocumentId } from '../../types/brands.js';
|
|
7
7
|
import type { GlobalOptions } from '../program.js';
|
|
8
8
|
import type { Document } from '../../types/document.js';
|
|
9
|
+
import type { WebStore } from '../../types/store.js';
|
|
10
|
+
import { ChunkingService } from '../../services/chunking.service.js';
|
|
11
|
+
import { classifyWebContentType } from '../../services/index.service.js';
|
|
9
12
|
|
|
10
13
|
export function createCrawlCommand(getOptions: () => GlobalOptions): Command {
|
|
11
14
|
return new Command('crawl')
|
|
@@ -27,10 +30,38 @@ export function createCrawlCommand(getOptions: () => GlobalOptions): Command {
|
|
|
27
30
|
const globalOpts = getOptions();
|
|
28
31
|
const services = await createServices(globalOpts.config, globalOpts.dataDir);
|
|
29
32
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
33
|
+
// Look up or auto-create web store
|
|
34
|
+
let store: WebStore;
|
|
35
|
+
let storeCreated = false;
|
|
36
|
+
const existingStore = await services.store.getByIdOrName(storeIdOrName);
|
|
37
|
+
|
|
38
|
+
if (!existingStore) {
|
|
39
|
+
// Auto-create web store
|
|
40
|
+
const result = await services.store.create({
|
|
41
|
+
name: storeIdOrName,
|
|
42
|
+
type: 'web',
|
|
43
|
+
url,
|
|
44
|
+
});
|
|
45
|
+
if (!result.success) {
|
|
46
|
+
await destroyServices(services);
|
|
47
|
+
throw new Error(`Failed to create store: ${result.error.message}`);
|
|
48
|
+
}
|
|
49
|
+
// Type narrowing: success check above ensures result.data is Store
|
|
50
|
+
// We know it's a WebStore because we created it with type: 'web'
|
|
51
|
+
const createdStore = result.data;
|
|
52
|
+
if (createdStore.type !== 'web') {
|
|
53
|
+
throw new Error('Unexpected store type after creation');
|
|
54
|
+
}
|
|
55
|
+
store = createdStore;
|
|
56
|
+
storeCreated = true;
|
|
57
|
+
if (globalOpts.quiet !== true && globalOpts.format !== 'json') {
|
|
58
|
+
console.log(`Created web store: ${store.name}`);
|
|
59
|
+
}
|
|
60
|
+
} else if (existingStore.type !== 'web') {
|
|
61
|
+
await destroyServices(services);
|
|
62
|
+
throw new Error(`Store "${storeIdOrName}" exists but is not a web store (type: ${existingStore.type})`);
|
|
63
|
+
} else {
|
|
64
|
+
store = existingStore;
|
|
34
65
|
}
|
|
35
66
|
|
|
36
67
|
const maxPages = cmdOptions.maxPages !== undefined ? parseInt(cmdOptions.maxPages) : 50;
|
|
@@ -47,7 +78,10 @@ export function createCrawlCommand(getOptions: () => GlobalOptions): Command {
|
|
|
47
78
|
}
|
|
48
79
|
|
|
49
80
|
const crawler = new IntelligentCrawler();
|
|
81
|
+
// Use web preset for larger prose-friendly chunks
|
|
82
|
+
const webChunker = ChunkingService.forContentType('web');
|
|
50
83
|
let pagesIndexed = 0;
|
|
84
|
+
let chunksCreated = 0;
|
|
51
85
|
|
|
52
86
|
// Listen for progress events
|
|
53
87
|
crawler.on('progress', (progress: CrawlProgress) => {
|
|
@@ -78,24 +112,40 @@ export function createCrawlCommand(getOptions: () => GlobalOptions): Command {
|
|
|
78
112
|
...(cmdOptions.simple !== undefined && { simple: cmdOptions.simple }),
|
|
79
113
|
useHeadless: cmdOptions.headless ?? false,
|
|
80
114
|
})) {
|
|
81
|
-
//
|
|
82
|
-
const
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
115
|
+
// Use extracted content if available, otherwise markdown
|
|
116
|
+
const contentToProcess = result.extracted !== undefined ? result.extracted : result.markdown;
|
|
117
|
+
|
|
118
|
+
// Chunk the content using markdown-aware chunking (web content is converted to markdown)
|
|
119
|
+
const chunks = webChunker.chunk(contentToProcess, `${result.url}.md`);
|
|
120
|
+
const fileType = classifyWebContentType(result.url, result.title);
|
|
121
|
+
const urlHash = createHash('md5').update(result.url).digest('hex');
|
|
122
|
+
|
|
123
|
+
for (const chunk of chunks) {
|
|
124
|
+
const chunkId = chunks.length > 1
|
|
125
|
+
? `${store.id}-${urlHash}-${String(chunk.chunkIndex)}`
|
|
126
|
+
: `${store.id}-${urlHash}`;
|
|
127
|
+
const vector = await services.embeddings.embed(chunk.content);
|
|
128
|
+
|
|
129
|
+
docs.push({
|
|
130
|
+
id: createDocumentId(chunkId),
|
|
131
|
+
content: chunk.content,
|
|
132
|
+
vector,
|
|
133
|
+
metadata: {
|
|
134
|
+
type: chunks.length > 1 ? 'chunk' : 'web',
|
|
135
|
+
storeId: store.id,
|
|
136
|
+
url: result.url,
|
|
137
|
+
title: result.title,
|
|
138
|
+
extracted: result.extracted !== undefined,
|
|
139
|
+
depth: result.depth,
|
|
140
|
+
indexedAt: new Date(),
|
|
141
|
+
fileType,
|
|
142
|
+
chunkIndex: chunk.chunkIndex,
|
|
143
|
+
totalChunks: chunk.totalChunks,
|
|
144
|
+
sectionHeader: chunk.sectionHeader,
|
|
145
|
+
},
|
|
146
|
+
});
|
|
147
|
+
chunksCreated++;
|
|
148
|
+
}
|
|
99
149
|
|
|
100
150
|
pagesIndexed++;
|
|
101
151
|
}
|
|
@@ -111,8 +161,10 @@ export function createCrawlCommand(getOptions: () => GlobalOptions): Command {
|
|
|
111
161
|
const crawlResult = {
|
|
112
162
|
success: true,
|
|
113
163
|
store: store.name,
|
|
164
|
+
storeCreated,
|
|
114
165
|
url,
|
|
115
166
|
pagesCrawled: pagesIndexed,
|
|
167
|
+
chunksCreated,
|
|
116
168
|
mode: cmdOptions.simple === true ? 'simple' : 'intelligent',
|
|
117
169
|
hadCrawlInstruction: cmdOptions.crawl !== undefined,
|
|
118
170
|
hadExtractInstruction: cmdOptions.extract !== undefined,
|
|
@@ -121,9 +173,9 @@ export function createCrawlCommand(getOptions: () => GlobalOptions): Command {
|
|
|
121
173
|
if (globalOpts.format === 'json') {
|
|
122
174
|
console.log(JSON.stringify(crawlResult, null, 2));
|
|
123
175
|
} else if (spinner !== undefined) {
|
|
124
|
-
spinner.succeed(`Crawled
|
|
176
|
+
spinner.succeed(`Crawled ${String(pagesIndexed)} pages, indexed ${String(chunksCreated)} chunks`);
|
|
125
177
|
} else if (globalOpts.quiet !== true) {
|
|
126
|
-
console.log(`Crawled
|
|
178
|
+
console.log(`Crawled ${String(pagesIndexed)} pages, indexed ${String(chunksCreated)} chunks`);
|
|
127
179
|
}
|
|
128
180
|
} catch (error) {
|
|
129
181
|
const message = `Crawl failed: ${error instanceof Error ? error.message : String(error)}`;
|
|
@@ -935,7 +935,7 @@ describe('store command execution', () => {
|
|
|
935
935
|
});
|
|
936
936
|
});
|
|
937
937
|
|
|
938
|
-
it('routes source to path for repo stores', async () => {
|
|
938
|
+
it('routes source to path for repo stores with local path', async () => {
|
|
939
939
|
const mockStore: RepoStore = {
|
|
940
940
|
id: createStoreId('store-2'),
|
|
941
941
|
name: 'repo-store',
|
|
@@ -967,6 +967,73 @@ describe('store command execution', () => {
|
|
|
967
967
|
});
|
|
968
968
|
});
|
|
969
969
|
|
|
970
|
+
it('routes URL source to url for repo stores (Bug #1 fix)', async () => {
|
|
971
|
+
const mockStore: RepoStore = {
|
|
972
|
+
id: createStoreId('store-2'),
|
|
973
|
+
name: 'repo-url-store',
|
|
974
|
+
type: 'repo',
|
|
975
|
+
path: '/cloned/repo/path',
|
|
976
|
+
url: 'https://github.com/user/repo',
|
|
977
|
+
createdAt: new Date(),
|
|
978
|
+
updatedAt: new Date(),
|
|
979
|
+
};
|
|
980
|
+
|
|
981
|
+
mockServices.store.create.mockResolvedValue({
|
|
982
|
+
success: true,
|
|
983
|
+
data: mockStore,
|
|
984
|
+
});
|
|
985
|
+
|
|
986
|
+
const command = createStoreCommand(getOptions);
|
|
987
|
+
const createCommand = command.commands.find(c => c.name() === 'create');
|
|
988
|
+
const actionHandler = createCommand?._actionHandler;
|
|
989
|
+
|
|
990
|
+
createCommand.parseOptions(['--type', 'repo', '--source', 'https://github.com/user/repo']);
|
|
991
|
+
await actionHandler!(['repo-url-store']);
|
|
992
|
+
|
|
993
|
+
// URL should be routed to 'url' parameter, not 'path'
|
|
994
|
+
expect(mockServices.store.create).toHaveBeenCalledWith({
|
|
995
|
+
name: 'repo-url-store',
|
|
996
|
+
type: 'repo',
|
|
997
|
+
path: undefined,
|
|
998
|
+
url: 'https://github.com/user/repo',
|
|
999
|
+
description: undefined,
|
|
1000
|
+
tags: undefined,
|
|
1001
|
+
});
|
|
1002
|
+
});
|
|
1003
|
+
|
|
1004
|
+
it('routes http:// URL source to url for repo stores', async () => {
|
|
1005
|
+
const mockStore: RepoStore = {
|
|
1006
|
+
id: createStoreId('store-2'),
|
|
1007
|
+
name: 'repo-http-store',
|
|
1008
|
+
type: 'repo',
|
|
1009
|
+
path: '/cloned/repo/path',
|
|
1010
|
+
url: 'http://internal-git.example.com/repo',
|
|
1011
|
+
createdAt: new Date(),
|
|
1012
|
+
updatedAt: new Date(),
|
|
1013
|
+
};
|
|
1014
|
+
|
|
1015
|
+
mockServices.store.create.mockResolvedValue({
|
|
1016
|
+
success: true,
|
|
1017
|
+
data: mockStore,
|
|
1018
|
+
});
|
|
1019
|
+
|
|
1020
|
+
const command = createStoreCommand(getOptions);
|
|
1021
|
+
const createCommand = command.commands.find(c => c.name() === 'create');
|
|
1022
|
+
const actionHandler = createCommand?._actionHandler;
|
|
1023
|
+
|
|
1024
|
+
createCommand.parseOptions(['--type', 'repo', '--source', 'http://internal-git.example.com/repo']);
|
|
1025
|
+
await actionHandler!(['repo-http-store']);
|
|
1026
|
+
|
|
1027
|
+
expect(mockServices.store.create).toHaveBeenCalledWith({
|
|
1028
|
+
name: 'repo-http-store',
|
|
1029
|
+
type: 'repo',
|
|
1030
|
+
path: undefined,
|
|
1031
|
+
url: 'http://internal-git.example.com/repo',
|
|
1032
|
+
description: undefined,
|
|
1033
|
+
tags: undefined,
|
|
1034
|
+
});
|
|
1035
|
+
});
|
|
1036
|
+
|
|
970
1037
|
it('routes source to url for web stores', async () => {
|
|
971
1038
|
const mockStore: WebStore = {
|
|
972
1039
|
id: createStoreId('store-3'),
|
|
@@ -54,12 +54,15 @@ export function createStoreCommand(getOptions: () => GlobalOptions): Command {
|
|
|
54
54
|
}) => {
|
|
55
55
|
const globalOpts = getOptions();
|
|
56
56
|
const services = await createServices(globalOpts.config, globalOpts.dataDir);
|
|
57
|
+
let exitCode = 0;
|
|
57
58
|
try {
|
|
59
|
+
// Detect if source is a URL (for repo stores that should clone from remote)
|
|
60
|
+
const isUrl = options.source.startsWith('http://') || options.source.startsWith('https://');
|
|
58
61
|
const result = await services.store.create({
|
|
59
62
|
name,
|
|
60
63
|
type: options.type,
|
|
61
|
-
path: options.type
|
|
62
|
-
url: options.type === 'web' ? options.source : undefined,
|
|
64
|
+
path: options.type === 'file' || (options.type === 'repo' && !isUrl) ? options.source : undefined,
|
|
65
|
+
url: options.type === 'web' || (options.type === 'repo' && isUrl) ? options.source : undefined,
|
|
63
66
|
description: options.description,
|
|
64
67
|
tags: options.tags?.split(',').map((t) => t.trim()),
|
|
65
68
|
});
|
|
@@ -72,11 +75,14 @@ export function createStoreCommand(getOptions: () => GlobalOptions): Command {
|
|
|
72
75
|
}
|
|
73
76
|
} else {
|
|
74
77
|
console.error(`Error: ${result.error.message}`);
|
|
75
|
-
|
|
78
|
+
exitCode = 1;
|
|
76
79
|
}
|
|
77
80
|
} finally {
|
|
78
81
|
await destroyServices(services);
|
|
79
82
|
}
|
|
83
|
+
if (exitCode !== 0) {
|
|
84
|
+
process.exit(exitCode);
|
|
85
|
+
}
|
|
80
86
|
});
|
|
81
87
|
|
|
82
88
|
store
|
|
@@ -7,6 +7,9 @@ import { extractFromHtml } from '@extractus/article-extractor';
|
|
|
7
7
|
import TurndownService from 'turndown';
|
|
8
8
|
import { gfm } from 'turndown-plugin-gfm';
|
|
9
9
|
import { preprocessHtmlForCodeBlocks, cleanupMarkdown } from './markdown-utils.js';
|
|
10
|
+
import { createLogger, truncateForLog } from '../logging/index.js';
|
|
11
|
+
|
|
12
|
+
const logger = createLogger('article-converter');
|
|
10
13
|
|
|
11
14
|
export interface ConversionResult {
|
|
12
15
|
markdown: string;
|
|
@@ -28,6 +31,8 @@ export async function convertHtmlToMarkdown(
|
|
|
28
31
|
html: string,
|
|
29
32
|
url: string,
|
|
30
33
|
): Promise<ConversionResult> {
|
|
34
|
+
logger.debug({ url, htmlLength: html.length }, 'Starting HTML conversion');
|
|
35
|
+
|
|
31
36
|
try {
|
|
32
37
|
// Step 1: Extract main article content
|
|
33
38
|
let articleHtml: string;
|
|
@@ -38,13 +43,25 @@ export async function convertHtmlToMarkdown(
|
|
|
38
43
|
if (article !== null && article.content !== undefined && article.content !== '') {
|
|
39
44
|
articleHtml = article.content;
|
|
40
45
|
title = article.title !== undefined && article.title !== '' ? article.title : undefined;
|
|
46
|
+
logger.debug({
|
|
47
|
+
url,
|
|
48
|
+
title,
|
|
49
|
+
extractedLength: articleHtml.length,
|
|
50
|
+
usedFullHtml: false,
|
|
51
|
+
}, 'Article content extracted');
|
|
41
52
|
} else {
|
|
42
53
|
// Fallback to full HTML if extraction fails
|
|
43
54
|
articleHtml = html;
|
|
55
|
+
logger.debug({ url, usedFullHtml: true }, 'Article extraction returned empty, using full HTML');
|
|
44
56
|
}
|
|
45
|
-
} catch {
|
|
57
|
+
} catch (extractError) {
|
|
46
58
|
// Fallback to full HTML if extraction fails
|
|
47
59
|
articleHtml = html;
|
|
60
|
+
logger.debug({
|
|
61
|
+
url,
|
|
62
|
+
usedFullHtml: true,
|
|
63
|
+
error: extractError instanceof Error ? extractError.message : String(extractError),
|
|
64
|
+
}, 'Article extraction failed, using full HTML');
|
|
48
65
|
}
|
|
49
66
|
|
|
50
67
|
// Step 2: Preprocess HTML for code blocks
|
|
@@ -83,12 +100,30 @@ export async function convertHtmlToMarkdown(
|
|
|
83
100
|
// Step 4: Cleanup markdown with comprehensive regex patterns
|
|
84
101
|
const markdown = cleanupMarkdown(rawMarkdown);
|
|
85
102
|
|
|
103
|
+
logger.debug({
|
|
104
|
+
url,
|
|
105
|
+
title,
|
|
106
|
+
rawMarkdownLength: rawMarkdown.length,
|
|
107
|
+
finalMarkdownLength: markdown.length,
|
|
108
|
+
}, 'HTML to markdown conversion complete');
|
|
109
|
+
|
|
110
|
+
// Log markdown preview at trace level
|
|
111
|
+
logger.trace({
|
|
112
|
+
url,
|
|
113
|
+
markdownPreview: truncateForLog(markdown, 1000),
|
|
114
|
+
}, 'Markdown content preview');
|
|
115
|
+
|
|
86
116
|
return {
|
|
87
117
|
markdown,
|
|
88
118
|
...(title !== undefined && { title }),
|
|
89
119
|
success: true,
|
|
90
120
|
};
|
|
91
121
|
} catch (error) {
|
|
122
|
+
logger.error({
|
|
123
|
+
url,
|
|
124
|
+
error: error instanceof Error ? error.message : String(error),
|
|
125
|
+
}, 'HTML to markdown conversion failed');
|
|
126
|
+
|
|
92
127
|
return {
|
|
93
128
|
markdown: '',
|
|
94
129
|
success: false,
|
package/src/crawl/bridge.ts
CHANGED
|
@@ -11,6 +11,9 @@ import {
|
|
|
11
11
|
validateHeadlessResult,
|
|
12
12
|
validateParsePythonResult,
|
|
13
13
|
} from './schemas.js';
|
|
14
|
+
import { createLogger } from '../logging/index.js';
|
|
15
|
+
|
|
16
|
+
const logger = createLogger('python-bridge');
|
|
14
17
|
|
|
15
18
|
// Re-export for backwards compatibility
|
|
16
19
|
export type { CrawledLink, ParsePythonResult };
|
|
@@ -32,24 +35,26 @@ export class PythonBridge {
|
|
|
32
35
|
start(): Promise<void> {
|
|
33
36
|
if (this.process) return Promise.resolve();
|
|
34
37
|
|
|
38
|
+
logger.debug('Starting Python bridge process');
|
|
39
|
+
|
|
35
40
|
this.process = spawn('python3', ['python/crawl_worker.py'], {
|
|
36
41
|
stdio: ['pipe', 'pipe', 'pipe'],
|
|
37
42
|
});
|
|
38
43
|
|
|
39
44
|
// Add error handler for process spawn errors
|
|
40
45
|
this.process.on('error', (err) => {
|
|
41
|
-
|
|
46
|
+
logger.error({ error: err.message, stack: err.stack }, 'Python bridge process error');
|
|
42
47
|
this.rejectAllPending(new Error(`Process error: ${err.message}`));
|
|
43
48
|
});
|
|
44
49
|
|
|
45
50
|
// Add exit handler to detect non-zero exits
|
|
46
51
|
this.process.on('exit', (code, signal) => {
|
|
47
52
|
if (code !== 0 && code !== null) {
|
|
48
|
-
|
|
53
|
+
logger.error({ code }, 'Python bridge process exited with non-zero code');
|
|
49
54
|
this.rejectAllPending(new Error(`Process exited with code ${String(code)}`));
|
|
50
55
|
} else if (signal && !this.stoppingIntentionally) {
|
|
51
56
|
// Only log if we didn't intentionally stop the process
|
|
52
|
-
|
|
57
|
+
logger.error({ signal }, 'Python bridge process killed with signal');
|
|
53
58
|
this.rejectAllPending(new Error(`Process killed with signal ${signal}`));
|
|
54
59
|
}
|
|
55
60
|
this.process = null;
|
|
@@ -60,7 +65,7 @@ export class PythonBridge {
|
|
|
60
65
|
if (this.process.stderr) {
|
|
61
66
|
const stderrRl = createInterface({ input: this.process.stderr });
|
|
62
67
|
stderrRl.on('line', (line) => {
|
|
63
|
-
|
|
68
|
+
logger.warn({ stderr: line }, 'Python bridge stderr output');
|
|
64
69
|
});
|
|
65
70
|
}
|
|
66
71
|
|
|
@@ -106,11 +111,14 @@ export class PythonBridge {
|
|
|
106
111
|
} catch (error: unknown) {
|
|
107
112
|
// Log validation failure with original response for debugging
|
|
108
113
|
if (error instanceof ZodError) {
|
|
109
|
-
|
|
110
|
-
|
|
114
|
+
logger.error({
|
|
115
|
+
issues: error.issues,
|
|
116
|
+
response: JSON.stringify(response.result),
|
|
117
|
+
}, 'Python bridge response validation failed');
|
|
111
118
|
pending.reject(new Error(`Invalid response format from Python bridge: ${error.message}`));
|
|
112
119
|
} else {
|
|
113
120
|
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
121
|
+
logger.error({ error: errorMessage }, 'Response validation error');
|
|
114
122
|
pending.reject(new Error(`Response validation error: ${errorMessage}`));
|
|
115
123
|
}
|
|
116
124
|
}
|
|
@@ -118,7 +126,10 @@ export class PythonBridge {
|
|
|
118
126
|
// If neither result nor error, leave pending (will timeout)
|
|
119
127
|
}
|
|
120
128
|
} catch (err) {
|
|
121
|
-
|
|
129
|
+
logger.error({
|
|
130
|
+
error: err instanceof Error ? err.message : String(err),
|
|
131
|
+
line,
|
|
132
|
+
}, 'Failed to parse JSON response from Python bridge');
|
|
122
133
|
}
|
|
123
134
|
});
|
|
124
135
|
|