bluera-knowledge 0.9.43 → 0.10.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +1 -1
- package/CHANGELOG.md +75 -0
- package/README.md +114 -42
- package/commands/sync.md +96 -0
- package/dist/{chunk-MQE32YY6.js → chunk-6U45VP5Z.js} +42 -6
- package/dist/chunk-6U45VP5Z.js.map +1 -0
- package/dist/{chunk-CUHYSPRV.js → chunk-DP5XBPQV.js} +372 -2
- package/dist/chunk-DP5XBPQV.js.map +1 -0
- package/dist/{chunk-DWAIT2OD.js → chunk-UE4ZIJYA.js} +74 -5
- package/dist/{chunk-DWAIT2OD.js.map → chunk-UE4ZIJYA.js.map} +1 -1
- package/dist/index.js +216 -7
- package/dist/index.js.map +1 -1
- package/dist/mcp/server.js +2 -2
- package/dist/workers/background-worker-cli.js +4 -3
- package/dist/workers/background-worker-cli.js.map +1 -1
- package/hooks/check-dependencies.sh +29 -0
- package/package.json +1 -1
- package/python/crawl_worker.py +6 -1
- package/src/cli/commands/crawl.test.ts +43 -3
- package/src/cli/commands/crawl.ts +3 -3
- package/src/cli/commands/sync.test.ts +54 -0
- package/src/cli/commands/sync.ts +264 -0
- package/src/cli/index.ts +1 -0
- package/src/crawl/claude-client.test.ts +195 -24
- package/src/crawl/claude-client.ts +38 -3
- package/src/crawl/intelligent-crawler.test.ts +65 -0
- package/src/crawl/intelligent-crawler.ts +14 -2
- package/src/index.ts +2 -0
- package/src/mcp/commands/index.ts +2 -0
- package/src/mcp/commands/sync.commands.test.ts +283 -0
- package/src/mcp/commands/sync.commands.ts +233 -0
- package/src/services/gitignore.service.test.ts +157 -0
- package/src/services/gitignore.service.ts +132 -0
- package/src/services/store-definition.service.test.ts +440 -0
- package/src/services/store-definition.service.ts +198 -0
- package/src/services/store.service.test.ts +279 -1
- package/src/services/store.service.ts +101 -4
- package/src/types/index.ts +18 -0
- package/src/types/store-definition.test.ts +492 -0
- package/src/types/store-definition.ts +129 -0
- package/src/workers/background-worker.ts +1 -1
- package/dist/chunk-CUHYSPRV.js.map +0 -1
- package/dist/chunk-MQE32YY6.js.map +0 -1
|
@@ -81,7 +81,11 @@ describe('ClaudeClient', () => {
|
|
|
81
81
|
|
|
82
82
|
describe('determineCrawlUrls', () => {
|
|
83
83
|
it('should successfully parse valid crawl strategy response', async () => {
|
|
84
|
-
const promise = client.determineCrawlUrls(
|
|
84
|
+
const promise = client.determineCrawlUrls(
|
|
85
|
+
'https://example.com',
|
|
86
|
+
'<html>test</html>',
|
|
87
|
+
'Find all docs'
|
|
88
|
+
);
|
|
85
89
|
|
|
86
90
|
// Simulate successful response
|
|
87
91
|
setTimeout(() => {
|
|
@@ -102,8 +106,68 @@ describe('ClaudeClient', () => {
|
|
|
102
106
|
expect(result.reasoning).toBe('Found documentation pages');
|
|
103
107
|
});
|
|
104
108
|
|
|
109
|
+
it('should extract structured_output from Claude CLI wrapper format', async () => {
|
|
110
|
+
const promise = client.determineCrawlUrls(
|
|
111
|
+
'https://example.com',
|
|
112
|
+
'<html>test</html>',
|
|
113
|
+
'Find all docs'
|
|
114
|
+
);
|
|
115
|
+
|
|
116
|
+
// Claude CLI with --json-schema returns this wrapper format
|
|
117
|
+
setTimeout(() => {
|
|
118
|
+
mockProcess.stdout.emit(
|
|
119
|
+
'data',
|
|
120
|
+
Buffer.from(
|
|
121
|
+
JSON.stringify({
|
|
122
|
+
type: 'result',
|
|
123
|
+
subtype: 'success',
|
|
124
|
+
result: '',
|
|
125
|
+
structured_output: {
|
|
126
|
+
urls: ['https://example.com/page1', 'https://example.com/page2'],
|
|
127
|
+
reasoning: 'Found documentation pages',
|
|
128
|
+
},
|
|
129
|
+
})
|
|
130
|
+
)
|
|
131
|
+
);
|
|
132
|
+
mockProcess.emit('close', 0);
|
|
133
|
+
}, 10);
|
|
134
|
+
|
|
135
|
+
const result = await promise;
|
|
136
|
+
expect(result.urls).toEqual(['https://example.com/page1', 'https://example.com/page2']);
|
|
137
|
+
expect(result.reasoning).toBe('Found documentation pages');
|
|
138
|
+
});
|
|
139
|
+
|
|
140
|
+
it('should fall back to raw response when structured_output is not an object', async () => {
|
|
141
|
+
const promise = client.determineCrawlUrls(
|
|
142
|
+
'https://example.com',
|
|
143
|
+
'<html>test</html>',
|
|
144
|
+
'Find all docs'
|
|
145
|
+
);
|
|
146
|
+
|
|
147
|
+
// When structured_output is not an object, use the raw response
|
|
148
|
+
// (which will fail validation if it doesn't have urls/reasoning)
|
|
149
|
+
setTimeout(() => {
|
|
150
|
+
mockProcess.stdout.emit(
|
|
151
|
+
'data',
|
|
152
|
+
Buffer.from(
|
|
153
|
+
JSON.stringify({
|
|
154
|
+
type: 'result',
|
|
155
|
+
structured_output: 'not an object',
|
|
156
|
+
})
|
|
157
|
+
)
|
|
158
|
+
);
|
|
159
|
+
mockProcess.emit('close', 0);
|
|
160
|
+
}, 10);
|
|
161
|
+
|
|
162
|
+
await expect(promise).rejects.toThrow('invalid crawl strategy');
|
|
163
|
+
});
|
|
164
|
+
|
|
105
165
|
it('should call spawn with correct arguments for determineCrawlUrls', async () => {
|
|
106
|
-
const promise = client.determineCrawlUrls(
|
|
166
|
+
const promise = client.determineCrawlUrls(
|
|
167
|
+
'https://example.com',
|
|
168
|
+
'<html>test</html>',
|
|
169
|
+
'Find all docs'
|
|
170
|
+
);
|
|
107
171
|
|
|
108
172
|
setTimeout(() => {
|
|
109
173
|
mockProcess.stdout.emit(
|
|
@@ -136,7 +200,11 @@ describe('ClaudeClient', () => {
|
|
|
136
200
|
});
|
|
137
201
|
|
|
138
202
|
it('should write prompt to stdin', async () => {
|
|
139
|
-
const promise = client.determineCrawlUrls(
|
|
203
|
+
const promise = client.determineCrawlUrls(
|
|
204
|
+
'https://example.com',
|
|
205
|
+
'<html><body>Test</body></html>',
|
|
206
|
+
'Find tutorials'
|
|
207
|
+
);
|
|
140
208
|
|
|
141
209
|
setTimeout(() => {
|
|
142
210
|
mockProcess.stdout.emit(
|
|
@@ -160,7 +228,11 @@ describe('ClaudeClient', () => {
|
|
|
160
228
|
});
|
|
161
229
|
|
|
162
230
|
it('should reject when response has no urls array', async () => {
|
|
163
|
-
const promise = client.determineCrawlUrls(
|
|
231
|
+
const promise = client.determineCrawlUrls(
|
|
232
|
+
'https://example.com',
|
|
233
|
+
'<html>test</html>',
|
|
234
|
+
'Find all'
|
|
235
|
+
);
|
|
164
236
|
|
|
165
237
|
setTimeout(() => {
|
|
166
238
|
mockProcess.stdout.emit(
|
|
@@ -178,7 +250,11 @@ describe('ClaudeClient', () => {
|
|
|
178
250
|
});
|
|
179
251
|
|
|
180
252
|
it('should reject when response has empty urls array', async () => {
|
|
181
|
-
const promise = client.determineCrawlUrls(
|
|
253
|
+
const promise = client.determineCrawlUrls(
|
|
254
|
+
'https://example.com',
|
|
255
|
+
'<html>test</html>',
|
|
256
|
+
'Find all'
|
|
257
|
+
);
|
|
182
258
|
|
|
183
259
|
setTimeout(() => {
|
|
184
260
|
mockProcess.stdout.emit(
|
|
@@ -197,7 +273,11 @@ describe('ClaudeClient', () => {
|
|
|
197
273
|
});
|
|
198
274
|
|
|
199
275
|
it('should reject when response has no reasoning', async () => {
|
|
200
|
-
const promise = client.determineCrawlUrls(
|
|
276
|
+
const promise = client.determineCrawlUrls(
|
|
277
|
+
'https://example.com',
|
|
278
|
+
'<html>test</html>',
|
|
279
|
+
'Find all'
|
|
280
|
+
);
|
|
201
281
|
|
|
202
282
|
setTimeout(() => {
|
|
203
283
|
mockProcess.stdout.emit(
|
|
@@ -215,7 +295,11 @@ describe('ClaudeClient', () => {
|
|
|
215
295
|
});
|
|
216
296
|
|
|
217
297
|
it('should reject when urls contains non-string values', async () => {
|
|
218
|
-
const promise = client.determineCrawlUrls(
|
|
298
|
+
const promise = client.determineCrawlUrls(
|
|
299
|
+
'https://example.com',
|
|
300
|
+
'<html>test</html>',
|
|
301
|
+
'Find all'
|
|
302
|
+
);
|
|
219
303
|
|
|
220
304
|
setTimeout(() => {
|
|
221
305
|
mockProcess.stdout.emit(
|
|
@@ -234,7 +318,11 @@ describe('ClaudeClient', () => {
|
|
|
234
318
|
});
|
|
235
319
|
|
|
236
320
|
it('should reject when response is not valid JSON', async () => {
|
|
237
|
-
const promise = client.determineCrawlUrls(
|
|
321
|
+
const promise = client.determineCrawlUrls(
|
|
322
|
+
'https://example.com',
|
|
323
|
+
'<html>test</html>',
|
|
324
|
+
'Find all'
|
|
325
|
+
);
|
|
238
326
|
|
|
239
327
|
setTimeout(() => {
|
|
240
328
|
mockProcess.stdout.emit('data', Buffer.from('Not valid JSON'));
|
|
@@ -245,7 +333,11 @@ describe('ClaudeClient', () => {
|
|
|
245
333
|
});
|
|
246
334
|
|
|
247
335
|
it('should reject when response is null', async () => {
|
|
248
|
-
const promise = client.determineCrawlUrls(
|
|
336
|
+
const promise = client.determineCrawlUrls(
|
|
337
|
+
'https://example.com',
|
|
338
|
+
'<html>test</html>',
|
|
339
|
+
'Find all'
|
|
340
|
+
);
|
|
249
341
|
|
|
250
342
|
setTimeout(() => {
|
|
251
343
|
mockProcess.stdout.emit('data', Buffer.from('null'));
|
|
@@ -257,7 +349,7 @@ describe('ClaudeClient', () => {
|
|
|
257
349
|
|
|
258
350
|
it('should truncate HTML longer than 50000 characters', async () => {
|
|
259
351
|
const longHtml = '<html>' + 'a'.repeat(60000) + '</html>';
|
|
260
|
-
const promise = client.determineCrawlUrls(longHtml, 'Find all');
|
|
352
|
+
const promise = client.determineCrawlUrls('https://example.com', longHtml, 'Find all');
|
|
261
353
|
|
|
262
354
|
setTimeout(() => {
|
|
263
355
|
mockProcess.stdout.emit(
|
|
@@ -281,7 +373,7 @@ describe('ClaudeClient', () => {
|
|
|
281
373
|
|
|
282
374
|
it('should not truncate HTML shorter than 50000 characters', async () => {
|
|
283
375
|
const shortHtml = '<html><body>Short content</body></html>';
|
|
284
|
-
const promise = client.determineCrawlUrls(shortHtml, 'Find all');
|
|
376
|
+
const promise = client.determineCrawlUrls('https://example.com', shortHtml, 'Find all');
|
|
285
377
|
|
|
286
378
|
setTimeout(() => {
|
|
287
379
|
mockProcess.stdout.emit(
|
|
@@ -302,6 +394,33 @@ describe('ClaudeClient', () => {
|
|
|
302
394
|
expect(writtenPrompt).toContain(shortHtml);
|
|
303
395
|
expect(writtenPrompt).not.toContain('[... HTML truncated ...]');
|
|
304
396
|
});
|
|
397
|
+
|
|
398
|
+
it('should include seedUrl in prompt for relative URL resolution', async () => {
|
|
399
|
+
const promise = client.determineCrawlUrls(
|
|
400
|
+
'https://code.claude.com/docs',
|
|
401
|
+
'<html><a href="/docs/en/hooks">Hooks</a></html>',
|
|
402
|
+
'Find all docs'
|
|
403
|
+
);
|
|
404
|
+
|
|
405
|
+
setTimeout(() => {
|
|
406
|
+
mockProcess.stdout.emit(
|
|
407
|
+
'data',
|
|
408
|
+
Buffer.from(
|
|
409
|
+
JSON.stringify({
|
|
410
|
+
urls: ['https://code.claude.com/docs/en/hooks'],
|
|
411
|
+
reasoning: 'Found hooks documentation',
|
|
412
|
+
})
|
|
413
|
+
)
|
|
414
|
+
);
|
|
415
|
+
mockProcess.emit('close', 0);
|
|
416
|
+
}, 10);
|
|
417
|
+
|
|
418
|
+
await promise;
|
|
419
|
+
|
|
420
|
+
const writtenPrompt = vi.mocked(mockProcess.stdin.write).mock.calls[0]?.[0] as string;
|
|
421
|
+
expect(writtenPrompt).toContain('Base URL: https://code.claude.com/docs');
|
|
422
|
+
expect(writtenPrompt).toContain('resolve them against the Base URL');
|
|
423
|
+
});
|
|
305
424
|
});
|
|
306
425
|
|
|
307
426
|
describe('extractContent', () => {
|
|
@@ -386,7 +505,11 @@ describe('ClaudeClient', () => {
|
|
|
386
505
|
|
|
387
506
|
describe('Subprocess Management', () => {
|
|
388
507
|
it('should handle process spawn errors', async () => {
|
|
389
|
-
const promise = client.determineCrawlUrls(
|
|
508
|
+
const promise = client.determineCrawlUrls(
|
|
509
|
+
'https://example.com',
|
|
510
|
+
'<html>test</html>',
|
|
511
|
+
'Find all'
|
|
512
|
+
);
|
|
390
513
|
|
|
391
514
|
setTimeout(() => {
|
|
392
515
|
mockProcess.emit('error', new Error('spawn ENOENT'));
|
|
@@ -410,7 +533,11 @@ describe('ClaudeClient', () => {
|
|
|
410
533
|
});
|
|
411
534
|
|
|
412
535
|
it('should collect stderr data', async () => {
|
|
413
|
-
const promise = client.determineCrawlUrls(
|
|
536
|
+
const promise = client.determineCrawlUrls(
|
|
537
|
+
'https://example.com',
|
|
538
|
+
'<html>test</html>',
|
|
539
|
+
'Find all'
|
|
540
|
+
);
|
|
414
541
|
|
|
415
542
|
setTimeout(() => {
|
|
416
543
|
mockProcess.stderr.emit('data', Buffer.from('Error message 1\n'));
|
|
@@ -466,14 +593,22 @@ describe('ClaudeClient', () => {
|
|
|
466
593
|
|
|
467
594
|
describe('Timeout Handling', () => {
|
|
468
595
|
it('should timeout after configured timeout period', async () => {
|
|
469
|
-
const promise = client.determineCrawlUrls(
|
|
596
|
+
const promise = client.determineCrawlUrls(
|
|
597
|
+
'https://example.com',
|
|
598
|
+
'<html>test</html>',
|
|
599
|
+
'Find all'
|
|
600
|
+
);
|
|
470
601
|
|
|
471
602
|
// Don't emit close event - let it timeout
|
|
472
603
|
await expect(promise).rejects.toThrow('timed out after 100ms');
|
|
473
604
|
});
|
|
474
605
|
|
|
475
606
|
it('should kill process on timeout', async () => {
|
|
476
|
-
const promise = client.determineCrawlUrls(
|
|
607
|
+
const promise = client.determineCrawlUrls(
|
|
608
|
+
'https://example.com',
|
|
609
|
+
'<html>test</html>',
|
|
610
|
+
'Find all'
|
|
611
|
+
);
|
|
477
612
|
|
|
478
613
|
await expect(promise).rejects.toThrow('timed out');
|
|
479
614
|
expect(mockProcess.kill).toHaveBeenCalledWith('SIGTERM');
|
|
@@ -540,7 +675,11 @@ describe('ClaudeClient', () => {
|
|
|
540
675
|
|
|
541
676
|
describe('JSON Parsing', () => {
|
|
542
677
|
it('should handle malformed JSON in response', async () => {
|
|
543
|
-
const promise = client.determineCrawlUrls(
|
|
678
|
+
const promise = client.determineCrawlUrls(
|
|
679
|
+
'https://example.com',
|
|
680
|
+
'<html>test</html>',
|
|
681
|
+
'Find all'
|
|
682
|
+
);
|
|
544
683
|
|
|
545
684
|
setTimeout(() => {
|
|
546
685
|
mockProcess.stdout.emit('data', Buffer.from('{ invalid json }'));
|
|
@@ -551,7 +690,11 @@ describe('ClaudeClient', () => {
|
|
|
551
690
|
});
|
|
552
691
|
|
|
553
692
|
it('should handle incomplete JSON in response', async () => {
|
|
554
|
-
const promise = client.determineCrawlUrls(
|
|
693
|
+
const promise = client.determineCrawlUrls(
|
|
694
|
+
'https://example.com',
|
|
695
|
+
'<html>test</html>',
|
|
696
|
+
'Find all'
|
|
697
|
+
);
|
|
555
698
|
|
|
556
699
|
setTimeout(() => {
|
|
557
700
|
mockProcess.stdout.emit('data', Buffer.from('{"urls": ["https://example.com"'));
|
|
@@ -562,7 +705,11 @@ describe('ClaudeClient', () => {
|
|
|
562
705
|
});
|
|
563
706
|
|
|
564
707
|
it('should handle JSON with extra whitespace', async () => {
|
|
565
|
-
const promise = client.determineCrawlUrls(
|
|
708
|
+
const promise = client.determineCrawlUrls(
|
|
709
|
+
'https://example.com',
|
|
710
|
+
'<html>test</html>',
|
|
711
|
+
'Find all'
|
|
712
|
+
);
|
|
566
713
|
|
|
567
714
|
setTimeout(() => {
|
|
568
715
|
mockProcess.stdout.emit(
|
|
@@ -582,7 +729,11 @@ describe('ClaudeClient', () => {
|
|
|
582
729
|
});
|
|
583
730
|
|
|
584
731
|
it('should handle JSON arrays as invalid for determineCrawlUrls', async () => {
|
|
585
|
-
const promise = client.determineCrawlUrls(
|
|
732
|
+
const promise = client.determineCrawlUrls(
|
|
733
|
+
'https://example.com',
|
|
734
|
+
'<html>test</html>',
|
|
735
|
+
'Find all'
|
|
736
|
+
);
|
|
586
737
|
|
|
587
738
|
setTimeout(() => {
|
|
588
739
|
mockProcess.stdout.emit('data', Buffer.from('[]'));
|
|
@@ -593,7 +744,11 @@ describe('ClaudeClient', () => {
|
|
|
593
744
|
});
|
|
594
745
|
|
|
595
746
|
it('should handle JSON primitives as invalid for determineCrawlUrls', async () => {
|
|
596
|
-
const promise = client.determineCrawlUrls(
|
|
747
|
+
const promise = client.determineCrawlUrls(
|
|
748
|
+
'https://example.com',
|
|
749
|
+
'<html>test</html>',
|
|
750
|
+
'Find all'
|
|
751
|
+
);
|
|
597
752
|
|
|
598
753
|
setTimeout(() => {
|
|
599
754
|
mockProcess.stdout.emit('data', Buffer.from('"string response"'));
|
|
@@ -606,7 +761,11 @@ describe('ClaudeClient', () => {
|
|
|
606
761
|
|
|
607
762
|
describe('Response Validation', () => {
|
|
608
763
|
it('should validate urls is an array', async () => {
|
|
609
|
-
const promise = client.determineCrawlUrls(
|
|
764
|
+
const promise = client.determineCrawlUrls(
|
|
765
|
+
'https://example.com',
|
|
766
|
+
'<html>test</html>',
|
|
767
|
+
'Find all'
|
|
768
|
+
);
|
|
610
769
|
|
|
611
770
|
setTimeout(() => {
|
|
612
771
|
mockProcess.stdout.emit(
|
|
@@ -625,7 +784,11 @@ describe('ClaudeClient', () => {
|
|
|
625
784
|
});
|
|
626
785
|
|
|
627
786
|
it('should validate reasoning is a string', async () => {
|
|
628
|
-
const promise = client.determineCrawlUrls(
|
|
787
|
+
const promise = client.determineCrawlUrls(
|
|
788
|
+
'https://example.com',
|
|
789
|
+
'<html>test</html>',
|
|
790
|
+
'Find all'
|
|
791
|
+
);
|
|
629
792
|
|
|
630
793
|
setTimeout(() => {
|
|
631
794
|
mockProcess.stdout.emit(
|
|
@@ -644,7 +807,11 @@ describe('ClaudeClient', () => {
|
|
|
644
807
|
});
|
|
645
808
|
|
|
646
809
|
it('should accept valid response with multiple URLs', async () => {
|
|
647
|
-
const promise = client.determineCrawlUrls(
|
|
810
|
+
const promise = client.determineCrawlUrls(
|
|
811
|
+
'https://example.com',
|
|
812
|
+
'<html>test</html>',
|
|
813
|
+
'Find all'
|
|
814
|
+
);
|
|
648
815
|
|
|
649
816
|
setTimeout(() => {
|
|
650
817
|
mockProcess.stdout.emit(
|
|
@@ -671,7 +838,11 @@ describe('ClaudeClient', () => {
|
|
|
671
838
|
|
|
672
839
|
describe('Error Messages', () => {
|
|
673
840
|
it('should wrap errors with context for determineCrawlUrls', async () => {
|
|
674
|
-
const promise = client.determineCrawlUrls(
|
|
841
|
+
const promise = client.determineCrawlUrls(
|
|
842
|
+
'https://example.com',
|
|
843
|
+
'<html>test</html>',
|
|
844
|
+
'Find all'
|
|
845
|
+
);
|
|
675
846
|
|
|
676
847
|
setTimeout(() => {
|
|
677
848
|
mockProcess.emit('close', 1);
|
|
@@ -69,25 +69,38 @@ export class ClaudeClient {
|
|
|
69
69
|
/**
|
|
70
70
|
* Determine which URLs to crawl based on natural language instruction
|
|
71
71
|
*
|
|
72
|
+
* @param seedUrl - The URL of the seed page (for resolving relative URLs)
|
|
72
73
|
* @param seedHtml - HTML content of the seed page
|
|
73
74
|
* @param instruction - Natural language crawl instruction (e.g., "scrape all Getting Started pages")
|
|
74
75
|
* @returns List of URLs to crawl with reasoning
|
|
75
76
|
*/
|
|
76
|
-
async determineCrawlUrls(
|
|
77
|
+
async determineCrawlUrls(
|
|
78
|
+
seedUrl: string,
|
|
79
|
+
seedHtml: string,
|
|
80
|
+
instruction: string
|
|
81
|
+
): Promise<CrawlStrategy> {
|
|
77
82
|
const prompt = `You are analyzing a webpage to determine which pages to crawl based on the user's instruction.
|
|
78
83
|
|
|
84
|
+
Base URL: ${seedUrl}
|
|
85
|
+
|
|
79
86
|
Instruction: ${instruction}
|
|
80
87
|
|
|
81
88
|
Webpage HTML (analyze the navigation structure, links, and content):
|
|
82
89
|
${this.truncateHtml(seedHtml, 50000)}
|
|
83
90
|
|
|
84
|
-
Based on the instruction, extract and return a list of absolute URLs that should be crawled.
|
|
91
|
+
Based on the instruction, extract and return a list of absolute URLs that should be crawled. When you encounter relative URLs (starting with "/" or without a protocol), resolve them against the Base URL. For example, if Base URL is "https://example.com/docs" and you see href="/docs/hooks", return "https://example.com/docs/hooks".
|
|
92
|
+
|
|
93
|
+
Look for navigation menus, sidebars, headers, and link structures that match the instruction.
|
|
85
94
|
|
|
86
95
|
Return only URLs that are relevant to the instruction. If the instruction mentions specific sections (e.g., "Getting Started"), find links in those sections.`;
|
|
87
96
|
|
|
88
97
|
try {
|
|
89
98
|
const result = await this.callClaude(prompt, CRAWL_STRATEGY_SCHEMA);
|
|
90
|
-
const
|
|
99
|
+
const rawParsed: unknown = JSON.parse(result);
|
|
100
|
+
|
|
101
|
+
// Claude CLI with --json-schema returns wrapper: {type, result, structured_output: {...}}
|
|
102
|
+
// Extract structured_output if present, otherwise use raw response
|
|
103
|
+
const parsed = this.extractStructuredOutput(rawParsed);
|
|
91
104
|
|
|
92
105
|
// Validate and narrow type
|
|
93
106
|
if (
|
|
@@ -223,4 +236,26 @@ ${this.truncateMarkdown(markdown, 100000)}`;
|
|
|
223
236
|
|
|
224
237
|
return `${markdown.substring(0, maxLength)}\n\n[... content truncated ...]`;
|
|
225
238
|
}
|
|
239
|
+
|
|
240
|
+
/**
|
|
241
|
+
* Type guard to check if value is a record (plain object)
|
|
242
|
+
*/
|
|
243
|
+
private isRecord(value: unknown): value is Record<string, unknown> {
|
|
244
|
+
return typeof value === 'object' && value !== null && !Array.isArray(value);
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
/**
|
|
248
|
+
* Extract structured_output from Claude CLI wrapper format if present.
|
|
249
|
+
* Claude CLI with --json-schema returns: {type, result, structured_output: {...}}
|
|
250
|
+
* This method extracts the inner structured_output, or returns the raw value if not wrapped.
|
|
251
|
+
*/
|
|
252
|
+
private extractStructuredOutput(rawParsed: unknown): unknown {
|
|
253
|
+
if (this.isRecord(rawParsed) && 'structured_output' in rawParsed) {
|
|
254
|
+
const structuredOutput = rawParsed['structured_output'];
|
|
255
|
+
if (typeof structuredOutput === 'object') {
|
|
256
|
+
return structuredOutput;
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
return rawParsed;
|
|
260
|
+
}
|
|
226
261
|
}
|
|
@@ -971,4 +971,69 @@ describe('IntelligentCrawler', () => {
|
|
|
971
971
|
expect(mockClaudeClient.extractContent).not.toHaveBeenCalled();
|
|
972
972
|
});
|
|
973
973
|
});
|
|
974
|
+
|
|
975
|
+
describe('Single-Page Crawl Warning', () => {
|
|
976
|
+
it('should emit warning when only 1 page crawled with maxPages > 1', async () => {
|
|
977
|
+
// Mock no links found - single page crawl
|
|
978
|
+
mockPythonBridge.crawl.mockResolvedValue({ pages: [{ links: [] }] });
|
|
979
|
+
|
|
980
|
+
const results = [];
|
|
981
|
+
for await (const result of crawler.crawl('https://example.com', {
|
|
982
|
+
simple: true,
|
|
983
|
+
maxPages: 50,
|
|
984
|
+
})) {
|
|
985
|
+
results.push(result);
|
|
986
|
+
}
|
|
987
|
+
|
|
988
|
+
expect(results).toHaveLength(1);
|
|
989
|
+
const warningEvents = progressEvents.filter(
|
|
990
|
+
(e) => e.type === 'error' && e.message?.includes('Only crawled 1 page')
|
|
991
|
+
);
|
|
992
|
+
expect(warningEvents).toHaveLength(1);
|
|
993
|
+
expect(warningEvents[0]?.message).toContain('--fast');
|
|
994
|
+
expect(warningEvents[0]?.message).toContain('maxPages=50');
|
|
995
|
+
});
|
|
996
|
+
|
|
997
|
+
it('should NOT emit warning when maxPages is 1', async () => {
|
|
998
|
+
mockPythonBridge.crawl.mockResolvedValue({ pages: [{ links: [] }] });
|
|
999
|
+
|
|
1000
|
+
const results = [];
|
|
1001
|
+
for await (const result of crawler.crawl('https://example.com', {
|
|
1002
|
+
simple: true,
|
|
1003
|
+
maxPages: 1,
|
|
1004
|
+
})) {
|
|
1005
|
+
results.push(result);
|
|
1006
|
+
}
|
|
1007
|
+
|
|
1008
|
+
expect(results).toHaveLength(1);
|
|
1009
|
+
const warningEvents = progressEvents.filter(
|
|
1010
|
+
(e) => e.type === 'error' && e.message?.includes('Only crawled 1 page')
|
|
1011
|
+
);
|
|
1012
|
+
expect(warningEvents).toHaveLength(0);
|
|
1013
|
+
});
|
|
1014
|
+
|
|
1015
|
+
it('should NOT emit warning when multiple pages crawled', async () => {
|
|
1016
|
+
mockPythonBridge.crawl
|
|
1017
|
+
.mockResolvedValueOnce({ pages: [{ links: ['https://example.com/page2'] }] })
|
|
1018
|
+
.mockResolvedValueOnce({ pages: [{ links: [] }] });
|
|
1019
|
+
|
|
1020
|
+
vi.mocked(axios.get)
|
|
1021
|
+
.mockResolvedValueOnce({ data: '<html><body>Page1</body></html>' })
|
|
1022
|
+
.mockResolvedValueOnce({ data: '<html><body>Page2</body></html>' });
|
|
1023
|
+
|
|
1024
|
+
const results = [];
|
|
1025
|
+
for await (const result of crawler.crawl('https://example.com', {
|
|
1026
|
+
simple: true,
|
|
1027
|
+
maxPages: 50,
|
|
1028
|
+
})) {
|
|
1029
|
+
results.push(result);
|
|
1030
|
+
}
|
|
1031
|
+
|
|
1032
|
+
expect(results).toHaveLength(2);
|
|
1033
|
+
const warningEvents = progressEvents.filter(
|
|
1034
|
+
(e) => e.type === 'error' && e.message?.includes('Only crawled 1 page')
|
|
1035
|
+
);
|
|
1036
|
+
expect(warningEvents).toHaveLength(0);
|
|
1037
|
+
});
|
|
1038
|
+
});
|
|
974
1039
|
});
|
|
@@ -109,6 +109,18 @@ export class IntelligentCrawler extends EventEmitter {
|
|
|
109
109
|
'Crawl complete'
|
|
110
110
|
);
|
|
111
111
|
|
|
112
|
+
// Warn if crawl discovered far fewer pages than requested
|
|
113
|
+
if (this.visited.size === 1 && maxPages > 1) {
|
|
114
|
+
const warningProgress: CrawlProgress = {
|
|
115
|
+
type: 'error',
|
|
116
|
+
pagesVisited: this.visited.size,
|
|
117
|
+
totalPages: maxPages,
|
|
118
|
+
message: `Warning: Only crawled 1 page despite maxPages=${String(maxPages)}. Link discovery may have failed. If using --fast mode, try without it for JavaScript-heavy sites.`,
|
|
119
|
+
error: new Error('Low page discovery'),
|
|
120
|
+
};
|
|
121
|
+
this.emit('progress', warningProgress);
|
|
122
|
+
}
|
|
123
|
+
|
|
112
124
|
const completeProgress: CrawlProgress = {
|
|
113
125
|
type: 'complete',
|
|
114
126
|
pagesVisited: this.visited.size,
|
|
@@ -157,8 +169,8 @@ export class IntelligentCrawler extends EventEmitter {
|
|
|
157
169
|
|
|
158
170
|
const seedHtml = await this.fetchHtml(seedUrl, useHeadless);
|
|
159
171
|
|
|
160
|
-
// Step 2: Ask Claude which URLs to crawl
|
|
161
|
-
strategy = await this.claudeClient.determineCrawlUrls(seedHtml, crawlInstruction);
|
|
172
|
+
// Step 2: Ask Claude which URLs to crawl (pass seedUrl for relative URL resolution)
|
|
173
|
+
strategy = await this.claudeClient.determineCrawlUrls(seedUrl, seedHtml, crawlInstruction);
|
|
162
174
|
|
|
163
175
|
const strategyCompleteProgress: CrawlProgress = {
|
|
164
176
|
type: 'strategy',
|
package/src/index.ts
CHANGED
|
@@ -16,6 +16,7 @@ import { createSearchCommand } from './cli/commands/search.js';
|
|
|
16
16
|
import { createServeCommand } from './cli/commands/serve.js';
|
|
17
17
|
import { createSetupCommand } from './cli/commands/setup.js';
|
|
18
18
|
import { createStoreCommand } from './cli/commands/store.js';
|
|
19
|
+
import { createSyncCommand } from './cli/commands/sync.js';
|
|
19
20
|
import { createProgram, getGlobalOptions } from './cli/program.js';
|
|
20
21
|
|
|
21
22
|
// Default paths
|
|
@@ -105,6 +106,7 @@ program.addCommand(createIndexCommand(() => getGlobalOptions(program)));
|
|
|
105
106
|
program.addCommand(createServeCommand(() => getGlobalOptions(program)));
|
|
106
107
|
program.addCommand(createCrawlCommand(() => getGlobalOptions(program)));
|
|
107
108
|
program.addCommand(createSetupCommand(() => getGlobalOptions(program)));
|
|
109
|
+
program.addCommand(createSyncCommand(() => getGlobalOptions(program)));
|
|
108
110
|
program.addCommand(createMCPCommand(() => getGlobalOptions(program)));
|
|
109
111
|
|
|
110
112
|
// Show comprehensive help when no arguments provided
|
|
@@ -9,11 +9,13 @@ import { jobCommands } from './job.commands.js';
|
|
|
9
9
|
import { metaCommands } from './meta.commands.js';
|
|
10
10
|
import { commandRegistry } from './registry.js';
|
|
11
11
|
import { storeCommands } from './store.commands.js';
|
|
12
|
+
import { syncCommands } from './sync.commands.js';
|
|
12
13
|
|
|
13
14
|
// Register all commands
|
|
14
15
|
commandRegistry.registerAll(storeCommands);
|
|
15
16
|
commandRegistry.registerAll(jobCommands);
|
|
16
17
|
commandRegistry.registerAll(metaCommands);
|
|
18
|
+
commandRegistry.registerAll(syncCommands);
|
|
17
19
|
|
|
18
20
|
// Re-export for convenience
|
|
19
21
|
export { commandRegistry, executeCommand, generateHelp } from './registry.js';
|