@cosmocoder/mcp-web-docs 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +22 -0
- package/README.md +368 -0
- package/build/__mocks__/embeddings.d.ts +17 -0
- package/build/__mocks__/embeddings.js +66 -0
- package/build/__mocks__/embeddings.js.map +1 -0
- package/build/config.d.ts +44 -0
- package/build/config.js +158 -0
- package/build/config.js.map +1 -0
- package/build/config.test.d.ts +1 -0
- package/build/config.test.js +165 -0
- package/build/config.test.js.map +1 -0
- package/build/crawler/auth.d.ts +128 -0
- package/build/crawler/auth.js +546 -0
- package/build/crawler/auth.js.map +1 -0
- package/build/crawler/auth.test.d.ts +1 -0
- package/build/crawler/auth.test.js +174 -0
- package/build/crawler/auth.test.js.map +1 -0
- package/build/crawler/base.d.ts +24 -0
- package/build/crawler/base.js +149 -0
- package/build/crawler/base.js.map +1 -0
- package/build/crawler/base.test.d.ts +1 -0
- package/build/crawler/base.test.js +234 -0
- package/build/crawler/base.test.js.map +1 -0
- package/build/crawler/browser-config.d.ts +2 -0
- package/build/crawler/browser-config.js +29 -0
- package/build/crawler/browser-config.js.map +1 -0
- package/build/crawler/browser-config.test.d.ts +1 -0
- package/build/crawler/browser-config.test.js +56 -0
- package/build/crawler/browser-config.test.js.map +1 -0
- package/build/crawler/cheerio.d.ts +11 -0
- package/build/crawler/cheerio.js +134 -0
- package/build/crawler/cheerio.js.map +1 -0
- package/build/crawler/chromium.d.ts +21 -0
- package/build/crawler/chromium.js +596 -0
- package/build/crawler/chromium.js.map +1 -0
- package/build/crawler/content-extractor-types.d.ts +25 -0
- package/build/crawler/content-extractor-types.js +2 -0
- package/build/crawler/content-extractor-types.js.map +1 -0
- package/build/crawler/content-extractors.d.ts +9 -0
- package/build/crawler/content-extractors.js +9 -0
- package/build/crawler/content-extractors.js.map +1 -0
- package/build/crawler/content-utils.d.ts +2 -0
- package/build/crawler/content-utils.js +22 -0
- package/build/crawler/content-utils.js.map +1 -0
- package/build/crawler/content-utils.test.d.ts +1 -0
- package/build/crawler/content-utils.test.js +99 -0
- package/build/crawler/content-utils.test.js.map +1 -0
- package/build/crawler/crawlee-crawler.d.ts +63 -0
- package/build/crawler/crawlee-crawler.js +342 -0
- package/build/crawler/crawlee-crawler.js.map +1 -0
- package/build/crawler/crawlee-crawler.test.d.ts +1 -0
- package/build/crawler/crawlee-crawler.test.js +280 -0
- package/build/crawler/crawlee-crawler.test.js.map +1 -0
- package/build/crawler/default-extractor.d.ts +4 -0
- package/build/crawler/default-extractor.js +26 -0
- package/build/crawler/default-extractor.js.map +1 -0
- package/build/crawler/default-extractor.test.d.ts +1 -0
- package/build/crawler/default-extractor.test.js +200 -0
- package/build/crawler/default-extractor.test.js.map +1 -0
- package/build/crawler/default.d.ts +11 -0
- package/build/crawler/default.js +138 -0
- package/build/crawler/default.js.map +1 -0
- package/build/crawler/docs-crawler.d.ts +26 -0
- package/build/crawler/docs-crawler.js +97 -0
- package/build/crawler/docs-crawler.js.map +1 -0
- package/build/crawler/docs-crawler.test.d.ts +1 -0
- package/build/crawler/docs-crawler.test.js +185 -0
- package/build/crawler/docs-crawler.test.js.map +1 -0
- package/build/crawler/factory.d.ts +6 -0
- package/build/crawler/factory.js +83 -0
- package/build/crawler/factory.js.map +1 -0
- package/build/crawler/github-pages-extractor.d.ts +4 -0
- package/build/crawler/github-pages-extractor.js +33 -0
- package/build/crawler/github-pages-extractor.js.map +1 -0
- package/build/crawler/github-pages-extractor.test.d.ts +1 -0
- package/build/crawler/github-pages-extractor.test.js +184 -0
- package/build/crawler/github-pages-extractor.test.js.map +1 -0
- package/build/crawler/github.d.ts +20 -0
- package/build/crawler/github.js +181 -0
- package/build/crawler/github.js.map +1 -0
- package/build/crawler/github.test.d.ts +1 -0
- package/build/crawler/github.test.js +326 -0
- package/build/crawler/github.test.js.map +1 -0
- package/build/crawler/puppeteer.d.ts +16 -0
- package/build/crawler/puppeteer.js +191 -0
- package/build/crawler/puppeteer.js.map +1 -0
- package/build/crawler/queue-manager.d.ts +43 -0
- package/build/crawler/queue-manager.js +169 -0
- package/build/crawler/queue-manager.js.map +1 -0
- package/build/crawler/queue-manager.test.d.ts +1 -0
- package/build/crawler/queue-manager.test.js +509 -0
- package/build/crawler/queue-manager.test.js.map +1 -0
- package/build/crawler/site-rules.d.ts +11 -0
- package/build/crawler/site-rules.js +104 -0
- package/build/crawler/site-rules.js.map +1 -0
- package/build/crawler/site-rules.test.d.ts +1 -0
- package/build/crawler/site-rules.test.js +139 -0
- package/build/crawler/site-rules.test.js.map +1 -0
- package/build/crawler/storybook-extractor.d.ts +34 -0
- package/build/crawler/storybook-extractor.js +767 -0
- package/build/crawler/storybook-extractor.js.map +1 -0
- package/build/crawler/storybook-extractor.test.d.ts +1 -0
- package/build/crawler/storybook-extractor.test.js +491 -0
- package/build/crawler/storybook-extractor.test.js.map +1 -0
- package/build/embeddings/fastembed.d.ts +25 -0
- package/build/embeddings/fastembed.js +188 -0
- package/build/embeddings/fastembed.js.map +1 -0
- package/build/embeddings/fastembed.test.d.ts +1 -0
- package/build/embeddings/fastembed.test.js +307 -0
- package/build/embeddings/fastembed.test.js.map +1 -0
- package/build/embeddings/openai.d.ts +8 -0
- package/build/embeddings/openai.js +56 -0
- package/build/embeddings/openai.js.map +1 -0
- package/build/embeddings/types.d.ts +4 -0
- package/build/embeddings/types.js +2 -0
- package/build/embeddings/types.js.map +1 -0
- package/build/index.d.ts +2 -0
- package/build/index.js +1007 -0
- package/build/index.js.map +1 -0
- package/build/index.test.d.ts +1 -0
- package/build/index.test.js +364 -0
- package/build/index.test.js.map +1 -0
- package/build/indexing/queue-manager.d.ts +36 -0
- package/build/indexing/queue-manager.js +86 -0
- package/build/indexing/queue-manager.js.map +1 -0
- package/build/indexing/queue-manager.test.d.ts +1 -0
- package/build/indexing/queue-manager.test.js +257 -0
- package/build/indexing/queue-manager.test.js.map +1 -0
- package/build/indexing/status.d.ts +39 -0
- package/build/indexing/status.js +207 -0
- package/build/indexing/status.js.map +1 -0
- package/build/indexing/status.test.d.ts +1 -0
- package/build/indexing/status.test.js +246 -0
- package/build/indexing/status.test.js.map +1 -0
- package/build/processor/content.d.ts +16 -0
- package/build/processor/content.js +286 -0
- package/build/processor/content.js.map +1 -0
- package/build/processor/content.test.d.ts +1 -0
- package/build/processor/content.test.js +369 -0
- package/build/processor/content.test.js.map +1 -0
- package/build/processor/markdown.d.ts +11 -0
- package/build/processor/markdown.js +256 -0
- package/build/processor/markdown.js.map +1 -0
- package/build/processor/markdown.test.d.ts +1 -0
- package/build/processor/markdown.test.js +312 -0
- package/build/processor/markdown.test.js.map +1 -0
- package/build/processor/metadata-parser.d.ts +37 -0
- package/build/processor/metadata-parser.js +245 -0
- package/build/processor/metadata-parser.js.map +1 -0
- package/build/processor/metadata-parser.test.d.ts +1 -0
- package/build/processor/metadata-parser.test.js +357 -0
- package/build/processor/metadata-parser.test.js.map +1 -0
- package/build/processor/processor.d.ts +8 -0
- package/build/processor/processor.js +190 -0
- package/build/processor/processor.js.map +1 -0
- package/build/processor/processor.test.d.ts +1 -0
- package/build/processor/processor.test.js +357 -0
- package/build/processor/processor.test.js.map +1 -0
- package/build/rag/cache.d.ts +10 -0
- package/build/rag/cache.js +10 -0
- package/build/rag/cache.js.map +1 -0
- package/build/rag/code-generator.d.ts +11 -0
- package/build/rag/code-generator.js +30 -0
- package/build/rag/code-generator.js.map +1 -0
- package/build/rag/context-assembler.d.ts +23 -0
- package/build/rag/context-assembler.js +113 -0
- package/build/rag/context-assembler.js.map +1 -0
- package/build/rag/docs-search.d.ts +55 -0
- package/build/rag/docs-search.js +380 -0
- package/build/rag/docs-search.js.map +1 -0
- package/build/rag/pipeline.d.ts +26 -0
- package/build/rag/pipeline.js +91 -0
- package/build/rag/pipeline.js.map +1 -0
- package/build/rag/query-processor.d.ts +14 -0
- package/build/rag/query-processor.js +57 -0
- package/build/rag/query-processor.js.map +1 -0
- package/build/rag/reranker.d.ts +55 -0
- package/build/rag/reranker.js +210 -0
- package/build/rag/reranker.js.map +1 -0
- package/build/rag/response-generator.d.ts +20 -0
- package/build/rag/response-generator.js +101 -0
- package/build/rag/response-generator.js.map +1 -0
- package/build/rag/retriever.d.ts +19 -0
- package/build/rag/retriever.js +111 -0
- package/build/rag/retriever.js.map +1 -0
- package/build/rag/validator.d.ts +22 -0
- package/build/rag/validator.js +128 -0
- package/build/rag/validator.js.map +1 -0
- package/build/rag/version-manager.d.ts +23 -0
- package/build/rag/version-manager.js +98 -0
- package/build/rag/version-manager.js.map +1 -0
- package/build/setupTests.d.ts +4 -0
- package/build/setupTests.js +50 -0
- package/build/setupTests.js.map +1 -0
- package/build/storage/storage.d.ts +38 -0
- package/build/storage/storage.js +700 -0
- package/build/storage/storage.js.map +1 -0
- package/build/storage/storage.test.d.ts +1 -0
- package/build/storage/storage.test.js +338 -0
- package/build/storage/storage.test.js.map +1 -0
- package/build/types/rag.d.ts +27 -0
- package/build/types/rag.js +2 -0
- package/build/types/rag.js.map +1 -0
- package/build/types.d.ts +120 -0
- package/build/types.js +2 -0
- package/build/types.js.map +1 -0
- package/build/util/content-utils.d.ts +31 -0
- package/build/util/content-utils.js +120 -0
- package/build/util/content-utils.js.map +1 -0
- package/build/util/content.d.ts +1 -0
- package/build/util/content.js +16 -0
- package/build/util/content.js.map +1 -0
- package/build/util/docs.d.ts +1 -0
- package/build/util/docs.js +26 -0
- package/build/util/docs.js.map +1 -0
- package/build/util/docs.test.d.ts +1 -0
- package/build/util/docs.test.js +49 -0
- package/build/util/docs.test.js.map +1 -0
- package/build/util/favicon.d.ts +6 -0
- package/build/util/favicon.js +88 -0
- package/build/util/favicon.js.map +1 -0
- package/build/util/favicon.test.d.ts +1 -0
- package/build/util/favicon.test.js +140 -0
- package/build/util/favicon.test.js.map +1 -0
- package/build/util/logger.d.ts +17 -0
- package/build/util/logger.js +72 -0
- package/build/util/logger.js.map +1 -0
- package/build/util/logger.test.d.ts +1 -0
- package/build/util/logger.test.js +46 -0
- package/build/util/logger.test.js.map +1 -0
- package/build/util/security.d.ts +312 -0
- package/build/util/security.js +719 -0
- package/build/util/security.js.map +1 -0
- package/build/util/security.test.d.ts +1 -0
- package/build/util/security.test.js +524 -0
- package/build/util/security.test.js.map +1 -0
- package/build/util/site-detector.d.ts +22 -0
- package/build/util/site-detector.js +42 -0
- package/build/util/site-detector.js.map +1 -0
- package/package.json +112 -0
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import { log } from 'crawlee';
|
|
2
|
+
// Suppress Crawlee's stdout logging for MCP compatibility
|
|
3
|
+
// MCP servers must only output JSON-RPC messages to stdout
|
|
4
|
+
log.setLevel(log.LEVELS.OFF);
|
|
5
|
+
export const getBrowserConfig = (requestQueue) => ({
|
|
6
|
+
maxRequestsPerCrawl: 1000,
|
|
7
|
+
requestQueue,
|
|
8
|
+
maxConcurrency: 20,
|
|
9
|
+
maxRequestsPerMinute: 600,
|
|
10
|
+
maxRequestRetries: 0,
|
|
11
|
+
navigationTimeoutSecs: 10,
|
|
12
|
+
browserPoolOptions: {
|
|
13
|
+
maxOpenPagesPerBrowser: 5,
|
|
14
|
+
useFingerprints: false,
|
|
15
|
+
operationTimeoutSecs: 15,
|
|
16
|
+
closeInactiveBrowserAfterSecs: 10,
|
|
17
|
+
},
|
|
18
|
+
preNavigationHooks: [
|
|
19
|
+
async ({ page }) => {
|
|
20
|
+
await page.setViewportSize({ width: 1920, height: 1080 });
|
|
21
|
+
await page.setExtraHTTPHeaders({
|
|
22
|
+
'Accept-Language': 'en-US,en;q=0.9',
|
|
23
|
+
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
|
24
|
+
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
|
25
|
+
});
|
|
26
|
+
},
|
|
27
|
+
],
|
|
28
|
+
});
|
|
29
|
+
//# sourceMappingURL=browser-config.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"browser-config.js","sourceRoot":"","sources":["../../src/crawler/browser-config.ts"],"names":[],"mappings":"AAAA,OAAO,EAA4B,GAAG,EAAE,MAAM,SAAS,CAAC;AAExD,0DAA0D;AAC1D,2DAA2D;AAC3D,GAAG,CAAC,QAAQ,CAAC,GAAG,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC;AAE7B,MAAM,CAAC,MAAM,gBAAgB,GAAG,CAAC,YAAsD,EAAqC,EAAE,CAAC,CAAC;IAC9H,mBAAmB,EAAE,IAAI;IACzB,YAAY;IACZ,cAAc,EAAE,EAAE;IAClB,oBAAoB,EAAE,GAAG;IACzB,iBAAiB,EAAE,CAAC;IACpB,qBAAqB,EAAE,EAAE;IACzB,kBAAkB,EAAE;QAClB,sBAAsB,EAAE,CAAC;QACzB,eAAe,EAAE,KAAK;QACtB,oBAAoB,EAAE,EAAE;QACxB,6BAA6B,EAAE,EAAE;KAClC;IACD,kBAAkB,EAAE;QAClB,KAAK,EAAE,EAAE,IAAI,EAAE,EAAE,EAAE;YACjB,MAAM,IAAI,CAAC,eAAe,CAAC,EAAE,KAAK,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC;YAC1D,MAAM,IAAI,CAAC,mBAAmB,CAAC;gBAC7B,iBAAiB,EAAE,gBAAgB;gBACnC,MAAM,EAAE,4EAA4E;gBACpF,YAAY,EACV,2HAA2H;aAC9H,CAAC,CAAC;QACL,CAAC;KACF;CACF,CAAC,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import { getBrowserConfig } from './browser-config.js';
|
|
2
|
+
describe('Browser Config', () => {
|
|
3
|
+
describe('getBrowserConfig', () => {
|
|
4
|
+
it('should return config with default values', () => {
|
|
5
|
+
const mockQueue = {};
|
|
6
|
+
const config = getBrowserConfig(mockQueue);
|
|
7
|
+
expect(config.maxRequestsPerCrawl).toBe(1000);
|
|
8
|
+
expect(config.maxConcurrency).toBe(20);
|
|
9
|
+
expect(config.maxRequestsPerMinute).toBe(600);
|
|
10
|
+
expect(config.maxRequestRetries).toBe(0);
|
|
11
|
+
expect(config.navigationTimeoutSecs).toBe(10);
|
|
12
|
+
});
|
|
13
|
+
it('should return config with requestQueue', () => {
|
|
14
|
+
const mockQueue = { name: 'test-queue' };
|
|
15
|
+
const config = getBrowserConfig(mockQueue);
|
|
16
|
+
expect(config.requestQueue).toBe(mockQueue);
|
|
17
|
+
});
|
|
18
|
+
it('should have browser pool options', () => {
|
|
19
|
+
const mockQueue = {};
|
|
20
|
+
const config = getBrowserConfig(mockQueue);
|
|
21
|
+
expect(config.browserPoolOptions).toBeDefined();
|
|
22
|
+
expect(config.browserPoolOptions?.maxOpenPagesPerBrowser).toBe(5);
|
|
23
|
+
expect(config.browserPoolOptions?.useFingerprints).toBe(false);
|
|
24
|
+
expect(config.browserPoolOptions?.operationTimeoutSecs).toBe(15);
|
|
25
|
+
expect(config.browserPoolOptions?.closeInactiveBrowserAfterSecs).toBe(10);
|
|
26
|
+
});
|
|
27
|
+
it('should have preNavigationHooks', () => {
|
|
28
|
+
const mockQueue = {};
|
|
29
|
+
const config = getBrowserConfig(mockQueue);
|
|
30
|
+
expect(config.preNavigationHooks).toBeDefined();
|
|
31
|
+
expect(Array.isArray(config.preNavigationHooks)).toBe(true);
|
|
32
|
+
expect(config.preNavigationHooks?.length).toBe(1);
|
|
33
|
+
});
|
|
34
|
+
it('should configure page in preNavigationHook', async () => {
|
|
35
|
+
const mockQueue = {};
|
|
36
|
+
const config = getBrowserConfig(mockQueue);
|
|
37
|
+
const mockPage = {
|
|
38
|
+
setViewportSize: vi.fn().mockResolvedValue(undefined),
|
|
39
|
+
setExtraHTTPHeaders: vi.fn().mockResolvedValue(undefined),
|
|
40
|
+
};
|
|
41
|
+
const mockContext = { page: mockPage };
|
|
42
|
+
const mockGotoOptions = {};
|
|
43
|
+
const hook = config.preNavigationHooks?.[0];
|
|
44
|
+
if (hook) {
|
|
45
|
+
await hook(mockContext, mockGotoOptions);
|
|
46
|
+
}
|
|
47
|
+
expect(mockPage.setViewportSize).toHaveBeenCalledWith({ width: 1920, height: 1080 });
|
|
48
|
+
expect(mockPage.setExtraHTTPHeaders).toHaveBeenCalledWith(expect.objectContaining({
|
|
49
|
+
'Accept-Language': expect.any(String),
|
|
50
|
+
Accept: expect.any(String),
|
|
51
|
+
'User-Agent': expect.any(String),
|
|
52
|
+
}));
|
|
53
|
+
});
|
|
54
|
+
});
|
|
55
|
+
});
|
|
56
|
+
//# sourceMappingURL=browser-config.test.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"browser-config.test.js","sourceRoot":"","sources":["../../src/crawler/browser-config.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,gBAAgB,EAAE,MAAM,qBAAqB,CAAC;AAIvD,QAAQ,CAAC,gBAAgB,EAAE,GAAG,EAAE;IAC9B,QAAQ,CAAC,kBAAkB,EAAE,GAAG,EAAE;QAChC,EAAE,CAAC,0CAA0C,EAAE,GAAG,EAAE;YAClD,MAAM,SAAS,GAAG,EAA4C,CAAC;YAC/D,MAAM,MAAM,GAAG,gBAAgB,CAAC,SAAS,CAAC,CAAC;YAE3C,MAAM,CAAC,MAAM,CAAC,mBAAmB,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAC9C,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YACvC,MAAM,CAAC,MAAM,CAAC,oBAAoB,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;YAC9C,MAAM,CAAC,MAAM,CAAC,iBAAiB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YACzC,MAAM,CAAC,MAAM,CAAC,qBAAqB,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QAChD,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,wCAAwC,EAAE,GAAG,EAAE;YAChD,MAAM,SAAS,GAAG,EAAE,IAAI,EAAE,YAAY,EAA4C,CAAC;YACnF,MAAM,MAAM,GAAG,gBAAgB,CAAC,SAAS,CAAC,CAAC;YAE3C,MAAM,CAAC,MAAM,CAAC,YAAY,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;QAC9C,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,kCAAkC,EAAE,GAAG,EAAE;YAC1C,MAAM,SAAS,GAAG,EAA4C,CAAC;YAC/D,MAAM,MAAM,GAAG,gBAAgB,CAAC,SAAS,CAAC,CAAC;YAE3C,MAAM,CAAC,MAAM,CAAC,kBAAkB,CAAC,CAAC,WAAW,EAAE,CAAC;YAChD,MAAM,CAAC,MAAM,CAAC,kBAAkB,EAAE,sBAAsB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAClE,MAAM,CAAC,MAAM,CAAC,kBAAkB,EAAE,eAAe,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YAC/D,MAAM,CAAC,MAAM,CAAC,kBAAkB,EAAE,oBAAoB,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YACjE,MAAM,CAAC,MAAM,CAAC,kBAAkB,EAAE,6BAA6B,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QAC5E,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,gCAAgC,EAAE,GAAG,EAAE;YACxC,MAAM,SAAS,GAAG,EAA4C,CAAC;YAC/D,MAAM,MAAM,GAAG,gBAAgB,CAAC,SAAS,CAAC,CAAC;YAE3C,MAAM,CAAC,MAAM,CAAC,kBAAkB,CAAC,CAAC,WAAW,EAAE,CAAC;YAChD,MAAM,CAAC,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,kBAAkB,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAC5D,MAAM,CAAC,MAAM,CAAC,kBAAkB,EAAE,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACpD,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,4CAA4C,EAAE,KAAK,IAAI,EAAE;YAC1D,MAAM,SAAS,GAAG,EAA4C,CAAC;YAC/D,MAAM,MAAM,GAAG,gBAAgB,CAAC,SAAS,CAAC,CAAC;YAE3C,MAAM,QAAQ,GAAG;gBACf,eAAe,EAAE,EAAE,CAAC,EAAE,EAAE,CAAC,iBAAiB,CAAC,SAAS,CAAC;gBACrD,mBAAmB,EAAE,EAAE,CAAC,EAAE,EAAE,CAAC,iBAAiB,CAAC,SAAS,CAAC;aACvC,CAAC;YAErB,MAAM,WAAW,GAAG,EAAE,IAAI,EAAE,QAAQ,EAA+B,CAAC;YACpE,MAAM,eAAe,GAAG,EAA2B,CAAC;YAEpD,MAAM,IAAI,GAAG,MAAM,CAAC,kBAAkB,EAAE,CAAC,CAAC,CAAC,CAAC;YAC5C,IAAI,IAAI,EAAE,CAAC;gBACT,MAAM,IAAI,CAAC,WAAW,EAAE,eAAe,CAAC,CAAC;YAC3C,CAAC;YAED,MAAM,CAAC,QAAQ,CAAC,eAAe,CAAC,CAAC,oBAAoB,CAAC,EAAE,KAAK,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC;YACrF,MAAM,CAAC,QAAQ,CAAC,mBAAmB,CAAC,CAAC,oBAAoB,CACvD,MAAM,CAAC,gBAAgB,CAAC;gBACtB,iBAAiB,EAAE,MAAM,CAAC,GAAG,CAAC,MAAM,CAAC;gBACrC,MAAM,EAAE,MAAM,CAAC,GAAG,CAAC,MAAM,CAAC;gBAC1B,YAAY,EAAE,MAAM,CAAC,GAAG,CAAC,MAAM,CAAC;aACjC,CAAC,CACH,CAAC;QACJ,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import { CrawlResult } from '../types.js';
|
|
2
|
+
import { BaseCrawler } from './base.js';
|
|
3
|
+
export declare class CheerioCrawler extends BaseCrawler {
|
|
4
|
+
private readonly BATCH_SIZE;
|
|
5
|
+
private readonly FETCH_TIMEOUT;
|
|
6
|
+
constructor(maxDepth?: number, maxRequestsPerCrawl?: number, onProgress?: (progress: number, description: string) => void);
|
|
7
|
+
crawl(url: string): AsyncGenerator<CrawlResult, void, unknown>;
|
|
8
|
+
private processPageWithRetry;
|
|
9
|
+
private extractLinks;
|
|
10
|
+
private extractTitle;
|
|
11
|
+
}
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
import { URL } from 'url';
|
|
2
|
+
import * as cheerio from 'cheerio';
|
|
3
|
+
import { BaseCrawler } from './base.js';
|
|
4
|
+
export class CheerioCrawler extends BaseCrawler {
|
|
5
|
+
// Batch size for processing URLs
|
|
6
|
+
BATCH_SIZE = 50;
|
|
7
|
+
FETCH_TIMEOUT = 30000; // 30 seconds
|
|
8
|
+
constructor(maxDepth = 4, maxRequestsPerCrawl = 1000, onProgress) {
|
|
9
|
+
super(maxDepth, maxRequestsPerCrawl, onProgress);
|
|
10
|
+
}
|
|
11
|
+
async *crawl(url) {
|
|
12
|
+
console.debug(`[${this.constructor.name}] Starting crawl from: ${url}`);
|
|
13
|
+
if (this.isAborting) {
|
|
14
|
+
console.debug('[CheerioCrawler] Crawl aborted');
|
|
15
|
+
return;
|
|
16
|
+
}
|
|
17
|
+
const startUrl = new URL(url);
|
|
18
|
+
const baseUrl = this.normalizeUrl(startUrl.toString());
|
|
19
|
+
// Track pages to crawl
|
|
20
|
+
const pagesToCrawl = new Map(); // URL -> depth
|
|
21
|
+
pagesToCrawl.set(baseUrl, 0);
|
|
22
|
+
while (pagesToCrawl.size > 0 && !this.isAborting) {
|
|
23
|
+
// Get batch of URLs to process
|
|
24
|
+
const batchEntries = Array.from(pagesToCrawl.entries()).slice(0, this.BATCH_SIZE);
|
|
25
|
+
const batch = new Map(batchEntries);
|
|
26
|
+
// Remove batch from queue
|
|
27
|
+
batchEntries.forEach(([url]) => pagesToCrawl.delete(url));
|
|
28
|
+
try {
|
|
29
|
+
// Process batch in parallel with timeout and rate limiting
|
|
30
|
+
const results = await Promise.all(Array.from(batch.entries()).map(async ([pageUrl]) => {
|
|
31
|
+
// Apply rate limiting
|
|
32
|
+
await this.rateLimit();
|
|
33
|
+
const result = await this.processPageWithRetry(pageUrl);
|
|
34
|
+
return { pageUrl, ...result };
|
|
35
|
+
}));
|
|
36
|
+
// Handle results
|
|
37
|
+
for (const { pageUrl, content, links, error } of results) {
|
|
38
|
+
if (error || !content || this.isAborting)
|
|
39
|
+
continue;
|
|
40
|
+
this.markUrlAsSeen(pageUrl);
|
|
41
|
+
yield {
|
|
42
|
+
url: pageUrl,
|
|
43
|
+
path: this.getPathFromUrl(pageUrl),
|
|
44
|
+
content,
|
|
45
|
+
title: this.extractTitle(content)
|
|
46
|
+
};
|
|
47
|
+
// Add new links to queue if within depth limit
|
|
48
|
+
const currentDepth = batch.get(pageUrl) || 0;
|
|
49
|
+
if (currentDepth < this.maxDepth) {
|
|
50
|
+
for (const link of links) {
|
|
51
|
+
const normalizedLink = this.normalizeUrl(link);
|
|
52
|
+
if (this.shouldCrawl(normalizedLink) && !pagesToCrawl.has(normalizedLink)) {
|
|
53
|
+
pagesToCrawl.set(normalizedLink, currentDepth + 1);
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
// Add delay between batches
|
|
59
|
+
await new Promise(resolve => setTimeout(resolve, 1000));
|
|
60
|
+
}
|
|
61
|
+
catch (e) {
|
|
62
|
+
console.error('[CheerioCrawler] Error processing batch:', e);
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
console.debug('[CheerioCrawler] Crawl completed');
|
|
66
|
+
}
|
|
67
|
+
async processPageWithRetry(url) {
|
|
68
|
+
return this.retryWithBackoff(async () => {
|
|
69
|
+
try {
|
|
70
|
+
// Create fetch request with timeout
|
|
71
|
+
const controller = new AbortController();
|
|
72
|
+
const timeoutId = setTimeout(() => controller.abort(), this.FETCH_TIMEOUT);
|
|
73
|
+
const response = await fetch(url, {
|
|
74
|
+
signal: controller.signal,
|
|
75
|
+
headers: {
|
|
76
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
|
77
|
+
}
|
|
78
|
+
});
|
|
79
|
+
clearTimeout(timeoutId);
|
|
80
|
+
if (!response.ok) {
|
|
81
|
+
throw new Error(`HTTP error! status: ${response.status}`);
|
|
82
|
+
}
|
|
83
|
+
const content = await response.text();
|
|
84
|
+
const links = this.extractLinks(content, new URL(url));
|
|
85
|
+
return { content, links };
|
|
86
|
+
}
|
|
87
|
+
catch (e) {
|
|
88
|
+
if (e instanceof Error) {
|
|
89
|
+
return { content: null, links: [], error: e };
|
|
90
|
+
}
|
|
91
|
+
return { content: null, links: [], error: new Error('Unknown error occurred') };
|
|
92
|
+
}
|
|
93
|
+
});
|
|
94
|
+
}
|
|
95
|
+
extractLinks(html, baseUrl) {
|
|
96
|
+
try {
|
|
97
|
+
const $ = cheerio.load(html);
|
|
98
|
+
const links = new Set();
|
|
99
|
+
// Find all links, including those in navigation elements
|
|
100
|
+
$('a').each((_, element) => {
|
|
101
|
+
const href = $(element).attr('href');
|
|
102
|
+
if (!href)
|
|
103
|
+
return;
|
|
104
|
+
try {
|
|
105
|
+
const url = new URL(href, baseUrl);
|
|
106
|
+
const normalizedUrl = this.normalizeUrl(url.toString());
|
|
107
|
+
// Use BaseCrawler's URL validation
|
|
108
|
+
if (this.shouldCrawl(normalizedUrl)) {
|
|
109
|
+
links.add(normalizedUrl);
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
catch (e) {
|
|
113
|
+
console.debug(`[CheerioCrawler] Invalid URL ${href}:`, e);
|
|
114
|
+
}
|
|
115
|
+
});
|
|
116
|
+
return Array.from(links);
|
|
117
|
+
}
|
|
118
|
+
catch (e) {
|
|
119
|
+
console.error('[CheerioCrawler] Error extracting links:', e);
|
|
120
|
+
return [];
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
extractTitle(html) {
|
|
124
|
+
try {
|
|
125
|
+
const $ = cheerio.load(html);
|
|
126
|
+
return $('title').text().trim() || 'Untitled';
|
|
127
|
+
}
|
|
128
|
+
catch (e) {
|
|
129
|
+
console.error('[CheerioCrawler] Error extracting title:', e);
|
|
130
|
+
return 'Untitled';
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
//# sourceMappingURL=cheerio.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cheerio.js","sourceRoot":"","sources":["../../src/crawler/cheerio.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,GAAG,EAAE,MAAM,KAAK,CAAC;AAC1B,OAAO,KAAK,OAAO,MAAM,SAAS,CAAC;AAEnC,OAAO,EAAE,WAAW,EAAE,MAAM,WAAW,CAAC;AAExC,MAAM,OAAO,cAAe,SAAQ,WAAW;IAC7C,iCAAiC;IAChB,UAAU,GAAG,EAAE,CAAC;IAChB,aAAa,GAAG,KAAK,CAAC,CAAC,aAAa;IAErD,YACE,WAAmB,CAAC,EACpB,sBAA8B,IAAI,EAClC,UAA4D;QAE5D,KAAK,CAAC,QAAQ,EAAE,mBAAmB,EAAE,UAAU,CAAC,CAAC;IACnD,CAAC;IAED,KAAK,CAAC,CAAC,KAAK,CAAC,GAAW;QACtB,OAAO,CAAC,KAAK,CAAC,IAAI,IAAI,CAAC,WAAW,CAAC,IAAI,0BAA0B,GAAG,EAAE,CAAC,CAAC;QAExE,IAAI,IAAI,CAAC,UAAU,EAAE,CAAC;YACpB,OAAO,CAAC,KAAK,CAAC,gCAAgC,CAAC,CAAC;YAChD,OAAO;QACT,CAAC;QAED,MAAM,QAAQ,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;QAC9B,MAAM,OAAO,GAAG,IAAI,CAAC,YAAY,CAAC,QAAQ,CAAC,QAAQ,EAAE,CAAC,CAAC;QAEvD,uBAAuB;QACvB,MAAM,YAAY,GAAG,IAAI,GAAG,EAAkB,CAAC,CAAC,eAAe;QAC/D,YAAY,CAAC,GAAG,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;QAE7B,OAAO,YAAY,CAAC,IAAI,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,UAAU,EAAE,CAAC;YACjD,+BAA+B;YAC/B,MAAM,YAAY,GAAG,KAAK,CAAC,IAAI,CAAC,YAAY,CAAC,OAAO,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC,UAAU,CAAC,CAAC;YAClF,MAAM,KAAK,GAAG,IAAI,GAAG,CAAC,YAAY,CAAC,CAAC;YAEpC,0BAA0B;YAC1B,YAAY,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,EAAE,EAAE,CAAC,YAAY,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC;YAE1D,IAAI,CAAC;gBACH,2DAA2D;gBAC3D,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,GAAG,CAC/B,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC,GAAG,CAAC,KAAK,EAAE,CAAC,OAAO,CAAC,EAAE,EAAE;oBAClD,sBAAsB;oBACtB,MAAM,IAAI,CAAC,SAAS,EAAE,CAAC;oBACvB,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,oBAAoB,CAAC,OAAO,CAAC,CAAC;oBACxD,OAAO,EAAE,OAAO,EAAE,GAAG,MAAM,EAAE,CAAC;gBAChC,CAAC,CAAC,CACH,CAAC;gBAEF,iBAAiB;gBACjB,KAAK,MAAM,EAAE,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,KAAK,EAAE,IAAI,OAAO,EAAE,CAAC;oBACzD,IAAI,KAAK,IAAI,CAAC,OAAO,IAAI,IAAI,CAAC,UAAU;wBAAE,SAAS;oBAEnD,IAAI,CAAC,aAAa,CAAC,OAAO,CAAC,CAAC;oBAE5B,MAAM;wBACJ,GAAG,EAAE,OAAO;wBACZ,IAAI,EAAE,IAAI,CAAC,cAAc,CAAC,OAAO,CAAC;wBAClC,OAAO;wBACP,KAAK,EAAE,IAAI,CAAC,YAAY,CAAC,OAAO,CAAC;qBAClC,CAAC;oBAEF,+CAA+C;oBAC/C,MAAM,YAAY,GAAG,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;oBAC7C,IAAI,YAAY,GAAG,IAAI,CAAC,QAAQ,EAAE,CAAC;wBACjC,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;4BACzB,MAAM,cAAc,GAAG,IAAI,CAAC,YAAY,CAAC,IAAI,CAAC,CAAC;4BAC/C,IAAI,IAAI,CAAC,WAAW,CAAC,cAAc,CAAC,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,cAAc,CAAC,EAAE,CAAC;gCAC1E,YAAY,CAAC,GAAG,CAAC,cAAc,EAAE,YAAY,GAAG,CAAC,CAAC,CAAC;4BACrD,CAAC;wBACH,CAAC;oBACH,CAAC;gBACH,CAAC;gBAED,4BAA4B;gBAC5B,MAAM,IAAI,OAAO,CAAC,OAAO,CAAC,EAAE,CAAC,UAAU,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC,CAAC;YAC1D,CAAC;YAAC,OAAO,CAAC,EAAE,CAAC;gBACX,OAAO,CAAC,KAAK,CAAC,0CAA0C,EAAE,CAAC,CAAC,CAAC;YAC/D,CAAC;QACH,CAAC;QAED,OAAO,CAAC,KAAK,CAAC,kCAAkC,CAAC,CAAC;IACpD,CAAC;IAEO,KAAK,CAAC,oBAAoB,CAAC,GAAW;QAK5C,OAAO,IAAI,CAAC,gBAAgB,CAAC,KAAK,IAAI,EAAE;YACtC,IAAI,CAAC;gBACH,oCAAoC;gBACpC,MAAM,UAAU,GAAG,IAAI,eAAe,EAAE,CAAC;gBACzC,MAAM,SAAS,GAAG,UAAU,CAAC,GAAG,EAAE,CAAC,UAAU,CAAC,KAAK,EAAE,EAAE,IAAI,CAAC,aAAa,CAAC,CAAC;gBAE3E,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;oBAChC,MAAM,EAAE,UAAU,CAAC,MAAM;oBACzB,OAAO,EAAE;wBACP,YAAY,EAAE,qHAAqH;qBACpI;iBACF,CAAC,CAAC;gBAEH,YAAY,CAAC,SAAS,CAAC,CAAC;gBAExB,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;oBACjB,MAAM,IAAI,KAAK,CAAC,uBAAuB,QAAQ,CAAC,MAAM,EAAE,CAAC,CAAC;gBAC5D,CAAC;gBAED,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;gBACtC,MAAM,KAAK,GAAG,IAAI,CAAC,YAAY,CAAC,OAAO,EAAE,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC;gBAEvD,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,CAAC;YAC5B,CAAC;YAAC,OAAO,CAAC,EAAE,CAAC;gBACX,IAAI,CAAC,YAAY,KAAK,EAAE,CAAC;oBACvB,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,EAAE,EAAE,KAAK,EAAE,CAAC,EAAE,CAAC;gBAChD,CAAC;gBACD,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,EAAE,EAAE,KAAK,EAAE,IAAI,KAAK,CAAC,wBAAwB,CAAC,EAAE,CAAC;YAClF,CAAC;QACH,CAAC,CAAC,CAAC;IACL,CAAC;IAEO,YAAY,CAAC,IAAY,EAAE,OAAY;QAC7C,IAAI,CAAC;YACH,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAC7B,MAAM,KAAK,GAAG,IAAI,GAAG,EAAU,CAAC;YAEhC,yDAAyD;YACzD,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,EAAE;gBACzB,MAAM,IAAI,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;gBACrC,IAAI,CAAC,IAAI;oBAAE,OAAO;gBAElB,IAAI,CAAC;oBACH,MAAM,GAAG,GAAG,IAAI,GAAG,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;oBACnC,MAAM,aAAa,GAAG,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,QAAQ,EAAE,CAAC,CAAC;oBAExD,mCAAmC;oBACnC,IAAI,IAAI,CAAC,WAAW,CAAC,aAAa,CAAC,EAAE,CAAC;wBACpC,KAAK,CAAC,GAAG,CAAC,aAAa,CAAC,CAAC;oBAC3B,CAAC;gBACH,CAAC;gBAAC,OAAO,CAAC,EAAE,CAAC;oBACX,OAAO,CAAC,KAAK,CAAC,gCAAgC,IAAI,GAAG,EAAE,CAAC,CAAC,CAAC;gBAC5D,CAAC;YACH,CAAC,CAAC,CAAC;YAEH,OAAO,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QAC3B,CAAC;QAAC,OAAO,CAAC,EAAE,CAAC;YACX,OAAO,CAAC,KAAK,CAAC,0CAA0C,EAAE,CAAC,CAAC,CAAC;YAC7D,OAAO,EAAE,CAAC;QACZ,CAAC;IACH,CAAC;IAEO,YAAY,CAAC,IAAY;QAC/B,IAAI,CAAC;YACH,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAC7B,OAAO,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,IAAI,UAAU,CAAC;QAChD,CAAC;QAAC,OAAO,CAAC,EAAE,CAAC;YACX,OAAO,CAAC,KAAK,CAAC,0CAA0C,EAAE,CAAC,CAAC,CAAC;YAC7D,OAAO,UAAU,CAAC;QACpB,CAAC;IACH,CAAC;CACF"}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import { CrawlResult } from '../types.js';
|
|
2
|
+
import { BaseCrawler } from './base.js';
|
|
3
|
+
export declare class ChromiumCrawler extends BaseCrawler {
|
|
4
|
+
private readonly PCR_CONFIG;
|
|
5
|
+
private curCrawlCount;
|
|
6
|
+
private baseHostname;
|
|
7
|
+
private readonly BATCH_SIZE;
|
|
8
|
+
private readonly REACT_WAIT_TIME;
|
|
9
|
+
private readonly NAVIGATION_WAIT_TIME;
|
|
10
|
+
private readonly MAX_CONCURRENT_PAGES;
|
|
11
|
+
private readonly PAGE_TIMEOUT;
|
|
12
|
+
private readonly resourceCache;
|
|
13
|
+
constructor(maxDepth?: number, maxRequestsPerCrawl?: number, onProgress?: (progress: number, description: string) => void);
|
|
14
|
+
crawl(url: string): AsyncGenerator<CrawlResult, void, unknown>;
|
|
15
|
+
private extractPageContent;
|
|
16
|
+
private waitForDynamicContent;
|
|
17
|
+
private gotoPageAndHandleRedirects;
|
|
18
|
+
private configurePage;
|
|
19
|
+
private processSinglePage;
|
|
20
|
+
private getLinksFromPage;
|
|
21
|
+
}
|