@cosmocoder/mcp-web-docs 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (240) hide show
  1. package/LICENSE +22 -0
  2. package/README.md +368 -0
  3. package/build/__mocks__/embeddings.d.ts +17 -0
  4. package/build/__mocks__/embeddings.js +66 -0
  5. package/build/__mocks__/embeddings.js.map +1 -0
  6. package/build/config.d.ts +44 -0
  7. package/build/config.js +158 -0
  8. package/build/config.js.map +1 -0
  9. package/build/config.test.d.ts +1 -0
  10. package/build/config.test.js +165 -0
  11. package/build/config.test.js.map +1 -0
  12. package/build/crawler/auth.d.ts +128 -0
  13. package/build/crawler/auth.js +546 -0
  14. package/build/crawler/auth.js.map +1 -0
  15. package/build/crawler/auth.test.d.ts +1 -0
  16. package/build/crawler/auth.test.js +174 -0
  17. package/build/crawler/auth.test.js.map +1 -0
  18. package/build/crawler/base.d.ts +24 -0
  19. package/build/crawler/base.js +149 -0
  20. package/build/crawler/base.js.map +1 -0
  21. package/build/crawler/base.test.d.ts +1 -0
  22. package/build/crawler/base.test.js +234 -0
  23. package/build/crawler/base.test.js.map +1 -0
  24. package/build/crawler/browser-config.d.ts +2 -0
  25. package/build/crawler/browser-config.js +29 -0
  26. package/build/crawler/browser-config.js.map +1 -0
  27. package/build/crawler/browser-config.test.d.ts +1 -0
  28. package/build/crawler/browser-config.test.js +56 -0
  29. package/build/crawler/browser-config.test.js.map +1 -0
  30. package/build/crawler/cheerio.d.ts +11 -0
  31. package/build/crawler/cheerio.js +134 -0
  32. package/build/crawler/cheerio.js.map +1 -0
  33. package/build/crawler/chromium.d.ts +21 -0
  34. package/build/crawler/chromium.js +596 -0
  35. package/build/crawler/chromium.js.map +1 -0
  36. package/build/crawler/content-extractor-types.d.ts +25 -0
  37. package/build/crawler/content-extractor-types.js +2 -0
  38. package/build/crawler/content-extractor-types.js.map +1 -0
  39. package/build/crawler/content-extractors.d.ts +9 -0
  40. package/build/crawler/content-extractors.js +9 -0
  41. package/build/crawler/content-extractors.js.map +1 -0
  42. package/build/crawler/content-utils.d.ts +2 -0
  43. package/build/crawler/content-utils.js +22 -0
  44. package/build/crawler/content-utils.js.map +1 -0
  45. package/build/crawler/content-utils.test.d.ts +1 -0
  46. package/build/crawler/content-utils.test.js +99 -0
  47. package/build/crawler/content-utils.test.js.map +1 -0
  48. package/build/crawler/crawlee-crawler.d.ts +63 -0
  49. package/build/crawler/crawlee-crawler.js +342 -0
  50. package/build/crawler/crawlee-crawler.js.map +1 -0
  51. package/build/crawler/crawlee-crawler.test.d.ts +1 -0
  52. package/build/crawler/crawlee-crawler.test.js +280 -0
  53. package/build/crawler/crawlee-crawler.test.js.map +1 -0
  54. package/build/crawler/default-extractor.d.ts +4 -0
  55. package/build/crawler/default-extractor.js +26 -0
  56. package/build/crawler/default-extractor.js.map +1 -0
  57. package/build/crawler/default-extractor.test.d.ts +1 -0
  58. package/build/crawler/default-extractor.test.js +200 -0
  59. package/build/crawler/default-extractor.test.js.map +1 -0
  60. package/build/crawler/default.d.ts +11 -0
  61. package/build/crawler/default.js +138 -0
  62. package/build/crawler/default.js.map +1 -0
  63. package/build/crawler/docs-crawler.d.ts +26 -0
  64. package/build/crawler/docs-crawler.js +97 -0
  65. package/build/crawler/docs-crawler.js.map +1 -0
  66. package/build/crawler/docs-crawler.test.d.ts +1 -0
  67. package/build/crawler/docs-crawler.test.js +185 -0
  68. package/build/crawler/docs-crawler.test.js.map +1 -0
  69. package/build/crawler/factory.d.ts +6 -0
  70. package/build/crawler/factory.js +83 -0
  71. package/build/crawler/factory.js.map +1 -0
  72. package/build/crawler/github-pages-extractor.d.ts +4 -0
  73. package/build/crawler/github-pages-extractor.js +33 -0
  74. package/build/crawler/github-pages-extractor.js.map +1 -0
  75. package/build/crawler/github-pages-extractor.test.d.ts +1 -0
  76. package/build/crawler/github-pages-extractor.test.js +184 -0
  77. package/build/crawler/github-pages-extractor.test.js.map +1 -0
  78. package/build/crawler/github.d.ts +20 -0
  79. package/build/crawler/github.js +181 -0
  80. package/build/crawler/github.js.map +1 -0
  81. package/build/crawler/github.test.d.ts +1 -0
  82. package/build/crawler/github.test.js +326 -0
  83. package/build/crawler/github.test.js.map +1 -0
  84. package/build/crawler/puppeteer.d.ts +16 -0
  85. package/build/crawler/puppeteer.js +191 -0
  86. package/build/crawler/puppeteer.js.map +1 -0
  87. package/build/crawler/queue-manager.d.ts +43 -0
  88. package/build/crawler/queue-manager.js +169 -0
  89. package/build/crawler/queue-manager.js.map +1 -0
  90. package/build/crawler/queue-manager.test.d.ts +1 -0
  91. package/build/crawler/queue-manager.test.js +509 -0
  92. package/build/crawler/queue-manager.test.js.map +1 -0
  93. package/build/crawler/site-rules.d.ts +11 -0
  94. package/build/crawler/site-rules.js +104 -0
  95. package/build/crawler/site-rules.js.map +1 -0
  96. package/build/crawler/site-rules.test.d.ts +1 -0
  97. package/build/crawler/site-rules.test.js +139 -0
  98. package/build/crawler/site-rules.test.js.map +1 -0
  99. package/build/crawler/storybook-extractor.d.ts +34 -0
  100. package/build/crawler/storybook-extractor.js +767 -0
  101. package/build/crawler/storybook-extractor.js.map +1 -0
  102. package/build/crawler/storybook-extractor.test.d.ts +1 -0
  103. package/build/crawler/storybook-extractor.test.js +491 -0
  104. package/build/crawler/storybook-extractor.test.js.map +1 -0
  105. package/build/embeddings/fastembed.d.ts +25 -0
  106. package/build/embeddings/fastembed.js +188 -0
  107. package/build/embeddings/fastembed.js.map +1 -0
  108. package/build/embeddings/fastembed.test.d.ts +1 -0
  109. package/build/embeddings/fastembed.test.js +307 -0
  110. package/build/embeddings/fastembed.test.js.map +1 -0
  111. package/build/embeddings/openai.d.ts +8 -0
  112. package/build/embeddings/openai.js +56 -0
  113. package/build/embeddings/openai.js.map +1 -0
  114. package/build/embeddings/types.d.ts +4 -0
  115. package/build/embeddings/types.js +2 -0
  116. package/build/embeddings/types.js.map +1 -0
  117. package/build/index.d.ts +2 -0
  118. package/build/index.js +1007 -0
  119. package/build/index.js.map +1 -0
  120. package/build/index.test.d.ts +1 -0
  121. package/build/index.test.js +364 -0
  122. package/build/index.test.js.map +1 -0
  123. package/build/indexing/queue-manager.d.ts +36 -0
  124. package/build/indexing/queue-manager.js +86 -0
  125. package/build/indexing/queue-manager.js.map +1 -0
  126. package/build/indexing/queue-manager.test.d.ts +1 -0
  127. package/build/indexing/queue-manager.test.js +257 -0
  128. package/build/indexing/queue-manager.test.js.map +1 -0
  129. package/build/indexing/status.d.ts +39 -0
  130. package/build/indexing/status.js +207 -0
  131. package/build/indexing/status.js.map +1 -0
  132. package/build/indexing/status.test.d.ts +1 -0
  133. package/build/indexing/status.test.js +246 -0
  134. package/build/indexing/status.test.js.map +1 -0
  135. package/build/processor/content.d.ts +16 -0
  136. package/build/processor/content.js +286 -0
  137. package/build/processor/content.js.map +1 -0
  138. package/build/processor/content.test.d.ts +1 -0
  139. package/build/processor/content.test.js +369 -0
  140. package/build/processor/content.test.js.map +1 -0
  141. package/build/processor/markdown.d.ts +11 -0
  142. package/build/processor/markdown.js +256 -0
  143. package/build/processor/markdown.js.map +1 -0
  144. package/build/processor/markdown.test.d.ts +1 -0
  145. package/build/processor/markdown.test.js +312 -0
  146. package/build/processor/markdown.test.js.map +1 -0
  147. package/build/processor/metadata-parser.d.ts +37 -0
  148. package/build/processor/metadata-parser.js +245 -0
  149. package/build/processor/metadata-parser.js.map +1 -0
  150. package/build/processor/metadata-parser.test.d.ts +1 -0
  151. package/build/processor/metadata-parser.test.js +357 -0
  152. package/build/processor/metadata-parser.test.js.map +1 -0
  153. package/build/processor/processor.d.ts +8 -0
  154. package/build/processor/processor.js +190 -0
  155. package/build/processor/processor.js.map +1 -0
  156. package/build/processor/processor.test.d.ts +1 -0
  157. package/build/processor/processor.test.js +357 -0
  158. package/build/processor/processor.test.js.map +1 -0
  159. package/build/rag/cache.d.ts +10 -0
  160. package/build/rag/cache.js +10 -0
  161. package/build/rag/cache.js.map +1 -0
  162. package/build/rag/code-generator.d.ts +11 -0
  163. package/build/rag/code-generator.js +30 -0
  164. package/build/rag/code-generator.js.map +1 -0
  165. package/build/rag/context-assembler.d.ts +23 -0
  166. package/build/rag/context-assembler.js +113 -0
  167. package/build/rag/context-assembler.js.map +1 -0
  168. package/build/rag/docs-search.d.ts +55 -0
  169. package/build/rag/docs-search.js +380 -0
  170. package/build/rag/docs-search.js.map +1 -0
  171. package/build/rag/pipeline.d.ts +26 -0
  172. package/build/rag/pipeline.js +91 -0
  173. package/build/rag/pipeline.js.map +1 -0
  174. package/build/rag/query-processor.d.ts +14 -0
  175. package/build/rag/query-processor.js +57 -0
  176. package/build/rag/query-processor.js.map +1 -0
  177. package/build/rag/reranker.d.ts +55 -0
  178. package/build/rag/reranker.js +210 -0
  179. package/build/rag/reranker.js.map +1 -0
  180. package/build/rag/response-generator.d.ts +20 -0
  181. package/build/rag/response-generator.js +101 -0
  182. package/build/rag/response-generator.js.map +1 -0
  183. package/build/rag/retriever.d.ts +19 -0
  184. package/build/rag/retriever.js +111 -0
  185. package/build/rag/retriever.js.map +1 -0
  186. package/build/rag/validator.d.ts +22 -0
  187. package/build/rag/validator.js +128 -0
  188. package/build/rag/validator.js.map +1 -0
  189. package/build/rag/version-manager.d.ts +23 -0
  190. package/build/rag/version-manager.js +98 -0
  191. package/build/rag/version-manager.js.map +1 -0
  192. package/build/setupTests.d.ts +4 -0
  193. package/build/setupTests.js +50 -0
  194. package/build/setupTests.js.map +1 -0
  195. package/build/storage/storage.d.ts +38 -0
  196. package/build/storage/storage.js +700 -0
  197. package/build/storage/storage.js.map +1 -0
  198. package/build/storage/storage.test.d.ts +1 -0
  199. package/build/storage/storage.test.js +338 -0
  200. package/build/storage/storage.test.js.map +1 -0
  201. package/build/types/rag.d.ts +27 -0
  202. package/build/types/rag.js +2 -0
  203. package/build/types/rag.js.map +1 -0
  204. package/build/types.d.ts +120 -0
  205. package/build/types.js +2 -0
  206. package/build/types.js.map +1 -0
  207. package/build/util/content-utils.d.ts +31 -0
  208. package/build/util/content-utils.js +120 -0
  209. package/build/util/content-utils.js.map +1 -0
  210. package/build/util/content.d.ts +1 -0
  211. package/build/util/content.js +16 -0
  212. package/build/util/content.js.map +1 -0
  213. package/build/util/docs.d.ts +1 -0
  214. package/build/util/docs.js +26 -0
  215. package/build/util/docs.js.map +1 -0
  216. package/build/util/docs.test.d.ts +1 -0
  217. package/build/util/docs.test.js +49 -0
  218. package/build/util/docs.test.js.map +1 -0
  219. package/build/util/favicon.d.ts +6 -0
  220. package/build/util/favicon.js +88 -0
  221. package/build/util/favicon.js.map +1 -0
  222. package/build/util/favicon.test.d.ts +1 -0
  223. package/build/util/favicon.test.js +140 -0
  224. package/build/util/favicon.test.js.map +1 -0
  225. package/build/util/logger.d.ts +17 -0
  226. package/build/util/logger.js +72 -0
  227. package/build/util/logger.js.map +1 -0
  228. package/build/util/logger.test.d.ts +1 -0
  229. package/build/util/logger.test.js +46 -0
  230. package/build/util/logger.test.js.map +1 -0
  231. package/build/util/security.d.ts +312 -0
  232. package/build/util/security.js +719 -0
  233. package/build/util/security.js.map +1 -0
  234. package/build/util/security.test.d.ts +1 -0
  235. package/build/util/security.test.js +524 -0
  236. package/build/util/security.test.js.map +1 -0
  237. package/build/util/site-detector.d.ts +22 -0
  238. package/build/util/site-detector.js +42 -0
  239. package/build/util/site-detector.js.map +1 -0
  240. package/package.json +112 -0
@@ -0,0 +1,29 @@
1
+ import { log } from 'crawlee';
2
+ // Suppress Crawlee's stdout logging for MCP compatibility
3
+ // MCP servers must only output JSON-RPC messages to stdout
4
+ log.setLevel(log.LEVELS.OFF);
5
+ export const getBrowserConfig = (requestQueue) => ({
6
+ maxRequestsPerCrawl: 1000,
7
+ requestQueue,
8
+ maxConcurrency: 20,
9
+ maxRequestsPerMinute: 600,
10
+ maxRequestRetries: 0,
11
+ navigationTimeoutSecs: 10,
12
+ browserPoolOptions: {
13
+ maxOpenPagesPerBrowser: 5,
14
+ useFingerprints: false,
15
+ operationTimeoutSecs: 15,
16
+ closeInactiveBrowserAfterSecs: 10,
17
+ },
18
+ preNavigationHooks: [
19
+ async ({ page }) => {
20
+ await page.setViewportSize({ width: 1920, height: 1080 });
21
+ await page.setExtraHTTPHeaders({
22
+ 'Accept-Language': 'en-US,en;q=0.9',
23
+ Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
24
+ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
25
+ });
26
+ },
27
+ ],
28
+ });
29
+ //# sourceMappingURL=browser-config.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"browser-config.js","sourceRoot":"","sources":["../../src/crawler/browser-config.ts"],"names":[],"mappings":"AAAA,OAAO,EAA4B,GAAG,EAAE,MAAM,SAAS,CAAC;AAExD,0DAA0D;AAC1D,2DAA2D;AAC3D,GAAG,CAAC,QAAQ,CAAC,GAAG,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC;AAE7B,MAAM,CAAC,MAAM,gBAAgB,GAAG,CAAC,YAAsD,EAAqC,EAAE,CAAC,CAAC;IAC9H,mBAAmB,EAAE,IAAI;IACzB,YAAY;IACZ,cAAc,EAAE,EAAE;IAClB,oBAAoB,EAAE,GAAG;IACzB,iBAAiB,EAAE,CAAC;IACpB,qBAAqB,EAAE,EAAE;IACzB,kBAAkB,EAAE;QAClB,sBAAsB,EAAE,CAAC;QACzB,eAAe,EAAE,KAAK;QACtB,oBAAoB,EAAE,EAAE;QACxB,6BAA6B,EAAE,EAAE;KAClC;IACD,kBAAkB,EAAE;QAClB,KAAK,EAAE,EAAE,IAAI,EAAE,EAAE,EAAE;YACjB,MAAM,IAAI,CAAC,eAAe,CAAC,EAAE,KAAK,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC;YAC1D,MAAM,IAAI,CAAC,mBAAmB,CAAC;gBAC7B,iBAAiB,EAAE,gBAAgB;gBACnC,MAAM,EAAE,4EAA4E;gBACpF,YAAY,EACV,2HAA2H;aAC9H,CAAC,CAAC;QACL,CAAC;KACF;CACF,CAAC,CAAC"}
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,56 @@
1
+ import { getBrowserConfig } from './browser-config.js';
2
+ describe('Browser Config', () => {
3
+ describe('getBrowserConfig', () => {
4
+ it('should return config with default values', () => {
5
+ const mockQueue = {};
6
+ const config = getBrowserConfig(mockQueue);
7
+ expect(config.maxRequestsPerCrawl).toBe(1000);
8
+ expect(config.maxConcurrency).toBe(20);
9
+ expect(config.maxRequestsPerMinute).toBe(600);
10
+ expect(config.maxRequestRetries).toBe(0);
11
+ expect(config.navigationTimeoutSecs).toBe(10);
12
+ });
13
+ it('should return config with requestQueue', () => {
14
+ const mockQueue = { name: 'test-queue' };
15
+ const config = getBrowserConfig(mockQueue);
16
+ expect(config.requestQueue).toBe(mockQueue);
17
+ });
18
+ it('should have browser pool options', () => {
19
+ const mockQueue = {};
20
+ const config = getBrowserConfig(mockQueue);
21
+ expect(config.browserPoolOptions).toBeDefined();
22
+ expect(config.browserPoolOptions?.maxOpenPagesPerBrowser).toBe(5);
23
+ expect(config.browserPoolOptions?.useFingerprints).toBe(false);
24
+ expect(config.browserPoolOptions?.operationTimeoutSecs).toBe(15);
25
+ expect(config.browserPoolOptions?.closeInactiveBrowserAfterSecs).toBe(10);
26
+ });
27
+ it('should have preNavigationHooks', () => {
28
+ const mockQueue = {};
29
+ const config = getBrowserConfig(mockQueue);
30
+ expect(config.preNavigationHooks).toBeDefined();
31
+ expect(Array.isArray(config.preNavigationHooks)).toBe(true);
32
+ expect(config.preNavigationHooks?.length).toBe(1);
33
+ });
34
+ it('should configure page in preNavigationHook', async () => {
35
+ const mockQueue = {};
36
+ const config = getBrowserConfig(mockQueue);
37
+ const mockPage = {
38
+ setViewportSize: vi.fn().mockResolvedValue(undefined),
39
+ setExtraHTTPHeaders: vi.fn().mockResolvedValue(undefined),
40
+ };
41
+ const mockContext = { page: mockPage };
42
+ const mockGotoOptions = {};
43
+ const hook = config.preNavigationHooks?.[0];
44
+ if (hook) {
45
+ await hook(mockContext, mockGotoOptions);
46
+ }
47
+ expect(mockPage.setViewportSize).toHaveBeenCalledWith({ width: 1920, height: 1080 });
48
+ expect(mockPage.setExtraHTTPHeaders).toHaveBeenCalledWith(expect.objectContaining({
49
+ 'Accept-Language': expect.any(String),
50
+ Accept: expect.any(String),
51
+ 'User-Agent': expect.any(String),
52
+ }));
53
+ });
54
+ });
55
+ });
56
+ //# sourceMappingURL=browser-config.test.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"browser-config.test.js","sourceRoot":"","sources":["../../src/crawler/browser-config.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,gBAAgB,EAAE,MAAM,qBAAqB,CAAC;AAIvD,QAAQ,CAAC,gBAAgB,EAAE,GAAG,EAAE;IAC9B,QAAQ,CAAC,kBAAkB,EAAE,GAAG,EAAE;QAChC,EAAE,CAAC,0CAA0C,EAAE,GAAG,EAAE;YAClD,MAAM,SAAS,GAAG,EAA4C,CAAC;YAC/D,MAAM,MAAM,GAAG,gBAAgB,CAAC,SAAS,CAAC,CAAC;YAE3C,MAAM,CAAC,MAAM,CAAC,mBAAmB,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAC9C,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YACvC,MAAM,CAAC,MAAM,CAAC,oBAAoB,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;YAC9C,MAAM,CAAC,MAAM,CAAC,iBAAiB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YACzC,MAAM,CAAC,MAAM,CAAC,qBAAqB,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QAChD,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,wCAAwC,EAAE,GAAG,EAAE;YAChD,MAAM,SAAS,GAAG,EAAE,IAAI,EAAE,YAAY,EAA4C,CAAC;YACnF,MAAM,MAAM,GAAG,gBAAgB,CAAC,SAAS,CAAC,CAAC;YAE3C,MAAM,CAAC,MAAM,CAAC,YAAY,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;QAC9C,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,kCAAkC,EAAE,GAAG,EAAE;YAC1C,MAAM,SAAS,GAAG,EAA4C,CAAC;YAC/D,MAAM,MAAM,GAAG,gBAAgB,CAAC,SAAS,CAAC,CAAC;YAE3C,MAAM,CAAC,MAAM,CAAC,kBAAkB,CAAC,CAAC,WAAW,EAAE,CAAC;YAChD,MAAM,CAAC,MAAM,CAAC,kBAAkB,EAAE,sBAAsB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAClE,MAAM,CAAC,MAAM,CAAC,kBAAkB,EAAE,eAAe,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YAC/D,MAAM,CAAC,MAAM,CAAC,kBAAkB,EAAE,oBAAoB,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YACjE,MAAM,CAAC,MAAM,CAAC,kBAAkB,EAAE,6BAA6B,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QAC5E,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,gCAAgC,EAAE,GAAG,EAAE;YACxC,MAAM,SAAS,GAAG,EAA4C,CAAC;YAC/D,MAAM,MAAM,GAAG,gBAAgB,CAAC,SAAS,CAAC,CAAC;YAE3C,MAAM,CAAC,MAAM,CAAC,kBAAkB,CAAC,CAAC,WAAW,EAAE,CAAC;YAChD,MAAM,CAAC,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,kBAAkB,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAC5D,MAAM,CAAC,MAAM,CAAC,kBAAkB,EAAE,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACpD,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,4CAA4C,EAAE,KAAK,IAAI,EAAE;YAC1D,MAAM,SAAS,GAAG,EAA4C,CAAC;YAC/D,MAAM,MAAM,GAAG,gBAAgB,CAAC,SAAS,CAAC,CAAC;YAE3C,MAAM,QAAQ,GAAG;gBACf,eAAe,EAAE,EAAE,CAAC,EAAE,EAAE,CAAC,iBAAiB,CAAC,SAAS,CAAC;gBACrD,mBAAmB,EAAE,EAAE,CAAC,EAAE,EAAE,CAAC,iBAAiB,CAAC,SAAS,CAAC;aACvC,CAAC;YAErB,MAAM,WAAW,GAAG,EAAE,IAAI,EAAE,QAAQ,EAA+B,CAAC;YACpE,MAAM,eAAe,GAAG,EAA2B,CAAC;YAEpD,MAAM,IAAI,GAAG,MAAM,CAAC,kBAAkB,EAAE,CAAC,CAAC,CAAC,CAAC;YAC5C,IAAI,IAAI,EAAE,CAAC;gBACT,MAAM,IAAI,CAAC,WAAW,EAAE,eAAe,CAAC,CAAC;YAC3C,CAAC;YAED,MAAM,CAAC,QAAQ,CAAC,eAAe,CAAC,CAAC,oBAAoB,CAAC,EAAE,KAAK,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC;YACrF,MAAM,CAAC,QAAQ,CAAC,mBAAmB,CAAC,CAAC,oBAAoB,CACvD,MAAM,CAAC,gBAAgB,CAAC;gBACtB,iBAAiB,EAAE,MAAM,CAAC,GAAG,CAAC,MAAM,CAAC;gBACrC,MAAM,EAAE,MAAM,CAAC,GAAG,CAAC,MAAM,CAAC;gBAC1B,YAAY,EAAE,MAAM,CAAC,GAAG,CAAC,MAAM,CAAC;aACjC,CAAC,CACH,CAAC;QACJ,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
@@ -0,0 +1,11 @@
1
+ import { CrawlResult } from '../types.js';
2
+ import { BaseCrawler } from './base.js';
3
+ export declare class CheerioCrawler extends BaseCrawler {
4
+ private readonly BATCH_SIZE;
5
+ private readonly FETCH_TIMEOUT;
6
+ constructor(maxDepth?: number, maxRequestsPerCrawl?: number, onProgress?: (progress: number, description: string) => void);
7
+ crawl(url: string): AsyncGenerator<CrawlResult, void, unknown>;
8
+ private processPageWithRetry;
9
+ private extractLinks;
10
+ private extractTitle;
11
+ }
@@ -0,0 +1,134 @@
1
+ import { URL } from 'url';
2
+ import * as cheerio from 'cheerio';
3
+ import { BaseCrawler } from './base.js';
4
+ export class CheerioCrawler extends BaseCrawler {
5
+ // Batch size for processing URLs
6
+ BATCH_SIZE = 50;
7
+ FETCH_TIMEOUT = 30000; // 30 seconds
8
+ constructor(maxDepth = 4, maxRequestsPerCrawl = 1000, onProgress) {
9
+ super(maxDepth, maxRequestsPerCrawl, onProgress);
10
+ }
11
+ async *crawl(url) {
12
+ console.debug(`[${this.constructor.name}] Starting crawl from: ${url}`);
13
+ if (this.isAborting) {
14
+ console.debug('[CheerioCrawler] Crawl aborted');
15
+ return;
16
+ }
17
+ const startUrl = new URL(url);
18
+ const baseUrl = this.normalizeUrl(startUrl.toString());
19
+ // Track pages to crawl
20
+ const pagesToCrawl = new Map(); // URL -> depth
21
+ pagesToCrawl.set(baseUrl, 0);
22
+ while (pagesToCrawl.size > 0 && !this.isAborting) {
23
+ // Get batch of URLs to process
24
+ const batchEntries = Array.from(pagesToCrawl.entries()).slice(0, this.BATCH_SIZE);
25
+ const batch = new Map(batchEntries);
26
+ // Remove batch from queue
27
+ batchEntries.forEach(([url]) => pagesToCrawl.delete(url));
28
+ try {
29
+ // Process batch in parallel with timeout and rate limiting
30
+ const results = await Promise.all(Array.from(batch.entries()).map(async ([pageUrl]) => {
31
+ // Apply rate limiting
32
+ await this.rateLimit();
33
+ const result = await this.processPageWithRetry(pageUrl);
34
+ return { pageUrl, ...result };
35
+ }));
36
+ // Handle results
37
+ for (const { pageUrl, content, links, error } of results) {
38
+ if (error || !content || this.isAborting)
39
+ continue;
40
+ this.markUrlAsSeen(pageUrl);
41
+ yield {
42
+ url: pageUrl,
43
+ path: this.getPathFromUrl(pageUrl),
44
+ content,
45
+ title: this.extractTitle(content)
46
+ };
47
+ // Add new links to queue if within depth limit
48
+ const currentDepth = batch.get(pageUrl) || 0;
49
+ if (currentDepth < this.maxDepth) {
50
+ for (const link of links) {
51
+ const normalizedLink = this.normalizeUrl(link);
52
+ if (this.shouldCrawl(normalizedLink) && !pagesToCrawl.has(normalizedLink)) {
53
+ pagesToCrawl.set(normalizedLink, currentDepth + 1);
54
+ }
55
+ }
56
+ }
57
+ }
58
+ // Add delay between batches
59
+ await new Promise(resolve => setTimeout(resolve, 1000));
60
+ }
61
+ catch (e) {
62
+ console.error('[CheerioCrawler] Error processing batch:', e);
63
+ }
64
+ }
65
+ console.debug('[CheerioCrawler] Crawl completed');
66
+ }
67
+ async processPageWithRetry(url) {
68
+ return this.retryWithBackoff(async () => {
69
+ try {
70
+ // Create fetch request with timeout
71
+ const controller = new AbortController();
72
+ const timeoutId = setTimeout(() => controller.abort(), this.FETCH_TIMEOUT);
73
+ const response = await fetch(url, {
74
+ signal: controller.signal,
75
+ headers: {
76
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
77
+ }
78
+ });
79
+ clearTimeout(timeoutId);
80
+ if (!response.ok) {
81
+ throw new Error(`HTTP error! status: ${response.status}`);
82
+ }
83
+ const content = await response.text();
84
+ const links = this.extractLinks(content, new URL(url));
85
+ return { content, links };
86
+ }
87
+ catch (e) {
88
+ if (e instanceof Error) {
89
+ return { content: null, links: [], error: e };
90
+ }
91
+ return { content: null, links: [], error: new Error('Unknown error occurred') };
92
+ }
93
+ });
94
+ }
95
+ extractLinks(html, baseUrl) {
96
+ try {
97
+ const $ = cheerio.load(html);
98
+ const links = new Set();
99
+ // Find all links, including those in navigation elements
100
+ $('a').each((_, element) => {
101
+ const href = $(element).attr('href');
102
+ if (!href)
103
+ return;
104
+ try {
105
+ const url = new URL(href, baseUrl);
106
+ const normalizedUrl = this.normalizeUrl(url.toString());
107
+ // Use BaseCrawler's URL validation
108
+ if (this.shouldCrawl(normalizedUrl)) {
109
+ links.add(normalizedUrl);
110
+ }
111
+ }
112
+ catch (e) {
113
+ console.debug(`[CheerioCrawler] Invalid URL ${href}:`, e);
114
+ }
115
+ });
116
+ return Array.from(links);
117
+ }
118
+ catch (e) {
119
+ console.error('[CheerioCrawler] Error extracting links:', e);
120
+ return [];
121
+ }
122
+ }
123
+ extractTitle(html) {
124
+ try {
125
+ const $ = cheerio.load(html);
126
+ return $('title').text().trim() || 'Untitled';
127
+ }
128
+ catch (e) {
129
+ console.error('[CheerioCrawler] Error extracting title:', e);
130
+ return 'Untitled';
131
+ }
132
+ }
133
+ }
134
+ //# sourceMappingURL=cheerio.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"cheerio.js","sourceRoot":"","sources":["../../src/crawler/cheerio.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,GAAG,EAAE,MAAM,KAAK,CAAC;AAC1B,OAAO,KAAK,OAAO,MAAM,SAAS,CAAC;AAEnC,OAAO,EAAE,WAAW,EAAE,MAAM,WAAW,CAAC;AAExC,MAAM,OAAO,cAAe,SAAQ,WAAW;IAC7C,iCAAiC;IAChB,UAAU,GAAG,EAAE,CAAC;IAChB,aAAa,GAAG,KAAK,CAAC,CAAC,aAAa;IAErD,YACE,WAAmB,CAAC,EACpB,sBAA8B,IAAI,EAClC,UAA4D;QAE5D,KAAK,CAAC,QAAQ,EAAE,mBAAmB,EAAE,UAAU,CAAC,CAAC;IACnD,CAAC;IAED,KAAK,CAAC,CAAC,KAAK,CAAC,GAAW;QACtB,OAAO,CAAC,KAAK,CAAC,IAAI,IAAI,CAAC,WAAW,CAAC,IAAI,0BAA0B,GAAG,EAAE,CAAC,CAAC;QAExE,IAAI,IAAI,CAAC,UAAU,EAAE,CAAC;YACpB,OAAO,CAAC,KAAK,CAAC,gCAAgC,CAAC,CAAC;YAChD,OAAO;QACT,CAAC;QAED,MAAM,QAAQ,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;QAC9B,MAAM,OAAO,GAAG,IAAI,CAAC,YAAY,CAAC,QAAQ,CAAC,QAAQ,EAAE,CAAC,CAAC;QAEvD,uBAAuB;QACvB,MAAM,YAAY,GAAG,IAAI,GAAG,EAAkB,CAAC,CAAC,eAAe;QAC/D,YAAY,CAAC,GAAG,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;QAE7B,OAAO,YAAY,CAAC,IAAI,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,UAAU,EAAE,CAAC;YACjD,+BAA+B;YAC/B,MAAM,YAAY,GAAG,KAAK,CAAC,IAAI,CAAC,YAAY,CAAC,OAAO,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC,UAAU,CAAC,CAAC;YAClF,MAAM,KAAK,GAAG,IAAI,GAAG,CAAC,YAAY,CAAC,CAAC;YAEpC,0BAA0B;YAC1B,YAAY,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,EAAE,EAAE,CAAC,YAAY,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC;YAE1D,IAAI,CAAC;gBACH,2DAA2D;gBAC3D,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,GAAG,CAC/B,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC,GAAG,CAAC,KAAK,EAAE,CAAC,OAAO,CAAC,EAAE,EAAE;oBAClD,sBAAsB;oBACtB,MAAM,IAAI,CAAC,SAAS,EAAE,CAAC;oBACvB,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,oBAAoB,CAAC,OAAO,CAAC,CAAC;oBACxD,OAAO,EAAE,OAAO,EAAE,GAAG,MAAM,EAAE,CAAC;gBAChC,CAAC,CAAC,CACH,CAAC;gBAEF,iBAAiB;gBACjB,KAAK,MAAM,EAAE,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,KAAK,EAAE,IAAI,OAAO,EAAE,CAAC;oBACzD,IAAI,KAAK,IAAI,CAAC,OAAO,IAAI,IAAI,CAAC,UAAU;wBAAE,SAAS;oBAEnD,IAAI,CAAC,aAAa,CAAC,OAAO,CAAC,CAAC;oBAE5B,MAAM;wBACJ,GAAG,EAAE,OAAO;wBACZ,IAAI,EAAE,IAAI,CAAC,cAAc,CAAC,OAAO,CAAC;wBAClC,OAAO;wBACP,KAAK,EAAE,IAAI,CAAC,YAAY,CAAC,OAAO,CAAC;qBAClC,CAAC;oBAEF,+CAA+C;oBAC/C,MAAM,YAAY,GAAG,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;oBAC7C,IAAI,YAAY,GAAG,IAAI,CAAC,QAAQ,EAAE,CAAC;wBACjC,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;4BACzB,MAAM,cAAc,GAAG,IAAI,CAAC,YAAY,CAAC,IAAI,CAAC,CAAC;4BAC/C,IAAI,IAAI,CAAC,WAAW,CAAC,cAAc,CAAC,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,cAAc,CAAC,EAAE,CAAC;gCAC1E,YAAY,CAAC,GAAG,CAAC,cAAc,EAAE,YAAY,GAAG,CAAC,CAAC,CAAC;4BACrD,CAAC;wBACH,CAAC;oBACH,CAAC;gBACH,CAAC;gBAED,4BAA4B;gBAC5B,MAAM,IAAI,OAAO,CAAC,OAAO,CAAC,EAAE,CAAC,UAAU,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC,CAAC;YAC1D,CAAC;YAAC,OAAO,CAAC,EAAE,CAAC;gBACX,OAAO,CAAC,KAAK,CAAC,0CAA0C,EAAE,CAAC,CAAC,CAAC;YAC/D,CAAC;QACH,CAAC;QAED,OAAO,CAAC,KAAK,CAAC,kCAAkC,CAAC,CAAC;IACpD,CAAC;IAEO,KAAK,CAAC,oBAAoB,CAAC,GAAW;QAK5C,OAAO,IAAI,CAAC,gBAAgB,CAAC,KAAK,IAAI,EAAE;YACtC,IAAI,CAAC;gBACH,oCAAoC;gBACpC,MAAM,UAAU,GAAG,IAAI,eAAe,EAAE,CAAC;gBACzC,MAAM,SAAS,GAAG,UAAU,CAAC,GAAG,EAAE,CAAC,UAAU,CAAC,KAAK,EAAE,EAAE,IAAI,CAAC,aAAa,CAAC,CAAC;gBAE3E,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;oBAChC,MAAM,EAAE,UAAU,CAAC,MAAM;oBACzB,OAAO,EAAE;wBACP,YAAY,EAAE,qHAAqH;qBACpI;iBACF,CAAC,CAAC;gBAEH,YAAY,CAAC,SAAS,CAAC,CAAC;gBAExB,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;oBACjB,MAAM,IAAI,KAAK,CAAC,uBAAuB,QAAQ,CAAC,MAAM,EAAE,CAAC,CAAC;gBAC5D,CAAC;gBAED,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;gBACtC,MAAM,KAAK,GAAG,IAAI,CAAC,YAAY,CAAC,OAAO,EAAE,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC;gBAEvD,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,CAAC;YAC5B,CAAC;YAAC,OAAO,CAAC,EAAE,CAAC;gBACX,IAAI,CAAC,YAAY,KAAK,EAAE,CAAC;oBACvB,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,EAAE,EAAE,KAAK,EAAE,CAAC,EAAE,CAAC;gBAChD,CAAC;gBACD,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,EAAE,EAAE,KAAK,EAAE,IAAI,KAAK,CAAC,wBAAwB,CAAC,EAAE,CAAC;YAClF,CAAC;QACH,CAAC,CAAC,CAAC;IACL,CAAC;IAEO,YAAY,CAAC,IAAY,EAAE,OAAY;QAC7C,IAAI,CAAC;YACH,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAC7B,MAAM,KAAK,GAAG,IAAI,GAAG,EAAU,CAAC;YAEhC,yDAAyD;YACzD,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,EAAE;gBACzB,MAAM,IAAI,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;gBACrC,IAAI,CAAC,IAAI;oBAAE,OAAO;gBAElB,IAAI,CAAC;oBACH,MAAM,GAAG,GAAG,IAAI,GAAG,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;oBACnC,MAAM,aAAa,GAAG,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,QAAQ,EAAE,CAAC,CAAC;oBAExD,mCAAmC;oBACnC,IAAI,IAAI,CAAC,WAAW,CAAC,aAAa,CAAC,EAAE,CAAC;wBACpC,KAAK,CAAC,GAAG,CAAC,aAAa,CAAC,CAAC;oBAC3B,CAAC;gBACH,CAAC;gBAAC,OAAO,CAAC,EAAE,CAAC;oBACX,OAAO,CAAC,KAAK,CAAC,gCAAgC,IAAI,GAAG,EAAE,CAAC,CAAC,CAAC;gBAC5D,CAAC;YACH,CAAC,CAAC,CAAC;YAEH,OAAO,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QAC3B,CAAC;QAAC,OAAO,CAAC,EAAE,CAAC;YACX,OAAO,CAAC,KAAK,CAAC,0CAA0C,EAAE,CAAC,CAAC,CAAC;YAC7D,OAAO,EAAE,CAAC;QACZ,CAAC;IACH,CAAC;IAEO,YAAY,CAAC,IAAY;QAC/B,IAAI,CAAC;YACH,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAC7B,OAAO,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,IAAI,UAAU,CAAC;QAChD,CAAC;QAAC,OAAO,CAAC,EAAE,CAAC;YACX,OAAO,CAAC,KAAK,CAAC,0CAA0C,EAAE,CAAC,CAAC,CAAC;YAC7D,OAAO,UAAU,CAAC;QACpB,CAAC;IACH,CAAC;CACF"}
@@ -0,0 +1,21 @@
1
+ import { CrawlResult } from '../types.js';
2
+ import { BaseCrawler } from './base.js';
3
+ export declare class ChromiumCrawler extends BaseCrawler {
4
+ private readonly PCR_CONFIG;
5
+ private curCrawlCount;
6
+ private baseHostname;
7
+ private readonly BATCH_SIZE;
8
+ private readonly REACT_WAIT_TIME;
9
+ private readonly NAVIGATION_WAIT_TIME;
10
+ private readonly MAX_CONCURRENT_PAGES;
11
+ private readonly PAGE_TIMEOUT;
12
+ private readonly resourceCache;
13
+ constructor(maxDepth?: number, maxRequestsPerCrawl?: number, onProgress?: (progress: number, description: string) => void);
14
+ crawl(url: string): AsyncGenerator<CrawlResult, void, unknown>;
15
+ private extractPageContent;
16
+ private waitForDynamicContent;
17
+ private gotoPageAndHandleRedirects;
18
+ private configurePage;
19
+ private processSinglePage;
20
+ private getLinksFromPage;
21
+ }