@cosmocoder/mcp-web-docs 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (240) hide show
  1. package/LICENSE +22 -0
  2. package/README.md +368 -0
  3. package/build/__mocks__/embeddings.d.ts +17 -0
  4. package/build/__mocks__/embeddings.js +66 -0
  5. package/build/__mocks__/embeddings.js.map +1 -0
  6. package/build/config.d.ts +44 -0
  7. package/build/config.js +158 -0
  8. package/build/config.js.map +1 -0
  9. package/build/config.test.d.ts +1 -0
  10. package/build/config.test.js +165 -0
  11. package/build/config.test.js.map +1 -0
  12. package/build/crawler/auth.d.ts +128 -0
  13. package/build/crawler/auth.js +546 -0
  14. package/build/crawler/auth.js.map +1 -0
  15. package/build/crawler/auth.test.d.ts +1 -0
  16. package/build/crawler/auth.test.js +174 -0
  17. package/build/crawler/auth.test.js.map +1 -0
  18. package/build/crawler/base.d.ts +24 -0
  19. package/build/crawler/base.js +149 -0
  20. package/build/crawler/base.js.map +1 -0
  21. package/build/crawler/base.test.d.ts +1 -0
  22. package/build/crawler/base.test.js +234 -0
  23. package/build/crawler/base.test.js.map +1 -0
  24. package/build/crawler/browser-config.d.ts +2 -0
  25. package/build/crawler/browser-config.js +29 -0
  26. package/build/crawler/browser-config.js.map +1 -0
  27. package/build/crawler/browser-config.test.d.ts +1 -0
  28. package/build/crawler/browser-config.test.js +56 -0
  29. package/build/crawler/browser-config.test.js.map +1 -0
  30. package/build/crawler/cheerio.d.ts +11 -0
  31. package/build/crawler/cheerio.js +134 -0
  32. package/build/crawler/cheerio.js.map +1 -0
  33. package/build/crawler/chromium.d.ts +21 -0
  34. package/build/crawler/chromium.js +596 -0
  35. package/build/crawler/chromium.js.map +1 -0
  36. package/build/crawler/content-extractor-types.d.ts +25 -0
  37. package/build/crawler/content-extractor-types.js +2 -0
  38. package/build/crawler/content-extractor-types.js.map +1 -0
  39. package/build/crawler/content-extractors.d.ts +9 -0
  40. package/build/crawler/content-extractors.js +9 -0
  41. package/build/crawler/content-extractors.js.map +1 -0
  42. package/build/crawler/content-utils.d.ts +2 -0
  43. package/build/crawler/content-utils.js +22 -0
  44. package/build/crawler/content-utils.js.map +1 -0
  45. package/build/crawler/content-utils.test.d.ts +1 -0
  46. package/build/crawler/content-utils.test.js +99 -0
  47. package/build/crawler/content-utils.test.js.map +1 -0
  48. package/build/crawler/crawlee-crawler.d.ts +63 -0
  49. package/build/crawler/crawlee-crawler.js +342 -0
  50. package/build/crawler/crawlee-crawler.js.map +1 -0
  51. package/build/crawler/crawlee-crawler.test.d.ts +1 -0
  52. package/build/crawler/crawlee-crawler.test.js +280 -0
  53. package/build/crawler/crawlee-crawler.test.js.map +1 -0
  54. package/build/crawler/default-extractor.d.ts +4 -0
  55. package/build/crawler/default-extractor.js +26 -0
  56. package/build/crawler/default-extractor.js.map +1 -0
  57. package/build/crawler/default-extractor.test.d.ts +1 -0
  58. package/build/crawler/default-extractor.test.js +200 -0
  59. package/build/crawler/default-extractor.test.js.map +1 -0
  60. package/build/crawler/default.d.ts +11 -0
  61. package/build/crawler/default.js +138 -0
  62. package/build/crawler/default.js.map +1 -0
  63. package/build/crawler/docs-crawler.d.ts +26 -0
  64. package/build/crawler/docs-crawler.js +97 -0
  65. package/build/crawler/docs-crawler.js.map +1 -0
  66. package/build/crawler/docs-crawler.test.d.ts +1 -0
  67. package/build/crawler/docs-crawler.test.js +185 -0
  68. package/build/crawler/docs-crawler.test.js.map +1 -0
  69. package/build/crawler/factory.d.ts +6 -0
  70. package/build/crawler/factory.js +83 -0
  71. package/build/crawler/factory.js.map +1 -0
  72. package/build/crawler/github-pages-extractor.d.ts +4 -0
  73. package/build/crawler/github-pages-extractor.js +33 -0
  74. package/build/crawler/github-pages-extractor.js.map +1 -0
  75. package/build/crawler/github-pages-extractor.test.d.ts +1 -0
  76. package/build/crawler/github-pages-extractor.test.js +184 -0
  77. package/build/crawler/github-pages-extractor.test.js.map +1 -0
  78. package/build/crawler/github.d.ts +20 -0
  79. package/build/crawler/github.js +181 -0
  80. package/build/crawler/github.js.map +1 -0
  81. package/build/crawler/github.test.d.ts +1 -0
  82. package/build/crawler/github.test.js +326 -0
  83. package/build/crawler/github.test.js.map +1 -0
  84. package/build/crawler/puppeteer.d.ts +16 -0
  85. package/build/crawler/puppeteer.js +191 -0
  86. package/build/crawler/puppeteer.js.map +1 -0
  87. package/build/crawler/queue-manager.d.ts +43 -0
  88. package/build/crawler/queue-manager.js +169 -0
  89. package/build/crawler/queue-manager.js.map +1 -0
  90. package/build/crawler/queue-manager.test.d.ts +1 -0
  91. package/build/crawler/queue-manager.test.js +509 -0
  92. package/build/crawler/queue-manager.test.js.map +1 -0
  93. package/build/crawler/site-rules.d.ts +11 -0
  94. package/build/crawler/site-rules.js +104 -0
  95. package/build/crawler/site-rules.js.map +1 -0
  96. package/build/crawler/site-rules.test.d.ts +1 -0
  97. package/build/crawler/site-rules.test.js +139 -0
  98. package/build/crawler/site-rules.test.js.map +1 -0
  99. package/build/crawler/storybook-extractor.d.ts +34 -0
  100. package/build/crawler/storybook-extractor.js +767 -0
  101. package/build/crawler/storybook-extractor.js.map +1 -0
  102. package/build/crawler/storybook-extractor.test.d.ts +1 -0
  103. package/build/crawler/storybook-extractor.test.js +491 -0
  104. package/build/crawler/storybook-extractor.test.js.map +1 -0
  105. package/build/embeddings/fastembed.d.ts +25 -0
  106. package/build/embeddings/fastembed.js +188 -0
  107. package/build/embeddings/fastembed.js.map +1 -0
  108. package/build/embeddings/fastembed.test.d.ts +1 -0
  109. package/build/embeddings/fastembed.test.js +307 -0
  110. package/build/embeddings/fastembed.test.js.map +1 -0
  111. package/build/embeddings/openai.d.ts +8 -0
  112. package/build/embeddings/openai.js +56 -0
  113. package/build/embeddings/openai.js.map +1 -0
  114. package/build/embeddings/types.d.ts +4 -0
  115. package/build/embeddings/types.js +2 -0
  116. package/build/embeddings/types.js.map +1 -0
  117. package/build/index.d.ts +2 -0
  118. package/build/index.js +1007 -0
  119. package/build/index.js.map +1 -0
  120. package/build/index.test.d.ts +1 -0
  121. package/build/index.test.js +364 -0
  122. package/build/index.test.js.map +1 -0
  123. package/build/indexing/queue-manager.d.ts +36 -0
  124. package/build/indexing/queue-manager.js +86 -0
  125. package/build/indexing/queue-manager.js.map +1 -0
  126. package/build/indexing/queue-manager.test.d.ts +1 -0
  127. package/build/indexing/queue-manager.test.js +257 -0
  128. package/build/indexing/queue-manager.test.js.map +1 -0
  129. package/build/indexing/status.d.ts +39 -0
  130. package/build/indexing/status.js +207 -0
  131. package/build/indexing/status.js.map +1 -0
  132. package/build/indexing/status.test.d.ts +1 -0
  133. package/build/indexing/status.test.js +246 -0
  134. package/build/indexing/status.test.js.map +1 -0
  135. package/build/processor/content.d.ts +16 -0
  136. package/build/processor/content.js +286 -0
  137. package/build/processor/content.js.map +1 -0
  138. package/build/processor/content.test.d.ts +1 -0
  139. package/build/processor/content.test.js +369 -0
  140. package/build/processor/content.test.js.map +1 -0
  141. package/build/processor/markdown.d.ts +11 -0
  142. package/build/processor/markdown.js +256 -0
  143. package/build/processor/markdown.js.map +1 -0
  144. package/build/processor/markdown.test.d.ts +1 -0
  145. package/build/processor/markdown.test.js +312 -0
  146. package/build/processor/markdown.test.js.map +1 -0
  147. package/build/processor/metadata-parser.d.ts +37 -0
  148. package/build/processor/metadata-parser.js +245 -0
  149. package/build/processor/metadata-parser.js.map +1 -0
  150. package/build/processor/metadata-parser.test.d.ts +1 -0
  151. package/build/processor/metadata-parser.test.js +357 -0
  152. package/build/processor/metadata-parser.test.js.map +1 -0
  153. package/build/processor/processor.d.ts +8 -0
  154. package/build/processor/processor.js +190 -0
  155. package/build/processor/processor.js.map +1 -0
  156. package/build/processor/processor.test.d.ts +1 -0
  157. package/build/processor/processor.test.js +357 -0
  158. package/build/processor/processor.test.js.map +1 -0
  159. package/build/rag/cache.d.ts +10 -0
  160. package/build/rag/cache.js +10 -0
  161. package/build/rag/cache.js.map +1 -0
  162. package/build/rag/code-generator.d.ts +11 -0
  163. package/build/rag/code-generator.js +30 -0
  164. package/build/rag/code-generator.js.map +1 -0
  165. package/build/rag/context-assembler.d.ts +23 -0
  166. package/build/rag/context-assembler.js +113 -0
  167. package/build/rag/context-assembler.js.map +1 -0
  168. package/build/rag/docs-search.d.ts +55 -0
  169. package/build/rag/docs-search.js +380 -0
  170. package/build/rag/docs-search.js.map +1 -0
  171. package/build/rag/pipeline.d.ts +26 -0
  172. package/build/rag/pipeline.js +91 -0
  173. package/build/rag/pipeline.js.map +1 -0
  174. package/build/rag/query-processor.d.ts +14 -0
  175. package/build/rag/query-processor.js +57 -0
  176. package/build/rag/query-processor.js.map +1 -0
  177. package/build/rag/reranker.d.ts +55 -0
  178. package/build/rag/reranker.js +210 -0
  179. package/build/rag/reranker.js.map +1 -0
  180. package/build/rag/response-generator.d.ts +20 -0
  181. package/build/rag/response-generator.js +101 -0
  182. package/build/rag/response-generator.js.map +1 -0
  183. package/build/rag/retriever.d.ts +19 -0
  184. package/build/rag/retriever.js +111 -0
  185. package/build/rag/retriever.js.map +1 -0
  186. package/build/rag/validator.d.ts +22 -0
  187. package/build/rag/validator.js +128 -0
  188. package/build/rag/validator.js.map +1 -0
  189. package/build/rag/version-manager.d.ts +23 -0
  190. package/build/rag/version-manager.js +98 -0
  191. package/build/rag/version-manager.js.map +1 -0
  192. package/build/setupTests.d.ts +4 -0
  193. package/build/setupTests.js +50 -0
  194. package/build/setupTests.js.map +1 -0
  195. package/build/storage/storage.d.ts +38 -0
  196. package/build/storage/storage.js +700 -0
  197. package/build/storage/storage.js.map +1 -0
  198. package/build/storage/storage.test.d.ts +1 -0
  199. package/build/storage/storage.test.js +338 -0
  200. package/build/storage/storage.test.js.map +1 -0
  201. package/build/types/rag.d.ts +27 -0
  202. package/build/types/rag.js +2 -0
  203. package/build/types/rag.js.map +1 -0
  204. package/build/types.d.ts +120 -0
  205. package/build/types.js +2 -0
  206. package/build/types.js.map +1 -0
  207. package/build/util/content-utils.d.ts +31 -0
  208. package/build/util/content-utils.js +120 -0
  209. package/build/util/content-utils.js.map +1 -0
  210. package/build/util/content.d.ts +1 -0
  211. package/build/util/content.js +16 -0
  212. package/build/util/content.js.map +1 -0
  213. package/build/util/docs.d.ts +1 -0
  214. package/build/util/docs.js +26 -0
  215. package/build/util/docs.js.map +1 -0
  216. package/build/util/docs.test.d.ts +1 -0
  217. package/build/util/docs.test.js +49 -0
  218. package/build/util/docs.test.js.map +1 -0
  219. package/build/util/favicon.d.ts +6 -0
  220. package/build/util/favicon.js +88 -0
  221. package/build/util/favicon.js.map +1 -0
  222. package/build/util/favicon.test.d.ts +1 -0
  223. package/build/util/favicon.test.js +140 -0
  224. package/build/util/favicon.test.js.map +1 -0
  225. package/build/util/logger.d.ts +17 -0
  226. package/build/util/logger.js +72 -0
  227. package/build/util/logger.js.map +1 -0
  228. package/build/util/logger.test.d.ts +1 -0
  229. package/build/util/logger.test.js +46 -0
  230. package/build/util/logger.test.js.map +1 -0
  231. package/build/util/security.d.ts +312 -0
  232. package/build/util/security.js +719 -0
  233. package/build/util/security.js.map +1 -0
  234. package/build/util/security.test.d.ts +1 -0
  235. package/build/util/security.test.js +524 -0
  236. package/build/util/security.test.js.map +1 -0
  237. package/build/util/site-detector.d.ts +22 -0
  238. package/build/util/site-detector.js +42 -0
  239. package/build/util/site-detector.js.map +1 -0
  240. package/package.json +112 -0
@@ -0,0 +1,138 @@
1
+ import { URL } from 'url';
2
+ import * as cheerio from 'cheerio';
3
+ import { BaseCrawler } from './base.js';
4
+ export class DefaultCrawler extends BaseCrawler {
5
+ BATCH_SIZE = 50;
6
+ FETCH_TIMEOUT = 30000; // 30 seconds
7
+ constructor(maxDepth = 4, maxRequestsPerCrawl = 1000, onProgress) {
8
+ super(maxDepth, maxRequestsPerCrawl, onProgress);
9
+ }
10
+ async *crawl(url) {
11
+ console.debug(`[${this.constructor.name}] Starting crawl from: ${url}`);
12
+ if (this.isAborting) {
13
+ console.debug('[DefaultCrawler] Crawl aborted');
14
+ return;
15
+ }
16
+ const startUrl = new URL(url);
17
+ const baseUrl = this.normalizeUrl(startUrl.toString());
18
+ // Track pages to process
19
+ const pagesToCrawl = new Map(); // URL -> depth
20
+ pagesToCrawl.set(baseUrl, 0);
21
+ while (pagesToCrawl.size > 0 && !this.isAborting) {
22
+ // Get batch of URLs to process
23
+ const batchEntries = Array.from(pagesToCrawl.entries()).slice(0, this.BATCH_SIZE);
24
+ const batch = new Map(batchEntries);
25
+ // Remove batch from queue
26
+ batchEntries.forEach(([url]) => pagesToCrawl.delete(url));
27
+ try {
28
+ // Process batch in parallel with timeout and rate limiting
29
+ const results = await Promise.all(Array.from(batch.entries()).map(async ([pageUrl]) => {
30
+ // Apply rate limiting
31
+ await this.rateLimit();
32
+ const result = await this.processPageWithRetry(pageUrl);
33
+ return { pageUrl, ...result };
34
+ }));
35
+ // Handle results
36
+ for (const { pageUrl, content, links, error } of results) {
37
+ if (error || !content || this.isAborting)
38
+ continue;
39
+ this.markUrlAsSeen(pageUrl);
40
+ yield {
41
+ url: pageUrl,
42
+ path: this.getPathFromUrl(pageUrl),
43
+ content,
44
+ title: this.extractTitle(content)
45
+ };
46
+ // Add new links to queue if within depth limit
47
+ const currentDepth = batch.get(pageUrl) || 0;
48
+ if (currentDepth < this.maxDepth) {
49
+ for (const link of links) {
50
+ const normalizedLink = this.normalizeUrl(link);
51
+ if (this.shouldCrawl(normalizedLink) && !pagesToCrawl.has(normalizedLink)) {
52
+ pagesToCrawl.set(normalizedLink, currentDepth + 1);
53
+ }
54
+ }
55
+ }
56
+ // Check if we've hit the request limit
57
+ if (this.seenUrls.size >= this.maxRequestsPerCrawl) {
58
+ console.debug('[DefaultCrawler] Max requests reached');
59
+ return;
60
+ }
61
+ }
62
+ // Add delay between batches
63
+ await new Promise(resolve => setTimeout(resolve, 1000));
64
+ }
65
+ catch (e) {
66
+ console.error('[DefaultCrawler] Error processing batch:', e);
67
+ }
68
+ }
69
+ console.debug('[DefaultCrawler] Crawl completed');
70
+ }
71
+ async processPageWithRetry(url) {
72
+ return this.retryWithBackoff(async () => {
73
+ try {
74
+ // Create fetch request with timeout
75
+ const controller = new AbortController();
76
+ const timeoutId = setTimeout(() => controller.abort(), this.FETCH_TIMEOUT);
77
+ const response = await fetch(url, {
78
+ signal: controller.signal,
79
+ headers: {
80
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
81
+ }
82
+ });
83
+ clearTimeout(timeoutId);
84
+ if (!response.ok) {
85
+ throw new Error(`HTTP error! status: ${response.status}`);
86
+ }
87
+ const content = await response.text();
88
+ const links = this.extractLinks(content, new URL(url));
89
+ return { content, links };
90
+ }
91
+ catch (e) {
92
+ if (e instanceof Error) {
93
+ return { content: null, links: [], error: e };
94
+ }
95
+ return { content: null, links: [], error: new Error('Unknown error occurred') };
96
+ }
97
+ });
98
+ }
99
+ extractLinks(html, baseUrl) {
100
+ try {
101
+ const $ = cheerio.load(html);
102
+ const links = new Set();
103
+ // Find all links, including those in navigation elements
104
+ $('a').each((_, element) => {
105
+ const href = $(element).attr('href');
106
+ if (!href)
107
+ return;
108
+ try {
109
+ const url = new URL(href, baseUrl);
110
+ const normalizedUrl = this.normalizeUrl(url.toString());
111
+ // Use BaseCrawler's URL validation
112
+ if (this.shouldCrawl(normalizedUrl)) {
113
+ links.add(normalizedUrl);
114
+ }
115
+ }
116
+ catch (e) {
117
+ console.debug(`[DefaultCrawler] Invalid URL ${href}:`, e);
118
+ }
119
+ });
120
+ return Array.from(links);
121
+ }
122
+ catch (e) {
123
+ console.error('[DefaultCrawler] Error extracting links:', e);
124
+ return [];
125
+ }
126
+ }
127
+ extractTitle(html) {
128
+ try {
129
+ const $ = cheerio.load(html);
130
+ return $('title').text().trim() || 'Untitled';
131
+ }
132
+ catch (e) {
133
+ console.error('[DefaultCrawler] Error extracting title:', e);
134
+ return 'Untitled';
135
+ }
136
+ }
137
+ }
138
+ //# sourceMappingURL=default.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"default.js","sourceRoot":"","sources":["../../src/crawler/default.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,GAAG,EAAE,MAAM,KAAK,CAAC;AAC1B,OAAO,KAAK,OAAO,MAAM,SAAS,CAAC;AAEnC,OAAO,EAAE,WAAW,EAAE,MAAM,WAAW,CAAC;AAExC,MAAM,OAAO,cAAe,SAAQ,WAAW;IAC5B,UAAU,GAAG,EAAE,CAAC;IAChB,aAAa,GAAG,KAAK,CAAC,CAAC,aAAa;IAErD,YACE,WAAmB,CAAC,EACpB,sBAA8B,IAAI,EAClC,UAA4D;QAE5D,KAAK,CAAC,QAAQ,EAAE,mBAAmB,EAAE,UAAU,CAAC,CAAC;IACnD,CAAC;IAED,KAAK,CAAC,CAAC,KAAK,CAAC,GAAW;QACtB,OAAO,CAAC,KAAK,CAAC,IAAI,IAAI,CAAC,WAAW,CAAC,IAAI,0BAA0B,GAAG,EAAE,CAAC,CAAC;QAExE,IAAI,IAAI,CAAC,UAAU,EAAE,CAAC;YACpB,OAAO,CAAC,KAAK,CAAC,gCAAgC,CAAC,CAAC;YAChD,OAAO;QACT,CAAC;QAED,MAAM,QAAQ,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;QAC9B,MAAM,OAAO,GAAG,IAAI,CAAC,YAAY,CAAC,QAAQ,CAAC,QAAQ,EAAE,CAAC,CAAC;QAEvD,yBAAyB;QACzB,MAAM,YAAY,GAAG,IAAI,GAAG,EAAkB,CAAC,CAAC,eAAe;QAC/D,YAAY,CAAC,GAAG,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;QAE7B,OAAO,YAAY,CAAC,IAAI,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,UAAU,EAAE,CAAC;YACjD,+BAA+B;YAC/B,MAAM,YAAY,GAAG,KAAK,CAAC,IAAI,CAAC,YAAY,CAAC,OAAO,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC,UAAU,CAAC,CAAC;YAClF,MAAM,KAAK,GAAG,IAAI,GAAG,CAAC,YAAY,CAAC,CAAC;YAEpC,0BAA0B;YAC1B,YAAY,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,EAAE,EAAE,CAAC,YAAY,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC;YAE1D,IAAI,CAAC;gBACH,2DAA2D;gBAC3D,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,GAAG,CAC/B,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC,GAAG,CAAC,KAAK,EAAE,CAAC,OAAO,CAAC,EAAE,EAAE;oBAClD,sBAAsB;oBACtB,MAAM,IAAI,CAAC,SAAS,EAAE,CAAC;oBACvB,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,oBAAoB,CAAC,OAAO,CAAC,CAAC;oBACxD,OAAO,EAAE,OAAO,EAAE,GAAG,MAAM,EAAE,CAAC;gBAChC,CAAC,CAAC,CACH,CAAC;gBAEF,iBAAiB;gBACjB,KAAK,MAAM,EAAE,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,KAAK,EAAE,IAAI,OAAO,EAAE,CAAC;oBACzD,IAAI,KAAK,IAAI,CAAC,OAAO,IAAI,IAAI,CAAC,UAAU;wBAAE,SAAS;oBAEnD,IAAI,CAAC,aAAa,CAAC,OAAO,CAAC,CAAC;oBAE5B,MAAM;wBACJ,GAAG,EAAE,OAAO;wBACZ,IAAI,EAAE,IAAI,CAAC,cAAc,CAAC,OAAO,CAAC;wBAClC,OAAO;wBACP,KAAK,EAAE,IAAI,CAAC,YAAY,CAAC,OAAO,CAAC;qBAClC,CAAC;oBAEF,+CAA+C;oBAC/C,MAAM,YAAY,GAAG,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;oBAC7C,IAAI,YAAY,GAAG,IAAI,CAAC,QAAQ,EAAE,CAAC;wBACjC,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;4BACzB,MAAM,cAAc,GAAG,IAAI,CAAC,YAAY,CAAC,IAAI,CAAC,CAAC;4BAC/C,IAAI,IAAI,CAAC,WAAW,CAAC,cAAc,CAAC,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,cAAc,CAAC,EAAE,CAAC;gCAC1E,YAAY,CAAC,GAAG,CAAC,cAAc,EAAE,YAAY,GAAG,CAAC,CAAC,CAAC;4BACrD,CAAC;wBACH,CAAC;oBACH,CAAC;oBAED,uCAAuC;oBACvC,IAAI,IAAI,CAAC,QAAQ,CAAC,IAAI,IAAI,IAAI,CAAC,mBAAmB,EAAE,CAAC;wBACnD,OAAO,CAAC,KAAK,CAAC,uCAAuC,CAAC,CAAC;wBACvD,OAAO;oBACT,CAAC;gBACH,CAAC;gBAED,4BAA4B;gBAC5B,MAAM,IAAI,OAAO,CAAC,OAAO,CAAC,EAAE,CAAC,UAAU,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC,CAAC;YAC1D,CAAC;YAAC,OAAO,CAAC,EAAE,CAAC;gBACX,OAAO,CAAC,KAAK,CAAC,0CAA0C,EAAE,CAAC,CAAC,CAAC;YAC/D,CAAC;QACH,CAAC;QAED,OAAO,CAAC,KAAK,CAAC,kCAAkC,CAAC,CAAC;IACpD,CAAC;IAEO,KAAK,CAAC,oBAAoB,CAAC,GAAW;QAK5C,OAAO,IAAI,CAAC,gBAAgB,CAAC,KAAK,IAAI,EAAE;YACtC,IAAI,CAAC;gBACH,oCAAoC;gBACpC,MAAM,UAAU,GAAG,IAAI,eAAe,EAAE,CAAC;gBACzC,MAAM,SAAS,GAAG,UAAU,CAAC,GAAG,EAAE,CAAC,UAAU,CAAC,KAAK,EAAE,EAAE,IAAI,CAAC,aAAa,CAAC,CAAC;gBAE3E,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;oBAChC,MAAM,EAAE,UAAU,CAAC,MAAM;oBACzB,OAAO,EAAE;wBACP,YAAY,EAAE,qHAAqH;qBACpI;iBACF,CAAC,CAAC;gBAEH,YAAY,CAAC,SAAS,CAAC,CAAC;gBAExB,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;oBACjB,MAAM,IAAI,KAAK,CAAC,uBAAuB,QAAQ,CAAC,MAAM,EAAE,CAAC,CAAC;gBAC5D,CAAC;gBAED,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;gBACtC,MAAM,KAAK,GAAG,IAAI,CAAC,YAAY,CAAC,OAAO,EAAE,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC;gBAEvD,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,CAAC;YAC5B,CAAC;YAAC,OAAO,CAAC,EAAE,CAAC;gBACX,IAAI,CAAC,YAAY,KAAK,EAAE,CAAC;oBACvB,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,EAAE,EAAE,KAAK,EAAE,CAAC,EAAE,CAAC;gBAChD,CAAC;gBACD,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,EAAE,EAAE,KAAK,EAAE,IAAI,KAAK,CAAC,wBAAwB,CAAC,EAAE,CAAC;YAClF,CAAC;QACH,CAAC,CAAC,CAAC;IACL,CAAC;IAEO,YAAY,CAAC,IAAY,EAAE,OAAY;QAC7C,IAAI,CAAC;YACH,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAC7B,MAAM,KAAK,GAAG,IAAI,GAAG,EAAU,CAAC;YAEhC,yDAAyD;YACzD,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,EAAE;gBACzB,MAAM,IAAI,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;gBACrC,IAAI,CAAC,IAAI;oBAAE,OAAO;gBAElB,IAAI,CAAC;oBACH,MAAM,GAAG,GAAG,IAAI,GAAG,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;oBACnC,MAAM,aAAa,GAAG,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,QAAQ,EAAE,CAAC,CAAC;oBAExD,mCAAmC;oBACnC,IAAI,IAAI,CAAC,WAAW,CAAC,aAAa,CAAC,EAAE,CAAC;wBACpC,KAAK,CAAC,GAAG,CAAC,aAAa,CAAC,CAAC;oBAC3B,CAAC;gBACH,CAAC;gBAAC,OAAO,CAAC,EAAE,CAAC;oBACX,OAAO,CAAC,KAAK,CAAC,gCAAgC,IAAI,GAAG,EAAE,CAAC,CAAC,CAAC;gBAC5D,CAAC;YACH,CAAC,CAAC,CAAC;YAEH,OAAO,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QAC3B,CAAC;QAAC,OAAO,CAAC,EAAE,CAAC;YACX,OAAO,CAAC,KAAK,CAAC,0CAA0C,EAAE,CAAC,CAAC,CAAC;YAC7D,OAAO,EAAE,CAAC;QACZ,CAAC;IACH,CAAC;IAEO,YAAY,CAAC,IAAY;QAC/B,IAAI,CAAC;YACH,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAC7B,OAAO,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,IAAI,UAAU,CAAC;QAChD,CAAC;QAAC,OAAO,CAAC,EAAE,CAAC;YACX,OAAO,CAAC,KAAK,CAAC,0CAA0C,EAAE,CAAC,CAAC,CAAC;YAC7D,OAAO,UAAU,CAAC;QACpB,CAAC;IACH,CAAC;CACF"}
@@ -0,0 +1,26 @@
1
+ import { CrawlResult, DocsCrawlerType, WebCrawler } from '../types.js';
2
+ import { StorageState } from './crawlee-crawler.js';
3
+ export declare class DocsCrawler implements WebCrawler {
4
+ private readonly maxDepth;
5
+ private readonly maxRequestsPerCrawl;
6
+ private readonly githubToken?;
7
+ private readonly onProgress?;
8
+ private readonly GITHUB_HOST;
9
+ private readonly MIN_PAGES;
10
+ private isAborting;
11
+ private storageState?;
12
+ private pathPrefix?;
13
+ constructor(maxDepth?: number, maxRequestsPerCrawl?: number, githubToken?: string | undefined, onProgress?: ((progress: number, description: string) => void) | undefined);
14
+ /**
15
+ * Set an optional path prefix to restrict crawling to URLs under this path.
16
+ * Only pages whose path starts with this prefix will be crawled.
17
+ * Example: '/oss/javascript/langchain' would only crawl pages under that path.
18
+ */
19
+ setPathPrefix(prefix: string): void;
20
+ /**
21
+ * Set authentication storage state (cookies) to use when crawling
22
+ */
23
+ setStorageState(state: StorageState): void;
24
+ crawl(url: string): AsyncGenerator<CrawlResult, DocsCrawlerType>;
25
+ abort(): void;
26
+ }
@@ -0,0 +1,97 @@
1
+ import { URL } from 'url';
2
+ import { CrawleeCrawler } from './crawlee-crawler.js';
3
+ import { GitHubCrawler } from './github.js';
4
+ import { logger } from '../util/logger.js';
5
+ export class DocsCrawler {
6
+ maxDepth;
7
+ maxRequestsPerCrawl;
8
+ githubToken;
9
+ onProgress;
10
+ GITHUB_HOST = 'github.com';
11
+ MIN_PAGES = 2; // Require at least 2 pages for component libraries
12
+ isAborting = false;
13
+ storageState;
14
+ pathPrefix;
15
+ constructor(maxDepth = 4, maxRequestsPerCrawl = 1000, githubToken, onProgress) {
16
+ this.maxDepth = maxDepth;
17
+ this.maxRequestsPerCrawl = maxRequestsPerCrawl;
18
+ this.githubToken = githubToken;
19
+ this.onProgress = onProgress;
20
+ }
21
+ /**
22
+ * Set an optional path prefix to restrict crawling to URLs under this path.
23
+ * Only pages whose path starts with this prefix will be crawled.
24
+ * Example: '/oss/javascript/langchain' would only crawl pages under that path.
25
+ */
26
+ setPathPrefix(prefix) {
27
+ this.pathPrefix = prefix;
28
+ logger.info(`[DocsCrawler] Path prefix restriction set: ${prefix}`);
29
+ }
30
+ /**
31
+ * Set authentication storage state (cookies) to use when crawling
32
+ */
33
+ setStorageState(state) {
34
+ this.storageState = state;
35
+ logger.info(`[DocsCrawler] Set storage state with ${state.cookies?.length || 0} cookies`);
36
+ }
37
+ async *crawl(url) {
38
+ const startUrl = new URL(url);
39
+ logger.debug(`[DocsCrawler] Starting crawl of ${startUrl}`);
40
+ if (this.isAborting) {
41
+ logger.debug('[DocsCrawler] Crawl aborted');
42
+ return 'crawlee';
43
+ }
44
+ // Handle GitHub repositories
45
+ if (startUrl.host === this.GITHUB_HOST) {
46
+ logger.debug('[DocsCrawler] Detected GitHub repository');
47
+ const githubCrawler = new GitHubCrawler(this.maxDepth, this.maxRequestsPerCrawl, this.githubToken, this.onProgress);
48
+ try {
49
+ for await (const page of githubCrawler.crawl(url)) {
50
+ if (this.isAborting)
51
+ break;
52
+ yield page;
53
+ }
54
+ return 'github';
55
+ }
56
+ catch (e) {
57
+ logger.debug('[DocsCrawler] GitHub crawler failed:', e);
58
+ // Don't fall through to other crawlers for GitHub URLs
59
+ throw e;
60
+ }
61
+ }
62
+ // Use Crawlee for all other sites
63
+ logger.debug('[DocsCrawler] Using Crawlee crawler');
64
+ const crawleeCrawler = new CrawleeCrawler(this.maxDepth, this.maxRequestsPerCrawl, this.onProgress);
65
+ // Pass authentication if available
66
+ if (this.storageState) {
67
+ crawleeCrawler.setStorageState(this.storageState);
68
+ }
69
+ // Pass path prefix restriction if configured
70
+ if (this.pathPrefix) {
71
+ crawleeCrawler.setPathPrefix(this.pathPrefix);
72
+ }
73
+ let pageCount = 0;
74
+ try {
75
+ for await (const page of crawleeCrawler.crawl(url)) {
76
+ if (this.isAborting)
77
+ break;
78
+ pageCount++;
79
+ yield page;
80
+ }
81
+ if (pageCount >= this.MIN_PAGES) {
82
+ logger.debug(`[DocsCrawler] Crawlee crawler successful (${pageCount} pages)`);
83
+ return 'crawlee';
84
+ }
85
+ logger.debug(`[DocsCrawler] Crawlee crawler found insufficient pages (${pageCount})`);
86
+ throw new Error(`Crawlee crawler found only ${pageCount} pages, need at least ${this.MIN_PAGES}`);
87
+ }
88
+ catch (e) {
89
+ logger.debug('[DocsCrawler] Crawlee crawler failed:', e);
90
+ throw e;
91
+ }
92
+ }
93
+ abort() {
94
+ this.isAborting = true;
95
+ }
96
+ }
97
+ //# sourceMappingURL=docs-crawler.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"docs-crawler.js","sourceRoot":"","sources":["../../src/crawler/docs-crawler.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,GAAG,EAAE,MAAM,KAAK,CAAC;AAE1B,OAAO,EAAE,cAAc,EAAgB,MAAM,sBAAsB,CAAC;AACpE,OAAO,EAAE,aAAa,EAAE,MAAM,aAAa,CAAC;AAC5C,OAAO,EAAE,MAAM,EAAE,MAAM,mBAAmB,CAAC;AAE3C,MAAM,OAAO,WAAW;IAQH;IACA;IACA;IACA;IAVF,WAAW,GAAG,YAAY,CAAC;IAC3B,SAAS,GAAG,CAAC,CAAC,CAAC,mDAAmD;IAC3E,UAAU,GAAG,KAAK,CAAC;IACnB,YAAY,CAAgB;IAC5B,UAAU,CAAU;IAE5B,YACmB,WAAmB,CAAC,EACpB,sBAA8B,IAAI,EAClC,WAAoB,EACpB,UAA4D;QAH5D,aAAQ,GAAR,QAAQ,CAAY;QACpB,wBAAmB,GAAnB,mBAAmB,CAAe;QAClC,gBAAW,GAAX,WAAW,CAAS;QACpB,eAAU,GAAV,UAAU,CAAkD;IAC5E,CAAC;IAEJ;;;;OAIG;IACH,aAAa,CAAC,MAAc;QAC1B,IAAI,CAAC,UAAU,GAAG,MAAM,CAAC;QACzB,MAAM,CAAC,IAAI,CAAC,8CAA8C,MAAM,EAAE,CAAC,CAAC;IACtE,CAAC;IAED;;OAEG;IACH,eAAe,CAAC,KAAmB;QACjC,IAAI,CAAC,YAAY,GAAG,KAAK,CAAC;QAC1B,MAAM,CAAC,IAAI,CAAC,wCAAwC,KAAK,CAAC,OAAO,EAAE,MAAM,IAAI,CAAC,UAAU,CAAC,CAAC;IAC5F,CAAC;IAED,KAAK,CAAC,CAAC,KAAK,CAAC,GAAW;QACtB,MAAM,QAAQ,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;QAC9B,MAAM,CAAC,KAAK,CAAC,mCAAmC,QAAQ,EAAE,CAAC,CAAC;QAE5D,IAAI,IAAI,CAAC,UAAU,EAAE,CAAC;YACpB,MAAM,CAAC,KAAK,CAAC,6BAA6B,CAAC,CAAC;YAC5C,OAAO,SAAS,CAAC;QACnB,CAAC;QAED,6BAA6B;QAC7B,IAAI,QAAQ,CAAC,IAAI,KAAK,IAAI,CAAC,WAAW,EAAE,CAAC;YACvC,MAAM,CAAC,KAAK,CAAC,0CAA0C,CAAC,CAAC;YACzD,MAAM,aAAa,GAAG,IAAI,aAAa,CAAC,IAAI,CAAC,QAAQ,EAAE,IAAI,CAAC,mBAAmB,EAAE,IAAI,CAAC,WAAW,EAAE,IAAI,CAAC,UAAU,CAAC,CAAC;YAEpH,IAAI,CAAC;gBACH,IAAI,KAAK,EAAE,MAAM,IAAI,IAAI,aAAa,CAAC,KAAK,CAAC,GAAG,CAAC,EAAE,CAAC;oBAClD,IAAI,IAAI,CAAC,UAAU;wBAAE,MAAM;oBAC3B,MAAM,IAAI,CAAC;gBACb,CAAC;gBACD,OAAO,QAAQ,CAAC;YAClB,CAAC;YAAC,OAAO,CAAC,EAAE,CAAC;gBACX,MAAM,CAAC,KAAK,CAAC,sCAAsC,EAAE,CAAC,CAAC,CAAC;gBACxD,uDAAuD;gBACvD,MAAM,CAAC,CAAC;YACV,CAAC;QACH,CAAC;QAED,kCAAkC;QAClC,MAAM,CAAC,KAAK,CAAC,qCAAqC,CAAC,CAAC;QACpD,MAAM,cAAc,GAAG,IAAI,cAAc,CAAC,IAAI,CAAC,QAAQ,EAAE,IAAI,CAAC,mBAAmB,EAAE,IAAI,CAAC,UAAU,CAAC,CAAC;QAEpG,mCAAmC;QACnC,IAAI,IAAI,CAAC,YAAY,EAAE,CAAC;YACtB,cAAc,CAAC,eAAe,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;QACpD,CAAC;QAED,6CAA6C;QAC7C,IAAI,IAAI,CAAC,UAAU,EAAE,CAAC;YACpB,cAAc,CAAC,aAAa,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;QAChD,CAAC;QAED,IAAI,SAAS,GAAG,CAAC,CAAC;QAElB,IAAI,CAAC;YACH,IAAI,KAAK,EAAE,MAAM,IAAI,IAAI,cAAc,CAAC,KAAK,CAAC,GAAG,CAAC,EAAE,CAAC;gBACnD,IAAI,IAAI,CAAC,UAAU;oBAAE,MAAM;gBAC3B,SAAS,EAAE,CAAC;gBACZ,MAAM,IAAI,CAAC;YACb,CAAC;YAED,IAAI,SAAS,IAAI,IAAI,CAAC,SAAS,EAAE,CAAC;gBAChC,MAAM,CAAC,KAAK,CAAC,6CAA6C,SAAS,SAAS,CAAC,CAAC;gBAC9E,OAAO,SAAS,CAAC;YACnB,CAAC;YACD,MAAM,CAAC,KAAK,CAAC,2DAA2D,SAAS,GAAG,CAAC,CAAC;YACtF,MAAM,IAAI,KAAK,CAAC,8BAA8B,SAAS,yBAAyB,IAAI,CAAC,SAAS,EAAE,CAAC,CAAC;QACpG,CAAC;QAAC,OAAO,CAAC,EAAE,CAAC;YACX,MAAM,CAAC,KAAK,CAAC,uCAAuC,EAAE,CAAC,CAAC,CAAC;YACzD,MAAM,CAAC,CAAC;QACV,CAAC;IACH,CAAC;IAED,KAAK;QACH,IAAI,CAAC,UAAU,GAAG,IAAI,CAAC;IACzB,CAAC;CACF"}
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,185 @@
1
+ import { DocsCrawler } from './docs-crawler.js';
2
+ const mockGitHubCrawl = vi.fn();
3
+ vi.mock('./github.js', () => ({
4
+ GitHubCrawler: function () {
5
+ return {
6
+ crawl: mockGitHubCrawl,
7
+ };
8
+ },
9
+ }));
10
+ const mockCrawleeCrawl = vi.fn();
11
+ const mockSetStorageState = vi.fn();
12
+ vi.mock('./crawlee-crawler.js', () => ({
13
+ CrawleeCrawler: function () {
14
+ return {
15
+ crawl: mockCrawleeCrawl,
16
+ setStorageState: mockSetStorageState,
17
+ };
18
+ },
19
+ }));
20
+ describe('DocsCrawler', () => {
21
+ let crawler;
22
+ beforeEach(() => {
23
+ vi.clearAllMocks();
24
+ crawler = new DocsCrawler();
25
+ });
26
+ describe('constructor', () => {
27
+ it('should initialize with default values', () => {
28
+ expect(crawler).toBeDefined();
29
+ });
30
+ it('should accept custom parameters', () => {
31
+ const customCrawler = new DocsCrawler(10, 500, 'github_token', vi.fn());
32
+ expect(customCrawler).toBeDefined();
33
+ });
34
+ });
35
+ describe('crawl', () => {
36
+ describe('GitHub URLs', () => {
37
+ it('should use GitHubCrawler for github.com URLs', async () => {
38
+ const mockResults = [
39
+ { url: 'https://github.com/owner/repo/README.md', path: 'README.md', content: '# README', title: 'README' },
40
+ ];
41
+ mockGitHubCrawl.mockImplementation(async function* () {
42
+ for (const result of mockResults) {
43
+ yield result;
44
+ }
45
+ });
46
+ const results = [];
47
+ const generator = crawler.crawl('https://github.com/owner/repo');
48
+ for await (const result of generator) {
49
+ results.push(result);
50
+ }
51
+ expect(results).toHaveLength(1);
52
+ expect(results[0].url).toContain('github.com');
53
+ });
54
+ it('should return github type for GitHub URLs', async () => {
55
+ mockGitHubCrawl.mockImplementation(async function* () {
56
+ yield { url: 'https://github.com/owner/repo', path: '/', content: 'test', title: 'Test' };
57
+ });
58
+ const generator = crawler.crawl('https://github.com/owner/repo');
59
+ // Manually iterate to capture the return value
60
+ let result = await generator.next();
61
+ while (!result.done) {
62
+ result = await generator.next();
63
+ }
64
+ const crawlerType = result.value;
65
+ expect(crawlerType).toBe('github');
66
+ });
67
+ it('should propagate errors from GitHubCrawler', async () => {
68
+ // eslint-disable-next-line require-yield
69
+ mockGitHubCrawl.mockImplementation(async function* () {
70
+ throw new Error('GitHub API error');
71
+ });
72
+ const generator = crawler.crawl('https://github.com/owner/repo');
73
+ await expect(async () => {
74
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
75
+ for await (const _ of generator) {
76
+ // Just consume results
77
+ }
78
+ }).rejects.toThrow('GitHub API error');
79
+ });
80
+ });
81
+ describe('Non-GitHub URLs', () => {
82
+ it('should use CrawleeCrawler for non-GitHub URLs', async () => {
83
+ const mockResults = [
84
+ { url: 'https://docs.example.com/guide', path: '/guide', content: '<h1>Guide</h1>', title: 'Guide' },
85
+ { url: 'https://docs.example.com/api', path: '/api', content: '<h1>API</h1>', title: 'API' },
86
+ ];
87
+ mockCrawleeCrawl.mockImplementation(async function* () {
88
+ for (const result of mockResults) {
89
+ yield result;
90
+ }
91
+ });
92
+ const results = [];
93
+ for await (const result of crawler.crawl('https://docs.example.com')) {
94
+ results.push(result);
95
+ }
96
+ expect(results).toHaveLength(2);
97
+ });
98
+ it('should return crawlee type for sufficient pages', async () => {
99
+ mockCrawleeCrawl.mockImplementation(async function* () {
100
+ yield { url: 'https://example.com/page1', path: '/page1', content: 'Page 1', title: 'Page 1' };
101
+ yield { url: 'https://example.com/page2', path: '/page2', content: 'Page 2', title: 'Page 2' };
102
+ });
103
+ const generator = crawler.crawl('https://example.com');
104
+ // Manually iterate to capture the return value
105
+ let result = await generator.next();
106
+ while (!result.done) {
107
+ result = await generator.next();
108
+ }
109
+ const crawlerType = result.value;
110
+ expect(crawlerType).toBe('crawlee');
111
+ });
112
+ it('should throw error when insufficient pages found', async () => {
113
+ mockCrawleeCrawl.mockImplementation(async function* () {
114
+ yield { url: 'https://example.com/page1', path: '/page1', content: 'Page 1', title: 'Page 1' };
115
+ // Only 1 page, needs at least 2
116
+ });
117
+ const generator = crawler.crawl('https://example.com');
118
+ await expect(async () => {
119
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
120
+ for await (const _ of generator) {
121
+ // Just consume results
122
+ }
123
+ }).rejects.toThrow(/found only 1 pages/);
124
+ });
125
+ });
126
+ describe('abort', () => {
127
+ it('should stop crawling when aborted', async () => {
128
+ mockCrawleeCrawl.mockImplementation(async function* () {
129
+ yield { url: 'https://example.com/page1', path: '/page1', content: 'Page 1', title: 'Page 1' };
130
+ yield { url: 'https://example.com/page2', path: '/page2', content: 'Page 2', title: 'Page 2' };
131
+ yield { url: 'https://example.com/page3', path: '/page3', content: 'Page 3', title: 'Page 3' };
132
+ });
133
+ const results = [];
134
+ const generator = crawler.crawl('https://example.com');
135
+ // Get first result
136
+ const first = await generator.next();
137
+ if (!first.done) {
138
+ results.push(first.value);
139
+ }
140
+ // Abort
141
+ crawler.abort();
142
+ // Generator should stop yielding after abort (depending on implementation)
143
+ // The test verifies abort() method exists and is callable
144
+ expect(results).toHaveLength(1);
145
+ });
146
+ it('should return early when already aborting', async () => {
147
+ crawler.abort();
148
+ mockCrawleeCrawl.mockImplementation(async function* () {
149
+ yield { url: 'https://example.com/page1', path: '/page1', content: 'Page 1', title: 'Page 1' };
150
+ });
151
+ const generator = crawler.crawl('https://example.com');
152
+ const result = await generator.next();
153
+ // Should return immediately with crawlee type when aborted
154
+ expect(result.done).toBe(true);
155
+ expect(result.value).toBe('crawlee');
156
+ });
157
+ });
158
+ describe('setStorageState', () => {
159
+ it('should set storage state', () => {
160
+ const storageState = {
161
+ cookies: [{ name: 'session', value: 'abc123', domain: 'example.com', path: '/' }],
162
+ };
163
+ crawler.setStorageState(storageState);
164
+ // Verify the method doesn't throw
165
+ expect(true).toBe(true);
166
+ });
167
+ it('should pass storage state to CrawleeCrawler', async () => {
168
+ const storageState = {
169
+ cookies: [{ name: 'session', value: 'abc123', domain: 'example.com', path: '/' }],
170
+ };
171
+ crawler.setStorageState(storageState);
172
+ mockCrawleeCrawl.mockImplementation(async function* () {
173
+ yield { url: 'https://example.com/page1', path: '/page1', content: 'Page 1', title: 'Page 1' };
174
+ yield { url: 'https://example.com/page2', path: '/page2', content: 'Page 2', title: 'Page 2' };
175
+ });
176
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
177
+ for await (const _ of crawler.crawl('https://example.com')) {
178
+ // Just consume results
179
+ }
180
+ expect(mockSetStorageState).toHaveBeenCalledWith(storageState);
181
+ });
182
+ });
183
+ });
184
+ });
185
+ //# sourceMappingURL=docs-crawler.test.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"docs-crawler.test.js","sourceRoot":"","sources":["../../src/crawler/docs-crawler.test.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAEhD,MAAM,eAAe,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC;AAChC,EAAE,CAAC,IAAI,CAAC,aAAa,EAAE,GAAG,EAAE,CAAC,CAAC;IAC5B,aAAa,EAAE;QACb,OAAO;YACL,KAAK,EAAE,eAAe;SACvB,CAAC;IACJ,CAAC;CACF,CAAC,CAAC,CAAC;AAEJ,MAAM,gBAAgB,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC;AACjC,MAAM,mBAAmB,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC;AACpC,EAAE,CAAC,IAAI,CAAC,sBAAsB,EAAE,GAAG,EAAE,CAAC,CAAC;IACrC,cAAc,EAAE;QACd,OAAO;YACL,KAAK,EAAE,gBAAgB;YACvB,eAAe,EAAE,mBAAmB;SACrC,CAAC;IACJ,CAAC;CACF,CAAC,CAAC,CAAC;AAEJ,QAAQ,CAAC,aAAa,EAAE,GAAG,EAAE;IAC3B,IAAI,OAAoB,CAAC;IAEzB,UAAU,CAAC,GAAG,EAAE;QACd,EAAE,CAAC,aAAa,EAAE,CAAC;QACnB,OAAO,GAAG,IAAI,WAAW,EAAE,CAAC;IAC9B,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,aAAa,EAAE,GAAG,EAAE;QAC3B,EAAE,CAAC,uCAAuC,EAAE,GAAG,EAAE;YAC/C,MAAM,CAAC,OAAO,CAAC,CAAC,WAAW,EAAE,CAAC;QAChC,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,iCAAiC,EAAE,GAAG,EAAE;YACzC,MAAM,aAAa,GAAG,IAAI,WAAW,CAAC,EAAE,EAAE,GAAG,EAAE,cAAc,EAAE,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;YACxE,MAAM,CAAC,aAAa,CAAC,CAAC,WAAW,EAAE,CAAC;QACtC,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,OAAO,EAAE,GAAG,EAAE;QACrB,QAAQ,CAAC,aAAa,EAAE,GAAG,EAAE;YAC3B,EAAE,CAAC,8CAA8C,EAAE,KAAK,IAAI,EAAE;gBAC5D,MAAM,WAAW,GAAkB;oBACjC,EAAE,GAAG,EAAE,yCAAyC,EAAE,IAAI,EAAE,WAAW,EAAE,OAAO,EAAE,UAAU,EAAE,KAAK,EAAE,QAAQ,EAAE;iBAC5G,CAAC;gBAEF,eAAe,CAAC,kBAAkB,CAAC,KAAK,SAAS,CAAC;oBAChD,KAAK,MAAM,MAAM,IAAI,WAAW,EAAE,CAAC;wBACjC,MAAM,MAAM,CAAC;oBACf,CAAC;gBACH,CAAC,CAAC,CAAC;gBAEH,MAAM,OAAO,GAAkB,EAAE,CAAC;gBAClC,MAAM,SAAS,GAAG,OAAO,CAAC,KAAK,CAAC,+BAA+B,CAAC,CAAC;gBAEjE,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI,SAAS,EAAE,CAAC;oBACrC,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;gBACvB,CAAC;gBAED,MAAM,CAAC,OAAO,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;gBAChC,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,SAAS,CAAC,YAAY,CAAC,CAAC;YACjD,CAAC,CAAC,CAAC;YAEH,EAAE,CAAC,2CAA2C,EAAE,KAAK,IAAI,EAAE;gBACzD,eAAe,CAAC,kBAAkB,CAAC,KAAK,SAAS,CAAC;oBAChD,MAAM,EAAE,GAAG,EAAE,+BAA+B,EAAE,IAAI,EAAE,GAAG,EAAE,OAAO,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,CAAC;gBAC5F,CAAC,CAAC,CAAC;gBAEH,MAAM,SAAS,GAAG,OAAO,CAAC,KAAK,CAAC,+BAA+B,CAAC,CAAC;gBAEjE,+CAA+C;gBAC/C,IAAI,MAAM,GAAG,MAAM,SAAS,CAAC,IAAI,EAAE,CAAC;gBACpC,OAAO,CAAC,MAAM,CAAC,IAAI,EAAE,CAAC;oBACpB,MAAM,GAAG,MAAM,SAAS,CAAC,IAAI,EAAE,CAAC;gBAClC,CAAC;gBACD,MAAM,WAAW,GAAG,MAAM,CAAC,KAAK,CAAC;gBAEjC,MAAM,CAAC,WAAW,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;YACrC,CAAC,CAAC,CAAC;YAEH,EAAE,CAAC,4CAA4C,EAAE,KAAK,IAAI,EAAE;gBAC1D,yCAAyC;gBACzC,eAAe,CAAC,kBAAkB,CAAC,KAAK,SAAS,CAAC;oBAChD,MAAM,IAAI,KAAK,CAAC,kBAAkB,CAAC,CAAC;gBACtC,CAAC,CAAC,CAAC;gBAEH,MAAM,SAAS,GAAG,OAAO,CAAC,KAAK,CAAC,+BAA+B,CAAC,CAAC;gBAEjE,MAAM,MAAM,CAAC,KAAK,IAAI,EAAE;oBACtB,6DAA6D;oBAC7D,IAAI,KAAK,EAAE,MAAM,CAAC,IAAI,SAAS,EAAE,CAAC;wBAChC,uBAAuB;oBACzB,CAAC;gBACH,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,CAAC,kBAAkB,CAAC,CAAC;YACzC,CAAC,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;QAEH,QAAQ,CAAC,iBAAiB,EAAE,GAAG,EAAE;YAC/B,EAAE,CAAC,+CAA+C,EAAE,KAAK,IAAI,EAAE;gBAC7D,MAAM,WAAW,GAAkB;oBACjC,EAAE,GAAG,EAAE,gCAAgC,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,gBAAgB,EAAE,KAAK,EAAE,OAAO,EAAE;oBACpG,EAAE,GAAG,EAAE,8BAA8B,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,cAAc,EAAE,KAAK,EAAE,KAAK,EAAE;iBAC7F,CAAC;gBAEF,gBAAgB,CAAC,kBAAkB,CAAC,KAAK,SAAS,CAAC;oBACjD,KAAK,MAAM,MAAM,IAAI,WAAW,EAAE,CAAC;wBACjC,MAAM,MAAM,CAAC;oBACf,CAAC;gBACH,CAAC,CAAC,CAAC;gBAEH,MAAM,OAAO,GAAkB,EAAE,CAAC;gBAClC,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI,OAAO,CAAC,KAAK,CAAC,0BAA0B,CAAC,EAAE,CAAC;oBACrE,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;gBACvB,CAAC;gBAED,MAAM,CAAC,OAAO,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;YAClC,CAAC,CAAC,CAAC;YAEH,EAAE,CAAC,iDAAiD,EAAE,KAAK,IAAI,EAAE;gBAC/D,gBAAgB,CAAC,kBAAkB,CAAC,KAAK,SAAS,CAAC;oBACjD,MAAM,EAAE,GAAG,EAAE,2BAA2B,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,QAAQ,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC;oBAC/F,MAAM,EAAE,GAAG,EAAE,2BAA2B,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,QAAQ,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC;gBACjG,CAAC,CAAC,CAAC;gBAEH,MAAM,SAAS,GAAG,OAAO,CAAC,KAAK,CAAC,qBAAqB,CAAC,CAAC;gBAEvD,+CAA+C;gBAC/C,IAAI,MAAM,GAAG,MAAM,SAAS,CAAC,IAAI,EAAE,CAAC;gBACpC,OAAO,CAAC,MAAM,CAAC,IAAI,EAAE,CAAC;oBACpB,MAAM,GAAG,MAAM,SAAS,CAAC,IAAI,EAAE,CAAC;gBAClC,CAAC;gBACD,MAAM,WAAW,GAAG,MAAM,CAAC,KAAK,CAAC;gBAEjC,MAAM,CAAC,WAAW,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;YACtC,CAAC,CAAC,CAAC;YAEH,EAAE,CAAC,kDAAkD,EAAE,KAAK,IAAI,EAAE;gBAChE,gBAAgB,CAAC,kBAAkB,CAAC,KAAK,SAAS,CAAC;oBACjD,MAAM,EAAE,GAAG,EAAE,2BAA2B,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,QAAQ,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC;oBAC/F,gCAAgC;gBAClC,CAAC,CAAC,CAAC;gBAEH,MAAM,SAAS,GAAG,OAAO,CAAC,KAAK,CAAC,qBAAqB,CAAC,CAAC;gBAEvD,MAAM,MAAM,CAAC,KAAK,IAAI,EAAE;oBACtB,6DAA6D;oBAC7D,IAAI,KAAK,EAAE,MAAM,CAAC,IAAI,SAAS,EAAE,CAAC;wBAChC,uBAAuB;oBACzB,CAAC;gBACH,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,CAAC,oBAAoB,CAAC,CAAC;YAC3C,CAAC,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;QAEH,QAAQ,CAAC,OAAO,EAAE,GAAG,EAAE;YACrB,EAAE,CAAC,mCAAmC,EAAE,KAAK,IAAI,EAAE;gBACjD,gBAAgB,CAAC,kBAAkB,CAAC,KAAK,SAAS,CAAC;oBACjD,MAAM,EAAE,GAAG,EAAE,2BAA2B,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,QAAQ,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC;oBAC/F,MAAM,EAAE,GAAG,EAAE,2BAA2B,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,QAAQ,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC;oBAC/F,MAAM,EAAE,GAAG,EAAE,2BAA2B,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,QAAQ,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC;gBACjG,CAAC,CAAC,CAAC;gBAEH,MAAM,OAAO,GAAkB,EAAE,CAAC;gBAClC,MAAM,SAAS,GAAG,OAAO,CAAC,KAAK,CAAC,qBAAqB,CAAC,CAAC;gBAEvD,mBAAmB;gBACnB,MAAM,KAAK,GAAG,MAAM,SAAS,CAAC,IAAI,EAAE,CAAC;gBACrC,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC;oBAChB,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;gBAC5B,CAAC;gBAED,QAAQ;gBACR,OAAO,CAAC,KAAK,EAAE,CAAC;gBAEhB,2EAA2E;gBAC3E,0DAA0D;gBAC1D,MAAM,CAAC,OAAO,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;YAClC,CAAC,CAAC,CAAC;YAEH,EAAE,CAAC,2CAA2C,EAAE,KAAK,IAAI,EAAE;gBACzD,OAAO,CAAC,KAAK,EAAE,CAAC;gBAEhB,gBAAgB,CAAC,kBAAkB,CAAC,KAAK,SAAS,CAAC;oBACjD,MAAM,EAAE,GAAG,EAAE,2BAA2B,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,QAAQ,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC;gBACjG,CAAC,CAAC,CAAC;gBAEH,MAAM,SAAS,GAAG,OAAO,CAAC,KAAK,CAAC,qBAAqB,CAAC,CAAC;gBACvD,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,IAAI,EAAE,CAAC;gBAEtC,2DAA2D;gBAC3D,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;gBAC/B,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;YACvC,CAAC,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;QAEH,QAAQ,CAAC,iBAAiB,EAAE,GAAG,EAAE;YAC/B,EAAE,CAAC,0BAA0B,EAAE,GAAG,EAAE;gBAClC,MAAM,YAAY,GAAG;oBACnB,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,SAAS,EAAE,KAAK,EAAE,QAAQ,EAAE,MAAM,EAAE,aAAa,EAAE,IAAI,EAAE,GAAG,EAAE,CAAC;iBAClF,CAAC;gBAEF,OAAO,CAAC,eAAe,CAAC,YAAY,CAAC,CAAC;gBAEtC,kCAAkC;gBAClC,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAC1B,CAAC,CAAC,CAAC;YAEH,EAAE,CAAC,6CAA6C,EAAE,KAAK,IAAI,EAAE;gBAC3D,MAAM,YAAY,GAAG;oBACnB,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,SAAS,EAAE,KAAK,EAAE,QAAQ,EAAE,MAAM,EAAE,aAAa,EAAE,IAAI,EAAE,GAAG,EAAE,CAAC;iBAClF,CAAC;gBAEF,OAAO,CAAC,eAAe,CAAC,YAAY,CAAC,CAAC;gBAEtC,gBAAgB,CAAC,kBAAkB,CAAC,KAAK,SAAS,CAAC;oBACjD,MAAM,EAAE,GAAG,EAAE,2BAA2B,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,QAAQ,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC;oBAC/F,MAAM,EAAE,GAAG,EAAE,2BAA2B,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,QAAQ,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC;gBACjG,CAAC,CAAC,CAAC;gBAEH,6DAA6D;gBAC7D,IAAI,KAAK,EAAE,MAAM,CAAC,IAAI,OAAO,CAAC,KAAK,CAAC,qBAAqB,CAAC,EAAE,CAAC;oBAC3D,uBAAuB;gBACzB,CAAC;gBAED,MAAM,CAAC,mBAAmB,CAAC,CAAC,oBAAoB,CAAC,YAAY,CAAC,CAAC;YACjE,CAAC,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
@@ -0,0 +1,6 @@
1
+ import { BaseCrawler } from './base.js';
2
+ export declare class CrawlerFactory {
3
+ private static readonly JS_FRAMEWORK_INDICATORS;
4
+ private static detectSiteType;
5
+ static createCrawler(url: string, maxRequestsPerCrawl?: number, maxDepth?: number, onProgress?: (progress: number, description: string) => void): Promise<BaseCrawler>;
6
+ }
@@ -0,0 +1,83 @@
1
+ import { URL } from 'url';
2
+ import { DefaultCrawler } from './default.js';
3
+ import { ChromiumCrawler } from './chromium.js';
4
+ import { CheerioCrawler } from './cheerio.js';
5
+ export class CrawlerFactory {
6
+ // Common JavaScript framework identifiers
7
+ static JS_FRAMEWORK_INDICATORS = [
8
+ 'react',
9
+ 'vue',
10
+ 'angular',
11
+ 'next',
12
+ 'nuxt',
13
+ 'gatsby',
14
+ 'docusaurus',
15
+ 'vuepress',
16
+ 'gridsome',
17
+ 'svelte'
18
+ ];
19
+ static async detectSiteType(url) {
20
+ try {
21
+ const response = await fetch(url, {
22
+ headers: {
23
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
24
+ }
25
+ });
26
+ if (!response.ok) {
27
+ throw new Error(`HTTP error! status: ${response.status}`);
28
+ }
29
+ const html = await response.text();
30
+ // Check for JavaScript frameworks
31
+ const hasFramework = CrawlerFactory.JS_FRAMEWORK_INDICATORS.some(framework => html.toLowerCase().includes(framework));
32
+ // Check for JavaScript-heavy indicators
33
+ const isJsHeavy = (html.includes('data-react') ||
34
+ html.includes('ng-') ||
35
+ html.includes('v-') ||
36
+ html.includes('__NEXT_DATA__') ||
37
+ html.includes('nuxt') ||
38
+ html.includes('id="___gatsby"'));
39
+ return { isJsHeavy, hasFramework };
40
+ }
41
+ catch (e) {
42
+ console.error('[CrawlerFactory] Error detecting site type:', e);
43
+ return { isJsHeavy: false, hasFramework: false };
44
+ }
45
+ }
46
+ static async createCrawler(url, maxRequestsPerCrawl = 1000, maxDepth = 4, onProgress) {
47
+ const startUrl = new URL(url);
48
+ console.debug(`[CrawlerFactory] Creating crawler for ${startUrl}`);
49
+ // Check if site is JavaScript-heavy first
50
+ const { isJsHeavy, hasFramework } = await CrawlerFactory.detectSiteType(url);
51
+ // Try Chromium for JavaScript-heavy sites
52
+ if (isJsHeavy || hasFramework) {
53
+ console.debug(`[CrawlerFactory] Site appears to be JavaScript-heavy, using Chromium crawler`);
54
+ return new ChromiumCrawler(maxDepth, maxRequestsPerCrawl, onProgress);
55
+ }
56
+ // Try default crawler
57
+ try {
58
+ console.debug(`[CrawlerFactory] Attempting default crawler for ${url}`);
59
+ const defaultCrawler = new DefaultCrawler(maxDepth, maxRequestsPerCrawl, onProgress);
60
+ const generator = defaultCrawler.crawl(url);
61
+ const { value: firstPage, done } = await generator.next();
62
+ if (!done && firstPage?.content) {
63
+ console.debug('[CrawlerFactory] Successfully created default crawler');
64
+ return defaultCrawler;
65
+ }
66
+ }
67
+ catch (e) {
68
+ console.debug('[CrawlerFactory] Default crawler failed:', e);
69
+ }
70
+ // Fall back to Cheerio crawler
71
+ console.debug(`[CrawlerFactory] Attempting Cheerio crawler for ${url}`);
72
+ const cheerioCrawler = new CheerioCrawler(maxDepth, maxRequestsPerCrawl, onProgress);
73
+ const generator = cheerioCrawler.crawl(url);
74
+ const { value: firstPage, done } = await generator.next();
75
+ if (!done && firstPage?.content) {
76
+ console.debug('[CrawlerFactory] Successfully created Cheerio crawler');
77
+ return cheerioCrawler;
78
+ }
79
+ console.error(`[CrawlerFactory] All crawlers failed for ${url}`);
80
+ throw new Error(`Failed to create crawler for ${url}`);
81
+ }
82
+ }
83
+ //# sourceMappingURL=factory.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"factory.js","sourceRoot":"","sources":["../../src/crawler/factory.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,GAAG,EAAE,MAAM,KAAK,CAAC;AAE1B,OAAO,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AAC9C,OAAO,EAAE,eAAe,EAAE,MAAM,eAAe,CAAC;AAChD,OAAO,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AAE9C,MAAM,OAAO,cAAc;IACzB,0CAA0C;IAClC,MAAM,CAAU,uBAAuB,GAAG;QAChD,OAAO;QACP,KAAK;QACL,SAAS;QACT,MAAM;QACN,MAAM;QACN,QAAQ;QACR,YAAY;QACZ,UAAU;QACV,UAAU;QACV,QAAQ;KACT,CAAC;IAEM,MAAM,CAAC,KAAK,CAAC,cAAc,CAAC,GAAW;QAI7C,IAAI,CAAC;YACH,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;gBAChC,OAAO,EAAE;oBACP,YAAY,EAAE,qHAAqH;iBACpI;aACF,CAAC,CAAC;YAEH,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;gBACjB,MAAM,IAAI,KAAK,CAAC,uBAAuB,QAAQ,CAAC,MAAM,EAAE,CAAC,CAAC;YAC5D,CAAC;YAED,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;YAEnC,kCAAkC;YAClC,MAAM,YAAY,GAAG,cAAc,CAAC,uBAAuB,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,CAC3E,IAAI,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,SAAS,CAAC,CACvC,CAAC;YAEF,wCAAwC;YACxC,MAAM,SAAS,GAAG,CAChB,IAAI,CAAC,QAAQ,CAAC,YAAY,CAAC;gBAC3B,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC;gBACpB,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC;gBACnB,IAAI,CAAC,QAAQ,CAAC,eAAe,CAAC;gBAC9B,IAAI,CAAC,QAAQ,CAAC,MAAM,CAAC;gBACrB,IAAI,CAAC,QAAQ,CAAC,gBAAgB,CAAC,CAChC,CAAC;YAEF,OAAO,EAAE,SAAS,EAAE,YAAY,EAAE,CAAC;QACrC,CAAC;QAAC,OAAO,CAAC,EAAE,CAAC;YACX,OAAO,CAAC,KAAK,CAAC,6CAA6C,EAAE,CAAC,CAAC,CAAC;YAChE,OAAO,EAAE,SAAS,EAAE,KAAK,EAAE,YAAY,EAAE,KAAK,EAAE,CAAC;QACnD,CAAC;IACH,CAAC;IAED,MAAM,CAAC,KAAK,CAAC,aAAa,CACxB,GAAW,EACX,sBAA8B,IAAI,EAClC,WAAmB,CAAC,EACpB,UAA4D;QAE5D,MAAM,QAAQ,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;QAC9B,OAAO,CAAC,KAAK,CAAC,yCAAyC,QAAQ,EAAE,CAAC,CAAC;QAEnE,0CAA0C;QAC1C,MAAM,EAAE,SAAS,EAAE,YAAY,EAAE,GAAG,MAAM,cAAc,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC;QAE7E,0CAA0C;QAC1C,IAAI,SAAS,IAAI,YAAY,EAAE,CAAC;YAC9B,OAAO,CAAC,KAAK,CAAC,8EAA8E,CAAC,CAAC;YAC9F,OAAO,IAAI,eAAe,CAAC,QAAQ,EAAE,mBAAmB,EAAE,UAAU,CAAC,CAAC;QACxE,CAAC;QAED,sBAAsB;QACtB,IAAI,CAAC;YACH,OAAO,CAAC,KAAK,CAAC,mDAAmD,GAAG,EAAE,CAAC,CAAC;YACxE,MAAM,cAAc,GAAG,IAAI,cAAc,CAAC,QAAQ,EAAE,mBAAmB,EAAE,UAAU,CAAC,CAAC;YACrF,MAAM,SAAS,GAAG,cAAc,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;YAC5C,MAAM,EAAE,KAAK,EAAE,SAAS,EAAE,IAAI,EAAE,GAAG,MAAM,SAAS,CAAC,IAAI,EAAE,CAAC;YAE1D,IAAI,CAAC,IAAI,IAAI,SAAS,EAAE,OAAO,EAAE,CAAC;gBAChC,OAAO,CAAC,KAAK,CAAC,uDAAuD,CAAC,CAAC;gBACvE,OAAO,cAAc,CAAC;YACxB,CAAC;QACH,CAAC;QAAC,OAAO,CAAC,EAAE,CAAC;YACX,OAAO,CAAC,KAAK,CAAC,0CAA0C,EAAE,CAAC,CAAC,CAAC;QAC/D,CAAC;QAED,+BAA+B;QAC/B,OAAO,CAAC,KAAK,CAAC,mDAAmD,GAAG,EAAE,CAAC,CAAC;QACxE,MAAM,cAAc,GAAG,IAAI,cAAc,CAAC,QAAQ,EAAE,mBAAmB,EAAE,UAAU,CAAC,CAAC;QACrF,MAAM,SAAS,GAAG,cAAc,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;QAC5C,MAAM,EAAE,KAAK,EAAE,SAAS,EAAE,IAAI,EAAE,GAAG,MAAM,SAAS,CAAC,IAAI,EAAE,CAAC;QAE1D,IAAI,CAAC,IAAI,IAAI,SAAS,EAAE,OAAO,EAAE,CAAC;YAChC,OAAO,CAAC,KAAK,CAAC,uDAAuD,CAAC,CAAC;YACvE,OAAO,cAAc,CAAC;QACxB,CAAC;QAED,OAAO,CAAC,KAAK,CAAC,4CAA4C,GAAG,EAAE,CAAC,CAAC;QACjE,MAAM,IAAI,KAAK,CAAC,gCAAgC,GAAG,EAAE,CAAC,CAAC;IACzD,CAAC"}
@@ -0,0 +1,4 @@
1
+ import { ContentExtractor, ExtractedContent } from './content-extractor-types.js';
2
+ export declare class GitHubPagesExtractor implements ContentExtractor {
3
+ extractContent(document: Document): Promise<ExtractedContent>;
4
+ }
@@ -0,0 +1,33 @@
1
+ export class GitHubPagesExtractor {
2
+ async extractContent(document) {
3
+ // Remove navigation and footer
4
+ document.querySelectorAll('nav, header, footer').forEach((el) => el.remove());
5
+ // Get main content
6
+ const main = document.querySelector('main, article, .markdown-body');
7
+ if (!main) {
8
+ return {
9
+ content: '',
10
+ metadata: { type: 'overview' },
11
+ };
12
+ }
13
+ const clone = main.cloneNode(true);
14
+ clone.querySelectorAll('script, style').forEach((el) => el.remove());
15
+ // Extract title and description
16
+ const title = clone.querySelector('h1')?.textContent?.trim();
17
+ const firstParagraph = clone.querySelector('h1 + p')?.textContent?.trim();
18
+ return {
19
+ content: clone.textContent?.trim() || '',
20
+ metadata: {
21
+ type: 'overview',
22
+ pattern: {
23
+ name: title || '',
24
+ type: 'component',
25
+ description: firstParagraph || '',
26
+ usageContexts: [],
27
+ relatedPatterns: [],
28
+ },
29
+ },
30
+ };
31
+ }
32
+ }
33
+ //# sourceMappingURL=github-pages-extractor.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"github-pages-extractor.js","sourceRoot":"","sources":["../../src/crawler/github-pages-extractor.ts"],"names":[],"mappings":"AAEA,MAAM,OAAO,oBAAoB;IAC/B,KAAK,CAAC,cAAc,CAAC,QAAkB;QACrC,+BAA+B;QAC/B,QAAQ,CAAC,gBAAgB,CAAC,qBAAqB,CAAC,CAAC,OAAO,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,CAAC,MAAM,EAAE,CAAC,CAAC;QAE9E,mBAAmB;QACnB,MAAM,IAAI,GAAG,QAAQ,CAAC,aAAa,CAAC,+BAA+B,CAAC,CAAC;QACrE,IAAI,CAAC,IAAI,EAAE,CAAC;YACV,OAAO;gBACL,OAAO,EAAE,EAAE;gBACX,QAAQ,EAAE,EAAE,IAAI,EAAE,UAAU,EAAE;aAC/B,CAAC;QACJ,CAAC;QAED,MAAM,KAAK,GAAG,IAAI,CAAC,SAAS,CAAC,IAAI,CAAY,CAAC;QAC9C,KAAK,CAAC,gBAAgB,CAAC,eAAe,CAAC,CAAC,OAAO,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,CAAC,MAAM,EAAE,CAAC,CAAC;QAErE,gCAAgC;QAChC,MAAM,KAAK,GAAG,KAAK,CAAC,aAAa,CAAC,IAAI,CAAC,EAAE,WAAW,EAAE,IAAI,EAAE,CAAC;QAC7D,MAAM,cAAc,GAAG,KAAK,CAAC,aAAa,CAAC,QAAQ,CAAC,EAAE,WAAW,EAAE,IAAI,EAAE,CAAC;QAE1E,OAAO;YACL,OAAO,EAAE,KAAK,CAAC,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE;YACxC,QAAQ,EAAE;gBACR,IAAI,EAAE,UAAU;gBAChB,OAAO,EAAE;oBACP,IAAI,EAAE,KAAK,IAAI,EAAE;oBACjB,IAAI,EAAE,WAAW;oBACjB,WAAW,EAAE,cAAc,IAAI,EAAE;oBACjC,aAAa,EAAE,EAAE;oBACjB,eAAe,EAAE,EAAE;iBACpB;aACF;SACF,CAAC;IACJ,CAAC;CACF"}
@@ -0,0 +1 @@
1
+ export {};