@cosmocoder/mcp-web-docs 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +22 -0
- package/README.md +368 -0
- package/build/__mocks__/embeddings.d.ts +17 -0
- package/build/__mocks__/embeddings.js +66 -0
- package/build/__mocks__/embeddings.js.map +1 -0
- package/build/config.d.ts +44 -0
- package/build/config.js +158 -0
- package/build/config.js.map +1 -0
- package/build/config.test.d.ts +1 -0
- package/build/config.test.js +165 -0
- package/build/config.test.js.map +1 -0
- package/build/crawler/auth.d.ts +128 -0
- package/build/crawler/auth.js +546 -0
- package/build/crawler/auth.js.map +1 -0
- package/build/crawler/auth.test.d.ts +1 -0
- package/build/crawler/auth.test.js +174 -0
- package/build/crawler/auth.test.js.map +1 -0
- package/build/crawler/base.d.ts +24 -0
- package/build/crawler/base.js +149 -0
- package/build/crawler/base.js.map +1 -0
- package/build/crawler/base.test.d.ts +1 -0
- package/build/crawler/base.test.js +234 -0
- package/build/crawler/base.test.js.map +1 -0
- package/build/crawler/browser-config.d.ts +2 -0
- package/build/crawler/browser-config.js +29 -0
- package/build/crawler/browser-config.js.map +1 -0
- package/build/crawler/browser-config.test.d.ts +1 -0
- package/build/crawler/browser-config.test.js +56 -0
- package/build/crawler/browser-config.test.js.map +1 -0
- package/build/crawler/cheerio.d.ts +11 -0
- package/build/crawler/cheerio.js +134 -0
- package/build/crawler/cheerio.js.map +1 -0
- package/build/crawler/chromium.d.ts +21 -0
- package/build/crawler/chromium.js +596 -0
- package/build/crawler/chromium.js.map +1 -0
- package/build/crawler/content-extractor-types.d.ts +25 -0
- package/build/crawler/content-extractor-types.js +2 -0
- package/build/crawler/content-extractor-types.js.map +1 -0
- package/build/crawler/content-extractors.d.ts +9 -0
- package/build/crawler/content-extractors.js +9 -0
- package/build/crawler/content-extractors.js.map +1 -0
- package/build/crawler/content-utils.d.ts +2 -0
- package/build/crawler/content-utils.js +22 -0
- package/build/crawler/content-utils.js.map +1 -0
- package/build/crawler/content-utils.test.d.ts +1 -0
- package/build/crawler/content-utils.test.js +99 -0
- package/build/crawler/content-utils.test.js.map +1 -0
- package/build/crawler/crawlee-crawler.d.ts +63 -0
- package/build/crawler/crawlee-crawler.js +342 -0
- package/build/crawler/crawlee-crawler.js.map +1 -0
- package/build/crawler/crawlee-crawler.test.d.ts +1 -0
- package/build/crawler/crawlee-crawler.test.js +280 -0
- package/build/crawler/crawlee-crawler.test.js.map +1 -0
- package/build/crawler/default-extractor.d.ts +4 -0
- package/build/crawler/default-extractor.js +26 -0
- package/build/crawler/default-extractor.js.map +1 -0
- package/build/crawler/default-extractor.test.d.ts +1 -0
- package/build/crawler/default-extractor.test.js +200 -0
- package/build/crawler/default-extractor.test.js.map +1 -0
- package/build/crawler/default.d.ts +11 -0
- package/build/crawler/default.js +138 -0
- package/build/crawler/default.js.map +1 -0
- package/build/crawler/docs-crawler.d.ts +26 -0
- package/build/crawler/docs-crawler.js +97 -0
- package/build/crawler/docs-crawler.js.map +1 -0
- package/build/crawler/docs-crawler.test.d.ts +1 -0
- package/build/crawler/docs-crawler.test.js +185 -0
- package/build/crawler/docs-crawler.test.js.map +1 -0
- package/build/crawler/factory.d.ts +6 -0
- package/build/crawler/factory.js +83 -0
- package/build/crawler/factory.js.map +1 -0
- package/build/crawler/github-pages-extractor.d.ts +4 -0
- package/build/crawler/github-pages-extractor.js +33 -0
- package/build/crawler/github-pages-extractor.js.map +1 -0
- package/build/crawler/github-pages-extractor.test.d.ts +1 -0
- package/build/crawler/github-pages-extractor.test.js +184 -0
- package/build/crawler/github-pages-extractor.test.js.map +1 -0
- package/build/crawler/github.d.ts +20 -0
- package/build/crawler/github.js +181 -0
- package/build/crawler/github.js.map +1 -0
- package/build/crawler/github.test.d.ts +1 -0
- package/build/crawler/github.test.js +326 -0
- package/build/crawler/github.test.js.map +1 -0
- package/build/crawler/puppeteer.d.ts +16 -0
- package/build/crawler/puppeteer.js +191 -0
- package/build/crawler/puppeteer.js.map +1 -0
- package/build/crawler/queue-manager.d.ts +43 -0
- package/build/crawler/queue-manager.js +169 -0
- package/build/crawler/queue-manager.js.map +1 -0
- package/build/crawler/queue-manager.test.d.ts +1 -0
- package/build/crawler/queue-manager.test.js +509 -0
- package/build/crawler/queue-manager.test.js.map +1 -0
- package/build/crawler/site-rules.d.ts +11 -0
- package/build/crawler/site-rules.js +104 -0
- package/build/crawler/site-rules.js.map +1 -0
- package/build/crawler/site-rules.test.d.ts +1 -0
- package/build/crawler/site-rules.test.js +139 -0
- package/build/crawler/site-rules.test.js.map +1 -0
- package/build/crawler/storybook-extractor.d.ts +34 -0
- package/build/crawler/storybook-extractor.js +767 -0
- package/build/crawler/storybook-extractor.js.map +1 -0
- package/build/crawler/storybook-extractor.test.d.ts +1 -0
- package/build/crawler/storybook-extractor.test.js +491 -0
- package/build/crawler/storybook-extractor.test.js.map +1 -0
- package/build/embeddings/fastembed.d.ts +25 -0
- package/build/embeddings/fastembed.js +188 -0
- package/build/embeddings/fastembed.js.map +1 -0
- package/build/embeddings/fastembed.test.d.ts +1 -0
- package/build/embeddings/fastembed.test.js +307 -0
- package/build/embeddings/fastembed.test.js.map +1 -0
- package/build/embeddings/openai.d.ts +8 -0
- package/build/embeddings/openai.js +56 -0
- package/build/embeddings/openai.js.map +1 -0
- package/build/embeddings/types.d.ts +4 -0
- package/build/embeddings/types.js +2 -0
- package/build/embeddings/types.js.map +1 -0
- package/build/index.d.ts +2 -0
- package/build/index.js +1007 -0
- package/build/index.js.map +1 -0
- package/build/index.test.d.ts +1 -0
- package/build/index.test.js +364 -0
- package/build/index.test.js.map +1 -0
- package/build/indexing/queue-manager.d.ts +36 -0
- package/build/indexing/queue-manager.js +86 -0
- package/build/indexing/queue-manager.js.map +1 -0
- package/build/indexing/queue-manager.test.d.ts +1 -0
- package/build/indexing/queue-manager.test.js +257 -0
- package/build/indexing/queue-manager.test.js.map +1 -0
- package/build/indexing/status.d.ts +39 -0
- package/build/indexing/status.js +207 -0
- package/build/indexing/status.js.map +1 -0
- package/build/indexing/status.test.d.ts +1 -0
- package/build/indexing/status.test.js +246 -0
- package/build/indexing/status.test.js.map +1 -0
- package/build/processor/content.d.ts +16 -0
- package/build/processor/content.js +286 -0
- package/build/processor/content.js.map +1 -0
- package/build/processor/content.test.d.ts +1 -0
- package/build/processor/content.test.js +369 -0
- package/build/processor/content.test.js.map +1 -0
- package/build/processor/markdown.d.ts +11 -0
- package/build/processor/markdown.js +256 -0
- package/build/processor/markdown.js.map +1 -0
- package/build/processor/markdown.test.d.ts +1 -0
- package/build/processor/markdown.test.js +312 -0
- package/build/processor/markdown.test.js.map +1 -0
- package/build/processor/metadata-parser.d.ts +37 -0
- package/build/processor/metadata-parser.js +245 -0
- package/build/processor/metadata-parser.js.map +1 -0
- package/build/processor/metadata-parser.test.d.ts +1 -0
- package/build/processor/metadata-parser.test.js +357 -0
- package/build/processor/metadata-parser.test.js.map +1 -0
- package/build/processor/processor.d.ts +8 -0
- package/build/processor/processor.js +190 -0
- package/build/processor/processor.js.map +1 -0
- package/build/processor/processor.test.d.ts +1 -0
- package/build/processor/processor.test.js +357 -0
- package/build/processor/processor.test.js.map +1 -0
- package/build/rag/cache.d.ts +10 -0
- package/build/rag/cache.js +10 -0
- package/build/rag/cache.js.map +1 -0
- package/build/rag/code-generator.d.ts +11 -0
- package/build/rag/code-generator.js +30 -0
- package/build/rag/code-generator.js.map +1 -0
- package/build/rag/context-assembler.d.ts +23 -0
- package/build/rag/context-assembler.js +113 -0
- package/build/rag/context-assembler.js.map +1 -0
- package/build/rag/docs-search.d.ts +55 -0
- package/build/rag/docs-search.js +380 -0
- package/build/rag/docs-search.js.map +1 -0
- package/build/rag/pipeline.d.ts +26 -0
- package/build/rag/pipeline.js +91 -0
- package/build/rag/pipeline.js.map +1 -0
- package/build/rag/query-processor.d.ts +14 -0
- package/build/rag/query-processor.js +57 -0
- package/build/rag/query-processor.js.map +1 -0
- package/build/rag/reranker.d.ts +55 -0
- package/build/rag/reranker.js +210 -0
- package/build/rag/reranker.js.map +1 -0
- package/build/rag/response-generator.d.ts +20 -0
- package/build/rag/response-generator.js +101 -0
- package/build/rag/response-generator.js.map +1 -0
- package/build/rag/retriever.d.ts +19 -0
- package/build/rag/retriever.js +111 -0
- package/build/rag/retriever.js.map +1 -0
- package/build/rag/validator.d.ts +22 -0
- package/build/rag/validator.js +128 -0
- package/build/rag/validator.js.map +1 -0
- package/build/rag/version-manager.d.ts +23 -0
- package/build/rag/version-manager.js +98 -0
- package/build/rag/version-manager.js.map +1 -0
- package/build/setupTests.d.ts +4 -0
- package/build/setupTests.js +50 -0
- package/build/setupTests.js.map +1 -0
- package/build/storage/storage.d.ts +38 -0
- package/build/storage/storage.js +700 -0
- package/build/storage/storage.js.map +1 -0
- package/build/storage/storage.test.d.ts +1 -0
- package/build/storage/storage.test.js +338 -0
- package/build/storage/storage.test.js.map +1 -0
- package/build/types/rag.d.ts +27 -0
- package/build/types/rag.js +2 -0
- package/build/types/rag.js.map +1 -0
- package/build/types.d.ts +120 -0
- package/build/types.js +2 -0
- package/build/types.js.map +1 -0
- package/build/util/content-utils.d.ts +31 -0
- package/build/util/content-utils.js +120 -0
- package/build/util/content-utils.js.map +1 -0
- package/build/util/content.d.ts +1 -0
- package/build/util/content.js +16 -0
- package/build/util/content.js.map +1 -0
- package/build/util/docs.d.ts +1 -0
- package/build/util/docs.js +26 -0
- package/build/util/docs.js.map +1 -0
- package/build/util/docs.test.d.ts +1 -0
- package/build/util/docs.test.js +49 -0
- package/build/util/docs.test.js.map +1 -0
- package/build/util/favicon.d.ts +6 -0
- package/build/util/favicon.js +88 -0
- package/build/util/favicon.js.map +1 -0
- package/build/util/favicon.test.d.ts +1 -0
- package/build/util/favicon.test.js +140 -0
- package/build/util/favicon.test.js.map +1 -0
- package/build/util/logger.d.ts +17 -0
- package/build/util/logger.js +72 -0
- package/build/util/logger.js.map +1 -0
- package/build/util/logger.test.d.ts +1 -0
- package/build/util/logger.test.js +46 -0
- package/build/util/logger.test.js.map +1 -0
- package/build/util/security.d.ts +312 -0
- package/build/util/security.js +719 -0
- package/build/util/security.js.map +1 -0
- package/build/util/security.test.d.ts +1 -0
- package/build/util/security.test.js +524 -0
- package/build/util/security.test.js.map +1 -0
- package/build/util/site-detector.d.ts +22 -0
- package/build/util/site-detector.js +42 -0
- package/build/util/site-detector.js.map +1 -0
- package/package.json +112 -0
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
import { URL } from 'url';
|
|
2
|
+
import * as cheerio from 'cheerio';
|
|
3
|
+
import { BaseCrawler } from './base.js';
|
|
4
|
+
export class DefaultCrawler extends BaseCrawler {
|
|
5
|
+
BATCH_SIZE = 50;
|
|
6
|
+
FETCH_TIMEOUT = 30000; // 30 seconds
|
|
7
|
+
constructor(maxDepth = 4, maxRequestsPerCrawl = 1000, onProgress) {
|
|
8
|
+
super(maxDepth, maxRequestsPerCrawl, onProgress);
|
|
9
|
+
}
|
|
10
|
+
async *crawl(url) {
|
|
11
|
+
console.debug(`[${this.constructor.name}] Starting crawl from: ${url}`);
|
|
12
|
+
if (this.isAborting) {
|
|
13
|
+
console.debug('[DefaultCrawler] Crawl aborted');
|
|
14
|
+
return;
|
|
15
|
+
}
|
|
16
|
+
const startUrl = new URL(url);
|
|
17
|
+
const baseUrl = this.normalizeUrl(startUrl.toString());
|
|
18
|
+
// Track pages to process
|
|
19
|
+
const pagesToCrawl = new Map(); // URL -> depth
|
|
20
|
+
pagesToCrawl.set(baseUrl, 0);
|
|
21
|
+
while (pagesToCrawl.size > 0 && !this.isAborting) {
|
|
22
|
+
// Get batch of URLs to process
|
|
23
|
+
const batchEntries = Array.from(pagesToCrawl.entries()).slice(0, this.BATCH_SIZE);
|
|
24
|
+
const batch = new Map(batchEntries);
|
|
25
|
+
// Remove batch from queue
|
|
26
|
+
batchEntries.forEach(([url]) => pagesToCrawl.delete(url));
|
|
27
|
+
try {
|
|
28
|
+
// Process batch in parallel with timeout and rate limiting
|
|
29
|
+
const results = await Promise.all(Array.from(batch.entries()).map(async ([pageUrl]) => {
|
|
30
|
+
// Apply rate limiting
|
|
31
|
+
await this.rateLimit();
|
|
32
|
+
const result = await this.processPageWithRetry(pageUrl);
|
|
33
|
+
return { pageUrl, ...result };
|
|
34
|
+
}));
|
|
35
|
+
// Handle results
|
|
36
|
+
for (const { pageUrl, content, links, error } of results) {
|
|
37
|
+
if (error || !content || this.isAborting)
|
|
38
|
+
continue;
|
|
39
|
+
this.markUrlAsSeen(pageUrl);
|
|
40
|
+
yield {
|
|
41
|
+
url: pageUrl,
|
|
42
|
+
path: this.getPathFromUrl(pageUrl),
|
|
43
|
+
content,
|
|
44
|
+
title: this.extractTitle(content)
|
|
45
|
+
};
|
|
46
|
+
// Add new links to queue if within depth limit
|
|
47
|
+
const currentDepth = batch.get(pageUrl) || 0;
|
|
48
|
+
if (currentDepth < this.maxDepth) {
|
|
49
|
+
for (const link of links) {
|
|
50
|
+
const normalizedLink = this.normalizeUrl(link);
|
|
51
|
+
if (this.shouldCrawl(normalizedLink) && !pagesToCrawl.has(normalizedLink)) {
|
|
52
|
+
pagesToCrawl.set(normalizedLink, currentDepth + 1);
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
// Check if we've hit the request limit
|
|
57
|
+
if (this.seenUrls.size >= this.maxRequestsPerCrawl) {
|
|
58
|
+
console.debug('[DefaultCrawler] Max requests reached');
|
|
59
|
+
return;
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
// Add delay between batches
|
|
63
|
+
await new Promise(resolve => setTimeout(resolve, 1000));
|
|
64
|
+
}
|
|
65
|
+
catch (e) {
|
|
66
|
+
console.error('[DefaultCrawler] Error processing batch:', e);
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
console.debug('[DefaultCrawler] Crawl completed');
|
|
70
|
+
}
|
|
71
|
+
async processPageWithRetry(url) {
|
|
72
|
+
return this.retryWithBackoff(async () => {
|
|
73
|
+
try {
|
|
74
|
+
// Create fetch request with timeout
|
|
75
|
+
const controller = new AbortController();
|
|
76
|
+
const timeoutId = setTimeout(() => controller.abort(), this.FETCH_TIMEOUT);
|
|
77
|
+
const response = await fetch(url, {
|
|
78
|
+
signal: controller.signal,
|
|
79
|
+
headers: {
|
|
80
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
|
81
|
+
}
|
|
82
|
+
});
|
|
83
|
+
clearTimeout(timeoutId);
|
|
84
|
+
if (!response.ok) {
|
|
85
|
+
throw new Error(`HTTP error! status: ${response.status}`);
|
|
86
|
+
}
|
|
87
|
+
const content = await response.text();
|
|
88
|
+
const links = this.extractLinks(content, new URL(url));
|
|
89
|
+
return { content, links };
|
|
90
|
+
}
|
|
91
|
+
catch (e) {
|
|
92
|
+
if (e instanceof Error) {
|
|
93
|
+
return { content: null, links: [], error: e };
|
|
94
|
+
}
|
|
95
|
+
return { content: null, links: [], error: new Error('Unknown error occurred') };
|
|
96
|
+
}
|
|
97
|
+
});
|
|
98
|
+
}
|
|
99
|
+
extractLinks(html, baseUrl) {
|
|
100
|
+
try {
|
|
101
|
+
const $ = cheerio.load(html);
|
|
102
|
+
const links = new Set();
|
|
103
|
+
// Find all links, including those in navigation elements
|
|
104
|
+
$('a').each((_, element) => {
|
|
105
|
+
const href = $(element).attr('href');
|
|
106
|
+
if (!href)
|
|
107
|
+
return;
|
|
108
|
+
try {
|
|
109
|
+
const url = new URL(href, baseUrl);
|
|
110
|
+
const normalizedUrl = this.normalizeUrl(url.toString());
|
|
111
|
+
// Use BaseCrawler's URL validation
|
|
112
|
+
if (this.shouldCrawl(normalizedUrl)) {
|
|
113
|
+
links.add(normalizedUrl);
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
catch (e) {
|
|
117
|
+
console.debug(`[DefaultCrawler] Invalid URL ${href}:`, e);
|
|
118
|
+
}
|
|
119
|
+
});
|
|
120
|
+
return Array.from(links);
|
|
121
|
+
}
|
|
122
|
+
catch (e) {
|
|
123
|
+
console.error('[DefaultCrawler] Error extracting links:', e);
|
|
124
|
+
return [];
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
extractTitle(html) {
|
|
128
|
+
try {
|
|
129
|
+
const $ = cheerio.load(html);
|
|
130
|
+
return $('title').text().trim() || 'Untitled';
|
|
131
|
+
}
|
|
132
|
+
catch (e) {
|
|
133
|
+
console.error('[DefaultCrawler] Error extracting title:', e);
|
|
134
|
+
return 'Untitled';
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
//# sourceMappingURL=default.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"default.js","sourceRoot":"","sources":["../../src/crawler/default.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,GAAG,EAAE,MAAM,KAAK,CAAC;AAC1B,OAAO,KAAK,OAAO,MAAM,SAAS,CAAC;AAEnC,OAAO,EAAE,WAAW,EAAE,MAAM,WAAW,CAAC;AAExC,MAAM,OAAO,cAAe,SAAQ,WAAW;IAC5B,UAAU,GAAG,EAAE,CAAC;IAChB,aAAa,GAAG,KAAK,CAAC,CAAC,aAAa;IAErD,YACE,WAAmB,CAAC,EACpB,sBAA8B,IAAI,EAClC,UAA4D;QAE5D,KAAK,CAAC,QAAQ,EAAE,mBAAmB,EAAE,UAAU,CAAC,CAAC;IACnD,CAAC;IAED,KAAK,CAAC,CAAC,KAAK,CAAC,GAAW;QACtB,OAAO,CAAC,KAAK,CAAC,IAAI,IAAI,CAAC,WAAW,CAAC,IAAI,0BAA0B,GAAG,EAAE,CAAC,CAAC;QAExE,IAAI,IAAI,CAAC,UAAU,EAAE,CAAC;YACpB,OAAO,CAAC,KAAK,CAAC,gCAAgC,CAAC,CAAC;YAChD,OAAO;QACT,CAAC;QAED,MAAM,QAAQ,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;QAC9B,MAAM,OAAO,GAAG,IAAI,CAAC,YAAY,CAAC,QAAQ,CAAC,QAAQ,EAAE,CAAC,CAAC;QAEvD,yBAAyB;QACzB,MAAM,YAAY,GAAG,IAAI,GAAG,EAAkB,CAAC,CAAC,eAAe;QAC/D,YAAY,CAAC,GAAG,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;QAE7B,OAAO,YAAY,CAAC,IAAI,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,UAAU,EAAE,CAAC;YACjD,+BAA+B;YAC/B,MAAM,YAAY,GAAG,KAAK,CAAC,IAAI,CAAC,YAAY,CAAC,OAAO,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC,UAAU,CAAC,CAAC;YAClF,MAAM,KAAK,GAAG,IAAI,GAAG,CAAC,YAAY,CAAC,CAAC;YAEpC,0BAA0B;YAC1B,YAAY,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,EAAE,EAAE,CAAC,YAAY,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC;YAE1D,IAAI,CAAC;gBACH,2DAA2D;gBAC3D,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,GAAG,CAC/B,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC,GAAG,CAAC,KAAK,EAAE,CAAC,OAAO,CAAC,EAAE,EAAE;oBAClD,sBAAsB;oBACtB,MAAM,IAAI,CAAC,SAAS,EAAE,CAAC;oBACvB,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,oBAAoB,CAAC,OAAO,CAAC,CAAC;oBACxD,OAAO,EAAE,OAAO,EAAE,GAAG,MAAM,EAAE,CAAC;gBAChC,CAAC,CAAC,CACH,CAAC;gBAEF,iBAAiB;gBACjB,KAAK,MAAM,EAAE,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,KAAK,EAAE,IAAI,OAAO,EAAE,CAAC;oBACzD,IAAI,KAAK,IAAI,CAAC,OAAO,IAAI,IAAI,CAAC,UAAU;wBAAE,SAAS;oBAEnD,IAAI,CAAC,aAAa,CAAC,OAAO,CAAC,CAAC;oBAE5B,MAAM;wBACJ,GAAG,EAAE,OAAO;wBACZ,IAAI,EAAE,IAAI,CAAC,cAAc,CAAC,OAAO,CAAC;wBAClC,OAAO;wBACP,KAAK,EAAE,IAAI,CAAC,YAAY,CAAC,OAAO,CAAC;qBAClC,CAAC;oBAEF,+CAA+C;oBAC/C,MAAM,YAAY,GAAG,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;oBAC7C,IAAI,YAAY,GAAG,IAAI,CAAC,QAAQ,EAAE,CAAC;wBACjC,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;4BACzB,MAAM,cAAc,GAAG,IAAI,CAAC,YAAY,CAAC,IAAI,CAAC,CAAC;4BAC/C,IAAI,IAAI,CAAC,WAAW,CAAC,cAAc,CAAC,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,cAAc,CAAC,EAAE,CAAC;gCAC1E,YAAY,CAAC,GAAG,CAAC,cAAc,EAAE,YAAY,GAAG,CAAC,CAAC,CAAC;4BACrD,CAAC;wBACH,CAAC;oBACH,CAAC;oBAED,uCAAuC;oBACvC,IAAI,IAAI,CAAC,QAAQ,CAAC,IAAI,IAAI,IAAI,CAAC,mBAAmB,EAAE,CAAC;wBACnD,OAAO,CAAC,KAAK,CAAC,uCAAuC,CAAC,CAAC;wBACvD,OAAO;oBACT,CAAC;gBACH,CAAC;gBAED,4BAA4B;gBAC5B,MAAM,IAAI,OAAO,CAAC,OAAO,CAAC,EAAE,CAAC,UAAU,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC,CAAC;YAC1D,CAAC;YAAC,OAAO,CAAC,EAAE,CAAC;gBACX,OAAO,CAAC,KAAK,CAAC,0CAA0C,EAAE,CAAC,CAAC,CAAC;YAC/D,CAAC;QACH,CAAC;QAED,OAAO,CAAC,KAAK,CAAC,kCAAkC,CAAC,CAAC;IACpD,CAAC;IAEO,KAAK,CAAC,oBAAoB,CAAC,GAAW;QAK5C,OAAO,IAAI,CAAC,gBAAgB,CAAC,KAAK,IAAI,EAAE;YACtC,IAAI,CAAC;gBACH,oCAAoC;gBACpC,MAAM,UAAU,GAAG,IAAI,eAAe,EAAE,CAAC;gBACzC,MAAM,SAAS,GAAG,UAAU,CAAC,GAAG,EAAE,CAAC,UAAU,CAAC,KAAK,EAAE,EAAE,IAAI,CAAC,aAAa,CAAC,CAAC;gBAE3E,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;oBAChC,MAAM,EAAE,UAAU,CAAC,MAAM;oBACzB,OAAO,EAAE;wBACP,YAAY,EAAE,qHAAqH;qBACpI;iBACF,CAAC,CAAC;gBAEH,YAAY,CAAC,SAAS,CAAC,CAAC;gBAExB,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;oBACjB,MAAM,IAAI,KAAK,CAAC,uBAAuB,QAAQ,CAAC,MAAM,EAAE,CAAC,CAAC;gBAC5D,CAAC;gBAED,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;gBACtC,MAAM,KAAK,GAAG,IAAI,CAAC,YAAY,CAAC,OAAO,EAAE,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC;gBAEvD,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,CAAC;YAC5B,CAAC;YAAC,OAAO,CAAC,EAAE,CAAC;gBACX,IAAI,CAAC,YAAY,KAAK,EAAE,CAAC;oBACvB,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,EAAE,EAAE,KAAK,EAAE,CAAC,EAAE,CAAC;gBAChD,CAAC;gBACD,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,EAAE,EAAE,KAAK,EAAE,IAAI,KAAK,CAAC,wBAAwB,CAAC,EAAE,CAAC;YAClF,CAAC;QACH,CAAC,CAAC,CAAC;IACL,CAAC;IAEO,YAAY,CAAC,IAAY,EAAE,OAAY;QAC7C,IAAI,CAAC;YACH,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAC7B,MAAM,KAAK,GAAG,IAAI,GAAG,EAAU,CAAC;YAEhC,yDAAyD;YACzD,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,EAAE;gBACzB,MAAM,IAAI,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;gBACrC,IAAI,CAAC,IAAI;oBAAE,OAAO;gBAElB,IAAI,CAAC;oBACH,MAAM,GAAG,GAAG,IAAI,GAAG,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;oBACnC,MAAM,aAAa,GAAG,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,QAAQ,EAAE,CAAC,CAAC;oBAExD,mCAAmC;oBACnC,IAAI,IAAI,CAAC,WAAW,CAAC,aAAa,CAAC,EAAE,CAAC;wBACpC,KAAK,CAAC,GAAG,CAAC,aAAa,CAAC,CAAC;oBAC3B,CAAC;gBACH,CAAC;gBAAC,OAAO,CAAC,EAAE,CAAC;oBACX,OAAO,CAAC,KAAK,CAAC,gCAAgC,IAAI,GAAG,EAAE,CAAC,CAAC,CAAC;gBAC5D,CAAC;YACH,CAAC,CAAC,CAAC;YAEH,OAAO,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QAC3B,CAAC;QAAC,OAAO,CAAC,EAAE,CAAC;YACX,OAAO,CAAC,KAAK,CAAC,0CAA0C,EAAE,CAAC,CAAC,CAAC;YAC7D,OAAO,EAAE,CAAC;QACZ,CAAC;IACH,CAAC;IAEO,YAAY,CAAC,IAAY;QAC/B,IAAI,CAAC;YACH,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAC7B,OAAO,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,IAAI,UAAU,CAAC;QAChD,CAAC;QAAC,OAAO,CAAC,EAAE,CAAC;YACX,OAAO,CAAC,KAAK,CAAC,0CAA0C,EAAE,CAAC,CAAC,CAAC;YAC7D,OAAO,UAAU,CAAC;QACpB,CAAC;IACH,CAAC;CACF"}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import { CrawlResult, DocsCrawlerType, WebCrawler } from '../types.js';
|
|
2
|
+
import { StorageState } from './crawlee-crawler.js';
|
|
3
|
+
export declare class DocsCrawler implements WebCrawler {
|
|
4
|
+
private readonly maxDepth;
|
|
5
|
+
private readonly maxRequestsPerCrawl;
|
|
6
|
+
private readonly githubToken?;
|
|
7
|
+
private readonly onProgress?;
|
|
8
|
+
private readonly GITHUB_HOST;
|
|
9
|
+
private readonly MIN_PAGES;
|
|
10
|
+
private isAborting;
|
|
11
|
+
private storageState?;
|
|
12
|
+
private pathPrefix?;
|
|
13
|
+
constructor(maxDepth?: number, maxRequestsPerCrawl?: number, githubToken?: string | undefined, onProgress?: ((progress: number, description: string) => void) | undefined);
|
|
14
|
+
/**
|
|
15
|
+
* Set an optional path prefix to restrict crawling to URLs under this path.
|
|
16
|
+
* Only pages whose path starts with this prefix will be crawled.
|
|
17
|
+
* Example: '/oss/javascript/langchain' would only crawl pages under that path.
|
|
18
|
+
*/
|
|
19
|
+
setPathPrefix(prefix: string): void;
|
|
20
|
+
/**
|
|
21
|
+
* Set authentication storage state (cookies) to use when crawling
|
|
22
|
+
*/
|
|
23
|
+
setStorageState(state: StorageState): void;
|
|
24
|
+
crawl(url: string): AsyncGenerator<CrawlResult, DocsCrawlerType>;
|
|
25
|
+
abort(): void;
|
|
26
|
+
}
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
import { URL } from 'url';
|
|
2
|
+
import { CrawleeCrawler } from './crawlee-crawler.js';
|
|
3
|
+
import { GitHubCrawler } from './github.js';
|
|
4
|
+
import { logger } from '../util/logger.js';
|
|
5
|
+
export class DocsCrawler {
|
|
6
|
+
maxDepth;
|
|
7
|
+
maxRequestsPerCrawl;
|
|
8
|
+
githubToken;
|
|
9
|
+
onProgress;
|
|
10
|
+
GITHUB_HOST = 'github.com';
|
|
11
|
+
MIN_PAGES = 2; // Require at least 2 pages for component libraries
|
|
12
|
+
isAborting = false;
|
|
13
|
+
storageState;
|
|
14
|
+
pathPrefix;
|
|
15
|
+
constructor(maxDepth = 4, maxRequestsPerCrawl = 1000, githubToken, onProgress) {
|
|
16
|
+
this.maxDepth = maxDepth;
|
|
17
|
+
this.maxRequestsPerCrawl = maxRequestsPerCrawl;
|
|
18
|
+
this.githubToken = githubToken;
|
|
19
|
+
this.onProgress = onProgress;
|
|
20
|
+
}
|
|
21
|
+
/**
|
|
22
|
+
* Set an optional path prefix to restrict crawling to URLs under this path.
|
|
23
|
+
* Only pages whose path starts with this prefix will be crawled.
|
|
24
|
+
* Example: '/oss/javascript/langchain' would only crawl pages under that path.
|
|
25
|
+
*/
|
|
26
|
+
setPathPrefix(prefix) {
|
|
27
|
+
this.pathPrefix = prefix;
|
|
28
|
+
logger.info(`[DocsCrawler] Path prefix restriction set: ${prefix}`);
|
|
29
|
+
}
|
|
30
|
+
/**
|
|
31
|
+
* Set authentication storage state (cookies) to use when crawling
|
|
32
|
+
*/
|
|
33
|
+
setStorageState(state) {
|
|
34
|
+
this.storageState = state;
|
|
35
|
+
logger.info(`[DocsCrawler] Set storage state with ${state.cookies?.length || 0} cookies`);
|
|
36
|
+
}
|
|
37
|
+
async *crawl(url) {
|
|
38
|
+
const startUrl = new URL(url);
|
|
39
|
+
logger.debug(`[DocsCrawler] Starting crawl of ${startUrl}`);
|
|
40
|
+
if (this.isAborting) {
|
|
41
|
+
logger.debug('[DocsCrawler] Crawl aborted');
|
|
42
|
+
return 'crawlee';
|
|
43
|
+
}
|
|
44
|
+
// Handle GitHub repositories
|
|
45
|
+
if (startUrl.host === this.GITHUB_HOST) {
|
|
46
|
+
logger.debug('[DocsCrawler] Detected GitHub repository');
|
|
47
|
+
const githubCrawler = new GitHubCrawler(this.maxDepth, this.maxRequestsPerCrawl, this.githubToken, this.onProgress);
|
|
48
|
+
try {
|
|
49
|
+
for await (const page of githubCrawler.crawl(url)) {
|
|
50
|
+
if (this.isAborting)
|
|
51
|
+
break;
|
|
52
|
+
yield page;
|
|
53
|
+
}
|
|
54
|
+
return 'github';
|
|
55
|
+
}
|
|
56
|
+
catch (e) {
|
|
57
|
+
logger.debug('[DocsCrawler] GitHub crawler failed:', e);
|
|
58
|
+
// Don't fall through to other crawlers for GitHub URLs
|
|
59
|
+
throw e;
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
// Use Crawlee for all other sites
|
|
63
|
+
logger.debug('[DocsCrawler] Using Crawlee crawler');
|
|
64
|
+
const crawleeCrawler = new CrawleeCrawler(this.maxDepth, this.maxRequestsPerCrawl, this.onProgress);
|
|
65
|
+
// Pass authentication if available
|
|
66
|
+
if (this.storageState) {
|
|
67
|
+
crawleeCrawler.setStorageState(this.storageState);
|
|
68
|
+
}
|
|
69
|
+
// Pass path prefix restriction if configured
|
|
70
|
+
if (this.pathPrefix) {
|
|
71
|
+
crawleeCrawler.setPathPrefix(this.pathPrefix);
|
|
72
|
+
}
|
|
73
|
+
let pageCount = 0;
|
|
74
|
+
try {
|
|
75
|
+
for await (const page of crawleeCrawler.crawl(url)) {
|
|
76
|
+
if (this.isAborting)
|
|
77
|
+
break;
|
|
78
|
+
pageCount++;
|
|
79
|
+
yield page;
|
|
80
|
+
}
|
|
81
|
+
if (pageCount >= this.MIN_PAGES) {
|
|
82
|
+
logger.debug(`[DocsCrawler] Crawlee crawler successful (${pageCount} pages)`);
|
|
83
|
+
return 'crawlee';
|
|
84
|
+
}
|
|
85
|
+
logger.debug(`[DocsCrawler] Crawlee crawler found insufficient pages (${pageCount})`);
|
|
86
|
+
throw new Error(`Crawlee crawler found only ${pageCount} pages, need at least ${this.MIN_PAGES}`);
|
|
87
|
+
}
|
|
88
|
+
catch (e) {
|
|
89
|
+
logger.debug('[DocsCrawler] Crawlee crawler failed:', e);
|
|
90
|
+
throw e;
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
abort() {
|
|
94
|
+
this.isAborting = true;
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
//# sourceMappingURL=docs-crawler.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"docs-crawler.js","sourceRoot":"","sources":["../../src/crawler/docs-crawler.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,GAAG,EAAE,MAAM,KAAK,CAAC;AAE1B,OAAO,EAAE,cAAc,EAAgB,MAAM,sBAAsB,CAAC;AACpE,OAAO,EAAE,aAAa,EAAE,MAAM,aAAa,CAAC;AAC5C,OAAO,EAAE,MAAM,EAAE,MAAM,mBAAmB,CAAC;AAE3C,MAAM,OAAO,WAAW;IAQH;IACA;IACA;IACA;IAVF,WAAW,GAAG,YAAY,CAAC;IAC3B,SAAS,GAAG,CAAC,CAAC,CAAC,mDAAmD;IAC3E,UAAU,GAAG,KAAK,CAAC;IACnB,YAAY,CAAgB;IAC5B,UAAU,CAAU;IAE5B,YACmB,WAAmB,CAAC,EACpB,sBAA8B,IAAI,EAClC,WAAoB,EACpB,UAA4D;QAH5D,aAAQ,GAAR,QAAQ,CAAY;QACpB,wBAAmB,GAAnB,mBAAmB,CAAe;QAClC,gBAAW,GAAX,WAAW,CAAS;QACpB,eAAU,GAAV,UAAU,CAAkD;IAC5E,CAAC;IAEJ;;;;OAIG;IACH,aAAa,CAAC,MAAc;QAC1B,IAAI,CAAC,UAAU,GAAG,MAAM,CAAC;QACzB,MAAM,CAAC,IAAI,CAAC,8CAA8C,MAAM,EAAE,CAAC,CAAC;IACtE,CAAC;IAED;;OAEG;IACH,eAAe,CAAC,KAAmB;QACjC,IAAI,CAAC,YAAY,GAAG,KAAK,CAAC;QAC1B,MAAM,CAAC,IAAI,CAAC,wCAAwC,KAAK,CAAC,OAAO,EAAE,MAAM,IAAI,CAAC,UAAU,CAAC,CAAC;IAC5F,CAAC;IAED,KAAK,CAAC,CAAC,KAAK,CAAC,GAAW;QACtB,MAAM,QAAQ,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;QAC9B,MAAM,CAAC,KAAK,CAAC,mCAAmC,QAAQ,EAAE,CAAC,CAAC;QAE5D,IAAI,IAAI,CAAC,UAAU,EAAE,CAAC;YACpB,MAAM,CAAC,KAAK,CAAC,6BAA6B,CAAC,CAAC;YAC5C,OAAO,SAAS,CAAC;QACnB,CAAC;QAED,6BAA6B;QAC7B,IAAI,QAAQ,CAAC,IAAI,KAAK,IAAI,CAAC,WAAW,EAAE,CAAC;YACvC,MAAM,CAAC,KAAK,CAAC,0CAA0C,CAAC,CAAC;YACzD,MAAM,aAAa,GAAG,IAAI,aAAa,CAAC,IAAI,CAAC,QAAQ,EAAE,IAAI,CAAC,mBAAmB,EAAE,IAAI,CAAC,WAAW,EAAE,IAAI,CAAC,UAAU,CAAC,CAAC;YAEpH,IAAI,CAAC;gBACH,IAAI,KAAK,EAAE,MAAM,IAAI,IAAI,aAAa,CAAC,KAAK,CAAC,GAAG,CAAC,EAAE,CAAC;oBAClD,IAAI,IAAI,CAAC,UAAU;wBAAE,MAAM;oBAC3B,MAAM,IAAI,CAAC;gBACb,CAAC;gBACD,OAAO,QAAQ,CAAC;YAClB,CAAC;YAAC,OAAO,CAAC,EAAE,CAAC;gBACX,MAAM,CAAC,KAAK,CAAC,sCAAsC,EAAE,CAAC,CAAC,CAAC;gBACxD,uDAAuD;gBACvD,MAAM,CAAC,CAAC;YACV,CAAC;QACH,CAAC;QAED,kCAAkC;QAClC,MAAM,CAAC,KAAK,CAAC,qCAAqC,CAAC,CAAC;QACpD,MAAM,cAAc,GAAG,IAAI,cAAc,CAAC,IAAI,CAAC,QAAQ,EAAE,IAAI,CAAC,mBAAmB,EAAE,IAAI,CAAC,UAAU,CAAC,CAAC;QAEpG,mCAAmC;QACnC,IAAI,IAAI,CAAC,YAAY,EAAE,CAAC;YACtB,cAAc,CAAC,eAAe,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;QACpD,CAAC;QAED,6CAA6C;QAC7C,IAAI,IAAI,CAAC,UAAU,EAAE,CAAC;YACpB,cAAc,CAAC,aAAa,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;QAChD,CAAC;QAED,IAAI,SAAS,GAAG,CAAC,CAAC;QAElB,IAAI,CAAC;YACH,IAAI,KAAK,EAAE,MAAM,IAAI,IAAI,cAAc,CAAC,KAAK,CAAC,GAAG,CAAC,EAAE,CAAC;gBACnD,IAAI,IAAI,CAAC,UAAU;oBAAE,MAAM;gBAC3B,SAAS,EAAE,CAAC;gBACZ,MAAM,IAAI,CAAC;YACb,CAAC;YAED,IAAI,SAAS,IAAI,IAAI,CAAC,SAAS,EAAE,CAAC;gBAChC,MAAM,CAAC,KAAK,CAAC,6CAA6C,SAAS,SAAS,CAAC,CAAC;gBAC9E,OAAO,SAAS,CAAC;YACnB,CAAC;YACD,MAAM,CAAC,KAAK,CAAC,2DAA2D,SAAS,GAAG,CAAC,CAAC;YACtF,MAAM,IAAI,KAAK,CAAC,8BAA8B,SAAS,yBAAyB,IAAI,CAAC,SAAS,EAAE,CAAC,CAAC;QACpG,CAAC;QAAC,OAAO,CAAC,EAAE,CAAC;YACX,MAAM,CAAC,KAAK,CAAC,uCAAuC,EAAE,CAAC,CAAC,CAAC;YACzD,MAAM,CAAC,CAAC;QACV,CAAC;IACH,CAAC;IAED,KAAK;QACH,IAAI,CAAC,UAAU,GAAG,IAAI,CAAC;IACzB,CAAC;CACF"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
import { DocsCrawler } from './docs-crawler.js';
|
|
2
|
+
const mockGitHubCrawl = vi.fn();
|
|
3
|
+
vi.mock('./github.js', () => ({
|
|
4
|
+
GitHubCrawler: function () {
|
|
5
|
+
return {
|
|
6
|
+
crawl: mockGitHubCrawl,
|
|
7
|
+
};
|
|
8
|
+
},
|
|
9
|
+
}));
|
|
10
|
+
const mockCrawleeCrawl = vi.fn();
|
|
11
|
+
const mockSetStorageState = vi.fn();
|
|
12
|
+
vi.mock('./crawlee-crawler.js', () => ({
|
|
13
|
+
CrawleeCrawler: function () {
|
|
14
|
+
return {
|
|
15
|
+
crawl: mockCrawleeCrawl,
|
|
16
|
+
setStorageState: mockSetStorageState,
|
|
17
|
+
};
|
|
18
|
+
},
|
|
19
|
+
}));
|
|
20
|
+
describe('DocsCrawler', () => {
|
|
21
|
+
let crawler;
|
|
22
|
+
beforeEach(() => {
|
|
23
|
+
vi.clearAllMocks();
|
|
24
|
+
crawler = new DocsCrawler();
|
|
25
|
+
});
|
|
26
|
+
describe('constructor', () => {
|
|
27
|
+
it('should initialize with default values', () => {
|
|
28
|
+
expect(crawler).toBeDefined();
|
|
29
|
+
});
|
|
30
|
+
it('should accept custom parameters', () => {
|
|
31
|
+
const customCrawler = new DocsCrawler(10, 500, 'github_token', vi.fn());
|
|
32
|
+
expect(customCrawler).toBeDefined();
|
|
33
|
+
});
|
|
34
|
+
});
|
|
35
|
+
describe('crawl', () => {
|
|
36
|
+
describe('GitHub URLs', () => {
|
|
37
|
+
it('should use GitHubCrawler for github.com URLs', async () => {
|
|
38
|
+
const mockResults = [
|
|
39
|
+
{ url: 'https://github.com/owner/repo/README.md', path: 'README.md', content: '# README', title: 'README' },
|
|
40
|
+
];
|
|
41
|
+
mockGitHubCrawl.mockImplementation(async function* () {
|
|
42
|
+
for (const result of mockResults) {
|
|
43
|
+
yield result;
|
|
44
|
+
}
|
|
45
|
+
});
|
|
46
|
+
const results = [];
|
|
47
|
+
const generator = crawler.crawl('https://github.com/owner/repo');
|
|
48
|
+
for await (const result of generator) {
|
|
49
|
+
results.push(result);
|
|
50
|
+
}
|
|
51
|
+
expect(results).toHaveLength(1);
|
|
52
|
+
expect(results[0].url).toContain('github.com');
|
|
53
|
+
});
|
|
54
|
+
it('should return github type for GitHub URLs', async () => {
|
|
55
|
+
mockGitHubCrawl.mockImplementation(async function* () {
|
|
56
|
+
yield { url: 'https://github.com/owner/repo', path: '/', content: 'test', title: 'Test' };
|
|
57
|
+
});
|
|
58
|
+
const generator = crawler.crawl('https://github.com/owner/repo');
|
|
59
|
+
// Manually iterate to capture the return value
|
|
60
|
+
let result = await generator.next();
|
|
61
|
+
while (!result.done) {
|
|
62
|
+
result = await generator.next();
|
|
63
|
+
}
|
|
64
|
+
const crawlerType = result.value;
|
|
65
|
+
expect(crawlerType).toBe('github');
|
|
66
|
+
});
|
|
67
|
+
it('should propagate errors from GitHubCrawler', async () => {
|
|
68
|
+
// eslint-disable-next-line require-yield
|
|
69
|
+
mockGitHubCrawl.mockImplementation(async function* () {
|
|
70
|
+
throw new Error('GitHub API error');
|
|
71
|
+
});
|
|
72
|
+
const generator = crawler.crawl('https://github.com/owner/repo');
|
|
73
|
+
await expect(async () => {
|
|
74
|
+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
75
|
+
for await (const _ of generator) {
|
|
76
|
+
// Just consume results
|
|
77
|
+
}
|
|
78
|
+
}).rejects.toThrow('GitHub API error');
|
|
79
|
+
});
|
|
80
|
+
});
|
|
81
|
+
describe('Non-GitHub URLs', () => {
|
|
82
|
+
it('should use CrawleeCrawler for non-GitHub URLs', async () => {
|
|
83
|
+
const mockResults = [
|
|
84
|
+
{ url: 'https://docs.example.com/guide', path: '/guide', content: '<h1>Guide</h1>', title: 'Guide' },
|
|
85
|
+
{ url: 'https://docs.example.com/api', path: '/api', content: '<h1>API</h1>', title: 'API' },
|
|
86
|
+
];
|
|
87
|
+
mockCrawleeCrawl.mockImplementation(async function* () {
|
|
88
|
+
for (const result of mockResults) {
|
|
89
|
+
yield result;
|
|
90
|
+
}
|
|
91
|
+
});
|
|
92
|
+
const results = [];
|
|
93
|
+
for await (const result of crawler.crawl('https://docs.example.com')) {
|
|
94
|
+
results.push(result);
|
|
95
|
+
}
|
|
96
|
+
expect(results).toHaveLength(2);
|
|
97
|
+
});
|
|
98
|
+
it('should return crawlee type for sufficient pages', async () => {
|
|
99
|
+
mockCrawleeCrawl.mockImplementation(async function* () {
|
|
100
|
+
yield { url: 'https://example.com/page1', path: '/page1', content: 'Page 1', title: 'Page 1' };
|
|
101
|
+
yield { url: 'https://example.com/page2', path: '/page2', content: 'Page 2', title: 'Page 2' };
|
|
102
|
+
});
|
|
103
|
+
const generator = crawler.crawl('https://example.com');
|
|
104
|
+
// Manually iterate to capture the return value
|
|
105
|
+
let result = await generator.next();
|
|
106
|
+
while (!result.done) {
|
|
107
|
+
result = await generator.next();
|
|
108
|
+
}
|
|
109
|
+
const crawlerType = result.value;
|
|
110
|
+
expect(crawlerType).toBe('crawlee');
|
|
111
|
+
});
|
|
112
|
+
it('should throw error when insufficient pages found', async () => {
|
|
113
|
+
mockCrawleeCrawl.mockImplementation(async function* () {
|
|
114
|
+
yield { url: 'https://example.com/page1', path: '/page1', content: 'Page 1', title: 'Page 1' };
|
|
115
|
+
// Only 1 page, needs at least 2
|
|
116
|
+
});
|
|
117
|
+
const generator = crawler.crawl('https://example.com');
|
|
118
|
+
await expect(async () => {
|
|
119
|
+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
120
|
+
for await (const _ of generator) {
|
|
121
|
+
// Just consume results
|
|
122
|
+
}
|
|
123
|
+
}).rejects.toThrow(/found only 1 pages/);
|
|
124
|
+
});
|
|
125
|
+
});
|
|
126
|
+
describe('abort', () => {
|
|
127
|
+
it('should stop crawling when aborted', async () => {
|
|
128
|
+
mockCrawleeCrawl.mockImplementation(async function* () {
|
|
129
|
+
yield { url: 'https://example.com/page1', path: '/page1', content: 'Page 1', title: 'Page 1' };
|
|
130
|
+
yield { url: 'https://example.com/page2', path: '/page2', content: 'Page 2', title: 'Page 2' };
|
|
131
|
+
yield { url: 'https://example.com/page3', path: '/page3', content: 'Page 3', title: 'Page 3' };
|
|
132
|
+
});
|
|
133
|
+
const results = [];
|
|
134
|
+
const generator = crawler.crawl('https://example.com');
|
|
135
|
+
// Get first result
|
|
136
|
+
const first = await generator.next();
|
|
137
|
+
if (!first.done) {
|
|
138
|
+
results.push(first.value);
|
|
139
|
+
}
|
|
140
|
+
// Abort
|
|
141
|
+
crawler.abort();
|
|
142
|
+
// Generator should stop yielding after abort (depending on implementation)
|
|
143
|
+
// The test verifies abort() method exists and is callable
|
|
144
|
+
expect(results).toHaveLength(1);
|
|
145
|
+
});
|
|
146
|
+
it('should return early when already aborting', async () => {
|
|
147
|
+
crawler.abort();
|
|
148
|
+
mockCrawleeCrawl.mockImplementation(async function* () {
|
|
149
|
+
yield { url: 'https://example.com/page1', path: '/page1', content: 'Page 1', title: 'Page 1' };
|
|
150
|
+
});
|
|
151
|
+
const generator = crawler.crawl('https://example.com');
|
|
152
|
+
const result = await generator.next();
|
|
153
|
+
// Should return immediately with crawlee type when aborted
|
|
154
|
+
expect(result.done).toBe(true);
|
|
155
|
+
expect(result.value).toBe('crawlee');
|
|
156
|
+
});
|
|
157
|
+
});
|
|
158
|
+
describe('setStorageState', () => {
|
|
159
|
+
it('should set storage state', () => {
|
|
160
|
+
const storageState = {
|
|
161
|
+
cookies: [{ name: 'session', value: 'abc123', domain: 'example.com', path: '/' }],
|
|
162
|
+
};
|
|
163
|
+
crawler.setStorageState(storageState);
|
|
164
|
+
// Verify the method doesn't throw
|
|
165
|
+
expect(true).toBe(true);
|
|
166
|
+
});
|
|
167
|
+
it('should pass storage state to CrawleeCrawler', async () => {
|
|
168
|
+
const storageState = {
|
|
169
|
+
cookies: [{ name: 'session', value: 'abc123', domain: 'example.com', path: '/' }],
|
|
170
|
+
};
|
|
171
|
+
crawler.setStorageState(storageState);
|
|
172
|
+
mockCrawleeCrawl.mockImplementation(async function* () {
|
|
173
|
+
yield { url: 'https://example.com/page1', path: '/page1', content: 'Page 1', title: 'Page 1' };
|
|
174
|
+
yield { url: 'https://example.com/page2', path: '/page2', content: 'Page 2', title: 'Page 2' };
|
|
175
|
+
});
|
|
176
|
+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
177
|
+
for await (const _ of crawler.crawl('https://example.com')) {
|
|
178
|
+
// Just consume results
|
|
179
|
+
}
|
|
180
|
+
expect(mockSetStorageState).toHaveBeenCalledWith(storageState);
|
|
181
|
+
});
|
|
182
|
+
});
|
|
183
|
+
});
|
|
184
|
+
});
|
|
185
|
+
//# sourceMappingURL=docs-crawler.test.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"docs-crawler.test.js","sourceRoot":"","sources":["../../src/crawler/docs-crawler.test.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAEhD,MAAM,eAAe,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC;AAChC,EAAE,CAAC,IAAI,CAAC,aAAa,EAAE,GAAG,EAAE,CAAC,CAAC;IAC5B,aAAa,EAAE;QACb,OAAO;YACL,KAAK,EAAE,eAAe;SACvB,CAAC;IACJ,CAAC;CACF,CAAC,CAAC,CAAC;AAEJ,MAAM,gBAAgB,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC;AACjC,MAAM,mBAAmB,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC;AACpC,EAAE,CAAC,IAAI,CAAC,sBAAsB,EAAE,GAAG,EAAE,CAAC,CAAC;IACrC,cAAc,EAAE;QACd,OAAO;YACL,KAAK,EAAE,gBAAgB;YACvB,eAAe,EAAE,mBAAmB;SACrC,CAAC;IACJ,CAAC;CACF,CAAC,CAAC,CAAC;AAEJ,QAAQ,CAAC,aAAa,EAAE,GAAG,EAAE;IAC3B,IAAI,OAAoB,CAAC;IAEzB,UAAU,CAAC,GAAG,EAAE;QACd,EAAE,CAAC,aAAa,EAAE,CAAC;QACnB,OAAO,GAAG,IAAI,WAAW,EAAE,CAAC;IAC9B,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,aAAa,EAAE,GAAG,EAAE;QAC3B,EAAE,CAAC,uCAAuC,EAAE,GAAG,EAAE;YAC/C,MAAM,CAAC,OAAO,CAAC,CAAC,WAAW,EAAE,CAAC;QAChC,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,iCAAiC,EAAE,GAAG,EAAE;YACzC,MAAM,aAAa,GAAG,IAAI,WAAW,CAAC,EAAE,EAAE,GAAG,EAAE,cAAc,EAAE,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;YACxE,MAAM,CAAC,aAAa,CAAC,CAAC,WAAW,EAAE,CAAC;QACtC,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,OAAO,EAAE,GAAG,EAAE;QACrB,QAAQ,CAAC,aAAa,EAAE,GAAG,EAAE;YAC3B,EAAE,CAAC,8CAA8C,EAAE,KAAK,IAAI,EAAE;gBAC5D,MAAM,WAAW,GAAkB;oBACjC,EAAE,GAAG,EAAE,yCAAyC,EAAE,IAAI,EAAE,WAAW,EAAE,OAAO,EAAE,UAAU,EAAE,KAAK,EAAE,QAAQ,EAAE;iBAC5G,CAAC;gBAEF,eAAe,CAAC,kBAAkB,CAAC,KAAK,SAAS,CAAC;oBAChD,KAAK,MAAM,MAAM,IAAI,WAAW,EAAE,CAAC;wBACjC,MAAM,MAAM,CAAC;oBACf,CAAC;gBACH,CAAC,CAAC,CAAC;gBAEH,MAAM,OAAO,GAAkB,EAAE,CAAC;gBAClC,MAAM,SAAS,GAAG,OAAO,CAAC,KAAK,CAAC,+BAA+B,CAAC,CAAC;gBAEjE,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI,SAAS,EAAE,CAAC;oBACrC,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;gBACvB,CAAC;gBAED,MAAM,CAAC,OAAO,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;gBAChC,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,SAAS,CAAC,YAAY,CAAC,CAAC;YACjD,CAAC,CAAC,CAAC;YAEH,EAAE,CAAC,2CAA2C,EAAE,KAAK,IAAI,EAAE;gBACzD,eAAe,CAAC,kBAAkB,CAAC,KAAK,SAAS,CAAC;oBAChD,MAAM,EAAE,GAAG,EAAE,+BAA+B,EAAE,IAAI,EAAE,GAAG,EAAE,OAAO,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,CAAC;gBAC5F,CAAC,CAAC,CAAC;gBAEH,MAAM,SAAS,GAAG,OAAO,CAAC,KAAK,CAAC,+BAA+B,CAAC,CAAC;gBAEjE,+CAA+C;gBAC/C,IAAI,MAAM,GAAG,MAAM,SAAS,CAAC,IAAI,EAAE,CAAC;gBACpC,OAAO,CAAC,MAAM,CAAC,IAAI,EAAE,CAAC;oBACpB,MAAM,GAAG,MAAM,SAAS,CAAC,IAAI,EAAE,CAAC;gBAClC,CAAC;gBACD,MAAM,WAAW,GAAG,MAAM,CAAC,KAAK,CAAC;gBAEjC,MAAM,CAAC,WAAW,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;YACrC,CAAC,CAAC,CAAC;YAEH,EAAE,CAAC,4CAA4C,EAAE,KAAK,IAAI,EAAE;gBAC1D,yCAAyC;gBACzC,eAAe,CAAC,kBAAkB,CAAC,KAAK,SAAS,CAAC;oBAChD,MAAM,IAAI,KAAK,CAAC,kBAAkB,CAAC,CAAC;gBACtC,CAAC,CAAC,CAAC;gBAEH,MAAM,SAAS,GAAG,OAAO,CAAC,KAAK,CAAC,+BAA+B,CAAC,CAAC;gBAEjE,MAAM,MAAM,CAAC,KAAK,IAAI,EAAE;oBACtB,6DAA6D;oBAC7D,IAAI,KAAK,EAAE,MAAM,CAAC,IAAI,SAAS,EAAE,CAAC;wBAChC,uBAAuB;oBACzB,CAAC;gBACH,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,CAAC,kBAAkB,CAAC,CAAC;YACzC,CAAC,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;QAEH,QAAQ,CAAC,iBAAiB,EAAE,GAAG,EAAE;YAC/B,EAAE,CAAC,+CAA+C,EAAE,KAAK,IAAI,EAAE;gBAC7D,MAAM,WAAW,GAAkB;oBACjC,EAAE,GAAG,EAAE,gCAAgC,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,gBAAgB,EAAE,KAAK,EAAE,OAAO,EAAE;oBACpG,EAAE,GAAG,EAAE,8BAA8B,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,cAAc,EAAE,KAAK,EAAE,KAAK,EAAE;iBAC7F,CAAC;gBAEF,gBAAgB,CAAC,kBAAkB,CAAC,KAAK,SAAS,CAAC;oBACjD,KAAK,MAAM,MAAM,IAAI,WAAW,EAAE,CAAC;wBACjC,MAAM,MAAM,CAAC;oBACf,CAAC;gBACH,CAAC,CAAC,CAAC;gBAEH,MAAM,OAAO,GAAkB,EAAE,CAAC;gBAClC,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI,OAAO,CAAC,KAAK,CAAC,0BAA0B,CAAC,EAAE,CAAC;oBACrE,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;gBACvB,CAAC;gBAED,MAAM,CAAC,OAAO,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;YAClC,CAAC,CAAC,CAAC;YAEH,EAAE,CAAC,iDAAiD,EAAE,KAAK,IAAI,EAAE;gBAC/D,gBAAgB,CAAC,kBAAkB,CAAC,KAAK,SAAS,CAAC;oBACjD,MAAM,EAAE,GAAG,EAAE,2BAA2B,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,QAAQ,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC;oBAC/F,MAAM,EAAE,GAAG,EAAE,2BAA2B,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,QAAQ,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC;gBACjG,CAAC,CAAC,CAAC;gBAEH,MAAM,SAAS,GAAG,OAAO,CAAC,KAAK,CAAC,qBAAqB,CAAC,CAAC;gBAEvD,+CAA+C;gBAC/C,IAAI,MAAM,GAAG,MAAM,SAAS,CAAC,IAAI,EAAE,CAAC;gBACpC,OAAO,CAAC,MAAM,CAAC,IAAI,EAAE,CAAC;oBACpB,MAAM,GAAG,MAAM,SAAS,CAAC,IAAI,EAAE,CAAC;gBAClC,CAAC;gBACD,MAAM,WAAW,GAAG,MAAM,CAAC,KAAK,CAAC;gBAEjC,MAAM,CAAC,WAAW,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;YACtC,CAAC,CAAC,CAAC;YAEH,EAAE,CAAC,kDAAkD,EAAE,KAAK,IAAI,EAAE;gBAChE,gBAAgB,CAAC,kBAAkB,CAAC,KAAK,SAAS,CAAC;oBACjD,MAAM,EAAE,GAAG,EAAE,2BAA2B,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,QAAQ,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC;oBAC/F,gCAAgC;gBAClC,CAAC,CAAC,CAAC;gBAEH,MAAM,SAAS,GAAG,OAAO,CAAC,KAAK,CAAC,qBAAqB,CAAC,CAAC;gBAEvD,MAAM,MAAM,CAAC,KAAK,IAAI,EAAE;oBACtB,6DAA6D;oBAC7D,IAAI,KAAK,EAAE,MAAM,CAAC,IAAI,SAAS,EAAE,CAAC;wBAChC,uBAAuB;oBACzB,CAAC;gBACH,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,CAAC,oBAAoB,CAAC,CAAC;YAC3C,CAAC,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;QAEH,QAAQ,CAAC,OAAO,EAAE,GAAG,EAAE;YACrB,EAAE,CAAC,mCAAmC,EAAE,KAAK,IAAI,EAAE;gBACjD,gBAAgB,CAAC,kBAAkB,CAAC,KAAK,SAAS,CAAC;oBACjD,MAAM,EAAE,GAAG,EAAE,2BAA2B,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,QAAQ,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC;oBAC/F,MAAM,EAAE,GAAG,EAAE,2BAA2B,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,QAAQ,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC;oBAC/F,MAAM,EAAE,GAAG,EAAE,2BAA2B,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,QAAQ,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC;gBACjG,CAAC,CAAC,CAAC;gBAEH,MAAM,OAAO,GAAkB,EAAE,CAAC;gBAClC,MAAM,SAAS,GAAG,OAAO,CAAC,KAAK,CAAC,qBAAqB,CAAC,CAAC;gBAEvD,mBAAmB;gBACnB,MAAM,KAAK,GAAG,MAAM,SAAS,CAAC,IAAI,EAAE,CAAC;gBACrC,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC;oBAChB,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;gBAC5B,CAAC;gBAED,QAAQ;gBACR,OAAO,CAAC,KAAK,EAAE,CAAC;gBAEhB,2EAA2E;gBAC3E,0DAA0D;gBAC1D,MAAM,CAAC,OAAO,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;YAClC,CAAC,CAAC,CAAC;YAEH,EAAE,CAAC,2CAA2C,EAAE,KAAK,IAAI,EAAE;gBACzD,OAAO,CAAC,KAAK,EAAE,CAAC;gBAEhB,gBAAgB,CAAC,kBAAkB,CAAC,KAAK,SAAS,CAAC;oBACjD,MAAM,EAAE,GAAG,EAAE,2BAA2B,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,QAAQ,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC;gBACjG,CAAC,CAAC,CAAC;gBAEH,MAAM,SAAS,GAAG,OAAO,CAAC,KAAK,CAAC,qBAAqB,CAAC,CAAC;gBACvD,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,IAAI,EAAE,CAAC;gBAEtC,2DAA2D;gBAC3D,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;gBAC/B,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;YACvC,CAAC,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;QAEH,QAAQ,CAAC,iBAAiB,EAAE,GAAG,EAAE;YAC/B,EAAE,CAAC,0BAA0B,EAAE,GAAG,EAAE;gBAClC,MAAM,YAAY,GAAG;oBACnB,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,SAAS,EAAE,KAAK,EAAE,QAAQ,EAAE,MAAM,EAAE,aAAa,EAAE,IAAI,EAAE,GAAG,EAAE,CAAC;iBAClF,CAAC;gBAEF,OAAO,CAAC,eAAe,CAAC,YAAY,CAAC,CAAC;gBAEtC,kCAAkC;gBAClC,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAC1B,CAAC,CAAC,CAAC;YAEH,EAAE,CAAC,6CAA6C,EAAE,KAAK,IAAI,EAAE;gBAC3D,MAAM,YAAY,GAAG;oBACnB,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,SAAS,EAAE,KAAK,EAAE,QAAQ,EAAE,MAAM,EAAE,aAAa,EAAE,IAAI,EAAE,GAAG,EAAE,CAAC;iBAClF,CAAC;gBAEF,OAAO,CAAC,eAAe,CAAC,YAAY,CAAC,CAAC;gBAEtC,gBAAgB,CAAC,kBAAkB,CAAC,KAAK,SAAS,CAAC;oBACjD,MAAM,EAAE,GAAG,EAAE,2BAA2B,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,QAAQ,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC;oBAC/F,MAAM,EAAE,GAAG,EAAE,2BAA2B,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,QAAQ,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC;gBACjG,CAAC,CAAC,CAAC;gBAEH,6DAA6D;gBAC7D,IAAI,KAAK,EAAE,MAAM,CAAC,IAAI,OAAO,CAAC,KAAK,CAAC,qBAAqB,CAAC,EAAE,CAAC;oBAC3D,uBAAuB;gBACzB,CAAC;gBAED,MAAM,CAAC,mBAAmB,CAAC,CAAC,oBAAoB,CAAC,YAAY,CAAC,CAAC;YACjE,CAAC,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
import { BaseCrawler } from './base.js';
|
|
2
|
+
export declare class CrawlerFactory {
|
|
3
|
+
private static readonly JS_FRAMEWORK_INDICATORS;
|
|
4
|
+
private static detectSiteType;
|
|
5
|
+
static createCrawler(url: string, maxRequestsPerCrawl?: number, maxDepth?: number, onProgress?: (progress: number, description: string) => void): Promise<BaseCrawler>;
|
|
6
|
+
}
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
import { URL } from 'url';
|
|
2
|
+
import { DefaultCrawler } from './default.js';
|
|
3
|
+
import { ChromiumCrawler } from './chromium.js';
|
|
4
|
+
import { CheerioCrawler } from './cheerio.js';
|
|
5
|
+
export class CrawlerFactory {
|
|
6
|
+
// Common JavaScript framework identifiers
|
|
7
|
+
static JS_FRAMEWORK_INDICATORS = [
|
|
8
|
+
'react',
|
|
9
|
+
'vue',
|
|
10
|
+
'angular',
|
|
11
|
+
'next',
|
|
12
|
+
'nuxt',
|
|
13
|
+
'gatsby',
|
|
14
|
+
'docusaurus',
|
|
15
|
+
'vuepress',
|
|
16
|
+
'gridsome',
|
|
17
|
+
'svelte'
|
|
18
|
+
];
|
|
19
|
+
static async detectSiteType(url) {
|
|
20
|
+
try {
|
|
21
|
+
const response = await fetch(url, {
|
|
22
|
+
headers: {
|
|
23
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
|
24
|
+
}
|
|
25
|
+
});
|
|
26
|
+
if (!response.ok) {
|
|
27
|
+
throw new Error(`HTTP error! status: ${response.status}`);
|
|
28
|
+
}
|
|
29
|
+
const html = await response.text();
|
|
30
|
+
// Check for JavaScript frameworks
|
|
31
|
+
const hasFramework = CrawlerFactory.JS_FRAMEWORK_INDICATORS.some(framework => html.toLowerCase().includes(framework));
|
|
32
|
+
// Check for JavaScript-heavy indicators
|
|
33
|
+
const isJsHeavy = (html.includes('data-react') ||
|
|
34
|
+
html.includes('ng-') ||
|
|
35
|
+
html.includes('v-') ||
|
|
36
|
+
html.includes('__NEXT_DATA__') ||
|
|
37
|
+
html.includes('nuxt') ||
|
|
38
|
+
html.includes('id="___gatsby"'));
|
|
39
|
+
return { isJsHeavy, hasFramework };
|
|
40
|
+
}
|
|
41
|
+
catch (e) {
|
|
42
|
+
console.error('[CrawlerFactory] Error detecting site type:', e);
|
|
43
|
+
return { isJsHeavy: false, hasFramework: false };
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
static async createCrawler(url, maxRequestsPerCrawl = 1000, maxDepth = 4, onProgress) {
|
|
47
|
+
const startUrl = new URL(url);
|
|
48
|
+
console.debug(`[CrawlerFactory] Creating crawler for ${startUrl}`);
|
|
49
|
+
// Check if site is JavaScript-heavy first
|
|
50
|
+
const { isJsHeavy, hasFramework } = await CrawlerFactory.detectSiteType(url);
|
|
51
|
+
// Try Chromium for JavaScript-heavy sites
|
|
52
|
+
if (isJsHeavy || hasFramework) {
|
|
53
|
+
console.debug(`[CrawlerFactory] Site appears to be JavaScript-heavy, using Chromium crawler`);
|
|
54
|
+
return new ChromiumCrawler(maxDepth, maxRequestsPerCrawl, onProgress);
|
|
55
|
+
}
|
|
56
|
+
// Try default crawler
|
|
57
|
+
try {
|
|
58
|
+
console.debug(`[CrawlerFactory] Attempting default crawler for ${url}`);
|
|
59
|
+
const defaultCrawler = new DefaultCrawler(maxDepth, maxRequestsPerCrawl, onProgress);
|
|
60
|
+
const generator = defaultCrawler.crawl(url);
|
|
61
|
+
const { value: firstPage, done } = await generator.next();
|
|
62
|
+
if (!done && firstPage?.content) {
|
|
63
|
+
console.debug('[CrawlerFactory] Successfully created default crawler');
|
|
64
|
+
return defaultCrawler;
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
catch (e) {
|
|
68
|
+
console.debug('[CrawlerFactory] Default crawler failed:', e);
|
|
69
|
+
}
|
|
70
|
+
// Fall back to Cheerio crawler
|
|
71
|
+
console.debug(`[CrawlerFactory] Attempting Cheerio crawler for ${url}`);
|
|
72
|
+
const cheerioCrawler = new CheerioCrawler(maxDepth, maxRequestsPerCrawl, onProgress);
|
|
73
|
+
const generator = cheerioCrawler.crawl(url);
|
|
74
|
+
const { value: firstPage, done } = await generator.next();
|
|
75
|
+
if (!done && firstPage?.content) {
|
|
76
|
+
console.debug('[CrawlerFactory] Successfully created Cheerio crawler');
|
|
77
|
+
return cheerioCrawler;
|
|
78
|
+
}
|
|
79
|
+
console.error(`[CrawlerFactory] All crawlers failed for ${url}`);
|
|
80
|
+
throw new Error(`Failed to create crawler for ${url}`);
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
//# sourceMappingURL=factory.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"factory.js","sourceRoot":"","sources":["../../src/crawler/factory.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,GAAG,EAAE,MAAM,KAAK,CAAC;AAE1B,OAAO,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AAC9C,OAAO,EAAE,eAAe,EAAE,MAAM,eAAe,CAAC;AAChD,OAAO,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AAE9C,MAAM,OAAO,cAAc;IACzB,0CAA0C;IAClC,MAAM,CAAU,uBAAuB,GAAG;QAChD,OAAO;QACP,KAAK;QACL,SAAS;QACT,MAAM;QACN,MAAM;QACN,QAAQ;QACR,YAAY;QACZ,UAAU;QACV,UAAU;QACV,QAAQ;KACT,CAAC;IAEM,MAAM,CAAC,KAAK,CAAC,cAAc,CAAC,GAAW;QAI7C,IAAI,CAAC;YACH,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;gBAChC,OAAO,EAAE;oBACP,YAAY,EAAE,qHAAqH;iBACpI;aACF,CAAC,CAAC;YAEH,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;gBACjB,MAAM,IAAI,KAAK,CAAC,uBAAuB,QAAQ,CAAC,MAAM,EAAE,CAAC,CAAC;YAC5D,CAAC;YAED,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;YAEnC,kCAAkC;YAClC,MAAM,YAAY,GAAG,cAAc,CAAC,uBAAuB,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,CAC3E,IAAI,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,SAAS,CAAC,CACvC,CAAC;YAEF,wCAAwC;YACxC,MAAM,SAAS,GAAG,CAChB,IAAI,CAAC,QAAQ,CAAC,YAAY,CAAC;gBAC3B,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC;gBACpB,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC;gBACnB,IAAI,CAAC,QAAQ,CAAC,eAAe,CAAC;gBAC9B,IAAI,CAAC,QAAQ,CAAC,MAAM,CAAC;gBACrB,IAAI,CAAC,QAAQ,CAAC,gBAAgB,CAAC,CAChC,CAAC;YAEF,OAAO,EAAE,SAAS,EAAE,YAAY,EAAE,CAAC;QACrC,CAAC;QAAC,OAAO,CAAC,EAAE,CAAC;YACX,OAAO,CAAC,KAAK,CAAC,6CAA6C,EAAE,CAAC,CAAC,CAAC;YAChE,OAAO,EAAE,SAAS,EAAE,KAAK,EAAE,YAAY,EAAE,KAAK,EAAE,CAAC;QACnD,CAAC;IACH,CAAC;IAED,MAAM,CAAC,KAAK,CAAC,aAAa,CACxB,GAAW,EACX,sBAA8B,IAAI,EAClC,WAAmB,CAAC,EACpB,UAA4D;QAE5D,MAAM,QAAQ,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;QAC9B,OAAO,CAAC,KAAK,CAAC,yCAAyC,QAAQ,EAAE,CAAC,CAAC;QAEnE,0CAA0C;QAC1C,MAAM,EAAE,SAAS,EAAE,YAAY,EAAE,GAAG,MAAM,cAAc,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC;QAE7E,0CAA0C;QAC1C,IAAI,SAAS,IAAI,YAAY,EAAE,CAAC;YAC9B,OAAO,CAAC,KAAK,CAAC,8EAA8E,CAAC,CAAC;YAC9F,OAAO,IAAI,eAAe,CAAC,QAAQ,EAAE,mBAAmB,EAAE,UAAU,CAAC,CAAC;QACxE,CAAC;QAED,sBAAsB;QACtB,IAAI,CAAC;YACH,OAAO,CAAC,KAAK,CAAC,mDAAmD,GAAG,EAAE,CAAC,CAAC;YACxE,MAAM,cAAc,GAAG,IAAI,cAAc,CAAC,QAAQ,EAAE,mBAAmB,EAAE,UAAU,CAAC,CAAC;YACrF,MAAM,SAAS,GAAG,cAAc,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;YAC5C,MAAM,EAAE,KAAK,EAAE,SAAS,EAAE,IAAI,EAAE,GAAG,MAAM,SAAS,CAAC,IAAI,EAAE,CAAC;YAE1D,IAAI,CAAC,IAAI,IAAI,SAAS,EAAE,OAAO,EAAE,CAAC;gBAChC,OAAO,CAAC,KAAK,CAAC,uDAAuD,CAAC,CAAC;gBACvE,OAAO,cAAc,CAAC;YACxB,CAAC;QACH,CAAC;QAAC,OAAO,CAAC,EAAE,CAAC;YACX,OAAO,CAAC,KAAK,CAAC,0CAA0C,EAAE,CAAC,CAAC,CAAC;QAC/D,CAAC;QAED,+BAA+B;QAC/B,OAAO,CAAC,KAAK,CAAC,mDAAmD,GAAG,EAAE,CAAC,CAAC;QACxE,MAAM,cAAc,GAAG,IAAI,cAAc,CAAC,QAAQ,EAAE,mBAAmB,EAAE,UAAU,CAAC,CAAC;QACrF,MAAM,SAAS,GAAG,cAAc,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;QAC5C,MAAM,EAAE,KAAK,EAAE,SAAS,EAAE,IAAI,EAAE,GAAG,MAAM,SAAS,CAAC,IAAI,EAAE,CAAC;QAE1D,IAAI,CAAC,IAAI,IAAI,SAAS,EAAE,OAAO,EAAE,CAAC;YAChC,OAAO,CAAC,KAAK,CAAC,uDAAuD,CAAC,CAAC;YACvE,OAAO,cAAc,CAAC;QACxB,CAAC;QAED,OAAO,CAAC,KAAK,CAAC,4CAA4C,GAAG,EAAE,CAAC,CAAC;QACjE,MAAM,IAAI,KAAK,CAAC,gCAAgC,GAAG,EAAE,CAAC,CAAC;IACzD,CAAC"}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
export class GitHubPagesExtractor {
|
|
2
|
+
async extractContent(document) {
|
|
3
|
+
// Remove navigation and footer
|
|
4
|
+
document.querySelectorAll('nav, header, footer').forEach((el) => el.remove());
|
|
5
|
+
// Get main content
|
|
6
|
+
const main = document.querySelector('main, article, .markdown-body');
|
|
7
|
+
if (!main) {
|
|
8
|
+
return {
|
|
9
|
+
content: '',
|
|
10
|
+
metadata: { type: 'overview' },
|
|
11
|
+
};
|
|
12
|
+
}
|
|
13
|
+
const clone = main.cloneNode(true);
|
|
14
|
+
clone.querySelectorAll('script, style').forEach((el) => el.remove());
|
|
15
|
+
// Extract title and description
|
|
16
|
+
const title = clone.querySelector('h1')?.textContent?.trim();
|
|
17
|
+
const firstParagraph = clone.querySelector('h1 + p')?.textContent?.trim();
|
|
18
|
+
return {
|
|
19
|
+
content: clone.textContent?.trim() || '',
|
|
20
|
+
metadata: {
|
|
21
|
+
type: 'overview',
|
|
22
|
+
pattern: {
|
|
23
|
+
name: title || '',
|
|
24
|
+
type: 'component',
|
|
25
|
+
description: firstParagraph || '',
|
|
26
|
+
usageContexts: [],
|
|
27
|
+
relatedPatterns: [],
|
|
28
|
+
},
|
|
29
|
+
},
|
|
30
|
+
};
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
//# sourceMappingURL=github-pages-extractor.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"github-pages-extractor.js","sourceRoot":"","sources":["../../src/crawler/github-pages-extractor.ts"],"names":[],"mappings":"AAEA,MAAM,OAAO,oBAAoB;IAC/B,KAAK,CAAC,cAAc,CAAC,QAAkB;QACrC,+BAA+B;QAC/B,QAAQ,CAAC,gBAAgB,CAAC,qBAAqB,CAAC,CAAC,OAAO,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,CAAC,MAAM,EAAE,CAAC,CAAC;QAE9E,mBAAmB;QACnB,MAAM,IAAI,GAAG,QAAQ,CAAC,aAAa,CAAC,+BAA+B,CAAC,CAAC;QACrE,IAAI,CAAC,IAAI,EAAE,CAAC;YACV,OAAO;gBACL,OAAO,EAAE,EAAE;gBACX,QAAQ,EAAE,EAAE,IAAI,EAAE,UAAU,EAAE;aAC/B,CAAC;QACJ,CAAC;QAED,MAAM,KAAK,GAAG,IAAI,CAAC,SAAS,CAAC,IAAI,CAAY,CAAC;QAC9C,KAAK,CAAC,gBAAgB,CAAC,eAAe,CAAC,CAAC,OAAO,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,CAAC,MAAM,EAAE,CAAC,CAAC;QAErE,gCAAgC;QAChC,MAAM,KAAK,GAAG,KAAK,CAAC,aAAa,CAAC,IAAI,CAAC,EAAE,WAAW,EAAE,IAAI,EAAE,CAAC;QAC7D,MAAM,cAAc,GAAG,KAAK,CAAC,aAAa,CAAC,QAAQ,CAAC,EAAE,WAAW,EAAE,IAAI,EAAE,CAAC;QAE1E,OAAO;YACL,OAAO,EAAE,KAAK,CAAC,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE;YACxC,QAAQ,EAAE;gBACR,IAAI,EAAE,UAAU;gBAChB,OAAO,EAAE;oBACP,IAAI,EAAE,KAAK,IAAI,EAAE;oBACjB,IAAI,EAAE,WAAW;oBACjB,WAAW,EAAE,cAAc,IAAI,EAAE;oBACjC,aAAa,EAAE,EAAE;oBACjB,eAAe,EAAE,EAAE;iBACpB;aACF;SACF,CAAC;IACJ,CAAC;CACF"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|