@cosmocoder/mcp-web-docs 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +22 -0
- package/README.md +368 -0
- package/build/__mocks__/embeddings.d.ts +17 -0
- package/build/__mocks__/embeddings.js +66 -0
- package/build/__mocks__/embeddings.js.map +1 -0
- package/build/config.d.ts +44 -0
- package/build/config.js +158 -0
- package/build/config.js.map +1 -0
- package/build/config.test.d.ts +1 -0
- package/build/config.test.js +165 -0
- package/build/config.test.js.map +1 -0
- package/build/crawler/auth.d.ts +128 -0
- package/build/crawler/auth.js +546 -0
- package/build/crawler/auth.js.map +1 -0
- package/build/crawler/auth.test.d.ts +1 -0
- package/build/crawler/auth.test.js +174 -0
- package/build/crawler/auth.test.js.map +1 -0
- package/build/crawler/base.d.ts +24 -0
- package/build/crawler/base.js +149 -0
- package/build/crawler/base.js.map +1 -0
- package/build/crawler/base.test.d.ts +1 -0
- package/build/crawler/base.test.js +234 -0
- package/build/crawler/base.test.js.map +1 -0
- package/build/crawler/browser-config.d.ts +2 -0
- package/build/crawler/browser-config.js +29 -0
- package/build/crawler/browser-config.js.map +1 -0
- package/build/crawler/browser-config.test.d.ts +1 -0
- package/build/crawler/browser-config.test.js +56 -0
- package/build/crawler/browser-config.test.js.map +1 -0
- package/build/crawler/cheerio.d.ts +11 -0
- package/build/crawler/cheerio.js +134 -0
- package/build/crawler/cheerio.js.map +1 -0
- package/build/crawler/chromium.d.ts +21 -0
- package/build/crawler/chromium.js +596 -0
- package/build/crawler/chromium.js.map +1 -0
- package/build/crawler/content-extractor-types.d.ts +25 -0
- package/build/crawler/content-extractor-types.js +2 -0
- package/build/crawler/content-extractor-types.js.map +1 -0
- package/build/crawler/content-extractors.d.ts +9 -0
- package/build/crawler/content-extractors.js +9 -0
- package/build/crawler/content-extractors.js.map +1 -0
- package/build/crawler/content-utils.d.ts +2 -0
- package/build/crawler/content-utils.js +22 -0
- package/build/crawler/content-utils.js.map +1 -0
- package/build/crawler/content-utils.test.d.ts +1 -0
- package/build/crawler/content-utils.test.js +99 -0
- package/build/crawler/content-utils.test.js.map +1 -0
- package/build/crawler/crawlee-crawler.d.ts +63 -0
- package/build/crawler/crawlee-crawler.js +342 -0
- package/build/crawler/crawlee-crawler.js.map +1 -0
- package/build/crawler/crawlee-crawler.test.d.ts +1 -0
- package/build/crawler/crawlee-crawler.test.js +280 -0
- package/build/crawler/crawlee-crawler.test.js.map +1 -0
- package/build/crawler/default-extractor.d.ts +4 -0
- package/build/crawler/default-extractor.js +26 -0
- package/build/crawler/default-extractor.js.map +1 -0
- package/build/crawler/default-extractor.test.d.ts +1 -0
- package/build/crawler/default-extractor.test.js +200 -0
- package/build/crawler/default-extractor.test.js.map +1 -0
- package/build/crawler/default.d.ts +11 -0
- package/build/crawler/default.js +138 -0
- package/build/crawler/default.js.map +1 -0
- package/build/crawler/docs-crawler.d.ts +26 -0
- package/build/crawler/docs-crawler.js +97 -0
- package/build/crawler/docs-crawler.js.map +1 -0
- package/build/crawler/docs-crawler.test.d.ts +1 -0
- package/build/crawler/docs-crawler.test.js +185 -0
- package/build/crawler/docs-crawler.test.js.map +1 -0
- package/build/crawler/factory.d.ts +6 -0
- package/build/crawler/factory.js +83 -0
- package/build/crawler/factory.js.map +1 -0
- package/build/crawler/github-pages-extractor.d.ts +4 -0
- package/build/crawler/github-pages-extractor.js +33 -0
- package/build/crawler/github-pages-extractor.js.map +1 -0
- package/build/crawler/github-pages-extractor.test.d.ts +1 -0
- package/build/crawler/github-pages-extractor.test.js +184 -0
- package/build/crawler/github-pages-extractor.test.js.map +1 -0
- package/build/crawler/github.d.ts +20 -0
- package/build/crawler/github.js +181 -0
- package/build/crawler/github.js.map +1 -0
- package/build/crawler/github.test.d.ts +1 -0
- package/build/crawler/github.test.js +326 -0
- package/build/crawler/github.test.js.map +1 -0
- package/build/crawler/puppeteer.d.ts +16 -0
- package/build/crawler/puppeteer.js +191 -0
- package/build/crawler/puppeteer.js.map +1 -0
- package/build/crawler/queue-manager.d.ts +43 -0
- package/build/crawler/queue-manager.js +169 -0
- package/build/crawler/queue-manager.js.map +1 -0
- package/build/crawler/queue-manager.test.d.ts +1 -0
- package/build/crawler/queue-manager.test.js +509 -0
- package/build/crawler/queue-manager.test.js.map +1 -0
- package/build/crawler/site-rules.d.ts +11 -0
- package/build/crawler/site-rules.js +104 -0
- package/build/crawler/site-rules.js.map +1 -0
- package/build/crawler/site-rules.test.d.ts +1 -0
- package/build/crawler/site-rules.test.js +139 -0
- package/build/crawler/site-rules.test.js.map +1 -0
- package/build/crawler/storybook-extractor.d.ts +34 -0
- package/build/crawler/storybook-extractor.js +767 -0
- package/build/crawler/storybook-extractor.js.map +1 -0
- package/build/crawler/storybook-extractor.test.d.ts +1 -0
- package/build/crawler/storybook-extractor.test.js +491 -0
- package/build/crawler/storybook-extractor.test.js.map +1 -0
- package/build/embeddings/fastembed.d.ts +25 -0
- package/build/embeddings/fastembed.js +188 -0
- package/build/embeddings/fastembed.js.map +1 -0
- package/build/embeddings/fastembed.test.d.ts +1 -0
- package/build/embeddings/fastembed.test.js +307 -0
- package/build/embeddings/fastembed.test.js.map +1 -0
- package/build/embeddings/openai.d.ts +8 -0
- package/build/embeddings/openai.js +56 -0
- package/build/embeddings/openai.js.map +1 -0
- package/build/embeddings/types.d.ts +4 -0
- package/build/embeddings/types.js +2 -0
- package/build/embeddings/types.js.map +1 -0
- package/build/index.d.ts +2 -0
- package/build/index.js +1007 -0
- package/build/index.js.map +1 -0
- package/build/index.test.d.ts +1 -0
- package/build/index.test.js +364 -0
- package/build/index.test.js.map +1 -0
- package/build/indexing/queue-manager.d.ts +36 -0
- package/build/indexing/queue-manager.js +86 -0
- package/build/indexing/queue-manager.js.map +1 -0
- package/build/indexing/queue-manager.test.d.ts +1 -0
- package/build/indexing/queue-manager.test.js +257 -0
- package/build/indexing/queue-manager.test.js.map +1 -0
- package/build/indexing/status.d.ts +39 -0
- package/build/indexing/status.js +207 -0
- package/build/indexing/status.js.map +1 -0
- package/build/indexing/status.test.d.ts +1 -0
- package/build/indexing/status.test.js +246 -0
- package/build/indexing/status.test.js.map +1 -0
- package/build/processor/content.d.ts +16 -0
- package/build/processor/content.js +286 -0
- package/build/processor/content.js.map +1 -0
- package/build/processor/content.test.d.ts +1 -0
- package/build/processor/content.test.js +369 -0
- package/build/processor/content.test.js.map +1 -0
- package/build/processor/markdown.d.ts +11 -0
- package/build/processor/markdown.js +256 -0
- package/build/processor/markdown.js.map +1 -0
- package/build/processor/markdown.test.d.ts +1 -0
- package/build/processor/markdown.test.js +312 -0
- package/build/processor/markdown.test.js.map +1 -0
- package/build/processor/metadata-parser.d.ts +37 -0
- package/build/processor/metadata-parser.js +245 -0
- package/build/processor/metadata-parser.js.map +1 -0
- package/build/processor/metadata-parser.test.d.ts +1 -0
- package/build/processor/metadata-parser.test.js +357 -0
- package/build/processor/metadata-parser.test.js.map +1 -0
- package/build/processor/processor.d.ts +8 -0
- package/build/processor/processor.js +190 -0
- package/build/processor/processor.js.map +1 -0
- package/build/processor/processor.test.d.ts +1 -0
- package/build/processor/processor.test.js +357 -0
- package/build/processor/processor.test.js.map +1 -0
- package/build/rag/cache.d.ts +10 -0
- package/build/rag/cache.js +10 -0
- package/build/rag/cache.js.map +1 -0
- package/build/rag/code-generator.d.ts +11 -0
- package/build/rag/code-generator.js +30 -0
- package/build/rag/code-generator.js.map +1 -0
- package/build/rag/context-assembler.d.ts +23 -0
- package/build/rag/context-assembler.js +113 -0
- package/build/rag/context-assembler.js.map +1 -0
- package/build/rag/docs-search.d.ts +55 -0
- package/build/rag/docs-search.js +380 -0
- package/build/rag/docs-search.js.map +1 -0
- package/build/rag/pipeline.d.ts +26 -0
- package/build/rag/pipeline.js +91 -0
- package/build/rag/pipeline.js.map +1 -0
- package/build/rag/query-processor.d.ts +14 -0
- package/build/rag/query-processor.js +57 -0
- package/build/rag/query-processor.js.map +1 -0
- package/build/rag/reranker.d.ts +55 -0
- package/build/rag/reranker.js +210 -0
- package/build/rag/reranker.js.map +1 -0
- package/build/rag/response-generator.d.ts +20 -0
- package/build/rag/response-generator.js +101 -0
- package/build/rag/response-generator.js.map +1 -0
- package/build/rag/retriever.d.ts +19 -0
- package/build/rag/retriever.js +111 -0
- package/build/rag/retriever.js.map +1 -0
- package/build/rag/validator.d.ts +22 -0
- package/build/rag/validator.js +128 -0
- package/build/rag/validator.js.map +1 -0
- package/build/rag/version-manager.d.ts +23 -0
- package/build/rag/version-manager.js +98 -0
- package/build/rag/version-manager.js.map +1 -0
- package/build/setupTests.d.ts +4 -0
- package/build/setupTests.js +50 -0
- package/build/setupTests.js.map +1 -0
- package/build/storage/storage.d.ts +38 -0
- package/build/storage/storage.js +700 -0
- package/build/storage/storage.js.map +1 -0
- package/build/storage/storage.test.d.ts +1 -0
- package/build/storage/storage.test.js +338 -0
- package/build/storage/storage.test.js.map +1 -0
- package/build/types/rag.d.ts +27 -0
- package/build/types/rag.js +2 -0
- package/build/types/rag.js.map +1 -0
- package/build/types.d.ts +120 -0
- package/build/types.js +2 -0
- package/build/types.js.map +1 -0
- package/build/util/content-utils.d.ts +31 -0
- package/build/util/content-utils.js +120 -0
- package/build/util/content-utils.js.map +1 -0
- package/build/util/content.d.ts +1 -0
- package/build/util/content.js +16 -0
- package/build/util/content.js.map +1 -0
- package/build/util/docs.d.ts +1 -0
- package/build/util/docs.js +26 -0
- package/build/util/docs.js.map +1 -0
- package/build/util/docs.test.d.ts +1 -0
- package/build/util/docs.test.js +49 -0
- package/build/util/docs.test.js.map +1 -0
- package/build/util/favicon.d.ts +6 -0
- package/build/util/favicon.js +88 -0
- package/build/util/favicon.js.map +1 -0
- package/build/util/favicon.test.d.ts +1 -0
- package/build/util/favicon.test.js +140 -0
- package/build/util/favicon.test.js.map +1 -0
- package/build/util/logger.d.ts +17 -0
- package/build/util/logger.js +72 -0
- package/build/util/logger.js.map +1 -0
- package/build/util/logger.test.d.ts +1 -0
- package/build/util/logger.test.js +46 -0
- package/build/util/logger.test.js.map +1 -0
- package/build/util/security.d.ts +312 -0
- package/build/util/security.js +719 -0
- package/build/util/security.js.map +1 -0
- package/build/util/security.test.d.ts +1 -0
- package/build/util/security.test.js +524 -0
- package/build/util/security.test.js.map +1 -0
- package/build/util/site-detector.d.ts +22 -0
- package/build/util/site-detector.js +42 -0
- package/build/util/site-detector.js.map +1 -0
- package/package.json +112 -0
|
@@ -0,0 +1,326 @@
|
|
|
1
|
+
import { GitHubCrawler } from './github.js';
|
|
2
|
+
describe('GitHubCrawler', () => {
|
|
3
|
+
let crawler;
|
|
4
|
+
beforeEach(() => {
|
|
5
|
+
fetchMock.resetMocks();
|
|
6
|
+
crawler = new GitHubCrawler();
|
|
7
|
+
// Mock rateLimit to skip delays - the method is in BaseCrawler
|
|
8
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
9
|
+
vi.spyOn(crawler, 'rateLimit').mockResolvedValue(undefined);
|
|
10
|
+
});
|
|
11
|
+
describe('constructor', () => {
|
|
12
|
+
it('should initialize with default values', () => {
|
|
13
|
+
const defaultCrawler = new GitHubCrawler();
|
|
14
|
+
expect(defaultCrawler).toBeDefined();
|
|
15
|
+
});
|
|
16
|
+
it('should accept custom maxDepth and maxRequestsPerCrawl', () => {
|
|
17
|
+
const customCrawler = new GitHubCrawler(10, 500);
|
|
18
|
+
expect(customCrawler).toBeDefined();
|
|
19
|
+
});
|
|
20
|
+
it('should accept GitHub token', () => {
|
|
21
|
+
const tokenCrawler = new GitHubCrawler(4, 1000, 'github_token_123');
|
|
22
|
+
expect(tokenCrawler).toBeDefined();
|
|
23
|
+
});
|
|
24
|
+
it('should accept progress callback', () => {
|
|
25
|
+
const progressFn = vi.fn();
|
|
26
|
+
const progressCrawler = new GitHubCrawler(4, 1000, undefined, progressFn);
|
|
27
|
+
expect(progressCrawler).toBeDefined();
|
|
28
|
+
});
|
|
29
|
+
});
|
|
30
|
+
describe('crawl', () => {
|
|
31
|
+
it('should reject invalid GitHub URLs', async () => {
|
|
32
|
+
const results = [];
|
|
33
|
+
// Non-GitHub URL
|
|
34
|
+
for await (const result of crawler.crawl('https://example.com/owner/repo')) {
|
|
35
|
+
results.push(result);
|
|
36
|
+
}
|
|
37
|
+
expect(results).toHaveLength(0);
|
|
38
|
+
});
|
|
39
|
+
it('should reject URLs without owner/repo', async () => {
|
|
40
|
+
const results = [];
|
|
41
|
+
for await (const result of crawler.crawl('https://github.com')) {
|
|
42
|
+
results.push(result);
|
|
43
|
+
}
|
|
44
|
+
expect(results).toHaveLength(0);
|
|
45
|
+
});
|
|
46
|
+
it('should crawl documentation directory when found', async () => {
|
|
47
|
+
// First call: list root directory
|
|
48
|
+
fetchMock.mockResponseOnce(JSON.stringify([
|
|
49
|
+
{ path: 'docs', type: 'dir', name: 'docs', url: 'https://api.github.com/repos/owner/repo/contents/docs' },
|
|
50
|
+
{ path: 'src', type: 'dir', name: 'src', url: 'https://api.github.com/repos/owner/repo/contents/src' },
|
|
51
|
+
]));
|
|
52
|
+
// Second call: list docs directory
|
|
53
|
+
fetchMock.mockResponseOnce(JSON.stringify([
|
|
54
|
+
{
|
|
55
|
+
path: 'docs/guide.md',
|
|
56
|
+
type: 'file',
|
|
57
|
+
name: 'guide.md',
|
|
58
|
+
url: 'https://api.github.com/repos/owner/repo/contents/docs/guide.md',
|
|
59
|
+
},
|
|
60
|
+
{ path: 'docs/api.md', type: 'file', name: 'api.md', url: 'https://api.github.com/repos/owner/repo/contents/docs/api.md' },
|
|
61
|
+
]));
|
|
62
|
+
// Third call: fetch guide.md content
|
|
63
|
+
fetchMock.mockResponseOnce('# Guide\n\nThis is the guide content.');
|
|
64
|
+
// Fourth call: fetch api.md content
|
|
65
|
+
fetchMock.mockResponseOnce('# API Reference\n\nAPI documentation.');
|
|
66
|
+
const results = [];
|
|
67
|
+
for await (const result of crawler.crawl('https://github.com/owner/repo')) {
|
|
68
|
+
results.push(result);
|
|
69
|
+
}
|
|
70
|
+
expect(results).toHaveLength(2);
|
|
71
|
+
expect(results[0].path).toBe('docs/guide.md');
|
|
72
|
+
expect(results[0].content).toContain('Guide');
|
|
73
|
+
expect(results[1].path).toBe('docs/api.md');
|
|
74
|
+
});
|
|
75
|
+
it('should handle .git extension in repo URL', async () => {
|
|
76
|
+
const rootFiles = [
|
|
77
|
+
{ path: 'README.md', type: 'file', name: 'README.md', url: 'https://api.github.com/repos/owner/repo/contents/README.md' },
|
|
78
|
+
];
|
|
79
|
+
// First call: findDocumentationDirs checks root
|
|
80
|
+
fetchMock.mockResponseOnce(JSON.stringify(rootFiles));
|
|
81
|
+
// Second call: processDirectory fetches root again (no doc dirs found)
|
|
82
|
+
fetchMock.mockResponseOnce(JSON.stringify(rootFiles));
|
|
83
|
+
// Third call: fetch README.md content
|
|
84
|
+
fetchMock.mockResponseOnce('# README\n\nProject readme.');
|
|
85
|
+
const results = [];
|
|
86
|
+
for await (const result of crawler.crawl('https://github.com/owner/repo.git')) {
|
|
87
|
+
results.push(result);
|
|
88
|
+
}
|
|
89
|
+
expect(results).toHaveLength(1);
|
|
90
|
+
expect(results[0].title).toBe('README');
|
|
91
|
+
});
|
|
92
|
+
it('should skip non-markdown files', async () => {
|
|
93
|
+
const rootFiles = [
|
|
94
|
+
{ path: 'index.js', type: 'file', name: 'index.js', url: 'https://api.github.com/repos/owner/repo/contents/index.js' },
|
|
95
|
+
{ path: 'style.css', type: 'file', name: 'style.css', url: 'https://api.github.com/repos/owner/repo/contents/style.css' },
|
|
96
|
+
{ path: 'README.md', type: 'file', name: 'README.md', url: 'https://api.github.com/repos/owner/repo/contents/README.md' },
|
|
97
|
+
];
|
|
98
|
+
// First call: findDocumentationDirs checks root
|
|
99
|
+
fetchMock.mockResponseOnce(JSON.stringify(rootFiles));
|
|
100
|
+
// Second call: processDirectory fetches root again (no doc dirs found)
|
|
101
|
+
fetchMock.mockResponseOnce(JSON.stringify(rootFiles));
|
|
102
|
+
// Third call: fetch README.md content
|
|
103
|
+
fetchMock.mockResponseOnce('# README');
|
|
104
|
+
const results = [];
|
|
105
|
+
for await (const result of crawler.crawl('https://github.com/owner/repo')) {
|
|
106
|
+
results.push(result);
|
|
107
|
+
}
|
|
108
|
+
expect(results).toHaveLength(1);
|
|
109
|
+
expect(results[0].path).toBe('README.md');
|
|
110
|
+
});
|
|
111
|
+
it('should handle various markdown extensions', async () => {
|
|
112
|
+
const rootFiles = [
|
|
113
|
+
{ path: 'doc.md', type: 'file', name: 'doc.md', url: 'https://api.github.com/repos/owner/repo/contents/doc.md' },
|
|
114
|
+
{ path: 'page.mdx', type: 'file', name: 'page.mdx', url: 'https://api.github.com/repos/owner/repo/contents/page.mdx' },
|
|
115
|
+
{
|
|
116
|
+
path: 'guide.markdown',
|
|
117
|
+
type: 'file',
|
|
118
|
+
name: 'guide.markdown',
|
|
119
|
+
url: 'https://api.github.com/repos/owner/repo/contents/guide.markdown',
|
|
120
|
+
},
|
|
121
|
+
];
|
|
122
|
+
// First call: findDocumentationDirs checks root
|
|
123
|
+
fetchMock.mockResponseOnce(JSON.stringify(rootFiles));
|
|
124
|
+
// Second call: processDirectory fetches root again (no doc dirs found)
|
|
125
|
+
fetchMock.mockResponseOnce(JSON.stringify(rootFiles));
|
|
126
|
+
// File content fetches
|
|
127
|
+
fetchMock.mockResponseOnce('# Doc');
|
|
128
|
+
fetchMock.mockResponseOnce('# Page');
|
|
129
|
+
fetchMock.mockResponseOnce('# Guide');
|
|
130
|
+
const results = [];
|
|
131
|
+
for await (const result of crawler.crawl('https://github.com/owner/repo')) {
|
|
132
|
+
results.push(result);
|
|
133
|
+
}
|
|
134
|
+
expect(results).toHaveLength(3);
|
|
135
|
+
});
|
|
136
|
+
it('should skip directories like node_modules, vendor, test, etc.', async () => {
|
|
137
|
+
fetchMock.mockResponseOnce(JSON.stringify([
|
|
138
|
+
{
|
|
139
|
+
path: 'node_modules',
|
|
140
|
+
type: 'dir',
|
|
141
|
+
name: 'node_modules',
|
|
142
|
+
url: 'https://api.github.com/repos/owner/repo/contents/node_modules',
|
|
143
|
+
},
|
|
144
|
+
{ path: 'vendor', type: 'dir', name: 'vendor', url: 'https://api.github.com/repos/owner/repo/contents/vendor' },
|
|
145
|
+
{ path: 'test', type: 'dir', name: 'test', url: 'https://api.github.com/repos/owner/repo/contents/test' },
|
|
146
|
+
{ path: 'docs', type: 'dir', name: 'docs', url: 'https://api.github.com/repos/owner/repo/contents/docs' },
|
|
147
|
+
]));
|
|
148
|
+
fetchMock.mockResponseOnce(JSON.stringify([
|
|
149
|
+
{
|
|
150
|
+
path: 'docs/guide.md',
|
|
151
|
+
type: 'file',
|
|
152
|
+
name: 'guide.md',
|
|
153
|
+
url: 'https://api.github.com/repos/owner/repo/contents/docs/guide.md',
|
|
154
|
+
},
|
|
155
|
+
]));
|
|
156
|
+
fetchMock.mockResponseOnce('# Guide');
|
|
157
|
+
const results = [];
|
|
158
|
+
for await (const result of crawler.crawl('https://github.com/owner/repo')) {
|
|
159
|
+
results.push(result);
|
|
160
|
+
}
|
|
161
|
+
// Should only get the docs/guide.md, not files from skipped directories
|
|
162
|
+
expect(results).toHaveLength(1);
|
|
163
|
+
expect(results[0].path).toBe('docs/guide.md');
|
|
164
|
+
});
|
|
165
|
+
it('should handle GitHub API rate limit error', async () => {
|
|
166
|
+
fetchMock.mockResponseOnce('', { status: 403 });
|
|
167
|
+
const results = [];
|
|
168
|
+
for await (const result of crawler.crawl('https://github.com/owner/repo')) {
|
|
169
|
+
results.push(result);
|
|
170
|
+
}
|
|
171
|
+
// Should return empty due to rate limit
|
|
172
|
+
expect(results).toHaveLength(0);
|
|
173
|
+
});
|
|
174
|
+
it('should handle API errors gracefully', async () => {
|
|
175
|
+
fetchMock.mockResponseOnce('', { status: 404 });
|
|
176
|
+
const results = [];
|
|
177
|
+
for await (const result of crawler.crawl('https://github.com/owner/repo')) {
|
|
178
|
+
results.push(result);
|
|
179
|
+
}
|
|
180
|
+
expect(results).toHaveLength(0);
|
|
181
|
+
});
|
|
182
|
+
it('should use GitHub token in headers when provided', async () => {
|
|
183
|
+
const tokenCrawler = new GitHubCrawler(4, 1000, 'test_token_123');
|
|
184
|
+
// Mock rateLimit for new crawler instance
|
|
185
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
186
|
+
vi.spyOn(tokenCrawler, 'rateLimit').mockResolvedValue(undefined);
|
|
187
|
+
fetchMock.mockResponseOnce(JSON.stringify([]));
|
|
188
|
+
const results = [];
|
|
189
|
+
for await (const result of tokenCrawler.crawl('https://github.com/owner/repo')) {
|
|
190
|
+
results.push(result);
|
|
191
|
+
}
|
|
192
|
+
expect(fetchMock).toHaveBeenCalledWith(expect.any(String), expect.objectContaining({
|
|
193
|
+
headers: expect.objectContaining({
|
|
194
|
+
Authorization: 'token test_token_123',
|
|
195
|
+
}),
|
|
196
|
+
}));
|
|
197
|
+
});
|
|
198
|
+
it('should extract title from file path', async () => {
|
|
199
|
+
const rootFiles = [
|
|
200
|
+
{
|
|
201
|
+
path: 'getting-started.md',
|
|
202
|
+
type: 'file',
|
|
203
|
+
name: 'getting-started.md',
|
|
204
|
+
url: 'https://api.github.com/repos/owner/repo/contents/getting-started.md',
|
|
205
|
+
},
|
|
206
|
+
{
|
|
207
|
+
path: 'api_reference.md',
|
|
208
|
+
type: 'file',
|
|
209
|
+
name: 'api_reference.md',
|
|
210
|
+
url: 'https://api.github.com/repos/owner/repo/contents/api_reference.md',
|
|
211
|
+
},
|
|
212
|
+
];
|
|
213
|
+
// First call: findDocumentationDirs checks root
|
|
214
|
+
fetchMock.mockResponseOnce(JSON.stringify(rootFiles));
|
|
215
|
+
// Second call: processDirectory fetches root again (no doc dirs found)
|
|
216
|
+
fetchMock.mockResponseOnce(JSON.stringify(rootFiles));
|
|
217
|
+
// File content fetches
|
|
218
|
+
fetchMock.mockResponseOnce('# Content');
|
|
219
|
+
fetchMock.mockResponseOnce('# Content');
|
|
220
|
+
const results = [];
|
|
221
|
+
for await (const result of crawler.crawl('https://github.com/owner/repo')) {
|
|
222
|
+
results.push(result);
|
|
223
|
+
}
|
|
224
|
+
expect(results[0].title).toBe('Getting Started');
|
|
225
|
+
expect(results[1].title).toBe('Api Reference');
|
|
226
|
+
});
|
|
227
|
+
it('should construct correct GitHub blob URLs', async () => {
|
|
228
|
+
// First call: findDocumentationDirs checks root - find docs directory
|
|
229
|
+
fetchMock.mockResponseOnce(JSON.stringify([{ path: 'docs', type: 'dir', name: 'docs', url: 'https://api.github.com/repos/owner/repo/contents/docs' }]));
|
|
230
|
+
// Second call: processDirectory fetches docs directory contents
|
|
231
|
+
fetchMock.mockResponseOnce(JSON.stringify([
|
|
232
|
+
{
|
|
233
|
+
path: 'docs/guide.md',
|
|
234
|
+
type: 'file',
|
|
235
|
+
name: 'guide.md',
|
|
236
|
+
url: 'https://api.github.com/repos/owner/repo/contents/docs/guide.md',
|
|
237
|
+
},
|
|
238
|
+
]));
|
|
239
|
+
// Third call: fetch file content
|
|
240
|
+
fetchMock.mockResponseOnce('# Guide');
|
|
241
|
+
const results = [];
|
|
242
|
+
for await (const result of crawler.crawl('https://github.com/owner/repo')) {
|
|
243
|
+
results.push(result);
|
|
244
|
+
}
|
|
245
|
+
expect(results[0].url).toBe('https://github.com/owner/repo/blob/main/docs/guide.md');
|
|
246
|
+
});
|
|
247
|
+
it('should handle fetch errors for file content', async () => {
|
|
248
|
+
fetchMock.mockResponseOnce(JSON.stringify([
|
|
249
|
+
{ path: 'guide.md', type: 'file', name: 'guide.md', url: 'https://api.github.com/repos/owner/repo/contents/guide.md' },
|
|
250
|
+
]));
|
|
251
|
+
fetchMock.mockRejectOnce(new Error('Network error'));
|
|
252
|
+
const results = [];
|
|
253
|
+
for await (const result of crawler.crawl('https://github.com/owner/repo')) {
|
|
254
|
+
results.push(result);
|
|
255
|
+
}
|
|
256
|
+
// Should skip files that fail to fetch
|
|
257
|
+
expect(results).toHaveLength(0);
|
|
258
|
+
});
|
|
259
|
+
it('should validate GitHub API response structure', async () => {
|
|
260
|
+
// Mock invalid response structure
|
|
261
|
+
fetchMock.mockResponseOnce(JSON.stringify({ invalid: 'structure' }));
|
|
262
|
+
const results = [];
|
|
263
|
+
for await (const result of crawler.crawl('https://github.com/owner/repo')) {
|
|
264
|
+
results.push(result);
|
|
265
|
+
}
|
|
266
|
+
expect(results).toHaveLength(0);
|
|
267
|
+
});
|
|
268
|
+
it('should find multiple documentation directories', async () => {
|
|
269
|
+
fetchMock.mockResponseOnce(JSON.stringify([
|
|
270
|
+
{ path: 'docs', type: 'dir', name: 'docs', url: 'https://api.github.com/repos/owner/repo/contents/docs' },
|
|
271
|
+
{ path: 'guide', type: 'dir', name: 'guide', url: 'https://api.github.com/repos/owner/repo/contents/guide' },
|
|
272
|
+
]));
|
|
273
|
+
fetchMock.mockResponseOnce(JSON.stringify([
|
|
274
|
+
{ path: 'docs/api.md', type: 'file', name: 'api.md', url: 'https://api.github.com/repos/owner/repo/contents/docs/api.md' },
|
|
275
|
+
]));
|
|
276
|
+
fetchMock.mockResponseOnce('# API');
|
|
277
|
+
fetchMock.mockResponseOnce(JSON.stringify([
|
|
278
|
+
{
|
|
279
|
+
path: 'guide/intro.md',
|
|
280
|
+
type: 'file',
|
|
281
|
+
name: 'intro.md',
|
|
282
|
+
url: 'https://api.github.com/repos/owner/repo/contents/guide/intro.md',
|
|
283
|
+
},
|
|
284
|
+
]));
|
|
285
|
+
fetchMock.mockResponseOnce('# Intro');
|
|
286
|
+
const results = [];
|
|
287
|
+
for await (const result of crawler.crawl('https://github.com/owner/repo')) {
|
|
288
|
+
results.push(result);
|
|
289
|
+
}
|
|
290
|
+
expect(results).toHaveLength(2);
|
|
291
|
+
});
|
|
292
|
+
it('should stop crawling when aborted', async () => {
|
|
293
|
+
const rootFiles = [
|
|
294
|
+
{ path: 'doc1.md', type: 'file', name: 'doc1.md', url: 'https://api.github.com/repos/owner/repo/contents/doc1.md' },
|
|
295
|
+
{ path: 'doc2.md', type: 'file', name: 'doc2.md', url: 'https://api.github.com/repos/owner/repo/contents/doc2.md' },
|
|
296
|
+
];
|
|
297
|
+
// First call: findDocumentationDirs checks root
|
|
298
|
+
fetchMock.mockResponseOnce(JSON.stringify(rootFiles));
|
|
299
|
+
// Second call: processDirectory fetches root again (no doc dirs found)
|
|
300
|
+
fetchMock.mockResponseOnce(JSON.stringify(rootFiles));
|
|
301
|
+
// Third call: fetch doc1.md content
|
|
302
|
+
fetchMock.mockResponseOnce('# Doc 1');
|
|
303
|
+
const results = [];
|
|
304
|
+
const abortableCrawler = new GitHubCrawler();
|
|
305
|
+
// Mock rateLimit for new crawler instance
|
|
306
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
307
|
+
vi.spyOn(abortableCrawler, 'rateLimit').mockResolvedValue(undefined);
|
|
308
|
+
// Get the generator
|
|
309
|
+
const generator = abortableCrawler.crawl('https://github.com/owner/repo');
|
|
310
|
+
// Get first result
|
|
311
|
+
const first = await generator.next();
|
|
312
|
+
if (!first.done) {
|
|
313
|
+
results.push(first.value);
|
|
314
|
+
}
|
|
315
|
+
// Abort before getting second result
|
|
316
|
+
abortableCrawler.isAborting = true;
|
|
317
|
+
// Try to get more results
|
|
318
|
+
for await (const result of generator) {
|
|
319
|
+
results.push(result);
|
|
320
|
+
}
|
|
321
|
+
// Should only have the first result
|
|
322
|
+
expect(results).toHaveLength(1);
|
|
323
|
+
});
|
|
324
|
+
});
|
|
325
|
+
});
|
|
326
|
+
//# sourceMappingURL=github.test.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"github.test.js","sourceRoot":"","sources":["../../src/crawler/github.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,aAAa,EAAE,MAAM,aAAa,CAAC;AAG5C,QAAQ,CAAC,eAAe,EAAE,GAAG,EAAE;IAC7B,IAAI,OAAsB,CAAC;IAE3B,UAAU,CAAC,GAAG,EAAE;QACd,SAAS,CAAC,UAAU,EAAE,CAAC;QACvB,OAAO,GAAG,IAAI,aAAa,EAAE,CAAC;QAC9B,+DAA+D;QAC/D,8DAA8D;QAC9D,EAAE,CAAC,KAAK,CAAC,OAAc,EAAE,WAAW,CAAC,CAAC,iBAAiB,CAAC,SAAS,CAAC,CAAC;IACrE,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,aAAa,EAAE,GAAG,EAAE;QAC3B,EAAE,CAAC,uCAAuC,EAAE,GAAG,EAAE;YAC/C,MAAM,cAAc,GAAG,IAAI,aAAa,EAAE,CAAC;YAC3C,MAAM,CAAC,cAAc,CAAC,CAAC,WAAW,EAAE,CAAC;QACvC,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,uDAAuD,EAAE,GAAG,EAAE;YAC/D,MAAM,aAAa,GAAG,IAAI,aAAa,CAAC,EAAE,EAAE,GAAG,CAAC,CAAC;YACjD,MAAM,CAAC,aAAa,CAAC,CAAC,WAAW,EAAE,CAAC;QACtC,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,4BAA4B,EAAE,GAAG,EAAE;YACpC,MAAM,YAAY,GAAG,IAAI,aAAa,CAAC,CAAC,EAAE,IAAI,EAAE,kBAAkB,CAAC,CAAC;YACpE,MAAM,CAAC,YAAY,CAAC,CAAC,WAAW,EAAE,CAAC;QACrC,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,iCAAiC,EAAE,GAAG,EAAE;YACzC,MAAM,UAAU,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC;YAC3B,MAAM,eAAe,GAAG,IAAI,aAAa,CAAC,CAAC,EAAE,IAAI,EAAE,SAAS,EAAE,UAAU,CAAC,CAAC;YAC1E,MAAM,CAAC,eAAe,CAAC,CAAC,WAAW,EAAE,CAAC;QACxC,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,OAAO,EAAE,GAAG,EAAE;QACrB,EAAE,CAAC,mCAAmC,EAAE,KAAK,IAAI,EAAE;YACjD,MAAM,OAAO,GAAkB,EAAE,CAAC;YAElC,iBAAiB;YACjB,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI,OAAO,CAAC,KAAK,CAAC,gCAAgC,CAAC,EAAE,CAAC;gBAC3E,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACvB,CAAC;YAED,MAAM,CAAC,OAAO,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;QAClC,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,uCAAuC,EAAE,KAAK,IAAI,EAAE;YACrD,MAAM,OAAO,GAAkB,EAAE,CAAC;YAElC,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI,OAAO,CAAC,KAAK,CAAC,oBAAoB,CAAC,EAAE,CAAC;gBAC/D,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACvB,CAAC;YAED,MAAM,CAAC,OAAO,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;QAClC,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,iDAAiD,EAAE,KAAK,IAAI,EAAE;YAC/D,kCAAkC;YAClC,SAAS,CAAC,gBAAgB,CACxB,IAAI,CAAC,SAAS,CAAC;gBACb,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,MAAM,EAAE,GAAG,EAAE,uDAAuD,EAAE;gBACzG,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,KAAK,EAAE,GAAG,EAAE,sDAAsD,EAAE;aACvG,CAAC,CACH,CAAC;YACF,mCAAmC;YACnC,SAAS,CAAC,gBAAgB,CACxB,IAAI,CAAC,SAAS,CAAC;gBACb;oBACE,IAAI,EAAE,eAAe;oBACrB,IAAI,EAAE,MAAM;oBACZ,IAAI,EAAE,UAAU;oBAChB,GAAG,EAAE,gEAAgE;iBACtE;gBACD,EAAE,IAAI,EAAE,aAAa,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,QAAQ,EAAE,GAAG,EAAE,8DAA8D,EAAE;aAC3H,CAAC,CACH,CAAC;YACF,qCAAqC;YACrC,SAAS,CAAC,gBAAgB,CAAC,uCAAuC,CAAC,CAAC;YACpE,oCAAoC;YACpC,SAAS,CAAC,gBAAgB,CAAC,uCAAuC,CAAC,CAAC;YAEpE,MAAM,OAAO,GAAkB,EAAE,CAAC;YAClC,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI,OAAO,CAAC,KAAK,CAAC,+BAA+B,CAAC,EAAE,CAAC;gBAC1E,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACvB,CAAC;YAED,MAAM,CAAC,OAAO,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;YAChC,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,eAAe,CAAC,CAAC;YAC9C,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,SAAS,CAAC,OAAO,CAAC,CAAC;YAC9C,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;QAC9C,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,0CAA0C,EAAE,KAAK,IAAI,EAAE;YACxD,MAAM,SAAS,GAAG;gBAChB,EAAE,IAAI,EAAE,WAAW,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,WAAW,EAAE,GAAG,EAAE,4DAA4D,EAAE;aAC1H,CAAC;YACF,gDAAgD;YAChD,SAAS,CAAC,gBAAgB,CAAC,IAAI,CAAC,SAAS,CAAC,SAAS,CAAC,CAAC,CAAC;YACtD,uEAAuE;YACvE,SAAS,CAAC,gBAAgB,CAAC,IAAI,CAAC,SAAS,CAAC,SAAS,CAAC,CAAC,CAAC;YACtD,sCAAsC;YACtC,SAAS,CAAC,gBAAgB,CAAC,6BAA6B,CAAC,CAAC;YAE1D,MAAM,OAAO,GAAkB,EAAE,CAAC;YAClC,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI,OAAO,CAAC,KAAK,CAAC,mCAAmC,CAAC,EAAE,CAAC;gBAC9E,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACvB,CAAC;YAED,MAAM,CAAC,OAAO,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;YAChC,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;QAC1C,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,gCAAgC,EAAE,KAAK,IAAI,EAAE;YAC9C,MAAM,SAAS,GAAG;gBAChB,EAAE,IAAI,EAAE,UAAU,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,UAAU,EAAE,GAAG,EAAE,2DAA2D,EAAE;gBACtH,EAAE,IAAI,EAAE,WAAW,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,WAAW,EAAE,GAAG,EAAE,4DAA4D,EAAE;gBACzH,EAAE,IAAI,EAAE,WAAW,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,WAAW,EAAE,GAAG,EAAE,4DAA4D,EAAE;aAC1H,CAAC;YACF,gDAAgD;YAChD,SAAS,CAAC,gBAAgB,CAAC,IAAI,CAAC,SAAS,CAAC,SAAS,CAAC,CAAC,CAAC;YACtD,uEAAuE;YACvE,SAAS,CAAC,gBAAgB,CAAC,IAAI,CAAC,SAAS,CAAC,SAAS,CAAC,CAAC,CAAC;YACtD,sCAAsC;YACtC,SAAS,CAAC,gBAAgB,CAAC,UAAU,CAAC,CAAC;YAEvC,MAAM,OAAO,GAAkB,EAAE,CAAC;YAClC,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI,OAAO,CAAC,KAAK,CAAC,+BAA+B,CAAC,EAAE,CAAC;gBAC1E,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACvB,CAAC;YAED,MAAM,CAAC,OAAO,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;YAChC,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;QAC5C,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,2CAA2C,EAAE,KAAK,IAAI,EAAE;YACzD,MAAM,SAAS,GAAG;gBAChB,EAAE,IAAI,EAAE,QAAQ,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,QAAQ,EAAE,GAAG,EAAE,yDAAyD,EAAE;gBAChH,EAAE,IAAI,EAAE,UAAU,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,UAAU,EAAE,GAAG,EAAE,2DAA2D,EAAE;gBACtH;oBACE,IAAI,EAAE,gBAAgB;oBACtB,IAAI,EAAE,MAAM;oBACZ,IAAI,EAAE,gBAAgB;oBACtB,GAAG,EAAE,iEAAiE;iBACvE;aACF,CAAC;YACF,gDAAgD;YAChD,SAAS,CAAC,gBAAgB,CAAC,IAAI,CAAC,SAAS,CAAC,SAAS,CAAC,CAAC,CAAC;YACtD,uEAAuE;YACvE,SAAS,CAAC,gBAAgB,CAAC,IAAI,CAAC,SAAS,CAAC,SAAS,CAAC,CAAC,CAAC;YACtD,uBAAuB;YACvB,SAAS,CAAC,gBAAgB,CAAC,OAAO,CAAC,CAAC;YACpC,SAAS,CAAC,gBAAgB,CAAC,QAAQ,CAAC,CAAC;YACrC,SAAS,CAAC,gBAAgB,CAAC,SAAS,CAAC,CAAC;YAEtC,MAAM,OAAO,GAAkB,EAAE,CAAC;YAClC,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI,OAAO,CAAC,KAAK,CAAC,+BAA+B,CAAC,EAAE,CAAC;gBAC1E,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACvB,CAAC;YAED,MAAM,CAAC,OAAO,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;QAClC,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,+DAA+D,EAAE,KAAK,IAAI,EAAE;YAC7E,SAAS,CAAC,gBAAgB,CACxB,IAAI,CAAC,SAAS,CAAC;gBACb;oBACE,IAAI,EAAE,cAAc;oBACpB,IAAI,EAAE,KAAK;oBACX,IAAI,EAAE,cAAc;oBACpB,GAAG,EAAE,+DAA+D;iBACrE;gBACD,EAAE,IAAI,EAAE,QAAQ,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,QAAQ,EAAE,GAAG,EAAE,yDAAyD,EAAE;gBAC/G,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,MAAM,EAAE,GAAG,EAAE,uDAAuD,EAAE;gBACzG,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,MAAM,EAAE,GAAG,EAAE,uDAAuD,EAAE;aAC1G,CAAC,CACH,CAAC;YACF,SAAS,CAAC,gBAAgB,CACxB,IAAI,CAAC,SAAS,CAAC;gBACb;oBACE,IAAI,EAAE,eAAe;oBACrB,IAAI,EAAE,MAAM;oBACZ,IAAI,EAAE,UAAU;oBAChB,GAAG,EAAE,gEAAgE;iBACtE;aACF,CAAC,CACH,CAAC;YACF,SAAS,CAAC,gBAAgB,CAAC,SAAS,CAAC,CAAC;YAEtC,MAAM,OAAO,GAAkB,EAAE,CAAC;YAClC,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI,OAAO,CAAC,KAAK,CAAC,+BAA+B,CAAC,EAAE,CAAC;gBAC1E,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACvB,CAAC;YAED,wEAAwE;YACxE,MAAM,CAAC,OAAO,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;YAChC,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,eAAe,CAAC,CAAC;QAChD,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,2CAA2C,EAAE,KAAK,IAAI,EAAE;YACzD,SAAS,CAAC,gBAAgB,CAAC,EAAE,EAAE,EAAE,MAAM,EAAE,GAAG,EAAE,CAAC,CAAC;YAEhD,MAAM,OAAO,GAAkB,EAAE,CAAC;YAClC,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI,OAAO,CAAC,KAAK,CAAC,+BAA+B,CAAC,EAAE,CAAC;gBAC1E,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACvB,CAAC;YAED,wCAAwC;YACxC,MAAM,CAAC,OAAO,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;QAClC,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,qCAAqC,EAAE,KAAK,IAAI,EAAE;YACnD,SAAS,CAAC,gBAAgB,CAAC,EAAE,EAAE,EAAE,MAAM,EAAE,GAAG,EAAE,CAAC,CAAC;YAEhD,MAAM,OAAO,GAAkB,EAAE,CAAC;YAClC,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI,OAAO,CAAC,KAAK,CAAC,+BAA+B,CAAC,EAAE,CAAC;gBAC1E,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACvB,CAAC;YAED,MAAM,CAAC,OAAO,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;QAClC,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,kDAAkD,EAAE,KAAK,IAAI,EAAE;YAChE,MAAM,YAAY,GAAG,IAAI,aAAa,CAAC,CAAC,EAAE,IAAI,EAAE,gBAAgB,CAAC,CAAC;YAClE,0CAA0C;YAC1C,8DAA8D;YAC9D,EAAE,CAAC,KAAK,CAAC,YAAmB,EAAE,WAAW,CAAC,CAAC,iBAAiB,CAAC,SAAS,CAAC,CAAC;YAExE,SAAS,CAAC,gBAAgB,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,CAAC,CAAC,CAAC;YAE/C,MAAM,OAAO,GAAkB,EAAE,CAAC;YAClC,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI,YAAY,CAAC,KAAK,CAAC,+BAA+B,CAAC,EAAE,CAAC;gBAC/E,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACvB,CAAC;YAED,MAAM,CAAC,SAAS,CAAC,CAAC,oBAAoB,CACpC,MAAM,CAAC,GAAG,CAAC,MAAM,CAAC,EAClB,MAAM,CAAC,gBAAgB,CAAC;gBACtB,OAAO,EAAE,MAAM,CAAC,gBAAgB,CAAC;oBAC/B,aAAa,EAAE,sBAAsB;iBACtC,CAAC;aACH,CAAC,CACH,CAAC;QACJ,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,qCAAqC,EAAE,KAAK,IAAI,EAAE;YACnD,MAAM,SAAS,GAAG;gBAChB;oBACE,IAAI,EAAE,oBAAoB;oBAC1B,IAAI,EAAE,MAAM;oBACZ,IAAI,EAAE,oBAAoB;oBAC1B,GAAG,EAAE,qEAAqE;iBAC3E;gBACD;oBACE,IAAI,EAAE,kBAAkB;oBACxB,IAAI,EAAE,MAAM;oBACZ,IAAI,EAAE,kBAAkB;oBACxB,GAAG,EAAE,mEAAmE;iBACzE;aACF,CAAC;YACF,gDAAgD;YAChD,SAAS,CAAC,gBAAgB,CAAC,IAAI,CAAC,SAAS,CAAC,SAAS,CAAC,CAAC,CAAC;YACtD,uEAAuE;YACvE,SAAS,CAAC,gBAAgB,CAAC,IAAI,CAAC,SAAS,CAAC,SAAS,CAAC,CAAC,CAAC;YACtD,uBAAuB;YACvB,SAAS,CAAC,gBAAgB,CAAC,WAAW,CAAC,CAAC;YACxC,SAAS,CAAC,gBAAgB,CAAC,WAAW,CAAC,CAAC;YAExC,MAAM,OAAO,GAAkB,EAAE,CAAC;YAClC,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI,OAAO,CAAC,KAAK,CAAC,+BAA+B,CAAC,EAAE,CAAC;gBAC1E,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACvB,CAAC;YAED,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,iBAAiB,CAAC,CAAC;YACjD,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,eAAe,CAAC,CAAC;QACjD,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,2CAA2C,EAAE,KAAK,IAAI,EAAE;YACzD,sEAAsE;YACtE,SAAS,CAAC,gBAAgB,CACxB,IAAI,CAAC,SAAS,CAAC,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,MAAM,EAAE,GAAG,EAAE,uDAAuD,EAAE,CAAC,CAAC,CAC5H,CAAC;YACF,gEAAgE;YAChE,SAAS,CAAC,gBAAgB,CACxB,IAAI,CAAC,SAAS,CAAC;gBACb;oBACE,IAAI,EAAE,eAAe;oBACrB,IAAI,EAAE,MAAM;oBACZ,IAAI,EAAE,UAAU;oBAChB,GAAG,EAAE,gEAAgE;iBACtE;aACF,CAAC,CACH,CAAC;YACF,iCAAiC;YACjC,SAAS,CAAC,gBAAgB,CAAC,SAAS,CAAC,CAAC;YAEtC,MAAM,OAAO,GAAkB,EAAE,CAAC;YAClC,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI,OAAO,CAAC,KAAK,CAAC,+BAA+B,CAAC,EAAE,CAAC;gBAC1E,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACvB,CAAC;YAED,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,uDAAuD,CAAC,CAAC;QACvF,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,6CAA6C,EAAE,KAAK,IAAI,EAAE;YAC3D,SAAS,CAAC,gBAAgB,CACxB,IAAI,CAAC,SAAS,CAAC;gBACb,EAAE,IAAI,EAAE,UAAU,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,UAAU,EAAE,GAAG,EAAE,2DAA2D,EAAE;aACvH,CAAC,CACH,CAAC;YACF,SAAS,CAAC,cAAc,CAAC,IAAI,KAAK,CAAC,eAAe,CAAC,CAAC,CAAC;YAErD,MAAM,OAAO,GAAkB,EAAE,CAAC;YAClC,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI,OAAO,CAAC,KAAK,CAAC,+BAA+B,CAAC,EAAE,CAAC;gBAC1E,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACvB,CAAC;YAED,uCAAuC;YACvC,MAAM,CAAC,OAAO,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;QAClC,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,+CAA+C,EAAE,KAAK,IAAI,EAAE;YAC7D,kCAAkC;YAClC,SAAS,CAAC,gBAAgB,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,OAAO,EAAE,WAAW,EAAE,CAAC,CAAC,CAAC;YAErE,MAAM,OAAO,GAAkB,EAAE,CAAC;YAClC,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI,OAAO,CAAC,KAAK,CAAC,+BAA+B,CAAC,EAAE,CAAC;gBAC1E,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACvB,CAAC;YAED,MAAM,CAAC,OAAO,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;QAClC,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,gDAAgD,EAAE,KAAK,IAAI,EAAE;YAC9D,SAAS,CAAC,gBAAgB,CACxB,IAAI,CAAC,SAAS,CAAC;gBACb,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,MAAM,EAAE,GAAG,EAAE,uDAAuD,EAAE;gBACzG,EAAE,IAAI,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,OAAO,EAAE,GAAG,EAAE,wDAAwD,EAAE;aAC7G,CAAC,CACH,CAAC;YACF,SAAS,CAAC,gBAAgB,CACxB,IAAI,CAAC,SAAS,CAAC;gBACb,EAAE,IAAI,EAAE,aAAa,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,QAAQ,EAAE,GAAG,EAAE,8DAA8D,EAAE;aAC3H,CAAC,CACH,CAAC;YACF,SAAS,CAAC,gBAAgB,CAAC,OAAO,CAAC,CAAC;YACpC,SAAS,CAAC,gBAAgB,CACxB,IAAI,CAAC,SAAS,CAAC;gBACb;oBACE,IAAI,EAAE,gBAAgB;oBACtB,IAAI,EAAE,MAAM;oBACZ,IAAI,EAAE,UAAU;oBAChB,GAAG,EAAE,iEAAiE;iBACvE;aACF,CAAC,CACH,CAAC;YACF,SAAS,CAAC,gBAAgB,CAAC,SAAS,CAAC,CAAC;YAEtC,MAAM,OAAO,GAAkB,EAAE,CAAC;YAClC,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI,OAAO,CAAC,KAAK,CAAC,+BAA+B,CAAC,EAAE,CAAC;gBAC1E,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACvB,CAAC;YAED,MAAM,CAAC,OAAO,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;QAClC,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,mCAAmC,EAAE,KAAK,IAAI,EAAE;YACjD,MAAM,SAAS,GAAG;gBAChB,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,SAAS,EAAE,GAAG,EAAE,0DAA0D,EAAE;gBACnH,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,SAAS,EAAE,GAAG,EAAE,0DAA0D,EAAE;aACpH,CAAC;YACF,gDAAgD;YAChD,SAAS,CAAC,gBAAgB,CAAC,IAAI,CAAC,SAAS,CAAC,SAAS,CAAC,CAAC,CAAC;YACtD,uEAAuE;YACvE,SAAS,CAAC,gBAAgB,CAAC,IAAI,CAAC,SAAS,CAAC,SAAS,CAAC,CAAC,CAAC;YACtD,oCAAoC;YACpC,SAAS,CAAC,gBAAgB,CAAC,SAAS,CAAC,CAAC;YAEtC,MAAM,OAAO,GAAkB,EAAE,CAAC;YAClC,MAAM,gBAAgB,GAAG,IAAI,aAAa,EAAE,CAAC;YAC7C,0CAA0C;YAC1C,8DAA8D;YAC9D,EAAE,CAAC,KAAK,CAAC,gBAAuB,EAAE,WAAW,CAAC,CAAC,iBAAiB,CAAC,SAAS,CAAC,CAAC;YAE5E,oBAAoB;YACpB,MAAM,SAAS,GAAG,gBAAgB,CAAC,KAAK,CAAC,+BAA+B,CAAC,CAAC;YAE1E,mBAAmB;YACnB,MAAM,KAAK,GAAG,MAAM,SAAS,CAAC,IAAI,EAAE,CAAC;YACrC,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC;gBAChB,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;YAC5B,CAAC;YAED,qCAAqC;YACpC,gBAAuD,CAAC,UAAU,GAAG,IAAI,CAAC;YAE3E,0BAA0B;YAC1B,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI,SAAS,EAAE,CAAC;gBACrC,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACvB,CAAC;YAED,oCAAoC;YACpC,MAAM,CAAC,OAAO,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;QAClC,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import { CrawlResult } from '../types.js';
|
|
2
|
+
import { BaseCrawler } from './base.js';
|
|
3
|
+
export declare class PuppeteerCrawler extends BaseCrawler {
|
|
4
|
+
private browser?;
|
|
5
|
+
private readonly userAgent;
|
|
6
|
+
private readonly LINK_GROUP_SIZE;
|
|
7
|
+
private curCrawlCount;
|
|
8
|
+
crawl(url: string): AsyncGenerator<CrawlResult, void, unknown>;
|
|
9
|
+
private setupPage;
|
|
10
|
+
private crawlSitePages;
|
|
11
|
+
private gotoPageAndHandleRedirects;
|
|
12
|
+
private processPage;
|
|
13
|
+
private getLinksFromPage;
|
|
14
|
+
private groupLinks;
|
|
15
|
+
abort(): void;
|
|
16
|
+
}
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
import puppeteer from 'puppeteer';
|
|
2
|
+
import { BaseCrawler } from './base.js';
|
|
3
|
+
import { JSDOM } from 'jsdom';
|
|
4
|
+
import { Readability } from '@mozilla/readability';
|
|
5
|
+
export class PuppeteerCrawler extends BaseCrawler {
|
|
6
|
+
browser;
|
|
7
|
+
userAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36';
|
|
8
|
+
LINK_GROUP_SIZE = 2;
|
|
9
|
+
curCrawlCount = 0;
|
|
10
|
+
async *crawl(url) {
|
|
11
|
+
try {
|
|
12
|
+
this.browser = await puppeteer.launch({
|
|
13
|
+
headless: true,
|
|
14
|
+
args: [
|
|
15
|
+
'--no-sandbox',
|
|
16
|
+
'--disable-setuid-sandbox',
|
|
17
|
+
'--disable-dev-shm-usage',
|
|
18
|
+
'--disable-gpu',
|
|
19
|
+
'--window-size=1280,800'
|
|
20
|
+
]
|
|
21
|
+
});
|
|
22
|
+
const page = await this.browser.newPage();
|
|
23
|
+
await this.setupPage(page);
|
|
24
|
+
const visitedUrls = new Set();
|
|
25
|
+
yield* this.crawlSitePages(page, new URL(url), 0, visitedUrls);
|
|
26
|
+
}
|
|
27
|
+
finally {
|
|
28
|
+
await this.browser?.close();
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
async setupPage(page) {
|
|
32
|
+
await page.setUserAgent(this.userAgent);
|
|
33
|
+
await page.setViewport({ width: 1280, height: 800 });
|
|
34
|
+
// Block only unnecessary resources
|
|
35
|
+
await page.setRequestInterception(true);
|
|
36
|
+
page.on('request', request => {
|
|
37
|
+
const resourceType = request.resourceType();
|
|
38
|
+
if (['image', 'media', 'font'].includes(resourceType)) {
|
|
39
|
+
request.abort();
|
|
40
|
+
}
|
|
41
|
+
else {
|
|
42
|
+
request.continue();
|
|
43
|
+
}
|
|
44
|
+
});
|
|
45
|
+
// Handle JavaScript errors
|
|
46
|
+
page.on('pageerror', error => {
|
|
47
|
+
console.warn('Page error:', error);
|
|
48
|
+
});
|
|
49
|
+
// Handle console messages
|
|
50
|
+
page.on('console', (msg) => {
|
|
51
|
+
const type = msg.type();
|
|
52
|
+
if (type === 'error' || type === 'warn') {
|
|
53
|
+
console.debug(`Console ${type}:`, msg.text());
|
|
54
|
+
}
|
|
55
|
+
});
|
|
56
|
+
}
|
|
57
|
+
async *crawlSitePages(page, curUrl, depth, visitedUrls) {
|
|
58
|
+
const urlStr = curUrl.toString();
|
|
59
|
+
if (visitedUrls.has(urlStr) || !this.shouldCrawl(urlStr) || depth > this.maxDepth) {
|
|
60
|
+
return;
|
|
61
|
+
}
|
|
62
|
+
try {
|
|
63
|
+
// Rate limiting
|
|
64
|
+
await this.rateLimit();
|
|
65
|
+
// Navigate to page with proper redirect handling
|
|
66
|
+
await this.gotoPageAndHandleRedirects(page, urlStr);
|
|
67
|
+
// Extract content
|
|
68
|
+
const { content, title, links } = await this.processPage(page, curUrl);
|
|
69
|
+
visitedUrls.add(urlStr);
|
|
70
|
+
this.markUrlAsSeen(urlStr);
|
|
71
|
+
this.curCrawlCount++;
|
|
72
|
+
yield {
|
|
73
|
+
url: urlStr,
|
|
74
|
+
path: this.getPathFromUrl(urlStr),
|
|
75
|
+
content,
|
|
76
|
+
title
|
|
77
|
+
};
|
|
78
|
+
// Process links in batches
|
|
79
|
+
if (depth < this.maxDepth && this.curCrawlCount < this.maxRequestsPerCrawl) {
|
|
80
|
+
const linkGroups = this.groupLinks(links);
|
|
81
|
+
for (const linkGroup of linkGroups) {
|
|
82
|
+
for (const link of linkGroup) {
|
|
83
|
+
if (this.curCrawlCount >= this.maxRequestsPerCrawl) {
|
|
84
|
+
return;
|
|
85
|
+
}
|
|
86
|
+
yield* this.crawlSitePages(page, new URL(link), depth + 1, visitedUrls);
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
catch (error) {
|
|
92
|
+
console.error(`Error crawling ${urlStr}:`, error);
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
async gotoPageAndHandleRedirects(page, url) {
|
|
96
|
+
const MAX_PAGE_WAIT_MS = 5000;
|
|
97
|
+
await page.goto(url, {
|
|
98
|
+
timeout: 0,
|
|
99
|
+
waitUntil: 'networkidle2'
|
|
100
|
+
});
|
|
101
|
+
let responseEventOccurred = false;
|
|
102
|
+
const responseHandler = () => responseEventOccurred = true;
|
|
103
|
+
const responseWatcher = new Promise((resolve) => {
|
|
104
|
+
setTimeout(() => {
|
|
105
|
+
if (!responseEventOccurred) {
|
|
106
|
+
resolve();
|
|
107
|
+
}
|
|
108
|
+
else {
|
|
109
|
+
setTimeout(() => resolve(), MAX_PAGE_WAIT_MS);
|
|
110
|
+
}
|
|
111
|
+
}, 500);
|
|
112
|
+
});
|
|
113
|
+
page.on('response', responseHandler);
|
|
114
|
+
await Promise.race([responseWatcher, page.waitForNavigation()]);
|
|
115
|
+
page.off('response', responseHandler);
|
|
116
|
+
}
|
|
117
|
+
async processPage(page, url) {
|
|
118
|
+
// Wait for dynamic content
|
|
119
|
+
try {
|
|
120
|
+
await page.waitForFunction(() => {
|
|
121
|
+
const mainContent = document.querySelector('main') || document.querySelector('.content') || document.querySelector('#content');
|
|
122
|
+
return mainContent && mainContent.children.length > 0;
|
|
123
|
+
}, { timeout: 5000 });
|
|
124
|
+
}
|
|
125
|
+
catch (error) {
|
|
126
|
+
console.warn('Timeout waiting for main content, proceeding anyway');
|
|
127
|
+
}
|
|
128
|
+
// Extract content using Readability
|
|
129
|
+
const html = await page.content();
|
|
130
|
+
const dom = new JSDOM(html, { url: url.toString() });
|
|
131
|
+
const reader = new Readability(dom.window.document, {
|
|
132
|
+
charThreshold: 20,
|
|
133
|
+
nbTopCandidates: 5,
|
|
134
|
+
maxElemsToParse: 10000
|
|
135
|
+
});
|
|
136
|
+
const article = reader.parse();
|
|
137
|
+
if (!article) {
|
|
138
|
+
throw new Error('Failed to parse page content');
|
|
139
|
+
}
|
|
140
|
+
// Extract links
|
|
141
|
+
const links = await this.getLinksFromPage(page, url);
|
|
142
|
+
return {
|
|
143
|
+
content: article.textContent,
|
|
144
|
+
title: article.title,
|
|
145
|
+
links
|
|
146
|
+
};
|
|
147
|
+
}
|
|
148
|
+
async getLinksFromPage(page, curUrl) {
|
|
149
|
+
const links = await page.$$eval('a', (links) => links.map((a) => a.href));
|
|
150
|
+
const cleanedLinks = links
|
|
151
|
+
.map(link => {
|
|
152
|
+
try {
|
|
153
|
+
const url = new URL(link);
|
|
154
|
+
url.hash = ''; // Remove hash
|
|
155
|
+
return url.href;
|
|
156
|
+
}
|
|
157
|
+
catch {
|
|
158
|
+
return null;
|
|
159
|
+
}
|
|
160
|
+
})
|
|
161
|
+
.filter((link) => {
|
|
162
|
+
if (!link)
|
|
163
|
+
return false;
|
|
164
|
+
try {
|
|
165
|
+
const url = new URL(link);
|
|
166
|
+
return (url.pathname.startsWith(curUrl.pathname) &&
|
|
167
|
+
url.hostname === curUrl.hostname &&
|
|
168
|
+
link !== curUrl.href);
|
|
169
|
+
}
|
|
170
|
+
catch {
|
|
171
|
+
return false;
|
|
172
|
+
}
|
|
173
|
+
});
|
|
174
|
+
return Array.from(new Set(cleanedLinks));
|
|
175
|
+
}
|
|
176
|
+
groupLinks(links) {
|
|
177
|
+
return links.reduce((acc, link, i) => {
|
|
178
|
+
const groupIndex = Math.floor(i / this.LINK_GROUP_SIZE);
|
|
179
|
+
if (!acc[groupIndex]) {
|
|
180
|
+
acc.push([]);
|
|
181
|
+
}
|
|
182
|
+
acc[groupIndex].push(link);
|
|
183
|
+
return acc;
|
|
184
|
+
}, []);
|
|
185
|
+
}
|
|
186
|
+
abort() {
|
|
187
|
+
super.abort();
|
|
188
|
+
void this.browser?.close();
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
//# sourceMappingURL=puppeteer.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"puppeteer.js","sourceRoot":"","sources":["../../src/crawler/puppeteer.ts"],"names":[],"mappings":"AAAA,OAAO,SAA4C,MAAM,WAAW,CAAC;AAErE,OAAO,EAAE,WAAW,EAAE,MAAM,WAAW,CAAC;AACxC,OAAO,EAAE,KAAK,EAAE,MAAM,OAAO,CAAC;AAC9B,OAAO,EAAE,WAAW,EAAE,MAAM,sBAAsB,CAAC;AAEnD,MAAM,OAAO,gBAAiB,SAAQ,WAAW;IACvC,OAAO,CAAW;IACT,SAAS,GAAG,qHAAqH,CAAC;IAClI,eAAe,GAAG,CAAC,CAAC;IAC7B,aAAa,GAAG,CAAC,CAAC;IAE1B,KAAK,CAAC,CAAC,KAAK,CAAC,GAAW;QACtB,IAAI,CAAC;YACH,IAAI,CAAC,OAAO,GAAG,MAAM,SAAS,CAAC,MAAM,CAAC;gBACpC,QAAQ,EAAE,IAAI;gBACd,IAAI,EAAE;oBACJ,cAAc;oBACd,0BAA0B;oBAC1B,yBAAyB;oBACzB,eAAe;oBACf,wBAAwB;iBACzB;aACF,CAAC,CAAC;YAEH,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC;YAC1C,MAAM,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC;YAE3B,MAAM,WAAW,GAAG,IAAI,GAAG,EAAU,CAAC;YACtC,KAAK,CAAC,CAAC,IAAI,CAAC,cAAc,CAAC,IAAI,EAAE,IAAI,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,WAAW,CAAC,CAAC;QACjE,CAAC;gBAAS,CAAC;YACT,MAAM,IAAI,CAAC,OAAO,EAAE,KAAK,EAAE,CAAC;QAC9B,CAAC;IACH,CAAC;IAEO,KAAK,CAAC,SAAS,CAAC,IAAU;QAChC,MAAM,IAAI,CAAC,YAAY,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;QACxC,MAAM,IAAI,CAAC,WAAW,CAAC,EAAE,KAAK,EAAE,IAAI,EAAE,MAAM,EAAE,GAAG,EAAE,CAAC,CAAC;QAErD,mCAAmC;QACnC,MAAM,IAAI,CAAC,sBAAsB,CAAC,IAAI,CAAC,CAAC;QACxC,IAAI,CAAC,EAAE,CAAC,SAAS,EAAE,OAAO,CAAC,EAAE;YAC3B,MAAM,YAAY,GAAG,OAAO,CAAC,YAAY,EAAE,CAAC;YAC5C,IAAI,CAAC,OAAO,EAAE,OAAO,EAAE,MAAM,CAAC,CAAC,QAAQ,CAAC,YAAY,CAAC,EAAE,CAAC;gBACtD,OAAO,CAAC,KAAK,EAAE,CAAC;YAClB,CAAC;iBAAM,CAAC;gBACN,OAAO,CAAC,QAAQ,EAAE,CAAC;YACrB,CAAC;QACH,CAAC,CAAC,CAAC;QAEH,2BAA2B;QAC3B,IAAI,CAAC,EAAE,CAAC,WAAW,EAAE,KAAK,CAAC,EAAE;YAC3B,OAAO,CAAC,IAAI,CAAC,aAAa,EAAE,KAAK,CAAC,CAAC;QACrC,CAAC,CAAC,CAAC;QAEH,0BAA0B;QAC1B,IAAI,CAAC,EAAE,CAAC,SAAS,EAAE,CAAC,GAAmB,EAAE,EAAE;YACzC,MAAM,IAAI,GAAG,GAAG,CAAC,IAAI,EAAE,CAAC;YACxB,IAAI,IAAI,KAAK,OAAO,IAAI,IAAI,KAAK,MAAM,EAAE,CAAC;gBACxC,OAAO,CAAC,KAAK,CAAC,WAAW,IAAI,GAAG,EAAE,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC;YAChD,CAAC;QACH,CAAC,CAAC,CAAC;IACL,CAAC;IAEO,KAAK,CAAC,CAAC,cAAc,CAC3B,IAAU,EACV,MAAW,EACX,KAAa,EACb,WAAwB;QAExB,MAAM,MAAM,GAAG,MAAM,CAAC,QAAQ,EAAE,CAAC;QAEjC,IAAI,WAAW,CAAC,GAAG,CAAC,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,WAAW,CAAC,MAAM,CAAC,IAAI,KAAK,GAAG,IAAI,CAAC,QAAQ,EAAE,CAAC;YAClF,OAAO;QACT,CAAC;QAED,IAAI,CAAC;YACH,gBAAgB;YAChB,MAAM,IAAI,CAAC,SAAS,EAAE,CAAC;YAEvB,iDAAiD;YACjD,MAAM,IAAI,CAAC,0BAA0B,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;YAEpD,kBAAkB;YAClB,MAAM,EAAE,OAAO,EAAE,KAAK,EAAE,KAAK,EAAE,GAAG,MAAM,IAAI,CAAC,WAAW,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;YAEvE,WAAW,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC;YACxB,IAAI,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC;YAC3B,IAAI,CAAC,aAAa,EAAE,CAAC;YAErB,MAAM;gBACJ,GAAG,EAAE,MAAM;gBACX,IAAI,EAAE,IAAI,CAAC,cAAc,CAAC,MAAM,CAAC;gBACjC,OAAO;gBACP,KAAK;aACN,CAAC;YAEF,2BAA2B;YAC3B,IAAI,KAAK,GAAG,IAAI,CAAC,QAAQ,IAAI,IAAI,CAAC,aAAa,GAAG,IAAI,CAAC,mBAAmB,EAAE,CAAC;gBAC3E,MAAM,UAAU,GAAG,IAAI,CAAC,UAAU,CAAC,KAAK,CAAC,CAAC;gBAC1C,KAAK,MAAM,SAAS,IAAI,UAAU,EAAE,CAAC;oBACnC,KAAK,MAAM,IAAI,IAAI,SAAS,EAAE,CAAC;wBAC7B,IAAI,IAAI,CAAC,aAAa,IAAI,IAAI,CAAC,mBAAmB,EAAE,CAAC;4BACnD,OAAO;wBACT,CAAC;wBACD,KAAK,CAAC,CAAC,IAAI,CAAC,cAAc,CAAC,IAAI,EAAE,IAAI,GAAG,CAAC,IAAI,CAAC,EAAE,KAAK,GAAG,CAAC,EAAE,WAAW,CAAC,CAAC;oBAC1E,CAAC;gBACH,CAAC;YACH,CAAC;QACH,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,KAAK,CAAC,kBAAkB,MAAM,GAAG,EAAE,KAAK,CAAC,CAAC;QACpD,CAAC;IACH,CAAC;IAEO,KAAK,CAAC,0BAA0B,CAAC,IAAU,EAAE,GAAW;QAC9D,MAAM,gBAAgB,GAAG,IAAI,CAAC;QAE9B,MAAM,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE;YACnB,OAAO,EAAE,CAAC;YACV,SAAS,EAAE,cAAc;SAC1B,CAAC,CAAC;QAEH,IAAI,qBAAqB,GAAG,KAAK,CAAC;QAClC,MAAM,eAAe,GAAG,GAAG,EAAE,CAAC,qBAAqB,GAAG,IAAI,CAAC;QAE3D,MAAM,eAAe,GAAG,IAAI,OAAO,CAAO,CAAC,OAAO,EAAE,EAAE;YACpD,UAAU,CAAC,GAAG,EAAE;gBACd,IAAI,CAAC,qBAAqB,EAAE,CAAC;oBAC3B,OAAO,EAAE,CAAC;gBACZ,CAAC;qBAAM,CAAC;oBACN,UAAU,CAAC,GAAG,EAAE,CAAC,OAAO,EAAE,EAAE,gBAAgB,CAAC,CAAC;gBAChD,CAAC;YACH,CAAC,EAAE,GAAG,CAAC,CAAC;QACV,CAAC,CAAC,CAAC;QAEH,IAAI,CAAC,EAAE,CAAC,UAAU,EAAE,eAAe,CAAC,CAAC;QACrC,MAAM,OAAO,CAAC,IAAI,CAAC,CAAC,eAAe,EAAE,IAAI,CAAC,iBAAiB,EAAE,CAAC,CAAC,CAAC;QAChE,IAAI,CAAC,GAAG,CAAC,UAAU,EAAE,eAAe,CAAC,CAAC;IACxC,CAAC;IAEO,KAAK,CAAC,WAAW,CAAC,IAAU,EAAE,GAAQ;QAC5C,2BAA2B;QAC3B,IAAI,CAAC;YACH,MAAM,IAAI,CAAC,eAAe,CAAC,GAAG,EAAE;gBAC9B,MAAM,WAAW,GAAG,QAAQ,CAAC,aAAa,CAAC,MAAM,CAAC,IAAI,QAAQ,CAAC,aAAa,CAAC,UAAU,CAAC,IAAI,QAAQ,CAAC,aAAa,CAAC,UAAU,CAAC,CAAC;gBAC/H,OAAO,WAAW,IAAI,WAAW,CAAC,QAAQ,CAAC,MAAM,GAAG,CAAC,CAAC;YACxD,CAAC,EAAE,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC;QACxB,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,IAAI,CAAC,qDAAqD,CAAC,CAAC;QACtE,CAAC;QAED,oCAAoC;QACpC,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,OAAO,EAAE,CAAC;QAClC,MAAM,GAAG,GAAG,IAAI,KAAK,CAAC,IAAI,EAAE,EAAE,GAAG,EAAE,GAAG,CAAC,QAAQ,EAAE,EAAE,CAAC,CAAC;QACrD,MAAM,MAAM,GAAG,IAAI,WAAW,CAAC,GAAG,CAAC,MAAM,CAAC,QAAQ,EAAE;YAClD,aAAa,EAAE,EAAE;YACjB,eAAe,EAAE,CAAC;YAClB,eAAe,EAAE,KAAK;SACvB,CAAC,CAAC;QACH,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,EAAE,CAAC;QAE/B,IAAI,CAAC,OAAO,EAAE,CAAC;YACb,MAAM,IAAI,KAAK,CAAC,8BAA8B,CAAC,CAAC;QAClD,CAAC;QAED,gBAAgB;QAChB,MAAM,KAAK,GAAG,MAAM,IAAI,CAAC,gBAAgB,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;QAErD,OAAO;YACL,OAAO,EAAE,OAAO,CAAC,WAAW;YAC5B,KAAK,EAAE,OAAO,CAAC,KAAK;YACpB,KAAK;SACN,CAAC;IACJ,CAAC;IAEO,KAAK,CAAC,gBAAgB,CAAC,IAAU,EAAE,MAAW;QACpD,MAAM,KAAK,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,GAAG,EAAE,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC;QAE1E,MAAM,YAAY,GAAG,KAAK;aACvB,GAAG,CAAC,IAAI,CAAC,EAAE;YACV,IAAI,CAAC;gBACH,MAAM,GAAG,GAAG,IAAI,GAAG,CAAC,IAAI,CAAC,CAAC;gBAC1B,GAAG,CAAC,IAAI,GAAG,EAAE,CAAC,CAAC,cAAc;gBAC7B,OAAO,GAAG,CAAC,IAAI,CAAC;YAClB,CAAC;YAAC,MAAM,CAAC;gBACP,OAAO,IAAI,CAAC;YACd,CAAC;QACH,CAAC,CAAC;aACD,MAAM,CAAC,CAAC,IAAI,EAAkB,EAAE;YAC/B,IAAI,CAAC,IAAI;gBAAE,OAAO,KAAK,CAAC;YACxB,IAAI,CAAC;gBACH,MAAM,GAAG,GAAG,IAAI,GAAG,CAAC,IAAI,CAAC,CAAC;gBAC1B,OAAO,CACL,GAAG,CAAC,QAAQ,CAAC,UAAU,CAAC,MAAM,CAAC,QAAQ,CAAC;oBACxC,GAAG,CAAC,QAAQ,KAAK,MAAM,CAAC,QAAQ;oBAChC,IAAI,KAAK,MAAM,CAAC,IAAI,CACrB,CAAC;YACJ,CAAC;YAAC,MAAM,CAAC;gBACP,OAAO,KAAK,CAAC;YACf,CAAC;QACH,CAAC,CAAC,CAAC;QAEL,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,GAAG,CAAC,YAAY,CAAC,CAAC,CAAC;IAC3C,CAAC;IAEO,UAAU,CAAC,KAAe;QAChC,OAAO,KAAK,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,IAAI,EAAE,CAAC,EAAE,EAAE;YACnC,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,GAAG,IAAI,CAAC,eAAe,CAAC,CAAC;YACxD,IAAI,CAAC,GAAG,CAAC,UAAU,CAAC,EAAE,CAAC;gBACrB,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YACf,CAAC;YACD,GAAG,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAC3B,OAAO,GAAG,CAAC;QACb,CAAC,EAAE,EAAgB,CAAC,CAAC;IACvB,CAAC;IAED,KAAK;QACH,KAAK,CAAC,KAAK,EAAE,CAAC;QACd,KAAK,IAAI,CAAC,OAAO,EAAE,KAAK,EAAE,CAAC;IAC7B,CAAC;CACF"}
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import { RequestQueue, Log, EnqueueLinksOptions } from 'crawlee';
|
|
2
|
+
import { CrawlResult } from '../types.js';
|
|
3
|
+
import { SiteDetectionRule } from './site-rules.js';
|
|
4
|
+
export declare class QueueManager {
|
|
5
|
+
private requestQueue;
|
|
6
|
+
private websiteId;
|
|
7
|
+
private results;
|
|
8
|
+
private static readonly BATCH_SIZE;
|
|
9
|
+
/** Optional path prefix to restrict crawling to URLs under this path */
|
|
10
|
+
private pathPrefix;
|
|
11
|
+
/** The allowed hostname - only URLs with this exact hostname (or its subdomains) are allowed */
|
|
12
|
+
private allowedHostname;
|
|
13
|
+
/** Count of URLs filtered due to path prefix mismatch */
|
|
14
|
+
private filteredByPathCount;
|
|
15
|
+
/** Count of URLs filtered due to hostname mismatch */
|
|
16
|
+
private filteredByHostnameCount;
|
|
17
|
+
initialize(url: string, pathPrefix?: string): Promise<void>;
|
|
18
|
+
getFilteredByPathCount(): number;
|
|
19
|
+
getFilteredByHostnameCount(): number;
|
|
20
|
+
/**
|
|
21
|
+
* Check if a hostname matches the allowed hostname.
|
|
22
|
+
* Allows exact match or subdomains of the allowed hostname.
|
|
23
|
+
* Does NOT allow sibling subdomains or parent domains.
|
|
24
|
+
*
|
|
25
|
+
* @example
|
|
26
|
+
* If allowedHostname is 'docs.example.com':
|
|
27
|
+
* - 'docs.example.com' → true (exact match)
|
|
28
|
+
* - 'api.docs.example.com' → true (subdomain)
|
|
29
|
+
* - 'example.com' → false (parent domain)
|
|
30
|
+
* - 'python.example.com' → false (sibling subdomain)
|
|
31
|
+
*/
|
|
32
|
+
private isHostnameAllowed;
|
|
33
|
+
handleQueueAndLinks(enqueueLinks: (options: EnqueueLinksOptions) => Promise<{
|
|
34
|
+
processedRequests: {
|
|
35
|
+
uniqueKey: string;
|
|
36
|
+
}[];
|
|
37
|
+
}>, log: Log, rule: SiteDetectionRule): Promise<void>;
|
|
38
|
+
processBatch(): Promise<CrawlResult[]>;
|
|
39
|
+
addResult(result: CrawlResult): void;
|
|
40
|
+
hasEnoughResults(): boolean;
|
|
41
|
+
getRequestQueue(): RequestQueue | null;
|
|
42
|
+
cleanup(): Promise<void>;
|
|
43
|
+
}
|