@cosmocoder/mcp-web-docs 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +22 -0
- package/README.md +368 -0
- package/build/__mocks__/embeddings.d.ts +17 -0
- package/build/__mocks__/embeddings.js +66 -0
- package/build/__mocks__/embeddings.js.map +1 -0
- package/build/config.d.ts +44 -0
- package/build/config.js +158 -0
- package/build/config.js.map +1 -0
- package/build/config.test.d.ts +1 -0
- package/build/config.test.js +165 -0
- package/build/config.test.js.map +1 -0
- package/build/crawler/auth.d.ts +128 -0
- package/build/crawler/auth.js +546 -0
- package/build/crawler/auth.js.map +1 -0
- package/build/crawler/auth.test.d.ts +1 -0
- package/build/crawler/auth.test.js +174 -0
- package/build/crawler/auth.test.js.map +1 -0
- package/build/crawler/base.d.ts +24 -0
- package/build/crawler/base.js +149 -0
- package/build/crawler/base.js.map +1 -0
- package/build/crawler/base.test.d.ts +1 -0
- package/build/crawler/base.test.js +234 -0
- package/build/crawler/base.test.js.map +1 -0
- package/build/crawler/browser-config.d.ts +2 -0
- package/build/crawler/browser-config.js +29 -0
- package/build/crawler/browser-config.js.map +1 -0
- package/build/crawler/browser-config.test.d.ts +1 -0
- package/build/crawler/browser-config.test.js +56 -0
- package/build/crawler/browser-config.test.js.map +1 -0
- package/build/crawler/cheerio.d.ts +11 -0
- package/build/crawler/cheerio.js +134 -0
- package/build/crawler/cheerio.js.map +1 -0
- package/build/crawler/chromium.d.ts +21 -0
- package/build/crawler/chromium.js +596 -0
- package/build/crawler/chromium.js.map +1 -0
- package/build/crawler/content-extractor-types.d.ts +25 -0
- package/build/crawler/content-extractor-types.js +2 -0
- package/build/crawler/content-extractor-types.js.map +1 -0
- package/build/crawler/content-extractors.d.ts +9 -0
- package/build/crawler/content-extractors.js +9 -0
- package/build/crawler/content-extractors.js.map +1 -0
- package/build/crawler/content-utils.d.ts +2 -0
- package/build/crawler/content-utils.js +22 -0
- package/build/crawler/content-utils.js.map +1 -0
- package/build/crawler/content-utils.test.d.ts +1 -0
- package/build/crawler/content-utils.test.js +99 -0
- package/build/crawler/content-utils.test.js.map +1 -0
- package/build/crawler/crawlee-crawler.d.ts +63 -0
- package/build/crawler/crawlee-crawler.js +342 -0
- package/build/crawler/crawlee-crawler.js.map +1 -0
- package/build/crawler/crawlee-crawler.test.d.ts +1 -0
- package/build/crawler/crawlee-crawler.test.js +280 -0
- package/build/crawler/crawlee-crawler.test.js.map +1 -0
- package/build/crawler/default-extractor.d.ts +4 -0
- package/build/crawler/default-extractor.js +26 -0
- package/build/crawler/default-extractor.js.map +1 -0
- package/build/crawler/default-extractor.test.d.ts +1 -0
- package/build/crawler/default-extractor.test.js +200 -0
- package/build/crawler/default-extractor.test.js.map +1 -0
- package/build/crawler/default.d.ts +11 -0
- package/build/crawler/default.js +138 -0
- package/build/crawler/default.js.map +1 -0
- package/build/crawler/docs-crawler.d.ts +26 -0
- package/build/crawler/docs-crawler.js +97 -0
- package/build/crawler/docs-crawler.js.map +1 -0
- package/build/crawler/docs-crawler.test.d.ts +1 -0
- package/build/crawler/docs-crawler.test.js +185 -0
- package/build/crawler/docs-crawler.test.js.map +1 -0
- package/build/crawler/factory.d.ts +6 -0
- package/build/crawler/factory.js +83 -0
- package/build/crawler/factory.js.map +1 -0
- package/build/crawler/github-pages-extractor.d.ts +4 -0
- package/build/crawler/github-pages-extractor.js +33 -0
- package/build/crawler/github-pages-extractor.js.map +1 -0
- package/build/crawler/github-pages-extractor.test.d.ts +1 -0
- package/build/crawler/github-pages-extractor.test.js +184 -0
- package/build/crawler/github-pages-extractor.test.js.map +1 -0
- package/build/crawler/github.d.ts +20 -0
- package/build/crawler/github.js +181 -0
- package/build/crawler/github.js.map +1 -0
- package/build/crawler/github.test.d.ts +1 -0
- package/build/crawler/github.test.js +326 -0
- package/build/crawler/github.test.js.map +1 -0
- package/build/crawler/puppeteer.d.ts +16 -0
- package/build/crawler/puppeteer.js +191 -0
- package/build/crawler/puppeteer.js.map +1 -0
- package/build/crawler/queue-manager.d.ts +43 -0
- package/build/crawler/queue-manager.js +169 -0
- package/build/crawler/queue-manager.js.map +1 -0
- package/build/crawler/queue-manager.test.d.ts +1 -0
- package/build/crawler/queue-manager.test.js +509 -0
- package/build/crawler/queue-manager.test.js.map +1 -0
- package/build/crawler/site-rules.d.ts +11 -0
- package/build/crawler/site-rules.js +104 -0
- package/build/crawler/site-rules.js.map +1 -0
- package/build/crawler/site-rules.test.d.ts +1 -0
- package/build/crawler/site-rules.test.js +139 -0
- package/build/crawler/site-rules.test.js.map +1 -0
- package/build/crawler/storybook-extractor.d.ts +34 -0
- package/build/crawler/storybook-extractor.js +767 -0
- package/build/crawler/storybook-extractor.js.map +1 -0
- package/build/crawler/storybook-extractor.test.d.ts +1 -0
- package/build/crawler/storybook-extractor.test.js +491 -0
- package/build/crawler/storybook-extractor.test.js.map +1 -0
- package/build/embeddings/fastembed.d.ts +25 -0
- package/build/embeddings/fastembed.js +188 -0
- package/build/embeddings/fastembed.js.map +1 -0
- package/build/embeddings/fastembed.test.d.ts +1 -0
- package/build/embeddings/fastembed.test.js +307 -0
- package/build/embeddings/fastembed.test.js.map +1 -0
- package/build/embeddings/openai.d.ts +8 -0
- package/build/embeddings/openai.js +56 -0
- package/build/embeddings/openai.js.map +1 -0
- package/build/embeddings/types.d.ts +4 -0
- package/build/embeddings/types.js +2 -0
- package/build/embeddings/types.js.map +1 -0
- package/build/index.d.ts +2 -0
- package/build/index.js +1007 -0
- package/build/index.js.map +1 -0
- package/build/index.test.d.ts +1 -0
- package/build/index.test.js +364 -0
- package/build/index.test.js.map +1 -0
- package/build/indexing/queue-manager.d.ts +36 -0
- package/build/indexing/queue-manager.js +86 -0
- package/build/indexing/queue-manager.js.map +1 -0
- package/build/indexing/queue-manager.test.d.ts +1 -0
- package/build/indexing/queue-manager.test.js +257 -0
- package/build/indexing/queue-manager.test.js.map +1 -0
- package/build/indexing/status.d.ts +39 -0
- package/build/indexing/status.js +207 -0
- package/build/indexing/status.js.map +1 -0
- package/build/indexing/status.test.d.ts +1 -0
- package/build/indexing/status.test.js +246 -0
- package/build/indexing/status.test.js.map +1 -0
- package/build/processor/content.d.ts +16 -0
- package/build/processor/content.js +286 -0
- package/build/processor/content.js.map +1 -0
- package/build/processor/content.test.d.ts +1 -0
- package/build/processor/content.test.js +369 -0
- package/build/processor/content.test.js.map +1 -0
- package/build/processor/markdown.d.ts +11 -0
- package/build/processor/markdown.js +256 -0
- package/build/processor/markdown.js.map +1 -0
- package/build/processor/markdown.test.d.ts +1 -0
- package/build/processor/markdown.test.js +312 -0
- package/build/processor/markdown.test.js.map +1 -0
- package/build/processor/metadata-parser.d.ts +37 -0
- package/build/processor/metadata-parser.js +245 -0
- package/build/processor/metadata-parser.js.map +1 -0
- package/build/processor/metadata-parser.test.d.ts +1 -0
- package/build/processor/metadata-parser.test.js +357 -0
- package/build/processor/metadata-parser.test.js.map +1 -0
- package/build/processor/processor.d.ts +8 -0
- package/build/processor/processor.js +190 -0
- package/build/processor/processor.js.map +1 -0
- package/build/processor/processor.test.d.ts +1 -0
- package/build/processor/processor.test.js +357 -0
- package/build/processor/processor.test.js.map +1 -0
- package/build/rag/cache.d.ts +10 -0
- package/build/rag/cache.js +10 -0
- package/build/rag/cache.js.map +1 -0
- package/build/rag/code-generator.d.ts +11 -0
- package/build/rag/code-generator.js +30 -0
- package/build/rag/code-generator.js.map +1 -0
- package/build/rag/context-assembler.d.ts +23 -0
- package/build/rag/context-assembler.js +113 -0
- package/build/rag/context-assembler.js.map +1 -0
- package/build/rag/docs-search.d.ts +55 -0
- package/build/rag/docs-search.js +380 -0
- package/build/rag/docs-search.js.map +1 -0
- package/build/rag/pipeline.d.ts +26 -0
- package/build/rag/pipeline.js +91 -0
- package/build/rag/pipeline.js.map +1 -0
- package/build/rag/query-processor.d.ts +14 -0
- package/build/rag/query-processor.js +57 -0
- package/build/rag/query-processor.js.map +1 -0
- package/build/rag/reranker.d.ts +55 -0
- package/build/rag/reranker.js +210 -0
- package/build/rag/reranker.js.map +1 -0
- package/build/rag/response-generator.d.ts +20 -0
- package/build/rag/response-generator.js +101 -0
- package/build/rag/response-generator.js.map +1 -0
- package/build/rag/retriever.d.ts +19 -0
- package/build/rag/retriever.js +111 -0
- package/build/rag/retriever.js.map +1 -0
- package/build/rag/validator.d.ts +22 -0
- package/build/rag/validator.js +128 -0
- package/build/rag/validator.js.map +1 -0
- package/build/rag/version-manager.d.ts +23 -0
- package/build/rag/version-manager.js +98 -0
- package/build/rag/version-manager.js.map +1 -0
- package/build/setupTests.d.ts +4 -0
- package/build/setupTests.js +50 -0
- package/build/setupTests.js.map +1 -0
- package/build/storage/storage.d.ts +38 -0
- package/build/storage/storage.js +700 -0
- package/build/storage/storage.js.map +1 -0
- package/build/storage/storage.test.d.ts +1 -0
- package/build/storage/storage.test.js +338 -0
- package/build/storage/storage.test.js.map +1 -0
- package/build/types/rag.d.ts +27 -0
- package/build/types/rag.js +2 -0
- package/build/types/rag.js.map +1 -0
- package/build/types.d.ts +120 -0
- package/build/types.js +2 -0
- package/build/types.js.map +1 -0
- package/build/util/content-utils.d.ts +31 -0
- package/build/util/content-utils.js +120 -0
- package/build/util/content-utils.js.map +1 -0
- package/build/util/content.d.ts +1 -0
- package/build/util/content.js +16 -0
- package/build/util/content.js.map +1 -0
- package/build/util/docs.d.ts +1 -0
- package/build/util/docs.js +26 -0
- package/build/util/docs.js.map +1 -0
- package/build/util/docs.test.d.ts +1 -0
- package/build/util/docs.test.js +49 -0
- package/build/util/docs.test.js.map +1 -0
- package/build/util/favicon.d.ts +6 -0
- package/build/util/favicon.js +88 -0
- package/build/util/favicon.js.map +1 -0
- package/build/util/favicon.test.d.ts +1 -0
- package/build/util/favicon.test.js +140 -0
- package/build/util/favicon.test.js.map +1 -0
- package/build/util/logger.d.ts +17 -0
- package/build/util/logger.js +72 -0
- package/build/util/logger.js.map +1 -0
- package/build/util/logger.test.d.ts +1 -0
- package/build/util/logger.test.js +46 -0
- package/build/util/logger.test.js.map +1 -0
- package/build/util/security.d.ts +312 -0
- package/build/util/security.js +719 -0
- package/build/util/security.js.map +1 -0
- package/build/util/security.test.d.ts +1 -0
- package/build/util/security.test.js +524 -0
- package/build/util/security.test.js.map +1 -0
- package/build/util/site-detector.d.ts +22 -0
- package/build/util/site-detector.js +42 -0
- package/build/util/site-detector.js.map +1 -0
- package/package.json +112 -0
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
const mockQueueManager = {
|
|
2
|
+
initialize: vi.fn().mockResolvedValue(undefined),
|
|
3
|
+
getRequestQueue: vi.fn().mockReturnValue({}),
|
|
4
|
+
handleQueueAndLinks: vi.fn().mockResolvedValue(undefined),
|
|
5
|
+
addResult: vi.fn(),
|
|
6
|
+
hasEnoughResults: vi.fn().mockReturnValue(false),
|
|
7
|
+
processBatch: vi.fn().mockResolvedValue([]),
|
|
8
|
+
cleanup: vi.fn().mockResolvedValue(undefined),
|
|
9
|
+
};
|
|
10
|
+
vi.mock('./queue-manager.js', () => ({
|
|
11
|
+
QueueManager: function () {
|
|
12
|
+
return mockQueueManager;
|
|
13
|
+
},
|
|
14
|
+
}));
|
|
15
|
+
vi.mock('./browser-config.js', () => ({
|
|
16
|
+
getBrowserConfig: vi.fn().mockReturnValue({
|
|
17
|
+
requestQueue: {},
|
|
18
|
+
preNavigationHooks: [],
|
|
19
|
+
launchContext: {},
|
|
20
|
+
browserPoolOptions: {},
|
|
21
|
+
}),
|
|
22
|
+
}));
|
|
23
|
+
vi.mock('./site-rules.js', () => ({
|
|
24
|
+
siteRules: [
|
|
25
|
+
{
|
|
26
|
+
type: 'default',
|
|
27
|
+
extractor: {
|
|
28
|
+
extractContent: vi.fn().mockResolvedValue({ content: 'Extracted content', metadata: {} }),
|
|
29
|
+
},
|
|
30
|
+
detect: vi.fn().mockResolvedValue(true),
|
|
31
|
+
},
|
|
32
|
+
],
|
|
33
|
+
}));
|
|
34
|
+
// Mock PlaywrightCrawler
|
|
35
|
+
const mockCrawlerRun = vi.fn().mockResolvedValue(undefined);
|
|
36
|
+
const mockCrawlerTeardown = vi.fn().mockResolvedValue(undefined);
|
|
37
|
+
vi.mock('crawlee', () => ({
|
|
38
|
+
PlaywrightCrawler: function (options) {
|
|
39
|
+
// Store the request handler for testing
|
|
40
|
+
global.__requestHandler = options.requestHandler;
|
|
41
|
+
return {
|
|
42
|
+
run: mockCrawlerRun,
|
|
43
|
+
teardown: mockCrawlerTeardown,
|
|
44
|
+
};
|
|
45
|
+
},
|
|
46
|
+
}));
|
|
47
|
+
// Import after mocking
|
|
48
|
+
import { CrawleeCrawler } from './crawlee-crawler.js';
|
|
49
|
+
describe('CrawleeCrawler', () => {
|
|
50
|
+
let crawler;
|
|
51
|
+
beforeEach(() => {
|
|
52
|
+
vi.clearAllMocks();
|
|
53
|
+
crawler = new CrawleeCrawler();
|
|
54
|
+
mockQueueManager.hasEnoughResults.mockReturnValue(false);
|
|
55
|
+
mockQueueManager.processBatch.mockResolvedValue([]);
|
|
56
|
+
});
|
|
57
|
+
describe('constructor', () => {
|
|
58
|
+
it('should initialize with default values', () => {
|
|
59
|
+
expect(crawler).toBeDefined();
|
|
60
|
+
});
|
|
61
|
+
it('should accept custom maxDepth and maxRequestsPerCrawl', () => {
|
|
62
|
+
const customCrawler = new CrawleeCrawler(10, 500);
|
|
63
|
+
expect(customCrawler).toBeDefined();
|
|
64
|
+
});
|
|
65
|
+
it('should accept progress callback', () => {
|
|
66
|
+
const progressFn = vi.fn();
|
|
67
|
+
const progressCrawler = new CrawleeCrawler(4, 1000, progressFn);
|
|
68
|
+
expect(progressCrawler).toBeDefined();
|
|
69
|
+
});
|
|
70
|
+
});
|
|
71
|
+
describe('setStorageState', () => {
|
|
72
|
+
it('should accept storage state', () => {
|
|
73
|
+
const state = {
|
|
74
|
+
cookies: [{ name: 'session', value: 'abc123', domain: 'example.com', path: '/' }],
|
|
75
|
+
};
|
|
76
|
+
crawler.setStorageState(state);
|
|
77
|
+
// No error means success
|
|
78
|
+
expect(true).toBe(true);
|
|
79
|
+
});
|
|
80
|
+
it('should accept storage state with origins', () => {
|
|
81
|
+
const state = {
|
|
82
|
+
cookies: [{ name: 'session', value: 'abc123', domain: 'example.com', path: '/' }],
|
|
83
|
+
origins: [
|
|
84
|
+
{
|
|
85
|
+
origin: 'https://example.com',
|
|
86
|
+
localStorage: [{ name: 'token', value: 'xyz' }],
|
|
87
|
+
},
|
|
88
|
+
],
|
|
89
|
+
};
|
|
90
|
+
crawler.setStorageState(state);
|
|
91
|
+
expect(true).toBe(true);
|
|
92
|
+
});
|
|
93
|
+
});
|
|
94
|
+
describe('crawl', () => {
|
|
95
|
+
it('should initialize queue manager with URL', async () => {
|
|
96
|
+
// Set up processBatch to return results immediately to end the crawl
|
|
97
|
+
mockCrawlerRun.mockResolvedValueOnce(undefined);
|
|
98
|
+
mockQueueManager.processBatch.mockResolvedValueOnce([]);
|
|
99
|
+
const results = [];
|
|
100
|
+
for await (const result of crawler.crawl('https://example.com/docs')) {
|
|
101
|
+
results.push(result);
|
|
102
|
+
}
|
|
103
|
+
expect(mockQueueManager.initialize).toHaveBeenCalledWith('https://example.com/docs', undefined);
|
|
104
|
+
});
|
|
105
|
+
it('should yield results from queue manager', async () => {
|
|
106
|
+
const mockResults = [
|
|
107
|
+
{ url: 'https://example.com/page1', path: '/page1', content: 'Page 1', title: 'Page 1' },
|
|
108
|
+
{ url: 'https://example.com/page2', path: '/page2', content: 'Page 2', title: 'Page 2' },
|
|
109
|
+
];
|
|
110
|
+
// Since hasEnoughResults returns false, processBatch is only called once
|
|
111
|
+
// at the end of crawl (line 388 in crawlee-crawler.ts), so we only need one mock value
|
|
112
|
+
mockQueueManager.processBatch.mockResolvedValueOnce(mockResults);
|
|
113
|
+
const results = [];
|
|
114
|
+
for await (const result of crawler.crawl('https://example.com')) {
|
|
115
|
+
results.push(result);
|
|
116
|
+
}
|
|
117
|
+
expect(results).toEqual(mockResults);
|
|
118
|
+
});
|
|
119
|
+
it('should cleanup queue manager after crawl', async () => {
|
|
120
|
+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
121
|
+
for await (const _ of crawler.crawl('https://example.com')) {
|
|
122
|
+
// Just consume results
|
|
123
|
+
}
|
|
124
|
+
expect(mockQueueManager.cleanup).toHaveBeenCalled();
|
|
125
|
+
});
|
|
126
|
+
it('should process batch when enough results accumulated', async () => {
|
|
127
|
+
const mockResults = [{ url: 'https://example.com/page1', path: '/page1', content: 'Page 1', title: 'Page 1' }];
|
|
128
|
+
mockQueueManager.hasEnoughResults.mockReturnValueOnce(true).mockReturnValue(false);
|
|
129
|
+
mockQueueManager.processBatch.mockResolvedValueOnce(mockResults).mockResolvedValueOnce([]);
|
|
130
|
+
const results = [];
|
|
131
|
+
for await (const result of crawler.crawl('https://example.com')) {
|
|
132
|
+
results.push(result);
|
|
133
|
+
}
|
|
134
|
+
expect(results).toHaveLength(1);
|
|
135
|
+
});
|
|
136
|
+
it('should set allowed hostname from URL', async () => {
|
|
137
|
+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
138
|
+
for await (const _ of crawler.crawl('https://docs.example.com/guide')) {
|
|
139
|
+
// Just consume results
|
|
140
|
+
}
|
|
141
|
+
// Verify through the initialize call
|
|
142
|
+
expect(mockQueueManager.initialize).toHaveBeenCalledWith('https://docs.example.com/guide', undefined);
|
|
143
|
+
});
|
|
144
|
+
});
|
|
145
|
+
describe('abort', () => {
|
|
146
|
+
it('should stop the crawler', async () => {
|
|
147
|
+
// Make run() hang until we resolve it, so abort() can be called while crawler exists
|
|
148
|
+
let resolveRun;
|
|
149
|
+
const runPromise = new Promise((resolve) => {
|
|
150
|
+
resolveRun = resolve;
|
|
151
|
+
});
|
|
152
|
+
mockCrawlerRun.mockReturnValueOnce(runPromise);
|
|
153
|
+
// Create a crawler that we can abort
|
|
154
|
+
const abortableCrawler = new CrawleeCrawler();
|
|
155
|
+
// Start consuming the generator - this creates the crawler
|
|
156
|
+
const generator = abortableCrawler.crawl('https://example.com');
|
|
157
|
+
// Get the first value to start the generator (this creates the crawler)
|
|
158
|
+
const firstResultPromise = generator.next();
|
|
159
|
+
// Give the generator time to start and create the crawler
|
|
160
|
+
await new Promise((resolve) => setTimeout(resolve, 10));
|
|
161
|
+
// Now abort - the crawler exists at this point
|
|
162
|
+
abortableCrawler.abort();
|
|
163
|
+
// Let the run() complete so the generator can finish
|
|
164
|
+
resolveRun();
|
|
165
|
+
// Wait for the generator to complete
|
|
166
|
+
await firstResultPromise;
|
|
167
|
+
const results = [];
|
|
168
|
+
for await (const result of generator) {
|
|
169
|
+
results.push(result);
|
|
170
|
+
}
|
|
171
|
+
expect(mockCrawlerTeardown).toHaveBeenCalled();
|
|
172
|
+
});
|
|
173
|
+
});
|
|
174
|
+
describe('domain restriction', () => {
|
|
175
|
+
it('should extract hostname from URL for domain restriction', async () => {
|
|
176
|
+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
177
|
+
for await (const _ of crawler.crawl('https://subdomain.example.com/path')) {
|
|
178
|
+
// Just consume results
|
|
179
|
+
}
|
|
180
|
+
expect(mockQueueManager.initialize).toHaveBeenCalledWith('https://subdomain.example.com/path', undefined);
|
|
181
|
+
});
|
|
182
|
+
it('should pass path prefix to queue manager when set', async () => {
|
|
183
|
+
crawler.setPathPrefix('/docs/api');
|
|
184
|
+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
185
|
+
for await (const _ of crawler.crawl('https://example.com/docs/api')) {
|
|
186
|
+
// Just consume results
|
|
187
|
+
}
|
|
188
|
+
expect(mockQueueManager.initialize).toHaveBeenCalledWith('https://example.com/docs/api', '/docs/api');
|
|
189
|
+
});
|
|
190
|
+
});
|
|
191
|
+
describe('authentication', () => {
|
|
192
|
+
it('should configure crawler with storage state when set', async () => {
|
|
193
|
+
const state = {
|
|
194
|
+
cookies: [{ name: 'auth', value: 'token123', domain: 'example.com', path: '/' }],
|
|
195
|
+
};
|
|
196
|
+
crawler.setStorageState(state);
|
|
197
|
+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
198
|
+
for await (const _ of crawler.crawl('https://example.com')) {
|
|
199
|
+
// Just consume results
|
|
200
|
+
}
|
|
201
|
+
// Verify queue manager was initialized (auth is handled internally)
|
|
202
|
+
expect(mockQueueManager.initialize).toHaveBeenCalled();
|
|
203
|
+
});
|
|
204
|
+
});
|
|
205
|
+
describe('isWithinAllowedDomain', () => {
|
|
206
|
+
// Access the private method through the class prototype for testing
|
|
207
|
+
it('should handle URL parsing for domain check', async () => {
|
|
208
|
+
// This is tested indirectly through the crawl method
|
|
209
|
+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
210
|
+
for await (const _ of crawler.crawl('https://example.com')) {
|
|
211
|
+
// Just consume results
|
|
212
|
+
}
|
|
213
|
+
// No errors means domain parsing worked
|
|
214
|
+
expect(mockQueueManager.initialize).toHaveBeenCalled();
|
|
215
|
+
});
|
|
216
|
+
});
|
|
217
|
+
describe('error handling', () => {
|
|
218
|
+
it('should cleanup on error', async () => {
|
|
219
|
+
mockCrawlerRun.mockRejectedValueOnce(new Error('Crawl failed'));
|
|
220
|
+
await expect(async () => {
|
|
221
|
+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
222
|
+
for await (const _ of crawler.crawl('https://example.com')) {
|
|
223
|
+
// Just consume results
|
|
224
|
+
}
|
|
225
|
+
}).rejects.toThrow('Crawl failed');
|
|
226
|
+
expect(mockQueueManager.cleanup).toHaveBeenCalled();
|
|
227
|
+
});
|
|
228
|
+
it('should handle invalid URLs gracefully', async () => {
|
|
229
|
+
// The crawler should handle this internally
|
|
230
|
+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
231
|
+
for await (const _ of crawler.crawl('https://example.com')) {
|
|
232
|
+
// Just consume results
|
|
233
|
+
}
|
|
234
|
+
expect(mockQueueManager.initialize).toHaveBeenCalled();
|
|
235
|
+
});
|
|
236
|
+
});
|
|
237
|
+
});
|
|
238
|
+
describe('StorageState interface', () => {
|
|
239
|
+
it('should allow cookies with all optional properties', () => {
|
|
240
|
+
const state = {
|
|
241
|
+
cookies: [
|
|
242
|
+
{
|
|
243
|
+
name: 'session',
|
|
244
|
+
value: 'abc',
|
|
245
|
+
domain: 'example.com',
|
|
246
|
+
path: '/',
|
|
247
|
+
expires: 1234567890,
|
|
248
|
+
httpOnly: true,
|
|
249
|
+
secure: true,
|
|
250
|
+
sameSite: 'Strict',
|
|
251
|
+
},
|
|
252
|
+
],
|
|
253
|
+
};
|
|
254
|
+
expect(state.cookies).toHaveLength(1);
|
|
255
|
+
expect(state.cookies[0].sameSite).toBe('Strict');
|
|
256
|
+
});
|
|
257
|
+
it('should allow minimal cookie definition', () => {
|
|
258
|
+
const state = {
|
|
259
|
+
cookies: [{ name: 'token', value: 'xyz', domain: 'test.com', path: '/' }],
|
|
260
|
+
};
|
|
261
|
+
expect(state.cookies[0].expires).toBeUndefined();
|
|
262
|
+
});
|
|
263
|
+
it('should allow origins for localStorage', () => {
|
|
264
|
+
const state = {
|
|
265
|
+
cookies: [],
|
|
266
|
+
origins: [
|
|
267
|
+
{
|
|
268
|
+
origin: 'https://example.com',
|
|
269
|
+
localStorage: [
|
|
270
|
+
{ name: 'key1', value: 'value1' },
|
|
271
|
+
{ name: 'key2', value: 'value2' },
|
|
272
|
+
],
|
|
273
|
+
},
|
|
274
|
+
],
|
|
275
|
+
};
|
|
276
|
+
expect(state.origins).toHaveLength(1);
|
|
277
|
+
expect(state.origins[0].localStorage).toHaveLength(2);
|
|
278
|
+
});
|
|
279
|
+
});
|
|
280
|
+
//# sourceMappingURL=crawlee-crawler.test.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"crawlee-crawler.test.js","sourceRoot":"","sources":["../../src/crawler/crawlee-crawler.test.ts"],"names":[],"mappings":"AAEA,MAAM,gBAAgB,GAAG;IACvB,UAAU,EAAE,EAAE,CAAC,EAAE,EAAE,CAAC,iBAAiB,CAAC,SAAS,CAAC;IAChD,eAAe,EAAE,EAAE,CAAC,EAAE,EAAE,CAAC,eAAe,CAAC,EAAE,CAAC;IAC5C,mBAAmB,EAAE,EAAE,CAAC,EAAE,EAAE,CAAC,iBAAiB,CAAC,SAAS,CAAC;IACzD,SAAS,EAAE,EAAE,CAAC,EAAE,EAAE;IAClB,gBAAgB,EAAE,EAAE,CAAC,EAAE,EAAE,CAAC,eAAe,CAAC,KAAK,CAAC;IAChD,YAAY,EAAE,EAAE,CAAC,EAAE,EAAE,CAAC,iBAAiB,CAAC,EAAE,CAAC;IAC3C,OAAO,EAAE,EAAE,CAAC,EAAE,EAAE,CAAC,iBAAiB,CAAC,SAAS,CAAC;CAC9C,CAAC;AAEF,EAAE,CAAC,IAAI,CAAC,oBAAoB,EAAE,GAAG,EAAE,CAAC,CAAC;IACnC,YAAY,EAAE;QACZ,OAAO,gBAAgB,CAAC;IAC1B,CAAC;CACF,CAAC,CAAC,CAAC;AAEJ,EAAE,CAAC,IAAI,CAAC,qBAAqB,EAAE,GAAG,EAAE,CAAC,CAAC;IACpC,gBAAgB,EAAE,EAAE,CAAC,EAAE,EAAE,CAAC,eAAe,CAAC;QACxC,YAAY,EAAE,EAAE;QAChB,kBAAkB,EAAE,EAAE;QACtB,aAAa,EAAE,EAAE;QACjB,kBAAkB,EAAE,EAAE;KACvB,CAAC;CACH,CAAC,CAAC,CAAC;AAEJ,EAAE,CAAC,IAAI,CAAC,iBAAiB,EAAE,GAAG,EAAE,CAAC,CAAC;IAChC,SAAS,EAAE;QACT;YACE,IAAI,EAAE,SAAS;YACf,SAAS,EAAE;gBACT,cAAc,EAAE,EAAE,CAAC,EAAE,EAAE,CAAC,iBAAiB,CAAC,EAAE,OAAO,EAAE,mBAAmB,EAAE,QAAQ,EAAE,EAAE,EAAE,CAAC;aAC1F;YACD,MAAM,EAAE,EAAE,CAAC,EAAE,EAAE,CAAC,iBAAiB,CAAC,IAAI,CAAC;SACxC;KACF;CACF,CAAC,CAAC,CAAC;AAEJ,yBAAyB;AACzB,MAAM,cAAc,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,iBAAiB,CAAC,SAAS,CAAC,CAAC;AAC5D,MAAM,mBAAmB,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,iBAAiB,CAAC,SAAS,CAAC,CAAC;AAEjE,EAAE,CAAC,IAAI,CAAC,SAAS,EAAE,GAAG,EAAE,CAAC,CAAC;IACxB,iBAAiB,EAAE,UAAU,OAAqC;QAChE,wCAAwC;QACvC,MAAyC,CAAC,gBAAgB,GAAG,OAAO,CAAC,cAAc,CAAC;QACrF,OAAO;YACL,GAAG,EAAE,cAAc;YACnB,QAAQ,EAAE,mBAAmB;SAC9B,CAAC;IACJ,CAAC;CACF,CAAC,CAAC,CAAC;AAEJ,uBAAuB;AACvB,OAAO,EAAE,cAAc,EAAgB,MAAM,sBAAsB,CAAC;AAEpE,QAAQ,CAAC,gBAAgB,EAAE,GAAG,EAAE;IAC9B,IAAI,OAAuB,CAAC;IAE5B,UAAU,CAAC,GAAG,EAAE;QACd,EAAE,CAAC,aAAa,EAAE,CAAC;QACnB,OAAO,GAAG,IAAI,cAAc,EAAE,CAAC;QAC/B,gBAAgB,CAAC,gBAAgB,CAAC,eAAe,CAAC,KAAK,CAAC,CAAC;QACzD,gBAAgB,CAAC,YAAY,CAAC,iBAAiB,CAAC,EAAE,CAAC,CAAC;IACtD,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,aAAa,EAAE,GAAG,EAAE;QAC3B,EAAE,CAAC,uCAAuC,EAAE,GAAG,EAAE;YAC/C,MAAM,CAAC,OAAO,CAAC,CAAC,WAAW,EAAE,CAAC;QAChC,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,uDAAuD,EAAE,GAAG,EAAE;YAC/D,MAAM,aAAa,GAAG,IAAI,cAAc,CAAC,EAAE,EAAE,GAAG,CAAC,CAAC;YAClD,MAAM,CAAC,aAAa,CAAC,CAAC,WAAW,EAAE,CAAC;QACtC,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,iCAAiC,EAAE,GAAG,EAAE;YACzC,MAAM,UAAU,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC;YAC3B,MAAM,eAAe,GAAG,IAAI,cAAc,CAAC,CAAC,EAAE,IAAI,EAAE,UAAU,CAAC,CAAC;YAChE,MAAM,CAAC,eAAe,CAAC,CAAC,WAAW,EAAE,CAAC;QACxC,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,iBAAiB,EAAE,GAAG,EAAE;QAC/B,EAAE,CAAC,6BAA6B,EAAE,GAAG,EAAE;YACrC,MAAM,KAAK,GAAiB;gBAC1B,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,SAAS,EAAE,KAAK,EAAE,QAAQ,EAAE,MAAM,EAAE,aAAa,EAAE,IAAI,EAAE,GAAG,EAAE,CAAC;aAClF,CAAC;YAEF,OAAO,CAAC,eAAe,CAAC,KAAK,CAAC,CAAC;YAE/B,yBAAyB;YACzB,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC1B,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,0CAA0C,EAAE,GAAG,EAAE;YAClD,MAAM,KAAK,GAAiB;gBAC1B,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,SAAS,EAAE,KAAK,EAAE,QAAQ,EAAE,MAAM,EAAE,aAAa,EAAE,IAAI,EAAE,GAAG,EAAE,CAAC;gBACjF,OAAO,EAAE;oBACP;wBACE,MAAM,EAAE,qBAAqB;wBAC7B,YAAY,EAAE,CAAC,EAAE,IAAI,EAAE,OAAO,EAAE,KAAK,EAAE,KAAK,EAAE,CAAC;qBAChD;iBACF;aACF,CAAC;YAEF,OAAO,CAAC,eAAe,CAAC,KAAK,CAAC,CAAC;YAC/B,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC1B,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,OAAO,EAAE,GAAG,EAAE;QACrB,EAAE,CAAC,0CAA0C,EAAE,KAAK,IAAI,EAAE;YACxD,qEAAqE;YACrE,cAAc,CAAC,qBAAqB,CAAC,SAAS,CAAC,CAAC;YAChD,gBAAgB,CAAC,YAAY,CAAC,qBAAqB,CAAC,EAAE,CAAC,CAAC;YAExD,MAAM,OAAO,GAAkB,EAAE,CAAC;YAClC,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI,OAAO,CAAC,KAAK,CAAC,0BAA0B,CAAC,EAAE,CAAC;gBACrE,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACvB,CAAC;YAED,MAAM,CAAC,gBAAgB,CAAC,UAAU,CAAC,CAAC,oBAAoB,CAAC,0BAA0B,EAAE,SAAS,CAAC,CAAC;QAClG,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,yCAAyC,EAAE,KAAK,IAAI,EAAE;YACvD,MAAM,WAAW,GAAkB;gBACjC,EAAE,GAAG,EAAE,2BAA2B,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,QAAQ,EAAE,KAAK,EAAE,QAAQ,EAAE;gBACxF,EAAE,GAAG,EAAE,2BAA2B,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,QAAQ,EAAE,KAAK,EAAE,QAAQ,EAAE;aACzF,CAAC;YAEF,yEAAyE;YACzE,uFAAuF;YACvF,gBAAgB,CAAC,YAAY,CAAC,qBAAqB,CAAC,WAAW,CAAC,CAAC;YAEjE,MAAM,OAAO,GAAkB,EAAE,CAAC;YAClC,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI,OAAO,CAAC,KAAK,CAAC,qBAAqB,CAAC,EAAE,CAAC;gBAChE,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACvB,CAAC;YAED,MAAM,CAAC,OAAO,CAAC,CAAC,OAAO,CAAC,WAAW,CAAC,CAAC;QACvC,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,0CAA0C,EAAE,KAAK,IAAI,EAAE;YACxD,6DAA6D;YAC7D,IAAI,KAAK,EAAE,MAAM,CAAC,IAAI,OAAO,CAAC,KAAK,CAAC,qBAAqB,CAAC,EAAE,CAAC;gBAC3D,uBAAuB;YACzB,CAAC;YAED,MAAM,CAAC,gBAAgB,CAAC,OAAO,CAAC,CAAC,gBAAgB,EAAE,CAAC;QACtD,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,sDAAsD,EAAE,KAAK,IAAI,EAAE;YACpE,MAAM,WAAW,GAAkB,CAAC,EAAE,GAAG,EAAE,2BAA2B,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,QAAQ,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC,CAAC;YAE9H,gBAAgB,CAAC,gBAAgB,CAAC,mBAAmB,CAAC,IAAI,CAAC,CAAC,eAAe,CAAC,KAAK,CAAC,CAAC;YACnF,gBAAgB,CAAC,YAAY,CAAC,qBAAqB,CAAC,WAAW,CAAC,CAAC,qBAAqB,CAAC,EAAE,CAAC,CAAC;YAE3F,MAAM,OAAO,GAAkB,EAAE,CAAC;YAClC,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI,OAAO,CAAC,KAAK,CAAC,qBAAqB,CAAC,EAAE,CAAC;gBAChE,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACvB,CAAC;YAED,MAAM,CAAC,OAAO,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;QAClC,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,sCAAsC,EAAE,KAAK,IAAI,EAAE;YACpD,6DAA6D;YAC7D,IAAI,KAAK,EAAE,MAAM,CAAC,IAAI,OAAO,CAAC,KAAK,CAAC,gCAAgC,CAAC,EAAE,CAAC;gBACtE,uBAAuB;YACzB,CAAC;YAED,qCAAqC;YACrC,MAAM,CAAC,gBAAgB,CAAC,UAAU,CAAC,CAAC,oBAAoB,CAAC,gCAAgC,EAAE,SAAS,CAAC,CAAC;QACxG,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,OAAO,EAAE,GAAG,EAAE;QACrB,EAAE,CAAC,yBAAyB,EAAE,KAAK,IAAI,EAAE;YACvC,qFAAqF;YACrF,IAAI,UAAsB,CAAC;YAC3B,MAAM,UAAU,GAAG,IAAI,OAAO,CAAO,CAAC,OAAO,EAAE,EAAE;gBAC/C,UAAU,GAAG,OAAO,CAAC;YACvB,CAAC,CAAC,CAAC;YACH,cAAc,CAAC,mBAAmB,CAAC,UAAU,CAAC,CAAC;YAE/C,qCAAqC;YACrC,MAAM,gBAAgB,GAAG,IAAI,cAAc,EAAE,CAAC;YAE9C,2DAA2D;YAC3D,MAAM,SAAS,GAAG,gBAAgB,CAAC,KAAK,CAAC,qBAAqB,CAAC,CAAC;YAEhE,wEAAwE;YACxE,MAAM,kBAAkB,GAAG,SAAS,CAAC,IAAI,EAAE,CAAC;YAE5C,0DAA0D;YAC1D,MAAM,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC,UAAU,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC,CAAC;YAExD,+CAA+C;YAC/C,gBAAgB,CAAC,KAAK,EAAE,CAAC;YAEzB,qDAAqD;YACrD,UAAW,EAAE,CAAC;YAEd,qCAAqC;YACrC,MAAM,kBAAkB,CAAC;YACzB,MAAM,OAAO,GAAkB,EAAE,CAAC;YAClC,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI,SAAS,EAAE,CAAC;gBACrC,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACvB,CAAC;YAED,MAAM,CAAC,mBAAmB,CAAC,CAAC,gBAAgB,EAAE,CAAC;QACjD,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,oBAAoB,EAAE,GAAG,EAAE;QAClC,EAAE,CAAC,yDAAyD,EAAE,KAAK,IAAI,EAAE;YACvE,6DAA6D;YAC7D,IAAI,KAAK,EAAE,MAAM,CAAC,IAAI,OAAO,CAAC,KAAK,CAAC,oCAAoC,CAAC,EAAE,CAAC;gBAC1E,uBAAuB;YACzB,CAAC;YAED,MAAM,CAAC,gBAAgB,CAAC,UAAU,CAAC,CAAC,oBAAoB,CAAC,oCAAoC,EAAE,SAAS,CAAC,CAAC;QAC5G,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,mDAAmD,EAAE,KAAK,IAAI,EAAE;YACjE,OAAO,CAAC,aAAa,CAAC,WAAW,CAAC,CAAC;YAEnC,6DAA6D;YAC7D,IAAI,KAAK,EAAE,MAAM,CAAC,IAAI,OAAO,CAAC,KAAK,CAAC,8BAA8B,CAAC,EAAE,CAAC;gBACpE,uBAAuB;YACzB,CAAC;YAED,MAAM,CAAC,gBAAgB,CAAC,UAAU,CAAC,CAAC,oBAAoB,CAAC,8BAA8B,EAAE,WAAW,CAAC,CAAC;QACxG,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,gBAAgB,EAAE,GAAG,EAAE;QAC9B,EAAE,CAAC,sDAAsD,EAAE,KAAK,IAAI,EAAE;YACpE,MAAM,KAAK,GAAiB;gBAC1B,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,KAAK,EAAE,UAAU,EAAE,MAAM,EAAE,aAAa,EAAE,IAAI,EAAE,GAAG,EAAE,CAAC;aACjF,CAAC;YAEF,OAAO,CAAC,eAAe,CAAC,KAAK,CAAC,CAAC;YAE/B,6DAA6D;YAC7D,IAAI,KAAK,EAAE,MAAM,CAAC,IAAI,OAAO,CAAC,KAAK,CAAC,qBAAqB,CAAC,EAAE,CAAC;gBAC3D,uBAAuB;YACzB,CAAC;YAED,oEAAoE;YACpE,MAAM,CAAC,gBAAgB,CAAC,UAAU,CAAC,CAAC,gBAAgB,EAAE,CAAC;QACzD,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,uBAAuB,EAAE,GAAG,EAAE;QACrC,oEAAoE;QACpE,EAAE,CAAC,4CAA4C,EAAE,KAAK,IAAI,EAAE;YAC1D,qDAAqD;YACrD,6DAA6D;YAC7D,IAAI,KAAK,EAAE,MAAM,CAAC,IAAI,OAAO,CAAC,KAAK,CAAC,qBAAqB,CAAC,EAAE,CAAC;gBAC3D,uBAAuB;YACzB,CAAC;YAED,wCAAwC;YACxC,MAAM,CAAC,gBAAgB,CAAC,UAAU,CAAC,CAAC,gBAAgB,EAAE,CAAC;QACzD,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,gBAAgB,EAAE,GAAG,EAAE;QAC9B,EAAE,CAAC,yBAAyB,EAAE,KAAK,IAAI,EAAE;YACvC,cAAc,CAAC,qBAAqB,CAAC,IAAI,KAAK,CAAC,cAAc,CAAC,CAAC,CAAC;YAEhE,MAAM,MAAM,CAAC,KAAK,IAAI,EAAE;gBACtB,6DAA6D;gBAC7D,IAAI,KAAK,EAAE,MAAM,CAAC,IAAI,OAAO,CAAC,KAAK,CAAC,qBAAqB,CAAC,EAAE,CAAC;oBAC3D,uBAAuB;gBACzB,CAAC;YACH,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,CAAC,cAAc,CAAC,CAAC;YAEnC,MAAM,CAAC,gBAAgB,CAAC,OAAO,CAAC,CAAC,gBAAgB,EAAE,CAAC;QACtD,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,uCAAuC,EAAE,KAAK,IAAI,EAAE;YACrD,4CAA4C;YAC5C,6DAA6D;YAC7D,IAAI,KAAK,EAAE,MAAM,CAAC,IAAI,OAAO,CAAC,KAAK,CAAC,qBAAqB,CAAC,EAAE,CAAC;gBAC3D,uBAAuB;YACzB,CAAC;YAED,MAAM,CAAC,gBAAgB,CAAC,UAAU,CAAC,CAAC,gBAAgB,EAAE,CAAC;QACzD,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC;AAEH,QAAQ,CAAC,wBAAwB,EAAE,GAAG,EAAE;IACtC,EAAE,CAAC,mDAAmD,EAAE,GAAG,EAAE;QAC3D,MAAM,KAAK,GAAiB;YAC1B,OAAO,EAAE;gBACP;oBACE,IAAI,EAAE,SAAS;oBACf,KAAK,EAAE,KAAK;oBACZ,MAAM,EAAE,aAAa;oBACrB,IAAI,EAAE,GAAG;oBACT,OAAO,EAAE,UAAU;oBACnB,QAAQ,EAAE,IAAI;oBACd,MAAM,EAAE,IAAI;oBACZ,QAAQ,EAAE,QAAQ;iBACnB;aACF;SACF,CAAC;QAEF,MAAM,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;QACtC,MAAM,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;IACnD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,wCAAwC,EAAE,GAAG,EAAE;QAChD,MAAM,KAAK,GAAiB;YAC1B,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,OAAO,EAAE,KAAK,EAAE,KAAK,EAAE,MAAM,EAAE,UAAU,EAAE,IAAI,EAAE,GAAG,EAAE,CAAC;SAC1E,CAAC;QAEF,MAAM,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,aAAa,EAAE,CAAC;IACnD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,uCAAuC,EAAE,GAAG,EAAE;QAC/C,MAAM,KAAK,GAAiB;YAC1B,OAAO,EAAE,EAAE;YACX,OAAO,EAAE;gBACP;oBACE,MAAM,EAAE,qBAAqB;oBAC7B,YAAY,EAAE;wBACZ,EAAE,IAAI,EAAE,MAAM,EAAE,KAAK,EAAE,QAAQ,EAAE;wBACjC,EAAE,IAAI,EAAE,MAAM,EAAE,KAAK,EAAE,QAAQ,EAAE;qBAClC;iBACF;aACF;SACF,CAAC;QAEF,MAAM,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;QACtC,MAAM,CAAC,KAAK,CAAC,OAAQ,CAAC,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;IACzD,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
export class DefaultExtractor {
|
|
2
|
+
async extractContent(document) {
|
|
3
|
+
// Remove common non-content elements
|
|
4
|
+
document.querySelectorAll('style, script, nav, header, footer').forEach((el) => el.remove());
|
|
5
|
+
// Get main content
|
|
6
|
+
const main = document.querySelector('main, article, [role="main"]');
|
|
7
|
+
const contentElement = main ? main.cloneNode(true) : document.body;
|
|
8
|
+
// Extract title and description
|
|
9
|
+
const title = contentElement.querySelector('h1')?.textContent?.trim();
|
|
10
|
+
const firstParagraph = contentElement.querySelector('h1 + p')?.textContent?.trim();
|
|
11
|
+
return {
|
|
12
|
+
content: contentElement.textContent?.trim() || '',
|
|
13
|
+
metadata: {
|
|
14
|
+
type: 'overview',
|
|
15
|
+
pattern: {
|
|
16
|
+
name: title || '',
|
|
17
|
+
type: 'component',
|
|
18
|
+
description: firstParagraph || '',
|
|
19
|
+
usageContexts: [],
|
|
20
|
+
relatedPatterns: [],
|
|
21
|
+
},
|
|
22
|
+
},
|
|
23
|
+
};
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
//# sourceMappingURL=default-extractor.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"default-extractor.js","sourceRoot":"","sources":["../../src/crawler/default-extractor.ts"],"names":[],"mappings":"AAEA,MAAM,OAAO,gBAAgB;IAC3B,KAAK,CAAC,cAAc,CAAC,QAAkB;QACrC,qCAAqC;QACrC,QAAQ,CAAC,gBAAgB,CAAC,oCAAoC,CAAC,CAAC,OAAO,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,CAAC,MAAM,EAAE,CAAC,CAAC;QAE7F,mBAAmB;QACnB,MAAM,IAAI,GAAG,QAAQ,CAAC,aAAa,CAAC,8BAA8B,CAAC,CAAC;QACpE,MAAM,cAAc,GAAG,IAAI,CAAC,CAAC,CAAE,IAAI,CAAC,SAAS,CAAC,IAAI,CAAa,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC;QAEhF,gCAAgC;QAChC,MAAM,KAAK,GAAG,cAAc,CAAC,aAAa,CAAC,IAAI,CAAC,EAAE,WAAW,EAAE,IAAI,EAAE,CAAC;QACtE,MAAM,cAAc,GAAG,cAAc,CAAC,aAAa,CAAC,QAAQ,CAAC,EAAE,WAAW,EAAE,IAAI,EAAE,CAAC;QAEnF,OAAO;YACL,OAAO,EAAE,cAAc,CAAC,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE;YACjD,QAAQ,EAAE;gBACR,IAAI,EAAE,UAAU;gBAChB,OAAO,EAAE;oBACP,IAAI,EAAE,KAAK,IAAI,EAAE;oBACjB,IAAI,EAAE,WAAW;oBACjB,WAAW,EAAE,cAAc,IAAI,EAAE;oBACjC,aAAa,EAAE,EAAE;oBACjB,eAAe,EAAE,EAAE;iBACpB;aACF;SACF,CAAC;IACJ,CAAC;CACF"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
import { DefaultExtractor } from './default-extractor.js';
|
|
2
|
+
import { JSDOM } from 'jsdom';
|
|
3
|
+
describe('DefaultExtractor', () => {
|
|
4
|
+
let extractor;
|
|
5
|
+
beforeEach(() => {
|
|
6
|
+
extractor = new DefaultExtractor();
|
|
7
|
+
});
|
|
8
|
+
function createDocument(html) {
|
|
9
|
+
const dom = new JSDOM(html);
|
|
10
|
+
return dom.window.document;
|
|
11
|
+
}
|
|
12
|
+
describe('extractContent', () => {
|
|
13
|
+
it('should extract content from main element', async () => {
|
|
14
|
+
const html = `
|
|
15
|
+
<html>
|
|
16
|
+
<body>
|
|
17
|
+
<nav>Navigation</nav>
|
|
18
|
+
<main>
|
|
19
|
+
<h1>Main Title</h1>
|
|
20
|
+
<p>Main content here</p>
|
|
21
|
+
</main>
|
|
22
|
+
<footer>Footer</footer>
|
|
23
|
+
</body>
|
|
24
|
+
</html>
|
|
25
|
+
`;
|
|
26
|
+
const doc = createDocument(html);
|
|
27
|
+
const result = await extractor.extractContent(doc);
|
|
28
|
+
expect(result.content).toContain('Main Title');
|
|
29
|
+
expect(result.content).toContain('Main content here');
|
|
30
|
+
expect(result.content).not.toContain('Navigation');
|
|
31
|
+
expect(result.content).not.toContain('Footer');
|
|
32
|
+
});
|
|
33
|
+
it('should extract content from article element', async () => {
|
|
34
|
+
const html = `
|
|
35
|
+
<html>
|
|
36
|
+
<body>
|
|
37
|
+
<header>Header</header>
|
|
38
|
+
<article>
|
|
39
|
+
<h1>Article Title</h1>
|
|
40
|
+
<p>Article content</p>
|
|
41
|
+
</article>
|
|
42
|
+
</body>
|
|
43
|
+
</html>
|
|
44
|
+
`;
|
|
45
|
+
const doc = createDocument(html);
|
|
46
|
+
const result = await extractor.extractContent(doc);
|
|
47
|
+
expect(result.content).toContain('Article Title');
|
|
48
|
+
expect(result.content).toContain('Article content');
|
|
49
|
+
expect(result.content).not.toContain('Header');
|
|
50
|
+
});
|
|
51
|
+
it('should extract content from role="main" element', async () => {
|
|
52
|
+
const html = `
|
|
53
|
+
<html>
|
|
54
|
+
<body>
|
|
55
|
+
<nav>Nav</nav>
|
|
56
|
+
<div role="main">
|
|
57
|
+
<h1>Role Main Title</h1>
|
|
58
|
+
<p>Role main content</p>
|
|
59
|
+
</div>
|
|
60
|
+
</body>
|
|
61
|
+
</html>
|
|
62
|
+
`;
|
|
63
|
+
const doc = createDocument(html);
|
|
64
|
+
const result = await extractor.extractContent(doc);
|
|
65
|
+
expect(result.content).toContain('Role Main Title');
|
|
66
|
+
expect(result.content).toContain('Role main content');
|
|
67
|
+
expect(result.content).not.toContain('Nav');
|
|
68
|
+
});
|
|
69
|
+
it('should fall back to body when no main content element', async () => {
|
|
70
|
+
const html = `
|
|
71
|
+
<html>
|
|
72
|
+
<body>
|
|
73
|
+
<div>
|
|
74
|
+
<h1>Page Title</h1>
|
|
75
|
+
<p>Page content</p>
|
|
76
|
+
</div>
|
|
77
|
+
</body>
|
|
78
|
+
</html>
|
|
79
|
+
`;
|
|
80
|
+
const doc = createDocument(html);
|
|
81
|
+
const result = await extractor.extractContent(doc);
|
|
82
|
+
expect(result.content).toContain('Page Title');
|
|
83
|
+
expect(result.content).toContain('Page content');
|
|
84
|
+
});
|
|
85
|
+
it('should remove script and style elements', async () => {
|
|
86
|
+
const html = `
|
|
87
|
+
<html>
|
|
88
|
+
<body>
|
|
89
|
+
<style>.hidden { display: none; }</style>
|
|
90
|
+
<script>console.log('secret');</script>
|
|
91
|
+
<main>
|
|
92
|
+
<h1>Visible Content</h1>
|
|
93
|
+
</main>
|
|
94
|
+
</body>
|
|
95
|
+
</html>
|
|
96
|
+
`;
|
|
97
|
+
const doc = createDocument(html);
|
|
98
|
+
const result = await extractor.extractContent(doc);
|
|
99
|
+
expect(result.content).toContain('Visible Content');
|
|
100
|
+
expect(result.content).not.toContain('hidden');
|
|
101
|
+
expect(result.content).not.toContain('secret');
|
|
102
|
+
});
|
|
103
|
+
it('should extract title from h1', async () => {
|
|
104
|
+
const html = `
|
|
105
|
+
<html>
|
|
106
|
+
<body>
|
|
107
|
+
<main>
|
|
108
|
+
<h1>Component Name</h1>
|
|
109
|
+
<p>Description paragraph</p>
|
|
110
|
+
</main>
|
|
111
|
+
</body>
|
|
112
|
+
</html>
|
|
113
|
+
`;
|
|
114
|
+
const doc = createDocument(html);
|
|
115
|
+
const result = await extractor.extractContent(doc);
|
|
116
|
+
expect(result.metadata.pattern?.name).toBe('Component Name');
|
|
117
|
+
});
|
|
118
|
+
it('should extract description from first paragraph after h1', async () => {
|
|
119
|
+
const html = `
|
|
120
|
+
<html>
|
|
121
|
+
<body>
|
|
122
|
+
<main>
|
|
123
|
+
<h1>Component Name</h1>
|
|
124
|
+
<p>This is the component description.</p>
|
|
125
|
+
<p>This is additional content.</p>
|
|
126
|
+
</main>
|
|
127
|
+
</body>
|
|
128
|
+
</html>
|
|
129
|
+
`;
|
|
130
|
+
const doc = createDocument(html);
|
|
131
|
+
const result = await extractor.extractContent(doc);
|
|
132
|
+
expect(result.metadata.pattern?.description).toBe('This is the component description.');
|
|
133
|
+
});
|
|
134
|
+
it('should return overview type metadata', async () => {
|
|
135
|
+
const html = '<html><body><main><h1>Test</h1></main></body></html>';
|
|
136
|
+
const doc = createDocument(html);
|
|
137
|
+
const result = await extractor.extractContent(doc);
|
|
138
|
+
expect(result.metadata.type).toBe('overview');
|
|
139
|
+
});
|
|
140
|
+
it('should return component type pattern', async () => {
|
|
141
|
+
const html = '<html><body><main><h1>Test</h1></main></body></html>';
|
|
142
|
+
const doc = createDocument(html);
|
|
143
|
+
const result = await extractor.extractContent(doc);
|
|
144
|
+
expect(result.metadata.pattern?.type).toBe('component');
|
|
145
|
+
});
|
|
146
|
+
it('should handle empty document', async () => {
|
|
147
|
+
const html = '<html><body></body></html>';
|
|
148
|
+
const doc = createDocument(html);
|
|
149
|
+
const result = await extractor.extractContent(doc);
|
|
150
|
+
expect(result.content).toBe('');
|
|
151
|
+
expect(result.metadata.type).toBe('overview');
|
|
152
|
+
});
|
|
153
|
+
it('should handle document with only whitespace', async () => {
|
|
154
|
+
const html = '<html><body> \n\n </body></html>';
|
|
155
|
+
const doc = createDocument(html);
|
|
156
|
+
const result = await extractor.extractContent(doc);
|
|
157
|
+
expect(result.content).toBe('');
|
|
158
|
+
});
|
|
159
|
+
it('should initialize usageContexts and relatedPatterns as empty arrays', async () => {
|
|
160
|
+
const html = '<html><body><main><h1>Test</h1></main></body></html>';
|
|
161
|
+
const doc = createDocument(html);
|
|
162
|
+
const result = await extractor.extractContent(doc);
|
|
163
|
+
expect(result.metadata.pattern?.usageContexts).toEqual([]);
|
|
164
|
+
expect(result.metadata.pattern?.relatedPatterns).toEqual([]);
|
|
165
|
+
});
|
|
166
|
+
it('should handle missing h1', async () => {
|
|
167
|
+
const html = `
|
|
168
|
+
<html>
|
|
169
|
+
<body>
|
|
170
|
+
<main>
|
|
171
|
+
<h2>Subheading</h2>
|
|
172
|
+
<p>Content here</p>
|
|
173
|
+
</main>
|
|
174
|
+
</body>
|
|
175
|
+
</html>
|
|
176
|
+
`;
|
|
177
|
+
const doc = createDocument(html);
|
|
178
|
+
const result = await extractor.extractContent(doc);
|
|
179
|
+
expect(result.metadata.pattern?.name).toBe('');
|
|
180
|
+
expect(result.content).toContain('Content here');
|
|
181
|
+
});
|
|
182
|
+
it('should handle missing description paragraph', async () => {
|
|
183
|
+
const html = `
|
|
184
|
+
<html>
|
|
185
|
+
<body>
|
|
186
|
+
<main>
|
|
187
|
+
<h1>Title Only</h1>
|
|
188
|
+
<div>Some div content</div>
|
|
189
|
+
</main>
|
|
190
|
+
</body>
|
|
191
|
+
</html>
|
|
192
|
+
`;
|
|
193
|
+
const doc = createDocument(html);
|
|
194
|
+
const result = await extractor.extractContent(doc);
|
|
195
|
+
expect(result.metadata.pattern?.name).toBe('Title Only');
|
|
196
|
+
expect(result.metadata.pattern?.description).toBe('');
|
|
197
|
+
});
|
|
198
|
+
});
|
|
199
|
+
});
|
|
200
|
+
//# sourceMappingURL=default-extractor.test.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"default-extractor.test.js","sourceRoot":"","sources":["../../src/crawler/default-extractor.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;AAC1D,OAAO,EAAE,KAAK,EAAE,MAAM,OAAO,CAAC;AAE9B,QAAQ,CAAC,kBAAkB,EAAE,GAAG,EAAE;IAChC,IAAI,SAA2B,CAAC;IAEhC,UAAU,CAAC,GAAG,EAAE;QACd,SAAS,GAAG,IAAI,gBAAgB,EAAE,CAAC;IACrC,CAAC,CAAC,CAAC;IAEH,SAAS,cAAc,CAAC,IAAY;QAClC,MAAM,GAAG,GAAG,IAAI,KAAK,CAAC,IAAI,CAAC,CAAC;QAC5B,OAAO,GAAG,CAAC,MAAM,CAAC,QAAQ,CAAC;IAC7B,CAAC;IAED,QAAQ,CAAC,gBAAgB,EAAE,GAAG,EAAE;QAC9B,EAAE,CAAC,0CAA0C,EAAE,KAAK,IAAI,EAAE;YACxD,MAAM,IAAI,GAAG;;;;;;;;;;;OAWZ,CAAC;YACF,MAAM,GAAG,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;YACjC,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC;YAEnD,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,SAAS,CAAC,YAAY,CAAC,CAAC;YAC/C,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,SAAS,CAAC,mBAAmB,CAAC,CAAC;YACtD,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,YAAY,CAAC,CAAC;YACnD,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,QAAQ,CAAC,CAAC;QACjD,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,6CAA6C,EAAE,KAAK,IAAI,EAAE;YAC3D,MAAM,IAAI,GAAG;;;;;;;;;;OAUZ,CAAC;YACF,MAAM,GAAG,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;YACjC,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC;YAEnD,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,SAAS,CAAC,eAAe,CAAC,CAAC;YAClD,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,SAAS,CAAC,iBAAiB,CAAC,CAAC;YACpD,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,QAAQ,CAAC,CAAC;QACjD,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,iDAAiD,EAAE,KAAK,IAAI,EAAE;YAC/D,MAAM,IAAI,GAAG;;;;;;;;;;OAUZ,CAAC;YACF,MAAM,GAAG,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;YACjC,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC;YAEnD,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,SAAS,CAAC,iBAAiB,CAAC,CAAC;YACpD,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,SAAS,CAAC,mBAAmB,CAAC,CAAC;YACtD,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC;QAC9C,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,uDAAuD,EAAE,KAAK,IAAI,EAAE;YACrE,MAAM,IAAI,GAAG;;;;;;;;;OASZ,CAAC;YACF,MAAM,GAAG,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;YACjC,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC;YAEnD,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,SAAS,CAAC,YAAY,CAAC,CAAC;YAC/C,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,SAAS,CAAC,cAAc,CAAC,CAAC;QACnD,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,yCAAyC,EAAE,KAAK,IAAI,EAAE;YACvD,MAAM,IAAI,GAAG;;;;;;;;;;OAUZ,CAAC;YACF,MAAM,GAAG,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;YACjC,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC;YAEnD,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,SAAS,CAAC,iBAAiB,CAAC,CAAC;YACpD,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,QAAQ,CAAC,CAAC;YAC/C,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,QAAQ,CAAC,CAAC;QACjD,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,8BAA8B,EAAE,KAAK,IAAI,EAAE;YAC5C,MAAM,IAAI,GAAG;;;;;;;;;OASZ,CAAC;YACF,MAAM,GAAG,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;YACjC,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC;YAEnD,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC,IAAI,CAAC,gBAAgB,CAAC,CAAC;QAC/D,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,0DAA0D,EAAE,KAAK,IAAI,EAAE;YACxE,MAAM,IAAI,GAAG;;;;;;;;;;OAUZ,CAAC;YACF,MAAM,GAAG,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;YACjC,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC;YAEnD,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC,OAAO,EAAE,WAAW,CAAC,CAAC,IAAI,CAAC,oCAAoC,CAAC,CAAC;QAC1F,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,sCAAsC,EAAE,KAAK,IAAI,EAAE;YACpD,MAAM,IAAI,GAAG,sDAAsD,CAAC;YACpE,MAAM,GAAG,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;YACjC,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC;YAEnD,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;QAChD,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,sCAAsC,EAAE,KAAK,IAAI,EAAE;YACpD,MAAM,IAAI,GAAG,sDAAsD,CAAC;YACpE,MAAM,GAAG,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;YACjC,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC;YAEnD,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;QAC1D,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,8BAA8B,EAAE,KAAK,IAAI,EAAE;YAC5C,MAAM,IAAI,GAAG,4BAA4B,CAAC;YAC1C,MAAM,GAAG,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;YACjC,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC;YAEnD,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YAChC,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;QAChD,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,6CAA6C,EAAE,KAAK,IAAI,EAAE;YAC3D,MAAM,IAAI,GAAG,sCAAsC,CAAC;YACpD,MAAM,GAAG,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;YACjC,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC;YAEnD,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QAClC,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,qEAAqE,EAAE,KAAK,IAAI,EAAE;YACnF,MAAM,IAAI,GAAG,sDAAsD,CAAC;YACpE,MAAM,GAAG,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;YACjC,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC;YAEnD,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC,OAAO,EAAE,aAAa,CAAC,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC;YAC3D,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC,OAAO,EAAE,eAAe,CAAC,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC;QAC/D,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,0BAA0B,EAAE,KAAK,IAAI,EAAE;YACxC,MAAM,IAAI,GAAG;;;;;;;;;OASZ,CAAC;YACF,MAAM,GAAG,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;YACjC,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC;YAEnD,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YAC/C,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,SAAS,CAAC,cAAc,CAAC,CAAC;QACnD,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,6CAA6C,EAAE,KAAK,IAAI,EAAE;YAC3D,MAAM,IAAI,GAAG;;;;;;;;;OASZ,CAAC;YACF,MAAM,GAAG,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;YACjC,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC;YAEnD,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;YACzD,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC,OAAO,EAAE,WAAW,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACxD,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import { CrawlResult } from '../types.js';
|
|
2
|
+
import { BaseCrawler } from './base.js';
|
|
3
|
+
export declare class DefaultCrawler extends BaseCrawler {
|
|
4
|
+
private readonly BATCH_SIZE;
|
|
5
|
+
private readonly FETCH_TIMEOUT;
|
|
6
|
+
constructor(maxDepth?: number, maxRequestsPerCrawl?: number, onProgress?: (progress: number, description: string) => void);
|
|
7
|
+
crawl(url: string): AsyncGenerator<CrawlResult, void, unknown>;
|
|
8
|
+
private processPageWithRetry;
|
|
9
|
+
private extractLinks;
|
|
10
|
+
private extractTitle;
|
|
11
|
+
}
|