@houtini/seo-crawler-mcp 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/ci.yml +59 -0
- package/LICENSE +190 -0
- package/NOTICE +8 -0
- package/README.md +694 -0
- package/build/analyzers/QueryLoader.d.ts +30 -0
- package/build/analyzers/QueryLoader.d.ts.map +1 -0
- package/build/analyzers/QueryLoader.js +126 -0
- package/build/analyzers/QueryLoader.js.map +1 -0
- package/build/cli.d.ts +3 -0
- package/build/cli.d.ts.map +1 -0
- package/build/cli.js +190 -0
- package/build/cli.js.map +1 -0
- package/build/core/ContentExtractor.d.ts +30 -0
- package/build/core/ContentExtractor.d.ts.map +1 -0
- package/build/core/ContentExtractor.js +362 -0
- package/build/core/ContentExtractor.js.map +1 -0
- package/build/core/CrawlDatabase.d.ts +25 -0
- package/build/core/CrawlDatabase.d.ts.map +1 -0
- package/build/core/CrawlDatabase.js +603 -0
- package/build/core/CrawlDatabase.js.map +1 -0
- package/build/core/CrawlOrchestrator.d.ts +27 -0
- package/build/core/CrawlOrchestrator.d.ts.map +1 -0
- package/build/core/CrawlOrchestrator.js +279 -0
- package/build/core/CrawlOrchestrator.js.map +1 -0
- package/build/core/CrawlStorage.d.ts +33 -0
- package/build/core/CrawlStorage.d.ts.map +1 -0
- package/build/core/CrawlStorage.js +94 -0
- package/build/core/CrawlStorage.js.map +1 -0
- package/build/core/LinkExtractor.d.ts +14 -0
- package/build/core/LinkExtractor.d.ts.map +1 -0
- package/build/core/LinkExtractor.js +91 -0
- package/build/core/LinkExtractor.js.map +1 -0
- package/build/core/UrlManager.d.ts +21 -0
- package/build/core/UrlManager.d.ts.map +1 -0
- package/build/core/UrlManager.js +87 -0
- package/build/core/UrlManager.js.map +1 -0
- package/build/formatters/structured-report-format.d.ts +48 -0
- package/build/formatters/structured-report-format.d.ts.map +1 -0
- package/build/formatters/structured-report-format.js +145 -0
- package/build/formatters/structured-report-format.js.map +1 -0
- package/build/index.d.ts +3 -0
- package/build/index.d.ts.map +1 -0
- package/build/index.js +214 -0
- package/build/index.js.map +1 -0
- package/build/schema/index.d.ts +627 -0
- package/build/schema/index.d.ts.map +1 -0
- package/build/schema/index.js +159 -0
- package/build/schema/index.js.map +1 -0
- package/build/tools/analyze-seo.d.ts +44 -0
- package/build/tools/analyze-seo.d.ts.map +1 -0
- package/build/tools/analyze-seo.js +110 -0
- package/build/tools/analyze-seo.js.map +1 -0
- package/build/tools/list-queries.d.ts +28 -0
- package/build/tools/list-queries.d.ts.map +1 -0
- package/build/tools/list-queries.js +30 -0
- package/build/tools/list-queries.js.map +1 -0
- package/build/tools/query-seo-data.d.ts +15 -0
- package/build/tools/query-seo-data.d.ts.map +1 -0
- package/build/tools/query-seo-data.js +43 -0
- package/build/tools/query-seo-data.js.map +1 -0
- package/build/tools/run-seo-audit.d.ts +3 -0
- package/build/tools/run-seo-audit.d.ts.map +1 -0
- package/build/tools/run-seo-audit.js +54 -0
- package/build/tools/run-seo-audit.js.map +1 -0
- package/build/types/index.d.ts +158 -0
- package/build/types/index.d.ts.map +1 -0
- package/build/types/index.js +2 -0
- package/build/types/index.js.map +1 -0
- package/build/utils/debug.d.ts +2 -0
- package/build/utils/debug.d.ts.map +1 -0
- package/build/utils/debug.js +7 -0
- package/build/utils/debug.js.map +1 -0
- package/package.json +49 -0
- package/server.json +31 -0
- package/src/analyzers/QueryLoader.ts +175 -0
- package/src/analyzers/queries/README.md +228 -0
- package/src/analyzers/queries/content/duplicate-h1.sql +18 -0
- package/src/analyzers/queries/content/duplicate-meta-descriptions.sql +18 -0
- package/src/analyzers/queries/content/duplicate-titles.sql +19 -0
- package/src/analyzers/queries/content/missing-h1.sql +18 -0
- package/src/analyzers/queries/content/missing-meta-descriptions.sql +19 -0
- package/src/analyzers/queries/content/multiple-h1.sql +17 -0
- package/src/analyzers/queries/content/thin-content.sql +18 -0
- package/src/analyzers/queries/critical/404-errors.sql +14 -0
- package/src/analyzers/queries/critical/broken-internal-links.sql +20 -0
- package/src/analyzers/queries/critical/missing-titles.sql +17 -0
- package/src/analyzers/queries/critical/server-errors.sql +15 -0
- package/src/analyzers/queries/opportunities/high-external-links.sql +18 -0
- package/src/analyzers/queries/opportunities/meta-description-length.sql +27 -0
- package/src/analyzers/queries/opportunities/missing-images.sql +18 -0
- package/src/analyzers/queries/opportunities/no-outbound-links.sql +18 -0
- package/src/analyzers/queries/opportunities/title-equals-h1.sql +21 -0
- package/src/analyzers/queries/opportunities/title-length.sql +27 -0
- package/src/analyzers/queries/security/missing-csp.sql +16 -0
- package/src/analyzers/queries/security/missing-hsts.sql +17 -0
- package/src/analyzers/queries/security/missing-referrer-policy.sql +16 -0
- package/src/analyzers/queries/security/missing-x-frame-options.sql +16 -0
- package/src/analyzers/queries/security/protocol-relative-links.sql +16 -0
- package/src/analyzers/queries/security/unsafe-external-links.sql +17 -0
- package/src/analyzers/queries/technical/canonical-issues.sql +20 -0
- package/src/analyzers/queries/technical/heading-hierarchy-issues.sql +19 -0
- package/src/analyzers/queries/technical/non-https.sql +16 -0
- package/src/analyzers/queries/technical/orphan-pages.sql +21 -0
- package/src/analyzers/queries/technical/redirects.sql +15 -0
- package/src/cli.ts +224 -0
- package/src/core/ContentExtractor.ts +480 -0
- package/src/core/CrawlDatabase.ts +736 -0
- package/src/core/CrawlOrchestrator.ts +346 -0
- package/src/core/CrawlStorage.ts +148 -0
- package/src/core/LinkExtractor.ts +123 -0
- package/src/core/UrlManager.ts +114 -0
- package/src/formatters/structured-report-format.ts +254 -0
- package/src/index.ts +259 -0
- package/src/schema/index.ts +176 -0
- package/src/tools/analyze-seo.ts +184 -0
- package/src/tools/list-queries.ts +70 -0
- package/src/tools/query-seo-data.ts +77 -0
- package/src/tools/run-seo-audit.ts +83 -0
- package/src/types/index.ts +179 -0
- package/src/utils/debug.ts +12 -0
- package/tsconfig.json +26 -0
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
import { HttpCrawler, RequestQueue, Configuration, log } from 'crawlee';
|
|
2
|
+
import { MemoryStorage } from '@crawlee/memory-storage';
|
|
3
|
+
import { load } from 'cheerio';
|
|
4
|
+
import { debug } from '../utils/debug.js';
|
|
5
|
+
export class CrawlOrchestrator {
|
|
6
|
+
config;
|
|
7
|
+
urlManager;
|
|
8
|
+
contentExtractor;
|
|
9
|
+
linkExtractor;
|
|
10
|
+
storage;
|
|
11
|
+
crawler;
|
|
12
|
+
metadata;
|
|
13
|
+
linkBuffer = [];
|
|
14
|
+
LINK_BUFFER_SIZE = 100;
|
|
15
|
+
memoryStorage;
|
|
16
|
+
constructor(config, urlManager, contentExtractor, linkExtractor, storage) {
|
|
17
|
+
this.config = config;
|
|
18
|
+
this.urlManager = urlManager;
|
|
19
|
+
this.contentExtractor = contentExtractor;
|
|
20
|
+
this.linkExtractor = linkExtractor;
|
|
21
|
+
this.storage = storage;
|
|
22
|
+
log.setLevel(log.LEVELS.OFF);
|
|
23
|
+
this.metadata = this.createInitialMetadata();
|
|
24
|
+
}
|
|
25
|
+
createInitialMetadata() {
|
|
26
|
+
return {
|
|
27
|
+
crawlId: this.config.crawlId,
|
|
28
|
+
status: 'queued',
|
|
29
|
+
startedAt: null,
|
|
30
|
+
completedAt: null,
|
|
31
|
+
duration: null,
|
|
32
|
+
stats: {
|
|
33
|
+
discovered: 0,
|
|
34
|
+
crawled: 0,
|
|
35
|
+
failed: 0,
|
|
36
|
+
skipped: 0,
|
|
37
|
+
depth: 0,
|
|
38
|
+
speed: 0
|
|
39
|
+
},
|
|
40
|
+
errors: []
|
|
41
|
+
};
|
|
42
|
+
}
|
|
43
|
+
async initializeCrawler() {
|
|
44
|
+
const storageDir = `./crawlee-storage-${this.config.crawlId}`;
|
|
45
|
+
this.memoryStorage = new MemoryStorage({ localDataDirectory: storageDir });
|
|
46
|
+
const configuration = new Configuration({
|
|
47
|
+
storageClient: this.memoryStorage,
|
|
48
|
+
persistStorage: false,
|
|
49
|
+
});
|
|
50
|
+
const requestQueue = await RequestQueue.open(undefined, { config: configuration });
|
|
51
|
+
const userAgents = {
|
|
52
|
+
chrome: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
|
|
53
|
+
googlebot: 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
|
|
54
|
+
};
|
|
55
|
+
const userAgent = userAgents[this.config.userAgent] || userAgents.chrome;
|
|
56
|
+
const crawlerConfig = {
|
|
57
|
+
maxRequestsPerCrawl: this.config.maxPages,
|
|
58
|
+
maxConcurrency: 20,
|
|
59
|
+
minConcurrency: 5,
|
|
60
|
+
maxRequestRetries: 5,
|
|
61
|
+
requestHandlerTimeoutSecs: this.config.timeout / 1000,
|
|
62
|
+
navigationTimeoutSecs: 30,
|
|
63
|
+
additionalMimeTypes: ['text/html', 'application/xhtml+xml'],
|
|
64
|
+
requestQueue,
|
|
65
|
+
preNavigationHooks: [
|
|
66
|
+
async ({ request }) => {
|
|
67
|
+
request.headers = {
|
|
68
|
+
'User-Agent': userAgent,
|
|
69
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
|
70
|
+
'Accept-Language': 'en-US,en;q=0.9',
|
|
71
|
+
'Accept-Encoding': 'gzip, deflate, br',
|
|
72
|
+
'Connection': 'keep-alive',
|
|
73
|
+
'Upgrade-Insecure-Requests': '1'
|
|
74
|
+
};
|
|
75
|
+
}
|
|
76
|
+
],
|
|
77
|
+
requestHandler: async (context) => {
|
|
78
|
+
const { request, response, body, crawler } = context;
|
|
79
|
+
if (!response || !body)
|
|
80
|
+
return;
|
|
81
|
+
const $ = load(body.toString());
|
|
82
|
+
await this.processPage(request.url, body.toString(), $, response, crawler);
|
|
83
|
+
},
|
|
84
|
+
failedRequestHandler: async (context, error) => {
|
|
85
|
+
const { request } = context;
|
|
86
|
+
await this.handleFailedRequest(request, error);
|
|
87
|
+
}
|
|
88
|
+
};
|
|
89
|
+
this.crawler = new HttpCrawler(crawlerConfig);
|
|
90
|
+
}
|
|
91
|
+
async run() {
|
|
92
|
+
try {
|
|
93
|
+
debug('[ORCH] Starting orchestrator.run()');
|
|
94
|
+
debug('[ORCH] Config:', JSON.stringify(this.config, null, 2));
|
|
95
|
+
await this.initializeCrawler();
|
|
96
|
+
debug('[ORCH] Crawler initialized with isolated MemoryStorage');
|
|
97
|
+
await this.storage.initialize();
|
|
98
|
+
debug('[ORCH] Storage initialized');
|
|
99
|
+
this.metadata.status = 'running';
|
|
100
|
+
this.metadata.startedAt = new Date().toISOString();
|
|
101
|
+
await this.storage.saveMetadata(this.metadata, this.config);
|
|
102
|
+
debug('[ORCH] Metadata saved, status=running');
|
|
103
|
+
this.urlManager.addDiscovered(this.config.startUrl, 0);
|
|
104
|
+
debug('[ORCH] Start URL added to UrlManager');
|
|
105
|
+
debug('[ORCH] About to call crawler.run()...');
|
|
106
|
+
await this.crawler.run([this.config.startUrl]);
|
|
107
|
+
debug('[ORCH] Crawler.run() completed');
|
|
108
|
+
if (this.linkBuffer.length > 0) {
|
|
109
|
+
await this.storage.saveLinkData(this.linkBuffer);
|
|
110
|
+
this.linkBuffer = [];
|
|
111
|
+
}
|
|
112
|
+
const endTime = Date.now();
|
|
113
|
+
const startTime = new Date(this.metadata.startedAt).getTime();
|
|
114
|
+
this.metadata.status = 'completed';
|
|
115
|
+
this.metadata.completedAt = new Date().toISOString();
|
|
116
|
+
this.metadata.duration = endTime - startTime;
|
|
117
|
+
this.metadata.stats.depth = this.urlManager.getMaxDepth();
|
|
118
|
+
this.metadata.stats.speed = this.metadata.duration > 0
|
|
119
|
+
? (this.metadata.stats.crawled / (this.metadata.duration / 1000))
|
|
120
|
+
: 0;
|
|
121
|
+
}
|
|
122
|
+
catch (error) {
|
|
123
|
+
console.error('[ORCH ERROR] Fatal error in run():', error);
|
|
124
|
+
console.error('[ORCH ERROR] Stack:', error.stack);
|
|
125
|
+
this.metadata.status = 'failed';
|
|
126
|
+
this.metadata.errors.push({
|
|
127
|
+
url: '',
|
|
128
|
+
errorType: 'unknown',
|
|
129
|
+
message: error.message || 'Unknown error',
|
|
130
|
+
timestamp: new Date().toISOString()
|
|
131
|
+
});
|
|
132
|
+
}
|
|
133
|
+
finally {
|
|
134
|
+
if (this.memoryStorage) {
|
|
135
|
+
try {
|
|
136
|
+
await this.memoryStorage.purge();
|
|
137
|
+
debug('[ORCH] MemoryStorage cleaned up successfully');
|
|
138
|
+
}
|
|
139
|
+
catch (cleanupError) {
|
|
140
|
+
console.error('[ORCH WARN] Storage cleanup failed:', cleanupError.message);
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
await this.storage.saveMetadata(this.metadata, this.config);
|
|
145
|
+
return this.metadata;
|
|
146
|
+
}
|
|
147
|
+
async processPage(url, html, $, response, crawler) {
|
|
148
|
+
this.urlManager.markVisited(url);
|
|
149
|
+
const pageData = this.contentExtractor.extract(url, html, $, {
|
|
150
|
+
crawlId: this.config.crawlId,
|
|
151
|
+
depth: this.urlManager.getDepth(url),
|
|
152
|
+
statusCode: response.status || 200,
|
|
153
|
+
contentType: response.headers?.['content-type'] || 'text/html',
|
|
154
|
+
responseTime: 0,
|
|
155
|
+
size: html.length,
|
|
156
|
+
isInternal: this.urlManager.isInternal(url),
|
|
157
|
+
linkedFrom: this.urlManager.getSourcePages(url),
|
|
158
|
+
redirects: []
|
|
159
|
+
}, response);
|
|
160
|
+
const links = this.linkExtractor.extract($, url, this.config.crawlId);
|
|
161
|
+
const currentDepth = this.urlManager.getDepth(url);
|
|
162
|
+
const linksToAdd = [];
|
|
163
|
+
for (const link of links) {
|
|
164
|
+
if (this.shouldCrawlUrl(link.targetUrl)) {
|
|
165
|
+
if (currentDepth < this.config.maxDepth) {
|
|
166
|
+
if (!this.urlManager.isDiscovered(link.targetUrl)) {
|
|
167
|
+
this.urlManager.addDiscovered(link.targetUrl, currentDepth + 1, url);
|
|
168
|
+
}
|
|
169
|
+
linksToAdd.push(link.targetUrl);
|
|
170
|
+
}
|
|
171
|
+
else {
|
|
172
|
+
this.metadata.stats.skipped++;
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
else {
|
|
176
|
+
this.metadata.stats.skipped++;
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
if (linksToAdd.length > 0) {
|
|
180
|
+
await crawler.addRequests(linksToAdd);
|
|
181
|
+
}
|
|
182
|
+
await this.storage.savePageData(pageData);
|
|
183
|
+
this.linkBuffer.push(...links);
|
|
184
|
+
if (this.linkBuffer.length >= this.LINK_BUFFER_SIZE) {
|
|
185
|
+
await this.storage.saveLinkData(this.linkBuffer);
|
|
186
|
+
this.linkBuffer = [];
|
|
187
|
+
}
|
|
188
|
+
this.metadata.stats.crawled++;
|
|
189
|
+
this.metadata.stats.discovered = this.urlManager.getTotalDiscovered();
|
|
190
|
+
if (this.metadata.stats.crawled % 10 === 0) {
|
|
191
|
+
await this.storage.updateMetadata(this.metadata);
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
async handleFailedRequest(request, error) {
|
|
195
|
+
this.metadata.stats.failed++;
|
|
196
|
+
const errorType = this.categorizeError(error);
|
|
197
|
+
const errorMessage = error.message || 'Unknown error';
|
|
198
|
+
const errorMessages = request.errorMessages || [];
|
|
199
|
+
const fullErrorDetails = errorMessages.length > 0
|
|
200
|
+
? `${errorMessage} (Retry ${request.retryCount}: ${errorMessages.join(', ')})`
|
|
201
|
+
: errorMessage;
|
|
202
|
+
this.metadata.errors.push({
|
|
203
|
+
url: request.url,
|
|
204
|
+
errorType: errorType,
|
|
205
|
+
message: fullErrorDetails,
|
|
206
|
+
timestamp: new Date().toISOString()
|
|
207
|
+
});
|
|
208
|
+
}
|
|
209
|
+
categorizeError(error) {
|
|
210
|
+
const message = error.message.toLowerCase();
|
|
211
|
+
if (message.includes('timeout'))
|
|
212
|
+
return 'timeout';
|
|
213
|
+
if (message.includes('dns') || message.includes('getaddrinfo'))
|
|
214
|
+
return 'dns';
|
|
215
|
+
if (message.includes('connect') || message.includes('econnrefused'))
|
|
216
|
+
return 'connection';
|
|
217
|
+
if (message.includes('ssl') || message.includes('certificate'))
|
|
218
|
+
return 'ssl';
|
|
219
|
+
if (message.includes('401') || message.includes('403'))
|
|
220
|
+
return 'auth';
|
|
221
|
+
if (message.includes('404'))
|
|
222
|
+
return 'not_found';
|
|
223
|
+
if (message.includes('429') || message.includes('rate limit'))
|
|
224
|
+
return 'rate_limit';
|
|
225
|
+
if (message.includes('500') || message.includes('502') || message.includes('503'))
|
|
226
|
+
return 'server_error';
|
|
227
|
+
return 'network';
|
|
228
|
+
}
|
|
229
|
+
shouldCrawlUrl(url) {
|
|
230
|
+
const ext = this.getFileExtension(url);
|
|
231
|
+
if (ext && this.config.excludeExtensions.includes(ext)) {
|
|
232
|
+
return false;
|
|
233
|
+
}
|
|
234
|
+
if (ext && this.config.includeExtensions.length > 0 &&
|
|
235
|
+
!this.config.includeExtensions.includes(ext)) {
|
|
236
|
+
return false;
|
|
237
|
+
}
|
|
238
|
+
if (this.config.excludePatterns.length > 0) {
|
|
239
|
+
if (this.config.excludePatterns.some(pattern => {
|
|
240
|
+
try {
|
|
241
|
+
return new RegExp(pattern).test(url);
|
|
242
|
+
}
|
|
243
|
+
catch {
|
|
244
|
+
return false;
|
|
245
|
+
}
|
|
246
|
+
})) {
|
|
247
|
+
return false;
|
|
248
|
+
}
|
|
249
|
+
}
|
|
250
|
+
if (this.config.includePatterns.length > 0) {
|
|
251
|
+
if (!this.config.includePatterns.some(pattern => {
|
|
252
|
+
try {
|
|
253
|
+
return new RegExp(pattern).test(url);
|
|
254
|
+
}
|
|
255
|
+
catch {
|
|
256
|
+
return false;
|
|
257
|
+
}
|
|
258
|
+
})) {
|
|
259
|
+
return false;
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
if (!this.config.crawlExternal && !this.urlManager.isInternal(url)) {
|
|
263
|
+
return false;
|
|
264
|
+
}
|
|
265
|
+
return true;
|
|
266
|
+
}
|
|
267
|
+
getFileExtension(url) {
|
|
268
|
+
try {
|
|
269
|
+
const pathname = new URL(url).pathname;
|
|
270
|
+
const parts = pathname.split('.');
|
|
271
|
+
if (parts.length > 1) {
|
|
272
|
+
return parts[parts.length - 1].toLowerCase();
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
catch { }
|
|
276
|
+
return '';
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
//# sourceMappingURL=CrawlOrchestrator.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"CrawlOrchestrator.js","sourceRoot":"","sources":["../../src/core/CrawlOrchestrator.ts"],"names":[],"mappings":"AAcA,OAAO,EAAE,WAAW,EAAE,YAAY,EAAE,aAAa,EAAsC,GAAG,EAAE,MAAM,SAAS,CAAC;AAC5G,OAAO,EAAE,aAAa,EAAE,MAAM,yBAAyB,CAAC;AACxD,OAAO,EAAE,IAAI,EAAE,MAAM,SAAS,CAAC;AAM/B,OAAO,EAAE,KAAK,EAAE,MAAM,mBAAmB,CAAC;AAE1C,MAAM,OAAO,iBAAiB;IAQlB;IACA;IACA;IACA;IACA;IAXF,OAAO,CAAM;IACb,QAAQ,CAAgB;IACxB,UAAU,GAAU,EAAE,CAAC;IACd,gBAAgB,GAAG,GAAG,CAAC;IAChC,aAAa,CAAiB;IAEtC,YACU,MAAmB,EACnB,UAAsB,EACtB,gBAAkC,EAClC,aAA4B,EAC5B,OAAqB;QAJrB,WAAM,GAAN,MAAM,CAAa;QACnB,eAAU,GAAV,UAAU,CAAY;QACtB,qBAAgB,GAAhB,gBAAgB,CAAkB;QAClC,kBAAa,GAAb,aAAa,CAAe;QAC5B,YAAO,GAAP,OAAO,CAAc;QAG7B,GAAG,CAAC,QAAQ,CAAC,GAAG,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC;QAE7B,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC,qBAAqB,EAAE,CAAC;IAE/C,CAAC;IAEO,qBAAqB;QAC3B,OAAO;YACL,OAAO,EAAE,IAAI,CAAC,MAAM,CAAC,OAAO;YAC5B,MAAM,EAAE,QAAQ;YAChB,SAAS,EAAE,IAAI;YACf,WAAW,EAAE,IAAI;YACjB,QAAQ,EAAE,IAAI;YACd,KAAK,EAAE;gBACL,UAAU,EAAE,CAAC;gBACb,OAAO,EAAE,CAAC;gBACV,MAAM,EAAE,CAAC;gBACT,OAAO,EAAE,CAAC;gBACV,KAAK,EAAE,CAAC;gBACR,KAAK,EAAE,CAAC;aACT;YACD,MAAM,EAAE,EAAE;SACX,CAAC;IACJ,CAAC;IAEO,KAAK,CAAC,iBAAiB;QAI7B,MAAM,UAAU,GAAG,qBAAqB,IAAI,CAAC,MAAM,CAAC,OAAO,EAAE,CAAC;QAC9D,IAAI,CAAC,aAAa,GAAG,IAAI,aAAa,CAAC,EAAE,kBAAkB,EAAE,UAAU,EAAE,CAAC,CAAC;QAG3E,MAAM,aAAa,GAAG,IAAI,aAAa,CAAC;YACtC,aAAa,EAAE,IAAI,CAAC,aAAa;YACjC,cAAc,EAAE,KAAK;SACtB,CAAC,CAAC;QAGH,MAAM,YAAY,GAAG,MAAM,YAAY,CAAC,IAAI,CAAC,SAAS,EAAE,EAAE,MAAM,EAAE,aAAa,EAAE,CAAC,CAAC;QAGnF,MAAM,UAAU,GAAG;YACjB,MAAM,EAAE,iHAAiH;YACzH,SAAS,EAAE,0EAA0E;SACtF,CAAC;QAEF,MAAM,SAAS,GAAG,UAAU,CAAC,IAAI,CAAC,MAAM,CAAC,SAAoC,CAAC,IAAI,UAAU,CAAC,MAAM,CAAC;QAEpG,MAAM,aAAa,GAAG;YACpB,mBAAmB,EAAE,IAAI,CAAC,MAAM,CAAC,QAAQ;YACzC,cAAc,EAAE,EAAE;YAClB,cAAc,EAAE,CAAC;YACjB,iBAAiB,EAAE,CAAC;YACpB,yBAAyB,EAAE,IAAI,CAAC,MAAM,CAAC,OAAO,GAAG,IAAI;YACrD,qBAAqB,EAAE,EAAE;YACzB,mBAAmB,EAAE,CAAC,WAAW,EAAE,uBAAuB,CAAC;YAC3D,YAAY;YACZ,kBAAkB,EAAE;gBAClB,KAAK,EAAE,EAAE,OAAO,EAAO,EAAE,EAAE;oBACzB,OAAO,CAAC,OAAO,GAAG;wBAChB,YAAY,EAAE,SAAS;wBACvB,QAAQ,EAAE,4EAA4E;wBACtF,iBAAiB,EAAE,gBAAgB;wBACnC,iBAAiB,EAAE,mBAAmB;wBACtC,YAAY,EAAE,YAAY;wBAC1B,2BAA2B,EAAE,GAAG;qBACjC,CAAC;gBACJ,CAAC;aACF;YACD,cAAc,EAAE,KAAK,EAAE,OAAwB,EAAE,EAAE;gBACjD,MAAM,EAAE,OAAO,EAAE,QAAQ,EAAE,IAAI,EAAE,OAAO,EAAE,GAAG,OAAc,CAAC;gBAC5D,IAAI,CAAC,QAAQ,IAAI,CAAC,IAAI;oBAAE,OAAO;gBAE/B,MAAM,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,CAAC,CAAC;gBAChC,MAAM,IAAI,CAAC,WAAW,CAAC,OAAO,CAAC,GAAG,EAAE,IAAI,CAAC,QAAQ,EAAE,EAAE,CAAC,EAAE,QAAQ,EAAE,OAAO,CAAC,CAAC;YAC7E,CAAC;YACD,oBAAoB,EAAE,KAAK,EAAE,OAAwB,EAAE,KAAY,EAAE,EAAE;gBACrE,MAAM,EAAE,OAAO,EAAE,GAAG,OAAc,CAAC;gBACnC,MAAM,IAAI,CAAC,mBAAmB,CAAC,OAAO,EAAE,KAAK,CAAC,CAAC;YACjD,CAAC;SACF,CAAC;QAEF,IAAI,CAAC,OAAO,GAAG,IAAI,WAAW,CAAC,aAAoB,CAAC,CAAC;IACvD,CAAC;IAED,KAAK,CAAC,GAAG;QACP,IAAI,CAAC;YACH,KAAK,CAAC,oCAAoC,CAAC,CAAC;YAC5C,KAAK,CAAC,gBAAgB,EAAE,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;YAG9D,MAAM,IAAI,CAAC,iBAAiB,EAAE,CAAC;YAC/B,KAAK,CAAC,wDAAwD,CAAC,CAAC;YAEhE,MAAM,IAAI,CAAC,OAAO,CAAC,UAAU,EAAE,CAAC;YAChC,KAAK,CAAC,4BAA4B,CAAC,CAAC;YAEpC,IAAI,CAAC,QAAQ,CAAC,MAAM,GAAG,SAAS,CAAC;YACjC,IAAI,CAAC,QAAQ,CAAC,SAAS,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;YACnD,MAAM,IAAI,CAAC,OAAO,CAAC,YAAY,CAAC,IAAI,CAAC,QAAQ,EAAE,IAAI,CAAC,MAAM,CAAC,CAAC;YAC5D,KAAK,CAAC,uCAAuC,CAAC,CAAC;YAE/C,IAAI,CAAC,UAAU,CAAC,aAAa,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,EAAE,CAAC,CAAC,CAAC;YACvD,KAAK,CAAC,sCAAsC,CAAC,CAAC;YAE9C,KAAK,CAAC,uCAAuC,CAAC,CAAC;YAC/C,MAAM,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC,CAAC;YAC/C,KAAK,CAAC,gCAAgC,CAAC,CAAC;YAExC,IAAI,IAAI,CAAC,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAC/B,MAAM,IAAI,CAAC,OAAO,CAAC,YAAY,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;gBACjD,IAAI,CAAC,UAAU,GAAG,EAAE,CAAC;YACvB,CAAC;YAED,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;YAC3B,MAAM,SAAS,GAAG,IAAI,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC,SAAU,CAAC,CAAC,OAAO,EAAE,CAAC;YAE/D,IAAI,CAAC,QAAQ,CAAC,MAAM,GAAG,WAAW,CAAC;YACnC,IAAI,CAAC,QAAQ,CAAC,WAAW,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;YACrD,IAAI,CAAC,QAAQ,CAAC,QAAQ,GAAG,OAAO,GAAG,SAAS,CAAC;YAC7C,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,KAAK,GAAG,IAAI,CAAC,UAAU,CAAC,WAAW,EAAE,CAAC;YAC1D,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,KAAK,GAAG,IAAI,CAAC,QAAQ,CAAC,QAAQ,GAAG,CAAC;gBACpD,CAAC,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,OAAO,GAAG,CAAC,IAAI,CAAC,QAAQ,CAAC,QAAQ,GAAG,IAAI,CAAC,CAAC;gBACjE,CAAC,CAAC,CAAC,CAAC;QAER,CAAC;QAAC,OAAO,KAAU,EAAE,CAAC;YACpB,OAAO,CAAC,KAAK,CAAC,oCAAoC,EAAE,KAAK,CAAC,CAAC;YAC3D,OAAO,CAAC,KAAK,CAAC,qBAAqB,EAAE,KAAK,CAAC,KAAK,CAAC,CAAC;YAClD,IAAI,CAAC,QAAQ,CAAC,MAAM,GAAG,QAAQ,CAAC;YAChC,IAAI,CAAC,QAAQ,CAAC,MAAM,CAAC,IAAI,CAAC;gBACxB,GAAG,EAAE,EAAE;gBACP,SAAS,EAAE,SAAS;gBACpB,OAAO,EAAE,KAAK,CAAC,OAAO,IAAI,eAAe;gBACzC,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;aACpC,CAAC,CAAC;QACL,CAAC;gBAAS,CAAC;YAGT,IAAI,IAAI,CAAC,aAAa,EAAE,CAAC;gBACvB,IAAI,CAAC;oBACH,MAAM,IAAI,CAAC,aAAa,CAAC,KAAK,EAAE,CAAC;oBACjC,KAAK,CAAC,8CAA8C,CAAC,CAAC;gBACxD,CAAC;gBAAC,OAAO,YAAiB,EAAE,CAAC;oBAC3B,OAAO,CAAC,KAAK,CAAC,qCAAqC,EAAE,YAAY,CAAC,OAAO,CAAC,CAAC;gBAC7E,CAAC;YACH,CAAC;QACH,CAAC;QAED,MAAM,IAAI,CAAC,OAAO,CAAC,YAAY,CAAC,IAAI,CAAC,QAAQ,EAAE,IAAI,CAAC,MAAM,CAAC,CAAC;QAC5D,OAAO,IAAI,CAAC,QAAQ,CAAC;IACvB,CAAC;IAEO,KAAK,CAAC,WAAW,CACvB,GAAW,EACX,IAAY,EACZ,CAAM,EACN,QAAa,EACb,OAAY;QAEZ,IAAI,CAAC,UAAU,CAAC,WAAW,CAAC,GAAG,CAAC,CAAC;QAEjC,MAAM,QAAQ,GAAG,IAAI,CAAC,gBAAgB,CAAC,OAAO,CAAC,GAAG,EAAE,IAAI,EAAE,CAAC,EAAE;YAC3D,OAAO,EAAE,IAAI,CAAC,MAAM,CAAC,OAAO;YAC5B,KAAK,EAAE,IAAI,CAAC,UAAU,CAAC,QAAQ,CAAC,GAAG,CAAC;YACpC,UAAU,EAAE,QAAQ,CAAC,MAAM,IAAI,GAAG;YAClC,WAAW,EAAE,QAAQ,CAAC,OAAO,EAAE,CAAC,cAAc,CAAC,IAAI,WAAW;YAC9D,YAAY,EAAE,CAAC;YACf,IAAI,EAAE,IAAI,CAAC,MAAM;YACjB,UAAU,EAAE,IAAI,CAAC,UAAU,CAAC,UAAU,CAAC,GAAG,CAAC;YAC3C,UAAU,EAAE,IAAI,CAAC,UAAU,CAAC,cAAc,CAAC,GAAG,CAAC;YAC/C,SAAS,EAAE,EAAE;SACd,EAAE,QAAQ,CAAC,CAAC;QAEb,MAAM,KAAK,GAAG,IAAI,CAAC,aAAa,CAAC,OAAO,CAAC,CAAC,EAAE,GAAG,EAAE,IAAI,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QAEtE,MAAM,YAAY,GAAG,IAAI,CAAC,UAAU,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC;QACnD,MAAM,UAAU,GAAa,EAAE,CAAC;QAEhC,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;YACzB,IAAI,IAAI,CAAC,cAAc,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,CAAC;gBAExC,IAAI,YAAY,GAAG,IAAI,CAAC,MAAM,CAAC,QAAQ,EAAE,CAAC;oBAExC,IAAI,CAAC,IAAI,CAAC,UAAU,CAAC,YAAY,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,CAAC;wBAClD,IAAI,CAAC,UAAU,CAAC,aAAa,CAAC,IAAI,CAAC,SAAS,EAAE,YAAY,GAAG,CAAC,EAAE,GAAG,CAAC,CAAC;oBACvE,CAAC;oBACD,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;gBAClC,CAAC;qBAAM,CAAC;oBACN,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,OAAO,EAAE,CAAC;gBAChC,CAAC;YACH,CAAC;iBAAM,CAAC;gBACN,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,OAAO,EAAE,CAAC;YAChC,CAAC;QACH,CAAC;QAED,IAAI,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC1B,MAAM,OAAO,CAAC,WAAW,CAAC,UAAU,CAAC,CAAC;QACxC,CAAC;QAED,MAAM,IAAI,CAAC,OAAO,CAAC,YAAY,CAAC,QAAQ,CAAC,CAAC;QAE1C,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,GAAG,KAAK,CAAC,CAAC;QAC/B,IAAI,IAAI,CAAC,UAAU,CAAC,MAAM,IAAI,IAAI,CAAC,gBAAgB,EAAE,CAAC;YACpD,MAAM,IAAI,CAAC,OAAO,CAAC,YAAY,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;YACjD,IAAI,CAAC,UAAU,GAAG,EAAE,CAAC;QACvB,CAAC;QAED,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,OAAO,EAAE,CAAC;QAC9B,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,UAAU,GAAG,IAAI,CAAC,UAAU,CAAC,kBAAkB,EAAE,CAAC;QAEtE,IAAI,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,OAAO,GAAG,EAAE,KAAK,CAAC,EAAE,CAAC;YAC3C,MAAM,IAAI,CAAC,OAAO,CAAC,cAAc,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;QACnD,CAAC;IACH,CAAC;IAEO,KAAK,CAAC,mBAAmB,CAAC,OAAgB,EAAE,KAAY;QAC9D,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC;QAE7B,MAAM,SAAS,GAAG,IAAI,CAAC,eAAe,CAAC,KAAK,CAAC,CAAC;QAC9C,MAAM,YAAY,GAAG,KAAK,CAAC,OAAO,IAAI,eAAe,CAAC;QACtD,MAAM,aAAa,GAAG,OAAO,CAAC,aAAa,IAAI,EAAE,CAAC;QAElD,MAAM,gBAAgB,GAAG,aAAa,CAAC,MAAM,GAAG,CAAC;YAC/C,CAAC,CAAC,GAAG,YAAY,WAAW,OAAO,CAAC,UAAU,KAAK,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG;YAC9E,CAAC,CAAC,YAAY,CAAC;QAEjB,IAAI,CAAC,QAAQ,CAAC,MAAM,CAAC,IAAI,CAAC;YACxB,GAAG,EAAE,OAAO,CAAC,GAAG;YAChB,SAAS,EAAE,SAAS;YACpB,OAAO,EAAE,gBAAgB;YACzB,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;SACpC,CAAC,CAAC;IACL,CAAC;IAEO,eAAe,CAAC,KAAY;QAClC,MAAM,OAAO,GAAG,KAAK,CAAC,OAAO,CAAC,WAAW,EAAE,CAAC;QAE5C,IAAI,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAC;YAAE,OAAO,SAAS,CAAC;QAClD,IAAI,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAC,IAAI,OAAO,CAAC,QAAQ,CAAC,aAAa,CAAC;YAAE,OAAO,KAAK,CAAC;QAC7E,IAAI,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAC,IAAI,OAAO,CAAC,QAAQ,CAAC,cAAc,CAAC;YAAE,OAAO,YAAY,CAAC;QACzF,IAAI,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAC,IAAI,OAAO,CAAC,QAAQ,CAAC,aAAa,CAAC;YAAE,OAAO,KAAK,CAAC;QAC7E,IAAI,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAC,IAAI,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAC;YAAE,OAAO,MAAM,CAAC;QACtE,IAAI,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAC;YAAE,OAAO,WAAW,CAAC;QAChD,IAAI,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAC,IAAI,OAAO,CAAC,QAAQ,CAAC,YAAY,CAAC;YAAE,OAAO,YAAY,CAAC;QACnF,IAAI,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAC,IAAI,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAC,IAAI,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAC;YAAE,OAAO,cAAc,CAAC;QAEzG,OAAO,SAAS,CAAC;IACnB,CAAC;IAEO,cAAc,CAAC,GAAW;QAChC,MAAM,GAAG,GAAG,IAAI,CAAC,gBAAgB,CAAC,GAAG,CAAC,CAAC;QAGvC,IAAI,GAAG,IAAI,IAAI,CAAC,MAAM,CAAC,iBAAiB,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;YACvD,OAAO,KAAK,CAAC;QACf,CAAC;QAID,IAAI,GAAG,IAAI,IAAI,CAAC,MAAM,CAAC,iBAAiB,CAAC,MAAM,GAAG,CAAC;YAC/C,CAAC,IAAI,CAAC,MAAM,CAAC,iBAAiB,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;YACjD,OAAO,KAAK,CAAC;QACf,CAAC;QAED,IAAI,IAAI,CAAC,MAAM,CAAC,eAAe,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC3C,IAAI,IAAI,CAAC,MAAM,CAAC,eAAe,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE;gBAC7C,IAAI,CAAC;oBACH,OAAO,IAAI,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;gBACvC,CAAC;gBAAC,MAAM,CAAC;oBACP,OAAO,KAAK,CAAC;gBACf,CAAC;YACH,CAAC,CAAC,EAAE,CAAC;gBACH,OAAO,KAAK,CAAC;YACf,CAAC;QACH,CAAC;QAED,IAAI,IAAI,CAAC,MAAM,CAAC,eAAe,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC3C,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,eAAe,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE;gBAC9C,IAAI,CAAC;oBACH,OAAO,IAAI,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;gBACvC,CAAC;gBAAC,MAAM,CAAC;oBACP,OAAO,KAAK,CAAC;gBACf,CAAC;YACH,CAAC,CAAC,EAAE,CAAC;gBACH,OAAO,KAAK,CAAC;YACf,CAAC;QACH,CAAC;QAED,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,aAAa,IAAI,CAAC,IAAI,CAAC,UAAU,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE,CAAC;YACnE,OAAO,KAAK,CAAC;QACf,CAAC;QAED,OAAO,IAAI,CAAC;IACd,CAAC;IAEO,gBAAgB,CAAC,GAAW;QAClC,IAAI,CAAC;YACH,MAAM,QAAQ,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC;YACvC,MAAM,KAAK,GAAG,QAAQ,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;YAClC,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACrB,OAAO,KAAK,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,WAAW,EAAE,CAAC;YAC/C,CAAC;QACH,CAAC;QAAC,MAAM,CAAC,CAAA,CAAC;QACV,OAAO,EAAE,CAAC;IACZ,CAAC;CACF"}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import type { CrawlConfig, CrawlMetadata, PageData, LinkData } from '../types/index.js';
|
|
2
|
+
import { CrawlDatabase } from './CrawlDatabase.js';
|
|
3
|
+
export declare class CrawlStorage {
|
|
4
|
+
private baseDir;
|
|
5
|
+
private db;
|
|
6
|
+
private dbPath;
|
|
7
|
+
private baseUrl?;
|
|
8
|
+
constructor(outputPath: string, baseUrl?: string);
|
|
9
|
+
initialize(): Promise<void>;
|
|
10
|
+
saveConfig(config: CrawlConfig): Promise<void>;
|
|
11
|
+
loadConfig(): Promise<CrawlConfig>;
|
|
12
|
+
saveMetadata(metadata: CrawlMetadata, config?: CrawlConfig): Promise<void>;
|
|
13
|
+
loadMetadata(): Promise<CrawlMetadata | null>;
|
|
14
|
+
updateMetadata(updates: Partial<CrawlMetadata>): Promise<void>;
|
|
15
|
+
savePageData(page: PageData): Promise<void>;
|
|
16
|
+
savePageDataBatch(pages: PageData[]): Promise<void>;
|
|
17
|
+
loadPageData(url: string): Promise<PageData | null>;
|
|
18
|
+
loadAllPageData(): Promise<PageData[]>;
|
|
19
|
+
saveLinkData(links: LinkData[]): Promise<void>;
|
|
20
|
+
loadLinkData(): Promise<LinkData[]>;
|
|
21
|
+
appendLinkData(newLinks: LinkData[]): Promise<void>;
|
|
22
|
+
generateCsvExport(): Promise<void>;
|
|
23
|
+
exists(): Promise<boolean>;
|
|
24
|
+
getStats(): Promise<{
|
|
25
|
+
totalPages: number;
|
|
26
|
+
totalLinks: number;
|
|
27
|
+
hasMetadata: boolean;
|
|
28
|
+
hasConfig: boolean;
|
|
29
|
+
}>;
|
|
30
|
+
close(): void;
|
|
31
|
+
getDatabase(): CrawlDatabase;
|
|
32
|
+
}
|
|
33
|
+
//# sourceMappingURL=CrawlStorage.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"CrawlStorage.d.ts","sourceRoot":"","sources":["../../src/core/CrawlStorage.ts"],"names":[],"mappings":"AAqBA,OAAO,KAAK,EAAE,WAAW,EAAE,aAAa,EAAE,QAAQ,EAAE,QAAQ,EAAE,MAAM,mBAAmB,CAAC;AAExF,OAAO,EAAE,aAAa,EAAE,MAAM,oBAAoB,CAAC;AAEnD,qBAAa,YAAY;IACvB,OAAO,CAAC,OAAO,CAAS;IACxB,OAAO,CAAC,EAAE,CAAiB;IAC3B,OAAO,CAAC,MAAM,CAAS;IACvB,OAAO,CAAC,OAAO,CAAC,CAAS;gBAEb,UAAU,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,MAAM;IAM1C,UAAU,IAAI,OAAO,CAAC,IAAI,CAAC;IAO3B,UAAU,CAAC,MAAM,EAAE,WAAW,GAAG,OAAO,CAAC,IAAI,CAAC;IAM9C,UAAU,IAAI,OAAO,CAAC,WAAW,CAAC;IAOlC,YAAY,CAAC,QAAQ,EAAE,aAAa,EAAE,MAAM,CAAC,EAAE,WAAW,GAAG,OAAO,CAAC,IAAI,CAAC;IAI1E,YAAY,IAAI,OAAO,CAAC,aAAa,GAAG,IAAI,CAAC;IAK7C,cAAc,CAAC,OAAO,EAAE,OAAO,CAAC,aAAa,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC;IAK9D,YAAY,CAAC,IAAI,EAAE,QAAQ,GAAG,OAAO,CAAC,IAAI,CAAC;IAI3C,iBAAiB,CAAC,KAAK,EAAE,QAAQ,EAAE,GAAG,OAAO,CAAC,IAAI,CAAC;IAInD,YAAY,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,QAAQ,GAAG,IAAI,CAAC;IAInD,eAAe,IAAI,OAAO,CAAC,QAAQ,EAAE,CAAC;IAKtC,YAAY,CAAC,KAAK,EAAE,QAAQ,EAAE,GAAG,OAAO,CAAC,IAAI,CAAC;IAI9C,YAAY,IAAI,OAAO,CAAC,QAAQ,EAAE,CAAC;IAInC,cAAc,CAAC,QAAQ,EAAE,QAAQ,EAAE,GAAG,OAAO,CAAC,IAAI,CAAC;IAKnD,iBAAiB,IAAI,OAAO,CAAC,IAAI,CAAC;IAMlC,MAAM,IAAI,OAAO,CAAC,OAAO,CAAC;IAS1B,QAAQ,IAAI,OAAO,CAAC;QACxB,UAAU,EAAE,MAAM,CAAC;QACnB,UAAU,EAAE,MAAM,CAAC;QACnB,WAAW,EAAE,OAAO,CAAC;QACrB,SAAS,EAAE,OAAO,CAAC;KACpB,CAAC;IAkBF,KAAK,IAAI,IAAI;IAKb,WAAW,IAAI,aAAa;CAG7B"}
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import { promises as fs } from 'fs';
|
|
2
|
+
import { join } from 'path';
|
|
3
|
+
import { CrawlConfigSchema } from '../schema/index.js';
|
|
4
|
+
import { CrawlDatabase } from './CrawlDatabase.js';
|
|
5
|
+
export class CrawlStorage {
|
|
6
|
+
baseDir;
|
|
7
|
+
db;
|
|
8
|
+
dbPath;
|
|
9
|
+
baseUrl;
|
|
10
|
+
constructor(outputPath, baseUrl) {
|
|
11
|
+
this.baseDir = outputPath;
|
|
12
|
+
this.dbPath = join(outputPath, 'crawl-data.db');
|
|
13
|
+
this.baseUrl = baseUrl;
|
|
14
|
+
}
|
|
15
|
+
async initialize() {
|
|
16
|
+
await fs.mkdir(this.baseDir, { recursive: true });
|
|
17
|
+
this.db = new CrawlDatabase(this.dbPath);
|
|
18
|
+
}
|
|
19
|
+
async saveConfig(config) {
|
|
20
|
+
const validated = CrawlConfigSchema.parse(config);
|
|
21
|
+
const filePath = join(this.baseDir, 'config.json');
|
|
22
|
+
await fs.writeFile(filePath, JSON.stringify(validated, null, 2), 'utf-8');
|
|
23
|
+
}
|
|
24
|
+
async loadConfig() {
|
|
25
|
+
const filePath = join(this.baseDir, 'config.json');
|
|
26
|
+
const content = await fs.readFile(filePath, 'utf-8');
|
|
27
|
+
return CrawlConfigSchema.parse(JSON.parse(content));
|
|
28
|
+
}
|
|
29
|
+
async saveMetadata(metadata, config) {
|
|
30
|
+
this.db.saveCrawlMetadata(metadata, this.baseUrl, config);
|
|
31
|
+
}
|
|
32
|
+
async loadMetadata() {
|
|
33
|
+
const config = await this.loadConfig();
|
|
34
|
+
return this.db.getCrawlMetadata(config.crawlId);
|
|
35
|
+
}
|
|
36
|
+
async updateMetadata(updates) {
|
|
37
|
+
this.db.updateCrawlMetadata(updates);
|
|
38
|
+
}
|
|
39
|
+
async savePageData(page) {
|
|
40
|
+
this.db.savePage(page);
|
|
41
|
+
}
|
|
42
|
+
async savePageDataBatch(pages) {
|
|
43
|
+
this.db.savePageBatch(pages);
|
|
44
|
+
}
|
|
45
|
+
async loadPageData(url) {
|
|
46
|
+
return this.db.getPage(url);
|
|
47
|
+
}
|
|
48
|
+
async loadAllPageData() {
|
|
49
|
+
return this.db.getAllPages();
|
|
50
|
+
}
|
|
51
|
+
async saveLinkData(links) {
|
|
52
|
+
this.db.saveLinks(links);
|
|
53
|
+
}
|
|
54
|
+
async loadLinkData() {
|
|
55
|
+
return this.db.getAllLinks();
|
|
56
|
+
}
|
|
57
|
+
async appendLinkData(newLinks) {
|
|
58
|
+
this.db.saveLinks(newLinks);
|
|
59
|
+
}
|
|
60
|
+
async generateCsvExport() {
|
|
61
|
+
const csvPath = join(this.baseDir, 'crawl-export.csv');
|
|
62
|
+
this.db.exportToCsv(csvPath);
|
|
63
|
+
}
|
|
64
|
+
async exists() {
|
|
65
|
+
try {
|
|
66
|
+
await fs.access(this.baseDir);
|
|
67
|
+
return true;
|
|
68
|
+
}
|
|
69
|
+
catch {
|
|
70
|
+
return false;
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
async getStats() {
|
|
74
|
+
const [pageCount, linkCount, hasConfig] = await Promise.all([
|
|
75
|
+
Promise.resolve(this.db.getPageCount()),
|
|
76
|
+
Promise.resolve(this.db.getLinkCount()),
|
|
77
|
+
fs.access(join(this.baseDir, 'config.json')).then(() => true).catch(() => false)
|
|
78
|
+
]);
|
|
79
|
+
const metadata = await this.loadMetadata();
|
|
80
|
+
return {
|
|
81
|
+
totalPages: pageCount,
|
|
82
|
+
totalLinks: linkCount,
|
|
83
|
+
hasMetadata: metadata !== null,
|
|
84
|
+
hasConfig
|
|
85
|
+
};
|
|
86
|
+
}
|
|
87
|
+
close() {
|
|
88
|
+
this.db.close();
|
|
89
|
+
}
|
|
90
|
+
getDatabase() {
|
|
91
|
+
return this.db;
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
//# sourceMappingURL=CrawlStorage.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"CrawlStorage.js","sourceRoot":"","sources":["../../src/core/CrawlStorage.ts"],"names":[],"mappings":"AAmBA,OAAO,EAAE,QAAQ,IAAI,EAAE,EAAE,MAAM,IAAI,CAAC;AACpC,OAAO,EAAE,IAAI,EAAE,MAAM,MAAM,CAAC;AAE5B,OAAO,EAAE,iBAAiB,EAAE,MAAM,oBAAoB,CAAC;AACvD,OAAO,EAAE,aAAa,EAAE,MAAM,oBAAoB,CAAC;AAEnD,MAAM,OAAO,YAAY;IACf,OAAO,CAAS;IAChB,EAAE,CAAiB;IACnB,MAAM,CAAS;IACf,OAAO,CAAU;IAEzB,YAAY,UAAkB,EAAE,OAAgB;QAC9C,IAAI,CAAC,OAAO,GAAG,UAAU,CAAC;QAC1B,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC,UAAU,EAAE,eAAe,CAAC,CAAC;QAChD,IAAI,CAAC,OAAO,GAAG,OAAO,CAAC;IACzB,CAAC;IAED,KAAK,CAAC,UAAU;QACd,MAAM,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,OAAO,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;QAElD,IAAI,CAAC,EAAE,GAAG,IAAI,aAAa,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IAC3C,CAAC;IAGD,KAAK,CAAC,UAAU,CAAC,MAAmB;QAClC,MAAM,SAAS,GAAG,iBAAiB,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;QAClD,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,aAAa,CAAC,CAAC;QACnD,MAAM,EAAE,CAAC,SAAS,CAAC,QAAQ,EAAE,IAAI,CAAC,SAAS,CAAC,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC;IAC5E,CAAC;IAED,KAAK,CAAC,UAAU;QACd,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,aAAa,CAAC,CAAC;QACnD,MAAM,OAAO,GAAG,MAAM,EAAE,CAAC,QAAQ,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;QACrD,OAAO,iBAAiB,CAAC,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC;IACtD,CAAC;IAGD,KAAK,CAAC,YAAY,CAAC,QAAuB,EAAE,MAAoB;QAC9D,IAAI,CAAC,EAAE,CAAC,iBAAiB,CAAC,QAAQ,EAAE,IAAI,CAAC,OAAO,EAAE,MAAM,CAAC,CAAC;IAC5D,CAAC;IAED,KAAK,CAAC,YAAY;QAChB,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,UAAU,EAAE,CAAC;QACvC,OAAO,IAAI,CAAC,EAAE,CAAC,gBAAgB,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;IAClD,CAAC;IAED,KAAK,CAAC,cAAc,CAAC,OAA+B;QAClD,IAAI,CAAC,EAAE,CAAC,mBAAmB,CAAC,OAAO,CAAC,CAAC;IACvC,CAAC;IAGD,KAAK,CAAC,YAAY,CAAC,IAAc;QAC/B,IAAI,CAAC,EAAE,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;IACzB,CAAC;IAED,KAAK,CAAC,iBAAiB,CAAC,KAAiB;QACvC,IAAI,CAAC,EAAE,CAAC,aAAa,CAAC,KAAK,CAAC,CAAC;IAC/B,CAAC;IAED,KAAK,CAAC,YAAY,CAAC,GAAW;QAC5B,OAAO,IAAI,CAAC,EAAE,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;IAC9B,CAAC;IAED,KAAK,CAAC,eAAe;QACnB,OAAO,IAAI,CAAC,EAAE,CAAC,WAAW,EAAE,CAAC;IAC/B,CAAC;IAGD,KAAK,CAAC,YAAY,CAAC,KAAiB;QAClC,IAAI,CAAC,EAAE,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC;IAC3B,CAAC;IAED,KAAK,CAAC,YAAY;QAChB,OAAO,IAAI,CAAC,EAAE,CAAC,WAAW,EAAE,CAAC;IAC/B,CAAC;IAED,KAAK,CAAC,cAAc,CAAC,QAAoB;QACvC,IAAI,CAAC,EAAE,CAAC,SAAS,CAAC,QAAQ,CAAC,CAAC;IAC9B,CAAC;IAGD,KAAK,CAAC,iBAAiB;QACrB,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,kBAAkB,CAAC,CAAC;QACvD,IAAI,CAAC,EAAE,CAAC,WAAW,CAAC,OAAO,CAAC,CAAC;IAC/B,CAAC;IAGD,KAAK,CAAC,MAAM;QACV,IAAI,CAAC;YACH,MAAM,EAAE,CAAC,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;YAC9B,OAAO,IAAI,CAAC;QACd,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,KAAK,CAAC;QACf,CAAC;IACH,CAAC;IAED,KAAK,CAAC,QAAQ;QAMZ,MAAM,CAAC,SAAS,EAAE,SAAS,EAAE,SAAS,CAAC,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC;YAC1D,OAAO,CAAC,OAAO,CAAC,IAAI,CAAC,EAAE,CAAC,YAAY,EAAE,CAAC;YACvC,OAAO,CAAC,OAAO,CAAC,IAAI,CAAC,EAAE,CAAC,YAAY,EAAE,CAAC;YACvC,EAAE,CAAC,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,aAAa,CAAC,CAAC,CAAC,IAAI,CAAC,GAAG,EAAE,CAAC,IAAI,CAAC,CAAC,KAAK,CAAC,GAAG,EAAE,CAAC,KAAK,CAAC;SACjF,CAAC,CAAC;QAEH,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,YAAY,EAAE,CAAC;QAE3C,OAAO;YACL,UAAU,EAAE,SAAS;YACrB,UAAU,EAAE,SAAS;YACrB,WAAW,EAAE,QAAQ,KAAK,IAAI;YAC9B,SAAS;SACV,CAAC;IACJ,CAAC;IAGD,KAAK;QACH,IAAI,CAAC,EAAE,CAAC,KAAK,EAAE,CAAC;IAClB,CAAC;IAGD,WAAW;QACT,OAAO,IAAI,CAAC,EAAE,CAAC;IACjB,CAAC;CACF"}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import type { CheerioAPI } from 'cheerio';
|
|
2
|
+
import type { LinkData } from '../types/index.js';
|
|
3
|
+
export declare class LinkExtractor {
|
|
4
|
+
private baseDomain;
|
|
5
|
+
constructor(baseDomain: string);
|
|
6
|
+
extract($: CheerioAPI, sourceUrl: string, crawlId: string): LinkData[];
|
|
7
|
+
private shouldSkipLink;
|
|
8
|
+
private extractAnchorText;
|
|
9
|
+
private cleanUrl;
|
|
10
|
+
private normalizeDomain;
|
|
11
|
+
private isInternal;
|
|
12
|
+
private detectPlacement;
|
|
13
|
+
}
|
|
14
|
+
//# sourceMappingURL=LinkExtractor.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"LinkExtractor.d.ts","sourceRoot":"","sources":["../../src/core/LinkExtractor.ts"],"names":[],"mappings":"AAWA,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,SAAS,CAAC;AAC1C,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,mBAAmB,CAAC;AAGlD,qBAAa,aAAa;IACxB,OAAO,CAAC,UAAU,CAAS;gBAEf,UAAU,EAAE,MAAM;IAI9B,OAAO,CAAC,CAAC,EAAE,UAAU,EAAE,SAAS,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,GAAG,QAAQ,EAAE;IAgCtE,OAAO,CAAC,cAAc;IAKtB,OAAO,CAAC,iBAAiB;IAKzB,OAAO,CAAC,QAAQ;IAchB,OAAO,CAAC,eAAe;IASvB,OAAO,CAAC,UAAU;IASlB,OAAO,CAAC,eAAe;CA0BxB"}
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
import { LinkDataSchema } from '../schema/index.js';
|
|
2
|
+
export class LinkExtractor {
|
|
3
|
+
baseDomain;
|
|
4
|
+
constructor(baseDomain) {
|
|
5
|
+
this.baseDomain = this.normalizeDomain(baseDomain);
|
|
6
|
+
}
|
|
7
|
+
extract($, sourceUrl, crawlId) {
|
|
8
|
+
const links = [];
|
|
9
|
+
$('a[href]').each((_, el) => {
|
|
10
|
+
const href = $(el).attr('href')?.trim();
|
|
11
|
+
if (!href || this.shouldSkipLink(href)) {
|
|
12
|
+
return;
|
|
13
|
+
}
|
|
14
|
+
try {
|
|
15
|
+
const absolute = new URL(href, sourceUrl);
|
|
16
|
+
const targetUrl = this.cleanUrl(absolute);
|
|
17
|
+
const linkData = {
|
|
18
|
+
crawlId,
|
|
19
|
+
sourceUrl,
|
|
20
|
+
targetUrl,
|
|
21
|
+
anchorText: this.extractAnchorText($(el)),
|
|
22
|
+
isInternal: this.isInternal(targetUrl),
|
|
23
|
+
targetDomain: absolute.hostname,
|
|
24
|
+
targetStatus: null,
|
|
25
|
+
placement: this.detectPlacement($(el)),
|
|
26
|
+
discoveredAt: new Date().toISOString()
|
|
27
|
+
};
|
|
28
|
+
links.push(LinkDataSchema.parse(linkData));
|
|
29
|
+
}
|
|
30
|
+
catch { }
|
|
31
|
+
});
|
|
32
|
+
return links;
|
|
33
|
+
}
|
|
34
|
+
shouldSkipLink(href) {
|
|
35
|
+
const skipPrefixes = ['#', 'mailto:', 'tel:', 'javascript:'];
|
|
36
|
+
return skipPrefixes.some(prefix => href.startsWith(prefix));
|
|
37
|
+
}
|
|
38
|
+
extractAnchorText($el) {
|
|
39
|
+
const text = $el.text().trim();
|
|
40
|
+
return text.slice(0, 100) || '(no text)';
|
|
41
|
+
}
|
|
42
|
+
cleanUrl(url) {
|
|
43
|
+
let clean = `${url.protocol}//${url.hostname}${url.pathname}`;
|
|
44
|
+
if (url.search) {
|
|
45
|
+
clean += url.search;
|
|
46
|
+
}
|
|
47
|
+
if (clean.endsWith('/') && clean.length > clean.indexOf('://') + 4) {
|
|
48
|
+
clean = clean.slice(0, -1);
|
|
49
|
+
}
|
|
50
|
+
return clean;
|
|
51
|
+
}
|
|
52
|
+
normalizeDomain(domain) {
|
|
53
|
+
try {
|
|
54
|
+
const url = domain.startsWith('http') ? domain : `https://${domain}`;
|
|
55
|
+
return new URL(url).hostname.replace(/^www\./, '');
|
|
56
|
+
}
|
|
57
|
+
catch {
|
|
58
|
+
return domain.replace(/^www\./, '');
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
isInternal(url) {
|
|
62
|
+
try {
|
|
63
|
+
const urlDomain = new URL(url).hostname.replace(/^www\./, '');
|
|
64
|
+
return urlDomain === this.baseDomain;
|
|
65
|
+
}
|
|
66
|
+
catch {
|
|
67
|
+
return false;
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
detectPlacement($el) {
|
|
71
|
+
let current = $el.parent();
|
|
72
|
+
while (current && current.length > 0) {
|
|
73
|
+
const tagName = current.prop('tagName')?.toLowerCase();
|
|
74
|
+
const classes = current.attr('class')?.toLowerCase() ?? '';
|
|
75
|
+
const id = current.attr('id')?.toLowerCase() ?? '';
|
|
76
|
+
if (tagName === 'footer' || classes.includes('footer') || id.includes('footer')) {
|
|
77
|
+
return 'footer';
|
|
78
|
+
}
|
|
79
|
+
if (tagName === 'nav' || tagName === 'header') {
|
|
80
|
+
return 'navigation';
|
|
81
|
+
}
|
|
82
|
+
const navKeywords = ['nav', 'menu', 'header'];
|
|
83
|
+
if (navKeywords.some(keyword => classes.includes(keyword) || id.includes(keyword))) {
|
|
84
|
+
return 'navigation';
|
|
85
|
+
}
|
|
86
|
+
current = current.parent();
|
|
87
|
+
}
|
|
88
|
+
return 'body';
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
//# sourceMappingURL=LinkExtractor.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"LinkExtractor.js","sourceRoot":"","sources":["../../src/core/LinkExtractor.ts"],"names":[],"mappings":"AAaA,OAAO,EAAE,cAAc,EAAE,MAAM,oBAAoB,CAAC;AAEpD,MAAM,OAAO,aAAa;IAChB,UAAU,CAAS;IAE3B,YAAY,UAAkB;QAC5B,IAAI,CAAC,UAAU,GAAG,IAAI,CAAC,eAAe,CAAC,UAAU,CAAC,CAAC;IACrD,CAAC;IAED,OAAO,CAAC,CAAa,EAAE,SAAiB,EAAE,OAAe;QACvD,MAAM,KAAK,GAAe,EAAE,CAAC;QAE7B,CAAC,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE;YAC1B,MAAM,IAAI,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,EAAE,IAAI,EAAE,CAAC;YACxC,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,cAAc,CAAC,IAAI,CAAC,EAAE,CAAC;gBACvC,OAAO;YACT,CAAC;YAED,IAAI,CAAC;gBACH,MAAM,QAAQ,GAAG,IAAI,GAAG,CAAC,IAAI,EAAE,SAAS,CAAC,CAAC;gBAC1C,MAAM,SAAS,GAAG,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;gBAE1C,MAAM,QAAQ,GAAa;oBACzB,OAAO;oBACP,SAAS;oBACT,SAAS;oBACT,UAAU,EAAE,IAAI,CAAC,iBAAiB,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;oBACzC,UAAU,EAAE,IAAI,CAAC,UAAU,CAAC,SAAS,CAAC;oBACtC,YAAY,EAAE,QAAQ,CAAC,QAAQ;oBAC/B,YAAY,EAAE,IAAI;oBAClB,SAAS,EAAE,IAAI,CAAC,eAAe,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;oBACtC,YAAY,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;iBACvC,CAAC;gBAEF,KAAK,CAAC,IAAI,CAAC,cAAc,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC,CAAC;YAC7C,CAAC;YAAC,MAAM,CAAC,CAAA,CAAC;QACZ,CAAC,CAAC,CAAC;QAEH,OAAO,KAAK,CAAC;IACf,CAAC;IAEO,cAAc,CAAC,IAAY;QACjC,MAAM,YAAY,GAAG,CAAC,GAAG,EAAE,SAAS,EAAE,MAAM,EAAE,aAAa,CAAC,CAAC;QAC7D,OAAO,YAAY,CAAC,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC;IAC9D,CAAC;IAEO,iBAAiB,CAAC,GAAQ;QAChC,MAAM,IAAI,GAAG,GAAG,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;QAC/B,OAAO,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,IAAI,WAAW,CAAC;IAC3C,CAAC;IAEO,QAAQ,CAAC,GAAQ;QACvB,IAAI,KAAK,GAAG,GAAG,GAAG,CAAC,QAAQ,KAAK,GAAG,CAAC,QAAQ,GAAG,GAAG,CAAC,QAAQ,EAAE,CAAC;QAE9D,IAAI,GAAG,CAAC,MAAM,EAAE,CAAC;YACf,KAAK,IAAI,GAAG,CAAC,MAAM,CAAC;QACtB,CAAC;QAED,IAAI,KAAK,CAAC,QAAQ,CAAC,GAAG,CAAC,IAAI,KAAK,CAAC,MAAM,GAAG,KAAK,CAAC,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,EAAE,CAAC;YACnE,KAAK,GAAG,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;QAC7B,CAAC;QAED,OAAO,KAAK,CAAC;IACf,CAAC;IAEO,eAAe,CAAC,MAAc;QACpC,IAAI,CAAC;YACH,MAAM,GAAG,GAAG,MAAM,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,WAAW,MAAM,EAAE,CAAC;YACrE,OAAO,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC,CAAC;QACrD,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,MAAM,CAAC,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC,CAAC;QACtC,CAAC;IACH,CAAC;IAEO,UAAU,CAAC,GAAW;QAC5B,IAAI,CAAC;YACH,MAAM,SAAS,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC,CAAC;YAC9D,OAAO,SAAS,KAAK,IAAI,CAAC,UAAU,CAAC;QACvC,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,KAAK,CAAC;QACf,CAAC;IACH,CAAC;IAEO,eAAe,CAAC,GAAQ;QAC9B,IAAI,OAAO,GAAG,GAAG,CAAC,MAAM,EAAE,CAAC;QAE3B,OAAO,OAAO,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACrC,MAAM,OAAO,GAAG,OAAO,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,WAAW,EAAE,CAAC;YACvD,MAAM,OAAO,GAAG,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,WAAW,EAAE,IAAI,EAAE,CAAC;YAC3D,MAAM,EAAE,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,WAAW,EAAE,IAAI,EAAE,CAAC;YAEnD,IAAI,OAAO,KAAK,QAAQ,IAAI,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAC,IAAI,EAAE,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE,CAAC;gBAChF,OAAO,QAAQ,CAAC;YAClB,CAAC;YAED,IAAI,OAAO,KAAK,KAAK,IAAI,OAAO,KAAK,QAAQ,EAAE,CAAC;gBAC9C,OAAO,YAAY,CAAC;YACtB,CAAC;YAED,MAAM,WAAW,GAAG,CAAC,KAAK,EAAE,MAAM,EAAE,QAAQ,CAAC,CAAC;YAC9C,IAAI,WAAW,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,EAAE,CAAC;gBACnF,OAAO,YAAY,CAAC;YACtB,CAAC;YAED,OAAO,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC;QAC7B,CAAC;QAED,OAAO,MAAM,CAAC;IAChB,CAAC;CACF"}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
export declare class UrlManager {
|
|
2
|
+
private baseDomain;
|
|
3
|
+
private discovered;
|
|
4
|
+
private visited;
|
|
5
|
+
private sourcePagesMap;
|
|
6
|
+
constructor(baseDomain: string);
|
|
7
|
+
normalizeUrl(url: string): string;
|
|
8
|
+
private normalizeDomain;
|
|
9
|
+
isInternal(url: string): boolean;
|
|
10
|
+
addDiscovered(url: string, depth: number, sourceUrl?: string): void;
|
|
11
|
+
markVisited(url: string): void;
|
|
12
|
+
isVisited(url: string): boolean;
|
|
13
|
+
isDiscovered(url: string): boolean;
|
|
14
|
+
getSourcePages(url: string): string[];
|
|
15
|
+
getDepth(url: string): number;
|
|
16
|
+
getTotalDiscovered(): number;
|
|
17
|
+
getTotalVisited(): number;
|
|
18
|
+
getMaxDepth(): number;
|
|
19
|
+
getUnvisitedUrls(): string[];
|
|
20
|
+
}
|
|
21
|
+
//# sourceMappingURL=UrlManager.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"UrlManager.d.ts","sourceRoot":"","sources":["../../src/core/UrlManager.ts"],"names":[],"mappings":"AAWA,qBAAa,UAAU;IACrB,OAAO,CAAC,UAAU,CAAS;IAC3B,OAAO,CAAC,UAAU,CAAkC;IACpD,OAAO,CAAC,OAAO,CAA0B;IACzC,OAAO,CAAC,cAAc,CAAuC;gBAEjD,UAAU,EAAE,MAAM;IAI9B,YAAY,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM;IAoBjC,OAAO,CAAC,eAAe;IASvB,UAAU,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO;IAShC,aAAa,CAAC,GAAG,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,SAAS,CAAC,EAAE,MAAM,GAAG,IAAI;IAgBnE,WAAW,CAAC,GAAG,EAAE,MAAM,GAAG,IAAI;IAK9B,SAAS,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO;IAI/B,YAAY,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO;IAIlC,cAAc,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM,EAAE;IAMrC,QAAQ,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM;IAI7B,kBAAkB,IAAI,MAAM;IAI5B,eAAe,IAAI,MAAM;IAIzB,WAAW,IAAI,MAAM;IAIrB,gBAAgB,IAAI,MAAM,EAAE;CAG7B"}
|