@memvid/maw 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +188 -0
- package/dist/bin/maw.d.ts +6 -0
- package/dist/bin/maw.d.ts.map +1 -0
- package/dist/bin/maw.js +275 -0
- package/dist/bin/maw.js.map +1 -0
- package/dist/src/crawler/index.d.ts +71 -0
- package/dist/src/crawler/index.d.ts.map +1 -0
- package/dist/src/crawler/index.js +249 -0
- package/dist/src/crawler/index.js.map +1 -0
- package/dist/src/crawler/robots.d.ts +26 -0
- package/dist/src/crawler/robots.d.ts.map +1 -0
- package/dist/src/crawler/robots.js +179 -0
- package/dist/src/crawler/robots.js.map +1 -0
- package/dist/src/crawler/sitemap.d.ts +36 -0
- package/dist/src/crawler/sitemap.d.ts.map +1 -0
- package/dist/src/crawler/sitemap.js +209 -0
- package/dist/src/crawler/sitemap.js.map +1 -0
- package/dist/src/engine/detector.d.ts +18 -0
- package/dist/src/engine/detector.d.ts.map +1 -0
- package/dist/src/engine/detector.js +155 -0
- package/dist/src/engine/detector.js.map +1 -0
- package/dist/src/engine/fetch.d.ts +18 -0
- package/dist/src/engine/fetch.d.ts.map +1 -0
- package/dist/src/engine/fetch.js +53 -0
- package/dist/src/engine/fetch.js.map +1 -0
- package/dist/src/engine/index.d.ts +39 -0
- package/dist/src/engine/index.d.ts.map +1 -0
- package/dist/src/engine/index.js +116 -0
- package/dist/src/engine/index.js.map +1 -0
- package/dist/src/engine/playwright.d.ts +23 -0
- package/dist/src/engine/playwright.d.ts.map +1 -0
- package/dist/src/engine/playwright.js +88 -0
- package/dist/src/engine/playwright.js.map +1 -0
- package/dist/src/engine/rebrowser.d.ts +22 -0
- package/dist/src/engine/rebrowser.d.ts.map +1 -0
- package/dist/src/engine/rebrowser.js +142 -0
- package/dist/src/engine/rebrowser.js.map +1 -0
- package/dist/src/extractor/cleaner.d.ts +13 -0
- package/dist/src/extractor/cleaner.d.ts.map +1 -0
- package/dist/src/extractor/cleaner.js +122 -0
- package/dist/src/extractor/cleaner.js.map +1 -0
- package/dist/src/extractor/index.d.ts +29 -0
- package/dist/src/extractor/index.d.ts.map +1 -0
- package/dist/src/extractor/index.js +162 -0
- package/dist/src/extractor/index.js.map +1 -0
- package/dist/src/extractor/links.d.ts +22 -0
- package/dist/src/extractor/links.d.ts.map +1 -0
- package/dist/src/extractor/links.js +92 -0
- package/dist/src/extractor/links.js.map +1 -0
- package/dist/src/extractor/markdown.d.ts +13 -0
- package/dist/src/extractor/markdown.d.ts.map +1 -0
- package/dist/src/extractor/markdown.js +94 -0
- package/dist/src/extractor/markdown.js.map +1 -0
- package/dist/src/git/index.d.ts +40 -0
- package/dist/src/git/index.d.ts.map +1 -0
- package/dist/src/git/index.js +303 -0
- package/dist/src/git/index.js.map +1 -0
- package/dist/src/index.d.ts +103 -0
- package/dist/src/index.d.ts.map +1 -0
- package/dist/src/index.js +229 -0
- package/dist/src/index.js.map +1 -0
- package/dist/src/ingestor/index.d.ts +95 -0
- package/dist/src/ingestor/index.d.ts.map +1 -0
- package/dist/src/ingestor/index.js +471 -0
- package/dist/src/ingestor/index.js.map +1 -0
- package/dist/src/utils/dedup.d.ts +66 -0
- package/dist/src/utils/dedup.d.ts.map +1 -0
- package/dist/src/utils/dedup.js +296 -0
- package/dist/src/utils/dedup.js.map +1 -0
- package/dist/src/utils/index.d.ts +3 -0
- package/dist/src/utils/index.d.ts.map +1 -0
- package/dist/src/utils/index.js +3 -0
- package/dist/src/utils/index.js.map +1 -0
- package/dist/src/utils/logger.d.ts +12 -0
- package/dist/src/utils/logger.d.ts.map +1 -0
- package/dist/src/utils/logger.js +49 -0
- package/dist/src/utils/logger.js.map +1 -0
- package/dist/src/utils/ui.d.ts +126 -0
- package/dist/src/utils/ui.d.ts.map +1 -0
- package/dist/src/utils/ui.js +357 -0
- package/dist/src/utils/ui.js.map +1 -0
- package/dist/src/utils/url.d.ts +21 -0
- package/dist/src/utils/url.d.ts.map +1 -0
- package/dist/src/utils/url.js +107 -0
- package/dist/src/utils/url.js.map +1 -0
- package/package.json +71 -0
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Web crawler with rate limiting
|
|
3
|
+
*/
|
|
4
|
+
import PQueue from 'p-queue';
|
|
5
|
+
import { EngineWaterfall } from '../engine/index.js';
|
|
6
|
+
import { Extractor } from '../extractor/index.js';
|
|
7
|
+
import { RobotsParser } from './robots.js';
|
|
8
|
+
import { SitemapParser } from './sitemap.js';
|
|
9
|
+
import { createLogger } from '../utils/logger.js';
|
|
10
|
+
import { normalizeUrl, shouldSkipUrl, getBaseDomain } from '../utils/url.js';
|
|
11
|
+
import { DedupTracker } from '../utils/dedup.js';
|
|
12
|
+
const DEFAULT_OPTIONS = {
|
|
13
|
+
depth: 2, // 2 levels deep (fast, good coverage)
|
|
14
|
+
concurrency: 10, // 10 concurrent requests
|
|
15
|
+
maxPages: 150, // 150 pages max (good coverage, avoids bloat)
|
|
16
|
+
rateLimit: 10, // 10 requests/sec
|
|
17
|
+
timeout: 10000, // 10s timeout (fail fast)
|
|
18
|
+
respectRobots: true,
|
|
19
|
+
useSitemap: true,
|
|
20
|
+
};
|
|
21
|
+
const log = createLogger();
|
|
22
|
+
export class Crawler {
|
|
23
|
+
options;
|
|
24
|
+
engine;
|
|
25
|
+
extractor;
|
|
26
|
+
robots;
|
|
27
|
+
sitemap;
|
|
28
|
+
dedup;
|
|
29
|
+
visited = new Set();
|
|
30
|
+
queue;
|
|
31
|
+
baseHosts = new Set();
|
|
32
|
+
results = [];
|
|
33
|
+
pending = new Map();
|
|
34
|
+
constructor(options = {}) {
|
|
35
|
+
this.options = { ...DEFAULT_OPTIONS, ...options };
|
|
36
|
+
this.engine = new EngineWaterfall();
|
|
37
|
+
this.extractor = new Extractor();
|
|
38
|
+
this.robots = new RobotsParser();
|
|
39
|
+
this.sitemap = new SitemapParser();
|
|
40
|
+
this.dedup = new DedupTracker('en'); // Prefer English content
|
|
41
|
+
// Rate-limited queue
|
|
42
|
+
this.queue = new PQueue({
|
|
43
|
+
concurrency: this.options.concurrency,
|
|
44
|
+
interval: 1000,
|
|
45
|
+
intervalCap: this.options.rateLimit,
|
|
46
|
+
});
|
|
47
|
+
}
|
|
48
|
+
/**
|
|
49
|
+
* Crawl URLs and yield results as they complete
|
|
50
|
+
*/
|
|
51
|
+
async *crawl(startUrls) {
|
|
52
|
+
const normalizedUrls = startUrls.map(u => normalizeUrl(u));
|
|
53
|
+
this.baseHosts = new Set(normalizedUrls.map(u => getBaseDomain(u)));
|
|
54
|
+
// Add start URLs FIRST (don't wait for sitemap)
|
|
55
|
+
for (const url of normalizedUrls) {
|
|
56
|
+
this.addToQueue(url, 0);
|
|
57
|
+
}
|
|
58
|
+
// Parse robots.txt (fast, ~1 request)
|
|
59
|
+
if (this.options.respectRobots) {
|
|
60
|
+
try {
|
|
61
|
+
await Promise.race([
|
|
62
|
+
Promise.all(normalizedUrls.map(url => this.robots.parse(url))),
|
|
63
|
+
new Promise((_, reject) => setTimeout(() => reject(new Error('timeout')), 3000))
|
|
64
|
+
]);
|
|
65
|
+
}
|
|
66
|
+
catch {
|
|
67
|
+
// Timeout or error - continue without robots
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
// Get URLs from sitemap (with 5s total timeout)
|
|
71
|
+
if (this.options.useSitemap) {
|
|
72
|
+
try {
|
|
73
|
+
await Promise.race([
|
|
74
|
+
this.parseSitemaps(normalizedUrls),
|
|
75
|
+
new Promise((_, reject) => setTimeout(() => reject(new Error('timeout')), 5000))
|
|
76
|
+
]);
|
|
77
|
+
}
|
|
78
|
+
catch {
|
|
79
|
+
// Timeout - continue with what we have
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
// Process queue and yield results with idle timeout
|
|
83
|
+
let lastYieldTime = Date.now();
|
|
84
|
+
let yieldedCount = 0;
|
|
85
|
+
const idleTimeout = 15000; // 15 seconds without new results = done
|
|
86
|
+
while (this.queue.size > 0 || this.queue.pending > 0 || this.results.length > 0) {
|
|
87
|
+
// Yield available results
|
|
88
|
+
while (this.results.length > 0) {
|
|
89
|
+
yield this.results.shift();
|
|
90
|
+
yieldedCount++;
|
|
91
|
+
lastYieldTime = Date.now();
|
|
92
|
+
}
|
|
93
|
+
// Stop if we've hit maxPages
|
|
94
|
+
if (yieldedCount >= this.options.maxPages) {
|
|
95
|
+
break;
|
|
96
|
+
}
|
|
97
|
+
// Check for idle timeout (no new results for too long)
|
|
98
|
+
if (Date.now() - lastYieldTime > idleTimeout) {
|
|
99
|
+
break;
|
|
100
|
+
}
|
|
101
|
+
// Wait a bit for more results
|
|
102
|
+
await new Promise(r => setTimeout(r, 50));
|
|
103
|
+
}
|
|
104
|
+
// Final results
|
|
105
|
+
while (this.results.length > 0) {
|
|
106
|
+
yield this.results.shift();
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
async parseSitemaps(urls) {
|
|
110
|
+
for (const url of urls) {
|
|
111
|
+
try {
|
|
112
|
+
// Only try first sitemap from robots.txt
|
|
113
|
+
const robotsSitemaps = this.robots.getSitemaps(url);
|
|
114
|
+
let sitemapUrls = [];
|
|
115
|
+
if (robotsSitemaps.length > 0) {
|
|
116
|
+
// Just get first sitemap
|
|
117
|
+
sitemapUrls = await this.sitemap.parseUrl(robotsSitemaps[0]);
|
|
118
|
+
}
|
|
119
|
+
else {
|
|
120
|
+
sitemapUrls = await this.sitemap.parse(url);
|
|
121
|
+
}
|
|
122
|
+
// Add limited sitemap URLs
|
|
123
|
+
const limit = Math.min(this.options.maxPages, 50);
|
|
124
|
+
for (const sUrl of sitemapUrls.slice(0, limit)) {
|
|
125
|
+
if (this.shouldCrawl(sUrl, 1)) {
|
|
126
|
+
this.addToQueue(sUrl, 1);
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
catch {
|
|
131
|
+
// No sitemap - continue
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
addToQueue(url, depth) {
|
|
136
|
+
const normalized = normalizeUrl(url);
|
|
137
|
+
if (this.visited.has(normalized))
|
|
138
|
+
return;
|
|
139
|
+
if (this.visited.size >= this.options.maxPages)
|
|
140
|
+
return;
|
|
141
|
+
this.visited.add(normalized);
|
|
142
|
+
this.queue.add(async () => {
|
|
143
|
+
await this.processUrl(url, depth);
|
|
144
|
+
});
|
|
145
|
+
}
|
|
146
|
+
async processUrl(url, depth) {
|
|
147
|
+
try {
|
|
148
|
+
const result = await this.engine.fetch(url, {
|
|
149
|
+
timeout: this.options.timeout,
|
|
150
|
+
forceEngine: this.options.forceEngine,
|
|
151
|
+
});
|
|
152
|
+
if (result.blocked) {
|
|
153
|
+
log.dim(` Blocked: ${url}`);
|
|
154
|
+
return;
|
|
155
|
+
}
|
|
156
|
+
if (result.statusCode >= 400) {
|
|
157
|
+
log.dim(` HTTP ${result.statusCode}: ${url}`);
|
|
158
|
+
return;
|
|
159
|
+
}
|
|
160
|
+
const extracted = this.extractor.extract(result.html, result.finalUrl);
|
|
161
|
+
// Skip pages with very little content
|
|
162
|
+
if (extracted.wordCount < 20) {
|
|
163
|
+
log.dim(` Low content: ${url}`);
|
|
164
|
+
return;
|
|
165
|
+
}
|
|
166
|
+
// Add to results
|
|
167
|
+
this.results.push({
|
|
168
|
+
url,
|
|
169
|
+
finalUrl: result.finalUrl,
|
|
170
|
+
extracted,
|
|
171
|
+
depth,
|
|
172
|
+
engine: result.engine,
|
|
173
|
+
});
|
|
174
|
+
// Queue discovered links
|
|
175
|
+
if (depth < this.options.depth) {
|
|
176
|
+
for (const link of extracted.links) {
|
|
177
|
+
if (this.shouldCrawl(link, depth + 1)) {
|
|
178
|
+
this.addToQueue(link, depth + 1);
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
catch (error) {
|
|
184
|
+
log.dim(` Failed: ${url} - ${error.message}`);
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
shouldCrawl(url, depth) {
|
|
188
|
+
try {
|
|
189
|
+
// Normalize
|
|
190
|
+
const normalized = normalizeUrl(url);
|
|
191
|
+
// Already visited
|
|
192
|
+
if (this.visited.has(normalized))
|
|
193
|
+
return false;
|
|
194
|
+
// Max pages
|
|
195
|
+
if (this.visited.size >= this.options.maxPages)
|
|
196
|
+
return false;
|
|
197
|
+
// Depth check
|
|
198
|
+
if (depth > this.options.depth)
|
|
199
|
+
return false;
|
|
200
|
+
// Skip non-HTML
|
|
201
|
+
if (shouldSkipUrl(url))
|
|
202
|
+
return false;
|
|
203
|
+
// Same domain only
|
|
204
|
+
const domain = getBaseDomain(url);
|
|
205
|
+
if (!this.baseHosts.has(domain))
|
|
206
|
+
return false;
|
|
207
|
+
// Robots.txt
|
|
208
|
+
if (this.options.respectRobots && !this.robots.isAllowed(url))
|
|
209
|
+
return false;
|
|
210
|
+
// Include/exclude patterns
|
|
211
|
+
if (this.options.includePattern && !this.options.includePattern.test(url))
|
|
212
|
+
return false;
|
|
213
|
+
if (this.options.excludePattern && this.options.excludePattern.test(url))
|
|
214
|
+
return false;
|
|
215
|
+
// Smart dedup - skip localized versions
|
|
216
|
+
const dedupResult = this.dedup.shouldSkip(url);
|
|
217
|
+
if (dedupResult.skip) {
|
|
218
|
+
log.dim(` Skip: ${url} (${dedupResult.reason})`);
|
|
219
|
+
return false;
|
|
220
|
+
}
|
|
221
|
+
return true;
|
|
222
|
+
}
|
|
223
|
+
catch {
|
|
224
|
+
return false;
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
/**
|
|
228
|
+
* Get engine statistics
|
|
229
|
+
*/
|
|
230
|
+
getStats() {
|
|
231
|
+
return {
|
|
232
|
+
...this.engine.getStats(),
|
|
233
|
+
visited: this.visited.size,
|
|
234
|
+
queued: this.queue.size,
|
|
235
|
+
pending: this.queue.pending,
|
|
236
|
+
dedup: this.dedup.getStats(),
|
|
237
|
+
};
|
|
238
|
+
}
|
|
239
|
+
/**
|
|
240
|
+
* Close all resources
|
|
241
|
+
*/
|
|
242
|
+
async close() {
|
|
243
|
+
this.queue.clear();
|
|
244
|
+
await this.engine.close();
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
export { RobotsParser } from './robots.js';
|
|
248
|
+
export { SitemapParser } from './sitemap.js';
|
|
249
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/crawler/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,MAAM,MAAM,SAAS,CAAC;AAC7B,OAAO,EAAE,eAAe,EAAqB,MAAM,oBAAoB,CAAC;AACxE,OAAO,EAAE,SAAS,EAAsB,MAAM,uBAAuB,CAAC;AACtE,OAAO,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAC3C,OAAO,EAAE,aAAa,EAAE,MAAM,cAAc,CAAC;AAC7C,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAClD,OAAO,EAAE,YAAY,EAAE,aAAa,EAAE,aAAa,EAAE,MAAM,iBAAiB,CAAC;AAC7E,OAAO,EAAE,YAAY,EAAE,MAAM,mBAAmB,CAAC;AAuBjD,MAAM,eAAe,GAAiB;IACpC,KAAK,EAAE,CAAC,EAAe,sCAAsC;IAC7D,WAAW,EAAE,EAAE,EAAQ,yBAAyB;IAChD,QAAQ,EAAE,GAAG,EAAU,8CAA8C;IACrE,SAAS,EAAE,EAAE,EAAU,kBAAkB;IACzC,OAAO,EAAE,KAAK,EAAS,0BAA0B;IACjD,aAAa,EAAE,IAAI;IACnB,UAAU,EAAE,IAAI;CACjB,CAAC;AAEF,MAAM,GAAG,GAAG,YAAY,EAAE,CAAC;AAE3B,MAAM,OAAO,OAAO;IACV,OAAO,CAAe;IACtB,MAAM,CAAkB;IACxB,SAAS,CAAY;IACrB,MAAM,CAAe;IACrB,OAAO,CAAgB;IACvB,KAAK,CAAe;IACpB,OAAO,GAAgB,IAAI,GAAG,EAAE,CAAC;IACjC,KAAK,CAAS;IACd,SAAS,GAAgB,IAAI,GAAG,EAAE,CAAC;IACnC,OAAO,GAAkB,EAAE,CAAC;IAC5B,OAAO,GAAgD,IAAI,GAAG,EAAE,CAAC;IAEzE,YAAY,UAAiC,EAAE;QAC7C,IAAI,CAAC,OAAO,GAAG,EAAE,GAAG,eAAe,EAAE,GAAG,OAAO,EAAE,CAAC;QAClD,IAAI,CAAC,MAAM,GAAG,IAAI,eAAe,EAAE,CAAC;QACpC,IAAI,CAAC,SAAS,GAAG,IAAI,SAAS,EAAE,CAAC;QACjC,IAAI,CAAC,MAAM,GAAG,IAAI,YAAY,EAAE,CAAC;QACjC,IAAI,CAAC,OAAO,GAAG,IAAI,aAAa,EAAE,CAAC;QACnC,IAAI,CAAC,KAAK,GAAG,IAAI,YAAY,CAAC,IAAI,CAAC,CAAC,CAAC,yBAAyB;QAE9D,qBAAqB;QACrB,IAAI,CAAC,KAAK,GAAG,IAAI,MAAM,CAAC;YACtB,WAAW,EAAE,IAAI,CAAC,OAAO,CAAC,WAAW;YACrC,QAAQ,EAAE,IAAI;YACd,WAAW,EAAE,IAAI,CAAC,OAAO,CAAC,SAAS;SACpC,CAAC,CAAC;IACL,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,CAAC,KAAK,CAAC,SAAmB;QAC9B,MAAM,cAAc,GAAG,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC,CAAC;QAC3D,IAAI,CAAC,SAAS,GAAG,IAAI,GAAG,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,aAAa,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QAEpE,gDAAgD;QAChD,KAAK,MAAM,GAAG,IAAI,cAAc,EAAE,CAAC;YACjC,IAAI,CAAC,UAAU,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC;QAC1B,CAAC;QAED,sCAAsC;QACtC,IAAI,IAAI,CAAC,OAAO,CAAC,aAAa,EAAE,CAAC;YAC/B,IAAI,CAAC;gBACH,MAAM,OAAO,CAAC,IAAI,CAAC;oBACjB,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC;oBAC9D,IAAI,OAAO,CAAC,CAAC,CAAC,EAAE,MAAM,EAAE,EAAE,CAAC,UAAU,CAAC,GAAG,EAAE,CAAC,MAAM,CAAC,IAAI,KAAK,CAAC,SAAS,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC;iBACjF,CAAC,CAAC;YACL,CAAC;YAAC,MAAM,CAAC;gBACP,6CAA6C;YAC/C,CAAC;QACH,CAAC;QAED,gDAAgD;QAChD,IAAI,IAAI,CAAC,OAAO,CAAC,UAAU,EAAE,CAAC;YAC5B,IAAI,CAAC;gBACH,MAAM,OAAO,CAAC,IAAI,CAAC;oBACjB,IAAI,CAAC,aAAa,CAAC,cAAc,CAAC;oBAClC,IAAI,OAAO,CAAC,CAAC,CAAC,EAAE,MAAM,EAAE,EAAE,CAAC,UAAU,CAAC,GAAG,EAAE,CAAC,MAAM,CAAC,IAAI,KAAK,CAAC,SAAS,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC;iBACjF,CAAC,CAAC;YACL,CAAC;YAAC,MAAM,CAAC;gBACP,uCAAuC;YACzC,CAAC;QACH,CAAC;QAED,oDAAoD;QACpD,IAAI,aAAa,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QAC/B,IAAI,YAAY,GAAG,CAAC,CAAC;QACrB,MAAM,WAAW,GAAG,KAAK,CAAC,CAAC,wCAAwC;QAEnE,OAAO,IAAI,CAAC,KAAK,CAAC,IAAI,GAAG,CAAC,IAAI,IAAI,CAAC,KAAK,CAAC,OAAO,GAAG,CAAC,IAAI,IAAI,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAChF,0BAA0B;YAC1B,OAAO,IAAI,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAC/B,MAAM,IAAI,CAAC,OAAO,CAAC,KAAK,EAAG,CAAC;gBAC5B,YAAY,EAAE,CAAC;gBACf,aAAa,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;YAC7B,CAAC;YAED,6BAA6B;YAC7B,IAAI,YAAY,IAAI,IAAI,CAAC,OAAO,CAAC,QAAQ,EAAE,CAAC;gBAC1C,MAAM;YACR,CAAC;YAED,uDAAuD;YACvD,IAAI,IAAI,CAAC,GAAG,EAAE,GAAG,aAAa,GAAG,WAAW,EAAE,CAAC;gBAC7C,MAAM;YACR,CAAC;YAED,8BAA8B;YAC9B,MAAM,IAAI,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,UAAU,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC;QAC5C,CAAC;QAED,gBAAgB;QAChB,OAAO,IAAI,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC/B,MAAM,IAAI,CAAC,OAAO,CAAC,KAAK,EAAG,CAAC;QAC9B,CAAC;IACH,CAAC;IAEO,KAAK,CAAC,aAAa,CAAC,IAAc;QACxC,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;YACvB,IAAI,CAAC;gBACH,yCAAyC;gBACzC,MAAM,cAAc,GAAG,IAAI,CAAC,MAAM,CAAC,WAAW,CAAC,GAAG,CAAC,CAAC;gBACpD,IAAI,WAAW,GAAa,EAAE,CAAC;gBAE/B,IAAI,cAAc,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;oBAC9B,yBAAyB;oBACzB,WAAW,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,CAAC;gBAC/D,CAAC;qBAAM,CAAC;oBACN,WAAW,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;gBAC9C,CAAC;gBAED,2BAA2B;gBAC3B,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC,CAAC;gBAClD,KAAK,MAAM,IAAI,IAAI,WAAW,CAAC,KAAK,CAAC,CAAC,EAAE,KAAK,CAAC,EAAE,CAAC;oBAC/C,IAAI,IAAI,CAAC,WAAW,CAAC,IAAI,EAAE,CAAC,CAAC,EAAE,CAAC;wBAC9B,IAAI,CAAC,UAAU,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC;oBAC3B,CAAC;gBACH,CAAC;YACH,CAAC;YAAC,MAAM,CAAC;gBACP,wBAAwB;YAC1B,CAAC;QACH,CAAC;IACH,CAAC;IAEO,UAAU,CAAC,GAAW,EAAE,KAAa;QAC3C,MAAM,UAAU,GAAG,YAAY,CAAC,GAAG,CAAC,CAAC;QAErC,IAAI,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,UAAU,CAAC;YAAE,OAAO;QACzC,IAAI,IAAI,CAAC,OAAO,CAAC,IAAI,IAAI,IAAI,CAAC,OAAO,CAAC,QAAQ;YAAE,OAAO;QAEvD,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC;QAE7B,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,KAAK,IAAI,EAAE;YACxB,MAAM,IAAI,CAAC,UAAU,CAAC,GAAG,EAAE,KAAK,CAAC,CAAC;QACpC,CAAC,CAAC,CAAC;IACL,CAAC;IAEO,KAAK,CAAC,UAAU,CAAC,GAAW,EAAE,KAAa;QACjD,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,EAAE;gBAC1C,OAAO,EAAE,IAAI,CAAC,OAAO,CAAC,OAAO;gBAC7B,WAAW,EAAE,IAAI,CAAC,OAAO,CAAC,WAAW;aACtC,CAAC,CAAC;YAEH,IAAI,MAAM,CAAC,OAAO,EAAE,CAAC;gBACnB,GAAG,CAAC,GAAG,CAAC,cAAc,GAAG,EAAE,CAAC,CAAC;gBAC7B,OAAO;YACT,CAAC;YAED,IAAI,MAAM,CAAC,UAAU,IAAI,GAAG,EAAE,CAAC;gBAC7B,GAAG,CAAC,GAAG,CAAC,UAAU,MAAM,CAAC,UAAU,KAAK,GAAG,EAAE,CAAC,CAAC;gBAC/C,OAAO;YACT,CAAC;YAED,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,CAAC,OAAO,CAAC,MAAM,CAAC,IAAI,EAAE,MAAM,CAAC,QAAQ,CAAC,CAAC;YAEvE,sCAAsC;YACtC,IAAI,SAAS,CAAC,SAAS,GAAG,EAAE,EAAE,CAAC;gBAC7B,GAAG,CAAC,GAAG,CAAC,kBAAkB,GAAG,EAAE,CAAC,CAAC;gBACjC,OAAO;YACT,CAAC;YAED,iBAAiB;YACjB,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC;gBAChB,GAAG;gBACH,QAAQ,EAAE,MAAM,CAAC,QAAQ;gBACzB,SAAS;gBACT,KAAK;gBACL,MAAM,EAAE,MAAM,CAAC,MAAM;aACtB,CAAC,CAAC;YAEH,yBAAyB;YACzB,IAAI,KAAK,GAAG,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC;gBAC/B,KAAK,MAAM,IAAI,IAAI,SAAS,CAAC,KAAK,EAAE,CAAC;oBACnC,IAAI,IAAI,CAAC,WAAW,CAAC,IAAI,EAAE,KAAK,GAAG,CAAC,CAAC,EAAE,CAAC;wBACtC,IAAI,CAAC,UAAU,CAAC,IAAI,EAAE,KAAK,GAAG,CAAC,CAAC,CAAC;oBACnC,CAAC;gBACH,CAAC;YACH,CAAC;QACH,CAAC;QAAC,OAAO,KAAU,EAAE,CAAC;YACpB,GAAG,CAAC,GAAG,CAAC,aAAa,GAAG,MAAM,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC;QACjD,CAAC;IACH,CAAC;IAEO,WAAW,CAAC,GAAW,EAAE,KAAa;QAC5C,IAAI,CAAC;YACH,YAAY;YACZ,MAAM,UAAU,GAAG,YAAY,CAAC,GAAG,CAAC,CAAC;YAErC,kBAAkB;YAClB,IAAI,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,UAAU,CAAC;gBAAE,OAAO,KAAK,CAAC;YAE/C,YAAY;YACZ,IAAI,IAAI,CAAC,OAAO,CAAC,IAAI,IAAI,IAAI,CAAC,OAAO,CAAC,QAAQ;gBAAE,OAAO,KAAK,CAAC;YAE7D,cAAc;YACd,IAAI,KAAK,GAAG,IAAI,CAAC,OAAO,CAAC,KAAK;gBAAE,OAAO,KAAK,CAAC;YAE7C,gBAAgB;YAChB,IAAI,aAAa,CAAC,GAAG,CAAC;gBAAE,OAAO,KAAK,CAAC;YAErC,mBAAmB;YACnB,MAAM,MAAM,GAAG,aAAa,CAAC,GAAG,CAAC,CAAC;YAClC,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,MAAM,CAAC;gBAAE,OAAO,KAAK,CAAC;YAE9C,aAAa;YACb,IAAI,IAAI,CAAC,OAAO,CAAC,aAAa,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC,GAAG,CAAC;gBAAE,OAAO,KAAK,CAAC;YAE5E,2BAA2B;YAC3B,IAAI,IAAI,CAAC,OAAO,CAAC,cAAc,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,cAAc,CAAC,IAAI,CAAC,GAAG,CAAC;gBAAE,OAAO,KAAK,CAAC;YACxF,IAAI,IAAI,CAAC,OAAO,CAAC,cAAc,IAAI,IAAI,CAAC,OAAO,CAAC,cAAc,CAAC,IAAI,CAAC,GAAG,CAAC;gBAAE,OAAO,KAAK,CAAC;YAEvF,wCAAwC;YACxC,MAAM,WAAW,GAAG,IAAI,CAAC,KAAK,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC;YAC/C,IAAI,WAAW,CAAC,IAAI,EAAE,CAAC;gBACrB,GAAG,CAAC,GAAG,CAAC,WAAW,GAAG,KAAK,WAAW,CAAC,MAAM,GAAG,CAAC,CAAC;gBAClD,OAAO,KAAK,CAAC;YACf,CAAC;YAED,OAAO,IAAI,CAAC;QACd,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,KAAK,CAAC;QACf,CAAC;IACH,CAAC;IAED;;OAEG;IACH,QAAQ;QACN,OAAO;YACL,GAAG,IAAI,CAAC,MAAM,CAAC,QAAQ,EAAE;YACzB,OAAO,EAAE,IAAI,CAAC,OAAO,CAAC,IAAI;YAC1B,MAAM,EAAE,IAAI,CAAC,KAAK,CAAC,IAAI;YACvB,OAAO,EAAE,IAAI,CAAC,KAAK,CAAC,OAAO;YAC3B,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,QAAQ,EAAE;SAC7B,CAAC;IACJ,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,KAAK;QACT,IAAI,CAAC,KAAK,CAAC,KAAK,EAAE,CAAC;QACnB,MAAM,IAAI,CAAC,MAAM,CAAC,KAAK,EAAE,CAAC;IAC5B,CAAC;CACF;AAED,OAAO,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAC3C,OAAO,EAAE,aAAa,EAAE,MAAM,cAAc,CAAC"}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Robots.txt parser
|
|
3
|
+
*/
|
|
4
|
+
export declare class RobotsParser {
|
|
5
|
+
private rules;
|
|
6
|
+
private sitemaps;
|
|
7
|
+
/**
|
|
8
|
+
* Fetch and parse robots.txt for a URL
|
|
9
|
+
*/
|
|
10
|
+
parse(url: string): Promise<void>;
|
|
11
|
+
private parseRobotsTxt;
|
|
12
|
+
/**
|
|
13
|
+
* Check if a URL is allowed by robots.txt
|
|
14
|
+
*/
|
|
15
|
+
isAllowed(url: string, userAgent?: string): boolean;
|
|
16
|
+
/**
|
|
17
|
+
* Get crawl delay for a host
|
|
18
|
+
*/
|
|
19
|
+
getCrawlDelay(url: string, userAgent?: string): number | undefined;
|
|
20
|
+
/**
|
|
21
|
+
* Get sitemaps declared in robots.txt
|
|
22
|
+
*/
|
|
23
|
+
getSitemaps(url: string): string[];
|
|
24
|
+
private pathMatches;
|
|
25
|
+
}
|
|
26
|
+
//# sourceMappingURL=robots.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"robots.d.ts","sourceRoot":"","sources":["../../../src/crawler/robots.ts"],"names":[],"mappings":"AAAA;;GAEG;AASH,qBAAa,YAAY;IACvB,OAAO,CAAC,KAAK,CAAwC;IACrD,OAAO,CAAC,QAAQ,CAAoC;IAEpD;;OAEG;IACG,KAAK,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IA6BvC,OAAO,CAAC,cAAc;IAoEtB;;OAEG;IACH,SAAS,CAAC,GAAG,EAAE,MAAM,EAAE,SAAS,SAAM,GAAG,OAAO;IA0ChD;;OAEG;IACH,aAAa,CAAC,GAAG,EAAE,MAAM,EAAE,SAAS,SAAM,GAAG,MAAM,GAAG,SAAS;IAiB/D;;OAEG;IACH,WAAW,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM,EAAE;IASlC,OAAO,CAAC,WAAW;CAoBpB"}
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Robots.txt parser
|
|
3
|
+
*/
|
|
4
|
+
export class RobotsParser {
|
|
5
|
+
rules = new Map();
|
|
6
|
+
sitemaps = new Map();
|
|
7
|
+
/**
|
|
8
|
+
* Fetch and parse robots.txt for a URL
|
|
9
|
+
*/
|
|
10
|
+
async parse(url) {
|
|
11
|
+
try {
|
|
12
|
+
const parsedUrl = new URL(url);
|
|
13
|
+
const robotsUrl = `${parsedUrl.origin}/robots.txt`;
|
|
14
|
+
const host = parsedUrl.hostname;
|
|
15
|
+
// Already parsed
|
|
16
|
+
if (this.rules.has(host))
|
|
17
|
+
return;
|
|
18
|
+
const response = await fetch(robotsUrl, {
|
|
19
|
+
headers: { 'User-Agent': 'maw/1.0' },
|
|
20
|
+
signal: AbortSignal.timeout(5000),
|
|
21
|
+
});
|
|
22
|
+
if (!response.ok) {
|
|
23
|
+
// No robots.txt - allow everything
|
|
24
|
+
this.rules.set(host, []);
|
|
25
|
+
return;
|
|
26
|
+
}
|
|
27
|
+
const text = await response.text();
|
|
28
|
+
this.parseRobotsTxt(host, text);
|
|
29
|
+
}
|
|
30
|
+
catch {
|
|
31
|
+
// Failed to fetch - allow everything
|
|
32
|
+
const host = new URL(url).hostname;
|
|
33
|
+
this.rules.set(host, []);
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
parseRobotsTxt(host, text) {
|
|
37
|
+
const rules = [];
|
|
38
|
+
const sitemaps = [];
|
|
39
|
+
let currentRule = null;
|
|
40
|
+
const lines = text.split('\n');
|
|
41
|
+
for (const line of lines) {
|
|
42
|
+
const trimmed = line.trim();
|
|
43
|
+
// Skip comments and empty lines
|
|
44
|
+
if (!trimmed || trimmed.startsWith('#'))
|
|
45
|
+
continue;
|
|
46
|
+
const colonIndex = trimmed.indexOf(':');
|
|
47
|
+
if (colonIndex === -1)
|
|
48
|
+
continue;
|
|
49
|
+
const directive = trimmed.slice(0, colonIndex).toLowerCase().trim();
|
|
50
|
+
const value = trimmed.slice(colonIndex + 1).trim();
|
|
51
|
+
switch (directive) {
|
|
52
|
+
case 'user-agent':
|
|
53
|
+
if (currentRule) {
|
|
54
|
+
rules.push(currentRule);
|
|
55
|
+
}
|
|
56
|
+
currentRule = {
|
|
57
|
+
userAgent: value.toLowerCase(),
|
|
58
|
+
allow: [],
|
|
59
|
+
disallow: [],
|
|
60
|
+
};
|
|
61
|
+
break;
|
|
62
|
+
case 'allow':
|
|
63
|
+
if (currentRule && value) {
|
|
64
|
+
currentRule.allow.push(value);
|
|
65
|
+
}
|
|
66
|
+
break;
|
|
67
|
+
case 'disallow':
|
|
68
|
+
if (currentRule && value) {
|
|
69
|
+
currentRule.disallow.push(value);
|
|
70
|
+
}
|
|
71
|
+
break;
|
|
72
|
+
case 'crawl-delay':
|
|
73
|
+
if (currentRule) {
|
|
74
|
+
const delay = parseFloat(value);
|
|
75
|
+
if (!isNaN(delay)) {
|
|
76
|
+
currentRule.crawlDelay = delay;
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
break;
|
|
80
|
+
case 'sitemap':
|
|
81
|
+
if (value) {
|
|
82
|
+
sitemaps.push(value);
|
|
83
|
+
}
|
|
84
|
+
break;
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
if (currentRule) {
|
|
88
|
+
rules.push(currentRule);
|
|
89
|
+
}
|
|
90
|
+
this.rules.set(host, rules);
|
|
91
|
+
this.sitemaps.set(host, sitemaps);
|
|
92
|
+
}
|
|
93
|
+
/**
|
|
94
|
+
* Check if a URL is allowed by robots.txt
|
|
95
|
+
*/
|
|
96
|
+
isAllowed(url, userAgent = '*') {
|
|
97
|
+
try {
|
|
98
|
+
const parsedUrl = new URL(url);
|
|
99
|
+
const host = parsedUrl.hostname;
|
|
100
|
+
const path = parsedUrl.pathname + parsedUrl.search;
|
|
101
|
+
const hostRules = this.rules.get(host);
|
|
102
|
+
if (!hostRules || hostRules.length === 0) {
|
|
103
|
+
return true; // No rules = allow
|
|
104
|
+
}
|
|
105
|
+
// Find matching rules (specific user-agent or *)
|
|
106
|
+
const matchingRules = hostRules.filter(r => r.userAgent === userAgent.toLowerCase() || r.userAgent === '*');
|
|
107
|
+
if (matchingRules.length === 0) {
|
|
108
|
+
return true; // No matching rules = allow
|
|
109
|
+
}
|
|
110
|
+
// Check rules (more specific rules take precedence)
|
|
111
|
+
for (const rule of matchingRules) {
|
|
112
|
+
// Check disallow first
|
|
113
|
+
for (const disallow of rule.disallow) {
|
|
114
|
+
if (this.pathMatches(path, disallow)) {
|
|
115
|
+
// Check if there's a more specific allow
|
|
116
|
+
for (const allow of rule.allow) {
|
|
117
|
+
if (this.pathMatches(path, allow) && allow.length > disallow.length) {
|
|
118
|
+
return true;
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
return false;
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
return true;
|
|
126
|
+
}
|
|
127
|
+
catch {
|
|
128
|
+
return true;
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
/**
|
|
132
|
+
* Get crawl delay for a host
|
|
133
|
+
*/
|
|
134
|
+
getCrawlDelay(url, userAgent = '*') {
|
|
135
|
+
try {
|
|
136
|
+
const host = new URL(url).hostname;
|
|
137
|
+
const hostRules = this.rules.get(host);
|
|
138
|
+
if (!hostRules)
|
|
139
|
+
return undefined;
|
|
140
|
+
const rule = hostRules.find(r => r.userAgent === userAgent.toLowerCase() || r.userAgent === '*');
|
|
141
|
+
return rule?.crawlDelay;
|
|
142
|
+
}
|
|
143
|
+
catch {
|
|
144
|
+
return undefined;
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
/**
|
|
148
|
+
* Get sitemaps declared in robots.txt
|
|
149
|
+
*/
|
|
150
|
+
getSitemaps(url) {
|
|
151
|
+
try {
|
|
152
|
+
const host = new URL(url).hostname;
|
|
153
|
+
return this.sitemaps.get(host) || [];
|
|
154
|
+
}
|
|
155
|
+
catch {
|
|
156
|
+
return [];
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
pathMatches(path, pattern) {
|
|
160
|
+
// Simple pattern matching (supports * and $ wildcards)
|
|
161
|
+
if (pattern === '/')
|
|
162
|
+
return true;
|
|
163
|
+
// Convert pattern to regex
|
|
164
|
+
let regex = pattern
|
|
165
|
+
.replace(/[.+?^${}()|[\]\\]/g, '\\$&') // Escape special chars
|
|
166
|
+
.replace(/\*/g, '.*'); // * becomes .*
|
|
167
|
+
// $ at end means exact match
|
|
168
|
+
if (regex.endsWith('\\$')) {
|
|
169
|
+
regex = regex.slice(0, -2) + '$';
|
|
170
|
+
}
|
|
171
|
+
try {
|
|
172
|
+
return new RegExp('^' + regex).test(path);
|
|
173
|
+
}
|
|
174
|
+
catch {
|
|
175
|
+
return path.startsWith(pattern);
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
//# sourceMappingURL=robots.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"robots.js","sourceRoot":"","sources":["../../../src/crawler/robots.ts"],"names":[],"mappings":"AAAA;;GAEG;AASH,MAAM,OAAO,YAAY;IACf,KAAK,GAA8B,IAAI,GAAG,EAAE,CAAC;IAC7C,QAAQ,GAA0B,IAAI,GAAG,EAAE,CAAC;IAEpD;;OAEG;IACH,KAAK,CAAC,KAAK,CAAC,GAAW;QACrB,IAAI,CAAC;YACH,MAAM,SAAS,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;YAC/B,MAAM,SAAS,GAAG,GAAG,SAAS,CAAC,MAAM,aAAa,CAAC;YACnD,MAAM,IAAI,GAAG,SAAS,CAAC,QAAQ,CAAC;YAEhC,iBAAiB;YACjB,IAAI,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC;gBAAE,OAAO;YAEjC,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,SAAS,EAAE;gBACtC,OAAO,EAAE,EAAE,YAAY,EAAE,SAAS,EAAE;gBACpC,MAAM,EAAE,WAAW,CAAC,OAAO,CAAC,IAAI,CAAC;aAClC,CAAC,CAAC;YAEH,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;gBACjB,mCAAmC;gBACnC,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;gBACzB,OAAO;YACT,CAAC;YAED,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;YACnC,IAAI,CAAC,cAAc,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC;QAClC,CAAC;QAAC,MAAM,CAAC;YACP,qCAAqC;YACrC,MAAM,IAAI,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC;YACnC,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;QAC3B,CAAC;IACH,CAAC;IAEO,cAAc,CAAC,IAAY,EAAE,IAAY;QAC/C,MAAM,KAAK,GAAiB,EAAE,CAAC;QAC/B,MAAM,QAAQ,GAAa,EAAE,CAAC;QAC9B,IAAI,WAAW,GAAsB,IAAI,CAAC;QAE1C,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;QAE/B,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;YACzB,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC;YAE5B,gCAAgC;YAChC,IAAI,CAAC,OAAO,IAAI,OAAO,CAAC,UAAU,CAAC,GAAG,CAAC;gBAAE,SAAS;YAElD,MAAM,UAAU,GAAG,OAAO,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;YACxC,IAAI,UAAU,KAAK,CAAC,CAAC;gBAAE,SAAS;YAEhC,MAAM,SAAS,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,UAAU,CAAC,CAAC,WAAW,EAAE,CAAC,IAAI,EAAE,CAAC;YACpE,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,UAAU,GAAG,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;YAEnD,QAAQ,SAAS,EAAE,CAAC;gBAClB,KAAK,YAAY;oBACf,IAAI,WAAW,EAAE,CAAC;wBAChB,KAAK,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;oBAC1B,CAAC;oBACD,WAAW,GAAG;wBACZ,SAAS,EAAE,KAAK,CAAC,WAAW,EAAE;wBAC9B,KAAK,EAAE,EAAE;wBACT,QAAQ,EAAE,EAAE;qBACb,CAAC;oBACF,MAAM;gBAER,KAAK,OAAO;oBACV,IAAI,WAAW,IAAI,KAAK,EAAE,CAAC;wBACzB,WAAW,CAAC,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;oBAChC,CAAC;oBACD,MAAM;gBAER,KAAK,UAAU;oBACb,IAAI,WAAW,IAAI,KAAK,EAAE,CAAC;wBACzB,WAAW,CAAC,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;oBACnC,CAAC;oBACD,MAAM;gBAER,KAAK,aAAa;oBAChB,IAAI,WAAW,EAAE,CAAC;wBAChB,MAAM,KAAK,GAAG,UAAU,CAAC,KAAK,CAAC,CAAC;wBAChC,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,CAAC;4BAClB,WAAW,CAAC,UAAU,GAAG,KAAK,CAAC;wBACjC,CAAC;oBACH,CAAC;oBACD,MAAM;gBAER,KAAK,SAAS;oBACZ,IAAI,KAAK,EAAE,CAAC;wBACV,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;oBACvB,CAAC;oBACD,MAAM;YACV,CAAC;QACH,CAAC;QAED,IAAI,WAAW,EAAE,CAAC;YAChB,KAAK,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;QAC1B,CAAC;QAED,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,EAAE,KAAK,CAAC,CAAC;QAC5B,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,IAAI,EAAE,QAAQ,CAAC,CAAC;IACpC,CAAC;IAED;;OAEG;IACH,SAAS,CAAC,GAAW,EAAE,SAAS,GAAG,GAAG;QACpC,IAAI,CAAC;YACH,MAAM,SAAS,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;YAC/B,MAAM,IAAI,GAAG,SAAS,CAAC,QAAQ,CAAC;YAChC,MAAM,IAAI,GAAG,SAAS,CAAC,QAAQ,GAAG,SAAS,CAAC,MAAM,CAAC;YAEnD,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;YACvC,IAAI,CAAC,SAAS,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBACzC,OAAO,IAAI,CAAC,CAAC,mBAAmB;YAClC,CAAC;YAED,iDAAiD;YACjD,MAAM,aAAa,GAAG,SAAS,CAAC,MAAM,CACpC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,SAAS,KAAK,SAAS,CAAC,WAAW,EAAE,IAAI,CAAC,CAAC,SAAS,KAAK,GAAG,CACpE,CAAC;YAEF,IAAI,aAAa,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBAC/B,OAAO,IAAI,CAAC,CAAC,4BAA4B;YAC3C,CAAC;YAED,oDAAoD;YACpD,KAAK,MAAM,IAAI,IAAI,aAAa,EAAE,CAAC;gBACjC,uBAAuB;gBACvB,KAAK,MAAM,QAAQ,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;oBACrC,IAAI,IAAI,CAAC,WAAW,CAAC,IAAI,EAAE,QAAQ,CAAC,EAAE,CAAC;wBACrC,yCAAyC;wBACzC,KAAK,MAAM,KAAK,IAAI,IAAI,CAAC,KAAK,EAAE,CAAC;4BAC/B,IAAI,IAAI,CAAC,WAAW,CAAC,IAAI,EAAE,KAAK,CAAC,IAAI,KAAK,CAAC,MAAM,GAAG,QAAQ,CAAC,MAAM,EAAE,CAAC;gCACpE,OAAO,IAAI,CAAC;4BACd,CAAC;wBACH,CAAC;wBACD,OAAO,KAAK,CAAC;oBACf,CAAC;gBACH,CAAC;YACH,CAAC;YAED,OAAO,IAAI,CAAC;QACd,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,IAAI,CAAC;QACd,CAAC;IACH,CAAC;IAED;;OAEG;IACH,aAAa,CAAC,GAAW,EAAE,SAAS,GAAG,GAAG;QACxC,IAAI,CAAC;YACH,MAAM,IAAI,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC;YACnC,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;YAEvC,IAAI,CAAC,SAAS;gBAAE,OAAO,SAAS,CAAC;YAEjC,MAAM,IAAI,GAAG,SAAS,CAAC,IAAI,CACzB,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,SAAS,KAAK,SAAS,CAAC,WAAW,EAAE,IAAI,CAAC,CAAC,SAAS,KAAK,GAAG,CACpE,CAAC;YAEF,OAAO,IAAI,EAAE,UAAU,CAAC;QAC1B,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,SAAS,CAAC;QACnB,CAAC;IACH,CAAC;IAED;;OAEG;IACH,WAAW,CAAC,GAAW;QACrB,IAAI,CAAC;YACH,MAAM,IAAI,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC;YACnC,OAAO,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;QACvC,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,EAAE,CAAC;QACZ,CAAC;IACH,CAAC;IAEO,WAAW,CAAC,IAAY,EAAE,OAAe;QAC/C,uDAAuD;QACvD,IAAI,OAAO,KAAK,GAAG;YAAE,OAAO,IAAI,CAAC;QAEjC,2BAA2B;QAC3B,IAAI,KAAK,GAAG,OAAO;aAChB,OAAO,CAAC,oBAAoB,EAAE,MAAM,CAAC,CAAC,uBAAuB;aAC7D,OAAO,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC,CAAC,eAAe;QAExC,6BAA6B;QAC7B,IAAI,KAAK,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,CAAC;YAC1B,KAAK,GAAG,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,GAAG,GAAG,CAAC;QACnC,CAAC;QAED,IAAI,CAAC;YACH,OAAO,IAAI,MAAM,CAAC,GAAG,GAAG,KAAK,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC5C,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,IAAI,CAAC,UAAU,CAAC,OAAO,CAAC,CAAC;QAClC,CAAC;IACH,CAAC;CACF"}
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Sitemap.xml parser using sitemapper
|
|
3
|
+
*/
|
|
4
|
+
export interface SitemapUrl {
|
|
5
|
+
loc: string;
|
|
6
|
+
lastmod?: string;
|
|
7
|
+
changefreq?: string;
|
|
8
|
+
priority?: number;
|
|
9
|
+
}
|
|
10
|
+
export declare class SitemapParser {
|
|
11
|
+
private cache;
|
|
12
|
+
private sitemapper;
|
|
13
|
+
constructor();
|
|
14
|
+
/**
|
|
15
|
+
* Parse sitemap for a URL and return all URLs
|
|
16
|
+
*/
|
|
17
|
+
parse(url: string): Promise<string[]>;
|
|
18
|
+
/**
|
|
19
|
+
* Parse sitemap from a specific URL
|
|
20
|
+
*/
|
|
21
|
+
parseUrl(sitemapUrl: string): Promise<string[]>;
|
|
22
|
+
private fetchSitemap;
|
|
23
|
+
private parseSitemapContent;
|
|
24
|
+
private parseSitemapIndex;
|
|
25
|
+
private parseUrlset;
|
|
26
|
+
/**
|
|
27
|
+
* Parse sitemap for a URL and return all URLs with metadata
|
|
28
|
+
* Uses sitemapper for robust parsing of sitemap indexes and news sitemaps
|
|
29
|
+
*/
|
|
30
|
+
parseWithMetadata(url: string): Promise<SitemapUrl[]>;
|
|
31
|
+
/**
|
|
32
|
+
* Parse sitemap with full metadata
|
|
33
|
+
*/
|
|
34
|
+
parseWithMeta(content: string): SitemapUrl[];
|
|
35
|
+
}
|
|
36
|
+
//# sourceMappingURL=sitemap.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"sitemap.d.ts","sourceRoot":"","sources":["../../../src/crawler/sitemap.ts"],"names":[],"mappings":"AAAA;;GAEG;AAKH,MAAM,WAAW,UAAU;IACzB,GAAG,EAAE,MAAM,CAAC;IACZ,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,qBAAa,aAAa;IACxB,OAAO,CAAC,KAAK,CAAoC;IACjD,OAAO,CAAC,UAAU,CAAa;;IAW/B;;OAEG;IACG,KAAK,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC;IAuC3C;;OAEG;IACG,QAAQ,CAAC,UAAU,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC;YAYvC,YAAY;YAkBZ,mBAAmB;IA8BjC,OAAO,CAAC,iBAAiB;IAYzB,OAAO,CAAC,WAAW;IAYnB;;;OAGG;IACG,iBAAiB,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,UAAU,EAAE,CAAC;IA8C3D;;OAEG;IACH,aAAa,CAAC,OAAO,EAAE,MAAM,GAAG,UAAU,EAAE;CA0B7C"}
|