@memvid/maw 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. package/README.md +188 -0
  2. package/dist/bin/maw.d.ts +6 -0
  3. package/dist/bin/maw.d.ts.map +1 -0
  4. package/dist/bin/maw.js +275 -0
  5. package/dist/bin/maw.js.map +1 -0
  6. package/dist/src/crawler/index.d.ts +71 -0
  7. package/dist/src/crawler/index.d.ts.map +1 -0
  8. package/dist/src/crawler/index.js +249 -0
  9. package/dist/src/crawler/index.js.map +1 -0
  10. package/dist/src/crawler/robots.d.ts +26 -0
  11. package/dist/src/crawler/robots.d.ts.map +1 -0
  12. package/dist/src/crawler/robots.js +179 -0
  13. package/dist/src/crawler/robots.js.map +1 -0
  14. package/dist/src/crawler/sitemap.d.ts +36 -0
  15. package/dist/src/crawler/sitemap.d.ts.map +1 -0
  16. package/dist/src/crawler/sitemap.js +209 -0
  17. package/dist/src/crawler/sitemap.js.map +1 -0
  18. package/dist/src/engine/detector.d.ts +18 -0
  19. package/dist/src/engine/detector.d.ts.map +1 -0
  20. package/dist/src/engine/detector.js +155 -0
  21. package/dist/src/engine/detector.js.map +1 -0
  22. package/dist/src/engine/fetch.d.ts +18 -0
  23. package/dist/src/engine/fetch.d.ts.map +1 -0
  24. package/dist/src/engine/fetch.js +53 -0
  25. package/dist/src/engine/fetch.js.map +1 -0
  26. package/dist/src/engine/index.d.ts +39 -0
  27. package/dist/src/engine/index.d.ts.map +1 -0
  28. package/dist/src/engine/index.js +116 -0
  29. package/dist/src/engine/index.js.map +1 -0
  30. package/dist/src/engine/playwright.d.ts +23 -0
  31. package/dist/src/engine/playwright.d.ts.map +1 -0
  32. package/dist/src/engine/playwright.js +88 -0
  33. package/dist/src/engine/playwright.js.map +1 -0
  34. package/dist/src/engine/rebrowser.d.ts +22 -0
  35. package/dist/src/engine/rebrowser.d.ts.map +1 -0
  36. package/dist/src/engine/rebrowser.js +142 -0
  37. package/dist/src/engine/rebrowser.js.map +1 -0
  38. package/dist/src/extractor/cleaner.d.ts +13 -0
  39. package/dist/src/extractor/cleaner.d.ts.map +1 -0
  40. package/dist/src/extractor/cleaner.js +122 -0
  41. package/dist/src/extractor/cleaner.js.map +1 -0
  42. package/dist/src/extractor/index.d.ts +29 -0
  43. package/dist/src/extractor/index.d.ts.map +1 -0
  44. package/dist/src/extractor/index.js +162 -0
  45. package/dist/src/extractor/index.js.map +1 -0
  46. package/dist/src/extractor/links.d.ts +22 -0
  47. package/dist/src/extractor/links.d.ts.map +1 -0
  48. package/dist/src/extractor/links.js +92 -0
  49. package/dist/src/extractor/links.js.map +1 -0
  50. package/dist/src/extractor/markdown.d.ts +13 -0
  51. package/dist/src/extractor/markdown.d.ts.map +1 -0
  52. package/dist/src/extractor/markdown.js +94 -0
  53. package/dist/src/extractor/markdown.js.map +1 -0
  54. package/dist/src/git/index.d.ts +40 -0
  55. package/dist/src/git/index.d.ts.map +1 -0
  56. package/dist/src/git/index.js +303 -0
  57. package/dist/src/git/index.js.map +1 -0
  58. package/dist/src/index.d.ts +103 -0
  59. package/dist/src/index.d.ts.map +1 -0
  60. package/dist/src/index.js +229 -0
  61. package/dist/src/index.js.map +1 -0
  62. package/dist/src/ingestor/index.d.ts +95 -0
  63. package/dist/src/ingestor/index.d.ts.map +1 -0
  64. package/dist/src/ingestor/index.js +471 -0
  65. package/dist/src/ingestor/index.js.map +1 -0
  66. package/dist/src/utils/dedup.d.ts +66 -0
  67. package/dist/src/utils/dedup.d.ts.map +1 -0
  68. package/dist/src/utils/dedup.js +296 -0
  69. package/dist/src/utils/dedup.js.map +1 -0
  70. package/dist/src/utils/index.d.ts +3 -0
  71. package/dist/src/utils/index.d.ts.map +1 -0
  72. package/dist/src/utils/index.js +3 -0
  73. package/dist/src/utils/index.js.map +1 -0
  74. package/dist/src/utils/logger.d.ts +12 -0
  75. package/dist/src/utils/logger.d.ts.map +1 -0
  76. package/dist/src/utils/logger.js +49 -0
  77. package/dist/src/utils/logger.js.map +1 -0
  78. package/dist/src/utils/ui.d.ts +126 -0
  79. package/dist/src/utils/ui.d.ts.map +1 -0
  80. package/dist/src/utils/ui.js +357 -0
  81. package/dist/src/utils/ui.js.map +1 -0
  82. package/dist/src/utils/url.d.ts +21 -0
  83. package/dist/src/utils/url.d.ts.map +1 -0
  84. package/dist/src/utils/url.js +107 -0
  85. package/dist/src/utils/url.js.map +1 -0
  86. package/package.json +71 -0
@@ -0,0 +1,249 @@
1
+ /**
2
+ * Web crawler with rate limiting
3
+ */
4
+ import PQueue from 'p-queue';
5
+ import { EngineWaterfall } from '../engine/index.js';
6
+ import { Extractor } from '../extractor/index.js';
7
+ import { RobotsParser } from './robots.js';
8
+ import { SitemapParser } from './sitemap.js';
9
+ import { createLogger } from '../utils/logger.js';
10
+ import { normalizeUrl, shouldSkipUrl, getBaseDomain } from '../utils/url.js';
11
+ import { DedupTracker } from '../utils/dedup.js';
12
+ const DEFAULT_OPTIONS = {
13
+ depth: 2, // 2 levels deep (fast, good coverage)
14
+ concurrency: 10, // 10 concurrent requests
15
+ maxPages: 150, // 150 pages max (good coverage, avoids bloat)
16
+ rateLimit: 10, // 10 requests/sec
17
+ timeout: 10000, // 10s timeout (fail fast)
18
+ respectRobots: true,
19
+ useSitemap: true,
20
+ };
21
+ const log = createLogger();
22
+ export class Crawler {
23
+ options;
24
+ engine;
25
+ extractor;
26
+ robots;
27
+ sitemap;
28
+ dedup;
29
+ visited = new Set();
30
+ queue;
31
+ baseHosts = new Set();
32
+ results = [];
33
+ pending = new Map();
34
+ constructor(options = {}) {
35
+ this.options = { ...DEFAULT_OPTIONS, ...options };
36
+ this.engine = new EngineWaterfall();
37
+ this.extractor = new Extractor();
38
+ this.robots = new RobotsParser();
39
+ this.sitemap = new SitemapParser();
40
+ this.dedup = new DedupTracker('en'); // Prefer English content
41
+ // Rate-limited queue
42
+ this.queue = new PQueue({
43
+ concurrency: this.options.concurrency,
44
+ interval: 1000,
45
+ intervalCap: this.options.rateLimit,
46
+ });
47
+ }
48
+ /**
49
+ * Crawl URLs and yield results as they complete
50
+ */
51
+ async *crawl(startUrls) {
52
+ const normalizedUrls = startUrls.map(u => normalizeUrl(u));
53
+ this.baseHosts = new Set(normalizedUrls.map(u => getBaseDomain(u)));
54
+ // Add start URLs FIRST (don't wait for sitemap)
55
+ for (const url of normalizedUrls) {
56
+ this.addToQueue(url, 0);
57
+ }
58
+ // Parse robots.txt (fast, ~1 request)
59
+ if (this.options.respectRobots) {
60
+ try {
61
+ await Promise.race([
62
+ Promise.all(normalizedUrls.map(url => this.robots.parse(url))),
63
+ new Promise((_, reject) => setTimeout(() => reject(new Error('timeout')), 3000))
64
+ ]);
65
+ }
66
+ catch {
67
+ // Timeout or error - continue without robots
68
+ }
69
+ }
70
+ // Get URLs from sitemap (with 5s total timeout)
71
+ if (this.options.useSitemap) {
72
+ try {
73
+ await Promise.race([
74
+ this.parseSitemaps(normalizedUrls),
75
+ new Promise((_, reject) => setTimeout(() => reject(new Error('timeout')), 5000))
76
+ ]);
77
+ }
78
+ catch {
79
+ // Timeout - continue with what we have
80
+ }
81
+ }
82
+ // Process queue and yield results with idle timeout
83
+ let lastYieldTime = Date.now();
84
+ let yieldedCount = 0;
85
+ const idleTimeout = 15000; // 15 seconds without new results = done
86
+ while (this.queue.size > 0 || this.queue.pending > 0 || this.results.length > 0) {
87
+ // Yield available results
88
+ while (this.results.length > 0) {
89
+ yield this.results.shift();
90
+ yieldedCount++;
91
+ lastYieldTime = Date.now();
92
+ }
93
+ // Stop if we've hit maxPages
94
+ if (yieldedCount >= this.options.maxPages) {
95
+ break;
96
+ }
97
+ // Check for idle timeout (no new results for too long)
98
+ if (Date.now() - lastYieldTime > idleTimeout) {
99
+ break;
100
+ }
101
+ // Wait a bit for more results
102
+ await new Promise(r => setTimeout(r, 50));
103
+ }
104
+ // Final results
105
+ while (this.results.length > 0) {
106
+ yield this.results.shift();
107
+ }
108
+ }
109
+ async parseSitemaps(urls) {
110
+ for (const url of urls) {
111
+ try {
112
+ // Only try first sitemap from robots.txt
113
+ const robotsSitemaps = this.robots.getSitemaps(url);
114
+ let sitemapUrls = [];
115
+ if (robotsSitemaps.length > 0) {
116
+ // Just get first sitemap
117
+ sitemapUrls = await this.sitemap.parseUrl(robotsSitemaps[0]);
118
+ }
119
+ else {
120
+ sitemapUrls = await this.sitemap.parse(url);
121
+ }
122
+ // Add limited sitemap URLs
123
+ const limit = Math.min(this.options.maxPages, 50);
124
+ for (const sUrl of sitemapUrls.slice(0, limit)) {
125
+ if (this.shouldCrawl(sUrl, 1)) {
126
+ this.addToQueue(sUrl, 1);
127
+ }
128
+ }
129
+ }
130
+ catch {
131
+ // No sitemap - continue
132
+ }
133
+ }
134
+ }
135
+ addToQueue(url, depth) {
136
+ const normalized = normalizeUrl(url);
137
+ if (this.visited.has(normalized))
138
+ return;
139
+ if (this.visited.size >= this.options.maxPages)
140
+ return;
141
+ this.visited.add(normalized);
142
+ this.queue.add(async () => {
143
+ await this.processUrl(url, depth);
144
+ });
145
+ }
146
+ async processUrl(url, depth) {
147
+ try {
148
+ const result = await this.engine.fetch(url, {
149
+ timeout: this.options.timeout,
150
+ forceEngine: this.options.forceEngine,
151
+ });
152
+ if (result.blocked) {
153
+ log.dim(` Blocked: ${url}`);
154
+ return;
155
+ }
156
+ if (result.statusCode >= 400) {
157
+ log.dim(` HTTP ${result.statusCode}: ${url}`);
158
+ return;
159
+ }
160
+ const extracted = this.extractor.extract(result.html, result.finalUrl);
161
+ // Skip pages with very little content
162
+ if (extracted.wordCount < 20) {
163
+ log.dim(` Low content: ${url}`);
164
+ return;
165
+ }
166
+ // Add to results
167
+ this.results.push({
168
+ url,
169
+ finalUrl: result.finalUrl,
170
+ extracted,
171
+ depth,
172
+ engine: result.engine,
173
+ });
174
+ // Queue discovered links
175
+ if (depth < this.options.depth) {
176
+ for (const link of extracted.links) {
177
+ if (this.shouldCrawl(link, depth + 1)) {
178
+ this.addToQueue(link, depth + 1);
179
+ }
180
+ }
181
+ }
182
+ }
183
+ catch (error) {
184
+ log.dim(` Failed: ${url} - ${error.message}`);
185
+ }
186
+ }
187
+ shouldCrawl(url, depth) {
188
+ try {
189
+ // Normalize
190
+ const normalized = normalizeUrl(url);
191
+ // Already visited
192
+ if (this.visited.has(normalized))
193
+ return false;
194
+ // Max pages
195
+ if (this.visited.size >= this.options.maxPages)
196
+ return false;
197
+ // Depth check
198
+ if (depth > this.options.depth)
199
+ return false;
200
+ // Skip non-HTML
201
+ if (shouldSkipUrl(url))
202
+ return false;
203
+ // Same domain only
204
+ const domain = getBaseDomain(url);
205
+ if (!this.baseHosts.has(domain))
206
+ return false;
207
+ // Robots.txt
208
+ if (this.options.respectRobots && !this.robots.isAllowed(url))
209
+ return false;
210
+ // Include/exclude patterns
211
+ if (this.options.includePattern && !this.options.includePattern.test(url))
212
+ return false;
213
+ if (this.options.excludePattern && this.options.excludePattern.test(url))
214
+ return false;
215
+ // Smart dedup - skip localized versions
216
+ const dedupResult = this.dedup.shouldSkip(url);
217
+ if (dedupResult.skip) {
218
+ log.dim(` Skip: ${url} (${dedupResult.reason})`);
219
+ return false;
220
+ }
221
+ return true;
222
+ }
223
+ catch {
224
+ return false;
225
+ }
226
+ }
227
+ /**
228
+ * Get engine statistics
229
+ */
230
+ getStats() {
231
+ return {
232
+ ...this.engine.getStats(),
233
+ visited: this.visited.size,
234
+ queued: this.queue.size,
235
+ pending: this.queue.pending,
236
+ dedup: this.dedup.getStats(),
237
+ };
238
+ }
239
+ /**
240
+ * Close all resources
241
+ */
242
+ async close() {
243
+ this.queue.clear();
244
+ await this.engine.close();
245
+ }
246
+ }
247
+ export { RobotsParser } from './robots.js';
248
+ export { SitemapParser } from './sitemap.js';
249
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/crawler/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,MAAM,MAAM,SAAS,CAAC;AAC7B,OAAO,EAAE,eAAe,EAAqB,MAAM,oBAAoB,CAAC;AACxE,OAAO,EAAE,SAAS,EAAsB,MAAM,uBAAuB,CAAC;AACtE,OAAO,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAC3C,OAAO,EAAE,aAAa,EAAE,MAAM,cAAc,CAAC;AAC7C,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAClD,OAAO,EAAE,YAAY,EAAE,aAAa,EAAE,aAAa,EAAE,MAAM,iBAAiB,CAAC;AAC7E,OAAO,EAAE,YAAY,EAAE,MAAM,mBAAmB,CAAC;AAuBjD,MAAM,eAAe,GAAiB;IACpC,KAAK,EAAE,CAAC,EAAe,sCAAsC;IAC7D,WAAW,EAAE,EAAE,EAAQ,yBAAyB;IAChD,QAAQ,EAAE,GAAG,EAAU,8CAA8C;IACrE,SAAS,EAAE,EAAE,EAAU,kBAAkB;IACzC,OAAO,EAAE,KAAK,EAAS,0BAA0B;IACjD,aAAa,EAAE,IAAI;IACnB,UAAU,EAAE,IAAI;CACjB,CAAC;AAEF,MAAM,GAAG,GAAG,YAAY,EAAE,CAAC;AAE3B,MAAM,OAAO,OAAO;IACV,OAAO,CAAe;IACtB,MAAM,CAAkB;IACxB,SAAS,CAAY;IACrB,MAAM,CAAe;IACrB,OAAO,CAAgB;IACvB,KAAK,CAAe;IACpB,OAAO,GAAgB,IAAI,GAAG,EAAE,CAAC;IACjC,KAAK,CAAS;IACd,SAAS,GAAgB,IAAI,GAAG,EAAE,CAAC;IACnC,OAAO,GAAkB,EAAE,CAAC;IAC5B,OAAO,GAAgD,IAAI,GAAG,EAAE,CAAC;IAEzE,YAAY,UAAiC,EAAE;QAC7C,IAAI,CAAC,OAAO,GAAG,EAAE,GAAG,eAAe,EAAE,GAAG,OAAO,EAAE,CAAC;QAClD,IAAI,CAAC,MAAM,GAAG,IAAI,eAAe,EAAE,CAAC;QACpC,IAAI,CAAC,SAAS,GAAG,IAAI,SAAS,EAAE,CAAC;QACjC,IAAI,CAAC,MAAM,GAAG,IAAI,YAAY,EAAE,CAAC;QACjC,IAAI,CAAC,OAAO,GAAG,IAAI,aAAa,EAAE,CAAC;QACnC,IAAI,CAAC,KAAK,GAAG,IAAI,YAAY,CAAC,IAAI,CAAC,CAAC,CAAC,yBAAyB;QAE9D,qBAAqB;QACrB,IAAI,CAAC,KAAK,GAAG,IAAI,MAAM,CAAC;YACtB,WAAW,EAAE,IAAI,CAAC,OAAO,CAAC,WAAW;YACrC,QAAQ,EAAE,IAAI;YACd,WAAW,EAAE,IAAI,CAAC,OAAO,CAAC,SAAS;SACpC,CAAC,CAAC;IACL,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,CAAC,KAAK,CAAC,SAAmB;QAC9B,MAAM,cAAc,GAAG,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC,CAAC;QAC3D,IAAI,CAAC,SAAS,GAAG,IAAI,GAAG,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,aAAa,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QAEpE,gDAAgD;QAChD,KAAK,MAAM,GAAG,IAAI,cAAc,EAAE,CAAC;YACjC,IAAI,CAAC,UAAU,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC;QAC1B,CAAC;QAED,sCAAsC;QACtC,IAAI,IAAI,CAAC,OAAO,CAAC,aAAa,EAAE,CAAC;YAC/B,IAAI,CAAC;gBACH,MAAM,OAAO,CAAC,IAAI,CAAC;oBACjB,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC;oBAC9D,IAAI,OAAO,CAAC,CAAC,CAAC,EAAE,MAAM,EAAE,EAAE,CAAC,UAAU,CAAC,GAAG,EAAE,CAAC,MAAM,CAAC,IAAI,KAAK,CAAC,SAAS,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC;iBACjF,CAAC,CAAC;YACL,CAAC;YAAC,MAAM,CAAC;gBACP,6CAA6C;YAC/C,CAAC;QACH,CAAC;QAED,gDAAgD;QAChD,IAAI,IAAI,CAAC,OAAO,CAAC,UAAU,EAAE,CAAC;YAC5B,IAAI,CAAC;gBACH,MAAM,OAAO,CAAC,IAAI,CAAC;oBACjB,IAAI,CAAC,aAAa,CAAC,cAAc,CAAC;oBAClC,IAAI,OAAO,CAAC,CAAC,CAAC,EAAE,MAAM,EAAE,EAAE,CAAC,UAAU,CAAC,GAAG,EAAE,CAAC,MAAM,CAAC,IAAI,KAAK,CAAC,SAAS,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC;iBACjF,CAAC,CAAC;YACL,CAAC;YAAC,MAAM,CAAC;gBACP,uCAAuC;YACzC,CAAC;QACH,CAAC;QAED,oDAAoD;QACpD,IAAI,aAAa,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QAC/B,IAAI,YAAY,GAAG,CAAC,CAAC;QACrB,MAAM,WAAW,GAAG,KAAK,CAAC,CAAC,wCAAwC;QAEnE,OAAO,IAAI,CAAC,KAAK,CAAC,IAAI,GAAG,CAAC,IAAI,IAAI,CAAC,KAAK,CAAC,OAAO,GAAG,CAAC,IAAI,IAAI,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAChF,0BAA0B;YAC1B,OAAO,IAAI,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAC/B,MAAM,IAAI,CAAC,OAAO,CAAC,KAAK,EAAG,CAAC;gBAC5B,YAAY,EAAE,CAAC;gBACf,aAAa,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;YAC7B,CAAC;YAED,6BAA6B;YAC7B,IAAI,YAAY,IAAI,IAAI,CAAC,OAAO,CAAC,QAAQ,EAAE,CAAC;gBAC1C,MAAM;YACR,CAAC;YAED,uDAAuD;YACvD,IAAI,IAAI,CAAC,GAAG,EAAE,GAAG,aAAa,GAAG,WAAW,EAAE,CAAC;gBAC7C,MAAM;YACR,CAAC;YAED,8BAA8B;YAC9B,MAAM,IAAI,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,UAAU,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC;QAC5C,CAAC;QAED,gBAAgB;QAChB,OAAO,IAAI,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC/B,MAAM,IAAI,CAAC,OAAO,CAAC,KAAK,EAAG,CAAC;QAC9B,CAAC;IACH,CAAC;IAEO,KAAK,CAAC,aAAa,CAAC,IAAc;QACxC,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;YACvB,IAAI,CAAC;gBACH,yCAAyC;gBACzC,MAAM,cAAc,GAAG,IAAI,CAAC,MAAM,CAAC,WAAW,CAAC,GAAG,CAAC,CAAC;gBACpD,IAAI,WAAW,GAAa,EAAE,CAAC;gBAE/B,IAAI,cAAc,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;oBAC9B,yBAAyB;oBACzB,WAAW,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,CAAC;gBAC/D,CAAC;qBAAM,CAAC;oBACN,WAAW,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;gBAC9C,CAAC;gBAED,2BAA2B;gBAC3B,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC,CAAC;gBAClD,KAAK,MAAM,IAAI,IAAI,WAAW,CAAC,KAAK,CAAC,CAAC,EAAE,KAAK,CAAC,EAAE,CAAC;oBAC/C,IAAI,IAAI,CAAC,WAAW,CAAC,IAAI,EAAE,CAAC,CAAC,EAAE,CAAC;wBAC9B,IAAI,CAAC,UAAU,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC;oBAC3B,CAAC;gBACH,CAAC;YACH,CAAC;YAAC,MAAM,CAAC;gBACP,wBAAwB;YAC1B,CAAC;QACH,CAAC;IACH,CAAC;IAEO,UAAU,CAAC,GAAW,EAAE,KAAa;QAC3C,MAAM,UAAU,GAAG,YAAY,CAAC,GAAG,CAAC,CAAC;QAErC,IAAI,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,UAAU,CAAC;YAAE,OAAO;QACzC,IAAI,IAAI,CAAC,OAAO,CAAC,IAAI,IAAI,IAAI,CAAC,OAAO,CAAC,QAAQ;YAAE,OAAO;QAEvD,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC;QAE7B,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,KAAK,IAAI,EAAE;YACxB,MAAM,IAAI,CAAC,UAAU,CAAC,GAAG,EAAE,KAAK,CAAC,CAAC;QACpC,CAAC,CAAC,CAAC;IACL,CAAC;IAEO,KAAK,CAAC,UAAU,CAAC,GAAW,EAAE,KAAa;QACjD,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,EAAE;gBAC1C,OAAO,EAAE,IAAI,CAAC,OAAO,CAAC,OAAO;gBAC7B,WAAW,EAAE,IAAI,CAAC,OAAO,CAAC,WAAW;aACtC,CAAC,CAAC;YAEH,IAAI,MAAM,CAAC,OAAO,EAAE,CAAC;gBACnB,GAAG,CAAC,GAAG,CAAC,cAAc,GAAG,EAAE,CAAC,CAAC;gBAC7B,OAAO;YACT,CAAC;YAED,IAAI,MAAM,CAAC,UAAU,IAAI,GAAG,EAAE,CAAC;gBAC7B,GAAG,CAAC,GAAG,CAAC,UAAU,MAAM,CAAC,UAAU,KAAK,GAAG,EAAE,CAAC,CAAC;gBAC/C,OAAO;YACT,CAAC;YAED,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,CAAC,OAAO,CAAC,MAAM,CAAC,IAAI,EAAE,MAAM,CAAC,QAAQ,CAAC,CAAC;YAEvE,sCAAsC;YACtC,IAAI,SAAS,CAAC,SAAS,GAAG,EAAE,EAAE,CAAC;gBAC7B,GAAG,CAAC,GAAG,CAAC,kBAAkB,GAAG,EAAE,CAAC,CAAC;gBACjC,OAAO;YACT,CAAC;YAED,iBAAiB;YACjB,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC;gBAChB,GAAG;gBACH,QAAQ,EAAE,MAAM,CAAC,QAAQ;gBACzB,SAAS;gBACT,KAAK;gBACL,MAAM,EAAE,MAAM,CAAC,MAAM;aACtB,CAAC,CAAC;YAEH,yBAAyB;YACzB,IAAI,KAAK,GAAG,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC;gBAC/B,KAAK,MAAM,IAAI,IAAI,SAAS,CAAC,KAAK,EAAE,CAAC;oBACnC,IAAI,IAAI,CAAC,WAAW,CAAC,IAAI,EAAE,KAAK,GAAG,CAAC,CAAC,EAAE,CAAC;wBACtC,IAAI,CAAC,UAAU,CAAC,IAAI,EAAE,KAAK,GAAG,CAAC,CAAC,CAAC;oBACnC,CAAC;gBACH,CAAC;YACH,CAAC;QACH,CAAC;QAAC,OAAO,KAAU,EAAE,CAAC;YACpB,GAAG,CAAC,GAAG,CAAC,aAAa,GAAG,MAAM,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC;QACjD,CAAC;IACH,CAAC;IAEO,WAAW,CAAC,GAAW,EAAE,KAAa;QAC5C,IAAI,CAAC;YACH,YAAY;YACZ,MAAM,UAAU,GAAG,YAAY,CAAC,GAAG,CAAC,CAAC;YAErC,kBAAkB;YAClB,IAAI,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,UAAU,CAAC;gBAAE,OAAO,KAAK,CAAC;YAE/C,YAAY;YACZ,IAAI,IAAI,CAAC,OAAO,CAAC,IAAI,IAAI,IAAI,CAAC,OAAO,CAAC,QAAQ;gBAAE,OAAO,KAAK,CAAC;YAE7D,cAAc;YACd,IAAI,KAAK,GAAG,IAAI,CAAC,OAAO,CAAC,KAAK;gBAAE,OAAO,KAAK,CAAC;YAE7C,gBAAgB;YAChB,IAAI,aAAa,CAAC,GAAG,CAAC;gBAAE,OAAO,KAAK,CAAC;YAErC,mBAAmB;YACnB,MAAM,MAAM,GAAG,aAAa,CAAC,GAAG,CAAC,CAAC;YAClC,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,MAAM,CAAC;gBAAE,OAAO,KAAK,CAAC;YAE9C,aAAa;YACb,IAAI,IAAI,CAAC,OAAO,CAAC,aAAa,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC,GAAG,CAAC;gBAAE,OAAO,KAAK,CAAC;YAE5E,2BAA2B;YAC3B,IAAI,IAAI,CAAC,OAAO,CAAC,cAAc,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,cAAc,CAAC,IAAI,CAAC,GAAG,CAAC;gBAAE,OAAO,KAAK,CAAC;YACxF,IAAI,IAAI,CAAC,OAAO,CAAC,cAAc,IAAI,IAAI,CAAC,OAAO,CAAC,cAAc,CAAC,IAAI,CAAC,GAAG,CAAC;gBAAE,OAAO,KAAK,CAAC;YAEvF,wCAAwC;YACxC,MAAM,WAAW,GAAG,IAAI,CAAC,KAAK,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC;YAC/C,IAAI,WAAW,CAAC,IAAI,EAAE,CAAC;gBACrB,GAAG,CAAC,GAAG,CAAC,WAAW,GAAG,KAAK,WAAW,CAAC,MAAM,GAAG,CAAC,CAAC;gBAClD,OAAO,KAAK,CAAC;YACf,CAAC;YAED,OAAO,IAAI,CAAC;QACd,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,KAAK,CAAC;QACf,CAAC;IACH,CAAC;IAED;;OAEG;IACH,QAAQ;QACN,OAAO;YACL,GAAG,IAAI,CAAC,MAAM,CAAC,QAAQ,EAAE;YACzB,OAAO,EAAE,IAAI,CAAC,OAAO,CAAC,IAAI;YAC1B,MAAM,EAAE,IAAI,CAAC,KAAK,CAAC,IAAI;YACvB,OAAO,EAAE,IAAI,CAAC,KAAK,CAAC,OAAO;YAC3B,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,QAAQ,EAAE;SAC7B,CAAC;IACJ,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,KAAK;QACT,IAAI,CAAC,KAAK,CAAC,KAAK,EAAE,CAAC;QACnB,MAAM,IAAI,CAAC,MAAM,CAAC,KAAK,EAAE,CAAC;IAC5B,CAAC;CACF;AAED,OAAO,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAC3C,OAAO,EAAE,aAAa,EAAE,MAAM,cAAc,CAAC"}
@@ -0,0 +1,26 @@
1
+ /**
2
+ * Robots.txt parser
3
+ */
4
+ export declare class RobotsParser {
5
+ private rules;
6
+ private sitemaps;
7
+ /**
8
+ * Fetch and parse robots.txt for a URL
9
+ */
10
+ parse(url: string): Promise<void>;
11
+ private parseRobotsTxt;
12
+ /**
13
+ * Check if a URL is allowed by robots.txt
14
+ */
15
+ isAllowed(url: string, userAgent?: string): boolean;
16
+ /**
17
+ * Get crawl delay for a host
18
+ */
19
+ getCrawlDelay(url: string, userAgent?: string): number | undefined;
20
+ /**
21
+ * Get sitemaps declared in robots.txt
22
+ */
23
+ getSitemaps(url: string): string[];
24
+ private pathMatches;
25
+ }
26
+ //# sourceMappingURL=robots.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"robots.d.ts","sourceRoot":"","sources":["../../../src/crawler/robots.ts"],"names":[],"mappings":"AAAA;;GAEG;AASH,qBAAa,YAAY;IACvB,OAAO,CAAC,KAAK,CAAwC;IACrD,OAAO,CAAC,QAAQ,CAAoC;IAEpD;;OAEG;IACG,KAAK,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IA6BvC,OAAO,CAAC,cAAc;IAoEtB;;OAEG;IACH,SAAS,CAAC,GAAG,EAAE,MAAM,EAAE,SAAS,SAAM,GAAG,OAAO;IA0ChD;;OAEG;IACH,aAAa,CAAC,GAAG,EAAE,MAAM,EAAE,SAAS,SAAM,GAAG,MAAM,GAAG,SAAS;IAiB/D;;OAEG;IACH,WAAW,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM,EAAE;IASlC,OAAO,CAAC,WAAW;CAoBpB"}
@@ -0,0 +1,179 @@
1
+ /**
2
+ * Robots.txt parser
3
+ */
4
+ export class RobotsParser {
5
+ rules = new Map();
6
+ sitemaps = new Map();
7
+ /**
8
+ * Fetch and parse robots.txt for a URL
9
+ */
10
+ async parse(url) {
11
+ try {
12
+ const parsedUrl = new URL(url);
13
+ const robotsUrl = `${parsedUrl.origin}/robots.txt`;
14
+ const host = parsedUrl.hostname;
15
+ // Already parsed
16
+ if (this.rules.has(host))
17
+ return;
18
+ const response = await fetch(robotsUrl, {
19
+ headers: { 'User-Agent': 'maw/1.0' },
20
+ signal: AbortSignal.timeout(5000),
21
+ });
22
+ if (!response.ok) {
23
+ // No robots.txt - allow everything
24
+ this.rules.set(host, []);
25
+ return;
26
+ }
27
+ const text = await response.text();
28
+ this.parseRobotsTxt(host, text);
29
+ }
30
+ catch {
31
+ // Failed to fetch - allow everything
32
+ const host = new URL(url).hostname;
33
+ this.rules.set(host, []);
34
+ }
35
+ }
36
+ parseRobotsTxt(host, text) {
37
+ const rules = [];
38
+ const sitemaps = [];
39
+ let currentRule = null;
40
+ const lines = text.split('\n');
41
+ for (const line of lines) {
42
+ const trimmed = line.trim();
43
+ // Skip comments and empty lines
44
+ if (!trimmed || trimmed.startsWith('#'))
45
+ continue;
46
+ const colonIndex = trimmed.indexOf(':');
47
+ if (colonIndex === -1)
48
+ continue;
49
+ const directive = trimmed.slice(0, colonIndex).toLowerCase().trim();
50
+ const value = trimmed.slice(colonIndex + 1).trim();
51
+ switch (directive) {
52
+ case 'user-agent':
53
+ if (currentRule) {
54
+ rules.push(currentRule);
55
+ }
56
+ currentRule = {
57
+ userAgent: value.toLowerCase(),
58
+ allow: [],
59
+ disallow: [],
60
+ };
61
+ break;
62
+ case 'allow':
63
+ if (currentRule && value) {
64
+ currentRule.allow.push(value);
65
+ }
66
+ break;
67
+ case 'disallow':
68
+ if (currentRule && value) {
69
+ currentRule.disallow.push(value);
70
+ }
71
+ break;
72
+ case 'crawl-delay':
73
+ if (currentRule) {
74
+ const delay = parseFloat(value);
75
+ if (!isNaN(delay)) {
76
+ currentRule.crawlDelay = delay;
77
+ }
78
+ }
79
+ break;
80
+ case 'sitemap':
81
+ if (value) {
82
+ sitemaps.push(value);
83
+ }
84
+ break;
85
+ }
86
+ }
87
+ if (currentRule) {
88
+ rules.push(currentRule);
89
+ }
90
+ this.rules.set(host, rules);
91
+ this.sitemaps.set(host, sitemaps);
92
+ }
93
+ /**
94
+ * Check if a URL is allowed by robots.txt
95
+ */
96
+ isAllowed(url, userAgent = '*') {
97
+ try {
98
+ const parsedUrl = new URL(url);
99
+ const host = parsedUrl.hostname;
100
+ const path = parsedUrl.pathname + parsedUrl.search;
101
+ const hostRules = this.rules.get(host);
102
+ if (!hostRules || hostRules.length === 0) {
103
+ return true; // No rules = allow
104
+ }
105
+ // Find matching rules (specific user-agent or *)
106
+ const matchingRules = hostRules.filter(r => r.userAgent === userAgent.toLowerCase() || r.userAgent === '*');
107
+ if (matchingRules.length === 0) {
108
+ return true; // No matching rules = allow
109
+ }
110
+ // Check rules (more specific rules take precedence)
111
+ for (const rule of matchingRules) {
112
+ // Check disallow first
113
+ for (const disallow of rule.disallow) {
114
+ if (this.pathMatches(path, disallow)) {
115
+ // Check if there's a more specific allow
116
+ for (const allow of rule.allow) {
117
+ if (this.pathMatches(path, allow) && allow.length > disallow.length) {
118
+ return true;
119
+ }
120
+ }
121
+ return false;
122
+ }
123
+ }
124
+ }
125
+ return true;
126
+ }
127
+ catch {
128
+ return true;
129
+ }
130
+ }
131
+ /**
132
+ * Get crawl delay for a host
133
+ */
134
+ getCrawlDelay(url, userAgent = '*') {
135
+ try {
136
+ const host = new URL(url).hostname;
137
+ const hostRules = this.rules.get(host);
138
+ if (!hostRules)
139
+ return undefined;
140
+ const rule = hostRules.find(r => r.userAgent === userAgent.toLowerCase() || r.userAgent === '*');
141
+ return rule?.crawlDelay;
142
+ }
143
+ catch {
144
+ return undefined;
145
+ }
146
+ }
147
+ /**
148
+ * Get sitemaps declared in robots.txt
149
+ */
150
+ getSitemaps(url) {
151
+ try {
152
+ const host = new URL(url).hostname;
153
+ return this.sitemaps.get(host) || [];
154
+ }
155
+ catch {
156
+ return [];
157
+ }
158
+ }
159
+ pathMatches(path, pattern) {
160
+ // Simple pattern matching (supports * and $ wildcards)
161
+ if (pattern === '/')
162
+ return true;
163
+ // Convert pattern to regex
164
+ let regex = pattern
165
+ .replace(/[.+?^${}()|[\]\\]/g, '\\$&') // Escape special chars
166
+ .replace(/\*/g, '.*'); // * becomes .*
167
+ // $ at end means exact match
168
+ if (regex.endsWith('\\$')) {
169
+ regex = regex.slice(0, -2) + '$';
170
+ }
171
+ try {
172
+ return new RegExp('^' + regex).test(path);
173
+ }
174
+ catch {
175
+ return path.startsWith(pattern);
176
+ }
177
+ }
178
+ }
179
+ //# sourceMappingURL=robots.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"robots.js","sourceRoot":"","sources":["../../../src/crawler/robots.ts"],"names":[],"mappings":"AAAA;;GAEG;AASH,MAAM,OAAO,YAAY;IACf,KAAK,GAA8B,IAAI,GAAG,EAAE,CAAC;IAC7C,QAAQ,GAA0B,IAAI,GAAG,EAAE,CAAC;IAEpD;;OAEG;IACH,KAAK,CAAC,KAAK,CAAC,GAAW;QACrB,IAAI,CAAC;YACH,MAAM,SAAS,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;YAC/B,MAAM,SAAS,GAAG,GAAG,SAAS,CAAC,MAAM,aAAa,CAAC;YACnD,MAAM,IAAI,GAAG,SAAS,CAAC,QAAQ,CAAC;YAEhC,iBAAiB;YACjB,IAAI,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC;gBAAE,OAAO;YAEjC,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,SAAS,EAAE;gBACtC,OAAO,EAAE,EAAE,YAAY,EAAE,SAAS,EAAE;gBACpC,MAAM,EAAE,WAAW,CAAC,OAAO,CAAC,IAAI,CAAC;aAClC,CAAC,CAAC;YAEH,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;gBACjB,mCAAmC;gBACnC,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;gBACzB,OAAO;YACT,CAAC;YAED,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;YACnC,IAAI,CAAC,cAAc,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC;QAClC,CAAC;QAAC,MAAM,CAAC;YACP,qCAAqC;YACrC,MAAM,IAAI,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC;YACnC,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;QAC3B,CAAC;IACH,CAAC;IAEO,cAAc,CAAC,IAAY,EAAE,IAAY;QAC/C,MAAM,KAAK,GAAiB,EAAE,CAAC;QAC/B,MAAM,QAAQ,GAAa,EAAE,CAAC;QAC9B,IAAI,WAAW,GAAsB,IAAI,CAAC;QAE1C,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;QAE/B,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;YACzB,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC;YAE5B,gCAAgC;YAChC,IAAI,CAAC,OAAO,IAAI,OAAO,CAAC,UAAU,CAAC,GAAG,CAAC;gBAAE,SAAS;YAElD,MAAM,UAAU,GAAG,OAAO,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;YACxC,IAAI,UAAU,KAAK,CAAC,CAAC;gBAAE,SAAS;YAEhC,MAAM,SAAS,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,UAAU,CAAC,CAAC,WAAW,EAAE,CAAC,IAAI,EAAE,CAAC;YACpE,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,UAAU,GAAG,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;YAEnD,QAAQ,SAAS,EAAE,CAAC;gBAClB,KAAK,YAAY;oBACf,IAAI,WAAW,EAAE,CAAC;wBAChB,KAAK,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;oBAC1B,CAAC;oBACD,WAAW,GAAG;wBACZ,SAAS,EAAE,KAAK,CAAC,WAAW,EAAE;wBAC9B,KAAK,EAAE,EAAE;wBACT,QAAQ,EAAE,EAAE;qBACb,CAAC;oBACF,MAAM;gBAER,KAAK,OAAO;oBACV,IAAI,WAAW,IAAI,KAAK,EAAE,CAAC;wBACzB,WAAW,CAAC,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;oBAChC,CAAC;oBACD,MAAM;gBAER,KAAK,UAAU;oBACb,IAAI,WAAW,IAAI,KAAK,EAAE,CAAC;wBACzB,WAAW,CAAC,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;oBACnC,CAAC;oBACD,MAAM;gBAER,KAAK,aAAa;oBAChB,IAAI,WAAW,EAAE,CAAC;wBAChB,MAAM,KAAK,GAAG,UAAU,CAAC,KAAK,CAAC,CAAC;wBAChC,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,CAAC;4BAClB,WAAW,CAAC,UAAU,GAAG,KAAK,CAAC;wBACjC,CAAC;oBACH,CAAC;oBACD,MAAM;gBAER,KAAK,SAAS;oBACZ,IAAI,KAAK,EAAE,CAAC;wBACV,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;oBACvB,CAAC;oBACD,MAAM;YACV,CAAC;QACH,CAAC;QAED,IAAI,WAAW,EAAE,CAAC;YAChB,KAAK,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;QAC1B,CAAC;QAED,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,EAAE,KAAK,CAAC,CAAC;QAC5B,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,IAAI,EAAE,QAAQ,CAAC,CAAC;IACpC,CAAC;IAED;;OAEG;IACH,SAAS,CAAC,GAAW,EAAE,SAAS,GAAG,GAAG;QACpC,IAAI,CAAC;YACH,MAAM,SAAS,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;YAC/B,MAAM,IAAI,GAAG,SAAS,CAAC,QAAQ,CAAC;YAChC,MAAM,IAAI,GAAG,SAAS,CAAC,QAAQ,GAAG,SAAS,CAAC,MAAM,CAAC;YAEnD,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;YACvC,IAAI,CAAC,SAAS,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBACzC,OAAO,IAAI,CAAC,CAAC,mBAAmB;YAClC,CAAC;YAED,iDAAiD;YACjD,MAAM,aAAa,GAAG,SAAS,CAAC,MAAM,CACpC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,SAAS,KAAK,SAAS,CAAC,WAAW,EAAE,IAAI,CAAC,CAAC,SAAS,KAAK,GAAG,CACpE,CAAC;YAEF,IAAI,aAAa,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBAC/B,OAAO,IAAI,CAAC,CAAC,4BAA4B;YAC3C,CAAC;YAED,oDAAoD;YACpD,KAAK,MAAM,IAAI,IAAI,aAAa,EAAE,CAAC;gBACjC,uBAAuB;gBACvB,KAAK,MAAM,QAAQ,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;oBACrC,IAAI,IAAI,CAAC,WAAW,CAAC,IAAI,EAAE,QAAQ,CAAC,EAAE,CAAC;wBACrC,yCAAyC;wBACzC,KAAK,MAAM,KAAK,IAAI,IAAI,CAAC,KAAK,EAAE,CAAC;4BAC/B,IAAI,IAAI,CAAC,WAAW,CAAC,IAAI,EAAE,KAAK,CAAC,IAAI,KAAK,CAAC,MAAM,GAAG,QAAQ,CAAC,MAAM,EAAE,CAAC;gCACpE,OAAO,IAAI,CAAC;4BACd,CAAC;wBACH,CAAC;wBACD,OAAO,KAAK,CAAC;oBACf,CAAC;gBACH,CAAC;YACH,CAAC;YAED,OAAO,IAAI,CAAC;QACd,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,IAAI,CAAC;QACd,CAAC;IACH,CAAC;IAED;;OAEG;IACH,aAAa,CAAC,GAAW,EAAE,SAAS,GAAG,GAAG;QACxC,IAAI,CAAC;YACH,MAAM,IAAI,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC;YACnC,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;YAEvC,IAAI,CAAC,SAAS;gBAAE,OAAO,SAAS,CAAC;YAEjC,MAAM,IAAI,GAAG,SAAS,CAAC,IAAI,CACzB,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,SAAS,KAAK,SAAS,CAAC,WAAW,EAAE,IAAI,CAAC,CAAC,SAAS,KAAK,GAAG,CACpE,CAAC;YAEF,OAAO,IAAI,EAAE,UAAU,CAAC;QAC1B,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,SAAS,CAAC;QACnB,CAAC;IACH,CAAC;IAED;;OAEG;IACH,WAAW,CAAC,GAAW;QACrB,IAAI,CAAC;YACH,MAAM,IAAI,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC;YACnC,OAAO,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;QACvC,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,EAAE,CAAC;QACZ,CAAC;IACH,CAAC;IAEO,WAAW,CAAC,IAAY,EAAE,OAAe;QAC/C,uDAAuD;QACvD,IAAI,OAAO,KAAK,GAAG;YAAE,OAAO,IAAI,CAAC;QAEjC,2BAA2B;QAC3B,IAAI,KAAK,GAAG,OAAO;aAChB,OAAO,CAAC,oBAAoB,EAAE,MAAM,CAAC,CAAC,uBAAuB;aAC7D,OAAO,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC,CAAC,eAAe;QAExC,6BAA6B;QAC7B,IAAI,KAAK,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,CAAC;YAC1B,KAAK,GAAG,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,GAAG,GAAG,CAAC;QACnC,CAAC;QAED,IAAI,CAAC;YACH,OAAO,IAAI,MAAM,CAAC,GAAG,GAAG,KAAK,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC5C,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,IAAI,CAAC,UAAU,CAAC,OAAO,CAAC,CAAC;QAClC,CAAC;IACH,CAAC;CACF"}
@@ -0,0 +1,36 @@
1
+ /**
2
+ * Sitemap.xml parser using sitemapper
3
+ */
4
+ export interface SitemapUrl {
5
+ loc: string;
6
+ lastmod?: string;
7
+ changefreq?: string;
8
+ priority?: number;
9
+ }
10
+ export declare class SitemapParser {
11
+ private cache;
12
+ private sitemapper;
13
+ constructor();
14
+ /**
15
+ * Parse sitemap for a URL and return all URLs
16
+ */
17
+ parse(url: string): Promise<string[]>;
18
+ /**
19
+ * Parse sitemap from a specific URL
20
+ */
21
+ parseUrl(sitemapUrl: string): Promise<string[]>;
22
+ private fetchSitemap;
23
+ private parseSitemapContent;
24
+ private parseSitemapIndex;
25
+ private parseUrlset;
26
+ /**
27
+ * Parse sitemap for a URL and return all URLs with metadata
28
+ * Uses sitemapper for robust parsing of sitemap indexes and news sitemaps
29
+ */
30
+ parseWithMetadata(url: string): Promise<SitemapUrl[]>;
31
+ /**
32
+ * Parse sitemap with full metadata
33
+ */
34
+ parseWithMeta(content: string): SitemapUrl[];
35
+ }
36
+ //# sourceMappingURL=sitemap.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"sitemap.d.ts","sourceRoot":"","sources":["../../../src/crawler/sitemap.ts"],"names":[],"mappings":"AAAA;;GAEG;AAKH,MAAM,WAAW,UAAU;IACzB,GAAG,EAAE,MAAM,CAAC;IACZ,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,qBAAa,aAAa;IACxB,OAAO,CAAC,KAAK,CAAoC;IACjD,OAAO,CAAC,UAAU,CAAa;;IAW/B;;OAEG;IACG,KAAK,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC;IAuC3C;;OAEG;IACG,QAAQ,CAAC,UAAU,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC;YAYvC,YAAY;YAkBZ,mBAAmB;IA8BjC,OAAO,CAAC,iBAAiB;IAYzB,OAAO,CAAC,WAAW;IAYnB;;;OAGG;IACG,iBAAiB,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,UAAU,EAAE,CAAC;IA8C3D;;OAEG;IACH,aAAa,CAAC,OAAO,EAAE,MAAM,GAAG,UAAU,EAAE;CA0B7C"}