@redstone-md/mapr 0.0.1-alpha → 0.0.3-alpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lib/scraper.ts CHANGED
@@ -6,11 +6,15 @@ import {
6
6
  discoveredArtifactSchema,
7
7
  extractArtifactCandidates,
8
8
  extractNestedCandidates,
9
+ isIgnoredContentType,
10
+ isAnalyzableArtifactType,
9
11
  type ArtifactCandidate,
10
12
  type DiscoveredArtifact,
11
13
  } from "./artifacts";
12
14
  import { WasmModuleSummarizer } from "./wasm";
13
15
 
16
+ const MAPR_USER_AGENT = "mapr";
17
+
14
18
  const httpUrlSchema = z
15
19
  .string()
16
20
  .trim()
@@ -18,8 +22,9 @@ const httpUrlSchema = z
18
22
  .refine((value) => /^https?:\/\//.test(value), "Expected an http or https URL.");
19
23
 
20
24
  const scraperOptionsSchema = z.object({
21
- maxPages: z.number().int().positive().default(10),
22
- maxArtifacts: z.number().int().positive().default(200),
25
+ maxPages: z.number().int().positive().default(20),
26
+ maxArtifacts: z.number().int().positive().default(400),
27
+ maxDepth: z.number().int().nonnegative().default(3),
23
28
  });
24
29
 
25
30
  export interface ScrapeResult {
@@ -30,12 +35,30 @@ export interface ScrapeResult {
30
35
  }
31
36
 
32
37
  type FetchLike = (input: string | URL | Request, init?: RequestInit) => Promise<Response>;
33
- type ScraperOptions = z.input<typeof scraperOptionsSchema>;
38
+ type NumericScraperOptions = z.input<typeof scraperOptionsSchema>;
39
+ type QueueEntry = { candidate: ArtifactCandidate; depth: number };
40
+ type CrawlScope = "site" | "page";
41
+
42
+ export interface ScraperProgressEvent {
43
+ message: string;
44
+ url: string;
45
+ type: ArtifactCandidate["type"];
46
+ depth: number;
47
+ }
48
+
49
+ interface ScraperOptions extends NumericScraperOptions {
50
+ onProgress?: (event: ScraperProgressEvent) => void;
51
+ }
34
52
 
35
53
  function isPageCandidate(candidate: ArtifactCandidate, rootOrigin: string): boolean {
36
54
  return candidate.type === "html" && new URL(candidate.url).origin === rootOrigin;
37
55
  }
38
56
 
57
+ function isRootLikeEntry(url: string): boolean {
58
+ const pathname = new URL(url).pathname.toLowerCase();
59
+ return pathname === "/" || pathname === "" || pathname.endsWith("/index.html") || pathname.endsWith("/index.htm");
60
+ }
61
+
39
62
  function shouldFollowCandidate(candidate: ArtifactCandidate, rootOrigin: string): boolean {
40
63
  if (candidate.type === "html") {
41
64
  return new URL(candidate.url).origin === rootOrigin;
@@ -44,38 +67,133 @@ function shouldFollowCandidate(candidate: ArtifactCandidate, rootOrigin: string)
44
67
  return true;
45
68
  }
46
69
 
70
+ function parseSitemapXml(xml: string, rootOrigin: string): ArtifactCandidate[] {
71
+ const candidates = new Map<string, ArtifactCandidate>();
72
+ const regex = /<loc>([^<]+)<\/loc>/gi;
73
+ let match: RegExpExecArray | null;
74
+
75
+ while ((match = regex.exec(xml)) !== null) {
76
+ try {
77
+ const url = new URL(match[1] ?? "").toString();
78
+ if (new URL(url).origin !== rootOrigin) {
79
+ continue;
80
+ }
81
+
82
+ const candidate = artifactCandidateSchema.safeParse({
83
+ url,
84
+ type: "html",
85
+ discoveredFrom: "sitemap:loc",
86
+ });
87
+
88
+ if (candidate.success) {
89
+ candidates.set(candidate.data.url, candidate.data);
90
+ }
91
+ } catch {
92
+ continue;
93
+ }
94
+ }
95
+
96
+ return [...candidates.values()];
97
+ }
98
+
99
+ function parseRobotsSitemaps(robotsText: string): string[] {
100
+ return robotsText
101
+ .split(/\r?\n/)
102
+ .map((line) => line.trim())
103
+ .filter((line) => /^sitemap:/i.test(line))
104
+ .map((line) => line.replace(/^sitemap:\s*/i, "").trim())
105
+ .filter(Boolean);
106
+ }
107
+
108
+ function summarizeSourceMap(rawMap: string, mapUrl: string): string {
109
+ try {
110
+ const payload = z
111
+ .object({
112
+ version: z.number().optional(),
113
+ file: z.string().optional(),
114
+ sourceRoot: z.string().optional(),
115
+ sources: z.array(z.string()).optional(),
116
+ sourcesContent: z.array(z.string().nullable()).optional(),
117
+ })
118
+ .parse(JSON.parse(rawMap) as unknown);
119
+
120
+ const sources = payload.sources ?? [];
121
+ const sourcesContent = payload.sourcesContent ?? [];
122
+ const lines = [`Source map: ${mapUrl}`, `Mapped sources: ${sources.length}`];
123
+
124
+ for (let index = 0; index < sources.length; index += 1) {
125
+ const sourceName = sources[index];
126
+ const sourceContent = sourcesContent[index];
127
+ if (!sourceName) {
128
+ continue;
129
+ }
130
+
131
+ lines.push(`--- Source: ${sourceName}`);
132
+ if (typeof sourceContent === "string" && sourceContent.length > 0) {
133
+ lines.push(sourceContent);
134
+ }
135
+ }
136
+
137
+ return lines.join("\n");
138
+ } catch {
139
+ return rawMap;
140
+ }
141
+ }
142
+
47
143
  export class BundleScraper {
48
144
  private readonly options: z.infer<typeof scraperOptionsSchema>;
49
145
  private readonly wasmSummarizer = new WasmModuleSummarizer();
146
+ private readonly onProgress: ((event: ScraperProgressEvent) => void) | undefined;
50
147
 
51
148
  public constructor(
52
149
  private readonly fetcher: FetchLike = fetch,
53
150
  options: ScraperOptions = {},
54
151
  ) {
55
152
  this.options = scraperOptionsSchema.parse(options);
153
+ this.onProgress = options.onProgress;
56
154
  }
57
155
 
58
156
  public async scrape(pageUrl: string): Promise<ScrapeResult> {
59
157
  const validatedPageUrl = httpUrlSchema.parse(pageUrl);
60
158
  const rootOrigin = new URL(validatedPageUrl).origin;
159
+ const crawlScope: CrawlScope = isRootLikeEntry(validatedPageUrl) ? "site" : "page";
61
160
  const visitedUrls = new Set<string>();
62
161
  const htmlPages = new Set<string>();
63
162
  const artifacts: DiscoveredArtifact[] = [];
64
- const queue: ArtifactCandidate[] = [
65
- artifactCandidateSchema.parse({
66
- url: validatedPageUrl,
67
- type: "html",
68
- discoveredFrom: "root",
69
- }),
163
+ const queue: QueueEntry[] = [
164
+ {
165
+ candidate: artifactCandidateSchema.parse({
166
+ url: validatedPageUrl,
167
+ type: "html",
168
+ discoveredFrom: "root",
169
+ }),
170
+ depth: 0,
171
+ },
70
172
  ];
71
173
 
174
+ if (crawlScope === "site") {
175
+ queue.push(...(await this.discoverSupplementalPages(rootOrigin)).map((candidate) => ({ candidate, depth: 1 })));
176
+ }
177
+
72
178
  while (queue.length > 0) {
73
179
  if (artifacts.length >= this.options.maxArtifacts) {
74
180
  break;
75
181
  }
76
182
 
77
- const candidate = queue.shift();
78
- if (!candidate || visitedUrls.has(candidate.url)) {
183
+ const entry = queue.shift();
184
+ if (!entry || visitedUrls.has(entry.candidate.url)) {
185
+ continue;
186
+ }
187
+
188
+ const { candidate, depth } = entry;
189
+
190
+ if (depth > this.options.maxDepth) {
191
+ this.emitProgress({
192
+ message: `Skipping ${candidate.type} beyond crawl depth ${this.options.maxDepth}: ${candidate.url}`,
193
+ url: candidate.url,
194
+ type: candidate.type,
195
+ depth,
196
+ });
79
197
  continue;
80
198
  }
81
199
 
@@ -88,19 +206,41 @@ export class BundleScraper {
88
206
  }
89
207
 
90
208
  visitedUrls.add(candidate.url);
91
- const artifact = await this.fetchArtifact(candidate);
92
- artifacts.push(artifact);
209
+ this.emitProgress({
210
+ message: `Fetching ${candidate.type} depth ${depth}: ${candidate.url}`,
211
+ url: candidate.url,
212
+ type: candidate.type,
213
+ depth,
214
+ });
215
+
216
+ const artifact = await this.fetchArtifact(candidate, depth, candidate.url === validatedPageUrl);
217
+ if (!artifact) {
218
+ continue;
219
+ }
93
220
 
94
221
  if (artifact.type === "html") {
95
222
  htmlPages.add(artifact.url);
96
223
  }
97
224
 
98
- const nestedCandidates = extractNestedCandidates(artifact);
225
+ if (isAnalyzableArtifactType(artifact.type)) {
226
+ artifacts.push(artifact);
227
+ }
228
+
229
+ const nestedCandidates = this.filterNestedCandidates(extractNestedCandidates(artifact), validatedPageUrl, crawlScope);
99
230
  for (const nestedCandidate of nestedCandidates) {
100
231
  if (!visitedUrls.has(nestedCandidate.url)) {
101
- queue.push(nestedCandidate);
232
+ queue.push({ candidate: nestedCandidate, depth: depth + 1 });
102
233
  }
103
234
  }
235
+
236
+ if (nestedCandidates.length > 0) {
237
+ this.emitProgress({
238
+ message: `Discovered ${nestedCandidates.length} nested candidate(s) from ${artifact.url}`,
239
+ url: artifact.url,
240
+ type: artifact.type,
241
+ depth,
242
+ });
243
+ }
104
244
  }
105
245
 
106
246
  return {
@@ -113,10 +253,69 @@ export class BundleScraper {
113
253
  };
114
254
  }
115
255
 
116
- private async fetchArtifact(candidate: ArtifactCandidate): Promise<DiscoveredArtifact> {
117
- const response = await this.fetchResponse(candidate.url, candidate.type);
256
+ private async discoverSupplementalPages(rootOrigin: string): Promise<ArtifactCandidate[]> {
257
+ const candidates = new Map<string, ArtifactCandidate>();
258
+ const directSitemapUrl = new URL("/sitemap.xml", rootOrigin).toString();
259
+ const robotsUrl = new URL("/robots.txt", rootOrigin).toString();
260
+
261
+ const robotsText = await this.fetchOptionalText(robotsUrl);
262
+ const sitemapUrls = new Set<string>([directSitemapUrl]);
263
+
264
+ if (robotsText) {
265
+ for (const sitemapUrl of parseRobotsSitemaps(robotsText)) {
266
+ try {
267
+ const normalizedUrl = new URL(sitemapUrl, rootOrigin).toString();
268
+ if (new URL(normalizedUrl).origin === rootOrigin) {
269
+ sitemapUrls.add(normalizedUrl);
270
+ }
271
+ } catch {
272
+ continue;
273
+ }
274
+ }
275
+ }
276
+
277
+ for (const sitemapUrl of sitemapUrls) {
278
+ const sitemapXml = await this.fetchOptionalText(sitemapUrl);
279
+ if (!sitemapXml) {
280
+ continue;
281
+ }
282
+
283
+ for (const candidate of parseSitemapXml(sitemapXml, rootOrigin)) {
284
+ candidates.set(candidate.url, candidate);
285
+ }
286
+ }
287
+
288
+ return [...candidates.values()];
289
+ }
290
+
291
+ private async fetchArtifact(candidate: ArtifactCandidate, depth: number, required: boolean): Promise<DiscoveredArtifact | null> {
292
+ const response = await this.fetchResponse(candidate.url, candidate.type, depth, required);
293
+ if (!response) {
294
+ return null;
295
+ }
296
+
118
297
  const contentType = response.headers.get("content-type")?.toLowerCase() ?? "";
119
298
 
299
+ if (isIgnoredContentType(contentType)) {
300
+ this.emitProgress({
301
+ message: `Skipping binary or font asset returned from ${candidate.url}`,
302
+ url: candidate.url,
303
+ type: candidate.type,
304
+ depth,
305
+ });
306
+ return null;
307
+ }
308
+
309
+ if (candidate.type === "html" && !contentType.includes("text/html") && !contentType.includes("application/xhtml+xml")) {
310
+ this.emitProgress({
311
+ message: `Skipping non-HTML response for discovered page ${candidate.url}`,
312
+ url: candidate.url,
313
+ type: candidate.type,
314
+ depth,
315
+ });
316
+ return null;
317
+ }
318
+
120
319
  if (candidate.type === "wasm" || contentType.includes("application/wasm")) {
121
320
  const bytes = new Uint8Array(await response.arrayBuffer());
122
321
  return discoveredArtifactSchema.parse({
@@ -131,8 +330,14 @@ export class BundleScraper {
131
330
  });
132
331
  }
133
332
 
134
- const content = await response.text();
135
- const resolvedType = contentType.includes("text/html") ? "html" : candidate.type;
333
+ const rawContent = await response.text();
334
+ const resolvedType = contentType.includes("text/html")
335
+ ? "html"
336
+ : contentType.includes("application/json") && candidate.type === "source-map"
337
+ ? "source-map"
338
+ : candidate.type;
339
+
340
+ const content = resolvedType === "source-map" ? summarizeSourceMap(rawContent, candidate.url) : rawContent;
136
341
 
137
342
  return discoveredArtifactSchema.parse({
138
343
  url: candidate.url,
@@ -143,20 +348,45 @@ export class BundleScraper {
143
348
  });
144
349
  }
145
350
 
146
- private async fetchResponse(url: string, artifactType: ArtifactCandidate["type"]): Promise<Response> {
351
+ private async fetchResponse(
352
+ url: string,
353
+ artifactType: ArtifactCandidate["type"],
354
+ depth: number,
355
+ required: boolean,
356
+ ): Promise<Response | null> {
147
357
  try {
148
358
  const response = await this.fetcher(url, {
149
359
  headers: {
150
- "user-agent": "mapr/0.2.0",
360
+ "user-agent": MAPR_USER_AGENT,
151
361
  },
152
362
  });
153
363
 
154
364
  if (!response.ok) {
155
- throw new Error(`Failed to fetch ${artifactType} from ${url}: ${response.status} ${response.statusText}`);
365
+ if (required) {
366
+ throw new Error(`Failed to fetch ${artifactType} from ${url}: ${response.status} ${response.statusText}`);
367
+ }
368
+
369
+ this.emitProgress({
370
+ message: `Skipping ${artifactType} after ${response.status} ${response.statusText}: ${url}`,
371
+ url,
372
+ type: artifactType,
373
+ depth,
374
+ });
375
+ return null;
156
376
  }
157
377
 
158
378
  return response;
159
379
  } catch (error) {
380
+ if (!required) {
381
+ this.emitProgress({
382
+ message: `Skipping ${artifactType} after fetch error: ${url}`,
383
+ url,
384
+ type: artifactType,
385
+ depth,
386
+ });
387
+ return null;
388
+ }
389
+
160
390
  if (error instanceof Error) {
161
391
  throw new Error(`Unable to fetch ${artifactType} artifact ${url}: ${error.message}`);
162
392
  }
@@ -164,6 +394,72 @@ export class BundleScraper {
164
394
  throw new Error(`Unable to fetch ${artifactType} artifact ${url}.`);
165
395
  }
166
396
  }
397
+
398
+ private async fetchOptionalText(url: string): Promise<string | null> {
399
+ try {
400
+ const response = await this.fetcher(url, {
401
+ headers: {
402
+ "user-agent": MAPR_USER_AGENT,
403
+ },
404
+ });
405
+
406
+ if (!response.ok) {
407
+ return null;
408
+ }
409
+
410
+ return await response.text();
411
+ } catch {
412
+ return null;
413
+ }
414
+ }
415
+
416
+ private emitProgress(event: ScraperProgressEvent): void {
417
+ this.onProgress?.(event);
418
+ }
419
+
420
+ private filterNestedCandidates(
421
+ candidates: ArtifactCandidate[],
422
+ entryUrl: string,
423
+ crawlScope: CrawlScope,
424
+ ): ArtifactCandidate[] {
425
+ if (crawlScope === "site") {
426
+ return candidates;
427
+ }
428
+
429
+ const entryPath = new URL(entryUrl).pathname.toLowerCase();
430
+ const entryStem = entryPath.replace(/(?:index)?\.html?$/i, "").replace(/\/+$/, "") || entryPath;
431
+ const entryDirectory = entryPath.includes("/") ? entryPath.slice(0, entryPath.lastIndexOf("/") + 1) : "/";
432
+
433
+ return candidates.filter((candidate) => {
434
+ if (candidate.type !== "html") {
435
+ return true;
436
+ }
437
+
438
+ const discoveredFrom = candidate.discoveredFrom.toLowerCase();
439
+ if (discoveredFrom.includes("iframe") || discoveredFrom.includes("form")) {
440
+ return true;
441
+ }
442
+
443
+ const candidatePath = new URL(candidate.url).pathname.toLowerCase();
444
+ if (candidatePath === entryPath) {
445
+ return true;
446
+ }
447
+
448
+ if (entryDirectory !== "/") {
449
+ return candidatePath.startsWith(entryDirectory);
450
+ }
451
+
452
+ if (entryStem !== entryPath && candidatePath.startsWith(entryStem)) {
453
+ return true;
454
+ }
455
+
456
+ if (candidatePath.startsWith(`${entryPath}/`)) {
457
+ return true;
458
+ }
459
+
460
+ return false;
461
+ });
462
+ }
167
463
  }
168
464
 
169
465
  export { extractArtifactCandidates };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@redstone-md/mapr",
3
- "version": "0.0.1-alpha",
3
+ "version": "0.0.3-alpha",
4
4
  "type": "module",
5
5
  "description": "Bun-native CLI/TUI for reverse-engineering frontend websites, bundles, WASM, and service workers",
6
6
  "license": "SEE LICENSE IN LICENSE",
@@ -30,6 +30,7 @@
30
30
  "mapr": "./bin/mapr"
31
31
  },
32
32
  "files": [
33
+ "assets",
33
34
  "bin",
34
35
  "index.ts",
35
36
  "lib",