npm - @redstone-md/mapr - Versions diffs - 0.0.1-alpha → 0.0.3-alpha - Mend

@redstone-md/mapr 0.0.1-alpha → 0.0.3-alpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/lib/scraper.ts CHANGED Viewed

@@ -6,11 +6,15 @@ import {
   discoveredArtifactSchema,
   extractArtifactCandidates,
   extractNestedCandidates,
+  isIgnoredContentType,
+  isAnalyzableArtifactType,
   type ArtifactCandidate,
   type DiscoveredArtifact,
 } from "./artifacts";
 import { WasmModuleSummarizer } from "./wasm";
+const MAPR_USER_AGENT = "mapr";
 const httpUrlSchema = z
   .string()
   .trim()
@@ -18,8 +22,9 @@ const httpUrlSchema = z
   .refine((value) => /^https?:\/\//.test(value), "Expected an http or https URL.");
 const scraperOptionsSchema = z.object({
-  maxPages: z.number().int().positive().default(10),
-  maxArtifacts: z.number().int().positive().default(200),
+  maxPages: z.number().int().positive().default(20),
+  maxArtifacts: z.number().int().positive().default(400),
+  maxDepth: z.number().int().nonnegative().default(3),
 });
 export interface ScrapeResult {
@@ -30,12 +35,30 @@ export interface ScrapeResult {
 }
 type FetchLike = (input: string | URL | Request, init?: RequestInit) => Promise<Response>;
-type ScraperOptions = z.input<typeof scraperOptionsSchema>;
+type NumericScraperOptions = z.input<typeof scraperOptionsSchema>;
+type QueueEntry = { candidate: ArtifactCandidate; depth: number };
+type CrawlScope = "site" | "page";
+export interface ScraperProgressEvent {
+  message: string;
+  url: string;
+  type: ArtifactCandidate["type"];
+  depth: number;
+}
+interface ScraperOptions extends NumericScraperOptions {
+  onProgress?: (event: ScraperProgressEvent) => void;
+}
 function isPageCandidate(candidate: ArtifactCandidate, rootOrigin: string): boolean {
   return candidate.type === "html" && new URL(candidate.url).origin === rootOrigin;
 }
+function isRootLikeEntry(url: string): boolean {
+  const pathname = new URL(url).pathname.toLowerCase();
+  return pathname === "/" || pathname === "" || pathname.endsWith("/index.html") || pathname.endsWith("/index.htm");
+}
 function shouldFollowCandidate(candidate: ArtifactCandidate, rootOrigin: string): boolean {
   if (candidate.type === "html") {
     return new URL(candidate.url).origin === rootOrigin;
@@ -44,38 +67,133 @@ function shouldFollowCandidate(candidate: ArtifactCandidate, rootOrigin: string)
   return true;
 }
+function parseSitemapXml(xml: string, rootOrigin: string): ArtifactCandidate[] {
+  const candidates = new Map<string, ArtifactCandidate>();
+  const regex = /<loc>([^<]+)<\/loc>/gi;
+  let match: RegExpExecArray | null;
+  while ((match = regex.exec(xml)) !== null) {
+    try {
+      const url = new URL(match[1] ?? "").toString();
+      if (new URL(url).origin !== rootOrigin) {
+        continue;
+      }
+      const candidate = artifactCandidateSchema.safeParse({
+        url,
+        type: "html",
+        discoveredFrom: "sitemap:loc",
+      });
+      if (candidate.success) {
+        candidates.set(candidate.data.url, candidate.data);
+      }
+    } catch {
+      continue;
+    }
+  }
+  return [...candidates.values()];
+}
+function parseRobotsSitemaps(robotsText: string): string[] {
+  return robotsText
+    .split(/\r?\n/)
+    .map((line) => line.trim())
+    .filter((line) => /^sitemap:/i.test(line))
+    .map((line) => line.replace(/^sitemap:\s*/i, "").trim())
+    .filter(Boolean);
+}
+function summarizeSourceMap(rawMap: string, mapUrl: string): string {
+  try {
+    const payload = z
+      .object({
+        version: z.number().optional(),
+        file: z.string().optional(),
+        sourceRoot: z.string().optional(),
+        sources: z.array(z.string()).optional(),
+        sourcesContent: z.array(z.string().nullable()).optional(),
+      })
+      .parse(JSON.parse(rawMap) as unknown);
+    const sources = payload.sources ?? [];
+    const sourcesContent = payload.sourcesContent ?? [];
+    const lines = [`Source map: ${mapUrl}`, `Mapped sources: ${sources.length}`];
+    for (let index = 0; index < sources.length; index += 1) {
+      const sourceName = sources[index];
+      const sourceContent = sourcesContent[index];
+      if (!sourceName) {
+        continue;
+      }
+      lines.push(`--- Source: ${sourceName}`);
+      if (typeof sourceContent === "string" && sourceContent.length > 0) {
+        lines.push(sourceContent);
+      }
+    }
+    return lines.join("\n");
+  } catch {
+    return rawMap;
+  }
+}
 export class BundleScraper {
   private readonly options: z.infer<typeof scraperOptionsSchema>;
   private readonly wasmSummarizer = new WasmModuleSummarizer();
+  private readonly onProgress: ((event: ScraperProgressEvent) => void) | undefined;
   public constructor(
     private readonly fetcher: FetchLike = fetch,
     options: ScraperOptions = {},
   ) {
     this.options = scraperOptionsSchema.parse(options);
+    this.onProgress = options.onProgress;
   }
   public async scrape(pageUrl: string): Promise<ScrapeResult> {
     const validatedPageUrl = httpUrlSchema.parse(pageUrl);
     const rootOrigin = new URL(validatedPageUrl).origin;
+    const crawlScope: CrawlScope = isRootLikeEntry(validatedPageUrl) ? "site" : "page";
     const visitedUrls = new Set<string>();
     const htmlPages = new Set<string>();
     const artifacts: DiscoveredArtifact[] = [];
-    const queue: ArtifactCandidate[] = [
-      artifactCandidateSchema.parse({
-        url: validatedPageUrl,
-        type: "html",
-        discoveredFrom: "root",
-      }),
+    const queue: QueueEntry[] = [
+      {
+        candidate: artifactCandidateSchema.parse({
+          url: validatedPageUrl,
+          type: "html",
+          discoveredFrom: "root",
+        }),
+        depth: 0,
+      },
     ];
+    if (crawlScope === "site") {
+      queue.push(...(await this.discoverSupplementalPages(rootOrigin)).map((candidate) => ({ candidate, depth: 1 })));
+    }
     while (queue.length > 0) {
       if (artifacts.length >= this.options.maxArtifacts) {
         break;
       }
-      const candidate = queue.shift();
-      if (!candidate || visitedUrls.has(candidate.url)) {
+      const entry = queue.shift();
+      if (!entry || visitedUrls.has(entry.candidate.url)) {
+        continue;
+      }
+      const { candidate, depth } = entry;
+      if (depth > this.options.maxDepth) {
+        this.emitProgress({
+          message: `Skipping ${candidate.type} beyond crawl depth ${this.options.maxDepth}: ${candidate.url}`,
+          url: candidate.url,
+          type: candidate.type,
+          depth,
+        });
         continue;
       }
@@ -88,19 +206,41 @@ export class BundleScraper {
       }
       visitedUrls.add(candidate.url);
-      const artifact = await this.fetchArtifact(candidate);
-      artifacts.push(artifact);
+      this.emitProgress({
+        message: `Fetching ${candidate.type} depth ${depth}: ${candidate.url}`,
+        url: candidate.url,
+        type: candidate.type,
+        depth,
+      });
+      const artifact = await this.fetchArtifact(candidate, depth, candidate.url === validatedPageUrl);
+      if (!artifact) {
+        continue;
+      }
       if (artifact.type === "html") {
         htmlPages.add(artifact.url);
       }
-      const nestedCandidates = extractNestedCandidates(artifact);
+      if (isAnalyzableArtifactType(artifact.type)) {
+        artifacts.push(artifact);
+      }
+      const nestedCandidates = this.filterNestedCandidates(extractNestedCandidates(artifact), validatedPageUrl, crawlScope);
       for (const nestedCandidate of nestedCandidates) {
         if (!visitedUrls.has(nestedCandidate.url)) {
-          queue.push(nestedCandidate);
+          queue.push({ candidate: nestedCandidate, depth: depth + 1 });
         }
       }
+      if (nestedCandidates.length > 0) {
+        this.emitProgress({
+          message: `Discovered ${nestedCandidates.length} nested candidate(s) from ${artifact.url}`,
+          url: artifact.url,
+          type: artifact.type,
+          depth,
+        });
+      }
     }
     return {
@@ -113,10 +253,69 @@ export class BundleScraper {
     };
   }
-  private async fetchArtifact(candidate: ArtifactCandidate): Promise<DiscoveredArtifact> {
-    const response = await this.fetchResponse(candidate.url, candidate.type);
+  private async discoverSupplementalPages(rootOrigin: string): Promise<ArtifactCandidate[]> {
+    const candidates = new Map<string, ArtifactCandidate>();
+    const directSitemapUrl = new URL("/sitemap.xml", rootOrigin).toString();
+    const robotsUrl = new URL("/robots.txt", rootOrigin).toString();
+    const robotsText = await this.fetchOptionalText(robotsUrl);
+    const sitemapUrls = new Set<string>([directSitemapUrl]);
+    if (robotsText) {
+      for (const sitemapUrl of parseRobotsSitemaps(robotsText)) {
+        try {
+          const normalizedUrl = new URL(sitemapUrl, rootOrigin).toString();
+          if (new URL(normalizedUrl).origin === rootOrigin) {
+            sitemapUrls.add(normalizedUrl);
+          }
+        } catch {
+          continue;
+        }
+      }
+    }
+    for (const sitemapUrl of sitemapUrls) {
+      const sitemapXml = await this.fetchOptionalText(sitemapUrl);
+      if (!sitemapXml) {
+        continue;
+      }
+      for (const candidate of parseSitemapXml(sitemapXml, rootOrigin)) {
+        candidates.set(candidate.url, candidate);
+      }
+    }
+    return [...candidates.values()];
+  }
+  private async fetchArtifact(candidate: ArtifactCandidate, depth: number, required: boolean): Promise<DiscoveredArtifact | null> {
+    const response = await this.fetchResponse(candidate.url, candidate.type, depth, required);
+    if (!response) {
+      return null;
+    }
     const contentType = response.headers.get("content-type")?.toLowerCase() ?? "";
+    if (isIgnoredContentType(contentType)) {
+      this.emitProgress({
+        message: `Skipping binary or font asset returned from ${candidate.url}`,
+        url: candidate.url,
+        type: candidate.type,
+        depth,
+      });
+      return null;
+    }
+    if (candidate.type === "html" && !contentType.includes("text/html") && !contentType.includes("application/xhtml+xml")) {
+      this.emitProgress({
+        message: `Skipping non-HTML response for discovered page ${candidate.url}`,
+        url: candidate.url,
+        type: candidate.type,
+        depth,
+      });
+      return null;
+    }
     if (candidate.type === "wasm" || contentType.includes("application/wasm")) {
       const bytes = new Uint8Array(await response.arrayBuffer());
       return discoveredArtifactSchema.parse({
@@ -131,8 +330,14 @@ export class BundleScraper {
       });
     }
-    const content = await response.text();
-    const resolvedType = contentType.includes("text/html") ? "html" : candidate.type;
+    const rawContent = await response.text();
+    const resolvedType = contentType.includes("text/html")
+      ? "html"
+      : contentType.includes("application/json") && candidate.type === "source-map"
+        ? "source-map"
+        : candidate.type;
+    const content = resolvedType === "source-map" ? summarizeSourceMap(rawContent, candidate.url) : rawContent;
     return discoveredArtifactSchema.parse({
       url: candidate.url,
@@ -143,20 +348,45 @@ export class BundleScraper {
     });
   }
-  private async fetchResponse(url: string, artifactType: ArtifactCandidate["type"]): Promise<Response> {
+  private async fetchResponse(
+    url: string,
+    artifactType: ArtifactCandidate["type"],
+    depth: number,
+    required: boolean,
+  ): Promise<Response | null> {
     try {
       const response = await this.fetcher(url, {
         headers: {
-          "user-agent": "mapr/0.2.0",
+          "user-agent": MAPR_USER_AGENT,
         },
       });
       if (!response.ok) {
-        throw new Error(`Failed to fetch ${artifactType} from ${url}: ${response.status} ${response.statusText}`);
+        if (required) {
+          throw new Error(`Failed to fetch ${artifactType} from ${url}: ${response.status} ${response.statusText}`);
+        }
+        this.emitProgress({
+          message: `Skipping ${artifactType} after ${response.status} ${response.statusText}: ${url}`,
+          url,
+          type: artifactType,
+          depth,
+        });
+        return null;
       }
       return response;
     } catch (error) {
+      if (!required) {
+        this.emitProgress({
+          message: `Skipping ${artifactType} after fetch error: ${url}`,
+          url,
+          type: artifactType,
+          depth,
+        });
+        return null;
+      }
       if (error instanceof Error) {
         throw new Error(`Unable to fetch ${artifactType} artifact ${url}: ${error.message}`);
       }
@@ -164,6 +394,72 @@ export class BundleScraper {
       throw new Error(`Unable to fetch ${artifactType} artifact ${url}.`);
     }
   }
+  private async fetchOptionalText(url: string): Promise<string | null> {
+    try {
+      const response = await this.fetcher(url, {
+        headers: {
+          "user-agent": MAPR_USER_AGENT,
+        },
+      });
+      if (!response.ok) {
+        return null;
+      }
+      return await response.text();
+    } catch {
+      return null;
+    }
+  }
+  private emitProgress(event: ScraperProgressEvent): void {
+    this.onProgress?.(event);
+  }
+  private filterNestedCandidates(
+    candidates: ArtifactCandidate[],
+    entryUrl: string,
+    crawlScope: CrawlScope,
+  ): ArtifactCandidate[] {
+    if (crawlScope === "site") {
+      return candidates;
+    }
+    const entryPath = new URL(entryUrl).pathname.toLowerCase();
+    const entryStem = entryPath.replace(/(?:index)?\.html?$/i, "").replace(/\/+$/, "") || entryPath;
+    const entryDirectory = entryPath.includes("/") ? entryPath.slice(0, entryPath.lastIndexOf("/") + 1) : "/";
+    return candidates.filter((candidate) => {
+      if (candidate.type !== "html") {
+        return true;
+      }
+      const discoveredFrom = candidate.discoveredFrom.toLowerCase();
+      if (discoveredFrom.includes("iframe") || discoveredFrom.includes("form")) {
+        return true;
+      }
+      const candidatePath = new URL(candidate.url).pathname.toLowerCase();
+      if (candidatePath === entryPath) {
+        return true;
+      }
+      if (entryDirectory !== "/") {
+        return candidatePath.startsWith(entryDirectory);
+      }
+      if (entryStem !== entryPath && candidatePath.startsWith(entryStem)) {
+        return true;
+      }
+      if (candidatePath.startsWith(`${entryPath}/`)) {
+        return true;
+      }
+      return false;
+    });
+  }
 }
 export { extractArtifactCandidates };

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@redstone-md/mapr",
-  "version": "0.0.1-alpha",
+  "version": "0.0.3-alpha",
   "type": "module",
   "description": "Bun-native CLI/TUI for reverse-engineering frontend websites, bundles, WASM, and service workers",
   "license": "SEE LICENSE IN LICENSE",
@@ -30,6 +30,7 @@
     "mapr": "./bin/mapr"
   },
   "files": [
+    "assets",
     "bin",
     "index.ts",
     "lib",