npm - messi-crawler - Versions diffs - 1.0.0 - Mend

messi-crawler 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (73) hide show

package/README.md +201 -0
package/dist/cli/renderer.js +71 -0
package/dist/config.js +18 -0
package/dist/db/clear.js +16 -0
package/dist/db/client.js +20 -0
package/dist/db/queries.js +179 -0
package/dist/frontier/frontier.js +44 -0
package/dist/frontier/logger.js +65 -0
package/dist/frontier/robots.js +46 -0
package/dist/frontier/scheduler.js +98 -0
package/dist/index.js +533 -0
package/dist/normalizer.js +33 -0
package/dist/output/db-strategy.js +16 -0
package/dist/output/index.js +23 -0
package/dist/output/pdf-strategy.js +316 -0
package/dist/output/strategy.js +1 -0
package/dist/security/ssrf.js +45 -0
package/dist/security/validate-url.js +41 -0
package/dist/seed.js +14 -0
package/dist/setup.js +148 -0
package/dist/test/client.test.js +33 -0
package/dist/test/downloader.test.js +84 -0
package/dist/test/extractor.test.js +126 -0
package/dist/test/frontier.test.js +43 -0
package/dist/test/logger.test.js +55 -0
package/dist/test/normalizer.test.js +36 -0
package/dist/test/pdf-strategy.test.js +68 -0
package/dist/test/queries.test.js +173 -0
package/dist/test/robots.test.js +46 -0
package/dist/test/scheduler.test.js +73 -0
package/dist/test/seed.test.js +26 -0
package/dist/test/worker.test.js +118 -0
package/dist/worker/downloader.js +114 -0
package/dist/worker/extractor.js +197 -0
package/dist/worker/worker.js +87 -0
package/package.json +48 -0
package/seeds.txt +4 -0
package/src/cli/renderer.ts +83 -0
package/src/config.ts +22 -0
package/src/db/clear.ts +16 -0
package/src/db/client.ts +26 -0
package/src/db/queries.ts +255 -0
package/src/db/schema.sql +43 -0
package/src/frontier/frontier.ts +60 -0
package/src/frontier/logger.ts +75 -0
package/src/frontier/robots.ts +50 -0
package/src/frontier/scheduler.ts +119 -0
package/src/index.ts +596 -0
package/src/normalizer.ts +37 -0
package/src/output/db-strategy.ts +20 -0
package/src/output/index.ts +32 -0
package/src/output/pdf-strategy.ts +388 -0
package/src/output/strategy.ts +16 -0
package/src/security/ssrf.ts +48 -0
package/src/security/validate-url.ts +49 -0
package/src/seed.ts +18 -0
package/src/setup.ts +170 -0
package/src/test/client.test.ts +38 -0
package/src/test/downloader.test.ts +101 -0
package/src/test/extractor.test.ts +139 -0
package/src/test/frontier.test.ts +53 -0
package/src/test/logger.test.ts +71 -0
package/src/test/normalizer.test.ts +43 -0
package/src/test/pdf-strategy.test.ts +84 -0
package/src/test/queries.test.ts +247 -0
package/src/test/robots.test.ts +56 -0
package/src/test/scheduler.test.ts +90 -0
package/src/test/seed.test.ts +35 -0
package/src/test/worker.test.ts +144 -0
package/src/worker/downloader.ts +149 -0
package/src/worker/extractor.ts +235 -0
package/src/worker/worker.ts +100 -0
package/tsconfig.json +15 -0

package/src/worker/downloader.ts ADDED Viewed

@@ -0,0 +1,149 @@
+import { request } from "undici";
+import { config } from "../config.js";
+import { isBlockedAddress } from "../security/ssrf.js";
+export interface DownloaderResult {
+  url: string;
+  html: string;
+  statusCode: number;
+}
+/**
+ * Guards against SSRF by resolving the hostname before the request is made.
+ * Throws if the address is private, loopback, or link-local.
+ */
+async function assertNotBlocked(url: string): Promise<void> {
+  const hostname = new URL(url).hostname;
+  if (await isBlockedAddress(hostname)) {
+    throw new Error(`SSRF blocked: "${hostname}" resolves to a private or internal address`);
+  }
+}
+/**
+ * Fetches the HTML content of a page, following redirects up to MAX_REDIRECTS.
+ * Tracks the final URL, enforces a request timeout, and blocks SSRF targets.
+ */
+export async function downloadPage(initialUrl: string): Promise<DownloaderResult> {
+  let currentUrl = initialUrl;
+  let redirectCount = 0;
+  // SSRF check on the initial URL before any network activity
+  await assertNotBlocked(currentUrl);
+  while (true) {
+    const res = await request(currentUrl, {
+      method: "GET",
+      headersTimeout: config.REQUEST_TIMEOUT_MS,
+      bodyTimeout: config.REQUEST_TIMEOUT_MS,
+    });
+    const statusCode = res.statusCode;
+    // Handle redirects (301, 302, 303, 307, 308)
+    if (statusCode >= 300 && statusCode < 400 && res.headers.location) {
+      if (redirectCount >= config.MAX_REDIRECTS) {
+        await res.body.text(); // consume body to release connection
+        throw new Error("Too many redirects");
+      }
+      const location = Array.isArray(res.headers.location)
+        ? res.headers.location[0]
+        : res.headers.location;
+      const nextUrl = new URL(location, currentUrl).href;
+      // SSRF check on every redirect target before following
+      await assertNotBlocked(nextUrl);
+      currentUrl = nextUrl;
+      redirectCount++;
+      await res.body.text(); // consume body
+      continue;
+    }
+    // Error on non-200 responses
+    if (statusCode !== 200) {
+      await res.body.text();
+      throw new Error(`HTTP status ${statusCode}`);
+    }
+    // Skip non-HTML content types
+    const contentTypeHeader = res.headers["content-type"];
+    const contentType = Array.isArray(contentTypeHeader)
+      ? contentTypeHeader[0]
+      : contentTypeHeader;
+    if (contentType && !contentType.includes("text/html")) {
+      await res.body.text();
+      throw new Error(`Non-HTML content type: ${contentType}`);
+    }
+    const html = await res.body.text();
+    return { url: currentUrl, html, statusCode };
+  }
+}
+/**
+ * Downloads image assets securely.
+ * Enforces the same SSRF/blocklist checks, redirection limits, and timeouts as pages.
+ */
+export async function downloadImage(initialUrl: string): Promise<Buffer> {
+  let currentUrl = initialUrl;
+  let redirectCount = 0;
+  // SSRF check on initial URL
+  await assertNotBlocked(currentUrl);
+  while (true) {
+    const res = await request(currentUrl, {
+      method: "GET",
+      headersTimeout: config.REQUEST_TIMEOUT_MS,
+      bodyTimeout: config.REQUEST_TIMEOUT_MS,
+    });
+    const statusCode = res.statusCode;
+    // Handle redirects (301, 302, 303, 307, 308)
+    if (statusCode >= 300 && statusCode < 400 && res.headers.location) {
+      if (redirectCount >= config.MAX_REDIRECTS) {
+        await res.body.text();
+        throw new Error("Too many redirects fetching image");
+      }
+      const location = Array.isArray(res.headers.location)
+        ? res.headers.location[0]
+        : res.headers.location;
+      const nextUrl = new URL(location, currentUrl).href;
+      // SSRF check on target before redirecting
+      await assertNotBlocked(nextUrl);
+      currentUrl = nextUrl;
+      redirectCount++;
+      await res.body.text();
+      continue;
+    }
+    if (statusCode !== 200) {
+      await res.body.text();
+      throw new Error(`HTTP status ${statusCode} fetching image`);
+    }
+    // Validate that it's an image
+    const contentTypeHeader = res.headers["content-type"];
+    const contentType = Array.isArray(contentTypeHeader)
+      ? contentTypeHeader[0]
+      : contentTypeHeader;
+    if (contentType && !contentType.startsWith("image/")) {
+      await res.body.text();
+      throw new Error(`Non-image content type: ${contentType}`);
+    }
+    const arrayBuffer = await res.body.arrayBuffer();
+    return Buffer.from(arrayBuffer);
+  }
+}

package/src/worker/extractor.ts ADDED Viewed

@@ -0,0 +1,235 @@
+import * as cheerio from "cheerio";
+export interface ContentBlock {
+  type: "heading" | "paragraph" | "list" | "image";
+  text?: string;
+  level?: number;
+  items?: string[];
+  src?: string;
+  alt?: string;
+}
+export interface ExtractedImage {
+  src: string;
+  alt: string;
+}
+export interface ExtractedData {
+  title: string | null;
+  description: string | null;
+  canonicalUrl: string | null;
+  headings: {
+    h1: string[];
+    h2: string[];
+    h3: string[];
+  };
+  textContent: string | null;
+  links: string[];
+  blocks?: ContentBlock[];
+  images?: ExtractedImage[];
+}
+/**
+ * Extracts metadata, headings, structured text content blocks, images, and outgoing links from HTML.
+ * Strips site chrome and uses a text-density heuristic if no main content container is found.
+ */
+export function extractPageData(html: string, baseUrl?: string): ExtractedData {
+  const $ = cheerio.load(html);
+  const title = $("title").text().trim() || null;
+  const description = $("meta[name=description]").attr("content")?.trim() || null;
+  const canonicalUrl = $("link[rel=canonical]").attr("href")?.trim() || null;
+  const h1: string[] = [];
+  const h2: string[] = [];
+  const h3: string[] = [];
+  $("h1").each((_, el) => {
+    const text = $(el).text().trim();
+    if (text) h1.push(text);
+  });
+  $("h2").each((_, el) => {
+    const text = $(el).text().trim();
+    if (text) h2.push(text);
+  });
+  $("h3").each((_, el) => {
+    const text = $(el).text().trim();
+    if (text) h3.push(text);
+  });
+  const links: string[] = [];
+  $("a[href]").each((_, el) => {
+    const href = $(el).attr("href")?.trim();
+    if (href) {
+      links.push(href);
+    }
+  });
+  // Determine resolution base URL for images
+  const resolutionBase = canonicalUrl || baseUrl || null;
+  // 1. Main-content heuristic selection
+  let mainNode = $("article").first();
+  if (mainNode.length === 0) {
+    mainNode = $("main").first();
+  }
+  if (mainNode.length === 0) {
+    mainNode = $("[role=main]").first();
+  }
+  // Fallback text-density heuristic
+  if (mainNode.length === 0) {
+    const totalBodyText = $("body").text().trim();
+    const minTextLength = Math.min(200, totalBodyText.length * 0.1);
+    let bestNode = $("body");
+    let maxScore = -1;
+    $("div, section").each((_, el) => {
+      const $el = $(el);
+      const text = $el.text().trim();
+      const textLength = text.length;
+      if (textLength < minTextLength) return;
+      const tagCount = $el.find("*").length;
+      const score = textLength / (tagCount + 1);
+      if (score > maxScore) {
+        maxScore = score;
+        bestNode = $el;
+      }
+    });
+    mainNode = bestNode;
+  }
+  // 2. Clone and clean the chosen node
+  const cleanedNode = mainNode.clone();
+  cleanedNode.find("script, style, noscript, iframe, nav, footer, header").remove();
+  // 3. Extract in-order content blocks and overall images list
+  const blocks: ContentBlock[] = [];
+  const images: ExtractedImage[] = [];
+  // Extract all images inside the cleaned main node
+  cleanedNode.find("img").each((_, img) => {
+    const src = $(img).attr("src")?.trim();
+    const alt = $(img).attr("alt")?.trim() || "";
+    if (src) {
+      let resolvedSrc = src;
+      if (resolutionBase) {
+        try {
+          resolvedSrc = new URL(src, resolutionBase).href;
+        } catch {
+          // keep relative src if resolution fails
+        }
+      }
+      images.push({ src: resolvedSrc, alt });
+    }
+  });
+  // Track if we need to force a new paragraph on the next text node
+  let forceNewParagraph = true;
+  function walk(node: any) {
+    if (node.type === "text") {
+      const text = (node as any).data.replace(/\s+/g, " ").trim();
+      if (text) {
+        const lastBlock = blocks[blocks.length - 1];
+        if (!forceNewParagraph && lastBlock && lastBlock.type === "paragraph") {
+          lastBlock.text = (lastBlock.text + " " + text).replace(/\s+/g, " ").trim();
+        } else {
+          blocks.push({ type: "paragraph", text });
+          forceNewParagraph = false;
+        }
+      }
+      return;
+    }
+    if (node.type !== "tag") {
+      return;
+    }
+    const el = node as any;
+    const tagName = el.tagName?.toLowerCase();
+    // Skip removed elements just in case
+    if (["script", "style", "noscript", "iframe", "nav", "footer", "header"].includes(tagName)) {
+      return;
+    }
+    if (/^h[1-6]$/.test(tagName)) {
+      const level = parseInt(tagName.substring(1), 10);
+      const text = $(el).text().replace(/\s+/g, " ").trim();
+      if (text) {
+        blocks.push({ type: "heading", level, text });
+      }
+      forceNewParagraph = true;
+    } else if (tagName === "p") {
+      const text = $(el).text().replace(/\s+/g, " ").trim();
+      if (text) {
+        blocks.push({ type: "paragraph", text });
+      }
+      forceNewParagraph = true;
+    } else if (tagName === "ul" || tagName === "ol") {
+      const items: string[] = [];
+      $(el).find("li").each((_, li) => {
+        const itemText = $(li).text().replace(/\s+/g, " ").trim();
+        if (itemText) items.push(itemText);
+      });
+      if (items.length > 0) {
+        blocks.push({ type: "list", items });
+      }
+      forceNewParagraph = true;
+    } else if (tagName === "img") {
+      const src = $(el).attr("src")?.trim();
+      const alt = $(el).attr("alt")?.trim() || "";
+      if (src) {
+        let resolvedSrc = src;
+        if (resolutionBase) {
+          try {
+            resolvedSrc = new URL(src, resolutionBase).href;
+          } catch {
+            // keep as is
+          }
+        }
+        blocks.push({ type: "image", src: resolvedSrc, alt });
+      }
+      forceNewParagraph = true;
+    } else if (tagName === "br") {
+      forceNewParagraph = true;
+    } else {
+      // For general container tags (div, span, etc.), walk contents recursively
+      $(el).contents().each((_, child) => {
+        walk(child);
+      });
+    }
+  }
+  cleanedNode.contents().each((_, child) => {
+    walk(child);
+  });
+  // Fallback textContent: concatenated paragraphs / lists for backwards compatibility
+  const textContentParts: string[] = [];
+  for (const block of blocks) {
+    if (block.type === "paragraph" && block.text) {
+      textContentParts.push(block.text);
+    } else if (block.type === "heading" && block.text) {
+      textContentParts.push(block.text);
+    } else if (block.type === "list" && block.items) {
+      textContentParts.push(block.items.join(" "));
+    }
+  }
+  const textContent = textContentParts.join(" ").replace(/\s+/g, " ").trim() || null;
+  return {
+    title,
+    description,
+    canonicalUrl,
+    headings: { h1, h2, h3 },
+    textContent,
+    links,
+    blocks,
+    images,
+  };
+}

package/src/worker/worker.ts ADDED Viewed

@@ -0,0 +1,100 @@
+import { downloadPage } from "./downloader.js";
+import { extractPageData } from "./extractor.js";
+import { normalizeURL, getDomain } from "../normalizer.js";
+import { insertURL, insertLink, markFailed } from "../db/queries.js";
+import { config } from "../config.js";
+import { isAllowedByRobots } from "../frontier/robots.js";
+import { getStrategy } from "../output/index.js";
+function isDomainAllowed(domain: string): boolean {
+  if (!config.ALLOWED_DOMAINS || config.ALLOWED_DOMAINS.length === 0) {
+    return true;
+  }
+  return config.ALLOWED_DOMAINS.includes(domain);
+}
+/**
+ * Handles the complete crawling workflow for a single URL:
+ * 1. Downloads the page HTML (handling redirects & timeouts).
+ * 2. Extracts title, description, canonical, headings, text content, and outgoing links.
+ * 3. Delegates persistence to the active OutputStrategy (DB or PDF).
+ * 4. Filters, normalizes, and enqueues discovered links, establishing link graph relations.
+ */
+export async function processPage(urlRow: { id: number; url: string; depth: number }): Promise<void> {
+  const urlId = urlRow.id;
+  const pageUrl = urlRow.url;
+  const currentDepth = urlRow.depth;
+  try {
+    // 0. Check robots.txt compliance
+    const allowed = await isAllowedByRobots(pageUrl);
+    if (!allowed) {
+      await markFailed(urlId, "Disallowed by robots.txt");
+      return;
+    }
+    // 1. Download page content
+    const downloadResult = await downloadPage(pageUrl);
+    // 2. Extract content & outgoing links
+    const extracted = extractPageData(downloadResult.html, downloadResult.url);
+    // Resolve final URL using canonical link if present
+    let finalUrl = downloadResult.url;
+    if (extracted.canonicalUrl) {
+      const normalizedCanonical = normalizeURL(extracted.canonicalUrl, finalUrl);
+      if (normalizedCanonical) {
+        finalUrl = normalizedCanonical;
+      }
+    }
+    // 3. Persist content via the active output strategy (DB or PDF)
+    const strategy = getStrategy();
+    await strategy.save(urlId, finalUrl, {
+      title: extracted.title,
+      description: extracted.description,
+      canonicalUrl: extracted.canonicalUrl,
+      headings: extracted.headings,
+      textContent: extracted.textContent,
+      blocks: extracted.blocks,
+      images: extracted.images,
+    });
+    // 4. Process outgoing links
+    const uniqueNormalizedLinks = new Set<string>();
+    for (const link of extracted.links) {
+      const normalized = normalizeURL(link, finalUrl);
+      if (!normalized) continue;
+      // Skip self-referential links
+      if (normalized === finalUrl || normalized === pageUrl) continue;
+      const linkDomain = getDomain(normalized);
+      if (!linkDomain || !isDomainAllowed(linkDomain)) continue;
+      uniqueNormalizedLinks.add(normalized);
+    }
+    for (const normalizedLink of uniqueNormalizedLinks) {
+      const nextDepth = currentDepth + 1;
+      // Enforce MAX_DEPTH limit
+      if (nextDepth > config.MAX_DEPTH) {
+        continue;
+      }
+      const targetDomain = getDomain(normalizedLink)!;
+      // Insert target URL (ON CONFLICT DO NOTHING) and get its ID
+      const targetUrlId = await insertURL(normalizedLink, targetDomain, nextDepth);
+      // Establish link graph relation
+      await insertLink(urlId, targetUrlId);
+    }
+  } catch (error: any) {
+    const errorMsg = error instanceof Error ? error.message : String(error);
+    await markFailed(urlId, errorMsg);
+    throw error;
+  }
+}

package/tsconfig.json ADDED Viewed

@@ -0,0 +1,15 @@
+{
+  "compilerOptions": {
+    "target": "ES2022",
+    "module": "NodeNext",
+    "moduleResolution": "NodeNext",
+    "esModuleInterop": true,
+    "forceConsistentCasingInFileNames": true,
+    "strict": true,
+    "skipLibCheck": true,
+    "outDir": "./dist",
+    "rootDir": "./src"
+  },
+  "include": ["src/**/*"],
+  "exclude": ["node_modules", "dist"]
+}