npm - messi-crawler - Versions diffs - 1.0.0 - Mend

messi-crawler 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (73) hide show

package/README.md +201 -0
package/dist/cli/renderer.js +71 -0
package/dist/config.js +18 -0
package/dist/db/clear.js +16 -0
package/dist/db/client.js +20 -0
package/dist/db/queries.js +179 -0
package/dist/frontier/frontier.js +44 -0
package/dist/frontier/logger.js +65 -0
package/dist/frontier/robots.js +46 -0
package/dist/frontier/scheduler.js +98 -0
package/dist/index.js +533 -0
package/dist/normalizer.js +33 -0
package/dist/output/db-strategy.js +16 -0
package/dist/output/index.js +23 -0
package/dist/output/pdf-strategy.js +316 -0
package/dist/output/strategy.js +1 -0
package/dist/security/ssrf.js +45 -0
package/dist/security/validate-url.js +41 -0
package/dist/seed.js +14 -0
package/dist/setup.js +148 -0
package/dist/test/client.test.js +33 -0
package/dist/test/downloader.test.js +84 -0
package/dist/test/extractor.test.js +126 -0
package/dist/test/frontier.test.js +43 -0
package/dist/test/logger.test.js +55 -0
package/dist/test/normalizer.test.js +36 -0
package/dist/test/pdf-strategy.test.js +68 -0
package/dist/test/queries.test.js +173 -0
package/dist/test/robots.test.js +46 -0
package/dist/test/scheduler.test.js +73 -0
package/dist/test/seed.test.js +26 -0
package/dist/test/worker.test.js +118 -0
package/dist/worker/downloader.js +114 -0
package/dist/worker/extractor.js +197 -0
package/dist/worker/worker.js +87 -0
package/package.json +48 -0
package/seeds.txt +4 -0
package/src/cli/renderer.ts +83 -0
package/src/config.ts +22 -0
package/src/db/clear.ts +16 -0
package/src/db/client.ts +26 -0
package/src/db/queries.ts +255 -0
package/src/db/schema.sql +43 -0
package/src/frontier/frontier.ts +60 -0
package/src/frontier/logger.ts +75 -0
package/src/frontier/robots.ts +50 -0
package/src/frontier/scheduler.ts +119 -0
package/src/index.ts +596 -0
package/src/normalizer.ts +37 -0
package/src/output/db-strategy.ts +20 -0
package/src/output/index.ts +32 -0
package/src/output/pdf-strategy.ts +388 -0
package/src/output/strategy.ts +16 -0
package/src/security/ssrf.ts +48 -0
package/src/security/validate-url.ts +49 -0
package/src/seed.ts +18 -0
package/src/setup.ts +170 -0
package/src/test/client.test.ts +38 -0
package/src/test/downloader.test.ts +101 -0
package/src/test/extractor.test.ts +139 -0
package/src/test/frontier.test.ts +53 -0
package/src/test/logger.test.ts +71 -0
package/src/test/normalizer.test.ts +43 -0
package/src/test/pdf-strategy.test.ts +84 -0
package/src/test/queries.test.ts +247 -0
package/src/test/robots.test.ts +56 -0
package/src/test/scheduler.test.ts +90 -0
package/src/test/seed.test.ts +35 -0
package/src/test/worker.test.ts +144 -0
package/src/worker/downloader.ts +149 -0
package/src/worker/extractor.ts +235 -0
package/src/worker/worker.ts +100 -0
package/tsconfig.json +15 -0

package/src/test/client.test.ts ADDED Viewed

@@ -0,0 +1,38 @@
+import { describe, it, expect, vi } from "vitest";
+// Mock pg module before importing client
+vi.mock("pg", () => {
+  const queryMock = vi.fn().mockResolvedValue({ rows: [] });
+  const endMock = vi.fn().mockResolvedValue(undefined);
+  class PoolMock {
+    query = queryMock;
+    end = endMock;
+  }
+  return {
+    default: {
+      Pool: PoolMock,
+    },
+    Pool: PoolMock,
+  };
+});
+import { pool, query, closePool } from "../db/client.js";
+describe("Database Client", () => {
+  it("should expose pool and query function", async () => {
+    expect(pool).toBeDefined();
+    expect(query).toBeDefined();
+    expect(closePool).toBeDefined();
+  });
+  it("should delegate query call to pool", async () => {
+    const res = await query("SELECT 1");
+    expect(res).toEqual({ rows: [] });
+    expect(pool.query).toHaveBeenCalledWith("SELECT 1", undefined);
+  });
+  it("should call end on pool when closing", async () => {
+    await closePool();
+    expect(pool.end).toHaveBeenCalled();
+  });
+});

package/src/test/downloader.test.ts ADDED Viewed

@@ -0,0 +1,101 @@
+import { describe, it, expect, vi, beforeEach } from "vitest";
+// Mock undici request function
+vi.mock("undici", () => {
+  return {
+    request: vi.fn(),
+  };
+});
+vi.mock("../config.js", () => {
+  return {
+    config: {
+      REQUEST_TIMEOUT_MS: 1000,
+      MAX_REDIRECTS: 2,
+    },
+  };
+});
+import { request } from "undici";
+import { downloadPage } from "../worker/downloader.js";
+const mockedRequest = vi.mocked(request);
+describe("HTTP Downloader", () => {
+  beforeEach(() => {
+    vi.clearAllMocks();
+  });
+  it("should successfully download HTML page", async () => {
+    mockedRequest.mockResolvedValue({
+      statusCode: 200,
+      headers: { "content-type": "text/html" },
+      body: { text: async () => "<html>Hello</html>" },
+    } as any);
+    const result = await downloadPage("https://react.dev");
+    expect(result).toEqual({
+      url: "https://react.dev",
+      html: "<html>Hello</html>",
+      statusCode: 200,
+    });
+    expect(mockedRequest).toHaveBeenCalledTimes(1);
+  });
+  it("should follow redirects manually and return final URL", async () => {
+    mockedRequest
+      .mockResolvedValueOnce({
+        statusCode: 301,
+        headers: { location: "https://react.dev/docs" },
+        body: { text: async () => "" },
+      } as any)
+      .mockResolvedValueOnce({
+        statusCode: 200,
+        headers: { "content-type": "text/html; charset=utf-8" },
+        body: { text: async () => "docs html" },
+      } as any);
+    const result = await downloadPage("https://react.dev");
+    expect(result).toEqual({
+      url: "https://react.dev/docs",
+      html: "docs html",
+      statusCode: 200,
+    });
+    expect(mockedRequest).toHaveBeenCalledTimes(2);
+    expect(mockedRequest).toHaveBeenNthCalledWith(1, "https://react.dev", expect.any(Object));
+    expect(mockedRequest).toHaveBeenNthCalledWith(2, "https://react.dev/docs", expect.any(Object));
+  });
+  it("should throw error if redirect limit is exceeded", async () => {
+    mockedRequest.mockResolvedValue({
+      statusCode: 302,
+      headers: { location: "https://react.dev/loop" },
+      body: { text: async () => "" },
+    } as any);
+    await expect(downloadPage("https://react.dev")).rejects.toThrow("Too many redirects");
+    expect(mockedRequest).toHaveBeenCalledTimes(3); // 1 initial + 2 redirects (max redirects is 2)
+  });
+  it("should throw error for non-200 HTTP status code", async () => {
+    mockedRequest.mockResolvedValue({
+      statusCode: 404,
+      headers: {},
+      body: { text: async () => "Not Found" },
+    } as any);
+    await expect(downloadPage("https://react.dev")).rejects.toThrow("HTTP status 404");
+  });
+  it("should throw error for non-HTML content types", async () => {
+    mockedRequest.mockResolvedValue({
+      statusCode: 200,
+      headers: { "content-type": "application/json" },
+      body: { text: async () => "{}" },
+    } as any);
+    await expect(downloadPage("https://react.dev")).rejects.toThrow("Non-HTML content type");
+  });
+});

package/src/test/extractor.test.ts ADDED Viewed

@@ -0,0 +1,139 @@
+import { describe, it, expect } from "vitest";
+import { extractPageData } from "../worker/extractor.js";
+describe("HTML Extractor", () => {
+  it("should extract metadata, headings, clean text, and links", () => {
+    const sampleHtml = `
+      <!DOCTYPE html>
+      <html>
+        <head>
+          <title>Test Page Title</title>
+          <meta name="description" content="This is a test description.">
+          <link rel="canonical" href="https://example.com/canonical-url">
+        </head>
+        <body>
+          <style>body { color: red; }</style>
+          <h1>Heading One</h1>
+          <h2>Heading Two</h2>
+          <h3>Heading Three</h3>
+          <p>This is some body text. <a href="/about">About Us</a> and <a href="https://google.com">Google</a>.</p>
+          <script>console.log("hello");</script>
+        </body>
+      </html>
+    `;
+    const result = extractPageData(sampleHtml);
+    expect(result.title).toBe("Test Page Title");
+    expect(result.description).toBe("This is a test description.");
+    expect(result.canonicalUrl).toBe("https://example.com/canonical-url");
+    expect(result.headings).toEqual({
+      h1: ["Heading One"],
+      h2: ["Heading Two"],
+      h3: ["Heading Three"],
+    });
+    // Style and script tags should be stripped, only body paragraph and headings remain
+    expect(result.textContent).toContain("Heading One Heading Two Heading Three This is some body text. About Us and Google.");
+    expect(result.textContent).not.toContain("color: red");
+    expect(result.textContent).not.toContain("console.log");
+    expect(result.links).toEqual(["/about", "https://google.com"]);
+  });
+  it("should handle missing tags gracefully", () => {
+    const sampleHtml = `
+      <html>
+        <body>
+          <p>Just some text</p>
+        </body>
+      </html>
+    `;
+    const result = extractPageData(sampleHtml);
+    expect(result.title).toBeNull();
+    expect(result.description).toBeNull();
+    expect(result.canonicalUrl).toBeNull();
+    expect(result.headings).toEqual({ h1: [], h2: [], h3: [] });
+    expect(result.textContent).toBe("Just some text");
+    expect(result.links).toEqual([]);
+  });
+  it("should select main content via tags (article/main/role=main) and remove chrome", () => {
+    const html = `
+      <html>
+        <body>
+          <header><nav>Header navigation links</nav></header>
+          <div role="main">
+            <article>
+              <h1>Article Title</h1>
+              <p>This is the actual article content.</p>
+              <footer>Article footer inside main</footer>
+            </article>
+          </div>
+          <footer>Site footer chrome</footer>
+        </body>
+      </html>
+    `;
+    const result = extractPageData(html);
+    // Note: article footer and header nav should be removed
+    expect(result.textContent).toBe("Article Title This is the actual article content.");
+    expect(result.textContent).not.toContain("Header navigation links");
+    expect(result.textContent).not.toContain("Site footer chrome");
+  });
+  it("should select main content via text density score when no tag is present", () => {
+    const html = `
+      <html>
+        <body>
+          <div class="sidebar">
+            <p>Nav 1</p>
+            <p>Nav 2</p>
+          </div>
+          <div class="content">
+            <p>This is a much longer paragraph with a lot of text to ensure it has a higher text density compared to the sidebar. It contains many words and represents the main article body.</p>
+            <p>Another paragraph to increase text density even more.</p>
+          </div>
+        </body>
+      </html>
+    `;
+    const result = extractPageData(html);
+    expect(result.textContent).toContain("This is a much longer paragraph");
+    expect(result.textContent).not.toContain("Nav 1");
+  });
+  it("should extract structured blocks and resolve image URLs", () => {
+    const html = `
+      <html>
+        <body>
+          <article>
+            <h1>Title</h1>
+            <p>Intro paragraph.</p>
+            <ul>
+              <li>Item A</li>
+              <li>Item B</li>
+            </ul>
+            <img src="/assets/photo.jpg" alt="A nice photo">
+          </article>
+        </body>
+      </html>
+    `;
+    const result = extractPageData(html, "https://example.com/blog/post-1");
+    expect(result.blocks).toBeDefined();
+    expect(result.blocks!.length).toBe(4);
+    expect(result.blocks![0]).toEqual({ type: "heading", level: 1, text: "Title" });
+    expect(result.blocks![1]).toEqual({ type: "paragraph", text: "Intro paragraph." });
+    expect(result.blocks![2]).toEqual({ type: "list", items: ["Item A", "Item B"] });
+    expect(result.blocks![3]).toEqual({
+      type: "image",
+      src: "https://example.com/assets/photo.jpg",
+      alt: "A nice photo",
+    });
+    expect(result.images).toEqual([
+      { src: "https://example.com/assets/photo.jpg", alt: "A nice photo" },
+    ]);
+  });
+});

package/src/test/frontier.test.ts ADDED Viewed

@@ -0,0 +1,53 @@
+import { describe, it, expect, vi, beforeEach } from "vitest";
+// Mock the client module
+vi.mock("../db/client.js", () => {
+  return {
+    query: vi.fn(),
+  };
+});
+import { query } from "../db/client.js";
+import { getPendingDomains, getPendingCounts } from "../frontier/frontier.js";
+const mockedQuery = vi.mocked(query);
+describe("URL Frontier", () => {
+  beforeEach(() => {
+    vi.clearAllMocks();
+  });
+  describe("getPendingDomains", () => {
+    it("should query and return active pending domains", async () => {
+      mockedQuery.mockResolvedValue({
+        rows: [{ domain: "react.dev" }, { domain: "typescriptlang.org" }],
+      } as any);
+      const domains = await getPendingDomains();
+      expect(mockedQuery).toHaveBeenCalledTimes(1);
+      expect(mockedQuery).toHaveBeenCalledWith(expect.stringContaining("SELECT DISTINCT domain"));
+      expect(domains).toEqual(["react.dev", "typescriptlang.org"]);
+    });
+  });
+  describe("getPendingCounts", () => {
+    it("should query and return count breakdown of pending domains", async () => {
+      mockedQuery.mockResolvedValue({
+        rows: [
+          { domain: "react.dev", count: "10" },
+          { domain: "typescriptlang.org", count: "5" },
+        ],
+      } as any);
+      const counts = await getPendingCounts();
+      expect(mockedQuery).toHaveBeenCalledTimes(1);
+      expect(mockedQuery).toHaveBeenCalledWith(expect.stringContaining("COUNT(*)"));
+      expect(counts).toEqual({
+        "react.dev": 10,
+        "typescriptlang.org": 5,
+      });
+    });
+  });
+});

package/src/test/logger.test.ts ADDED Viewed

@@ -0,0 +1,71 @@
+import { describe, it, expect, vi, beforeEach, afterEach } from "vitest";
+vi.mock("../db/queries.js", () => {
+  return {
+    getGlobalStats: vi.fn(),
+    refreshDomainStats: vi.fn(),
+    getDomainStats: vi.fn(),
+  };
+});
+import { getGlobalStats, refreshDomainStats, getDomainStats } from "../db/queries.js";
+import { startProgressLogger, stopProgressLogger } from "../frontier/logger.js";
+const mockGetGlobalStats = vi.mocked(getGlobalStats);
+const mockRefreshDomainStats = vi.mocked(refreshDomainStats);
+const mockGetDomainStats = vi.mocked(getDomainStats);
+describe("Progress Logger", () => {
+  beforeEach(() => {
+    vi.useFakeTimers();
+    vi.clearAllMocks();
+    vi.spyOn(console, "log").mockImplementation(() => {});
+    vi.spyOn(console, "error").mockImplementation(() => {});
+  });
+  afterEach(() => {
+    stopProgressLogger();
+    vi.useRealTimers();
+  });
+  it("should initialize stats and periodically log progress report", async () => {
+    mockGetGlobalStats.mockResolvedValue({ pending: 10, fetching: 2, done: 20, failed: 1 });
+    mockGetDomainStats.mockResolvedValue([
+      {
+        domain: "react.dev",
+        pending_count: 10,
+        fetching_count: 2,
+        done_count: 20,
+        failed_count: 1,
+        last_crawled_at: new Date("2026-06-05T12:00:00Z"),
+      },
+    ]);
+    await startProgressLogger(5000);
+    // Initial query should be called to establish baseline
+    expect(mockGetGlobalStats).toHaveBeenCalledTimes(1);
+    // Fast-forward 5 seconds
+    await vi.advanceTimersByTimeAsync(5000);
+    expect(mockRefreshDomainStats).toHaveBeenCalledTimes(1);
+    expect(mockGetGlobalStats).toHaveBeenCalledTimes(2);
+    expect(mockGetDomainStats).toHaveBeenCalledTimes(1);
+    expect(console.log).toHaveBeenCalledWith(expect.stringContaining("Crawler Progress Report"));
+    expect(console.log).toHaveBeenCalledWith(expect.stringContaining("PENDING : 10"));
+    expect(console.log).toHaveBeenCalledWith(expect.stringContaining("react.dev"));
+  });
+  it("should handle query errors gracefully", async () => {
+    mockGetGlobalStats.mockRejectedValue(new Error("Database connection lost"));
+    await startProgressLogger(5000);
+    await vi.advanceTimersByTimeAsync(5000);
+    expect(console.error).toHaveBeenCalledWith(
+      "Error generating crawler progress logs:",
+      expect.any(Error)
+    );
+  });
+});

package/src/test/normalizer.test.ts ADDED Viewed

@@ -0,0 +1,43 @@
+import { describe, it, expect } from "vitest";
+import { normalizeURL, getDomain } from "../normalizer.js";
+describe("URL Normalizer", () => {
+  describe("normalizeURL", () => {
+    it("should resolve relative URLs against base URL", () => {
+      expect(normalizeURL("/relative/path", "https://react.dev")).toBe("https://react.dev/relative/path");
+      expect(normalizeURL("relative/path", "https://react.dev/sub/")).toBe("https://react.dev/sub/relative/path");
+    });
+    it("should strip trailing slash (including bare domain)", () => {
+      expect(normalizeURL("https://example.com/", "https://react.dev")).toBe("https://example.com");
+      expect(normalizeURL("https://example.com/about/", "https://react.dev")).toBe("https://example.com/about");
+    });
+    it("should strip fragments", () => {
+      expect(normalizeURL("https://example.com#section", "https://react.dev")).toBe("https://example.com");
+      expect(normalizeURL("https://example.com/about#team", "https://react.dev")).toBe("https://example.com/about");
+    });
+    it("should lowercase scheme and host", () => {
+      expect(normalizeURL("HTTPS://EXAMPLE.COM/About", "https://react.dev")).toBe("https://example.com/About");
+    });
+    it("should filter out unsupported protocols", () => {
+      expect(normalizeURL("ftp://example.com", "https://react.dev")).toBeNull();
+      expect(normalizeURL("javascript:void(0)", "https://react.dev")).toBeNull();
+      expect(normalizeURL("mailto:test@example.com", "https://react.dev")).toBeNull();
+    });
+    it("should preserve query parameters", () => {
+      expect(normalizeURL("https://example.com/search?q=typescript", "https://react.dev")).toBe("https://example.com/search?q=typescript");
+    });
+  });
+  describe("getDomain", () => {
+    it("should extract hostname correctly", () => {
+      expect(getDomain("https://react.dev/docs/getting-started")).toBe("react.dev");
+      expect(getDomain("http://localhost:3000/test")).toBe("localhost");
+      expect(getDomain("invalid-url")).toBeNull();
+    });
+  });
+});

package/src/test/pdf-strategy.test.ts ADDED Viewed

@@ -0,0 +1,84 @@
+import { describe, it, expect, vi, beforeEach, afterEach } from "vitest";
+import fs from "fs";
+import path from "path";
+import { PdfStrategy } from "../output/pdf-strategy.js";
+// Mock downloader
+vi.mock("../worker/downloader.js", () => {
+  return {
+    downloadImage: vi.fn(),
+  };
+});
+// Mock db queries
+vi.mock("../db/queries.js", () => {
+  return {
+    markDone: vi.fn().mockResolvedValue(undefined),
+  };
+});
+import { downloadImage } from "../worker/downloader.js";
+import { markDone } from "../db/queries.js";
+const mockDownloadImage = vi.mocked(downloadImage);
+describe("PdfStrategy Integration", () => {
+  let strategy: PdfStrategy;
+  beforeEach(() => {
+    vi.clearAllMocks();
+    // Mock downloadImage to return a valid 1x1 PNG buffer
+    const oneByOnePng = Buffer.from(
+      "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII=",
+      "base64"
+    );
+    mockDownloadImage.mockResolvedValue(oneByOnePng);
+    strategy = new PdfStrategy();
+  });
+  afterEach(() => {
+    // Delete only the specific file generated by this test strategy instance
+    if (strategy && (strategy as any).pdfPath) {
+      const filePath = (strategy as any).pdfPath;
+      if (fs.existsSync(filePath)) {
+        try {
+          fs.unlinkSync(filePath);
+        } catch {
+          // Ignore if file cannot be deleted
+        }
+      }
+    }
+  });
+  it("should successfully generate a PDF document from structured blocks", async () => {
+    await strategy.init();
+    await strategy.save(1, "https://react.dev/docs", {
+      title: "React Documentation",
+      description: "Learn React library",
+      canonicalUrl: "https://react.dev/docs",
+      headings: { h1: ["React"], h2: [], h3: [] },
+      textContent: "Learn React details...",
+      blocks: [
+        { type: "heading", level: 1, text: "React Basics" },
+        { type: "paragraph", text: "React is a JavaScript library for building user interfaces." },
+        { type: "list", items: ["Component-Based", "Declarative UI", "Learn Once, Write Anywhere"] },
+        { type: "image", src: "https://react.dev/logo.png", alt: "React Logo" },
+      ],
+      images: [
+        { src: "https://react.dev/logo.png", alt: "React Logo" },
+      ],
+    });
+    await strategy.finish();
+    // Verify markDone was called
+    expect(markDone).toHaveBeenCalledTimes(1);
+    // Verify PDF file was written to the output folder
+    const pdfPath = (strategy as any).pdfPath;
+    expect(pdfPath).toBeDefined();
+    expect(fs.existsSync(pdfPath)).toBe(true);
+  });
+});