messi-crawler 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +201 -0
- package/dist/cli/renderer.js +71 -0
- package/dist/config.js +18 -0
- package/dist/db/clear.js +16 -0
- package/dist/db/client.js +20 -0
- package/dist/db/queries.js +179 -0
- package/dist/frontier/frontier.js +44 -0
- package/dist/frontier/logger.js +65 -0
- package/dist/frontier/robots.js +46 -0
- package/dist/frontier/scheduler.js +98 -0
- package/dist/index.js +533 -0
- package/dist/normalizer.js +33 -0
- package/dist/output/db-strategy.js +16 -0
- package/dist/output/index.js +23 -0
- package/dist/output/pdf-strategy.js +316 -0
- package/dist/output/strategy.js +1 -0
- package/dist/security/ssrf.js +45 -0
- package/dist/security/validate-url.js +41 -0
- package/dist/seed.js +14 -0
- package/dist/setup.js +148 -0
- package/dist/test/client.test.js +33 -0
- package/dist/test/downloader.test.js +84 -0
- package/dist/test/extractor.test.js +126 -0
- package/dist/test/frontier.test.js +43 -0
- package/dist/test/logger.test.js +55 -0
- package/dist/test/normalizer.test.js +36 -0
- package/dist/test/pdf-strategy.test.js +68 -0
- package/dist/test/queries.test.js +173 -0
- package/dist/test/robots.test.js +46 -0
- package/dist/test/scheduler.test.js +73 -0
- package/dist/test/seed.test.js +26 -0
- package/dist/test/worker.test.js +118 -0
- package/dist/worker/downloader.js +114 -0
- package/dist/worker/extractor.js +197 -0
- package/dist/worker/worker.js +87 -0
- package/package.json +48 -0
- package/seeds.txt +4 -0
- package/src/cli/renderer.ts +83 -0
- package/src/config.ts +22 -0
- package/src/db/clear.ts +16 -0
- package/src/db/client.ts +26 -0
- package/src/db/queries.ts +255 -0
- package/src/db/schema.sql +43 -0
- package/src/frontier/frontier.ts +60 -0
- package/src/frontier/logger.ts +75 -0
- package/src/frontier/robots.ts +50 -0
- package/src/frontier/scheduler.ts +119 -0
- package/src/index.ts +596 -0
- package/src/normalizer.ts +37 -0
- package/src/output/db-strategy.ts +20 -0
- package/src/output/index.ts +32 -0
- package/src/output/pdf-strategy.ts +388 -0
- package/src/output/strategy.ts +16 -0
- package/src/security/ssrf.ts +48 -0
- package/src/security/validate-url.ts +49 -0
- package/src/seed.ts +18 -0
- package/src/setup.ts +170 -0
- package/src/test/client.test.ts +38 -0
- package/src/test/downloader.test.ts +101 -0
- package/src/test/extractor.test.ts +139 -0
- package/src/test/frontier.test.ts +53 -0
- package/src/test/logger.test.ts +71 -0
- package/src/test/normalizer.test.ts +43 -0
- package/src/test/pdf-strategy.test.ts +84 -0
- package/src/test/queries.test.ts +247 -0
- package/src/test/robots.test.ts +56 -0
- package/src/test/scheduler.test.ts +90 -0
- package/src/test/seed.test.ts +35 -0
- package/src/test/worker.test.ts +144 -0
- package/src/worker/downloader.ts +149 -0
- package/src/worker/extractor.ts +235 -0
- package/src/worker/worker.ts +100 -0
- package/tsconfig.json +15 -0
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
import { describe, it, expect, vi, beforeEach } from "vitest";
|
|
2
|
+
// Mock undici request function
|
|
3
|
+
vi.mock("undici", () => {
|
|
4
|
+
return {
|
|
5
|
+
request: vi.fn(),
|
|
6
|
+
};
|
|
7
|
+
});
|
|
8
|
+
vi.mock("../config.js", () => {
|
|
9
|
+
return {
|
|
10
|
+
config: {
|
|
11
|
+
REQUEST_TIMEOUT_MS: 1000,
|
|
12
|
+
MAX_REDIRECTS: 2,
|
|
13
|
+
},
|
|
14
|
+
};
|
|
15
|
+
});
|
|
16
|
+
import { request } from "undici";
|
|
17
|
+
import { downloadPage } from "../worker/downloader.js";
|
|
18
|
+
const mockedRequest = vi.mocked(request);
|
|
19
|
+
describe("HTTP Downloader", () => {
|
|
20
|
+
beforeEach(() => {
|
|
21
|
+
vi.clearAllMocks();
|
|
22
|
+
});
|
|
23
|
+
it("should successfully download HTML page", async () => {
|
|
24
|
+
mockedRequest.mockResolvedValue({
|
|
25
|
+
statusCode: 200,
|
|
26
|
+
headers: { "content-type": "text/html" },
|
|
27
|
+
body: { text: async () => "<html>Hello</html>" },
|
|
28
|
+
});
|
|
29
|
+
const result = await downloadPage("https://react.dev");
|
|
30
|
+
expect(result).toEqual({
|
|
31
|
+
url: "https://react.dev",
|
|
32
|
+
html: "<html>Hello</html>",
|
|
33
|
+
statusCode: 200,
|
|
34
|
+
});
|
|
35
|
+
expect(mockedRequest).toHaveBeenCalledTimes(1);
|
|
36
|
+
});
|
|
37
|
+
it("should follow redirects manually and return final URL", async () => {
|
|
38
|
+
mockedRequest
|
|
39
|
+
.mockResolvedValueOnce({
|
|
40
|
+
statusCode: 301,
|
|
41
|
+
headers: { location: "https://react.dev/docs" },
|
|
42
|
+
body: { text: async () => "" },
|
|
43
|
+
})
|
|
44
|
+
.mockResolvedValueOnce({
|
|
45
|
+
statusCode: 200,
|
|
46
|
+
headers: { "content-type": "text/html; charset=utf-8" },
|
|
47
|
+
body: { text: async () => "docs html" },
|
|
48
|
+
});
|
|
49
|
+
const result = await downloadPage("https://react.dev");
|
|
50
|
+
expect(result).toEqual({
|
|
51
|
+
url: "https://react.dev/docs",
|
|
52
|
+
html: "docs html",
|
|
53
|
+
statusCode: 200,
|
|
54
|
+
});
|
|
55
|
+
expect(mockedRequest).toHaveBeenCalledTimes(2);
|
|
56
|
+
expect(mockedRequest).toHaveBeenNthCalledWith(1, "https://react.dev", expect.any(Object));
|
|
57
|
+
expect(mockedRequest).toHaveBeenNthCalledWith(2, "https://react.dev/docs", expect.any(Object));
|
|
58
|
+
});
|
|
59
|
+
it("should throw error if redirect limit is exceeded", async () => {
|
|
60
|
+
mockedRequest.mockResolvedValue({
|
|
61
|
+
statusCode: 302,
|
|
62
|
+
headers: { location: "https://react.dev/loop" },
|
|
63
|
+
body: { text: async () => "" },
|
|
64
|
+
});
|
|
65
|
+
await expect(downloadPage("https://react.dev")).rejects.toThrow("Too many redirects");
|
|
66
|
+
expect(mockedRequest).toHaveBeenCalledTimes(3); // 1 initial + 2 redirects (max redirects is 2)
|
|
67
|
+
});
|
|
68
|
+
it("should throw error for non-200 HTTP status code", async () => {
|
|
69
|
+
mockedRequest.mockResolvedValue({
|
|
70
|
+
statusCode: 404,
|
|
71
|
+
headers: {},
|
|
72
|
+
body: { text: async () => "Not Found" },
|
|
73
|
+
});
|
|
74
|
+
await expect(downloadPage("https://react.dev")).rejects.toThrow("HTTP status 404");
|
|
75
|
+
});
|
|
76
|
+
it("should throw error for non-HTML content types", async () => {
|
|
77
|
+
mockedRequest.mockResolvedValue({
|
|
78
|
+
statusCode: 200,
|
|
79
|
+
headers: { "content-type": "application/json" },
|
|
80
|
+
body: { text: async () => "{}" },
|
|
81
|
+
});
|
|
82
|
+
await expect(downloadPage("https://react.dev")).rejects.toThrow("Non-HTML content type");
|
|
83
|
+
});
|
|
84
|
+
});
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
import { describe, it, expect } from "vitest";
|
|
2
|
+
import { extractPageData } from "../worker/extractor.js";
|
|
3
|
+
describe("HTML Extractor", () => {
|
|
4
|
+
it("should extract metadata, headings, clean text, and links", () => {
|
|
5
|
+
const sampleHtml = `
|
|
6
|
+
<!DOCTYPE html>
|
|
7
|
+
<html>
|
|
8
|
+
<head>
|
|
9
|
+
<title>Test Page Title</title>
|
|
10
|
+
<meta name="description" content="This is a test description.">
|
|
11
|
+
<link rel="canonical" href="https://example.com/canonical-url">
|
|
12
|
+
</head>
|
|
13
|
+
<body>
|
|
14
|
+
<style>body { color: red; }</style>
|
|
15
|
+
<h1>Heading One</h1>
|
|
16
|
+
<h2>Heading Two</h2>
|
|
17
|
+
<h3>Heading Three</h3>
|
|
18
|
+
<p>This is some body text. <a href="/about">About Us</a> and <a href="https://google.com">Google</a>.</p>
|
|
19
|
+
<script>console.log("hello");</script>
|
|
20
|
+
</body>
|
|
21
|
+
</html>
|
|
22
|
+
`;
|
|
23
|
+
const result = extractPageData(sampleHtml);
|
|
24
|
+
expect(result.title).toBe("Test Page Title");
|
|
25
|
+
expect(result.description).toBe("This is a test description.");
|
|
26
|
+
expect(result.canonicalUrl).toBe("https://example.com/canonical-url");
|
|
27
|
+
expect(result.headings).toEqual({
|
|
28
|
+
h1: ["Heading One"],
|
|
29
|
+
h2: ["Heading Two"],
|
|
30
|
+
h3: ["Heading Three"],
|
|
31
|
+
});
|
|
32
|
+
// Style and script tags should be stripped, only body paragraph and headings remain
|
|
33
|
+
expect(result.textContent).toContain("Heading One Heading Two Heading Three This is some body text. About Us and Google.");
|
|
34
|
+
expect(result.textContent).not.toContain("color: red");
|
|
35
|
+
expect(result.textContent).not.toContain("console.log");
|
|
36
|
+
expect(result.links).toEqual(["/about", "https://google.com"]);
|
|
37
|
+
});
|
|
38
|
+
it("should handle missing tags gracefully", () => {
|
|
39
|
+
const sampleHtml = `
|
|
40
|
+
<html>
|
|
41
|
+
<body>
|
|
42
|
+
<p>Just some text</p>
|
|
43
|
+
</body>
|
|
44
|
+
</html>
|
|
45
|
+
`;
|
|
46
|
+
const result = extractPageData(sampleHtml);
|
|
47
|
+
expect(result.title).toBeNull();
|
|
48
|
+
expect(result.description).toBeNull();
|
|
49
|
+
expect(result.canonicalUrl).toBeNull();
|
|
50
|
+
expect(result.headings).toEqual({ h1: [], h2: [], h3: [] });
|
|
51
|
+
expect(result.textContent).toBe("Just some text");
|
|
52
|
+
expect(result.links).toEqual([]);
|
|
53
|
+
});
|
|
54
|
+
it("should select main content via tags (article/main/role=main) and remove chrome", () => {
|
|
55
|
+
const html = `
|
|
56
|
+
<html>
|
|
57
|
+
<body>
|
|
58
|
+
<header><nav>Header navigation links</nav></header>
|
|
59
|
+
<div role="main">
|
|
60
|
+
<article>
|
|
61
|
+
<h1>Article Title</h1>
|
|
62
|
+
<p>This is the actual article content.</p>
|
|
63
|
+
<footer>Article footer inside main</footer>
|
|
64
|
+
</article>
|
|
65
|
+
</div>
|
|
66
|
+
<footer>Site footer chrome</footer>
|
|
67
|
+
</body>
|
|
68
|
+
</html>
|
|
69
|
+
`;
|
|
70
|
+
const result = extractPageData(html);
|
|
71
|
+
// Note: article footer and header nav should be removed
|
|
72
|
+
expect(result.textContent).toBe("Article Title This is the actual article content.");
|
|
73
|
+
expect(result.textContent).not.toContain("Header navigation links");
|
|
74
|
+
expect(result.textContent).not.toContain("Site footer chrome");
|
|
75
|
+
});
|
|
76
|
+
it("should select main content via text density score when no tag is present", () => {
|
|
77
|
+
const html = `
|
|
78
|
+
<html>
|
|
79
|
+
<body>
|
|
80
|
+
<div class="sidebar">
|
|
81
|
+
<p>Nav 1</p>
|
|
82
|
+
<p>Nav 2</p>
|
|
83
|
+
</div>
|
|
84
|
+
<div class="content">
|
|
85
|
+
<p>This is a much longer paragraph with a lot of text to ensure it has a higher text density compared to the sidebar. It contains many words and represents the main article body.</p>
|
|
86
|
+
<p>Another paragraph to increase text density even more.</p>
|
|
87
|
+
</div>
|
|
88
|
+
</body>
|
|
89
|
+
</html>
|
|
90
|
+
`;
|
|
91
|
+
const result = extractPageData(html);
|
|
92
|
+
expect(result.textContent).toContain("This is a much longer paragraph");
|
|
93
|
+
expect(result.textContent).not.toContain("Nav 1");
|
|
94
|
+
});
|
|
95
|
+
it("should extract structured blocks and resolve image URLs", () => {
|
|
96
|
+
const html = `
|
|
97
|
+
<html>
|
|
98
|
+
<body>
|
|
99
|
+
<article>
|
|
100
|
+
<h1>Title</h1>
|
|
101
|
+
<p>Intro paragraph.</p>
|
|
102
|
+
<ul>
|
|
103
|
+
<li>Item A</li>
|
|
104
|
+
<li>Item B</li>
|
|
105
|
+
</ul>
|
|
106
|
+
<img src="/assets/photo.jpg" alt="A nice photo">
|
|
107
|
+
</article>
|
|
108
|
+
</body>
|
|
109
|
+
</html>
|
|
110
|
+
`;
|
|
111
|
+
const result = extractPageData(html, "https://example.com/blog/post-1");
|
|
112
|
+
expect(result.blocks).toBeDefined();
|
|
113
|
+
expect(result.blocks.length).toBe(4);
|
|
114
|
+
expect(result.blocks[0]).toEqual({ type: "heading", level: 1, text: "Title" });
|
|
115
|
+
expect(result.blocks[1]).toEqual({ type: "paragraph", text: "Intro paragraph." });
|
|
116
|
+
expect(result.blocks[2]).toEqual({ type: "list", items: ["Item A", "Item B"] });
|
|
117
|
+
expect(result.blocks[3]).toEqual({
|
|
118
|
+
type: "image",
|
|
119
|
+
src: "https://example.com/assets/photo.jpg",
|
|
120
|
+
alt: "A nice photo",
|
|
121
|
+
});
|
|
122
|
+
expect(result.images).toEqual([
|
|
123
|
+
{ src: "https://example.com/assets/photo.jpg", alt: "A nice photo" },
|
|
124
|
+
]);
|
|
125
|
+
});
|
|
126
|
+
});
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import { describe, it, expect, vi, beforeEach } from "vitest";
|
|
2
|
+
// Mock the client module
|
|
3
|
+
vi.mock("../db/client.js", () => {
|
|
4
|
+
return {
|
|
5
|
+
query: vi.fn(),
|
|
6
|
+
};
|
|
7
|
+
});
|
|
8
|
+
import { query } from "../db/client.js";
|
|
9
|
+
import { getPendingDomains, getPendingCounts } from "../frontier/frontier.js";
|
|
10
|
+
const mockedQuery = vi.mocked(query);
|
|
11
|
+
describe("URL Frontier", () => {
|
|
12
|
+
beforeEach(() => {
|
|
13
|
+
vi.clearAllMocks();
|
|
14
|
+
});
|
|
15
|
+
describe("getPendingDomains", () => {
|
|
16
|
+
it("should query and return active pending domains", async () => {
|
|
17
|
+
mockedQuery.mockResolvedValue({
|
|
18
|
+
rows: [{ domain: "react.dev" }, { domain: "typescriptlang.org" }],
|
|
19
|
+
});
|
|
20
|
+
const domains = await getPendingDomains();
|
|
21
|
+
expect(mockedQuery).toHaveBeenCalledTimes(1);
|
|
22
|
+
expect(mockedQuery).toHaveBeenCalledWith(expect.stringContaining("SELECT DISTINCT domain"));
|
|
23
|
+
expect(domains).toEqual(["react.dev", "typescriptlang.org"]);
|
|
24
|
+
});
|
|
25
|
+
});
|
|
26
|
+
describe("getPendingCounts", () => {
|
|
27
|
+
it("should query and return count breakdown of pending domains", async () => {
|
|
28
|
+
mockedQuery.mockResolvedValue({
|
|
29
|
+
rows: [
|
|
30
|
+
{ domain: "react.dev", count: "10" },
|
|
31
|
+
{ domain: "typescriptlang.org", count: "5" },
|
|
32
|
+
],
|
|
33
|
+
});
|
|
34
|
+
const counts = await getPendingCounts();
|
|
35
|
+
expect(mockedQuery).toHaveBeenCalledTimes(1);
|
|
36
|
+
expect(mockedQuery).toHaveBeenCalledWith(expect.stringContaining("COUNT(*)"));
|
|
37
|
+
expect(counts).toEqual({
|
|
38
|
+
"react.dev": 10,
|
|
39
|
+
"typescriptlang.org": 5,
|
|
40
|
+
});
|
|
41
|
+
});
|
|
42
|
+
});
|
|
43
|
+
});
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
import { describe, it, expect, vi, beforeEach, afterEach } from "vitest";
|
|
2
|
+
vi.mock("../db/queries.js", () => {
|
|
3
|
+
return {
|
|
4
|
+
getGlobalStats: vi.fn(),
|
|
5
|
+
refreshDomainStats: vi.fn(),
|
|
6
|
+
getDomainStats: vi.fn(),
|
|
7
|
+
};
|
|
8
|
+
});
|
|
9
|
+
import { getGlobalStats, refreshDomainStats, getDomainStats } from "../db/queries.js";
|
|
10
|
+
import { startProgressLogger, stopProgressLogger } from "../frontier/logger.js";
|
|
11
|
+
const mockGetGlobalStats = vi.mocked(getGlobalStats);
|
|
12
|
+
const mockRefreshDomainStats = vi.mocked(refreshDomainStats);
|
|
13
|
+
const mockGetDomainStats = vi.mocked(getDomainStats);
|
|
14
|
+
describe("Progress Logger", () => {
|
|
15
|
+
beforeEach(() => {
|
|
16
|
+
vi.useFakeTimers();
|
|
17
|
+
vi.clearAllMocks();
|
|
18
|
+
vi.spyOn(console, "log").mockImplementation(() => { });
|
|
19
|
+
vi.spyOn(console, "error").mockImplementation(() => { });
|
|
20
|
+
});
|
|
21
|
+
afterEach(() => {
|
|
22
|
+
stopProgressLogger();
|
|
23
|
+
vi.useRealTimers();
|
|
24
|
+
});
|
|
25
|
+
it("should initialize stats and periodically log progress report", async () => {
|
|
26
|
+
mockGetGlobalStats.mockResolvedValue({ pending: 10, fetching: 2, done: 20, failed: 1 });
|
|
27
|
+
mockGetDomainStats.mockResolvedValue([
|
|
28
|
+
{
|
|
29
|
+
domain: "react.dev",
|
|
30
|
+
pending_count: 10,
|
|
31
|
+
fetching_count: 2,
|
|
32
|
+
done_count: 20,
|
|
33
|
+
failed_count: 1,
|
|
34
|
+
last_crawled_at: new Date("2026-06-05T12:00:00Z"),
|
|
35
|
+
},
|
|
36
|
+
]);
|
|
37
|
+
await startProgressLogger(5000);
|
|
38
|
+
// Initial query should be called to establish baseline
|
|
39
|
+
expect(mockGetGlobalStats).toHaveBeenCalledTimes(1);
|
|
40
|
+
// Fast-forward 5 seconds
|
|
41
|
+
await vi.advanceTimersByTimeAsync(5000);
|
|
42
|
+
expect(mockRefreshDomainStats).toHaveBeenCalledTimes(1);
|
|
43
|
+
expect(mockGetGlobalStats).toHaveBeenCalledTimes(2);
|
|
44
|
+
expect(mockGetDomainStats).toHaveBeenCalledTimes(1);
|
|
45
|
+
expect(console.log).toHaveBeenCalledWith(expect.stringContaining("Crawler Progress Report"));
|
|
46
|
+
expect(console.log).toHaveBeenCalledWith(expect.stringContaining("PENDING : 10"));
|
|
47
|
+
expect(console.log).toHaveBeenCalledWith(expect.stringContaining("react.dev"));
|
|
48
|
+
});
|
|
49
|
+
it("should handle query errors gracefully", async () => {
|
|
50
|
+
mockGetGlobalStats.mockRejectedValue(new Error("Database connection lost"));
|
|
51
|
+
await startProgressLogger(5000);
|
|
52
|
+
await vi.advanceTimersByTimeAsync(5000);
|
|
53
|
+
expect(console.error).toHaveBeenCalledWith("Error generating crawler progress logs:", expect.any(Error));
|
|
54
|
+
});
|
|
55
|
+
});
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import { describe, it, expect } from "vitest";
|
|
2
|
+
import { normalizeURL, getDomain } from "../normalizer.js";
|
|
3
|
+
describe("URL Normalizer", () => {
|
|
4
|
+
describe("normalizeURL", () => {
|
|
5
|
+
it("should resolve relative URLs against base URL", () => {
|
|
6
|
+
expect(normalizeURL("/relative/path", "https://react.dev")).toBe("https://react.dev/relative/path");
|
|
7
|
+
expect(normalizeURL("relative/path", "https://react.dev/sub/")).toBe("https://react.dev/sub/relative/path");
|
|
8
|
+
});
|
|
9
|
+
it("should strip trailing slash (including bare domain)", () => {
|
|
10
|
+
expect(normalizeURL("https://example.com/", "https://react.dev")).toBe("https://example.com");
|
|
11
|
+
expect(normalizeURL("https://example.com/about/", "https://react.dev")).toBe("https://example.com/about");
|
|
12
|
+
});
|
|
13
|
+
it("should strip fragments", () => {
|
|
14
|
+
expect(normalizeURL("https://example.com#section", "https://react.dev")).toBe("https://example.com");
|
|
15
|
+
expect(normalizeURL("https://example.com/about#team", "https://react.dev")).toBe("https://example.com/about");
|
|
16
|
+
});
|
|
17
|
+
it("should lowercase scheme and host", () => {
|
|
18
|
+
expect(normalizeURL("HTTPS://EXAMPLE.COM/About", "https://react.dev")).toBe("https://example.com/About");
|
|
19
|
+
});
|
|
20
|
+
it("should filter out unsupported protocols", () => {
|
|
21
|
+
expect(normalizeURL("ftp://example.com", "https://react.dev")).toBeNull();
|
|
22
|
+
expect(normalizeURL("javascript:void(0)", "https://react.dev")).toBeNull();
|
|
23
|
+
expect(normalizeURL("mailto:test@example.com", "https://react.dev")).toBeNull();
|
|
24
|
+
});
|
|
25
|
+
it("should preserve query parameters", () => {
|
|
26
|
+
expect(normalizeURL("https://example.com/search?q=typescript", "https://react.dev")).toBe("https://example.com/search?q=typescript");
|
|
27
|
+
});
|
|
28
|
+
});
|
|
29
|
+
describe("getDomain", () => {
|
|
30
|
+
it("should extract hostname correctly", () => {
|
|
31
|
+
expect(getDomain("https://react.dev/docs/getting-started")).toBe("react.dev");
|
|
32
|
+
expect(getDomain("http://localhost:3000/test")).toBe("localhost");
|
|
33
|
+
expect(getDomain("invalid-url")).toBeNull();
|
|
34
|
+
});
|
|
35
|
+
});
|
|
36
|
+
});
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
import { describe, it, expect, vi, beforeEach, afterEach } from "vitest";
|
|
2
|
+
import fs from "fs";
|
|
3
|
+
import { PdfStrategy } from "../output/pdf-strategy.js";
|
|
4
|
+
// Mock downloader
|
|
5
|
+
vi.mock("../worker/downloader.js", () => {
|
|
6
|
+
return {
|
|
7
|
+
downloadImage: vi.fn(),
|
|
8
|
+
};
|
|
9
|
+
});
|
|
10
|
+
// Mock db queries
|
|
11
|
+
vi.mock("../db/queries.js", () => {
|
|
12
|
+
return {
|
|
13
|
+
markDone: vi.fn().mockResolvedValue(undefined),
|
|
14
|
+
};
|
|
15
|
+
});
|
|
16
|
+
import { downloadImage } from "../worker/downloader.js";
|
|
17
|
+
import { markDone } from "../db/queries.js";
|
|
18
|
+
const mockDownloadImage = vi.mocked(downloadImage);
|
|
19
|
+
describe("PdfStrategy Integration", () => {
|
|
20
|
+
let strategy;
|
|
21
|
+
beforeEach(() => {
|
|
22
|
+
vi.clearAllMocks();
|
|
23
|
+
// Mock downloadImage to return a valid 1x1 PNG buffer
|
|
24
|
+
const oneByOnePng = Buffer.from("iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII=", "base64");
|
|
25
|
+
mockDownloadImage.mockResolvedValue(oneByOnePng);
|
|
26
|
+
strategy = new PdfStrategy();
|
|
27
|
+
});
|
|
28
|
+
afterEach(() => {
|
|
29
|
+
// Delete only the specific file generated by this test strategy instance
|
|
30
|
+
if (strategy && strategy.pdfPath) {
|
|
31
|
+
const filePath = strategy.pdfPath;
|
|
32
|
+
if (fs.existsSync(filePath)) {
|
|
33
|
+
try {
|
|
34
|
+
fs.unlinkSync(filePath);
|
|
35
|
+
}
|
|
36
|
+
catch {
|
|
37
|
+
// Ignore if file cannot be deleted
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
});
|
|
42
|
+
it("should successfully generate a PDF document from structured blocks", async () => {
|
|
43
|
+
await strategy.init();
|
|
44
|
+
await strategy.save(1, "https://react.dev/docs", {
|
|
45
|
+
title: "React Documentation",
|
|
46
|
+
description: "Learn React library",
|
|
47
|
+
canonicalUrl: "https://react.dev/docs",
|
|
48
|
+
headings: { h1: ["React"], h2: [], h3: [] },
|
|
49
|
+
textContent: "Learn React details...",
|
|
50
|
+
blocks: [
|
|
51
|
+
{ type: "heading", level: 1, text: "React Basics" },
|
|
52
|
+
{ type: "paragraph", text: "React is a JavaScript library for building user interfaces." },
|
|
53
|
+
{ type: "list", items: ["Component-Based", "Declarative UI", "Learn Once, Write Anywhere"] },
|
|
54
|
+
{ type: "image", src: "https://react.dev/logo.png", alt: "React Logo" },
|
|
55
|
+
],
|
|
56
|
+
images: [
|
|
57
|
+
{ src: "https://react.dev/logo.png", alt: "React Logo" },
|
|
58
|
+
],
|
|
59
|
+
});
|
|
60
|
+
await strategy.finish();
|
|
61
|
+
// Verify markDone was called
|
|
62
|
+
expect(markDone).toHaveBeenCalledTimes(1);
|
|
63
|
+
// Verify PDF file was written to the output folder
|
|
64
|
+
const pdfPath = strategy.pdfPath;
|
|
65
|
+
expect(pdfPath).toBeDefined();
|
|
66
|
+
expect(fs.existsSync(pdfPath)).toBe(true);
|
|
67
|
+
});
|
|
68
|
+
});
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
import { describe, it, expect, vi, beforeEach } from "vitest";
|
|
2
|
+
// Mock the client module directly
|
|
3
|
+
vi.mock("../db/client.js", () => {
|
|
4
|
+
return {
|
|
5
|
+
query: vi.fn(),
|
|
6
|
+
pool: {
|
|
7
|
+
connect: vi.fn(),
|
|
8
|
+
},
|
|
9
|
+
};
|
|
10
|
+
});
|
|
11
|
+
// Import the mocked query and pool
|
|
12
|
+
import { query, pool } from "../db/client.js";
|
|
13
|
+
import { claimNextURL, markDone, markFailed, insertURL, insertLink, resetStaleLocks, getGlobalStats, refreshDomainStats, getDomainStats, } from "../db/queries.js";
|
|
14
|
+
const mockedQuery = vi.mocked(query);
|
|
15
|
+
const mockedPool = vi.mocked(pool);
|
|
16
|
+
describe("Database Queries", () => {
|
|
17
|
+
beforeEach(() => {
|
|
18
|
+
vi.clearAllMocks();
|
|
19
|
+
mockedQuery.mockReset().mockResolvedValue({ rows: [] });
|
|
20
|
+
mockedPool.connect.mockReset();
|
|
21
|
+
});
|
|
22
|
+
describe("claimNextURL", () => {
|
|
23
|
+
it("should return the row if a PENDING URL is found", async () => {
|
|
24
|
+
const mockRow = { id: 1, url: "https://react.dev", domain: "react.dev", status: "FETCHING", depth: 0 };
|
|
25
|
+
mockedQuery.mockResolvedValue({ rows: [mockRow] });
|
|
26
|
+
const result = await claimNextURL("react.dev");
|
|
27
|
+
expect(mockedQuery).toHaveBeenCalledTimes(1);
|
|
28
|
+
expect(mockedQuery).toHaveBeenCalledWith(expect.stringContaining("UPDATE urls"), ["react.dev"]);
|
|
29
|
+
expect(result).toEqual(mockRow);
|
|
30
|
+
});
|
|
31
|
+
it("should return null if no PENDING URL is found", async () => {
|
|
32
|
+
mockedQuery.mockResolvedValue({ rows: [] });
|
|
33
|
+
const result = await claimNextURL("react.dev");
|
|
34
|
+
expect(result).toBeNull();
|
|
35
|
+
});
|
|
36
|
+
});
|
|
37
|
+
describe("markDone", () => {
|
|
38
|
+
it("should execute queries in a transaction", async () => {
|
|
39
|
+
const mockClient = {
|
|
40
|
+
query: vi.fn().mockResolvedValue({}),
|
|
41
|
+
release: vi.fn(),
|
|
42
|
+
};
|
|
43
|
+
mockedPool.connect.mockResolvedValue(mockClient);
|
|
44
|
+
const content = {
|
|
45
|
+
title: "React",
|
|
46
|
+
description: "Library",
|
|
47
|
+
canonicalUrl: "https://react.dev",
|
|
48
|
+
headings: { h1: ["H1"], h2: [], h3: [] },
|
|
49
|
+
textContent: "Body content",
|
|
50
|
+
};
|
|
51
|
+
await markDone(1, content);
|
|
52
|
+
expect(mockedPool.connect).toHaveBeenCalledTimes(1);
|
|
53
|
+
expect(mockClient.query).toHaveBeenCalledWith("BEGIN");
|
|
54
|
+
expect(mockClient.query).toHaveBeenCalledWith(expect.stringContaining("INSERT INTO crawled_pages"), [1, "React", "Library", "https://react.dev", JSON.stringify(content.headings), "Body content"]);
|
|
55
|
+
expect(mockClient.query).toHaveBeenCalledWith(expect.stringContaining("UPDATE urls"), [1]);
|
|
56
|
+
expect(mockClient.query).toHaveBeenCalledWith("COMMIT");
|
|
57
|
+
expect(mockClient.release).toHaveBeenCalledTimes(1);
|
|
58
|
+
});
|
|
59
|
+
it("should rollback transaction on error", async () => {
|
|
60
|
+
const mockClient = {
|
|
61
|
+
query: vi.fn().mockImplementation((sql) => {
|
|
62
|
+
if (sql.includes("INSERT INTO crawled_pages")) {
|
|
63
|
+
throw new Error("DB Error");
|
|
64
|
+
}
|
|
65
|
+
return Promise.resolve({});
|
|
66
|
+
}),
|
|
67
|
+
release: vi.fn(),
|
|
68
|
+
};
|
|
69
|
+
mockedPool.connect.mockResolvedValue(mockClient);
|
|
70
|
+
const content = {
|
|
71
|
+
title: "React",
|
|
72
|
+
description: "Library",
|
|
73
|
+
canonicalUrl: "https://react.dev",
|
|
74
|
+
headings: { h1: ["H1"], h2: [], h3: [] },
|
|
75
|
+
textContent: "Body content",
|
|
76
|
+
};
|
|
77
|
+
await expect(markDone(1, content)).rejects.toThrow("DB Error");
|
|
78
|
+
expect(mockClient.query).toHaveBeenCalledWith("ROLLBACK");
|
|
79
|
+
expect(mockClient.release).toHaveBeenCalledTimes(1);
|
|
80
|
+
});
|
|
81
|
+
});
|
|
82
|
+
describe("markFailed", () => {
|
|
83
|
+
it("should update status to FAILED with error message", async () => {
|
|
84
|
+
mockedQuery.mockResolvedValue({ rows: [] });
|
|
85
|
+
await markFailed(1, "Connection timeout");
|
|
86
|
+
expect(mockedQuery).toHaveBeenCalledTimes(1);
|
|
87
|
+
expect(mockedQuery).toHaveBeenCalledWith(expect.stringContaining("UPDATE urls"), [1, "Connection timeout"]);
|
|
88
|
+
});
|
|
89
|
+
});
|
|
90
|
+
describe("insertURL", () => {
|
|
91
|
+
it("should return the ID of the URL", async () => {
|
|
92
|
+
mockedQuery.mockResolvedValue({ rows: [{ id: 42 }] });
|
|
93
|
+
const id = await insertURL("https://react.dev", "react.dev", 1);
|
|
94
|
+
expect(id).toBe(42);
|
|
95
|
+
expect(mockedQuery).toHaveBeenCalledTimes(1);
|
|
96
|
+
expect(mockedQuery).toHaveBeenCalledWith(expect.stringContaining("WITH ins AS"), ["https://react.dev", "react.dev", 1]);
|
|
97
|
+
});
|
|
98
|
+
});
|
|
99
|
+
describe("insertLink", () => {
|
|
100
|
+
it("should insert edge into links table", async () => {
|
|
101
|
+
mockedQuery.mockResolvedValue({ rows: [] });
|
|
102
|
+
await insertLink(1, 2);
|
|
103
|
+
expect(mockedQuery).toHaveBeenCalledTimes(1);
|
|
104
|
+
expect(mockedQuery).toHaveBeenCalledWith(expect.stringContaining("INSERT INTO links"), [1, 2]);
|
|
105
|
+
});
|
|
106
|
+
});
|
|
107
|
+
describe("resetStaleLocks", () => {
|
|
108
|
+
it("should reset FETCHING urls back to PENDING", async () => {
|
|
109
|
+
mockedQuery.mockResolvedValue({ rows: [] });
|
|
110
|
+
await resetStaleLocks();
|
|
111
|
+
expect(mockedQuery).toHaveBeenCalledTimes(1);
|
|
112
|
+
expect(mockedQuery).toHaveBeenCalledWith(expect.stringContaining("UPDATE urls"));
|
|
113
|
+
expect(mockedQuery).toHaveBeenCalledWith(expect.stringContaining("status = 'PENDING'"));
|
|
114
|
+
expect(mockedQuery).toHaveBeenCalledWith(expect.stringContaining("status = 'FETCHING'"));
|
|
115
|
+
});
|
|
116
|
+
});
|
|
117
|
+
describe("getGlobalStats", () => {
|
|
118
|
+
it("should map query results to GlobalStats structure", async () => {
|
|
119
|
+
mockedQuery.mockResolvedValue({
|
|
120
|
+
rows: [
|
|
121
|
+
{ status: "PENDING", count: "10" },
|
|
122
|
+
{ status: "DONE", count: "5" },
|
|
123
|
+
],
|
|
124
|
+
});
|
|
125
|
+
const stats = await getGlobalStats();
|
|
126
|
+
expect(stats).toEqual({
|
|
127
|
+
pending: 10,
|
|
128
|
+
fetching: 0,
|
|
129
|
+
done: 5,
|
|
130
|
+
failed: 0,
|
|
131
|
+
});
|
|
132
|
+
expect(mockedQuery).toHaveBeenCalledWith(expect.stringContaining("SELECT status, COUNT(*)"));
|
|
133
|
+
});
|
|
134
|
+
});
|
|
135
|
+
describe("refreshDomainStats", () => {
|
|
136
|
+
it("should run CREATE TABLE and INSERT INTO domain_stats query", async () => {
|
|
137
|
+
mockedQuery.mockResolvedValue({ rows: [] });
|
|
138
|
+
await refreshDomainStats();
|
|
139
|
+
expect(mockedQuery).toHaveBeenCalledTimes(2);
|
|
140
|
+
expect(mockedQuery).toHaveBeenNthCalledWith(1, expect.stringContaining("CREATE TABLE IF NOT EXISTS domain_stats"));
|
|
141
|
+
expect(mockedQuery).toHaveBeenNthCalledWith(2, expect.stringContaining("INSERT INTO domain_stats"));
|
|
142
|
+
});
|
|
143
|
+
});
|
|
144
|
+
describe("getDomainStats", () => {
|
|
145
|
+
it("should fetch and format domain stats", async () => {
|
|
146
|
+
const lastCrawled = new Date();
|
|
147
|
+
mockedQuery.mockResolvedValue({
|
|
148
|
+
rows: [
|
|
149
|
+
{
|
|
150
|
+
domain: "react.dev",
|
|
151
|
+
pending_count: "5",
|
|
152
|
+
fetching_count: "1",
|
|
153
|
+
done_count: "10",
|
|
154
|
+
failed_count: "2",
|
|
155
|
+
last_crawled_at: lastCrawled.toISOString(),
|
|
156
|
+
},
|
|
157
|
+
],
|
|
158
|
+
});
|
|
159
|
+
const stats = await getDomainStats();
|
|
160
|
+
expect(stats).toEqual([
|
|
161
|
+
{
|
|
162
|
+
domain: "react.dev",
|
|
163
|
+
pending_count: 5,
|
|
164
|
+
fetching_count: 1,
|
|
165
|
+
done_count: 10,
|
|
166
|
+
failed_count: 2,
|
|
167
|
+
last_crawled_at: lastCrawled,
|
|
168
|
+
},
|
|
169
|
+
]);
|
|
170
|
+
expect(mockedQuery).toHaveBeenCalledWith(expect.stringContaining("SELECT domain, pending_count"));
|
|
171
|
+
});
|
|
172
|
+
});
|
|
173
|
+
});
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import { describe, it, expect, vi, beforeEach } from "vitest";
|
|
2
|
+
vi.mock("undici", () => {
|
|
3
|
+
return {
|
|
4
|
+
request: vi.fn(),
|
|
5
|
+
};
|
|
6
|
+
});
|
|
7
|
+
vi.mock("../config.js", () => {
|
|
8
|
+
return {
|
|
9
|
+
config: {
|
|
10
|
+
REQUEST_TIMEOUT_MS: 1000,
|
|
11
|
+
},
|
|
12
|
+
};
|
|
13
|
+
});
|
|
14
|
+
import { request } from "undici";
|
|
15
|
+
import { isAllowedByRobots } from "../frontier/robots.js";
|
|
16
|
+
const mockedRequest = vi.mocked(request);
|
|
17
|
+
describe("robots.txt compliance", () => {
|
|
18
|
+
beforeEach(() => {
|
|
19
|
+
vi.clearAllMocks();
|
|
20
|
+
});
|
|
21
|
+
it("should allow URLs if robots.txt allows it", async () => {
|
|
22
|
+
mockedRequest.mockResolvedValue({
|
|
23
|
+
statusCode: 200,
|
|
24
|
+
body: {
|
|
25
|
+
text: async () => `
|
|
26
|
+
User-agent: *
|
|
27
|
+
Disallow: /private/
|
|
28
|
+
`,
|
|
29
|
+
},
|
|
30
|
+
});
|
|
31
|
+
const allowed = await isAllowedByRobots("https://react.dev/docs");
|
|
32
|
+
expect(allowed).toBe(true);
|
|
33
|
+
const disallowed = await isAllowedByRobots("https://react.dev/private/secret");
|
|
34
|
+
expect(disallowed).toBe(false);
|
|
35
|
+
});
|
|
36
|
+
it("should allow URLs if robots.txt returns 404", async () => {
|
|
37
|
+
mockedRequest.mockResolvedValue({
|
|
38
|
+
statusCode: 404,
|
|
39
|
+
body: {
|
|
40
|
+
text: async () => "Not Found",
|
|
41
|
+
},
|
|
42
|
+
});
|
|
43
|
+
const allowed = await isAllowedByRobots("https://react.dev/docs");
|
|
44
|
+
expect(allowed).toBe(true);
|
|
45
|
+
});
|
|
46
|
+
});
|