messi-crawler 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +201 -0
- package/dist/cli/renderer.js +71 -0
- package/dist/config.js +18 -0
- package/dist/db/clear.js +16 -0
- package/dist/db/client.js +20 -0
- package/dist/db/queries.js +179 -0
- package/dist/frontier/frontier.js +44 -0
- package/dist/frontier/logger.js +65 -0
- package/dist/frontier/robots.js +46 -0
- package/dist/frontier/scheduler.js +98 -0
- package/dist/index.js +533 -0
- package/dist/normalizer.js +33 -0
- package/dist/output/db-strategy.js +16 -0
- package/dist/output/index.js +23 -0
- package/dist/output/pdf-strategy.js +316 -0
- package/dist/output/strategy.js +1 -0
- package/dist/security/ssrf.js +45 -0
- package/dist/security/validate-url.js +41 -0
- package/dist/seed.js +14 -0
- package/dist/setup.js +148 -0
- package/dist/test/client.test.js +33 -0
- package/dist/test/downloader.test.js +84 -0
- package/dist/test/extractor.test.js +126 -0
- package/dist/test/frontier.test.js +43 -0
- package/dist/test/logger.test.js +55 -0
- package/dist/test/normalizer.test.js +36 -0
- package/dist/test/pdf-strategy.test.js +68 -0
- package/dist/test/queries.test.js +173 -0
- package/dist/test/robots.test.js +46 -0
- package/dist/test/scheduler.test.js +73 -0
- package/dist/test/seed.test.js +26 -0
- package/dist/test/worker.test.js +118 -0
- package/dist/worker/downloader.js +114 -0
- package/dist/worker/extractor.js +197 -0
- package/dist/worker/worker.js +87 -0
- package/package.json +48 -0
- package/seeds.txt +4 -0
- package/src/cli/renderer.ts +83 -0
- package/src/config.ts +22 -0
- package/src/db/clear.ts +16 -0
- package/src/db/client.ts +26 -0
- package/src/db/queries.ts +255 -0
- package/src/db/schema.sql +43 -0
- package/src/frontier/frontier.ts +60 -0
- package/src/frontier/logger.ts +75 -0
- package/src/frontier/robots.ts +50 -0
- package/src/frontier/scheduler.ts +119 -0
- package/src/index.ts +596 -0
- package/src/normalizer.ts +37 -0
- package/src/output/db-strategy.ts +20 -0
- package/src/output/index.ts +32 -0
- package/src/output/pdf-strategy.ts +388 -0
- package/src/output/strategy.ts +16 -0
- package/src/security/ssrf.ts +48 -0
- package/src/security/validate-url.ts +49 -0
- package/src/seed.ts +18 -0
- package/src/setup.ts +170 -0
- package/src/test/client.test.ts +38 -0
- package/src/test/downloader.test.ts +101 -0
- package/src/test/extractor.test.ts +139 -0
- package/src/test/frontier.test.ts +53 -0
- package/src/test/logger.test.ts +71 -0
- package/src/test/normalizer.test.ts +43 -0
- package/src/test/pdf-strategy.test.ts +84 -0
- package/src/test/queries.test.ts +247 -0
- package/src/test/robots.test.ts +56 -0
- package/src/test/scheduler.test.ts +90 -0
- package/src/test/seed.test.ts +35 -0
- package/src/test/worker.test.ts +144 -0
- package/src/worker/downloader.ts +149 -0
- package/src/worker/extractor.ts +235 -0
- package/src/worker/worker.ts +100 -0
- package/tsconfig.json +15 -0
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
import { describe, it, expect, vi, beforeEach } from "vitest";
|
|
2
|
+
|
|
3
|
+
// Mock the client module directly
|
|
4
|
+
vi.mock("../db/client.js", () => {
|
|
5
|
+
return {
|
|
6
|
+
query: vi.fn(),
|
|
7
|
+
pool: {
|
|
8
|
+
connect: vi.fn(),
|
|
9
|
+
},
|
|
10
|
+
};
|
|
11
|
+
});
|
|
12
|
+
|
|
13
|
+
// Import the mocked query and pool
|
|
14
|
+
import { query, pool } from "../db/client.js";
|
|
15
|
+
import {
|
|
16
|
+
claimNextURL,
|
|
17
|
+
markDone,
|
|
18
|
+
markFailed,
|
|
19
|
+
insertURL,
|
|
20
|
+
insertLink,
|
|
21
|
+
resetStaleLocks,
|
|
22
|
+
getGlobalStats,
|
|
23
|
+
refreshDomainStats,
|
|
24
|
+
getDomainStats,
|
|
25
|
+
} from "../db/queries.js";
|
|
26
|
+
|
|
27
|
+
const mockedQuery = vi.mocked(query);
|
|
28
|
+
const mockedPool = vi.mocked(pool);
|
|
29
|
+
|
|
30
|
+
describe("Database Queries", () => {
|
|
31
|
+
beforeEach(() => {
|
|
32
|
+
vi.clearAllMocks();
|
|
33
|
+
mockedQuery.mockReset().mockResolvedValue({ rows: [] } as any);
|
|
34
|
+
mockedPool.connect.mockReset();
|
|
35
|
+
});
|
|
36
|
+
|
|
37
|
+
describe("claimNextURL", () => {
|
|
38
|
+
it("should return the row if a PENDING URL is found", async () => {
|
|
39
|
+
const mockRow = { id: 1, url: "https://react.dev", domain: "react.dev", status: "FETCHING", depth: 0 };
|
|
40
|
+
mockedQuery.mockResolvedValue({ rows: [mockRow] } as any);
|
|
41
|
+
|
|
42
|
+
const result = await claimNextURL("react.dev");
|
|
43
|
+
|
|
44
|
+
expect(mockedQuery).toHaveBeenCalledTimes(1);
|
|
45
|
+
expect(mockedQuery).toHaveBeenCalledWith(expect.stringContaining("UPDATE urls"), ["react.dev"]);
|
|
46
|
+
expect(result).toEqual(mockRow);
|
|
47
|
+
});
|
|
48
|
+
|
|
49
|
+
it("should return null if no PENDING URL is found", async () => {
|
|
50
|
+
mockedQuery.mockResolvedValue({ rows: [] } as any);
|
|
51
|
+
|
|
52
|
+
const result = await claimNextURL("react.dev");
|
|
53
|
+
|
|
54
|
+
expect(result).toBeNull();
|
|
55
|
+
});
|
|
56
|
+
});
|
|
57
|
+
|
|
58
|
+
describe("markDone", () => {
|
|
59
|
+
it("should execute queries in a transaction", async () => {
|
|
60
|
+
const mockClient = {
|
|
61
|
+
query: vi.fn().mockResolvedValue({}),
|
|
62
|
+
release: vi.fn(),
|
|
63
|
+
};
|
|
64
|
+
mockedPool.connect.mockResolvedValue(mockClient as any);
|
|
65
|
+
|
|
66
|
+
const content = {
|
|
67
|
+
title: "React",
|
|
68
|
+
description: "Library",
|
|
69
|
+
canonicalUrl: "https://react.dev",
|
|
70
|
+
headings: { h1: ["H1"], h2: [], h3: [] },
|
|
71
|
+
textContent: "Body content",
|
|
72
|
+
};
|
|
73
|
+
|
|
74
|
+
await markDone(1, content);
|
|
75
|
+
|
|
76
|
+
expect(mockedPool.connect).toHaveBeenCalledTimes(1);
|
|
77
|
+
expect(mockClient.query).toHaveBeenCalledWith("BEGIN");
|
|
78
|
+
expect(mockClient.query).toHaveBeenCalledWith(
|
|
79
|
+
expect.stringContaining("INSERT INTO crawled_pages"),
|
|
80
|
+
[1, "React", "Library", "https://react.dev", JSON.stringify(content.headings), "Body content"]
|
|
81
|
+
);
|
|
82
|
+
expect(mockClient.query).toHaveBeenCalledWith(
|
|
83
|
+
expect.stringContaining("UPDATE urls"),
|
|
84
|
+
[1]
|
|
85
|
+
);
|
|
86
|
+
expect(mockClient.query).toHaveBeenCalledWith("COMMIT");
|
|
87
|
+
expect(mockClient.release).toHaveBeenCalledTimes(1);
|
|
88
|
+
});
|
|
89
|
+
|
|
90
|
+
it("should rollback transaction on error", async () => {
|
|
91
|
+
const mockClient = {
|
|
92
|
+
query: vi.fn().mockImplementation((sql: string) => {
|
|
93
|
+
if (sql.includes("INSERT INTO crawled_pages")) {
|
|
94
|
+
throw new Error("DB Error");
|
|
95
|
+
}
|
|
96
|
+
return Promise.resolve({});
|
|
97
|
+
}),
|
|
98
|
+
release: vi.fn(),
|
|
99
|
+
};
|
|
100
|
+
mockedPool.connect.mockResolvedValue(mockClient as any);
|
|
101
|
+
|
|
102
|
+
const content = {
|
|
103
|
+
title: "React",
|
|
104
|
+
description: "Library",
|
|
105
|
+
canonicalUrl: "https://react.dev",
|
|
106
|
+
headings: { h1: ["H1"], h2: [], h3: [] },
|
|
107
|
+
textContent: "Body content",
|
|
108
|
+
};
|
|
109
|
+
|
|
110
|
+
await expect(markDone(1, content)).rejects.toThrow("DB Error");
|
|
111
|
+
|
|
112
|
+
expect(mockClient.query).toHaveBeenCalledWith("ROLLBACK");
|
|
113
|
+
expect(mockClient.release).toHaveBeenCalledTimes(1);
|
|
114
|
+
});
|
|
115
|
+
});
|
|
116
|
+
|
|
117
|
+
describe("markFailed", () => {
|
|
118
|
+
it("should update status to FAILED with error message", async () => {
|
|
119
|
+
mockedQuery.mockResolvedValue({ rows: [] } as any);
|
|
120
|
+
await markFailed(1, "Connection timeout");
|
|
121
|
+
|
|
122
|
+
expect(mockedQuery).toHaveBeenCalledTimes(1);
|
|
123
|
+
expect(mockedQuery).toHaveBeenCalledWith(
|
|
124
|
+
expect.stringContaining("UPDATE urls"),
|
|
125
|
+
[1, "Connection timeout"]
|
|
126
|
+
);
|
|
127
|
+
});
|
|
128
|
+
});
|
|
129
|
+
|
|
130
|
+
describe("insertURL", () => {
|
|
131
|
+
it("should return the ID of the URL", async () => {
|
|
132
|
+
mockedQuery.mockResolvedValue({ rows: [{ id: 42 }] } as any);
|
|
133
|
+
|
|
134
|
+
const id = await insertURL("https://react.dev", "react.dev", 1);
|
|
135
|
+
|
|
136
|
+
expect(id).toBe(42);
|
|
137
|
+
expect(mockedQuery).toHaveBeenCalledTimes(1);
|
|
138
|
+
expect(mockedQuery).toHaveBeenCalledWith(
|
|
139
|
+
expect.stringContaining("WITH ins AS"),
|
|
140
|
+
["https://react.dev", "react.dev", 1]
|
|
141
|
+
);
|
|
142
|
+
});
|
|
143
|
+
});
|
|
144
|
+
|
|
145
|
+
describe("insertLink", () => {
|
|
146
|
+
it("should insert edge into links table", async () => {
|
|
147
|
+
mockedQuery.mockResolvedValue({ rows: [] } as any);
|
|
148
|
+
await insertLink(1, 2);
|
|
149
|
+
|
|
150
|
+
expect(mockedQuery).toHaveBeenCalledTimes(1);
|
|
151
|
+
expect(mockedQuery).toHaveBeenCalledWith(
|
|
152
|
+
expect.stringContaining("INSERT INTO links"),
|
|
153
|
+
[1, 2]
|
|
154
|
+
);
|
|
155
|
+
});
|
|
156
|
+
});
|
|
157
|
+
|
|
158
|
+
describe("resetStaleLocks", () => {
|
|
159
|
+
it("should reset FETCHING urls back to PENDING", async () => {
|
|
160
|
+
mockedQuery.mockResolvedValue({ rows: [] } as any);
|
|
161
|
+
await resetStaleLocks();
|
|
162
|
+
|
|
163
|
+
expect(mockedQuery).toHaveBeenCalledTimes(1);
|
|
164
|
+
expect(mockedQuery).toHaveBeenCalledWith(
|
|
165
|
+
expect.stringContaining("UPDATE urls")
|
|
166
|
+
);
|
|
167
|
+
expect(mockedQuery).toHaveBeenCalledWith(
|
|
168
|
+
expect.stringContaining("status = 'PENDING'")
|
|
169
|
+
);
|
|
170
|
+
expect(mockedQuery).toHaveBeenCalledWith(
|
|
171
|
+
expect.stringContaining("status = 'FETCHING'")
|
|
172
|
+
);
|
|
173
|
+
});
|
|
174
|
+
});
|
|
175
|
+
|
|
176
|
+
describe("getGlobalStats", () => {
|
|
177
|
+
it("should map query results to GlobalStats structure", async () => {
|
|
178
|
+
mockedQuery.mockResolvedValue({
|
|
179
|
+
rows: [
|
|
180
|
+
{ status: "PENDING", count: "10" },
|
|
181
|
+
{ status: "DONE", count: "5" },
|
|
182
|
+
],
|
|
183
|
+
} as any);
|
|
184
|
+
|
|
185
|
+
const stats = await getGlobalStats();
|
|
186
|
+
expect(stats).toEqual({
|
|
187
|
+
pending: 10,
|
|
188
|
+
fetching: 0,
|
|
189
|
+
done: 5,
|
|
190
|
+
failed: 0,
|
|
191
|
+
});
|
|
192
|
+
expect(mockedQuery).toHaveBeenCalledWith(
|
|
193
|
+
expect.stringContaining("SELECT status, COUNT(*)")
|
|
194
|
+
);
|
|
195
|
+
});
|
|
196
|
+
});
|
|
197
|
+
|
|
198
|
+
describe("refreshDomainStats", () => {
|
|
199
|
+
it("should run CREATE TABLE and INSERT INTO domain_stats query", async () => {
|
|
200
|
+
mockedQuery.mockResolvedValue({ rows: [] } as any);
|
|
201
|
+
await refreshDomainStats();
|
|
202
|
+
|
|
203
|
+
expect(mockedQuery).toHaveBeenCalledTimes(2);
|
|
204
|
+
expect(mockedQuery).toHaveBeenNthCalledWith(
|
|
205
|
+
1,
|
|
206
|
+
expect.stringContaining("CREATE TABLE IF NOT EXISTS domain_stats")
|
|
207
|
+
);
|
|
208
|
+
expect(mockedQuery).toHaveBeenNthCalledWith(
|
|
209
|
+
2,
|
|
210
|
+
expect.stringContaining("INSERT INTO domain_stats")
|
|
211
|
+
);
|
|
212
|
+
});
|
|
213
|
+
});
|
|
214
|
+
|
|
215
|
+
describe("getDomainStats", () => {
|
|
216
|
+
it("should fetch and format domain stats", async () => {
|
|
217
|
+
const lastCrawled = new Date();
|
|
218
|
+
mockedQuery.mockResolvedValue({
|
|
219
|
+
rows: [
|
|
220
|
+
{
|
|
221
|
+
domain: "react.dev",
|
|
222
|
+
pending_count: "5",
|
|
223
|
+
fetching_count: "1",
|
|
224
|
+
done_count: "10",
|
|
225
|
+
failed_count: "2",
|
|
226
|
+
last_crawled_at: lastCrawled.toISOString(),
|
|
227
|
+
},
|
|
228
|
+
],
|
|
229
|
+
} as any);
|
|
230
|
+
|
|
231
|
+
const stats = await getDomainStats();
|
|
232
|
+
expect(stats).toEqual([
|
|
233
|
+
{
|
|
234
|
+
domain: "react.dev",
|
|
235
|
+
pending_count: 5,
|
|
236
|
+
fetching_count: 1,
|
|
237
|
+
done_count: 10,
|
|
238
|
+
failed_count: 2,
|
|
239
|
+
last_crawled_at: lastCrawled,
|
|
240
|
+
},
|
|
241
|
+
]);
|
|
242
|
+
expect(mockedQuery).toHaveBeenCalledWith(
|
|
243
|
+
expect.stringContaining("SELECT domain, pending_count")
|
|
244
|
+
);
|
|
245
|
+
});
|
|
246
|
+
});
|
|
247
|
+
});
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import { describe, it, expect, vi, beforeEach } from "vitest";
|
|
2
|
+
|
|
3
|
+
vi.mock("undici", () => {
|
|
4
|
+
return {
|
|
5
|
+
request: vi.fn(),
|
|
6
|
+
};
|
|
7
|
+
});
|
|
8
|
+
|
|
9
|
+
vi.mock("../config.js", () => {
|
|
10
|
+
return {
|
|
11
|
+
config: {
|
|
12
|
+
REQUEST_TIMEOUT_MS: 1000,
|
|
13
|
+
},
|
|
14
|
+
};
|
|
15
|
+
});
|
|
16
|
+
|
|
17
|
+
import { request } from "undici";
|
|
18
|
+
import { isAllowedByRobots } from "../frontier/robots.js";
|
|
19
|
+
|
|
20
|
+
const mockedRequest = vi.mocked(request);
|
|
21
|
+
|
|
22
|
+
describe("robots.txt compliance", () => {
|
|
23
|
+
beforeEach(() => {
|
|
24
|
+
vi.clearAllMocks();
|
|
25
|
+
});
|
|
26
|
+
|
|
27
|
+
it("should allow URLs if robots.txt allows it", async () => {
|
|
28
|
+
mockedRequest.mockResolvedValue({
|
|
29
|
+
statusCode: 200,
|
|
30
|
+
body: {
|
|
31
|
+
text: async () => `
|
|
32
|
+
User-agent: *
|
|
33
|
+
Disallow: /private/
|
|
34
|
+
`,
|
|
35
|
+
},
|
|
36
|
+
} as any);
|
|
37
|
+
|
|
38
|
+
const allowed = await isAllowedByRobots("https://react.dev/docs");
|
|
39
|
+
expect(allowed).toBe(true);
|
|
40
|
+
|
|
41
|
+
const disallowed = await isAllowedByRobots("https://react.dev/private/secret");
|
|
42
|
+
expect(disallowed).toBe(false);
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
it("should allow URLs if robots.txt returns 404", async () => {
|
|
46
|
+
mockedRequest.mockResolvedValue({
|
|
47
|
+
statusCode: 404,
|
|
48
|
+
body: {
|
|
49
|
+
text: async () => "Not Found",
|
|
50
|
+
},
|
|
51
|
+
} as any);
|
|
52
|
+
|
|
53
|
+
const allowed = await isAllowedByRobots("https://react.dev/docs");
|
|
54
|
+
expect(allowed).toBe(true);
|
|
55
|
+
});
|
|
56
|
+
});
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
import { describe, it, expect, vi, beforeEach, afterEach } from "vitest";
|
|
2
|
+
|
|
3
|
+
// Mock dependencies
|
|
4
|
+
vi.mock("../config.js", () => {
|
|
5
|
+
return {
|
|
6
|
+
config: {
|
|
7
|
+
WORKER_COUNT: 2,
|
|
8
|
+
CRAWL_DELAY_MS: 1000,
|
|
9
|
+
MAX_PAGES: 100,
|
|
10
|
+
},
|
|
11
|
+
};
|
|
12
|
+
});
|
|
13
|
+
|
|
14
|
+
vi.mock("../db/queries.js", () => {
|
|
15
|
+
return {
|
|
16
|
+
claimNextURL: vi.fn(),
|
|
17
|
+
getGlobalStats: vi.fn().mockResolvedValue({ pending: 0, fetching: 0, done: 0, failed: 0 }),
|
|
18
|
+
};
|
|
19
|
+
});
|
|
20
|
+
|
|
21
|
+
vi.mock("../frontier/frontier.js", () => {
|
|
22
|
+
return {
|
|
23
|
+
getPendingDomains: vi.fn(),
|
|
24
|
+
};
|
|
25
|
+
});
|
|
26
|
+
|
|
27
|
+
vi.mock("../worker/worker.js", () => {
|
|
28
|
+
return {
|
|
29
|
+
processPage: vi.fn().mockResolvedValue(undefined),
|
|
30
|
+
};
|
|
31
|
+
});
|
|
32
|
+
|
|
33
|
+
import { claimNextURL } from "../db/queries.js";
|
|
34
|
+
import { getPendingDomains } from "../frontier/frontier.js";
|
|
35
|
+
import { startScheduler, stopScheduler, getCooldown } from "../frontier/scheduler.js";
|
|
36
|
+
|
|
37
|
+
const mockClaimNextURL = vi.mocked(claimNextURL);
|
|
38
|
+
const mockGetPendingDomains = vi.mocked(getPendingDomains);
|
|
39
|
+
|
|
40
|
+
describe("Scheduler", () => {
|
|
41
|
+
beforeEach(() => {
|
|
42
|
+
vi.clearAllMocks();
|
|
43
|
+
vi.useFakeTimers();
|
|
44
|
+
});
|
|
45
|
+
|
|
46
|
+
afterEach(() => {
|
|
47
|
+
vi.useRealTimers();
|
|
48
|
+
stopScheduler();
|
|
49
|
+
});
|
|
50
|
+
|
|
51
|
+
it("should respect politeness delay (cooldowns) and round-robin domains", async () => {
|
|
52
|
+
// Two domains are pending
|
|
53
|
+
mockGetPendingDomains.mockResolvedValue(["react.dev", "typescriptlang.org"]);
|
|
54
|
+
|
|
55
|
+
// Mock claimNextURL responses
|
|
56
|
+
mockClaimNextURL
|
|
57
|
+
.mockResolvedValueOnce({ id: 1, url: "https://react.dev", domain: "react.dev", status: "FETCHING", depth: 0 })
|
|
58
|
+
.mockResolvedValueOnce({ id: 2, url: "https://typescriptlang.org", domain: "typescriptlang.org", status: "FETCHING", depth: 0 });
|
|
59
|
+
|
|
60
|
+
// Start the scheduler
|
|
61
|
+
const schedulerPromise = startScheduler();
|
|
62
|
+
|
|
63
|
+
// Allow the first loop iteration to execute
|
|
64
|
+
await vi.advanceTimersByTimeAsync(0);
|
|
65
|
+
|
|
66
|
+
// Verify it claimed react.dev first
|
|
67
|
+
expect(mockClaimNextURL).toHaveBeenNthCalledWith(1, "react.dev");
|
|
68
|
+
const cooldownReact = getCooldown("react.dev");
|
|
69
|
+
expect(cooldownReact).toBeGreaterThan(0);
|
|
70
|
+
|
|
71
|
+
// Advance time slightly (100ms, less than 1000ms cooldown)
|
|
72
|
+
await vi.advanceTimersByTimeAsync(100);
|
|
73
|
+
|
|
74
|
+
// It should check the next domain in round robin, which is typescriptlang.org
|
|
75
|
+
// Since typescriptlang.org has no cooldown, it should claim a URL for it
|
|
76
|
+
expect(mockClaimNextURL).toHaveBeenNthCalledWith(2, "typescriptlang.org");
|
|
77
|
+
const cooldownTS = getCooldown("typescriptlang.org");
|
|
78
|
+
expect(cooldownTS).toBeGreaterThan(0);
|
|
79
|
+
|
|
80
|
+
// Advance time slightly again
|
|
81
|
+
await vi.advanceTimersByTimeAsync(100);
|
|
82
|
+
// claimNextURL should not have been called a third time because both domains are on cooldown
|
|
83
|
+
expect(mockClaimNextURL).toHaveBeenCalledTimes(2);
|
|
84
|
+
|
|
85
|
+
// Stop scheduler to exit loop
|
|
86
|
+
stopScheduler();
|
|
87
|
+
await vi.advanceTimersByTimeAsync(100);
|
|
88
|
+
await schedulerPromise;
|
|
89
|
+
});
|
|
90
|
+
});
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import { describe, it, expect, vi, beforeEach } from "vitest";
|
|
2
|
+
|
|
3
|
+
vi.mock("../db/client.js", () => {
|
|
4
|
+
return {
|
|
5
|
+
query: vi.fn().mockResolvedValue({ rows: [] }),
|
|
6
|
+
};
|
|
7
|
+
});
|
|
8
|
+
|
|
9
|
+
vi.mock("../config.js", () => {
|
|
10
|
+
return {
|
|
11
|
+
config: {
|
|
12
|
+
SEED_URLS: ["https://react.dev", "not-a-url"],
|
|
13
|
+
},
|
|
14
|
+
};
|
|
15
|
+
});
|
|
16
|
+
|
|
17
|
+
import { seedDatabase } from "../seed.js";
|
|
18
|
+
import { query } from "../db/client.js";
|
|
19
|
+
|
|
20
|
+
describe("Seeding Logic", () => {
|
|
21
|
+
beforeEach(() => {
|
|
22
|
+
vi.clearAllMocks();
|
|
23
|
+
});
|
|
24
|
+
|
|
25
|
+
it("should insert valid seed URLs and skip invalid ones", async () => {
|
|
26
|
+
await seedDatabase();
|
|
27
|
+
|
|
28
|
+
// query should only be called once, for "https://react.dev"
|
|
29
|
+
expect(query).toHaveBeenCalledTimes(1);
|
|
30
|
+
expect(query).toHaveBeenCalledWith(
|
|
31
|
+
expect.stringContaining("INSERT INTO urls"),
|
|
32
|
+
["https://react.dev", "react.dev"]
|
|
33
|
+
);
|
|
34
|
+
});
|
|
35
|
+
});
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
import { describe, it, expect, vi, beforeEach } from "vitest";
|
|
2
|
+
|
|
3
|
+
// Mock downloader
|
|
4
|
+
vi.mock("../worker/downloader.js", () => {
|
|
5
|
+
return {
|
|
6
|
+
downloadPage: vi.fn(),
|
|
7
|
+
};
|
|
8
|
+
});
|
|
9
|
+
|
|
10
|
+
// Mock extractor
|
|
11
|
+
vi.mock("../worker/extractor.js", () => {
|
|
12
|
+
return {
|
|
13
|
+
extractPageData: vi.fn(),
|
|
14
|
+
};
|
|
15
|
+
});
|
|
16
|
+
|
|
17
|
+
// Mock db queries
|
|
18
|
+
vi.mock("../db/queries.js", () => {
|
|
19
|
+
return {
|
|
20
|
+
insertURL: vi.fn(),
|
|
21
|
+
insertLink: vi.fn(),
|
|
22
|
+
markDone: vi.fn(),
|
|
23
|
+
markFailed: vi.fn(),
|
|
24
|
+
};
|
|
25
|
+
});
|
|
26
|
+
|
|
27
|
+
vi.mock("../config.js", () => {
|
|
28
|
+
return {
|
|
29
|
+
config: {
|
|
30
|
+
ALLOWED_DOMAINS: ["react.dev"],
|
|
31
|
+
MAX_DEPTH: 2,
|
|
32
|
+
},
|
|
33
|
+
};
|
|
34
|
+
});
|
|
35
|
+
|
|
36
|
+
vi.mock("../frontier/robots.js", () => {
|
|
37
|
+
return {
|
|
38
|
+
isAllowedByRobots: vi.fn(),
|
|
39
|
+
};
|
|
40
|
+
});
|
|
41
|
+
|
|
42
|
+
import { downloadPage } from "../worker/downloader.js";
|
|
43
|
+
import { extractPageData } from "../worker/extractor.js";
|
|
44
|
+
import { insertURL, insertLink, markDone, markFailed } from "../db/queries.js";
|
|
45
|
+
import { isAllowedByRobots } from "../frontier/robots.js";
|
|
46
|
+
import { processPage } from "../worker/worker.js";
|
|
47
|
+
|
|
48
|
+
const mockDownloadPage = vi.mocked(downloadPage);
|
|
49
|
+
const mockExtractPageData = vi.mocked(extractPageData);
|
|
50
|
+
const mockInsertURL = vi.mocked(insertURL);
|
|
51
|
+
const mockInsertLink = vi.mocked(insertLink);
|
|
52
|
+
const mockMarkDone = vi.mocked(markDone);
|
|
53
|
+
const mockMarkFailed = vi.mocked(markFailed);
|
|
54
|
+
const mockIsAllowedByRobots = vi.mocked(isAllowedByRobots);
|
|
55
|
+
|
|
56
|
+
describe("Worker Pipeline", () => {
|
|
57
|
+
beforeEach(() => {
|
|
58
|
+
vi.clearAllMocks();
|
|
59
|
+
mockIsAllowedByRobots.mockResolvedValue(true);
|
|
60
|
+
});
|
|
61
|
+
|
|
62
|
+
it("should successfully process a page, extract content, and insert links", async () => {
|
|
63
|
+
mockDownloadPage.mockResolvedValue({
|
|
64
|
+
url: "https://react.dev/docs",
|
|
65
|
+
html: "<html>...</html>",
|
|
66
|
+
statusCode: 200,
|
|
67
|
+
});
|
|
68
|
+
|
|
69
|
+
mockExtractPageData.mockReturnValue({
|
|
70
|
+
title: "React Docs",
|
|
71
|
+
description: "Learn React",
|
|
72
|
+
canonicalUrl: "https://react.dev/docs",
|
|
73
|
+
headings: { h1: ["Docs"], h2: [], h3: [] },
|
|
74
|
+
textContent: "Learn React content",
|
|
75
|
+
links: ["/tutorial", "https://external.com", "https://react.dev/docs"],
|
|
76
|
+
});
|
|
77
|
+
|
|
78
|
+
mockInsertURL.mockResolvedValue(100);
|
|
79
|
+
|
|
80
|
+
await processPage({ id: 42, url: "https://react.dev/docs", depth: 1 });
|
|
81
|
+
|
|
82
|
+
expect(mockMarkDone).toHaveBeenCalledTimes(1);
|
|
83
|
+
expect(mockMarkDone).toHaveBeenCalledWith(42, {
|
|
84
|
+
title: "React Docs",
|
|
85
|
+
description: "Learn React",
|
|
86
|
+
canonicalUrl: "https://react.dev/docs",
|
|
87
|
+
headings: { h1: ["Docs"], h2: [], h3: [] },
|
|
88
|
+
textContent: "Learn React content",
|
|
89
|
+
});
|
|
90
|
+
|
|
91
|
+
expect(mockInsertURL).toHaveBeenCalledTimes(1);
|
|
92
|
+
expect(mockInsertURL).toHaveBeenCalledWith("https://react.dev/tutorial", "react.dev", 2);
|
|
93
|
+
|
|
94
|
+
expect(mockInsertLink).toHaveBeenCalledTimes(1);
|
|
95
|
+
expect(mockInsertLink).toHaveBeenCalledWith(42, 100);
|
|
96
|
+
|
|
97
|
+
expect(mockMarkFailed).not.toHaveBeenCalled();
|
|
98
|
+
});
|
|
99
|
+
|
|
100
|
+
it("should mark URL as FAILED if download fails", async () => {
|
|
101
|
+
mockDownloadPage.mockRejectedValue(new Error("Network Error"));
|
|
102
|
+
|
|
103
|
+
await expect(processPage({ id: 42, url: "https://react.dev/docs", depth: 1 })).rejects.toThrow("Network Error");
|
|
104
|
+
|
|
105
|
+
expect(mockMarkFailed).toHaveBeenCalledTimes(1);
|
|
106
|
+
expect(mockMarkFailed).toHaveBeenCalledWith(42, "Network Error");
|
|
107
|
+
expect(mockMarkDone).not.toHaveBeenCalled();
|
|
108
|
+
});
|
|
109
|
+
|
|
110
|
+
it("should discard links that exceed MAX_DEPTH", async () => {
|
|
111
|
+
mockDownloadPage.mockResolvedValue({
|
|
112
|
+
url: "https://react.dev/docs",
|
|
113
|
+
html: "<html>...</html>",
|
|
114
|
+
statusCode: 200,
|
|
115
|
+
});
|
|
116
|
+
|
|
117
|
+
mockExtractPageData.mockReturnValue({
|
|
118
|
+
title: "React Docs",
|
|
119
|
+
description: "Learn React",
|
|
120
|
+
canonicalUrl: "https://react.dev/docs",
|
|
121
|
+
headings: { h1: ["Docs"], h2: [], h3: [] },
|
|
122
|
+
textContent: "Learn React content",
|
|
123
|
+
links: ["/tutorial"],
|
|
124
|
+
});
|
|
125
|
+
|
|
126
|
+
// Run with current depth = 2, so nextDepth = 3 which exceeds MAX_DEPTH = 2
|
|
127
|
+
await processPage({ id: 42, url: "https://react.dev/docs", depth: 2 });
|
|
128
|
+
|
|
129
|
+
expect(mockMarkDone).toHaveBeenCalledTimes(1);
|
|
130
|
+
expect(mockInsertURL).not.toHaveBeenCalled();
|
|
131
|
+
expect(mockInsertLink).not.toHaveBeenCalled();
|
|
132
|
+
});
|
|
133
|
+
|
|
134
|
+
it("should abort crawl if URL is disallowed by robots.txt", async () => {
|
|
135
|
+
mockIsAllowedByRobots.mockResolvedValue(false);
|
|
136
|
+
|
|
137
|
+
await processPage({ id: 42, url: "https://react.dev/private", depth: 1 });
|
|
138
|
+
|
|
139
|
+
expect(mockMarkFailed).toHaveBeenCalledTimes(1);
|
|
140
|
+
expect(mockMarkFailed).toHaveBeenCalledWith(42, "Disallowed by robots.txt");
|
|
141
|
+
expect(mockDownloadPage).not.toHaveBeenCalled();
|
|
142
|
+
expect(mockMarkDone).not.toHaveBeenCalled();
|
|
143
|
+
});
|
|
144
|
+
});
|