messi-crawler 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +201 -0
- package/dist/cli/renderer.js +71 -0
- package/dist/config.js +18 -0
- package/dist/db/clear.js +16 -0
- package/dist/db/client.js +20 -0
- package/dist/db/queries.js +179 -0
- package/dist/frontier/frontier.js +44 -0
- package/dist/frontier/logger.js +65 -0
- package/dist/frontier/robots.js +46 -0
- package/dist/frontier/scheduler.js +98 -0
- package/dist/index.js +533 -0
- package/dist/normalizer.js +33 -0
- package/dist/output/db-strategy.js +16 -0
- package/dist/output/index.js +23 -0
- package/dist/output/pdf-strategy.js +316 -0
- package/dist/output/strategy.js +1 -0
- package/dist/security/ssrf.js +45 -0
- package/dist/security/validate-url.js +41 -0
- package/dist/seed.js +14 -0
- package/dist/setup.js +148 -0
- package/dist/test/client.test.js +33 -0
- package/dist/test/downloader.test.js +84 -0
- package/dist/test/extractor.test.js +126 -0
- package/dist/test/frontier.test.js +43 -0
- package/dist/test/logger.test.js +55 -0
- package/dist/test/normalizer.test.js +36 -0
- package/dist/test/pdf-strategy.test.js +68 -0
- package/dist/test/queries.test.js +173 -0
- package/dist/test/robots.test.js +46 -0
- package/dist/test/scheduler.test.js +73 -0
- package/dist/test/seed.test.js +26 -0
- package/dist/test/worker.test.js +118 -0
- package/dist/worker/downloader.js +114 -0
- package/dist/worker/extractor.js +197 -0
- package/dist/worker/worker.js +87 -0
- package/package.json +48 -0
- package/seeds.txt +4 -0
- package/src/cli/renderer.ts +83 -0
- package/src/config.ts +22 -0
- package/src/db/clear.ts +16 -0
- package/src/db/client.ts +26 -0
- package/src/db/queries.ts +255 -0
- package/src/db/schema.sql +43 -0
- package/src/frontier/frontier.ts +60 -0
- package/src/frontier/logger.ts +75 -0
- package/src/frontier/robots.ts +50 -0
- package/src/frontier/scheduler.ts +119 -0
- package/src/index.ts +596 -0
- package/src/normalizer.ts +37 -0
- package/src/output/db-strategy.ts +20 -0
- package/src/output/index.ts +32 -0
- package/src/output/pdf-strategy.ts +388 -0
- package/src/output/strategy.ts +16 -0
- package/src/security/ssrf.ts +48 -0
- package/src/security/validate-url.ts +49 -0
- package/src/seed.ts +18 -0
- package/src/setup.ts +170 -0
- package/src/test/client.test.ts +38 -0
- package/src/test/downloader.test.ts +101 -0
- package/src/test/extractor.test.ts +139 -0
- package/src/test/frontier.test.ts +53 -0
- package/src/test/logger.test.ts +71 -0
- package/src/test/normalizer.test.ts +43 -0
- package/src/test/pdf-strategy.test.ts +84 -0
- package/src/test/queries.test.ts +247 -0
- package/src/test/robots.test.ts +56 -0
- package/src/test/scheduler.test.ts +90 -0
- package/src/test/seed.test.ts +35 -0
- package/src/test/worker.test.ts +144 -0
- package/src/worker/downloader.ts +149 -0
- package/src/worker/extractor.ts +235 -0
- package/src/worker/worker.ts +100 -0
- package/tsconfig.json +15 -0
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import { describe, it, expect, vi, beforeEach, afterEach } from "vitest";
|
|
2
|
+
// Mock dependencies
|
|
3
|
+
vi.mock("../config.js", () => {
|
|
4
|
+
return {
|
|
5
|
+
config: {
|
|
6
|
+
WORKER_COUNT: 2,
|
|
7
|
+
CRAWL_DELAY_MS: 1000,
|
|
8
|
+
MAX_PAGES: 100,
|
|
9
|
+
},
|
|
10
|
+
};
|
|
11
|
+
});
|
|
12
|
+
vi.mock("../db/queries.js", () => {
|
|
13
|
+
return {
|
|
14
|
+
claimNextURL: vi.fn(),
|
|
15
|
+
getGlobalStats: vi.fn().mockResolvedValue({ pending: 0, fetching: 0, done: 0, failed: 0 }),
|
|
16
|
+
};
|
|
17
|
+
});
|
|
18
|
+
vi.mock("../frontier/frontier.js", () => {
|
|
19
|
+
return {
|
|
20
|
+
getPendingDomains: vi.fn(),
|
|
21
|
+
};
|
|
22
|
+
});
|
|
23
|
+
vi.mock("../worker/worker.js", () => {
|
|
24
|
+
return {
|
|
25
|
+
processPage: vi.fn().mockResolvedValue(undefined),
|
|
26
|
+
};
|
|
27
|
+
});
|
|
28
|
+
import { claimNextURL } from "../db/queries.js";
|
|
29
|
+
import { getPendingDomains } from "../frontier/frontier.js";
|
|
30
|
+
import { startScheduler, stopScheduler, getCooldown } from "../frontier/scheduler.js";
|
|
31
|
+
const mockClaimNextURL = vi.mocked(claimNextURL);
|
|
32
|
+
const mockGetPendingDomains = vi.mocked(getPendingDomains);
|
|
33
|
+
describe("Scheduler", () => {
|
|
34
|
+
beforeEach(() => {
|
|
35
|
+
vi.clearAllMocks();
|
|
36
|
+
vi.useFakeTimers();
|
|
37
|
+
});
|
|
38
|
+
afterEach(() => {
|
|
39
|
+
vi.useRealTimers();
|
|
40
|
+
stopScheduler();
|
|
41
|
+
});
|
|
42
|
+
it("should respect politeness delay (cooldowns) and round-robin domains", async () => {
|
|
43
|
+
// Two domains are pending
|
|
44
|
+
mockGetPendingDomains.mockResolvedValue(["react.dev", "typescriptlang.org"]);
|
|
45
|
+
// Mock claimNextURL responses
|
|
46
|
+
mockClaimNextURL
|
|
47
|
+
.mockResolvedValueOnce({ id: 1, url: "https://react.dev", domain: "react.dev", status: "FETCHING", depth: 0 })
|
|
48
|
+
.mockResolvedValueOnce({ id: 2, url: "https://typescriptlang.org", domain: "typescriptlang.org", status: "FETCHING", depth: 0 });
|
|
49
|
+
// Start the scheduler
|
|
50
|
+
const schedulerPromise = startScheduler();
|
|
51
|
+
// Allow the first loop iteration to execute
|
|
52
|
+
await vi.advanceTimersByTimeAsync(0);
|
|
53
|
+
// Verify it claimed react.dev first
|
|
54
|
+
expect(mockClaimNextURL).toHaveBeenNthCalledWith(1, "react.dev");
|
|
55
|
+
const cooldownReact = getCooldown("react.dev");
|
|
56
|
+
expect(cooldownReact).toBeGreaterThan(0);
|
|
57
|
+
// Advance time slightly (100ms, less than 1000ms cooldown)
|
|
58
|
+
await vi.advanceTimersByTimeAsync(100);
|
|
59
|
+
// It should check the next domain in round robin, which is typescriptlang.org
|
|
60
|
+
// Since typescriptlang.org has no cooldown, it should claim a URL for it
|
|
61
|
+
expect(mockClaimNextURL).toHaveBeenNthCalledWith(2, "typescriptlang.org");
|
|
62
|
+
const cooldownTS = getCooldown("typescriptlang.org");
|
|
63
|
+
expect(cooldownTS).toBeGreaterThan(0);
|
|
64
|
+
// Advance time slightly again
|
|
65
|
+
await vi.advanceTimersByTimeAsync(100);
|
|
66
|
+
// claimNextURL should not have been called a third time because both domains are on cooldown
|
|
67
|
+
expect(mockClaimNextURL).toHaveBeenCalledTimes(2);
|
|
68
|
+
// Stop scheduler to exit loop
|
|
69
|
+
stopScheduler();
|
|
70
|
+
await vi.advanceTimersByTimeAsync(100);
|
|
71
|
+
await schedulerPromise;
|
|
72
|
+
});
|
|
73
|
+
});
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import { describe, it, expect, vi, beforeEach } from "vitest";
|
|
2
|
+
vi.mock("../db/client.js", () => {
|
|
3
|
+
return {
|
|
4
|
+
query: vi.fn().mockResolvedValue({ rows: [] }),
|
|
5
|
+
};
|
|
6
|
+
});
|
|
7
|
+
vi.mock("../config.js", () => {
|
|
8
|
+
return {
|
|
9
|
+
config: {
|
|
10
|
+
SEED_URLS: ["https://react.dev", "not-a-url"],
|
|
11
|
+
},
|
|
12
|
+
};
|
|
13
|
+
});
|
|
14
|
+
import { seedDatabase } from "../seed.js";
|
|
15
|
+
import { query } from "../db/client.js";
|
|
16
|
+
describe("Seeding Logic", () => {
|
|
17
|
+
beforeEach(() => {
|
|
18
|
+
vi.clearAllMocks();
|
|
19
|
+
});
|
|
20
|
+
it("should insert valid seed URLs and skip invalid ones", async () => {
|
|
21
|
+
await seedDatabase();
|
|
22
|
+
// query should only be called once, for "https://react.dev"
|
|
23
|
+
expect(query).toHaveBeenCalledTimes(1);
|
|
24
|
+
expect(query).toHaveBeenCalledWith(expect.stringContaining("INSERT INTO urls"), ["https://react.dev", "react.dev"]);
|
|
25
|
+
});
|
|
26
|
+
});
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
import { describe, it, expect, vi, beforeEach } from "vitest";
|
|
2
|
+
// Mock downloader
|
|
3
|
+
vi.mock("../worker/downloader.js", () => {
|
|
4
|
+
return {
|
|
5
|
+
downloadPage: vi.fn(),
|
|
6
|
+
};
|
|
7
|
+
});
|
|
8
|
+
// Mock extractor
|
|
9
|
+
vi.mock("../worker/extractor.js", () => {
|
|
10
|
+
return {
|
|
11
|
+
extractPageData: vi.fn(),
|
|
12
|
+
};
|
|
13
|
+
});
|
|
14
|
+
// Mock db queries
|
|
15
|
+
vi.mock("../db/queries.js", () => {
|
|
16
|
+
return {
|
|
17
|
+
insertURL: vi.fn(),
|
|
18
|
+
insertLink: vi.fn(),
|
|
19
|
+
markDone: vi.fn(),
|
|
20
|
+
markFailed: vi.fn(),
|
|
21
|
+
};
|
|
22
|
+
});
|
|
23
|
+
vi.mock("../config.js", () => {
|
|
24
|
+
return {
|
|
25
|
+
config: {
|
|
26
|
+
ALLOWED_DOMAINS: ["react.dev"],
|
|
27
|
+
MAX_DEPTH: 2,
|
|
28
|
+
},
|
|
29
|
+
};
|
|
30
|
+
});
|
|
31
|
+
vi.mock("../frontier/robots.js", () => {
|
|
32
|
+
return {
|
|
33
|
+
isAllowedByRobots: vi.fn(),
|
|
34
|
+
};
|
|
35
|
+
});
|
|
36
|
+
import { downloadPage } from "../worker/downloader.js";
|
|
37
|
+
import { extractPageData } from "../worker/extractor.js";
|
|
38
|
+
import { insertURL, insertLink, markDone, markFailed } from "../db/queries.js";
|
|
39
|
+
import { isAllowedByRobots } from "../frontier/robots.js";
|
|
40
|
+
import { processPage } from "../worker/worker.js";
|
|
41
|
+
const mockDownloadPage = vi.mocked(downloadPage);
|
|
42
|
+
const mockExtractPageData = vi.mocked(extractPageData);
|
|
43
|
+
const mockInsertURL = vi.mocked(insertURL);
|
|
44
|
+
const mockInsertLink = vi.mocked(insertLink);
|
|
45
|
+
const mockMarkDone = vi.mocked(markDone);
|
|
46
|
+
const mockMarkFailed = vi.mocked(markFailed);
|
|
47
|
+
const mockIsAllowedByRobots = vi.mocked(isAllowedByRobots);
|
|
48
|
+
describe("Worker Pipeline", () => {
|
|
49
|
+
beforeEach(() => {
|
|
50
|
+
vi.clearAllMocks();
|
|
51
|
+
mockIsAllowedByRobots.mockResolvedValue(true);
|
|
52
|
+
});
|
|
53
|
+
it("should successfully process a page, extract content, and insert links", async () => {
|
|
54
|
+
mockDownloadPage.mockResolvedValue({
|
|
55
|
+
url: "https://react.dev/docs",
|
|
56
|
+
html: "<html>...</html>",
|
|
57
|
+
statusCode: 200,
|
|
58
|
+
});
|
|
59
|
+
mockExtractPageData.mockReturnValue({
|
|
60
|
+
title: "React Docs",
|
|
61
|
+
description: "Learn React",
|
|
62
|
+
canonicalUrl: "https://react.dev/docs",
|
|
63
|
+
headings: { h1: ["Docs"], h2: [], h3: [] },
|
|
64
|
+
textContent: "Learn React content",
|
|
65
|
+
links: ["/tutorial", "https://external.com", "https://react.dev/docs"],
|
|
66
|
+
});
|
|
67
|
+
mockInsertURL.mockResolvedValue(100);
|
|
68
|
+
await processPage({ id: 42, url: "https://react.dev/docs", depth: 1 });
|
|
69
|
+
expect(mockMarkDone).toHaveBeenCalledTimes(1);
|
|
70
|
+
expect(mockMarkDone).toHaveBeenCalledWith(42, {
|
|
71
|
+
title: "React Docs",
|
|
72
|
+
description: "Learn React",
|
|
73
|
+
canonicalUrl: "https://react.dev/docs",
|
|
74
|
+
headings: { h1: ["Docs"], h2: [], h3: [] },
|
|
75
|
+
textContent: "Learn React content",
|
|
76
|
+
});
|
|
77
|
+
expect(mockInsertURL).toHaveBeenCalledTimes(1);
|
|
78
|
+
expect(mockInsertURL).toHaveBeenCalledWith("https://react.dev/tutorial", "react.dev", 2);
|
|
79
|
+
expect(mockInsertLink).toHaveBeenCalledTimes(1);
|
|
80
|
+
expect(mockInsertLink).toHaveBeenCalledWith(42, 100);
|
|
81
|
+
expect(mockMarkFailed).not.toHaveBeenCalled();
|
|
82
|
+
});
|
|
83
|
+
it("should mark URL as FAILED if download fails", async () => {
|
|
84
|
+
mockDownloadPage.mockRejectedValue(new Error("Network Error"));
|
|
85
|
+
await expect(processPage({ id: 42, url: "https://react.dev/docs", depth: 1 })).rejects.toThrow("Network Error");
|
|
86
|
+
expect(mockMarkFailed).toHaveBeenCalledTimes(1);
|
|
87
|
+
expect(mockMarkFailed).toHaveBeenCalledWith(42, "Network Error");
|
|
88
|
+
expect(mockMarkDone).not.toHaveBeenCalled();
|
|
89
|
+
});
|
|
90
|
+
it("should discard links that exceed MAX_DEPTH", async () => {
|
|
91
|
+
mockDownloadPage.mockResolvedValue({
|
|
92
|
+
url: "https://react.dev/docs",
|
|
93
|
+
html: "<html>...</html>",
|
|
94
|
+
statusCode: 200,
|
|
95
|
+
});
|
|
96
|
+
mockExtractPageData.mockReturnValue({
|
|
97
|
+
title: "React Docs",
|
|
98
|
+
description: "Learn React",
|
|
99
|
+
canonicalUrl: "https://react.dev/docs",
|
|
100
|
+
headings: { h1: ["Docs"], h2: [], h3: [] },
|
|
101
|
+
textContent: "Learn React content",
|
|
102
|
+
links: ["/tutorial"],
|
|
103
|
+
});
|
|
104
|
+
// Run with current depth = 2, so nextDepth = 3 which exceeds MAX_DEPTH = 2
|
|
105
|
+
await processPage({ id: 42, url: "https://react.dev/docs", depth: 2 });
|
|
106
|
+
expect(mockMarkDone).toHaveBeenCalledTimes(1);
|
|
107
|
+
expect(mockInsertURL).not.toHaveBeenCalled();
|
|
108
|
+
expect(mockInsertLink).not.toHaveBeenCalled();
|
|
109
|
+
});
|
|
110
|
+
it("should abort crawl if URL is disallowed by robots.txt", async () => {
|
|
111
|
+
mockIsAllowedByRobots.mockResolvedValue(false);
|
|
112
|
+
await processPage({ id: 42, url: "https://react.dev/private", depth: 1 });
|
|
113
|
+
expect(mockMarkFailed).toHaveBeenCalledTimes(1);
|
|
114
|
+
expect(mockMarkFailed).toHaveBeenCalledWith(42, "Disallowed by robots.txt");
|
|
115
|
+
expect(mockDownloadPage).not.toHaveBeenCalled();
|
|
116
|
+
expect(mockMarkDone).not.toHaveBeenCalled();
|
|
117
|
+
});
|
|
118
|
+
});
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
import { request } from "undici";
|
|
2
|
+
import { config } from "../config.js";
|
|
3
|
+
import { isBlockedAddress } from "../security/ssrf.js";
|
|
4
|
+
/**
|
|
5
|
+
* Guards against SSRF by resolving the hostname before the request is made.
|
|
6
|
+
* Throws if the address is private, loopback, or link-local.
|
|
7
|
+
*/
|
|
8
|
+
async function assertNotBlocked(url) {
|
|
9
|
+
const hostname = new URL(url).hostname;
|
|
10
|
+
if (await isBlockedAddress(hostname)) {
|
|
11
|
+
throw new Error(`SSRF blocked: "${hostname}" resolves to a private or internal address`);
|
|
12
|
+
}
|
|
13
|
+
}
|
|
14
|
+
/**
|
|
15
|
+
* Fetches the HTML content of a page, following redirects up to MAX_REDIRECTS.
|
|
16
|
+
* Tracks the final URL, enforces a request timeout, and blocks SSRF targets.
|
|
17
|
+
*/
|
|
18
|
+
export async function downloadPage(initialUrl) {
|
|
19
|
+
let currentUrl = initialUrl;
|
|
20
|
+
let redirectCount = 0;
|
|
21
|
+
// SSRF check on the initial URL before any network activity
|
|
22
|
+
await assertNotBlocked(currentUrl);
|
|
23
|
+
while (true) {
|
|
24
|
+
const res = await request(currentUrl, {
|
|
25
|
+
method: "GET",
|
|
26
|
+
headersTimeout: config.REQUEST_TIMEOUT_MS,
|
|
27
|
+
bodyTimeout: config.REQUEST_TIMEOUT_MS,
|
|
28
|
+
});
|
|
29
|
+
const statusCode = res.statusCode;
|
|
30
|
+
// Handle redirects (301, 302, 303, 307, 308)
|
|
31
|
+
if (statusCode >= 300 && statusCode < 400 && res.headers.location) {
|
|
32
|
+
if (redirectCount >= config.MAX_REDIRECTS) {
|
|
33
|
+
await res.body.text(); // consume body to release connection
|
|
34
|
+
throw new Error("Too many redirects");
|
|
35
|
+
}
|
|
36
|
+
const location = Array.isArray(res.headers.location)
|
|
37
|
+
? res.headers.location[0]
|
|
38
|
+
: res.headers.location;
|
|
39
|
+
const nextUrl = new URL(location, currentUrl).href;
|
|
40
|
+
// SSRF check on every redirect target before following
|
|
41
|
+
await assertNotBlocked(nextUrl);
|
|
42
|
+
currentUrl = nextUrl;
|
|
43
|
+
redirectCount++;
|
|
44
|
+
await res.body.text(); // consume body
|
|
45
|
+
continue;
|
|
46
|
+
}
|
|
47
|
+
// Error on non-200 responses
|
|
48
|
+
if (statusCode !== 200) {
|
|
49
|
+
await res.body.text();
|
|
50
|
+
throw new Error(`HTTP status ${statusCode}`);
|
|
51
|
+
}
|
|
52
|
+
// Skip non-HTML content types
|
|
53
|
+
const contentTypeHeader = res.headers["content-type"];
|
|
54
|
+
const contentType = Array.isArray(contentTypeHeader)
|
|
55
|
+
? contentTypeHeader[0]
|
|
56
|
+
: contentTypeHeader;
|
|
57
|
+
if (contentType && !contentType.includes("text/html")) {
|
|
58
|
+
await res.body.text();
|
|
59
|
+
throw new Error(`Non-HTML content type: ${contentType}`);
|
|
60
|
+
}
|
|
61
|
+
const html = await res.body.text();
|
|
62
|
+
return { url: currentUrl, html, statusCode };
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
/**
|
|
66
|
+
* Downloads image assets securely.
|
|
67
|
+
* Enforces the same SSRF/blocklist checks, redirection limits, and timeouts as pages.
|
|
68
|
+
*/
|
|
69
|
+
export async function downloadImage(initialUrl) {
|
|
70
|
+
let currentUrl = initialUrl;
|
|
71
|
+
let redirectCount = 0;
|
|
72
|
+
// SSRF check on initial URL
|
|
73
|
+
await assertNotBlocked(currentUrl);
|
|
74
|
+
while (true) {
|
|
75
|
+
const res = await request(currentUrl, {
|
|
76
|
+
method: "GET",
|
|
77
|
+
headersTimeout: config.REQUEST_TIMEOUT_MS,
|
|
78
|
+
bodyTimeout: config.REQUEST_TIMEOUT_MS,
|
|
79
|
+
});
|
|
80
|
+
const statusCode = res.statusCode;
|
|
81
|
+
// Handle redirects (301, 302, 303, 307, 308)
|
|
82
|
+
if (statusCode >= 300 && statusCode < 400 && res.headers.location) {
|
|
83
|
+
if (redirectCount >= config.MAX_REDIRECTS) {
|
|
84
|
+
await res.body.text();
|
|
85
|
+
throw new Error("Too many redirects fetching image");
|
|
86
|
+
}
|
|
87
|
+
const location = Array.isArray(res.headers.location)
|
|
88
|
+
? res.headers.location[0]
|
|
89
|
+
: res.headers.location;
|
|
90
|
+
const nextUrl = new URL(location, currentUrl).href;
|
|
91
|
+
// SSRF check on target before redirecting
|
|
92
|
+
await assertNotBlocked(nextUrl);
|
|
93
|
+
currentUrl = nextUrl;
|
|
94
|
+
redirectCount++;
|
|
95
|
+
await res.body.text();
|
|
96
|
+
continue;
|
|
97
|
+
}
|
|
98
|
+
if (statusCode !== 200) {
|
|
99
|
+
await res.body.text();
|
|
100
|
+
throw new Error(`HTTP status ${statusCode} fetching image`);
|
|
101
|
+
}
|
|
102
|
+
// Validate that it's an image
|
|
103
|
+
const contentTypeHeader = res.headers["content-type"];
|
|
104
|
+
const contentType = Array.isArray(contentTypeHeader)
|
|
105
|
+
? contentTypeHeader[0]
|
|
106
|
+
: contentTypeHeader;
|
|
107
|
+
if (contentType && !contentType.startsWith("image/")) {
|
|
108
|
+
await res.body.text();
|
|
109
|
+
throw new Error(`Non-image content type: ${contentType}`);
|
|
110
|
+
}
|
|
111
|
+
const arrayBuffer = await res.body.arrayBuffer();
|
|
112
|
+
return Buffer.from(arrayBuffer);
|
|
113
|
+
}
|
|
114
|
+
}
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
import * as cheerio from "cheerio";
|
|
2
|
+
/**
|
|
3
|
+
* Extracts metadata, headings, structured text content blocks, images, and outgoing links from HTML.
|
|
4
|
+
* Strips site chrome and uses a text-density heuristic if no main content container is found.
|
|
5
|
+
*/
|
|
6
|
+
export function extractPageData(html, baseUrl) {
|
|
7
|
+
const $ = cheerio.load(html);
|
|
8
|
+
const title = $("title").text().trim() || null;
|
|
9
|
+
const description = $("meta[name=description]").attr("content")?.trim() || null;
|
|
10
|
+
const canonicalUrl = $("link[rel=canonical]").attr("href")?.trim() || null;
|
|
11
|
+
const h1 = [];
|
|
12
|
+
const h2 = [];
|
|
13
|
+
const h3 = [];
|
|
14
|
+
$("h1").each((_, el) => {
|
|
15
|
+
const text = $(el).text().trim();
|
|
16
|
+
if (text)
|
|
17
|
+
h1.push(text);
|
|
18
|
+
});
|
|
19
|
+
$("h2").each((_, el) => {
|
|
20
|
+
const text = $(el).text().trim();
|
|
21
|
+
if (text)
|
|
22
|
+
h2.push(text);
|
|
23
|
+
});
|
|
24
|
+
$("h3").each((_, el) => {
|
|
25
|
+
const text = $(el).text().trim();
|
|
26
|
+
if (text)
|
|
27
|
+
h3.push(text);
|
|
28
|
+
});
|
|
29
|
+
const links = [];
|
|
30
|
+
$("a[href]").each((_, el) => {
|
|
31
|
+
const href = $(el).attr("href")?.trim();
|
|
32
|
+
if (href) {
|
|
33
|
+
links.push(href);
|
|
34
|
+
}
|
|
35
|
+
});
|
|
36
|
+
// Determine resolution base URL for images
|
|
37
|
+
const resolutionBase = canonicalUrl || baseUrl || null;
|
|
38
|
+
// 1. Main-content heuristic selection
|
|
39
|
+
let mainNode = $("article").first();
|
|
40
|
+
if (mainNode.length === 0) {
|
|
41
|
+
mainNode = $("main").first();
|
|
42
|
+
}
|
|
43
|
+
if (mainNode.length === 0) {
|
|
44
|
+
mainNode = $("[role=main]").first();
|
|
45
|
+
}
|
|
46
|
+
// Fallback text-density heuristic
|
|
47
|
+
if (mainNode.length === 0) {
|
|
48
|
+
const totalBodyText = $("body").text().trim();
|
|
49
|
+
const minTextLength = Math.min(200, totalBodyText.length * 0.1);
|
|
50
|
+
let bestNode = $("body");
|
|
51
|
+
let maxScore = -1;
|
|
52
|
+
$("div, section").each((_, el) => {
|
|
53
|
+
const $el = $(el);
|
|
54
|
+
const text = $el.text().trim();
|
|
55
|
+
const textLength = text.length;
|
|
56
|
+
if (textLength < minTextLength)
|
|
57
|
+
return;
|
|
58
|
+
const tagCount = $el.find("*").length;
|
|
59
|
+
const score = textLength / (tagCount + 1);
|
|
60
|
+
if (score > maxScore) {
|
|
61
|
+
maxScore = score;
|
|
62
|
+
bestNode = $el;
|
|
63
|
+
}
|
|
64
|
+
});
|
|
65
|
+
mainNode = bestNode;
|
|
66
|
+
}
|
|
67
|
+
// 2. Clone and clean the chosen node
|
|
68
|
+
const cleanedNode = mainNode.clone();
|
|
69
|
+
cleanedNode.find("script, style, noscript, iframe, nav, footer, header").remove();
|
|
70
|
+
// 3. Extract in-order content blocks and overall images list
|
|
71
|
+
const blocks = [];
|
|
72
|
+
const images = [];
|
|
73
|
+
// Extract all images inside the cleaned main node
|
|
74
|
+
cleanedNode.find("img").each((_, img) => {
|
|
75
|
+
const src = $(img).attr("src")?.trim();
|
|
76
|
+
const alt = $(img).attr("alt")?.trim() || "";
|
|
77
|
+
if (src) {
|
|
78
|
+
let resolvedSrc = src;
|
|
79
|
+
if (resolutionBase) {
|
|
80
|
+
try {
|
|
81
|
+
resolvedSrc = new URL(src, resolutionBase).href;
|
|
82
|
+
}
|
|
83
|
+
catch {
|
|
84
|
+
// keep relative src if resolution fails
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
images.push({ src: resolvedSrc, alt });
|
|
88
|
+
}
|
|
89
|
+
});
|
|
90
|
+
// Track if we need to force a new paragraph on the next text node
|
|
91
|
+
let forceNewParagraph = true;
|
|
92
|
+
function walk(node) {
|
|
93
|
+
if (node.type === "text") {
|
|
94
|
+
const text = node.data.replace(/\s+/g, " ").trim();
|
|
95
|
+
if (text) {
|
|
96
|
+
const lastBlock = blocks[blocks.length - 1];
|
|
97
|
+
if (!forceNewParagraph && lastBlock && lastBlock.type === "paragraph") {
|
|
98
|
+
lastBlock.text = (lastBlock.text + " " + text).replace(/\s+/g, " ").trim();
|
|
99
|
+
}
|
|
100
|
+
else {
|
|
101
|
+
blocks.push({ type: "paragraph", text });
|
|
102
|
+
forceNewParagraph = false;
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
return;
|
|
106
|
+
}
|
|
107
|
+
if (node.type !== "tag") {
|
|
108
|
+
return;
|
|
109
|
+
}
|
|
110
|
+
const el = node;
|
|
111
|
+
const tagName = el.tagName?.toLowerCase();
|
|
112
|
+
// Skip removed elements just in case
|
|
113
|
+
if (["script", "style", "noscript", "iframe", "nav", "footer", "header"].includes(tagName)) {
|
|
114
|
+
return;
|
|
115
|
+
}
|
|
116
|
+
if (/^h[1-6]$/.test(tagName)) {
|
|
117
|
+
const level = parseInt(tagName.substring(1), 10);
|
|
118
|
+
const text = $(el).text().replace(/\s+/g, " ").trim();
|
|
119
|
+
if (text) {
|
|
120
|
+
blocks.push({ type: "heading", level, text });
|
|
121
|
+
}
|
|
122
|
+
forceNewParagraph = true;
|
|
123
|
+
}
|
|
124
|
+
else if (tagName === "p") {
|
|
125
|
+
const text = $(el).text().replace(/\s+/g, " ").trim();
|
|
126
|
+
if (text) {
|
|
127
|
+
blocks.push({ type: "paragraph", text });
|
|
128
|
+
}
|
|
129
|
+
forceNewParagraph = true;
|
|
130
|
+
}
|
|
131
|
+
else if (tagName === "ul" || tagName === "ol") {
|
|
132
|
+
const items = [];
|
|
133
|
+
$(el).find("li").each((_, li) => {
|
|
134
|
+
const itemText = $(li).text().replace(/\s+/g, " ").trim();
|
|
135
|
+
if (itemText)
|
|
136
|
+
items.push(itemText);
|
|
137
|
+
});
|
|
138
|
+
if (items.length > 0) {
|
|
139
|
+
blocks.push({ type: "list", items });
|
|
140
|
+
}
|
|
141
|
+
forceNewParagraph = true;
|
|
142
|
+
}
|
|
143
|
+
else if (tagName === "img") {
|
|
144
|
+
const src = $(el).attr("src")?.trim();
|
|
145
|
+
const alt = $(el).attr("alt")?.trim() || "";
|
|
146
|
+
if (src) {
|
|
147
|
+
let resolvedSrc = src;
|
|
148
|
+
if (resolutionBase) {
|
|
149
|
+
try {
|
|
150
|
+
resolvedSrc = new URL(src, resolutionBase).href;
|
|
151
|
+
}
|
|
152
|
+
catch {
|
|
153
|
+
// keep as is
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
blocks.push({ type: "image", src: resolvedSrc, alt });
|
|
157
|
+
}
|
|
158
|
+
forceNewParagraph = true;
|
|
159
|
+
}
|
|
160
|
+
else if (tagName === "br") {
|
|
161
|
+
forceNewParagraph = true;
|
|
162
|
+
}
|
|
163
|
+
else {
|
|
164
|
+
// For general container tags (div, span, etc.), walk contents recursively
|
|
165
|
+
$(el).contents().each((_, child) => {
|
|
166
|
+
walk(child);
|
|
167
|
+
});
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
cleanedNode.contents().each((_, child) => {
|
|
171
|
+
walk(child);
|
|
172
|
+
});
|
|
173
|
+
// Fallback textContent: concatenated paragraphs / lists for backwards compatibility
|
|
174
|
+
const textContentParts = [];
|
|
175
|
+
for (const block of blocks) {
|
|
176
|
+
if (block.type === "paragraph" && block.text) {
|
|
177
|
+
textContentParts.push(block.text);
|
|
178
|
+
}
|
|
179
|
+
else if (block.type === "heading" && block.text) {
|
|
180
|
+
textContentParts.push(block.text);
|
|
181
|
+
}
|
|
182
|
+
else if (block.type === "list" && block.items) {
|
|
183
|
+
textContentParts.push(block.items.join(" "));
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
const textContent = textContentParts.join(" ").replace(/\s+/g, " ").trim() || null;
|
|
187
|
+
return {
|
|
188
|
+
title,
|
|
189
|
+
description,
|
|
190
|
+
canonicalUrl,
|
|
191
|
+
headings: { h1, h2, h3 },
|
|
192
|
+
textContent,
|
|
193
|
+
links,
|
|
194
|
+
blocks,
|
|
195
|
+
images,
|
|
196
|
+
};
|
|
197
|
+
}
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
import { downloadPage } from "./downloader.js";
|
|
2
|
+
import { extractPageData } from "./extractor.js";
|
|
3
|
+
import { normalizeURL, getDomain } from "../normalizer.js";
|
|
4
|
+
import { insertURL, insertLink, markFailed } from "../db/queries.js";
|
|
5
|
+
import { config } from "../config.js";
|
|
6
|
+
import { isAllowedByRobots } from "../frontier/robots.js";
|
|
7
|
+
import { getStrategy } from "../output/index.js";
|
|
8
|
+
function isDomainAllowed(domain) {
|
|
9
|
+
if (!config.ALLOWED_DOMAINS || config.ALLOWED_DOMAINS.length === 0) {
|
|
10
|
+
return true;
|
|
11
|
+
}
|
|
12
|
+
return config.ALLOWED_DOMAINS.includes(domain);
|
|
13
|
+
}
|
|
14
|
+
/**
|
|
15
|
+
* Handles the complete crawling workflow for a single URL:
|
|
16
|
+
* 1. Downloads the page HTML (handling redirects & timeouts).
|
|
17
|
+
* 2. Extracts title, description, canonical, headings, text content, and outgoing links.
|
|
18
|
+
* 3. Delegates persistence to the active OutputStrategy (DB or PDF).
|
|
19
|
+
* 4. Filters, normalizes, and enqueues discovered links, establishing link graph relations.
|
|
20
|
+
*/
|
|
21
|
+
export async function processPage(urlRow) {
|
|
22
|
+
const urlId = urlRow.id;
|
|
23
|
+
const pageUrl = urlRow.url;
|
|
24
|
+
const currentDepth = urlRow.depth;
|
|
25
|
+
try {
|
|
26
|
+
// 0. Check robots.txt compliance
|
|
27
|
+
const allowed = await isAllowedByRobots(pageUrl);
|
|
28
|
+
if (!allowed) {
|
|
29
|
+
await markFailed(urlId, "Disallowed by robots.txt");
|
|
30
|
+
return;
|
|
31
|
+
}
|
|
32
|
+
// 1. Download page content
|
|
33
|
+
const downloadResult = await downloadPage(pageUrl);
|
|
34
|
+
// 2. Extract content & outgoing links
|
|
35
|
+
const extracted = extractPageData(downloadResult.html, downloadResult.url);
|
|
36
|
+
// Resolve final URL using canonical link if present
|
|
37
|
+
let finalUrl = downloadResult.url;
|
|
38
|
+
if (extracted.canonicalUrl) {
|
|
39
|
+
const normalizedCanonical = normalizeURL(extracted.canonicalUrl, finalUrl);
|
|
40
|
+
if (normalizedCanonical) {
|
|
41
|
+
finalUrl = normalizedCanonical;
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
// 3. Persist content via the active output strategy (DB or PDF)
|
|
45
|
+
const strategy = getStrategy();
|
|
46
|
+
await strategy.save(urlId, finalUrl, {
|
|
47
|
+
title: extracted.title,
|
|
48
|
+
description: extracted.description,
|
|
49
|
+
canonicalUrl: extracted.canonicalUrl,
|
|
50
|
+
headings: extracted.headings,
|
|
51
|
+
textContent: extracted.textContent,
|
|
52
|
+
blocks: extracted.blocks,
|
|
53
|
+
images: extracted.images,
|
|
54
|
+
});
|
|
55
|
+
// 4. Process outgoing links
|
|
56
|
+
const uniqueNormalizedLinks = new Set();
|
|
57
|
+
for (const link of extracted.links) {
|
|
58
|
+
const normalized = normalizeURL(link, finalUrl);
|
|
59
|
+
if (!normalized)
|
|
60
|
+
continue;
|
|
61
|
+
// Skip self-referential links
|
|
62
|
+
if (normalized === finalUrl || normalized === pageUrl)
|
|
63
|
+
continue;
|
|
64
|
+
const linkDomain = getDomain(normalized);
|
|
65
|
+
if (!linkDomain || !isDomainAllowed(linkDomain))
|
|
66
|
+
continue;
|
|
67
|
+
uniqueNormalizedLinks.add(normalized);
|
|
68
|
+
}
|
|
69
|
+
for (const normalizedLink of uniqueNormalizedLinks) {
|
|
70
|
+
const nextDepth = currentDepth + 1;
|
|
71
|
+
// Enforce MAX_DEPTH limit
|
|
72
|
+
if (nextDepth > config.MAX_DEPTH) {
|
|
73
|
+
continue;
|
|
74
|
+
}
|
|
75
|
+
const targetDomain = getDomain(normalizedLink);
|
|
76
|
+
// Insert target URL (ON CONFLICT DO NOTHING) and get its ID
|
|
77
|
+
const targetUrlId = await insertURL(normalizedLink, targetDomain, nextDepth);
|
|
78
|
+
// Establish link graph relation
|
|
79
|
+
await insertLink(urlId, targetUrlId);
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
catch (error) {
|
|
83
|
+
const errorMsg = error instanceof Error ? error.message : String(error);
|
|
84
|
+
await markFailed(urlId, errorMsg);
|
|
85
|
+
throw error;
|
|
86
|
+
}
|
|
87
|
+
}
|
package/package.json
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "messi-crawler",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "A web crawler built with Node.js and TypeScript for collecting programming-related documentation and web content.",
|
|
5
|
+
"main": "dist/index.js",
|
|
6
|
+
"bin": {
|
|
7
|
+
"messi-crawler": "./dist/index.js"
|
|
8
|
+
},
|
|
9
|
+
"type": "module",
|
|
10
|
+
"scripts": {
|
|
11
|
+
"build": "tsc",
|
|
12
|
+
"start": "node dist/index.js",
|
|
13
|
+
"dev": "tsx src/index.ts",
|
|
14
|
+
"test": "vitest",
|
|
15
|
+
"config": "tsx src/setup.ts",
|
|
16
|
+
"crawl": "npm run build && node dist/index.js",
|
|
17
|
+
"db:clear": "tsx src/db/clear.ts"
|
|
18
|
+
},
|
|
19
|
+
"repository": {
|
|
20
|
+
"type": "git",
|
|
21
|
+
"url": "git+https://github.com/lightning4747/Web-crawler-cli.git"
|
|
22
|
+
},
|
|
23
|
+
"keywords": [],
|
|
24
|
+
"author": "",
|
|
25
|
+
"license": "ISC",
|
|
26
|
+
"bugs": {
|
|
27
|
+
"url": "https://github.com/lightning4747/Web-crawler-cli/issues"
|
|
28
|
+
},
|
|
29
|
+
"homepage": "https://github.com/lightning4747/Web-crawler-cli#readme",
|
|
30
|
+
"dependencies": {
|
|
31
|
+
"cheerio": "^1.2.0",
|
|
32
|
+
"dotenv": "^17.4.2",
|
|
33
|
+
"inquirer": "^14.0.2",
|
|
34
|
+
"pdfkit": "^0.19.1",
|
|
35
|
+
"pg": "^8.21.0",
|
|
36
|
+
"robots-parser": "^3.0.1",
|
|
37
|
+
"undici": "^8.3.0"
|
|
38
|
+
},
|
|
39
|
+
"devDependencies": {
|
|
40
|
+
"@types/inquirer": "^9.0.10",
|
|
41
|
+
"@types/node": "^25.9.1",
|
|
42
|
+
"@types/pdfkit": "^0.17.6",
|
|
43
|
+
"@types/pg": "^8.20.0",
|
|
44
|
+
"tsx": "^4.22.4",
|
|
45
|
+
"typescript": "^6.0.3",
|
|
46
|
+
"vitest": "^4.1.8"
|
|
47
|
+
}
|
|
48
|
+
}
|
package/seeds.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
[https://www.akc.org/dog-breeds/](https://www.akc.org/dog-breeds/)
|
|
2
|
+
[https://www.royalkennelclub.org/breeds-a-z/](https://www.google.com/search?q=https://www.royalkennelclub.org/breeds-a-z/)
|
|
3
|
+
[https://dogapi.dog/api/v2/breeds](https://www.google.com/search?q=https://dogapi.dog/api/v2/breeds)
|
|
4
|
+
[https://dog.ceo/api/breeds/list/all](https://www.google.com/search?q=https://dog.ceo/api/breeds/list/all)
|