messi-crawler 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. package/README.md +201 -0
  2. package/dist/cli/renderer.js +71 -0
  3. package/dist/config.js +18 -0
  4. package/dist/db/clear.js +16 -0
  5. package/dist/db/client.js +20 -0
  6. package/dist/db/queries.js +179 -0
  7. package/dist/frontier/frontier.js +44 -0
  8. package/dist/frontier/logger.js +65 -0
  9. package/dist/frontier/robots.js +46 -0
  10. package/dist/frontier/scheduler.js +98 -0
  11. package/dist/index.js +533 -0
  12. package/dist/normalizer.js +33 -0
  13. package/dist/output/db-strategy.js +16 -0
  14. package/dist/output/index.js +23 -0
  15. package/dist/output/pdf-strategy.js +316 -0
  16. package/dist/output/strategy.js +1 -0
  17. package/dist/security/ssrf.js +45 -0
  18. package/dist/security/validate-url.js +41 -0
  19. package/dist/seed.js +14 -0
  20. package/dist/setup.js +148 -0
  21. package/dist/test/client.test.js +33 -0
  22. package/dist/test/downloader.test.js +84 -0
  23. package/dist/test/extractor.test.js +126 -0
  24. package/dist/test/frontier.test.js +43 -0
  25. package/dist/test/logger.test.js +55 -0
  26. package/dist/test/normalizer.test.js +36 -0
  27. package/dist/test/pdf-strategy.test.js +68 -0
  28. package/dist/test/queries.test.js +173 -0
  29. package/dist/test/robots.test.js +46 -0
  30. package/dist/test/scheduler.test.js +73 -0
  31. package/dist/test/seed.test.js +26 -0
  32. package/dist/test/worker.test.js +118 -0
  33. package/dist/worker/downloader.js +114 -0
  34. package/dist/worker/extractor.js +197 -0
  35. package/dist/worker/worker.js +87 -0
  36. package/package.json +48 -0
  37. package/seeds.txt +4 -0
  38. package/src/cli/renderer.ts +83 -0
  39. package/src/config.ts +22 -0
  40. package/src/db/clear.ts +16 -0
  41. package/src/db/client.ts +26 -0
  42. package/src/db/queries.ts +255 -0
  43. package/src/db/schema.sql +43 -0
  44. package/src/frontier/frontier.ts +60 -0
  45. package/src/frontier/logger.ts +75 -0
  46. package/src/frontier/robots.ts +50 -0
  47. package/src/frontier/scheduler.ts +119 -0
  48. package/src/index.ts +596 -0
  49. package/src/normalizer.ts +37 -0
  50. package/src/output/db-strategy.ts +20 -0
  51. package/src/output/index.ts +32 -0
  52. package/src/output/pdf-strategy.ts +388 -0
  53. package/src/output/strategy.ts +16 -0
  54. package/src/security/ssrf.ts +48 -0
  55. package/src/security/validate-url.ts +49 -0
  56. package/src/seed.ts +18 -0
  57. package/src/setup.ts +170 -0
  58. package/src/test/client.test.ts +38 -0
  59. package/src/test/downloader.test.ts +101 -0
  60. package/src/test/extractor.test.ts +139 -0
  61. package/src/test/frontier.test.ts +53 -0
  62. package/src/test/logger.test.ts +71 -0
  63. package/src/test/normalizer.test.ts +43 -0
  64. package/src/test/pdf-strategy.test.ts +84 -0
  65. package/src/test/queries.test.ts +247 -0
  66. package/src/test/robots.test.ts +56 -0
  67. package/src/test/scheduler.test.ts +90 -0
  68. package/src/test/seed.test.ts +35 -0
  69. package/src/test/worker.test.ts +144 -0
  70. package/src/worker/downloader.ts +149 -0
  71. package/src/worker/extractor.ts +235 -0
  72. package/src/worker/worker.ts +100 -0
  73. package/tsconfig.json +15 -0
@@ -0,0 +1,73 @@
1
+ import { describe, it, expect, vi, beforeEach, afterEach } from "vitest";
2
+ // Mock dependencies
3
+ vi.mock("../config.js", () => {
4
+ return {
5
+ config: {
6
+ WORKER_COUNT: 2,
7
+ CRAWL_DELAY_MS: 1000,
8
+ MAX_PAGES: 100,
9
+ },
10
+ };
11
+ });
12
+ vi.mock("../db/queries.js", () => {
13
+ return {
14
+ claimNextURL: vi.fn(),
15
+ getGlobalStats: vi.fn().mockResolvedValue({ pending: 0, fetching: 0, done: 0, failed: 0 }),
16
+ };
17
+ });
18
+ vi.mock("../frontier/frontier.js", () => {
19
+ return {
20
+ getPendingDomains: vi.fn(),
21
+ };
22
+ });
23
+ vi.mock("../worker/worker.js", () => {
24
+ return {
25
+ processPage: vi.fn().mockResolvedValue(undefined),
26
+ };
27
+ });
28
+ import { claimNextURL } from "../db/queries.js";
29
+ import { getPendingDomains } from "../frontier/frontier.js";
30
+ import { startScheduler, stopScheduler, getCooldown } from "../frontier/scheduler.js";
31
+ const mockClaimNextURL = vi.mocked(claimNextURL);
32
+ const mockGetPendingDomains = vi.mocked(getPendingDomains);
33
+ describe("Scheduler", () => {
34
+ beforeEach(() => {
35
+ vi.clearAllMocks();
36
+ vi.useFakeTimers();
37
+ });
38
+ afterEach(() => {
39
+ vi.useRealTimers();
40
+ stopScheduler();
41
+ });
42
+ it("should respect politeness delay (cooldowns) and round-robin domains", async () => {
43
+ // Two domains are pending
44
+ mockGetPendingDomains.mockResolvedValue(["react.dev", "typescriptlang.org"]);
45
+ // Mock claimNextURL responses
46
+ mockClaimNextURL
47
+ .mockResolvedValueOnce({ id: 1, url: "https://react.dev", domain: "react.dev", status: "FETCHING", depth: 0 })
48
+ .mockResolvedValueOnce({ id: 2, url: "https://typescriptlang.org", domain: "typescriptlang.org", status: "FETCHING", depth: 0 });
49
+ // Start the scheduler
50
+ const schedulerPromise = startScheduler();
51
+ // Allow the first loop iteration to execute
52
+ await vi.advanceTimersByTimeAsync(0);
53
+ // Verify it claimed react.dev first
54
+ expect(mockClaimNextURL).toHaveBeenNthCalledWith(1, "react.dev");
55
+ const cooldownReact = getCooldown("react.dev");
56
+ expect(cooldownReact).toBeGreaterThan(0);
57
+ // Advance time slightly (100ms, less than 1000ms cooldown)
58
+ await vi.advanceTimersByTimeAsync(100);
59
+ // It should check the next domain in round robin, which is typescriptlang.org
60
+ // Since typescriptlang.org has no cooldown, it should claim a URL for it
61
+ expect(mockClaimNextURL).toHaveBeenNthCalledWith(2, "typescriptlang.org");
62
+ const cooldownTS = getCooldown("typescriptlang.org");
63
+ expect(cooldownTS).toBeGreaterThan(0);
64
+ // Advance time slightly again
65
+ await vi.advanceTimersByTimeAsync(100);
66
+ // claimNextURL should not have been called a third time because both domains are on cooldown
67
+ expect(mockClaimNextURL).toHaveBeenCalledTimes(2);
68
+ // Stop scheduler to exit loop
69
+ stopScheduler();
70
+ await vi.advanceTimersByTimeAsync(100);
71
+ await schedulerPromise;
72
+ });
73
+ });
@@ -0,0 +1,26 @@
1
+ import { describe, it, expect, vi, beforeEach } from "vitest";
2
+ vi.mock("../db/client.js", () => {
3
+ return {
4
+ query: vi.fn().mockResolvedValue({ rows: [] }),
5
+ };
6
+ });
7
+ vi.mock("../config.js", () => {
8
+ return {
9
+ config: {
10
+ SEED_URLS: ["https://react.dev", "not-a-url"],
11
+ },
12
+ };
13
+ });
14
+ import { seedDatabase } from "../seed.js";
15
+ import { query } from "../db/client.js";
16
+ describe("Seeding Logic", () => {
17
+ beforeEach(() => {
18
+ vi.clearAllMocks();
19
+ });
20
+ it("should insert valid seed URLs and skip invalid ones", async () => {
21
+ await seedDatabase();
22
+ // query should only be called once, for "https://react.dev"
23
+ expect(query).toHaveBeenCalledTimes(1);
24
+ expect(query).toHaveBeenCalledWith(expect.stringContaining("INSERT INTO urls"), ["https://react.dev", "react.dev"]);
25
+ });
26
+ });
@@ -0,0 +1,118 @@
1
+ import { describe, it, expect, vi, beforeEach } from "vitest";
2
+ // Mock downloader
3
+ vi.mock("../worker/downloader.js", () => {
4
+ return {
5
+ downloadPage: vi.fn(),
6
+ };
7
+ });
8
+ // Mock extractor
9
+ vi.mock("../worker/extractor.js", () => {
10
+ return {
11
+ extractPageData: vi.fn(),
12
+ };
13
+ });
14
+ // Mock db queries
15
+ vi.mock("../db/queries.js", () => {
16
+ return {
17
+ insertURL: vi.fn(),
18
+ insertLink: vi.fn(),
19
+ markDone: vi.fn(),
20
+ markFailed: vi.fn(),
21
+ };
22
+ });
23
+ vi.mock("../config.js", () => {
24
+ return {
25
+ config: {
26
+ ALLOWED_DOMAINS: ["react.dev"],
27
+ MAX_DEPTH: 2,
28
+ },
29
+ };
30
+ });
31
+ vi.mock("../frontier/robots.js", () => {
32
+ return {
33
+ isAllowedByRobots: vi.fn(),
34
+ };
35
+ });
36
+ import { downloadPage } from "../worker/downloader.js";
37
+ import { extractPageData } from "../worker/extractor.js";
38
+ import { insertURL, insertLink, markDone, markFailed } from "../db/queries.js";
39
+ import { isAllowedByRobots } from "../frontier/robots.js";
40
+ import { processPage } from "../worker/worker.js";
41
+ const mockDownloadPage = vi.mocked(downloadPage);
42
+ const mockExtractPageData = vi.mocked(extractPageData);
43
+ const mockInsertURL = vi.mocked(insertURL);
44
+ const mockInsertLink = vi.mocked(insertLink);
45
+ const mockMarkDone = vi.mocked(markDone);
46
+ const mockMarkFailed = vi.mocked(markFailed);
47
+ const mockIsAllowedByRobots = vi.mocked(isAllowedByRobots);
48
+ describe("Worker Pipeline", () => {
49
+ beforeEach(() => {
50
+ vi.clearAllMocks();
51
+ mockIsAllowedByRobots.mockResolvedValue(true);
52
+ });
53
+ it("should successfully process a page, extract content, and insert links", async () => {
54
+ mockDownloadPage.mockResolvedValue({
55
+ url: "https://react.dev/docs",
56
+ html: "<html>...</html>",
57
+ statusCode: 200,
58
+ });
59
+ mockExtractPageData.mockReturnValue({
60
+ title: "React Docs",
61
+ description: "Learn React",
62
+ canonicalUrl: "https://react.dev/docs",
63
+ headings: { h1: ["Docs"], h2: [], h3: [] },
64
+ textContent: "Learn React content",
65
+ links: ["/tutorial", "https://external.com", "https://react.dev/docs"],
66
+ });
67
+ mockInsertURL.mockResolvedValue(100);
68
+ await processPage({ id: 42, url: "https://react.dev/docs", depth: 1 });
69
+ expect(mockMarkDone).toHaveBeenCalledTimes(1);
70
+ expect(mockMarkDone).toHaveBeenCalledWith(42, {
71
+ title: "React Docs",
72
+ description: "Learn React",
73
+ canonicalUrl: "https://react.dev/docs",
74
+ headings: { h1: ["Docs"], h2: [], h3: [] },
75
+ textContent: "Learn React content",
76
+ });
77
+ expect(mockInsertURL).toHaveBeenCalledTimes(1);
78
+ expect(mockInsertURL).toHaveBeenCalledWith("https://react.dev/tutorial", "react.dev", 2);
79
+ expect(mockInsertLink).toHaveBeenCalledTimes(1);
80
+ expect(mockInsertLink).toHaveBeenCalledWith(42, 100);
81
+ expect(mockMarkFailed).not.toHaveBeenCalled();
82
+ });
83
+ it("should mark URL as FAILED if download fails", async () => {
84
+ mockDownloadPage.mockRejectedValue(new Error("Network Error"));
85
+ await expect(processPage({ id: 42, url: "https://react.dev/docs", depth: 1 })).rejects.toThrow("Network Error");
86
+ expect(mockMarkFailed).toHaveBeenCalledTimes(1);
87
+ expect(mockMarkFailed).toHaveBeenCalledWith(42, "Network Error");
88
+ expect(mockMarkDone).not.toHaveBeenCalled();
89
+ });
90
+ it("should discard links that exceed MAX_DEPTH", async () => {
91
+ mockDownloadPage.mockResolvedValue({
92
+ url: "https://react.dev/docs",
93
+ html: "<html>...</html>",
94
+ statusCode: 200,
95
+ });
96
+ mockExtractPageData.mockReturnValue({
97
+ title: "React Docs",
98
+ description: "Learn React",
99
+ canonicalUrl: "https://react.dev/docs",
100
+ headings: { h1: ["Docs"], h2: [], h3: [] },
101
+ textContent: "Learn React content",
102
+ links: ["/tutorial"],
103
+ });
104
+ // Run with current depth = 2, so nextDepth = 3 which exceeds MAX_DEPTH = 2
105
+ await processPage({ id: 42, url: "https://react.dev/docs", depth: 2 });
106
+ expect(mockMarkDone).toHaveBeenCalledTimes(1);
107
+ expect(mockInsertURL).not.toHaveBeenCalled();
108
+ expect(mockInsertLink).not.toHaveBeenCalled();
109
+ });
110
+ it("should abort crawl if URL is disallowed by robots.txt", async () => {
111
+ mockIsAllowedByRobots.mockResolvedValue(false);
112
+ await processPage({ id: 42, url: "https://react.dev/private", depth: 1 });
113
+ expect(mockMarkFailed).toHaveBeenCalledTimes(1);
114
+ expect(mockMarkFailed).toHaveBeenCalledWith(42, "Disallowed by robots.txt");
115
+ expect(mockDownloadPage).not.toHaveBeenCalled();
116
+ expect(mockMarkDone).not.toHaveBeenCalled();
117
+ });
118
+ });
@@ -0,0 +1,114 @@
1
+ import { request } from "undici";
2
+ import { config } from "../config.js";
3
+ import { isBlockedAddress } from "../security/ssrf.js";
4
+ /**
5
+ * Guards against SSRF by resolving the hostname before the request is made.
6
+ * Throws if the address is private, loopback, or link-local.
7
+ */
8
+ async function assertNotBlocked(url) {
9
+ const hostname = new URL(url).hostname;
10
+ if (await isBlockedAddress(hostname)) {
11
+ throw new Error(`SSRF blocked: "${hostname}" resolves to a private or internal address`);
12
+ }
13
+ }
14
+ /**
15
+ * Fetches the HTML content of a page, following redirects up to MAX_REDIRECTS.
16
+ * Tracks the final URL, enforces a request timeout, and blocks SSRF targets.
17
+ */
18
+ export async function downloadPage(initialUrl) {
19
+ let currentUrl = initialUrl;
20
+ let redirectCount = 0;
21
+ // SSRF check on the initial URL before any network activity
22
+ await assertNotBlocked(currentUrl);
23
+ while (true) {
24
+ const res = await request(currentUrl, {
25
+ method: "GET",
26
+ headersTimeout: config.REQUEST_TIMEOUT_MS,
27
+ bodyTimeout: config.REQUEST_TIMEOUT_MS,
28
+ });
29
+ const statusCode = res.statusCode;
30
+ // Handle redirects (301, 302, 303, 307, 308)
31
+ if (statusCode >= 300 && statusCode < 400 && res.headers.location) {
32
+ if (redirectCount >= config.MAX_REDIRECTS) {
33
+ await res.body.text(); // consume body to release connection
34
+ throw new Error("Too many redirects");
35
+ }
36
+ const location = Array.isArray(res.headers.location)
37
+ ? res.headers.location[0]
38
+ : res.headers.location;
39
+ const nextUrl = new URL(location, currentUrl).href;
40
+ // SSRF check on every redirect target before following
41
+ await assertNotBlocked(nextUrl);
42
+ currentUrl = nextUrl;
43
+ redirectCount++;
44
+ await res.body.text(); // consume body
45
+ continue;
46
+ }
47
+ // Error on non-200 responses
48
+ if (statusCode !== 200) {
49
+ await res.body.text();
50
+ throw new Error(`HTTP status ${statusCode}`);
51
+ }
52
+ // Skip non-HTML content types
53
+ const contentTypeHeader = res.headers["content-type"];
54
+ const contentType = Array.isArray(contentTypeHeader)
55
+ ? contentTypeHeader[0]
56
+ : contentTypeHeader;
57
+ if (contentType && !contentType.includes("text/html")) {
58
+ await res.body.text();
59
+ throw new Error(`Non-HTML content type: ${contentType}`);
60
+ }
61
+ const html = await res.body.text();
62
+ return { url: currentUrl, html, statusCode };
63
+ }
64
+ }
65
+ /**
66
+ * Downloads image assets securely.
67
+ * Enforces the same SSRF/blocklist checks, redirection limits, and timeouts as pages.
68
+ */
69
+ export async function downloadImage(initialUrl) {
70
+ let currentUrl = initialUrl;
71
+ let redirectCount = 0;
72
+ // SSRF check on initial URL
73
+ await assertNotBlocked(currentUrl);
74
+ while (true) {
75
+ const res = await request(currentUrl, {
76
+ method: "GET",
77
+ headersTimeout: config.REQUEST_TIMEOUT_MS,
78
+ bodyTimeout: config.REQUEST_TIMEOUT_MS,
79
+ });
80
+ const statusCode = res.statusCode;
81
+ // Handle redirects (301, 302, 303, 307, 308)
82
+ if (statusCode >= 300 && statusCode < 400 && res.headers.location) {
83
+ if (redirectCount >= config.MAX_REDIRECTS) {
84
+ await res.body.text();
85
+ throw new Error("Too many redirects fetching image");
86
+ }
87
+ const location = Array.isArray(res.headers.location)
88
+ ? res.headers.location[0]
89
+ : res.headers.location;
90
+ const nextUrl = new URL(location, currentUrl).href;
91
+ // SSRF check on target before redirecting
92
+ await assertNotBlocked(nextUrl);
93
+ currentUrl = nextUrl;
94
+ redirectCount++;
95
+ await res.body.text();
96
+ continue;
97
+ }
98
+ if (statusCode !== 200) {
99
+ await res.body.text();
100
+ throw new Error(`HTTP status ${statusCode} fetching image`);
101
+ }
102
+ // Validate that it's an image
103
+ const contentTypeHeader = res.headers["content-type"];
104
+ const contentType = Array.isArray(contentTypeHeader)
105
+ ? contentTypeHeader[0]
106
+ : contentTypeHeader;
107
+ if (contentType && !contentType.startsWith("image/")) {
108
+ await res.body.text();
109
+ throw new Error(`Non-image content type: ${contentType}`);
110
+ }
111
+ const arrayBuffer = await res.body.arrayBuffer();
112
+ return Buffer.from(arrayBuffer);
113
+ }
114
+ }
@@ -0,0 +1,197 @@
1
+ import * as cheerio from "cheerio";
2
+ /**
3
+ * Extracts metadata, headings, structured text content blocks, images, and outgoing links from HTML.
4
+ * Strips site chrome and uses a text-density heuristic if no main content container is found.
5
+ */
6
+ export function extractPageData(html, baseUrl) {
7
+ const $ = cheerio.load(html);
8
+ const title = $("title").text().trim() || null;
9
+ const description = $("meta[name=description]").attr("content")?.trim() || null;
10
+ const canonicalUrl = $("link[rel=canonical]").attr("href")?.trim() || null;
11
+ const h1 = [];
12
+ const h2 = [];
13
+ const h3 = [];
14
+ $("h1").each((_, el) => {
15
+ const text = $(el).text().trim();
16
+ if (text)
17
+ h1.push(text);
18
+ });
19
+ $("h2").each((_, el) => {
20
+ const text = $(el).text().trim();
21
+ if (text)
22
+ h2.push(text);
23
+ });
24
+ $("h3").each((_, el) => {
25
+ const text = $(el).text().trim();
26
+ if (text)
27
+ h3.push(text);
28
+ });
29
+ const links = [];
30
+ $("a[href]").each((_, el) => {
31
+ const href = $(el).attr("href")?.trim();
32
+ if (href) {
33
+ links.push(href);
34
+ }
35
+ });
36
+ // Determine resolution base URL for images
37
+ const resolutionBase = canonicalUrl || baseUrl || null;
38
+ // 1. Main-content heuristic selection
39
+ let mainNode = $("article").first();
40
+ if (mainNode.length === 0) {
41
+ mainNode = $("main").first();
42
+ }
43
+ if (mainNode.length === 0) {
44
+ mainNode = $("[role=main]").first();
45
+ }
46
+ // Fallback text-density heuristic
47
+ if (mainNode.length === 0) {
48
+ const totalBodyText = $("body").text().trim();
49
+ const minTextLength = Math.min(200, totalBodyText.length * 0.1);
50
+ let bestNode = $("body");
51
+ let maxScore = -1;
52
+ $("div, section").each((_, el) => {
53
+ const $el = $(el);
54
+ const text = $el.text().trim();
55
+ const textLength = text.length;
56
+ if (textLength < minTextLength)
57
+ return;
58
+ const tagCount = $el.find("*").length;
59
+ const score = textLength / (tagCount + 1);
60
+ if (score > maxScore) {
61
+ maxScore = score;
62
+ bestNode = $el;
63
+ }
64
+ });
65
+ mainNode = bestNode;
66
+ }
67
+ // 2. Clone and clean the chosen node
68
+ const cleanedNode = mainNode.clone();
69
+ cleanedNode.find("script, style, noscript, iframe, nav, footer, header").remove();
70
+ // 3. Extract in-order content blocks and overall images list
71
+ const blocks = [];
72
+ const images = [];
73
+ // Extract all images inside the cleaned main node
74
+ cleanedNode.find("img").each((_, img) => {
75
+ const src = $(img).attr("src")?.trim();
76
+ const alt = $(img).attr("alt")?.trim() || "";
77
+ if (src) {
78
+ let resolvedSrc = src;
79
+ if (resolutionBase) {
80
+ try {
81
+ resolvedSrc = new URL(src, resolutionBase).href;
82
+ }
83
+ catch {
84
+ // keep relative src if resolution fails
85
+ }
86
+ }
87
+ images.push({ src: resolvedSrc, alt });
88
+ }
89
+ });
90
+ // Track if we need to force a new paragraph on the next text node
91
+ let forceNewParagraph = true;
92
+ function walk(node) {
93
+ if (node.type === "text") {
94
+ const text = node.data.replace(/\s+/g, " ").trim();
95
+ if (text) {
96
+ const lastBlock = blocks[blocks.length - 1];
97
+ if (!forceNewParagraph && lastBlock && lastBlock.type === "paragraph") {
98
+ lastBlock.text = (lastBlock.text + " " + text).replace(/\s+/g, " ").trim();
99
+ }
100
+ else {
101
+ blocks.push({ type: "paragraph", text });
102
+ forceNewParagraph = false;
103
+ }
104
+ }
105
+ return;
106
+ }
107
+ if (node.type !== "tag") {
108
+ return;
109
+ }
110
+ const el = node;
111
+ const tagName = el.tagName?.toLowerCase();
112
+ // Skip removed elements just in case
113
+ if (["script", "style", "noscript", "iframe", "nav", "footer", "header"].includes(tagName)) {
114
+ return;
115
+ }
116
+ if (/^h[1-6]$/.test(tagName)) {
117
+ const level = parseInt(tagName.substring(1), 10);
118
+ const text = $(el).text().replace(/\s+/g, " ").trim();
119
+ if (text) {
120
+ blocks.push({ type: "heading", level, text });
121
+ }
122
+ forceNewParagraph = true;
123
+ }
124
+ else if (tagName === "p") {
125
+ const text = $(el).text().replace(/\s+/g, " ").trim();
126
+ if (text) {
127
+ blocks.push({ type: "paragraph", text });
128
+ }
129
+ forceNewParagraph = true;
130
+ }
131
+ else if (tagName === "ul" || tagName === "ol") {
132
+ const items = [];
133
+ $(el).find("li").each((_, li) => {
134
+ const itemText = $(li).text().replace(/\s+/g, " ").trim();
135
+ if (itemText)
136
+ items.push(itemText);
137
+ });
138
+ if (items.length > 0) {
139
+ blocks.push({ type: "list", items });
140
+ }
141
+ forceNewParagraph = true;
142
+ }
143
+ else if (tagName === "img") {
144
+ const src = $(el).attr("src")?.trim();
145
+ const alt = $(el).attr("alt")?.trim() || "";
146
+ if (src) {
147
+ let resolvedSrc = src;
148
+ if (resolutionBase) {
149
+ try {
150
+ resolvedSrc = new URL(src, resolutionBase).href;
151
+ }
152
+ catch {
153
+ // keep as is
154
+ }
155
+ }
156
+ blocks.push({ type: "image", src: resolvedSrc, alt });
157
+ }
158
+ forceNewParagraph = true;
159
+ }
160
+ else if (tagName === "br") {
161
+ forceNewParagraph = true;
162
+ }
163
+ else {
164
+ // For general container tags (div, span, etc.), walk contents recursively
165
+ $(el).contents().each((_, child) => {
166
+ walk(child);
167
+ });
168
+ }
169
+ }
170
+ cleanedNode.contents().each((_, child) => {
171
+ walk(child);
172
+ });
173
+ // Fallback textContent: concatenated paragraphs / lists for backwards compatibility
174
+ const textContentParts = [];
175
+ for (const block of blocks) {
176
+ if (block.type === "paragraph" && block.text) {
177
+ textContentParts.push(block.text);
178
+ }
179
+ else if (block.type === "heading" && block.text) {
180
+ textContentParts.push(block.text);
181
+ }
182
+ else if (block.type === "list" && block.items) {
183
+ textContentParts.push(block.items.join(" "));
184
+ }
185
+ }
186
+ const textContent = textContentParts.join(" ").replace(/\s+/g, " ").trim() || null;
187
+ return {
188
+ title,
189
+ description,
190
+ canonicalUrl,
191
+ headings: { h1, h2, h3 },
192
+ textContent,
193
+ links,
194
+ blocks,
195
+ images,
196
+ };
197
+ }
@@ -0,0 +1,87 @@
1
+ import { downloadPage } from "./downloader.js";
2
+ import { extractPageData } from "./extractor.js";
3
+ import { normalizeURL, getDomain } from "../normalizer.js";
4
+ import { insertURL, insertLink, markFailed } from "../db/queries.js";
5
+ import { config } from "../config.js";
6
+ import { isAllowedByRobots } from "../frontier/robots.js";
7
+ import { getStrategy } from "../output/index.js";
8
+ function isDomainAllowed(domain) {
9
+ if (!config.ALLOWED_DOMAINS || config.ALLOWED_DOMAINS.length === 0) {
10
+ return true;
11
+ }
12
+ return config.ALLOWED_DOMAINS.includes(domain);
13
+ }
14
+ /**
15
+ * Handles the complete crawling workflow for a single URL:
16
+ * 1. Downloads the page HTML (handling redirects & timeouts).
17
+ * 2. Extracts title, description, canonical, headings, text content, and outgoing links.
18
+ * 3. Delegates persistence to the active OutputStrategy (DB or PDF).
19
+ * 4. Filters, normalizes, and enqueues discovered links, establishing link graph relations.
20
+ */
21
+ export async function processPage(urlRow) {
22
+ const urlId = urlRow.id;
23
+ const pageUrl = urlRow.url;
24
+ const currentDepth = urlRow.depth;
25
+ try {
26
+ // 0. Check robots.txt compliance
27
+ const allowed = await isAllowedByRobots(pageUrl);
28
+ if (!allowed) {
29
+ await markFailed(urlId, "Disallowed by robots.txt");
30
+ return;
31
+ }
32
+ // 1. Download page content
33
+ const downloadResult = await downloadPage(pageUrl);
34
+ // 2. Extract content & outgoing links
35
+ const extracted = extractPageData(downloadResult.html, downloadResult.url);
36
+ // Resolve final URL using canonical link if present
37
+ let finalUrl = downloadResult.url;
38
+ if (extracted.canonicalUrl) {
39
+ const normalizedCanonical = normalizeURL(extracted.canonicalUrl, finalUrl);
40
+ if (normalizedCanonical) {
41
+ finalUrl = normalizedCanonical;
42
+ }
43
+ }
44
+ // 3. Persist content via the active output strategy (DB or PDF)
45
+ const strategy = getStrategy();
46
+ await strategy.save(urlId, finalUrl, {
47
+ title: extracted.title,
48
+ description: extracted.description,
49
+ canonicalUrl: extracted.canonicalUrl,
50
+ headings: extracted.headings,
51
+ textContent: extracted.textContent,
52
+ blocks: extracted.blocks,
53
+ images: extracted.images,
54
+ });
55
+ // 4. Process outgoing links
56
+ const uniqueNormalizedLinks = new Set();
57
+ for (const link of extracted.links) {
58
+ const normalized = normalizeURL(link, finalUrl);
59
+ if (!normalized)
60
+ continue;
61
+ // Skip self-referential links
62
+ if (normalized === finalUrl || normalized === pageUrl)
63
+ continue;
64
+ const linkDomain = getDomain(normalized);
65
+ if (!linkDomain || !isDomainAllowed(linkDomain))
66
+ continue;
67
+ uniqueNormalizedLinks.add(normalized);
68
+ }
69
+ for (const normalizedLink of uniqueNormalizedLinks) {
70
+ const nextDepth = currentDepth + 1;
71
+ // Enforce MAX_DEPTH limit
72
+ if (nextDepth > config.MAX_DEPTH) {
73
+ continue;
74
+ }
75
+ const targetDomain = getDomain(normalizedLink);
76
+ // Insert target URL (ON CONFLICT DO NOTHING) and get its ID
77
+ const targetUrlId = await insertURL(normalizedLink, targetDomain, nextDepth);
78
+ // Establish link graph relation
79
+ await insertLink(urlId, targetUrlId);
80
+ }
81
+ }
82
+ catch (error) {
83
+ const errorMsg = error instanceof Error ? error.message : String(error);
84
+ await markFailed(urlId, errorMsg);
85
+ throw error;
86
+ }
87
+ }
package/package.json ADDED
@@ -0,0 +1,48 @@
1
+ {
2
+ "name": "messi-crawler",
3
+ "version": "1.0.0",
4
+ "description": "A web crawler built with Node.js and TypeScript for collecting programming-related documentation and web content.",
5
+ "main": "dist/index.js",
6
+ "bin": {
7
+ "messi-crawler": "./dist/index.js"
8
+ },
9
+ "type": "module",
10
+ "scripts": {
11
+ "build": "tsc",
12
+ "start": "node dist/index.js",
13
+ "dev": "tsx src/index.ts",
14
+ "test": "vitest",
15
+ "config": "tsx src/setup.ts",
16
+ "crawl": "npm run build && node dist/index.js",
17
+ "db:clear": "tsx src/db/clear.ts"
18
+ },
19
+ "repository": {
20
+ "type": "git",
21
+ "url": "git+https://github.com/lightning4747/Web-crawler-cli.git"
22
+ },
23
+ "keywords": [],
24
+ "author": "",
25
+ "license": "ISC",
26
+ "bugs": {
27
+ "url": "https://github.com/lightning4747/Web-crawler-cli/issues"
28
+ },
29
+ "homepage": "https://github.com/lightning4747/Web-crawler-cli#readme",
30
+ "dependencies": {
31
+ "cheerio": "^1.2.0",
32
+ "dotenv": "^17.4.2",
33
+ "inquirer": "^14.0.2",
34
+ "pdfkit": "^0.19.1",
35
+ "pg": "^8.21.0",
36
+ "robots-parser": "^3.0.1",
37
+ "undici": "^8.3.0"
38
+ },
39
+ "devDependencies": {
40
+ "@types/inquirer": "^9.0.10",
41
+ "@types/node": "^25.9.1",
42
+ "@types/pdfkit": "^0.17.6",
43
+ "@types/pg": "^8.20.0",
44
+ "tsx": "^4.22.4",
45
+ "typescript": "^6.0.3",
46
+ "vitest": "^4.1.8"
47
+ }
48
+ }
package/seeds.txt ADDED
@@ -0,0 +1,4 @@
1
+ [https://www.akc.org/dog-breeds/](https://www.akc.org/dog-breeds/)
2
+ [https://www.royalkennelclub.org/breeds-a-z/](https://www.google.com/search?q=https://www.royalkennelclub.org/breeds-a-z/)
3
+ [https://dogapi.dog/api/v2/breeds](https://www.google.com/search?q=https://dogapi.dog/api/v2/breeds)
4
+ [https://dog.ceo/api/breeds/list/all](https://www.google.com/search?q=https://dog.ceo/api/breeds/list/all)