messi-crawler 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. package/README.md +201 -0
  2. package/dist/cli/renderer.js +71 -0
  3. package/dist/config.js +18 -0
  4. package/dist/db/clear.js +16 -0
  5. package/dist/db/client.js +20 -0
  6. package/dist/db/queries.js +179 -0
  7. package/dist/frontier/frontier.js +44 -0
  8. package/dist/frontier/logger.js +65 -0
  9. package/dist/frontier/robots.js +46 -0
  10. package/dist/frontier/scheduler.js +98 -0
  11. package/dist/index.js +533 -0
  12. package/dist/normalizer.js +33 -0
  13. package/dist/output/db-strategy.js +16 -0
  14. package/dist/output/index.js +23 -0
  15. package/dist/output/pdf-strategy.js +316 -0
  16. package/dist/output/strategy.js +1 -0
  17. package/dist/security/ssrf.js +45 -0
  18. package/dist/security/validate-url.js +41 -0
  19. package/dist/seed.js +14 -0
  20. package/dist/setup.js +148 -0
  21. package/dist/test/client.test.js +33 -0
  22. package/dist/test/downloader.test.js +84 -0
  23. package/dist/test/extractor.test.js +126 -0
  24. package/dist/test/frontier.test.js +43 -0
  25. package/dist/test/logger.test.js +55 -0
  26. package/dist/test/normalizer.test.js +36 -0
  27. package/dist/test/pdf-strategy.test.js +68 -0
  28. package/dist/test/queries.test.js +173 -0
  29. package/dist/test/robots.test.js +46 -0
  30. package/dist/test/scheduler.test.js +73 -0
  31. package/dist/test/seed.test.js +26 -0
  32. package/dist/test/worker.test.js +118 -0
  33. package/dist/worker/downloader.js +114 -0
  34. package/dist/worker/extractor.js +197 -0
  35. package/dist/worker/worker.js +87 -0
  36. package/package.json +48 -0
  37. package/seeds.txt +4 -0
  38. package/src/cli/renderer.ts +83 -0
  39. package/src/config.ts +22 -0
  40. package/src/db/clear.ts +16 -0
  41. package/src/db/client.ts +26 -0
  42. package/src/db/queries.ts +255 -0
  43. package/src/db/schema.sql +43 -0
  44. package/src/frontier/frontier.ts +60 -0
  45. package/src/frontier/logger.ts +75 -0
  46. package/src/frontier/robots.ts +50 -0
  47. package/src/frontier/scheduler.ts +119 -0
  48. package/src/index.ts +596 -0
  49. package/src/normalizer.ts +37 -0
  50. package/src/output/db-strategy.ts +20 -0
  51. package/src/output/index.ts +32 -0
  52. package/src/output/pdf-strategy.ts +388 -0
  53. package/src/output/strategy.ts +16 -0
  54. package/src/security/ssrf.ts +48 -0
  55. package/src/security/validate-url.ts +49 -0
  56. package/src/seed.ts +18 -0
  57. package/src/setup.ts +170 -0
  58. package/src/test/client.test.ts +38 -0
  59. package/src/test/downloader.test.ts +101 -0
  60. package/src/test/extractor.test.ts +139 -0
  61. package/src/test/frontier.test.ts +53 -0
  62. package/src/test/logger.test.ts +71 -0
  63. package/src/test/normalizer.test.ts +43 -0
  64. package/src/test/pdf-strategy.test.ts +84 -0
  65. package/src/test/queries.test.ts +247 -0
  66. package/src/test/robots.test.ts +56 -0
  67. package/src/test/scheduler.test.ts +90 -0
  68. package/src/test/seed.test.ts +35 -0
  69. package/src/test/worker.test.ts +144 -0
  70. package/src/worker/downloader.ts +149 -0
  71. package/src/worker/extractor.ts +235 -0
  72. package/src/worker/worker.ts +100 -0
  73. package/tsconfig.json +15 -0
@@ -0,0 +1,149 @@
1
+ import { request } from "undici";
2
+ import { config } from "../config.js";
3
+ import { isBlockedAddress } from "../security/ssrf.js";
4
+
5
+ export interface DownloaderResult {
6
+ url: string;
7
+ html: string;
8
+ statusCode: number;
9
+ }
10
+
11
+ /**
12
+ * Guards against SSRF by resolving the hostname before the request is made.
13
+ * Throws if the address is private, loopback, or link-local.
14
+ */
15
+ async function assertNotBlocked(url: string): Promise<void> {
16
+ const hostname = new URL(url).hostname;
17
+ if (await isBlockedAddress(hostname)) {
18
+ throw new Error(`SSRF blocked: "${hostname}" resolves to a private or internal address`);
19
+ }
20
+ }
21
+
22
+ /**
23
+ * Fetches the HTML content of a page, following redirects up to MAX_REDIRECTS.
24
+ * Tracks the final URL, enforces a request timeout, and blocks SSRF targets.
25
+ */
26
+ export async function downloadPage(initialUrl: string): Promise<DownloaderResult> {
27
+ let currentUrl = initialUrl;
28
+ let redirectCount = 0;
29
+
30
+ // SSRF check on the initial URL before any network activity
31
+ await assertNotBlocked(currentUrl);
32
+
33
+ while (true) {
34
+ const res = await request(currentUrl, {
35
+ method: "GET",
36
+ headersTimeout: config.REQUEST_TIMEOUT_MS,
37
+ bodyTimeout: config.REQUEST_TIMEOUT_MS,
38
+ });
39
+
40
+ const statusCode = res.statusCode;
41
+
42
+ // Handle redirects (301, 302, 303, 307, 308)
43
+ if (statusCode >= 300 && statusCode < 400 && res.headers.location) {
44
+ if (redirectCount >= config.MAX_REDIRECTS) {
45
+ await res.body.text(); // consume body to release connection
46
+ throw new Error("Too many redirects");
47
+ }
48
+
49
+ const location = Array.isArray(res.headers.location)
50
+ ? res.headers.location[0]
51
+ : res.headers.location;
52
+
53
+ const nextUrl = new URL(location, currentUrl).href;
54
+
55
+ // SSRF check on every redirect target before following
56
+ await assertNotBlocked(nextUrl);
57
+
58
+ currentUrl = nextUrl;
59
+ redirectCount++;
60
+
61
+ await res.body.text(); // consume body
62
+ continue;
63
+ }
64
+
65
+ // Error on non-200 responses
66
+ if (statusCode !== 200) {
67
+ await res.body.text();
68
+ throw new Error(`HTTP status ${statusCode}`);
69
+ }
70
+
71
+ // Skip non-HTML content types
72
+ const contentTypeHeader = res.headers["content-type"];
73
+ const contentType = Array.isArray(contentTypeHeader)
74
+ ? contentTypeHeader[0]
75
+ : contentTypeHeader;
76
+
77
+ if (contentType && !contentType.includes("text/html")) {
78
+ await res.body.text();
79
+ throw new Error(`Non-HTML content type: ${contentType}`);
80
+ }
81
+
82
+ const html = await res.body.text();
83
+ return { url: currentUrl, html, statusCode };
84
+ }
85
+ }
86
+
87
+ /**
88
+ * Downloads image assets securely.
89
+ * Enforces the same SSRF/blocklist checks, redirection limits, and timeouts as pages.
90
+ */
91
+ export async function downloadImage(initialUrl: string): Promise<Buffer> {
92
+ let currentUrl = initialUrl;
93
+ let redirectCount = 0;
94
+
95
+ // SSRF check on initial URL
96
+ await assertNotBlocked(currentUrl);
97
+
98
+ while (true) {
99
+ const res = await request(currentUrl, {
100
+ method: "GET",
101
+ headersTimeout: config.REQUEST_TIMEOUT_MS,
102
+ bodyTimeout: config.REQUEST_TIMEOUT_MS,
103
+ });
104
+
105
+ const statusCode = res.statusCode;
106
+
107
+ // Handle redirects (301, 302, 303, 307, 308)
108
+ if (statusCode >= 300 && statusCode < 400 && res.headers.location) {
109
+ if (redirectCount >= config.MAX_REDIRECTS) {
110
+ await res.body.text();
111
+ throw new Error("Too many redirects fetching image");
112
+ }
113
+
114
+ const location = Array.isArray(res.headers.location)
115
+ ? res.headers.location[0]
116
+ : res.headers.location;
117
+
118
+ const nextUrl = new URL(location, currentUrl).href;
119
+
120
+ // SSRF check on target before redirecting
121
+ await assertNotBlocked(nextUrl);
122
+
123
+ currentUrl = nextUrl;
124
+ redirectCount++;
125
+ await res.body.text();
126
+ continue;
127
+ }
128
+
129
+ if (statusCode !== 200) {
130
+ await res.body.text();
131
+ throw new Error(`HTTP status ${statusCode} fetching image`);
132
+ }
133
+
134
+ // Validate that it's an image
135
+ const contentTypeHeader = res.headers["content-type"];
136
+ const contentType = Array.isArray(contentTypeHeader)
137
+ ? contentTypeHeader[0]
138
+ : contentTypeHeader;
139
+
140
+ if (contentType && !contentType.startsWith("image/")) {
141
+ await res.body.text();
142
+ throw new Error(`Non-image content type: ${contentType}`);
143
+ }
144
+
145
+ const arrayBuffer = await res.body.arrayBuffer();
146
+ return Buffer.from(arrayBuffer);
147
+ }
148
+ }
149
+
@@ -0,0 +1,235 @@
1
+ import * as cheerio from "cheerio";
2
+
3
+ export interface ContentBlock {
4
+ type: "heading" | "paragraph" | "list" | "image";
5
+ text?: string;
6
+ level?: number;
7
+ items?: string[];
8
+ src?: string;
9
+ alt?: string;
10
+ }
11
+
12
+ export interface ExtractedImage {
13
+ src: string;
14
+ alt: string;
15
+ }
16
+
17
+ export interface ExtractedData {
18
+ title: string | null;
19
+ description: string | null;
20
+ canonicalUrl: string | null;
21
+ headings: {
22
+ h1: string[];
23
+ h2: string[];
24
+ h3: string[];
25
+ };
26
+ textContent: string | null;
27
+ links: string[];
28
+ blocks?: ContentBlock[];
29
+ images?: ExtractedImage[];
30
+ }
31
+
32
+ /**
33
+ * Extracts metadata, headings, structured text content blocks, images, and outgoing links from HTML.
34
+ * Strips site chrome and uses a text-density heuristic if no main content container is found.
35
+ */
36
+ export function extractPageData(html: string, baseUrl?: string): ExtractedData {
37
+ const $ = cheerio.load(html);
38
+
39
+ const title = $("title").text().trim() || null;
40
+ const description = $("meta[name=description]").attr("content")?.trim() || null;
41
+ const canonicalUrl = $("link[rel=canonical]").attr("href")?.trim() || null;
42
+
43
+ const h1: string[] = [];
44
+ const h2: string[] = [];
45
+ const h3: string[] = [];
46
+
47
+ $("h1").each((_, el) => {
48
+ const text = $(el).text().trim();
49
+ if (text) h1.push(text);
50
+ });
51
+ $("h2").each((_, el) => {
52
+ const text = $(el).text().trim();
53
+ if (text) h2.push(text);
54
+ });
55
+ $("h3").each((_, el) => {
56
+ const text = $(el).text().trim();
57
+ if (text) h3.push(text);
58
+ });
59
+
60
+ const links: string[] = [];
61
+ $("a[href]").each((_, el) => {
62
+ const href = $(el).attr("href")?.trim();
63
+ if (href) {
64
+ links.push(href);
65
+ }
66
+ });
67
+
68
+ // Determine resolution base URL for images
69
+ const resolutionBase = canonicalUrl || baseUrl || null;
70
+
71
+ // 1. Main-content heuristic selection
72
+ let mainNode = $("article").first();
73
+ if (mainNode.length === 0) {
74
+ mainNode = $("main").first();
75
+ }
76
+ if (mainNode.length === 0) {
77
+ mainNode = $("[role=main]").first();
78
+ }
79
+
80
+ // Fallback text-density heuristic
81
+ if (mainNode.length === 0) {
82
+ const totalBodyText = $("body").text().trim();
83
+ const minTextLength = Math.min(200, totalBodyText.length * 0.1);
84
+ let bestNode = $("body");
85
+ let maxScore = -1;
86
+
87
+ $("div, section").each((_, el) => {
88
+ const $el = $(el);
89
+ const text = $el.text().trim();
90
+ const textLength = text.length;
91
+ if (textLength < minTextLength) return;
92
+
93
+ const tagCount = $el.find("*").length;
94
+ const score = textLength / (tagCount + 1);
95
+
96
+ if (score > maxScore) {
97
+ maxScore = score;
98
+ bestNode = $el;
99
+ }
100
+ });
101
+
102
+ mainNode = bestNode;
103
+ }
104
+
105
+ // 2. Clone and clean the chosen node
106
+ const cleanedNode = mainNode.clone();
107
+ cleanedNode.find("script, style, noscript, iframe, nav, footer, header").remove();
108
+
109
+ // 3. Extract in-order content blocks and overall images list
110
+ const blocks: ContentBlock[] = [];
111
+ const images: ExtractedImage[] = [];
112
+
113
+ // Extract all images inside the cleaned main node
114
+ cleanedNode.find("img").each((_, img) => {
115
+ const src = $(img).attr("src")?.trim();
116
+ const alt = $(img).attr("alt")?.trim() || "";
117
+ if (src) {
118
+ let resolvedSrc = src;
119
+ if (resolutionBase) {
120
+ try {
121
+ resolvedSrc = new URL(src, resolutionBase).href;
122
+ } catch {
123
+ // keep relative src if resolution fails
124
+ }
125
+ }
126
+ images.push({ src: resolvedSrc, alt });
127
+ }
128
+ });
129
+
130
+ // Track if we need to force a new paragraph on the next text node
131
+ let forceNewParagraph = true;
132
+
133
+ function walk(node: any) {
134
+ if (node.type === "text") {
135
+ const text = (node as any).data.replace(/\s+/g, " ").trim();
136
+ if (text) {
137
+ const lastBlock = blocks[blocks.length - 1];
138
+ if (!forceNewParagraph && lastBlock && lastBlock.type === "paragraph") {
139
+ lastBlock.text = (lastBlock.text + " " + text).replace(/\s+/g, " ").trim();
140
+ } else {
141
+ blocks.push({ type: "paragraph", text });
142
+ forceNewParagraph = false;
143
+ }
144
+ }
145
+ return;
146
+ }
147
+
148
+ if (node.type !== "tag") {
149
+ return;
150
+ }
151
+
152
+ const el = node as any;
153
+ const tagName = el.tagName?.toLowerCase();
154
+
155
+ // Skip removed elements just in case
156
+ if (["script", "style", "noscript", "iframe", "nav", "footer", "header"].includes(tagName)) {
157
+ return;
158
+ }
159
+
160
+ if (/^h[1-6]$/.test(tagName)) {
161
+ const level = parseInt(tagName.substring(1), 10);
162
+ const text = $(el).text().replace(/\s+/g, " ").trim();
163
+ if (text) {
164
+ blocks.push({ type: "heading", level, text });
165
+ }
166
+ forceNewParagraph = true;
167
+ } else if (tagName === "p") {
168
+ const text = $(el).text().replace(/\s+/g, " ").trim();
169
+ if (text) {
170
+ blocks.push({ type: "paragraph", text });
171
+ }
172
+ forceNewParagraph = true;
173
+ } else if (tagName === "ul" || tagName === "ol") {
174
+ const items: string[] = [];
175
+ $(el).find("li").each((_, li) => {
176
+ const itemText = $(li).text().replace(/\s+/g, " ").trim();
177
+ if (itemText) items.push(itemText);
178
+ });
179
+ if (items.length > 0) {
180
+ blocks.push({ type: "list", items });
181
+ }
182
+ forceNewParagraph = true;
183
+ } else if (tagName === "img") {
184
+ const src = $(el).attr("src")?.trim();
185
+ const alt = $(el).attr("alt")?.trim() || "";
186
+ if (src) {
187
+ let resolvedSrc = src;
188
+ if (resolutionBase) {
189
+ try {
190
+ resolvedSrc = new URL(src, resolutionBase).href;
191
+ } catch {
192
+ // keep as is
193
+ }
194
+ }
195
+ blocks.push({ type: "image", src: resolvedSrc, alt });
196
+ }
197
+ forceNewParagraph = true;
198
+ } else if (tagName === "br") {
199
+ forceNewParagraph = true;
200
+ } else {
201
+ // For general container tags (div, span, etc.), walk contents recursively
202
+ $(el).contents().each((_, child) => {
203
+ walk(child);
204
+ });
205
+ }
206
+ }
207
+
208
+ cleanedNode.contents().each((_, child) => {
209
+ walk(child);
210
+ });
211
+
212
+ // Fallback textContent: concatenated paragraphs / lists for backwards compatibility
213
+ const textContentParts: string[] = [];
214
+ for (const block of blocks) {
215
+ if (block.type === "paragraph" && block.text) {
216
+ textContentParts.push(block.text);
217
+ } else if (block.type === "heading" && block.text) {
218
+ textContentParts.push(block.text);
219
+ } else if (block.type === "list" && block.items) {
220
+ textContentParts.push(block.items.join(" "));
221
+ }
222
+ }
223
+ const textContent = textContentParts.join(" ").replace(/\s+/g, " ").trim() || null;
224
+
225
+ return {
226
+ title,
227
+ description,
228
+ canonicalUrl,
229
+ headings: { h1, h2, h3 },
230
+ textContent,
231
+ links,
232
+ blocks,
233
+ images,
234
+ };
235
+ }
@@ -0,0 +1,100 @@
1
+ import { downloadPage } from "./downloader.js";
2
+ import { extractPageData } from "./extractor.js";
3
+ import { normalizeURL, getDomain } from "../normalizer.js";
4
+ import { insertURL, insertLink, markFailed } from "../db/queries.js";
5
+ import { config } from "../config.js";
6
+ import { isAllowedByRobots } from "../frontier/robots.js";
7
+ import { getStrategy } from "../output/index.js";
8
+
9
+ function isDomainAllowed(domain: string): boolean {
10
+ if (!config.ALLOWED_DOMAINS || config.ALLOWED_DOMAINS.length === 0) {
11
+ return true;
12
+ }
13
+ return config.ALLOWED_DOMAINS.includes(domain);
14
+ }
15
+
16
+ /**
17
+ * Handles the complete crawling workflow for a single URL:
18
+ * 1. Downloads the page HTML (handling redirects & timeouts).
19
+ * 2. Extracts title, description, canonical, headings, text content, and outgoing links.
20
+ * 3. Delegates persistence to the active OutputStrategy (DB or PDF).
21
+ * 4. Filters, normalizes, and enqueues discovered links, establishing link graph relations.
22
+ */
23
+ export async function processPage(urlRow: { id: number; url: string; depth: number }): Promise<void> {
24
+ const urlId = urlRow.id;
25
+ const pageUrl = urlRow.url;
26
+ const currentDepth = urlRow.depth;
27
+
28
+ try {
29
+ // 0. Check robots.txt compliance
30
+ const allowed = await isAllowedByRobots(pageUrl);
31
+ if (!allowed) {
32
+ await markFailed(urlId, "Disallowed by robots.txt");
33
+ return;
34
+ }
35
+
36
+ // 1. Download page content
37
+ const downloadResult = await downloadPage(pageUrl);
38
+
39
+ // 2. Extract content & outgoing links
40
+ const extracted = extractPageData(downloadResult.html, downloadResult.url);
41
+
42
+ // Resolve final URL using canonical link if present
43
+ let finalUrl = downloadResult.url;
44
+ if (extracted.canonicalUrl) {
45
+ const normalizedCanonical = normalizeURL(extracted.canonicalUrl, finalUrl);
46
+ if (normalizedCanonical) {
47
+ finalUrl = normalizedCanonical;
48
+ }
49
+ }
50
+
51
+ // 3. Persist content via the active output strategy (DB or PDF)
52
+ const strategy = getStrategy();
53
+ await strategy.save(urlId, finalUrl, {
54
+ title: extracted.title,
55
+ description: extracted.description,
56
+ canonicalUrl: extracted.canonicalUrl,
57
+ headings: extracted.headings,
58
+ textContent: extracted.textContent,
59
+ blocks: extracted.blocks,
60
+ images: extracted.images,
61
+ });
62
+
63
+ // 4. Process outgoing links
64
+ const uniqueNormalizedLinks = new Set<string>();
65
+
66
+ for (const link of extracted.links) {
67
+ const normalized = normalizeURL(link, finalUrl);
68
+ if (!normalized) continue;
69
+
70
+ // Skip self-referential links
71
+ if (normalized === finalUrl || normalized === pageUrl) continue;
72
+
73
+ const linkDomain = getDomain(normalized);
74
+ if (!linkDomain || !isDomainAllowed(linkDomain)) continue;
75
+
76
+ uniqueNormalizedLinks.add(normalized);
77
+ }
78
+
79
+ for (const normalizedLink of uniqueNormalizedLinks) {
80
+ const nextDepth = currentDepth + 1;
81
+
82
+ // Enforce MAX_DEPTH limit
83
+ if (nextDepth > config.MAX_DEPTH) {
84
+ continue;
85
+ }
86
+
87
+ const targetDomain = getDomain(normalizedLink)!;
88
+
89
+ // Insert target URL (ON CONFLICT DO NOTHING) and get its ID
90
+ const targetUrlId = await insertURL(normalizedLink, targetDomain, nextDepth);
91
+
92
+ // Establish link graph relation
93
+ await insertLink(urlId, targetUrlId);
94
+ }
95
+ } catch (error: any) {
96
+ const errorMsg = error instanceof Error ? error.message : String(error);
97
+ await markFailed(urlId, errorMsg);
98
+ throw error;
99
+ }
100
+ }
package/tsconfig.json ADDED
@@ -0,0 +1,15 @@
1
+ {
2
+ "compilerOptions": {
3
+ "target": "ES2022",
4
+ "module": "NodeNext",
5
+ "moduleResolution": "NodeNext",
6
+ "esModuleInterop": true,
7
+ "forceConsistentCasingInFileNames": true,
8
+ "strict": true,
9
+ "skipLibCheck": true,
10
+ "outDir": "./dist",
11
+ "rootDir": "./src"
12
+ },
13
+ "include": ["src/**/*"],
14
+ "exclude": ["node_modules", "dist"]
15
+ }