messi-crawler 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +201 -0
- package/dist/cli/renderer.js +71 -0
- package/dist/config.js +18 -0
- package/dist/db/clear.js +16 -0
- package/dist/db/client.js +20 -0
- package/dist/db/queries.js +179 -0
- package/dist/frontier/frontier.js +44 -0
- package/dist/frontier/logger.js +65 -0
- package/dist/frontier/robots.js +46 -0
- package/dist/frontier/scheduler.js +98 -0
- package/dist/index.js +533 -0
- package/dist/normalizer.js +33 -0
- package/dist/output/db-strategy.js +16 -0
- package/dist/output/index.js +23 -0
- package/dist/output/pdf-strategy.js +316 -0
- package/dist/output/strategy.js +1 -0
- package/dist/security/ssrf.js +45 -0
- package/dist/security/validate-url.js +41 -0
- package/dist/seed.js +14 -0
- package/dist/setup.js +148 -0
- package/dist/test/client.test.js +33 -0
- package/dist/test/downloader.test.js +84 -0
- package/dist/test/extractor.test.js +126 -0
- package/dist/test/frontier.test.js +43 -0
- package/dist/test/logger.test.js +55 -0
- package/dist/test/normalizer.test.js +36 -0
- package/dist/test/pdf-strategy.test.js +68 -0
- package/dist/test/queries.test.js +173 -0
- package/dist/test/robots.test.js +46 -0
- package/dist/test/scheduler.test.js +73 -0
- package/dist/test/seed.test.js +26 -0
- package/dist/test/worker.test.js +118 -0
- package/dist/worker/downloader.js +114 -0
- package/dist/worker/extractor.js +197 -0
- package/dist/worker/worker.js +87 -0
- package/package.json +48 -0
- package/seeds.txt +4 -0
- package/src/cli/renderer.ts +83 -0
- package/src/config.ts +22 -0
- package/src/db/clear.ts +16 -0
- package/src/db/client.ts +26 -0
- package/src/db/queries.ts +255 -0
- package/src/db/schema.sql +43 -0
- package/src/frontier/frontier.ts +60 -0
- package/src/frontier/logger.ts +75 -0
- package/src/frontier/robots.ts +50 -0
- package/src/frontier/scheduler.ts +119 -0
- package/src/index.ts +596 -0
- package/src/normalizer.ts +37 -0
- package/src/output/db-strategy.ts +20 -0
- package/src/output/index.ts +32 -0
- package/src/output/pdf-strategy.ts +388 -0
- package/src/output/strategy.ts +16 -0
- package/src/security/ssrf.ts +48 -0
- package/src/security/validate-url.ts +49 -0
- package/src/seed.ts +18 -0
- package/src/setup.ts +170 -0
- package/src/test/client.test.ts +38 -0
- package/src/test/downloader.test.ts +101 -0
- package/src/test/extractor.test.ts +139 -0
- package/src/test/frontier.test.ts +53 -0
- package/src/test/logger.test.ts +71 -0
- package/src/test/normalizer.test.ts +43 -0
- package/src/test/pdf-strategy.test.ts +84 -0
- package/src/test/queries.test.ts +247 -0
- package/src/test/robots.test.ts +56 -0
- package/src/test/scheduler.test.ts +90 -0
- package/src/test/seed.test.ts +35 -0
- package/src/test/worker.test.ts +144 -0
- package/src/worker/downloader.ts +149 -0
- package/src/worker/extractor.ts +235 -0
- package/src/worker/worker.ts +100 -0
- package/tsconfig.json +15 -0
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import { DatabaseStrategy } from "./db-strategy.js";
|
|
2
|
+
import { PdfStrategy } from "./pdf-strategy.js";
|
|
3
|
+
let activeStrategy = null;
|
|
4
|
+
export function setStrategy(strategy) {
|
|
5
|
+
activeStrategy = strategy;
|
|
6
|
+
}
|
|
7
|
+
export function getStrategy() {
|
|
8
|
+
if (!activeStrategy) {
|
|
9
|
+
// Default to database strategy if none was configured
|
|
10
|
+
activeStrategy = new DatabaseStrategy();
|
|
11
|
+
}
|
|
12
|
+
return activeStrategy;
|
|
13
|
+
}
|
|
14
|
+
export function createStrategy(mode) {
|
|
15
|
+
switch (mode) {
|
|
16
|
+
case "pdf":
|
|
17
|
+
return new PdfStrategy();
|
|
18
|
+
case "database":
|
|
19
|
+
default:
|
|
20
|
+
return new DatabaseStrategy();
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
export { DatabaseStrategy, PdfStrategy };
|
|
@@ -0,0 +1,316 @@
|
|
|
1
|
+
import fs from "fs";
|
|
2
|
+
import path from "path";
|
|
3
|
+
import PDFDocumentCtor from "pdfkit";
|
|
4
|
+
import { markDone } from "../db/queries.js";
|
|
5
|
+
import { downloadImage } from "../worker/downloader.js";
|
|
6
|
+
const OUTPUT_DIR = "output";
|
|
7
|
+
const BASE_NAME = "documentation";
|
|
8
|
+
function resolveOutputPath() {
|
|
9
|
+
const first = path.join(OUTPUT_DIR, `${BASE_NAME}.pdf`);
|
|
10
|
+
if (!fs.existsSync(first))
|
|
11
|
+
return first;
|
|
12
|
+
let n = 2;
|
|
13
|
+
while (fs.existsSync(path.join(OUTPUT_DIR, `${BASE_NAME}${n}.pdf`)))
|
|
14
|
+
n++;
|
|
15
|
+
return path.join(OUTPUT_DIR, `${BASE_NAME}${n}.pdf`);
|
|
16
|
+
}
|
|
17
|
+
// ─── Layout constants ─────────────────────────────────────────────────────────
|
|
18
|
+
const MARGIN = 64;
|
|
19
|
+
const FOOTER_HEIGHT = 28; // reserved space at bottom for footer
|
|
20
|
+
const MAX_TEXT_CHARS = 5000;
|
|
21
|
+
// ─── Colour palette ───────────────────────────────────────────────────────────
|
|
22
|
+
const C = {
|
|
23
|
+
title: "#1a1a2e",
|
|
24
|
+
url: "#4361ee",
|
|
25
|
+
desc: "#444444",
|
|
26
|
+
section: "#2d6a4f",
|
|
27
|
+
body: "#222222",
|
|
28
|
+
truncated: "#aaaaaa",
|
|
29
|
+
rule: "#dddddd",
|
|
30
|
+
coverBg: "#1a1a2e",
|
|
31
|
+
coverFg: "#ffffff",
|
|
32
|
+
coverSub: "#a8dadc",
|
|
33
|
+
badge: "#888888",
|
|
34
|
+
footer: "#aaaaaa",
|
|
35
|
+
};
|
|
36
|
+
// ─── Helpers ──────────────────────────────────────────────────────────────────
|
|
37
|
+
/**
|
|
38
|
+
* Draws a horizontal rule at the current cursor Y and advances by `gap` points.
|
|
39
|
+
* Uses an absolute move so it never inherits stale font metrics.
|
|
40
|
+
*/
|
|
41
|
+
function rule(doc, color = C.rule, gap = 10) {
|
|
42
|
+
const y = doc.y;
|
|
43
|
+
doc
|
|
44
|
+
.moveTo(MARGIN, y)
|
|
45
|
+
.lineTo(doc.page.width - MARGIN, y)
|
|
46
|
+
.strokeColor(color)
|
|
47
|
+
.lineWidth(0.5)
|
|
48
|
+
.stroke();
|
|
49
|
+
doc.y = y + gap; // advance cursor by exact points, not line-height multiples
|
|
50
|
+
}
|
|
51
|
+
/**
|
|
52
|
+
* Normalises raw scraped text:
|
|
53
|
+
* - Collapses runs of whitespace/newlines into a single space
|
|
54
|
+
* - Trims leading/trailing whitespace
|
|
55
|
+
* This prevents the large blank gaps that `paragraphGap` creates when the
|
|
56
|
+
* extractor's output happens to contain stray newline characters.
|
|
57
|
+
*/
|
|
58
|
+
function normaliseText(raw) {
|
|
59
|
+
return raw.replace(/\s+/g, " ").trim();
|
|
60
|
+
}
|
|
61
|
+
/**
|
|
62
|
+
* Returns the usable content height on the current page
|
|
63
|
+
* (page height minus top margin, bottom margin, and footer reservation).
|
|
64
|
+
*/
|
|
65
|
+
function contentBottom(doc) {
|
|
66
|
+
return doc.page.height - MARGIN - FOOTER_HEIGHT;
|
|
67
|
+
}
|
|
68
|
+
// ─── Strategy ────────────────────────────────────────────────────────────────
|
|
69
|
+
export class PdfStrategy {
|
|
70
|
+
doc;
|
|
71
|
+
stream;
|
|
72
|
+
pageCount = 0;
|
|
73
|
+
pdfPath;
|
|
74
|
+
async init() {
|
|
75
|
+
if (!fs.existsSync(OUTPUT_DIR))
|
|
76
|
+
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
|
77
|
+
this.pdfPath = resolveOutputPath();
|
|
78
|
+
this.doc = new PDFDocumentCtor({
|
|
79
|
+
autoFirstPage: false,
|
|
80
|
+
bufferPages: true,
|
|
81
|
+
// Explicit margins so pdfkit never auto-paginates into blank pages
|
|
82
|
+
// due to cursor running past the bottom margin.
|
|
83
|
+
margins: { top: MARGIN, bottom: MARGIN + FOOTER_HEIGHT, left: MARGIN, right: MARGIN },
|
|
84
|
+
info: {
|
|
85
|
+
Title: "Checkout the repo https://github.com/lightning4747/Web-crawler-cli",
|
|
86
|
+
Author: "Web Crawler",
|
|
87
|
+
Subject: "Compiled documentation from crawled pages",
|
|
88
|
+
},
|
|
89
|
+
});
|
|
90
|
+
this.stream = fs.createWriteStream(this.pdfPath);
|
|
91
|
+
this.doc.pipe(this.stream);
|
|
92
|
+
this.renderCover();
|
|
93
|
+
console.log(`[PDF] Output file: ${this.pdfPath}`);
|
|
94
|
+
}
|
|
95
|
+
// ── Cover page ──────────────────────────────────────────────────────────────
|
|
96
|
+
renderCover() {
|
|
97
|
+
const doc = this.doc;
|
|
98
|
+
doc.addPage();
|
|
99
|
+
doc.rect(0, 0, doc.page.width, doc.page.height).fill(C.coverBg);
|
|
100
|
+
const midY = doc.page.height / 2 - 60;
|
|
101
|
+
doc.fontSize(38).font("Helvetica-Bold").fillColor(C.coverFg)
|
|
102
|
+
.text("Web Crawler", MARGIN, midY, { align: "center" });
|
|
103
|
+
// Advance by exact points to avoid font-size-based gaps
|
|
104
|
+
doc.y += 8;
|
|
105
|
+
doc.fontSize(32).font("Helvetica-Bold").fillColor(C.coverSub)
|
|
106
|
+
.text("Documentation Book", { align: "center" });
|
|
107
|
+
doc.y += 28;
|
|
108
|
+
doc.fontSize(11).font("Helvetica").fillColor(C.coverFg)
|
|
109
|
+
.text("https://github.com/lightning4747/Web-crawler-cli", { align: "center", link: "https://github.com/lightning4747/Web-crawler-cli" });
|
|
110
|
+
doc.y += 8;
|
|
111
|
+
doc.fontSize(10).font("Helvetica").fillColor(C.coverSub)
|
|
112
|
+
.text(new Date().toUTCString(), { align: "center" });
|
|
113
|
+
}
|
|
114
|
+
// ── Chapter page ────────────────────────────────────────────────────────────
|
|
115
|
+
async save(urlId, url, content) {
|
|
116
|
+
await markDone(urlId, content);
|
|
117
|
+
const doc = this.doc;
|
|
118
|
+
this.pageCount++;
|
|
119
|
+
doc.addPage();
|
|
120
|
+
const W = doc.page.width - MARGIN * 2; // usable text width
|
|
121
|
+
const limit = contentBottom(doc); // y-coordinate of content boundary
|
|
122
|
+
// ── Chapter badge (top-right, absolute position) ─────────────────────────
|
|
123
|
+
doc.fontSize(8).font("Helvetica").fillColor(C.badge)
|
|
124
|
+
.text(`CHAPTER ${this.pageCount}`, MARGIN, MARGIN, { width: W, align: "right" });
|
|
125
|
+
// Place cursor just below the badge — use exact points, not moveDown
|
|
126
|
+
doc.y = MARGIN + 14;
|
|
127
|
+
// ── Title ─────────────────────────────────────────────────────────────────
|
|
128
|
+
doc.fontSize(22).font("Helvetica-Bold").fillColor(C.title)
|
|
129
|
+
.text(content.title ?? url, { width: W, lineGap: 2 });
|
|
130
|
+
doc.y += 10;
|
|
131
|
+
rule(doc, "#4361ee", 10);
|
|
132
|
+
// ── Source URL ────────────────────────────────────────────────────────────
|
|
133
|
+
doc.fontSize(8.5).font("Helvetica-Oblique").fillColor(C.url)
|
|
134
|
+
.text(url, { width: W, link: url, underline: true, lineGap: 1 });
|
|
135
|
+
doc.y += 12;
|
|
136
|
+
// ── Description ───────────────────────────────────────────────────────────
|
|
137
|
+
if (content.description && doc.y < limit) {
|
|
138
|
+
doc.fontSize(11).font("Helvetica-Oblique").fillColor(C.desc)
|
|
139
|
+
.text(content.description.trim(), { width: W, lineGap: 2, align: "left" });
|
|
140
|
+
doc.y += 12;
|
|
141
|
+
}
|
|
142
|
+
// ── Headings summary ──────────────────────────────────────────────────────
|
|
143
|
+
const allHeadings = [
|
|
144
|
+
...content.headings.h1,
|
|
145
|
+
...content.headings.h2,
|
|
146
|
+
...content.headings.h3,
|
|
147
|
+
].slice(0, 12);
|
|
148
|
+
if (allHeadings.length > 0 && doc.y < limit) {
|
|
149
|
+
doc.fontSize(9).font("Helvetica-Bold").fillColor(C.section)
|
|
150
|
+
.text("CONTENTS OVERVIEW", { width: W, characterSpacing: 1 });
|
|
151
|
+
doc.y += 6;
|
|
152
|
+
for (const h of allHeadings) {
|
|
153
|
+
if (doc.y >= limit)
|
|
154
|
+
break;
|
|
155
|
+
const bulletX = MARGIN;
|
|
156
|
+
const textX = MARGIN + 14;
|
|
157
|
+
const y = doc.y;
|
|
158
|
+
// Bullet dot — drawn absolutely, no cursor movement
|
|
159
|
+
doc.circle(bulletX + 3, y + 5, 2).fill(C.section);
|
|
160
|
+
doc.fontSize(10).font("Helvetica").fillColor(C.body)
|
|
161
|
+
.text(h, textX, y, { width: W - 14, lineGap: 2 });
|
|
162
|
+
// Advance by 4pt padding between bullet items
|
|
163
|
+
doc.y += 4;
|
|
164
|
+
}
|
|
165
|
+
doc.y += 8;
|
|
166
|
+
if (doc.y < limit)
|
|
167
|
+
rule(doc, C.rule, 10);
|
|
168
|
+
}
|
|
169
|
+
// ── Page Content ──────────────────────────────────────────────────────────
|
|
170
|
+
if (content.blocks && content.blocks.length > 0) {
|
|
171
|
+
doc.fontSize(9).font("Helvetica-Bold").fillColor(C.section)
|
|
172
|
+
.text("PAGE CONTENT", { width: W, characterSpacing: 1 });
|
|
173
|
+
doc.y += 8;
|
|
174
|
+
for (const block of content.blocks) {
|
|
175
|
+
if (doc.y >= limit) {
|
|
176
|
+
doc.addPage();
|
|
177
|
+
}
|
|
178
|
+
if (block.type === "heading" && block.text) {
|
|
179
|
+
const headingSize = block.level === 1 ? 16 : block.level === 2 ? 14 : 12;
|
|
180
|
+
const headingHeight = doc.heightOfString(block.text, { width: W });
|
|
181
|
+
if (doc.y + headingHeight + 40 > limit) {
|
|
182
|
+
doc.addPage();
|
|
183
|
+
}
|
|
184
|
+
doc.fontSize(headingSize).font("Helvetica-Bold").fillColor(C.title)
|
|
185
|
+
.text(block.text, { width: W, lineGap: 2 });
|
|
186
|
+
doc.y += 6;
|
|
187
|
+
}
|
|
188
|
+
else if (block.type === "paragraph" && block.text) {
|
|
189
|
+
const text = block.text.trim();
|
|
190
|
+
if (!text)
|
|
191
|
+
continue;
|
|
192
|
+
const textHeight = doc.heightOfString(text, { width: W });
|
|
193
|
+
if (doc.y + 20 > limit) {
|
|
194
|
+
doc.addPage();
|
|
195
|
+
}
|
|
196
|
+
doc.fontSize(10).font("Helvetica").fillColor(C.body)
|
|
197
|
+
.text(text, { width: W, lineGap: 3 });
|
|
198
|
+
doc.y += 8;
|
|
199
|
+
}
|
|
200
|
+
else if (block.type === "list" && block.items && block.items.length > 0) {
|
|
201
|
+
if (doc.y + 20 > limit) {
|
|
202
|
+
doc.addPage();
|
|
203
|
+
}
|
|
204
|
+
for (const item of block.items) {
|
|
205
|
+
const itemText = item.trim();
|
|
206
|
+
if (!itemText)
|
|
207
|
+
continue;
|
|
208
|
+
const bulletX = MARGIN + 10;
|
|
209
|
+
const textX = MARGIN + 22;
|
|
210
|
+
const itemHeight = doc.heightOfString(itemText, { width: W - 22 });
|
|
211
|
+
if (doc.y + itemHeight > limit) {
|
|
212
|
+
doc.addPage();
|
|
213
|
+
}
|
|
214
|
+
const y = doc.y;
|
|
215
|
+
doc.circle(bulletX + 3, y + 5, 2).fill(C.body);
|
|
216
|
+
doc.fontSize(9.5).font("Helvetica").fillColor(C.body)
|
|
217
|
+
.text(itemText, textX, y, { width: W - 22, lineGap: 2 });
|
|
218
|
+
doc.y += 4;
|
|
219
|
+
}
|
|
220
|
+
doc.y += 4;
|
|
221
|
+
}
|
|
222
|
+
else if (block.type === "image" && block.src) {
|
|
223
|
+
try {
|
|
224
|
+
const imageBuffer = await downloadImage(block.src);
|
|
225
|
+
const maxImageHeight = 200;
|
|
226
|
+
if (doc.y + maxImageHeight + 20 > limit) {
|
|
227
|
+
doc.addPage();
|
|
228
|
+
}
|
|
229
|
+
doc.image(imageBuffer, {
|
|
230
|
+
fit: [W, maxImageHeight],
|
|
231
|
+
align: "center",
|
|
232
|
+
});
|
|
233
|
+
doc.y += maxImageHeight + 10;
|
|
234
|
+
if (block.alt) {
|
|
235
|
+
doc.fontSize(8.5).font("Helvetica-Oblique").fillColor(C.desc)
|
|
236
|
+
.text(block.alt, { width: W, align: "center" });
|
|
237
|
+
doc.y += 8;
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
catch (err) {
|
|
241
|
+
const fallbackText = `[Image: ${block.alt || "No description available"} (${block.src})]`;
|
|
242
|
+
const boxHeight = 40;
|
|
243
|
+
if (doc.y + boxHeight > limit) {
|
|
244
|
+
doc.addPage();
|
|
245
|
+
}
|
|
246
|
+
const currentY = doc.y;
|
|
247
|
+
doc.rect(MARGIN, currentY, W, boxHeight)
|
|
248
|
+
.strokeColor(C.rule)
|
|
249
|
+
.lineWidth(0.5)
|
|
250
|
+
.stroke();
|
|
251
|
+
doc.fontSize(9).font("Helvetica-Oblique").fillColor(C.truncated)
|
|
252
|
+
.text(fallbackText, MARGIN + 10, currentY + 14, { width: W - 20, align: "center" });
|
|
253
|
+
doc.y = currentY + boxHeight + 10;
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
else if (content.textContent && doc.y < limit) {
|
|
259
|
+
doc.fontSize(9).font("Helvetica-Bold").fillColor(C.section)
|
|
260
|
+
.text("PAGE CONTENT", { width: W, characterSpacing: 1 });
|
|
261
|
+
doc.y += 8;
|
|
262
|
+
const raw = normaliseText(content.textContent);
|
|
263
|
+
const body = raw.slice(0, MAX_TEXT_CHARS);
|
|
264
|
+
const truncated = raw.length > MAX_TEXT_CHARS;
|
|
265
|
+
doc.fontSize(10.5).font("Helvetica").fillColor(C.body)
|
|
266
|
+
.text(body, {
|
|
267
|
+
width: W,
|
|
268
|
+
lineGap: 3,
|
|
269
|
+
align: "left",
|
|
270
|
+
});
|
|
271
|
+
if (truncated && doc.y < limit) {
|
|
272
|
+
doc.y += 8;
|
|
273
|
+
doc.fontSize(8.5).font("Helvetica-Oblique").fillColor(C.truncated)
|
|
274
|
+
.text("[ content truncated for brevity ]", { width: W, align: "center" });
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
}
|
|
278
|
+
async finish() {
|
|
279
|
+
const doc = this.doc;
|
|
280
|
+
const range = doc.bufferedPageRange();
|
|
281
|
+
const totalPages = range.count;
|
|
282
|
+
for (let i = 1; i < totalPages; i++) {
|
|
283
|
+
doc.switchToPage(i);
|
|
284
|
+
const W = doc.page.width - MARGIN * 2;
|
|
285
|
+
const footerY = doc.page.height - MARGIN - FOOTER_HEIGHT + 8;
|
|
286
|
+
// Draw running header
|
|
287
|
+
doc.fontSize(8).font("Helvetica").fillColor(C.badge)
|
|
288
|
+
.text("https://github.com/lightning4747/Web-crawler-cli", MARGIN, MARGIN - 24, { width: W, align: "left", link: "https://github.com/lightning4747/Web-crawler-cli" });
|
|
289
|
+
doc
|
|
290
|
+
.moveTo(MARGIN, MARGIN - 14)
|
|
291
|
+
.lineTo(doc.page.width - MARGIN, MARGIN - 14)
|
|
292
|
+
.strokeColor(C.rule)
|
|
293
|
+
.lineWidth(0.4)
|
|
294
|
+
.stroke();
|
|
295
|
+
// Draw running footer
|
|
296
|
+
doc
|
|
297
|
+
.moveTo(MARGIN, footerY)
|
|
298
|
+
.lineTo(doc.page.width - MARGIN, footerY)
|
|
299
|
+
.strokeColor(C.rule)
|
|
300
|
+
.lineWidth(0.4)
|
|
301
|
+
.stroke();
|
|
302
|
+
doc.fontSize(8).font("Helvetica").fillColor(C.footer)
|
|
303
|
+
.text(`Page ${i} of ${totalPages - 1}`, MARGIN, footerY + 5, {
|
|
304
|
+
width: W,
|
|
305
|
+
align: "center",
|
|
306
|
+
lineBreak: false,
|
|
307
|
+
});
|
|
308
|
+
}
|
|
309
|
+
await new Promise((resolve, reject) => {
|
|
310
|
+
this.stream.on("finish", resolve);
|
|
311
|
+
this.stream.on("error", reject);
|
|
312
|
+
this.doc.end();
|
|
313
|
+
});
|
|
314
|
+
console.log(`[PDF] Done — ${this.pageCount} chapter(s) written to ${this.pdfPath}`);
|
|
315
|
+
}
|
|
316
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import { lookup } from "node:dns/promises";
|
|
2
|
+
/**
|
|
3
|
+
* IP range patterns that must never be requested.
|
|
4
|
+
* Covers loopback, private, link-local, and cloud metadata addresses.
|
|
5
|
+
*/
|
|
6
|
+
const BLOCKED_IP_RANGES = [
|
|
7
|
+
/^127\./, // loopback
|
|
8
|
+
/^10\./, // private Class A
|
|
9
|
+
/^172\.(1[6-9]|2\d|3[01])\./, // private Class B
|
|
10
|
+
/^192\.168\./, // private Class C
|
|
11
|
+
/^169\.254\./, // link-local / cloud IMDS (AWS, GCP, Azure)
|
|
12
|
+
/^0\./, // current network
|
|
13
|
+
/^::1$/, // IPv6 loopback
|
|
14
|
+
/^fc00:/i, // IPv6 unique local
|
|
15
|
+
/^fe80:/i, // IPv6 link-local
|
|
16
|
+
/^100\.64\./, // shared address space (RFC 6598)
|
|
17
|
+
];
|
|
18
|
+
/**
|
|
19
|
+
* Hostnames that are blocked regardless of DNS resolution.
|
|
20
|
+
*/
|
|
21
|
+
const BLOCKED_HOSTNAMES = new Set([
|
|
22
|
+
"localhost",
|
|
23
|
+
"metadata.google.internal",
|
|
24
|
+
"169.254.169.254",
|
|
25
|
+
]);
|
|
26
|
+
/**
|
|
27
|
+
* Returns true if the hostname resolves to a private/internal address.
|
|
28
|
+
* Fails closed — if DNS lookup throws, the address is considered blocked.
|
|
29
|
+
*/
|
|
30
|
+
export async function isBlockedAddress(hostname) {
|
|
31
|
+
const lower = hostname.toLowerCase();
|
|
32
|
+
if (BLOCKED_HOSTNAMES.has(lower))
|
|
33
|
+
return true;
|
|
34
|
+
// Reject raw IP literals that match blocked ranges without a DNS lookup
|
|
35
|
+
if (BLOCKED_IP_RANGES.some((r) => r.test(hostname)))
|
|
36
|
+
return true;
|
|
37
|
+
try {
|
|
38
|
+
const { address } = await lookup(hostname);
|
|
39
|
+
return BLOCKED_IP_RANGES.some((r) => r.test(address));
|
|
40
|
+
}
|
|
41
|
+
catch {
|
|
42
|
+
// DNS resolution failed — fail closed
|
|
43
|
+
return true;
|
|
44
|
+
}
|
|
45
|
+
}
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Validates that a string is a well-formed HTTP or HTTPS URL.
|
|
3
|
+
* Returns the parsed URL on success, or a descriptive error string on failure.
|
|
4
|
+
*/
|
|
5
|
+
export function validateSeedUrl(raw) {
|
|
6
|
+
let parsed;
|
|
7
|
+
try {
|
|
8
|
+
parsed = new URL(raw);
|
|
9
|
+
}
|
|
10
|
+
catch {
|
|
11
|
+
return { url: null, error: `"${raw}" is not a valid URL` };
|
|
12
|
+
}
|
|
13
|
+
if (parsed.protocol !== "http:" && parsed.protocol !== "https:") {
|
|
14
|
+
return {
|
|
15
|
+
url: null,
|
|
16
|
+
error: `"${raw}" uses scheme "${parsed.protocol.replace(":", "")}" — only http and https are allowed`,
|
|
17
|
+
};
|
|
18
|
+
}
|
|
19
|
+
if (!parsed.hostname) {
|
|
20
|
+
return { url: null, error: `"${raw}" has no hostname` };
|
|
21
|
+
}
|
|
22
|
+
return { url: parsed, error: null };
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* Validates a list of URL strings.
|
|
26
|
+
* Returns valid URLs and a list of { input, reason } error objects.
|
|
27
|
+
*/
|
|
28
|
+
export function validateSeedUrls(raws) {
|
|
29
|
+
const valid = [];
|
|
30
|
+
const invalid = [];
|
|
31
|
+
for (const raw of raws) {
|
|
32
|
+
const result = validateSeedUrl(raw);
|
|
33
|
+
if (result.error) {
|
|
34
|
+
invalid.push({ input: raw, reason: result.error });
|
|
35
|
+
}
|
|
36
|
+
else {
|
|
37
|
+
valid.push(raw);
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
return { valid, invalid };
|
|
41
|
+
}
|
package/dist/seed.js
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import { config } from "./config.js";
|
|
2
|
+
import { query } from "./db/client.js";
|
|
3
|
+
import { getDomain } from "./normalizer.js";
|
|
4
|
+
export async function seedDatabase() {
|
|
5
|
+
for (const url of config.SEED_URLS) {
|
|
6
|
+
const domain = getDomain(url);
|
|
7
|
+
if (!domain) {
|
|
8
|
+
continue;
|
|
9
|
+
}
|
|
10
|
+
await query(`INSERT INTO urls (url, domain, status, depth)
|
|
11
|
+
VALUES ($1, $2, 'PENDING', 0)
|
|
12
|
+
ON CONFLICT (url) DO NOTHING`, [url, domain]);
|
|
13
|
+
}
|
|
14
|
+
}
|
package/dist/setup.js
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Standalone CLI configuration wizard.
|
|
3
|
+
*
|
|
4
|
+
* Usage: npm run config
|
|
5
|
+
*
|
|
6
|
+
* Steps:
|
|
7
|
+
* 1. Reads seed URLs from seeds.txt
|
|
8
|
+
* 2. Prompts for crawler performance settings and output mode
|
|
9
|
+
* 3. Writes updated values to .env
|
|
10
|
+
* 4. Patches SEED_URLS and ALLOWED_DOMAINS in src/config.ts
|
|
11
|
+
*/
|
|
12
|
+
import { select, number, confirm } from "@inquirer/prompts";
|
|
13
|
+
import fs from "fs";
|
|
14
|
+
import path from "path";
|
|
15
|
+
import { validateSeedUrls } from "./security/validate-url.js";
|
|
16
|
+
const SEEDS_FILE = "seeds.txt";
|
|
17
|
+
const ENV_FILE = ".env";
|
|
18
|
+
const CONFIG_FILE = path.join("src", "config.ts");
|
|
19
|
+
// ─── Helpers ─────────────────────────────────────────────────────────────────
|
|
20
|
+
function readSeedsFile() {
|
|
21
|
+
if (!fs.existsSync(SEEDS_FILE)) {
|
|
22
|
+
console.warn(`[setup] ${SEEDS_FILE} not found. No seed URLs will be loaded.`);
|
|
23
|
+
return [];
|
|
24
|
+
}
|
|
25
|
+
const raw = fs
|
|
26
|
+
.readFileSync(SEEDS_FILE, "utf-8")
|
|
27
|
+
.split("\n")
|
|
28
|
+
.map((l) => l.trim())
|
|
29
|
+
.filter((l) => l.length > 0 && !l.startsWith("#"));
|
|
30
|
+
const { valid, invalid } = validateSeedUrls(raw);
|
|
31
|
+
if (invalid.length > 0) {
|
|
32
|
+
console.warn(`[setup] Skipping ${invalid.length} invalid URL(s) in ${SEEDS_FILE}:`);
|
|
33
|
+
for (const e of invalid) {
|
|
34
|
+
console.warn(` ✗ ${e.reason}`);
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
return valid;
|
|
38
|
+
}
|
|
39
|
+
function extractDomains(urls) {
|
|
40
|
+
return urls.reduce((acc, url) => {
|
|
41
|
+
try {
|
|
42
|
+
const { hostname } = new URL(url);
|
|
43
|
+
if (hostname && !acc.includes(hostname))
|
|
44
|
+
acc.push(hostname);
|
|
45
|
+
}
|
|
46
|
+
catch { }
|
|
47
|
+
return acc;
|
|
48
|
+
}, []);
|
|
49
|
+
}
|
|
50
|
+
function writeEnvFile(values) {
|
|
51
|
+
let existing = {};
|
|
52
|
+
if (fs.existsSync(ENV_FILE)) {
|
|
53
|
+
for (const line of fs.readFileSync(ENV_FILE, "utf-8").split("\n")) {
|
|
54
|
+
const trimmed = line.trim();
|
|
55
|
+
if (!trimmed || trimmed.startsWith("#"))
|
|
56
|
+
continue;
|
|
57
|
+
const eqIdx = trimmed.indexOf("=");
|
|
58
|
+
if (eqIdx === -1)
|
|
59
|
+
continue;
|
|
60
|
+
existing[trimmed.slice(0, eqIdx).trim()] = trimmed.slice(eqIdx + 1).trim();
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
const content = Object.entries({ ...existing, ...values })
|
|
64
|
+
.map(([k, v]) => `${k}=${v}`)
|
|
65
|
+
.join("\n") + "\n";
|
|
66
|
+
fs.writeFileSync(ENV_FILE, content, "utf-8");
|
|
67
|
+
console.log(`[setup] .env updated.`);
|
|
68
|
+
}
|
|
69
|
+
function patchConfigTs(seedUrls, allowedDomains, outputMode) {
|
|
70
|
+
if (!fs.existsSync(CONFIG_FILE)) {
|
|
71
|
+
console.warn(`[setup] ${CONFIG_FILE} not found — skipping patch.`);
|
|
72
|
+
return;
|
|
73
|
+
}
|
|
74
|
+
let src = fs.readFileSync(CONFIG_FILE, "utf-8");
|
|
75
|
+
const seedArray = "[\n" + seedUrls.map((u) => ` "${u}"`).join(",\n") + ",\n ]";
|
|
76
|
+
src = src.replace(/SEED_URLS:\s*\[[\s\S]*?\]/, `SEED_URLS: ${seedArray}`);
|
|
77
|
+
const domainArray = "[\n" + allowedDomains.map((d) => ` "${d}"`).join(",\n") + ",\n ]";
|
|
78
|
+
src = src.replace(/ALLOWED_DOMAINS:\s*\[[\s\S]*?\]/, `ALLOWED_DOMAINS: ${domainArray}`);
|
|
79
|
+
if (/OUTPUT_MODE:/.test(src)) {
|
|
80
|
+
src = src.replace(/OUTPUT_MODE:\s*["'][^"']*["']/, `OUTPUT_MODE: "${outputMode}"`);
|
|
81
|
+
}
|
|
82
|
+
else {
|
|
83
|
+
src = src.replace(/(\n};)/, `\n OUTPUT_MODE: "${outputMode}",\n};`);
|
|
84
|
+
}
|
|
85
|
+
fs.writeFileSync(CONFIG_FILE, src, "utf-8");
|
|
86
|
+
console.log(`[setup] src/config.ts patched.`);
|
|
87
|
+
}
|
|
88
|
+
// ─── Main wizard ──────────────────────────────────────────────────────────────
|
|
89
|
+
async function main() {
|
|
90
|
+
console.log("\n╔══════════════════════════════════════════╗");
|
|
91
|
+
console.log("║ Web Crawler — Interactive Setup Wizard ║");
|
|
92
|
+
console.log("╚══════════════════════════════════════════╝\n");
|
|
93
|
+
const seedUrls = readSeedsFile();
|
|
94
|
+
if (seedUrls.length === 0) {
|
|
95
|
+
console.warn(`[setup] No URLs found in ${SEEDS_FILE}. Add target URLs and re-run.\n`);
|
|
96
|
+
}
|
|
97
|
+
else {
|
|
98
|
+
console.log(`[setup] Found ${seedUrls.length} seed URL(s):`);
|
|
99
|
+
seedUrls.forEach((u) => console.log(` ${u}`));
|
|
100
|
+
console.log();
|
|
101
|
+
}
|
|
102
|
+
const outputMode = await select({
|
|
103
|
+
message: "OUTPUT_MODE — where should crawled data be stored?",
|
|
104
|
+
choices: [
|
|
105
|
+
{ name: "PostgreSQL database (structured data)", value: "database" },
|
|
106
|
+
{ name: "PDF eBook (compiled document)", value: "pdf" },
|
|
107
|
+
],
|
|
108
|
+
default: "database",
|
|
109
|
+
});
|
|
110
|
+
const maxDepth = await number({
|
|
111
|
+
message: "MAX_DEPTH — link hops from seed URLs:",
|
|
112
|
+
default: 3,
|
|
113
|
+
validate: (v) => (v !== undefined && v >= 0 ? true : "Must be 0 or greater"),
|
|
114
|
+
});
|
|
115
|
+
const crawlDelayMs = await number({
|
|
116
|
+
message: "CRAWL_DELAY_MS — politeness delay per domain (ms):",
|
|
117
|
+
default: 1000,
|
|
118
|
+
validate: (v) => (v !== undefined && v >= 0 ? true : "Must be 0 or greater"),
|
|
119
|
+
});
|
|
120
|
+
const workerCount = await number({
|
|
121
|
+
message: "WORKER_COUNT — concurrent workers:",
|
|
122
|
+
default: 5,
|
|
123
|
+
validate: (v) => (v !== undefined && v >= 1 ? true : "Must be at least 1"),
|
|
124
|
+
});
|
|
125
|
+
const maxPages = await number({
|
|
126
|
+
message: "MAX_PAGES — page limit (0 = unlimited):",
|
|
127
|
+
default: 1000,
|
|
128
|
+
validate: (v) => (v !== undefined && v >= 0 ? true : "Must be 0 or greater"),
|
|
129
|
+
});
|
|
130
|
+
const ok = await confirm({ message: "Save these settings?", default: true });
|
|
131
|
+
if (!ok) {
|
|
132
|
+
console.log("\nAborted.\n");
|
|
133
|
+
process.exit(0);
|
|
134
|
+
}
|
|
135
|
+
writeEnvFile({
|
|
136
|
+
MAX_DEPTH: String(maxDepth),
|
|
137
|
+
CRAWL_DELAY_MS: String(crawlDelayMs),
|
|
138
|
+
WORKER_COUNT: String(workerCount),
|
|
139
|
+
MAX_PAGES: String(maxPages),
|
|
140
|
+
OUTPUT_MODE: outputMode,
|
|
141
|
+
});
|
|
142
|
+
patchConfigTs(seedUrls, extractDomains(seedUrls), outputMode);
|
|
143
|
+
console.log("\n✓ Configuration saved. Run npm run crawl to start.\n");
|
|
144
|
+
}
|
|
145
|
+
main().catch((err) => {
|
|
146
|
+
console.error("[setup] Fatal error:", err);
|
|
147
|
+
process.exit(1);
|
|
148
|
+
});
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import { describe, it, expect, vi } from "vitest";
|
|
2
|
+
// Mock pg module before importing client
|
|
3
|
+
vi.mock("pg", () => {
|
|
4
|
+
const queryMock = vi.fn().mockResolvedValue({ rows: [] });
|
|
5
|
+
const endMock = vi.fn().mockResolvedValue(undefined);
|
|
6
|
+
class PoolMock {
|
|
7
|
+
query = queryMock;
|
|
8
|
+
end = endMock;
|
|
9
|
+
}
|
|
10
|
+
return {
|
|
11
|
+
default: {
|
|
12
|
+
Pool: PoolMock,
|
|
13
|
+
},
|
|
14
|
+
Pool: PoolMock,
|
|
15
|
+
};
|
|
16
|
+
});
|
|
17
|
+
import { pool, query, closePool } from "../db/client.js";
|
|
18
|
+
describe("Database Client", () => {
|
|
19
|
+
it("should expose pool and query function", async () => {
|
|
20
|
+
expect(pool).toBeDefined();
|
|
21
|
+
expect(query).toBeDefined();
|
|
22
|
+
expect(closePool).toBeDefined();
|
|
23
|
+
});
|
|
24
|
+
it("should delegate query call to pool", async () => {
|
|
25
|
+
const res = await query("SELECT 1");
|
|
26
|
+
expect(res).toEqual({ rows: [] });
|
|
27
|
+
expect(pool.query).toHaveBeenCalledWith("SELECT 1", undefined);
|
|
28
|
+
});
|
|
29
|
+
it("should call end on pool when closing", async () => {
|
|
30
|
+
await closePool();
|
|
31
|
+
expect(pool.end).toHaveBeenCalled();
|
|
32
|
+
});
|
|
33
|
+
});
|