messi-crawler 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. package/README.md +201 -0
  2. package/dist/cli/renderer.js +71 -0
  3. package/dist/config.js +18 -0
  4. package/dist/db/clear.js +16 -0
  5. package/dist/db/client.js +20 -0
  6. package/dist/db/queries.js +179 -0
  7. package/dist/frontier/frontier.js +44 -0
  8. package/dist/frontier/logger.js +65 -0
  9. package/dist/frontier/robots.js +46 -0
  10. package/dist/frontier/scheduler.js +98 -0
  11. package/dist/index.js +533 -0
  12. package/dist/normalizer.js +33 -0
  13. package/dist/output/db-strategy.js +16 -0
  14. package/dist/output/index.js +23 -0
  15. package/dist/output/pdf-strategy.js +316 -0
  16. package/dist/output/strategy.js +1 -0
  17. package/dist/security/ssrf.js +45 -0
  18. package/dist/security/validate-url.js +41 -0
  19. package/dist/seed.js +14 -0
  20. package/dist/setup.js +148 -0
  21. package/dist/test/client.test.js +33 -0
  22. package/dist/test/downloader.test.js +84 -0
  23. package/dist/test/extractor.test.js +126 -0
  24. package/dist/test/frontier.test.js +43 -0
  25. package/dist/test/logger.test.js +55 -0
  26. package/dist/test/normalizer.test.js +36 -0
  27. package/dist/test/pdf-strategy.test.js +68 -0
  28. package/dist/test/queries.test.js +173 -0
  29. package/dist/test/robots.test.js +46 -0
  30. package/dist/test/scheduler.test.js +73 -0
  31. package/dist/test/seed.test.js +26 -0
  32. package/dist/test/worker.test.js +118 -0
  33. package/dist/worker/downloader.js +114 -0
  34. package/dist/worker/extractor.js +197 -0
  35. package/dist/worker/worker.js +87 -0
  36. package/package.json +48 -0
  37. package/seeds.txt +4 -0
  38. package/src/cli/renderer.ts +83 -0
  39. package/src/config.ts +22 -0
  40. package/src/db/clear.ts +16 -0
  41. package/src/db/client.ts +26 -0
  42. package/src/db/queries.ts +255 -0
  43. package/src/db/schema.sql +43 -0
  44. package/src/frontier/frontier.ts +60 -0
  45. package/src/frontier/logger.ts +75 -0
  46. package/src/frontier/robots.ts +50 -0
  47. package/src/frontier/scheduler.ts +119 -0
  48. package/src/index.ts +596 -0
  49. package/src/normalizer.ts +37 -0
  50. package/src/output/db-strategy.ts +20 -0
  51. package/src/output/index.ts +32 -0
  52. package/src/output/pdf-strategy.ts +388 -0
  53. package/src/output/strategy.ts +16 -0
  54. package/src/security/ssrf.ts +48 -0
  55. package/src/security/validate-url.ts +49 -0
  56. package/src/seed.ts +18 -0
  57. package/src/setup.ts +170 -0
  58. package/src/test/client.test.ts +38 -0
  59. package/src/test/downloader.test.ts +101 -0
  60. package/src/test/extractor.test.ts +139 -0
  61. package/src/test/frontier.test.ts +53 -0
  62. package/src/test/logger.test.ts +71 -0
  63. package/src/test/normalizer.test.ts +43 -0
  64. package/src/test/pdf-strategy.test.ts +84 -0
  65. package/src/test/queries.test.ts +247 -0
  66. package/src/test/robots.test.ts +56 -0
  67. package/src/test/scheduler.test.ts +90 -0
  68. package/src/test/seed.test.ts +35 -0
  69. package/src/test/worker.test.ts +144 -0
  70. package/src/worker/downloader.ts +149 -0
  71. package/src/worker/extractor.ts +235 -0
  72. package/src/worker/worker.ts +100 -0
  73. package/tsconfig.json +15 -0
@@ -0,0 +1,23 @@
1
+ import { DatabaseStrategy } from "./db-strategy.js";
2
+ import { PdfStrategy } from "./pdf-strategy.js";
3
+ let activeStrategy = null;
4
+ export function setStrategy(strategy) {
5
+ activeStrategy = strategy;
6
+ }
7
+ export function getStrategy() {
8
+ if (!activeStrategy) {
9
+ // Default to database strategy if none was configured
10
+ activeStrategy = new DatabaseStrategy();
11
+ }
12
+ return activeStrategy;
13
+ }
14
+ export function createStrategy(mode) {
15
+ switch (mode) {
16
+ case "pdf":
17
+ return new PdfStrategy();
18
+ case "database":
19
+ default:
20
+ return new DatabaseStrategy();
21
+ }
22
+ }
23
+ export { DatabaseStrategy, PdfStrategy };
@@ -0,0 +1,316 @@
1
+ import fs from "fs";
2
+ import path from "path";
3
+ import PDFDocumentCtor from "pdfkit";
4
+ import { markDone } from "../db/queries.js";
5
+ import { downloadImage } from "../worker/downloader.js";
6
+ const OUTPUT_DIR = "output";
7
+ const BASE_NAME = "documentation";
8
+ function resolveOutputPath() {
9
+ const first = path.join(OUTPUT_DIR, `${BASE_NAME}.pdf`);
10
+ if (!fs.existsSync(first))
11
+ return first;
12
+ let n = 2;
13
+ while (fs.existsSync(path.join(OUTPUT_DIR, `${BASE_NAME}${n}.pdf`)))
14
+ n++;
15
+ return path.join(OUTPUT_DIR, `${BASE_NAME}${n}.pdf`);
16
+ }
17
+ // ─── Layout constants ─────────────────────────────────────────────────────────
18
+ const MARGIN = 64;
19
+ const FOOTER_HEIGHT = 28; // reserved space at bottom for footer
20
+ const MAX_TEXT_CHARS = 5000;
21
+ // ─── Colour palette ───────────────────────────────────────────────────────────
22
+ const C = {
23
+ title: "#1a1a2e",
24
+ url: "#4361ee",
25
+ desc: "#444444",
26
+ section: "#2d6a4f",
27
+ body: "#222222",
28
+ truncated: "#aaaaaa",
29
+ rule: "#dddddd",
30
+ coverBg: "#1a1a2e",
31
+ coverFg: "#ffffff",
32
+ coverSub: "#a8dadc",
33
+ badge: "#888888",
34
+ footer: "#aaaaaa",
35
+ };
36
+ // ─── Helpers ──────────────────────────────────────────────────────────────────
37
+ /**
38
+ * Draws a horizontal rule at the current cursor Y and advances by `gap` points.
39
+ * Uses an absolute move so it never inherits stale font metrics.
40
+ */
41
+ function rule(doc, color = C.rule, gap = 10) {
42
+ const y = doc.y;
43
+ doc
44
+ .moveTo(MARGIN, y)
45
+ .lineTo(doc.page.width - MARGIN, y)
46
+ .strokeColor(color)
47
+ .lineWidth(0.5)
48
+ .stroke();
49
+ doc.y = y + gap; // advance cursor by exact points, not line-height multiples
50
+ }
51
+ /**
52
+ * Normalises raw scraped text:
53
+ * - Collapses runs of whitespace/newlines into a single space
54
+ * - Trims leading/trailing whitespace
55
+ * This prevents the large blank gaps that `paragraphGap` creates when the
56
+ * extractor's output happens to contain stray newline characters.
57
+ */
58
+ function normaliseText(raw) {
59
+ return raw.replace(/\s+/g, " ").trim();
60
+ }
61
+ /**
62
+ * Returns the usable content height on the current page
63
+ * (page height minus top margin, bottom margin, and footer reservation).
64
+ */
65
+ function contentBottom(doc) {
66
+ return doc.page.height - MARGIN - FOOTER_HEIGHT;
67
+ }
68
+ // ─── Strategy ────────────────────────────────────────────────────────────────
69
+ export class PdfStrategy {
70
+ doc;
71
+ stream;
72
+ pageCount = 0;
73
+ pdfPath;
74
+ async init() {
75
+ if (!fs.existsSync(OUTPUT_DIR))
76
+ fs.mkdirSync(OUTPUT_DIR, { recursive: true });
77
+ this.pdfPath = resolveOutputPath();
78
+ this.doc = new PDFDocumentCtor({
79
+ autoFirstPage: false,
80
+ bufferPages: true,
81
+ // Explicit margins so pdfkit never auto-paginates into blank pages
82
+ // due to cursor running past the bottom margin.
83
+ margins: { top: MARGIN, bottom: MARGIN + FOOTER_HEIGHT, left: MARGIN, right: MARGIN },
84
+ info: {
85
+ Title: "Checkout the repo https://github.com/lightning4747/Web-crawler-cli",
86
+ Author: "Web Crawler",
87
+ Subject: "Compiled documentation from crawled pages",
88
+ },
89
+ });
90
+ this.stream = fs.createWriteStream(this.pdfPath);
91
+ this.doc.pipe(this.stream);
92
+ this.renderCover();
93
+ console.log(`[PDF] Output file: ${this.pdfPath}`);
94
+ }
95
+ // ── Cover page ──────────────────────────────────────────────────────────────
96
+ renderCover() {
97
+ const doc = this.doc;
98
+ doc.addPage();
99
+ doc.rect(0, 0, doc.page.width, doc.page.height).fill(C.coverBg);
100
+ const midY = doc.page.height / 2 - 60;
101
+ doc.fontSize(38).font("Helvetica-Bold").fillColor(C.coverFg)
102
+ .text("Web Crawler", MARGIN, midY, { align: "center" });
103
+ // Advance by exact points to avoid font-size-based gaps
104
+ doc.y += 8;
105
+ doc.fontSize(32).font("Helvetica-Bold").fillColor(C.coverSub)
106
+ .text("Documentation Book", { align: "center" });
107
+ doc.y += 28;
108
+ doc.fontSize(11).font("Helvetica").fillColor(C.coverFg)
109
+ .text("https://github.com/lightning4747/Web-crawler-cli", { align: "center", link: "https://github.com/lightning4747/Web-crawler-cli" });
110
+ doc.y += 8;
111
+ doc.fontSize(10).font("Helvetica").fillColor(C.coverSub)
112
+ .text(new Date().toUTCString(), { align: "center" });
113
+ }
114
+ // ── Chapter page ────────────────────────────────────────────────────────────
115
+ async save(urlId, url, content) {
116
+ await markDone(urlId, content);
117
+ const doc = this.doc;
118
+ this.pageCount++;
119
+ doc.addPage();
120
+ const W = doc.page.width - MARGIN * 2; // usable text width
121
+ const limit = contentBottom(doc); // y-coordinate of content boundary
122
+ // ── Chapter badge (top-right, absolute position) ─────────────────────────
123
+ doc.fontSize(8).font("Helvetica").fillColor(C.badge)
124
+ .text(`CHAPTER ${this.pageCount}`, MARGIN, MARGIN, { width: W, align: "right" });
125
+ // Place cursor just below the badge — use exact points, not moveDown
126
+ doc.y = MARGIN + 14;
127
+ // ── Title ─────────────────────────────────────────────────────────────────
128
+ doc.fontSize(22).font("Helvetica-Bold").fillColor(C.title)
129
+ .text(content.title ?? url, { width: W, lineGap: 2 });
130
+ doc.y += 10;
131
+ rule(doc, "#4361ee", 10);
132
+ // ── Source URL ────────────────────────────────────────────────────────────
133
+ doc.fontSize(8.5).font("Helvetica-Oblique").fillColor(C.url)
134
+ .text(url, { width: W, link: url, underline: true, lineGap: 1 });
135
+ doc.y += 12;
136
+ // ── Description ───────────────────────────────────────────────────────────
137
+ if (content.description && doc.y < limit) {
138
+ doc.fontSize(11).font("Helvetica-Oblique").fillColor(C.desc)
139
+ .text(content.description.trim(), { width: W, lineGap: 2, align: "left" });
140
+ doc.y += 12;
141
+ }
142
+ // ── Headings summary ──────────────────────────────────────────────────────
143
+ const allHeadings = [
144
+ ...content.headings.h1,
145
+ ...content.headings.h2,
146
+ ...content.headings.h3,
147
+ ].slice(0, 12);
148
+ if (allHeadings.length > 0 && doc.y < limit) {
149
+ doc.fontSize(9).font("Helvetica-Bold").fillColor(C.section)
150
+ .text("CONTENTS OVERVIEW", { width: W, characterSpacing: 1 });
151
+ doc.y += 6;
152
+ for (const h of allHeadings) {
153
+ if (doc.y >= limit)
154
+ break;
155
+ const bulletX = MARGIN;
156
+ const textX = MARGIN + 14;
157
+ const y = doc.y;
158
+ // Bullet dot — drawn absolutely, no cursor movement
159
+ doc.circle(bulletX + 3, y + 5, 2).fill(C.section);
160
+ doc.fontSize(10).font("Helvetica").fillColor(C.body)
161
+ .text(h, textX, y, { width: W - 14, lineGap: 2 });
162
+ // Advance by 4pt padding between bullet items
163
+ doc.y += 4;
164
+ }
165
+ doc.y += 8;
166
+ if (doc.y < limit)
167
+ rule(doc, C.rule, 10);
168
+ }
169
+ // ── Page Content ──────────────────────────────────────────────────────────
170
+ if (content.blocks && content.blocks.length > 0) {
171
+ doc.fontSize(9).font("Helvetica-Bold").fillColor(C.section)
172
+ .text("PAGE CONTENT", { width: W, characterSpacing: 1 });
173
+ doc.y += 8;
174
+ for (const block of content.blocks) {
175
+ if (doc.y >= limit) {
176
+ doc.addPage();
177
+ }
178
+ if (block.type === "heading" && block.text) {
179
+ const headingSize = block.level === 1 ? 16 : block.level === 2 ? 14 : 12;
180
+ const headingHeight = doc.heightOfString(block.text, { width: W });
181
+ if (doc.y + headingHeight + 40 > limit) {
182
+ doc.addPage();
183
+ }
184
+ doc.fontSize(headingSize).font("Helvetica-Bold").fillColor(C.title)
185
+ .text(block.text, { width: W, lineGap: 2 });
186
+ doc.y += 6;
187
+ }
188
+ else if (block.type === "paragraph" && block.text) {
189
+ const text = block.text.trim();
190
+ if (!text)
191
+ continue;
192
+ const textHeight = doc.heightOfString(text, { width: W });
193
+ if (doc.y + 20 > limit) {
194
+ doc.addPage();
195
+ }
196
+ doc.fontSize(10).font("Helvetica").fillColor(C.body)
197
+ .text(text, { width: W, lineGap: 3 });
198
+ doc.y += 8;
199
+ }
200
+ else if (block.type === "list" && block.items && block.items.length > 0) {
201
+ if (doc.y + 20 > limit) {
202
+ doc.addPage();
203
+ }
204
+ for (const item of block.items) {
205
+ const itemText = item.trim();
206
+ if (!itemText)
207
+ continue;
208
+ const bulletX = MARGIN + 10;
209
+ const textX = MARGIN + 22;
210
+ const itemHeight = doc.heightOfString(itemText, { width: W - 22 });
211
+ if (doc.y + itemHeight > limit) {
212
+ doc.addPage();
213
+ }
214
+ const y = doc.y;
215
+ doc.circle(bulletX + 3, y + 5, 2).fill(C.body);
216
+ doc.fontSize(9.5).font("Helvetica").fillColor(C.body)
217
+ .text(itemText, textX, y, { width: W - 22, lineGap: 2 });
218
+ doc.y += 4;
219
+ }
220
+ doc.y += 4;
221
+ }
222
+ else if (block.type === "image" && block.src) {
223
+ try {
224
+ const imageBuffer = await downloadImage(block.src);
225
+ const maxImageHeight = 200;
226
+ if (doc.y + maxImageHeight + 20 > limit) {
227
+ doc.addPage();
228
+ }
229
+ doc.image(imageBuffer, {
230
+ fit: [W, maxImageHeight],
231
+ align: "center",
232
+ });
233
+ doc.y += maxImageHeight + 10;
234
+ if (block.alt) {
235
+ doc.fontSize(8.5).font("Helvetica-Oblique").fillColor(C.desc)
236
+ .text(block.alt, { width: W, align: "center" });
237
+ doc.y += 8;
238
+ }
239
+ }
240
+ catch (err) {
241
+ const fallbackText = `[Image: ${block.alt || "No description available"} (${block.src})]`;
242
+ const boxHeight = 40;
243
+ if (doc.y + boxHeight > limit) {
244
+ doc.addPage();
245
+ }
246
+ const currentY = doc.y;
247
+ doc.rect(MARGIN, currentY, W, boxHeight)
248
+ .strokeColor(C.rule)
249
+ .lineWidth(0.5)
250
+ .stroke();
251
+ doc.fontSize(9).font("Helvetica-Oblique").fillColor(C.truncated)
252
+ .text(fallbackText, MARGIN + 10, currentY + 14, { width: W - 20, align: "center" });
253
+ doc.y = currentY + boxHeight + 10;
254
+ }
255
+ }
256
+ }
257
+ }
258
+ else if (content.textContent && doc.y < limit) {
259
+ doc.fontSize(9).font("Helvetica-Bold").fillColor(C.section)
260
+ .text("PAGE CONTENT", { width: W, characterSpacing: 1 });
261
+ doc.y += 8;
262
+ const raw = normaliseText(content.textContent);
263
+ const body = raw.slice(0, MAX_TEXT_CHARS);
264
+ const truncated = raw.length > MAX_TEXT_CHARS;
265
+ doc.fontSize(10.5).font("Helvetica").fillColor(C.body)
266
+ .text(body, {
267
+ width: W,
268
+ lineGap: 3,
269
+ align: "left",
270
+ });
271
+ if (truncated && doc.y < limit) {
272
+ doc.y += 8;
273
+ doc.fontSize(8.5).font("Helvetica-Oblique").fillColor(C.truncated)
274
+ .text("[ content truncated for brevity ]", { width: W, align: "center" });
275
+ }
276
+ }
277
+ }
278
+ async finish() {
279
+ const doc = this.doc;
280
+ const range = doc.bufferedPageRange();
281
+ const totalPages = range.count;
282
+ for (let i = 1; i < totalPages; i++) {
283
+ doc.switchToPage(i);
284
+ const W = doc.page.width - MARGIN * 2;
285
+ const footerY = doc.page.height - MARGIN - FOOTER_HEIGHT + 8;
286
+ // Draw running header
287
+ doc.fontSize(8).font("Helvetica").fillColor(C.badge)
288
+ .text("https://github.com/lightning4747/Web-crawler-cli", MARGIN, MARGIN - 24, { width: W, align: "left", link: "https://github.com/lightning4747/Web-crawler-cli" });
289
+ doc
290
+ .moveTo(MARGIN, MARGIN - 14)
291
+ .lineTo(doc.page.width - MARGIN, MARGIN - 14)
292
+ .strokeColor(C.rule)
293
+ .lineWidth(0.4)
294
+ .stroke();
295
+ // Draw running footer
296
+ doc
297
+ .moveTo(MARGIN, footerY)
298
+ .lineTo(doc.page.width - MARGIN, footerY)
299
+ .strokeColor(C.rule)
300
+ .lineWidth(0.4)
301
+ .stroke();
302
+ doc.fontSize(8).font("Helvetica").fillColor(C.footer)
303
+ .text(`Page ${i} of ${totalPages - 1}`, MARGIN, footerY + 5, {
304
+ width: W,
305
+ align: "center",
306
+ lineBreak: false,
307
+ });
308
+ }
309
+ await new Promise((resolve, reject) => {
310
+ this.stream.on("finish", resolve);
311
+ this.stream.on("error", reject);
312
+ this.doc.end();
313
+ });
314
+ console.log(`[PDF] Done — ${this.pageCount} chapter(s) written to ${this.pdfPath}`);
315
+ }
316
+ }
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,45 @@
1
+ import { lookup } from "node:dns/promises";
2
+ /**
3
+ * IP range patterns that must never be requested.
4
+ * Covers loopback, private, link-local, and cloud metadata addresses.
5
+ */
6
+ const BLOCKED_IP_RANGES = [
7
+ /^127\./, // loopback
8
+ /^10\./, // private Class A
9
+ /^172\.(1[6-9]|2\d|3[01])\./, // private Class B
10
+ /^192\.168\./, // private Class C
11
+ /^169\.254\./, // link-local / cloud IMDS (AWS, GCP, Azure)
12
+ /^0\./, // current network
13
+ /^::1$/, // IPv6 loopback
14
+ /^fc00:/i, // IPv6 unique local
15
+ /^fe80:/i, // IPv6 link-local
16
+ /^100\.64\./, // shared address space (RFC 6598)
17
+ ];
18
+ /**
19
+ * Hostnames that are blocked regardless of DNS resolution.
20
+ */
21
+ const BLOCKED_HOSTNAMES = new Set([
22
+ "localhost",
23
+ "metadata.google.internal",
24
+ "169.254.169.254",
25
+ ]);
26
+ /**
27
+ * Returns true if the hostname resolves to a private/internal address.
28
+ * Fails closed — if DNS lookup throws, the address is considered blocked.
29
+ */
30
+ export async function isBlockedAddress(hostname) {
31
+ const lower = hostname.toLowerCase();
32
+ if (BLOCKED_HOSTNAMES.has(lower))
33
+ return true;
34
+ // Reject raw IP literals that match blocked ranges without a DNS lookup
35
+ if (BLOCKED_IP_RANGES.some((r) => r.test(hostname)))
36
+ return true;
37
+ try {
38
+ const { address } = await lookup(hostname);
39
+ return BLOCKED_IP_RANGES.some((r) => r.test(address));
40
+ }
41
+ catch {
42
+ // DNS resolution failed — fail closed
43
+ return true;
44
+ }
45
+ }
@@ -0,0 +1,41 @@
1
+ /**
2
+ * Validates that a string is a well-formed HTTP or HTTPS URL.
3
+ * Returns the parsed URL on success, or a descriptive error string on failure.
4
+ */
5
+ export function validateSeedUrl(raw) {
6
+ let parsed;
7
+ try {
8
+ parsed = new URL(raw);
9
+ }
10
+ catch {
11
+ return { url: null, error: `"${raw}" is not a valid URL` };
12
+ }
13
+ if (parsed.protocol !== "http:" && parsed.protocol !== "https:") {
14
+ return {
15
+ url: null,
16
+ error: `"${raw}" uses scheme "${parsed.protocol.replace(":", "")}" — only http and https are allowed`,
17
+ };
18
+ }
19
+ if (!parsed.hostname) {
20
+ return { url: null, error: `"${raw}" has no hostname` };
21
+ }
22
+ return { url: parsed, error: null };
23
+ }
24
+ /**
25
+ * Validates a list of URL strings.
26
+ * Returns valid URLs and a list of { input, reason } error objects.
27
+ */
28
+ export function validateSeedUrls(raws) {
29
+ const valid = [];
30
+ const invalid = [];
31
+ for (const raw of raws) {
32
+ const result = validateSeedUrl(raw);
33
+ if (result.error) {
34
+ invalid.push({ input: raw, reason: result.error });
35
+ }
36
+ else {
37
+ valid.push(raw);
38
+ }
39
+ }
40
+ return { valid, invalid };
41
+ }
package/dist/seed.js ADDED
@@ -0,0 +1,14 @@
1
+ import { config } from "./config.js";
2
+ import { query } from "./db/client.js";
3
+ import { getDomain } from "./normalizer.js";
4
+ export async function seedDatabase() {
5
+ for (const url of config.SEED_URLS) {
6
+ const domain = getDomain(url);
7
+ if (!domain) {
8
+ continue;
9
+ }
10
+ await query(`INSERT INTO urls (url, domain, status, depth)
11
+ VALUES ($1, $2, 'PENDING', 0)
12
+ ON CONFLICT (url) DO NOTHING`, [url, domain]);
13
+ }
14
+ }
package/dist/setup.js ADDED
@@ -0,0 +1,148 @@
1
+ /**
2
+ * Standalone CLI configuration wizard.
3
+ *
4
+ * Usage: npm run config
5
+ *
6
+ * Steps:
7
+ * 1. Reads seed URLs from seeds.txt
8
+ * 2. Prompts for crawler performance settings and output mode
9
+ * 3. Writes updated values to .env
10
+ * 4. Patches SEED_URLS and ALLOWED_DOMAINS in src/config.ts
11
+ */
12
+ import { select, number, confirm } from "@inquirer/prompts";
13
+ import fs from "fs";
14
+ import path from "path";
15
+ import { validateSeedUrls } from "./security/validate-url.js";
16
+ const SEEDS_FILE = "seeds.txt";
17
+ const ENV_FILE = ".env";
18
+ const CONFIG_FILE = path.join("src", "config.ts");
19
+ // ─── Helpers ─────────────────────────────────────────────────────────────────
20
+ function readSeedsFile() {
21
+ if (!fs.existsSync(SEEDS_FILE)) {
22
+ console.warn(`[setup] ${SEEDS_FILE} not found. No seed URLs will be loaded.`);
23
+ return [];
24
+ }
25
+ const raw = fs
26
+ .readFileSync(SEEDS_FILE, "utf-8")
27
+ .split("\n")
28
+ .map((l) => l.trim())
29
+ .filter((l) => l.length > 0 && !l.startsWith("#"));
30
+ const { valid, invalid } = validateSeedUrls(raw);
31
+ if (invalid.length > 0) {
32
+ console.warn(`[setup] Skipping ${invalid.length} invalid URL(s) in ${SEEDS_FILE}:`);
33
+ for (const e of invalid) {
34
+ console.warn(` ✗ ${e.reason}`);
35
+ }
36
+ }
37
+ return valid;
38
+ }
39
+ function extractDomains(urls) {
40
+ return urls.reduce((acc, url) => {
41
+ try {
42
+ const { hostname } = new URL(url);
43
+ if (hostname && !acc.includes(hostname))
44
+ acc.push(hostname);
45
+ }
46
+ catch { }
47
+ return acc;
48
+ }, []);
49
+ }
50
+ function writeEnvFile(values) {
51
+ let existing = {};
52
+ if (fs.existsSync(ENV_FILE)) {
53
+ for (const line of fs.readFileSync(ENV_FILE, "utf-8").split("\n")) {
54
+ const trimmed = line.trim();
55
+ if (!trimmed || trimmed.startsWith("#"))
56
+ continue;
57
+ const eqIdx = trimmed.indexOf("=");
58
+ if (eqIdx === -1)
59
+ continue;
60
+ existing[trimmed.slice(0, eqIdx).trim()] = trimmed.slice(eqIdx + 1).trim();
61
+ }
62
+ }
63
+ const content = Object.entries({ ...existing, ...values })
64
+ .map(([k, v]) => `${k}=${v}`)
65
+ .join("\n") + "\n";
66
+ fs.writeFileSync(ENV_FILE, content, "utf-8");
67
+ console.log(`[setup] .env updated.`);
68
+ }
69
+ function patchConfigTs(seedUrls, allowedDomains, outputMode) {
70
+ if (!fs.existsSync(CONFIG_FILE)) {
71
+ console.warn(`[setup] ${CONFIG_FILE} not found — skipping patch.`);
72
+ return;
73
+ }
74
+ let src = fs.readFileSync(CONFIG_FILE, "utf-8");
75
+ const seedArray = "[\n" + seedUrls.map((u) => ` "${u}"`).join(",\n") + ",\n ]";
76
+ src = src.replace(/SEED_URLS:\s*\[[\s\S]*?\]/, `SEED_URLS: ${seedArray}`);
77
+ const domainArray = "[\n" + allowedDomains.map((d) => ` "${d}"`).join(",\n") + ",\n ]";
78
+ src = src.replace(/ALLOWED_DOMAINS:\s*\[[\s\S]*?\]/, `ALLOWED_DOMAINS: ${domainArray}`);
79
+ if (/OUTPUT_MODE:/.test(src)) {
80
+ src = src.replace(/OUTPUT_MODE:\s*["'][^"']*["']/, `OUTPUT_MODE: "${outputMode}"`);
81
+ }
82
+ else {
83
+ src = src.replace(/(\n};)/, `\n OUTPUT_MODE: "${outputMode}",\n};`);
84
+ }
85
+ fs.writeFileSync(CONFIG_FILE, src, "utf-8");
86
+ console.log(`[setup] src/config.ts patched.`);
87
+ }
88
+ // ─── Main wizard ──────────────────────────────────────────────────────────────
89
+ async function main() {
90
+ console.log("\n╔══════════════════════════════════════════╗");
91
+ console.log("║ Web Crawler — Interactive Setup Wizard ║");
92
+ console.log("╚══════════════════════════════════════════╝\n");
93
+ const seedUrls = readSeedsFile();
94
+ if (seedUrls.length === 0) {
95
+ console.warn(`[setup] No URLs found in ${SEEDS_FILE}. Add target URLs and re-run.\n`);
96
+ }
97
+ else {
98
+ console.log(`[setup] Found ${seedUrls.length} seed URL(s):`);
99
+ seedUrls.forEach((u) => console.log(` ${u}`));
100
+ console.log();
101
+ }
102
+ const outputMode = await select({
103
+ message: "OUTPUT_MODE — where should crawled data be stored?",
104
+ choices: [
105
+ { name: "PostgreSQL database (structured data)", value: "database" },
106
+ { name: "PDF eBook (compiled document)", value: "pdf" },
107
+ ],
108
+ default: "database",
109
+ });
110
+ const maxDepth = await number({
111
+ message: "MAX_DEPTH — link hops from seed URLs:",
112
+ default: 3,
113
+ validate: (v) => (v !== undefined && v >= 0 ? true : "Must be 0 or greater"),
114
+ });
115
+ const crawlDelayMs = await number({
116
+ message: "CRAWL_DELAY_MS — politeness delay per domain (ms):",
117
+ default: 1000,
118
+ validate: (v) => (v !== undefined && v >= 0 ? true : "Must be 0 or greater"),
119
+ });
120
+ const workerCount = await number({
121
+ message: "WORKER_COUNT — concurrent workers:",
122
+ default: 5,
123
+ validate: (v) => (v !== undefined && v >= 1 ? true : "Must be at least 1"),
124
+ });
125
+ const maxPages = await number({
126
+ message: "MAX_PAGES — page limit (0 = unlimited):",
127
+ default: 1000,
128
+ validate: (v) => (v !== undefined && v >= 0 ? true : "Must be 0 or greater"),
129
+ });
130
+ const ok = await confirm({ message: "Save these settings?", default: true });
131
+ if (!ok) {
132
+ console.log("\nAborted.\n");
133
+ process.exit(0);
134
+ }
135
+ writeEnvFile({
136
+ MAX_DEPTH: String(maxDepth),
137
+ CRAWL_DELAY_MS: String(crawlDelayMs),
138
+ WORKER_COUNT: String(workerCount),
139
+ MAX_PAGES: String(maxPages),
140
+ OUTPUT_MODE: outputMode,
141
+ });
142
+ patchConfigTs(seedUrls, extractDomains(seedUrls), outputMode);
143
+ console.log("\n✓ Configuration saved. Run npm run crawl to start.\n");
144
+ }
145
+ main().catch((err) => {
146
+ console.error("[setup] Fatal error:", err);
147
+ process.exit(1);
148
+ });
@@ -0,0 +1,33 @@
1
+ import { describe, it, expect, vi } from "vitest";
2
+ // Mock pg module before importing client
3
+ vi.mock("pg", () => {
4
+ const queryMock = vi.fn().mockResolvedValue({ rows: [] });
5
+ const endMock = vi.fn().mockResolvedValue(undefined);
6
+ class PoolMock {
7
+ query = queryMock;
8
+ end = endMock;
9
+ }
10
+ return {
11
+ default: {
12
+ Pool: PoolMock,
13
+ },
14
+ Pool: PoolMock,
15
+ };
16
+ });
17
+ import { pool, query, closePool } from "../db/client.js";
18
+ describe("Database Client", () => {
19
+ it("should expose pool and query function", async () => {
20
+ expect(pool).toBeDefined();
21
+ expect(query).toBeDefined();
22
+ expect(closePool).toBeDefined();
23
+ });
24
+ it("should delegate query call to pool", async () => {
25
+ const res = await query("SELECT 1");
26
+ expect(res).toEqual({ rows: [] });
27
+ expect(pool.query).toHaveBeenCalledWith("SELECT 1", undefined);
28
+ });
29
+ it("should call end on pool when closing", async () => {
30
+ await closePool();
31
+ expect(pool.end).toHaveBeenCalled();
32
+ });
33
+ });