messi-crawler 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. package/README.md +201 -0
  2. package/dist/cli/renderer.js +71 -0
  3. package/dist/config.js +18 -0
  4. package/dist/db/clear.js +16 -0
  5. package/dist/db/client.js +20 -0
  6. package/dist/db/queries.js +179 -0
  7. package/dist/frontier/frontier.js +44 -0
  8. package/dist/frontier/logger.js +65 -0
  9. package/dist/frontier/robots.js +46 -0
  10. package/dist/frontier/scheduler.js +98 -0
  11. package/dist/index.js +533 -0
  12. package/dist/normalizer.js +33 -0
  13. package/dist/output/db-strategy.js +16 -0
  14. package/dist/output/index.js +23 -0
  15. package/dist/output/pdf-strategy.js +316 -0
  16. package/dist/output/strategy.js +1 -0
  17. package/dist/security/ssrf.js +45 -0
  18. package/dist/security/validate-url.js +41 -0
  19. package/dist/seed.js +14 -0
  20. package/dist/setup.js +148 -0
  21. package/dist/test/client.test.js +33 -0
  22. package/dist/test/downloader.test.js +84 -0
  23. package/dist/test/extractor.test.js +126 -0
  24. package/dist/test/frontier.test.js +43 -0
  25. package/dist/test/logger.test.js +55 -0
  26. package/dist/test/normalizer.test.js +36 -0
  27. package/dist/test/pdf-strategy.test.js +68 -0
  28. package/dist/test/queries.test.js +173 -0
  29. package/dist/test/robots.test.js +46 -0
  30. package/dist/test/scheduler.test.js +73 -0
  31. package/dist/test/seed.test.js +26 -0
  32. package/dist/test/worker.test.js +118 -0
  33. package/dist/worker/downloader.js +114 -0
  34. package/dist/worker/extractor.js +197 -0
  35. package/dist/worker/worker.js +87 -0
  36. package/package.json +48 -0
  37. package/seeds.txt +4 -0
  38. package/src/cli/renderer.ts +83 -0
  39. package/src/config.ts +22 -0
  40. package/src/db/clear.ts +16 -0
  41. package/src/db/client.ts +26 -0
  42. package/src/db/queries.ts +255 -0
  43. package/src/db/schema.sql +43 -0
  44. package/src/frontier/frontier.ts +60 -0
  45. package/src/frontier/logger.ts +75 -0
  46. package/src/frontier/robots.ts +50 -0
  47. package/src/frontier/scheduler.ts +119 -0
  48. package/src/index.ts +596 -0
  49. package/src/normalizer.ts +37 -0
  50. package/src/output/db-strategy.ts +20 -0
  51. package/src/output/index.ts +32 -0
  52. package/src/output/pdf-strategy.ts +388 -0
  53. package/src/output/strategy.ts +16 -0
  54. package/src/security/ssrf.ts +48 -0
  55. package/src/security/validate-url.ts +49 -0
  56. package/src/seed.ts +18 -0
  57. package/src/setup.ts +170 -0
  58. package/src/test/client.test.ts +38 -0
  59. package/src/test/downloader.test.ts +101 -0
  60. package/src/test/extractor.test.ts +139 -0
  61. package/src/test/frontier.test.ts +53 -0
  62. package/src/test/logger.test.ts +71 -0
  63. package/src/test/normalizer.test.ts +43 -0
  64. package/src/test/pdf-strategy.test.ts +84 -0
  65. package/src/test/queries.test.ts +247 -0
  66. package/src/test/robots.test.ts +56 -0
  67. package/src/test/scheduler.test.ts +90 -0
  68. package/src/test/seed.test.ts +35 -0
  69. package/src/test/worker.test.ts +144 -0
  70. package/src/worker/downloader.ts +149 -0
  71. package/src/worker/extractor.ts +235 -0
  72. package/src/worker/worker.ts +100 -0
  73. package/tsconfig.json +15 -0
@@ -0,0 +1,388 @@
1
+ import fs from "fs";
2
+ import path from "path";
3
+ import PDFDocumentCtor from "pdfkit";
4
+ import type { OutputStrategy } from "./strategy.js";
5
+ import { markDone, type CrawledPageContent } from "../db/queries.js";
6
+ import { downloadImage } from "../worker/downloader.js";
7
+
8
+ type PDFDocumentInstance = InstanceType<typeof PDFDocumentCtor>;
9
+
10
+ const OUTPUT_DIR = "output";
11
+ const BASE_NAME = "documentation";
12
+
13
+ function resolveOutputPath(): string {
14
+ const first = path.join(OUTPUT_DIR, `${BASE_NAME}.pdf`);
15
+ if (!fs.existsSync(first)) return first;
16
+ let n = 2;
17
+ while (fs.existsSync(path.join(OUTPUT_DIR, `${BASE_NAME}${n}.pdf`))) n++;
18
+ return path.join(OUTPUT_DIR, `${BASE_NAME}${n}.pdf`);
19
+ }
20
+
21
+ // ─── Layout constants ─────────────────────────────────────────────────────────
22
+
23
+ const MARGIN = 64;
24
+ const FOOTER_HEIGHT = 28; // reserved space at bottom for footer
25
+ const MAX_TEXT_CHARS = 5000;
26
+
27
+ // ─── Colour palette ───────────────────────────────────────────────────────────
28
+
29
+ const C = {
30
+ title: "#1a1a2e",
31
+ url: "#4361ee",
32
+ desc: "#444444",
33
+ section: "#2d6a4f",
34
+ body: "#222222",
35
+ truncated: "#aaaaaa",
36
+ rule: "#dddddd",
37
+ coverBg: "#1a1a2e",
38
+ coverFg: "#ffffff",
39
+ coverSub: "#a8dadc",
40
+ badge: "#888888",
41
+ footer: "#aaaaaa",
42
+ };
43
+
44
+ // ─── Helpers ──────────────────────────────────────────────────────────────────
45
+
46
+ /**
47
+ * Draws a horizontal rule at the current cursor Y and advances by `gap` points.
48
+ * Uses an absolute move so it never inherits stale font metrics.
49
+ */
50
+ function rule(doc: PDFDocumentInstance, color = C.rule, gap = 10): void {
51
+ const y = doc.y;
52
+ doc
53
+ .moveTo(MARGIN, y)
54
+ .lineTo(doc.page.width - MARGIN, y)
55
+ .strokeColor(color)
56
+ .lineWidth(0.5)
57
+ .stroke();
58
+ doc.y = y + gap; // advance cursor by exact points, not line-height multiples
59
+ }
60
+
61
+ /**
62
+ * Normalises raw scraped text:
63
+ * - Collapses runs of whitespace/newlines into a single space
64
+ * - Trims leading/trailing whitespace
65
+ * This prevents the large blank gaps that `paragraphGap` creates when the
66
+ * extractor's output happens to contain stray newline characters.
67
+ */
68
+ function normaliseText(raw: string): string {
69
+ return raw.replace(/\s+/g, " ").trim();
70
+ }
71
+
72
+ /**
73
+ * Returns the usable content height on the current page
74
+ * (page height minus top margin, bottom margin, and footer reservation).
75
+ */
76
+ function contentBottom(doc: PDFDocumentInstance): number {
77
+ return doc.page.height - MARGIN - FOOTER_HEIGHT;
78
+ }
79
+
80
+ // ─── Strategy ────────────────────────────────────────────────────────────────
81
+
82
+ export class PdfStrategy implements OutputStrategy {
83
+ private doc!: PDFDocumentInstance;
84
+ private stream!: fs.WriteStream;
85
+ private pageCount = 0;
86
+ private pdfPath!: string;
87
+
88
+ async init(): Promise<void> {
89
+ if (!fs.existsSync(OUTPUT_DIR)) fs.mkdirSync(OUTPUT_DIR, { recursive: true });
90
+
91
+ this.pdfPath = resolveOutputPath();
92
+
93
+ this.doc = new PDFDocumentCtor({
94
+ autoFirstPage: false,
95
+ bufferPages: true,
96
+ // Explicit margins so pdfkit never auto-paginates into blank pages
97
+ // due to cursor running past the bottom margin.
98
+ margins: { top: MARGIN, bottom: MARGIN + FOOTER_HEIGHT, left: MARGIN, right: MARGIN },
99
+ info: {
100
+ Title: "Checkout the repo https://github.com/lightning4747/Web-crawler-cli",
101
+ Author: "Web Crawler",
102
+ Subject: "Compiled documentation from crawled pages",
103
+ },
104
+ });
105
+
106
+ this.stream = fs.createWriteStream(this.pdfPath);
107
+ this.doc.pipe(this.stream);
108
+ this.renderCover();
109
+ console.log(`[PDF] Output file: ${this.pdfPath}`);
110
+ }
111
+
112
+ // ── Cover page ──────────────────────────────────────────────────────────────
113
+
114
+ private renderCover(): void {
115
+ const doc = this.doc;
116
+ doc.addPage();
117
+
118
+ doc.rect(0, 0, doc.page.width, doc.page.height).fill(C.coverBg);
119
+
120
+ const midY = doc.page.height / 2 - 60;
121
+
122
+ doc.fontSize(38).font("Helvetica-Bold").fillColor(C.coverFg)
123
+ .text("Web Crawler", MARGIN, midY, { align: "center" });
124
+
125
+ // Advance by exact points to avoid font-size-based gaps
126
+ doc.y += 8;
127
+
128
+ doc.fontSize(32).font("Helvetica-Bold").fillColor(C.coverSub)
129
+ .text("Documentation Book", { align: "center" });
130
+
131
+ doc.y += 28;
132
+
133
+ doc.fontSize(11).font("Helvetica").fillColor(C.coverFg)
134
+ .text("https://github.com/lightning4747/Web-crawler-cli", { align: "center", link: "https://github.com/lightning4747/Web-crawler-cli" });
135
+
136
+ doc.y += 8;
137
+
138
+ doc.fontSize(10).font("Helvetica").fillColor(C.coverSub)
139
+ .text(new Date().toUTCString(), { align: "center" });
140
+ }
141
+
142
+ // ── Chapter page ────────────────────────────────────────────────────────────
143
+
144
+ async save(urlId: number, url: string, content: CrawledPageContent): Promise<void> {
145
+ await markDone(urlId, content);
146
+
147
+ const doc = this.doc;
148
+ this.pageCount++;
149
+ doc.addPage();
150
+
151
+ const W = doc.page.width - MARGIN * 2; // usable text width
152
+ const limit = contentBottom(doc); // y-coordinate of content boundary
153
+
154
+ // ── Chapter badge (top-right, absolute position) ─────────────────────────
155
+ doc.fontSize(8).font("Helvetica").fillColor(C.badge)
156
+ .text(`CHAPTER ${this.pageCount}`, MARGIN, MARGIN, { width: W, align: "right" });
157
+
158
+ // Place cursor just below the badge — use exact points, not moveDown
159
+ doc.y = MARGIN + 14;
160
+
161
+ // ── Title ─────────────────────────────────────────────────────────────────
162
+ doc.fontSize(22).font("Helvetica-Bold").fillColor(C.title)
163
+ .text(content.title ?? url, { width: W, lineGap: 2 });
164
+
165
+ doc.y += 10;
166
+ rule(doc, "#4361ee", 10);
167
+
168
+ // ── Source URL ────────────────────────────────────────────────────────────
169
+ doc.fontSize(8.5).font("Helvetica-Oblique").fillColor(C.url)
170
+ .text(url, { width: W, link: url, underline: true, lineGap: 1 });
171
+
172
+ doc.y += 12;
173
+
174
+ // ── Description ───────────────────────────────────────────────────────────
175
+ if (content.description && doc.y < limit) {
176
+ doc.fontSize(11).font("Helvetica-Oblique").fillColor(C.desc)
177
+ .text(content.description.trim(), { width: W, lineGap: 2, align: "left" });
178
+
179
+ doc.y += 12;
180
+ }
181
+
182
+ // ── Headings summary ──────────────────────────────────────────────────────
183
+ const allHeadings = [
184
+ ...content.headings.h1,
185
+ ...content.headings.h2,
186
+ ...content.headings.h3,
187
+ ].slice(0, 12);
188
+
189
+ if (allHeadings.length > 0 && doc.y < limit) {
190
+ doc.fontSize(9).font("Helvetica-Bold").fillColor(C.section)
191
+ .text("CONTENTS OVERVIEW", { width: W, characterSpacing: 1 });
192
+
193
+ doc.y += 6;
194
+
195
+ for (const h of allHeadings) {
196
+ if (doc.y >= limit) break;
197
+
198
+ const bulletX = MARGIN;
199
+ const textX = MARGIN + 14;
200
+ const y = doc.y;
201
+
202
+ // Bullet dot — drawn absolutely, no cursor movement
203
+ doc.circle(bulletX + 3, y + 5, 2).fill(C.section);
204
+
205
+ doc.fontSize(10).font("Helvetica").fillColor(C.body)
206
+ .text(h, textX, y, { width: W - 14, lineGap: 2 });
207
+
208
+ // Advance by 4pt padding between bullet items
209
+ doc.y += 4;
210
+ }
211
+
212
+ doc.y += 8;
213
+
214
+ if (doc.y < limit) rule(doc, C.rule, 10);
215
+ }
216
+
217
+ // ── Page Content ──────────────────────────────────────────────────────────
218
+ if (content.blocks && content.blocks.length > 0) {
219
+ doc.fontSize(9).font("Helvetica-Bold").fillColor(C.section)
220
+ .text("PAGE CONTENT", { width: W, characterSpacing: 1 });
221
+
222
+ doc.y += 8;
223
+
224
+ for (const block of content.blocks) {
225
+ if (doc.y >= limit) {
226
+ doc.addPage();
227
+ }
228
+
229
+ if (block.type === "heading" && block.text) {
230
+ const headingSize = block.level === 1 ? 16 : block.level === 2 ? 14 : 12;
231
+ const headingHeight = doc.heightOfString(block.text, { width: W });
232
+
233
+ if (doc.y + headingHeight + 40 > limit) {
234
+ doc.addPage();
235
+ }
236
+
237
+ doc.fontSize(headingSize).font("Helvetica-Bold").fillColor(C.title)
238
+ .text(block.text, { width: W, lineGap: 2 });
239
+ doc.y += 6;
240
+ } else if (block.type === "paragraph" && block.text) {
241
+ const text = block.text.trim();
242
+ if (!text) continue;
243
+
244
+ const textHeight = doc.heightOfString(text, { width: W });
245
+ if (doc.y + 20 > limit) {
246
+ doc.addPage();
247
+ }
248
+
249
+ doc.fontSize(10).font("Helvetica").fillColor(C.body)
250
+ .text(text, { width: W, lineGap: 3 });
251
+ doc.y += 8;
252
+ } else if (block.type === "list" && block.items && block.items.length > 0) {
253
+ if (doc.y + 20 > limit) {
254
+ doc.addPage();
255
+ }
256
+
257
+ for (const item of block.items) {
258
+ const itemText = item.trim();
259
+ if (!itemText) continue;
260
+
261
+ const bulletX = MARGIN + 10;
262
+ const textX = MARGIN + 22;
263
+ const itemHeight = doc.heightOfString(itemText, { width: W - 22 });
264
+
265
+ if (doc.y + itemHeight > limit) {
266
+ doc.addPage();
267
+ }
268
+
269
+ const y = doc.y;
270
+ doc.circle(bulletX + 3, y + 5, 2).fill(C.body);
271
+
272
+ doc.fontSize(9.5).font("Helvetica").fillColor(C.body)
273
+ .text(itemText, textX, y, { width: W - 22, lineGap: 2 });
274
+ doc.y += 4;
275
+ }
276
+ doc.y += 4;
277
+ } else if (block.type === "image" && block.src) {
278
+ try {
279
+ const imageBuffer = await downloadImage(block.src);
280
+ const maxImageHeight = 200;
281
+
282
+ if (doc.y + maxImageHeight + 20 > limit) {
283
+ doc.addPage();
284
+ }
285
+
286
+ doc.image(imageBuffer, {
287
+ fit: [W, maxImageHeight],
288
+ align: "center",
289
+ });
290
+ doc.y += maxImageHeight + 10;
291
+
292
+ if (block.alt) {
293
+ doc.fontSize(8.5).font("Helvetica-Oblique").fillColor(C.desc)
294
+ .text(block.alt, { width: W, align: "center" });
295
+ doc.y += 8;
296
+ }
297
+ } catch (err) {
298
+ const fallbackText = `[Image: ${block.alt || "No description available"} (${block.src})]`;
299
+ const boxHeight = 40;
300
+
301
+ if (doc.y + boxHeight > limit) {
302
+ doc.addPage();
303
+ }
304
+
305
+ const currentY = doc.y;
306
+ doc.rect(MARGIN, currentY, W, boxHeight)
307
+ .strokeColor(C.rule)
308
+ .lineWidth(0.5)
309
+ .stroke();
310
+
311
+ doc.fontSize(9).font("Helvetica-Oblique").fillColor(C.truncated)
312
+ .text(fallbackText, MARGIN + 10, currentY + 14, { width: W - 20, align: "center" });
313
+
314
+ doc.y = currentY + boxHeight + 10;
315
+ }
316
+ }
317
+ }
318
+ } else if (content.textContent && doc.y < limit) {
319
+ doc.fontSize(9).font("Helvetica-Bold").fillColor(C.section)
320
+ .text("PAGE CONTENT", { width: W, characterSpacing: 1 });
321
+
322
+ doc.y += 8;
323
+
324
+ const raw = normaliseText(content.textContent);
325
+ const body = raw.slice(0, MAX_TEXT_CHARS);
326
+ const truncated = raw.length > MAX_TEXT_CHARS;
327
+
328
+ doc.fontSize(10.5).font("Helvetica").fillColor(C.body)
329
+ .text(body, {
330
+ width: W,
331
+ lineGap: 3,
332
+ align: "left",
333
+ });
334
+
335
+ if (truncated && doc.y < limit) {
336
+ doc.y += 8;
337
+ doc.fontSize(8.5).font("Helvetica-Oblique").fillColor(C.truncated)
338
+ .text("[ content truncated for brevity ]", { width: W, align: "center" });
339
+ }
340
+ }
341
+ }
342
+
343
+ async finish(): Promise<void> {
344
+ const doc = this.doc;
345
+ const range = doc.bufferedPageRange();
346
+ const totalPages = range.count;
347
+
348
+ for (let i = 1; i < totalPages; i++) {
349
+ doc.switchToPage(i);
350
+
351
+ const W = doc.page.width - MARGIN * 2;
352
+ const footerY = doc.page.height - MARGIN - FOOTER_HEIGHT + 8;
353
+
354
+ // Draw running header
355
+ doc.fontSize(8).font("Helvetica").fillColor(C.badge)
356
+ .text("https://github.com/lightning4747/Web-crawler-cli", MARGIN, MARGIN - 24, { width: W, align: "left", link: "https://github.com/lightning4747/Web-crawler-cli" });
357
+
358
+ doc
359
+ .moveTo(MARGIN, MARGIN - 14)
360
+ .lineTo(doc.page.width - MARGIN, MARGIN - 14)
361
+ .strokeColor(C.rule)
362
+ .lineWidth(0.4)
363
+ .stroke();
364
+
365
+ // Draw running footer
366
+ doc
367
+ .moveTo(MARGIN, footerY)
368
+ .lineTo(doc.page.width - MARGIN, footerY)
369
+ .strokeColor(C.rule)
370
+ .lineWidth(0.4)
371
+ .stroke();
372
+
373
+ doc.fontSize(8).font("Helvetica").fillColor(C.footer)
374
+ .text(`Page ${i} of ${totalPages - 1}`, MARGIN, footerY + 5, {
375
+ width: W,
376
+ align: "center",
377
+ lineBreak: false,
378
+ });
379
+ }
380
+
381
+ await new Promise<void>((resolve, reject) => {
382
+ this.stream.on("finish", resolve);
383
+ this.stream.on("error", reject);
384
+ this.doc.end();
385
+ });
386
+ console.log(`[PDF] Done — ${this.pageCount} chapter(s) written to ${this.pdfPath}`);
387
+ }
388
+ }
@@ -0,0 +1,16 @@
1
+ import type { CrawledPageContent } from "../db/queries.js";
2
+
3
+ /**
4
+ * Strategy interface for output destinations.
5
+ * Implementations decide how to persist extracted page content.
6
+ */
7
+ export interface OutputStrategy {
8
+ /** Called once before crawling begins. */
9
+ init(): Promise<void>;
10
+
11
+ /** Called for each successfully crawled page. */
12
+ save(urlId: number, url: string, content: CrawledPageContent): Promise<void>;
13
+
14
+ /** Called once after crawling finishes to flush/close any open resources. */
15
+ finish(): Promise<void>;
16
+ }
@@ -0,0 +1,48 @@
1
+ import { lookup } from "node:dns/promises";
2
+
3
+ /**
4
+ * IP range patterns that must never be requested.
5
+ * Covers loopback, private, link-local, and cloud metadata addresses.
6
+ */
7
+ const BLOCKED_IP_RANGES: RegExp[] = [
8
+ /^127\./, // loopback
9
+ /^10\./, // private Class A
10
+ /^172\.(1[6-9]|2\d|3[01])\./, // private Class B
11
+ /^192\.168\./, // private Class C
12
+ /^169\.254\./, // link-local / cloud IMDS (AWS, GCP, Azure)
13
+ /^0\./, // current network
14
+ /^::1$/, // IPv6 loopback
15
+ /^fc00:/i, // IPv6 unique local
16
+ /^fe80:/i, // IPv6 link-local
17
+ /^100\.64\./, // shared address space (RFC 6598)
18
+ ];
19
+
20
+ /**
21
+ * Hostnames that are blocked regardless of DNS resolution.
22
+ */
23
+ const BLOCKED_HOSTNAMES = new Set([
24
+ "localhost",
25
+ "metadata.google.internal",
26
+ "169.254.169.254",
27
+ ]);
28
+
29
+ /**
30
+ * Returns true if the hostname resolves to a private/internal address.
31
+ * Fails closed — if DNS lookup throws, the address is considered blocked.
32
+ */
33
+ export async function isBlockedAddress(hostname: string): Promise<boolean> {
34
+ const lower = hostname.toLowerCase();
35
+
36
+ if (BLOCKED_HOSTNAMES.has(lower)) return true;
37
+
38
+ // Reject raw IP literals that match blocked ranges without a DNS lookup
39
+ if (BLOCKED_IP_RANGES.some((r) => r.test(hostname))) return true;
40
+
41
+ try {
42
+ const { address } = await lookup(hostname);
43
+ return BLOCKED_IP_RANGES.some((r) => r.test(address));
44
+ } catch {
45
+ // DNS resolution failed — fail closed
46
+ return true;
47
+ }
48
+ }
@@ -0,0 +1,49 @@
1
+ /**
2
+ * Validates that a string is a well-formed HTTP or HTTPS URL.
3
+ * Returns the parsed URL on success, or a descriptive error string on failure.
4
+ */
5
+ export function validateSeedUrl(raw: string): { url: URL; error: null } | { url: null; error: string } {
6
+ let parsed: URL;
7
+
8
+ try {
9
+ parsed = new URL(raw);
10
+ } catch {
11
+ return { url: null, error: `"${raw}" is not a valid URL` };
12
+ }
13
+
14
+ if (parsed.protocol !== "http:" && parsed.protocol !== "https:") {
15
+ return {
16
+ url: null,
17
+ error: `"${raw}" uses scheme "${parsed.protocol.replace(":", "")}" — only http and https are allowed`,
18
+ };
19
+ }
20
+
21
+ if (!parsed.hostname) {
22
+ return { url: null, error: `"${raw}" has no hostname` };
23
+ }
24
+
25
+ return { url: parsed, error: null };
26
+ }
27
+
28
+ /**
29
+ * Validates a list of URL strings.
30
+ * Returns valid URLs and a list of { input, reason } error objects.
31
+ */
32
+ export function validateSeedUrls(raws: string[]): {
33
+ valid: string[];
34
+ invalid: Array<{ input: string; reason: string }>;
35
+ } {
36
+ const valid: string[] = [];
37
+ const invalid: Array<{ input: string; reason: string }> = [];
38
+
39
+ for (const raw of raws) {
40
+ const result = validateSeedUrl(raw);
41
+ if (result.error) {
42
+ invalid.push({ input: raw, reason: result.error });
43
+ } else {
44
+ valid.push(raw);
45
+ }
46
+ }
47
+
48
+ return { valid, invalid };
49
+ }
package/src/seed.ts ADDED
@@ -0,0 +1,18 @@
1
+ import { config } from "./config.js";
2
+ import { query } from "./db/client.js";
3
+ import { getDomain } from "./normalizer.js";
4
+
5
+ export async function seedDatabase() {
6
+ for (const url of config.SEED_URLS) {
7
+ const domain = getDomain(url);
8
+ if (!domain) {
9
+ continue;
10
+ }
11
+ await query(
12
+ `INSERT INTO urls (url, domain, status, depth)
13
+ VALUES ($1, $2, 'PENDING', 0)
14
+ ON CONFLICT (url) DO NOTHING`,
15
+ [url, domain]
16
+ );
17
+ }
18
+ }
package/src/setup.ts ADDED
@@ -0,0 +1,170 @@
1
+ /**
2
+ * Standalone CLI configuration wizard.
3
+ *
4
+ * Usage: npm run config
5
+ *
6
+ * Steps:
7
+ * 1. Reads seed URLs from seeds.txt
8
+ * 2. Prompts for crawler performance settings and output mode
9
+ * 3. Writes updated values to .env
10
+ * 4. Patches SEED_URLS and ALLOWED_DOMAINS in src/config.ts
11
+ */
12
+
13
+ import { select, number, confirm } from "@inquirer/prompts";
14
+ import fs from "fs";
15
+ import path from "path";
16
+ import { validateSeedUrls } from "./security/validate-url.js";
17
+
18
+ const SEEDS_FILE = "seeds.txt";
19
+ const ENV_FILE = ".env";
20
+ const CONFIG_FILE = path.join("src", "config.ts");
21
+
22
+ // ─── Helpers ─────────────────────────────────────────────────────────────────
23
+
24
+ function readSeedsFile(): string[] {
25
+ if (!fs.existsSync(SEEDS_FILE)) {
26
+ console.warn(`[setup] ${SEEDS_FILE} not found. No seed URLs will be loaded.`);
27
+ return [];
28
+ }
29
+ const raw = fs
30
+ .readFileSync(SEEDS_FILE, "utf-8")
31
+ .split("\n")
32
+ .map((l) => l.trim())
33
+ .filter((l) => l.length > 0 && !l.startsWith("#"));
34
+
35
+ const { valid, invalid } = validateSeedUrls(raw);
36
+
37
+ if (invalid.length > 0) {
38
+ console.warn(`[setup] Skipping ${invalid.length} invalid URL(s) in ${SEEDS_FILE}:`);
39
+ for (const e of invalid) {
40
+ console.warn(` ✗ ${e.reason}`);
41
+ }
42
+ }
43
+
44
+ return valid;
45
+ }
46
+
47
+ function extractDomains(urls: string[]): string[] {
48
+ return urls.reduce<string[]>((acc, url) => {
49
+ try {
50
+ const { hostname } = new URL(url);
51
+ if (hostname && !acc.includes(hostname)) acc.push(hostname);
52
+ } catch {}
53
+ return acc;
54
+ }, []);
55
+ }
56
+
57
+ function writeEnvFile(values: Record<string, string>): void {
58
+ let existing: Record<string, string> = {};
59
+ if (fs.existsSync(ENV_FILE)) {
60
+ for (const line of fs.readFileSync(ENV_FILE, "utf-8").split("\n")) {
61
+ const trimmed = line.trim();
62
+ if (!trimmed || trimmed.startsWith("#")) continue;
63
+ const eqIdx = trimmed.indexOf("=");
64
+ if (eqIdx === -1) continue;
65
+ existing[trimmed.slice(0, eqIdx).trim()] = trimmed.slice(eqIdx + 1).trim();
66
+ }
67
+ }
68
+ const content =
69
+ Object.entries({ ...existing, ...values })
70
+ .map(([k, v]) => `${k}=${v}`)
71
+ .join("\n") + "\n";
72
+ fs.writeFileSync(ENV_FILE, content, "utf-8");
73
+ console.log(`[setup] .env updated.`);
74
+ }
75
+
76
+ function patchConfigTs(seedUrls: string[], allowedDomains: string[], outputMode: string): void {
77
+ if (!fs.existsSync(CONFIG_FILE)) {
78
+ console.warn(`[setup] ${CONFIG_FILE} not found — skipping patch.`);
79
+ return;
80
+ }
81
+ let src = fs.readFileSync(CONFIG_FILE, "utf-8");
82
+
83
+ const seedArray = "[\n" + seedUrls.map((u) => ` "${u}"`).join(",\n") + ",\n ]";
84
+ src = src.replace(/SEED_URLS:\s*\[[\s\S]*?\]/, `SEED_URLS: ${seedArray}`);
85
+
86
+ const domainArray = "[\n" + allowedDomains.map((d) => ` "${d}"`).join(",\n") + ",\n ]";
87
+ src = src.replace(/ALLOWED_DOMAINS:\s*\[[\s\S]*?\]/, `ALLOWED_DOMAINS: ${domainArray}`);
88
+
89
+ if (/OUTPUT_MODE:/.test(src)) {
90
+ src = src.replace(/OUTPUT_MODE:\s*["'][^"']*["']/, `OUTPUT_MODE: "${outputMode}"`);
91
+ } else {
92
+ src = src.replace(/(\n};)/, `\n OUTPUT_MODE: "${outputMode}",\n};`);
93
+ }
94
+
95
+ fs.writeFileSync(CONFIG_FILE, src, "utf-8");
96
+ console.log(`[setup] src/config.ts patched.`);
97
+ }
98
+
99
+ // ─── Main wizard ──────────────────────────────────────────────────────────────
100
+
101
+ async function main() {
102
+ console.log("\n╔══════════════════════════════════════════╗");
103
+ console.log("║ Web Crawler — Interactive Setup Wizard ║");
104
+ console.log("╚══════════════════════════════════════════╝\n");
105
+
106
+ const seedUrls = readSeedsFile();
107
+ if (seedUrls.length === 0) {
108
+ console.warn(`[setup] No URLs found in ${SEEDS_FILE}. Add target URLs and re-run.\n`);
109
+ } else {
110
+ console.log(`[setup] Found ${seedUrls.length} seed URL(s):`);
111
+ seedUrls.forEach((u) => console.log(` ${u}`));
112
+ console.log();
113
+ }
114
+
115
+ const outputMode = await select<string>({
116
+ message: "OUTPUT_MODE — where should crawled data be stored?",
117
+ choices: [
118
+ { name: "PostgreSQL database (structured data)", value: "database" },
119
+ { name: "PDF eBook (compiled document)", value: "pdf" },
120
+ ],
121
+ default: "database",
122
+ });
123
+
124
+ const maxDepth = await number({
125
+ message: "MAX_DEPTH — link hops from seed URLs:",
126
+ default: 3,
127
+ validate: (v) => (v !== undefined && v >= 0 ? true : "Must be 0 or greater"),
128
+ });
129
+
130
+ const crawlDelayMs = await number({
131
+ message: "CRAWL_DELAY_MS — politeness delay per domain (ms):",
132
+ default: 1000,
133
+ validate: (v) => (v !== undefined && v >= 0 ? true : "Must be 0 or greater"),
134
+ });
135
+
136
+ const workerCount = await number({
137
+ message: "WORKER_COUNT — concurrent workers:",
138
+ default: 5,
139
+ validate: (v) => (v !== undefined && v >= 1 ? true : "Must be at least 1"),
140
+ });
141
+
142
+ const maxPages = await number({
143
+ message: "MAX_PAGES — page limit (0 = unlimited):",
144
+ default: 1000,
145
+ validate: (v) => (v !== undefined && v >= 0 ? true : "Must be 0 or greater"),
146
+ });
147
+
148
+ const ok = await confirm({ message: "Save these settings?", default: true });
149
+ if (!ok) {
150
+ console.log("\nAborted.\n");
151
+ process.exit(0);
152
+ }
153
+
154
+ writeEnvFile({
155
+ MAX_DEPTH: String(maxDepth),
156
+ CRAWL_DELAY_MS: String(crawlDelayMs),
157
+ WORKER_COUNT: String(workerCount),
158
+ MAX_PAGES: String(maxPages),
159
+ OUTPUT_MODE: outputMode,
160
+ });
161
+
162
+ patchConfigTs(seedUrls, extractDomains(seedUrls), outputMode);
163
+
164
+ console.log("\n✓ Configuration saved. Run npm run crawl to start.\n");
165
+ }
166
+
167
+ main().catch((err) => {
168
+ console.error("[setup] Fatal error:", err);
169
+ process.exit(1);
170
+ });