messi-crawler 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. package/README.md +201 -0
  2. package/dist/cli/renderer.js +71 -0
  3. package/dist/config.js +18 -0
  4. package/dist/db/clear.js +16 -0
  5. package/dist/db/client.js +20 -0
  6. package/dist/db/queries.js +179 -0
  7. package/dist/frontier/frontier.js +44 -0
  8. package/dist/frontier/logger.js +65 -0
  9. package/dist/frontier/robots.js +46 -0
  10. package/dist/frontier/scheduler.js +98 -0
  11. package/dist/index.js +533 -0
  12. package/dist/normalizer.js +33 -0
  13. package/dist/output/db-strategy.js +16 -0
  14. package/dist/output/index.js +23 -0
  15. package/dist/output/pdf-strategy.js +316 -0
  16. package/dist/output/strategy.js +1 -0
  17. package/dist/security/ssrf.js +45 -0
  18. package/dist/security/validate-url.js +41 -0
  19. package/dist/seed.js +14 -0
  20. package/dist/setup.js +148 -0
  21. package/dist/test/client.test.js +33 -0
  22. package/dist/test/downloader.test.js +84 -0
  23. package/dist/test/extractor.test.js +126 -0
  24. package/dist/test/frontier.test.js +43 -0
  25. package/dist/test/logger.test.js +55 -0
  26. package/dist/test/normalizer.test.js +36 -0
  27. package/dist/test/pdf-strategy.test.js +68 -0
  28. package/dist/test/queries.test.js +173 -0
  29. package/dist/test/robots.test.js +46 -0
  30. package/dist/test/scheduler.test.js +73 -0
  31. package/dist/test/seed.test.js +26 -0
  32. package/dist/test/worker.test.js +118 -0
  33. package/dist/worker/downloader.js +114 -0
  34. package/dist/worker/extractor.js +197 -0
  35. package/dist/worker/worker.js +87 -0
  36. package/package.json +48 -0
  37. package/seeds.txt +4 -0
  38. package/src/cli/renderer.ts +83 -0
  39. package/src/config.ts +22 -0
  40. package/src/db/clear.ts +16 -0
  41. package/src/db/client.ts +26 -0
  42. package/src/db/queries.ts +255 -0
  43. package/src/db/schema.sql +43 -0
  44. package/src/frontier/frontier.ts +60 -0
  45. package/src/frontier/logger.ts +75 -0
  46. package/src/frontier/robots.ts +50 -0
  47. package/src/frontier/scheduler.ts +119 -0
  48. package/src/index.ts +596 -0
  49. package/src/normalizer.ts +37 -0
  50. package/src/output/db-strategy.ts +20 -0
  51. package/src/output/index.ts +32 -0
  52. package/src/output/pdf-strategy.ts +388 -0
  53. package/src/output/strategy.ts +16 -0
  54. package/src/security/ssrf.ts +48 -0
  55. package/src/security/validate-url.ts +49 -0
  56. package/src/seed.ts +18 -0
  57. package/src/setup.ts +170 -0
  58. package/src/test/client.test.ts +38 -0
  59. package/src/test/downloader.test.ts +101 -0
  60. package/src/test/extractor.test.ts +139 -0
  61. package/src/test/frontier.test.ts +53 -0
  62. package/src/test/logger.test.ts +71 -0
  63. package/src/test/normalizer.test.ts +43 -0
  64. package/src/test/pdf-strategy.test.ts +84 -0
  65. package/src/test/queries.test.ts +247 -0
  66. package/src/test/robots.test.ts +56 -0
  67. package/src/test/scheduler.test.ts +90 -0
  68. package/src/test/seed.test.ts +35 -0
  69. package/src/test/worker.test.ts +144 -0
  70. package/src/worker/downloader.ts +149 -0
  71. package/src/worker/extractor.ts +235 -0
  72. package/src/worker/worker.ts +100 -0
  73. package/tsconfig.json +15 -0
package/src/index.ts ADDED
@@ -0,0 +1,596 @@
1
+ #!/usr/bin/env node
2
+ import { input, select } from "@inquirer/prompts";
3
+ import * as readline from "node:readline/promises";
4
+ import { stdin as input$, stdout } from "node:process";
5
+ import fs from "fs";
6
+ import { seedDatabase } from "./seed.js";
7
+ import { resetStaleLocks, clearPendingURLs } from "./db/queries.js";
8
+ import { startScheduler } from "./frontier/scheduler.js";
9
+ import { pool } from "./db/client.js";
10
+ import { config } from "./config.js";
11
+ import { createStrategy, setStrategy, type OutputMode } from "./output/index.js";
12
+ import { validateSeedUrls } from "./security/validate-url.js";
13
+
14
+ const SEEDS_FILE = "seeds.txt";
15
+ const MIN_CRAWL_DELAY_MS = 500;
16
+
17
+ // Unified layout grid definition
18
+ const INNER_WIDTH = 72;
19
+
20
+ // ─── Terminal primitives ──────────────────────────────────────────────────────
21
+
22
+ const ESC = "\x1b[";
23
+ const RESET = "\x1b[0m";
24
+
25
+ // Gradient colors: Green to Blue
26
+ const gradient = (text: string): string => {
27
+ const colors = [
28
+ "\x1b[38;2;0;255;127m", // Spring Green
29
+ "\x1b[38;2;0;255;150m",
30
+ "\x1b[38;2;0;230;180m",
31
+ "\x1b[38;2;0;200;200m",
32
+ "\x1b[38;2;0;170;220m",
33
+ "\x1b[38;2;0;140;240m",
34
+ "\x1b[38;2;0;100;255m", // Dodger Blue
35
+ "\x1b[38;2;0;70;255m",
36
+ ];
37
+
38
+ let result = "";
39
+ const chars = text.split("");
40
+ for (let i = 0; i < chars.length; i++) {
41
+ const colorIndex = Math.floor((i / chars.length) * colors.length);
42
+ result += colors[colorIndex % colors.length] + chars[i];
43
+ }
44
+ return result + RESET;
45
+ };
46
+
47
+ // Clean structural geometric patterns
48
+ const STARS = {
49
+ full: "✦",
50
+ half: "✧",
51
+ small: "⋆",
52
+ star: "★",
53
+ outline: "☆",
54
+ };
55
+
56
+ const G = (s: string) => `\x1b[32m${s}${RESET}`; // green
57
+ const DG = (s: string) => `\x1b[2;32m${s}${RESET}`; // dim green
58
+ const YL = (s: string) => `\x1b[33m${s}${RESET}`; // yellow
59
+ const RD = (s: string) => `\x1b[31m${s}${RESET}`; // red
60
+ const W = (s: string) => `\x1b[97m${s}${RESET}`; // bright white
61
+ const DIM = (s: string) => `\x1b[2m${s}${RESET}`; // dim
62
+ const BOLD = (s: string) => `\x1b[1m${s}${RESET}`; // bold
63
+ const BLUE = (s: string) => `\x1b[34m${s}${RESET}`; // blue
64
+ const CYAN = (s: string) => `\x1b[36m${s}${RESET}`; // cyan
65
+
66
+ const BOX = {
67
+ tl: "┌", tr: "┐", bl: "└", br: "┘",
68
+ h: "─", v: "│", lm: "├", rm: "┤",
69
+ tm: "┬", bm: "┴", cross: "┼",
70
+ };
71
+
72
+ function getVisibleLength(s: string): number {
73
+ return s.replace(/\x1b\[[0-9;]*m/g, "").length;
74
+ }
75
+
76
+ // ─── Box Layout Engine ────────────────────────────────────────────────────────
77
+
78
+ function drawTopBorder(label: string): string {
79
+ const prefix = " ┌─ " + label + " ";
80
+ const currentLen = getVisibleLength(prefix);
81
+ const remaining = (INNER_WIDTH + 4) - currentLen - 1;
82
+ return DG(prefix + BOX.h.repeat(Math.max(0, remaining)) + "┐");
83
+ }
84
+
85
+ function drawBottomBorder(): string {
86
+ return DG(" └" + BOX.h.repeat(INNER_WIDTH) + "┘");
87
+ }
88
+
89
+ function drawRow(leftContent: string, rightContentRendered: string, rightContentRawLength: number): string {
90
+ const leftVisible = getVisibleLength(leftContent);
91
+ const leftPadded = leftContent + " ".repeat(Math.max(0, 16 - leftVisible));
92
+
93
+ const remainingSpaces = INNER_WIDTH - 18 - rightContentRawLength;
94
+
95
+ return DG(" │") + " " + leftPadded + rightContentRendered + " ".repeat(Math.max(0, remainingSpaces)) + DG("│");
96
+ }
97
+
98
+ // ─── Background Glowing Animation Engine ──────────────────────────────────────
99
+
100
+ let glowInterval: NodeJS.Timeout | null = null;
101
+ let currentGlowText = "";
102
+ let glowFrameIndex = 0;
103
+
104
+ // High-density frames with elevated base values to prevent vanishing artifacts
105
+ const glowFrames = [
106
+ "\x1b[38;2;0;140;100m", // Baseline low (Crisp Medium Mint)
107
+ "\x1b[38;2;0;175;130m",
108
+ "\x1b[38;2;0;205;155m",
109
+ "\x1b[38;2;0;235;185m",
110
+ "\x1b[38;2;0;255;210m", // Vivid Cyan-Green
111
+ "\x1b[38;2;100;255;225m", // High Glow
112
+ "\x1b[38;2;180;255;245m", // Peak Brightness
113
+ "\x1b[38;2;100;255;225m",
114
+ "\x1b[38;2;0;255;210m",
115
+ "\x1b[38;2;0;235;185m",
116
+ "\x1b[38;2;0;205;155m",
117
+ "\x1b[38;2;0;175;130m"
118
+ ];
119
+
120
+ function drawGlowLine(): void {
121
+ if (!currentGlowText) return;
122
+ const colorCode = glowFrames[glowFrameIndex % glowFrames.length];
123
+ process.stdout.write(`\r\x1b[K ${CYAN(STARS.full)} ${colorCode}${BOLD(currentGlowText)}${RESET}`);
124
+ glowFrameIndex++;
125
+ }
126
+
127
+ export function startGlowStatus(text: string): void {
128
+ currentGlowText = text;
129
+ glowFrameIndex = 0;
130
+ // Hide native terminal cursor to keep rendering clean
131
+ process.stdout.write("\x1b[?25l");
132
+ drawGlowLine();
133
+ glowInterval = setInterval(drawGlowLine, 120); // Frequency locked at 120ms
134
+ }
135
+
136
+ export function stopGlowStatus(): void {
137
+ if (glowInterval) {
138
+ clearInterval(glowInterval);
139
+ glowInterval = null;
140
+ }
141
+ currentGlowText = "";
142
+ // Clear the animation frame line and restore cursor
143
+ process.stdout.write("\r\x1b[K\x1b[?25h");
144
+ }
145
+
146
+ // ─── Logging Helpers (Interception-Aware) ─────────────────────────────────────
147
+
148
+ const ts = () => DIM(new Date().toISOString().slice(11, 19));
149
+
150
+ export function ok(msg: string) {
151
+ if (currentGlowText) process.stdout.write("\r\x1b[K");
152
+ console.log(` ${ts()} ${G("✓")} ${msg}`);
153
+ if (currentGlowText) drawGlowLine();
154
+ }
155
+
156
+ export function warn(msg: string) {
157
+ if (currentGlowText) process.stdout.write("\r\x1b[K");
158
+ console.log(` ${ts()} ${YL("⚠")} ${msg}`);
159
+ if (currentGlowText) drawGlowLine();
160
+ }
161
+
162
+ export function err(msg: string) {
163
+ if (currentGlowText) process.stdout.write("\r\x1b[K");
164
+ console.log(` ${ts()} ${RD("✗")} ${msg}`);
165
+ if (currentGlowText) drawGlowLine();
166
+ }
167
+
168
+ export function info(msg: string) {
169
+ if (currentGlowText) process.stdout.write("\r\x1b[K");
170
+ console.log(` ${ts()} ${BLUE("ℹ")} ${DIM(msg)}`);
171
+ if (currentGlowText) drawGlowLine();
172
+ }
173
+
174
+ export function blank() {
175
+ if (currentGlowText) process.stdout.write("\r\x1b[K");
176
+ console.log("");
177
+ if (currentGlowText) drawGlowLine();
178
+ }
179
+
180
+ // ─── Banner ───────────────────────────────────────────────────────────────────
181
+
182
+ export function printBanner(): void {
183
+ console.clear();
184
+ blank();
185
+
186
+ const bannerText = [
187
+ " ██╗ ██╗███████╗██████╗ ✦ ██████╗██████╗ █████╗ ██╗ ██╗██╗ ███████╗██████╗ ",
188
+ " ██║ ██║██╔════╝██╔══██╗ ██╔════╝██╔══██╗██╔══██╗██║ ██║██║ ✦ ██╔════╝██╔══██╗",
189
+ " ██║ █╗ ██║█████╗ ██████╔╝ ✦ ██║ ██████╔╝███████║██║ █╗ ██║██║ █████╗ ██████╔╝",
190
+ " ██║███╗██║██╔══╝ ██╔══██╗ ██║ ██╔══██╗██╔══██║██║███╗██║██║ ★ ██╔══╝ ██╔══██╗",
191
+ " ╚███╔███╔╝███████╗██████╔╝ ★ ╚██████╗██║ ██║██║ ██║╚███╔███╔╝███████╗███████╗██║ ██║",
192
+ " ╚══╝╚══╝ ╚══════╝╚═════╝ ╚═════╝╚═╝ ╚═╝╚═╝ ╚═╝ ╚══╝╚══╝ ╚══════╝╚══════╝╚═╝ ╚═╝",
193
+ ];
194
+
195
+ for (const line of bannerText) {
196
+ let texturedLine = gradient(line);
197
+ texturedLine = texturedLine.replace(/✦/g, W("✦")).replace(/★/g, W("★"));
198
+ console.log(` ${CYAN(STARS.full)} ${texturedLine} ${CYAN(STARS.full)}`);
199
+ }
200
+
201
+ blank();
202
+ console.log(` ${CYAN(STARS.half)} ${gradient("Node.js · TypeScript · PostgreSQL")} ${CYAN(STARS.half)} ${DIM("v1.0.0")}`);
203
+ console.log(DG(" " + BOX.h.repeat(INNER_WIDTH + 2)));
204
+ blank();
205
+ }
206
+
207
+ // ─── Status board ─────────────────────────────────────────────────────────────
208
+
209
+ export function printSummary(s: CrawlSettings, seedsFromFile: string[]): void {
210
+ const seedLabel =
211
+ s.seedSource === "file"
212
+ ? `seeds.txt (${seedsFromFile.length} url${seedsFromFile.length !== 1 ? "s" : ""})`
213
+ : s.seedSource === "custom"
214
+ ? `custom (${s.customUrls.length} url${s.customUrls.length !== 1 ? "s" : ""})`
215
+ : "config.ts";
216
+
217
+ const rows: Array<[string, string]> = [
218
+ ["output", s.outputMode === "pdf" ? "pdf -> output/documentation*.pdf" : "database -> postgresql"],
219
+ ["seeds", seedLabel],
220
+ ["depth", String(s.maxDepth)],
221
+ ["delay", `${s.crawlDelayMs}ms${s.crawlDelayMs < MIN_CRAWL_DELAY_MS ? " ! below safe minimum" : ""}`],
222
+ ["workers", String(s.workerCount)],
223
+ ["maxpages", s.maxPages === 0 ? "unlimited" : String(s.maxPages)],
224
+ ["start" , "Start the crawl"]
225
+ ];
226
+
227
+ blank();
228
+ console.log(CYAN(" " + STARS.small + " " + gradient("CONFIGURATION") + " " + STARS.small));
229
+ console.log(drawTopBorder("CONFIG"));
230
+
231
+ for (const [k, v] of rows) {
232
+ const valRendered = v.includes("!") ? YL(v) : gradient(v);
233
+ console.log(drawRow(CYAN(BOLD("/" + k)), valRendered, getVisibleLength(v)));
234
+ }
235
+
236
+ console.log(drawBottomBorder());
237
+ blank();
238
+ }
239
+
240
+ // ─── Help panel ───────────────────────────────────────────────────────────────
241
+
242
+ export function printHelp(): void {
243
+ blank();
244
+ console.log(CYAN(" " + STARS.star + " " + gradient("AVAILABLE COMMANDS") + " " + STARS.star));
245
+ console.log(drawTopBorder("COMMANDS"));
246
+
247
+ const cmds: [string, string][] = [
248
+ ["/output", "switch output destination (interactive)"],
249
+ ["/seeds", "choose seed URL source (interactive)"],
250
+ ["/depth <n>", "max link traversal depth"],
251
+ ["/delay <ms>", "politeness delay (min 500)"],
252
+ ["/workers <n>", "concurrent worker count"],
253
+ ["/maxpages <n>", "page cap per session (0 = unlimited)"],
254
+ ["/status", "print current config"],
255
+ ["/help", "show this panel"],
256
+ ["/start", "validate and begin crawl"],
257
+ ["/quit", "exit without crawling"],
258
+ ];
259
+
260
+ for (const [cmd, desc] of cmds) {
261
+ console.log(drawRow(CYAN(cmd), DIM(desc), getVisibleLength(desc)));
262
+ }
263
+
264
+ console.log(drawBottomBorder());
265
+ blank();
266
+ }
267
+
268
+ // ─── Interactive Setup Helpers ───────────────────────────────────────────────
269
+
270
+ function readSeedsFile(): string[] {
271
+ if (!fs.existsSync(SEEDS_FILE)) return [];
272
+ return fs
273
+ .readFileSync(SEEDS_FILE, "utf-8")
274
+ .split("\n")
275
+ .map((l) => l.trim())
276
+ .filter((l) => l.length > 0 && !l.startsWith("#"));
277
+ }
278
+
279
+ function extractDomains(urls: string[]): string[] {
280
+ return urls.reduce<string[]>((acc, url) => {
281
+ try {
282
+ const { hostname } = new URL(url);
283
+ if (hostname && !acc.includes(hostname)) acc.push(hostname);
284
+ } catch {}
285
+ return acc;
286
+ }, []);
287
+ }
288
+
289
+ function enforceCrawlDelay(raw: number): number {
290
+ if (raw < MIN_CRAWL_DELAY_MS) {
291
+ warn(`crawl_delay_ms ${raw}ms is below safe minimum — raised to ${MIN_CRAWL_DELAY_MS}ms`);
292
+ return MIN_CRAWL_DELAY_MS;
293
+ }
294
+ return raw;
295
+ }
296
+
297
+ function parseSlashCommand(input: string): { key: string; value: string } | null {
298
+ const match = input.trim().match(/^\/(\w+)(?:\s+(.+))?$/);
299
+ if (!match) return null;
300
+ return { key: match[1].toLowerCase(), value: (match[2] ?? "").trim() };
301
+ }
302
+
303
+ function parseIntValue(value: string, label: string, min = 0): number | string {
304
+ const n = parseInt(value, 10);
305
+ if (isNaN(n) || n < min) return `${label} must be an integer >= ${min}`;
306
+ return n;
307
+ }
308
+
309
+ interface CrawlSettings {
310
+ outputMode: OutputMode;
311
+ seedSource: "file" | "config" | "custom";
312
+ customUrls: string[];
313
+ maxDepth: number;
314
+ crawlDelayMs: number;
315
+ workerCount: number;
316
+ maxPages: number;
317
+ }
318
+
319
+ function defaultSettings(seedsFromFile: string[]): CrawlSettings {
320
+ return {
321
+ outputMode: config.OUTPUT_MODE,
322
+ seedSource: seedsFromFile.length > 0 ? "file" : "config",
323
+ customUrls: [],
324
+ maxDepth: config.MAX_DEPTH,
325
+ crawlDelayMs: config.CRAWL_DELAY_MS,
326
+ workerCount: config.WORKER_COUNT,
327
+ maxPages: config.MAX_PAGES,
328
+ };
329
+ }
330
+
331
+ async function handleOutputMode(): Promise<OutputMode> {
332
+ const result = await select({
333
+ message: gradient("Select output mode:"),
334
+ choices: [
335
+ { name: "PDF - Generate documentation PDF", value: "pdf" },
336
+ { name: "Database - Store in PostgreSQL", value: "database" },
337
+ ],
338
+ });
339
+ return result as OutputMode;
340
+ }
341
+
342
+ async function handleSeedSource(seedsFromFile: string[]): Promise<{ source: "file" | "config" | "custom"; urls: string[] }> {
343
+ const result = await select({
344
+ message: gradient("Select seed source:"),
345
+ choices: [
346
+ { name: `File (seeds.txt) - ${seedsFromFile.length} URLs found`, value: "file", disabled: seedsFromFile.length === 0 },
347
+ { name: "Config - Use default seeds from config.ts", value: "config" },
348
+ { name: "Custom - Enter your own URLs", value: "custom" },
349
+ ],
350
+ });
351
+ if (result === "file") return { source: "file", urls: seedsFromFile };
352
+ if (result === "config") return { source: "config", urls: config.SEED_URLS };
353
+ return { source: "custom", urls: await handleCustomUrls() };
354
+ }
355
+
356
+ async function handleCustomUrls(): Promise<string[]> {
357
+ while (true) {
358
+ const raw = await input({ message: gradient("Enter URLs (comma-separated):") });
359
+ const candidates = raw.split(",").map((u) => u.trim()).filter(Boolean);
360
+ const { valid, invalid } = validateSeedUrls(candidates);
361
+ if (invalid.length > 0) {
362
+ for (const e of invalid) err(e.reason);
363
+ continue;
364
+ }
365
+ return valid;
366
+ }
367
+ }
368
+
369
+ async function handleNumericConfig(prompt: string, currentValue: number, min = 0, max?: number): Promise<number> {
370
+ while (true) {
371
+ const value = await input({
372
+ message: gradient(`${prompt} (current: ${currentValue}):`),
373
+ validate: (v) => {
374
+ const num = parseInt(v, 10);
375
+ if (isNaN(num)) return "Please enter a valid number";
376
+ if (num < min) return `Value must be >= ${min}`;
377
+ if (max !== undefined && num > max) return `Value must be <= ${max}`;
378
+ return true;
379
+ },
380
+ });
381
+ return parseInt(value, 10);
382
+ }
383
+ }
384
+
385
+ // ─── Command REPL ─────────────────────────────────────────────────────────────
386
+
387
+ async function runCommandRepl(seedsFromFile: string[]): Promise<CrawlSettings> {
388
+ const s = defaultSettings(seedsFromFile);
389
+ printSummary(s, seedsFromFile);
390
+
391
+ console.log(
392
+ CYAN(STARS.small) + DIM(" type a command to configure the crawler.") +
393
+ " " + G("/help") + DIM(" for options,") +
394
+ " " + W("/start") + DIM(" to begin.")
395
+ );
396
+ blank();
397
+
398
+ while (true) {
399
+ const rl = readline.createInterface({ input: input$, output: stdout });
400
+ let line = "";
401
+ try {
402
+ line = await rl.question(CYAN(" " + STARS.half + " › "));
403
+ } finally {
404
+ rl.close();
405
+ }
406
+
407
+ const trimmed = line.trim();
408
+ if (!trimmed) continue;
409
+
410
+ const cmd = parseSlashCommand(trimmed);
411
+ if (!cmd) {
412
+ warn("Commands must start with / — try " + G("/help"));
413
+ continue;
414
+ }
415
+
416
+ switch (cmd.key) {
417
+ case "help":
418
+ printHelp();
419
+ break;
420
+
421
+ case "status":
422
+ printSummary(s, seedsFromFile);
423
+ break;
424
+
425
+ case "output":
426
+ s.outputMode = await handleOutputMode();
427
+ ok(`output -> ${G(s.outputMode)}`);
428
+ break;
429
+
430
+ case "seeds": {
431
+ const result = await handleSeedSource(seedsFromFile);
432
+ s.seedSource = result.source;
433
+ if (result.source === "custom") s.customUrls = result.urls;
434
+ ok(`seeds source -> ${G(result.source)} (${result.urls.length} URLs)`);
435
+ break;
436
+ }
437
+
438
+ case "depth":
439
+ if (cmd.value) {
440
+ const v = parseIntValue(cmd.value, "depth", 0);
441
+ if (typeof v === "string") { err(v); break; }
442
+ s.maxDepth = v;
443
+ } else {
444
+ s.maxDepth = await handleNumericConfig("Max depth", s.maxDepth, 0);
445
+ }
446
+ ok(`max_depth -> ${G(String(s.maxDepth))}`);
447
+ break;
448
+
449
+ case "delay":
450
+ if (cmd.value) {
451
+ const v = parseIntValue(cmd.value, "delay", 0);
452
+ if (typeof v === "string") { err(v); break; }
453
+ s.crawlDelayMs = enforceCrawlDelay(v);
454
+ } else {
455
+ s.crawlDelayMs = await handleNumericConfig("Crawl delay (ms)", s.crawlDelayMs, MIN_CRAWL_DELAY_MS);
456
+ }
457
+ ok(`crawl_delay_ms -> ${G(String(s.crawlDelayMs) + "ms")}`);
458
+ break;
459
+
460
+ case "workers":
461
+ if (cmd.value) {
462
+ const v = parseIntValue(cmd.value, "workers", 1);
463
+ if (typeof v === "string") { err(v); break; }
464
+ s.workerCount = v;
465
+ } else {
466
+ s.workerCount = await handleNumericConfig("Worker count", s.workerCount, 1, 100);
467
+ }
468
+ ok(`worker_count -> ${G(String(s.workerCount))}`);
469
+ break;
470
+
471
+ case "maxpages":
472
+ case "maxcount":
473
+ if (cmd.value) {
474
+ const v = parseIntValue(cmd.value, "maxpages", 0);
475
+ if (typeof v === "string") { err(v); break; }
476
+ s.maxPages = v;
477
+ } else {
478
+ s.maxPages = await handleNumericConfig("Max pages (0 = unlimited)", s.maxPages, 0);
479
+ }
480
+ ok(`max_pages -> ${G(s.maxPages === 0 ? "unlimited" : String(s.maxPages))}`);
481
+ break;
482
+
483
+ case "start": {
484
+ const seedCount =
485
+ s.seedSource === "file" ? seedsFromFile.length :
486
+ s.seedSource === "custom" ? s.customUrls.length :
487
+ config.SEED_URLS.length;
488
+
489
+ if (seedCount === 0) {
490
+ err("No seed URLs configured — use /seeds first");
491
+ break;
492
+ }
493
+ return s;
494
+ }
495
+
496
+ case "quit":
497
+ case "exit":
498
+ case "q":
499
+ process.exit(0);
500
+
501
+ default:
502
+ warn(`Unknown command ${G("/" + cmd.key)} — try ${G("/help")}`);
503
+ }
504
+ }
505
+ }
506
+
507
+ function printLaunchSummary(seedUrls: string[], s: CrawlSettings, crawlDelayMs: number): void {
508
+ blank();
509
+ console.log(CYAN(" " + STARS.full + " " + gradient("CRAWLER LAUNCHED") + " " + STARS.full));
510
+ console.log(drawTopBorder("LAUNCHING"));
511
+
512
+ const rows: Array<[string, string]> = [
513
+ ["seeds", `${seedUrls.length} url${seedUrls.length !== 1 ? "s" : ""} queued`],
514
+ ["domains", config.ALLOWED_DOMAINS.join(", ")],
515
+ ["output", s.outputMode === "pdf" ? "pdf -> output/documentation*.pdf" : "postgresql"],
516
+ ["depth", String(config.MAX_DEPTH)],
517
+ ["delay", `${crawlDelayMs}ms`],
518
+ ["workers", String(config.WORKER_COUNT)],
519
+ ["maxpages", config.MAX_PAGES === 0 ? "unlimited" : String(config.MAX_PAGES)],
520
+ ];
521
+
522
+ for (const [k, v] of rows) {
523
+ console.log(drawRow(k, gradient(v), getVisibleLength(v)));
524
+ }
525
+
526
+ console.log(drawBottomBorder());
527
+ blank();
528
+ }
529
+
530
+ // ─── Main Execution ───────────────────────────────────────────────────────────
531
+
532
+ async function main() {
533
+ printBanner();
534
+ const seedsFromFile = readSeedsFile();
535
+
536
+ if (seedsFromFile.length > 0) {
537
+ console.log(CYAN(" " + STARS.half + " " + gradient(`Found ${seedsFromFile.length} URLs in seeds.txt`)));
538
+ console.log(drawTopBorder(`seeds.txt (${seedsFromFile.length})`));
539
+ for (const u of seedsFromFile.slice(0, 5)) {
540
+ const innerLeft = " " + u;
541
+ console.log(DG(" │") + CYAN(innerLeft) + " ".repeat(Math.max(0, INNER_WIDTH - getVisibleLength(innerLeft))) + DG("│"));
542
+ }
543
+ console.log(drawBottomBorder());
544
+ blank();
545
+ }
546
+
547
+ const s = await runCommandRepl(seedsFromFile);
548
+ const crawlDelayMs = enforceCrawlDelay(s.crawlDelayMs);
549
+
550
+ config.MAX_DEPTH = s.maxDepth;
551
+ config.CRAWL_DELAY_MS = crawlDelayMs;
552
+ config.WORKER_COUNT = s.workerCount;
553
+ config.MAX_PAGES = s.maxPages;
554
+ config.OUTPUT_MODE = s.outputMode;
555
+
556
+ const rawSeeds = s.seedSource === "file" ? seedsFromFile : s.seedSource === "custom" ? s.customUrls : config.SEED_URLS;
557
+ const { valid: seedUrls } = validateSeedUrls(rawSeeds);
558
+
559
+ config.SEED_URLS = seedUrls;
560
+ config.ALLOWED_DOMAINS = extractDomains(seedUrls);
561
+
562
+ printLaunchSummary(seedUrls, s, crawlDelayMs);
563
+
564
+ const strategy = createStrategy(s.outputMode);
565
+ setStrategy(strategy);
566
+ await strategy.init();
567
+
568
+ try {
569
+ await resetStaleLocks();
570
+ await clearPendingURLs(config.ALLOWED_DOMAINS);
571
+ await seedDatabase();
572
+
573
+ // Background status text engine starts up before launching the scheduler
574
+ blank();
575
+ startGlowStatus("CRAWLING THROUGH THE SITE...");
576
+
577
+ await startScheduler();
578
+
579
+ stopGlowStatus();
580
+ ok("Crawl finished successfully.");
581
+ } catch (error) {
582
+ stopGlowStatus();
583
+ err("Fatal error in main loop.");
584
+ console.error(error);
585
+ } finally {
586
+ stopGlowStatus();
587
+ await strategy.finish();
588
+ await pool.end();
589
+ blank();
590
+ console.log(CYAN(" " + STARS.full + " " + gradient("Session complete") + " " + STARS.full));
591
+ ok("Database connection closed.");
592
+ blank();
593
+ }
594
+ }
595
+
596
+ main();
@@ -0,0 +1,37 @@
1
+ export function normalizeURL(rawUrl: string, baseUrl: string): string | null {
2
+ try {
3
+ // 1. Resolve relative URLs against base URL
4
+ // Scheme and host are automatically lowercased by the URL constructor
5
+ const url = new URL(rawUrl, baseUrl);
6
+
7
+ // Filter allowed schemes (http, https)
8
+ if (url.protocol !== "http:" && url.protocol !== "https:") {
9
+ return null;
10
+ }
11
+
12
+ // 3. Strip fragment
13
+ url.hash = "";
14
+
15
+ // 2. Strip trailing slash
16
+ let pathname = url.pathname;
17
+ if (pathname.endsWith("/")) {
18
+ pathname = pathname.slice(0, -1);
19
+ }
20
+
21
+ const portPart = url.port ? `:${url.port}` : "";
22
+ const searchPart = url.search || "";
23
+
24
+ return `${url.protocol}//${url.hostname}${portPart}${pathname}${searchPart}`;
25
+ } catch (e) {
26
+ return null;
27
+ }
28
+ }
29
+
30
+ export function getDomain(urlStr: string): string | null {
31
+ try {
32
+ const url = new URL(urlStr);
33
+ return url.hostname;
34
+ } catch (e) {
35
+ return null;
36
+ }
37
+ }
@@ -0,0 +1,20 @@
1
+ import type { OutputStrategy } from "./strategy.js";
2
+ import { markDone, type CrawledPageContent } from "../db/queries.js";
3
+
4
+ /**
5
+ * Persists crawled page content into the PostgreSQL database.
6
+ * This is the default structured-data output pipeline.
7
+ */
8
+ export class DatabaseStrategy implements OutputStrategy {
9
+ async init(): Promise<void> {
10
+ // Pool is already initialised by client.ts — nothing to do here.
11
+ }
12
+
13
+ async save(urlId: number, _url: string, content: CrawledPageContent): Promise<void> {
14
+ await markDone(urlId, content);
15
+ }
16
+
17
+ async finish(): Promise<void> {
18
+ // Connection pool is closed by index.ts — nothing to do here.
19
+ }
20
+ }
@@ -0,0 +1,32 @@
1
+ import type { OutputStrategy } from "./strategy.js";
2
+ import { DatabaseStrategy } from "./db-strategy.js";
3
+ import { PdfStrategy } from "./pdf-strategy.js";
4
+
5
+ export type OutputMode = "database" | "pdf";
6
+
7
+ let activeStrategy: OutputStrategy | null = null;
8
+
9
+ export function setStrategy(strategy: OutputStrategy): void {
10
+ activeStrategy = strategy;
11
+ }
12
+
13
+ export function getStrategy(): OutputStrategy {
14
+ if (!activeStrategy) {
15
+ // Default to database strategy if none was configured
16
+ activeStrategy = new DatabaseStrategy();
17
+ }
18
+ return activeStrategy;
19
+ }
20
+
21
+ export function createStrategy(mode: OutputMode): OutputStrategy {
22
+ switch (mode) {
23
+ case "pdf":
24
+ return new PdfStrategy();
25
+ case "database":
26
+ default:
27
+ return new DatabaseStrategy();
28
+ }
29
+ }
30
+
31
+ export { DatabaseStrategy, PdfStrategy };
32
+ export type { OutputStrategy };