messi-crawler 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. package/README.md +201 -0
  2. package/dist/cli/renderer.js +71 -0
  3. package/dist/config.js +18 -0
  4. package/dist/db/clear.js +16 -0
  5. package/dist/db/client.js +20 -0
  6. package/dist/db/queries.js +179 -0
  7. package/dist/frontier/frontier.js +44 -0
  8. package/dist/frontier/logger.js +65 -0
  9. package/dist/frontier/robots.js +46 -0
  10. package/dist/frontier/scheduler.js +98 -0
  11. package/dist/index.js +533 -0
  12. package/dist/normalizer.js +33 -0
  13. package/dist/output/db-strategy.js +16 -0
  14. package/dist/output/index.js +23 -0
  15. package/dist/output/pdf-strategy.js +316 -0
  16. package/dist/output/strategy.js +1 -0
  17. package/dist/security/ssrf.js +45 -0
  18. package/dist/security/validate-url.js +41 -0
  19. package/dist/seed.js +14 -0
  20. package/dist/setup.js +148 -0
  21. package/dist/test/client.test.js +33 -0
  22. package/dist/test/downloader.test.js +84 -0
  23. package/dist/test/extractor.test.js +126 -0
  24. package/dist/test/frontier.test.js +43 -0
  25. package/dist/test/logger.test.js +55 -0
  26. package/dist/test/normalizer.test.js +36 -0
  27. package/dist/test/pdf-strategy.test.js +68 -0
  28. package/dist/test/queries.test.js +173 -0
  29. package/dist/test/robots.test.js +46 -0
  30. package/dist/test/scheduler.test.js +73 -0
  31. package/dist/test/seed.test.js +26 -0
  32. package/dist/test/worker.test.js +118 -0
  33. package/dist/worker/downloader.js +114 -0
  34. package/dist/worker/extractor.js +197 -0
  35. package/dist/worker/worker.js +87 -0
  36. package/package.json +48 -0
  37. package/seeds.txt +4 -0
  38. package/src/cli/renderer.ts +83 -0
  39. package/src/config.ts +22 -0
  40. package/src/db/clear.ts +16 -0
  41. package/src/db/client.ts +26 -0
  42. package/src/db/queries.ts +255 -0
  43. package/src/db/schema.sql +43 -0
  44. package/src/frontier/frontier.ts +60 -0
  45. package/src/frontier/logger.ts +75 -0
  46. package/src/frontier/robots.ts +50 -0
  47. package/src/frontier/scheduler.ts +119 -0
  48. package/src/index.ts +596 -0
  49. package/src/normalizer.ts +37 -0
  50. package/src/output/db-strategy.ts +20 -0
  51. package/src/output/index.ts +32 -0
  52. package/src/output/pdf-strategy.ts +388 -0
  53. package/src/output/strategy.ts +16 -0
  54. package/src/security/ssrf.ts +48 -0
  55. package/src/security/validate-url.ts +49 -0
  56. package/src/seed.ts +18 -0
  57. package/src/setup.ts +170 -0
  58. package/src/test/client.test.ts +38 -0
  59. package/src/test/downloader.test.ts +101 -0
  60. package/src/test/extractor.test.ts +139 -0
  61. package/src/test/frontier.test.ts +53 -0
  62. package/src/test/logger.test.ts +71 -0
  63. package/src/test/normalizer.test.ts +43 -0
  64. package/src/test/pdf-strategy.test.ts +84 -0
  65. package/src/test/queries.test.ts +247 -0
  66. package/src/test/robots.test.ts +56 -0
  67. package/src/test/scheduler.test.ts +90 -0
  68. package/src/test/seed.test.ts +35 -0
  69. package/src/test/worker.test.ts +144 -0
  70. package/src/worker/downloader.ts +149 -0
  71. package/src/worker/extractor.ts +235 -0
  72. package/src/worker/worker.ts +100 -0
  73. package/tsconfig.json +15 -0
@@ -0,0 +1,98 @@
1
+ import { config } from "../config.js";
2
+ import { claimNextURL } from "../db/queries.js";
3
+ import { getPendingDomains } from "./frontier.js";
4
+ import { processPage } from "../worker/worker.js";
5
+ import { startProgressLogger, stopProgressLogger } from "./logger.js";
6
+ const cooldowns = new Map();
7
+ let activeWorkers = 0;
8
+ let lastDomainIndex = 0;
9
+ let isRunning = false;
10
+ // Pages dispatched in this session (in-memory counter, not cumulative DB total)
11
+ let sessionPageCount = 0;
12
+ export const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms));
13
+ /**
14
+ * Starts the round-robin scheduler loop.
15
+ * Respects politeness delays per domain and concurrency limits.
16
+ */
17
+ export async function startScheduler() {
18
+ if (isRunning)
19
+ return;
20
+ isRunning = true;
21
+ sessionPageCount = 0;
22
+ // Start the periodic progress logger
23
+ await startProgressLogger();
24
+ while (isRunning) {
25
+ // Enforce MAX_PAGES limit against this session's dispatched count
26
+ if (config.MAX_PAGES > 0 && sessionPageCount >= config.MAX_PAGES) {
27
+ // Wait for any in-flight workers to finish before stopping
28
+ while (activeWorkers > 0) {
29
+ await sleep(100);
30
+ }
31
+ console.log(`\n✓ Crawl complete — ${sessionPageCount} page(s) processed.\n`);
32
+ break;
33
+ }
34
+ // 1. Enforce worker concurrency limit
35
+ if (activeWorkers >= config.WORKER_COUNT) {
36
+ await sleep(50);
37
+ continue;
38
+ }
39
+ // 2. Fetch active pending domains from the frontier
40
+ const domains = await getPendingDomains();
41
+ if (domains.length === 0) {
42
+ // Exit if there are no pending URLs and all workers are idle
43
+ if (activeWorkers === 0) {
44
+ break;
45
+ }
46
+ await sleep(100);
47
+ continue;
48
+ }
49
+ let claimed = false;
50
+ const now = Date.now();
51
+ // 3. Round-robin traversal over domains
52
+ for (let i = 0; i < domains.length; i++) {
53
+ const idx = (lastDomainIndex + i) % domains.length;
54
+ const domain = domains[idx];
55
+ // Enforce politeness delay
56
+ const nextAllowed = cooldowns.get(domain) || 0;
57
+ if (now < nextAllowed) {
58
+ continue;
59
+ }
60
+ // Try to atomically claim a URL for this domain
61
+ const urlRow = await claimNextURL(domain);
62
+ if (urlRow) {
63
+ // Set the domain cooldown
64
+ cooldowns.set(domain, Date.now() + config.CRAWL_DELAY_MS);
65
+ // Update round-robin start index for the next tick
66
+ lastDomainIndex = (idx + 1) % domains.length;
67
+ // Dispatch worker
68
+ activeWorkers++;
69
+ sessionPageCount++;
70
+ processPage(urlRow)
71
+ .catch((err) => {
72
+ console.error(`Error processing ${urlRow.url}:`, err);
73
+ })
74
+ .finally(() => {
75
+ activeWorkers--;
76
+ });
77
+ claimed = true;
78
+ break;
79
+ }
80
+ }
81
+ // If no URL was claimed (e.g. all domains in cooldown or DB lock contention), sleep
82
+ if (!claimed) {
83
+ await sleep(50);
84
+ }
85
+ }
86
+ isRunning = false;
87
+ stopProgressLogger();
88
+ }
89
+ export function stopScheduler() {
90
+ isRunning = false;
91
+ stopProgressLogger();
92
+ }
93
+ export function getActiveWorkersCount() {
94
+ return activeWorkers;
95
+ }
96
+ export function getCooldown(domain) {
97
+ return cooldowns.get(domain) || 0;
98
+ }
package/dist/index.js ADDED
@@ -0,0 +1,533 @@
1
+ #!/usr/bin/env node
2
+ import { input, select } from "@inquirer/prompts";
3
+ import * as readline from "node:readline/promises";
4
+ import { stdin as input$, stdout } from "node:process";
5
+ import fs from "fs";
6
+ import { seedDatabase } from "./seed.js";
7
+ import { resetStaleLocks, clearPendingURLs } from "./db/queries.js";
8
+ import { startScheduler } from "./frontier/scheduler.js";
9
+ import { pool } from "./db/client.js";
10
+ import { config } from "./config.js";
11
+ import { createStrategy, setStrategy } from "./output/index.js";
12
+ import { validateSeedUrls } from "./security/validate-url.js";
13
+ const SEEDS_FILE = "seeds.txt";
14
+ const MIN_CRAWL_DELAY_MS = 500;
15
+ // Unified layout grid definition
16
+ const INNER_WIDTH = 72;
17
+ // ─── Terminal primitives ──────────────────────────────────────────────────────
18
+ const ESC = "\x1b[";
19
+ const RESET = "\x1b[0m";
20
+ // Gradient colors: Green to Blue
21
+ const gradient = (text) => {
22
+ const colors = [
23
+ "\x1b[38;2;0;255;127m", // Spring Green
24
+ "\x1b[38;2;0;255;150m",
25
+ "\x1b[38;2;0;230;180m",
26
+ "\x1b[38;2;0;200;200m",
27
+ "\x1b[38;2;0;170;220m",
28
+ "\x1b[38;2;0;140;240m",
29
+ "\x1b[38;2;0;100;255m", // Dodger Blue
30
+ "\x1b[38;2;0;70;255m",
31
+ ];
32
+ let result = "";
33
+ const chars = text.split("");
34
+ for (let i = 0; i < chars.length; i++) {
35
+ const colorIndex = Math.floor((i / chars.length) * colors.length);
36
+ result += colors[colorIndex % colors.length] + chars[i];
37
+ }
38
+ return result + RESET;
39
+ };
40
+ // Clean structural geometric patterns
41
+ const STARS = {
42
+ full: "✦",
43
+ half: "✧",
44
+ small: "⋆",
45
+ star: "★",
46
+ outline: "☆",
47
+ };
48
+ const G = (s) => `\x1b[32m${s}${RESET}`; // green
49
+ const DG = (s) => `\x1b[2;32m${s}${RESET}`; // dim green
50
+ const YL = (s) => `\x1b[33m${s}${RESET}`; // yellow
51
+ const RD = (s) => `\x1b[31m${s}${RESET}`; // red
52
+ const W = (s) => `\x1b[97m${s}${RESET}`; // bright white
53
+ const DIM = (s) => `\x1b[2m${s}${RESET}`; // dim
54
+ const BOLD = (s) => `\x1b[1m${s}${RESET}`; // bold
55
+ const BLUE = (s) => `\x1b[34m${s}${RESET}`; // blue
56
+ const CYAN = (s) => `\x1b[36m${s}${RESET}`; // cyan
57
+ const BOX = {
58
+ tl: "┌", tr: "┐", bl: "└", br: "┘",
59
+ h: "─", v: "│", lm: "├", rm: "┤",
60
+ tm: "┬", bm: "┴", cross: "┼",
61
+ };
62
+ function getVisibleLength(s) {
63
+ return s.replace(/\x1b\[[0-9;]*m/g, "").length;
64
+ }
65
+ // ─── Box Layout Engine ────────────────────────────────────────────────────────
66
+ function drawTopBorder(label) {
67
+ const prefix = " ┌─ " + label + " ";
68
+ const currentLen = getVisibleLength(prefix);
69
+ const remaining = (INNER_WIDTH + 4) - currentLen - 1;
70
+ return DG(prefix + BOX.h.repeat(Math.max(0, remaining)) + "┐");
71
+ }
72
+ function drawBottomBorder() {
73
+ return DG(" └" + BOX.h.repeat(INNER_WIDTH) + "┘");
74
+ }
75
+ function drawRow(leftContent, rightContentRendered, rightContentRawLength) {
76
+ const leftVisible = getVisibleLength(leftContent);
77
+ const leftPadded = leftContent + " ".repeat(Math.max(0, 16 - leftVisible));
78
+ const remainingSpaces = INNER_WIDTH - 18 - rightContentRawLength;
79
+ return DG(" │") + " " + leftPadded + rightContentRendered + " ".repeat(Math.max(0, remainingSpaces)) + DG("│");
80
+ }
81
+ // ─── Background Glowing Animation Engine ──────────────────────────────────────
82
+ let glowInterval = null;
83
+ let currentGlowText = "";
84
+ let glowFrameIndex = 0;
85
+ // High-density frames with elevated base values to prevent vanishing artifacts
86
+ const glowFrames = [
87
+ "\x1b[38;2;0;140;100m", // Baseline low (Crisp Medium Mint)
88
+ "\x1b[38;2;0;175;130m",
89
+ "\x1b[38;2;0;205;155m",
90
+ "\x1b[38;2;0;235;185m",
91
+ "\x1b[38;2;0;255;210m", // Vivid Cyan-Green
92
+ "\x1b[38;2;100;255;225m", // High Glow
93
+ "\x1b[38;2;180;255;245m", // Peak Brightness
94
+ "\x1b[38;2;100;255;225m",
95
+ "\x1b[38;2;0;255;210m",
96
+ "\x1b[38;2;0;235;185m",
97
+ "\x1b[38;2;0;205;155m",
98
+ "\x1b[38;2;0;175;130m"
99
+ ];
100
+ function drawGlowLine() {
101
+ if (!currentGlowText)
102
+ return;
103
+ const colorCode = glowFrames[glowFrameIndex % glowFrames.length];
104
+ process.stdout.write(`\r\x1b[K ${CYAN(STARS.full)} ${colorCode}${BOLD(currentGlowText)}${RESET}`);
105
+ glowFrameIndex++;
106
+ }
107
+ export function startGlowStatus(text) {
108
+ currentGlowText = text;
109
+ glowFrameIndex = 0;
110
+ // Hide native terminal cursor to keep rendering clean
111
+ process.stdout.write("\x1b[?25l");
112
+ drawGlowLine();
113
+ glowInterval = setInterval(drawGlowLine, 120); // Frequency locked at 120ms
114
+ }
115
+ export function stopGlowStatus() {
116
+ if (glowInterval) {
117
+ clearInterval(glowInterval);
118
+ glowInterval = null;
119
+ }
120
+ currentGlowText = "";
121
+ // Clear the animation frame line and restore cursor
122
+ process.stdout.write("\r\x1b[K\x1b[?25h");
123
+ }
124
+ // ─── Logging Helpers (Interception-Aware) ─────────────────────────────────────
125
+ const ts = () => DIM(new Date().toISOString().slice(11, 19));
126
+ export function ok(msg) {
127
+ if (currentGlowText)
128
+ process.stdout.write("\r\x1b[K");
129
+ console.log(` ${ts()} ${G("✓")} ${msg}`);
130
+ if (currentGlowText)
131
+ drawGlowLine();
132
+ }
133
+ export function warn(msg) {
134
+ if (currentGlowText)
135
+ process.stdout.write("\r\x1b[K");
136
+ console.log(` ${ts()} ${YL("⚠")} ${msg}`);
137
+ if (currentGlowText)
138
+ drawGlowLine();
139
+ }
140
+ export function err(msg) {
141
+ if (currentGlowText)
142
+ process.stdout.write("\r\x1b[K");
143
+ console.log(` ${ts()} ${RD("✗")} ${msg}`);
144
+ if (currentGlowText)
145
+ drawGlowLine();
146
+ }
147
+ export function info(msg) {
148
+ if (currentGlowText)
149
+ process.stdout.write("\r\x1b[K");
150
+ console.log(` ${ts()} ${BLUE("ℹ")} ${DIM(msg)}`);
151
+ if (currentGlowText)
152
+ drawGlowLine();
153
+ }
154
+ export function blank() {
155
+ if (currentGlowText)
156
+ process.stdout.write("\r\x1b[K");
157
+ console.log("");
158
+ if (currentGlowText)
159
+ drawGlowLine();
160
+ }
161
+ // ─── Banner ───────────────────────────────────────────────────────────────────
162
+ export function printBanner() {
163
+ console.clear();
164
+ blank();
165
+ const bannerText = [
166
+ " ██╗ ██╗███████╗██████╗ ✦ ██████╗██████╗ █████╗ ██╗ ██╗██╗ ███████╗██████╗ ",
167
+ " ██║ ██║██╔════╝██╔══██╗ ██╔════╝██╔══██╗██╔══██╗██║ ██║██║ ✦ ██╔════╝██╔══██╗",
168
+ " ██║ █╗ ██║█████╗ ██████╔╝ ✦ ██║ ██████╔╝███████║██║ █╗ ██║██║ █████╗ ██████╔╝",
169
+ " ██║███╗██║██╔══╝ ██╔══██╗ ██║ ██╔══██╗██╔══██║██║███╗██║██║ ★ ██╔══╝ ██╔══██╗",
170
+ " ╚███╔███╔╝███████╗██████╔╝ ★ ╚██████╗██║ ██║██║ ██║╚███╔███╔╝███████╗███████╗██║ ██║",
171
+ " ╚══╝╚══╝ ╚══════╝╚═════╝ ╚═════╝╚═╝ ╚═╝╚═╝ ╚═╝ ╚══╝╚══╝ ╚══════╝╚══════╝╚═╝ ╚═╝",
172
+ ];
173
+ for (const line of bannerText) {
174
+ let texturedLine = gradient(line);
175
+ texturedLine = texturedLine.replace(/✦/g, W("✦")).replace(/★/g, W("★"));
176
+ console.log(` ${CYAN(STARS.full)} ${texturedLine} ${CYAN(STARS.full)}`);
177
+ }
178
+ blank();
179
+ console.log(` ${CYAN(STARS.half)} ${gradient("Node.js · TypeScript · PostgreSQL")} ${CYAN(STARS.half)} ${DIM("v1.0.0")}`);
180
+ console.log(DG(" " + BOX.h.repeat(INNER_WIDTH + 2)));
181
+ blank();
182
+ }
183
+ // ─── Status board ─────────────────────────────────────────────────────────────
184
+ export function printSummary(s, seedsFromFile) {
185
+ const seedLabel = s.seedSource === "file"
186
+ ? `seeds.txt (${seedsFromFile.length} url${seedsFromFile.length !== 1 ? "s" : ""})`
187
+ : s.seedSource === "custom"
188
+ ? `custom (${s.customUrls.length} url${s.customUrls.length !== 1 ? "s" : ""})`
189
+ : "config.ts";
190
+ const rows = [
191
+ ["output", s.outputMode === "pdf" ? "pdf -> output/documentation*.pdf" : "database -> postgresql"],
192
+ ["seeds", seedLabel],
193
+ ["depth", String(s.maxDepth)],
194
+ ["delay", `${s.crawlDelayMs}ms${s.crawlDelayMs < MIN_CRAWL_DELAY_MS ? " ! below safe minimum" : ""}`],
195
+ ["workers", String(s.workerCount)],
196
+ ["maxpages", s.maxPages === 0 ? "unlimited" : String(s.maxPages)],
197
+ ["start", "Start the crawl"]
198
+ ];
199
+ blank();
200
+ console.log(CYAN(" " + STARS.small + " " + gradient("CONFIGURATION") + " " + STARS.small));
201
+ console.log(drawTopBorder("CONFIG"));
202
+ for (const [k, v] of rows) {
203
+ const valRendered = v.includes("!") ? YL(v) : gradient(v);
204
+ console.log(drawRow(CYAN(BOLD("/" + k)), valRendered, getVisibleLength(v)));
205
+ }
206
+ console.log(drawBottomBorder());
207
+ blank();
208
+ }
209
+ // ─── Help panel ───────────────────────────────────────────────────────────────
210
+ export function printHelp() {
211
+ blank();
212
+ console.log(CYAN(" " + STARS.star + " " + gradient("AVAILABLE COMMANDS") + " " + STARS.star));
213
+ console.log(drawTopBorder("COMMANDS"));
214
+ const cmds = [
215
+ ["/output", "switch output destination (interactive)"],
216
+ ["/seeds", "choose seed URL source (interactive)"],
217
+ ["/depth <n>", "max link traversal depth"],
218
+ ["/delay <ms>", "politeness delay (min 500)"],
219
+ ["/workers <n>", "concurrent worker count"],
220
+ ["/maxpages <n>", "page cap per session (0 = unlimited)"],
221
+ ["/status", "print current config"],
222
+ ["/help", "show this panel"],
223
+ ["/start", "validate and begin crawl"],
224
+ ["/quit", "exit without crawling"],
225
+ ];
226
+ for (const [cmd, desc] of cmds) {
227
+ console.log(drawRow(CYAN(cmd), DIM(desc), getVisibleLength(desc)));
228
+ }
229
+ console.log(drawBottomBorder());
230
+ blank();
231
+ }
232
+ // ─── Interactive Setup Helpers ───────────────────────────────────────────────
233
+ function readSeedsFile() {
234
+ if (!fs.existsSync(SEEDS_FILE))
235
+ return [];
236
+ return fs
237
+ .readFileSync(SEEDS_FILE, "utf-8")
238
+ .split("\n")
239
+ .map((l) => l.trim())
240
+ .filter((l) => l.length > 0 && !l.startsWith("#"));
241
+ }
242
+ function extractDomains(urls) {
243
+ return urls.reduce((acc, url) => {
244
+ try {
245
+ const { hostname } = new URL(url);
246
+ if (hostname && !acc.includes(hostname))
247
+ acc.push(hostname);
248
+ }
249
+ catch { }
250
+ return acc;
251
+ }, []);
252
+ }
253
+ function enforceCrawlDelay(raw) {
254
+ if (raw < MIN_CRAWL_DELAY_MS) {
255
+ warn(`crawl_delay_ms ${raw}ms is below safe minimum — raised to ${MIN_CRAWL_DELAY_MS}ms`);
256
+ return MIN_CRAWL_DELAY_MS;
257
+ }
258
+ return raw;
259
+ }
260
+ function parseSlashCommand(input) {
261
+ const match = input.trim().match(/^\/(\w+)(?:\s+(.+))?$/);
262
+ if (!match)
263
+ return null;
264
+ return { key: match[1].toLowerCase(), value: (match[2] ?? "").trim() };
265
+ }
266
+ function parseIntValue(value, label, min = 0) {
267
+ const n = parseInt(value, 10);
268
+ if (isNaN(n) || n < min)
269
+ return `${label} must be an integer >= ${min}`;
270
+ return n;
271
+ }
272
+ function defaultSettings(seedsFromFile) {
273
+ return {
274
+ outputMode: config.OUTPUT_MODE,
275
+ seedSource: seedsFromFile.length > 0 ? "file" : "config",
276
+ customUrls: [],
277
+ maxDepth: config.MAX_DEPTH,
278
+ crawlDelayMs: config.CRAWL_DELAY_MS,
279
+ workerCount: config.WORKER_COUNT,
280
+ maxPages: config.MAX_PAGES,
281
+ };
282
+ }
283
+ async function handleOutputMode() {
284
+ const result = await select({
285
+ message: gradient("Select output mode:"),
286
+ choices: [
287
+ { name: "PDF - Generate documentation PDF", value: "pdf" },
288
+ { name: "Database - Store in PostgreSQL", value: "database" },
289
+ ],
290
+ });
291
+ return result;
292
+ }
293
+ async function handleSeedSource(seedsFromFile) {
294
+ const result = await select({
295
+ message: gradient("Select seed source:"),
296
+ choices: [
297
+ { name: `File (seeds.txt) - ${seedsFromFile.length} URLs found`, value: "file", disabled: seedsFromFile.length === 0 },
298
+ { name: "Config - Use default seeds from config.ts", value: "config" },
299
+ { name: "Custom - Enter your own URLs", value: "custom" },
300
+ ],
301
+ });
302
+ if (result === "file")
303
+ return { source: "file", urls: seedsFromFile };
304
+ if (result === "config")
305
+ return { source: "config", urls: config.SEED_URLS };
306
+ return { source: "custom", urls: await handleCustomUrls() };
307
+ }
308
+ async function handleCustomUrls() {
309
+ while (true) {
310
+ const raw = await input({ message: gradient("Enter URLs (comma-separated):") });
311
+ const candidates = raw.split(",").map((u) => u.trim()).filter(Boolean);
312
+ const { valid, invalid } = validateSeedUrls(candidates);
313
+ if (invalid.length > 0) {
314
+ for (const e of invalid)
315
+ err(e.reason);
316
+ continue;
317
+ }
318
+ return valid;
319
+ }
320
+ }
321
+ async function handleNumericConfig(prompt, currentValue, min = 0, max) {
322
+ while (true) {
323
+ const value = await input({
324
+ message: gradient(`${prompt} (current: ${currentValue}):`),
325
+ validate: (v) => {
326
+ const num = parseInt(v, 10);
327
+ if (isNaN(num))
328
+ return "Please enter a valid number";
329
+ if (num < min)
330
+ return `Value must be >= ${min}`;
331
+ if (max !== undefined && num > max)
332
+ return `Value must be <= ${max}`;
333
+ return true;
334
+ },
335
+ });
336
+ return parseInt(value, 10);
337
+ }
338
+ }
339
+ // ─── Command REPL ─────────────────────────────────────────────────────────────
340
+ async function runCommandRepl(seedsFromFile) {
341
+ const s = defaultSettings(seedsFromFile);
342
+ printSummary(s, seedsFromFile);
343
+ console.log(CYAN(STARS.small) + DIM(" type a command to configure the crawler.") +
344
+ " " + G("/help") + DIM(" for options,") +
345
+ " " + W("/start") + DIM(" to begin."));
346
+ blank();
347
+ while (true) {
348
+ const rl = readline.createInterface({ input: input$, output: stdout });
349
+ let line = "";
350
+ try {
351
+ line = await rl.question(CYAN(" " + STARS.half + " › "));
352
+ }
353
+ finally {
354
+ rl.close();
355
+ }
356
+ const trimmed = line.trim();
357
+ if (!trimmed)
358
+ continue;
359
+ const cmd = parseSlashCommand(trimmed);
360
+ if (!cmd) {
361
+ warn("Commands must start with / — try " + G("/help"));
362
+ continue;
363
+ }
364
+ switch (cmd.key) {
365
+ case "help":
366
+ printHelp();
367
+ break;
368
+ case "status":
369
+ printSummary(s, seedsFromFile);
370
+ break;
371
+ case "output":
372
+ s.outputMode = await handleOutputMode();
373
+ ok(`output -> ${G(s.outputMode)}`);
374
+ break;
375
+ case "seeds": {
376
+ const result = await handleSeedSource(seedsFromFile);
377
+ s.seedSource = result.source;
378
+ if (result.source === "custom")
379
+ s.customUrls = result.urls;
380
+ ok(`seeds source -> ${G(result.source)} (${result.urls.length} URLs)`);
381
+ break;
382
+ }
383
+ case "depth":
384
+ if (cmd.value) {
385
+ const v = parseIntValue(cmd.value, "depth", 0);
386
+ if (typeof v === "string") {
387
+ err(v);
388
+ break;
389
+ }
390
+ s.maxDepth = v;
391
+ }
392
+ else {
393
+ s.maxDepth = await handleNumericConfig("Max depth", s.maxDepth, 0);
394
+ }
395
+ ok(`max_depth -> ${G(String(s.maxDepth))}`);
396
+ break;
397
+ case "delay":
398
+ if (cmd.value) {
399
+ const v = parseIntValue(cmd.value, "delay", 0);
400
+ if (typeof v === "string") {
401
+ err(v);
402
+ break;
403
+ }
404
+ s.crawlDelayMs = enforceCrawlDelay(v);
405
+ }
406
+ else {
407
+ s.crawlDelayMs = await handleNumericConfig("Crawl delay (ms)", s.crawlDelayMs, MIN_CRAWL_DELAY_MS);
408
+ }
409
+ ok(`crawl_delay_ms -> ${G(String(s.crawlDelayMs) + "ms")}`);
410
+ break;
411
+ case "workers":
412
+ if (cmd.value) {
413
+ const v = parseIntValue(cmd.value, "workers", 1);
414
+ if (typeof v === "string") {
415
+ err(v);
416
+ break;
417
+ }
418
+ s.workerCount = v;
419
+ }
420
+ else {
421
+ s.workerCount = await handleNumericConfig("Worker count", s.workerCount, 1, 100);
422
+ }
423
+ ok(`worker_count -> ${G(String(s.workerCount))}`);
424
+ break;
425
+ case "maxpages":
426
+ case "maxcount":
427
+ if (cmd.value) {
428
+ const v = parseIntValue(cmd.value, "maxpages", 0);
429
+ if (typeof v === "string") {
430
+ err(v);
431
+ break;
432
+ }
433
+ s.maxPages = v;
434
+ }
435
+ else {
436
+ s.maxPages = await handleNumericConfig("Max pages (0 = unlimited)", s.maxPages, 0);
437
+ }
438
+ ok(`max_pages -> ${G(s.maxPages === 0 ? "unlimited" : String(s.maxPages))}`);
439
+ break;
440
+ case "start": {
441
+ const seedCount = s.seedSource === "file" ? seedsFromFile.length :
442
+ s.seedSource === "custom" ? s.customUrls.length :
443
+ config.SEED_URLS.length;
444
+ if (seedCount === 0) {
445
+ err("No seed URLs configured — use /seeds first");
446
+ break;
447
+ }
448
+ return s;
449
+ }
450
+ case "quit":
451
+ case "exit":
452
+ case "q":
453
+ process.exit(0);
454
+ default:
455
+ warn(`Unknown command ${G("/" + cmd.key)} — try ${G("/help")}`);
456
+ }
457
+ }
458
+ }
459
+ function printLaunchSummary(seedUrls, s, crawlDelayMs) {
460
+ blank();
461
+ console.log(CYAN(" " + STARS.full + " " + gradient("CRAWLER LAUNCHED") + " " + STARS.full));
462
+ console.log(drawTopBorder("LAUNCHING"));
463
+ const rows = [
464
+ ["seeds", `${seedUrls.length} url${seedUrls.length !== 1 ? "s" : ""} queued`],
465
+ ["domains", config.ALLOWED_DOMAINS.join(", ")],
466
+ ["output", s.outputMode === "pdf" ? "pdf -> output/documentation*.pdf" : "postgresql"],
467
+ ["depth", String(config.MAX_DEPTH)],
468
+ ["delay", `${crawlDelayMs}ms`],
469
+ ["workers", String(config.WORKER_COUNT)],
470
+ ["maxpages", config.MAX_PAGES === 0 ? "unlimited" : String(config.MAX_PAGES)],
471
+ ];
472
+ for (const [k, v] of rows) {
473
+ console.log(drawRow(k, gradient(v), getVisibleLength(v)));
474
+ }
475
+ console.log(drawBottomBorder());
476
+ blank();
477
+ }
478
+ // ─── Main Execution ───────────────────────────────────────────────────────────
479
+ async function main() {
480
+ printBanner();
481
+ const seedsFromFile = readSeedsFile();
482
+ if (seedsFromFile.length > 0) {
483
+ console.log(CYAN(" " + STARS.half + " " + gradient(`Found ${seedsFromFile.length} URLs in seeds.txt`)));
484
+ console.log(drawTopBorder(`seeds.txt (${seedsFromFile.length})`));
485
+ for (const u of seedsFromFile.slice(0, 5)) {
486
+ const innerLeft = " " + u;
487
+ console.log(DG(" │") + CYAN(innerLeft) + " ".repeat(Math.max(0, INNER_WIDTH - getVisibleLength(innerLeft))) + DG("│"));
488
+ }
489
+ console.log(drawBottomBorder());
490
+ blank();
491
+ }
492
+ const s = await runCommandRepl(seedsFromFile);
493
+ const crawlDelayMs = enforceCrawlDelay(s.crawlDelayMs);
494
+ config.MAX_DEPTH = s.maxDepth;
495
+ config.CRAWL_DELAY_MS = crawlDelayMs;
496
+ config.WORKER_COUNT = s.workerCount;
497
+ config.MAX_PAGES = s.maxPages;
498
+ config.OUTPUT_MODE = s.outputMode;
499
+ const rawSeeds = s.seedSource === "file" ? seedsFromFile : s.seedSource === "custom" ? s.customUrls : config.SEED_URLS;
500
+ const { valid: seedUrls } = validateSeedUrls(rawSeeds);
501
+ config.SEED_URLS = seedUrls;
502
+ config.ALLOWED_DOMAINS = extractDomains(seedUrls);
503
+ printLaunchSummary(seedUrls, s, crawlDelayMs);
504
+ const strategy = createStrategy(s.outputMode);
505
+ setStrategy(strategy);
506
+ await strategy.init();
507
+ try {
508
+ await resetStaleLocks();
509
+ await clearPendingURLs(config.ALLOWED_DOMAINS);
510
+ await seedDatabase();
511
+ // Background status text engine starts up before launching the scheduler
512
+ blank();
513
+ startGlowStatus("CRAWLING THROUGH THE SITE...");
514
+ await startScheduler();
515
+ stopGlowStatus();
516
+ ok("Crawl finished successfully.");
517
+ }
518
+ catch (error) {
519
+ stopGlowStatus();
520
+ err("Fatal error in main loop.");
521
+ console.error(error);
522
+ }
523
+ finally {
524
+ stopGlowStatus();
525
+ await strategy.finish();
526
+ await pool.end();
527
+ blank();
528
+ console.log(CYAN(" " + STARS.full + " " + gradient("Session complete") + " " + STARS.full));
529
+ ok("Database connection closed.");
530
+ blank();
531
+ }
532
+ }
533
+ main();
@@ -0,0 +1,33 @@
1
+ export function normalizeURL(rawUrl, baseUrl) {
2
+ try {
3
+ // 1. Resolve relative URLs against base URL
4
+ // Scheme and host are automatically lowercased by the URL constructor
5
+ const url = new URL(rawUrl, baseUrl);
6
+ // Filter allowed schemes (http, https)
7
+ if (url.protocol !== "http:" && url.protocol !== "https:") {
8
+ return null;
9
+ }
10
+ // 3. Strip fragment
11
+ url.hash = "";
12
+ // 2. Strip trailing slash
13
+ let pathname = url.pathname;
14
+ if (pathname.endsWith("/")) {
15
+ pathname = pathname.slice(0, -1);
16
+ }
17
+ const portPart = url.port ? `:${url.port}` : "";
18
+ const searchPart = url.search || "";
19
+ return `${url.protocol}//${url.hostname}${portPart}${pathname}${searchPart}`;
20
+ }
21
+ catch (e) {
22
+ return null;
23
+ }
24
+ }
25
+ export function getDomain(urlStr) {
26
+ try {
27
+ const url = new URL(urlStr);
28
+ return url.hostname;
29
+ }
30
+ catch (e) {
31
+ return null;
32
+ }
33
+ }
@@ -0,0 +1,16 @@
1
+ import { markDone } from "../db/queries.js";
2
+ /**
3
+ * Persists crawled page content into the PostgreSQL database.
4
+ * This is the default structured-data output pipeline.
5
+ */
6
+ export class DatabaseStrategy {
7
+ async init() {
8
+ // Pool is already initialised by client.ts — nothing to do here.
9
+ }
10
+ async save(urlId, _url, content) {
11
+ await markDone(urlId, content);
12
+ }
13
+ async finish() {
14
+ // Connection pool is closed by index.ts — nothing to do here.
15
+ }
16
+ }