messi-crawler 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +201 -0
- package/dist/cli/renderer.js +71 -0
- package/dist/config.js +18 -0
- package/dist/db/clear.js +16 -0
- package/dist/db/client.js +20 -0
- package/dist/db/queries.js +179 -0
- package/dist/frontier/frontier.js +44 -0
- package/dist/frontier/logger.js +65 -0
- package/dist/frontier/robots.js +46 -0
- package/dist/frontier/scheduler.js +98 -0
- package/dist/index.js +533 -0
- package/dist/normalizer.js +33 -0
- package/dist/output/db-strategy.js +16 -0
- package/dist/output/index.js +23 -0
- package/dist/output/pdf-strategy.js +316 -0
- package/dist/output/strategy.js +1 -0
- package/dist/security/ssrf.js +45 -0
- package/dist/security/validate-url.js +41 -0
- package/dist/seed.js +14 -0
- package/dist/setup.js +148 -0
- package/dist/test/client.test.js +33 -0
- package/dist/test/downloader.test.js +84 -0
- package/dist/test/extractor.test.js +126 -0
- package/dist/test/frontier.test.js +43 -0
- package/dist/test/logger.test.js +55 -0
- package/dist/test/normalizer.test.js +36 -0
- package/dist/test/pdf-strategy.test.js +68 -0
- package/dist/test/queries.test.js +173 -0
- package/dist/test/robots.test.js +46 -0
- package/dist/test/scheduler.test.js +73 -0
- package/dist/test/seed.test.js +26 -0
- package/dist/test/worker.test.js +118 -0
- package/dist/worker/downloader.js +114 -0
- package/dist/worker/extractor.js +197 -0
- package/dist/worker/worker.js +87 -0
- package/package.json +48 -0
- package/seeds.txt +4 -0
- package/src/cli/renderer.ts +83 -0
- package/src/config.ts +22 -0
- package/src/db/clear.ts +16 -0
- package/src/db/client.ts +26 -0
- package/src/db/queries.ts +255 -0
- package/src/db/schema.sql +43 -0
- package/src/frontier/frontier.ts +60 -0
- package/src/frontier/logger.ts +75 -0
- package/src/frontier/robots.ts +50 -0
- package/src/frontier/scheduler.ts +119 -0
- package/src/index.ts +596 -0
- package/src/normalizer.ts +37 -0
- package/src/output/db-strategy.ts +20 -0
- package/src/output/index.ts +32 -0
- package/src/output/pdf-strategy.ts +388 -0
- package/src/output/strategy.ts +16 -0
- package/src/security/ssrf.ts +48 -0
- package/src/security/validate-url.ts +49 -0
- package/src/seed.ts +18 -0
- package/src/setup.ts +170 -0
- package/src/test/client.test.ts +38 -0
- package/src/test/downloader.test.ts +101 -0
- package/src/test/extractor.test.ts +139 -0
- package/src/test/frontier.test.ts +53 -0
- package/src/test/logger.test.ts +71 -0
- package/src/test/normalizer.test.ts +43 -0
- package/src/test/pdf-strategy.test.ts +84 -0
- package/src/test/queries.test.ts +247 -0
- package/src/test/robots.test.ts +56 -0
- package/src/test/scheduler.test.ts +90 -0
- package/src/test/seed.test.ts +35 -0
- package/src/test/worker.test.ts +144 -0
- package/src/worker/downloader.ts +149 -0
- package/src/worker/extractor.ts +235 -0
- package/src/worker/worker.ts +100 -0
- package/tsconfig.json +15 -0
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
import { config } from "../config.js";
|
|
2
|
+
import { claimNextURL } from "../db/queries.js";
|
|
3
|
+
import { getPendingDomains } from "./frontier.js";
|
|
4
|
+
import { processPage } from "../worker/worker.js";
|
|
5
|
+
import { startProgressLogger, stopProgressLogger } from "./logger.js";
|
|
6
|
+
const cooldowns = new Map();
|
|
7
|
+
let activeWorkers = 0;
|
|
8
|
+
let lastDomainIndex = 0;
|
|
9
|
+
let isRunning = false;
|
|
10
|
+
// Pages dispatched in this session (in-memory counter, not cumulative DB total)
|
|
11
|
+
let sessionPageCount = 0;
|
|
12
|
+
export const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms));
|
|
13
|
+
/**
|
|
14
|
+
* Starts the round-robin scheduler loop.
|
|
15
|
+
* Respects politeness delays per domain and concurrency limits.
|
|
16
|
+
*/
|
|
17
|
+
export async function startScheduler() {
|
|
18
|
+
if (isRunning)
|
|
19
|
+
return;
|
|
20
|
+
isRunning = true;
|
|
21
|
+
sessionPageCount = 0;
|
|
22
|
+
// Start the periodic progress logger
|
|
23
|
+
await startProgressLogger();
|
|
24
|
+
while (isRunning) {
|
|
25
|
+
// Enforce MAX_PAGES limit against this session's dispatched count
|
|
26
|
+
if (config.MAX_PAGES > 0 && sessionPageCount >= config.MAX_PAGES) {
|
|
27
|
+
// Wait for any in-flight workers to finish before stopping
|
|
28
|
+
while (activeWorkers > 0) {
|
|
29
|
+
await sleep(100);
|
|
30
|
+
}
|
|
31
|
+
console.log(`\n✓ Crawl complete — ${sessionPageCount} page(s) processed.\n`);
|
|
32
|
+
break;
|
|
33
|
+
}
|
|
34
|
+
// 1. Enforce worker concurrency limit
|
|
35
|
+
if (activeWorkers >= config.WORKER_COUNT) {
|
|
36
|
+
await sleep(50);
|
|
37
|
+
continue;
|
|
38
|
+
}
|
|
39
|
+
// 2. Fetch active pending domains from the frontier
|
|
40
|
+
const domains = await getPendingDomains();
|
|
41
|
+
if (domains.length === 0) {
|
|
42
|
+
// Exit if there are no pending URLs and all workers are idle
|
|
43
|
+
if (activeWorkers === 0) {
|
|
44
|
+
break;
|
|
45
|
+
}
|
|
46
|
+
await sleep(100);
|
|
47
|
+
continue;
|
|
48
|
+
}
|
|
49
|
+
let claimed = false;
|
|
50
|
+
const now = Date.now();
|
|
51
|
+
// 3. Round-robin traversal over domains
|
|
52
|
+
for (let i = 0; i < domains.length; i++) {
|
|
53
|
+
const idx = (lastDomainIndex + i) % domains.length;
|
|
54
|
+
const domain = domains[idx];
|
|
55
|
+
// Enforce politeness delay
|
|
56
|
+
const nextAllowed = cooldowns.get(domain) || 0;
|
|
57
|
+
if (now < nextAllowed) {
|
|
58
|
+
continue;
|
|
59
|
+
}
|
|
60
|
+
// Try to atomically claim a URL for this domain
|
|
61
|
+
const urlRow = await claimNextURL(domain);
|
|
62
|
+
if (urlRow) {
|
|
63
|
+
// Set the domain cooldown
|
|
64
|
+
cooldowns.set(domain, Date.now() + config.CRAWL_DELAY_MS);
|
|
65
|
+
// Update round-robin start index for the next tick
|
|
66
|
+
lastDomainIndex = (idx + 1) % domains.length;
|
|
67
|
+
// Dispatch worker
|
|
68
|
+
activeWorkers++;
|
|
69
|
+
sessionPageCount++;
|
|
70
|
+
processPage(urlRow)
|
|
71
|
+
.catch((err) => {
|
|
72
|
+
console.error(`Error processing ${urlRow.url}:`, err);
|
|
73
|
+
})
|
|
74
|
+
.finally(() => {
|
|
75
|
+
activeWorkers--;
|
|
76
|
+
});
|
|
77
|
+
claimed = true;
|
|
78
|
+
break;
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
// If no URL was claimed (e.g. all domains in cooldown or DB lock contention), sleep
|
|
82
|
+
if (!claimed) {
|
|
83
|
+
await sleep(50);
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
isRunning = false;
|
|
87
|
+
stopProgressLogger();
|
|
88
|
+
}
|
|
89
|
+
export function stopScheduler() {
|
|
90
|
+
isRunning = false;
|
|
91
|
+
stopProgressLogger();
|
|
92
|
+
}
|
|
93
|
+
export function getActiveWorkersCount() {
|
|
94
|
+
return activeWorkers;
|
|
95
|
+
}
|
|
96
|
+
export function getCooldown(domain) {
|
|
97
|
+
return cooldowns.get(domain) || 0;
|
|
98
|
+
}
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,533 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import { input, select } from "@inquirer/prompts";
|
|
3
|
+
import * as readline from "node:readline/promises";
|
|
4
|
+
import { stdin as input$, stdout } from "node:process";
|
|
5
|
+
import fs from "fs";
|
|
6
|
+
import { seedDatabase } from "./seed.js";
|
|
7
|
+
import { resetStaleLocks, clearPendingURLs } from "./db/queries.js";
|
|
8
|
+
import { startScheduler } from "./frontier/scheduler.js";
|
|
9
|
+
import { pool } from "./db/client.js";
|
|
10
|
+
import { config } from "./config.js";
|
|
11
|
+
import { createStrategy, setStrategy } from "./output/index.js";
|
|
12
|
+
import { validateSeedUrls } from "./security/validate-url.js";
|
|
13
|
+
const SEEDS_FILE = "seeds.txt";
|
|
14
|
+
const MIN_CRAWL_DELAY_MS = 500;
|
|
15
|
+
// Unified layout grid definition
|
|
16
|
+
const INNER_WIDTH = 72;
|
|
17
|
+
// ─── Terminal primitives ──────────────────────────────────────────────────────
|
|
18
|
+
const ESC = "\x1b[";
|
|
19
|
+
const RESET = "\x1b[0m";
|
|
20
|
+
// Gradient colors: Green to Blue
|
|
21
|
+
const gradient = (text) => {
|
|
22
|
+
const colors = [
|
|
23
|
+
"\x1b[38;2;0;255;127m", // Spring Green
|
|
24
|
+
"\x1b[38;2;0;255;150m",
|
|
25
|
+
"\x1b[38;2;0;230;180m",
|
|
26
|
+
"\x1b[38;2;0;200;200m",
|
|
27
|
+
"\x1b[38;2;0;170;220m",
|
|
28
|
+
"\x1b[38;2;0;140;240m",
|
|
29
|
+
"\x1b[38;2;0;100;255m", // Dodger Blue
|
|
30
|
+
"\x1b[38;2;0;70;255m",
|
|
31
|
+
];
|
|
32
|
+
let result = "";
|
|
33
|
+
const chars = text.split("");
|
|
34
|
+
for (let i = 0; i < chars.length; i++) {
|
|
35
|
+
const colorIndex = Math.floor((i / chars.length) * colors.length);
|
|
36
|
+
result += colors[colorIndex % colors.length] + chars[i];
|
|
37
|
+
}
|
|
38
|
+
return result + RESET;
|
|
39
|
+
};
|
|
40
|
+
// Clean structural geometric patterns
|
|
41
|
+
const STARS = {
|
|
42
|
+
full: "✦",
|
|
43
|
+
half: "✧",
|
|
44
|
+
small: "⋆",
|
|
45
|
+
star: "★",
|
|
46
|
+
outline: "☆",
|
|
47
|
+
};
|
|
48
|
+
const G = (s) => `\x1b[32m${s}${RESET}`; // green
|
|
49
|
+
const DG = (s) => `\x1b[2;32m${s}${RESET}`; // dim green
|
|
50
|
+
const YL = (s) => `\x1b[33m${s}${RESET}`; // yellow
|
|
51
|
+
const RD = (s) => `\x1b[31m${s}${RESET}`; // red
|
|
52
|
+
const W = (s) => `\x1b[97m${s}${RESET}`; // bright white
|
|
53
|
+
const DIM = (s) => `\x1b[2m${s}${RESET}`; // dim
|
|
54
|
+
const BOLD = (s) => `\x1b[1m${s}${RESET}`; // bold
|
|
55
|
+
const BLUE = (s) => `\x1b[34m${s}${RESET}`; // blue
|
|
56
|
+
const CYAN = (s) => `\x1b[36m${s}${RESET}`; // cyan
|
|
57
|
+
const BOX = {
|
|
58
|
+
tl: "┌", tr: "┐", bl: "└", br: "┘",
|
|
59
|
+
h: "─", v: "│", lm: "├", rm: "┤",
|
|
60
|
+
tm: "┬", bm: "┴", cross: "┼",
|
|
61
|
+
};
|
|
62
|
+
function getVisibleLength(s) {
|
|
63
|
+
return s.replace(/\x1b\[[0-9;]*m/g, "").length;
|
|
64
|
+
}
|
|
65
|
+
// ─── Box Layout Engine ────────────────────────────────────────────────────────
|
|
66
|
+
function drawTopBorder(label) {
|
|
67
|
+
const prefix = " ┌─ " + label + " ";
|
|
68
|
+
const currentLen = getVisibleLength(prefix);
|
|
69
|
+
const remaining = (INNER_WIDTH + 4) - currentLen - 1;
|
|
70
|
+
return DG(prefix + BOX.h.repeat(Math.max(0, remaining)) + "┐");
|
|
71
|
+
}
|
|
72
|
+
function drawBottomBorder() {
|
|
73
|
+
return DG(" └" + BOX.h.repeat(INNER_WIDTH) + "┘");
|
|
74
|
+
}
|
|
75
|
+
function drawRow(leftContent, rightContentRendered, rightContentRawLength) {
|
|
76
|
+
const leftVisible = getVisibleLength(leftContent);
|
|
77
|
+
const leftPadded = leftContent + " ".repeat(Math.max(0, 16 - leftVisible));
|
|
78
|
+
const remainingSpaces = INNER_WIDTH - 18 - rightContentRawLength;
|
|
79
|
+
return DG(" │") + " " + leftPadded + rightContentRendered + " ".repeat(Math.max(0, remainingSpaces)) + DG("│");
|
|
80
|
+
}
|
|
81
|
+
// ─── Background Glowing Animation Engine ──────────────────────────────────────
|
|
82
|
+
let glowInterval = null;
|
|
83
|
+
let currentGlowText = "";
|
|
84
|
+
let glowFrameIndex = 0;
|
|
85
|
+
// High-density frames with elevated base values to prevent vanishing artifacts
|
|
86
|
+
const glowFrames = [
|
|
87
|
+
"\x1b[38;2;0;140;100m", // Baseline low (Crisp Medium Mint)
|
|
88
|
+
"\x1b[38;2;0;175;130m",
|
|
89
|
+
"\x1b[38;2;0;205;155m",
|
|
90
|
+
"\x1b[38;2;0;235;185m",
|
|
91
|
+
"\x1b[38;2;0;255;210m", // Vivid Cyan-Green
|
|
92
|
+
"\x1b[38;2;100;255;225m", // High Glow
|
|
93
|
+
"\x1b[38;2;180;255;245m", // Peak Brightness
|
|
94
|
+
"\x1b[38;2;100;255;225m",
|
|
95
|
+
"\x1b[38;2;0;255;210m",
|
|
96
|
+
"\x1b[38;2;0;235;185m",
|
|
97
|
+
"\x1b[38;2;0;205;155m",
|
|
98
|
+
"\x1b[38;2;0;175;130m"
|
|
99
|
+
];
|
|
100
|
+
function drawGlowLine() {
|
|
101
|
+
if (!currentGlowText)
|
|
102
|
+
return;
|
|
103
|
+
const colorCode = glowFrames[glowFrameIndex % glowFrames.length];
|
|
104
|
+
process.stdout.write(`\r\x1b[K ${CYAN(STARS.full)} ${colorCode}${BOLD(currentGlowText)}${RESET}`);
|
|
105
|
+
glowFrameIndex++;
|
|
106
|
+
}
|
|
107
|
+
export function startGlowStatus(text) {
|
|
108
|
+
currentGlowText = text;
|
|
109
|
+
glowFrameIndex = 0;
|
|
110
|
+
// Hide native terminal cursor to keep rendering clean
|
|
111
|
+
process.stdout.write("\x1b[?25l");
|
|
112
|
+
drawGlowLine();
|
|
113
|
+
glowInterval = setInterval(drawGlowLine, 120); // Frequency locked at 120ms
|
|
114
|
+
}
|
|
115
|
+
export function stopGlowStatus() {
|
|
116
|
+
if (glowInterval) {
|
|
117
|
+
clearInterval(glowInterval);
|
|
118
|
+
glowInterval = null;
|
|
119
|
+
}
|
|
120
|
+
currentGlowText = "";
|
|
121
|
+
// Clear the animation frame line and restore cursor
|
|
122
|
+
process.stdout.write("\r\x1b[K\x1b[?25h");
|
|
123
|
+
}
|
|
124
|
+
// ─── Logging Helpers (Interception-Aware) ─────────────────────────────────────
|
|
125
|
+
const ts = () => DIM(new Date().toISOString().slice(11, 19));
|
|
126
|
+
export function ok(msg) {
|
|
127
|
+
if (currentGlowText)
|
|
128
|
+
process.stdout.write("\r\x1b[K");
|
|
129
|
+
console.log(` ${ts()} ${G("✓")} ${msg}`);
|
|
130
|
+
if (currentGlowText)
|
|
131
|
+
drawGlowLine();
|
|
132
|
+
}
|
|
133
|
+
export function warn(msg) {
|
|
134
|
+
if (currentGlowText)
|
|
135
|
+
process.stdout.write("\r\x1b[K");
|
|
136
|
+
console.log(` ${ts()} ${YL("⚠")} ${msg}`);
|
|
137
|
+
if (currentGlowText)
|
|
138
|
+
drawGlowLine();
|
|
139
|
+
}
|
|
140
|
+
export function err(msg) {
|
|
141
|
+
if (currentGlowText)
|
|
142
|
+
process.stdout.write("\r\x1b[K");
|
|
143
|
+
console.log(` ${ts()} ${RD("✗")} ${msg}`);
|
|
144
|
+
if (currentGlowText)
|
|
145
|
+
drawGlowLine();
|
|
146
|
+
}
|
|
147
|
+
export function info(msg) {
|
|
148
|
+
if (currentGlowText)
|
|
149
|
+
process.stdout.write("\r\x1b[K");
|
|
150
|
+
console.log(` ${ts()} ${BLUE("ℹ")} ${DIM(msg)}`);
|
|
151
|
+
if (currentGlowText)
|
|
152
|
+
drawGlowLine();
|
|
153
|
+
}
|
|
154
|
+
export function blank() {
|
|
155
|
+
if (currentGlowText)
|
|
156
|
+
process.stdout.write("\r\x1b[K");
|
|
157
|
+
console.log("");
|
|
158
|
+
if (currentGlowText)
|
|
159
|
+
drawGlowLine();
|
|
160
|
+
}
|
|
161
|
+
// ─── Banner ───────────────────────────────────────────────────────────────────
|
|
162
|
+
export function printBanner() {
|
|
163
|
+
console.clear();
|
|
164
|
+
blank();
|
|
165
|
+
const bannerText = [
|
|
166
|
+
" ██╗ ██╗███████╗██████╗ ✦ ██████╗██████╗ █████╗ ██╗ ██╗██╗ ███████╗██████╗ ",
|
|
167
|
+
" ██║ ██║██╔════╝██╔══██╗ ██╔════╝██╔══██╗██╔══██╗██║ ██║██║ ✦ ██╔════╝██╔══██╗",
|
|
168
|
+
" ██║ █╗ ██║█████╗ ██████╔╝ ✦ ██║ ██████╔╝███████║██║ █╗ ██║██║ █████╗ ██████╔╝",
|
|
169
|
+
" ██║███╗██║██╔══╝ ██╔══██╗ ██║ ██╔══██╗██╔══██║██║███╗██║██║ ★ ██╔══╝ ██╔══██╗",
|
|
170
|
+
" ╚███╔███╔╝███████╗██████╔╝ ★ ╚██████╗██║ ██║██║ ██║╚███╔███╔╝███████╗███████╗██║ ██║",
|
|
171
|
+
" ╚══╝╚══╝ ╚══════╝╚═════╝ ╚═════╝╚═╝ ╚═╝╚═╝ ╚═╝ ╚══╝╚══╝ ╚══════╝╚══════╝╚═╝ ╚═╝",
|
|
172
|
+
];
|
|
173
|
+
for (const line of bannerText) {
|
|
174
|
+
let texturedLine = gradient(line);
|
|
175
|
+
texturedLine = texturedLine.replace(/✦/g, W("✦")).replace(/★/g, W("★"));
|
|
176
|
+
console.log(` ${CYAN(STARS.full)} ${texturedLine} ${CYAN(STARS.full)}`);
|
|
177
|
+
}
|
|
178
|
+
blank();
|
|
179
|
+
console.log(` ${CYAN(STARS.half)} ${gradient("Node.js · TypeScript · PostgreSQL")} ${CYAN(STARS.half)} ${DIM("v1.0.0")}`);
|
|
180
|
+
console.log(DG(" " + BOX.h.repeat(INNER_WIDTH + 2)));
|
|
181
|
+
blank();
|
|
182
|
+
}
|
|
183
|
+
// ─── Status board ─────────────────────────────────────────────────────────────
|
|
184
|
+
export function printSummary(s, seedsFromFile) {
|
|
185
|
+
const seedLabel = s.seedSource === "file"
|
|
186
|
+
? `seeds.txt (${seedsFromFile.length} url${seedsFromFile.length !== 1 ? "s" : ""})`
|
|
187
|
+
: s.seedSource === "custom"
|
|
188
|
+
? `custom (${s.customUrls.length} url${s.customUrls.length !== 1 ? "s" : ""})`
|
|
189
|
+
: "config.ts";
|
|
190
|
+
const rows = [
|
|
191
|
+
["output", s.outputMode === "pdf" ? "pdf -> output/documentation*.pdf" : "database -> postgresql"],
|
|
192
|
+
["seeds", seedLabel],
|
|
193
|
+
["depth", String(s.maxDepth)],
|
|
194
|
+
["delay", `${s.crawlDelayMs}ms${s.crawlDelayMs < MIN_CRAWL_DELAY_MS ? " ! below safe minimum" : ""}`],
|
|
195
|
+
["workers", String(s.workerCount)],
|
|
196
|
+
["maxpages", s.maxPages === 0 ? "unlimited" : String(s.maxPages)],
|
|
197
|
+
["start", "Start the crawl"]
|
|
198
|
+
];
|
|
199
|
+
blank();
|
|
200
|
+
console.log(CYAN(" " + STARS.small + " " + gradient("CONFIGURATION") + " " + STARS.small));
|
|
201
|
+
console.log(drawTopBorder("CONFIG"));
|
|
202
|
+
for (const [k, v] of rows) {
|
|
203
|
+
const valRendered = v.includes("!") ? YL(v) : gradient(v);
|
|
204
|
+
console.log(drawRow(CYAN(BOLD("/" + k)), valRendered, getVisibleLength(v)));
|
|
205
|
+
}
|
|
206
|
+
console.log(drawBottomBorder());
|
|
207
|
+
blank();
|
|
208
|
+
}
|
|
209
|
+
// ─── Help panel ───────────────────────────────────────────────────────────────
|
|
210
|
+
export function printHelp() {
|
|
211
|
+
blank();
|
|
212
|
+
console.log(CYAN(" " + STARS.star + " " + gradient("AVAILABLE COMMANDS") + " " + STARS.star));
|
|
213
|
+
console.log(drawTopBorder("COMMANDS"));
|
|
214
|
+
const cmds = [
|
|
215
|
+
["/output", "switch output destination (interactive)"],
|
|
216
|
+
["/seeds", "choose seed URL source (interactive)"],
|
|
217
|
+
["/depth <n>", "max link traversal depth"],
|
|
218
|
+
["/delay <ms>", "politeness delay (min 500)"],
|
|
219
|
+
["/workers <n>", "concurrent worker count"],
|
|
220
|
+
["/maxpages <n>", "page cap per session (0 = unlimited)"],
|
|
221
|
+
["/status", "print current config"],
|
|
222
|
+
["/help", "show this panel"],
|
|
223
|
+
["/start", "validate and begin crawl"],
|
|
224
|
+
["/quit", "exit without crawling"],
|
|
225
|
+
];
|
|
226
|
+
for (const [cmd, desc] of cmds) {
|
|
227
|
+
console.log(drawRow(CYAN(cmd), DIM(desc), getVisibleLength(desc)));
|
|
228
|
+
}
|
|
229
|
+
console.log(drawBottomBorder());
|
|
230
|
+
blank();
|
|
231
|
+
}
|
|
232
|
+
// ─── Interactive Setup Helpers ───────────────────────────────────────────────
|
|
233
|
+
function readSeedsFile() {
|
|
234
|
+
if (!fs.existsSync(SEEDS_FILE))
|
|
235
|
+
return [];
|
|
236
|
+
return fs
|
|
237
|
+
.readFileSync(SEEDS_FILE, "utf-8")
|
|
238
|
+
.split("\n")
|
|
239
|
+
.map((l) => l.trim())
|
|
240
|
+
.filter((l) => l.length > 0 && !l.startsWith("#"));
|
|
241
|
+
}
|
|
242
|
+
function extractDomains(urls) {
|
|
243
|
+
return urls.reduce((acc, url) => {
|
|
244
|
+
try {
|
|
245
|
+
const { hostname } = new URL(url);
|
|
246
|
+
if (hostname && !acc.includes(hostname))
|
|
247
|
+
acc.push(hostname);
|
|
248
|
+
}
|
|
249
|
+
catch { }
|
|
250
|
+
return acc;
|
|
251
|
+
}, []);
|
|
252
|
+
}
|
|
253
|
+
function enforceCrawlDelay(raw) {
|
|
254
|
+
if (raw < MIN_CRAWL_DELAY_MS) {
|
|
255
|
+
warn(`crawl_delay_ms ${raw}ms is below safe minimum — raised to ${MIN_CRAWL_DELAY_MS}ms`);
|
|
256
|
+
return MIN_CRAWL_DELAY_MS;
|
|
257
|
+
}
|
|
258
|
+
return raw;
|
|
259
|
+
}
|
|
260
|
+
function parseSlashCommand(input) {
|
|
261
|
+
const match = input.trim().match(/^\/(\w+)(?:\s+(.+))?$/);
|
|
262
|
+
if (!match)
|
|
263
|
+
return null;
|
|
264
|
+
return { key: match[1].toLowerCase(), value: (match[2] ?? "").trim() };
|
|
265
|
+
}
|
|
266
|
+
function parseIntValue(value, label, min = 0) {
|
|
267
|
+
const n = parseInt(value, 10);
|
|
268
|
+
if (isNaN(n) || n < min)
|
|
269
|
+
return `${label} must be an integer >= ${min}`;
|
|
270
|
+
return n;
|
|
271
|
+
}
|
|
272
|
+
function defaultSettings(seedsFromFile) {
|
|
273
|
+
return {
|
|
274
|
+
outputMode: config.OUTPUT_MODE,
|
|
275
|
+
seedSource: seedsFromFile.length > 0 ? "file" : "config",
|
|
276
|
+
customUrls: [],
|
|
277
|
+
maxDepth: config.MAX_DEPTH,
|
|
278
|
+
crawlDelayMs: config.CRAWL_DELAY_MS,
|
|
279
|
+
workerCount: config.WORKER_COUNT,
|
|
280
|
+
maxPages: config.MAX_PAGES,
|
|
281
|
+
};
|
|
282
|
+
}
|
|
283
|
+
async function handleOutputMode() {
|
|
284
|
+
const result = await select({
|
|
285
|
+
message: gradient("Select output mode:"),
|
|
286
|
+
choices: [
|
|
287
|
+
{ name: "PDF - Generate documentation PDF", value: "pdf" },
|
|
288
|
+
{ name: "Database - Store in PostgreSQL", value: "database" },
|
|
289
|
+
],
|
|
290
|
+
});
|
|
291
|
+
return result;
|
|
292
|
+
}
|
|
293
|
+
async function handleSeedSource(seedsFromFile) {
|
|
294
|
+
const result = await select({
|
|
295
|
+
message: gradient("Select seed source:"),
|
|
296
|
+
choices: [
|
|
297
|
+
{ name: `File (seeds.txt) - ${seedsFromFile.length} URLs found`, value: "file", disabled: seedsFromFile.length === 0 },
|
|
298
|
+
{ name: "Config - Use default seeds from config.ts", value: "config" },
|
|
299
|
+
{ name: "Custom - Enter your own URLs", value: "custom" },
|
|
300
|
+
],
|
|
301
|
+
});
|
|
302
|
+
if (result === "file")
|
|
303
|
+
return { source: "file", urls: seedsFromFile };
|
|
304
|
+
if (result === "config")
|
|
305
|
+
return { source: "config", urls: config.SEED_URLS };
|
|
306
|
+
return { source: "custom", urls: await handleCustomUrls() };
|
|
307
|
+
}
|
|
308
|
+
async function handleCustomUrls() {
|
|
309
|
+
while (true) {
|
|
310
|
+
const raw = await input({ message: gradient("Enter URLs (comma-separated):") });
|
|
311
|
+
const candidates = raw.split(",").map((u) => u.trim()).filter(Boolean);
|
|
312
|
+
const { valid, invalid } = validateSeedUrls(candidates);
|
|
313
|
+
if (invalid.length > 0) {
|
|
314
|
+
for (const e of invalid)
|
|
315
|
+
err(e.reason);
|
|
316
|
+
continue;
|
|
317
|
+
}
|
|
318
|
+
return valid;
|
|
319
|
+
}
|
|
320
|
+
}
|
|
321
|
+
async function handleNumericConfig(prompt, currentValue, min = 0, max) {
|
|
322
|
+
while (true) {
|
|
323
|
+
const value = await input({
|
|
324
|
+
message: gradient(`${prompt} (current: ${currentValue}):`),
|
|
325
|
+
validate: (v) => {
|
|
326
|
+
const num = parseInt(v, 10);
|
|
327
|
+
if (isNaN(num))
|
|
328
|
+
return "Please enter a valid number";
|
|
329
|
+
if (num < min)
|
|
330
|
+
return `Value must be >= ${min}`;
|
|
331
|
+
if (max !== undefined && num > max)
|
|
332
|
+
return `Value must be <= ${max}`;
|
|
333
|
+
return true;
|
|
334
|
+
},
|
|
335
|
+
});
|
|
336
|
+
return parseInt(value, 10);
|
|
337
|
+
}
|
|
338
|
+
}
|
|
339
|
+
// ─── Command REPL ─────────────────────────────────────────────────────────────
|
|
340
|
+
async function runCommandRepl(seedsFromFile) {
|
|
341
|
+
const s = defaultSettings(seedsFromFile);
|
|
342
|
+
printSummary(s, seedsFromFile);
|
|
343
|
+
console.log(CYAN(STARS.small) + DIM(" type a command to configure the crawler.") +
|
|
344
|
+
" " + G("/help") + DIM(" for options,") +
|
|
345
|
+
" " + W("/start") + DIM(" to begin."));
|
|
346
|
+
blank();
|
|
347
|
+
while (true) {
|
|
348
|
+
const rl = readline.createInterface({ input: input$, output: stdout });
|
|
349
|
+
let line = "";
|
|
350
|
+
try {
|
|
351
|
+
line = await rl.question(CYAN(" " + STARS.half + " › "));
|
|
352
|
+
}
|
|
353
|
+
finally {
|
|
354
|
+
rl.close();
|
|
355
|
+
}
|
|
356
|
+
const trimmed = line.trim();
|
|
357
|
+
if (!trimmed)
|
|
358
|
+
continue;
|
|
359
|
+
const cmd = parseSlashCommand(trimmed);
|
|
360
|
+
if (!cmd) {
|
|
361
|
+
warn("Commands must start with / — try " + G("/help"));
|
|
362
|
+
continue;
|
|
363
|
+
}
|
|
364
|
+
switch (cmd.key) {
|
|
365
|
+
case "help":
|
|
366
|
+
printHelp();
|
|
367
|
+
break;
|
|
368
|
+
case "status":
|
|
369
|
+
printSummary(s, seedsFromFile);
|
|
370
|
+
break;
|
|
371
|
+
case "output":
|
|
372
|
+
s.outputMode = await handleOutputMode();
|
|
373
|
+
ok(`output -> ${G(s.outputMode)}`);
|
|
374
|
+
break;
|
|
375
|
+
case "seeds": {
|
|
376
|
+
const result = await handleSeedSource(seedsFromFile);
|
|
377
|
+
s.seedSource = result.source;
|
|
378
|
+
if (result.source === "custom")
|
|
379
|
+
s.customUrls = result.urls;
|
|
380
|
+
ok(`seeds source -> ${G(result.source)} (${result.urls.length} URLs)`);
|
|
381
|
+
break;
|
|
382
|
+
}
|
|
383
|
+
case "depth":
|
|
384
|
+
if (cmd.value) {
|
|
385
|
+
const v = parseIntValue(cmd.value, "depth", 0);
|
|
386
|
+
if (typeof v === "string") {
|
|
387
|
+
err(v);
|
|
388
|
+
break;
|
|
389
|
+
}
|
|
390
|
+
s.maxDepth = v;
|
|
391
|
+
}
|
|
392
|
+
else {
|
|
393
|
+
s.maxDepth = await handleNumericConfig("Max depth", s.maxDepth, 0);
|
|
394
|
+
}
|
|
395
|
+
ok(`max_depth -> ${G(String(s.maxDepth))}`);
|
|
396
|
+
break;
|
|
397
|
+
case "delay":
|
|
398
|
+
if (cmd.value) {
|
|
399
|
+
const v = parseIntValue(cmd.value, "delay", 0);
|
|
400
|
+
if (typeof v === "string") {
|
|
401
|
+
err(v);
|
|
402
|
+
break;
|
|
403
|
+
}
|
|
404
|
+
s.crawlDelayMs = enforceCrawlDelay(v);
|
|
405
|
+
}
|
|
406
|
+
else {
|
|
407
|
+
s.crawlDelayMs = await handleNumericConfig("Crawl delay (ms)", s.crawlDelayMs, MIN_CRAWL_DELAY_MS);
|
|
408
|
+
}
|
|
409
|
+
ok(`crawl_delay_ms -> ${G(String(s.crawlDelayMs) + "ms")}`);
|
|
410
|
+
break;
|
|
411
|
+
case "workers":
|
|
412
|
+
if (cmd.value) {
|
|
413
|
+
const v = parseIntValue(cmd.value, "workers", 1);
|
|
414
|
+
if (typeof v === "string") {
|
|
415
|
+
err(v);
|
|
416
|
+
break;
|
|
417
|
+
}
|
|
418
|
+
s.workerCount = v;
|
|
419
|
+
}
|
|
420
|
+
else {
|
|
421
|
+
s.workerCount = await handleNumericConfig("Worker count", s.workerCount, 1, 100);
|
|
422
|
+
}
|
|
423
|
+
ok(`worker_count -> ${G(String(s.workerCount))}`);
|
|
424
|
+
break;
|
|
425
|
+
case "maxpages":
|
|
426
|
+
case "maxcount":
|
|
427
|
+
if (cmd.value) {
|
|
428
|
+
const v = parseIntValue(cmd.value, "maxpages", 0);
|
|
429
|
+
if (typeof v === "string") {
|
|
430
|
+
err(v);
|
|
431
|
+
break;
|
|
432
|
+
}
|
|
433
|
+
s.maxPages = v;
|
|
434
|
+
}
|
|
435
|
+
else {
|
|
436
|
+
s.maxPages = await handleNumericConfig("Max pages (0 = unlimited)", s.maxPages, 0);
|
|
437
|
+
}
|
|
438
|
+
ok(`max_pages -> ${G(s.maxPages === 0 ? "unlimited" : String(s.maxPages))}`);
|
|
439
|
+
break;
|
|
440
|
+
case "start": {
|
|
441
|
+
const seedCount = s.seedSource === "file" ? seedsFromFile.length :
|
|
442
|
+
s.seedSource === "custom" ? s.customUrls.length :
|
|
443
|
+
config.SEED_URLS.length;
|
|
444
|
+
if (seedCount === 0) {
|
|
445
|
+
err("No seed URLs configured — use /seeds first");
|
|
446
|
+
break;
|
|
447
|
+
}
|
|
448
|
+
return s;
|
|
449
|
+
}
|
|
450
|
+
case "quit":
|
|
451
|
+
case "exit":
|
|
452
|
+
case "q":
|
|
453
|
+
process.exit(0);
|
|
454
|
+
default:
|
|
455
|
+
warn(`Unknown command ${G("/" + cmd.key)} — try ${G("/help")}`);
|
|
456
|
+
}
|
|
457
|
+
}
|
|
458
|
+
}
|
|
459
|
+
function printLaunchSummary(seedUrls, s, crawlDelayMs) {
|
|
460
|
+
blank();
|
|
461
|
+
console.log(CYAN(" " + STARS.full + " " + gradient("CRAWLER LAUNCHED") + " " + STARS.full));
|
|
462
|
+
console.log(drawTopBorder("LAUNCHING"));
|
|
463
|
+
const rows = [
|
|
464
|
+
["seeds", `${seedUrls.length} url${seedUrls.length !== 1 ? "s" : ""} queued`],
|
|
465
|
+
["domains", config.ALLOWED_DOMAINS.join(", ")],
|
|
466
|
+
["output", s.outputMode === "pdf" ? "pdf -> output/documentation*.pdf" : "postgresql"],
|
|
467
|
+
["depth", String(config.MAX_DEPTH)],
|
|
468
|
+
["delay", `${crawlDelayMs}ms`],
|
|
469
|
+
["workers", String(config.WORKER_COUNT)],
|
|
470
|
+
["maxpages", config.MAX_PAGES === 0 ? "unlimited" : String(config.MAX_PAGES)],
|
|
471
|
+
];
|
|
472
|
+
for (const [k, v] of rows) {
|
|
473
|
+
console.log(drawRow(k, gradient(v), getVisibleLength(v)));
|
|
474
|
+
}
|
|
475
|
+
console.log(drawBottomBorder());
|
|
476
|
+
blank();
|
|
477
|
+
}
|
|
478
|
+
// ─── Main Execution ───────────────────────────────────────────────────────────
|
|
479
|
+
async function main() {
|
|
480
|
+
printBanner();
|
|
481
|
+
const seedsFromFile = readSeedsFile();
|
|
482
|
+
if (seedsFromFile.length > 0) {
|
|
483
|
+
console.log(CYAN(" " + STARS.half + " " + gradient(`Found ${seedsFromFile.length} URLs in seeds.txt`)));
|
|
484
|
+
console.log(drawTopBorder(`seeds.txt (${seedsFromFile.length})`));
|
|
485
|
+
for (const u of seedsFromFile.slice(0, 5)) {
|
|
486
|
+
const innerLeft = " " + u;
|
|
487
|
+
console.log(DG(" │") + CYAN(innerLeft) + " ".repeat(Math.max(0, INNER_WIDTH - getVisibleLength(innerLeft))) + DG("│"));
|
|
488
|
+
}
|
|
489
|
+
console.log(drawBottomBorder());
|
|
490
|
+
blank();
|
|
491
|
+
}
|
|
492
|
+
const s = await runCommandRepl(seedsFromFile);
|
|
493
|
+
const crawlDelayMs = enforceCrawlDelay(s.crawlDelayMs);
|
|
494
|
+
config.MAX_DEPTH = s.maxDepth;
|
|
495
|
+
config.CRAWL_DELAY_MS = crawlDelayMs;
|
|
496
|
+
config.WORKER_COUNT = s.workerCount;
|
|
497
|
+
config.MAX_PAGES = s.maxPages;
|
|
498
|
+
config.OUTPUT_MODE = s.outputMode;
|
|
499
|
+
const rawSeeds = s.seedSource === "file" ? seedsFromFile : s.seedSource === "custom" ? s.customUrls : config.SEED_URLS;
|
|
500
|
+
const { valid: seedUrls } = validateSeedUrls(rawSeeds);
|
|
501
|
+
config.SEED_URLS = seedUrls;
|
|
502
|
+
config.ALLOWED_DOMAINS = extractDomains(seedUrls);
|
|
503
|
+
printLaunchSummary(seedUrls, s, crawlDelayMs);
|
|
504
|
+
const strategy = createStrategy(s.outputMode);
|
|
505
|
+
setStrategy(strategy);
|
|
506
|
+
await strategy.init();
|
|
507
|
+
try {
|
|
508
|
+
await resetStaleLocks();
|
|
509
|
+
await clearPendingURLs(config.ALLOWED_DOMAINS);
|
|
510
|
+
await seedDatabase();
|
|
511
|
+
// Background status text engine starts up before launching the scheduler
|
|
512
|
+
blank();
|
|
513
|
+
startGlowStatus("CRAWLING THROUGH THE SITE...");
|
|
514
|
+
await startScheduler();
|
|
515
|
+
stopGlowStatus();
|
|
516
|
+
ok("Crawl finished successfully.");
|
|
517
|
+
}
|
|
518
|
+
catch (error) {
|
|
519
|
+
stopGlowStatus();
|
|
520
|
+
err("Fatal error in main loop.");
|
|
521
|
+
console.error(error);
|
|
522
|
+
}
|
|
523
|
+
finally {
|
|
524
|
+
stopGlowStatus();
|
|
525
|
+
await strategy.finish();
|
|
526
|
+
await pool.end();
|
|
527
|
+
blank();
|
|
528
|
+
console.log(CYAN(" " + STARS.full + " " + gradient("Session complete") + " " + STARS.full));
|
|
529
|
+
ok("Database connection closed.");
|
|
530
|
+
blank();
|
|
531
|
+
}
|
|
532
|
+
}
|
|
533
|
+
main();
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
export function normalizeURL(rawUrl, baseUrl) {
|
|
2
|
+
try {
|
|
3
|
+
// 1. Resolve relative URLs against base URL
|
|
4
|
+
// Scheme and host are automatically lowercased by the URL constructor
|
|
5
|
+
const url = new URL(rawUrl, baseUrl);
|
|
6
|
+
// Filter allowed schemes (http, https)
|
|
7
|
+
if (url.protocol !== "http:" && url.protocol !== "https:") {
|
|
8
|
+
return null;
|
|
9
|
+
}
|
|
10
|
+
// 3. Strip fragment
|
|
11
|
+
url.hash = "";
|
|
12
|
+
// 2. Strip trailing slash
|
|
13
|
+
let pathname = url.pathname;
|
|
14
|
+
if (pathname.endsWith("/")) {
|
|
15
|
+
pathname = pathname.slice(0, -1);
|
|
16
|
+
}
|
|
17
|
+
const portPart = url.port ? `:${url.port}` : "";
|
|
18
|
+
const searchPart = url.search || "";
|
|
19
|
+
return `${url.protocol}//${url.hostname}${portPart}${pathname}${searchPart}`;
|
|
20
|
+
}
|
|
21
|
+
catch (e) {
|
|
22
|
+
return null;
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
export function getDomain(urlStr) {
|
|
26
|
+
try {
|
|
27
|
+
const url = new URL(urlStr);
|
|
28
|
+
return url.hostname;
|
|
29
|
+
}
|
|
30
|
+
catch (e) {
|
|
31
|
+
return null;
|
|
32
|
+
}
|
|
33
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import { markDone } from "../db/queries.js";
|
|
2
|
+
/**
|
|
3
|
+
* Persists crawled page content into the PostgreSQL database.
|
|
4
|
+
* This is the default structured-data output pipeline.
|
|
5
|
+
*/
|
|
6
|
+
export class DatabaseStrategy {
|
|
7
|
+
async init() {
|
|
8
|
+
// Pool is already initialised by client.ts — nothing to do here.
|
|
9
|
+
}
|
|
10
|
+
async save(urlId, _url, content) {
|
|
11
|
+
await markDone(urlId, content);
|
|
12
|
+
}
|
|
13
|
+
async finish() {
|
|
14
|
+
// Connection pool is closed by index.ts — nothing to do here.
|
|
15
|
+
}
|
|
16
|
+
}
|