messi-crawler 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +201 -0
- package/dist/cli/renderer.js +71 -0
- package/dist/config.js +18 -0
- package/dist/db/clear.js +16 -0
- package/dist/db/client.js +20 -0
- package/dist/db/queries.js +179 -0
- package/dist/frontier/frontier.js +44 -0
- package/dist/frontier/logger.js +65 -0
- package/dist/frontier/robots.js +46 -0
- package/dist/frontier/scheduler.js +98 -0
- package/dist/index.js +533 -0
- package/dist/normalizer.js +33 -0
- package/dist/output/db-strategy.js +16 -0
- package/dist/output/index.js +23 -0
- package/dist/output/pdf-strategy.js +316 -0
- package/dist/output/strategy.js +1 -0
- package/dist/security/ssrf.js +45 -0
- package/dist/security/validate-url.js +41 -0
- package/dist/seed.js +14 -0
- package/dist/setup.js +148 -0
- package/dist/test/client.test.js +33 -0
- package/dist/test/downloader.test.js +84 -0
- package/dist/test/extractor.test.js +126 -0
- package/dist/test/frontier.test.js +43 -0
- package/dist/test/logger.test.js +55 -0
- package/dist/test/normalizer.test.js +36 -0
- package/dist/test/pdf-strategy.test.js +68 -0
- package/dist/test/queries.test.js +173 -0
- package/dist/test/robots.test.js +46 -0
- package/dist/test/scheduler.test.js +73 -0
- package/dist/test/seed.test.js +26 -0
- package/dist/test/worker.test.js +118 -0
- package/dist/worker/downloader.js +114 -0
- package/dist/worker/extractor.js +197 -0
- package/dist/worker/worker.js +87 -0
- package/package.json +48 -0
- package/seeds.txt +4 -0
- package/src/cli/renderer.ts +83 -0
- package/src/config.ts +22 -0
- package/src/db/clear.ts +16 -0
- package/src/db/client.ts +26 -0
- package/src/db/queries.ts +255 -0
- package/src/db/schema.sql +43 -0
- package/src/frontier/frontier.ts +60 -0
- package/src/frontier/logger.ts +75 -0
- package/src/frontier/robots.ts +50 -0
- package/src/frontier/scheduler.ts +119 -0
- package/src/index.ts +596 -0
- package/src/normalizer.ts +37 -0
- package/src/output/db-strategy.ts +20 -0
- package/src/output/index.ts +32 -0
- package/src/output/pdf-strategy.ts +388 -0
- package/src/output/strategy.ts +16 -0
- package/src/security/ssrf.ts +48 -0
- package/src/security/validate-url.ts +49 -0
- package/src/seed.ts +18 -0
- package/src/setup.ts +170 -0
- package/src/test/client.test.ts +38 -0
- package/src/test/downloader.test.ts +101 -0
- package/src/test/extractor.test.ts +139 -0
- package/src/test/frontier.test.ts +53 -0
- package/src/test/logger.test.ts +71 -0
- package/src/test/normalizer.test.ts +43 -0
- package/src/test/pdf-strategy.test.ts +84 -0
- package/src/test/queries.test.ts +247 -0
- package/src/test/robots.test.ts +56 -0
- package/src/test/scheduler.test.ts +90 -0
- package/src/test/seed.test.ts +35 -0
- package/src/test/worker.test.ts +144 -0
- package/src/worker/downloader.ts +149 -0
- package/src/worker/extractor.ts +235 -0
- package/src/worker/worker.ts +100 -0
- package/tsconfig.json +15 -0
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Terminal rendering utilities — colours, symbols, layout helpers.
|
|
3
|
+
* Keeps all ANSI logic in one place so the wizard stays readable.
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
export const c = {
|
|
7
|
+
reset: "\x1b[0m",
|
|
8
|
+
bold: "\x1b[1m",
|
|
9
|
+
dim: "\x1b[2m",
|
|
10
|
+
italic: "\x1b[3m",
|
|
11
|
+
|
|
12
|
+
// foreground
|
|
13
|
+
white: "\x1b[97m",
|
|
14
|
+
gray: "\x1b[90m",
|
|
15
|
+
cyan: "\x1b[96m",
|
|
16
|
+
green: "\x1b[92m",
|
|
17
|
+
yellow: "\x1b[93m",
|
|
18
|
+
red: "\x1b[91m",
|
|
19
|
+
blue: "\x1b[94m",
|
|
20
|
+
magenta: "\x1b[95m",
|
|
21
|
+
orange: "\x1b[38;5;208m",
|
|
22
|
+
};
|
|
23
|
+
|
|
24
|
+
export const sym = {
|
|
25
|
+
dot: "·",
|
|
26
|
+
bullet: "•",
|
|
27
|
+
arrow: "›",
|
|
28
|
+
check: "✓",
|
|
29
|
+
cross: "✗",
|
|
30
|
+
warn: "⚠",
|
|
31
|
+
info: "ℹ",
|
|
32
|
+
sparkle: "◆",
|
|
33
|
+
bar: "│",
|
|
34
|
+
corner: "╰",
|
|
35
|
+
tee: "├",
|
|
36
|
+
horiz: "─",
|
|
37
|
+
};
|
|
38
|
+
|
|
39
|
+
/** Wraps text in an ANSI style sequence. */
|
|
40
|
+
export function style(text: string, ...styles: string[]): string {
|
|
41
|
+
return styles.join("") + text + c.reset;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/** Prints a blank line. */
|
|
45
|
+
export function blank(): void { console.log(); }
|
|
46
|
+
|
|
47
|
+
/** dim separator line */
|
|
48
|
+
export function divider(width = 52): void {
|
|
49
|
+
console.log(style(sym.horiz.repeat(width), c.dim, c.gray));
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/** A styled section label, e.g. ◆ Seeds */
|
|
53
|
+
export function section(label: string): void {
|
|
54
|
+
console.log(style(`${sym.sparkle} ${label}`, c.bold, c.cyan));
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
/** A key/value summary row, e.g. › depth 3 */
|
|
58
|
+
export function row(key: string, value: string, valueColor = c.white): void {
|
|
59
|
+
const pad = 16;
|
|
60
|
+
const k = style(key.padEnd(pad), c.gray);
|
|
61
|
+
const v = style(value, valueColor);
|
|
62
|
+
console.log(` ${style(sym.arrow, c.dim, c.gray)} ${k}${v}`);
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
/** Success line */
|
|
66
|
+
export function ok(msg: string): void {
|
|
67
|
+
console.log(` ${style(sym.check, c.green)} ${style(msg, c.white)}`);
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/** Warning line */
|
|
71
|
+
export function warn(msg: string): void {
|
|
72
|
+
console.log(` ${style(sym.warn, c.yellow)} ${style(msg, c.yellow)}`);
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/** Error line */
|
|
76
|
+
export function err(msg: string): void {
|
|
77
|
+
console.log(` ${style(sym.cross, c.red)} ${style(msg, c.red)}`);
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
/** Info line */
|
|
81
|
+
export function info(msg: string): void {
|
|
82
|
+
console.log(` ${style(sym.info, c.blue)} ${style(msg, c.gray)}`);
|
|
83
|
+
}
|
package/src/config.ts
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import dotenv from "dotenv";
|
|
2
|
+
dotenv.config();
|
|
3
|
+
|
|
4
|
+
export const config = {
|
|
5
|
+
MAX_DEPTH: parseInt(process.env.MAX_DEPTH || "3", 10),
|
|
6
|
+
CRAWL_DELAY_MS: parseInt(process.env.CRAWL_DELAY_MS || "1000", 10),
|
|
7
|
+
WORKER_COUNT: parseInt(process.env.WORKER_COUNT || "10", 10),
|
|
8
|
+
REQUEST_TIMEOUT_MS: parseInt(process.env.REQUEST_TIMEOUT_MS || "10000", 10),
|
|
9
|
+
MAX_REDIRECTS: parseInt(process.env.MAX_REDIRECTS || "5", 10),
|
|
10
|
+
MAX_PAGES: parseInt(process.env.MAX_PAGES || "1000", 10),
|
|
11
|
+
|
|
12
|
+
/** Output destination: "database" | "pdf" */
|
|
13
|
+
OUTPUT_MODE: (process.env.OUTPUT_MODE || "database") as "database" | "pdf",
|
|
14
|
+
|
|
15
|
+
SEED_URLS: [
|
|
16
|
+
"https://www.akc.org/dog-breeds/",
|
|
17
|
+
],
|
|
18
|
+
|
|
19
|
+
ALLOWED_DOMAINS: [
|
|
20
|
+
"www.akc.org",
|
|
21
|
+
],
|
|
22
|
+
};
|
package/src/db/clear.ts
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import { pool } from "./client.js";
|
|
2
|
+
|
|
3
|
+
async function clearDatabase() {
|
|
4
|
+
console.log("Clearing database...");
|
|
5
|
+
try {
|
|
6
|
+
// Truncate all crawler tables and reset the auto-increment IDs
|
|
7
|
+
await pool.query("TRUNCATE TABLE links, crawled_pages, urls, domain_stats RESTART IDENTITY CASCADE;");
|
|
8
|
+
console.log("Database cleared successfully.");
|
|
9
|
+
} catch (error) {
|
|
10
|
+
console.error("Error clearing database:", error);
|
|
11
|
+
} finally {
|
|
12
|
+
await pool.end();
|
|
13
|
+
}
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
clearDatabase();
|
package/src/db/client.ts
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import pg from "pg";
|
|
2
|
+
import dotenv from "dotenv";
|
|
3
|
+
|
|
4
|
+
dotenv.config();
|
|
5
|
+
|
|
6
|
+
const { Pool } = pg;
|
|
7
|
+
|
|
8
|
+
const connectionString = process.env.DATABASE_URL;
|
|
9
|
+
|
|
10
|
+
export const pool = connectionString
|
|
11
|
+
? new Pool({ connectionString })
|
|
12
|
+
: new Pool({
|
|
13
|
+
host: process.env.PGHOST || "localhost",
|
|
14
|
+
port: parseInt(process.env.PGPORT || "5432", 10),
|
|
15
|
+
user: process.env.PGUSER || "postgres",
|
|
16
|
+
password: process.env.PGPASSWORD || "",
|
|
17
|
+
database: process.env.PGDATABASE || "web_crawler",
|
|
18
|
+
});
|
|
19
|
+
|
|
20
|
+
export async function query(text: string, params?: any[]) {
|
|
21
|
+
return pool.query(text, params);
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
export async function closePool() {
|
|
25
|
+
await pool.end();
|
|
26
|
+
}
|
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
import { query, pool } from "./client.js";
|
|
2
|
+
import type { ContentBlock, ExtractedImage } from "../worker/extractor.js";
|
|
3
|
+
|
|
4
|
+
export interface URLRow {
|
|
5
|
+
id: number;
|
|
6
|
+
url: string;
|
|
7
|
+
domain: string;
|
|
8
|
+
status: string;
|
|
9
|
+
depth: number;
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
export interface CrawledPageContent {
|
|
13
|
+
title: string | null;
|
|
14
|
+
description: string | null;
|
|
15
|
+
canonicalUrl: string | null;
|
|
16
|
+
headings: { h1: string[]; h2: string[]; h3: string[] };
|
|
17
|
+
textContent: string | null;
|
|
18
|
+
blocks?: ContentBlock[];
|
|
19
|
+
images?: ExtractedImage[];
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* Claims the next PENDING URL for a given domain and marks it as FETCHING.
|
|
24
|
+
* Uses FOR UPDATE SKIP LOCKED to prevent multiple workers from claiming the same URL.
|
|
25
|
+
*/
|
|
26
|
+
export async function claimNextURL(domain: string): Promise<URLRow | null> {
|
|
27
|
+
const res = await query(
|
|
28
|
+
`UPDATE urls
|
|
29
|
+
SET status = 'FETCHING', fetched_at = NOW()
|
|
30
|
+
WHERE id = (
|
|
31
|
+
SELECT id FROM urls
|
|
32
|
+
WHERE status = 'PENDING' AND domain = $1
|
|
33
|
+
ORDER BY depth ASC, discovered_at ASC
|
|
34
|
+
LIMIT 1
|
|
35
|
+
FOR UPDATE SKIP LOCKED
|
|
36
|
+
)
|
|
37
|
+
RETURNING id, url, domain, status, depth`,
|
|
38
|
+
[domain]
|
|
39
|
+
);
|
|
40
|
+
|
|
41
|
+
if (res.rows.length === 0) {
|
|
42
|
+
return null;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
return res.rows[0] as URLRow;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
/**
|
|
49
|
+
* Atomically updates URL status to DONE and inserts the crawled page content.
|
|
50
|
+
*/
|
|
51
|
+
export async function markDone(urlId: number, content: CrawledPageContent): Promise<void> {
|
|
52
|
+
const client = await pool.connect();
|
|
53
|
+
try {
|
|
54
|
+
await client.query("BEGIN");
|
|
55
|
+
|
|
56
|
+
await client.query(
|
|
57
|
+
`INSERT INTO crawled_pages (url_id, title, description, canonical_url, headings, text_content)
|
|
58
|
+
VALUES ($1, $2, $3, $4, $5, $6)`,
|
|
59
|
+
[
|
|
60
|
+
urlId,
|
|
61
|
+
content.title,
|
|
62
|
+
content.description,
|
|
63
|
+
content.canonicalUrl,
|
|
64
|
+
JSON.stringify(content.headings),
|
|
65
|
+
content.textContent,
|
|
66
|
+
]
|
|
67
|
+
);
|
|
68
|
+
|
|
69
|
+
await client.query(
|
|
70
|
+
`UPDATE urls
|
|
71
|
+
SET status = 'DONE', fetched_at = NOW()
|
|
72
|
+
WHERE id = $1`,
|
|
73
|
+
[urlId]
|
|
74
|
+
);
|
|
75
|
+
|
|
76
|
+
await client.query("COMMIT");
|
|
77
|
+
} catch (error) {
|
|
78
|
+
await client.query("ROLLBACK");
|
|
79
|
+
throw error;
|
|
80
|
+
} finally {
|
|
81
|
+
client.release();
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
/**
|
|
86
|
+
* Marks a URL status as FAILED and stores the error message.
|
|
87
|
+
*/
|
|
88
|
+
export async function markFailed(urlId: number, errorMessage: string): Promise<void> {
|
|
89
|
+
await query(
|
|
90
|
+
`UPDATE urls
|
|
91
|
+
SET status = 'FAILED', error_message = $2, fetched_at = NOW()
|
|
92
|
+
WHERE id = $1`,
|
|
93
|
+
[urlId, errorMessage]
|
|
94
|
+
);
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
/**
|
|
98
|
+
* Inserts a URL as PENDING if it doesn't already exist.
|
|
99
|
+
* Returns the ID of the URL (whether newly inserted or already existing).
|
|
100
|
+
*/
|
|
101
|
+
export async function insertURL(url: string, domain: string, depth: number): Promise<number> {
|
|
102
|
+
const res = await query(
|
|
103
|
+
`WITH ins AS (
|
|
104
|
+
INSERT INTO urls (url, domain, status, depth)
|
|
105
|
+
VALUES ($1, $2, 'PENDING', $3)
|
|
106
|
+
ON CONFLICT (url) DO NOTHING
|
|
107
|
+
RETURNING id
|
|
108
|
+
)
|
|
109
|
+
SELECT id FROM ins
|
|
110
|
+
UNION ALL
|
|
111
|
+
SELECT id FROM urls WHERE url = $1
|
|
112
|
+
LIMIT 1`,
|
|
113
|
+
[url, domain, depth]
|
|
114
|
+
);
|
|
115
|
+
|
|
116
|
+
return res.rows[0].id;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
/**
|
|
120
|
+
* Inserts a link relationship between two URLs.
|
|
121
|
+
*/
|
|
122
|
+
export async function insertLink(fromUrlId: number, toUrlId: number): Promise<void> {
|
|
123
|
+
await query(
|
|
124
|
+
`INSERT INTO links (from_url_id, to_url_id)
|
|
125
|
+
VALUES ($1, $2)
|
|
126
|
+
ON CONFLICT (from_url_id, to_url_id) DO NOTHING`,
|
|
127
|
+
[fromUrlId, toUrlId]
|
|
128
|
+
);
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
/**
|
|
132
|
+
* Resets all URLs with FETCHING status back to PENDING.
|
|
133
|
+
* Used for crash recovery on startup to release stale locks.
|
|
134
|
+
*/
|
|
135
|
+
export async function resetStaleLocks(): Promise<void> {
|
|
136
|
+
await query(
|
|
137
|
+
`UPDATE urls
|
|
138
|
+
SET status = 'PENDING'
|
|
139
|
+
WHERE status = 'FETCHING'`
|
|
140
|
+
);
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
/**
|
|
144
|
+
* Deletes all PENDING URLs whose domain is not in the provided allowed list.
|
|
145
|
+
* This includes child links discovered during previous crawls, ensuring a new
|
|
146
|
+
* session scoped to different seeds starts with a clean queue.
|
|
147
|
+
*/
|
|
148
|
+
export async function clearPendingURLs(allowedDomains: string[]): Promise<void> {
|
|
149
|
+
if (allowedDomains.length === 0) return;
|
|
150
|
+
|
|
151
|
+
const result = await query(
|
|
152
|
+
`DELETE FROM urls
|
|
153
|
+
WHERE status = 'PENDING'
|
|
154
|
+
AND domain <> ALL($1::text[])`,
|
|
155
|
+
[allowedDomains]
|
|
156
|
+
);
|
|
157
|
+
|
|
158
|
+
const deleted = (result as any).rowCount ?? 0;
|
|
159
|
+
if (deleted > 0) {
|
|
160
|
+
console.log(`[setup] Cleared ${deleted} stale PENDING URL(s) outside allowed domains.`);
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
export interface GlobalStats {
|
|
165
|
+
pending: number;
|
|
166
|
+
fetching: number;
|
|
167
|
+
done: number;
|
|
168
|
+
failed: number;
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
export interface DomainStats {
|
|
172
|
+
domain: string;
|
|
173
|
+
pending_count: number;
|
|
174
|
+
fetching_count: number;
|
|
175
|
+
done_count: number;
|
|
176
|
+
failed_count: number;
|
|
177
|
+
last_crawled_at: Date | null;
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
/**
|
|
181
|
+
* Retrieves aggregate statistics across all URLs.
|
|
182
|
+
*/
|
|
183
|
+
export async function getGlobalStats(): Promise<GlobalStats> {
|
|
184
|
+
const res = await query(
|
|
185
|
+
`SELECT status, COUNT(*) as count
|
|
186
|
+
FROM urls
|
|
187
|
+
GROUP BY status`
|
|
188
|
+
);
|
|
189
|
+
|
|
190
|
+
const stats: GlobalStats = { pending: 0, fetching: 0, done: 0, failed: 0 };
|
|
191
|
+
for (const row of res.rows) {
|
|
192
|
+
const status = row.status.toLowerCase();
|
|
193
|
+
const count = parseInt(row.count, 10);
|
|
194
|
+
if (status === "pending") stats.pending = count;
|
|
195
|
+
else if (status === "fetching") stats.fetching = count;
|
|
196
|
+
else if (status === "done") stats.done = count;
|
|
197
|
+
else if (status === "failed") stats.failed = count;
|
|
198
|
+
}
|
|
199
|
+
return stats;
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
/**
|
|
203
|
+
* Recomputes and updates domain-level stats in the domain_stats table.
|
|
204
|
+
*/
|
|
205
|
+
export async function refreshDomainStats(): Promise<void> {
|
|
206
|
+
await query(`
|
|
207
|
+
CREATE TABLE IF NOT EXISTS domain_stats (
|
|
208
|
+
domain TEXT PRIMARY KEY,
|
|
209
|
+
pending_count INTEGER NOT NULL DEFAULT 0,
|
|
210
|
+
fetching_count INTEGER NOT NULL DEFAULT 0,
|
|
211
|
+
done_count INTEGER NOT NULL DEFAULT 0,
|
|
212
|
+
failed_count INTEGER NOT NULL DEFAULT 0,
|
|
213
|
+
last_crawled_at TIMESTAMPTZ
|
|
214
|
+
)
|
|
215
|
+
`);
|
|
216
|
+
|
|
217
|
+
await query(`
|
|
218
|
+
INSERT INTO domain_stats (domain, pending_count, fetching_count, done_count, failed_count, last_crawled_at)
|
|
219
|
+
SELECT
|
|
220
|
+
domain,
|
|
221
|
+
COUNT(*) FILTER (WHERE status = 'PENDING') as pending_count,
|
|
222
|
+
COUNT(*) FILTER (WHERE status = 'FETCHING') as fetching_count,
|
|
223
|
+
COUNT(*) FILTER (WHERE status = 'DONE') as done_count,
|
|
224
|
+
COUNT(*) FILTER (WHERE status = 'FAILED') as failed_count,
|
|
225
|
+
MAX(fetched_at) as last_crawled_at
|
|
226
|
+
FROM urls
|
|
227
|
+
GROUP BY domain
|
|
228
|
+
ON CONFLICT (domain) DO UPDATE SET
|
|
229
|
+
pending_count = EXCLUDED.pending_count,
|
|
230
|
+
fetching_count = EXCLUDED.fetching_count,
|
|
231
|
+
done_count = EXCLUDED.done_count,
|
|
232
|
+
failed_count = EXCLUDED.failed_count,
|
|
233
|
+
last_crawled_at = EXCLUDED.last_crawled_at
|
|
234
|
+
`);
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
/**
|
|
238
|
+
* Retrieves per-domain statistics.
|
|
239
|
+
*/
|
|
240
|
+
export async function getDomainStats(): Promise<DomainStats[]> {
|
|
241
|
+
const res = await query(
|
|
242
|
+
`SELECT domain, pending_count, fetching_count, done_count, failed_count, last_crawled_at
|
|
243
|
+
FROM domain_stats
|
|
244
|
+
ORDER BY domain ASC`
|
|
245
|
+
);
|
|
246
|
+
return res.rows.map((row) => ({
|
|
247
|
+
domain: row.domain,
|
|
248
|
+
pending_count: parseInt(row.pending_count, 10),
|
|
249
|
+
fetching_count: parseInt(row.fetching_count, 10),
|
|
250
|
+
done_count: parseInt(row.done_count, 10),
|
|
251
|
+
failed_count: parseInt(row.failed_count, 10),
|
|
252
|
+
last_crawled_at: row.last_crawled_at ? new Date(row.last_crawled_at) : null,
|
|
253
|
+
}));
|
|
254
|
+
}
|
|
255
|
+
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
-- All discovered URLs and their crawl state
|
|
2
|
+
CREATE TABLE IF NOT EXISTS urls (
|
|
3
|
+
id SERIAL PRIMARY KEY,
|
|
4
|
+
url TEXT NOT NULL UNIQUE,
|
|
5
|
+
domain TEXT NOT NULL,
|
|
6
|
+
status TEXT NOT NULL DEFAULT 'PENDING', -- PENDING | FETCHING | DONE | FAILED
|
|
7
|
+
depth INTEGER NOT NULL DEFAULT 0,
|
|
8
|
+
error_message TEXT,
|
|
9
|
+
discovered_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
|
10
|
+
fetched_at TIMESTAMPTZ
|
|
11
|
+
);
|
|
12
|
+
|
|
13
|
+
CREATE INDEX IF NOT EXISTS idx_urls_status_domain ON urls (status, domain);
|
|
14
|
+
|
|
15
|
+
-- Extracted page content
|
|
16
|
+
CREATE TABLE IF NOT EXISTS crawled_pages (
|
|
17
|
+
id SERIAL PRIMARY KEY,
|
|
18
|
+
url_id INTEGER NOT NULL REFERENCES urls(id) ON DELETE CASCADE,
|
|
19
|
+
title TEXT,
|
|
20
|
+
description TEXT,
|
|
21
|
+
canonical_url TEXT,
|
|
22
|
+
headings JSONB, -- { h1: [...], h2: [...], h3: [...] }
|
|
23
|
+
text_content TEXT,
|
|
24
|
+
crawled_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
|
25
|
+
);
|
|
26
|
+
|
|
27
|
+
-- Link graph
|
|
28
|
+
CREATE TABLE IF NOT EXISTS links (
|
|
29
|
+
from_url_id INTEGER NOT NULL REFERENCES urls(id) ON DELETE CASCADE,
|
|
30
|
+
to_url_id INTEGER NOT NULL REFERENCES urls(id) ON DELETE CASCADE,
|
|
31
|
+
PRIMARY KEY (from_url_id, to_url_id)
|
|
32
|
+
);
|
|
33
|
+
|
|
34
|
+
-- Domain statistics for observability
|
|
35
|
+
CREATE TABLE IF NOT EXISTS domain_stats (
|
|
36
|
+
domain TEXT PRIMARY KEY,
|
|
37
|
+
pending_count INTEGER NOT NULL DEFAULT 0,
|
|
38
|
+
fetching_count INTEGER NOT NULL DEFAULT 0,
|
|
39
|
+
done_count INTEGER NOT NULL DEFAULT 0,
|
|
40
|
+
failed_count INTEGER NOT NULL DEFAULT 0,
|
|
41
|
+
last_crawled_at TIMESTAMPTZ
|
|
42
|
+
);
|
|
43
|
+
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
import { query } from "../db/client.js";
|
|
2
|
+
import { config } from "../config.js";
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Retrieves the list of unique domains that have at least one pending URL,
|
|
6
|
+
* restricted to the domains allowed in the current crawl session.
|
|
7
|
+
*/
|
|
8
|
+
export async function getPendingDomains(): Promise<string[]> {
|
|
9
|
+
const allowed = config.ALLOWED_DOMAINS;
|
|
10
|
+
|
|
11
|
+
// If ALLOWED_DOMAINS is populated, filter at the DB level so the scheduler
|
|
12
|
+
// never even sees pending rows from outside the current session's scope.
|
|
13
|
+
if (allowed && allowed.length > 0) {
|
|
14
|
+
const res = await query(
|
|
15
|
+
`SELECT DISTINCT domain
|
|
16
|
+
FROM urls
|
|
17
|
+
WHERE status = 'PENDING'
|
|
18
|
+
AND domain = ANY($1::text[])`,
|
|
19
|
+
[allowed]
|
|
20
|
+
);
|
|
21
|
+
return res.rows.map((row) => row.domain);
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
const res = await query(
|
|
25
|
+
`SELECT DISTINCT domain
|
|
26
|
+
FROM urls
|
|
27
|
+
WHERE status = 'PENDING'`
|
|
28
|
+
);
|
|
29
|
+
return res.rows.map((row) => row.domain);
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* Retrieves the count of pending URLs bucketed by domain,
|
|
34
|
+
* restricted to the allowed domains in the current session.
|
|
35
|
+
*/
|
|
36
|
+
export async function getPendingCounts(): Promise<Record<string, number>> {
|
|
37
|
+
const allowed = config.ALLOWED_DOMAINS;
|
|
38
|
+
|
|
39
|
+
const res = allowed && allowed.length > 0
|
|
40
|
+
? await query(
|
|
41
|
+
`SELECT domain, COUNT(*) as count
|
|
42
|
+
FROM urls
|
|
43
|
+
WHERE status = 'PENDING'
|
|
44
|
+
AND domain = ANY($1::text[])
|
|
45
|
+
GROUP BY domain`,
|
|
46
|
+
[allowed]
|
|
47
|
+
)
|
|
48
|
+
: await query(
|
|
49
|
+
`SELECT domain, COUNT(*) as count
|
|
50
|
+
FROM urls
|
|
51
|
+
WHERE status = 'PENDING'
|
|
52
|
+
GROUP BY domain`
|
|
53
|
+
);
|
|
54
|
+
|
|
55
|
+
const counts: Record<string, number> = {};
|
|
56
|
+
for (const row of res.rows) {
|
|
57
|
+
counts[row.domain] = parseInt(row.count, 10);
|
|
58
|
+
}
|
|
59
|
+
return counts;
|
|
60
|
+
}
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
import { getGlobalStats, refreshDomainStats, getDomainStats } from "../db/queries.js";
|
|
2
|
+
|
|
3
|
+
let loggerInterval: any = null;
|
|
4
|
+
let lastDoneAndFailedCount = 0;
|
|
5
|
+
let startTime = 0;
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* Starts a background interval to log crawler progress periodically.
|
|
9
|
+
*/
|
|
10
|
+
export async function startProgressLogger(intervalMs: number = 5000): Promise<void> {
|
|
11
|
+
if (loggerInterval) return;
|
|
12
|
+
|
|
13
|
+
startTime = Date.now();
|
|
14
|
+
try {
|
|
15
|
+
const initialStats = await getGlobalStats();
|
|
16
|
+
lastDoneAndFailedCount = initialStats.done + initialStats.failed;
|
|
17
|
+
} catch (err) {
|
|
18
|
+
lastDoneAndFailedCount = 0;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
loggerInterval = setInterval(async () => {
|
|
22
|
+
try {
|
|
23
|
+
// 1. Sync statistics to domain_stats table
|
|
24
|
+
await refreshDomainStats();
|
|
25
|
+
|
|
26
|
+
// 2. Fetch global statistics
|
|
27
|
+
const globalStats = await getGlobalStats();
|
|
28
|
+
|
|
29
|
+
// 3. Fetch domain-level statistics
|
|
30
|
+
const domainStats = await getDomainStats();
|
|
31
|
+
|
|
32
|
+
// 4. Calculate crawl rates
|
|
33
|
+
const currentCompleted = globalStats.done + globalStats.failed;
|
|
34
|
+
const completedSinceStart = currentCompleted - lastDoneAndFailedCount;
|
|
35
|
+
const elapsedMinutes = (Date.now() - startTime) / 60000;
|
|
36
|
+
const crawlRate = elapsedMinutes > 0 ? (completedSinceStart / elapsedMinutes).toFixed(1) : "0.0";
|
|
37
|
+
|
|
38
|
+
// 5. Build and output the formatted log messages
|
|
39
|
+
console.log(`\n=== Crawler Progress Report ===`);
|
|
40
|
+
console.log(`Speed: ${crawlRate} pages/min`);
|
|
41
|
+
console.log(`Global Status Breakdown:`);
|
|
42
|
+
console.log(` PENDING : ${globalStats.pending}`);
|
|
43
|
+
console.log(` FETCHING: ${globalStats.fetching}`);
|
|
44
|
+
console.log(` DONE : ${globalStats.done}`);
|
|
45
|
+
console.log(` FAILED : ${globalStats.failed}`);
|
|
46
|
+
|
|
47
|
+
if (domainStats.length > 0) {
|
|
48
|
+
console.log(`Domain Breakdown:`);
|
|
49
|
+
for (const ds of domainStats) {
|
|
50
|
+
const lastCrawledStr = ds.last_crawled_at ? ds.last_crawled_at.toISOString() : "never";
|
|
51
|
+
console.log(
|
|
52
|
+
` - ${ds.domain}: PENDING: ${ds.pending_count} | FETCHING: ${ds.fetching_count} | DONE: ${ds.done_count} | FAILED: ${ds.failed_count} (Last Crawled: ${lastCrawledStr})`
|
|
53
|
+
);
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
console.log(`================================\n`);
|
|
57
|
+
} catch (error) {
|
|
58
|
+
console.error("Error generating crawler progress logs:", error);
|
|
59
|
+
}
|
|
60
|
+
}, intervalMs);
|
|
61
|
+
|
|
62
|
+
if (loggerInterval && typeof loggerInterval.unref === "function") {
|
|
63
|
+
loggerInterval.unref();
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
/**
|
|
68
|
+
* Stops the progress logger interval.
|
|
69
|
+
*/
|
|
70
|
+
export function stopProgressLogger(): void {
|
|
71
|
+
if (loggerInterval) {
|
|
72
|
+
clearInterval(loggerInterval);
|
|
73
|
+
loggerInterval = null;
|
|
74
|
+
}
|
|
75
|
+
}
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import { request } from "undici";
|
|
2
|
+
// @ts-ignore
|
|
3
|
+
import robotsParser from "robots-parser";
|
|
4
|
+
import { getDomain } from "../normalizer.js";
|
|
5
|
+
import { config } from "../config.js";
|
|
6
|
+
|
|
7
|
+
const robotsCache = new Map<string, any>();
|
|
8
|
+
const fetchFailures = new Set<string>();
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* Checks if a URL is allowed to be crawled according to the domain's robots.txt rules.
|
|
12
|
+
* Caches robots.txt rules per domain to avoid duplicate requests.
|
|
13
|
+
*/
|
|
14
|
+
export async function isAllowedByRobots(urlStr: string): Promise<boolean> {
|
|
15
|
+
const domain = getDomain(urlStr);
|
|
16
|
+
if (!domain) return false;
|
|
17
|
+
|
|
18
|
+
if (fetchFailures.has(domain)) {
|
|
19
|
+
return true;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
let parser = robotsCache.get(domain);
|
|
23
|
+
|
|
24
|
+
if (!parser) {
|
|
25
|
+
const robotsUrl = `https://${domain}/robots.txt`;
|
|
26
|
+
try {
|
|
27
|
+
const res = await request(robotsUrl, {
|
|
28
|
+
method: "GET",
|
|
29
|
+
headersTimeout: config.REQUEST_TIMEOUT_MS,
|
|
30
|
+
bodyTimeout: config.REQUEST_TIMEOUT_MS,
|
|
31
|
+
});
|
|
32
|
+
|
|
33
|
+
if (res.statusCode === 200) {
|
|
34
|
+
const content = await res.body.text();
|
|
35
|
+
const parserCreator = robotsParser as any;
|
|
36
|
+
parser = parserCreator(robotsUrl, content);
|
|
37
|
+
robotsCache.set(domain, parser);
|
|
38
|
+
} else {
|
|
39
|
+
fetchFailures.add(domain);
|
|
40
|
+
return true;
|
|
41
|
+
}
|
|
42
|
+
} catch (e) {
|
|
43
|
+
// On network/request errors, default to allowed but do not permanently cache failure
|
|
44
|
+
return true;
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
const isAllowed = parser.isAllowed(urlStr, "WebCrawler");
|
|
49
|
+
return isAllowed === undefined ? true : isAllowed;
|
|
50
|
+
}
|