messi-crawler 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +201 -0
- package/dist/cli/renderer.js +71 -0
- package/dist/config.js +18 -0
- package/dist/db/clear.js +16 -0
- package/dist/db/client.js +20 -0
- package/dist/db/queries.js +179 -0
- package/dist/frontier/frontier.js +44 -0
- package/dist/frontier/logger.js +65 -0
- package/dist/frontier/robots.js +46 -0
- package/dist/frontier/scheduler.js +98 -0
- package/dist/index.js +533 -0
- package/dist/normalizer.js +33 -0
- package/dist/output/db-strategy.js +16 -0
- package/dist/output/index.js +23 -0
- package/dist/output/pdf-strategy.js +316 -0
- package/dist/output/strategy.js +1 -0
- package/dist/security/ssrf.js +45 -0
- package/dist/security/validate-url.js +41 -0
- package/dist/seed.js +14 -0
- package/dist/setup.js +148 -0
- package/dist/test/client.test.js +33 -0
- package/dist/test/downloader.test.js +84 -0
- package/dist/test/extractor.test.js +126 -0
- package/dist/test/frontier.test.js +43 -0
- package/dist/test/logger.test.js +55 -0
- package/dist/test/normalizer.test.js +36 -0
- package/dist/test/pdf-strategy.test.js +68 -0
- package/dist/test/queries.test.js +173 -0
- package/dist/test/robots.test.js +46 -0
- package/dist/test/scheduler.test.js +73 -0
- package/dist/test/seed.test.js +26 -0
- package/dist/test/worker.test.js +118 -0
- package/dist/worker/downloader.js +114 -0
- package/dist/worker/extractor.js +197 -0
- package/dist/worker/worker.js +87 -0
- package/package.json +48 -0
- package/seeds.txt +4 -0
- package/src/cli/renderer.ts +83 -0
- package/src/config.ts +22 -0
- package/src/db/clear.ts +16 -0
- package/src/db/client.ts +26 -0
- package/src/db/queries.ts +255 -0
- package/src/db/schema.sql +43 -0
- package/src/frontier/frontier.ts +60 -0
- package/src/frontier/logger.ts +75 -0
- package/src/frontier/robots.ts +50 -0
- package/src/frontier/scheduler.ts +119 -0
- package/src/index.ts +596 -0
- package/src/normalizer.ts +37 -0
- package/src/output/db-strategy.ts +20 -0
- package/src/output/index.ts +32 -0
- package/src/output/pdf-strategy.ts +388 -0
- package/src/output/strategy.ts +16 -0
- package/src/security/ssrf.ts +48 -0
- package/src/security/validate-url.ts +49 -0
- package/src/seed.ts +18 -0
- package/src/setup.ts +170 -0
- package/src/test/client.test.ts +38 -0
- package/src/test/downloader.test.ts +101 -0
- package/src/test/extractor.test.ts +139 -0
- package/src/test/frontier.test.ts +53 -0
- package/src/test/logger.test.ts +71 -0
- package/src/test/normalizer.test.ts +43 -0
- package/src/test/pdf-strategy.test.ts +84 -0
- package/src/test/queries.test.ts +247 -0
- package/src/test/robots.test.ts +56 -0
- package/src/test/scheduler.test.ts +90 -0
- package/src/test/seed.test.ts +35 -0
- package/src/test/worker.test.ts +144 -0
- package/src/worker/downloader.ts +149 -0
- package/src/worker/extractor.ts +235 -0
- package/src/worker/worker.ts +100 -0
- package/tsconfig.json +15 -0
package/README.md
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
# Web Crawler
|
|
2
|
+
|
|
3
|
+
An interactive, CLI-driven web crawler built with Node.js, TypeScript, and a PostgreSQL backend. Specifically engineered to systematically extract and compile programming-related documentation from seed URLs, the crawler offers flexible output strategies — structured database records or compiled PDF eBooks — with politeness constraints, domain filtering, and depth controls.
|
|
4
|
+
|
|
5
|
+
## System Requirements
|
|
6
|
+
|
|
7
|
+
- **Node.js** (version 18 or greater)
|
|
8
|
+
- **PostgreSQL** (local or hosted instance like Supabase)
|
|
9
|
+
|
|
10
|
+
## Features
|
|
11
|
+
|
|
12
|
+
### Interactive CLI Wizard
|
|
13
|
+
Run `npm run crawl` to launch an interactive setup wizard that guides you through:
|
|
14
|
+
- **Output mode selection** — Database (structured records) or PDF (compiled eBook)
|
|
15
|
+
- **Seed URL source** — Use URLs from `seeds.txt`, config defaults, or enter custom URLs
|
|
16
|
+
- **Performance tuning** — Configure depth, crawl delay, worker count, and page limits
|
|
17
|
+
|
|
18
|
+
### Flexible Output Strategies
|
|
19
|
+
- **Database mode** — Stores extracted content (URL, title, description, headings, text) as structured records in PostgreSQL with link graph tracking
|
|
20
|
+
- **PDF mode** — Compiles all crawled pages into a formatted PDF eBook with cover page, table of contents, and styled chapters. PDFs are auto-versioned (`documentation.pdf`, `documentation2.pdf`, etc.) to avoid overwrites
|
|
21
|
+
|
|
22
|
+
### Safety & Politeness
|
|
23
|
+
- **Minimum crawl delay** — Enforces a 500ms floor on `CRAWL_DELAY_MS` to prevent accidental aggressive request rates
|
|
24
|
+
- **Robots.txt compliance** — Respects disallow directives per domain
|
|
25
|
+
- **Domain filtering** — Restricts crawling to seed domains only; child links outside allowed domains are ignored
|
|
26
|
+
- **Concurrency limits** — Configurable worker pool to control concurrent requests
|
|
27
|
+
|
|
28
|
+
### Session-Scoped Crawling
|
|
29
|
+
- Each run filters the DB queue to only process URLs matching the current session's allowed domains
|
|
30
|
+
- Stale pending URLs from previous runs are automatically cleared at startup
|
|
31
|
+
- Session page counter tracks progress independently of cumulative DB totals
|
|
32
|
+
|
|
33
|
+
## Configuration
|
|
34
|
+
|
|
35
|
+
The crawler is configured interactively via the CLI wizard, but you can also pre-set defaults using environment variables or `src/config.ts`.
|
|
36
|
+
|
|
37
|
+
### Environment Variables (`.env`)
|
|
38
|
+
Create a `.env` file in the root directory with:
|
|
39
|
+
|
|
40
|
+
```env
|
|
41
|
+
DATABASE_URL=postgresql://user:password@host:5432/dbname
|
|
42
|
+
MAX_DEPTH=3
|
|
43
|
+
CRAWL_DELAY_MS=1000
|
|
44
|
+
WORKER_COUNT=5
|
|
45
|
+
MAX_PAGES=1000
|
|
46
|
+
OUTPUT_MODE=database
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
### Seed URLs (`seeds.txt`)
|
|
50
|
+
Add target URLs to `seeds.txt`, one per line:
|
|
51
|
+
|
|
52
|
+
```
|
|
53
|
+
# Programming documentation sources
|
|
54
|
+
https://react.dev
|
|
55
|
+
https://developer.mozilla.org
|
|
56
|
+
https://www.typescriptlang.org/
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
Lines starting with `#` are ignored. The wizard defaults to using `seeds.txt` if present.
|
|
60
|
+
|
|
61
|
+
### Configuration Fields
|
|
62
|
+
|
|
63
|
+
| Field | Description | Default |
|
|
64
|
+
|--------------------|-----------------------------------------------------------------------------|---------------|
|
|
65
|
+
| `MAX_DEPTH` | Maximum link hops from seed URLs (0 = seeds only) | `3` |
|
|
66
|
+
| `CRAWL_DELAY_MS` | Politeness delay per domain (min 500ms enforced) | `1000` |
|
|
67
|
+
| `WORKER_COUNT` | Number of concurrent workers | `5` |
|
|
68
|
+
| `MAX_PAGES` | Page limit per session (0 = unlimited) | `1000` |
|
|
69
|
+
| `OUTPUT_MODE` | Output destination: `database` or `pdf` | `database` |
|
|
70
|
+
| `DATABASE_URL` | PostgreSQL connection string | (required) |
|
|
71
|
+
|
|
72
|
+
## Database Setup
|
|
73
|
+
|
|
74
|
+
Before running the crawler, initialize the database schema.
|
|
75
|
+
|
|
76
|
+
### 1. Create the database
|
|
77
|
+
```sql
|
|
78
|
+
CREATE DATABASE web_crawler;
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
### 2. Apply the schema
|
|
82
|
+
```bash
|
|
83
|
+
psql -U postgres -d web_crawler -f src/db/schema.sql
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
Or if using a hosted service like Supabase, run the contents of `src/db/schema.sql` in the SQL editor.
|
|
87
|
+
|
|
88
|
+
### 3. Configure the connection
|
|
89
|
+
Update `DATABASE_URL` in `.env` with your connection string:
|
|
90
|
+
```
|
|
91
|
+
DATABASE_URL=postgresql://postgres:password@db.example.supabase.co:5432/postgres
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
## Usage
|
|
95
|
+
|
|
96
|
+
### Start the Crawler
|
|
97
|
+
```bash
|
|
98
|
+
npm run crawl
|
|
99
|
+
```
|
|
100
|
+
This launches the interactive wizard, then starts crawling with your chosen settings.
|
|
101
|
+
|
|
102
|
+
### Pre-configure (optional)
|
|
103
|
+
```bash
|
|
104
|
+
npm run config
|
|
105
|
+
```
|
|
106
|
+
Runs the standalone configuration wizard, writes settings to `.env` and patches `src/config.ts`.
|
|
107
|
+
|
|
108
|
+
### Clear the Database
|
|
109
|
+
```bash
|
|
110
|
+
npm run db:clear
|
|
111
|
+
```
|
|
112
|
+
Truncates all tables and resets the crawler state.
|
|
113
|
+
|
|
114
|
+
### Run Tests
|
|
115
|
+
```bash
|
|
116
|
+
npm test
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
## Output
|
|
120
|
+
|
|
121
|
+
### Database Mode
|
|
122
|
+
Crawled data is stored across four tables:
|
|
123
|
+
- **`urls`** — All discovered URLs with status tracking (`PENDING`, `FETCHING`, `DONE`, `FAILED`)
|
|
124
|
+
- **`crawled_pages`** — Extracted content (title, description, headings, text)
|
|
125
|
+
- **`links`** — Link graph edges (from → to relationships)
|
|
126
|
+
- **`domain_stats`** — Per-domain aggregate statistics
|
|
127
|
+
|
|
128
|
+
Query examples:
|
|
129
|
+
```sql
|
|
130
|
+
-- Get all successfully crawled pages
|
|
131
|
+
SELECT url, title FROM crawled_pages
|
|
132
|
+
JOIN urls ON crawled_pages.url_id = urls.id;
|
|
133
|
+
|
|
134
|
+
-- View domain statistics
|
|
135
|
+
SELECT * FROM domain_stats;
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
### PDF Mode
|
|
139
|
+
Each crawl generates a compiled PDF in `output/`:
|
|
140
|
+
- `documentation.pdf` (first run)
|
|
141
|
+
- `documentation2.pdf` (second run)
|
|
142
|
+
- etc.
|
|
143
|
+
|
|
144
|
+
PDFs include:
|
|
145
|
+
- Styled cover page with generation timestamp
|
|
146
|
+
- One chapter per crawled page with title, URL, description, headings outline, and body text
|
|
147
|
+
- Footer with page numbers
|
|
148
|
+
|
|
149
|
+
## Architecture
|
|
150
|
+
|
|
151
|
+
### Core Components
|
|
152
|
+
|
|
153
|
+
| Module | Purpose |
|
|
154
|
+
|----------------------------|----------------------------------------------------------------------|
|
|
155
|
+
| `src/index.ts` | Main entry point; runs CLI wizard and orchestrates crawl session |
|
|
156
|
+
| `src/setup.ts` | Standalone configuration wizard (for `npm run config`) |
|
|
157
|
+
| `src/frontier/scheduler.ts`| Round-robin scheduler with politeness delays and concurrency limits |
|
|
158
|
+
| `src/worker/worker.ts` | Processes individual URLs: download, extract, persist |
|
|
159
|
+
| `src/worker/downloader.ts` | HTTP client with redirect handling and timeouts |
|
|
160
|
+
| `src/worker/extractor.ts` | Cheerio-based HTML parser for metadata and content |
|
|
161
|
+
| `src/output/` | Strategy pattern for output destinations (DB or PDF) |
|
|
162
|
+
| `src/db/queries.ts` | Database queries for URL state management and link tracking |
|
|
163
|
+
| `src/frontier/robots.ts` | Robots.txt parser with per-domain caching |
|
|
164
|
+
|
|
165
|
+
### Design Patterns
|
|
166
|
+
|
|
167
|
+
- **Strategy Pattern** — Output destinations (`DatabaseStrategy`, `PdfStrategy`) implement a common `OutputStrategy` interface, allowing runtime switching
|
|
168
|
+
- **Round-robin scheduling** — Domains are processed in rotation with per-domain cooldowns to enforce politeness delays
|
|
169
|
+
- **Optimistic locking** — PostgreSQL `FOR UPDATE SKIP LOCKED` prevents workers from claiming the same URL
|
|
170
|
+
|
|
171
|
+
## Safety & Best Practices
|
|
172
|
+
|
|
173
|
+
- **Politeness floor** — `CRAWL_DELAY_MS` cannot be set below 500ms; attempts to do so are flagged and auto-corrected
|
|
174
|
+
- **Domain scoping** — Only URLs matching `ALLOWED_DOMAINS` (derived from seeds) are crawled
|
|
175
|
+
- **Robots.txt compliance** — URLs disallowed by `robots.txt` are marked failed without download
|
|
176
|
+
- **Graceful shutdown** — On reaching `MAX_PAGES`, the scheduler waits for in-flight workers to complete before closing the DB pool
|
|
177
|
+
- **Crash recovery** — On startup, any URLs stuck in `FETCHING` state are reset to `PENDING`
|
|
178
|
+
|
|
179
|
+
## Troubleshooting
|
|
180
|
+
|
|
181
|
+
### Database connection errors
|
|
182
|
+
- Verify `DATABASE_URL` is correct and the database exists
|
|
183
|
+
- Check that the host/port is reachable (port 5432 is commonly blocked on public networks; use Supabase's connection pooler on port 6543 if needed)
|
|
184
|
+
- Ensure the password is URL-encoded if it contains special characters
|
|
185
|
+
|
|
186
|
+
### Crawler picks up wrong URLs
|
|
187
|
+
- Run `npm run db:clear` to wipe stale data from previous runs
|
|
188
|
+
- Verify `seeds.txt` contains only the URLs you want
|
|
189
|
+
- Check that `ALLOWED_DOMAINS` in the wizard output matches your intent
|
|
190
|
+
|
|
191
|
+
### Crawl delay too aggressive
|
|
192
|
+
- The minimum is 500ms. If you set a lower value, it's automatically raised with a warning.
|
|
193
|
+
- Increase `CRAWL_DELAY_MS` if target servers rate-limit or block requests
|
|
194
|
+
|
|
195
|
+
## License
|
|
196
|
+
|
|
197
|
+
ISC
|
|
198
|
+
|
|
199
|
+
## Repository
|
|
200
|
+
|
|
201
|
+
[github.com/lightning4747/Web-crawler](https://github.com/lightning4747/Web-crawler)
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Terminal rendering utilities — colours, symbols, layout helpers.
|
|
3
|
+
* Keeps all ANSI logic in one place so the wizard stays readable.
|
|
4
|
+
*/
|
|
5
|
+
export const c = {
|
|
6
|
+
reset: "\x1b[0m",
|
|
7
|
+
bold: "\x1b[1m",
|
|
8
|
+
dim: "\x1b[2m",
|
|
9
|
+
italic: "\x1b[3m",
|
|
10
|
+
// foreground
|
|
11
|
+
white: "\x1b[97m",
|
|
12
|
+
gray: "\x1b[90m",
|
|
13
|
+
cyan: "\x1b[96m",
|
|
14
|
+
green: "\x1b[92m",
|
|
15
|
+
yellow: "\x1b[93m",
|
|
16
|
+
red: "\x1b[91m",
|
|
17
|
+
blue: "\x1b[94m",
|
|
18
|
+
magenta: "\x1b[95m",
|
|
19
|
+
orange: "\x1b[38;5;208m",
|
|
20
|
+
};
|
|
21
|
+
export const sym = {
|
|
22
|
+
dot: "·",
|
|
23
|
+
bullet: "•",
|
|
24
|
+
arrow: "›",
|
|
25
|
+
check: "✓",
|
|
26
|
+
cross: "✗",
|
|
27
|
+
warn: "⚠",
|
|
28
|
+
info: "ℹ",
|
|
29
|
+
sparkle: "◆",
|
|
30
|
+
bar: "│",
|
|
31
|
+
corner: "╰",
|
|
32
|
+
tee: "├",
|
|
33
|
+
horiz: "─",
|
|
34
|
+
};
|
|
35
|
+
/** Wraps text in an ANSI style sequence. */
|
|
36
|
+
export function style(text, ...styles) {
|
|
37
|
+
return styles.join("") + text + c.reset;
|
|
38
|
+
}
|
|
39
|
+
/** Prints a blank line. */
|
|
40
|
+
export function blank() { console.log(); }
|
|
41
|
+
/** dim separator line */
|
|
42
|
+
export function divider(width = 52) {
|
|
43
|
+
console.log(style(sym.horiz.repeat(width), c.dim, c.gray));
|
|
44
|
+
}
|
|
45
|
+
/** A styled section label, e.g. ◆ Seeds */
|
|
46
|
+
export function section(label) {
|
|
47
|
+
console.log(style(`${sym.sparkle} ${label}`, c.bold, c.cyan));
|
|
48
|
+
}
|
|
49
|
+
/** A key/value summary row, e.g. › depth 3 */
|
|
50
|
+
export function row(key, value, valueColor = c.white) {
|
|
51
|
+
const pad = 16;
|
|
52
|
+
const k = style(key.padEnd(pad), c.gray);
|
|
53
|
+
const v = style(value, valueColor);
|
|
54
|
+
console.log(` ${style(sym.arrow, c.dim, c.gray)} ${k}${v}`);
|
|
55
|
+
}
|
|
56
|
+
/** Success line */
|
|
57
|
+
export function ok(msg) {
|
|
58
|
+
console.log(` ${style(sym.check, c.green)} ${style(msg, c.white)}`);
|
|
59
|
+
}
|
|
60
|
+
/** Warning line */
|
|
61
|
+
export function warn(msg) {
|
|
62
|
+
console.log(` ${style(sym.warn, c.yellow)} ${style(msg, c.yellow)}`);
|
|
63
|
+
}
|
|
64
|
+
/** Error line */
|
|
65
|
+
export function err(msg) {
|
|
66
|
+
console.log(` ${style(sym.cross, c.red)} ${style(msg, c.red)}`);
|
|
67
|
+
}
|
|
68
|
+
/** Info line */
|
|
69
|
+
export function info(msg) {
|
|
70
|
+
console.log(` ${style(sym.info, c.blue)} ${style(msg, c.gray)}`);
|
|
71
|
+
}
|
package/dist/config.js
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import dotenv from "dotenv";
|
|
2
|
+
dotenv.config();
|
|
3
|
+
export const config = {
|
|
4
|
+
MAX_DEPTH: parseInt(process.env.MAX_DEPTH || "3", 10),
|
|
5
|
+
CRAWL_DELAY_MS: parseInt(process.env.CRAWL_DELAY_MS || "1000", 10),
|
|
6
|
+
WORKER_COUNT: parseInt(process.env.WORKER_COUNT || "10", 10),
|
|
7
|
+
REQUEST_TIMEOUT_MS: parseInt(process.env.REQUEST_TIMEOUT_MS || "10000", 10),
|
|
8
|
+
MAX_REDIRECTS: parseInt(process.env.MAX_REDIRECTS || "5", 10),
|
|
9
|
+
MAX_PAGES: parseInt(process.env.MAX_PAGES || "1000", 10),
|
|
10
|
+
/** Output destination: "database" | "pdf" */
|
|
11
|
+
OUTPUT_MODE: (process.env.OUTPUT_MODE || "database"),
|
|
12
|
+
SEED_URLS: [
|
|
13
|
+
"https://www.akc.org/dog-breeds/",
|
|
14
|
+
],
|
|
15
|
+
ALLOWED_DOMAINS: [
|
|
16
|
+
"www.akc.org",
|
|
17
|
+
],
|
|
18
|
+
};
|
package/dist/db/clear.js
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import { pool } from "./client.js";
|
|
2
|
+
async function clearDatabase() {
|
|
3
|
+
console.log("Clearing database...");
|
|
4
|
+
try {
|
|
5
|
+
// Truncate all crawler tables and reset the auto-increment IDs
|
|
6
|
+
await pool.query("TRUNCATE TABLE links, crawled_pages, urls, domain_stats RESTART IDENTITY CASCADE;");
|
|
7
|
+
console.log("Database cleared successfully.");
|
|
8
|
+
}
|
|
9
|
+
catch (error) {
|
|
10
|
+
console.error("Error clearing database:", error);
|
|
11
|
+
}
|
|
12
|
+
finally {
|
|
13
|
+
await pool.end();
|
|
14
|
+
}
|
|
15
|
+
}
|
|
16
|
+
clearDatabase();
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import pg from "pg";
|
|
2
|
+
import dotenv from "dotenv";
|
|
3
|
+
dotenv.config();
|
|
4
|
+
const { Pool } = pg;
|
|
5
|
+
const connectionString = process.env.DATABASE_URL;
|
|
6
|
+
export const pool = connectionString
|
|
7
|
+
? new Pool({ connectionString })
|
|
8
|
+
: new Pool({
|
|
9
|
+
host: process.env.PGHOST || "localhost",
|
|
10
|
+
port: parseInt(process.env.PGPORT || "5432", 10),
|
|
11
|
+
user: process.env.PGUSER || "postgres",
|
|
12
|
+
password: process.env.PGPASSWORD || "",
|
|
13
|
+
database: process.env.PGDATABASE || "web_crawler",
|
|
14
|
+
});
|
|
15
|
+
export async function query(text, params) {
|
|
16
|
+
return pool.query(text, params);
|
|
17
|
+
}
|
|
18
|
+
export async function closePool() {
|
|
19
|
+
await pool.end();
|
|
20
|
+
}
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
import { query, pool } from "./client.js";
|
|
2
|
+
/**
|
|
3
|
+
* Claims the next PENDING URL for a given domain and marks it as FETCHING.
|
|
4
|
+
* Uses FOR UPDATE SKIP LOCKED to prevent multiple workers from claiming the same URL.
|
|
5
|
+
*/
|
|
6
|
+
export async function claimNextURL(domain) {
|
|
7
|
+
const res = await query(`UPDATE urls
|
|
8
|
+
SET status = 'FETCHING', fetched_at = NOW()
|
|
9
|
+
WHERE id = (
|
|
10
|
+
SELECT id FROM urls
|
|
11
|
+
WHERE status = 'PENDING' AND domain = $1
|
|
12
|
+
ORDER BY depth ASC, discovered_at ASC
|
|
13
|
+
LIMIT 1
|
|
14
|
+
FOR UPDATE SKIP LOCKED
|
|
15
|
+
)
|
|
16
|
+
RETURNING id, url, domain, status, depth`, [domain]);
|
|
17
|
+
if (res.rows.length === 0) {
|
|
18
|
+
return null;
|
|
19
|
+
}
|
|
20
|
+
return res.rows[0];
|
|
21
|
+
}
|
|
22
|
+
/**
|
|
23
|
+
* Atomically updates URL status to DONE and inserts the crawled page content.
|
|
24
|
+
*/
|
|
25
|
+
export async function markDone(urlId, content) {
|
|
26
|
+
const client = await pool.connect();
|
|
27
|
+
try {
|
|
28
|
+
await client.query("BEGIN");
|
|
29
|
+
await client.query(`INSERT INTO crawled_pages (url_id, title, description, canonical_url, headings, text_content)
|
|
30
|
+
VALUES ($1, $2, $3, $4, $5, $6)`, [
|
|
31
|
+
urlId,
|
|
32
|
+
content.title,
|
|
33
|
+
content.description,
|
|
34
|
+
content.canonicalUrl,
|
|
35
|
+
JSON.stringify(content.headings),
|
|
36
|
+
content.textContent,
|
|
37
|
+
]);
|
|
38
|
+
await client.query(`UPDATE urls
|
|
39
|
+
SET status = 'DONE', fetched_at = NOW()
|
|
40
|
+
WHERE id = $1`, [urlId]);
|
|
41
|
+
await client.query("COMMIT");
|
|
42
|
+
}
|
|
43
|
+
catch (error) {
|
|
44
|
+
await client.query("ROLLBACK");
|
|
45
|
+
throw error;
|
|
46
|
+
}
|
|
47
|
+
finally {
|
|
48
|
+
client.release();
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
/**
|
|
52
|
+
* Marks a URL status as FAILED and stores the error message.
|
|
53
|
+
*/
|
|
54
|
+
export async function markFailed(urlId, errorMessage) {
|
|
55
|
+
await query(`UPDATE urls
|
|
56
|
+
SET status = 'FAILED', error_message = $2, fetched_at = NOW()
|
|
57
|
+
WHERE id = $1`, [urlId, errorMessage]);
|
|
58
|
+
}
|
|
59
|
+
/**
|
|
60
|
+
* Inserts a URL as PENDING if it doesn't already exist.
|
|
61
|
+
* Returns the ID of the URL (whether newly inserted or already existing).
|
|
62
|
+
*/
|
|
63
|
+
export async function insertURL(url, domain, depth) {
|
|
64
|
+
const res = await query(`WITH ins AS (
|
|
65
|
+
INSERT INTO urls (url, domain, status, depth)
|
|
66
|
+
VALUES ($1, $2, 'PENDING', $3)
|
|
67
|
+
ON CONFLICT (url) DO NOTHING
|
|
68
|
+
RETURNING id
|
|
69
|
+
)
|
|
70
|
+
SELECT id FROM ins
|
|
71
|
+
UNION ALL
|
|
72
|
+
SELECT id FROM urls WHERE url = $1
|
|
73
|
+
LIMIT 1`, [url, domain, depth]);
|
|
74
|
+
return res.rows[0].id;
|
|
75
|
+
}
|
|
76
|
+
/**
|
|
77
|
+
* Inserts a link relationship between two URLs.
|
|
78
|
+
*/
|
|
79
|
+
export async function insertLink(fromUrlId, toUrlId) {
|
|
80
|
+
await query(`INSERT INTO links (from_url_id, to_url_id)
|
|
81
|
+
VALUES ($1, $2)
|
|
82
|
+
ON CONFLICT (from_url_id, to_url_id) DO NOTHING`, [fromUrlId, toUrlId]);
|
|
83
|
+
}
|
|
84
|
+
/**
|
|
85
|
+
* Resets all URLs with FETCHING status back to PENDING.
|
|
86
|
+
* Used for crash recovery on startup to release stale locks.
|
|
87
|
+
*/
|
|
88
|
+
export async function resetStaleLocks() {
|
|
89
|
+
await query(`UPDATE urls
|
|
90
|
+
SET status = 'PENDING'
|
|
91
|
+
WHERE status = 'FETCHING'`);
|
|
92
|
+
}
|
|
93
|
+
/**
|
|
94
|
+
* Deletes all PENDING URLs whose domain is not in the provided allowed list.
|
|
95
|
+
* This includes child links discovered during previous crawls, ensuring a new
|
|
96
|
+
* session scoped to different seeds starts with a clean queue.
|
|
97
|
+
*/
|
|
98
|
+
export async function clearPendingURLs(allowedDomains) {
|
|
99
|
+
if (allowedDomains.length === 0)
|
|
100
|
+
return;
|
|
101
|
+
const result = await query(`DELETE FROM urls
|
|
102
|
+
WHERE status = 'PENDING'
|
|
103
|
+
AND domain <> ALL($1::text[])`, [allowedDomains]);
|
|
104
|
+
const deleted = result.rowCount ?? 0;
|
|
105
|
+
if (deleted > 0) {
|
|
106
|
+
console.log(`[setup] Cleared ${deleted} stale PENDING URL(s) outside allowed domains.`);
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
/**
|
|
110
|
+
* Retrieves aggregate statistics across all URLs.
|
|
111
|
+
*/
|
|
112
|
+
export async function getGlobalStats() {
|
|
113
|
+
const res = await query(`SELECT status, COUNT(*) as count
|
|
114
|
+
FROM urls
|
|
115
|
+
GROUP BY status`);
|
|
116
|
+
const stats = { pending: 0, fetching: 0, done: 0, failed: 0 };
|
|
117
|
+
for (const row of res.rows) {
|
|
118
|
+
const status = row.status.toLowerCase();
|
|
119
|
+
const count = parseInt(row.count, 10);
|
|
120
|
+
if (status === "pending")
|
|
121
|
+
stats.pending = count;
|
|
122
|
+
else if (status === "fetching")
|
|
123
|
+
stats.fetching = count;
|
|
124
|
+
else if (status === "done")
|
|
125
|
+
stats.done = count;
|
|
126
|
+
else if (status === "failed")
|
|
127
|
+
stats.failed = count;
|
|
128
|
+
}
|
|
129
|
+
return stats;
|
|
130
|
+
}
|
|
131
|
+
/**
|
|
132
|
+
* Recomputes and updates domain-level stats in the domain_stats table.
|
|
133
|
+
*/
|
|
134
|
+
export async function refreshDomainStats() {
|
|
135
|
+
await query(`
|
|
136
|
+
CREATE TABLE IF NOT EXISTS domain_stats (
|
|
137
|
+
domain TEXT PRIMARY KEY,
|
|
138
|
+
pending_count INTEGER NOT NULL DEFAULT 0,
|
|
139
|
+
fetching_count INTEGER NOT NULL DEFAULT 0,
|
|
140
|
+
done_count INTEGER NOT NULL DEFAULT 0,
|
|
141
|
+
failed_count INTEGER NOT NULL DEFAULT 0,
|
|
142
|
+
last_crawled_at TIMESTAMPTZ
|
|
143
|
+
)
|
|
144
|
+
`);
|
|
145
|
+
await query(`
|
|
146
|
+
INSERT INTO domain_stats (domain, pending_count, fetching_count, done_count, failed_count, last_crawled_at)
|
|
147
|
+
SELECT
|
|
148
|
+
domain,
|
|
149
|
+
COUNT(*) FILTER (WHERE status = 'PENDING') as pending_count,
|
|
150
|
+
COUNT(*) FILTER (WHERE status = 'FETCHING') as fetching_count,
|
|
151
|
+
COUNT(*) FILTER (WHERE status = 'DONE') as done_count,
|
|
152
|
+
COUNT(*) FILTER (WHERE status = 'FAILED') as failed_count,
|
|
153
|
+
MAX(fetched_at) as last_crawled_at
|
|
154
|
+
FROM urls
|
|
155
|
+
GROUP BY domain
|
|
156
|
+
ON CONFLICT (domain) DO UPDATE SET
|
|
157
|
+
pending_count = EXCLUDED.pending_count,
|
|
158
|
+
fetching_count = EXCLUDED.fetching_count,
|
|
159
|
+
done_count = EXCLUDED.done_count,
|
|
160
|
+
failed_count = EXCLUDED.failed_count,
|
|
161
|
+
last_crawled_at = EXCLUDED.last_crawled_at
|
|
162
|
+
`);
|
|
163
|
+
}
|
|
164
|
+
/**
|
|
165
|
+
* Retrieves per-domain statistics.
|
|
166
|
+
*/
|
|
167
|
+
export async function getDomainStats() {
|
|
168
|
+
const res = await query(`SELECT domain, pending_count, fetching_count, done_count, failed_count, last_crawled_at
|
|
169
|
+
FROM domain_stats
|
|
170
|
+
ORDER BY domain ASC`);
|
|
171
|
+
return res.rows.map((row) => ({
|
|
172
|
+
domain: row.domain,
|
|
173
|
+
pending_count: parseInt(row.pending_count, 10),
|
|
174
|
+
fetching_count: parseInt(row.fetching_count, 10),
|
|
175
|
+
done_count: parseInt(row.done_count, 10),
|
|
176
|
+
failed_count: parseInt(row.failed_count, 10),
|
|
177
|
+
last_crawled_at: row.last_crawled_at ? new Date(row.last_crawled_at) : null,
|
|
178
|
+
}));
|
|
179
|
+
}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import { query } from "../db/client.js";
|
|
2
|
+
import { config } from "../config.js";
|
|
3
|
+
/**
|
|
4
|
+
* Retrieves the list of unique domains that have at least one pending URL,
|
|
5
|
+
* restricted to the domains allowed in the current crawl session.
|
|
6
|
+
*/
|
|
7
|
+
export async function getPendingDomains() {
|
|
8
|
+
const allowed = config.ALLOWED_DOMAINS;
|
|
9
|
+
// If ALLOWED_DOMAINS is populated, filter at the DB level so the scheduler
|
|
10
|
+
// never even sees pending rows from outside the current session's scope.
|
|
11
|
+
if (allowed && allowed.length > 0) {
|
|
12
|
+
const res = await query(`SELECT DISTINCT domain
|
|
13
|
+
FROM urls
|
|
14
|
+
WHERE status = 'PENDING'
|
|
15
|
+
AND domain = ANY($1::text[])`, [allowed]);
|
|
16
|
+
return res.rows.map((row) => row.domain);
|
|
17
|
+
}
|
|
18
|
+
const res = await query(`SELECT DISTINCT domain
|
|
19
|
+
FROM urls
|
|
20
|
+
WHERE status = 'PENDING'`);
|
|
21
|
+
return res.rows.map((row) => row.domain);
|
|
22
|
+
}
|
|
23
|
+
/**
|
|
24
|
+
* Retrieves the count of pending URLs bucketed by domain,
|
|
25
|
+
* restricted to the allowed domains in the current session.
|
|
26
|
+
*/
|
|
27
|
+
export async function getPendingCounts() {
|
|
28
|
+
const allowed = config.ALLOWED_DOMAINS;
|
|
29
|
+
const res = allowed && allowed.length > 0
|
|
30
|
+
? await query(`SELECT domain, COUNT(*) as count
|
|
31
|
+
FROM urls
|
|
32
|
+
WHERE status = 'PENDING'
|
|
33
|
+
AND domain = ANY($1::text[])
|
|
34
|
+
GROUP BY domain`, [allowed])
|
|
35
|
+
: await query(`SELECT domain, COUNT(*) as count
|
|
36
|
+
FROM urls
|
|
37
|
+
WHERE status = 'PENDING'
|
|
38
|
+
GROUP BY domain`);
|
|
39
|
+
const counts = {};
|
|
40
|
+
for (const row of res.rows) {
|
|
41
|
+
counts[row.domain] = parseInt(row.count, 10);
|
|
42
|
+
}
|
|
43
|
+
return counts;
|
|
44
|
+
}
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
import { getGlobalStats, refreshDomainStats, getDomainStats } from "../db/queries.js";
|
|
2
|
+
let loggerInterval = null;
|
|
3
|
+
let lastDoneAndFailedCount = 0;
|
|
4
|
+
let startTime = 0;
|
|
5
|
+
/**
|
|
6
|
+
* Starts a background interval to log crawler progress periodically.
|
|
7
|
+
*/
|
|
8
|
+
export async function startProgressLogger(intervalMs = 5000) {
|
|
9
|
+
if (loggerInterval)
|
|
10
|
+
return;
|
|
11
|
+
startTime = Date.now();
|
|
12
|
+
try {
|
|
13
|
+
const initialStats = await getGlobalStats();
|
|
14
|
+
lastDoneAndFailedCount = initialStats.done + initialStats.failed;
|
|
15
|
+
}
|
|
16
|
+
catch (err) {
|
|
17
|
+
lastDoneAndFailedCount = 0;
|
|
18
|
+
}
|
|
19
|
+
loggerInterval = setInterval(async () => {
|
|
20
|
+
try {
|
|
21
|
+
// 1. Sync statistics to domain_stats table
|
|
22
|
+
await refreshDomainStats();
|
|
23
|
+
// 2. Fetch global statistics
|
|
24
|
+
const globalStats = await getGlobalStats();
|
|
25
|
+
// 3. Fetch domain-level statistics
|
|
26
|
+
const domainStats = await getDomainStats();
|
|
27
|
+
// 4. Calculate crawl rates
|
|
28
|
+
const currentCompleted = globalStats.done + globalStats.failed;
|
|
29
|
+
const completedSinceStart = currentCompleted - lastDoneAndFailedCount;
|
|
30
|
+
const elapsedMinutes = (Date.now() - startTime) / 60000;
|
|
31
|
+
const crawlRate = elapsedMinutes > 0 ? (completedSinceStart / elapsedMinutes).toFixed(1) : "0.0";
|
|
32
|
+
// 5. Build and output the formatted log messages
|
|
33
|
+
console.log(`\n=== Crawler Progress Report ===`);
|
|
34
|
+
console.log(`Speed: ${crawlRate} pages/min`);
|
|
35
|
+
console.log(`Global Status Breakdown:`);
|
|
36
|
+
console.log(` PENDING : ${globalStats.pending}`);
|
|
37
|
+
console.log(` FETCHING: ${globalStats.fetching}`);
|
|
38
|
+
console.log(` DONE : ${globalStats.done}`);
|
|
39
|
+
console.log(` FAILED : ${globalStats.failed}`);
|
|
40
|
+
if (domainStats.length > 0) {
|
|
41
|
+
console.log(`Domain Breakdown:`);
|
|
42
|
+
for (const ds of domainStats) {
|
|
43
|
+
const lastCrawledStr = ds.last_crawled_at ? ds.last_crawled_at.toISOString() : "never";
|
|
44
|
+
console.log(` - ${ds.domain}: PENDING: ${ds.pending_count} | FETCHING: ${ds.fetching_count} | DONE: ${ds.done_count} | FAILED: ${ds.failed_count} (Last Crawled: ${lastCrawledStr})`);
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
console.log(`================================\n`);
|
|
48
|
+
}
|
|
49
|
+
catch (error) {
|
|
50
|
+
console.error("Error generating crawler progress logs:", error);
|
|
51
|
+
}
|
|
52
|
+
}, intervalMs);
|
|
53
|
+
if (loggerInterval && typeof loggerInterval.unref === "function") {
|
|
54
|
+
loggerInterval.unref();
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
/**
|
|
58
|
+
* Stops the progress logger interval.
|
|
59
|
+
*/
|
|
60
|
+
export function stopProgressLogger() {
|
|
61
|
+
if (loggerInterval) {
|
|
62
|
+
clearInterval(loggerInterval);
|
|
63
|
+
loggerInterval = null;
|
|
64
|
+
}
|
|
65
|
+
}
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import { request } from "undici";
|
|
2
|
+
// @ts-ignore
|
|
3
|
+
import robotsParser from "robots-parser";
|
|
4
|
+
import { getDomain } from "../normalizer.js";
|
|
5
|
+
import { config } from "../config.js";
|
|
6
|
+
const robotsCache = new Map();
|
|
7
|
+
const fetchFailures = new Set();
|
|
8
|
+
/**
|
|
9
|
+
* Checks if a URL is allowed to be crawled according to the domain's robots.txt rules.
|
|
10
|
+
* Caches robots.txt rules per domain to avoid duplicate requests.
|
|
11
|
+
*/
|
|
12
|
+
export async function isAllowedByRobots(urlStr) {
|
|
13
|
+
const domain = getDomain(urlStr);
|
|
14
|
+
if (!domain)
|
|
15
|
+
return false;
|
|
16
|
+
if (fetchFailures.has(domain)) {
|
|
17
|
+
return true;
|
|
18
|
+
}
|
|
19
|
+
let parser = robotsCache.get(domain);
|
|
20
|
+
if (!parser) {
|
|
21
|
+
const robotsUrl = `https://${domain}/robots.txt`;
|
|
22
|
+
try {
|
|
23
|
+
const res = await request(robotsUrl, {
|
|
24
|
+
method: "GET",
|
|
25
|
+
headersTimeout: config.REQUEST_TIMEOUT_MS,
|
|
26
|
+
bodyTimeout: config.REQUEST_TIMEOUT_MS,
|
|
27
|
+
});
|
|
28
|
+
if (res.statusCode === 200) {
|
|
29
|
+
const content = await res.body.text();
|
|
30
|
+
const parserCreator = robotsParser;
|
|
31
|
+
parser = parserCreator(robotsUrl, content);
|
|
32
|
+
robotsCache.set(domain, parser);
|
|
33
|
+
}
|
|
34
|
+
else {
|
|
35
|
+
fetchFailures.add(domain);
|
|
36
|
+
return true;
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
catch (e) {
|
|
40
|
+
// On network/request errors, default to allowed but do not permanently cache failure
|
|
41
|
+
return true;
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
const isAllowed = parser.isAllowed(urlStr, "WebCrawler");
|
|
45
|
+
return isAllowed === undefined ? true : isAllowed;
|
|
46
|
+
}
|