messi-crawler 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. package/README.md +201 -0
  2. package/dist/cli/renderer.js +71 -0
  3. package/dist/config.js +18 -0
  4. package/dist/db/clear.js +16 -0
  5. package/dist/db/client.js +20 -0
  6. package/dist/db/queries.js +179 -0
  7. package/dist/frontier/frontier.js +44 -0
  8. package/dist/frontier/logger.js +65 -0
  9. package/dist/frontier/robots.js +46 -0
  10. package/dist/frontier/scheduler.js +98 -0
  11. package/dist/index.js +533 -0
  12. package/dist/normalizer.js +33 -0
  13. package/dist/output/db-strategy.js +16 -0
  14. package/dist/output/index.js +23 -0
  15. package/dist/output/pdf-strategy.js +316 -0
  16. package/dist/output/strategy.js +1 -0
  17. package/dist/security/ssrf.js +45 -0
  18. package/dist/security/validate-url.js +41 -0
  19. package/dist/seed.js +14 -0
  20. package/dist/setup.js +148 -0
  21. package/dist/test/client.test.js +33 -0
  22. package/dist/test/downloader.test.js +84 -0
  23. package/dist/test/extractor.test.js +126 -0
  24. package/dist/test/frontier.test.js +43 -0
  25. package/dist/test/logger.test.js +55 -0
  26. package/dist/test/normalizer.test.js +36 -0
  27. package/dist/test/pdf-strategy.test.js +68 -0
  28. package/dist/test/queries.test.js +173 -0
  29. package/dist/test/robots.test.js +46 -0
  30. package/dist/test/scheduler.test.js +73 -0
  31. package/dist/test/seed.test.js +26 -0
  32. package/dist/test/worker.test.js +118 -0
  33. package/dist/worker/downloader.js +114 -0
  34. package/dist/worker/extractor.js +197 -0
  35. package/dist/worker/worker.js +87 -0
  36. package/package.json +48 -0
  37. package/seeds.txt +4 -0
  38. package/src/cli/renderer.ts +83 -0
  39. package/src/config.ts +22 -0
  40. package/src/db/clear.ts +16 -0
  41. package/src/db/client.ts +26 -0
  42. package/src/db/queries.ts +255 -0
  43. package/src/db/schema.sql +43 -0
  44. package/src/frontier/frontier.ts +60 -0
  45. package/src/frontier/logger.ts +75 -0
  46. package/src/frontier/robots.ts +50 -0
  47. package/src/frontier/scheduler.ts +119 -0
  48. package/src/index.ts +596 -0
  49. package/src/normalizer.ts +37 -0
  50. package/src/output/db-strategy.ts +20 -0
  51. package/src/output/index.ts +32 -0
  52. package/src/output/pdf-strategy.ts +388 -0
  53. package/src/output/strategy.ts +16 -0
  54. package/src/security/ssrf.ts +48 -0
  55. package/src/security/validate-url.ts +49 -0
  56. package/src/seed.ts +18 -0
  57. package/src/setup.ts +170 -0
  58. package/src/test/client.test.ts +38 -0
  59. package/src/test/downloader.test.ts +101 -0
  60. package/src/test/extractor.test.ts +139 -0
  61. package/src/test/frontier.test.ts +53 -0
  62. package/src/test/logger.test.ts +71 -0
  63. package/src/test/normalizer.test.ts +43 -0
  64. package/src/test/pdf-strategy.test.ts +84 -0
  65. package/src/test/queries.test.ts +247 -0
  66. package/src/test/robots.test.ts +56 -0
  67. package/src/test/scheduler.test.ts +90 -0
  68. package/src/test/seed.test.ts +35 -0
  69. package/src/test/worker.test.ts +144 -0
  70. package/src/worker/downloader.ts +149 -0
  71. package/src/worker/extractor.ts +235 -0
  72. package/src/worker/worker.ts +100 -0
  73. package/tsconfig.json +15 -0
package/README.md ADDED
@@ -0,0 +1,201 @@
1
+ # Web Crawler
2
+
3
+ An interactive, CLI-driven web crawler built with Node.js, TypeScript, and a PostgreSQL backend. Specifically engineered to systematically extract and compile programming-related documentation from seed URLs, the crawler offers flexible output strategies — structured database records or compiled PDF eBooks — with politeness constraints, domain filtering, and depth controls.
4
+
5
+ ## System Requirements
6
+
7
+ - **Node.js** (version 18 or greater)
8
+ - **PostgreSQL** (local or hosted instance like Supabase)
9
+
10
+ ## Features
11
+
12
+ ### Interactive CLI Wizard
13
+ Run `npm run crawl` to launch an interactive setup wizard that guides you through:
14
+ - **Output mode selection** — Database (structured records) or PDF (compiled eBook)
15
+ - **Seed URL source** — Use URLs from `seeds.txt`, config defaults, or enter custom URLs
16
+ - **Performance tuning** — Configure depth, crawl delay, worker count, and page limits
17
+
18
+ ### Flexible Output Strategies
19
+ - **Database mode** — Stores extracted content (URL, title, description, headings, text) as structured records in PostgreSQL with link graph tracking
20
+ - **PDF mode** — Compiles all crawled pages into a formatted PDF eBook with cover page, table of contents, and styled chapters. PDFs are auto-versioned (`documentation.pdf`, `documentation2.pdf`, etc.) to avoid overwrites
21
+
22
+ ### Safety & Politeness
23
+ - **Minimum crawl delay** — Enforces a 500ms floor on `CRAWL_DELAY_MS` to prevent accidental aggressive request rates
24
+ - **Robots.txt compliance** — Respects disallow directives per domain
25
+ - **Domain filtering** — Restricts crawling to seed domains only; child links outside allowed domains are ignored
26
+ - **Concurrency limits** — Configurable worker pool to control concurrent requests
27
+
28
+ ### Session-Scoped Crawling
29
+ - Each run filters the DB queue to only process URLs matching the current session's allowed domains
30
+ - Stale pending URLs from previous runs are automatically cleared at startup
31
+ - Session page counter tracks progress independently of cumulative DB totals
32
+
33
+ ## Configuration
34
+
35
+ The crawler is configured interactively via the CLI wizard, but you can also pre-set defaults using environment variables or `src/config.ts`.
36
+
37
+ ### Environment Variables (`.env`)
38
+ Create a `.env` file in the root directory with:
39
+
40
+ ```env
41
+ DATABASE_URL=postgresql://user:password@host:5432/dbname
42
+ MAX_DEPTH=3
43
+ CRAWL_DELAY_MS=1000
44
+ WORKER_COUNT=5
45
+ MAX_PAGES=1000
46
+ OUTPUT_MODE=database
47
+ ```
48
+
49
+ ### Seed URLs (`seeds.txt`)
50
+ Add target URLs to `seeds.txt`, one per line:
51
+
52
+ ```
53
+ # Programming documentation sources
54
+ https://react.dev
55
+ https://developer.mozilla.org
56
+ https://www.typescriptlang.org/
57
+ ```
58
+
59
+ Lines starting with `#` are ignored. The wizard defaults to using `seeds.txt` if present.
60
+
61
+ ### Configuration Fields
62
+
63
+ | Field | Description | Default |
64
+ |--------------------|-----------------------------------------------------------------------------|---------------|
65
+ | `MAX_DEPTH` | Maximum link hops from seed URLs (0 = seeds only) | `3` |
66
+ | `CRAWL_DELAY_MS` | Politeness delay per domain (min 500ms enforced) | `1000` |
67
+ | `WORKER_COUNT` | Number of concurrent workers | `5` |
68
+ | `MAX_PAGES` | Page limit per session (0 = unlimited) | `1000` |
69
+ | `OUTPUT_MODE` | Output destination: `database` or `pdf` | `database` |
70
+ | `DATABASE_URL` | PostgreSQL connection string | (required) |
71
+
72
+ ## Database Setup
73
+
74
+ Before running the crawler, initialize the database schema.
75
+
76
+ ### 1. Create the database
77
+ ```sql
78
+ CREATE DATABASE web_crawler;
79
+ ```
80
+
81
+ ### 2. Apply the schema
82
+ ```bash
83
+ psql -U postgres -d web_crawler -f src/db/schema.sql
84
+ ```
85
+
86
+ Or if using a hosted service like Supabase, run the contents of `src/db/schema.sql` in the SQL editor.
87
+
88
+ ### 3. Configure the connection
89
+ Update `DATABASE_URL` in `.env` with your connection string:
90
+ ```
91
+ DATABASE_URL=postgresql://postgres:password@db.example.supabase.co:5432/postgres
92
+ ```
93
+
94
+ ## Usage
95
+
96
+ ### Start the Crawler
97
+ ```bash
98
+ npm run crawl
99
+ ```
100
+ This launches the interactive wizard, then starts crawling with your chosen settings.
101
+
102
+ ### Pre-configure (optional)
103
+ ```bash
104
+ npm run config
105
+ ```
106
+ Runs the standalone configuration wizard, writes settings to `.env` and patches `src/config.ts`.
107
+
108
+ ### Clear the Database
109
+ ```bash
110
+ npm run db:clear
111
+ ```
112
+ Truncates all tables and resets the crawler state.
113
+
114
+ ### Run Tests
115
+ ```bash
116
+ npm test
117
+ ```
118
+
119
+ ## Output
120
+
121
+ ### Database Mode
122
+ Crawled data is stored across four tables:
123
+ - **`urls`** — All discovered URLs with status tracking (`PENDING`, `FETCHING`, `DONE`, `FAILED`)
124
+ - **`crawled_pages`** — Extracted content (title, description, headings, text)
125
+ - **`links`** — Link graph edges (from → to relationships)
126
+ - **`domain_stats`** — Per-domain aggregate statistics
127
+
128
+ Query examples:
129
+ ```sql
130
+ -- Get all successfully crawled pages
131
+ SELECT url, title FROM crawled_pages
132
+ JOIN urls ON crawled_pages.url_id = urls.id;
133
+
134
+ -- View domain statistics
135
+ SELECT * FROM domain_stats;
136
+ ```
137
+
138
+ ### PDF Mode
139
+ Each crawl generates a compiled PDF in `output/`:
140
+ - `documentation.pdf` (first run)
141
+ - `documentation2.pdf` (second run)
142
+ - etc.
143
+
144
+ PDFs include:
145
+ - Styled cover page with generation timestamp
146
+ - One chapter per crawled page with title, URL, description, headings outline, and body text
147
+ - Footer with page numbers
148
+
149
+ ## Architecture
150
+
151
+ ### Core Components
152
+
153
+ | Module | Purpose |
154
+ |----------------------------|----------------------------------------------------------------------|
155
+ | `src/index.ts` | Main entry point; runs CLI wizard and orchestrates crawl session |
156
+ | `src/setup.ts` | Standalone configuration wizard (for `npm run config`) |
157
+ | `src/frontier/scheduler.ts`| Round-robin scheduler with politeness delays and concurrency limits |
158
+ | `src/worker/worker.ts` | Processes individual URLs: download, extract, persist |
159
+ | `src/worker/downloader.ts` | HTTP client with redirect handling and timeouts |
160
+ | `src/worker/extractor.ts` | Cheerio-based HTML parser for metadata and content |
161
+ | `src/output/` | Strategy pattern for output destinations (DB or PDF) |
162
+ | `src/db/queries.ts` | Database queries for URL state management and link tracking |
163
+ | `src/frontier/robots.ts` | Robots.txt parser with per-domain caching |
164
+
165
+ ### Design Patterns
166
+
167
+ - **Strategy Pattern** — Output destinations (`DatabaseStrategy`, `PdfStrategy`) implement a common `OutputStrategy` interface, allowing runtime switching
168
+ - **Round-robin scheduling** — Domains are processed in rotation with per-domain cooldowns to enforce politeness delays
169
+ - **Optimistic locking** — PostgreSQL `FOR UPDATE SKIP LOCKED` prevents workers from claiming the same URL
170
+
171
+ ## Safety & Best Practices
172
+
173
+ - **Politeness floor** — `CRAWL_DELAY_MS` cannot be set below 500ms; attempts to do so are flagged and auto-corrected
174
+ - **Domain scoping** — Only URLs matching `ALLOWED_DOMAINS` (derived from seeds) are crawled
175
+ - **Robots.txt compliance** — URLs disallowed by `robots.txt` are marked failed without download
176
+ - **Graceful shutdown** — On reaching `MAX_PAGES`, the scheduler waits for in-flight workers to complete before closing the DB pool
177
+ - **Crash recovery** — On startup, any URLs stuck in `FETCHING` state are reset to `PENDING`
178
+
179
+ ## Troubleshooting
180
+
181
+ ### Database connection errors
182
+ - Verify `DATABASE_URL` is correct and the database exists
183
+ - Check that the host/port is reachable (port 5432 is commonly blocked on public networks; use Supabase's connection pooler on port 6543 if needed)
184
+ - Ensure the password is URL-encoded if it contains special characters
185
+
186
+ ### Crawler picks up wrong URLs
187
+ - Run `npm run db:clear` to wipe stale data from previous runs
188
+ - Verify `seeds.txt` contains only the URLs you want
189
+ - Check that `ALLOWED_DOMAINS` in the wizard output matches your intent
190
+
191
+ ### Crawl delay too aggressive
192
+ - The minimum is 500ms. If you set a lower value, it's automatically raised with a warning.
193
+ - Increase `CRAWL_DELAY_MS` if target servers rate-limit or block requests
194
+
195
+ ## License
196
+
197
+ ISC
198
+
199
+ ## Repository
200
+
201
+ [github.com/lightning4747/Web-crawler](https://github.com/lightning4747/Web-crawler)
@@ -0,0 +1,71 @@
1
+ /**
2
+ * Terminal rendering utilities — colours, symbols, layout helpers.
3
+ * Keeps all ANSI logic in one place so the wizard stays readable.
4
+ */
5
+ export const c = {
6
+ reset: "\x1b[0m",
7
+ bold: "\x1b[1m",
8
+ dim: "\x1b[2m",
9
+ italic: "\x1b[3m",
10
+ // foreground
11
+ white: "\x1b[97m",
12
+ gray: "\x1b[90m",
13
+ cyan: "\x1b[96m",
14
+ green: "\x1b[92m",
15
+ yellow: "\x1b[93m",
16
+ red: "\x1b[91m",
17
+ blue: "\x1b[94m",
18
+ magenta: "\x1b[95m",
19
+ orange: "\x1b[38;5;208m",
20
+ };
21
+ export const sym = {
22
+ dot: "·",
23
+ bullet: "•",
24
+ arrow: "›",
25
+ check: "✓",
26
+ cross: "✗",
27
+ warn: "⚠",
28
+ info: "ℹ",
29
+ sparkle: "◆",
30
+ bar: "│",
31
+ corner: "╰",
32
+ tee: "├",
33
+ horiz: "─",
34
+ };
35
+ /** Wraps text in an ANSI style sequence. */
36
+ export function style(text, ...styles) {
37
+ return styles.join("") + text + c.reset;
38
+ }
39
+ /** Prints a blank line. */
40
+ export function blank() { console.log(); }
41
+ /** dim separator line */
42
+ export function divider(width = 52) {
43
+ console.log(style(sym.horiz.repeat(width), c.dim, c.gray));
44
+ }
45
+ /** A styled section label, e.g. ◆ Seeds */
46
+ export function section(label) {
47
+ console.log(style(`${sym.sparkle} ${label}`, c.bold, c.cyan));
48
+ }
49
+ /** A key/value summary row, e.g. › depth 3 */
50
+ export function row(key, value, valueColor = c.white) {
51
+ const pad = 16;
52
+ const k = style(key.padEnd(pad), c.gray);
53
+ const v = style(value, valueColor);
54
+ console.log(` ${style(sym.arrow, c.dim, c.gray)} ${k}${v}`);
55
+ }
56
+ /** Success line */
57
+ export function ok(msg) {
58
+ console.log(` ${style(sym.check, c.green)} ${style(msg, c.white)}`);
59
+ }
60
+ /** Warning line */
61
+ export function warn(msg) {
62
+ console.log(` ${style(sym.warn, c.yellow)} ${style(msg, c.yellow)}`);
63
+ }
64
+ /** Error line */
65
+ export function err(msg) {
66
+ console.log(` ${style(sym.cross, c.red)} ${style(msg, c.red)}`);
67
+ }
68
+ /** Info line */
69
+ export function info(msg) {
70
+ console.log(` ${style(sym.info, c.blue)} ${style(msg, c.gray)}`);
71
+ }
package/dist/config.js ADDED
@@ -0,0 +1,18 @@
1
+ import dotenv from "dotenv";
2
+ dotenv.config();
3
+ export const config = {
4
+ MAX_DEPTH: parseInt(process.env.MAX_DEPTH || "3", 10),
5
+ CRAWL_DELAY_MS: parseInt(process.env.CRAWL_DELAY_MS || "1000", 10),
6
+ WORKER_COUNT: parseInt(process.env.WORKER_COUNT || "10", 10),
7
+ REQUEST_TIMEOUT_MS: parseInt(process.env.REQUEST_TIMEOUT_MS || "10000", 10),
8
+ MAX_REDIRECTS: parseInt(process.env.MAX_REDIRECTS || "5", 10),
9
+ MAX_PAGES: parseInt(process.env.MAX_PAGES || "1000", 10),
10
+ /** Output destination: "database" | "pdf" */
11
+ OUTPUT_MODE: (process.env.OUTPUT_MODE || "database"),
12
+ SEED_URLS: [
13
+ "https://www.akc.org/dog-breeds/",
14
+ ],
15
+ ALLOWED_DOMAINS: [
16
+ "www.akc.org",
17
+ ],
18
+ };
@@ -0,0 +1,16 @@
1
+ import { pool } from "./client.js";
2
+ async function clearDatabase() {
3
+ console.log("Clearing database...");
4
+ try {
5
+ // Truncate all crawler tables and reset the auto-increment IDs
6
+ await pool.query("TRUNCATE TABLE links, crawled_pages, urls, domain_stats RESTART IDENTITY CASCADE;");
7
+ console.log("Database cleared successfully.");
8
+ }
9
+ catch (error) {
10
+ console.error("Error clearing database:", error);
11
+ }
12
+ finally {
13
+ await pool.end();
14
+ }
15
+ }
16
+ clearDatabase();
@@ -0,0 +1,20 @@
1
+ import pg from "pg";
2
+ import dotenv from "dotenv";
3
+ dotenv.config();
4
+ const { Pool } = pg;
5
+ const connectionString = process.env.DATABASE_URL;
6
+ export const pool = connectionString
7
+ ? new Pool({ connectionString })
8
+ : new Pool({
9
+ host: process.env.PGHOST || "localhost",
10
+ port: parseInt(process.env.PGPORT || "5432", 10),
11
+ user: process.env.PGUSER || "postgres",
12
+ password: process.env.PGPASSWORD || "",
13
+ database: process.env.PGDATABASE || "web_crawler",
14
+ });
15
+ export async function query(text, params) {
16
+ return pool.query(text, params);
17
+ }
18
+ export async function closePool() {
19
+ await pool.end();
20
+ }
@@ -0,0 +1,179 @@
1
+ import { query, pool } from "./client.js";
2
+ /**
3
+ * Claims the next PENDING URL for a given domain and marks it as FETCHING.
4
+ * Uses FOR UPDATE SKIP LOCKED to prevent multiple workers from claiming the same URL.
5
+ */
6
+ export async function claimNextURL(domain) {
7
+ const res = await query(`UPDATE urls
8
+ SET status = 'FETCHING', fetched_at = NOW()
9
+ WHERE id = (
10
+ SELECT id FROM urls
11
+ WHERE status = 'PENDING' AND domain = $1
12
+ ORDER BY depth ASC, discovered_at ASC
13
+ LIMIT 1
14
+ FOR UPDATE SKIP LOCKED
15
+ )
16
+ RETURNING id, url, domain, status, depth`, [domain]);
17
+ if (res.rows.length === 0) {
18
+ return null;
19
+ }
20
+ return res.rows[0];
21
+ }
22
+ /**
23
+ * Atomically updates URL status to DONE and inserts the crawled page content.
24
+ */
25
+ export async function markDone(urlId, content) {
26
+ const client = await pool.connect();
27
+ try {
28
+ await client.query("BEGIN");
29
+ await client.query(`INSERT INTO crawled_pages (url_id, title, description, canonical_url, headings, text_content)
30
+ VALUES ($1, $2, $3, $4, $5, $6)`, [
31
+ urlId,
32
+ content.title,
33
+ content.description,
34
+ content.canonicalUrl,
35
+ JSON.stringify(content.headings),
36
+ content.textContent,
37
+ ]);
38
+ await client.query(`UPDATE urls
39
+ SET status = 'DONE', fetched_at = NOW()
40
+ WHERE id = $1`, [urlId]);
41
+ await client.query("COMMIT");
42
+ }
43
+ catch (error) {
44
+ await client.query("ROLLBACK");
45
+ throw error;
46
+ }
47
+ finally {
48
+ client.release();
49
+ }
50
+ }
51
+ /**
52
+ * Marks a URL status as FAILED and stores the error message.
53
+ */
54
+ export async function markFailed(urlId, errorMessage) {
55
+ await query(`UPDATE urls
56
+ SET status = 'FAILED', error_message = $2, fetched_at = NOW()
57
+ WHERE id = $1`, [urlId, errorMessage]);
58
+ }
59
+ /**
60
+ * Inserts a URL as PENDING if it doesn't already exist.
61
+ * Returns the ID of the URL (whether newly inserted or already existing).
62
+ */
63
+ export async function insertURL(url, domain, depth) {
64
+ const res = await query(`WITH ins AS (
65
+ INSERT INTO urls (url, domain, status, depth)
66
+ VALUES ($1, $2, 'PENDING', $3)
67
+ ON CONFLICT (url) DO NOTHING
68
+ RETURNING id
69
+ )
70
+ SELECT id FROM ins
71
+ UNION ALL
72
+ SELECT id FROM urls WHERE url = $1
73
+ LIMIT 1`, [url, domain, depth]);
74
+ return res.rows[0].id;
75
+ }
76
+ /**
77
+ * Inserts a link relationship between two URLs.
78
+ */
79
+ export async function insertLink(fromUrlId, toUrlId) {
80
+ await query(`INSERT INTO links (from_url_id, to_url_id)
81
+ VALUES ($1, $2)
82
+ ON CONFLICT (from_url_id, to_url_id) DO NOTHING`, [fromUrlId, toUrlId]);
83
+ }
84
+ /**
85
+ * Resets all URLs with FETCHING status back to PENDING.
86
+ * Used for crash recovery on startup to release stale locks.
87
+ */
88
+ export async function resetStaleLocks() {
89
+ await query(`UPDATE urls
90
+ SET status = 'PENDING'
91
+ WHERE status = 'FETCHING'`);
92
+ }
93
+ /**
94
+ * Deletes all PENDING URLs whose domain is not in the provided allowed list.
95
+ * This includes child links discovered during previous crawls, ensuring a new
96
+ * session scoped to different seeds starts with a clean queue.
97
+ */
98
+ export async function clearPendingURLs(allowedDomains) {
99
+ if (allowedDomains.length === 0)
100
+ return;
101
+ const result = await query(`DELETE FROM urls
102
+ WHERE status = 'PENDING'
103
+ AND domain <> ALL($1::text[])`, [allowedDomains]);
104
+ const deleted = result.rowCount ?? 0;
105
+ if (deleted > 0) {
106
+ console.log(`[setup] Cleared ${deleted} stale PENDING URL(s) outside allowed domains.`);
107
+ }
108
+ }
109
+ /**
110
+ * Retrieves aggregate statistics across all URLs.
111
+ */
112
+ export async function getGlobalStats() {
113
+ const res = await query(`SELECT status, COUNT(*) as count
114
+ FROM urls
115
+ GROUP BY status`);
116
+ const stats = { pending: 0, fetching: 0, done: 0, failed: 0 };
117
+ for (const row of res.rows) {
118
+ const status = row.status.toLowerCase();
119
+ const count = parseInt(row.count, 10);
120
+ if (status === "pending")
121
+ stats.pending = count;
122
+ else if (status === "fetching")
123
+ stats.fetching = count;
124
+ else if (status === "done")
125
+ stats.done = count;
126
+ else if (status === "failed")
127
+ stats.failed = count;
128
+ }
129
+ return stats;
130
+ }
131
+ /**
132
+ * Recomputes and updates domain-level stats in the domain_stats table.
133
+ */
134
+ export async function refreshDomainStats() {
135
+ await query(`
136
+ CREATE TABLE IF NOT EXISTS domain_stats (
137
+ domain TEXT PRIMARY KEY,
138
+ pending_count INTEGER NOT NULL DEFAULT 0,
139
+ fetching_count INTEGER NOT NULL DEFAULT 0,
140
+ done_count INTEGER NOT NULL DEFAULT 0,
141
+ failed_count INTEGER NOT NULL DEFAULT 0,
142
+ last_crawled_at TIMESTAMPTZ
143
+ )
144
+ `);
145
+ await query(`
146
+ INSERT INTO domain_stats (domain, pending_count, fetching_count, done_count, failed_count, last_crawled_at)
147
+ SELECT
148
+ domain,
149
+ COUNT(*) FILTER (WHERE status = 'PENDING') as pending_count,
150
+ COUNT(*) FILTER (WHERE status = 'FETCHING') as fetching_count,
151
+ COUNT(*) FILTER (WHERE status = 'DONE') as done_count,
152
+ COUNT(*) FILTER (WHERE status = 'FAILED') as failed_count,
153
+ MAX(fetched_at) as last_crawled_at
154
+ FROM urls
155
+ GROUP BY domain
156
+ ON CONFLICT (domain) DO UPDATE SET
157
+ pending_count = EXCLUDED.pending_count,
158
+ fetching_count = EXCLUDED.fetching_count,
159
+ done_count = EXCLUDED.done_count,
160
+ failed_count = EXCLUDED.failed_count,
161
+ last_crawled_at = EXCLUDED.last_crawled_at
162
+ `);
163
+ }
164
+ /**
165
+ * Retrieves per-domain statistics.
166
+ */
167
+ export async function getDomainStats() {
168
+ const res = await query(`SELECT domain, pending_count, fetching_count, done_count, failed_count, last_crawled_at
169
+ FROM domain_stats
170
+ ORDER BY domain ASC`);
171
+ return res.rows.map((row) => ({
172
+ domain: row.domain,
173
+ pending_count: parseInt(row.pending_count, 10),
174
+ fetching_count: parseInt(row.fetching_count, 10),
175
+ done_count: parseInt(row.done_count, 10),
176
+ failed_count: parseInt(row.failed_count, 10),
177
+ last_crawled_at: row.last_crawled_at ? new Date(row.last_crawled_at) : null,
178
+ }));
179
+ }
@@ -0,0 +1,44 @@
1
+ import { query } from "../db/client.js";
2
+ import { config } from "../config.js";
3
+ /**
4
+ * Retrieves the list of unique domains that have at least one pending URL,
5
+ * restricted to the domains allowed in the current crawl session.
6
+ */
7
+ export async function getPendingDomains() {
8
+ const allowed = config.ALLOWED_DOMAINS;
9
+ // If ALLOWED_DOMAINS is populated, filter at the DB level so the scheduler
10
+ // never even sees pending rows from outside the current session's scope.
11
+ if (allowed && allowed.length > 0) {
12
+ const res = await query(`SELECT DISTINCT domain
13
+ FROM urls
14
+ WHERE status = 'PENDING'
15
+ AND domain = ANY($1::text[])`, [allowed]);
16
+ return res.rows.map((row) => row.domain);
17
+ }
18
+ const res = await query(`SELECT DISTINCT domain
19
+ FROM urls
20
+ WHERE status = 'PENDING'`);
21
+ return res.rows.map((row) => row.domain);
22
+ }
23
+ /**
24
+ * Retrieves the count of pending URLs bucketed by domain,
25
+ * restricted to the allowed domains in the current session.
26
+ */
27
+ export async function getPendingCounts() {
28
+ const allowed = config.ALLOWED_DOMAINS;
29
+ const res = allowed && allowed.length > 0
30
+ ? await query(`SELECT domain, COUNT(*) as count
31
+ FROM urls
32
+ WHERE status = 'PENDING'
33
+ AND domain = ANY($1::text[])
34
+ GROUP BY domain`, [allowed])
35
+ : await query(`SELECT domain, COUNT(*) as count
36
+ FROM urls
37
+ WHERE status = 'PENDING'
38
+ GROUP BY domain`);
39
+ const counts = {};
40
+ for (const row of res.rows) {
41
+ counts[row.domain] = parseInt(row.count, 10);
42
+ }
43
+ return counts;
44
+ }
@@ -0,0 +1,65 @@
1
+ import { getGlobalStats, refreshDomainStats, getDomainStats } from "../db/queries.js";
2
+ let loggerInterval = null;
3
+ let lastDoneAndFailedCount = 0;
4
+ let startTime = 0;
5
+ /**
6
+ * Starts a background interval to log crawler progress periodically.
7
+ */
8
+ export async function startProgressLogger(intervalMs = 5000) {
9
+ if (loggerInterval)
10
+ return;
11
+ startTime = Date.now();
12
+ try {
13
+ const initialStats = await getGlobalStats();
14
+ lastDoneAndFailedCount = initialStats.done + initialStats.failed;
15
+ }
16
+ catch (err) {
17
+ lastDoneAndFailedCount = 0;
18
+ }
19
+ loggerInterval = setInterval(async () => {
20
+ try {
21
+ // 1. Sync statistics to domain_stats table
22
+ await refreshDomainStats();
23
+ // 2. Fetch global statistics
24
+ const globalStats = await getGlobalStats();
25
+ // 3. Fetch domain-level statistics
26
+ const domainStats = await getDomainStats();
27
+ // 4. Calculate crawl rates
28
+ const currentCompleted = globalStats.done + globalStats.failed;
29
+ const completedSinceStart = currentCompleted - lastDoneAndFailedCount;
30
+ const elapsedMinutes = (Date.now() - startTime) / 60000;
31
+ const crawlRate = elapsedMinutes > 0 ? (completedSinceStart / elapsedMinutes).toFixed(1) : "0.0";
32
+ // 5. Build and output the formatted log messages
33
+ console.log(`\n=== Crawler Progress Report ===`);
34
+ console.log(`Speed: ${crawlRate} pages/min`);
35
+ console.log(`Global Status Breakdown:`);
36
+ console.log(` PENDING : ${globalStats.pending}`);
37
+ console.log(` FETCHING: ${globalStats.fetching}`);
38
+ console.log(` DONE : ${globalStats.done}`);
39
+ console.log(` FAILED : ${globalStats.failed}`);
40
+ if (domainStats.length > 0) {
41
+ console.log(`Domain Breakdown:`);
42
+ for (const ds of domainStats) {
43
+ const lastCrawledStr = ds.last_crawled_at ? ds.last_crawled_at.toISOString() : "never";
44
+ console.log(` - ${ds.domain}: PENDING: ${ds.pending_count} | FETCHING: ${ds.fetching_count} | DONE: ${ds.done_count} | FAILED: ${ds.failed_count} (Last Crawled: ${lastCrawledStr})`);
45
+ }
46
+ }
47
+ console.log(`================================\n`);
48
+ }
49
+ catch (error) {
50
+ console.error("Error generating crawler progress logs:", error);
51
+ }
52
+ }, intervalMs);
53
+ if (loggerInterval && typeof loggerInterval.unref === "function") {
54
+ loggerInterval.unref();
55
+ }
56
+ }
57
+ /**
58
+ * Stops the progress logger interval.
59
+ */
60
+ export function stopProgressLogger() {
61
+ if (loggerInterval) {
62
+ clearInterval(loggerInterval);
63
+ loggerInterval = null;
64
+ }
65
+ }
@@ -0,0 +1,46 @@
1
+ import { request } from "undici";
2
+ // @ts-ignore
3
+ import robotsParser from "robots-parser";
4
+ import { getDomain } from "../normalizer.js";
5
+ import { config } from "../config.js";
6
+ const robotsCache = new Map();
7
+ const fetchFailures = new Set();
8
+ /**
9
+ * Checks if a URL is allowed to be crawled according to the domain's robots.txt rules.
10
+ * Caches robots.txt rules per domain to avoid duplicate requests.
11
+ */
12
+ export async function isAllowedByRobots(urlStr) {
13
+ const domain = getDomain(urlStr);
14
+ if (!domain)
15
+ return false;
16
+ if (fetchFailures.has(domain)) {
17
+ return true;
18
+ }
19
+ let parser = robotsCache.get(domain);
20
+ if (!parser) {
21
+ const robotsUrl = `https://${domain}/robots.txt`;
22
+ try {
23
+ const res = await request(robotsUrl, {
24
+ method: "GET",
25
+ headersTimeout: config.REQUEST_TIMEOUT_MS,
26
+ bodyTimeout: config.REQUEST_TIMEOUT_MS,
27
+ });
28
+ if (res.statusCode === 200) {
29
+ const content = await res.body.text();
30
+ const parserCreator = robotsParser;
31
+ parser = parserCreator(robotsUrl, content);
32
+ robotsCache.set(domain, parser);
33
+ }
34
+ else {
35
+ fetchFailures.add(domain);
36
+ return true;
37
+ }
38
+ }
39
+ catch (e) {
40
+ // On network/request errors, default to allowed but do not permanently cache failure
41
+ return true;
42
+ }
43
+ }
44
+ const isAllowed = parser.isAllowed(urlStr, "WebCrawler");
45
+ return isAllowed === undefined ? true : isAllowed;
46
+ }