messi-crawler 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +201 -0
- package/dist/cli/renderer.js +71 -0
- package/dist/config.js +18 -0
- package/dist/db/clear.js +16 -0
- package/dist/db/client.js +20 -0
- package/dist/db/queries.js +179 -0
- package/dist/frontier/frontier.js +44 -0
- package/dist/frontier/logger.js +65 -0
- package/dist/frontier/robots.js +46 -0
- package/dist/frontier/scheduler.js +98 -0
- package/dist/index.js +533 -0
- package/dist/normalizer.js +33 -0
- package/dist/output/db-strategy.js +16 -0
- package/dist/output/index.js +23 -0
- package/dist/output/pdf-strategy.js +316 -0
- package/dist/output/strategy.js +1 -0
- package/dist/security/ssrf.js +45 -0
- package/dist/security/validate-url.js +41 -0
- package/dist/seed.js +14 -0
- package/dist/setup.js +148 -0
- package/dist/test/client.test.js +33 -0
- package/dist/test/downloader.test.js +84 -0
- package/dist/test/extractor.test.js +126 -0
- package/dist/test/frontier.test.js +43 -0
- package/dist/test/logger.test.js +55 -0
- package/dist/test/normalizer.test.js +36 -0
- package/dist/test/pdf-strategy.test.js +68 -0
- package/dist/test/queries.test.js +173 -0
- package/dist/test/robots.test.js +46 -0
- package/dist/test/scheduler.test.js +73 -0
- package/dist/test/seed.test.js +26 -0
- package/dist/test/worker.test.js +118 -0
- package/dist/worker/downloader.js +114 -0
- package/dist/worker/extractor.js +197 -0
- package/dist/worker/worker.js +87 -0
- package/package.json +48 -0
- package/seeds.txt +4 -0
- package/src/cli/renderer.ts +83 -0
- package/src/config.ts +22 -0
- package/src/db/clear.ts +16 -0
- package/src/db/client.ts +26 -0
- package/src/db/queries.ts +255 -0
- package/src/db/schema.sql +43 -0
- package/src/frontier/frontier.ts +60 -0
- package/src/frontier/logger.ts +75 -0
- package/src/frontier/robots.ts +50 -0
- package/src/frontier/scheduler.ts +119 -0
- package/src/index.ts +596 -0
- package/src/normalizer.ts +37 -0
- package/src/output/db-strategy.ts +20 -0
- package/src/output/index.ts +32 -0
- package/src/output/pdf-strategy.ts +388 -0
- package/src/output/strategy.ts +16 -0
- package/src/security/ssrf.ts +48 -0
- package/src/security/validate-url.ts +49 -0
- package/src/seed.ts +18 -0
- package/src/setup.ts +170 -0
- package/src/test/client.test.ts +38 -0
- package/src/test/downloader.test.ts +101 -0
- package/src/test/extractor.test.ts +139 -0
- package/src/test/frontier.test.ts +53 -0
- package/src/test/logger.test.ts +71 -0
- package/src/test/normalizer.test.ts +43 -0
- package/src/test/pdf-strategy.test.ts +84 -0
- package/src/test/queries.test.ts +247 -0
- package/src/test/robots.test.ts +56 -0
- package/src/test/scheduler.test.ts +90 -0
- package/src/test/seed.test.ts +35 -0
- package/src/test/worker.test.ts +144 -0
- package/src/worker/downloader.ts +149 -0
- package/src/worker/extractor.ts +235 -0
- package/src/worker/worker.ts +100 -0
- package/tsconfig.json +15 -0
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
import { config } from "../config.js";
|
|
2
|
+
import { claimNextURL } from "../db/queries.js";
|
|
3
|
+
import { getPendingDomains } from "./frontier.js";
|
|
4
|
+
import { processPage } from "../worker/worker.js";
|
|
5
|
+
import { startProgressLogger, stopProgressLogger } from "./logger.js";
|
|
6
|
+
|
|
7
|
+
const cooldowns = new Map<string, number>();
|
|
8
|
+
|
|
9
|
+
let activeWorkers = 0;
|
|
10
|
+
let lastDomainIndex = 0;
|
|
11
|
+
let isRunning = false;
|
|
12
|
+
|
|
13
|
+
// Pages dispatched in this session (in-memory counter, not cumulative DB total)
|
|
14
|
+
let sessionPageCount = 0;
|
|
15
|
+
|
|
16
|
+
export const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* Starts the round-robin scheduler loop.
|
|
20
|
+
* Respects politeness delays per domain and concurrency limits.
|
|
21
|
+
*/
|
|
22
|
+
export async function startScheduler(): Promise<void> {
|
|
23
|
+
if (isRunning) return;
|
|
24
|
+
isRunning = true;
|
|
25
|
+
sessionPageCount = 0;
|
|
26
|
+
|
|
27
|
+
// Start the periodic progress logger
|
|
28
|
+
await startProgressLogger();
|
|
29
|
+
|
|
30
|
+
while (isRunning) {
|
|
31
|
+
// Enforce MAX_PAGES limit against this session's dispatched count
|
|
32
|
+
if (config.MAX_PAGES > 0 && sessionPageCount >= config.MAX_PAGES) {
|
|
33
|
+
// Wait for any in-flight workers to finish before stopping
|
|
34
|
+
while (activeWorkers > 0) {
|
|
35
|
+
await sleep(100);
|
|
36
|
+
}
|
|
37
|
+
console.log(`\n✓ Crawl complete — ${sessionPageCount} page(s) processed.\n`);
|
|
38
|
+
break;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
// 1. Enforce worker concurrency limit
|
|
42
|
+
if (activeWorkers >= config.WORKER_COUNT) {
|
|
43
|
+
await sleep(50);
|
|
44
|
+
continue;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
// 2. Fetch active pending domains from the frontier
|
|
48
|
+
const domains = await getPendingDomains();
|
|
49
|
+
if (domains.length === 0) {
|
|
50
|
+
// Exit if there are no pending URLs and all workers are idle
|
|
51
|
+
if (activeWorkers === 0) {
|
|
52
|
+
break;
|
|
53
|
+
}
|
|
54
|
+
await sleep(100);
|
|
55
|
+
continue;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
let claimed = false;
|
|
59
|
+
const now = Date.now();
|
|
60
|
+
|
|
61
|
+
// 3. Round-robin traversal over domains
|
|
62
|
+
for (let i = 0; i < domains.length; i++) {
|
|
63
|
+
const idx = (lastDomainIndex + i) % domains.length;
|
|
64
|
+
const domain = domains[idx];
|
|
65
|
+
|
|
66
|
+
// Enforce politeness delay
|
|
67
|
+
const nextAllowed = cooldowns.get(domain) || 0;
|
|
68
|
+
if (now < nextAllowed) {
|
|
69
|
+
continue;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
// Try to atomically claim a URL for this domain
|
|
73
|
+
const urlRow = await claimNextURL(domain);
|
|
74
|
+
if (urlRow) {
|
|
75
|
+
// Set the domain cooldown
|
|
76
|
+
cooldowns.set(domain, Date.now() + config.CRAWL_DELAY_MS);
|
|
77
|
+
|
|
78
|
+
// Update round-robin start index for the next tick
|
|
79
|
+
lastDomainIndex = (idx + 1) % domains.length;
|
|
80
|
+
|
|
81
|
+
// Dispatch worker
|
|
82
|
+
activeWorkers++;
|
|
83
|
+
sessionPageCount++;
|
|
84
|
+
processPage(urlRow)
|
|
85
|
+
.catch((err) => {
|
|
86
|
+
console.error(`Error processing ${urlRow.url}:`, err);
|
|
87
|
+
})
|
|
88
|
+
.finally(() => {
|
|
89
|
+
activeWorkers--;
|
|
90
|
+
});
|
|
91
|
+
|
|
92
|
+
claimed = true;
|
|
93
|
+
break;
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
// If no URL was claimed (e.g. all domains in cooldown or DB lock contention), sleep
|
|
98
|
+
if (!claimed) {
|
|
99
|
+
await sleep(50);
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
isRunning = false;
|
|
104
|
+
stopProgressLogger();
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
export function stopScheduler() {
|
|
108
|
+
isRunning = false;
|
|
109
|
+
stopProgressLogger();
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
export function getActiveWorkersCount() {
|
|
113
|
+
return activeWorkers;
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
export function getCooldown(domain: string) {
|
|
117
|
+
return cooldowns.get(domain) || 0;
|
|
118
|
+
}
|
|
119
|
+
|