messi-crawler 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. package/README.md +201 -0
  2. package/dist/cli/renderer.js +71 -0
  3. package/dist/config.js +18 -0
  4. package/dist/db/clear.js +16 -0
  5. package/dist/db/client.js +20 -0
  6. package/dist/db/queries.js +179 -0
  7. package/dist/frontier/frontier.js +44 -0
  8. package/dist/frontier/logger.js +65 -0
  9. package/dist/frontier/robots.js +46 -0
  10. package/dist/frontier/scheduler.js +98 -0
  11. package/dist/index.js +533 -0
  12. package/dist/normalizer.js +33 -0
  13. package/dist/output/db-strategy.js +16 -0
  14. package/dist/output/index.js +23 -0
  15. package/dist/output/pdf-strategy.js +316 -0
  16. package/dist/output/strategy.js +1 -0
  17. package/dist/security/ssrf.js +45 -0
  18. package/dist/security/validate-url.js +41 -0
  19. package/dist/seed.js +14 -0
  20. package/dist/setup.js +148 -0
  21. package/dist/test/client.test.js +33 -0
  22. package/dist/test/downloader.test.js +84 -0
  23. package/dist/test/extractor.test.js +126 -0
  24. package/dist/test/frontier.test.js +43 -0
  25. package/dist/test/logger.test.js +55 -0
  26. package/dist/test/normalizer.test.js +36 -0
  27. package/dist/test/pdf-strategy.test.js +68 -0
  28. package/dist/test/queries.test.js +173 -0
  29. package/dist/test/robots.test.js +46 -0
  30. package/dist/test/scheduler.test.js +73 -0
  31. package/dist/test/seed.test.js +26 -0
  32. package/dist/test/worker.test.js +118 -0
  33. package/dist/worker/downloader.js +114 -0
  34. package/dist/worker/extractor.js +197 -0
  35. package/dist/worker/worker.js +87 -0
  36. package/package.json +48 -0
  37. package/seeds.txt +4 -0
  38. package/src/cli/renderer.ts +83 -0
  39. package/src/config.ts +22 -0
  40. package/src/db/clear.ts +16 -0
  41. package/src/db/client.ts +26 -0
  42. package/src/db/queries.ts +255 -0
  43. package/src/db/schema.sql +43 -0
  44. package/src/frontier/frontier.ts +60 -0
  45. package/src/frontier/logger.ts +75 -0
  46. package/src/frontier/robots.ts +50 -0
  47. package/src/frontier/scheduler.ts +119 -0
  48. package/src/index.ts +596 -0
  49. package/src/normalizer.ts +37 -0
  50. package/src/output/db-strategy.ts +20 -0
  51. package/src/output/index.ts +32 -0
  52. package/src/output/pdf-strategy.ts +388 -0
  53. package/src/output/strategy.ts +16 -0
  54. package/src/security/ssrf.ts +48 -0
  55. package/src/security/validate-url.ts +49 -0
  56. package/src/seed.ts +18 -0
  57. package/src/setup.ts +170 -0
  58. package/src/test/client.test.ts +38 -0
  59. package/src/test/downloader.test.ts +101 -0
  60. package/src/test/extractor.test.ts +139 -0
  61. package/src/test/frontier.test.ts +53 -0
  62. package/src/test/logger.test.ts +71 -0
  63. package/src/test/normalizer.test.ts +43 -0
  64. package/src/test/pdf-strategy.test.ts +84 -0
  65. package/src/test/queries.test.ts +247 -0
  66. package/src/test/robots.test.ts +56 -0
  67. package/src/test/scheduler.test.ts +90 -0
  68. package/src/test/seed.test.ts +35 -0
  69. package/src/test/worker.test.ts +144 -0
  70. package/src/worker/downloader.ts +149 -0
  71. package/src/worker/extractor.ts +235 -0
  72. package/src/worker/worker.ts +100 -0
  73. package/tsconfig.json +15 -0
@@ -0,0 +1,119 @@
1
+ import { config } from "../config.js";
2
+ import { claimNextURL } from "../db/queries.js";
3
+ import { getPendingDomains } from "./frontier.js";
4
+ import { processPage } from "../worker/worker.js";
5
+ import { startProgressLogger, stopProgressLogger } from "./logger.js";
6
+
7
+ const cooldowns = new Map<string, number>();
8
+
9
+ let activeWorkers = 0;
10
+ let lastDomainIndex = 0;
11
+ let isRunning = false;
12
+
13
+ // Pages dispatched in this session (in-memory counter, not cumulative DB total)
14
+ let sessionPageCount = 0;
15
+
16
+ export const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));
17
+
18
+ /**
19
+ * Starts the round-robin scheduler loop.
20
+ * Respects politeness delays per domain and concurrency limits.
21
+ */
22
+ export async function startScheduler(): Promise<void> {
23
+ if (isRunning) return;
24
+ isRunning = true;
25
+ sessionPageCount = 0;
26
+
27
+ // Start the periodic progress logger
28
+ await startProgressLogger();
29
+
30
+ while (isRunning) {
31
+ // Enforce MAX_PAGES limit against this session's dispatched count
32
+ if (config.MAX_PAGES > 0 && sessionPageCount >= config.MAX_PAGES) {
33
+ // Wait for any in-flight workers to finish before stopping
34
+ while (activeWorkers > 0) {
35
+ await sleep(100);
36
+ }
37
+ console.log(`\n✓ Crawl complete — ${sessionPageCount} page(s) processed.\n`);
38
+ break;
39
+ }
40
+
41
+ // 1. Enforce worker concurrency limit
42
+ if (activeWorkers >= config.WORKER_COUNT) {
43
+ await sleep(50);
44
+ continue;
45
+ }
46
+
47
+ // 2. Fetch active pending domains from the frontier
48
+ const domains = await getPendingDomains();
49
+ if (domains.length === 0) {
50
+ // Exit if there are no pending URLs and all workers are idle
51
+ if (activeWorkers === 0) {
52
+ break;
53
+ }
54
+ await sleep(100);
55
+ continue;
56
+ }
57
+
58
+ let claimed = false;
59
+ const now = Date.now();
60
+
61
+ // 3. Round-robin traversal over domains
62
+ for (let i = 0; i < domains.length; i++) {
63
+ const idx = (lastDomainIndex + i) % domains.length;
64
+ const domain = domains[idx];
65
+
66
+ // Enforce politeness delay
67
+ const nextAllowed = cooldowns.get(domain) || 0;
68
+ if (now < nextAllowed) {
69
+ continue;
70
+ }
71
+
72
+ // Try to atomically claim a URL for this domain
73
+ const urlRow = await claimNextURL(domain);
74
+ if (urlRow) {
75
+ // Set the domain cooldown
76
+ cooldowns.set(domain, Date.now() + config.CRAWL_DELAY_MS);
77
+
78
+ // Update round-robin start index for the next tick
79
+ lastDomainIndex = (idx + 1) % domains.length;
80
+
81
+ // Dispatch worker
82
+ activeWorkers++;
83
+ sessionPageCount++;
84
+ processPage(urlRow)
85
+ .catch((err) => {
86
+ console.error(`Error processing ${urlRow.url}:`, err);
87
+ })
88
+ .finally(() => {
89
+ activeWorkers--;
90
+ });
91
+
92
+ claimed = true;
93
+ break;
94
+ }
95
+ }
96
+
97
+ // If no URL was claimed (e.g. all domains in cooldown or DB lock contention), sleep
98
+ if (!claimed) {
99
+ await sleep(50);
100
+ }
101
+ }
102
+
103
+ isRunning = false;
104
+ stopProgressLogger();
105
+ }
106
+
107
+ export function stopScheduler() {
108
+ isRunning = false;
109
+ stopProgressLogger();
110
+ }
111
+
112
+ export function getActiveWorkersCount() {
113
+ return activeWorkers;
114
+ }
115
+
116
+ export function getCooldown(domain: string) {
117
+ return cooldowns.get(domain) || 0;
118
+ }
119
+