ef-dl 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,635 @@
1
+ import path from "path";
2
+ import fs from "fs";
3
+ import chalk from "chalk";
4
+ import { PromptType } from "../types/enums";
5
+ import { JUSTICE_GOV_SEARCH_URL } from "../types/constants";
6
+ import { TaskQueue } from "./task-queue.js";
7
+ import { WorkerPool } from "./worker-pool.js";
8
+ import {
9
+ closeBrowser,
10
+ fetchPageContent,
11
+ } from "../browserless/browser-client.js";
12
+ import { prompt } from "../utils/prompt";
13
+ import { logger } from "../utils/logger";
14
+ import type {
15
+ CoordinatorOptions,
16
+ CoordinatorResult,
17
+ PdfTask,
18
+ JusticeGovJson,
19
+ QueueProgress,
20
+ } from "./types.js";
21
+ import {
22
+ initProgressBars,
23
+ addJsonProgressTask,
24
+ addPdfProgressTask,
25
+ updateJsonProgress,
26
+ updatePdfProgress,
27
+ closeProgressBars,
28
+ } from "../utils/progress.js";
29
+
30
+ /**
31
+ * Coordinator (Producer)
32
+ *
33
+ * Manages the parallel download process:
34
+ * 1. Checks for existing queue (resume detection)
35
+ * 2. Discovers total pages from initial JSON fetch
36
+ * 3. Starts workers
37
+ * 4. Fetches JSON metadata and populates queue (streaming)
38
+ * 5. Waits for workers to complete
39
+ * 6. Shows summary and cleanup prompt
40
+ */
41
+ export class Coordinator {
42
+ private searchTerm: string;
43
+ private downloadDir: string;
44
+ private options: CoordinatorOptions;
45
+ private queue: TaskQueue;
46
+ private startTime: number;
47
+ private totalPages: number;
48
+ private totalPdfs: number;
49
+ private progressTimer: ReturnType<typeof setInterval> | null;
50
+ private queueDeleted: boolean;
51
+
52
+ constructor(
53
+ searchTerm: string,
54
+ downloadDir: string,
55
+ options: CoordinatorOptions = {},
56
+ ) {
57
+ this.searchTerm = searchTerm;
58
+ this.downloadDir = downloadDir;
59
+ this.options = {
60
+ startPage: 1,
61
+ workers: 5,
62
+ fresh: false,
63
+ verbose: false,
64
+ ...options,
65
+ };
66
+ this.queue = new TaskQueue(downloadDir, searchTerm);
67
+ this.startTime = Date.now();
68
+ this.totalPages = 0;
69
+ this.totalPdfs = 0;
70
+ this.progressTimer = null;
71
+ this.queueDeleted = false;
72
+ }
73
+
74
+ /**
75
+ * Main run method
76
+ */
77
+ async run(): Promise<CoordinatorResult> {
78
+ try {
79
+ // Phase 1: Check for resume
80
+ const resumeAction = await this.checkResume();
81
+
82
+ if (resumeAction === "abort") {
83
+ return {
84
+ totalPages: 0,
85
+ totalPdfs: 0,
86
+ completedPdfs: 0,
87
+ failedPdfs: 0,
88
+ duration: 0,
89
+ workersUsed: 0,
90
+ };
91
+ }
92
+
93
+ // Phase 2: Discover totals
94
+ await this.discoverTotals();
95
+
96
+ // Phase 3: Start workers
97
+ const workerPool = new WorkerPool(
98
+ this.queue,
99
+ this.options.workers!,
100
+ this.searchTerm,
101
+ this.downloadDir,
102
+ {
103
+ verbose: this.options.verbose,
104
+ onProgress: (progress) => {
105
+ const total = this.totalPdfs || progress.total;
106
+ const completed = progress.completed + progress.failed;
107
+ updatePdfProgress("PDF Downloads", completed, total);
108
+ },
109
+ },
110
+ );
111
+
112
+ await workerPool.start();
113
+
114
+ // Give workers time to initialize
115
+ await sleep(1000);
116
+
117
+ // Phase 4: Initialize progress bars
118
+ this.initializeProgressBars();
119
+ this.startPdfProgressPolling();
120
+
121
+ // Phase 5: Producer loop (fetch JSONs)
122
+ await this.producerLoop();
123
+
124
+ // Phase 6: Signal completion and wait for workers
125
+ this.queue.setMetadata("json_fetch_complete", "true");
126
+ await workerPool.waitForCompletion();
127
+ this.stopPdfProgressPolling();
128
+ this.finalizePdfProgress();
129
+
130
+ // Phase 7: Show summary
131
+ const result = await this.showSummary();
132
+
133
+ // Phase 8: Cleanup prompt
134
+ await this.promptForCleanup(result);
135
+
136
+ return result;
137
+ } catch (error: any) {
138
+ logger.error(chalk.red(`\nCoordinator error: ${error.message}`));
139
+ throw error;
140
+ } finally {
141
+ this.stopPdfProgressPolling();
142
+ this.finalizePdfProgress();
143
+ if (!this.queueDeleted) {
144
+ this.queue.close();
145
+ }
146
+ closeProgressBars();
147
+ }
148
+ }
149
+
150
+ /**
151
+ * Check for existing queue and handle resume
152
+ */
153
+ private async checkResume(): Promise<"resume" | "fresh" | "abort"> {
154
+ if (this.options.fresh || !this.queue.exists()) {
155
+ // Fresh start
156
+ this.queue.initialize();
157
+ return "fresh";
158
+ }
159
+
160
+ // Check progress
161
+ const progress = this.queue.getProgress();
162
+
163
+ if (progress.completed === 0 && progress.inProgress === 0) {
164
+ // Empty queue, treat as fresh
165
+ this.queue.initialize();
166
+ return "fresh";
167
+ }
168
+
169
+ // Show resume prompt
170
+ logger.info(chalk.cyan("\n🔍 Found previous download:"));
171
+ logger.info(chalk.white(` Search: ${this.searchTerm}`));
172
+ logger.info(chalk.gray(" ─────────────────────────────"));
173
+ logger.info(chalk.green(` ✓ Completed: ${progress.completed} PDFs`));
174
+ logger.info(
175
+ progress.inProgress > 0
176
+ ? chalk.yellow(` ⏳ In Progress: ${progress.inProgress} PDFs`)
177
+ : chalk.gray(` ⏳ In Progress: ${progress.inProgress} PDFs`),
178
+ );
179
+ logger.info(chalk.gray(` ⏸ Pending: ${progress.pending} PDFs`));
180
+ if (progress.failed > 0) {
181
+ logger.info(chalk.red(` ✗ Failed: ${progress.failed} PDFs`));
182
+ }
183
+ logger.info(chalk.gray(" ─────────────────────────────"));
184
+ logger.info(chalk.white(` Total: ${progress.total} PDFs`));
185
+ logger.info("");
186
+
187
+ const shouldResume = await prompt({
188
+ type: PromptType.Confirm,
189
+ message: "Resume where you left off?",
190
+ default: true,
191
+ cleanup: () => this.cleanupAfterPromptExit(),
192
+ });
193
+
194
+ if (shouldResume) {
195
+ // Reset in-progress tasks back to pending
196
+ this.queue.resetInProgress();
197
+ logger.info(chalk.green("✓ Resuming previous download\n"));
198
+ return "resume";
199
+ } else {
200
+ // Fresh start
201
+ const confirmFresh = await prompt({
202
+ type: PromptType.Confirm,
203
+ message: "Start fresh? This will delete previous progress.",
204
+ default: false,
205
+ cleanup: () => this.cleanupAfterPromptExit(),
206
+ });
207
+
208
+ if (!confirmFresh) {
209
+ logger.info(chalk.gray("Aborted."));
210
+ return "abort";
211
+ }
212
+
213
+ this.queue.initialize();
214
+ logger.info(chalk.green("✓ Starting fresh\n"));
215
+ return "fresh";
216
+ }
217
+ }
218
+
219
+ /**
220
+ * Discover total pages and PDFs from page 1
221
+ */
222
+ private async discoverTotals(): Promise<void> {
223
+ const startPage = this.options.startPage || 1;
224
+ const endPage = this.options.endPage;
225
+
226
+ // Check if this is a single page or range download
227
+ const isSinglePage = endPage !== undefined && startPage === endPage;
228
+ const isRange = endPage !== undefined && startPage !== endPage;
229
+
230
+ if (isSinglePage) {
231
+ logger.info(chalk.blue(`Fetching page ${startPage}...`));
232
+ } else {
233
+ logger.info(chalk.blue("Discovering total pages..."));
234
+ }
235
+
236
+ // Check if we already have the start page in queue (resume scenario)
237
+ if (this.queue.hasPage(startPage)) {
238
+ if (isSinglePage) {
239
+ // For single page, just count the PDFs in that page
240
+ const progress = this.queue.getProgress();
241
+ this.totalPdfs = progress.total;
242
+ this.totalPages = 1;
243
+ } else {
244
+ // Get totals from metadata or calculate from queue
245
+ const totalPagesStr = this.queue.getMetadata("total_pages");
246
+ const totalPdfsStr = this.queue.getMetadata("total_pdfs");
247
+ if (totalPagesStr) {
248
+ this.totalPages = parseInt(totalPagesStr, 10);
249
+ } else {
250
+ const progress = this.queue.getProgress();
251
+ this.totalPages = Math.ceil(progress.total / 10);
252
+ }
253
+ if (totalPdfsStr) {
254
+ const totalPdfsOverall = parseInt(totalPdfsStr, 10);
255
+ const remaining = Math.max(
256
+ 0,
257
+ totalPdfsOverall - (startPage - 1) * 10,
258
+ );
259
+ this.totalPdfs = Math.min(this.totalPages * 10, remaining);
260
+ } else {
261
+ this.totalPdfs = this.totalPages * 10;
262
+ }
263
+ }
264
+
265
+ logger.info(
266
+ chalk.green(` ✓ Found ${this.totalPdfs} PDFs (from queue)\n`),
267
+ );
268
+ return;
269
+ }
270
+
271
+ // Fetch the start page
272
+ const jsonDir = path.join(
273
+ this.downloadDir,
274
+ "cache",
275
+ this.searchTerm,
276
+ "json",
277
+ );
278
+
279
+ const { jsonData } = await fetchPageContent(
280
+ `${JUSTICE_GOV_SEARCH_URL}?keys=${encodeURIComponent(
281
+ this.searchTerm,
282
+ )}&page=${startPage}`,
283
+ {
284
+ saveJson: true,
285
+ jsonOutputDir: jsonDir,
286
+ },
287
+ );
288
+
289
+ if (!jsonData) {
290
+ throw new Error(`Failed to fetch page ${startPage}`);
291
+ }
292
+
293
+ const data = jsonData as JusticeGovJson;
294
+
295
+ if (isSinglePage) {
296
+ // Single page: only count PDFs on this page
297
+ const pdfs = this.extractPdfsFromJson(data, startPage);
298
+ this.totalPdfs = pdfs.length;
299
+ this.totalPages = 1;
300
+
301
+ // Insert PDFs from this page only
302
+ this.queue.insertPdfs(pdfs);
303
+
304
+ // Verify insertion
305
+ const progress = this.queue.getProgress();
306
+ logger.info(
307
+ chalk.green(
308
+ ` ✓ Found ${this.totalPdfs} PDFs on page ${startPage} (queue: ${progress.total} total)\n`,
309
+ ),
310
+ );
311
+ } else {
312
+ // Full or range download: discover total pages, then calculate pages to fetch
313
+ const totalPdfsOverall = data.hits?.total?.value || 0;
314
+ const totalPagesOverall = Math.ceil(totalPdfsOverall / 10);
315
+ const effectiveEndPage = endPage
316
+ ? Math.min(endPage, totalPagesOverall)
317
+ : totalPagesOverall;
318
+ this.totalPages = Math.max(0, effectiveEndPage - startPage + 1);
319
+ const remaining = Math.max(0, totalPdfsOverall - (startPage - 1) * 10);
320
+ this.totalPdfs = Math.min(this.totalPages * 10, remaining);
321
+
322
+ // Store in metadata
323
+ this.queue.setMetadata("total_pages", String(this.totalPages));
324
+ this.queue.setMetadata("total_pdfs", String(totalPdfsOverall));
325
+ this.queue.setMetadata("start_time", String(Date.now()));
326
+
327
+ // Insert PDFs from page 1 into queue
328
+ const pdfs = this.extractPdfsFromJson(data, startPage);
329
+ this.queue.insertPdfs(pdfs);
330
+
331
+ logger.info(
332
+ chalk.green(
333
+ ` ✓ Found ${this.totalPdfs} PDFs across ${this.totalPages} pages\n`,
334
+ ),
335
+ );
336
+ }
337
+ }
338
+
339
+ /**
340
+ * Initialize progress bars
341
+ */
342
+ private initializeProgressBars(): void {
343
+ initProgressBars();
344
+ addJsonProgressTask("JSON Metadata", this.totalPages);
345
+ addPdfProgressTask("PDF Downloads", this.totalPdfs);
346
+ }
347
+
348
+ private async cleanupAfterPromptExit(): Promise<void> {
349
+ this.stopPdfProgressPolling();
350
+ closeProgressBars();
351
+ await closeBrowser().catch(() => {});
352
+ }
353
+
354
+ private startPdfProgressPolling(): void {
355
+ if (this.progressTimer) {
356
+ return;
357
+ }
358
+
359
+ const update = () => {
360
+ const progress = this.queue.getProgress();
361
+ const total = this.totalPdfs || progress.total;
362
+ const completed = progress.completed + progress.failed;
363
+ updatePdfProgress("PDF Downloads", completed, total);
364
+ };
365
+
366
+ update();
367
+ this.progressTimer = setInterval(update, 1000);
368
+ }
369
+
370
+ private stopPdfProgressPolling(): void {
371
+ if (!this.progressTimer) {
372
+ return;
373
+ }
374
+
375
+ clearInterval(this.progressTimer);
376
+ this.progressTimer = null;
377
+ }
378
+
379
+ private finalizePdfProgress(): void {
380
+ if (this.queueDeleted) {
381
+ return;
382
+ }
383
+ const progress = this.queue.getProgress();
384
+ const total = progress.completed + progress.failed;
385
+ if (total === 0) {
386
+ return;
387
+ }
388
+
389
+ this.totalPdfs = total;
390
+ updatePdfProgress("PDF Downloads", total, total);
391
+ }
392
+
393
+ /**
394
+ * Producer loop: fetch JSONs and populate queue
395
+ */
396
+ private async producerLoop(): Promise<void> {
397
+ logger.info(chalk.blue("Fetching JSON metadata...\n"));
398
+
399
+ const startPage = this.options.startPage || 1;
400
+ const endPage = this.options.endPage
401
+ ? this.options.endPage
402
+ : startPage + this.totalPages - 1;
403
+ const jsonDir = path.join(
404
+ this.downloadDir,
405
+ "cache",
406
+ this.searchTerm,
407
+ "json",
408
+ );
409
+ let processedPages = 0;
410
+
411
+ for (let page = startPage; page <= endPage; page++) {
412
+ // Skip if page already in queue (resume)
413
+ if (this.queue.hasPage(page)) {
414
+ if (this.options.verbose) {
415
+ logger.info(chalk.gray(` Page ${page}: Already in queue`));
416
+ }
417
+ processedPages++;
418
+ updateJsonProgress("JSON Metadata", processedPages, this.totalPages);
419
+ continue;
420
+ }
421
+
422
+ try {
423
+ // Fetch JSON
424
+ const { jsonData } = await fetchPageContent(
425
+ `${JUSTICE_GOV_SEARCH_URL}?keys=${encodeURIComponent(
426
+ this.searchTerm,
427
+ )}&page=${page}`,
428
+ {
429
+ saveJson: true,
430
+ jsonOutputDir: jsonDir,
431
+ },
432
+ );
433
+
434
+ if (!jsonData) {
435
+ logger.error(chalk.red(` Page ${page}: Failed to fetch JSON`));
436
+ continue;
437
+ }
438
+
439
+ // Extract and insert PDFs
440
+ const data = jsonData as JusticeGovJson;
441
+ const pdfs = this.extractPdfsFromJson(data, page);
442
+ this.queue.insertPdfs(pdfs);
443
+
444
+ if (this.options.verbose) {
445
+ logger.info(
446
+ chalk.gray(` Page ${page}: ${pdfs.length} PDFs added to queue`),
447
+ );
448
+ }
449
+
450
+ // Update progress
451
+ processedPages++;
452
+ updateJsonProgress("JSON Metadata", processedPages, this.totalPages);
453
+
454
+ // Rate limiting
455
+ if (page < endPage) {
456
+ await sleep(1000);
457
+ }
458
+ } catch (error: any) {
459
+ logger.error(chalk.red(` Page ${page}: Error - ${error.message}`));
460
+ }
461
+ }
462
+
463
+ logger.info(chalk.green("\n✓ All JSON metadata fetched\n"));
464
+ }
465
+
466
+ /**
467
+ * Extract PDFs from JSON data
468
+ */
469
+ private extractPdfsFromJson(
470
+ data: JusticeGovJson,
471
+ pageNumber: number,
472
+ ): PdfTask[] {
473
+ const pdfs: PdfTask[] = [];
474
+ const hits = data.hits?.hits || [];
475
+ const timestamp = Date.now();
476
+
477
+ for (const hit of hits) {
478
+ const source = hit._source;
479
+ if (source?.ORIGIN_FILE_NAME && source?.ORIGIN_FILE_URI) {
480
+ pdfs.push({
481
+ id: `${this.searchTerm}_${pageNumber}_${source.ORIGIN_FILE_NAME}_${timestamp}`,
482
+ searchTerm: this.searchTerm,
483
+ pageNumber,
484
+ pdfName: source.ORIGIN_FILE_NAME,
485
+ pdfUrl: source.ORIGIN_FILE_URI,
486
+ fileSize: source.fileSize || 0,
487
+ });
488
+ }
489
+ }
490
+
491
+ return pdfs;
492
+ }
493
+
494
+ /**
495
+ * Show final summary
496
+ */
497
+ private async showSummary(): Promise<CoordinatorResult> {
498
+ const duration = Date.now() - this.startTime;
499
+ const progress = this.queue.getProgress();
500
+
501
+ logger.info(
502
+ chalk.white("\n╔══════════════════════════════════════════════════╗"),
503
+ );
504
+ logger.info(
505
+ chalk.white("║ SUMMARY ║"),
506
+ );
507
+ logger.info(
508
+ chalk.white("╠══════════════════════════════════════════════════╣"),
509
+ );
510
+ logger.info(
511
+ chalk.white("║ JSON Metadata ║"),
512
+ );
513
+ logger.info(
514
+ chalk.white(
515
+ `║ Total Pages: ${this.totalPages.toString().padEnd(33)} ║`,
516
+ ),
517
+ );
518
+ logger.info(
519
+ chalk.white(
520
+ `║ ✓ Downloaded: ${this.totalPages.toString().padEnd(32)} ║`,
521
+ ),
522
+ );
523
+ logger.info(chalk.white(`║ ✗ Failed: ${(0).toString().padEnd(36)} ║`));
524
+ logger.info(
525
+ chalk.white("║ ║"),
526
+ );
527
+ logger.info(
528
+ chalk.white("║ PDF Downloads ║"),
529
+ );
530
+ logger.info(
531
+ chalk.white(`║ Total PDFs: ${this.totalPdfs.toString().padEnd(34)} ║`),
532
+ );
533
+ logger.info(
534
+ chalk.white(
535
+ `║ ✓ Downloaded: ${progress.completed.toString().padEnd(32)} ║`,
536
+ ),
537
+ );
538
+ logger.info(
539
+ progress.failed > 0
540
+ ? chalk.red(`║ ✗ Failed: ${progress.failed.toString().padEnd(36)} ║`)
541
+ : chalk.white(`║ ✗ Failed: ${(0).toString().padEnd(36)} ║`),
542
+ );
543
+ logger.info(
544
+ chalk.white(
545
+ `║ Workers Used: ${this.options.workers!.toString().padEnd(32)} ║`,
546
+ ),
547
+ );
548
+ logger.info(
549
+ chalk.white("║ ║"),
550
+ );
551
+ logger.info(
552
+ chalk.white("║ Performance ║"),
553
+ );
554
+ logger.info(
555
+ chalk.white(
556
+ `║ Duration: ${this.formatDuration(duration).padEnd(35)} ║`,
557
+ ),
558
+ );
559
+ logger.info(
560
+ chalk.white(
561
+ `║ Average: ${this.formatSpeed(duration, progress.completed).padEnd(36)} ║`,
562
+ ),
563
+ );
564
+ logger.info(
565
+ chalk.white("╚══════════════════════════════════════════════════╝"),
566
+ );
567
+ logger.info("");
568
+
569
+ return {
570
+ totalPages: this.totalPages,
571
+ totalPdfs: this.totalPdfs,
572
+ completedPdfs: progress.completed,
573
+ failedPdfs: progress.failed,
574
+ duration,
575
+ workersUsed: this.options.workers!,
576
+ };
577
+ }
578
+
579
+ /**
580
+ * Prompt for cleanup
581
+ */
582
+ private async promptForCleanup(result: CoordinatorResult): Promise<void> {
583
+ const cacheDir = path.join(this.downloadDir, "cache", this.searchTerm);
584
+ const allSuccessful = result.failedPdfs === 0;
585
+
586
+ const shouldCleanup = await prompt({
587
+ type: PromptType.Confirm,
588
+ message: "Clean up cache folder? (removes JSON files and queue database)",
589
+ default: allSuccessful, // Yes if all successful, No if incomplete
590
+ cleanup: () => this.cleanupAfterPromptExit(),
591
+ });
592
+
593
+ if (shouldCleanup) {
594
+ this.stopPdfProgressPolling();
595
+ this.queue.delete();
596
+ this.queueDeleted = true;
597
+ logger.info(chalk.green("✓ Cache cleaned up"));
598
+ } else {
599
+ logger.info(chalk.gray("Cache preserved for potential resume"));
600
+ }
601
+ }
602
+
603
+ /**
604
+ * Format duration for display
605
+ */
606
+ private formatDuration(ms: number): string {
607
+ const seconds = Math.floor(ms / 1000);
608
+ const minutes = Math.floor(seconds / 60);
609
+ const hours = Math.floor(minutes / 60);
610
+
611
+ if (hours > 0) {
612
+ return `${hours}h ${minutes % 60}m ${seconds % 60}s`;
613
+ } else if (minutes > 0) {
614
+ return `${minutes}m ${seconds % 60}s`;
615
+ } else {
616
+ return `${seconds}s`;
617
+ }
618
+ }
619
+
620
+ /**
621
+ * Format speed for display
622
+ */
623
+ private formatSpeed(duration: number, completed: number): string {
624
+ const seconds = duration / 1000;
625
+ const rate = completed / seconds;
626
+ return `${rate.toFixed(1)} PDFs/second`;
627
+ }
628
+ }
629
+
630
+ /**
631
+ * Sleep utility
632
+ */
633
+ function sleep(ms: number): Promise<void> {
634
+ return new Promise((resolve) => setTimeout(resolve, ms));
635
+ }
@@ -0,0 +1,40 @@
1
+ /**
2
+ * Parallel Download Module
3
+ *
4
+ * Producer-Consumer Pipeline for parallel PDF downloading.
5
+ *
6
+ * Usage:
7
+ * import { Coordinator } from "./src/workers/index.js";
8
+ *
9
+ * const coordinator = new Coordinator("search term", "./downloads", {
10
+ * workers: 5,
11
+ * fresh: false,
12
+ * verbose: false
13
+ * });
14
+ *
15
+ * await coordinator.run();
16
+ */
17
+
18
+ // Main classes
19
+ export { Coordinator } from "./coordinator.js";
20
+ export { WorkerPool } from "./worker-pool.js";
21
+ export { TaskQueue } from "./task-queue.js";
22
+
23
+ // Worker function
24
+ export { runWorker } from "./worker.js";
25
+
26
+ // Types
27
+ export type {
28
+ PdfTask,
29
+ PdfTaskRecord,
30
+ TaskStatus,
31
+ QueueProgress,
32
+ CoordinatorOptions,
33
+ CoordinatorResult,
34
+ WorkerPoolOptions,
35
+ WorkerPoolResult,
36
+ WorkerOptions,
37
+ WorkerResult,
38
+ MetadataKey,
39
+ JusticeGovJson,
40
+ } from "./types.js";