ef-dl 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +521 -0
- package/fonts/Sub-Zero.flf +629 -0
- package/index.ts +866 -0
- package/package.json +65 -0
- package/src/browserless/browser-client.ts +307 -0
- package/src/browserless/challenger.ts +352 -0
- package/src/browserless/helpers.ts +171 -0
- package/src/types/browserless.d.ts +31 -0
- package/src/types/constants.ts +3 -0
- package/src/types/enums.ts +5 -0
- package/src/utils/ascii.ts +66 -0
- package/src/utils/helpers.ts +260 -0
- package/src/utils/logger.ts +42 -0
- package/src/utils/progress.ts +130 -0
- package/src/utils/prompt.ts +87 -0
- package/src/workers/coordinator.ts +635 -0
- package/src/workers/index.ts +40 -0
- package/src/workers/task-queue.ts +388 -0
- package/src/workers/types.ts +135 -0
- package/src/workers/worker-pool.ts +227 -0
- package/src/workers/worker.ts +290 -0
|
@@ -0,0 +1,635 @@
|
|
|
1
|
+
import path from "path";
|
|
2
|
+
import fs from "fs";
|
|
3
|
+
import chalk from "chalk";
|
|
4
|
+
import { PromptType } from "../types/enums";
|
|
5
|
+
import { JUSTICE_GOV_SEARCH_URL } from "../types/constants";
|
|
6
|
+
import { TaskQueue } from "./task-queue.js";
|
|
7
|
+
import { WorkerPool } from "./worker-pool.js";
|
|
8
|
+
import {
|
|
9
|
+
closeBrowser,
|
|
10
|
+
fetchPageContent,
|
|
11
|
+
} from "../browserless/browser-client.js";
|
|
12
|
+
import { prompt } from "../utils/prompt";
|
|
13
|
+
import { logger } from "../utils/logger";
|
|
14
|
+
import type {
|
|
15
|
+
CoordinatorOptions,
|
|
16
|
+
CoordinatorResult,
|
|
17
|
+
PdfTask,
|
|
18
|
+
JusticeGovJson,
|
|
19
|
+
QueueProgress,
|
|
20
|
+
} from "./types.js";
|
|
21
|
+
import {
|
|
22
|
+
initProgressBars,
|
|
23
|
+
addJsonProgressTask,
|
|
24
|
+
addPdfProgressTask,
|
|
25
|
+
updateJsonProgress,
|
|
26
|
+
updatePdfProgress,
|
|
27
|
+
closeProgressBars,
|
|
28
|
+
} from "../utils/progress.js";
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* Coordinator (Producer)
|
|
32
|
+
*
|
|
33
|
+
* Manages the parallel download process:
|
|
34
|
+
* 1. Checks for existing queue (resume detection)
|
|
35
|
+
* 2. Discovers total pages from initial JSON fetch
|
|
36
|
+
* 3. Starts workers
|
|
37
|
+
* 4. Fetches JSON metadata and populates queue (streaming)
|
|
38
|
+
* 5. Waits for workers to complete
|
|
39
|
+
* 6. Shows summary and cleanup prompt
|
|
40
|
+
*/
|
|
41
|
+
export class Coordinator {
|
|
42
|
+
private searchTerm: string;
|
|
43
|
+
private downloadDir: string;
|
|
44
|
+
private options: CoordinatorOptions;
|
|
45
|
+
private queue: TaskQueue;
|
|
46
|
+
private startTime: number;
|
|
47
|
+
private totalPages: number;
|
|
48
|
+
private totalPdfs: number;
|
|
49
|
+
private progressTimer: ReturnType<typeof setInterval> | null;
|
|
50
|
+
private queueDeleted: boolean;
|
|
51
|
+
|
|
52
|
+
constructor(
|
|
53
|
+
searchTerm: string,
|
|
54
|
+
downloadDir: string,
|
|
55
|
+
options: CoordinatorOptions = {},
|
|
56
|
+
) {
|
|
57
|
+
this.searchTerm = searchTerm;
|
|
58
|
+
this.downloadDir = downloadDir;
|
|
59
|
+
this.options = {
|
|
60
|
+
startPage: 1,
|
|
61
|
+
workers: 5,
|
|
62
|
+
fresh: false,
|
|
63
|
+
verbose: false,
|
|
64
|
+
...options,
|
|
65
|
+
};
|
|
66
|
+
this.queue = new TaskQueue(downloadDir, searchTerm);
|
|
67
|
+
this.startTime = Date.now();
|
|
68
|
+
this.totalPages = 0;
|
|
69
|
+
this.totalPdfs = 0;
|
|
70
|
+
this.progressTimer = null;
|
|
71
|
+
this.queueDeleted = false;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
/**
|
|
75
|
+
* Main run method
|
|
76
|
+
*/
|
|
77
|
+
async run(): Promise<CoordinatorResult> {
|
|
78
|
+
try {
|
|
79
|
+
// Phase 1: Check for resume
|
|
80
|
+
const resumeAction = await this.checkResume();
|
|
81
|
+
|
|
82
|
+
if (resumeAction === "abort") {
|
|
83
|
+
return {
|
|
84
|
+
totalPages: 0,
|
|
85
|
+
totalPdfs: 0,
|
|
86
|
+
completedPdfs: 0,
|
|
87
|
+
failedPdfs: 0,
|
|
88
|
+
duration: 0,
|
|
89
|
+
workersUsed: 0,
|
|
90
|
+
};
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
// Phase 2: Discover totals
|
|
94
|
+
await this.discoverTotals();
|
|
95
|
+
|
|
96
|
+
// Phase 3: Start workers
|
|
97
|
+
const workerPool = new WorkerPool(
|
|
98
|
+
this.queue,
|
|
99
|
+
this.options.workers!,
|
|
100
|
+
this.searchTerm,
|
|
101
|
+
this.downloadDir,
|
|
102
|
+
{
|
|
103
|
+
verbose: this.options.verbose,
|
|
104
|
+
onProgress: (progress) => {
|
|
105
|
+
const total = this.totalPdfs || progress.total;
|
|
106
|
+
const completed = progress.completed + progress.failed;
|
|
107
|
+
updatePdfProgress("PDF Downloads", completed, total);
|
|
108
|
+
},
|
|
109
|
+
},
|
|
110
|
+
);
|
|
111
|
+
|
|
112
|
+
await workerPool.start();
|
|
113
|
+
|
|
114
|
+
// Give workers time to initialize
|
|
115
|
+
await sleep(1000);
|
|
116
|
+
|
|
117
|
+
// Phase 4: Initialize progress bars
|
|
118
|
+
this.initializeProgressBars();
|
|
119
|
+
this.startPdfProgressPolling();
|
|
120
|
+
|
|
121
|
+
// Phase 5: Producer loop (fetch JSONs)
|
|
122
|
+
await this.producerLoop();
|
|
123
|
+
|
|
124
|
+
// Phase 6: Signal completion and wait for workers
|
|
125
|
+
this.queue.setMetadata("json_fetch_complete", "true");
|
|
126
|
+
await workerPool.waitForCompletion();
|
|
127
|
+
this.stopPdfProgressPolling();
|
|
128
|
+
this.finalizePdfProgress();
|
|
129
|
+
|
|
130
|
+
// Phase 7: Show summary
|
|
131
|
+
const result = await this.showSummary();
|
|
132
|
+
|
|
133
|
+
// Phase 8: Cleanup prompt
|
|
134
|
+
await this.promptForCleanup(result);
|
|
135
|
+
|
|
136
|
+
return result;
|
|
137
|
+
} catch (error: any) {
|
|
138
|
+
logger.error(chalk.red(`\nCoordinator error: ${error.message}`));
|
|
139
|
+
throw error;
|
|
140
|
+
} finally {
|
|
141
|
+
this.stopPdfProgressPolling();
|
|
142
|
+
this.finalizePdfProgress();
|
|
143
|
+
if (!this.queueDeleted) {
|
|
144
|
+
this.queue.close();
|
|
145
|
+
}
|
|
146
|
+
closeProgressBars();
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
/**
|
|
151
|
+
* Check for existing queue and handle resume
|
|
152
|
+
*/
|
|
153
|
+
private async checkResume(): Promise<"resume" | "fresh" | "abort"> {
|
|
154
|
+
if (this.options.fresh || !this.queue.exists()) {
|
|
155
|
+
// Fresh start
|
|
156
|
+
this.queue.initialize();
|
|
157
|
+
return "fresh";
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
// Check progress
|
|
161
|
+
const progress = this.queue.getProgress();
|
|
162
|
+
|
|
163
|
+
if (progress.completed === 0 && progress.inProgress === 0) {
|
|
164
|
+
// Empty queue, treat as fresh
|
|
165
|
+
this.queue.initialize();
|
|
166
|
+
return "fresh";
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
// Show resume prompt
|
|
170
|
+
logger.info(chalk.cyan("\n🔍 Found previous download:"));
|
|
171
|
+
logger.info(chalk.white(` Search: ${this.searchTerm}`));
|
|
172
|
+
logger.info(chalk.gray(" ─────────────────────────────"));
|
|
173
|
+
logger.info(chalk.green(` ✓ Completed: ${progress.completed} PDFs`));
|
|
174
|
+
logger.info(
|
|
175
|
+
progress.inProgress > 0
|
|
176
|
+
? chalk.yellow(` ⏳ In Progress: ${progress.inProgress} PDFs`)
|
|
177
|
+
: chalk.gray(` ⏳ In Progress: ${progress.inProgress} PDFs`),
|
|
178
|
+
);
|
|
179
|
+
logger.info(chalk.gray(` ⏸ Pending: ${progress.pending} PDFs`));
|
|
180
|
+
if (progress.failed > 0) {
|
|
181
|
+
logger.info(chalk.red(` ✗ Failed: ${progress.failed} PDFs`));
|
|
182
|
+
}
|
|
183
|
+
logger.info(chalk.gray(" ─────────────────────────────"));
|
|
184
|
+
logger.info(chalk.white(` Total: ${progress.total} PDFs`));
|
|
185
|
+
logger.info("");
|
|
186
|
+
|
|
187
|
+
const shouldResume = await prompt({
|
|
188
|
+
type: PromptType.Confirm,
|
|
189
|
+
message: "Resume where you left off?",
|
|
190
|
+
default: true,
|
|
191
|
+
cleanup: () => this.cleanupAfterPromptExit(),
|
|
192
|
+
});
|
|
193
|
+
|
|
194
|
+
if (shouldResume) {
|
|
195
|
+
// Reset in-progress tasks back to pending
|
|
196
|
+
this.queue.resetInProgress();
|
|
197
|
+
logger.info(chalk.green("✓ Resuming previous download\n"));
|
|
198
|
+
return "resume";
|
|
199
|
+
} else {
|
|
200
|
+
// Fresh start
|
|
201
|
+
const confirmFresh = await prompt({
|
|
202
|
+
type: PromptType.Confirm,
|
|
203
|
+
message: "Start fresh? This will delete previous progress.",
|
|
204
|
+
default: false,
|
|
205
|
+
cleanup: () => this.cleanupAfterPromptExit(),
|
|
206
|
+
});
|
|
207
|
+
|
|
208
|
+
if (!confirmFresh) {
|
|
209
|
+
logger.info(chalk.gray("Aborted."));
|
|
210
|
+
return "abort";
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
this.queue.initialize();
|
|
214
|
+
logger.info(chalk.green("✓ Starting fresh\n"));
|
|
215
|
+
return "fresh";
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
/**
|
|
220
|
+
* Discover total pages and PDFs from page 1
|
|
221
|
+
*/
|
|
222
|
+
private async discoverTotals(): Promise<void> {
|
|
223
|
+
const startPage = this.options.startPage || 1;
|
|
224
|
+
const endPage = this.options.endPage;
|
|
225
|
+
|
|
226
|
+
// Check if this is a single page or range download
|
|
227
|
+
const isSinglePage = endPage !== undefined && startPage === endPage;
|
|
228
|
+
const isRange = endPage !== undefined && startPage !== endPage;
|
|
229
|
+
|
|
230
|
+
if (isSinglePage) {
|
|
231
|
+
logger.info(chalk.blue(`Fetching page ${startPage}...`));
|
|
232
|
+
} else {
|
|
233
|
+
logger.info(chalk.blue("Discovering total pages..."));
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
// Check if we already have the start page in queue (resume scenario)
|
|
237
|
+
if (this.queue.hasPage(startPage)) {
|
|
238
|
+
if (isSinglePage) {
|
|
239
|
+
// For single page, just count the PDFs in that page
|
|
240
|
+
const progress = this.queue.getProgress();
|
|
241
|
+
this.totalPdfs = progress.total;
|
|
242
|
+
this.totalPages = 1;
|
|
243
|
+
} else {
|
|
244
|
+
// Get totals from metadata or calculate from queue
|
|
245
|
+
const totalPagesStr = this.queue.getMetadata("total_pages");
|
|
246
|
+
const totalPdfsStr = this.queue.getMetadata("total_pdfs");
|
|
247
|
+
if (totalPagesStr) {
|
|
248
|
+
this.totalPages = parseInt(totalPagesStr, 10);
|
|
249
|
+
} else {
|
|
250
|
+
const progress = this.queue.getProgress();
|
|
251
|
+
this.totalPages = Math.ceil(progress.total / 10);
|
|
252
|
+
}
|
|
253
|
+
if (totalPdfsStr) {
|
|
254
|
+
const totalPdfsOverall = parseInt(totalPdfsStr, 10);
|
|
255
|
+
const remaining = Math.max(
|
|
256
|
+
0,
|
|
257
|
+
totalPdfsOverall - (startPage - 1) * 10,
|
|
258
|
+
);
|
|
259
|
+
this.totalPdfs = Math.min(this.totalPages * 10, remaining);
|
|
260
|
+
} else {
|
|
261
|
+
this.totalPdfs = this.totalPages * 10;
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
logger.info(
|
|
266
|
+
chalk.green(` ✓ Found ${this.totalPdfs} PDFs (from queue)\n`),
|
|
267
|
+
);
|
|
268
|
+
return;
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
// Fetch the start page
|
|
272
|
+
const jsonDir = path.join(
|
|
273
|
+
this.downloadDir,
|
|
274
|
+
"cache",
|
|
275
|
+
this.searchTerm,
|
|
276
|
+
"json",
|
|
277
|
+
);
|
|
278
|
+
|
|
279
|
+
const { jsonData } = await fetchPageContent(
|
|
280
|
+
`${JUSTICE_GOV_SEARCH_URL}?keys=${encodeURIComponent(
|
|
281
|
+
this.searchTerm,
|
|
282
|
+
)}&page=${startPage}`,
|
|
283
|
+
{
|
|
284
|
+
saveJson: true,
|
|
285
|
+
jsonOutputDir: jsonDir,
|
|
286
|
+
},
|
|
287
|
+
);
|
|
288
|
+
|
|
289
|
+
if (!jsonData) {
|
|
290
|
+
throw new Error(`Failed to fetch page ${startPage}`);
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
const data = jsonData as JusticeGovJson;
|
|
294
|
+
|
|
295
|
+
if (isSinglePage) {
|
|
296
|
+
// Single page: only count PDFs on this page
|
|
297
|
+
const pdfs = this.extractPdfsFromJson(data, startPage);
|
|
298
|
+
this.totalPdfs = pdfs.length;
|
|
299
|
+
this.totalPages = 1;
|
|
300
|
+
|
|
301
|
+
// Insert PDFs from this page only
|
|
302
|
+
this.queue.insertPdfs(pdfs);
|
|
303
|
+
|
|
304
|
+
// Verify insertion
|
|
305
|
+
const progress = this.queue.getProgress();
|
|
306
|
+
logger.info(
|
|
307
|
+
chalk.green(
|
|
308
|
+
` ✓ Found ${this.totalPdfs} PDFs on page ${startPage} (queue: ${progress.total} total)\n`,
|
|
309
|
+
),
|
|
310
|
+
);
|
|
311
|
+
} else {
|
|
312
|
+
// Full or range download: discover total pages, then calculate pages to fetch
|
|
313
|
+
const totalPdfsOverall = data.hits?.total?.value || 0;
|
|
314
|
+
const totalPagesOverall = Math.ceil(totalPdfsOverall / 10);
|
|
315
|
+
const effectiveEndPage = endPage
|
|
316
|
+
? Math.min(endPage, totalPagesOverall)
|
|
317
|
+
: totalPagesOverall;
|
|
318
|
+
this.totalPages = Math.max(0, effectiveEndPage - startPage + 1);
|
|
319
|
+
const remaining = Math.max(0, totalPdfsOverall - (startPage - 1) * 10);
|
|
320
|
+
this.totalPdfs = Math.min(this.totalPages * 10, remaining);
|
|
321
|
+
|
|
322
|
+
// Store in metadata
|
|
323
|
+
this.queue.setMetadata("total_pages", String(this.totalPages));
|
|
324
|
+
this.queue.setMetadata("total_pdfs", String(totalPdfsOverall));
|
|
325
|
+
this.queue.setMetadata("start_time", String(Date.now()));
|
|
326
|
+
|
|
327
|
+
// Insert PDFs from page 1 into queue
|
|
328
|
+
const pdfs = this.extractPdfsFromJson(data, startPage);
|
|
329
|
+
this.queue.insertPdfs(pdfs);
|
|
330
|
+
|
|
331
|
+
logger.info(
|
|
332
|
+
chalk.green(
|
|
333
|
+
` ✓ Found ${this.totalPdfs} PDFs across ${this.totalPages} pages\n`,
|
|
334
|
+
),
|
|
335
|
+
);
|
|
336
|
+
}
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
/**
|
|
340
|
+
* Initialize progress bars
|
|
341
|
+
*/
|
|
342
|
+
private initializeProgressBars(): void {
|
|
343
|
+
initProgressBars();
|
|
344
|
+
addJsonProgressTask("JSON Metadata", this.totalPages);
|
|
345
|
+
addPdfProgressTask("PDF Downloads", this.totalPdfs);
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
private async cleanupAfterPromptExit(): Promise<void> {
|
|
349
|
+
this.stopPdfProgressPolling();
|
|
350
|
+
closeProgressBars();
|
|
351
|
+
await closeBrowser().catch(() => {});
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
private startPdfProgressPolling(): void {
|
|
355
|
+
if (this.progressTimer) {
|
|
356
|
+
return;
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
const update = () => {
|
|
360
|
+
const progress = this.queue.getProgress();
|
|
361
|
+
const total = this.totalPdfs || progress.total;
|
|
362
|
+
const completed = progress.completed + progress.failed;
|
|
363
|
+
updatePdfProgress("PDF Downloads", completed, total);
|
|
364
|
+
};
|
|
365
|
+
|
|
366
|
+
update();
|
|
367
|
+
this.progressTimer = setInterval(update, 1000);
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
private stopPdfProgressPolling(): void {
|
|
371
|
+
if (!this.progressTimer) {
|
|
372
|
+
return;
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
clearInterval(this.progressTimer);
|
|
376
|
+
this.progressTimer = null;
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
private finalizePdfProgress(): void {
|
|
380
|
+
if (this.queueDeleted) {
|
|
381
|
+
return;
|
|
382
|
+
}
|
|
383
|
+
const progress = this.queue.getProgress();
|
|
384
|
+
const total = progress.completed + progress.failed;
|
|
385
|
+
if (total === 0) {
|
|
386
|
+
return;
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
this.totalPdfs = total;
|
|
390
|
+
updatePdfProgress("PDF Downloads", total, total);
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
/**
|
|
394
|
+
* Producer loop: fetch JSONs and populate queue
|
|
395
|
+
*/
|
|
396
|
+
private async producerLoop(): Promise<void> {
|
|
397
|
+
logger.info(chalk.blue("Fetching JSON metadata...\n"));
|
|
398
|
+
|
|
399
|
+
const startPage = this.options.startPage || 1;
|
|
400
|
+
const endPage = this.options.endPage
|
|
401
|
+
? this.options.endPage
|
|
402
|
+
: startPage + this.totalPages - 1;
|
|
403
|
+
const jsonDir = path.join(
|
|
404
|
+
this.downloadDir,
|
|
405
|
+
"cache",
|
|
406
|
+
this.searchTerm,
|
|
407
|
+
"json",
|
|
408
|
+
);
|
|
409
|
+
let processedPages = 0;
|
|
410
|
+
|
|
411
|
+
for (let page = startPage; page <= endPage; page++) {
|
|
412
|
+
// Skip if page already in queue (resume)
|
|
413
|
+
if (this.queue.hasPage(page)) {
|
|
414
|
+
if (this.options.verbose) {
|
|
415
|
+
logger.info(chalk.gray(` Page ${page}: Already in queue`));
|
|
416
|
+
}
|
|
417
|
+
processedPages++;
|
|
418
|
+
updateJsonProgress("JSON Metadata", processedPages, this.totalPages);
|
|
419
|
+
continue;
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
try {
|
|
423
|
+
// Fetch JSON
|
|
424
|
+
const { jsonData } = await fetchPageContent(
|
|
425
|
+
`${JUSTICE_GOV_SEARCH_URL}?keys=${encodeURIComponent(
|
|
426
|
+
this.searchTerm,
|
|
427
|
+
)}&page=${page}`,
|
|
428
|
+
{
|
|
429
|
+
saveJson: true,
|
|
430
|
+
jsonOutputDir: jsonDir,
|
|
431
|
+
},
|
|
432
|
+
);
|
|
433
|
+
|
|
434
|
+
if (!jsonData) {
|
|
435
|
+
logger.error(chalk.red(` Page ${page}: Failed to fetch JSON`));
|
|
436
|
+
continue;
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
// Extract and insert PDFs
|
|
440
|
+
const data = jsonData as JusticeGovJson;
|
|
441
|
+
const pdfs = this.extractPdfsFromJson(data, page);
|
|
442
|
+
this.queue.insertPdfs(pdfs);
|
|
443
|
+
|
|
444
|
+
if (this.options.verbose) {
|
|
445
|
+
logger.info(
|
|
446
|
+
chalk.gray(` Page ${page}: ${pdfs.length} PDFs added to queue`),
|
|
447
|
+
);
|
|
448
|
+
}
|
|
449
|
+
|
|
450
|
+
// Update progress
|
|
451
|
+
processedPages++;
|
|
452
|
+
updateJsonProgress("JSON Metadata", processedPages, this.totalPages);
|
|
453
|
+
|
|
454
|
+
// Rate limiting
|
|
455
|
+
if (page < endPage) {
|
|
456
|
+
await sleep(1000);
|
|
457
|
+
}
|
|
458
|
+
} catch (error: any) {
|
|
459
|
+
logger.error(chalk.red(` Page ${page}: Error - ${error.message}`));
|
|
460
|
+
}
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
logger.info(chalk.green("\n✓ All JSON metadata fetched\n"));
|
|
464
|
+
}
|
|
465
|
+
|
|
466
|
+
/**
|
|
467
|
+
* Extract PDFs from JSON data
|
|
468
|
+
*/
|
|
469
|
+
private extractPdfsFromJson(
|
|
470
|
+
data: JusticeGovJson,
|
|
471
|
+
pageNumber: number,
|
|
472
|
+
): PdfTask[] {
|
|
473
|
+
const pdfs: PdfTask[] = [];
|
|
474
|
+
const hits = data.hits?.hits || [];
|
|
475
|
+
const timestamp = Date.now();
|
|
476
|
+
|
|
477
|
+
for (const hit of hits) {
|
|
478
|
+
const source = hit._source;
|
|
479
|
+
if (source?.ORIGIN_FILE_NAME && source?.ORIGIN_FILE_URI) {
|
|
480
|
+
pdfs.push({
|
|
481
|
+
id: `${this.searchTerm}_${pageNumber}_${source.ORIGIN_FILE_NAME}_${timestamp}`,
|
|
482
|
+
searchTerm: this.searchTerm,
|
|
483
|
+
pageNumber,
|
|
484
|
+
pdfName: source.ORIGIN_FILE_NAME,
|
|
485
|
+
pdfUrl: source.ORIGIN_FILE_URI,
|
|
486
|
+
fileSize: source.fileSize || 0,
|
|
487
|
+
});
|
|
488
|
+
}
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
return pdfs;
|
|
492
|
+
}
|
|
493
|
+
|
|
494
|
+
/**
|
|
495
|
+
* Show final summary
|
|
496
|
+
*/
|
|
497
|
+
private async showSummary(): Promise<CoordinatorResult> {
|
|
498
|
+
const duration = Date.now() - this.startTime;
|
|
499
|
+
const progress = this.queue.getProgress();
|
|
500
|
+
|
|
501
|
+
logger.info(
|
|
502
|
+
chalk.white("\n╔══════════════════════════════════════════════════╗"),
|
|
503
|
+
);
|
|
504
|
+
logger.info(
|
|
505
|
+
chalk.white("║ SUMMARY ║"),
|
|
506
|
+
);
|
|
507
|
+
logger.info(
|
|
508
|
+
chalk.white("╠══════════════════════════════════════════════════╣"),
|
|
509
|
+
);
|
|
510
|
+
logger.info(
|
|
511
|
+
chalk.white("║ JSON Metadata ║"),
|
|
512
|
+
);
|
|
513
|
+
logger.info(
|
|
514
|
+
chalk.white(
|
|
515
|
+
`║ Total Pages: ${this.totalPages.toString().padEnd(33)} ║`,
|
|
516
|
+
),
|
|
517
|
+
);
|
|
518
|
+
logger.info(
|
|
519
|
+
chalk.white(
|
|
520
|
+
`║ ✓ Downloaded: ${this.totalPages.toString().padEnd(32)} ║`,
|
|
521
|
+
),
|
|
522
|
+
);
|
|
523
|
+
logger.info(chalk.white(`║ ✗ Failed: ${(0).toString().padEnd(36)} ║`));
|
|
524
|
+
logger.info(
|
|
525
|
+
chalk.white("║ ║"),
|
|
526
|
+
);
|
|
527
|
+
logger.info(
|
|
528
|
+
chalk.white("║ PDF Downloads ║"),
|
|
529
|
+
);
|
|
530
|
+
logger.info(
|
|
531
|
+
chalk.white(`║ Total PDFs: ${this.totalPdfs.toString().padEnd(34)} ║`),
|
|
532
|
+
);
|
|
533
|
+
logger.info(
|
|
534
|
+
chalk.white(
|
|
535
|
+
`║ ✓ Downloaded: ${progress.completed.toString().padEnd(32)} ║`,
|
|
536
|
+
),
|
|
537
|
+
);
|
|
538
|
+
logger.info(
|
|
539
|
+
progress.failed > 0
|
|
540
|
+
? chalk.red(`║ ✗ Failed: ${progress.failed.toString().padEnd(36)} ║`)
|
|
541
|
+
: chalk.white(`║ ✗ Failed: ${(0).toString().padEnd(36)} ║`),
|
|
542
|
+
);
|
|
543
|
+
logger.info(
|
|
544
|
+
chalk.white(
|
|
545
|
+
`║ Workers Used: ${this.options.workers!.toString().padEnd(32)} ║`,
|
|
546
|
+
),
|
|
547
|
+
);
|
|
548
|
+
logger.info(
|
|
549
|
+
chalk.white("║ ║"),
|
|
550
|
+
);
|
|
551
|
+
logger.info(
|
|
552
|
+
chalk.white("║ Performance ║"),
|
|
553
|
+
);
|
|
554
|
+
logger.info(
|
|
555
|
+
chalk.white(
|
|
556
|
+
`║ Duration: ${this.formatDuration(duration).padEnd(35)} ║`,
|
|
557
|
+
),
|
|
558
|
+
);
|
|
559
|
+
logger.info(
|
|
560
|
+
chalk.white(
|
|
561
|
+
`║ Average: ${this.formatSpeed(duration, progress.completed).padEnd(36)} ║`,
|
|
562
|
+
),
|
|
563
|
+
);
|
|
564
|
+
logger.info(
|
|
565
|
+
chalk.white("╚══════════════════════════════════════════════════╝"),
|
|
566
|
+
);
|
|
567
|
+
logger.info("");
|
|
568
|
+
|
|
569
|
+
return {
|
|
570
|
+
totalPages: this.totalPages,
|
|
571
|
+
totalPdfs: this.totalPdfs,
|
|
572
|
+
completedPdfs: progress.completed,
|
|
573
|
+
failedPdfs: progress.failed,
|
|
574
|
+
duration,
|
|
575
|
+
workersUsed: this.options.workers!,
|
|
576
|
+
};
|
|
577
|
+
}
|
|
578
|
+
|
|
579
|
+
/**
|
|
580
|
+
* Prompt for cleanup
|
|
581
|
+
*/
|
|
582
|
+
private async promptForCleanup(result: CoordinatorResult): Promise<void> {
|
|
583
|
+
const cacheDir = path.join(this.downloadDir, "cache", this.searchTerm);
|
|
584
|
+
const allSuccessful = result.failedPdfs === 0;
|
|
585
|
+
|
|
586
|
+
const shouldCleanup = await prompt({
|
|
587
|
+
type: PromptType.Confirm,
|
|
588
|
+
message: "Clean up cache folder? (removes JSON files and queue database)",
|
|
589
|
+
default: allSuccessful, // Yes if all successful, No if incomplete
|
|
590
|
+
cleanup: () => this.cleanupAfterPromptExit(),
|
|
591
|
+
});
|
|
592
|
+
|
|
593
|
+
if (shouldCleanup) {
|
|
594
|
+
this.stopPdfProgressPolling();
|
|
595
|
+
this.queue.delete();
|
|
596
|
+
this.queueDeleted = true;
|
|
597
|
+
logger.info(chalk.green("✓ Cache cleaned up"));
|
|
598
|
+
} else {
|
|
599
|
+
logger.info(chalk.gray("Cache preserved for potential resume"));
|
|
600
|
+
}
|
|
601
|
+
}
|
|
602
|
+
|
|
603
|
+
/**
|
|
604
|
+
* Format duration for display
|
|
605
|
+
*/
|
|
606
|
+
private formatDuration(ms: number): string {
|
|
607
|
+
const seconds = Math.floor(ms / 1000);
|
|
608
|
+
const minutes = Math.floor(seconds / 60);
|
|
609
|
+
const hours = Math.floor(minutes / 60);
|
|
610
|
+
|
|
611
|
+
if (hours > 0) {
|
|
612
|
+
return `${hours}h ${minutes % 60}m ${seconds % 60}s`;
|
|
613
|
+
} else if (minutes > 0) {
|
|
614
|
+
return `${minutes}m ${seconds % 60}s`;
|
|
615
|
+
} else {
|
|
616
|
+
return `${seconds}s`;
|
|
617
|
+
}
|
|
618
|
+
}
|
|
619
|
+
|
|
620
|
+
/**
|
|
621
|
+
* Format speed for display
|
|
622
|
+
*/
|
|
623
|
+
private formatSpeed(duration: number, completed: number): string {
|
|
624
|
+
const seconds = duration / 1000;
|
|
625
|
+
const rate = completed / seconds;
|
|
626
|
+
return `${rate.toFixed(1)} PDFs/second`;
|
|
627
|
+
}
|
|
628
|
+
}
|
|
629
|
+
|
|
630
|
+
/**
|
|
631
|
+
* Sleep utility
|
|
632
|
+
*/
|
|
633
|
+
function sleep(ms: number): Promise<void> {
|
|
634
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
635
|
+
}
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Parallel Download Module
|
|
3
|
+
*
|
|
4
|
+
* Producer-Consumer Pipeline for parallel PDF downloading.
|
|
5
|
+
*
|
|
6
|
+
* Usage:
|
|
7
|
+
* import { Coordinator } from "./src/workers/index.js";
|
|
8
|
+
*
|
|
9
|
+
* const coordinator = new Coordinator("search term", "./downloads", {
|
|
10
|
+
* workers: 5,
|
|
11
|
+
* fresh: false,
|
|
12
|
+
* verbose: false
|
|
13
|
+
* });
|
|
14
|
+
*
|
|
15
|
+
* await coordinator.run();
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
// Main classes
|
|
19
|
+
export { Coordinator } from "./coordinator.js";
|
|
20
|
+
export { WorkerPool } from "./worker-pool.js";
|
|
21
|
+
export { TaskQueue } from "./task-queue.js";
|
|
22
|
+
|
|
23
|
+
// Worker function
|
|
24
|
+
export { runWorker } from "./worker.js";
|
|
25
|
+
|
|
26
|
+
// Types
|
|
27
|
+
export type {
|
|
28
|
+
PdfTask,
|
|
29
|
+
PdfTaskRecord,
|
|
30
|
+
TaskStatus,
|
|
31
|
+
QueueProgress,
|
|
32
|
+
CoordinatorOptions,
|
|
33
|
+
CoordinatorResult,
|
|
34
|
+
WorkerPoolOptions,
|
|
35
|
+
WorkerPoolResult,
|
|
36
|
+
WorkerOptions,
|
|
37
|
+
WorkerResult,
|
|
38
|
+
MetadataKey,
|
|
39
|
+
JusticeGovJson,
|
|
40
|
+
} from "./types.js";
|