ef-dl 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +521 -0
- package/fonts/Sub-Zero.flf +629 -0
- package/index.ts +866 -0
- package/package.json +65 -0
- package/src/browserless/browser-client.ts +307 -0
- package/src/browserless/challenger.ts +352 -0
- package/src/browserless/helpers.ts +171 -0
- package/src/types/browserless.d.ts +31 -0
- package/src/types/constants.ts +3 -0
- package/src/types/enums.ts +5 -0
- package/src/utils/ascii.ts +66 -0
- package/src/utils/helpers.ts +260 -0
- package/src/utils/logger.ts +42 -0
- package/src/utils/progress.ts +130 -0
- package/src/utils/prompt.ts +87 -0
- package/src/workers/coordinator.ts +635 -0
- package/src/workers/index.ts +40 -0
- package/src/workers/task-queue.ts +388 -0
- package/src/workers/types.ts +135 -0
- package/src/workers/worker-pool.ts +227 -0
- package/src/workers/worker.ts +290 -0
package/index.ts
ADDED
|
@@ -0,0 +1,866 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
/**
|
|
3
|
+
* Epstein Files Downloader CLI
|
|
4
|
+
*
|
|
5
|
+
* An educational tool for downloading documents from the US DOJ Epstein Files portal.
|
|
6
|
+
* This CLI automates the process of searching, downloading metadata (JSON), and
|
|
7
|
+
* downloading PDF files with support for pagination, prefixes, and deduplication.
|
|
8
|
+
*
|
|
9
|
+
* @module index
|
|
10
|
+
* @version 1.0.0
|
|
11
|
+
* @license MIT
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
// Runtime check - this package requires Bun
|
|
15
|
+
if (!process.versions.bun) {
|
|
16
|
+
console.error(
|
|
17
|
+
"\n❌ Error: This package requires Bun runtime and will not work with Node.js.\n" +
|
|
18
|
+
" The package uses Bun-specific APIs (bun:sqlite) for performance.\n\n" +
|
|
19
|
+
" Install Bun: https://bun.sh\n" +
|
|
20
|
+
" Then run: bunx ef-dl\n",
|
|
21
|
+
);
|
|
22
|
+
process.exit(1);
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
// ============================================================================
|
|
26
|
+
// SECTION 1: IMPORTS
|
|
27
|
+
// ============================================================================
|
|
28
|
+
|
|
29
|
+
import { Command } from "commander";
|
|
30
|
+
import chalk from "chalk";
|
|
31
|
+
import fs from "fs";
|
|
32
|
+
import path from "path";
|
|
33
|
+
import { PromptType } from "./src/types/enums";
|
|
34
|
+
import { downloadPdf, closeBrowser } from "./src/browserless/browser-client";
|
|
35
|
+
import {
|
|
36
|
+
installConsoleBridge,
|
|
37
|
+
logger,
|
|
38
|
+
setVerboseMode,
|
|
39
|
+
} from "./src/utils/logger";
|
|
40
|
+
import { prompt } from "./src/utils/prompt";
|
|
41
|
+
import {
|
|
42
|
+
initProgressBars,
|
|
43
|
+
addJsonProgressTask,
|
|
44
|
+
addPdfProgressTask,
|
|
45
|
+
updateJsonProgress,
|
|
46
|
+
updatePdfProgress,
|
|
47
|
+
markTaskDone,
|
|
48
|
+
closeProgressBars,
|
|
49
|
+
} from "./src/utils/progress.js";
|
|
50
|
+
import {
|
|
51
|
+
cleanupAfterPromptExit,
|
|
52
|
+
fetchSearchResults,
|
|
53
|
+
findExistingPdfFile,
|
|
54
|
+
promptForCleanup,
|
|
55
|
+
showConfiguration,
|
|
56
|
+
showDisclaimerAndVerifyAge,
|
|
57
|
+
showDownloadSummary,
|
|
58
|
+
showHeader,
|
|
59
|
+
type SearchResult,
|
|
60
|
+
} from "./src/utils/helpers";
|
|
61
|
+
import { Coordinator } from "./src/workers/index.js";
|
|
62
|
+
|
|
63
|
+
// ============================================================================
|
|
64
|
+
// SECTION 2: CONSTANTS & CONFIGURATION
|
|
65
|
+
// ============================================================================
|
|
66
|
+
|
|
67
|
+
/** Application version from package.json */
|
|
68
|
+
const packageJson = JSON.parse(fs.readFileSync("./package.json", "utf-8"));
|
|
69
|
+
const VERSION = packageJson.version;
|
|
70
|
+
const USE_DEFAULT_DIR = process.env.USE_DEFAULT_DIR === "true";
|
|
71
|
+
const DEFAULT_DOWNLOAD_DIR = "./downloads";
|
|
72
|
+
|
|
73
|
+
/** Commander.js program instance */
|
|
74
|
+
const program = new Command();
|
|
75
|
+
|
|
76
|
+
installConsoleBridge();
|
|
77
|
+
|
|
78
|
+
// ============================================================================
|
|
79
|
+
// SECTION 3: PDF DOWNLOAD LOGIC
|
|
80
|
+
// ============================================================================
|
|
81
|
+
|
|
82
|
+
/**
|
|
83
|
+
* Downloads PDFs from JSON search results with deduplication support.
|
|
84
|
+
*
|
|
85
|
+
* Features:
|
|
86
|
+
* - Checks for existing files by filename AND size to prevent duplicates
|
|
87
|
+
* - Renames existing files if prefix doesn't match current preference
|
|
88
|
+
* - Applies custom prefix or defaults to page number
|
|
89
|
+
* - Tracks download progress via callback
|
|
90
|
+
* - Respects rate limits with delays between downloads
|
|
91
|
+
*
|
|
92
|
+
* @param jsonData - Search results containing PDF metadata
|
|
93
|
+
* @param searchTerm - The search query (for directory structure)
|
|
94
|
+
* @param baseDirectory - Base download directory
|
|
95
|
+
* @param pageNumber - Current page number (used for default prefix)
|
|
96
|
+
* @param prefix - Custom prefix or page number as string
|
|
97
|
+
* @param onProgress - Optional callback for progress updates
|
|
98
|
+
* @returns Success and failure counts
|
|
99
|
+
*/
|
|
100
|
+
async function downloadPdfsFromJson(
|
|
101
|
+
jsonData: SearchResult,
|
|
102
|
+
searchTerm: string,
|
|
103
|
+
baseDirectory: string,
|
|
104
|
+
pageNumber: number,
|
|
105
|
+
prefix: string | undefined,
|
|
106
|
+
onProgress?: (current: number, total: number) => void,
|
|
107
|
+
): Promise<{ successCount: number; failCount: number }> {
|
|
108
|
+
const pdfs = jsonData.hits?.hits || [];
|
|
109
|
+
const totalPdfs = pdfs.length;
|
|
110
|
+
|
|
111
|
+
if (totalPdfs === 0) {
|
|
112
|
+
return { successCount: 0, failCount: 0 };
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
// Create PDF directory: {baseDirectory}/{searchTerm}/pdfs/
|
|
116
|
+
const pdfOutputDir = path.join(baseDirectory, searchTerm, "pdfs");
|
|
117
|
+
if (!fs.existsSync(pdfOutputDir)) {
|
|
118
|
+
fs.mkdirSync(pdfOutputDir, { recursive: true });
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
let successCount = 0;
|
|
122
|
+
let failCount = 0;
|
|
123
|
+
|
|
124
|
+
for (let i = 0; i < totalPdfs; i++) {
|
|
125
|
+
const pdf = pdfs[i];
|
|
126
|
+
if (!pdf || !pdf._source) {
|
|
127
|
+
failCount++;
|
|
128
|
+
if (onProgress) onProgress(i + 1, totalPdfs);
|
|
129
|
+
continue;
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
const fileName = pdf._source.ORIGIN_FILE_NAME;
|
|
133
|
+
const fileUrl = pdf._source.ORIGIN_FILE_URI;
|
|
134
|
+
const fileSize = pdf._source.fileSize;
|
|
135
|
+
|
|
136
|
+
if (!fileName || !fileUrl) {
|
|
137
|
+
failCount++;
|
|
138
|
+
if (onProgress) onProgress(i + 1, totalPdfs);
|
|
139
|
+
continue;
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
try {
|
|
143
|
+
// Determine target filename with prefix
|
|
144
|
+
const targetFileName = prefix ? `${prefix}-${fileName}` : fileName;
|
|
145
|
+
const targetFilePath = path.join(pdfOutputDir, targetFileName);
|
|
146
|
+
|
|
147
|
+
// Check if file already exists (by name and size)
|
|
148
|
+
const existingFile = findExistingPdfFile(
|
|
149
|
+
fileName,
|
|
150
|
+
pdfOutputDir,
|
|
151
|
+
fileSize,
|
|
152
|
+
prefix,
|
|
153
|
+
);
|
|
154
|
+
|
|
155
|
+
if (existingFile) {
|
|
156
|
+
// File exists with correct size
|
|
157
|
+
if (existingFile.needsRename) {
|
|
158
|
+
// Rename to match current prefix preference
|
|
159
|
+
fs.renameSync(existingFile.filePath, targetFilePath);
|
|
160
|
+
console.log(
|
|
161
|
+
chalk.gray(
|
|
162
|
+
` Renamed: ${path.basename(existingFile.filePath)} → ${targetFileName}`,
|
|
163
|
+
),
|
|
164
|
+
);
|
|
165
|
+
} else {
|
|
166
|
+
console.log(
|
|
167
|
+
chalk.gray(` Skipping (already exists): ${targetFileName}`),
|
|
168
|
+
);
|
|
169
|
+
}
|
|
170
|
+
successCount++;
|
|
171
|
+
if (onProgress) onProgress(i + 1, totalPdfs);
|
|
172
|
+
continue;
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
// File doesn't exist or size mismatch, download it
|
|
176
|
+
await downloadPdf(fileUrl, pdfOutputDir, fileName, prefix);
|
|
177
|
+
successCount++;
|
|
178
|
+
} catch (error: any) {
|
|
179
|
+
failCount++;
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
// Update progress after each PDF
|
|
183
|
+
if (onProgress) onProgress(i + 1, totalPdfs);
|
|
184
|
+
|
|
185
|
+
// Small delay between downloads to be respectful to the server
|
|
186
|
+
if (i < totalPdfs - 1) {
|
|
187
|
+
await new Promise((resolve) => setTimeout(resolve, 500));
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
return { successCount, failCount };
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
// ============================================================================
|
|
195
|
+
// SECTION 4: TERMINAL USER INTERFACE
|
|
196
|
+
// ============================================================================
|
|
197
|
+
|
|
198
|
+
// ============================================================================
|
|
199
|
+
// SECTION 8: DOWNLOAD WORKFLOWS
|
|
200
|
+
// ============================================================================
|
|
201
|
+
|
|
202
|
+
/**
|
|
203
|
+
* Handles downloading all pages starting from a specific page.
|
|
204
|
+
* First collects all JSON metadata, then downloads PDFs with progress tracking.
|
|
205
|
+
*/
|
|
206
|
+
async function downloadAllPagesWorkflow(
|
|
207
|
+
searchTerm: string,
|
|
208
|
+
baseDirectory: string,
|
|
209
|
+
startPage: number,
|
|
210
|
+
options: { prefix?: string; verbose: boolean },
|
|
211
|
+
): Promise<void> {
|
|
212
|
+
// Fetch first page to get total results count
|
|
213
|
+
const { jsonData: firstPageData } = await fetchSearchResults(
|
|
214
|
+
searchTerm,
|
|
215
|
+
startPage,
|
|
216
|
+
baseDirectory,
|
|
217
|
+
);
|
|
218
|
+
|
|
219
|
+
const totalResults = firstPageData.hits?.total?.value || 0;
|
|
220
|
+
const resultsPerPage = 10;
|
|
221
|
+
const totalPages = Math.ceil(totalResults / resultsPerPage);
|
|
222
|
+
const endPage = totalPages;
|
|
223
|
+
|
|
224
|
+
console.log(chalk.cyan(`\nDownload Mode: All Pages`));
|
|
225
|
+
console.log(chalk.cyan(` Total Results: ${totalResults}`));
|
|
226
|
+
console.log(chalk.cyan(` Total pages: ${totalPages}`));
|
|
227
|
+
console.log(chalk.cyan(` Starting from page: ${startPage}`));
|
|
228
|
+
console.log(chalk.gray(`\nFetching all JSON data first to count PDFs...\n`));
|
|
229
|
+
|
|
230
|
+
// First pass: collect all JSON data and count PDFs
|
|
231
|
+
const allJsonData: SearchResult[] = [];
|
|
232
|
+
let totalPdfCount = 0;
|
|
233
|
+
|
|
234
|
+
for (let page = startPage; page <= endPage; page++) {
|
|
235
|
+
try {
|
|
236
|
+
const { jsonData } = await fetchSearchResults(
|
|
237
|
+
searchTerm,
|
|
238
|
+
page,
|
|
239
|
+
baseDirectory,
|
|
240
|
+
);
|
|
241
|
+
allJsonData.push(jsonData);
|
|
242
|
+
const pagePdfCount = jsonData.hits?.hits?.length || 0;
|
|
243
|
+
totalPdfCount += pagePdfCount;
|
|
244
|
+
console.log(chalk.gray(` Page ${page}: ${pagePdfCount} PDFs`));
|
|
245
|
+
|
|
246
|
+
// Small delay between JSON fetches
|
|
247
|
+
if (page < endPage) {
|
|
248
|
+
await new Promise((resolve) => setTimeout(resolve, 1000));
|
|
249
|
+
}
|
|
250
|
+
} catch (error: any) {
|
|
251
|
+
console.error(
|
|
252
|
+
chalk.red(` Failed to fetch page ${page}: ${error.message}`),
|
|
253
|
+
);
|
|
254
|
+
// Add empty data for failed pages
|
|
255
|
+
allJsonData.push({ hits: { total: { value: 0 }, hits: [] } });
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
console.log(chalk.cyan(`\n Total PDFs to download: ${totalPdfCount}\n`));
|
|
260
|
+
|
|
261
|
+
// Initialize progress tracking
|
|
262
|
+
initProgressBars();
|
|
263
|
+
addJsonProgressTask("JSON Pages", allJsonData.length);
|
|
264
|
+
addPdfProgressTask("PDF Downloads", totalPdfCount);
|
|
265
|
+
updateJsonProgress("JSON Pages", allJsonData.length, allJsonData.length);
|
|
266
|
+
|
|
267
|
+
// Download PDFs
|
|
268
|
+
let totalSuccessCount = 0;
|
|
269
|
+
let totalFailCount = 0;
|
|
270
|
+
let currentPdfCount = 0;
|
|
271
|
+
|
|
272
|
+
for (let i = 0; i < allJsonData.length; i++) {
|
|
273
|
+
const jsonData = allJsonData[i];
|
|
274
|
+
const page = startPage + i;
|
|
275
|
+
|
|
276
|
+
if (!jsonData) {
|
|
277
|
+
console.error(chalk.red(`\nSkipping page ${page}: No data available`));
|
|
278
|
+
continue;
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
try {
|
|
282
|
+
// Calculate prefix for this specific page
|
|
283
|
+
const pagePrefix = options.prefix || String(page);
|
|
284
|
+
|
|
285
|
+
const { successCount, failCount } = await downloadPdfsFromJson(
|
|
286
|
+
jsonData,
|
|
287
|
+
searchTerm,
|
|
288
|
+
baseDirectory,
|
|
289
|
+
page,
|
|
290
|
+
pagePrefix,
|
|
291
|
+
(_current: number, _total: number) => {
|
|
292
|
+
currentPdfCount++;
|
|
293
|
+
updatePdfProgress("PDF Downloads", currentPdfCount, totalPdfCount);
|
|
294
|
+
},
|
|
295
|
+
);
|
|
296
|
+
|
|
297
|
+
totalSuccessCount += successCount;
|
|
298
|
+
totalFailCount += failCount;
|
|
299
|
+
|
|
300
|
+
// Delay between pages
|
|
301
|
+
if (i < allJsonData.length - 1) {
|
|
302
|
+
await new Promise((resolve) => setTimeout(resolve, 2000));
|
|
303
|
+
}
|
|
304
|
+
} catch (error: any) {
|
|
305
|
+
console.error(
|
|
306
|
+
chalk.red(`\nFailed to process page ${page}: ${error.message}`),
|
|
307
|
+
);
|
|
308
|
+
const pagePdfCount = jsonData.hits?.hits?.length || 0;
|
|
309
|
+
totalFailCount += pagePdfCount;
|
|
310
|
+
currentPdfCount += pagePdfCount;
|
|
311
|
+
updatePdfProgress("PDF Downloads", currentPdfCount, totalPdfCount);
|
|
312
|
+
}
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
// Mark tasks as done
|
|
316
|
+
markTaskDone("JSON Pages", "Complete ✓", chalk.blue);
|
|
317
|
+
markTaskDone(
|
|
318
|
+
"PDF Downloads",
|
|
319
|
+
`${totalSuccessCount} downloaded ✓`,
|
|
320
|
+
chalk.green,
|
|
321
|
+
);
|
|
322
|
+
|
|
323
|
+
// Cleanup and show summary
|
|
324
|
+
await new Promise((resolve) => setTimeout(resolve, 500));
|
|
325
|
+
closeProgressBars();
|
|
326
|
+
showDownloadSummary(totalSuccessCount, totalFailCount);
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
/**
|
|
330
|
+
* Handles downloading a single page.
|
|
331
|
+
*/
|
|
332
|
+
async function downloadSinglePageWorkflow(
|
|
333
|
+
searchTerm: string,
|
|
334
|
+
baseDirectory: string,
|
|
335
|
+
pageNumber: number,
|
|
336
|
+
effectivePrefix: string,
|
|
337
|
+
): Promise<void> {
|
|
338
|
+
console.log(chalk.cyan(`\nDownload Mode: Single Page (${pageNumber})`));
|
|
339
|
+
console.log(chalk.gray(`\nFetching JSON data to count PDFs...\n`));
|
|
340
|
+
|
|
341
|
+
// Fetch JSON first to get actual PDF count
|
|
342
|
+
const { jsonData } = await fetchSearchResults(
|
|
343
|
+
searchTerm,
|
|
344
|
+
pageNumber,
|
|
345
|
+
baseDirectory,
|
|
346
|
+
);
|
|
347
|
+
|
|
348
|
+
const actualPdfCount = jsonData.hits?.hits?.length || 0;
|
|
349
|
+
console.log(chalk.gray(` Found ${actualPdfCount} PDFs on this page\n`));
|
|
350
|
+
|
|
351
|
+
// Initialize progress tracking
|
|
352
|
+
initProgressBars();
|
|
353
|
+
addJsonProgressTask("JSON Pages", 1);
|
|
354
|
+
addPdfProgressTask("PDF Downloads", actualPdfCount);
|
|
355
|
+
updateJsonProgress("JSON Pages", 1, 1);
|
|
356
|
+
|
|
357
|
+
let totalSuccessCount = 0;
|
|
358
|
+
let totalFailCount = 0;
|
|
359
|
+
let currentPdfCount = 0;
|
|
360
|
+
|
|
361
|
+
// Download PDFs for this page
|
|
362
|
+
const { successCount, failCount } = await downloadPdfsFromJson(
|
|
363
|
+
jsonData,
|
|
364
|
+
searchTerm,
|
|
365
|
+
baseDirectory,
|
|
366
|
+
pageNumber,
|
|
367
|
+
effectivePrefix,
|
|
368
|
+
(_current: number, _total: number) => {
|
|
369
|
+
currentPdfCount++;
|
|
370
|
+
updatePdfProgress("PDF Downloads", currentPdfCount, actualPdfCount);
|
|
371
|
+
},
|
|
372
|
+
);
|
|
373
|
+
|
|
374
|
+
totalSuccessCount += successCount;
|
|
375
|
+
totalFailCount += failCount;
|
|
376
|
+
|
|
377
|
+
// Mark tasks as done
|
|
378
|
+
markTaskDone("JSON Pages", "Complete ✓", chalk.blue);
|
|
379
|
+
markTaskDone(
|
|
380
|
+
"PDF Downloads",
|
|
381
|
+
`${totalSuccessCount} downloaded ✓`,
|
|
382
|
+
chalk.green,
|
|
383
|
+
);
|
|
384
|
+
|
|
385
|
+
// Cleanup and show summary
|
|
386
|
+
await new Promise((resolve) => setTimeout(resolve, 500));
|
|
387
|
+
closeProgressBars();
|
|
388
|
+
showDownloadSummary(totalSuccessCount, totalFailCount);
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
/**
|
|
392
|
+
* Displays the download summary to the user.
|
|
393
|
+
*/
|
|
394
|
+
/**
|
|
395
|
+
* Interactive mode: prompts user for all configuration options.
|
|
396
|
+
* Pre-fills values from command line flags if provided.
|
|
397
|
+
*
|
|
398
|
+
* @param initialOptions - Options pre-filled from command line flags
|
|
399
|
+
* @returns Complete configuration from user input
|
|
400
|
+
*/
|
|
401
|
+
async function runInteractiveMode(initialOptions: {
|
|
402
|
+
search?: string;
|
|
403
|
+
directory?: string;
|
|
404
|
+
page?: string;
|
|
405
|
+
all?: boolean;
|
|
406
|
+
prefix?: string;
|
|
407
|
+
verbose?: boolean;
|
|
408
|
+
workers?: string;
|
|
409
|
+
}): Promise<{
|
|
410
|
+
searchTerm: string;
|
|
411
|
+
baseDirectory: string;
|
|
412
|
+
pageNum: number;
|
|
413
|
+
isPageExplicitlySet: boolean;
|
|
414
|
+
allFlag: boolean;
|
|
415
|
+
effectivePrefix: string;
|
|
416
|
+
hasCustomPrefix: boolean;
|
|
417
|
+
isVerbose: boolean;
|
|
418
|
+
workers: number;
|
|
419
|
+
endPage?: number;
|
|
420
|
+
}> {
|
|
421
|
+
console.log(chalk.cyan("\nInteractive Mode\n"));
|
|
422
|
+
console.log(
|
|
423
|
+
chalk.gray("Press Enter to accept default values shown in brackets.\n"),
|
|
424
|
+
);
|
|
425
|
+
|
|
426
|
+
// Search term prompt
|
|
427
|
+
const searchTerm: string = await prompt({
|
|
428
|
+
type: PromptType.Input,
|
|
429
|
+
message: "Search term:",
|
|
430
|
+
default: initialOptions.search || "",
|
|
431
|
+
validate: (value) => {
|
|
432
|
+
if (!value || value.trim() === "") {
|
|
433
|
+
return "Search term is required";
|
|
434
|
+
}
|
|
435
|
+
return true;
|
|
436
|
+
},
|
|
437
|
+
cleanup: cleanupAfterPromptExit,
|
|
438
|
+
});
|
|
439
|
+
|
|
440
|
+
// Directory prompt
|
|
441
|
+
const baseDirectory: string = USE_DEFAULT_DIR
|
|
442
|
+
? initialOptions.directory || DEFAULT_DOWNLOAD_DIR
|
|
443
|
+
: await prompt({
|
|
444
|
+
type: PromptType.Input,
|
|
445
|
+
message: "Download directory:",
|
|
446
|
+
default: initialOptions.directory || DEFAULT_DOWNLOAD_DIR,
|
|
447
|
+
validate: (value) => {
|
|
448
|
+
if (!value || value.trim() === "") {
|
|
449
|
+
return "Download directory is required";
|
|
450
|
+
}
|
|
451
|
+
return true;
|
|
452
|
+
},
|
|
453
|
+
cleanup: cleanupAfterPromptExit,
|
|
454
|
+
});
|
|
455
|
+
|
|
456
|
+
// Page number prompt
|
|
457
|
+
const pageInput: string = await prompt({
|
|
458
|
+
type: PromptType.Input,
|
|
459
|
+
message: "Page number (leave empty to download all pages):",
|
|
460
|
+
default: initialOptions.page || "",
|
|
461
|
+
cleanup: cleanupAfterPromptExit,
|
|
462
|
+
});
|
|
463
|
+
const isPageExplicitlySet = pageInput.trim() !== "";
|
|
464
|
+
const pageNum = isPageExplicitlySet ? parseInt(pageInput, 10) || 1 : 1;
|
|
465
|
+
|
|
466
|
+
// Download mode selection
|
|
467
|
+
let allFlag = initialOptions.all || false;
|
|
468
|
+
if (isPageExplicitlySet) {
|
|
469
|
+
const modeChoice = await prompt({
|
|
470
|
+
type: PromptType.Select,
|
|
471
|
+
message: "Download mode:",
|
|
472
|
+
choices: [
|
|
473
|
+
{ name: "Download only this page", value: "single" },
|
|
474
|
+
{ name: "Download from this page to the end", value: "all" },
|
|
475
|
+
],
|
|
476
|
+
default: initialOptions.all ? "all" : "single",
|
|
477
|
+
cleanup: cleanupAfterPromptExit,
|
|
478
|
+
});
|
|
479
|
+
allFlag = modeChoice === "all";
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
// Custom prefix prompt
|
|
483
|
+
const customPrefix: string = await prompt({
|
|
484
|
+
type: PromptType.Input,
|
|
485
|
+
message: "Custom prefix for filenames (leave empty to use page number):",
|
|
486
|
+
default: initialOptions.prefix || "",
|
|
487
|
+
cleanup: cleanupAfterPromptExit,
|
|
488
|
+
});
|
|
489
|
+
const hasCustomPrefix = customPrefix.trim() !== "";
|
|
490
|
+
const effectivePrefix = hasCustomPrefix ? customPrefix : String(pageNum);
|
|
491
|
+
|
|
492
|
+
// Parallel workers
|
|
493
|
+
const workersInput: string = await prompt({
|
|
494
|
+
type: PromptType.Input,
|
|
495
|
+
message: "Number of parallel workers (slowest 1-10 fastest):",
|
|
496
|
+
default: initialOptions.workers || "5",
|
|
497
|
+
validate: (value) => {
|
|
498
|
+
const num = parseInt(value, 10);
|
|
499
|
+
if (isNaN(num) || num < 1 || num > 10) {
|
|
500
|
+
return "Please enter a number between 1 and 10";
|
|
501
|
+
}
|
|
502
|
+
return true;
|
|
503
|
+
},
|
|
504
|
+
cleanup: cleanupAfterPromptExit,
|
|
505
|
+
});
|
|
506
|
+
const workers = parseInt(workersInput, 10) || 5;
|
|
507
|
+
|
|
508
|
+
// Verbose mode
|
|
509
|
+
const isVerbose: boolean = await prompt({
|
|
510
|
+
type: PromptType.Confirm,
|
|
511
|
+
message: "Enable verbose output?",
|
|
512
|
+
default: initialOptions.verbose || false,
|
|
513
|
+
cleanup: cleanupAfterPromptExit,
|
|
514
|
+
});
|
|
515
|
+
|
|
516
|
+
// Calculate endPage for single page downloads
|
|
517
|
+
const endPage = isPageExplicitlySet && !allFlag ? pageNum : undefined;
|
|
518
|
+
|
|
519
|
+
return {
|
|
520
|
+
searchTerm,
|
|
521
|
+
baseDirectory,
|
|
522
|
+
pageNum,
|
|
523
|
+
isPageExplicitlySet,
|
|
524
|
+
allFlag,
|
|
525
|
+
effectivePrefix,
|
|
526
|
+
hasCustomPrefix,
|
|
527
|
+
isVerbose,
|
|
528
|
+
workers,
|
|
529
|
+
endPage,
|
|
530
|
+
};
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
// ============================================================================
|
|
534
|
+
// SECTION 9: MAIN APPLICATION
|
|
535
|
+
// ============================================================================
|
|
536
|
+
|
|
537
|
+
/**
|
|
538
|
+
* Main application entry point.
|
|
539
|
+
* Handles CLI setup, user interactions, and download workflows.
|
|
540
|
+
*/
|
|
541
|
+
async function main(): Promise<void> {
|
|
542
|
+
// -------------------------------------------------------------------------
|
|
543
|
+
// CLI Setup
|
|
544
|
+
// -------------------------------------------------------------------------
|
|
545
|
+
program
|
|
546
|
+
.name("ef-dl")
|
|
547
|
+
.description("CLI to download Epstein files from justice.gov")
|
|
548
|
+
.version(VERSION)
|
|
549
|
+
.option("-s, --search <term>", "Search term (required)")
|
|
550
|
+
.option(
|
|
551
|
+
"-p, --page <number>",
|
|
552
|
+
"Page number to download (if not specified, downloads all pages starting from page 1)",
|
|
553
|
+
)
|
|
554
|
+
.option(
|
|
555
|
+
"-a, --all",
|
|
556
|
+
"Download all pages from the specified page number (requires -p). If -p is not set, this is automatically enabled.",
|
|
557
|
+
false,
|
|
558
|
+
)
|
|
559
|
+
.option("-d, --directory <path>", "Download directory (Required)")
|
|
560
|
+
.option(
|
|
561
|
+
"--prefix <string>",
|
|
562
|
+
"Custom prefix for PDF filenames. If not provided, page number is used (e.g., page 7 creates '7-filename.pdf')",
|
|
563
|
+
)
|
|
564
|
+
.option("-v, --verbose", "Show verbose debug output", false)
|
|
565
|
+
.option(
|
|
566
|
+
"-i, --interactive",
|
|
567
|
+
"Interactive mode: prompt for all options (flags provided will be pre-filled)",
|
|
568
|
+
false,
|
|
569
|
+
)
|
|
570
|
+
.option(
|
|
571
|
+
"--workers <number>",
|
|
572
|
+
"Number of parallel workers (1-10, default: 5)",
|
|
573
|
+
"5",
|
|
574
|
+
)
|
|
575
|
+
.option("--fresh", "Force fresh start, ignore resume", false)
|
|
576
|
+
.option("--sequential", "Use sequential download (no parallel)", false)
|
|
577
|
+
.configureHelp({
|
|
578
|
+
sortSubcommands: true,
|
|
579
|
+
helpWidth: 80,
|
|
580
|
+
})
|
|
581
|
+
.addHelpText(
|
|
582
|
+
"after",
|
|
583
|
+
`
|
|
584
|
+
Notes:
|
|
585
|
+
- If -p is not specified, the tool will download ALL pages by default
|
|
586
|
+
- Parallel downloads use 5 workers by default (configurable with --workers)
|
|
587
|
+
- Use --sequential to disable parallel downloads
|
|
588
|
+
- Use --fresh to ignore previous download and start fresh
|
|
589
|
+
- Use -p without -a to download a single specific page
|
|
590
|
+
- Use -p with -a to download from that page to the end
|
|
591
|
+
- Use -i for interactive mode to configure all options via prompts
|
|
592
|
+
- In interactive mode, any flags provided will be pre-filled as defaults
|
|
593
|
+
- PDF filenames include page number as prefix by default (e.g., '7-filename.pdf')
|
|
594
|
+
- Use --prefix to override with a custom prefix (e.g., --prefix EPSTEIN)
|
|
595
|
+
- Existing files are detected by filename AND size to prevent duplicates
|
|
596
|
+
- Existing files will be renamed if prefix doesn't match current preference
|
|
597
|
+
- JSON metadata files are saved in {directory}/cache/{search-term}/json/
|
|
598
|
+
- PDF files are saved in {directory}/files/{search-term}/
|
|
599
|
+
- Queue database is saved in {directory}/cache/{search-term}/{search-term}.db
|
|
600
|
+
`,
|
|
601
|
+
)
|
|
602
|
+
.parse();
|
|
603
|
+
|
|
604
|
+
// -------------------------------------------------------------------------
|
|
605
|
+
// Parse Options Early
|
|
606
|
+
// -------------------------------------------------------------------------
|
|
607
|
+
const options = program.opts();
|
|
608
|
+
|
|
609
|
+
// -------------------------------------------------------------------------
|
|
610
|
+
// Setup Signal Handlers for Graceful Interruption
|
|
611
|
+
// -------------------------------------------------------------------------
|
|
612
|
+
let isShuttingDown = false;
|
|
613
|
+
|
|
614
|
+
const isExitPromptError = (error: unknown): boolean => {
|
|
615
|
+
return error instanceof Error && error.name === "ExitPromptError";
|
|
616
|
+
};
|
|
617
|
+
|
|
618
|
+
const handlePromptExit = (error: unknown): void => {
|
|
619
|
+
if (isShuttingDown) return;
|
|
620
|
+
isShuttingDown = true;
|
|
621
|
+
logger.info(chalk.yellow("\n\n⚠ Prompt cancelled by user (Ctrl+C)"));
|
|
622
|
+
logger.info(chalk.gray("Cleaning up resources..."));
|
|
623
|
+
cleanupAfterPromptExit();
|
|
624
|
+
logger.info(chalk.gray("Exiting..."));
|
|
625
|
+
process.exit(130);
|
|
626
|
+
};
|
|
627
|
+
|
|
628
|
+
process.on("uncaughtException", (error) => {
|
|
629
|
+
if (isExitPromptError(error)) {
|
|
630
|
+
handlePromptExit(error);
|
|
631
|
+
return;
|
|
632
|
+
}
|
|
633
|
+
logger.error(error);
|
|
634
|
+
process.exit(1);
|
|
635
|
+
});
|
|
636
|
+
|
|
637
|
+
process.on("unhandledRejection", (reason) => {
|
|
638
|
+
if (isExitPromptError(reason)) {
|
|
639
|
+
handlePromptExit(reason);
|
|
640
|
+
return;
|
|
641
|
+
}
|
|
642
|
+
logger.error(reason);
|
|
643
|
+
process.exit(1);
|
|
644
|
+
});
|
|
645
|
+
|
|
646
|
+
process.on("SIGINT", async () => {
|
|
647
|
+
if (isShuttingDown) return;
|
|
648
|
+
isShuttingDown = true;
|
|
649
|
+
|
|
650
|
+
logger.info(chalk.yellow("\n\n⚠ Interrupted by user (Ctrl+C)"));
|
|
651
|
+
logger.info(chalk.gray("Cleaning up resources..."));
|
|
652
|
+
|
|
653
|
+
// Close progress bars
|
|
654
|
+
logger.info(chalk.gray("- Closing progress bars"));
|
|
655
|
+
closeProgressBars();
|
|
656
|
+
|
|
657
|
+
// Close browser instance
|
|
658
|
+
logger.info(chalk.gray("- Closing browser sessions"));
|
|
659
|
+
await closeBrowser().catch(() => {});
|
|
660
|
+
|
|
661
|
+
logger.info(chalk.gray("Exiting..."));
|
|
662
|
+
process.exit(130); // 130 = Ctrl+C exit code
|
|
663
|
+
});
|
|
664
|
+
|
|
665
|
+
process.on("SIGTERM", async () => {
|
|
666
|
+
if (isShuttingDown) return;
|
|
667
|
+
isShuttingDown = true;
|
|
668
|
+
|
|
669
|
+
logger.info(chalk.gray("\n\n⚠ Received SIGTERM"));
|
|
670
|
+
logger.info(chalk.gray("Cleaning up resources..."));
|
|
671
|
+
logger.info(chalk.gray("- Closing browser sessions"));
|
|
672
|
+
await closeBrowser().catch(() => {});
|
|
673
|
+
process.exit(143); // 143 = SIGTERM exit code
|
|
674
|
+
});
|
|
675
|
+
|
|
676
|
+
// -------------------------------------------------------------------------
|
|
677
|
+
// Check for Interactive Mode or No Arguments
|
|
678
|
+
// -------------------------------------------------------------------------
|
|
679
|
+
// If no arguments provided, default to interactive mode
|
|
680
|
+
const isInteractiveMode = options.interactive || process.argv.length <= 2;
|
|
681
|
+
|
|
682
|
+
let searchTerm: string;
|
|
683
|
+
let baseDirectory: string;
|
|
684
|
+
let pageNum: number;
|
|
685
|
+
let startPage: number;
|
|
686
|
+
let endPage: number | undefined;
|
|
687
|
+
let isPageExplicitlySet: boolean;
|
|
688
|
+
let allFlag: boolean;
|
|
689
|
+
let effectivePrefix: string;
|
|
690
|
+
let hasCustomPrefix: boolean;
|
|
691
|
+
let isVerbose: boolean;
|
|
692
|
+
let downloadAllPages: boolean;
|
|
693
|
+
let workers: number;
|
|
694
|
+
|
|
695
|
+
if (isInteractiveMode) {
|
|
696
|
+
// Interactive mode: show header, then age verification
|
|
697
|
+
showHeader(VERSION);
|
|
698
|
+
|
|
699
|
+
// Show disclaimer and verify age before proceeding
|
|
700
|
+
await showDisclaimerAndVerifyAge();
|
|
701
|
+
|
|
702
|
+
const config = await runInteractiveMode({
|
|
703
|
+
search: options.search,
|
|
704
|
+
directory: options.directory,
|
|
705
|
+
page: options.page,
|
|
706
|
+
all: options.all,
|
|
707
|
+
prefix: options.prefix,
|
|
708
|
+
verbose: options.verbose,
|
|
709
|
+
workers: options.workers,
|
|
710
|
+
});
|
|
711
|
+
|
|
712
|
+
searchTerm = config.searchTerm;
|
|
713
|
+
baseDirectory = config.baseDirectory;
|
|
714
|
+
pageNum = config.pageNum;
|
|
715
|
+
startPage = pageNum;
|
|
716
|
+
endPage = config.endPage;
|
|
717
|
+
isPageExplicitlySet = config.isPageExplicitlySet;
|
|
718
|
+
allFlag = config.allFlag;
|
|
719
|
+
effectivePrefix = config.effectivePrefix;
|
|
720
|
+
hasCustomPrefix = config.hasCustomPrefix;
|
|
721
|
+
isVerbose = config.isVerbose;
|
|
722
|
+
workers = config.workers;
|
|
723
|
+
downloadAllPages = !isPageExplicitlySet || allFlag;
|
|
724
|
+
} else {
|
|
725
|
+
// -------------------------------------------------------------------------
|
|
726
|
+
// Validate required options
|
|
727
|
+
// -------------------------------------------------------------------------
|
|
728
|
+
if (!options.search) {
|
|
729
|
+
showHeader(VERSION);
|
|
730
|
+
console.error(
|
|
731
|
+
chalk.red("\nError: Search term is required. Use -s or --search"),
|
|
732
|
+
);
|
|
733
|
+
program.help();
|
|
734
|
+
process.exit(1);
|
|
735
|
+
}
|
|
736
|
+
if (!options.directory && !USE_DEFAULT_DIR) {
|
|
737
|
+
showHeader(VERSION);
|
|
738
|
+
console.error(
|
|
739
|
+
chalk.red(
|
|
740
|
+
"\nError: Download directory is required. Use -d or --directory",
|
|
741
|
+
),
|
|
742
|
+
);
|
|
743
|
+
program.help();
|
|
744
|
+
process.exit(1);
|
|
745
|
+
}
|
|
746
|
+
|
|
747
|
+
// -------------------------------------------------------------------------
|
|
748
|
+
// EF-DL Header
|
|
749
|
+
// -------------------------------------------------------------------------
|
|
750
|
+
showHeader(VERSION);
|
|
751
|
+
|
|
752
|
+
// Show disclaimer and verify age before proceeding
|
|
753
|
+
await showDisclaimerAndVerifyAge();
|
|
754
|
+
|
|
755
|
+
searchTerm = options.search;
|
|
756
|
+
baseDirectory = options.directory || DEFAULT_DOWNLOAD_DIR;
|
|
757
|
+
isPageExplicitlySet =
|
|
758
|
+
process.argv.includes("-p") || process.argv.includes("--page");
|
|
759
|
+
allFlag = options.all;
|
|
760
|
+
hasCustomPrefix = !!options.prefix;
|
|
761
|
+
isVerbose = options.verbose;
|
|
762
|
+
|
|
763
|
+
// Parse page number
|
|
764
|
+
const pageOption = options.page;
|
|
765
|
+
if (isPageExplicitlySet && pageOption) {
|
|
766
|
+
pageNum = parseInt(pageOption, 10);
|
|
767
|
+
if (isNaN(pageNum) || pageNum < 1) {
|
|
768
|
+
console.error(chalk.red("Error: Page must be a positive integer"));
|
|
769
|
+
process.exit(1);
|
|
770
|
+
}
|
|
771
|
+
} else {
|
|
772
|
+
pageNum = 1;
|
|
773
|
+
}
|
|
774
|
+
|
|
775
|
+
effectivePrefix = options.prefix || String(pageNum);
|
|
776
|
+
startPage = pageNum;
|
|
777
|
+
downloadAllPages = !isPageExplicitlySet || allFlag;
|
|
778
|
+
workers = parseInt(options.workers, 10);
|
|
779
|
+
}
|
|
780
|
+
|
|
781
|
+
// -------------------------------------------------------------------------
|
|
782
|
+
// Apply Settings
|
|
783
|
+
// -------------------------------------------------------------------------
|
|
784
|
+
setVerboseMode(isVerbose);
|
|
785
|
+
|
|
786
|
+
// -------------------------------------------------------------------------
|
|
787
|
+
// Display Configuration
|
|
788
|
+
// -------------------------------------------------------------------------
|
|
789
|
+
const useParallel = !options.sequential;
|
|
790
|
+
|
|
791
|
+
showConfiguration(
|
|
792
|
+
searchTerm,
|
|
793
|
+
baseDirectory,
|
|
794
|
+
pageNum,
|
|
795
|
+
isPageExplicitlySet,
|
|
796
|
+
allFlag,
|
|
797
|
+
effectivePrefix,
|
|
798
|
+
hasCustomPrefix,
|
|
799
|
+
isVerbose,
|
|
800
|
+
useParallel,
|
|
801
|
+
workers,
|
|
802
|
+
);
|
|
803
|
+
|
|
804
|
+
console.log(chalk.green("\nStarting download process...\n"));
|
|
805
|
+
|
|
806
|
+
// Ensure base directory exists
|
|
807
|
+
if (!fs.existsSync(baseDirectory)) {
|
|
808
|
+
fs.mkdirSync(baseDirectory, { recursive: true });
|
|
809
|
+
}
|
|
810
|
+
|
|
811
|
+
// -------------------------------------------------------------------------
|
|
812
|
+
// Execute Download Workflow
|
|
813
|
+
// -------------------------------------------------------------------------
|
|
814
|
+
if (useParallel) {
|
|
815
|
+
// Use parallel download coordinator
|
|
816
|
+
console.log(chalk.blue(`Using parallel mode with ${workers} workers\n`));
|
|
817
|
+
|
|
818
|
+
const coordinator = new Coordinator(searchTerm, baseDirectory, {
|
|
819
|
+
startPage,
|
|
820
|
+
endPage,
|
|
821
|
+
workers,
|
|
822
|
+
fresh: options.fresh,
|
|
823
|
+
verbose: options.verbose,
|
|
824
|
+
});
|
|
825
|
+
|
|
826
|
+
await coordinator.run();
|
|
827
|
+
} else {
|
|
828
|
+
// Use sequential download (legacy mode)
|
|
829
|
+
console.log(chalk.blue("Using sequential mode\n"));
|
|
830
|
+
|
|
831
|
+
if (downloadAllPages) {
|
|
832
|
+
await downloadAllPagesWorkflow(searchTerm, baseDirectory, startPage, {
|
|
833
|
+
prefix: options.prefix,
|
|
834
|
+
verbose: options.verbose,
|
|
835
|
+
});
|
|
836
|
+
} else {
|
|
837
|
+
await downloadSinglePageWorkflow(
|
|
838
|
+
searchTerm,
|
|
839
|
+
baseDirectory,
|
|
840
|
+
startPage,
|
|
841
|
+
effectivePrefix,
|
|
842
|
+
);
|
|
843
|
+
}
|
|
844
|
+
|
|
845
|
+
// Legacy cleanup prompt (coordinator has its own)
|
|
846
|
+
await promptForCleanup(baseDirectory, searchTerm);
|
|
847
|
+
}
|
|
848
|
+
|
|
849
|
+
console.log(chalk.green.bold("\nProcess completed successfully!"));
|
|
850
|
+
}
|
|
851
|
+
|
|
852
|
+
// ============================================================================
|
|
853
|
+
// SECTION 10: ERROR HANDLING
|
|
854
|
+
// ============================================================================
|
|
855
|
+
|
|
856
|
+
main()
|
|
857
|
+
.catch(async (err) => {
|
|
858
|
+
closeProgressBars();
|
|
859
|
+
console.error(chalk.red(err));
|
|
860
|
+
await closeBrowser().catch(() => {});
|
|
861
|
+
process.exit(1);
|
|
862
|
+
})
|
|
863
|
+
.finally(async () => {
|
|
864
|
+
await closeBrowser().catch(() => {});
|
|
865
|
+
process.exit(0);
|
|
866
|
+
});
|