ef-dl 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +521 -0
- package/fonts/Sub-Zero.flf +629 -0
- package/index.ts +866 -0
- package/package.json +65 -0
- package/src/browserless/browser-client.ts +307 -0
- package/src/browserless/challenger.ts +352 -0
- package/src/browserless/helpers.ts +171 -0
- package/src/types/browserless.d.ts +31 -0
- package/src/types/constants.ts +3 -0
- package/src/types/enums.ts +5 -0
- package/src/utils/ascii.ts +66 -0
- package/src/utils/helpers.ts +260 -0
- package/src/utils/logger.ts +42 -0
- package/src/utils/progress.ts +130 -0
- package/src/utils/prompt.ts +87 -0
- package/src/workers/coordinator.ts +635 -0
- package/src/workers/index.ts +40 -0
- package/src/workers/task-queue.ts +388 -0
- package/src/workers/types.ts +135 -0
- package/src/workers/worker-pool.ts +227 -0
- package/src/workers/worker.ts +290 -0
package/package.json
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "ef-dl",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Educational tool to download files from the US DOJ Epstein Files search portal (Bun runtime required)",
|
|
5
|
+
"module": "index.ts",
|
|
6
|
+
"type": "module",
|
|
7
|
+
"bin": {
|
|
8
|
+
"ef-dl": "./index.ts"
|
|
9
|
+
},
|
|
10
|
+
"files": [
|
|
11
|
+
"index.ts",
|
|
12
|
+
"src",
|
|
13
|
+
"fonts",
|
|
14
|
+
"README.md",
|
|
15
|
+
"LICENSE"
|
|
16
|
+
],
|
|
17
|
+
"scripts": {
|
|
18
|
+
"dev": "bun --watch --hot index.ts",
|
|
19
|
+
"start": "bun index.ts",
|
|
20
|
+
"build": "bun build index.ts --outdir dist",
|
|
21
|
+
"typecheck": "tsc --noEmit",
|
|
22
|
+
"test:browser": "bun src/browser-client.ts"
|
|
23
|
+
},
|
|
24
|
+
"engines": {
|
|
25
|
+
"bun": ">=1.0.0"
|
|
26
|
+
},
|
|
27
|
+
"keywords": [
|
|
28
|
+
"bun",
|
|
29
|
+
"bun-only",
|
|
30
|
+
"cli",
|
|
31
|
+
"downloader",
|
|
32
|
+
"epstein files",
|
|
33
|
+
"epstein documents",
|
|
34
|
+
"epstein case files",
|
|
35
|
+
"educational"
|
|
36
|
+
],
|
|
37
|
+
"publishConfig": {
|
|
38
|
+
"access": "public",
|
|
39
|
+
"registry": "https://registry.npmjs.org"
|
|
40
|
+
},
|
|
41
|
+
"author": "",
|
|
42
|
+
"license": "MIT",
|
|
43
|
+
"repository": {
|
|
44
|
+
"type": "git",
|
|
45
|
+
"url": "https://github.com/iammorpheuszion/ef-dl.git"
|
|
46
|
+
},
|
|
47
|
+
"bugs": {
|
|
48
|
+
"url": "https://github.com/iammorpheuszion/ef-dl/issues"
|
|
49
|
+
},
|
|
50
|
+
"homepage": "https://github.com/iammorpheuszion/ef-dl",
|
|
51
|
+
"devDependencies": {
|
|
52
|
+
"@types/bun": "latest",
|
|
53
|
+
"@types/figlet": "^1.7.0",
|
|
54
|
+
"typescript": "^5.9.3"
|
|
55
|
+
},
|
|
56
|
+
"dependencies": {
|
|
57
|
+
"@inquirer/prompts": "^8.2.0",
|
|
58
|
+
"browserless": "^10.9.18",
|
|
59
|
+
"chalk": "^5.6.2",
|
|
60
|
+
"commander": "^14.0.3",
|
|
61
|
+
"figlet": "^1.10.0",
|
|
62
|
+
"multi-progress-bars": "^5.0.3",
|
|
63
|
+
"puppeteer": "^24.36.1"
|
|
64
|
+
}
|
|
65
|
+
}
|
|
@@ -0,0 +1,307 @@
|
|
|
1
|
+
import createBrowserless from "browserless";
|
|
2
|
+
import fs from "fs";
|
|
3
|
+
import path from "path";
|
|
4
|
+
import { JUSTICE_GOV_COOKIE_DOMAIN } from "../types/constants";
|
|
5
|
+
import { handleSecurityChallenges, isInterstitialPage } from "./challenger";
|
|
6
|
+
import {
|
|
7
|
+
extractSearchResultsJson,
|
|
8
|
+
parseCookieHeader,
|
|
9
|
+
saveJsonToFile,
|
|
10
|
+
} from "./helpers";
|
|
11
|
+
import { logger } from "../utils/logger";
|
|
12
|
+
|
|
13
|
+
const browserless = createBrowserless({
|
|
14
|
+
adblock: false, // Disable adblocker to avoid interference with security challenges
|
|
15
|
+
});
|
|
16
|
+
|
|
17
|
+
/**
|
|
18
|
+
* Logger function that only logs in verbose mode
|
|
19
|
+
*/
|
|
20
|
+
function debugLog(...args: any[]): void {
|
|
21
|
+
logger.debug(...args);
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
type PageContentResult = {
|
|
25
|
+
text: string;
|
|
26
|
+
json: {
|
|
27
|
+
title: string;
|
|
28
|
+
bodyText: string;
|
|
29
|
+
url: string;
|
|
30
|
+
};
|
|
31
|
+
};
|
|
32
|
+
|
|
33
|
+
export async function fetchPageContent(
|
|
34
|
+
url: string,
|
|
35
|
+
options?: {
|
|
36
|
+
cookieHeader?: string;
|
|
37
|
+
saveJson?: boolean;
|
|
38
|
+
jsonOutputDir?: string;
|
|
39
|
+
},
|
|
40
|
+
): Promise<PageContentResult & { jsonData?: any; jsonFilePath?: string }> {
|
|
41
|
+
const cookies = parseCookieHeader(options?.cookieHeader, url);
|
|
42
|
+
const context = await browserless.createContext();
|
|
43
|
+
let page;
|
|
44
|
+
|
|
45
|
+
const saveJson = options?.saveJson ?? true;
|
|
46
|
+
const jsonOutputDir = options?.jsonOutputDir ?? "./downloads/json";
|
|
47
|
+
|
|
48
|
+
try {
|
|
49
|
+
page = await context.page();
|
|
50
|
+
|
|
51
|
+
// Set age verification cookie to skip age check
|
|
52
|
+
const urlObj = new URL(url);
|
|
53
|
+
await page.setCookie({
|
|
54
|
+
name: "justiceGovAgeVerified",
|
|
55
|
+
value: "true",
|
|
56
|
+
domain: JUSTICE_GOV_COOKIE_DOMAIN,
|
|
57
|
+
path: "/",
|
|
58
|
+
url: `${urlObj.protocol}//${urlObj.hostname}`,
|
|
59
|
+
});
|
|
60
|
+
|
|
61
|
+
if (cookies.length) {
|
|
62
|
+
await page.setCookie(...cookies);
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
// Use goto directly - disable adblock to avoid interference with security challenges
|
|
66
|
+
const goto = context.goto;
|
|
67
|
+
const { error } = await goto(page, {
|
|
68
|
+
url,
|
|
69
|
+
timeout: 30000,
|
|
70
|
+
waitUntil: "networkidle2",
|
|
71
|
+
adblock: false,
|
|
72
|
+
} as any);
|
|
73
|
+
if (error) throw error;
|
|
74
|
+
|
|
75
|
+
await handleSecurityChallenges(page, context, url, null, debugLog);
|
|
76
|
+
|
|
77
|
+
// Always extract after any navigation
|
|
78
|
+
debugLog(`\n[Debug] Final URL: ${page.url()}`);
|
|
79
|
+
|
|
80
|
+
// Check if we're still on interstitial
|
|
81
|
+
const isStillInterstitial = await isInterstitialPage(page, debugLog);
|
|
82
|
+
if (isStillInterstitial) {
|
|
83
|
+
debugLog(
|
|
84
|
+
"[Warning] Still on interstitial page after all challenge attempts",
|
|
85
|
+
);
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
const title = await page.title();
|
|
89
|
+
const bodyText = await page.evaluate(() => document.body?.innerText ?? "");
|
|
90
|
+
debugLog(`[Debug] Page title: "${title}"`);
|
|
91
|
+
debugLog(`[Debug] Body text length: ${bodyText.length}`);
|
|
92
|
+
|
|
93
|
+
// Check if page content is actually the search results
|
|
94
|
+
const pageContent = await page.content();
|
|
95
|
+
|
|
96
|
+
// Print first 300 chars of content to see what we're dealing with
|
|
97
|
+
debugLog(`[Debug] Page content preview (first 300 chars):`);
|
|
98
|
+
debugLog(pageContent.slice(0, 300));
|
|
99
|
+
|
|
100
|
+
if (bodyText.length < 100) {
|
|
101
|
+
debugLog(
|
|
102
|
+
"[Warning] Page body text is very short, might be on a challenge page",
|
|
103
|
+
);
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
// Extract JSON data from the page
|
|
107
|
+
let jsonData: any = null;
|
|
108
|
+
let jsonFilePath: string | undefined;
|
|
109
|
+
|
|
110
|
+
try {
|
|
111
|
+
jsonData = await extractSearchResultsJson(page, debugLog);
|
|
112
|
+
|
|
113
|
+
if (saveJson && jsonData) {
|
|
114
|
+
const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
|
|
115
|
+
const urlObj = new URL(page.url());
|
|
116
|
+
const searchParams = new URLSearchParams(urlObj.search);
|
|
117
|
+
const searchTerm = searchParams.get("keys") || "unknown";
|
|
118
|
+
const pageNum = searchParams.get("page") || "1";
|
|
119
|
+
const filename = `search-${searchTerm}-page-${pageNum}-${timestamp}.json`;
|
|
120
|
+
|
|
121
|
+
jsonFilePath = saveJsonToFile(jsonData, jsonOutputDir, filename);
|
|
122
|
+
logger.info(`[Save] JSON data saved to: ${jsonFilePath}`);
|
|
123
|
+
}
|
|
124
|
+
} catch (extractError) {
|
|
125
|
+
logger.error("[Extract] Failed to extract JSON data:", extractError);
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
return {
|
|
129
|
+
text: bodyText,
|
|
130
|
+
json: {
|
|
131
|
+
title,
|
|
132
|
+
bodyText,
|
|
133
|
+
url: page.url(),
|
|
134
|
+
},
|
|
135
|
+
jsonData,
|
|
136
|
+
jsonFilePath,
|
|
137
|
+
};
|
|
138
|
+
} finally {
|
|
139
|
+
await context.destroyContext().catch(() => undefined);
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
/**
|
|
144
|
+
* Download a PDF file from justice.gov
|
|
145
|
+
* Handles the security challenges (robot button, interstitial)
|
|
146
|
+
*/
|
|
147
|
+
export async function downloadPdf(
|
|
148
|
+
pdfUrl: string,
|
|
149
|
+
outputDir: string,
|
|
150
|
+
fileName: string,
|
|
151
|
+
prefix?: string,
|
|
152
|
+
): Promise<string> {
|
|
153
|
+
debugLog(`[PDF Download] Starting download from: ${pdfUrl}`);
|
|
154
|
+
|
|
155
|
+
const context = await browserless.createContext();
|
|
156
|
+
let page;
|
|
157
|
+
|
|
158
|
+
try {
|
|
159
|
+
page = await context.page();
|
|
160
|
+
|
|
161
|
+
// Set age verification cookie to skip age check
|
|
162
|
+
const pdfUrlObj = new URL(pdfUrl);
|
|
163
|
+
await page.setCookie({
|
|
164
|
+
name: "justiceGovAgeVerified",
|
|
165
|
+
value: "true",
|
|
166
|
+
domain: JUSTICE_GOV_COOKIE_DOMAIN,
|
|
167
|
+
path: "/",
|
|
168
|
+
url: `${pdfUrlObj.protocol}//${pdfUrlObj.hostname}`,
|
|
169
|
+
});
|
|
170
|
+
|
|
171
|
+
// Navigate to PDF URL - disable adblock to avoid interference with security challenges
|
|
172
|
+
const goto = context.goto;
|
|
173
|
+
const { error } = await goto(page, {
|
|
174
|
+
url: pdfUrl,
|
|
175
|
+
timeout: 60000, // Longer timeout for PDF downloads
|
|
176
|
+
waitUntil: "networkidle2",
|
|
177
|
+
adblock: false,
|
|
178
|
+
} as any);
|
|
179
|
+
|
|
180
|
+
if (error) throw error;
|
|
181
|
+
|
|
182
|
+
await handleSecurityChallenges(page, context, pdfUrl, "PDF", debugLog);
|
|
183
|
+
|
|
184
|
+
// Check if we're on the PDF or if it triggered a download
|
|
185
|
+
const currentUrl = page.url();
|
|
186
|
+
debugLog(`[PDF Download] Final URL: ${currentUrl}`);
|
|
187
|
+
|
|
188
|
+
// Try to get the PDF buffer
|
|
189
|
+
let pdfBuffer: Buffer | null = null;
|
|
190
|
+
|
|
191
|
+
// Check if the current page is displaying a PDF
|
|
192
|
+
const contentType = await page.evaluate(() => {
|
|
193
|
+
return document.contentType || "";
|
|
194
|
+
});
|
|
195
|
+
|
|
196
|
+
if (contentType.includes("pdf") || currentUrl.endsWith(".pdf")) {
|
|
197
|
+
debugLog("[PDF Download] Page is displaying PDF, capturing...");
|
|
198
|
+
|
|
199
|
+
// Use CDP to capture the PDF
|
|
200
|
+
const client = await page.target().createCDPSession();
|
|
201
|
+
const { data } = await client.send("Page.captureSnapshot", {
|
|
202
|
+
format: "mhtml",
|
|
203
|
+
});
|
|
204
|
+
|
|
205
|
+
// For PDFs, we need to fetch the content directly
|
|
206
|
+
pdfBuffer = await page.evaluate(async () => {
|
|
207
|
+
const response = await fetch(window.location.href);
|
|
208
|
+
const blob = await response.blob();
|
|
209
|
+
const arrayBuffer = await blob.arrayBuffer();
|
|
210
|
+
return Array.from(new Uint8Array(arrayBuffer));
|
|
211
|
+
});
|
|
212
|
+
|
|
213
|
+
// Convert array back to Buffer
|
|
214
|
+
pdfBuffer = Buffer.from(pdfBuffer as any);
|
|
215
|
+
} else {
|
|
216
|
+
// Try to download via fetch in page context
|
|
217
|
+
debugLog("[PDF Download] Attempting to download via fetch...");
|
|
218
|
+
const pdfData = await page.evaluate(async (url: string) => {
|
|
219
|
+
try {
|
|
220
|
+
const response = await fetch(url, {
|
|
221
|
+
credentials: "include",
|
|
222
|
+
});
|
|
223
|
+
if (!response.ok) {
|
|
224
|
+
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
|
225
|
+
}
|
|
226
|
+
const blob = await response.blob();
|
|
227
|
+
const arrayBuffer = await blob.arrayBuffer();
|
|
228
|
+
return {
|
|
229
|
+
success: true,
|
|
230
|
+
data: Array.from(new Uint8Array(arrayBuffer)),
|
|
231
|
+
contentType: response.headers.get("content-type"),
|
|
232
|
+
};
|
|
233
|
+
} catch (error: any) {
|
|
234
|
+
return { success: false, error: error.message };
|
|
235
|
+
}
|
|
236
|
+
}, pdfUrl);
|
|
237
|
+
|
|
238
|
+
if (!pdfData.success) {
|
|
239
|
+
throw new Error(`Failed to download PDF: ${pdfData.error}`);
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
pdfBuffer = Buffer.from(pdfData.data);
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
if (!pdfBuffer || pdfBuffer.length === 0) {
|
|
246
|
+
throw new Error("Downloaded PDF is empty");
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
// Ensure output directory exists
|
|
250
|
+
if (!fs.existsSync(outputDir)) {
|
|
251
|
+
fs.mkdirSync(outputDir, { recursive: true });
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
// Apply prefix to filename if provided
|
|
255
|
+
const finalFileName = prefix ? `${prefix}-${fileName}` : fileName;
|
|
256
|
+
|
|
257
|
+
// Save PDF file
|
|
258
|
+
const filePath = path.join(outputDir, finalFileName);
|
|
259
|
+
fs.writeFileSync(filePath, pdfBuffer);
|
|
260
|
+
|
|
261
|
+
debugLog(
|
|
262
|
+
`[PDF Download] Successfully saved ${pdfBuffer.length} bytes to ${filePath}`,
|
|
263
|
+
);
|
|
264
|
+
|
|
265
|
+
return filePath;
|
|
266
|
+
} finally {
|
|
267
|
+
await context.destroyContext().catch(() => undefined);
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
/**
|
|
272
|
+
* Close the browserless instance and clean up all browser resources.
|
|
273
|
+
* Should be called when the application exits or is interrupted.
|
|
274
|
+
*/
|
|
275
|
+
export async function closeBrowser(): Promise<void> {
|
|
276
|
+
try {
|
|
277
|
+
await browserless.close();
|
|
278
|
+
debugLog("[Browser] Browser instance closed successfully");
|
|
279
|
+
} catch (error) {
|
|
280
|
+
// Browser might already be closed or never started
|
|
281
|
+
debugLog("[Browser] Error closing browser (may already be closed):", error);
|
|
282
|
+
}
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
if (import.meta.main) {
|
|
286
|
+
const url = process.argv[2] ?? "https://example.com";
|
|
287
|
+
fetchPageContent(url, {})
|
|
288
|
+
.then(({ text, json, jsonData, jsonFilePath }) => {
|
|
289
|
+
logger.info("\n=== Page Text Preview ===\n");
|
|
290
|
+
logger.info(text.slice(0, 2000));
|
|
291
|
+
logger.info("\n=== JSON Preview ===\n");
|
|
292
|
+
logger.info(JSON.stringify(json, null, 2).slice(0, 2000));
|
|
293
|
+
|
|
294
|
+
if (jsonData) {
|
|
295
|
+
logger.info("\n=== Extracted JSON Data Preview ===\n");
|
|
296
|
+
logger.info(JSON.stringify(jsonData, null, 2).slice(0, 2000));
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
if (jsonFilePath) {
|
|
300
|
+
logger.info(`\n[Saved] JSON data saved to: ${jsonFilePath}`);
|
|
301
|
+
}
|
|
302
|
+
})
|
|
303
|
+
.catch((err) => {
|
|
304
|
+
logger.error("Fetch failed:", err);
|
|
305
|
+
process.exit(1);
|
|
306
|
+
});
|
|
307
|
+
}
|