ef-dl 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json ADDED
@@ -0,0 +1,65 @@
1
+ {
2
+ "name": "ef-dl",
3
+ "version": "1.0.0",
4
+ "description": "Educational tool to download files from the US DOJ Epstein Files search portal (Bun runtime required)",
5
+ "module": "index.ts",
6
+ "type": "module",
7
+ "bin": {
8
+ "ef-dl": "./index.ts"
9
+ },
10
+ "files": [
11
+ "index.ts",
12
+ "src",
13
+ "fonts",
14
+ "README.md",
15
+ "LICENSE"
16
+ ],
17
+ "scripts": {
18
+ "dev": "bun --watch --hot index.ts",
19
+ "start": "bun index.ts",
20
+ "build": "bun build index.ts --outdir dist",
21
+ "typecheck": "tsc --noEmit",
22
+ "test:browser": "bun src/browser-client.ts"
23
+ },
24
+ "engines": {
25
+ "bun": ">=1.0.0"
26
+ },
27
+ "keywords": [
28
+ "bun",
29
+ "bun-only",
30
+ "cli",
31
+ "downloader",
32
+ "epstein files",
33
+ "epstein documents",
34
+ "epstein case files",
35
+ "educational"
36
+ ],
37
+ "publishConfig": {
38
+ "access": "public",
39
+ "registry": "https://registry.npmjs.org"
40
+ },
41
+ "author": "",
42
+ "license": "MIT",
43
+ "repository": {
44
+ "type": "git",
45
+ "url": "https://github.com/iammorpheuszion/ef-dl.git"
46
+ },
47
+ "bugs": {
48
+ "url": "https://github.com/iammorpheuszion/ef-dl/issues"
49
+ },
50
+ "homepage": "https://github.com/iammorpheuszion/ef-dl",
51
+ "devDependencies": {
52
+ "@types/bun": "latest",
53
+ "@types/figlet": "^1.7.0",
54
+ "typescript": "^5.9.3"
55
+ },
56
+ "dependencies": {
57
+ "@inquirer/prompts": "^8.2.0",
58
+ "browserless": "^10.9.18",
59
+ "chalk": "^5.6.2",
60
+ "commander": "^14.0.3",
61
+ "figlet": "^1.10.0",
62
+ "multi-progress-bars": "^5.0.3",
63
+ "puppeteer": "^24.36.1"
64
+ }
65
+ }
@@ -0,0 +1,307 @@
1
+ import createBrowserless from "browserless";
2
+ import fs from "fs";
3
+ import path from "path";
4
+ import { JUSTICE_GOV_COOKIE_DOMAIN } from "../types/constants";
5
+ import { handleSecurityChallenges, isInterstitialPage } from "./challenger";
6
+ import {
7
+ extractSearchResultsJson,
8
+ parseCookieHeader,
9
+ saveJsonToFile,
10
+ } from "./helpers";
11
+ import { logger } from "../utils/logger";
12
+
13
+ const browserless = createBrowserless({
14
+ adblock: false, // Disable adblocker to avoid interference with security challenges
15
+ });
16
+
17
+ /**
18
+ * Logger function that only logs in verbose mode
19
+ */
20
+ function debugLog(...args: any[]): void {
21
+ logger.debug(...args);
22
+ }
23
+
24
+ type PageContentResult = {
25
+ text: string;
26
+ json: {
27
+ title: string;
28
+ bodyText: string;
29
+ url: string;
30
+ };
31
+ };
32
+
33
+ export async function fetchPageContent(
34
+ url: string,
35
+ options?: {
36
+ cookieHeader?: string;
37
+ saveJson?: boolean;
38
+ jsonOutputDir?: string;
39
+ },
40
+ ): Promise<PageContentResult & { jsonData?: any; jsonFilePath?: string }> {
41
+ const cookies = parseCookieHeader(options?.cookieHeader, url);
42
+ const context = await browserless.createContext();
43
+ let page;
44
+
45
+ const saveJson = options?.saveJson ?? true;
46
+ const jsonOutputDir = options?.jsonOutputDir ?? "./downloads/json";
47
+
48
+ try {
49
+ page = await context.page();
50
+
51
+ // Set age verification cookie to skip age check
52
+ const urlObj = new URL(url);
53
+ await page.setCookie({
54
+ name: "justiceGovAgeVerified",
55
+ value: "true",
56
+ domain: JUSTICE_GOV_COOKIE_DOMAIN,
57
+ path: "/",
58
+ url: `${urlObj.protocol}//${urlObj.hostname}`,
59
+ });
60
+
61
+ if (cookies.length) {
62
+ await page.setCookie(...cookies);
63
+ }
64
+
65
+ // Use goto directly - disable adblock to avoid interference with security challenges
66
+ const goto = context.goto;
67
+ const { error } = await goto(page, {
68
+ url,
69
+ timeout: 30000,
70
+ waitUntil: "networkidle2",
71
+ adblock: false,
72
+ } as any);
73
+ if (error) throw error;
74
+
75
+ await handleSecurityChallenges(page, context, url, null, debugLog);
76
+
77
+ // Always extract after any navigation
78
+ debugLog(`\n[Debug] Final URL: ${page.url()}`);
79
+
80
+ // Check if we're still on interstitial
81
+ const isStillInterstitial = await isInterstitialPage(page, debugLog);
82
+ if (isStillInterstitial) {
83
+ debugLog(
84
+ "[Warning] Still on interstitial page after all challenge attempts",
85
+ );
86
+ }
87
+
88
+ const title = await page.title();
89
+ const bodyText = await page.evaluate(() => document.body?.innerText ?? "");
90
+ debugLog(`[Debug] Page title: "${title}"`);
91
+ debugLog(`[Debug] Body text length: ${bodyText.length}`);
92
+
93
+ // Check if page content is actually the search results
94
+ const pageContent = await page.content();
95
+
96
+ // Print first 300 chars of content to see what we're dealing with
97
+ debugLog(`[Debug] Page content preview (first 300 chars):`);
98
+ debugLog(pageContent.slice(0, 300));
99
+
100
+ if (bodyText.length < 100) {
101
+ debugLog(
102
+ "[Warning] Page body text is very short, might be on a challenge page",
103
+ );
104
+ }
105
+
106
+ // Extract JSON data from the page
107
+ let jsonData: any = null;
108
+ let jsonFilePath: string | undefined;
109
+
110
+ try {
111
+ jsonData = await extractSearchResultsJson(page, debugLog);
112
+
113
+ if (saveJson && jsonData) {
114
+ const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
115
+ const urlObj = new URL(page.url());
116
+ const searchParams = new URLSearchParams(urlObj.search);
117
+ const searchTerm = searchParams.get("keys") || "unknown";
118
+ const pageNum = searchParams.get("page") || "1";
119
+ const filename = `search-${searchTerm}-page-${pageNum}-${timestamp}.json`;
120
+
121
+ jsonFilePath = saveJsonToFile(jsonData, jsonOutputDir, filename);
122
+ logger.info(`[Save] JSON data saved to: ${jsonFilePath}`);
123
+ }
124
+ } catch (extractError) {
125
+ logger.error("[Extract] Failed to extract JSON data:", extractError);
126
+ }
127
+
128
+ return {
129
+ text: bodyText,
130
+ json: {
131
+ title,
132
+ bodyText,
133
+ url: page.url(),
134
+ },
135
+ jsonData,
136
+ jsonFilePath,
137
+ };
138
+ } finally {
139
+ await context.destroyContext().catch(() => undefined);
140
+ }
141
+ }
142
+
143
+ /**
144
+ * Download a PDF file from justice.gov
145
+ * Handles the security challenges (robot button, interstitial)
146
+ */
147
+ export async function downloadPdf(
148
+ pdfUrl: string,
149
+ outputDir: string,
150
+ fileName: string,
151
+ prefix?: string,
152
+ ): Promise<string> {
153
+ debugLog(`[PDF Download] Starting download from: ${pdfUrl}`);
154
+
155
+ const context = await browserless.createContext();
156
+ let page;
157
+
158
+ try {
159
+ page = await context.page();
160
+
161
+ // Set age verification cookie to skip age check
162
+ const pdfUrlObj = new URL(pdfUrl);
163
+ await page.setCookie({
164
+ name: "justiceGovAgeVerified",
165
+ value: "true",
166
+ domain: JUSTICE_GOV_COOKIE_DOMAIN,
167
+ path: "/",
168
+ url: `${pdfUrlObj.protocol}//${pdfUrlObj.hostname}`,
169
+ });
170
+
171
+ // Navigate to PDF URL - disable adblock to avoid interference with security challenges
172
+ const goto = context.goto;
173
+ const { error } = await goto(page, {
174
+ url: pdfUrl,
175
+ timeout: 60000, // Longer timeout for PDF downloads
176
+ waitUntil: "networkidle2",
177
+ adblock: false,
178
+ } as any);
179
+
180
+ if (error) throw error;
181
+
182
+ await handleSecurityChallenges(page, context, pdfUrl, "PDF", debugLog);
183
+
184
+ // Check if we're on the PDF or if it triggered a download
185
+ const currentUrl = page.url();
186
+ debugLog(`[PDF Download] Final URL: ${currentUrl}`);
187
+
188
+ // Try to get the PDF buffer
189
+ let pdfBuffer: Buffer | null = null;
190
+
191
+ // Check if the current page is displaying a PDF
192
+ const contentType = await page.evaluate(() => {
193
+ return document.contentType || "";
194
+ });
195
+
196
+ if (contentType.includes("pdf") || currentUrl.endsWith(".pdf")) {
197
+ debugLog("[PDF Download] Page is displaying PDF, capturing...");
198
+
199
+ // Use CDP to capture the PDF
200
+ const client = await page.target().createCDPSession();
201
+ const { data } = await client.send("Page.captureSnapshot", {
202
+ format: "mhtml",
203
+ });
204
+
205
+ // For PDFs, we need to fetch the content directly
206
+ pdfBuffer = await page.evaluate(async () => {
207
+ const response = await fetch(window.location.href);
208
+ const blob = await response.blob();
209
+ const arrayBuffer = await blob.arrayBuffer();
210
+ return Array.from(new Uint8Array(arrayBuffer));
211
+ });
212
+
213
+ // Convert array back to Buffer
214
+ pdfBuffer = Buffer.from(pdfBuffer as any);
215
+ } else {
216
+ // Try to download via fetch in page context
217
+ debugLog("[PDF Download] Attempting to download via fetch...");
218
+ const pdfData = await page.evaluate(async (url: string) => {
219
+ try {
220
+ const response = await fetch(url, {
221
+ credentials: "include",
222
+ });
223
+ if (!response.ok) {
224
+ throw new Error(`HTTP ${response.status}: ${response.statusText}`);
225
+ }
226
+ const blob = await response.blob();
227
+ const arrayBuffer = await blob.arrayBuffer();
228
+ return {
229
+ success: true,
230
+ data: Array.from(new Uint8Array(arrayBuffer)),
231
+ contentType: response.headers.get("content-type"),
232
+ };
233
+ } catch (error: any) {
234
+ return { success: false, error: error.message };
235
+ }
236
+ }, pdfUrl);
237
+
238
+ if (!pdfData.success) {
239
+ throw new Error(`Failed to download PDF: ${pdfData.error}`);
240
+ }
241
+
242
+ pdfBuffer = Buffer.from(pdfData.data);
243
+ }
244
+
245
+ if (!pdfBuffer || pdfBuffer.length === 0) {
246
+ throw new Error("Downloaded PDF is empty");
247
+ }
248
+
249
+ // Ensure output directory exists
250
+ if (!fs.existsSync(outputDir)) {
251
+ fs.mkdirSync(outputDir, { recursive: true });
252
+ }
253
+
254
+ // Apply prefix to filename if provided
255
+ const finalFileName = prefix ? `${prefix}-${fileName}` : fileName;
256
+
257
+ // Save PDF file
258
+ const filePath = path.join(outputDir, finalFileName);
259
+ fs.writeFileSync(filePath, pdfBuffer);
260
+
261
+ debugLog(
262
+ `[PDF Download] Successfully saved ${pdfBuffer.length} bytes to ${filePath}`,
263
+ );
264
+
265
+ return filePath;
266
+ } finally {
267
+ await context.destroyContext().catch(() => undefined);
268
+ }
269
+ }
270
+
271
+ /**
272
+ * Close the browserless instance and clean up all browser resources.
273
+ * Should be called when the application exits or is interrupted.
274
+ */
275
+ export async function closeBrowser(): Promise<void> {
276
+ try {
277
+ await browserless.close();
278
+ debugLog("[Browser] Browser instance closed successfully");
279
+ } catch (error) {
280
+ // Browser might already be closed or never started
281
+ debugLog("[Browser] Error closing browser (may already be closed):", error);
282
+ }
283
+ }
284
+
285
+ if (import.meta.main) {
286
+ const url = process.argv[2] ?? "https://example.com";
287
+ fetchPageContent(url, {})
288
+ .then(({ text, json, jsonData, jsonFilePath }) => {
289
+ logger.info("\n=== Page Text Preview ===\n");
290
+ logger.info(text.slice(0, 2000));
291
+ logger.info("\n=== JSON Preview ===\n");
292
+ logger.info(JSON.stringify(json, null, 2).slice(0, 2000));
293
+
294
+ if (jsonData) {
295
+ logger.info("\n=== Extracted JSON Data Preview ===\n");
296
+ logger.info(JSON.stringify(jsonData, null, 2).slice(0, 2000));
297
+ }
298
+
299
+ if (jsonFilePath) {
300
+ logger.info(`\n[Saved] JSON data saved to: ${jsonFilePath}`);
301
+ }
302
+ })
303
+ .catch((err) => {
304
+ logger.error("Fetch failed:", err);
305
+ process.exit(1);
306
+ });
307
+ }