openrxiv-cli 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. package/dist/api/api-client.d.ts +96 -0
  2. package/dist/api/api-client.d.ts.map +1 -0
  3. package/dist/api/api-client.js +257 -0
  4. package/dist/aws/bucket-explorer.d.ts +26 -0
  5. package/dist/aws/bucket-explorer.d.ts.map +1 -0
  6. package/dist/aws/bucket-explorer.js +220 -0
  7. package/dist/aws/config.d.ts +5 -0
  8. package/dist/aws/config.d.ts.map +1 -0
  9. package/dist/aws/config.js +36 -0
  10. package/dist/aws/downloader.d.ts +13 -0
  11. package/dist/aws/downloader.d.ts.map +1 -0
  12. package/dist/aws/downloader.js +115 -0
  13. package/dist/aws/month-lister.d.ts +18 -0
  14. package/dist/aws/month-lister.d.ts.map +1 -0
  15. package/dist/aws/month-lister.js +90 -0
  16. package/dist/commands/batch-info.d.ts +3 -0
  17. package/dist/commands/batch-info.d.ts.map +1 -0
  18. package/dist/commands/batch-info.js +213 -0
  19. package/dist/commands/batch-process.d.ts +3 -0
  20. package/dist/commands/batch-process.d.ts.map +1 -0
  21. package/dist/commands/batch-process.js +557 -0
  22. package/dist/commands/download.d.ts +3 -0
  23. package/dist/commands/download.d.ts.map +1 -0
  24. package/dist/commands/download.js +76 -0
  25. package/dist/commands/index.d.ts +6 -0
  26. package/dist/commands/index.d.ts.map +1 -0
  27. package/dist/commands/index.js +5 -0
  28. package/dist/commands/list.d.ts +3 -0
  29. package/dist/commands/list.d.ts.map +1 -0
  30. package/dist/commands/list.js +18 -0
  31. package/dist/commands/summary.d.ts +3 -0
  32. package/dist/commands/summary.d.ts.map +1 -0
  33. package/dist/commands/summary.js +249 -0
  34. package/dist/index.d.ts +7 -0
  35. package/dist/index.d.ts.map +1 -0
  36. package/dist/index.js +35 -0
  37. package/dist/utils/batches.d.ts +9 -0
  38. package/dist/utils/batches.d.ts.map +1 -0
  39. package/dist/utils/batches.js +61 -0
  40. package/dist/utils/batches.test.d.ts +2 -0
  41. package/dist/utils/batches.test.d.ts.map +1 -0
  42. package/dist/utils/batches.test.js +119 -0
  43. package/dist/utils/default-server.d.ts +3 -0
  44. package/dist/utils/default-server.d.ts.map +1 -0
  45. package/dist/utils/default-server.js +20 -0
  46. package/dist/utils/index.d.ts +5 -0
  47. package/dist/utils/index.d.ts.map +1 -0
  48. package/dist/utils/index.js +5 -0
  49. package/dist/utils/meca-processor.d.ts +28 -0
  50. package/dist/utils/meca-processor.d.ts.map +1 -0
  51. package/dist/utils/meca-processor.js +503 -0
  52. package/dist/utils/meca-processor.test.d.ts +2 -0
  53. package/dist/utils/meca-processor.test.d.ts.map +1 -0
  54. package/dist/utils/meca-processor.test.js +123 -0
  55. package/dist/utils/months.d.ts +36 -0
  56. package/dist/utils/months.d.ts.map +1 -0
  57. package/dist/utils/months.js +135 -0
  58. package/dist/utils/months.test.d.ts +2 -0
  59. package/dist/utils/months.test.d.ts.map +1 -0
  60. package/dist/utils/months.test.js +209 -0
  61. package/dist/utils/requester-pays-error.d.ts +6 -0
  62. package/dist/utils/requester-pays-error.d.ts.map +1 -0
  63. package/dist/utils/requester-pays-error.js +20 -0
  64. package/dist/version.d.ts +3 -0
  65. package/dist/version.d.ts.map +1 -0
  66. package/dist/version.js +2 -0
  67. package/package.json +67 -0
@@ -0,0 +1,557 @@
1
+ import { Command, Option } from 'commander';
2
+ import fs from 'fs';
3
+ import path from 'path';
4
+ import axios from 'axios';
5
+ import pLimit from 'p-limit';
6
+ import { listMonthFiles } from '../aws/month-lister.js';
7
+ import { downloadFile } from '../aws/downloader.js';
8
+ import { processMecaFile } from '../utils/meca-processor.js';
9
+ import { getFolderStructure, removeDuplicateFolders, sortFoldersChronologically, } from 'openrxiv-utils';
10
+ import { generateMonthRange, parseMonthInput, validateMonthFormat, getDefaultServer, } from '../utils/index.js';
11
+ import { parseBatchInput, validateBatchFormat } from '../utils/batches.js';
12
+ import { getBucketName } from '../aws/bucket-explorer.js';
13
+ export const batchProcessCommand = new Command('batch-process')
14
+ .description('Batch process MECA files for a given month or batch.')
15
+ .option('-m, --month <month>', 'Month(s) to process. Supports: YYYY-MM, comma-separated list (2025-01,2025-02), or wildcard pattern (2025-*). If not specified, processes backwards from current month to 2018-12')
16
+ .option('-b, --batch <batch>', 'Batch to process. Supports: single batch (e.g., "1"), range (e.g., "1-10"), or comma-separated list (e.g., "1,2,3"). Use this for historical content before 2018-12.')
17
+ .option('-s, --server <server>', 'Server type: biorxiv or medrxiv', getDefaultServer())
18
+ .option('-l, --limit <number>', 'Maximum number of files to process. If not specified, processes all available files')
19
+ .option('-a, --api-url <url>', 'API base URL', 'https://openrxiv.csf.now')
20
+ .addOption(new Option('-k, --api-key <key>', 'API key for authentication (or use OPENRXIV_BATCH_PROCESSING_API_KEY env var)').env('OPENRXIV_BATCH_PROCESSING_API_KEY'))
21
+ .option('-o, --output <dir>', 'Output directory for extracted files', './batch-extracted')
22
+ .option('--dry-run', 'List files without processing them', false)
23
+ .option('--force', 'Force reprocessing of existing files', false)
24
+ .option('--keep', 'Keep MECA files after processing (default: false)', false)
25
+ .option('--full-extract', 'Extract entire MECA file instead of selective extraction (default: false)', false)
26
+ .option('-c, --concurrency <number>', 'Number of files to process concurrently (default: 1)', '1')
27
+ .option('--max-file-size <size>', 'Skip files larger than this size (e.g., 100MB, 2GB)', '')
28
+ .option('--aws-bucket <bucket>', 'AWS S3 bucket name (auto-set based on server if not specified)')
29
+ .option('--aws-region <region>', 'AWS region', 'us-east-1')
30
+ .option('--check-individual-limit <number>', 'Threshold for individual checking (default: 100)', '100')
31
+ .action(async (options) => {
32
+ if (!options.apiKey && !options.dryRun) {
33
+ console.error('❌ API key is required. Please provide a valid API key using --api-key or set the OPENRXIV_BATCH_PROCESSING_API_KEY environment variable.');
34
+ process.exit(1);
35
+ }
36
+ const response = await axios.get(`${options.apiUrl}/health`).catch((error) => {
37
+ console.error('❌ API is not healthy. Please check the API URL and API key.');
38
+ process.exit(1);
39
+ });
40
+ if (response.status !== 200) {
41
+ console.error('❌ API is not healthy. Please check the API URL and API key.');
42
+ process.exit(1);
43
+ }
44
+ try {
45
+ if (options.batch && options.month) {
46
+ console.log(`🚀 Starting batch processing for batch: ${options.batch} and month: ${options.month}`);
47
+ }
48
+ else if (options.batch) {
49
+ console.log(`🚀 Starting batch processing for batch: ${options.batch}`);
50
+ }
51
+ else if (options.month) {
52
+ console.log(`🚀 Starting batch processing for month: ${options.month}`);
53
+ }
54
+ else {
55
+ console.log(`🚀 Starting backwards batch processing`);
56
+ }
57
+ console.log(`📊 Processing limit: ${options.limit ? `${options.limit} files` : 'all available files'}`);
58
+ console.log(`🔍 Dry run mode: ${options.dryRun ? 'enabled' : 'disabled'}`);
59
+ console.log(`⚡ Concurrency: ${options.concurrency} files`);
60
+ console.log(`🌐 Server: ${options.server}`);
61
+ if (!options.server) {
62
+ // Default to biorxiv if no server is specified
63
+ options.server = getDefaultServer();
64
+ }
65
+ if (!['biorxiv', 'medrxiv'].includes(options.server)) {
66
+ console.error('❌ Invalid server. Please use "biorxiv" or "medrxiv".');
67
+ process.exit(1);
68
+ }
69
+ // Auto-set AWS bucket based on server if not explicitly provided
70
+ const awsBucket = getBucketName(options.server);
71
+ console.log(`🪣 AWS Bucket: ${awsBucket}`);
72
+ // Create output directory
73
+ if (!fs.existsSync(options.output)) {
74
+ fs.mkdirSync(options.output, { recursive: true });
75
+ }
76
+ // Determine which folders to process
77
+ let foldersToProcess = [];
78
+ if (options.month) {
79
+ try {
80
+ const monthsToProcess = parseMonthInput(options.month);
81
+ // Validate all months after wildcard expansion
82
+ const invalidMonths = monthsToProcess.filter((m) => !validateMonthFormat(m));
83
+ if (invalidMonths.length > 0) {
84
+ console.error(`❌ Invalid month format(s): ${invalidMonths.join(', ')}`);
85
+ console.error('Expected format: YYYY-MM (e.g., 2025-01) or wildcard pattern (e.g., 2025-*)');
86
+ process.exit(1);
87
+ }
88
+ // Convert months to content structures
89
+ const monthStructures = monthsToProcess.map((month) => getFolderStructure({ month, server: options.server }));
90
+ foldersToProcess.push(...monthStructures);
91
+ }
92
+ catch (error) {
93
+ console.error(`❌ Error parsing month input: ${error instanceof Error ? error.message : String(error)}`);
94
+ process.exit(1);
95
+ }
96
+ }
97
+ if (options.batch) {
98
+ // Process batch(es) - support ranges like "1-10" or comma-separated lists
99
+ try {
100
+ const batchesToProcess = parseBatchInput(options.batch);
101
+ // Validate all batches
102
+ const invalidBatches = batchesToProcess.filter((b) => !validateBatchFormat(b));
103
+ if (invalidBatches.length > 0) {
104
+ console.error(`❌ Invalid batch format(s): ${invalidBatches.join(', ')}`);
105
+ console.error('Expected format: single batch (e.g., "1"), range (e.g., "1-10"), or comma-separated list (e.g., "1,2,3")');
106
+ process.exit(1);
107
+ }
108
+ // Convert batches to content structures
109
+ const batchStructures = batchesToProcess.map((batch) => getFolderStructure({ batch, server: options.server }));
110
+ foldersToProcess.push(...batchStructures);
111
+ }
112
+ catch (error) {
113
+ console.error(`❌ Error parsing batch input: ${error instanceof Error ? error.message : String(error)}`);
114
+ process.exit(1);
115
+ }
116
+ }
117
+ // Only generate month range if no other folders were specified
118
+ if (foldersToProcess.length === 0) {
119
+ // Generate month range and convert to content structures
120
+ const monthRange = generateMonthRange();
121
+ const monthStructures = monthRange.map((month) => getFolderStructure({ month, server: options.server }));
122
+ foldersToProcess.push(...monthStructures);
123
+ }
124
+ // Remove duplicates and sort chronologically for all cases
125
+ const uniqueFolders = removeDuplicateFolders(foldersToProcess);
126
+ foldersToProcess = sortFoldersChronologically(uniqueFolders);
127
+ console.log(`🚀 Starting processing for ${foldersToProcess.length} folders(s)`);
128
+ console.log(`📅 Processing folders: ${foldersToProcess.map((s) => s.batch).join(', ')}`);
129
+ const allStats = [];
130
+ for (const folder of foldersToProcess) {
131
+ const displayName = folder.type === 'back' ? `batch ${folder.batch}` : `month ${folder.batch}`;
132
+ console.log(`\n📅 Processing ${displayName}`);
133
+ const result = await processBatch(folder, options);
134
+ if (!result.success) {
135
+ console.error(`❌ Failed to process ${displayName}:`, result.error);
136
+ // Continue with next folder instead of exiting
137
+ continue;
138
+ }
139
+ // Collect statistics
140
+ if (result.stats) {
141
+ allStats.push(result.stats);
142
+ }
143
+ console.log(`✅ ${displayName} completed successfully`);
144
+ }
145
+ // Display summary table
146
+ if (allStats.length > 0) {
147
+ console.log('\n📊 Processing Summary');
148
+ console.log('═'.repeat(80));
149
+ console.log('Folder'.padEnd(20) +
150
+ 'Total'.padStart(8) +
151
+ 'Processed'.padStart(12) +
152
+ 'New'.padStart(8) +
153
+ 'Cached'.padStart(8) +
154
+ 'Errors'.padStart(8) +
155
+ 'Filtered'.padStart(10));
156
+ console.log('─'.repeat(80));
157
+ for (const stats of allStats) {
158
+ const folderName = stats.folderName.padEnd(20);
159
+ const total = stats.totalFiles.toString().padStart(8);
160
+ const processed = stats.totalProcessed.toString().padStart(12);
161
+ const newlyProcessed = stats.newlyProcessed.toString().padStart(8);
162
+ const alreadyProcessed = stats.alreadyProcessed.toString().padStart(8);
163
+ const errors = stats.errors.toString().padStart(8);
164
+ const filtered = stats.filteredCount.toString().padStart(10);
165
+ console.log(`${folderName}${total}${processed}${newlyProcessed}${alreadyProcessed}${errors}${filtered}`);
166
+ }
167
+ console.log('─'.repeat(80));
168
+ // Calculate totals
169
+ const totalFiles = allStats.reduce((sum, stat) => sum + stat.totalFiles, 0);
170
+ const totalProcessed = allStats.reduce((sum, stat) => sum + stat.totalProcessed, 0);
171
+ const totalNewlyProcessed = allStats.reduce((sum, stat) => sum + stat.newlyProcessed, 0);
172
+ const totalAlreadyProcessed = allStats.reduce((sum, stat) => sum + stat.alreadyProcessed, 0);
173
+ const totalErrors = allStats.reduce((sum, stat) => sum + stat.errors, 0);
174
+ const totalFiltered = allStats.reduce((sum, stat) => sum + stat.filteredCount, 0);
175
+ const totalFolderName = 'TOTAL'.padEnd(20);
176
+ const totalTotal = totalFiles.toString().padStart(8);
177
+ const totalProcessedStr = totalProcessed.toString().padStart(12);
178
+ const totalNewlyProcessedStr = totalNewlyProcessed.toString().padStart(8);
179
+ const totalAlreadyProcessedStr = totalAlreadyProcessed.toString().padStart(8);
180
+ const totalErrorsStr = totalErrors.toString().padStart(8);
181
+ const totalFilteredStr = totalFiltered.toString().padStart(10);
182
+ console.log(`${totalFolderName}${totalTotal}${totalProcessedStr}${totalNewlyProcessedStr}${totalAlreadyProcessedStr}${totalErrorsStr}${totalFilteredStr}`);
183
+ console.log('═'.repeat(80));
184
+ // Final summary message
185
+ if (foldersToProcess.length > 1) {
186
+ const summaryType = options.month ? 'batch processing' : 'backwards batch processing';
187
+ console.log(`\n🎉 ${summaryType} completed!`);
188
+ console.log(`📅 Processed ${foldersToProcess.length} folders`);
189
+ }
190
+ else {
191
+ console.log(`\n🎉 Folder processing completed!`);
192
+ console.log(`📅 Processed folder: ${foldersToProcess[0].batch}`);
193
+ }
194
+ }
195
+ }
196
+ catch (error) {
197
+ console.error('❌ Error in batch processing:', error);
198
+ process.exit(1);
199
+ }
200
+ });
201
+ /**
202
+ * Process a single batch or month
203
+ */
204
+ async function processBatch(folder, options) {
205
+ try {
206
+ // Step 1: List available MECA files for the folder
207
+ const availableFiles = await listAvailableFiles(folder, options.limit, options);
208
+ console.log(`📋 Found ${availableFiles.length} available files`);
209
+ if (availableFiles.length === 0) {
210
+ console.log('❌ No files found for the specified folder');
211
+ return { success: false, error: 'No files found' };
212
+ }
213
+ // Step 2: Check which files are already processed
214
+ const processingStatus = await checkProcessingStatus(availableFiles, options.apiUrl, folder, options.checkIndividualLimit);
215
+ let filesToProcess = options.force
216
+ ? availableFiles
217
+ : availableFiles.filter((file) => { var _a; return !((_a = processingStatus[file.s3Key]) === null || _a === void 0 ? void 0 : _a.exists); });
218
+ // Apply file size filter if specified
219
+ let filteredCount = 0;
220
+ if (options.maxFileSize) {
221
+ const maxSizeBytes = parseFileSize(options.maxFileSize);
222
+ if (maxSizeBytes === null) {
223
+ console.error(`❌ Invalid max file size format: ${options.maxFileSize}. Use format like "100MB" or "2GB"`);
224
+ process.exit(1);
225
+ }
226
+ const originalCount = filesToProcess.length;
227
+ filesToProcess = filesToProcess.filter((file) => file.fileSize <= maxSizeBytes);
228
+ filteredCount = originalCount - filesToProcess.length;
229
+ if (filteredCount > 0) {
230
+ console.log(`📏 File size filter: ${options.maxFileSize} max (${formatFileSize(maxSizeBytes)})`);
231
+ console.log(`🚫 Skipped ${filteredCount} files larger than ${options.maxFileSize}`);
232
+ // Show size distribution of remaining files
233
+ const remainingSizes = filesToProcess.map((f) => f.fileSize);
234
+ const avgSize = remainingSizes.reduce((a, b) => a + b, 0) / remainingSizes.length;
235
+ const maxSize = Math.max(...remainingSizes);
236
+ console.log(`📊 Remaining files: avg ${formatFileSize(avgSize)}, max ${formatFileSize(maxSize)}`);
237
+ }
238
+ }
239
+ console.log(`📊 Files to process: ${filesToProcess.length}`);
240
+ console.log(`✅ Already processed: ${availableFiles.length - filesToProcess.length}`);
241
+ // Prepare statistics (for both dry-run and actual processing)
242
+ const stats = {
243
+ folderName: folder.batch,
244
+ totalFiles: availableFiles.length,
245
+ totalProcessed: availableFiles.length - filesToProcess.length, // already processed
246
+ newlyProcessed: filesToProcess.length, // files that would be processed (for dry-run) or were processed (for actual)
247
+ alreadyProcessed: availableFiles.length - filesToProcess.length,
248
+ errors: 0, // will be updated during actual processing
249
+ filteredCount: filteredCount,
250
+ };
251
+ if (options.dryRun) {
252
+ console.log('\n📋 Files that would be processed:');
253
+ filesToProcess.slice(0, 10).forEach((file) => {
254
+ console.log(` - ${file.s3Key} (${formatFileSize(file.fileSize)}, ${file.lastModified.toLocaleDateString()})`);
255
+ });
256
+ if (filesToProcess.length > 10) {
257
+ console.log(` - ${filesToProcess.length - 10} more files...`);
258
+ }
259
+ return { success: true, stats };
260
+ }
261
+ // Step 3: Process files with concurrency control
262
+ let processedCount = 0;
263
+ let errorCount = 0;
264
+ const startTime = Date.now();
265
+ // Create concurrency limiter
266
+ const limit = pLimit(parseInt(options.concurrency.toString(), 10));
267
+ console.log(`📦 Processing ${filesToProcess.length} files with concurrency limit of ${options.concurrency}`);
268
+ // Create array of processing functions
269
+ const processingFunctions = filesToProcess.map((file) => {
270
+ return limit(async () => {
271
+ try {
272
+ console.log(` 📥 Starting ${file.s3Key}...`);
273
+ // Download the MECA file first
274
+ await downloadFile(file.s3Key, {
275
+ output: options.output,
276
+ server: options.server,
277
+ });
278
+ // Get the local file path
279
+ const localFilePath = path.join(options.output, path.basename(file.s3Key));
280
+ // Get API key from command line or environment variable
281
+ const apiKey = options.apiKey || process.env.OPENRXIV_BATCH_PROCESSING_API_KEY;
282
+ // Process the MECA file using the utility function
283
+ const result = await processMecaFile(localFilePath, {
284
+ batch: file.batch,
285
+ server: folder.server,
286
+ apiUrl: options.apiUrl,
287
+ output: options.output,
288
+ s3Key: file.s3Key, // Pass the full S3 key for database storage
289
+ apiKey,
290
+ selective: !options.fullExtract, // Enable selective extraction unless --full-extract is used
291
+ });
292
+ // Clean up files after processing
293
+ await cleanupFiles(localFilePath, file, options);
294
+ if (result.success) {
295
+ console.log(` ✅ Successfully processed: ${file.s3Key}`);
296
+ return { success: true, file, localFilePath };
297
+ }
298
+ else {
299
+ console.log(` ❌ Failed to process: ${file.s3Key} - ${result.error}`);
300
+ return { success: false, file, localFilePath, error: result.error };
301
+ }
302
+ }
303
+ catch (error) {
304
+ console.error(` ❌ Error processing ${file.s3Key}:`, error);
305
+ const errorMessage = error instanceof Error ? error.message : String(error);
306
+ return { success: false, file, localFilePath: null, error: errorMessage };
307
+ }
308
+ });
309
+ });
310
+ // Process all files with concurrency control
311
+ const results = await Promise.all(processingFunctions);
312
+ // Process results and cleanup
313
+ for (const result of results) {
314
+ if (result && typeof result === 'object' && 'success' in result) {
315
+ const { success } = result;
316
+ if (success) {
317
+ processedCount++;
318
+ }
319
+ else {
320
+ errorCount++;
321
+ }
322
+ }
323
+ else {
324
+ // Invalid result format
325
+ errorCount++;
326
+ console.error(` ❌ Invalid result format:`, result);
327
+ }
328
+ }
329
+ // Show final progress
330
+ const elapsed = Math.round((Date.now() - startTime) / 1000);
331
+ const avgTimePerFile = processedCount > 0 ? elapsed / processedCount : 0;
332
+ console.log(`📊 Processing complete. Progress: ${processedCount}/${filesToProcess.length} (${Math.round((processedCount / filesToProcess.length) * 100)}%)`);
333
+ console.log(`⏱️ Elapsed: ${elapsed}s, Avg: ${avgTimePerFile.toFixed(1)}s/file`);
334
+ // Summary
335
+ console.log(`\n🎉 Batch processing completed!`);
336
+ console.log(`📊 Total files: ${availableFiles.length}`);
337
+ console.log(`✅ Successfully processed: ${processedCount}`);
338
+ if (errorCount > 0) {
339
+ console.log(`❌ Errors: ${errorCount}`);
340
+ }
341
+ console.log(`⏭️ Skipped (already processed): ${availableFiles.length - filesToProcess.length}`);
342
+ // Show file size filtering summary if any files were filtered
343
+ if (filteredCount > 0) {
344
+ console.log(`🚫 Skipped ${filteredCount} files larger than ${options.maxFileSize}`);
345
+ }
346
+ // Cleanup summary
347
+ if (!options.keep) {
348
+ console.log(`🧹 Cleanup: MECA files and extracted content removed`);
349
+ }
350
+ else {
351
+ console.log(`💾 Cleanup: MECA files and extracted content preserved`);
352
+ }
353
+ // Update statistics with actual processing results
354
+ stats.newlyProcessed = processedCount;
355
+ stats.totalProcessed = availableFiles.length - filesToProcess.length + processedCount; // already processed + newly processed
356
+ stats.errors = errorCount;
357
+ return { success: true, stats };
358
+ }
359
+ catch (error) {
360
+ const errorMessage = error instanceof Error ? error.message : String(error);
361
+ return { success: false, error: errorMessage };
362
+ }
363
+ }
364
+ /**
365
+ * Clean up files after processing
366
+ */
367
+ async function cleanupFiles(localFilePath, file, options) {
368
+ if (!localFilePath)
369
+ return;
370
+ try {
371
+ if (!options.keep) {
372
+ // Remove the downloaded MECA file
373
+ if (fs.existsSync(localFilePath)) {
374
+ fs.unlinkSync(localFilePath);
375
+ console.log(` 🧹 Cleaned up MECA file: ${path.basename(file.s3Key)}`);
376
+ }
377
+ // Also clean up any extracted content directory
378
+ const extractedDir = localFilePath.replace('.meca', '');
379
+ if (fs.existsSync(extractedDir)) {
380
+ fs.rmSync(extractedDir, { recursive: true, force: true });
381
+ console.log(` 🧹 Cleaned up extracted content: ${path.basename(extractedDir)}`);
382
+ }
383
+ // Clean up any temporary files that might have been created
384
+ const tempFiles = [
385
+ localFilePath + '.tmp',
386
+ localFilePath + '.download',
387
+ path.dirname(localFilePath) + '/.temp_' + path.basename(localFilePath),
388
+ ];
389
+ for (const tempFile of tempFiles) {
390
+ if (fs.existsSync(tempFile)) {
391
+ try {
392
+ if (fs.statSync(tempFile).isDirectory()) {
393
+ fs.rmSync(tempFile, { recursive: true, force: true });
394
+ }
395
+ else {
396
+ fs.unlinkSync(tempFile);
397
+ }
398
+ console.log(` 🧹 Cleaned up temp file: ${path.basename(tempFile)}`);
399
+ }
400
+ catch (tempError) {
401
+ // Ignore temp file cleanup errors
402
+ }
403
+ }
404
+ }
405
+ }
406
+ else {
407
+ console.log(` 💾 Keeping MECA file: ${path.basename(file.s3Key)}`);
408
+ // Even when keeping files, clean up extracted content if it's not needed
409
+ if (!options.keep) {
410
+ try {
411
+ const extractedDir = localFilePath.replace('.meca', '');
412
+ if (fs.existsSync(extractedDir)) {
413
+ fs.rmSync(extractedDir, { recursive: true, force: true });
414
+ console.log(` 🧹 Cleaned up extracted content (keeping MECA): ${path.basename(extractedDir)}`);
415
+ }
416
+ }
417
+ catch (cleanupError) {
418
+ // Ignore extracted content cleanup errors when keeping MECA
419
+ }
420
+ }
421
+ }
422
+ }
423
+ catch (cleanupError) {
424
+ console.warn(` ⚠️ Warning: Could not clean up files for ${file.s3Key}:`, cleanupError);
425
+ }
426
+ }
427
+ async function listAvailableFiles(folder, limit, options) {
428
+ // If no limit specified, use a very large number to get all files
429
+ const actualLimit = limit || 999999;
430
+ return listMonthFiles({
431
+ month: folder.type === 'current' ? folder.batch : undefined,
432
+ batch: folder.type === 'back' ? folder.batch : undefined,
433
+ server: options.server,
434
+ limit: actualLimit,
435
+ });
436
+ }
437
+ /**
438
+ * Check the processing status of individual files.
439
+ *
440
+ * This is necessary if the list coming back from a large query misses some files.
441
+ */
442
+ async function checkIndividualProcessingStatus(files, apiUrl, status) {
443
+ console.log(' 🔍 Performing individual file status checks...');
444
+ // Create a concurrency limiter for API requests
445
+ const limit = pLimit(10);
446
+ // Create array of checking functions
447
+ const checkingFunctions = files.map((file) => {
448
+ return limit(async () => {
449
+ var _a;
450
+ try {
451
+ // Check individual file status using the bucket endpoint
452
+ const response = await axios.get(`${apiUrl}/v1/bucket?key=${encodeURIComponent(file.s3Key)}`);
453
+ if (response.status === 200 && response.data) {
454
+ // File exists and has data
455
+ status[file.s3Key] = { exists: true, paper: response.data };
456
+ console.log(` ✅ ${file.s3Key} - Found in database`);
457
+ }
458
+ else {
459
+ // File not found or no data
460
+ status[file.s3Key] = { exists: false };
461
+ console.log(` ❌ ${file.s3Key} - Not found in database`);
462
+ }
463
+ }
464
+ catch (error) {
465
+ if (((_a = error.response) === null || _a === void 0 ? void 0 : _a.status) === 404) {
466
+ // File not found
467
+ status[file.s3Key] = { exists: false };
468
+ console.log(` ❌ ${file.s3Key} - Not found in database (404)`);
469
+ }
470
+ else {
471
+ // Other error - assume not processed
472
+ status[file.s3Key] = { exists: false };
473
+ console.log(` ⚠️ ${file.s3Key} - Error checking status: ${error.message}`);
474
+ }
475
+ }
476
+ });
477
+ });
478
+ // Execute all checks concurrently
479
+ await Promise.all(checkingFunctions);
480
+ const processedCount = Object.values(status).filter((s) => s.exists).length;
481
+ console.log(` 📊 Individual check complete: ${processedCount}/${files.length} files actually processed`);
482
+ }
483
+ async function checkProcessingStatus(files, apiUrl, folder, checkIndividualLimit = 100) {
484
+ const status = {};
485
+ const processedFiles = new Set();
486
+ console.log('🔍 Checking processing status using batch endpoint...');
487
+ // Use the folder.batch directly instead of trying to extract month from S3 keys
488
+ const folderParam = folder.batch;
489
+ let offset = 0;
490
+ const limit = 1000; // Use the API's default limit
491
+ let hasMore = true;
492
+ while (hasMore) {
493
+ try {
494
+ const response = await axios.get(`${apiUrl}/v1/bucket/list?folder=${encodeURIComponent(folderParam)}&server=${folder.server}&limit=${limit}&offset=${offset}`);
495
+ const { items: batchItems, pagination } = response.data;
496
+ // Mark all files in this batch as processed
497
+ for (const item of batchItems) {
498
+ if (item.s3Key) {
499
+ processedFiles.add(item.s3Key);
500
+ status[item.s3Key] = { exists: true, paper: item };
501
+ }
502
+ }
503
+ // Check if we have more pages
504
+ hasMore = pagination.hasMore;
505
+ offset = pagination.nextOffset || offset + limit;
506
+ console.log(` 📄 Processed batch page: ${batchItems.length} items (offset: ${pagination.offset})`);
507
+ }
508
+ catch (error) {
509
+ console.warn(`⚠️ Error fetching batch at offset ${offset}:`, error);
510
+ hasMore = false;
511
+ }
512
+ }
513
+ // Now check which of our requested files exist in the processed set
514
+ const finalStatus = {};
515
+ for (const file of files) {
516
+ if (processedFiles.has(file.s3Key)) {
517
+ finalStatus[file.s3Key] = status[file.s3Key];
518
+ }
519
+ else {
520
+ finalStatus[file.s3Key] = { exists: false };
521
+ }
522
+ }
523
+ console.log(` ✅ Found ${processedFiles.size} processed items in batch`);
524
+ console.log(` 📊 Requested files status: ${Object.values(finalStatus).filter((s) => s.exists).length}/${files.length} already processed`);
525
+ const filesToCheck = files.filter((file) => { var _a; return !((_a = finalStatus[file.s3Key]) === null || _a === void 0 ? void 0 : _a.exists); });
526
+ // If individual checking is enabled and we have fewer files than the limit, do individual checks
527
+ if (filesToCheck.length > 0 && filesToCheck.length < checkIndividualLimit) {
528
+ console.log(`🔍 Individual checking enabled (${filesToCheck.length} files < ${checkIndividualLimit} limit)`);
529
+ await checkIndividualProcessingStatus(filesToCheck, apiUrl, finalStatus);
530
+ }
531
+ return finalStatus;
532
+ }
533
+ function parseFileSize(sizeStr) {
534
+ if (!sizeStr)
535
+ return null;
536
+ const match = sizeStr.match(/^(\d+(?:\.\d+)?)\s*(B|KB|MB|GB|TB)$/i);
537
+ if (!match)
538
+ return null;
539
+ const value = parseFloat(match[1]);
540
+ const unit = match[2].toUpperCase();
541
+ const multipliers = {
542
+ B: 1,
543
+ KB: 1024,
544
+ MB: 1024 * 1024,
545
+ GB: 1024 * 1024 * 1024,
546
+ TB: 1024 * 1024 * 1024 * 1024,
547
+ };
548
+ return value * multipliers[unit];
549
+ }
550
+ function formatFileSize(bytes) {
551
+ if (bytes === 0)
552
+ return '0 B';
553
+ const k = 1024;
554
+ const sizes = ['B', 'KB', 'MB', 'GB', 'TB'];
555
+ const i = Math.floor(Math.log(bytes) / Math.log(k));
556
+ return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i];
557
+ }
@@ -0,0 +1,3 @@
1
+ import { Command } from 'commander';
2
+ export declare const downloadCommand: Command;
3
+ //# sourceMappingURL=download.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"download.d.ts","sourceRoot":"","sources":["../../src/commands/download.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAMpC,eAAO,MAAM,eAAe,SA0ExB,CAAC"}
@@ -0,0 +1,76 @@
1
+ import { Command } from 'commander';
2
+ import axios from 'axios';
3
+ import { downloadFile } from '../aws/downloader.js';
4
+ import { setGlobalRequesterPays } from '../aws/config.js';
5
+ import { displayRequesterPaysError } from '../utils/requester-pays-error.js';
6
+ export const downloadCommand = new Command('download')
7
+ .description('Download MECA files from the bioRxiv/medRxiv S3 bucket by DOI')
8
+ .argument('<doi>', 'DOI of the paper (e.g., "10.1101/2024.01.15.123456")')
9
+ .option('-o, --output <dir>', 'Output directory for downloaded files', './downloads')
10
+ .option('-a, --api-url <url>', 'API base URL', 'https://openrxiv.csf.now')
11
+ .option('--requester-pays', 'Enable requester-pays for S3 bucket access')
12
+ .action(async (doi, options) => {
13
+ var _a, _b, _c;
14
+ try {
15
+ // Validate DOI format
16
+ if (!doi.includes('/')) {
17
+ console.error('❌ Invalid DOI format. Expected format: 10.1101/2024.01.15.123456');
18
+ process.exit(1);
19
+ }
20
+ // Split DOI into prefix and suffix
21
+ const [doiPrefix, doiSuffix] = doi.split('/', 2);
22
+ console.log(`🔍 Looking up paper with DOI: ${doi}`);
23
+ console.log(`📡 API URL: ${options.apiUrl}`);
24
+ // Look up the paper in the API
25
+ const response = await axios.get(`${options.apiUrl}/v1/works/${doiPrefix}/${doiSuffix}`);
26
+ if (!response.data || !response.data.s3Key) {
27
+ console.error('❌ Paper not found or no S3 key available');
28
+ process.exit(1);
29
+ }
30
+ const paper = response.data;
31
+ console.log(`📄 Found paper: ${paper.title || 'Unknown title'}`);
32
+ console.log(`📦 S3 Key: ${paper.s3Key}`);
33
+ // Set requester-pays if flag is provided
34
+ if (options.requesterPays) {
35
+ setGlobalRequesterPays(true);
36
+ console.log(`💰 Requester-pays enabled for S3 access`);
37
+ }
38
+ // Create a filesystem-safe filename from the DOI
39
+ const safeDoi = doi.replace(/[^a-zA-Z0-9.-]/g, '_');
40
+ const filename = `${safeDoi}.meca`;
41
+ console.log(`📥 Downloading MECA file as: ${filename}`);
42
+ // Download the file using the S3 key from the API
43
+ try {
44
+ await downloadFile(paper.s3Key, { ...options, filename });
45
+ console.log(`✅ Successfully downloaded MECA file for DOI: ${doi}`);
46
+ }
47
+ catch (downloadError) {
48
+ // Check if it's a requester-pays related error
49
+ const errorMessage = downloadError instanceof Error ? downloadError.message : String(downloadError);
50
+ if (errorMessage.includes('UnknownError') || errorMessage.includes('AccessDenied')) {
51
+ displayRequesterPaysError();
52
+ }
53
+ else {
54
+ console.error('❌ Download failed:', errorMessage);
55
+ }
56
+ process.exit(1);
57
+ }
58
+ }
59
+ catch (error) {
60
+ if (axios.isAxiosError(error)) {
61
+ if (((_a = error.response) === null || _a === void 0 ? void 0 : _a.status) === 404) {
62
+ console.error('❌ Article not found with the specified DOI');
63
+ }
64
+ else if (((_b = error.response) === null || _b === void 0 ? void 0 : _b.status) === 401) {
65
+ console.error('❌ Authentication failed. Please check your API key');
66
+ }
67
+ else {
68
+ console.error('❌ API error:', ((_c = error.response) === null || _c === void 0 ? void 0 : _c.data) || error.message);
69
+ }
70
+ }
71
+ else {
72
+ console.error('❌ Error looking up paper:', error);
73
+ }
74
+ process.exit(1);
75
+ }
76
+ });
@@ -0,0 +1,6 @@
1
+ export * from './list.js';
2
+ export * from './download.js';
3
+ export * from './summary.js';
4
+ export * from './batch-info.js';
5
+ export * from './batch-process.js';
6
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/commands/index.ts"],"names":[],"mappings":"AAAA,cAAc,WAAW,CAAC;AAC1B,cAAc,eAAe,CAAC;AAC9B,cAAc,cAAc,CAAC;AAC7B,cAAc,iBAAiB,CAAC;AAChC,cAAc,oBAAoB,CAAC"}
@@ -0,0 +1,5 @@
1
+ export * from './list.js';
2
+ export * from './download.js';
3
+ export * from './summary.js';
4
+ export * from './batch-info.js';
5
+ export * from './batch-process.js';
@@ -0,0 +1,3 @@
1
+ import { Command } from 'commander';
2
+ export declare const listCommand: Command;
3
+ //# sourceMappingURL=list.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"list.d.ts","sourceRoot":"","sources":["../../src/commands/list.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAIpC,eAAO,MAAM,WAAW,SAapB,CAAC"}