openrxiv-cli 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/api/api-client.d.ts +96 -0
- package/dist/api/api-client.d.ts.map +1 -0
- package/dist/api/api-client.js +257 -0
- package/dist/aws/bucket-explorer.d.ts +26 -0
- package/dist/aws/bucket-explorer.d.ts.map +1 -0
- package/dist/aws/bucket-explorer.js +220 -0
- package/dist/aws/config.d.ts +5 -0
- package/dist/aws/config.d.ts.map +1 -0
- package/dist/aws/config.js +36 -0
- package/dist/aws/downloader.d.ts +13 -0
- package/dist/aws/downloader.d.ts.map +1 -0
- package/dist/aws/downloader.js +115 -0
- package/dist/aws/month-lister.d.ts +18 -0
- package/dist/aws/month-lister.d.ts.map +1 -0
- package/dist/aws/month-lister.js +90 -0
- package/dist/commands/batch-info.d.ts +3 -0
- package/dist/commands/batch-info.d.ts.map +1 -0
- package/dist/commands/batch-info.js +213 -0
- package/dist/commands/batch-process.d.ts +3 -0
- package/dist/commands/batch-process.d.ts.map +1 -0
- package/dist/commands/batch-process.js +557 -0
- package/dist/commands/download.d.ts +3 -0
- package/dist/commands/download.d.ts.map +1 -0
- package/dist/commands/download.js +76 -0
- package/dist/commands/index.d.ts +6 -0
- package/dist/commands/index.d.ts.map +1 -0
- package/dist/commands/index.js +5 -0
- package/dist/commands/list.d.ts +3 -0
- package/dist/commands/list.d.ts.map +1 -0
- package/dist/commands/list.js +18 -0
- package/dist/commands/summary.d.ts +3 -0
- package/dist/commands/summary.d.ts.map +1 -0
- package/dist/commands/summary.js +249 -0
- package/dist/index.d.ts +7 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +35 -0
- package/dist/utils/batches.d.ts +9 -0
- package/dist/utils/batches.d.ts.map +1 -0
- package/dist/utils/batches.js +61 -0
- package/dist/utils/batches.test.d.ts +2 -0
- package/dist/utils/batches.test.d.ts.map +1 -0
- package/dist/utils/batches.test.js +119 -0
- package/dist/utils/default-server.d.ts +3 -0
- package/dist/utils/default-server.d.ts.map +1 -0
- package/dist/utils/default-server.js +20 -0
- package/dist/utils/index.d.ts +5 -0
- package/dist/utils/index.d.ts.map +1 -0
- package/dist/utils/index.js +5 -0
- package/dist/utils/meca-processor.d.ts +28 -0
- package/dist/utils/meca-processor.d.ts.map +1 -0
- package/dist/utils/meca-processor.js +503 -0
- package/dist/utils/meca-processor.test.d.ts +2 -0
- package/dist/utils/meca-processor.test.d.ts.map +1 -0
- package/dist/utils/meca-processor.test.js +123 -0
- package/dist/utils/months.d.ts +36 -0
- package/dist/utils/months.d.ts.map +1 -0
- package/dist/utils/months.js +135 -0
- package/dist/utils/months.test.d.ts +2 -0
- package/dist/utils/months.test.d.ts.map +1 -0
- package/dist/utils/months.test.js +209 -0
- package/dist/utils/requester-pays-error.d.ts +6 -0
- package/dist/utils/requester-pays-error.d.ts.map +1 -0
- package/dist/utils/requester-pays-error.js +20 -0
- package/dist/version.d.ts +3 -0
- package/dist/version.d.ts.map +1 -0
- package/dist/version.js +2 -0
- package/package.json +67 -0
|
@@ -0,0 +1,557 @@
|
|
|
1
|
+
import { Command, Option } from 'commander';
|
|
2
|
+
import fs from 'fs';
|
|
3
|
+
import path from 'path';
|
|
4
|
+
import axios from 'axios';
|
|
5
|
+
import pLimit from 'p-limit';
|
|
6
|
+
import { listMonthFiles } from '../aws/month-lister.js';
|
|
7
|
+
import { downloadFile } from '../aws/downloader.js';
|
|
8
|
+
import { processMecaFile } from '../utils/meca-processor.js';
|
|
9
|
+
import { getFolderStructure, removeDuplicateFolders, sortFoldersChronologically, } from 'openrxiv-utils';
|
|
10
|
+
import { generateMonthRange, parseMonthInput, validateMonthFormat, getDefaultServer, } from '../utils/index.js';
|
|
11
|
+
import { parseBatchInput, validateBatchFormat } from '../utils/batches.js';
|
|
12
|
+
import { getBucketName } from '../aws/bucket-explorer.js';
|
|
13
|
+
export const batchProcessCommand = new Command('batch-process')
|
|
14
|
+
.description('Batch process MECA files for a given month or batch.')
|
|
15
|
+
.option('-m, --month <month>', 'Month(s) to process. Supports: YYYY-MM, comma-separated list (2025-01,2025-02), or wildcard pattern (2025-*). If not specified, processes backwards from current month to 2018-12')
|
|
16
|
+
.option('-b, --batch <batch>', 'Batch to process. Supports: single batch (e.g., "1"), range (e.g., "1-10"), or comma-separated list (e.g., "1,2,3"). Use this for historical content before 2018-12.')
|
|
17
|
+
.option('-s, --server <server>', 'Server type: biorxiv or medrxiv', getDefaultServer())
|
|
18
|
+
.option('-l, --limit <number>', 'Maximum number of files to process. If not specified, processes all available files')
|
|
19
|
+
.option('-a, --api-url <url>', 'API base URL', 'https://openrxiv.csf.now')
|
|
20
|
+
.addOption(new Option('-k, --api-key <key>', 'API key for authentication (or use OPENRXIV_BATCH_PROCESSING_API_KEY env var)').env('OPENRXIV_BATCH_PROCESSING_API_KEY'))
|
|
21
|
+
.option('-o, --output <dir>', 'Output directory for extracted files', './batch-extracted')
|
|
22
|
+
.option('--dry-run', 'List files without processing them', false)
|
|
23
|
+
.option('--force', 'Force reprocessing of existing files', false)
|
|
24
|
+
.option('--keep', 'Keep MECA files after processing (default: false)', false)
|
|
25
|
+
.option('--full-extract', 'Extract entire MECA file instead of selective extraction (default: false)', false)
|
|
26
|
+
.option('-c, --concurrency <number>', 'Number of files to process concurrently (default: 1)', '1')
|
|
27
|
+
.option('--max-file-size <size>', 'Skip files larger than this size (e.g., 100MB, 2GB)', '')
|
|
28
|
+
.option('--aws-bucket <bucket>', 'AWS S3 bucket name (auto-set based on server if not specified)')
|
|
29
|
+
.option('--aws-region <region>', 'AWS region', 'us-east-1')
|
|
30
|
+
.option('--check-individual-limit <number>', 'Threshold for individual checking (default: 100)', '100')
|
|
31
|
+
.action(async (options) => {
|
|
32
|
+
if (!options.apiKey && !options.dryRun) {
|
|
33
|
+
console.error('❌ API key is required. Please provide a valid API key using --api-key or set the OPENRXIV_BATCH_PROCESSING_API_KEY environment variable.');
|
|
34
|
+
process.exit(1);
|
|
35
|
+
}
|
|
36
|
+
const response = await axios.get(`${options.apiUrl}/health`).catch((error) => {
|
|
37
|
+
console.error('❌ API is not healthy. Please check the API URL and API key.');
|
|
38
|
+
process.exit(1);
|
|
39
|
+
});
|
|
40
|
+
if (response.status !== 200) {
|
|
41
|
+
console.error('❌ API is not healthy. Please check the API URL and API key.');
|
|
42
|
+
process.exit(1);
|
|
43
|
+
}
|
|
44
|
+
try {
|
|
45
|
+
if (options.batch && options.month) {
|
|
46
|
+
console.log(`🚀 Starting batch processing for batch: ${options.batch} and month: ${options.month}`);
|
|
47
|
+
}
|
|
48
|
+
else if (options.batch) {
|
|
49
|
+
console.log(`🚀 Starting batch processing for batch: ${options.batch}`);
|
|
50
|
+
}
|
|
51
|
+
else if (options.month) {
|
|
52
|
+
console.log(`🚀 Starting batch processing for month: ${options.month}`);
|
|
53
|
+
}
|
|
54
|
+
else {
|
|
55
|
+
console.log(`🚀 Starting backwards batch processing`);
|
|
56
|
+
}
|
|
57
|
+
console.log(`📊 Processing limit: ${options.limit ? `${options.limit} files` : 'all available files'}`);
|
|
58
|
+
console.log(`🔍 Dry run mode: ${options.dryRun ? 'enabled' : 'disabled'}`);
|
|
59
|
+
console.log(`⚡ Concurrency: ${options.concurrency} files`);
|
|
60
|
+
console.log(`🌐 Server: ${options.server}`);
|
|
61
|
+
if (!options.server) {
|
|
62
|
+
// Default to biorxiv if no server is specified
|
|
63
|
+
options.server = getDefaultServer();
|
|
64
|
+
}
|
|
65
|
+
if (!['biorxiv', 'medrxiv'].includes(options.server)) {
|
|
66
|
+
console.error('❌ Invalid server. Please use "biorxiv" or "medrxiv".');
|
|
67
|
+
process.exit(1);
|
|
68
|
+
}
|
|
69
|
+
// Auto-set AWS bucket based on server if not explicitly provided
|
|
70
|
+
const awsBucket = getBucketName(options.server);
|
|
71
|
+
console.log(`🪣 AWS Bucket: ${awsBucket}`);
|
|
72
|
+
// Create output directory
|
|
73
|
+
if (!fs.existsSync(options.output)) {
|
|
74
|
+
fs.mkdirSync(options.output, { recursive: true });
|
|
75
|
+
}
|
|
76
|
+
// Determine which folders to process
|
|
77
|
+
let foldersToProcess = [];
|
|
78
|
+
if (options.month) {
|
|
79
|
+
try {
|
|
80
|
+
const monthsToProcess = parseMonthInput(options.month);
|
|
81
|
+
// Validate all months after wildcard expansion
|
|
82
|
+
const invalidMonths = monthsToProcess.filter((m) => !validateMonthFormat(m));
|
|
83
|
+
if (invalidMonths.length > 0) {
|
|
84
|
+
console.error(`❌ Invalid month format(s): ${invalidMonths.join(', ')}`);
|
|
85
|
+
console.error('Expected format: YYYY-MM (e.g., 2025-01) or wildcard pattern (e.g., 2025-*)');
|
|
86
|
+
process.exit(1);
|
|
87
|
+
}
|
|
88
|
+
// Convert months to content structures
|
|
89
|
+
const monthStructures = monthsToProcess.map((month) => getFolderStructure({ month, server: options.server }));
|
|
90
|
+
foldersToProcess.push(...monthStructures);
|
|
91
|
+
}
|
|
92
|
+
catch (error) {
|
|
93
|
+
console.error(`❌ Error parsing month input: ${error instanceof Error ? error.message : String(error)}`);
|
|
94
|
+
process.exit(1);
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
if (options.batch) {
|
|
98
|
+
// Process batch(es) - support ranges like "1-10" or comma-separated lists
|
|
99
|
+
try {
|
|
100
|
+
const batchesToProcess = parseBatchInput(options.batch);
|
|
101
|
+
// Validate all batches
|
|
102
|
+
const invalidBatches = batchesToProcess.filter((b) => !validateBatchFormat(b));
|
|
103
|
+
if (invalidBatches.length > 0) {
|
|
104
|
+
console.error(`❌ Invalid batch format(s): ${invalidBatches.join(', ')}`);
|
|
105
|
+
console.error('Expected format: single batch (e.g., "1"), range (e.g., "1-10"), or comma-separated list (e.g., "1,2,3")');
|
|
106
|
+
process.exit(1);
|
|
107
|
+
}
|
|
108
|
+
// Convert batches to content structures
|
|
109
|
+
const batchStructures = batchesToProcess.map((batch) => getFolderStructure({ batch, server: options.server }));
|
|
110
|
+
foldersToProcess.push(...batchStructures);
|
|
111
|
+
}
|
|
112
|
+
catch (error) {
|
|
113
|
+
console.error(`❌ Error parsing batch input: ${error instanceof Error ? error.message : String(error)}`);
|
|
114
|
+
process.exit(1);
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
// Only generate month range if no other folders were specified
|
|
118
|
+
if (foldersToProcess.length === 0) {
|
|
119
|
+
// Generate month range and convert to content structures
|
|
120
|
+
const monthRange = generateMonthRange();
|
|
121
|
+
const monthStructures = monthRange.map((month) => getFolderStructure({ month, server: options.server }));
|
|
122
|
+
foldersToProcess.push(...monthStructures);
|
|
123
|
+
}
|
|
124
|
+
// Remove duplicates and sort chronologically for all cases
|
|
125
|
+
const uniqueFolders = removeDuplicateFolders(foldersToProcess);
|
|
126
|
+
foldersToProcess = sortFoldersChronologically(uniqueFolders);
|
|
127
|
+
console.log(`🚀 Starting processing for ${foldersToProcess.length} folders(s)`);
|
|
128
|
+
console.log(`📅 Processing folders: ${foldersToProcess.map((s) => s.batch).join(', ')}`);
|
|
129
|
+
const allStats = [];
|
|
130
|
+
for (const folder of foldersToProcess) {
|
|
131
|
+
const displayName = folder.type === 'back' ? `batch ${folder.batch}` : `month ${folder.batch}`;
|
|
132
|
+
console.log(`\n📅 Processing ${displayName}`);
|
|
133
|
+
const result = await processBatch(folder, options);
|
|
134
|
+
if (!result.success) {
|
|
135
|
+
console.error(`❌ Failed to process ${displayName}:`, result.error);
|
|
136
|
+
// Continue with next folder instead of exiting
|
|
137
|
+
continue;
|
|
138
|
+
}
|
|
139
|
+
// Collect statistics
|
|
140
|
+
if (result.stats) {
|
|
141
|
+
allStats.push(result.stats);
|
|
142
|
+
}
|
|
143
|
+
console.log(`✅ ${displayName} completed successfully`);
|
|
144
|
+
}
|
|
145
|
+
// Display summary table
|
|
146
|
+
if (allStats.length > 0) {
|
|
147
|
+
console.log('\n📊 Processing Summary');
|
|
148
|
+
console.log('═'.repeat(80));
|
|
149
|
+
console.log('Folder'.padEnd(20) +
|
|
150
|
+
'Total'.padStart(8) +
|
|
151
|
+
'Processed'.padStart(12) +
|
|
152
|
+
'New'.padStart(8) +
|
|
153
|
+
'Cached'.padStart(8) +
|
|
154
|
+
'Errors'.padStart(8) +
|
|
155
|
+
'Filtered'.padStart(10));
|
|
156
|
+
console.log('─'.repeat(80));
|
|
157
|
+
for (const stats of allStats) {
|
|
158
|
+
const folderName = stats.folderName.padEnd(20);
|
|
159
|
+
const total = stats.totalFiles.toString().padStart(8);
|
|
160
|
+
const processed = stats.totalProcessed.toString().padStart(12);
|
|
161
|
+
const newlyProcessed = stats.newlyProcessed.toString().padStart(8);
|
|
162
|
+
const alreadyProcessed = stats.alreadyProcessed.toString().padStart(8);
|
|
163
|
+
const errors = stats.errors.toString().padStart(8);
|
|
164
|
+
const filtered = stats.filteredCount.toString().padStart(10);
|
|
165
|
+
console.log(`${folderName}${total}${processed}${newlyProcessed}${alreadyProcessed}${errors}${filtered}`);
|
|
166
|
+
}
|
|
167
|
+
console.log('─'.repeat(80));
|
|
168
|
+
// Calculate totals
|
|
169
|
+
const totalFiles = allStats.reduce((sum, stat) => sum + stat.totalFiles, 0);
|
|
170
|
+
const totalProcessed = allStats.reduce((sum, stat) => sum + stat.totalProcessed, 0);
|
|
171
|
+
const totalNewlyProcessed = allStats.reduce((sum, stat) => sum + stat.newlyProcessed, 0);
|
|
172
|
+
const totalAlreadyProcessed = allStats.reduce((sum, stat) => sum + stat.alreadyProcessed, 0);
|
|
173
|
+
const totalErrors = allStats.reduce((sum, stat) => sum + stat.errors, 0);
|
|
174
|
+
const totalFiltered = allStats.reduce((sum, stat) => sum + stat.filteredCount, 0);
|
|
175
|
+
const totalFolderName = 'TOTAL'.padEnd(20);
|
|
176
|
+
const totalTotal = totalFiles.toString().padStart(8);
|
|
177
|
+
const totalProcessedStr = totalProcessed.toString().padStart(12);
|
|
178
|
+
const totalNewlyProcessedStr = totalNewlyProcessed.toString().padStart(8);
|
|
179
|
+
const totalAlreadyProcessedStr = totalAlreadyProcessed.toString().padStart(8);
|
|
180
|
+
const totalErrorsStr = totalErrors.toString().padStart(8);
|
|
181
|
+
const totalFilteredStr = totalFiltered.toString().padStart(10);
|
|
182
|
+
console.log(`${totalFolderName}${totalTotal}${totalProcessedStr}${totalNewlyProcessedStr}${totalAlreadyProcessedStr}${totalErrorsStr}${totalFilteredStr}`);
|
|
183
|
+
console.log('═'.repeat(80));
|
|
184
|
+
// Final summary message
|
|
185
|
+
if (foldersToProcess.length > 1) {
|
|
186
|
+
const summaryType = options.month ? 'batch processing' : 'backwards batch processing';
|
|
187
|
+
console.log(`\n🎉 ${summaryType} completed!`);
|
|
188
|
+
console.log(`📅 Processed ${foldersToProcess.length} folders`);
|
|
189
|
+
}
|
|
190
|
+
else {
|
|
191
|
+
console.log(`\n🎉 Folder processing completed!`);
|
|
192
|
+
console.log(`📅 Processed folder: ${foldersToProcess[0].batch}`);
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
catch (error) {
|
|
197
|
+
console.error('❌ Error in batch processing:', error);
|
|
198
|
+
process.exit(1);
|
|
199
|
+
}
|
|
200
|
+
});
|
|
201
|
+
/**
|
|
202
|
+
* Process a single batch or month
|
|
203
|
+
*/
|
|
204
|
+
async function processBatch(folder, options) {
|
|
205
|
+
try {
|
|
206
|
+
// Step 1: List available MECA files for the folder
|
|
207
|
+
const availableFiles = await listAvailableFiles(folder, options.limit, options);
|
|
208
|
+
console.log(`📋 Found ${availableFiles.length} available files`);
|
|
209
|
+
if (availableFiles.length === 0) {
|
|
210
|
+
console.log('❌ No files found for the specified folder');
|
|
211
|
+
return { success: false, error: 'No files found' };
|
|
212
|
+
}
|
|
213
|
+
// Step 2: Check which files are already processed
|
|
214
|
+
const processingStatus = await checkProcessingStatus(availableFiles, options.apiUrl, folder, options.checkIndividualLimit);
|
|
215
|
+
let filesToProcess = options.force
|
|
216
|
+
? availableFiles
|
|
217
|
+
: availableFiles.filter((file) => { var _a; return !((_a = processingStatus[file.s3Key]) === null || _a === void 0 ? void 0 : _a.exists); });
|
|
218
|
+
// Apply file size filter if specified
|
|
219
|
+
let filteredCount = 0;
|
|
220
|
+
if (options.maxFileSize) {
|
|
221
|
+
const maxSizeBytes = parseFileSize(options.maxFileSize);
|
|
222
|
+
if (maxSizeBytes === null) {
|
|
223
|
+
console.error(`❌ Invalid max file size format: ${options.maxFileSize}. Use format like "100MB" or "2GB"`);
|
|
224
|
+
process.exit(1);
|
|
225
|
+
}
|
|
226
|
+
const originalCount = filesToProcess.length;
|
|
227
|
+
filesToProcess = filesToProcess.filter((file) => file.fileSize <= maxSizeBytes);
|
|
228
|
+
filteredCount = originalCount - filesToProcess.length;
|
|
229
|
+
if (filteredCount > 0) {
|
|
230
|
+
console.log(`📏 File size filter: ${options.maxFileSize} max (${formatFileSize(maxSizeBytes)})`);
|
|
231
|
+
console.log(`🚫 Skipped ${filteredCount} files larger than ${options.maxFileSize}`);
|
|
232
|
+
// Show size distribution of remaining files
|
|
233
|
+
const remainingSizes = filesToProcess.map((f) => f.fileSize);
|
|
234
|
+
const avgSize = remainingSizes.reduce((a, b) => a + b, 0) / remainingSizes.length;
|
|
235
|
+
const maxSize = Math.max(...remainingSizes);
|
|
236
|
+
console.log(`📊 Remaining files: avg ${formatFileSize(avgSize)}, max ${formatFileSize(maxSize)}`);
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
console.log(`📊 Files to process: ${filesToProcess.length}`);
|
|
240
|
+
console.log(`✅ Already processed: ${availableFiles.length - filesToProcess.length}`);
|
|
241
|
+
// Prepare statistics (for both dry-run and actual processing)
|
|
242
|
+
const stats = {
|
|
243
|
+
folderName: folder.batch,
|
|
244
|
+
totalFiles: availableFiles.length,
|
|
245
|
+
totalProcessed: availableFiles.length - filesToProcess.length, // already processed
|
|
246
|
+
newlyProcessed: filesToProcess.length, // files that would be processed (for dry-run) or were processed (for actual)
|
|
247
|
+
alreadyProcessed: availableFiles.length - filesToProcess.length,
|
|
248
|
+
errors: 0, // will be updated during actual processing
|
|
249
|
+
filteredCount: filteredCount,
|
|
250
|
+
};
|
|
251
|
+
if (options.dryRun) {
|
|
252
|
+
console.log('\n📋 Files that would be processed:');
|
|
253
|
+
filesToProcess.slice(0, 10).forEach((file) => {
|
|
254
|
+
console.log(` - ${file.s3Key} (${formatFileSize(file.fileSize)}, ${file.lastModified.toLocaleDateString()})`);
|
|
255
|
+
});
|
|
256
|
+
if (filesToProcess.length > 10) {
|
|
257
|
+
console.log(` - ${filesToProcess.length - 10} more files...`);
|
|
258
|
+
}
|
|
259
|
+
return { success: true, stats };
|
|
260
|
+
}
|
|
261
|
+
// Step 3: Process files with concurrency control
|
|
262
|
+
let processedCount = 0;
|
|
263
|
+
let errorCount = 0;
|
|
264
|
+
const startTime = Date.now();
|
|
265
|
+
// Create concurrency limiter
|
|
266
|
+
const limit = pLimit(parseInt(options.concurrency.toString(), 10));
|
|
267
|
+
console.log(`📦 Processing ${filesToProcess.length} files with concurrency limit of ${options.concurrency}`);
|
|
268
|
+
// Create array of processing functions
|
|
269
|
+
const processingFunctions = filesToProcess.map((file) => {
|
|
270
|
+
return limit(async () => {
|
|
271
|
+
try {
|
|
272
|
+
console.log(` 📥 Starting ${file.s3Key}...`);
|
|
273
|
+
// Download the MECA file first
|
|
274
|
+
await downloadFile(file.s3Key, {
|
|
275
|
+
output: options.output,
|
|
276
|
+
server: options.server,
|
|
277
|
+
});
|
|
278
|
+
// Get the local file path
|
|
279
|
+
const localFilePath = path.join(options.output, path.basename(file.s3Key));
|
|
280
|
+
// Get API key from command line or environment variable
|
|
281
|
+
const apiKey = options.apiKey || process.env.OPENRXIV_BATCH_PROCESSING_API_KEY;
|
|
282
|
+
// Process the MECA file using the utility function
|
|
283
|
+
const result = await processMecaFile(localFilePath, {
|
|
284
|
+
batch: file.batch,
|
|
285
|
+
server: folder.server,
|
|
286
|
+
apiUrl: options.apiUrl,
|
|
287
|
+
output: options.output,
|
|
288
|
+
s3Key: file.s3Key, // Pass the full S3 key for database storage
|
|
289
|
+
apiKey,
|
|
290
|
+
selective: !options.fullExtract, // Enable selective extraction unless --full-extract is used
|
|
291
|
+
});
|
|
292
|
+
// Clean up files after processing
|
|
293
|
+
await cleanupFiles(localFilePath, file, options);
|
|
294
|
+
if (result.success) {
|
|
295
|
+
console.log(` ✅ Successfully processed: ${file.s3Key}`);
|
|
296
|
+
return { success: true, file, localFilePath };
|
|
297
|
+
}
|
|
298
|
+
else {
|
|
299
|
+
console.log(` ❌ Failed to process: ${file.s3Key} - ${result.error}`);
|
|
300
|
+
return { success: false, file, localFilePath, error: result.error };
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
catch (error) {
|
|
304
|
+
console.error(` ❌ Error processing ${file.s3Key}:`, error);
|
|
305
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
306
|
+
return { success: false, file, localFilePath: null, error: errorMessage };
|
|
307
|
+
}
|
|
308
|
+
});
|
|
309
|
+
});
|
|
310
|
+
// Process all files with concurrency control
|
|
311
|
+
const results = await Promise.all(processingFunctions);
|
|
312
|
+
// Process results and cleanup
|
|
313
|
+
for (const result of results) {
|
|
314
|
+
if (result && typeof result === 'object' && 'success' in result) {
|
|
315
|
+
const { success } = result;
|
|
316
|
+
if (success) {
|
|
317
|
+
processedCount++;
|
|
318
|
+
}
|
|
319
|
+
else {
|
|
320
|
+
errorCount++;
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
else {
|
|
324
|
+
// Invalid result format
|
|
325
|
+
errorCount++;
|
|
326
|
+
console.error(` ❌ Invalid result format:`, result);
|
|
327
|
+
}
|
|
328
|
+
}
|
|
329
|
+
// Show final progress
|
|
330
|
+
const elapsed = Math.round((Date.now() - startTime) / 1000);
|
|
331
|
+
const avgTimePerFile = processedCount > 0 ? elapsed / processedCount : 0;
|
|
332
|
+
console.log(`📊 Processing complete. Progress: ${processedCount}/${filesToProcess.length} (${Math.round((processedCount / filesToProcess.length) * 100)}%)`);
|
|
333
|
+
console.log(`⏱️ Elapsed: ${elapsed}s, Avg: ${avgTimePerFile.toFixed(1)}s/file`);
|
|
334
|
+
// Summary
|
|
335
|
+
console.log(`\n🎉 Batch processing completed!`);
|
|
336
|
+
console.log(`📊 Total files: ${availableFiles.length}`);
|
|
337
|
+
console.log(`✅ Successfully processed: ${processedCount}`);
|
|
338
|
+
if (errorCount > 0) {
|
|
339
|
+
console.log(`❌ Errors: ${errorCount}`);
|
|
340
|
+
}
|
|
341
|
+
console.log(`⏭️ Skipped (already processed): ${availableFiles.length - filesToProcess.length}`);
|
|
342
|
+
// Show file size filtering summary if any files were filtered
|
|
343
|
+
if (filteredCount > 0) {
|
|
344
|
+
console.log(`🚫 Skipped ${filteredCount} files larger than ${options.maxFileSize}`);
|
|
345
|
+
}
|
|
346
|
+
// Cleanup summary
|
|
347
|
+
if (!options.keep) {
|
|
348
|
+
console.log(`🧹 Cleanup: MECA files and extracted content removed`);
|
|
349
|
+
}
|
|
350
|
+
else {
|
|
351
|
+
console.log(`💾 Cleanup: MECA files and extracted content preserved`);
|
|
352
|
+
}
|
|
353
|
+
// Update statistics with actual processing results
|
|
354
|
+
stats.newlyProcessed = processedCount;
|
|
355
|
+
stats.totalProcessed = availableFiles.length - filesToProcess.length + processedCount; // already processed + newly processed
|
|
356
|
+
stats.errors = errorCount;
|
|
357
|
+
return { success: true, stats };
|
|
358
|
+
}
|
|
359
|
+
catch (error) {
|
|
360
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
361
|
+
return { success: false, error: errorMessage };
|
|
362
|
+
}
|
|
363
|
+
}
|
|
364
|
+
/**
|
|
365
|
+
* Clean up files after processing
|
|
366
|
+
*/
|
|
367
|
+
async function cleanupFiles(localFilePath, file, options) {
|
|
368
|
+
if (!localFilePath)
|
|
369
|
+
return;
|
|
370
|
+
try {
|
|
371
|
+
if (!options.keep) {
|
|
372
|
+
// Remove the downloaded MECA file
|
|
373
|
+
if (fs.existsSync(localFilePath)) {
|
|
374
|
+
fs.unlinkSync(localFilePath);
|
|
375
|
+
console.log(` 🧹 Cleaned up MECA file: ${path.basename(file.s3Key)}`);
|
|
376
|
+
}
|
|
377
|
+
// Also clean up any extracted content directory
|
|
378
|
+
const extractedDir = localFilePath.replace('.meca', '');
|
|
379
|
+
if (fs.existsSync(extractedDir)) {
|
|
380
|
+
fs.rmSync(extractedDir, { recursive: true, force: true });
|
|
381
|
+
console.log(` 🧹 Cleaned up extracted content: ${path.basename(extractedDir)}`);
|
|
382
|
+
}
|
|
383
|
+
// Clean up any temporary files that might have been created
|
|
384
|
+
const tempFiles = [
|
|
385
|
+
localFilePath + '.tmp',
|
|
386
|
+
localFilePath + '.download',
|
|
387
|
+
path.dirname(localFilePath) + '/.temp_' + path.basename(localFilePath),
|
|
388
|
+
];
|
|
389
|
+
for (const tempFile of tempFiles) {
|
|
390
|
+
if (fs.existsSync(tempFile)) {
|
|
391
|
+
try {
|
|
392
|
+
if (fs.statSync(tempFile).isDirectory()) {
|
|
393
|
+
fs.rmSync(tempFile, { recursive: true, force: true });
|
|
394
|
+
}
|
|
395
|
+
else {
|
|
396
|
+
fs.unlinkSync(tempFile);
|
|
397
|
+
}
|
|
398
|
+
console.log(` 🧹 Cleaned up temp file: ${path.basename(tempFile)}`);
|
|
399
|
+
}
|
|
400
|
+
catch (tempError) {
|
|
401
|
+
// Ignore temp file cleanup errors
|
|
402
|
+
}
|
|
403
|
+
}
|
|
404
|
+
}
|
|
405
|
+
}
|
|
406
|
+
else {
|
|
407
|
+
console.log(` 💾 Keeping MECA file: ${path.basename(file.s3Key)}`);
|
|
408
|
+
// Even when keeping files, clean up extracted content if it's not needed
|
|
409
|
+
if (!options.keep) {
|
|
410
|
+
try {
|
|
411
|
+
const extractedDir = localFilePath.replace('.meca', '');
|
|
412
|
+
if (fs.existsSync(extractedDir)) {
|
|
413
|
+
fs.rmSync(extractedDir, { recursive: true, force: true });
|
|
414
|
+
console.log(` 🧹 Cleaned up extracted content (keeping MECA): ${path.basename(extractedDir)}`);
|
|
415
|
+
}
|
|
416
|
+
}
|
|
417
|
+
catch (cleanupError) {
|
|
418
|
+
// Ignore extracted content cleanup errors when keeping MECA
|
|
419
|
+
}
|
|
420
|
+
}
|
|
421
|
+
}
|
|
422
|
+
}
|
|
423
|
+
catch (cleanupError) {
|
|
424
|
+
console.warn(` ⚠️ Warning: Could not clean up files for ${file.s3Key}:`, cleanupError);
|
|
425
|
+
}
|
|
426
|
+
}
|
|
427
|
+
async function listAvailableFiles(folder, limit, options) {
|
|
428
|
+
// If no limit specified, use a very large number to get all files
|
|
429
|
+
const actualLimit = limit || 999999;
|
|
430
|
+
return listMonthFiles({
|
|
431
|
+
month: folder.type === 'current' ? folder.batch : undefined,
|
|
432
|
+
batch: folder.type === 'back' ? folder.batch : undefined,
|
|
433
|
+
server: options.server,
|
|
434
|
+
limit: actualLimit,
|
|
435
|
+
});
|
|
436
|
+
}
|
|
437
|
+
/**
|
|
438
|
+
* Check the processing status of individual files.
|
|
439
|
+
*
|
|
440
|
+
* This is necessary if the list coming back from a large query misses some files.
|
|
441
|
+
*/
|
|
442
|
+
async function checkIndividualProcessingStatus(files, apiUrl, status) {
|
|
443
|
+
console.log(' 🔍 Performing individual file status checks...');
|
|
444
|
+
// Create a concurrency limiter for API requests
|
|
445
|
+
const limit = pLimit(10);
|
|
446
|
+
// Create array of checking functions
|
|
447
|
+
const checkingFunctions = files.map((file) => {
|
|
448
|
+
return limit(async () => {
|
|
449
|
+
var _a;
|
|
450
|
+
try {
|
|
451
|
+
// Check individual file status using the bucket endpoint
|
|
452
|
+
const response = await axios.get(`${apiUrl}/v1/bucket?key=${encodeURIComponent(file.s3Key)}`);
|
|
453
|
+
if (response.status === 200 && response.data) {
|
|
454
|
+
// File exists and has data
|
|
455
|
+
status[file.s3Key] = { exists: true, paper: response.data };
|
|
456
|
+
console.log(` ✅ ${file.s3Key} - Found in database`);
|
|
457
|
+
}
|
|
458
|
+
else {
|
|
459
|
+
// File not found or no data
|
|
460
|
+
status[file.s3Key] = { exists: false };
|
|
461
|
+
console.log(` ❌ ${file.s3Key} - Not found in database`);
|
|
462
|
+
}
|
|
463
|
+
}
|
|
464
|
+
catch (error) {
|
|
465
|
+
if (((_a = error.response) === null || _a === void 0 ? void 0 : _a.status) === 404) {
|
|
466
|
+
// File not found
|
|
467
|
+
status[file.s3Key] = { exists: false };
|
|
468
|
+
console.log(` ❌ ${file.s3Key} - Not found in database (404)`);
|
|
469
|
+
}
|
|
470
|
+
else {
|
|
471
|
+
// Other error - assume not processed
|
|
472
|
+
status[file.s3Key] = { exists: false };
|
|
473
|
+
console.log(` ⚠️ ${file.s3Key} - Error checking status: ${error.message}`);
|
|
474
|
+
}
|
|
475
|
+
}
|
|
476
|
+
});
|
|
477
|
+
});
|
|
478
|
+
// Execute all checks concurrently
|
|
479
|
+
await Promise.all(checkingFunctions);
|
|
480
|
+
const processedCount = Object.values(status).filter((s) => s.exists).length;
|
|
481
|
+
console.log(` 📊 Individual check complete: ${processedCount}/${files.length} files actually processed`);
|
|
482
|
+
}
|
|
483
|
+
async function checkProcessingStatus(files, apiUrl, folder, checkIndividualLimit = 100) {
|
|
484
|
+
const status = {};
|
|
485
|
+
const processedFiles = new Set();
|
|
486
|
+
console.log('🔍 Checking processing status using batch endpoint...');
|
|
487
|
+
// Use the folder.batch directly instead of trying to extract month from S3 keys
|
|
488
|
+
const folderParam = folder.batch;
|
|
489
|
+
let offset = 0;
|
|
490
|
+
const limit = 1000; // Use the API's default limit
|
|
491
|
+
let hasMore = true;
|
|
492
|
+
while (hasMore) {
|
|
493
|
+
try {
|
|
494
|
+
const response = await axios.get(`${apiUrl}/v1/bucket/list?folder=${encodeURIComponent(folderParam)}&server=${folder.server}&limit=${limit}&offset=${offset}`);
|
|
495
|
+
const { items: batchItems, pagination } = response.data;
|
|
496
|
+
// Mark all files in this batch as processed
|
|
497
|
+
for (const item of batchItems) {
|
|
498
|
+
if (item.s3Key) {
|
|
499
|
+
processedFiles.add(item.s3Key);
|
|
500
|
+
status[item.s3Key] = { exists: true, paper: item };
|
|
501
|
+
}
|
|
502
|
+
}
|
|
503
|
+
// Check if we have more pages
|
|
504
|
+
hasMore = pagination.hasMore;
|
|
505
|
+
offset = pagination.nextOffset || offset + limit;
|
|
506
|
+
console.log(` 📄 Processed batch page: ${batchItems.length} items (offset: ${pagination.offset})`);
|
|
507
|
+
}
|
|
508
|
+
catch (error) {
|
|
509
|
+
console.warn(`⚠️ Error fetching batch at offset ${offset}:`, error);
|
|
510
|
+
hasMore = false;
|
|
511
|
+
}
|
|
512
|
+
}
|
|
513
|
+
// Now check which of our requested files exist in the processed set
|
|
514
|
+
const finalStatus = {};
|
|
515
|
+
for (const file of files) {
|
|
516
|
+
if (processedFiles.has(file.s3Key)) {
|
|
517
|
+
finalStatus[file.s3Key] = status[file.s3Key];
|
|
518
|
+
}
|
|
519
|
+
else {
|
|
520
|
+
finalStatus[file.s3Key] = { exists: false };
|
|
521
|
+
}
|
|
522
|
+
}
|
|
523
|
+
console.log(` ✅ Found ${processedFiles.size} processed items in batch`);
|
|
524
|
+
console.log(` 📊 Requested files status: ${Object.values(finalStatus).filter((s) => s.exists).length}/${files.length} already processed`);
|
|
525
|
+
const filesToCheck = files.filter((file) => { var _a; return !((_a = finalStatus[file.s3Key]) === null || _a === void 0 ? void 0 : _a.exists); });
|
|
526
|
+
// If individual checking is enabled and we have fewer files than the limit, do individual checks
|
|
527
|
+
if (filesToCheck.length > 0 && filesToCheck.length < checkIndividualLimit) {
|
|
528
|
+
console.log(`🔍 Individual checking enabled (${filesToCheck.length} files < ${checkIndividualLimit} limit)`);
|
|
529
|
+
await checkIndividualProcessingStatus(filesToCheck, apiUrl, finalStatus);
|
|
530
|
+
}
|
|
531
|
+
return finalStatus;
|
|
532
|
+
}
|
|
533
|
+
function parseFileSize(sizeStr) {
|
|
534
|
+
if (!sizeStr)
|
|
535
|
+
return null;
|
|
536
|
+
const match = sizeStr.match(/^(\d+(?:\.\d+)?)\s*(B|KB|MB|GB|TB)$/i);
|
|
537
|
+
if (!match)
|
|
538
|
+
return null;
|
|
539
|
+
const value = parseFloat(match[1]);
|
|
540
|
+
const unit = match[2].toUpperCase();
|
|
541
|
+
const multipliers = {
|
|
542
|
+
B: 1,
|
|
543
|
+
KB: 1024,
|
|
544
|
+
MB: 1024 * 1024,
|
|
545
|
+
GB: 1024 * 1024 * 1024,
|
|
546
|
+
TB: 1024 * 1024 * 1024 * 1024,
|
|
547
|
+
};
|
|
548
|
+
return value * multipliers[unit];
|
|
549
|
+
}
|
|
550
|
+
function formatFileSize(bytes) {
|
|
551
|
+
if (bytes === 0)
|
|
552
|
+
return '0 B';
|
|
553
|
+
const k = 1024;
|
|
554
|
+
const sizes = ['B', 'KB', 'MB', 'GB', 'TB'];
|
|
555
|
+
const i = Math.floor(Math.log(bytes) / Math.log(k));
|
|
556
|
+
return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i];
|
|
557
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"download.d.ts","sourceRoot":"","sources":["../../src/commands/download.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAMpC,eAAO,MAAM,eAAe,SA0ExB,CAAC"}
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import { Command } from 'commander';
|
|
2
|
+
import axios from 'axios';
|
|
3
|
+
import { downloadFile } from '../aws/downloader.js';
|
|
4
|
+
import { setGlobalRequesterPays } from '../aws/config.js';
|
|
5
|
+
import { displayRequesterPaysError } from '../utils/requester-pays-error.js';
|
|
6
|
+
export const downloadCommand = new Command('download')
|
|
7
|
+
.description('Download MECA files from the bioRxiv/medRxiv S3 bucket by DOI')
|
|
8
|
+
.argument('<doi>', 'DOI of the paper (e.g., "10.1101/2024.01.15.123456")')
|
|
9
|
+
.option('-o, --output <dir>', 'Output directory for downloaded files', './downloads')
|
|
10
|
+
.option('-a, --api-url <url>', 'API base URL', 'https://openrxiv.csf.now')
|
|
11
|
+
.option('--requester-pays', 'Enable requester-pays for S3 bucket access')
|
|
12
|
+
.action(async (doi, options) => {
|
|
13
|
+
var _a, _b, _c;
|
|
14
|
+
try {
|
|
15
|
+
// Validate DOI format
|
|
16
|
+
if (!doi.includes('/')) {
|
|
17
|
+
console.error('❌ Invalid DOI format. Expected format: 10.1101/2024.01.15.123456');
|
|
18
|
+
process.exit(1);
|
|
19
|
+
}
|
|
20
|
+
// Split DOI into prefix and suffix
|
|
21
|
+
const [doiPrefix, doiSuffix] = doi.split('/', 2);
|
|
22
|
+
console.log(`🔍 Looking up paper with DOI: ${doi}`);
|
|
23
|
+
console.log(`📡 API URL: ${options.apiUrl}`);
|
|
24
|
+
// Look up the paper in the API
|
|
25
|
+
const response = await axios.get(`${options.apiUrl}/v1/works/${doiPrefix}/${doiSuffix}`);
|
|
26
|
+
if (!response.data || !response.data.s3Key) {
|
|
27
|
+
console.error('❌ Paper not found or no S3 key available');
|
|
28
|
+
process.exit(1);
|
|
29
|
+
}
|
|
30
|
+
const paper = response.data;
|
|
31
|
+
console.log(`📄 Found paper: ${paper.title || 'Unknown title'}`);
|
|
32
|
+
console.log(`📦 S3 Key: ${paper.s3Key}`);
|
|
33
|
+
// Set requester-pays if flag is provided
|
|
34
|
+
if (options.requesterPays) {
|
|
35
|
+
setGlobalRequesterPays(true);
|
|
36
|
+
console.log(`💰 Requester-pays enabled for S3 access`);
|
|
37
|
+
}
|
|
38
|
+
// Create a filesystem-safe filename from the DOI
|
|
39
|
+
const safeDoi = doi.replace(/[^a-zA-Z0-9.-]/g, '_');
|
|
40
|
+
const filename = `${safeDoi}.meca`;
|
|
41
|
+
console.log(`📥 Downloading MECA file as: ${filename}`);
|
|
42
|
+
// Download the file using the S3 key from the API
|
|
43
|
+
try {
|
|
44
|
+
await downloadFile(paper.s3Key, { ...options, filename });
|
|
45
|
+
console.log(`✅ Successfully downloaded MECA file for DOI: ${doi}`);
|
|
46
|
+
}
|
|
47
|
+
catch (downloadError) {
|
|
48
|
+
// Check if it's a requester-pays related error
|
|
49
|
+
const errorMessage = downloadError instanceof Error ? downloadError.message : String(downloadError);
|
|
50
|
+
if (errorMessage.includes('UnknownError') || errorMessage.includes('AccessDenied')) {
|
|
51
|
+
displayRequesterPaysError();
|
|
52
|
+
}
|
|
53
|
+
else {
|
|
54
|
+
console.error('❌ Download failed:', errorMessage);
|
|
55
|
+
}
|
|
56
|
+
process.exit(1);
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
catch (error) {
|
|
60
|
+
if (axios.isAxiosError(error)) {
|
|
61
|
+
if (((_a = error.response) === null || _a === void 0 ? void 0 : _a.status) === 404) {
|
|
62
|
+
console.error('❌ Article not found with the specified DOI');
|
|
63
|
+
}
|
|
64
|
+
else if (((_b = error.response) === null || _b === void 0 ? void 0 : _b.status) === 401) {
|
|
65
|
+
console.error('❌ Authentication failed. Please check your API key');
|
|
66
|
+
}
|
|
67
|
+
else {
|
|
68
|
+
console.error('❌ API error:', ((_c = error.response) === null || _c === void 0 ? void 0 : _c.data) || error.message);
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
else {
|
|
72
|
+
console.error('❌ Error looking up paper:', error);
|
|
73
|
+
}
|
|
74
|
+
process.exit(1);
|
|
75
|
+
}
|
|
76
|
+
});
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/commands/index.ts"],"names":[],"mappings":"AAAA,cAAc,WAAW,CAAC;AAC1B,cAAc,eAAe,CAAC;AAC9B,cAAc,cAAc,CAAC;AAC7B,cAAc,iBAAiB,CAAC;AAChC,cAAc,oBAAoB,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"list.d.ts","sourceRoot":"","sources":["../../src/commands/list.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAIpC,eAAO,MAAM,WAAW,SAapB,CAAC"}
|