openrxiv-cli 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/aws/bucket-explorer.d.ts +1 -0
- package/dist/aws/bucket-explorer.d.ts.map +1 -1
- package/dist/aws/bucket-explorer.js +145 -43
- package/dist/commands/download.d.ts.map +1 -1
- package/dist/commands/download.js +2 -2
- package/dist/commands/list.d.ts.map +1 -1
- package/dist/commands/list.js +1 -0
- package/dist/version.d.ts +1 -1
- package/dist/version.js +1 -1
- package/package.json +15 -15
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"bucket-explorer.d.ts","sourceRoot":"","sources":["../../src/aws/bucket-explorer.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"bucket-explorer.d.ts","sourceRoot":"","sources":["../../src/aws/bucket-explorer.ts"],"names":[],"mappings":"AAQA;;GAEG;AACH,wBAAgB,aAAa,CAAC,MAAM,GAAE,SAAS,GAAG,SAA8B,GAAG,MAAM,CAUxF;AAED,MAAM,WAAW,WAAW;IAC1B,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,MAAM,CAAC,EAAE,SAAS,GAAG,SAAS,CAAC;IAC/B,MAAM,CAAC,EAAE,MAAM,CAAC;CACjB;AAED,MAAM,WAAW,aAAa;IAC5B,IAAI,CAAC,EAAE,KAAK,GAAG,KAAK,GAAG,KAAK,CAAC;IAC7B,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED,MAAM,WAAW,WAAW;IAC1B,GAAG,EAAE,MAAM,CAAC;IACZ,IAAI,EAAE,MAAM,CAAC;IACb,YAAY,EAAE,IAAI,CAAC;IACnB,IAAI,EAAE,MAAM,GAAG,KAAK,GAAG,KAAK,GAAG,OAAO,CAAC;CACxC;AAED,wBAAsB,iBAAiB,CAAC,OAAO,EAAE,WAAW,GAAG,OAAO,CAAC,IAAI,CAAC,CA0J3E;AAuJD,wBAAsB,cAAc,CAClC,IAAI,EAAE,MAAM,EACZ,OAAO,GAAE;IAAE,QAAQ,CAAC,EAAE,OAAO,CAAC;IAAC,MAAM,CAAC,EAAE,SAAS,GAAG,SAAS,CAAA;CAAO,GACnE,OAAO,CAAC,IAAI,CAAC,CAyCf"}
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import { ListObjectsV2Command, HeadObjectCommand } from '@aws-sdk/client-s3';
|
|
2
2
|
import chalk from 'chalk';
|
|
3
|
+
import { writeFile } from 'fs/promises';
|
|
3
4
|
import { getS3Client } from './config.js';
|
|
4
5
|
import { getFolderStructure } from 'openrxiv-utils';
|
|
5
6
|
import { getDefaultServer } from '../utils/default-server.js';
|
|
@@ -19,14 +20,14 @@ export function getBucketName(server = getDefaultServer()) {
|
|
|
19
20
|
}
|
|
20
21
|
export async function listBucketContent(options) {
|
|
21
22
|
const client = await getS3Client();
|
|
22
|
-
const { month, batch, limit = 50, server = getDefaultServer() } = options;
|
|
23
|
+
const { month, batch, limit = 50, server = getDefaultServer(), output } = options;
|
|
23
24
|
const bucketName = getBucketName(server);
|
|
24
25
|
console.log(chalk.blue(`Listing ${server} bucket content...`));
|
|
25
26
|
console.log(chalk.blue('===================================='));
|
|
26
27
|
try {
|
|
27
28
|
// If no month or batch specified, show the available content structure
|
|
28
29
|
if (!month && !batch) {
|
|
29
|
-
await listFolder(client, server);
|
|
30
|
+
await listFolder(client, server, output);
|
|
30
31
|
return;
|
|
31
32
|
}
|
|
32
33
|
let prefix = '';
|
|
@@ -40,29 +41,93 @@ export async function listBucketContent(options) {
|
|
|
40
41
|
console.log(chalk.gray(`🔍 Batch: ${folder.batch}`));
|
|
41
42
|
}
|
|
42
43
|
}
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
const
|
|
51
|
-
if (
|
|
44
|
+
// Collect all items across pages
|
|
45
|
+
const allItems = [];
|
|
46
|
+
let continuationToken = undefined;
|
|
47
|
+
const maxKeysPerRequest = 1000; // S3 maximum
|
|
48
|
+
const requestedLimit = parseInt(limit.toString());
|
|
49
|
+
let pageNumber = 1;
|
|
50
|
+
// Prepare data for CSV export if output is specified
|
|
51
|
+
const csvRows = [];
|
|
52
|
+
if (output) {
|
|
53
|
+
csvRows.push('Key,Size (bytes),Date Modified');
|
|
54
|
+
}
|
|
55
|
+
// Paginate through all results, respecting the limit
|
|
56
|
+
do {
|
|
57
|
+
const remainingItems = requestedLimit - allItems.length;
|
|
58
|
+
const keysToFetch = Math.min(maxKeysPerRequest, remainingItems);
|
|
59
|
+
if (keysToFetch <= 0) {
|
|
60
|
+
break;
|
|
61
|
+
}
|
|
62
|
+
// Show progress when exporting to CSV
|
|
63
|
+
if (output) {
|
|
64
|
+
console.log(chalk.gray(`📄 Fetching page ${pageNumber}...`));
|
|
65
|
+
}
|
|
66
|
+
const commandOptions = {
|
|
67
|
+
Bucket: bucketName,
|
|
68
|
+
Prefix: prefix,
|
|
69
|
+
MaxKeys: keysToFetch,
|
|
70
|
+
RequestPayer: 'requester',
|
|
71
|
+
};
|
|
72
|
+
if (continuationToken) {
|
|
73
|
+
commandOptions.ContinuationToken = continuationToken;
|
|
74
|
+
}
|
|
75
|
+
const command = new ListObjectsV2Command(commandOptions);
|
|
76
|
+
const response = await client.send(command);
|
|
77
|
+
if (response.Contents && response.Contents.length > 0) {
|
|
78
|
+
const validItems = response.Contents.filter((item) => item.Key !== undefined).map((item) => ({
|
|
79
|
+
Key: item.Key,
|
|
80
|
+
Size: item.Size,
|
|
81
|
+
LastModified: item.LastModified,
|
|
82
|
+
}));
|
|
83
|
+
const itemsToAdd = validItems.slice(0, remainingItems);
|
|
84
|
+
allItems.push(...itemsToAdd);
|
|
85
|
+
if (output) {
|
|
86
|
+
console.log(chalk.gray(` ✓ Page ${pageNumber}: ${itemsToAdd.length} items (Total: ${allItems.length})`));
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
if (allItems.length >= requestedLimit) {
|
|
90
|
+
break;
|
|
91
|
+
}
|
|
92
|
+
continuationToken = response.NextContinuationToken;
|
|
93
|
+
pageNumber++;
|
|
94
|
+
} while (continuationToken && allItems.length < requestedLimit);
|
|
95
|
+
if (allItems.length === 0) {
|
|
52
96
|
console.log(chalk.yellow('No content found'));
|
|
53
97
|
return;
|
|
54
98
|
}
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
99
|
+
const escapeCsvValue = (value) => {
|
|
100
|
+
if (value.includes(',') || value.includes('"') || value.includes('\n')) {
|
|
101
|
+
return `"${value.replace(/"/g, '""')}"`;
|
|
102
|
+
}
|
|
103
|
+
return value;
|
|
104
|
+
};
|
|
105
|
+
if (!output) {
|
|
106
|
+
console.log(chalk.green(`Found ${allItems.length} items:`));
|
|
107
|
+
console.log('');
|
|
108
|
+
}
|
|
109
|
+
for (const item of allItems) {
|
|
58
110
|
if (!item.Key)
|
|
59
111
|
continue;
|
|
60
112
|
const type = getContentType(item.Key);
|
|
61
113
|
const size = formatFileSize(item.Size || 0);
|
|
62
114
|
const date = item.LastModified ? item.LastModified.toLocaleDateString() : 'Unknown';
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
115
|
+
const sizeBytes = item.Size || 0;
|
|
116
|
+
const dateModified = item.LastModified ? item.LastModified.toISOString() : 'Unknown';
|
|
117
|
+
if (!output) {
|
|
118
|
+
console.log(`${chalk.cyan(item.Key)}`);
|
|
119
|
+
console.log(` Type: ${chalk.yellow(type)} | Size: ${chalk.blue(size)} | Modified: ${chalk.gray(date)}`);
|
|
120
|
+
console.log('');
|
|
121
|
+
}
|
|
122
|
+
if (output) {
|
|
123
|
+
csvRows.push(`${escapeCsvValue(item.Key)},${sizeBytes},${escapeCsvValue(dateModified)}`);
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
if (output) {
|
|
127
|
+
const csvContent = csvRows.join('\n');
|
|
128
|
+
const csvFileName = output.endsWith('.csv') ? output : `${output}.csv`;
|
|
129
|
+
await writeFile(csvFileName, csvContent, 'utf-8');
|
|
130
|
+
console.log(chalk.green(`✅ Exported ${allItems.length} items to ${csvFileName}`));
|
|
66
131
|
}
|
|
67
132
|
}
|
|
68
133
|
catch (error) {
|
|
@@ -76,15 +141,19 @@ export async function listBucketContent(options) {
|
|
|
76
141
|
* Lists the available content structure in the specified server bucket
|
|
77
142
|
* Shows available months and batches
|
|
78
143
|
*/
|
|
79
|
-
async function listFolder(client, server = getDefaultServer()) {
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
// List Current_Content folders (monthly content)
|
|
85
|
-
console.log(chalk.blue('📅 Current Content (Monthly):'));
|
|
86
|
-
console.log(chalk.gray(' Recent content organized by month'));
|
|
144
|
+
async function listFolder(client, server = getDefaultServer(), output) {
|
|
145
|
+
const folderNames = [];
|
|
146
|
+
if (!output) {
|
|
147
|
+
console.log(chalk.cyan('📁 Available Content Structure'));
|
|
148
|
+
console.log(chalk.cyan('=============================='));
|
|
87
149
|
console.log('');
|
|
150
|
+
}
|
|
151
|
+
try {
|
|
152
|
+
if (!output) {
|
|
153
|
+
console.log(chalk.blue('📅 Current Content (Monthly):'));
|
|
154
|
+
console.log(chalk.gray(' Recent content organized by month'));
|
|
155
|
+
console.log('');
|
|
156
|
+
}
|
|
88
157
|
const bucketName = getBucketName(server);
|
|
89
158
|
const currentContentCommand = new ListObjectsV2Command({
|
|
90
159
|
Bucket: bucketName,
|
|
@@ -120,17 +189,26 @@ async function listFolder(client, server = getDefaultServer()) {
|
|
|
120
189
|
return monthOrder.indexOf(monthB) - monthOrder.indexOf(monthA);
|
|
121
190
|
});
|
|
122
191
|
for (const month of months) {
|
|
123
|
-
|
|
192
|
+
if (output) {
|
|
193
|
+
folderNames.push(month);
|
|
194
|
+
}
|
|
195
|
+
else {
|
|
196
|
+
console.log(` ${chalk.green('📁')} ${chalk.cyan(month)}`);
|
|
197
|
+
}
|
|
124
198
|
}
|
|
125
199
|
}
|
|
126
200
|
else {
|
|
127
|
-
|
|
201
|
+
if (!output) {
|
|
202
|
+
console.log(chalk.gray(' No monthly content found'));
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
if (!output) {
|
|
206
|
+
console.log('');
|
|
207
|
+
// List Back_Content batches
|
|
208
|
+
console.log(chalk.blue('📦 Back Content (Historical Batches):'));
|
|
209
|
+
console.log(chalk.gray(' Legacy content organized in batches'));
|
|
210
|
+
console.log('');
|
|
128
211
|
}
|
|
129
|
-
console.log('');
|
|
130
|
-
// List Back_Content batches
|
|
131
|
-
console.log(chalk.blue('📦 Back Content (Historical Batches):'));
|
|
132
|
-
console.log(chalk.gray(' Legacy content organized in batches'));
|
|
133
|
-
console.log('');
|
|
134
212
|
const backContentCommand = new ListObjectsV2Command({
|
|
135
213
|
Bucket: bucketName,
|
|
136
214
|
Prefix: 'Back_Content/',
|
|
@@ -144,24 +222,48 @@ async function listFolder(client, server = getDefaultServer()) {
|
|
|
144
222
|
.filter(Boolean)
|
|
145
223
|
.sort();
|
|
146
224
|
for (const batch of batches) {
|
|
147
|
-
|
|
225
|
+
if (output) {
|
|
226
|
+
folderNames.push(batch);
|
|
227
|
+
}
|
|
228
|
+
else {
|
|
229
|
+
console.log(` ${chalk.green('📁')} ${chalk.cyan(batch)}`);
|
|
230
|
+
}
|
|
148
231
|
}
|
|
149
232
|
}
|
|
150
233
|
else {
|
|
151
|
-
|
|
234
|
+
if (!output) {
|
|
235
|
+
console.log(chalk.gray(' No historical batches found'));
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
if (!output) {
|
|
239
|
+
console.log('');
|
|
240
|
+
console.log(chalk.blue('💡 Usage Examples:'));
|
|
241
|
+
console.log(chalk.gray(` List specific month:\t${server} list --month 2024-01`));
|
|
242
|
+
console.log(chalk.gray(` List specific batch:\t${server} list --batch Batch_01`));
|
|
243
|
+
console.log(chalk.gray(` List with limit:\t${server} list --month 2024-01 --limit 100`));
|
|
244
|
+
console.log(chalk.gray(` File listing (CSV):\t${server} list -m 2025-01 --limit 10000 -o 2025-01.csv`));
|
|
245
|
+
console.log(chalk.gray(` Folder overview:\t${server} list -o folders.txt`));
|
|
246
|
+
console.log('');
|
|
247
|
+
}
|
|
248
|
+
else {
|
|
249
|
+
const textContent = folderNames.join('\n');
|
|
250
|
+
await writeFile(output, textContent, 'utf-8');
|
|
251
|
+
console.log(chalk.green(`✅ Exported ${folderNames.length} folders to ${output}`));
|
|
152
252
|
}
|
|
153
|
-
console.log('');
|
|
154
|
-
console.log(chalk.blue('💡 Usage Examples:'));
|
|
155
|
-
console.log(chalk.gray(` List specific month: ${server} list --month 2024-01`));
|
|
156
|
-
console.log(chalk.gray(` List specific batch: ${server} list --batch Batch_01`));
|
|
157
|
-
console.log(chalk.gray(` List with limit: ${server} list --month 2024-01 --limit 100`));
|
|
158
|
-
console.log('');
|
|
159
253
|
}
|
|
160
254
|
catch (error) {
|
|
161
255
|
if (error instanceof Error) {
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
256
|
+
if (output) {
|
|
257
|
+
throw new Error(`Failed to list folder structure: ${error.message}`);
|
|
258
|
+
}
|
|
259
|
+
else {
|
|
260
|
+
console.log(chalk.yellow(`⚠️ Warning: Could not fetch content structure: ${error.message}`));
|
|
261
|
+
console.log(chalk.gray(' This may be due to AWS permissions or network issues'));
|
|
262
|
+
console.log('');
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
else {
|
|
266
|
+
throw error;
|
|
165
267
|
}
|
|
166
268
|
}
|
|
167
269
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"download.d.ts","sourceRoot":"","sources":["../../src/commands/download.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAMpC,eAAO,MAAM,eAAe,
|
|
1
|
+
{"version":3,"file":"download.d.ts","sourceRoot":"","sources":["../../src/commands/download.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAMpC,eAAO,MAAM,eAAe,SA+ExB,CAAC"}
|
|
@@ -5,7 +5,7 @@ import { setGlobalRequesterPays } from '../aws/config.js';
|
|
|
5
5
|
import { displayRequesterPaysError } from '../utils/requester-pays-error.js';
|
|
6
6
|
export const downloadCommand = new Command('download')
|
|
7
7
|
.description('Download MECA files from the bioRxiv/medRxiv S3 bucket by DOI')
|
|
8
|
-
.argument('<doi>', 'DOI of the paper (e.g., "10.1101/2024.01.15.123456")')
|
|
8
|
+
.argument('<doi>', 'DOI of the paper (e.g., "10.1101/2024.01.15.123456" or "10.64898/2025.12.15.123456")')
|
|
9
9
|
.option('-o, --output <dir>', 'Output directory for downloaded files', './downloads')
|
|
10
10
|
.option('-a, --api-url <url>', 'API base URL', 'https://openrxiv.csf.now')
|
|
11
11
|
.option('--requester-pays', 'Enable requester-pays for S3 bucket access')
|
|
@@ -14,7 +14,7 @@ export const downloadCommand = new Command('download')
|
|
|
14
14
|
try {
|
|
15
15
|
// Validate DOI format
|
|
16
16
|
if (!doi.includes('/')) {
|
|
17
|
-
console.error('❌ Invalid DOI format. Expected format: 10.1101/2024.01.15.123456');
|
|
17
|
+
console.error('❌ Invalid DOI format. Expected format: 10.1101 or 10.64898/2024.01.15.123456 (or legacy /XXXXXX)');
|
|
18
18
|
process.exit(1);
|
|
19
19
|
}
|
|
20
20
|
// Split DOI into prefix and suffix
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"list.d.ts","sourceRoot":"","sources":["../../src/commands/list.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAIpC,eAAO,MAAM,WAAW,
|
|
1
|
+
{"version":3,"file":"list.d.ts","sourceRoot":"","sources":["../../src/commands/list.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAIpC,eAAO,MAAM,WAAW,SAiBpB,CAAC"}
|
package/dist/commands/list.js
CHANGED
|
@@ -7,6 +7,7 @@ export const listCommand = new Command('list')
|
|
|
7
7
|
.option('-b, --batch <batch>', 'Filter by specific batch (e.g., "Batch_01")')
|
|
8
8
|
.option('-l, --limit <number>', 'Limit the number of results', '50')
|
|
9
9
|
.option('-s, --server <server>', 'Server to use: "biorxiv" or "medrxiv"', getDefaultServer())
|
|
10
|
+
.option('-o, --output <file>', 'Export results to file (CSV for file listings with .csv extension; text for folder overview)')
|
|
10
11
|
.action(async (options) => {
|
|
11
12
|
try {
|
|
12
13
|
await listBucketContent(options);
|
package/dist/version.d.ts
CHANGED
package/dist/version.js
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
const version = '0.0.
|
|
1
|
+
const version = '0.0.3';
|
|
2
2
|
export default version;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "openrxiv-cli",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.3",
|
|
4
4
|
"description": "CLI tool to download openRxiv MECA files from AWS S3 for text and data mining",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"files": [
|
|
@@ -34,27 +34,27 @@
|
|
|
34
34
|
"node": ">=18.0.0"
|
|
35
35
|
},
|
|
36
36
|
"dependencies": {
|
|
37
|
-
"@aws-sdk/client-s3": "^3.
|
|
38
|
-
"@aws-sdk/s3-request-presigner": "^3.
|
|
39
|
-
"axios": "^1.
|
|
40
|
-
"openrxiv-utils": "^0.0.
|
|
37
|
+
"@aws-sdk/client-s3": "^3.995.0",
|
|
38
|
+
"@aws-sdk/s3-request-presigner": "^3.995.0",
|
|
39
|
+
"axios": "^1.13.5",
|
|
40
|
+
"openrxiv-utils": "^0.0.3",
|
|
41
41
|
"boxen": "^8.0.1",
|
|
42
42
|
"character-entities": "^2.0.2",
|
|
43
|
-
"chalk": "^5.
|
|
43
|
+
"chalk": "^5.6.2",
|
|
44
44
|
"cli-progress": "^3.12.0",
|
|
45
|
-
"commander": "^14.0.
|
|
46
|
-
"conf": "^
|
|
47
|
-
"inquirer": "^
|
|
45
|
+
"commander": "^14.0.3",
|
|
46
|
+
"conf": "^15.1.0",
|
|
47
|
+
"inquirer": "^13.2.5",
|
|
48
48
|
"jszip": "^3.10.1",
|
|
49
|
-
"ora": "^
|
|
50
|
-
"adm-zip": "^0.5.
|
|
51
|
-
"unified": "^11.0.
|
|
49
|
+
"ora": "^9.3.0",
|
|
50
|
+
"adm-zip": "^0.5.16",
|
|
51
|
+
"unified": "^11.0.5",
|
|
52
52
|
"xast-util-from-xml": "^4.0.0",
|
|
53
|
-
"p-limit": "^7.
|
|
53
|
+
"p-limit": "^7.3.0"
|
|
54
54
|
},
|
|
55
55
|
"devDependencies": {
|
|
56
|
-
"@types/cli-progress": "^3.11.
|
|
57
|
-
"@types/inquirer": "^9.0.
|
|
56
|
+
"@types/cli-progress": "^3.11.6",
|
|
57
|
+
"@types/inquirer": "^9.0.9"
|
|
58
58
|
},
|
|
59
59
|
"repository": {
|
|
60
60
|
"type": "git",
|