openrxiv-cli 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,6 +7,7 @@ export interface ListOptions {
7
7
  batch?: string;
8
8
  limit?: number;
9
9
  server?: 'biorxiv' | 'medrxiv';
10
+ output?: string;
10
11
  }
11
12
  export interface SearchOptions {
12
13
  type?: 'pdf' | 'xml' | 'all';
@@ -1 +1 @@
1
- {"version":3,"file":"bucket-explorer.d.ts","sourceRoot":"","sources":["../../src/aws/bucket-explorer.ts"],"names":[],"mappings":"AAOA;;GAEG;AACH,wBAAgB,aAAa,CAAC,MAAM,GAAE,SAAS,GAAG,SAA8B,GAAG,MAAM,CAUxF;AAED,MAAM,WAAW,WAAW;IAC1B,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,MAAM,CAAC,EAAE,SAAS,GAAG,SAAS,CAAC;CAChC;AAED,MAAM,WAAW,aAAa;IAC5B,IAAI,CAAC,EAAE,KAAK,GAAG,KAAK,GAAG,KAAK,CAAC;IAC7B,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED,MAAM,WAAW,WAAW;IAC1B,GAAG,EAAE,MAAM,CAAC;IACZ,IAAI,EAAE,MAAM,CAAC;IACb,YAAY,EAAE,IAAI,CAAC;IACnB,IAAI,EAAE,MAAM,GAAG,KAAK,GAAG,KAAK,GAAG,OAAO,CAAC;CACxC;AAED,wBAAsB,iBAAiB,CAAC,OAAO,EAAE,WAAW,GAAG,OAAO,CAAC,IAAI,CAAC,CAuE3E;AA+GD,wBAAsB,cAAc,CAClC,IAAI,EAAE,MAAM,EACZ,OAAO,GAAE;IAAE,QAAQ,CAAC,EAAE,OAAO,CAAC;IAAC,MAAM,CAAC,EAAE,SAAS,GAAG,SAAS,CAAA;CAAO,GACnE,OAAO,CAAC,IAAI,CAAC,CAyCf"}
1
+ {"version":3,"file":"bucket-explorer.d.ts","sourceRoot":"","sources":["../../src/aws/bucket-explorer.ts"],"names":[],"mappings":"AAQA;;GAEG;AACH,wBAAgB,aAAa,CAAC,MAAM,GAAE,SAAS,GAAG,SAA8B,GAAG,MAAM,CAUxF;AAED,MAAM,WAAW,WAAW;IAC1B,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,MAAM,CAAC,EAAE,SAAS,GAAG,SAAS,CAAC;IAC/B,MAAM,CAAC,EAAE,MAAM,CAAC;CACjB;AAED,MAAM,WAAW,aAAa;IAC5B,IAAI,CAAC,EAAE,KAAK,GAAG,KAAK,GAAG,KAAK,CAAC;IAC7B,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED,MAAM,WAAW,WAAW;IAC1B,GAAG,EAAE,MAAM,CAAC;IACZ,IAAI,EAAE,MAAM,CAAC;IACb,YAAY,EAAE,IAAI,CAAC;IACnB,IAAI,EAAE,MAAM,GAAG,KAAK,GAAG,KAAK,GAAG,OAAO,CAAC;CACxC;AAED,wBAAsB,iBAAiB,CAAC,OAAO,EAAE,WAAW,GAAG,OAAO,CAAC,IAAI,CAAC,CA0J3E;AAuJD,wBAAsB,cAAc,CAClC,IAAI,EAAE,MAAM,EACZ,OAAO,GAAE;IAAE,QAAQ,CAAC,EAAE,OAAO,CAAC;IAAC,MAAM,CAAC,EAAE,SAAS,GAAG,SAAS,CAAA;CAAO,GACnE,OAAO,CAAC,IAAI,CAAC,CAyCf"}
@@ -1,5 +1,6 @@
1
1
  import { ListObjectsV2Command, HeadObjectCommand } from '@aws-sdk/client-s3';
2
2
  import chalk from 'chalk';
3
+ import { writeFile } from 'fs/promises';
3
4
  import { getS3Client } from './config.js';
4
5
  import { getFolderStructure } from 'openrxiv-utils';
5
6
  import { getDefaultServer } from '../utils/default-server.js';
@@ -19,14 +20,14 @@ export function getBucketName(server = getDefaultServer()) {
19
20
  }
20
21
  export async function listBucketContent(options) {
21
22
  const client = await getS3Client();
22
- const { month, batch, limit = 50, server = getDefaultServer() } = options;
23
+ const { month, batch, limit = 50, server = getDefaultServer(), output } = options;
23
24
  const bucketName = getBucketName(server);
24
25
  console.log(chalk.blue(`Listing ${server} bucket content...`));
25
26
  console.log(chalk.blue('===================================='));
26
27
  try {
27
28
  // If no month or batch specified, show the available content structure
28
29
  if (!month && !batch) {
29
- await listFolder(client, server);
30
+ await listFolder(client, server, output);
30
31
  return;
31
32
  }
32
33
  let prefix = '';
@@ -40,29 +41,93 @@ export async function listBucketContent(options) {
40
41
  console.log(chalk.gray(`🔍 Batch: ${folder.batch}`));
41
42
  }
42
43
  }
43
- const commandOptions = {
44
- Bucket: bucketName,
45
- Prefix: prefix,
46
- MaxKeys: parseInt(limit.toString()),
47
- RequestPayer: 'requester',
48
- };
49
- const command = new ListObjectsV2Command(commandOptions);
50
- const response = await client.send(command);
51
- if (!response.Contents || response.Contents.length === 0) {
44
+ // Collect all items across pages
45
+ const allItems = [];
46
+ let continuationToken = undefined;
47
+ const maxKeysPerRequest = 1000; // S3 maximum
48
+ const requestedLimit = parseInt(limit.toString());
49
+ let pageNumber = 1;
50
+ // Prepare data for CSV export if output is specified
51
+ const csvRows = [];
52
+ if (output) {
53
+ csvRows.push('Key,Size (bytes),Date Modified');
54
+ }
55
+ // Paginate through all results, respecting the limit
56
+ do {
57
+ const remainingItems = requestedLimit - allItems.length;
58
+ const keysToFetch = Math.min(maxKeysPerRequest, remainingItems);
59
+ if (keysToFetch <= 0) {
60
+ break;
61
+ }
62
+ // Show progress when exporting to CSV
63
+ if (output) {
64
+ console.log(chalk.gray(`📄 Fetching page ${pageNumber}...`));
65
+ }
66
+ const commandOptions = {
67
+ Bucket: bucketName,
68
+ Prefix: prefix,
69
+ MaxKeys: keysToFetch,
70
+ RequestPayer: 'requester',
71
+ };
72
+ if (continuationToken) {
73
+ commandOptions.ContinuationToken = continuationToken;
74
+ }
75
+ const command = new ListObjectsV2Command(commandOptions);
76
+ const response = await client.send(command);
77
+ if (response.Contents && response.Contents.length > 0) {
78
+ const validItems = response.Contents.filter((item) => item.Key !== undefined).map((item) => ({
79
+ Key: item.Key,
80
+ Size: item.Size,
81
+ LastModified: item.LastModified,
82
+ }));
83
+ const itemsToAdd = validItems.slice(0, remainingItems);
84
+ allItems.push(...itemsToAdd);
85
+ if (output) {
86
+ console.log(chalk.gray(` ✓ Page ${pageNumber}: ${itemsToAdd.length} items (Total: ${allItems.length})`));
87
+ }
88
+ }
89
+ if (allItems.length >= requestedLimit) {
90
+ break;
91
+ }
92
+ continuationToken = response.NextContinuationToken;
93
+ pageNumber++;
94
+ } while (continuationToken && allItems.length < requestedLimit);
95
+ if (allItems.length === 0) {
52
96
  console.log(chalk.yellow('No content found'));
53
97
  return;
54
98
  }
55
- console.log(chalk.green(`Found ${response.Contents.length} items:`));
56
- console.log('');
57
- for (const item of response.Contents) {
99
+ const escapeCsvValue = (value) => {
100
+ if (value.includes(',') || value.includes('"') || value.includes('\n')) {
101
+ return `"${value.replace(/"/g, '""')}"`;
102
+ }
103
+ return value;
104
+ };
105
+ if (!output) {
106
+ console.log(chalk.green(`Found ${allItems.length} items:`));
107
+ console.log('');
108
+ }
109
+ for (const item of allItems) {
58
110
  if (!item.Key)
59
111
  continue;
60
112
  const type = getContentType(item.Key);
61
113
  const size = formatFileSize(item.Size || 0);
62
114
  const date = item.LastModified ? item.LastModified.toLocaleDateString() : 'Unknown';
63
- console.log(`${chalk.cyan(item.Key)}`);
64
- console.log(` Type: ${chalk.yellow(type)} | Size: ${chalk.blue(size)} | Modified: ${chalk.gray(date)}`);
65
- console.log('');
115
+ const sizeBytes = item.Size || 0;
116
+ const dateModified = item.LastModified ? item.LastModified.toISOString() : 'Unknown';
117
+ if (!output) {
118
+ console.log(`${chalk.cyan(item.Key)}`);
119
+ console.log(` Type: ${chalk.yellow(type)} | Size: ${chalk.blue(size)} | Modified: ${chalk.gray(date)}`);
120
+ console.log('');
121
+ }
122
+ if (output) {
123
+ csvRows.push(`${escapeCsvValue(item.Key)},${sizeBytes},${escapeCsvValue(dateModified)}`);
124
+ }
125
+ }
126
+ if (output) {
127
+ const csvContent = csvRows.join('\n');
128
+ const csvFileName = output.endsWith('.csv') ? output : `${output}.csv`;
129
+ await writeFile(csvFileName, csvContent, 'utf-8');
130
+ console.log(chalk.green(`✅ Exported ${allItems.length} items to ${csvFileName}`));
66
131
  }
67
132
  }
68
133
  catch (error) {
@@ -76,15 +141,19 @@ export async function listBucketContent(options) {
76
141
  * Lists the available content structure in the specified server bucket
77
142
  * Shows available months and batches
78
143
  */
79
- async function listFolder(client, server = getDefaultServer()) {
80
- console.log(chalk.cyan('📁 Available Content Structure'));
81
- console.log(chalk.cyan('=============================='));
82
- console.log('');
83
- try {
84
- // List Current_Content folders (monthly content)
85
- console.log(chalk.blue('📅 Current Content (Monthly):'));
86
- console.log(chalk.gray(' Recent content organized by month'));
144
+ async function listFolder(client, server = getDefaultServer(), output) {
145
+ const folderNames = [];
146
+ if (!output) {
147
+ console.log(chalk.cyan('📁 Available Content Structure'));
148
+ console.log(chalk.cyan('=============================='));
87
149
  console.log('');
150
+ }
151
+ try {
152
+ if (!output) {
153
+ console.log(chalk.blue('📅 Current Content (Monthly):'));
154
+ console.log(chalk.gray(' Recent content organized by month'));
155
+ console.log('');
156
+ }
88
157
  const bucketName = getBucketName(server);
89
158
  const currentContentCommand = new ListObjectsV2Command({
90
159
  Bucket: bucketName,
@@ -120,17 +189,26 @@ async function listFolder(client, server = getDefaultServer()) {
120
189
  return monthOrder.indexOf(monthB) - monthOrder.indexOf(monthA);
121
190
  });
122
191
  for (const month of months) {
123
- console.log(` ${chalk.green('📁')} ${chalk.cyan(month)}`);
192
+ if (output) {
193
+ folderNames.push(month);
194
+ }
195
+ else {
196
+ console.log(` ${chalk.green('📁')} ${chalk.cyan(month)}`);
197
+ }
124
198
  }
125
199
  }
126
200
  else {
127
- console.log(chalk.gray(' No monthly content found'));
201
+ if (!output) {
202
+ console.log(chalk.gray(' No monthly content found'));
203
+ }
204
+ }
205
+ if (!output) {
206
+ console.log('');
207
+ // List Back_Content batches
208
+ console.log(chalk.blue('📦 Back Content (Historical Batches):'));
209
+ console.log(chalk.gray(' Legacy content organized in batches'));
210
+ console.log('');
128
211
  }
129
- console.log('');
130
- // List Back_Content batches
131
- console.log(chalk.blue('📦 Back Content (Historical Batches):'));
132
- console.log(chalk.gray(' Legacy content organized in batches'));
133
- console.log('');
134
212
  const backContentCommand = new ListObjectsV2Command({
135
213
  Bucket: bucketName,
136
214
  Prefix: 'Back_Content/',
@@ -144,24 +222,48 @@ async function listFolder(client, server = getDefaultServer()) {
144
222
  .filter(Boolean)
145
223
  .sort();
146
224
  for (const batch of batches) {
147
- console.log(` ${chalk.green('📁')} ${chalk.cyan(batch)}`);
225
+ if (output) {
226
+ folderNames.push(batch);
227
+ }
228
+ else {
229
+ console.log(` ${chalk.green('📁')} ${chalk.cyan(batch)}`);
230
+ }
148
231
  }
149
232
  }
150
233
  else {
151
- console.log(chalk.gray(' No historical batches found'));
234
+ if (!output) {
235
+ console.log(chalk.gray(' No historical batches found'));
236
+ }
237
+ }
238
+ if (!output) {
239
+ console.log('');
240
+ console.log(chalk.blue('💡 Usage Examples:'));
241
+ console.log(chalk.gray(` List specific month:\t${server} list --month 2024-01`));
242
+ console.log(chalk.gray(` List specific batch:\t${server} list --batch Batch_01`));
243
+ console.log(chalk.gray(` List with limit:\t${server} list --month 2024-01 --limit 100`));
244
+ console.log(chalk.gray(` File listing (CSV):\t${server} list -m 2025-01 --limit 10000 -o 2025-01.csv`));
245
+ console.log(chalk.gray(` Folder overview:\t${server} list -o folders.txt`));
246
+ console.log('');
247
+ }
248
+ else {
249
+ const textContent = folderNames.join('\n');
250
+ await writeFile(output, textContent, 'utf-8');
251
+ console.log(chalk.green(`✅ Exported ${folderNames.length} folders to ${output}`));
152
252
  }
153
- console.log('');
154
- console.log(chalk.blue('💡 Usage Examples:'));
155
- console.log(chalk.gray(` List specific month: ${server} list --month 2024-01`));
156
- console.log(chalk.gray(` List specific batch: ${server} list --batch Batch_01`));
157
- console.log(chalk.gray(` List with limit: ${server} list --month 2024-01 --limit 100`));
158
- console.log('');
159
253
  }
160
254
  catch (error) {
161
255
  if (error instanceof Error) {
162
- console.log(chalk.yellow(`⚠️ Warning: Could not fetch content structure: ${error.message}`));
163
- console.log(chalk.gray(' This may be due to AWS permissions or network issues'));
164
- console.log('');
256
+ if (output) {
257
+ throw new Error(`Failed to list folder structure: ${error.message}`);
258
+ }
259
+ else {
260
+ console.log(chalk.yellow(`⚠️ Warning: Could not fetch content structure: ${error.message}`));
261
+ console.log(chalk.gray(' This may be due to AWS permissions or network issues'));
262
+ console.log('');
263
+ }
264
+ }
265
+ else {
266
+ throw error;
165
267
  }
166
268
  }
167
269
  }
@@ -1 +1 @@
1
- {"version":3,"file":"download.d.ts","sourceRoot":"","sources":["../../src/commands/download.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAMpC,eAAO,MAAM,eAAe,SA0ExB,CAAC"}
1
+ {"version":3,"file":"download.d.ts","sourceRoot":"","sources":["../../src/commands/download.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAMpC,eAAO,MAAM,eAAe,SA+ExB,CAAC"}
@@ -5,7 +5,7 @@ import { setGlobalRequesterPays } from '../aws/config.js';
5
5
  import { displayRequesterPaysError } from '../utils/requester-pays-error.js';
6
6
  export const downloadCommand = new Command('download')
7
7
  .description('Download MECA files from the bioRxiv/medRxiv S3 bucket by DOI')
8
- .argument('<doi>', 'DOI of the paper (e.g., "10.1101/2024.01.15.123456")')
8
+ .argument('<doi>', 'DOI of the paper (e.g., "10.1101/2024.01.15.123456" or "10.64898/2025.12.15.123456")')
9
9
  .option('-o, --output <dir>', 'Output directory for downloaded files', './downloads')
10
10
  .option('-a, --api-url <url>', 'API base URL', 'https://openrxiv.csf.now')
11
11
  .option('--requester-pays', 'Enable requester-pays for S3 bucket access')
@@ -14,7 +14,7 @@ export const downloadCommand = new Command('download')
14
14
  try {
15
15
  // Validate DOI format
16
16
  if (!doi.includes('/')) {
17
- console.error('❌ Invalid DOI format. Expected format: 10.1101/2024.01.15.123456');
17
+ console.error('❌ Invalid DOI format. Expected format: 10.1101 or 10.64898/2024.01.15.123456 (or legacy /XXXXXX)');
18
18
  process.exit(1);
19
19
  }
20
20
  // Split DOI into prefix and suffix
@@ -1 +1 @@
1
- {"version":3,"file":"list.d.ts","sourceRoot":"","sources":["../../src/commands/list.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAIpC,eAAO,MAAM,WAAW,SAapB,CAAC"}
1
+ {"version":3,"file":"list.d.ts","sourceRoot":"","sources":["../../src/commands/list.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAIpC,eAAO,MAAM,WAAW,SAiBpB,CAAC"}
@@ -7,6 +7,7 @@ export const listCommand = new Command('list')
7
7
  .option('-b, --batch <batch>', 'Filter by specific batch (e.g., "Batch_01")')
8
8
  .option('-l, --limit <number>', 'Limit the number of results', '50')
9
9
  .option('-s, --server <server>', 'Server to use: "biorxiv" or "medrxiv"', getDefaultServer())
10
+ .option('-o, --output <file>', 'Export results to file (CSV for file listings with .csv extension; text for folder overview)')
10
11
  .action(async (options) => {
11
12
  try {
12
13
  await listBucketContent(options);
package/dist/version.d.ts CHANGED
@@ -1,3 +1,3 @@
1
- declare const version = "0.0.2";
1
+ declare const version = "0.0.3";
2
2
  export default version;
3
3
  //# sourceMappingURL=version.d.ts.map
package/dist/version.js CHANGED
@@ -1,2 +1,2 @@
1
- const version = '0.0.2';
1
+ const version = '0.0.3';
2
2
  export default version;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "openrxiv-cli",
3
- "version": "0.0.2",
3
+ "version": "0.0.3",
4
4
  "description": "CLI tool to download openRxiv MECA files from AWS S3 for text and data mining",
5
5
  "main": "dist/index.js",
6
6
  "files": [
@@ -34,27 +34,27 @@
34
34
  "node": ">=18.0.0"
35
35
  },
36
36
  "dependencies": {
37
- "@aws-sdk/client-s3": "^3.0.0",
38
- "@aws-sdk/s3-request-presigner": "^3.0.0",
39
- "axios": "^1.6.0",
40
- "openrxiv-utils": "^0.0.2",
37
+ "@aws-sdk/client-s3": "^3.995.0",
38
+ "@aws-sdk/s3-request-presigner": "^3.995.0",
39
+ "axios": "^1.13.5",
40
+ "openrxiv-utils": "^0.0.3",
41
41
  "boxen": "^8.0.1",
42
42
  "character-entities": "^2.0.2",
43
- "chalk": "^5.0.0",
43
+ "chalk": "^5.6.2",
44
44
  "cli-progress": "^3.12.0",
45
- "commander": "^14.0.0",
46
- "conf": "^10.0.0",
47
- "inquirer": "^9.0.0",
45
+ "commander": "^14.0.3",
46
+ "conf": "^15.1.0",
47
+ "inquirer": "^13.2.5",
48
48
  "jszip": "^3.10.1",
49
- "ora": "^7.0.0",
50
- "adm-zip": "^0.5.10",
51
- "unified": "^11.0.0",
49
+ "ora": "^9.3.0",
50
+ "adm-zip": "^0.5.16",
51
+ "unified": "^11.0.5",
52
52
  "xast-util-from-xml": "^4.0.0",
53
- "p-limit": "^7.0.0"
53
+ "p-limit": "^7.3.0"
54
54
  },
55
55
  "devDependencies": {
56
- "@types/cli-progress": "^3.11.0",
57
- "@types/inquirer": "^9.0.0"
56
+ "@types/cli-progress": "^3.11.6",
57
+ "@types/inquirer": "^9.0.9"
58
58
  },
59
59
  "repository": {
60
60
  "type": "git",