openrxiv-utils 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,257 @@
1
+ /**
2
+ * Utility functions for determining bioRxiv folder structure
3
+ * based on the date requested.
4
+ *
5
+ * The bioRxiv structure is:
6
+ * - Before late 2018: Files are in Back_Content/Batch_[nn]/ folders
7
+ * - After late 2018: Files are in Current_Content/[Month]_[Year]/ folders
8
+ */
9
+
10
+ export interface FolderStructure {
11
+ server: 'biorxiv' | 'medrxiv';
12
+ type: 'current' | 'back';
13
+ prefix: string;
14
+ batch: string;
15
+ }
16
+
17
+ export interface FolderStructureOptions {
18
+ server?: 'biorxiv' | 'medrxiv';
19
+ month?: string;
20
+ batch?: string;
21
+ }
22
+
23
+ /**
24
+ * Normalizes batch input to the standard "Batch_XX" format
25
+ * @param batch - Batch input in various formats (e.g., "1", "batch-1", "Batch_01", "batch_01")
26
+ * @param server - Server type to determine batch format (e.g., "biorxiv", "medrxiv")
27
+ * @returns Normalized batch string in appropriate format
28
+ */
29
+ export function normalizeBatch(batch: string | number, server: string = 'biorxiv'): string {
30
+ if (typeof batch === 'number') {
31
+ if (batch < 1) {
32
+ throw new Error(
33
+ `Invalid batch format: ${batch}. Expected a positive number or batch identifier.`,
34
+ );
35
+ }
36
+ const batchNum = batch.toString().padStart(2, '0');
37
+ return server.toLowerCase() === 'medrxiv' ? `medRxiv_Batch_${batchNum}` : `Batch_${batchNum}`;
38
+ }
39
+
40
+ // Remove common prefixes and normalize
41
+ const normalized = batch
42
+ .toLowerCase()
43
+ .replace(/^batch[-_]?/i, '') // Remove "batch", "batch-", "batch_"
44
+ .replace(/^medrxiv[-_]?batch[-_]?/i, '') // Remove "medrxiv_batch", "medrxiv-batch", etc.
45
+ .replace(/^0+/, '') // Remove leading zeros
46
+ .trim();
47
+
48
+ const matchInt = normalized.match(/^\d+$/);
49
+ if (!matchInt) {
50
+ throw new Error(
51
+ `Invalid batch format: ${batch}. Expected a positive number or batch identifier.`,
52
+ );
53
+ }
54
+
55
+ // Parse the number and format it
56
+ const batchNum = parseInt(normalized, 10);
57
+ if (isNaN(batchNum) || batchNum < 1) {
58
+ throw new Error(
59
+ `Invalid batch format: ${batch}. Expected a positive number or batch identifier.`,
60
+ );
61
+ }
62
+
63
+ const formattedBatchNum = batchNum.toString().padStart(2, '0');
64
+ return server.toLowerCase() === 'medrxiv'
65
+ ? `medRxiv_Batch_${formattedBatchNum}`
66
+ : `Batch_${formattedBatchNum}`;
67
+ }
68
+
69
+ /**
70
+ * Determines the folder structure for a given month or batch
71
+ * @param options - Options containing month or batch
72
+ * @returns FolderStructure with the appropriate prefix and type
73
+ */
74
+ export function getFolderStructure(options: FolderStructureOptions): FolderStructure {
75
+ if (options.month && options.batch) {
76
+ throw new Error('Either month or batch must be specified, not both');
77
+ }
78
+ if (!options.month && !options.batch) {
79
+ throw new Error('Either month or batch must be specified');
80
+ }
81
+
82
+ if (options.batch) {
83
+ // If batch is specified, use Back_Content structure
84
+ const normalizedBatch = normalizeBatch(options.batch, options.server);
85
+ return {
86
+ server: options.server || 'biorxiv',
87
+ type: 'back',
88
+ prefix: `Back_Content/${normalizedBatch}/`,
89
+ batch: normalizedBatch,
90
+ };
91
+ }
92
+
93
+ if (options.month) {
94
+ // Normalize month format to YYYY-MM
95
+ const normalizedMonth = normalizeMonthToYYYYMM(options.month);
96
+
97
+ if (!normalizedMonth) {
98
+ throw new Error(
99
+ `Invalid month format: ${options.month}. Expected YYYY-MM or Month_YYYY format.`,
100
+ );
101
+ }
102
+
103
+ const [year, monthNum] = normalizedMonth.split('-').map(Number);
104
+
105
+ // bioRxiv switched from Back_Content to Current_Content in late 2018
106
+ // We'll use December 2018 as the cutoff point to be safe
107
+ const cutoffDate = new Date(2018, 11, 1); // December 1, 2018 (0-indexed month)
108
+ const requestedDate = new Date(year, monthNum - 1, 1);
109
+
110
+ if (requestedDate < cutoffDate) {
111
+ // Use Back_Content structure - but we don't know which batch
112
+ // User should specify batch explicitly for pre-2019 content
113
+ throw new Error(
114
+ `Date ${options.month} is in the Back_Content period. Please specify a batch using --batch option. ` +
115
+ `Available batches can be listed with 'biorxiv list' command.`,
116
+ );
117
+ } else {
118
+ // Use Current_Content structure
119
+ const monthName = getMonthName(monthNum);
120
+ return {
121
+ server: options.server || 'biorxiv',
122
+ type: 'current',
123
+ prefix: `Current_Content/${monthName}_${year}/`,
124
+ batch: `${monthName}_${year}`,
125
+ };
126
+ }
127
+ }
128
+ throw new Error('Invalid folder structure options');
129
+ }
130
+
131
+ export function removeDuplicateFolders(folders: FolderStructure[]): FolderStructure[] {
132
+ return folders.filter(
133
+ (folder, index, arr) =>
134
+ arr.findIndex(
135
+ (f) =>
136
+ f.batch === folder.batch &&
137
+ f.server === folder.server &&
138
+ f.type === folder.type &&
139
+ f.prefix === folder.prefix,
140
+ ) === index,
141
+ );
142
+ }
143
+
144
+ /**
145
+ * Sort folders chronologically, putting batches before months
146
+ */
147
+ export function sortFoldersChronologically(folders: FolderStructure[]): FolderStructure[] {
148
+ return folders.sort((a, b) => {
149
+ // Put batches before months
150
+ if (a.type === 'back' && b.type === 'current') return -1;
151
+ if (a.type === 'current' && b.type === 'back') return 1;
152
+
153
+ // For batches, sort by batch number
154
+ if (a.type === 'back' && b.type === 'back') {
155
+ const aNum = parseInt(a.batch.replace(/\D/g, ''));
156
+ const bNum = parseInt(b.batch.replace(/\D/g, ''));
157
+ return aNum - bNum;
158
+ }
159
+
160
+ // For months, sort chronologically (newest first)
161
+ if (a.type === 'current' && b.type === 'current') {
162
+ const aDate = new Date(a.batch);
163
+ const bDate = new Date(b.batch);
164
+ return aDate.getTime() - bDate.getTime();
165
+ }
166
+
167
+ return 0;
168
+ });
169
+ }
170
+
171
+ /**
172
+ * Normalizes various month formats to YYYY-MM
173
+ * @param month - Month in various formats
174
+ * @returns Normalized YYYY-MM format or null if invalid
175
+ */
176
+ export function normalizeMonthToYYYYMM(month: string): string | null {
177
+ // Already in YYYY-MM format
178
+ if (month.match(/^\d{4}-\d{2}$/)) {
179
+ const [, monthNum] = month.split('-').map(Number);
180
+ if (monthNum < 1 || monthNum > 12) {
181
+ return null; // Invalid month number
182
+ }
183
+ return month;
184
+ }
185
+
186
+ // Month_YYYY format (e.g., "November_2018")
187
+ const monthYearMatch = month.match(/^([A-Za-z]+)(?:[-_])(\d{4})$/);
188
+ if (monthYearMatch) {
189
+ const monthName = monthYearMatch[1];
190
+ const year = monthYearMatch[2];
191
+ const monthNum = getMonthNumber(monthName);
192
+
193
+ if (monthNum !== null) {
194
+ return `${year}-${monthNum.toString().padStart(2, '0')}`;
195
+ }
196
+ }
197
+
198
+ return null;
199
+ }
200
+
201
+ /**
202
+ * Gets month number from month name
203
+ * @param monthName - Month name (case insensitive)
204
+ * @returns Month number (1-12) or null if invalid
205
+ */
206
+ function getMonthNumber(monthName: string): number | null {
207
+ const monthNames = [
208
+ 'january',
209
+ 'february',
210
+ 'march',
211
+ 'april',
212
+ 'may',
213
+ 'june',
214
+ 'july',
215
+ 'august',
216
+ 'september',
217
+ 'october',
218
+ 'november',
219
+ 'december',
220
+ ];
221
+
222
+ const normalizedName = monthName.toLowerCase();
223
+ let monthIndex = monthNames.indexOf(normalizedName);
224
+ if (monthIndex === -1) {
225
+ monthIndex = monthNames.map((m) => m.slice(0, 3).toLowerCase()).indexOf(normalizedName);
226
+ }
227
+
228
+ return monthIndex !== -1 ? monthIndex + 1 : null;
229
+ }
230
+
231
+ /**
232
+ * Gets month name from month number
233
+ * @param monthNum - Month number (1-12)
234
+ * @returns Month name (e.g., "January")
235
+ */
236
+ function getMonthName(monthNum: number): string {
237
+ const monthNames = [
238
+ 'January',
239
+ 'February',
240
+ 'March',
241
+ 'April',
242
+ 'May',
243
+ 'June',
244
+ 'July',
245
+ 'August',
246
+ 'September',
247
+ 'October',
248
+ 'November',
249
+ 'December',
250
+ ];
251
+
252
+ if (monthNum < 1 || monthNum > 12) {
253
+ throw new Error(`Invalid month number: ${monthNum}. Must be 1-12.`);
254
+ }
255
+
256
+ return monthNames[monthNum - 1];
257
+ }
package/src/index.ts ADDED
@@ -0,0 +1,3 @@
1
+ // Export all utility functions
2
+ export * from './biorxiv-parser.js';
3
+ export * from './folder-structure.js';
package/tsconfig.json ADDED
@@ -0,0 +1,31 @@
1
+ {
2
+ "compilerOptions": {
3
+ "target": "ES2022",
4
+ "module": "ESNext",
5
+ "moduleResolution": "node",
6
+ "allowSyntheticDefaultImports": true,
7
+ "esModuleInterop": true,
8
+ "allowJs": true,
9
+ "strict": true,
10
+ "skipLibCheck": true,
11
+ "forceConsistentCasingInFileNames": true,
12
+ "resolveJsonModule": true,
13
+ "isolatedModules": true,
14
+ "declaration": true,
15
+ "outDir": "./dist",
16
+ "rootDir": "./src",
17
+ "sourceMap": true,
18
+ "declarationMap": true,
19
+ "baseUrl": ".",
20
+ "paths": {
21
+ "@/*": ["./src/*"]
22
+ }
23
+ },
24
+ "include": [
25
+ "src/**/*"
26
+ ],
27
+ "exclude": [
28
+ "node_modules",
29
+ "dist"
30
+ ]
31
+ }