openrxiv 0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/src/api/api-client.d.ts +96 -0
- package/dist/cli/src/api/api-client.d.ts.map +1 -0
- package/dist/cli/src/api/api-client.js +257 -0
- package/dist/cli/src/aws/bucket-explorer.d.ts +26 -0
- package/dist/cli/src/aws/bucket-explorer.d.ts.map +1 -0
- package/dist/cli/src/aws/bucket-explorer.js +220 -0
- package/dist/cli/src/aws/config.d.ts +18 -0
- package/dist/cli/src/aws/config.d.ts.map +1 -0
- package/dist/cli/src/aws/config.js +191 -0
- package/dist/cli/src/aws/downloader.d.ts +13 -0
- package/dist/cli/src/aws/downloader.d.ts.map +1 -0
- package/dist/cli/src/aws/downloader.js +115 -0
- package/dist/cli/src/aws/month-lister.d.ts +18 -0
- package/dist/cli/src/aws/month-lister.d.ts.map +1 -0
- package/dist/cli/src/aws/month-lister.js +90 -0
- package/dist/cli/src/commands/batch-process.d.ts +3 -0
- package/dist/cli/src/commands/batch-process.d.ts.map +1 -0
- package/dist/cli/src/commands/batch-process.js +557 -0
- package/dist/cli/src/commands/config.d.ts +3 -0
- package/dist/cli/src/commands/config.d.ts.map +1 -0
- package/dist/cli/src/commands/config.js +42 -0
- package/dist/cli/src/commands/download.d.ts +3 -0
- package/dist/cli/src/commands/download.d.ts.map +1 -0
- package/dist/cli/src/commands/download.js +76 -0
- package/dist/cli/src/commands/list.d.ts +3 -0
- package/dist/cli/src/commands/list.d.ts.map +1 -0
- package/dist/cli/src/commands/list.js +18 -0
- package/dist/cli/src/commands/month-info.d.ts +3 -0
- package/dist/cli/src/commands/month-info.d.ts.map +1 -0
- package/dist/cli/src/commands/month-info.js +213 -0
- package/dist/cli/src/commands/summary.d.ts +3 -0
- package/dist/cli/src/commands/summary.d.ts.map +1 -0
- package/dist/cli/src/commands/summary.js +249 -0
- package/dist/cli/src/index.d.ts +3 -0
- package/dist/cli/src/index.d.ts.map +1 -0
- package/dist/cli/src/index.js +35 -0
- package/dist/cli/src/utils/batches.d.ts +9 -0
- package/dist/cli/src/utils/batches.d.ts.map +1 -0
- package/dist/cli/src/utils/batches.js +61 -0
- package/dist/cli/src/utils/batches.test.d.ts +2 -0
- package/dist/cli/src/utils/batches.test.d.ts.map +1 -0
- package/dist/cli/src/utils/batches.test.js +119 -0
- package/dist/cli/src/utils/default-server.d.ts +3 -0
- package/dist/cli/src/utils/default-server.d.ts.map +1 -0
- package/dist/cli/src/utils/default-server.js +20 -0
- package/dist/cli/src/utils/index.d.ts +5 -0
- package/dist/cli/src/utils/index.d.ts.map +1 -0
- package/dist/cli/src/utils/index.js +5 -0
- package/dist/cli/src/utils/meca-processor.d.ts +28 -0
- package/dist/cli/src/utils/meca-processor.d.ts.map +1 -0
- package/dist/cli/src/utils/meca-processor.js +503 -0
- package/dist/cli/src/utils/meca-processor.test.d.ts +2 -0
- package/dist/cli/src/utils/meca-processor.test.d.ts.map +1 -0
- package/dist/cli/src/utils/meca-processor.test.js +123 -0
- package/dist/cli/src/utils/months.d.ts +36 -0
- package/dist/cli/src/utils/months.d.ts.map +1 -0
- package/dist/cli/src/utils/months.js +135 -0
- package/dist/cli/src/utils/months.test.d.ts +2 -0
- package/dist/cli/src/utils/months.test.d.ts.map +1 -0
- package/dist/cli/src/utils/months.test.js +209 -0
- package/dist/cli/src/utils/requester-pays-error.d.ts +6 -0
- package/dist/cli/src/utils/requester-pays-error.d.ts.map +1 -0
- package/dist/cli/src/utils/requester-pays-error.js +20 -0
- package/dist/cli/src/version.d.ts +3 -0
- package/dist/cli/src/version.d.ts.map +1 -0
- package/dist/cli/src/version.js +2 -0
- package/dist/cli.cjs +98815 -0
- package/dist/utils/src/biorxiv-parser.d.ts +51 -0
- package/dist/utils/src/biorxiv-parser.d.ts.map +1 -0
- package/dist/utils/src/biorxiv-parser.js +126 -0
- package/dist/utils/src/folder-structure.d.ts +44 -0
- package/dist/utils/src/folder-structure.d.ts.map +1 -0
- package/dist/utils/src/folder-structure.js +207 -0
- package/dist/utils/src/index.d.ts +3 -0
- package/dist/utils/src/index.d.ts.map +1 -0
- package/dist/utils/src/index.js +3 -0
- package/package.json +76 -0
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Utility functions for parsing bioRxiv URLs and DOIs
|
|
3
|
+
*/
|
|
4
|
+
export interface ParsedBiorxivURL {
|
|
5
|
+
doi: string;
|
|
6
|
+
baseDOI: string;
|
|
7
|
+
version: string | null;
|
|
8
|
+
fullURL: string;
|
|
9
|
+
isValid: boolean;
|
|
10
|
+
}
|
|
11
|
+
export interface DOIParts {
|
|
12
|
+
doi: string;
|
|
13
|
+
prefix: string;
|
|
14
|
+
suffix: string;
|
|
15
|
+
date: string | null;
|
|
16
|
+
identifier: string;
|
|
17
|
+
version: string | null;
|
|
18
|
+
}
|
|
19
|
+
/**
|
|
20
|
+
* Extract DOI from a bioRxiv URL
|
|
21
|
+
*/
|
|
22
|
+
export declare function extractDOIFromURL(url: string): string | null;
|
|
23
|
+
/**
|
|
24
|
+
* Parse a bioRxiv DOI into its components
|
|
25
|
+
* Supports both legacy numeric format (2019 and earlier) and current date-based format (2019+)
|
|
26
|
+
*/
|
|
27
|
+
export declare function parseDOI(doi: string): DOIParts | null;
|
|
28
|
+
/**
|
|
29
|
+
* Extract base DOI (without version)
|
|
30
|
+
* Works with both legacy numeric and current date-based formats
|
|
31
|
+
*/
|
|
32
|
+
export declare function extractBaseDOI(doi: string): string;
|
|
33
|
+
/**
|
|
34
|
+
* Extract version from DOI
|
|
35
|
+
* Works with both legacy numeric and current date-based formats
|
|
36
|
+
*/
|
|
37
|
+
export declare function extractVersion(doi: string): string | null;
|
|
38
|
+
/**
|
|
39
|
+
* Check if a DOI is a valid bioRxiv DOI
|
|
40
|
+
* Supports both legacy numeric and current date-based formats
|
|
41
|
+
*/
|
|
42
|
+
export declare function isValidBiorxivDOI(doi: string): boolean;
|
|
43
|
+
/**
|
|
44
|
+
* Check if a URL is a valid bioRxiv URL
|
|
45
|
+
*/
|
|
46
|
+
export declare function isValidBiorxivURL(url: string): boolean;
|
|
47
|
+
/**
|
|
48
|
+
* Parse a bioRxiv URL and extract all relevant information
|
|
49
|
+
*/
|
|
50
|
+
export declare function parseBiorxivURL(url: string): ParsedBiorxivURL | null;
|
|
51
|
+
//# sourceMappingURL=biorxiv-parser.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"biorxiv-parser.d.ts","sourceRoot":"","sources":["../../../../utils/src/biorxiv-parser.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,MAAM,WAAW,gBAAgB;IAC/B,GAAG,EAAE,MAAM,CAAC;IACZ,OAAO,EAAE,MAAM,CAAC;IAChB,OAAO,EAAE,MAAM,GAAG,IAAI,CAAC;IACvB,OAAO,EAAE,MAAM,CAAC;IAChB,OAAO,EAAE,OAAO,CAAC;CAClB;AAED,MAAM,WAAW,QAAQ;IACvB,GAAG,EAAE,MAAM,CAAC;IACZ,MAAM,EAAE,MAAM,CAAC;IACf,MAAM,EAAE,MAAM,CAAC;IACf,IAAI,EAAE,MAAM,GAAG,IAAI,CAAC;IACpB,UAAU,EAAE,MAAM,CAAC;IACnB,OAAO,EAAE,MAAM,GAAG,IAAI,CAAC;CACxB;AAED;;GAEG;AACH,wBAAgB,iBAAiB,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM,GAAG,IAAI,CAoC5D;AAED;;;GAGG;AACH,wBAAgB,QAAQ,CAAC,GAAG,EAAE,MAAM,GAAG,QAAQ,GAAG,IAAI,CAsCrD;AAED;;;GAGG;AACH,wBAAgB,cAAc,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM,CAGlD;AAED;;;GAGG;AACH,wBAAgB,cAAc,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM,GAAG,IAAI,CAGzD;AAED;;;GAGG;AACH,wBAAgB,iBAAiB,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAEtD;AAED;;GAEG;AACH,wBAAgB,iBAAiB,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAGtD;AAED;;GAEG;AACH,wBAAgB,eAAe,CAAC,GAAG,EAAE,MAAM,GAAG,gBAAgB,GAAG,IAAI,CAiBpE"}
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Utility functions for parsing bioRxiv URLs and DOIs
|
|
3
|
+
*/
|
|
4
|
+
/**
|
|
5
|
+
* Extract DOI from a bioRxiv URL
|
|
6
|
+
*/
|
|
7
|
+
export function extractDOIFromURL(url) {
|
|
8
|
+
// Handle various bioRxiv URL formats
|
|
9
|
+
let doi = null;
|
|
10
|
+
// Check for bioRxiv content URLs
|
|
11
|
+
if (url.includes('biorxiv.org/content/')) {
|
|
12
|
+
const match = url.match(/biorxiv\.org\/content\/([^?#]+)/);
|
|
13
|
+
if (match && match[1]) {
|
|
14
|
+
doi = match[1];
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
// Check for medRxiv content URLs
|
|
18
|
+
else if (url.includes('medrxiv.org/content/')) {
|
|
19
|
+
const match = url.match(/medrxiv\.org\/content\/([^?#]+)/);
|
|
20
|
+
if (match && match[1]) {
|
|
21
|
+
doi = match[1];
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
// Check for doi.org redirects
|
|
25
|
+
else if (url.includes('doi.org/')) {
|
|
26
|
+
const match = url.match(/doi\.org\/([^?#]+)/);
|
|
27
|
+
if (match && match[1]) {
|
|
28
|
+
doi = match[1];
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
// Check for direct DOI input
|
|
32
|
+
else if (url.startsWith('10.1101/')) {
|
|
33
|
+
doi = url;
|
|
34
|
+
}
|
|
35
|
+
if (doi) {
|
|
36
|
+
// Clean up the extracted DOI (remove any trailing extensions)
|
|
37
|
+
return doi.replace(/\.(article-info|full|abstract|pdf|suppl)$/, '');
|
|
38
|
+
}
|
|
39
|
+
return null;
|
|
40
|
+
}
|
|
41
|
+
/**
|
|
42
|
+
* Parse a bioRxiv DOI into its components
|
|
43
|
+
* Supports both legacy numeric format (2019 and earlier) and current date-based format (2019+)
|
|
44
|
+
*/
|
|
45
|
+
export function parseDOI(doi) {
|
|
46
|
+
// Handle current date-based format (2019+): 10.1101/YYYY.MM.DD.XXXXXXvN
|
|
47
|
+
const currentPattern = /^10\.1101\/(\d{4})\.(\d{2})\.(\d{2})\.(\d{6,8})(v\d+)?$/;
|
|
48
|
+
const currentMatch = doi.match(currentPattern);
|
|
49
|
+
if (currentMatch) {
|
|
50
|
+
const [prefix, suffix] = doi.split('/');
|
|
51
|
+
const [, year, month, day, identifier, version] = currentMatch;
|
|
52
|
+
const date = `${year}-${month}-${day}`;
|
|
53
|
+
return {
|
|
54
|
+
doi,
|
|
55
|
+
prefix,
|
|
56
|
+
suffix: suffix.replace(/(v\d+)$/, ''),
|
|
57
|
+
date,
|
|
58
|
+
identifier,
|
|
59
|
+
version: version || null,
|
|
60
|
+
};
|
|
61
|
+
}
|
|
62
|
+
// Handle legacy numeric format (2019 and earlier): 10.1101/XXXXXX
|
|
63
|
+
const legacyPattern = /^10\.1101\/(\d{6,8})(v\d+)?$/;
|
|
64
|
+
const legacyMatch = doi.match(legacyPattern);
|
|
65
|
+
if (legacyMatch) {
|
|
66
|
+
const [prefix, suffix] = doi.split('/');
|
|
67
|
+
const [, identifier, version] = legacyMatch;
|
|
68
|
+
return {
|
|
69
|
+
doi,
|
|
70
|
+
prefix,
|
|
71
|
+
suffix: suffix.replace(/(v\d+)$/, ''),
|
|
72
|
+
date: null,
|
|
73
|
+
identifier,
|
|
74
|
+
version: version || null,
|
|
75
|
+
};
|
|
76
|
+
}
|
|
77
|
+
return null;
|
|
78
|
+
}
|
|
79
|
+
/**
|
|
80
|
+
* Extract base DOI (without version)
|
|
81
|
+
* Works with both legacy numeric and current date-based formats
|
|
82
|
+
*/
|
|
83
|
+
export function extractBaseDOI(doi) {
|
|
84
|
+
// Remove version suffix if present
|
|
85
|
+
return doi.replace(/v\d+$/, '');
|
|
86
|
+
}
|
|
87
|
+
/**
|
|
88
|
+
* Extract version from DOI
|
|
89
|
+
* Works with both legacy numeric and current date-based formats
|
|
90
|
+
*/
|
|
91
|
+
export function extractVersion(doi) {
|
|
92
|
+
const match = doi.match(/v(\d+)$/);
|
|
93
|
+
return match ? match[1] : null;
|
|
94
|
+
}
|
|
95
|
+
/**
|
|
96
|
+
* Check if a DOI is a valid bioRxiv DOI
|
|
97
|
+
* Supports both legacy numeric and current date-based formats
|
|
98
|
+
*/
|
|
99
|
+
export function isValidBiorxivDOI(doi) {
|
|
100
|
+
return parseDOI(doi) !== null;
|
|
101
|
+
}
|
|
102
|
+
/**
|
|
103
|
+
* Check if a URL is a valid bioRxiv URL
|
|
104
|
+
*/
|
|
105
|
+
export function isValidBiorxivURL(url) {
|
|
106
|
+
const doi = extractDOIFromURL(url);
|
|
107
|
+
return doi !== null && isValidBiorxivDOI(doi);
|
|
108
|
+
}
|
|
109
|
+
/**
|
|
110
|
+
* Parse a bioRxiv URL and extract all relevant information
|
|
111
|
+
*/
|
|
112
|
+
export function parseBiorxivURL(url) {
|
|
113
|
+
const doi = extractDOIFromURL(url);
|
|
114
|
+
if (!doi || !isValidBiorxivDOI(doi)) {
|
|
115
|
+
return null;
|
|
116
|
+
}
|
|
117
|
+
const baseDOI = extractBaseDOI(doi);
|
|
118
|
+
const version = extractVersion(doi);
|
|
119
|
+
return {
|
|
120
|
+
doi,
|
|
121
|
+
baseDOI,
|
|
122
|
+
version,
|
|
123
|
+
fullURL: url,
|
|
124
|
+
isValid: true,
|
|
125
|
+
};
|
|
126
|
+
}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Utility functions for determining bioRxiv folder structure
|
|
3
|
+
* based on the date requested.
|
|
4
|
+
*
|
|
5
|
+
* The bioRxiv structure is:
|
|
6
|
+
* - Before late 2018: Files are in Back_Content/Batch_[nn]/ folders
|
|
7
|
+
* - After late 2018: Files are in Current_Content/[Month]_[Year]/ folders
|
|
8
|
+
*/
|
|
9
|
+
export interface FolderStructure {
|
|
10
|
+
server: 'biorxiv' | 'medrxiv';
|
|
11
|
+
type: 'current' | 'back';
|
|
12
|
+
prefix: string;
|
|
13
|
+
batch: string;
|
|
14
|
+
}
|
|
15
|
+
export interface FolderStructureOptions {
|
|
16
|
+
server?: 'biorxiv' | 'medrxiv';
|
|
17
|
+
month?: string;
|
|
18
|
+
batch?: string;
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* Normalizes batch input to the standard "Batch_XX" format
|
|
22
|
+
* @param batch - Batch input in various formats (e.g., "1", "batch-1", "Batch_01", "batch_01")
|
|
23
|
+
* @param server - Server type to determine batch format (e.g., "biorxiv", "medrxiv")
|
|
24
|
+
* @returns Normalized batch string in appropriate format
|
|
25
|
+
*/
|
|
26
|
+
export declare function normalizeBatch(batch: string | number, server?: string): string;
|
|
27
|
+
/**
|
|
28
|
+
* Determines the folder structure for a given month or batch
|
|
29
|
+
* @param options - Options containing month or batch
|
|
30
|
+
* @returns FolderStructure with the appropriate prefix and type
|
|
31
|
+
*/
|
|
32
|
+
export declare function getFolderStructure(options: FolderStructureOptions): FolderStructure;
|
|
33
|
+
export declare function removeDuplicateFolders(folders: FolderStructure[]): FolderStructure[];
|
|
34
|
+
/**
|
|
35
|
+
* Sort folders chronologically, putting batches before months
|
|
36
|
+
*/
|
|
37
|
+
export declare function sortFoldersChronologically(folders: FolderStructure[]): FolderStructure[];
|
|
38
|
+
/**
|
|
39
|
+
* Normalizes various month formats to YYYY-MM
|
|
40
|
+
* @param month - Month in various formats
|
|
41
|
+
* @returns Normalized YYYY-MM format or null if invalid
|
|
42
|
+
*/
|
|
43
|
+
export declare function normalizeMonthToYYYYMM(month: string): string | null;
|
|
44
|
+
//# sourceMappingURL=folder-structure.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"folder-structure.d.ts","sourceRoot":"","sources":["../../../../utils/src/folder-structure.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,MAAM,WAAW,eAAe;IAC9B,MAAM,EAAE,SAAS,GAAG,SAAS,CAAC;IAC9B,IAAI,EAAE,SAAS,GAAG,MAAM,CAAC;IACzB,MAAM,EAAE,MAAM,CAAC;IACf,KAAK,EAAE,MAAM,CAAC;CACf;AAED,MAAM,WAAW,sBAAsB;IACrC,MAAM,CAAC,EAAE,SAAS,GAAG,SAAS,CAAC;IAC/B,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED;;;;;GAKG;AACH,wBAAgB,cAAc,CAC5B,KAAK,EAAE,MAAM,GAAG,MAAM,EACtB,MAAM,GAAE,MAA2B,GAClC,MAAM,CAsCR;AAED;;;;GAIG;AACH,wBAAgB,kBAAkB,CAAC,OAAO,EAAE,sBAAsB,GAAG,eAAe,CAuDnF;AAED,wBAAgB,sBAAsB,CAAC,OAAO,EAAE,eAAe,EAAE,GAAG,eAAe,EAAE,CAWpF;AAED;;GAEG;AACH,wBAAgB,0BAA0B,CAAC,OAAO,EAAE,eAAe,EAAE,GAAG,eAAe,EAAE,CAsBxF;AAED;;;;GAIG;AACH,wBAAgB,sBAAsB,CAAC,KAAK,EAAE,MAAM,GAAG,MAAM,GAAG,IAAI,CAuBnE"}
|
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Utility functions for determining bioRxiv folder structure
|
|
3
|
+
* based on the date requested.
|
|
4
|
+
*
|
|
5
|
+
* The bioRxiv structure is:
|
|
6
|
+
* - Before late 2018: Files are in Back_Content/Batch_[nn]/ folders
|
|
7
|
+
* - After late 2018: Files are in Current_Content/[Month]_[Year]/ folders
|
|
8
|
+
*/
|
|
9
|
+
/**
|
|
10
|
+
* Normalizes batch input to the standard "Batch_XX" format
|
|
11
|
+
* @param batch - Batch input in various formats (e.g., "1", "batch-1", "Batch_01", "batch_01")
|
|
12
|
+
* @param server - Server type to determine batch format (e.g., "biorxiv", "medrxiv")
|
|
13
|
+
* @returns Normalized batch string in appropriate format
|
|
14
|
+
*/
|
|
15
|
+
export function normalizeBatch(batch, server = getDefaultServer()) {
|
|
16
|
+
if (typeof batch === 'number') {
|
|
17
|
+
if (batch < 1) {
|
|
18
|
+
throw new Error(`Invalid batch format: ${batch}. Expected a positive number or batch identifier.`);
|
|
19
|
+
}
|
|
20
|
+
const batchNum = batch.toString().padStart(2, '0');
|
|
21
|
+
return server.toLowerCase() === 'medrxiv' ? `medRxiv_Batch_${batchNum}` : `Batch_${batchNum}`;
|
|
22
|
+
}
|
|
23
|
+
// Remove common prefixes and normalize
|
|
24
|
+
const normalized = batch
|
|
25
|
+
.toLowerCase()
|
|
26
|
+
.replace(/^batch[-_]?/i, '') // Remove "batch", "batch-", "batch_"
|
|
27
|
+
.replace(/^medrxiv[-_]?batch[-_]?/i, '') // Remove "medrxiv_batch", "medrxiv-batch", etc.
|
|
28
|
+
.replace(/^0+/, '') // Remove leading zeros
|
|
29
|
+
.trim();
|
|
30
|
+
const matchInt = normalized.match(/^\d+$/);
|
|
31
|
+
if (!matchInt) {
|
|
32
|
+
throw new Error(`Invalid batch format: ${batch}. Expected a positive number or batch identifier.`);
|
|
33
|
+
}
|
|
34
|
+
// Parse the number and format it
|
|
35
|
+
const batchNum = parseInt(normalized, 10);
|
|
36
|
+
if (isNaN(batchNum) || batchNum < 1) {
|
|
37
|
+
throw new Error(`Invalid batch format: ${batch}. Expected a positive number or batch identifier.`);
|
|
38
|
+
}
|
|
39
|
+
const formattedBatchNum = batchNum.toString().padStart(2, '0');
|
|
40
|
+
return server.toLowerCase() === 'medrxiv'
|
|
41
|
+
? `medRxiv_Batch_${formattedBatchNum}`
|
|
42
|
+
: `Batch_${formattedBatchNum}`;
|
|
43
|
+
}
|
|
44
|
+
/**
|
|
45
|
+
* Determines the folder structure for a given month or batch
|
|
46
|
+
* @param options - Options containing month or batch
|
|
47
|
+
* @returns FolderStructure with the appropriate prefix and type
|
|
48
|
+
*/
|
|
49
|
+
export function getFolderStructure(options) {
|
|
50
|
+
if (options.month && options.batch) {
|
|
51
|
+
throw new Error('Either month or batch must be specified, not both');
|
|
52
|
+
}
|
|
53
|
+
if (!options.month && !options.batch) {
|
|
54
|
+
throw new Error('Either month or batch must be specified');
|
|
55
|
+
}
|
|
56
|
+
if (options.batch) {
|
|
57
|
+
// If batch is specified, use Back_Content structure
|
|
58
|
+
const normalizedBatch = normalizeBatch(options.batch, options.server);
|
|
59
|
+
return {
|
|
60
|
+
server: options.server || 'biorxiv',
|
|
61
|
+
type: 'back',
|
|
62
|
+
prefix: `Back_Content/${normalizedBatch}/`,
|
|
63
|
+
batch: normalizedBatch,
|
|
64
|
+
};
|
|
65
|
+
}
|
|
66
|
+
if (options.month) {
|
|
67
|
+
// Normalize month format to YYYY-MM
|
|
68
|
+
const normalizedMonth = normalizeMonthToYYYYMM(options.month);
|
|
69
|
+
if (!normalizedMonth) {
|
|
70
|
+
throw new Error(`Invalid month format: ${options.month}. Expected YYYY-MM or Month_YYYY format.`);
|
|
71
|
+
}
|
|
72
|
+
const [year, monthNum] = normalizedMonth.split('-').map(Number);
|
|
73
|
+
// bioRxiv switched from Back_Content to Current_Content in late 2018
|
|
74
|
+
// We'll use December 2018 as the cutoff point to be safe
|
|
75
|
+
const cutoffDate = new Date(2018, 11, 1); // December 1, 2018 (0-indexed month)
|
|
76
|
+
const requestedDate = new Date(year, monthNum - 1, 1);
|
|
77
|
+
if (requestedDate < cutoffDate) {
|
|
78
|
+
// Use Back_Content structure - but we don't know which batch
|
|
79
|
+
// User should specify batch explicitly for pre-2019 content
|
|
80
|
+
throw new Error(`Date ${options.month} is in the Back_Content period. Please specify a batch using --batch option. ` +
|
|
81
|
+
`Available batches can be listed with 'biorxiv list' command.`);
|
|
82
|
+
}
|
|
83
|
+
else {
|
|
84
|
+
// Use Current_Content structure
|
|
85
|
+
const monthName = getMonthName(monthNum);
|
|
86
|
+
return {
|
|
87
|
+
server: options.server || 'biorxiv',
|
|
88
|
+
type: 'current',
|
|
89
|
+
prefix: `Current_Content/${monthName}_${year}/`,
|
|
90
|
+
batch: `${monthName}_${year}`,
|
|
91
|
+
};
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
throw new Error('Invalid folder structure options');
|
|
95
|
+
}
|
|
96
|
+
export function removeDuplicateFolders(folders) {
|
|
97
|
+
return folders.filter((folder, index, arr) => arr.findIndex((f) => f.batch === folder.batch &&
|
|
98
|
+
f.server === folder.server &&
|
|
99
|
+
f.type === folder.type &&
|
|
100
|
+
f.prefix === folder.prefix) === index);
|
|
101
|
+
}
|
|
102
|
+
/**
|
|
103
|
+
* Sort folders chronologically, putting batches before months
|
|
104
|
+
*/
|
|
105
|
+
export function sortFoldersChronologically(folders) {
|
|
106
|
+
return folders.sort((a, b) => {
|
|
107
|
+
// Put batches before months
|
|
108
|
+
if (a.type === 'back' && b.type === 'current')
|
|
109
|
+
return -1;
|
|
110
|
+
if (a.type === 'current' && b.type === 'back')
|
|
111
|
+
return 1;
|
|
112
|
+
// For batches, sort by batch number
|
|
113
|
+
if (a.type === 'back' && b.type === 'back') {
|
|
114
|
+
const aNum = parseInt(a.batch.replace(/\D/g, ''));
|
|
115
|
+
const bNum = parseInt(b.batch.replace(/\D/g, ''));
|
|
116
|
+
return aNum - bNum;
|
|
117
|
+
}
|
|
118
|
+
// For months, sort chronologically (newest first)
|
|
119
|
+
if (a.type === 'current' && b.type === 'current') {
|
|
120
|
+
const aDate = new Date(a.batch);
|
|
121
|
+
const bDate = new Date(b.batch);
|
|
122
|
+
return aDate.getTime() - bDate.getTime();
|
|
123
|
+
}
|
|
124
|
+
return 0;
|
|
125
|
+
});
|
|
126
|
+
}
|
|
127
|
+
/**
|
|
128
|
+
* Normalizes various month formats to YYYY-MM
|
|
129
|
+
* @param month - Month in various formats
|
|
130
|
+
* @returns Normalized YYYY-MM format or null if invalid
|
|
131
|
+
*/
|
|
132
|
+
export function normalizeMonthToYYYYMM(month) {
|
|
133
|
+
// Already in YYYY-MM format
|
|
134
|
+
if (month.match(/^\d{4}-\d{2}$/)) {
|
|
135
|
+
const [, monthNum] = month.split('-').map(Number);
|
|
136
|
+
if (monthNum < 1 || monthNum > 12) {
|
|
137
|
+
return null; // Invalid month number
|
|
138
|
+
}
|
|
139
|
+
return month;
|
|
140
|
+
}
|
|
141
|
+
// Month_YYYY format (e.g., "November_2018")
|
|
142
|
+
const monthYearMatch = month.match(/^([A-Za-z]+)(?:[-_])(\d{4})$/);
|
|
143
|
+
if (monthYearMatch) {
|
|
144
|
+
const monthName = monthYearMatch[1];
|
|
145
|
+
const year = monthYearMatch[2];
|
|
146
|
+
const monthNum = getMonthNumber(monthName);
|
|
147
|
+
if (monthNum !== null) {
|
|
148
|
+
return `${year}-${monthNum.toString().padStart(2, '0')}`;
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
return null;
|
|
152
|
+
}
|
|
153
|
+
/**
|
|
154
|
+
* Gets month number from month name
|
|
155
|
+
* @param monthName - Month name (case insensitive)
|
|
156
|
+
* @returns Month number (1-12) or null if invalid
|
|
157
|
+
*/
|
|
158
|
+
function getMonthNumber(monthName) {
|
|
159
|
+
const monthNames = [
|
|
160
|
+
'january',
|
|
161
|
+
'february',
|
|
162
|
+
'march',
|
|
163
|
+
'april',
|
|
164
|
+
'may',
|
|
165
|
+
'june',
|
|
166
|
+
'july',
|
|
167
|
+
'august',
|
|
168
|
+
'september',
|
|
169
|
+
'october',
|
|
170
|
+
'november',
|
|
171
|
+
'december',
|
|
172
|
+
];
|
|
173
|
+
const normalizedName = monthName.toLowerCase();
|
|
174
|
+
let monthIndex = monthNames.indexOf(normalizedName);
|
|
175
|
+
if (monthIndex === -1) {
|
|
176
|
+
monthIndex = monthNames.map((m) => m.slice(0, 3).toLowerCase()).indexOf(normalizedName);
|
|
177
|
+
}
|
|
178
|
+
return monthIndex !== -1 ? monthIndex + 1 : null;
|
|
179
|
+
}
|
|
180
|
+
/**
|
|
181
|
+
* Gets month name from month number
|
|
182
|
+
* @param monthNum - Month number (1-12)
|
|
183
|
+
* @returns Month name (e.g., "January")
|
|
184
|
+
*/
|
|
185
|
+
function getMonthName(monthNum) {
|
|
186
|
+
const monthNames = [
|
|
187
|
+
'January',
|
|
188
|
+
'February',
|
|
189
|
+
'March',
|
|
190
|
+
'April',
|
|
191
|
+
'May',
|
|
192
|
+
'June',
|
|
193
|
+
'July',
|
|
194
|
+
'August',
|
|
195
|
+
'September',
|
|
196
|
+
'October',
|
|
197
|
+
'November',
|
|
198
|
+
'December',
|
|
199
|
+
];
|
|
200
|
+
if (monthNum < 1 || monthNum > 12) {
|
|
201
|
+
throw new Error(`Invalid month number: ${monthNum}. Must be 1-12.`);
|
|
202
|
+
}
|
|
203
|
+
return monthNames[monthNum - 1];
|
|
204
|
+
}
|
|
205
|
+
function getDefaultServer() {
|
|
206
|
+
throw new Error('Function not implemented.');
|
|
207
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../../utils/src/index.ts"],"names":[],"mappings":"AACA,cAAc,qBAAqB,CAAC;AACpC,cAAc,uBAAuB,CAAC"}
|
package/package.json
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "openrxiv",
|
|
3
|
+
"version": "0.0.0",
|
|
4
|
+
"description": "CLI tool to download openRxiv MECA files from AWS S3 for text and data mining",
|
|
5
|
+
"main": "dist/src/index.js",
|
|
6
|
+
"bin": {
|
|
7
|
+
"biorxiv": "./dist/cli.cjs",
|
|
8
|
+
"openrxiv": "./dist/cli.cjs",
|
|
9
|
+
"medrxiv": "./dist/cli.cjs"
|
|
10
|
+
},
|
|
11
|
+
"files": [
|
|
12
|
+
"dist"
|
|
13
|
+
],
|
|
14
|
+
"type": "module",
|
|
15
|
+
"scripts": {
|
|
16
|
+
"clean": "rimraf dist",
|
|
17
|
+
"unlink": "npm uninstall -g biorxiv",
|
|
18
|
+
"link": "npm run unlink; npm link;",
|
|
19
|
+
"dev": "npm run link && esbuild src/index.ts --bundle --outfile=dist/cli.cjs --platform=node --external:fsevents --target=node14 --watch",
|
|
20
|
+
"lint": "eslint \"src/**/!(*.spec).ts\" -c ../../.eslintrc.cjs",
|
|
21
|
+
"lint:format": "npx prettier --check \"src/**/*.ts\"",
|
|
22
|
+
"test": "vitest run",
|
|
23
|
+
"copy:version": "echo \"const version = '\"$npm_package_version\"';\nexport default version;\" > src/version.ts",
|
|
24
|
+
"test:watch": "vitest watch",
|
|
25
|
+
"build:esm": "tsc",
|
|
26
|
+
"build:cli": "esbuild src/index.ts --bundle --outfile=dist/cli.cjs --platform=node --external:fsevents --target=node14",
|
|
27
|
+
"build": "npm-run-all -l clean copy:version -p build:cli build:esm"
|
|
28
|
+
},
|
|
29
|
+
"keywords": [
|
|
30
|
+
"biorxiv",
|
|
31
|
+
"cli",
|
|
32
|
+
"aws",
|
|
33
|
+
"s3",
|
|
34
|
+
"download",
|
|
35
|
+
"meca",
|
|
36
|
+
"research",
|
|
37
|
+
"text-mining",
|
|
38
|
+
"data-mining"
|
|
39
|
+
],
|
|
40
|
+
"author": "Curvenote",
|
|
41
|
+
"license": "MIT",
|
|
42
|
+
"engines": {
|
|
43
|
+
"node": ">=18.0.0"
|
|
44
|
+
},
|
|
45
|
+
"dependencies": {
|
|
46
|
+
"@aws-sdk/client-s3": "^3.0.0",
|
|
47
|
+
"@aws-sdk/s3-request-presigner": "^3.0.0",
|
|
48
|
+
"axios": "^1.6.0",
|
|
49
|
+
"biorxiv-utils": "^0.0.0",
|
|
50
|
+
"boxen": "^8.0.1",
|
|
51
|
+
"character-entities": "^2.0.2",
|
|
52
|
+
"chalk": "^5.0.0",
|
|
53
|
+
"cli-progress": "^3.12.0",
|
|
54
|
+
"commander": "^11.0.0",
|
|
55
|
+
"conf": "^10.0.0",
|
|
56
|
+
"inquirer": "^9.0.0",
|
|
57
|
+
"jszip": "^3.10.1",
|
|
58
|
+
"ora": "^7.0.0",
|
|
59
|
+
"adm-zip": "^0.5.10",
|
|
60
|
+
"unified": "^11.0.0",
|
|
61
|
+
"xast-util-from-xml": "^4.0.0",
|
|
62
|
+
"p-limit": "^7.0.0"
|
|
63
|
+
},
|
|
64
|
+
"devDependencies": {
|
|
65
|
+
"@types/cli-progress": "^3.11.0",
|
|
66
|
+
"@types/inquirer": "^9.0.0"
|
|
67
|
+
},
|
|
68
|
+
"repository": {
|
|
69
|
+
"type": "git",
|
|
70
|
+
"url": "https://github.com/continuous-foundation/openrxiv.git"
|
|
71
|
+
},
|
|
72
|
+
"bugs": {
|
|
73
|
+
"url": "https://github.com/continuous-foundation/openrxiv/issues"
|
|
74
|
+
},
|
|
75
|
+
"homepage": "https://github.com/continuous-foundation/openrxiv#readme"
|
|
76
|
+
}
|