openrxiv-utils 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,258 @@
1
+ import { describe, it, expect } from 'vitest';
2
+ import {
3
+ extractDOIFromURL,
4
+ parseDOI,
5
+ extractBaseDOI,
6
+ extractVersion,
7
+ isValidBiorxivDOI,
8
+ isValidBiorxivURL,
9
+ parseBiorxivURL,
10
+ } from './biorxiv-parser.js';
11
+
12
+ describe('BioRxiv URL Parser', () => {
13
+ describe('extractDOIFromURL', () => {
14
+ it.each([
15
+ [
16
+ 'https://www.biorxiv.org/content/10.1101/2024.01.25.577295v3',
17
+ '10.1101/2024.01.25.577295v3',
18
+ ],
19
+ [
20
+ 'https://www.medrxiv.org/content/10.1101/2020.03.19.20039131v2',
21
+ '10.1101/2020.03.19.20039131v2',
22
+ ],
23
+ [
24
+ 'https://www.biorxiv.org/content/10.1101/2024.01.25.577295v3.article-info',
25
+ '10.1101/2024.01.25.577295v3',
26
+ ],
27
+ [
28
+ 'https://www.biorxiv.org/content/10.1101/2024.01.25.577295v3.full',
29
+ '10.1101/2024.01.25.577295v3',
30
+ ],
31
+ [
32
+ 'https://www.biorxiv.org/content/10.1101/2024.01.25.577295v3.abstract',
33
+ '10.1101/2024.01.25.577295v3',
34
+ ],
35
+ [
36
+ 'https://www.biorxiv.org/content/10.1101/2024.01.25.577295v3.pdf',
37
+ '10.1101/2024.01.25.577295v3',
38
+ ],
39
+ [
40
+ 'https://www.biorxiv.org/content/10.1101/2024.01.25.577295v3.suppl',
41
+ '10.1101/2024.01.25.577295v3',
42
+ ],
43
+ ['https://doi.org/10.1101/2024.01.25.577295v3', '10.1101/2024.01.25.577295v3'],
44
+ ['10.1101/2024.01.25.577295v3', '10.1101/2024.01.25.577295v3'],
45
+ ])('should extract DOI from standard content URL', (url, expected) => {
46
+ const result = extractDOIFromURL(url);
47
+ expect(result).toBe(expected);
48
+ });
49
+
50
+ it.each([
51
+ ['https://example.com/not-biorxiv', null],
52
+ ['https://biorxiv.org/invalid-path', null],
53
+ ['not-a-url', null],
54
+ ['', null],
55
+ ['https://biorxiv.org/', null],
56
+ ])('should return null for invalid URLs', (url, expected) => {
57
+ const result = extractDOIFromURL(url);
58
+ expect(result).toBe(expected);
59
+ });
60
+ });
61
+
62
+ describe('parseDOI', () => {
63
+ it('should parse DOI with version', () => {
64
+ const doi = '10.1101/2024.01.25.577295v3';
65
+ const result = parseDOI(doi);
66
+
67
+ expect(result).toEqual({
68
+ doi: '10.1101/2024.01.25.577295v3',
69
+ prefix: '10.1101',
70
+ date: '2024-01-25',
71
+ identifier: '577295',
72
+ suffix: '2024.01.25.577295',
73
+ version: 'v3',
74
+ });
75
+ });
76
+
77
+ it('should parse DOI without version', () => {
78
+ const doi = '10.1101/2024.01.25.577295';
79
+ const result = parseDOI(doi);
80
+
81
+ expect(result).toEqual({
82
+ doi: '10.1101/2024.01.25.577295',
83
+ prefix: '10.1101',
84
+ date: '2024-01-25',
85
+ identifier: '577295',
86
+ suffix: '2024.01.25.577295',
87
+ version: null,
88
+ });
89
+ });
90
+
91
+ it('should parse DOI with single digit month and day', () => {
92
+ const doi = '10.1101/2024.01.05.123456v1';
93
+ const result = parseDOI(doi);
94
+
95
+ expect(result).toEqual({
96
+ doi: '10.1101/2024.01.05.123456v1',
97
+ prefix: '10.1101',
98
+ date: '2024-01-05',
99
+ identifier: '123456',
100
+ suffix: '2024.01.05.123456',
101
+ version: 'v1',
102
+ });
103
+ });
104
+
105
+ it('should return null for invalid DOI format', () => {
106
+ const invalidDOIs = [
107
+ '10.1000/123.456.789',
108
+ '10.1101/2024.1.25.577295',
109
+ '10.1101/2024.01.25.57729',
110
+ '10.1101/2024.01.25.5772955666',
111
+ '10.1101/2024.01.25.577295v',
112
+ '10.1101/2024.01.25.577295v',
113
+ 'invalid-doi',
114
+ '',
115
+ ];
116
+
117
+ invalidDOIs.forEach((doi) => {
118
+ const result = parseDOI(doi);
119
+ expect(result).toBeNull();
120
+ });
121
+ });
122
+ });
123
+
124
+ describe('extractBaseDOI', () => {
125
+ it.each([
126
+ ['10.1101/2024.01.25.577295', '10.1101/2024.01.25.577295'], // Same DOI
127
+ ['10.1101/2024.01.25.577295v3', '10.1101/2024.01.25.577295'], // Remove version
128
+ ['10.1101/2024.01.25.577295v12', '10.1101/2024.01.25.577295'], // Remove double digit version
129
+ ['10.1101/2020.03.19.20039131v2', '10.1101/2020.03.19.20039131'], // medrxiv variant
130
+ ])('should extract base DOI from versioned DOI', (doi, expected) => {
131
+ const result = extractBaseDOI(doi);
132
+ expect(result).toBe(expected);
133
+ });
134
+ });
135
+
136
+ describe('extractVersion', () => {
137
+ it.each([
138
+ ['10.1101/2024.01.25.577295v3', '3'],
139
+ ['10.1101/2024.01.25.577295', null],
140
+ ['10.1101/2024.01.25.577295v12', '12'],
141
+ ])('should extract version from DOI', (doi, expected) => {
142
+ const result = extractVersion(doi);
143
+ expect(result).toBe(expected);
144
+ });
145
+ });
146
+
147
+ describe('isValidBiorxivDOI', () => {
148
+ it.each([
149
+ // Valid DOIs
150
+ ['10.1101/2024.01.25.577295v3', true],
151
+ ['10.1101/2024.01.25.577295', true],
152
+ ['10.1101/2020.01.15.123456v2', true],
153
+ ['10.1101/2018.01.15.789012', true],
154
+ ['10.1101/789012', true],
155
+ ['10.1101/789012v12', true],
156
+ ['10.1101/789012v3', true],
157
+ ['10.1101/2020.03.19.20039131v2', true],
158
+ ['10.1101/2024.1.25.577295', false],
159
+ ['10.1101/2024.01.25.57729', false],
160
+ ['invalid-doi', false],
161
+ ['10.1101/78901', false],
162
+ ['10.1101/78901v3', false],
163
+ ['', false],
164
+ ])('should validate correct bioRxiv DOIs %s', (doi, expected) => {
165
+ const result = isValidBiorxivDOI(doi);
166
+ expect(result).toBe(expected);
167
+ });
168
+ });
169
+
170
+ describe('isValidBiorxivURL', () => {
171
+ it.each([
172
+ ['https://www.biorxiv.org/content/10.1101/2024.01.25.577295v3', true],
173
+ ['https://www.biorxiv.org/content/10.1101/2024.01.25.577295v3.article-info', true],
174
+ ['https://doi.org/10.1101/2024.01.25.577295v3', true],
175
+ ['10.1101/2024.01.25.577295v3', true],
176
+ ['https://www.biorxiv.org/content/10.1101/486050v2.article-info', true],
177
+ ['https://www.biorxiv.org/content/10.1101/486050', true],
178
+ // Invalid
179
+ ['https://example.com/not-biorxiv', false],
180
+ ['https://biorxiv.org/invalid-path', false],
181
+ ['10.1000/123.456.789', false],
182
+ ['invalid-url', false],
183
+ ['', false],
184
+ ])('should validate correct bioRxiv URLs', (url, expected) => {
185
+ const result = isValidBiorxivURL(url);
186
+ expect(result).toBe(expected);
187
+ });
188
+ });
189
+
190
+ describe('parseBiorxivURL', () => {
191
+ it('should parse valid bioRxiv URL', () => {
192
+ const url = 'https://www.biorxiv.org/content/10.1101/2024.01.25.577295v3.article-info';
193
+ const result = parseBiorxivURL(url);
194
+
195
+ expect(result).toEqual({
196
+ doi: '10.1101/2024.01.25.577295v3',
197
+ baseDOI: '10.1101/2024.01.25.577295',
198
+ version: '3',
199
+ fullURL: url,
200
+ isValid: true,
201
+ });
202
+ });
203
+
204
+ it('should parse URL without version', () => {
205
+ const url = 'https://www.biorxiv.org/content/10.1101/2024.01.25.577295';
206
+ const result = parseBiorxivURL(url);
207
+
208
+ expect(result).toEqual({
209
+ doi: '10.1101/2024.01.25.577295',
210
+ baseDOI: '10.1101/2024.01.25.577295',
211
+ version: null,
212
+ fullURL: url,
213
+ isValid: true,
214
+ });
215
+ });
216
+
217
+ it('should return null for invalid URL', () => {
218
+ const url = 'https://example.com/not-biorxiv';
219
+ const result = parseBiorxivURL(url);
220
+ expect(result).toBeNull();
221
+ });
222
+ });
223
+
224
+ describe('Edge Cases', () => {
225
+ it('should handle DOIs with leading zeros in month/day', () => {
226
+ const doi = '10.1101/2024.01.05.123456v1';
227
+ const result = parseDOI(doi);
228
+
229
+ expect(result).toEqual({
230
+ doi: '10.1101/2024.01.05.123456v1',
231
+ prefix: '10.1101',
232
+ date: '2024-01-05',
233
+ identifier: '123456',
234
+ suffix: '2024.01.05.123456',
235
+ version: 'v1',
236
+ });
237
+ });
238
+
239
+ it('should handle DOIs with different identifier lengths', () => {
240
+ const doi = '10.1101/2024.01.25.123456v1';
241
+ const result = parseDOI(doi);
242
+
243
+ expect(result?.identifier).toBe('123456');
244
+ });
245
+
246
+ it('should handle URLs with query parameters', () => {
247
+ const url = 'https://www.biorxiv.org/content/10.1101/2024.01.25.577295v3?query=test';
248
+ const result = extractDOIFromURL(url);
249
+ expect(result).toBe('10.1101/2024.01.25.577295v3');
250
+ });
251
+
252
+ it('should handle URLs with fragments', () => {
253
+ const url = 'https://www.biorxiv.org/content/10.1101/2024.01.25.577295v3#section';
254
+ const result = extractDOIFromURL(url);
255
+ expect(result).toBe('10.1101/2024.01.25.577295v3');
256
+ });
257
+ });
258
+ });
@@ -0,0 +1,161 @@
1
+ /**
2
+ * Utility functions for parsing bioRxiv URLs and DOIs
3
+ */
4
+
5
+ export interface ParsedBiorxivURL {
6
+ doi: string;
7
+ baseDOI: string;
8
+ version: string | null;
9
+ fullURL: string;
10
+ isValid: boolean;
11
+ }
12
+
13
+ export interface DOIParts {
14
+ doi: string;
15
+ prefix: string;
16
+ suffix: string;
17
+ date: string | null;
18
+ identifier: string;
19
+ version: string | null;
20
+ }
21
+
22
+ /**
23
+ * Extract DOI from a bioRxiv URL
24
+ */
25
+ export function extractDOIFromURL(url: string): string | null {
26
+ // Handle various bioRxiv URL formats
27
+ let doi = null;
28
+
29
+ // Check for bioRxiv content URLs
30
+ if (url.includes('biorxiv.org/content/')) {
31
+ const match = url.match(/biorxiv\.org\/content\/([^?#]+)/);
32
+ if (match && match[1]) {
33
+ doi = match[1];
34
+ }
35
+ }
36
+ // Check for medRxiv content URLs
37
+ else if (url.includes('medrxiv.org/content/')) {
38
+ const match = url.match(/medrxiv\.org\/content\/([^?#]+)/);
39
+ if (match && match[1]) {
40
+ doi = match[1];
41
+ }
42
+ }
43
+ // Check for doi.org redirects
44
+ else if (url.includes('doi.org/')) {
45
+ const match = url.match(/doi\.org\/([^?#]+)/);
46
+ if (match && match[1]) {
47
+ doi = match[1];
48
+ }
49
+ }
50
+ // Check for direct DOI input
51
+ else if (url.startsWith('10.1101/')) {
52
+ doi = url;
53
+ }
54
+
55
+ if (doi) {
56
+ // Clean up the extracted DOI (remove any trailing extensions)
57
+ return doi.replace(/\.(article-info|full|abstract|pdf|suppl)$/, '');
58
+ }
59
+
60
+ return null;
61
+ }
62
+
63
+ /**
64
+ * Parse a bioRxiv DOI into its components
65
+ * Supports both legacy numeric format (2019 and earlier) and current date-based format (2019+)
66
+ */
67
+ export function parseDOI(doi: string): DOIParts | null {
68
+ // Handle current date-based format (2019+): 10.1101/YYYY.MM.DD.XXXXXXvN
69
+ const currentPattern = /^10\.1101\/(\d{4})\.(\d{2})\.(\d{2})\.(\d{6,8})(v\d+)?$/;
70
+ const currentMatch = doi.match(currentPattern);
71
+
72
+ if (currentMatch) {
73
+ const [prefix, suffix] = doi.split('/');
74
+ const [, year, month, day, identifier, version] = currentMatch;
75
+ const date = `${year}-${month}-${day}`;
76
+
77
+ return {
78
+ doi,
79
+ prefix,
80
+ suffix: suffix.replace(/(v\d+)$/, ''),
81
+ date,
82
+ identifier,
83
+ version: version || null,
84
+ };
85
+ }
86
+
87
+ // Handle legacy numeric format (2019 and earlier): 10.1101/XXXXXX
88
+ const legacyPattern = /^10\.1101\/(\d{6,8})(v\d+)?$/;
89
+ const legacyMatch = doi.match(legacyPattern);
90
+
91
+ if (legacyMatch) {
92
+ const [prefix, suffix] = doi.split('/');
93
+ const [, identifier, version] = legacyMatch;
94
+ return {
95
+ doi,
96
+ prefix,
97
+ suffix: suffix.replace(/(v\d+)$/, ''),
98
+ date: null,
99
+ identifier,
100
+ version: version || null,
101
+ };
102
+ }
103
+
104
+ return null;
105
+ }
106
+
107
+ /**
108
+ * Extract base DOI (without version)
109
+ * Works with both legacy numeric and current date-based formats
110
+ */
111
+ export function extractBaseDOI(doi: string): string {
112
+ // Remove version suffix if present
113
+ return doi.replace(/v\d+$/, '');
114
+ }
115
+
116
+ /**
117
+ * Extract version from DOI
118
+ * Works with both legacy numeric and current date-based formats
119
+ */
120
+ export function extractVersion(doi: string): string | null {
121
+ const match = doi.match(/v(\d+)$/);
122
+ return match ? match[1] : null;
123
+ }
124
+
125
+ /**
126
+ * Check if a DOI is a valid bioRxiv DOI
127
+ * Supports both legacy numeric and current date-based formats
128
+ */
129
+ export function isValidBiorxivDOI(doi: string): boolean {
130
+ return parseDOI(doi) !== null;
131
+ }
132
+
133
+ /**
134
+ * Check if a URL is a valid bioRxiv URL
135
+ */
136
+ export function isValidBiorxivURL(url: string): boolean {
137
+ const doi = extractDOIFromURL(url);
138
+ return doi !== null && isValidBiorxivDOI(doi);
139
+ }
140
+
141
+ /**
142
+ * Parse a bioRxiv URL and extract all relevant information
143
+ */
144
+ export function parseBiorxivURL(url: string): ParsedBiorxivURL | null {
145
+ const doi = extractDOIFromURL(url);
146
+
147
+ if (!doi || !isValidBiorxivDOI(doi)) {
148
+ return null;
149
+ }
150
+
151
+ const baseDOI = extractBaseDOI(doi);
152
+ const version = extractVersion(doi);
153
+
154
+ return {
155
+ doi,
156
+ baseDOI,
157
+ version,
158
+ fullURL: url,
159
+ isValid: true,
160
+ };
161
+ }