arxiv-api-wrapper 1.1.1 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +69 -1
- package/package.json +2 -2
- package/src/index.ts +45 -0
- package/src/oaiClient.ts +425 -0
- package/src/oaiParser.ts +264 -0
- package/src/oaiToArxiv.ts +204 -0
- package/src/oaiTypes.ts +248 -0
- package/tests/oai.integration.test.ts +222 -0
- package/tests/oai.test.ts +248 -0
- package/tests/oaiToArxiv.test.ts +131 -0
package/README.md
CHANGED
|
@@ -41,6 +41,60 @@ const papers = await getArxivEntriesById(['2101.01234', '2101.05678']);
|
|
|
41
41
|
- **Retry Logic**: Automatic retries with exponential backoff for transient failures
|
|
42
42
|
- **Pagination**: Support for paginated results with configurable page size
|
|
43
43
|
- **Sorting**: Multiple sort options (relevance, submission date, last updated)
|
|
44
|
+
- **OAI-PMH**: Support for the [arXiv Open Archives Initiative](https://info.arxiv.org/help/oa/index.html#open-archives-initiative-oai) interface (Identify, ListSets, GetRecord, ListRecords, ListIdentifiers, ListMetadataFormats)
|
|
45
|
+
|
|
46
|
+
## OAI-PMH interface
|
|
47
|
+
|
|
48
|
+
The package also supports the arXiv OAI-PMH endpoint (`https://oaipmh.arxiv.org/oai`), which is useful for metadata harvesting and bulk access. See the [arXiv OAI help](https://info.arxiv.org/help/oa/index.html#open-archives-initiative-oai) and the [OAI-PMH v2.0 protocol](https://www.openarchives.org/OAI/openarchivesprotocol.html) for details.
|
|
49
|
+
|
|
50
|
+
```typescript
|
|
51
|
+
import {
|
|
52
|
+
oaiIdentify,
|
|
53
|
+
oaiListRecords,
|
|
54
|
+
oaiListRecordsAsyncIterator,
|
|
55
|
+
oaiGetRecord,
|
|
56
|
+
oaiListSets,
|
|
57
|
+
oaiListIdentifiers,
|
|
58
|
+
oaiListMetadataFormats,
|
|
59
|
+
} from 'arxiv-api-wrapper';
|
|
60
|
+
|
|
61
|
+
// Repository info
|
|
62
|
+
const identify = await oaiIdentify();
|
|
63
|
+
console.log(identify.repositoryName, identify.protocolVersion);
|
|
64
|
+
|
|
65
|
+
// One page of records (e.g. Dublin Core)
|
|
66
|
+
const result = await oaiListRecords('oai_dc', {
|
|
67
|
+
from: '2024-01-01',
|
|
68
|
+
until: '2024-01-31',
|
|
69
|
+
set: 'math:math:LO', // optional: restrict to a set
|
|
70
|
+
rateLimit: { tokensPerInterval: 1, intervalMs: 1000 },
|
|
71
|
+
});
|
|
72
|
+
result.records.forEach((rec) => {
|
|
73
|
+
console.log(rec.header.identifier, rec.metadata);
|
|
74
|
+
});
|
|
75
|
+
if (result.resumptionToken) {
|
|
76
|
+
// Fetch next page with result.resumptionToken.value
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
// Single record by identifier (full or short form)
|
|
80
|
+
const record = await oaiGetRecord('cs/0112017', 'oai_dc');
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
For an intermediate option between manual page-by-page pagination and `*All` helpers, use async iterators:
|
|
84
|
+
|
|
85
|
+
```typescript
|
|
86
|
+
for await (const rec of oaiListRecordsAsyncIterator('oai_dc', {
|
|
87
|
+
from: '2024-01-01',
|
|
88
|
+
until: '2024-01-02',
|
|
89
|
+
maxRecords: 50,
|
|
90
|
+
})) {
|
|
91
|
+
console.log(rec.header.identifier);
|
|
92
|
+
}
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
The `oaiListRecordsAll` / `oaiListIdentifiersAll` / `oaiListSetsAll` helpers are convenience wrappers that collect from the corresponding async iterators.
|
|
96
|
+
|
|
97
|
+
All OAI functions accept optional `timeoutMs`, `retries`, `userAgent`, and `rateLimit` (same as the Atom API). OAI errors (e.g. `idDoesNotExist`, `noRecordsMatch`) are thrown as `OaiError` with a `code` and `messageText`.
|
|
44
98
|
|
|
45
99
|
## API Reference
|
|
46
100
|
|
|
@@ -234,7 +288,21 @@ import type {
|
|
|
234
288
|
ArxivSortOrder,
|
|
235
289
|
ArxivRateLimitConfig,
|
|
236
290
|
ArxivDateRange,
|
|
237
|
-
|
|
291
|
+
// OAI-PMH types
|
|
292
|
+
OaiIdentifyResponse,
|
|
293
|
+
OaiRecord,
|
|
294
|
+
OaiHeader,
|
|
295
|
+
OaiSet,
|
|
296
|
+
OaiMetadataFormat,
|
|
297
|
+
OaiResumptionToken,
|
|
298
|
+
OaiListRecordsResult,
|
|
299
|
+
OaiListIdentifiersResult,
|
|
300
|
+
OaiListSetsResult,
|
|
301
|
+
OaiRequestOptions,
|
|
302
|
+
OaiListOptions,
|
|
303
|
+
OaiErrorCode,
|
|
304
|
+
OaiError
|
|
305
|
+
} from 'arxiv-api-wrapper';
|
|
238
306
|
```
|
|
239
307
|
|
|
240
308
|
## License
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "arxiv-api-wrapper",
|
|
3
|
-
"version": "
|
|
3
|
+
"version": "2.0.0",
|
|
4
4
|
"description": "Provides functions wrapping the arXiv API",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"arxiv"
|
|
@@ -30,7 +30,7 @@
|
|
|
30
30
|
},
|
|
31
31
|
"devDependencies": {
|
|
32
32
|
"@types/node": "^25.0.0",
|
|
33
|
-
"typedoc": "^0.
|
|
33
|
+
"typedoc": "^0.28.17",
|
|
34
34
|
"typescript": "^5.0.0",
|
|
35
35
|
"vitest": "^4.0.18"
|
|
36
36
|
}
|
package/src/index.ts
CHANGED
|
@@ -14,6 +14,7 @@
|
|
|
14
14
|
* - **Retry Logic**: Automatic retries with exponential backoff for transient failures
|
|
15
15
|
* - **Pagination**: Support for paginated results with configurable page size
|
|
16
16
|
* - **Sorting**: Multiple sort options (relevance, submission date, last updated)
|
|
17
|
+
* - **OAI-PMH**: Support for the arXiv Open Archives Initiative interface (Identify, ListSets, GetRecord, ListRecords, etc.)
|
|
17
18
|
*
|
|
18
19
|
* ## Quick Start
|
|
19
20
|
*
|
|
@@ -41,6 +42,26 @@
|
|
|
41
42
|
|
|
42
43
|
// Main entry point for the arXiv API wrapper package
|
|
43
44
|
export { getArxivEntries, getArxivEntriesById } from './arxivAPIRead.js';
|
|
45
|
+
export {
|
|
46
|
+
oaiIdentify,
|
|
47
|
+
oaiListMetadataFormats,
|
|
48
|
+
oaiListSets,
|
|
49
|
+
oaiListSetsAsyncIterator,
|
|
50
|
+
oaiListSetsAll,
|
|
51
|
+
oaiGetRecord,
|
|
52
|
+
oaiListIdentifiers,
|
|
53
|
+
oaiListIdentifiersAsyncIterator,
|
|
54
|
+
oaiListIdentifiersAll,
|
|
55
|
+
oaiListRecords,
|
|
56
|
+
oaiListRecordsAsyncIterator,
|
|
57
|
+
oaiListRecordsAll,
|
|
58
|
+
normalizeOaiIdentifier,
|
|
59
|
+
} from './oaiClient.js';
|
|
60
|
+
export {
|
|
61
|
+
oaiRecordToArxivEntry,
|
|
62
|
+
oaiRecordsToArxivEntries,
|
|
63
|
+
oaiListRecordsToArxivQueryResult,
|
|
64
|
+
} from './oaiToArxiv.js';
|
|
44
65
|
export type {
|
|
45
66
|
ArxivQueryOptions,
|
|
46
67
|
ArxivQueryResult,
|
|
@@ -54,4 +75,28 @@ export type {
|
|
|
54
75
|
ArxivRateLimitConfig,
|
|
55
76
|
ArxivDateRange,
|
|
56
77
|
} from './types.js';
|
|
78
|
+
export type {
|
|
79
|
+
OaiRequestOptions,
|
|
80
|
+
OaiIdentifyResponse,
|
|
81
|
+
OaiMetadataFormat,
|
|
82
|
+
OaiSet,
|
|
83
|
+
OaiResumptionToken,
|
|
84
|
+
OaiHeader,
|
|
85
|
+
OaiRecord,
|
|
86
|
+
OaiMetadataPrefix,
|
|
87
|
+
OaiMetadata,
|
|
88
|
+
OaiMetadataByPrefix,
|
|
89
|
+
OaiDcMetadata,
|
|
90
|
+
OaiArxivAuthor,
|
|
91
|
+
OaiArxivMetadata,
|
|
92
|
+
OaiArxivOldMetadata,
|
|
93
|
+
OaiArxivRawVersion,
|
|
94
|
+
OaiArxivRawMetadata,
|
|
95
|
+
OaiListIdentifiersResult,
|
|
96
|
+
OaiListRecordsResult,
|
|
97
|
+
OaiListSetsResult,
|
|
98
|
+
OaiListOptions,
|
|
99
|
+
OaiErrorCode,
|
|
100
|
+
} from './oaiTypes.js';
|
|
101
|
+
export { OaiError } from './oaiTypes.js';
|
|
57
102
|
|
package/src/oaiClient.ts
ADDED
|
@@ -0,0 +1,425 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Client for the arXiv OAI-PMH interface.
|
|
3
|
+
* @see https://info.arxiv.org/help/oa/index.html#open-archives-initiative-oai
|
|
4
|
+
* @see https://www.openarchives.org/OAI/openarchivesprotocol.html
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
import { TokenBucketLimiter } from './rateLimiter.js';
|
|
8
|
+
import { fetchWithRetry } from './http.js';
|
|
9
|
+
import {
|
|
10
|
+
type OaiRequestOptions,
|
|
11
|
+
type OaiListOptions,
|
|
12
|
+
type OaiListIdentifiersResult,
|
|
13
|
+
type OaiListRecordsResult,
|
|
14
|
+
type OaiListSetsResult,
|
|
15
|
+
} from './oaiTypes.js';
|
|
16
|
+
import type { ArxivRateLimitConfig } from './types.js';
|
|
17
|
+
import {
|
|
18
|
+
parseIdentify,
|
|
19
|
+
parseListMetadataFormats,
|
|
20
|
+
parseListSets,
|
|
21
|
+
parseGetRecord,
|
|
22
|
+
parseListIdentifiers,
|
|
23
|
+
parseListRecords,
|
|
24
|
+
} from './oaiParser.js';
|
|
25
|
+
import type {
|
|
26
|
+
OaiIdentifyResponse,
|
|
27
|
+
OaiMetadataFormat,
|
|
28
|
+
OaiMetadataPrefix,
|
|
29
|
+
OaiRecord,
|
|
30
|
+
OaiHeader,
|
|
31
|
+
OaiSet,
|
|
32
|
+
} from './oaiTypes.js';
|
|
33
|
+
|
|
34
|
+
const OAI_BASE_URL = 'https://oaipmh.arxiv.org/oai';
|
|
35
|
+
|
|
36
|
+
const DEFAULT_USER_AGENT = 'arxiv-api-wrapper/1.0 (+https://export.arxiv.org)';
|
|
37
|
+
|
|
38
|
+
type OaiVerb =
|
|
39
|
+
| 'Identify'
|
|
40
|
+
| 'ListMetadataFormats'
|
|
41
|
+
| 'ListSets'
|
|
42
|
+
| 'GetRecord'
|
|
43
|
+
| 'ListIdentifiers'
|
|
44
|
+
| 'ListRecords';
|
|
45
|
+
|
|
46
|
+
interface OaiParams {
|
|
47
|
+
identifier?: string;
|
|
48
|
+
metadataPrefix?: OaiMetadataPrefix;
|
|
49
|
+
from?: string;
|
|
50
|
+
until?: string;
|
|
51
|
+
set?: string;
|
|
52
|
+
resumptionToken?: string;
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/** Build OAI-PMH request URL (exported for unit tests). */
|
|
56
|
+
export function buildOaiUrl(verb: OaiVerb, params: OaiParams): string {
|
|
57
|
+
const searchParams = new URLSearchParams();
|
|
58
|
+
searchParams.set('verb', verb);
|
|
59
|
+
if (params.identifier != null && params.identifier !== '')
|
|
60
|
+
searchParams.set('identifier', params.identifier);
|
|
61
|
+
if (params.metadataPrefix != null)
|
|
62
|
+
searchParams.set('metadataPrefix', params.metadataPrefix);
|
|
63
|
+
if (params.from != null && params.from !== '') searchParams.set('from', params.from);
|
|
64
|
+
if (params.until != null && params.until !== '') searchParams.set('until', params.until);
|
|
65
|
+
if (params.set != null && params.set !== '') searchParams.set('set', params.set);
|
|
66
|
+
if (params.resumptionToken != null && params.resumptionToken !== '')
|
|
67
|
+
searchParams.set('resumptionToken', params.resumptionToken);
|
|
68
|
+
return `${OAI_BASE_URL}?${searchParams.toString()}`;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
/**
|
|
72
|
+
* Normalize an arXiv identifier to OAI form (oai:arXiv.org:...).
|
|
73
|
+
* Accepts full form (oai:arXiv.org:cs/0112017) or short form (cs/0112017, 2101.01234).
|
|
74
|
+
*/
|
|
75
|
+
export function normalizeOaiIdentifier(identifier: string): string {
|
|
76
|
+
const trimmed = identifier.trim();
|
|
77
|
+
if (!trimmed) return trimmed;
|
|
78
|
+
if (/^oai:arXiv\.org:/i.test(trimmed)) return trimmed;
|
|
79
|
+
return `oai:arXiv.org:${trimmed}`;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
function mergeOptions(opts?: OaiRequestOptions): {
|
|
83
|
+
timeoutMs: number;
|
|
84
|
+
retries: number;
|
|
85
|
+
userAgent: string;
|
|
86
|
+
rateLimit?: ArxivRateLimitConfig;
|
|
87
|
+
} {
|
|
88
|
+
return {
|
|
89
|
+
timeoutMs: opts?.timeoutMs ?? 10000,
|
|
90
|
+
retries: opts?.retries ?? 3,
|
|
91
|
+
userAgent: opts?.userAgent ?? DEFAULT_USER_AGENT,
|
|
92
|
+
rateLimit: opts?.rateLimit,
|
|
93
|
+
};
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
async function oaiRequest(
|
|
97
|
+
verb: OaiVerb,
|
|
98
|
+
params: OaiParams,
|
|
99
|
+
options: OaiRequestOptions | undefined
|
|
100
|
+
): Promise<string> {
|
|
101
|
+
const { timeoutMs, retries, userAgent, rateLimit } = mergeOptions(options);
|
|
102
|
+
const url = buildOaiUrl(verb, params);
|
|
103
|
+
const limiter = rateLimit
|
|
104
|
+
? new TokenBucketLimiter(rateLimit.tokensPerInterval, rateLimit.intervalMs)
|
|
105
|
+
: undefined;
|
|
106
|
+
if (limiter) await limiter.acquire();
|
|
107
|
+
const res = await fetchWithRetry(
|
|
108
|
+
url,
|
|
109
|
+
{ method: 'GET', headers: { Accept: 'text/xml' } },
|
|
110
|
+
{ retries, timeoutMs, userAgent }
|
|
111
|
+
);
|
|
112
|
+
if (!res.ok) {
|
|
113
|
+
const text = await res.text().catch(() => '');
|
|
114
|
+
throw new Error(
|
|
115
|
+
`OAI request failed: ${res.status} ${res.statusText} for ${verb}. ${text.substring(0, 300)}`
|
|
116
|
+
);
|
|
117
|
+
}
|
|
118
|
+
const text = await res.text();
|
|
119
|
+
if (!text || text.trim().length === 0) {
|
|
120
|
+
throw new Error(`OAI request returned empty response for ${verb}`);
|
|
121
|
+
}
|
|
122
|
+
return text;
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
/**
|
|
126
|
+
* Retrieve information about the arXiv OAI repository (Identify verb).
|
|
127
|
+
*
|
|
128
|
+
* @param options - Optional request configuration (timeout, retries, userAgent, rateLimit). Same semantics as the Atom API options.
|
|
129
|
+
* @returns Parsed Identify response with repositoryName, baseURL, protocolVersion, etc.
|
|
130
|
+
* @see https://info.arxiv.org/help/oa/index.html#open-archives-initiative-oai
|
|
131
|
+
* @see https://www.openarchives.org/OAI/openarchivesprotocol.html
|
|
132
|
+
*/
|
|
133
|
+
export async function oaiIdentify(options?: OaiRequestOptions): Promise<OaiIdentifyResponse> {
|
|
134
|
+
const xml = await oaiRequest('Identify', {}, options);
|
|
135
|
+
return parseIdentify(xml);
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
/**
|
|
139
|
+
* List metadata formats supported by the repository, optionally for a specific item (ListMetadataFormats verb).
|
|
140
|
+
*
|
|
141
|
+
* @param identifier - Optional item identifier to list formats for that item only.
|
|
142
|
+
* @param options - Optional request configuration (timeout, retries, userAgent, rateLimit).
|
|
143
|
+
* @returns List of metadata formats (metadataPrefix, schema, metadataNamespace).
|
|
144
|
+
*/
|
|
145
|
+
export async function oaiListMetadataFormats(
|
|
146
|
+
identifier?: string,
|
|
147
|
+
options?: OaiRequestOptions
|
|
148
|
+
): Promise<OaiMetadataFormat[]> {
|
|
149
|
+
const params: OaiParams = {};
|
|
150
|
+
if (identifier != null && identifier !== '') params.identifier = identifier;
|
|
151
|
+
const xml = await oaiRequest('ListMetadataFormats', params, options);
|
|
152
|
+
return parseListMetadataFormats(xml);
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
/**
|
|
156
|
+
* List sets available for selective harvesting (ListSets verb).
|
|
157
|
+
*
|
|
158
|
+
* @param resumptionToken - Optional resumption token from a previous ListSets response.
|
|
159
|
+
* @param options - Optional request configuration (timeout, retries, userAgent, rateLimit).
|
|
160
|
+
* @returns Sets (setSpec, setName, setDescription) and optional resumptionToken.
|
|
161
|
+
*/
|
|
162
|
+
export async function oaiListSets(
|
|
163
|
+
resumptionToken?: string,
|
|
164
|
+
options?: OaiRequestOptions
|
|
165
|
+
): Promise<OaiListSetsResult> {
|
|
166
|
+
const params: OaiParams = {};
|
|
167
|
+
if (resumptionToken != null && resumptionToken !== '') params.resumptionToken = resumptionToken;
|
|
168
|
+
const xml = await oaiRequest('ListSets', params, options);
|
|
169
|
+
return parseListSets(xml);
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
/**
|
|
173
|
+
* Retrieve a single record by identifier and metadata format (GetRecord verb).
|
|
174
|
+
*
|
|
175
|
+
* @param identifier - Item identifier (full form oai:arXiv.org:cs/0112017 or short form cs/0112017, 2101.01234).
|
|
176
|
+
* @param metadataPrefix - Metadata format (e.g. oai_dc, arXiv, arXivRaw).
|
|
177
|
+
* @param options - Optional request configuration (timeout, retries, userAgent, rateLimit).
|
|
178
|
+
* @returns Single OAI record (header + metadata + about).
|
|
179
|
+
*/
|
|
180
|
+
export async function oaiGetRecord(
|
|
181
|
+
identifier: string,
|
|
182
|
+
metadataPrefix: OaiMetadataPrefix,
|
|
183
|
+
options?: OaiRequestOptions
|
|
184
|
+
): Promise<OaiRecord> {
|
|
185
|
+
const normalizedId = normalizeOaiIdentifier(identifier);
|
|
186
|
+
const xml = await oaiRequest(
|
|
187
|
+
'GetRecord',
|
|
188
|
+
{ identifier: normalizedId, metadataPrefix },
|
|
189
|
+
options
|
|
190
|
+
);
|
|
191
|
+
return parseGetRecord(xml);
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
/**
|
|
195
|
+
* List identifiers (headers only) for selective harvesting (ListIdentifiers verb).
|
|
196
|
+
*
|
|
197
|
+
* @param metadataPrefix - Required metadata format (e.g. oai_dc, arXiv, arXivRaw).
|
|
198
|
+
* @param listOptions - Optional from, until, set, resumptionToken and request options (timeout, retries, userAgent, rateLimit).
|
|
199
|
+
* @returns Headers and optional resumptionToken for the next page.
|
|
200
|
+
*/
|
|
201
|
+
export async function oaiListIdentifiers(
|
|
202
|
+
metadataPrefix: OaiMetadataPrefix,
|
|
203
|
+
listOptions?: OaiListOptions
|
|
204
|
+
): Promise<OaiListIdentifiersResult> {
|
|
205
|
+
const params: OaiParams = { metadataPrefix };
|
|
206
|
+
if (listOptions?.resumptionToken != null && listOptions.resumptionToken !== '') {
|
|
207
|
+
params.resumptionToken = listOptions.resumptionToken;
|
|
208
|
+
} else {
|
|
209
|
+
if (listOptions?.from != null && listOptions.from !== '')
|
|
210
|
+
params.from = listOptions.from;
|
|
211
|
+
if (listOptions?.until != null && listOptions.until !== '')
|
|
212
|
+
params.until = listOptions.until;
|
|
213
|
+
if (listOptions?.set != null && listOptions.set !== '') params.set = listOptions.set;
|
|
214
|
+
}
|
|
215
|
+
const xml = await oaiRequest('ListIdentifiers', params, listOptions);
|
|
216
|
+
return parseListIdentifiers(xml);
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
/**
|
|
220
|
+
* List records (full metadata) for selective harvesting (ListRecords verb).
|
|
221
|
+
*
|
|
222
|
+
* @param metadataPrefix - Required metadata format (e.g. oai_dc, arXiv, arXivRaw).
|
|
223
|
+
* @param listOptions - Optional from, until, set, resumptionToken and request options (timeout, retries, userAgent, rateLimit).
|
|
224
|
+
* @returns Records and optional resumptionToken for the next page.
|
|
225
|
+
*/
|
|
226
|
+
export async function oaiListRecords(
|
|
227
|
+
metadataPrefix: OaiMetadataPrefix,
|
|
228
|
+
listOptions?: OaiListOptions
|
|
229
|
+
): Promise<OaiListRecordsResult> {
|
|
230
|
+
const params: OaiParams = { metadataPrefix };
|
|
231
|
+
if (listOptions?.resumptionToken != null && listOptions.resumptionToken !== '') {
|
|
232
|
+
params.resumptionToken = listOptions.resumptionToken;
|
|
233
|
+
} else {
|
|
234
|
+
if (listOptions?.from != null && listOptions.from !== '')
|
|
235
|
+
params.from = listOptions.from;
|
|
236
|
+
if (listOptions?.until != null && listOptions.until !== '')
|
|
237
|
+
params.until = listOptions.until;
|
|
238
|
+
if (listOptions?.set != null && listOptions.set !== '') params.set = listOptions.set;
|
|
239
|
+
}
|
|
240
|
+
const xml = await oaiRequest('ListRecords', params, listOptions);
|
|
241
|
+
return parseListRecords(xml);
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
type OaiListRecordsAllOptions = Omit<OaiListOptions, 'resumptionToken'> & {
|
|
245
|
+
maxRecords?: number;
|
|
246
|
+
};
|
|
247
|
+
|
|
248
|
+
type OaiListIdentifiersAllOptions = Omit<OaiListOptions, 'resumptionToken'> & {
|
|
249
|
+
maxHeaders?: number;
|
|
250
|
+
};
|
|
251
|
+
|
|
252
|
+
type OaiListSetsAllOptions = OaiRequestOptions & {
|
|
253
|
+
maxSets?: number;
|
|
254
|
+
};
|
|
255
|
+
|
|
256
|
+
/**
|
|
257
|
+
* Iterate records across all pages for a given metadataPrefix and optional selective harvesting options.
|
|
258
|
+
*
|
|
259
|
+
* This helper follows resumption tokens internally and yields records one-by-one until completion or
|
|
260
|
+
* until the optional maxRecords cap is reached.
|
|
261
|
+
*
|
|
262
|
+
* @param metadataPrefix - Required metadata format (e.g. oai_dc, arXiv, arXivRaw).
|
|
263
|
+
* @param listOptions - Optional from, until, set, request options (timeout, retries, userAgent, rateLimit) and maxRecords.
|
|
264
|
+
* Any provided resumptionToken is ignored; pagination is handled internally.
|
|
265
|
+
* @returns Async iterator yielding records one-by-one.
|
|
266
|
+
*/
|
|
267
|
+
export async function* oaiListRecordsAsyncIterator(
|
|
268
|
+
metadataPrefix: OaiMetadataPrefix,
|
|
269
|
+
listOptions?: OaiListRecordsAllOptions
|
|
270
|
+
): AsyncGenerator<OaiRecord, void, void> {
|
|
271
|
+
let emitted = 0;
|
|
272
|
+
let resumptionToken: string | undefined;
|
|
273
|
+
const { maxRecords, ...restOptions } = listOptions ?? {};
|
|
274
|
+
|
|
275
|
+
do {
|
|
276
|
+
const pageOptions: OaiListOptions = resumptionToken
|
|
277
|
+
? { ...restOptions, resumptionToken }
|
|
278
|
+
: restOptions;
|
|
279
|
+
|
|
280
|
+
const page = await oaiListRecords(metadataPrefix, pageOptions);
|
|
281
|
+
const records = page.records ?? [];
|
|
282
|
+
if (records.length === 0) break;
|
|
283
|
+
|
|
284
|
+
for (const record of records) {
|
|
285
|
+
if (maxRecords != null && emitted >= maxRecords) return;
|
|
286
|
+
yield record;
|
|
287
|
+
emitted += 1;
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
resumptionToken = page.resumptionToken?.value;
|
|
291
|
+
} while (resumptionToken);
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
/**
|
|
295
|
+
* Iterate identifiers (headers only) across all pages for a given metadataPrefix and optional selective harvesting options.
|
|
296
|
+
*
|
|
297
|
+
* This helper follows resumption tokens internally and yields headers one-by-one until completion or
|
|
298
|
+
* until the optional maxHeaders cap is reached.
|
|
299
|
+
*
|
|
300
|
+
* @param metadataPrefix - Required metadata format (e.g. oai_dc, arXiv, arXivRaw).
|
|
301
|
+
* @param listOptions - Optional from, until, set, request options (timeout, retries, userAgent, rateLimit) and maxHeaders.
|
|
302
|
+
* Any provided resumptionToken is ignored; pagination is handled internally.
|
|
303
|
+
* @returns Async iterator yielding headers one-by-one.
|
|
304
|
+
*/
|
|
305
|
+
export async function* oaiListIdentifiersAsyncIterator(
|
|
306
|
+
metadataPrefix: OaiMetadataPrefix,
|
|
307
|
+
listOptions?: OaiListIdentifiersAllOptions
|
|
308
|
+
): AsyncGenerator<OaiHeader, void, void> {
|
|
309
|
+
let emitted = 0;
|
|
310
|
+
let resumptionToken: string | undefined;
|
|
311
|
+
const { maxHeaders, ...restOptions } = listOptions ?? {};
|
|
312
|
+
|
|
313
|
+
do {
|
|
314
|
+
const pageOptions: OaiListOptions = resumptionToken
|
|
315
|
+
? { ...restOptions, resumptionToken }
|
|
316
|
+
: restOptions;
|
|
317
|
+
|
|
318
|
+
const page = await oaiListIdentifiers(metadataPrefix, pageOptions);
|
|
319
|
+
const headers = page.headers ?? [];
|
|
320
|
+
if (headers.length === 0) break;
|
|
321
|
+
|
|
322
|
+
for (const header of headers) {
|
|
323
|
+
if (maxHeaders != null && emitted >= maxHeaders) return;
|
|
324
|
+
yield header;
|
|
325
|
+
emitted += 1;
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
resumptionToken = page.resumptionToken?.value;
|
|
329
|
+
} while (resumptionToken);
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
/**
|
|
333
|
+
* Iterate sets available for selective harvesting across all pages.
|
|
334
|
+
*
|
|
335
|
+
* This helper follows resumption tokens internally and yields sets one-by-one until completion or
|
|
336
|
+
* until the optional maxSets cap is reached.
|
|
337
|
+
*
|
|
338
|
+
* @param options - Optional request configuration (timeout, retries, userAgent, rateLimit) and maxSets.
|
|
339
|
+
* @returns Async iterator yielding sets one-by-one.
|
|
340
|
+
*/
|
|
341
|
+
export async function* oaiListSetsAsyncIterator(
|
|
342
|
+
options?: OaiListSetsAllOptions
|
|
343
|
+
): AsyncGenerator<OaiSet, void, void> {
|
|
344
|
+
let emitted = 0;
|
|
345
|
+
let resumptionToken: string | undefined;
|
|
346
|
+
const { maxSets, ...requestOptions } = options ?? {};
|
|
347
|
+
|
|
348
|
+
do {
|
|
349
|
+
const page = await oaiListSets(resumptionToken, requestOptions);
|
|
350
|
+
const sets = page.sets ?? [];
|
|
351
|
+
if (sets.length === 0) break;
|
|
352
|
+
|
|
353
|
+
for (const set of sets) {
|
|
354
|
+
if (maxSets != null && emitted >= maxSets) return;
|
|
355
|
+
yield set;
|
|
356
|
+
emitted += 1;
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
resumptionToken = page.resumptionToken?.value;
|
|
360
|
+
} while (resumptionToken);
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
/**
|
|
364
|
+
* Fetch all records across all pages for a given metadataPrefix and optional selective harvesting options.
|
|
365
|
+
*
|
|
366
|
+
* This helper collects from oaiListRecordsAsyncIterator until completion or the optional maxRecords cap.
|
|
367
|
+
*
|
|
368
|
+
* @param metadataPrefix - Required metadata format (e.g. oai_dc, arXiv, arXivRaw).
|
|
369
|
+
* @param listOptions - Optional from, until, set, request options (timeout, retries, userAgent, rateLimit) and maxRecords.
|
|
370
|
+
* Any provided resumptionToken is ignored; pagination is handled internally.
|
|
371
|
+
* @returns All fetched records as a single array.
|
|
372
|
+
*/
|
|
373
|
+
export async function oaiListRecordsAll(
|
|
374
|
+
metadataPrefix: OaiMetadataPrefix,
|
|
375
|
+
listOptions?: OaiListRecordsAllOptions
|
|
376
|
+
): Promise<{ records: OaiRecord[] }> {
|
|
377
|
+
const allRecords: OaiRecord[] = [];
|
|
378
|
+
for await (const record of oaiListRecordsAsyncIterator(metadataPrefix, listOptions)) {
|
|
379
|
+
allRecords.push(record);
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
return { records: allRecords };
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
/**
|
|
386
|
+
* Fetch all identifiers (headers only) across all pages for a given metadataPrefix and optional selective harvesting options.
|
|
387
|
+
*
|
|
388
|
+
* This helper collects from oaiListIdentifiersAsyncIterator until completion or the optional maxHeaders cap.
|
|
389
|
+
*
|
|
390
|
+
* @param metadataPrefix - Required metadata format (e.g. oai_dc, arXiv, arXivRaw).
|
|
391
|
+
* @param listOptions - Optional from, until, set, request options (timeout, retries, userAgent, rateLimit) and maxHeaders.
|
|
392
|
+
* Any provided resumptionToken is ignored; pagination is handled internally.
|
|
393
|
+
* @returns All fetched headers as a single array.
|
|
394
|
+
*/
|
|
395
|
+
export async function oaiListIdentifiersAll(
|
|
396
|
+
metadataPrefix: OaiMetadataPrefix,
|
|
397
|
+
listOptions?: OaiListIdentifiersAllOptions
|
|
398
|
+
): Promise<{ headers: OaiHeader[] }> {
|
|
399
|
+
const allHeaders: OaiHeader[] = [];
|
|
400
|
+
for await (const header of oaiListIdentifiersAsyncIterator(metadataPrefix, listOptions)) {
|
|
401
|
+
allHeaders.push(header);
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
return { headers: allHeaders };
|
|
405
|
+
}
|
|
406
|
+
|
|
407
|
+
/**
|
|
408
|
+
* Fetch all sets available for selective harvesting across all pages.
|
|
409
|
+
*
|
|
410
|
+
* This helper collects from oaiListSetsAsyncIterator until completion or the optional maxSets cap.
|
|
411
|
+
*
|
|
412
|
+
* @param options - Optional request configuration (timeout, retries, userAgent, rateLimit) and maxSets.
|
|
413
|
+
* @returns All fetched sets as a single array.
|
|
414
|
+
*/
|
|
415
|
+
export async function oaiListSetsAll(
|
|
416
|
+
options?: OaiListSetsAllOptions
|
|
417
|
+
): Promise<{ sets: OaiSet[] }> {
|
|
418
|
+
const allSets: OaiSet[] = [];
|
|
419
|
+
for await (const set of oaiListSetsAsyncIterator(options)) {
|
|
420
|
+
allSets.push(set);
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
return { sets: allSets };
|
|
424
|
+
}
|
|
425
|
+
|