arxiv-api-wrapper 1.1.1 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,264 @@
1
+ /**
2
+ * Parser for OAI-PMH XML responses from the arXiv OAI endpoint.
3
+ */
4
+
5
+ import {
6
+ type OaiErrorCode,
7
+ type OaiIdentifyResponse,
8
+ type OaiMetadataFormat,
9
+ type OaiMetadataPrefix,
10
+ type OaiSet,
11
+ type OaiResumptionToken,
12
+ type OaiHeader,
13
+ type OaiRecord,
14
+ type OaiMetadata,
15
+ OaiError,
16
+ } from './oaiTypes.js';
17
+ import { XMLParser } from 'fast-xml-parser';
18
+
19
+ const parser = new XMLParser({
20
+ ignoreAttributes: false,
21
+ attributeNamePrefix: '',
22
+ removeNSPrefix: true,
23
+ trimValues: true,
24
+ parseTagValue: false,
25
+ });
26
+
27
+ const VALID_ERROR_CODES: OaiErrorCode[] = [
28
+ 'badArgument',
29
+ 'badResumptionToken',
30
+ 'badVerb',
31
+ 'cannotDisseminateFormat',
32
+ 'idDoesNotExist',
33
+ 'noMetadataFormats',
34
+ 'noRecordsMatch',
35
+ 'noSetHierarchy',
36
+ ];
37
+
38
+ const VALID_METADATA_PREFIXES: OaiMetadataPrefix[] = ['oai_dc', 'arXiv', 'arXivOld', 'arXivRaw'];
39
+
40
+ function asArray<T>(x: T | T[] | undefined): T[] {
41
+ if (x == null) return [];
42
+ return Array.isArray(x) ? x : [x];
43
+ }
44
+
45
+ function str(v: unknown): string {
46
+ if (v == null) return '';
47
+ return String(v).trim();
48
+ }
49
+
50
+ function parseErrors(root: Record<string, unknown>): OaiError[] {
51
+ const errors: OaiError[] = [];
52
+ const raw = root.error;
53
+ if (raw == null) return errors;
54
+ const arr = asArray(raw);
55
+ for (const e of arr) {
56
+ const code = (e && typeof e === 'object' && 'code' in e && e.code) as string | undefined;
57
+ const msg = (e && typeof e === 'object' && '#text' in e ? e['#text'] : e) as string | unknown;
58
+ const messageText = typeof msg === 'string' ? msg : typeof msg !== 'undefined' ? String(msg) : '';
59
+ const codeStr = (code ?? 'badArgument') as OaiErrorCode;
60
+ if (VALID_ERROR_CODES.includes(codeStr)) {
61
+ errors.push(new OaiError(codeStr, messageText));
62
+ } else {
63
+ errors.push(new OaiError('badArgument', messageText || codeStr));
64
+ }
65
+ }
66
+ return errors;
67
+ }
68
+
69
+ function parseResumptionToken(el: unknown): OaiResumptionToken | undefined {
70
+ if (el == null || typeof el !== 'object') return undefined;
71
+ const o = el as Record<string, unknown>;
72
+ const value = str(o['#text'] ?? o['_'] ?? '');
73
+ if (!value) return undefined;
74
+ const token: OaiResumptionToken = { value };
75
+ if (o.expirationDate != null) token.expirationDate = str(o.expirationDate);
76
+ if (o.completeListSize != null) token.completeListSize = Number(o.completeListSize);
77
+ if (o.cursor != null) token.cursor = Number(o.cursor);
78
+ return token;
79
+ }
80
+
81
+ function parseHeader(el: unknown): OaiHeader {
82
+ const o = (el != null && typeof el === 'object' ? el : {}) as Record<string, unknown>;
83
+ const setSpec = asArray(o.setSpec).map((s) => str(s));
84
+ const status = o.status != null ? str(o.status) : undefined;
85
+ return {
86
+ identifier: str(o.identifier),
87
+ datestamp: str(o.datestamp),
88
+ setSpec,
89
+ ...(status === 'deleted' ? { status: 'deleted' as const } : {}),
90
+ };
91
+ }
92
+
93
+ /** Extract metadata as a plain object (first child of metadata is format-specific, e.g. dc or arXiv). */
94
+ function parseMetadata(el: unknown): OaiMetadata | undefined {
95
+ if (el == null || typeof el !== 'object') return undefined;
96
+ const o = el as Record<string, unknown>;
97
+ // metadata has a single child (e.g. dc, arXiv) - flatten one level for convenience
98
+ const keys = Object.keys(o).filter((k) => !k.startsWith('@') && k !== '#text' && k !== '_');
99
+ if (keys.length === 0) return undefined;
100
+ const out: Record<string, unknown> = {};
101
+ for (const key of keys) {
102
+ const val = o[key];
103
+ if (val != null && typeof val === 'object' && !Array.isArray(val)) {
104
+ out[key] = val;
105
+ } else {
106
+ out[key] = val;
107
+ }
108
+ }
109
+ return out as unknown as OaiMetadata;
110
+ }
111
+
112
+ function parseRecord(el: unknown): OaiRecord {
113
+ const o = (el != null && typeof el === 'object' ? el : {}) as Record<string, unknown>;
114
+ const header = parseHeader(o.header);
115
+ const metadata = o.metadata != null ? parseMetadata(o.metadata) : undefined;
116
+ const about = o.about != null ? asArray(o.about) : undefined;
117
+ return { header, ...(metadata != null ? { metadata } : {}), ...(about != null ? { about } : {}) };
118
+ }
119
+
120
+ function getRoot(xml: string): Record<string, unknown> {
121
+ const doc = parser.parse(xml) as Record<string, unknown>;
122
+ const root = doc['OAI-PMH'] ?? doc['OAIPMH'] ?? doc;
123
+ if (root == null || typeof root !== 'object') {
124
+ throw new OaiError('badArgument', 'Invalid OAI-PMH response: no root element');
125
+ }
126
+ return root as Record<string, unknown>;
127
+ }
128
+
129
+ /**
130
+ * Parse OAI-PMH response and throw OaiError if the response contains error elements.
131
+ * Returns the verb-specific payload (e.g. Identify, ListRecords).
132
+ */
133
+ export function parseOaiResponse(xml: string): Record<string, unknown> {
134
+ const root = getRoot(xml);
135
+ const errors = parseErrors(root);
136
+ if (errors.length > 0) {
137
+ const first = errors[0];
138
+ throw first;
139
+ }
140
+ return root;
141
+ }
142
+
143
+ /**
144
+ * Parse an Identify response body.
145
+ */
146
+ export function parseIdentify(xml: string): OaiIdentifyResponse {
147
+ const root = parseOaiResponse(xml);
148
+ const id = (root.Identify ?? root.identify) as Record<string, unknown> | undefined;
149
+ if (!id || typeof id !== 'object') {
150
+ throw new OaiError('badArgument', 'Invalid Identify response: missing Identify element');
151
+ }
152
+ return {
153
+ repositoryName: str(id.repositoryName),
154
+ baseURL: str(id.baseURL),
155
+ protocolVersion: str(id.protocolVersion),
156
+ adminEmail: asArray(id.adminEmail).map((e) => str(e)),
157
+ earliestDatestamp: str(id.earliestDatestamp),
158
+ deletedRecord: (str(id.deletedRecord) || 'no') as 'no' | 'persistent' | 'transient',
159
+ granularity: (str(id.granularity) || 'YYYY-MM-DD') as 'YYYY-MM-DD' | 'YYYY-MM-DDThh:mm:ssZ',
160
+ compression: asArray(id.compression).map((c) => str(c)).filter(Boolean),
161
+ description: id.description != null ? asArray(id.description) : undefined,
162
+ };
163
+ }
164
+
165
+ /**
166
+ * Parse a ListMetadataFormats response body.
167
+ */
168
+ export function parseListMetadataFormats(xml: string): OaiMetadataFormat[] {
169
+ const root = parseOaiResponse(xml);
170
+ const list = root.ListMetadataFormats ?? root.listMetadataFormats;
171
+ if (!list || typeof list !== 'object') {
172
+ throw new OaiError('badArgument', 'Invalid ListMetadataFormats response');
173
+ }
174
+ const arr = (list as Record<string, unknown>).metadataFormat;
175
+ const formats = asArray(arr);
176
+ return formats.map((f: unknown) => {
177
+ const o = (f && typeof f === 'object' ? f : {}) as Record<string, unknown>;
178
+ const metadataPrefix = str(o.metadataPrefix);
179
+ if (!VALID_METADATA_PREFIXES.includes(metadataPrefix as OaiMetadataPrefix)) {
180
+ throw new OaiError(
181
+ 'cannotDisseminateFormat',
182
+ `Unsupported metadataPrefix in ListMetadataFormats response: ${metadataPrefix}`
183
+ );
184
+ }
185
+ return {
186
+ metadataPrefix: metadataPrefix as OaiMetadataPrefix,
187
+ schema: str(o.schema),
188
+ metadataNamespace: str(o.metadataNamespace),
189
+ };
190
+ });
191
+ }
192
+
193
+ /**
194
+ * Parse a ListSets response body.
195
+ */
196
+ export function parseListSets(xml: string): { sets: OaiSet[]; resumptionToken?: OaiResumptionToken } {
197
+ const root = parseOaiResponse(xml);
198
+ const list = root.ListSets ?? root.listSets;
199
+ if (!list || typeof list !== 'object') {
200
+ throw new OaiError('badArgument', 'Invalid ListSets response');
201
+ }
202
+ const o = list as Record<string, unknown>;
203
+ const setArr = asArray(o.set);
204
+ const sets: OaiSet[] = setArr.map((s: unknown) => {
205
+ const set = (s && typeof s === 'object' ? s : {}) as Record<string, unknown>;
206
+ return {
207
+ setSpec: str(set.setSpec),
208
+ setName: str(set.setName),
209
+ ...(set.setDescription != null ? { setDescription: set.setDescription } : {}),
210
+ };
211
+ });
212
+ const resumptionToken = parseResumptionToken(o.resumptionToken);
213
+ return { sets, ...(resumptionToken ? { resumptionToken } : {}) };
214
+ }
215
+
216
+ /**
217
+ * Parse a GetRecord response body.
218
+ */
219
+ export function parseGetRecord(xml: string): OaiRecord {
220
+ const root = parseOaiResponse(xml);
221
+ const getRecord = root.GetRecord ?? root.getRecord;
222
+ if (!getRecord || typeof getRecord !== 'object') {
223
+ throw new OaiError('badArgument', 'Invalid GetRecord response');
224
+ }
225
+ const record = (getRecord as Record<string, unknown>).record;
226
+ if (!record) throw new OaiError('badArgument', 'Invalid GetRecord response: missing record');
227
+ return parseRecord(record);
228
+ }
229
+
230
+ /**
231
+ * Parse a ListIdentifiers response body.
232
+ */
233
+ export function parseListIdentifiers(
234
+ xml: string
235
+ ): { headers: OaiHeader[]; resumptionToken?: OaiResumptionToken } {
236
+ const root = parseOaiResponse(xml);
237
+ const list = root.ListIdentifiers ?? root.listIdentifiers;
238
+ if (!list || typeof list !== 'object') {
239
+ throw new OaiError('badArgument', 'Invalid ListIdentifiers response');
240
+ }
241
+ const o = list as Record<string, unknown>;
242
+ const headerArr = asArray(o.header);
243
+ const headers = headerArr.map((h: unknown) => parseHeader(h));
244
+ const resumptionToken = parseResumptionToken(o.resumptionToken);
245
+ return { headers, ...(resumptionToken ? { resumptionToken } : {}) };
246
+ }
247
+
248
+ /**
249
+ * Parse a ListRecords response body.
250
+ */
251
+ export function parseListRecords(
252
+ xml: string
253
+ ): { records: OaiRecord[]; resumptionToken?: OaiResumptionToken } {
254
+ const root = parseOaiResponse(xml);
255
+ const list = root.ListRecords ?? root.listRecords;
256
+ if (!list || typeof list !== 'object') {
257
+ throw new OaiError('badArgument', 'Invalid ListRecords response');
258
+ }
259
+ const o = list as Record<string, unknown>;
260
+ const recordArr = asArray(o.record);
261
+ const records = recordArr.map((r: unknown) => parseRecord(r));
262
+ const resumptionToken = parseResumptionToken(o.resumptionToken);
263
+ return { records, ...(resumptionToken ? { resumptionToken } : {}) };
264
+ }
@@ -0,0 +1,204 @@
1
+ import type { ArxivAuthor, ArxivEntry, ArxivLink, ArxivQueryResult } from './types.js';
2
+ import type { OaiListRecordsResult, OaiMetadata, OaiRecord } from './oaiTypes.js';
3
+
4
+ const OAI_BASE_URL = 'https://oaipmh.arxiv.org/oai';
5
+ const ARXIV_ABS_BASE = 'https://arxiv.org/abs/';
6
+ const ARXIV_PDF_BASE = 'https://arxiv.org/pdf/';
7
+
8
+ function normalizeWhitespace(value: string): string {
9
+ return value.replace(/\s+/g, ' ').trim();
10
+ }
11
+
12
+ function asArray<T>(value: T | T[] | undefined): T[] {
13
+ if (value == null) return [];
14
+ return Array.isArray(value) ? value : [value];
15
+ }
16
+
17
+ function firstNonEmpty(values: string[]): string {
18
+ return values.map((v) => normalizeWhitespace(v)).find(Boolean) ?? '';
19
+ }
20
+
21
+ function toStringArray(value: string | string[] | undefined): string[] {
22
+ return asArray(value).map((v) => normalizeWhitespace(String(v))).filter(Boolean);
23
+ }
24
+
25
+ function oaiIdentifierToArxivId(identifier: string): string {
26
+ return identifier.replace(/^oai:arXiv\.org:/i, '').trim();
27
+ }
28
+
29
+ function buildDefaultLinks(arxivId: string): ArxivLink[] {
30
+ if (!arxivId) return [];
31
+ return [
32
+ { href: `${ARXIV_ABS_BASE}${arxivId}`, rel: 'alternate', type: 'text/html' },
33
+ { href: `${ARXIV_PDF_BASE}${arxivId}`, rel: 'related', type: 'application/pdf', title: 'pdf' },
34
+ ];
35
+ }
36
+
37
+ function splitCategories(value: string | undefined): string[] {
38
+ if (!value) return [];
39
+ return value.trim().split(/\s+/).filter(Boolean);
40
+ }
41
+
42
+ function parseAuthorsList(value: string | undefined): ArxivAuthor[] {
43
+ if (!value) return [];
44
+ return value
45
+ .split(/\s*,\s*/)
46
+ .map((name) => normalizeWhitespace(name))
47
+ .filter(Boolean)
48
+ .map((name) => ({ name }));
49
+ }
50
+
51
+ function withVersionSuffix(baseId: string, version: string | undefined): string {
52
+ if (!version) return baseId;
53
+ if (/v\d+$/i.test(baseId)) return baseId;
54
+ return `${baseId}${version}`;
55
+ }
56
+
57
+ function extractLatestRawVersion(
58
+ versionValue: { version?: string; date: string; size?: string; source_type?: string } | { version?: string; date: string; size?: string; source_type?: string }[]
59
+ ): { version?: string; date: string; size?: string; source_type?: string } {
60
+ const versions = asArray(versionValue);
61
+ if (versions.length === 0) return { date: '' };
62
+ const sorted = [...versions].sort((a, b) => {
63
+ const va = Number((a.version ?? '').replace(/^v/i, ''));
64
+ const vb = Number((b.version ?? '').replace(/^v/i, ''));
65
+ if (Number.isNaN(va) || Number.isNaN(vb)) return 0;
66
+ return va - vb;
67
+ });
68
+ return sorted[sorted.length - 1];
69
+ }
70
+
71
+ function metadataToEntry(record: OaiRecord, metadata: OaiMetadata): ArxivEntry {
72
+ const fallbackArxivId = oaiIdentifierToArxivId(record.header.identifier);
73
+ const fallbackUpdated = record.header.datestamp;
74
+
75
+ if ('arXiv' in metadata) {
76
+ const rawAuthors = asArray(metadata.arXiv.authors?.author);
77
+ const authors: ArxivAuthor[] = rawAuthors.map((a) => {
78
+ const name = [a.forenames, a.keyname, a.suffix].filter(Boolean).join(' ').trim();
79
+ const affiliations = toStringArray(a.affiliation);
80
+ return { name: name || a.keyname, ...(affiliations[0] ? { affiliation: affiliations[0] } : {}) };
81
+ });
82
+ const arxivId = metadata.arXiv.id || fallbackArxivId;
83
+ const categories = splitCategories(metadata.arXiv.categories);
84
+ return {
85
+ id: `${ARXIV_ABS_BASE}${arxivId}`,
86
+ arxivId,
87
+ title: normalizeWhitespace(metadata.arXiv.title ?? ''),
88
+ summary: normalizeWhitespace(metadata.arXiv.abstract ?? ''),
89
+ published: metadata.arXiv.created ?? fallbackUpdated,
90
+ updated: metadata.arXiv.updated ?? fallbackUpdated,
91
+ authors,
92
+ categories,
93
+ ...(categories[0] ? { primaryCategory: categories[0] } : {}),
94
+ links: buildDefaultLinks(arxivId),
95
+ ...(metadata.arXiv.doi ? { doi: metadata.arXiv.doi } : {}),
96
+ ...(metadata.arXiv['journal-ref'] ? { journalRef: metadata.arXiv['journal-ref'] } : {}),
97
+ ...(metadata.arXiv.comments ? { comment: normalizeWhitespace(metadata.arXiv.comments) } : {}),
98
+ };
99
+ }
100
+
101
+ if ('arXivRaw' in metadata) {
102
+ const latestVersion = extractLatestRawVersion(metadata.arXivRaw.version);
103
+ const arxivId = withVersionSuffix(metadata.arXivRaw.id || fallbackArxivId, latestVersion.version);
104
+ const categories = splitCategories(metadata.arXivRaw.categories);
105
+ return {
106
+ id: `${ARXIV_ABS_BASE}${arxivId}`,
107
+ arxivId,
108
+ title: normalizeWhitespace(metadata.arXivRaw.title ?? ''),
109
+ summary: normalizeWhitespace(metadata.arXivRaw.abstract ?? ''),
110
+ published: asArray(metadata.arXivRaw.version)[0]?.date ?? fallbackUpdated,
111
+ updated: latestVersion.date || fallbackUpdated,
112
+ authors: parseAuthorsList(metadata.arXivRaw.authors),
113
+ categories,
114
+ ...(categories[0] ? { primaryCategory: categories[0] } : {}),
115
+ links: buildDefaultLinks(arxivId),
116
+ ...(metadata.arXivRaw.doi ? { doi: metadata.arXivRaw.doi } : {}),
117
+ ...(metadata.arXivRaw['journal-ref'] ? { journalRef: metadata.arXivRaw['journal-ref'] } : {}),
118
+ ...(metadata.arXivRaw.comments ? { comment: normalizeWhitespace(metadata.arXivRaw.comments) } : {}),
119
+ };
120
+ }
121
+
122
+ if ('arXivOld' in metadata) {
123
+ const arxivId = metadata.arXivOld.id || fallbackArxivId;
124
+ const categories = splitCategories(metadata.arXivOld.categories);
125
+ return {
126
+ id: `${ARXIV_ABS_BASE}${arxivId}`,
127
+ arxivId,
128
+ title: normalizeWhitespace(metadata.arXivOld.title ?? ''),
129
+ summary: normalizeWhitespace(metadata.arXivOld.abstract ?? ''),
130
+ published: fallbackUpdated,
131
+ updated: fallbackUpdated,
132
+ authors: parseAuthorsList(metadata.arXivOld.authors),
133
+ categories,
134
+ ...(categories[0] ? { primaryCategory: categories[0] } : {}),
135
+ links: buildDefaultLinks(arxivId),
136
+ ...(metadata.arXivOld.doi ? { doi: metadata.arXivOld.doi } : {}),
137
+ ...(metadata.arXivOld['journal-ref'] ? { journalRef: metadata.arXivOld['journal-ref'] } : {}),
138
+ ...(metadata.arXivOld.comments ? { comment: normalizeWhitespace(metadata.arXivOld.comments) } : {}),
139
+ };
140
+ }
141
+
142
+ const dc = metadata.dc;
143
+ const identifierValues = toStringArray(dc.identifier);
144
+ const identifierFromDc = firstNonEmpty(identifierValues);
145
+ const arxivIdFromDc =
146
+ oaiIdentifierToArxivId(identifierFromDc).replace(/^https?:\/\/arxiv\.org\/abs\//i, '') ||
147
+ fallbackArxivId;
148
+ const creators = toStringArray(dc.creator);
149
+ const categories = toStringArray(dc.subject);
150
+ const published = firstNonEmpty(toStringArray(dc.date)) || fallbackUpdated;
151
+ const summary = firstNonEmpty(toStringArray(dc.description));
152
+ const title = firstNonEmpty(toStringArray(dc.title));
153
+
154
+ return {
155
+ id: `${ARXIV_ABS_BASE}${arxivIdFromDc}`,
156
+ arxivId: arxivIdFromDc,
157
+ title,
158
+ summary,
159
+ published,
160
+ updated: fallbackUpdated,
161
+ authors: creators.map((name) => ({ name })),
162
+ categories,
163
+ ...(categories[0] ? { primaryCategory: categories[0] } : {}),
164
+ links: buildDefaultLinks(arxivIdFromDc),
165
+ };
166
+ }
167
+
168
+ /**
169
+ * Convert one OAI record to the package's ArxivEntry shape.
170
+ * Returns null for deleted records or records that do not include metadata.
171
+ */
172
+ export function oaiRecordToArxivEntry(record: OaiRecord): ArxivEntry | null {
173
+ if (record.header.status === 'deleted' || record.metadata == null) return null;
174
+ return metadataToEntry(record, record.metadata);
175
+ }
176
+
177
+ /** Convert OAI records to ArxivEntry array, skipping deleted/metadata-less records. */
178
+ export function oaiRecordsToArxivEntries(records: OaiRecord[]): ArxivEntry[] {
179
+ return records
180
+ .map((record) => oaiRecordToArxivEntry(record))
181
+ .filter((entry): entry is ArxivEntry => entry != null);
182
+ }
183
+
184
+ /**
185
+ * Convert an OAI ListRecords result to the same shape returned by getArxivEntries().
186
+ * Feed values are synthesized from OAI response data.
187
+ */
188
+ export function oaiListRecordsToArxivQueryResult(result: OaiListRecordsResult): ArxivQueryResult {
189
+ const entries = oaiRecordsToArxivEntries(result.records);
190
+ const latestUpdated = entries.map((e) => e.updated).sort().at(-1) ?? '';
191
+ const startIndex = Math.max(0, (result.resumptionToken?.cursor ?? entries.length) - entries.length);
192
+ return {
193
+ feed: {
194
+ id: OAI_BASE_URL,
195
+ updated: latestUpdated,
196
+ title: 'arXiv OAI converted records',
197
+ link: OAI_BASE_URL,
198
+ totalResults: result.resumptionToken?.completeListSize ?? entries.length,
199
+ startIndex,
200
+ itemsPerPage: entries.length,
201
+ },
202
+ entries,
203
+ };
204
+ }