arxiv-api-wrapper 2.1.1 → 2.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -2
- package/src/atom.ts +12 -2
- package/src/oaiParser.ts +10 -2
- package/tests/oai.integration.test.ts +38 -0
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "arxiv-api-wrapper",
|
|
3
|
-
"version": "2.1.
|
|
3
|
+
"version": "2.1.2",
|
|
4
4
|
"description": "Provides functions wrapping the arXiv API",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"arxiv"
|
|
@@ -26,7 +26,7 @@
|
|
|
26
26
|
"docs:serve": "npx serve docs"
|
|
27
27
|
},
|
|
28
28
|
"dependencies": {
|
|
29
|
-
"fast-xml-parser": "^5.5.
|
|
29
|
+
"fast-xml-parser": "^5.5.7"
|
|
30
30
|
},
|
|
31
31
|
"devDependencies": {
|
|
32
32
|
"@types/node": "^25.0.0",
|
package/src/atom.ts
CHANGED
|
@@ -23,7 +23,12 @@ function normalizeWhitespace(str: string): string {
|
|
|
23
23
|
}
|
|
24
24
|
|
|
25
25
|
export function parseFeedMeta(xml: string): ArxivFeedMeta {
|
|
26
|
-
|
|
26
|
+
let doc: any;
|
|
27
|
+
try {
|
|
28
|
+
doc = parser.parse(xml) as any;
|
|
29
|
+
} catch (error) {
|
|
30
|
+
throw new Error('Failed to parse Atom feed XML: ' + (error as Error).message);
|
|
31
|
+
}
|
|
27
32
|
const feed = doc.feed || {};
|
|
28
33
|
|
|
29
34
|
const title: string = feed.title ?? '';
|
|
@@ -46,7 +51,12 @@ export function parseFeedMeta(xml: string): ArxivFeedMeta {
|
|
|
46
51
|
}
|
|
47
52
|
|
|
48
53
|
export function parseEntries(xml: string): ArxivEntry[] {
|
|
49
|
-
|
|
54
|
+
let doc: any;
|
|
55
|
+
try {
|
|
56
|
+
doc = parser.parse(xml) as any;
|
|
57
|
+
} catch (error) {
|
|
58
|
+
throw new Error('Failed to parse Atom feed XML: ' + (error as Error).message);
|
|
59
|
+
}
|
|
50
60
|
const feed = doc.feed || {};
|
|
51
61
|
const rawEntries = Array.isArray(feed.entry) ? feed.entry : (feed.entry ? [feed.entry] : []);
|
|
52
62
|
|
package/src/oaiParser.ts
CHANGED
|
@@ -16,6 +16,8 @@ import {
|
|
|
16
16
|
} from './oaiTypes.js';
|
|
17
17
|
import { XMLParser } from 'fast-xml-parser';
|
|
18
18
|
|
|
19
|
+
// ListRecords pages can hold ~1500 records; each text node can contribute several entity
|
|
20
|
+
// expansions, so the library default (1000) is too low. Keep a high finite cap (trusted HTTPS).
|
|
19
21
|
const parser = new XMLParser({
|
|
20
22
|
ignoreAttributes: false,
|
|
21
23
|
attributeNamePrefix: '',
|
|
@@ -23,7 +25,7 @@ const parser = new XMLParser({
|
|
|
23
25
|
trimValues: true,
|
|
24
26
|
parseTagValue: false,
|
|
25
27
|
processEntities: {
|
|
26
|
-
maxTotalExpansions:
|
|
28
|
+
maxTotalExpansions: 10_000,
|
|
27
29
|
},
|
|
28
30
|
});
|
|
29
31
|
|
|
@@ -121,7 +123,13 @@ function parseRecord(el: unknown): OaiRecord {
|
|
|
121
123
|
}
|
|
122
124
|
|
|
123
125
|
function getRoot(xml: string): Record<string, unknown> {
|
|
124
|
-
|
|
126
|
+
let doc: Record<string, unknown>;
|
|
127
|
+
try {
|
|
128
|
+
doc = parser.parse(xml) as Record<string, unknown>;
|
|
129
|
+
} catch (error) {
|
|
130
|
+
throw new OaiError('badArgument', 'Failed to parse OAI-PMH response XML: ' + (error as Error).message);
|
|
131
|
+
}
|
|
132
|
+
|
|
125
133
|
const root = doc['OAI-PMH'] ?? doc['OAIPMH'] ?? doc;
|
|
126
134
|
if (root == null || typeof root !== 'object') {
|
|
127
135
|
throw new OaiError('badArgument', 'Invalid OAI-PMH response: no root element');
|
|
@@ -22,6 +22,10 @@ const OAI_OPTIONS = {
|
|
|
22
22
|
userAgent: 'arxiv-api-wrapper-tests/1.0',
|
|
23
23
|
};
|
|
24
24
|
|
|
25
|
+
/** arXiv OAI earliest datestamp day; dense enough for a max-sized ListRecords page + resumption. */
|
|
26
|
+
const HARVEST_FIRST_DAY_FROM = '2005-09-16';
|
|
27
|
+
const HARVEST_FIRST_DAY_UNTIL = '2005-09-17';
|
|
28
|
+
|
|
25
29
|
afterEach(() => {
|
|
26
30
|
vi.restoreAllMocks();
|
|
27
31
|
});
|
|
@@ -73,6 +77,40 @@ describe('OAI-PMH integration', () => {
|
|
|
73
77
|
}
|
|
74
78
|
}, 30000);
|
|
75
79
|
|
|
80
|
+
it(
|
|
81
|
+
'oaiListRecords parses a full first page and paginates for 2005-09-16 .. 2005-09-17',
|
|
82
|
+
async () => {
|
|
83
|
+
// arXiv OAI returns up to 1500 records per page when the list continues (resumptionToken set).
|
|
84
|
+
const largePageOptions = {
|
|
85
|
+
...OAI_OPTIONS,
|
|
86
|
+
timeoutMs: 120000,
|
|
87
|
+
from: HARVEST_FIRST_DAY_FROM,
|
|
88
|
+
until: HARVEST_FIRST_DAY_UNTIL,
|
|
89
|
+
};
|
|
90
|
+
|
|
91
|
+
const firstPage = await oaiListRecords('oai_dc', largePageOptions);
|
|
92
|
+
expect(firstPage.records).toHaveLength(1500);
|
|
93
|
+
expect(firstPage.resumptionToken?.value).toBeTruthy();
|
|
94
|
+
|
|
95
|
+
const assertRecordShape = (rec: (typeof firstPage.records)[0]) => {
|
|
96
|
+
expect(rec.header.identifier).toBeTruthy();
|
|
97
|
+
expect(rec.header.datestamp).toBeTruthy();
|
|
98
|
+
expect(rec.metadata).toBeDefined();
|
|
99
|
+
expect(typeof rec.metadata).toBe('object');
|
|
100
|
+
};
|
|
101
|
+
assertRecordShape(firstPage.records[0]);
|
|
102
|
+
assertRecordShape(firstPage.records[firstPage.records.length - 1]);
|
|
103
|
+
|
|
104
|
+
const secondPage = await oaiListRecords('oai_dc', {
|
|
105
|
+
...OAI_OPTIONS,
|
|
106
|
+
timeoutMs: 120000,
|
|
107
|
+
resumptionToken: firstPage.resumptionToken!.value,
|
|
108
|
+
});
|
|
109
|
+
expect(secondPage.records.length).toBeGreaterThan(0);
|
|
110
|
+
},
|
|
111
|
+
120000
|
|
112
|
+
);
|
|
113
|
+
|
|
76
114
|
it('oaiListRecords continuation requests work with resumptionToken-only options', async () => {
|
|
77
115
|
const firstPage = await oaiListRecords('oai_dc', OAI_OPTIONS);
|
|
78
116
|
expect(firstPage.resumptionToken?.value).toBeTruthy();
|