arxiv-api-wrapper 1.1.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +318 -250
- package/package.json +6 -4
- package/src/arxivAPIRead.ts +316 -316
- package/src/atom.ts +1 -1
- package/src/index.ts +102 -57
- package/src/oaiClient.ts +425 -0
- package/src/oaiParser.ts +264 -0
- package/src/oaiToArxiv.ts +204 -0
- package/src/oaiTypes.ts +248 -0
- package/src/types.ts +265 -265
- package/tests/arxivAPI.integration.test.ts +144 -144
- package/tests/arxivAPIRead.test.ts +1 -1
- package/tests/fixtures/parseEntries/2507.17541.json.ts +1 -1
- package/tests/fixtures/parseEntries/search_agdur.json.ts +1 -1
- package/tests/oai.integration.test.ts +222 -0
- package/tests/oai.test.ts +248 -0
- package/tests/oaiToArxiv.test.ts +131 -0
- package/tsconfig.json +13 -0
package/src/arxivAPIRead.ts
CHANGED
|
@@ -1,316 +1,316 @@
|
|
|
1
|
-
import { ArxivQueryOptions, ArxivQueryResult, ArxivSearchFilters, ArxivRateLimitConfig } from './types';
|
|
2
|
-
import { TokenBucketLimiter } from './rateLimiter';
|
|
3
|
-
import { fetchWithRetry } from './http';
|
|
4
|
-
import { parseEntries, parseFeedMeta } from './atom';
|
|
5
|
-
|
|
6
|
-
const ARXIV_BASE_URL = 'https://export.arxiv.org/api/query';
|
|
7
|
-
|
|
8
|
-
function encodeAuthor(term: string): string {
|
|
9
|
-
// Always quote terms to match arXiv's expected format
|
|
10
|
-
// Keep spaces - they'll be URL-encoded as %20
|
|
11
|
-
const normalized = term.trim();
|
|
12
|
-
return '"' + normalized + '"';
|
|
13
|
-
}
|
|
14
|
-
|
|
15
|
-
function encodePhrase(term: string, phraseExact?: boolean): string {
|
|
16
|
-
// Always quote terms to match arXiv's expected format
|
|
17
|
-
// Keep spaces - they'll be URL-encoded as %20
|
|
18
|
-
const normalized = term.trim();
|
|
19
|
-
return '"' + normalized + '"';
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
function fieldExpr(field: string, terms: string[] = [], phraseExact?: boolean): string[] {
|
|
23
|
-
if (!terms.length) return [];
|
|
24
|
-
if (field === 'au') {
|
|
25
|
-
return terms.map((t) => `${field}:${encodeAuthor(t)}`);
|
|
26
|
-
}
|
|
27
|
-
return terms.map((t) => `${field}:${encodePhrase(t, phraseExact)}`);
|
|
28
|
-
}
|
|
29
|
-
|
|
30
|
-
function rangeExpr(field: string, from: string, to: string): string {
|
|
31
|
-
return `${field}:[${from}+TO+${to}]`;
|
|
32
|
-
}
|
|
33
|
-
|
|
34
|
-
function groupOr(subfilters: string[]): string {
|
|
35
|
-
if (subfilters.length === 0) return '';
|
|
36
|
-
if (subfilters.length === 1) return subfilters[0];
|
|
37
|
-
return `(${subfilters.join('+OR+')})`;
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
function groupParen(expr: string): string {
|
|
41
|
-
return `(${expr})`;
|
|
42
|
-
}
|
|
43
|
-
|
|
44
|
-
function joinAnd(parts: string[]): string {
|
|
45
|
-
return parts.filter(Boolean).join('+AND+');
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
/**
|
|
49
|
-
* Builds an arXiv search query string from search filters.
|
|
50
|
-
*
|
|
51
|
-
* This function converts the structured `ArxivSearchFilters` object into
|
|
52
|
-
* a query string compatible with the arXiv API search syntax. Multiple terms
|
|
53
|
-
* in the same field are combined with AND, and multiple fields are combined
|
|
54
|
-
* with AND. OR groups and negation (ANDNOT) are also supported.
|
|
55
|
-
*
|
|
56
|
-
* @param filters - Search filters to convert to query string
|
|
57
|
-
* @returns URL-encoded query string ready for arXiv API
|
|
58
|
-
*
|
|
59
|
-
* @example
|
|
60
|
-
* ```typescript
|
|
61
|
-
* const query = buildSearchQuery({
|
|
62
|
-
* title: ['machine learning'],
|
|
63
|
-
* author: ['Geoffrey Hinton'],
|
|
64
|
-
* });
|
|
65
|
-
* // Returns: "ti:\"machine learning\"+AND+au:\"Geoffrey Hinton\""
|
|
66
|
-
* ```
|
|
67
|
-
*
|
|
68
|
-
* @example
|
|
69
|
-
* ```typescript
|
|
70
|
-
* // Complex query with OR groups
|
|
71
|
-
* const query = buildSearchQuery({
|
|
72
|
-
* or: [
|
|
73
|
-
* { title: ['quantum'] },
|
|
74
|
-
* { abstract: ['quantum'] },
|
|
75
|
-
* ],
|
|
76
|
-
* category: ['quant-ph'],
|
|
77
|
-
* });
|
|
78
|
-
* ```
|
|
79
|
-
*
|
|
80
|
-
* @see {@link ArxivSearchFilters} for filter options
|
|
81
|
-
*/
|
|
82
|
-
export function buildSearchQuery(filters: ArxivSearchFilters): string {
|
|
83
|
-
const parts: string[] = [];
|
|
84
|
-
const phraseExact = filters.phraseExact;
|
|
85
|
-
|
|
86
|
-
parts.push(...fieldExpr('all', filters.all, phraseExact)); // "all:" is supported per manual
|
|
87
|
-
parts.push(...fieldExpr('ti', filters.title, phraseExact));
|
|
88
|
-
parts.push(...fieldExpr('au', filters.author, phraseExact));
|
|
89
|
-
parts.push(...fieldExpr('abs', filters.abstract, phraseExact));
|
|
90
|
-
parts.push(...fieldExpr('co', filters.comment, phraseExact));
|
|
91
|
-
parts.push(...fieldExpr('jr', filters.journalRef, phraseExact));
|
|
92
|
-
parts.push(...fieldExpr('cat', filters.category, false));
|
|
93
|
-
|
|
94
|
-
if (filters.submittedDateRange) {
|
|
95
|
-
const { from, to } = filters.submittedDateRange;
|
|
96
|
-
parts.push(rangeExpr('submittedDate', from, to));
|
|
97
|
-
}
|
|
98
|
-
|
|
99
|
-
// OR group: each subfilter becomes an AND-joined clause, then ORed as a group
|
|
100
|
-
if (filters.or && filters.or.length > 0) {
|
|
101
|
-
const orClauses = filters.or.map((sf) => buildSearchQuery({ ...sf, or: undefined, andNot: undefined }));
|
|
102
|
-
const grouped = groupOr(orClauses);
|
|
103
|
-
if (grouped) parts.push(grouped);
|
|
104
|
-
}
|
|
105
|
-
|
|
106
|
-
// Build the base query from regular parts
|
|
107
|
-
const baseQuery = joinAnd(parts);
|
|
108
|
-
|
|
109
|
-
// ANDNOT group: a single negated clause (appended separately, not joined with AND)
|
|
110
|
-
if (filters.andNot) {
|
|
111
|
-
const neg = buildSearchQuery({ ...filters.andNot, or: undefined, andNot: undefined });
|
|
112
|
-
if (neg) {
|
|
113
|
-
if (baseQuery) {
|
|
114
|
-
return `${baseQuery}+ANDNOT+${groupParen(neg)}`;
|
|
115
|
-
}
|
|
116
|
-
return `ANDNOT+${groupParen(neg)}`;
|
|
117
|
-
}
|
|
118
|
-
}
|
|
119
|
-
|
|
120
|
-
return baseQuery;
|
|
121
|
-
}
|
|
122
|
-
|
|
123
|
-
function buildUrl(opts: ArxivQueryOptions): string {
|
|
124
|
-
const params: string[] = [];
|
|
125
|
-
|
|
126
|
-
// Add id_list if it exists and has at least one item
|
|
127
|
-
if (opts.idList && Array.isArray(opts.idList) && opts.idList.length > 0) {
|
|
128
|
-
params.push('id_list=' + encodeURIComponent(opts.idList.join(',')));
|
|
129
|
-
}
|
|
130
|
-
|
|
131
|
-
// Add search_query if search is provided (can be used together with id_list)
|
|
132
|
-
if (opts.search) {
|
|
133
|
-
const q = buildSearchQuery(opts.search);
|
|
134
|
-
// Encode the query properly: use encodeURIComponent to encode all special characters,
|
|
135
|
-
// then replace %2B back to + so that + signs decode as spaces (arXiv expects spaces around AND/OR)
|
|
136
|
-
const encodedQuery = encodeURIComponent(q).replace(/%2B/g, '+');
|
|
137
|
-
params.push('search_query=' + encodedQuery);
|
|
138
|
-
}
|
|
139
|
-
if (typeof opts.start === 'number') params.push('start=' + String(opts.start));
|
|
140
|
-
if (typeof opts.maxResults === 'number') params.push('max_results=' + String(opts.maxResults));
|
|
141
|
-
if (opts.sortBy) params.push('sortBy=' + encodeURIComponent(opts.sortBy));
|
|
142
|
-
if (opts.sortOrder) params.push('sortOrder=' + encodeURIComponent(opts.sortOrder));
|
|
143
|
-
const qs = params.join('&');
|
|
144
|
-
return `${ARXIV_BASE_URL}?${qs}`;
|
|
145
|
-
}
|
|
146
|
-
|
|
147
|
-
/**
|
|
148
|
-
* Queries the arXiv API and returns matching paper entries.
|
|
149
|
-
*
|
|
150
|
-
* This is the main function for interacting with the arXiv API. It supports
|
|
151
|
-
* searching by various criteria, fetching specific papers by ID, pagination,
|
|
152
|
-
* sorting, rate limiting, and automatic retries with exponential backoff.
|
|
153
|
-
*
|
|
154
|
-
* @param options - Query options including search filters, pagination, and request configuration
|
|
155
|
-
* @returns Promise resolving to query results with feed metadata and paper entries
|
|
156
|
-
*
|
|
157
|
-
* @throws {Error} If the API request fails after all retries
|
|
158
|
-
* @throws {Error} If the API returns a non-2xx status code
|
|
159
|
-
* @throws {Error} If the API returns an empty response
|
|
160
|
-
*
|
|
161
|
-
* @example
|
|
162
|
-
* ```typescript
|
|
163
|
-
* // Simple search
|
|
164
|
-
* const result = await getArxivEntries({
|
|
165
|
-
* search: {
|
|
166
|
-
* title: ['quantum computing'],
|
|
167
|
-
* author: ['John Doe'],
|
|
168
|
-
* },
|
|
169
|
-
* maxResults: 10,
|
|
170
|
-
* });
|
|
171
|
-
*
|
|
172
|
-
* console.log(`Found ${result.feed.totalResults} papers`);
|
|
173
|
-
* result.entries.forEach(entry => {
|
|
174
|
-
* console.log(`${entry.arxivId}: ${entry.title}`);
|
|
175
|
-
* });
|
|
176
|
-
* ```
|
|
177
|
-
*
|
|
178
|
-
* @example
|
|
179
|
-
* ```typescript
|
|
180
|
-
* // Fetch specific papers by ID
|
|
181
|
-
* const result = await getArxivEntries({
|
|
182
|
-
* idList: ['2101.01234', '2101.05678'],
|
|
183
|
-
* });
|
|
184
|
-
* ```
|
|
185
|
-
*
|
|
186
|
-
* @example
|
|
187
|
-
* ```typescript
|
|
188
|
-
* // With rate limiting and custom timeout
|
|
189
|
-
* const result = await getArxivEntries({
|
|
190
|
-
* search: { title: ['neural networks'] },
|
|
191
|
-
* rateLimit: {
|
|
192
|
-
* tokensPerInterval: 1,
|
|
193
|
-
* intervalMs: 3000, // 1 request per 3 seconds
|
|
194
|
-
* },
|
|
195
|
-
* timeoutMs: 15000,
|
|
196
|
-
* retries: 5,
|
|
197
|
-
* });
|
|
198
|
-
* ```
|
|
199
|
-
*
|
|
200
|
-
* @see {@link ArxivQueryOptions} for all available options
|
|
201
|
-
* @see {@link ArxivQueryResult} for the return type structure
|
|
202
|
-
* @see {@link ArxivSearchFilters} for search filter options
|
|
203
|
-
*/
|
|
204
|
-
export async function getArxivEntries(options: ArxivQueryOptions): Promise<ArxivQueryResult> {
|
|
205
|
-
const timeoutMs = options.timeoutMs ?? 10000;
|
|
206
|
-
const retries = options.retries ?? 3;
|
|
207
|
-
const userAgent = options.userAgent ?? 'arxiv-api-wrapper/1.0 (+https://export.arxiv.org)';
|
|
208
|
-
|
|
209
|
-
const limiter = options.rateLimit
|
|
210
|
-
? new TokenBucketLimiter(options.rateLimit.tokensPerInterval, options.rateLimit.intervalMs)
|
|
211
|
-
: undefined;
|
|
212
|
-
|
|
213
|
-
const url = buildUrl(options);
|
|
214
|
-
if (limiter) await limiter.acquire();
|
|
215
|
-
|
|
216
|
-
const res = await fetchWithRetry(url, { method: 'GET', headers: { Accept: 'application/atom+xml' } }, { retries, timeoutMs, userAgent });
|
|
217
|
-
|
|
218
|
-
// Check response status before parsing
|
|
219
|
-
if (!res.ok) {
|
|
220
|
-
const errorText = await res.text().catch(() => 'Unable to read error response');
|
|
221
|
-
throw new Error(
|
|
222
|
-
`arXiv API returned status ${res.status} ${res.statusText} for URL: ${url}. ` +
|
|
223
|
-
`Response: ${errorText.substring(0, 500)}`
|
|
224
|
-
);
|
|
225
|
-
}
|
|
226
|
-
|
|
227
|
-
const text = await res.text();
|
|
228
|
-
|
|
229
|
-
// Log the response for debugging if it appears empty
|
|
230
|
-
if (!text || text.trim().length === 0) {
|
|
231
|
-
console.error(`Empty response from arXiv API. URL: ${url}, Status: ${res.status}`);
|
|
232
|
-
throw new Error(`arXiv API returned empty response for URL: ${url}`);
|
|
233
|
-
}
|
|
234
|
-
|
|
235
|
-
const feed = parseFeedMeta(text);
|
|
236
|
-
const entries = parseEntries(text);
|
|
237
|
-
|
|
238
|
-
// Log if parsing resulted in empty data
|
|
239
|
-
if (feed.totalResults === 0 && entries.length === 0 && text.length > 0) {
|
|
240
|
-
console.warn(`Parsed empty results from non-empty response. URL: ${url}, Response length: ${text.length}`);
|
|
241
|
-
console.warn(`Response preview (first 500 chars): ${text.substring(0, 500)}`);
|
|
242
|
-
}
|
|
243
|
-
|
|
244
|
-
return { feed, entries };
|
|
245
|
-
}
|
|
246
|
-
|
|
247
|
-
/**
|
|
248
|
-
* Fetches arXiv papers by their IDs using the simpler id_list API mode.
|
|
249
|
-
*
|
|
250
|
-
* This is a convenience function for the simpler arXiv API mode where you provide
|
|
251
|
-
* a comma-delimited list of paper IDs and get back the data for those papers.
|
|
252
|
-
* It's simpler than using search queries when you already know the paper IDs.
|
|
253
|
-
*
|
|
254
|
-
* @param ids - Array of arXiv paper IDs (e.g., ['2101.01234', '2101.05678']). Maximum 100 IDs allowed.
|
|
255
|
-
* @param options - Optional request configuration
|
|
256
|
-
* @param options.rateLimit - Rate limiting configuration to respect arXiv API guidelines
|
|
257
|
-
* @param options.retries - Number of retry attempts for failed requests (default: 3)
|
|
258
|
-
* @param options.timeoutMs - Request timeout in milliseconds (default: 10000)
|
|
259
|
-
* @param options.userAgent - Custom User-Agent header for requests
|
|
260
|
-
* @returns Promise resolving to query results with feed metadata and paper entries
|
|
261
|
-
*
|
|
262
|
-
* @throws {Error} If more than 100 IDs are provided
|
|
263
|
-
* @throws {Error} If the API request fails after all retries
|
|
264
|
-
* @throws {Error} If the API returns a non-2xx status code
|
|
265
|
-
* @throws {Error} If the API returns an empty response
|
|
266
|
-
*
|
|
267
|
-
* @example
|
|
268
|
-
* ```typescript
|
|
269
|
-
* // Fetch papers by ID
|
|
270
|
-
* const result = await getArxivEntriesById(['2101.01234', '2101.05678']);
|
|
271
|
-
*
|
|
272
|
-
* result.entries.forEach(entry => {
|
|
273
|
-
* console.log(`${entry.arxivId}: ${entry.title}`);
|
|
274
|
-
* });
|
|
275
|
-
* ```
|
|
276
|
-
*
|
|
277
|
-
* @example
|
|
278
|
-
* ```typescript
|
|
279
|
-
* // With rate limiting
|
|
280
|
-
* const result = await getArxivEntriesById(
|
|
281
|
-
* ['2101.01234'],
|
|
282
|
-
* {
|
|
283
|
-
* rateLimit: {
|
|
284
|
-
* tokensPerInterval: 1,
|
|
285
|
-
* intervalMs: 3000, // 1 request per 3 seconds
|
|
286
|
-
* },
|
|
287
|
-
* timeoutMs: 15000,
|
|
288
|
-
* }
|
|
289
|
-
* );
|
|
290
|
-
* ```
|
|
291
|
-
*
|
|
292
|
-
* @see {@link getArxivEntries} for more advanced querying with search filters
|
|
293
|
-
* @see {@link ArxivQueryResult} for the return type structure
|
|
294
|
-
*/
|
|
295
|
-
export async function getArxivEntriesById(
|
|
296
|
-
ids: string[],
|
|
297
|
-
options?: {
|
|
298
|
-
rateLimit?: ArxivRateLimitConfig;
|
|
299
|
-
retries?: number;
|
|
300
|
-
timeoutMs?: number;
|
|
301
|
-
userAgent?: string;
|
|
302
|
-
}
|
|
303
|
-
): Promise<ArxivQueryResult> {
|
|
304
|
-
if (ids.length > 100) {
|
|
305
|
-
throw new Error(`Maximum of 100 IDs allowed, but ${ids.length} IDs were provided`);
|
|
306
|
-
}
|
|
307
|
-
|
|
308
|
-
return getArxivEntries({
|
|
309
|
-
idList: ids,
|
|
310
|
-
rateLimit: options?.rateLimit,
|
|
311
|
-
retries: options?.retries,
|
|
312
|
-
timeoutMs: options?.timeoutMs,
|
|
313
|
-
userAgent: options?.userAgent,
|
|
314
|
-
});
|
|
315
|
-
}
|
|
316
|
-
|
|
1
|
+
import { ArxivQueryOptions, ArxivQueryResult, ArxivSearchFilters, ArxivRateLimitConfig } from './types.js';
|
|
2
|
+
import { TokenBucketLimiter } from './rateLimiter.js';
|
|
3
|
+
import { fetchWithRetry } from './http.js';
|
|
4
|
+
import { parseEntries, parseFeedMeta } from './atom.js';
|
|
5
|
+
|
|
6
|
+
const ARXIV_BASE_URL = 'https://export.arxiv.org/api/query';
|
|
7
|
+
|
|
8
|
+
function encodeAuthor(term: string): string {
|
|
9
|
+
// Always quote terms to match arXiv's expected format
|
|
10
|
+
// Keep spaces - they'll be URL-encoded as %20
|
|
11
|
+
const normalized = term.trim();
|
|
12
|
+
return '"' + normalized + '"';
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
function encodePhrase(term: string, phraseExact?: boolean): string {
|
|
16
|
+
// Always quote terms to match arXiv's expected format
|
|
17
|
+
// Keep spaces - they'll be URL-encoded as %20
|
|
18
|
+
const normalized = term.trim();
|
|
19
|
+
return '"' + normalized + '"';
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
function fieldExpr(field: string, terms: string[] = [], phraseExact?: boolean): string[] {
|
|
23
|
+
if (!terms.length) return [];
|
|
24
|
+
if (field === 'au') {
|
|
25
|
+
return terms.map((t) => `${field}:${encodeAuthor(t)}`);
|
|
26
|
+
}
|
|
27
|
+
return terms.map((t) => `${field}:${encodePhrase(t, phraseExact)}`);
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
function rangeExpr(field: string, from: string, to: string): string {
|
|
31
|
+
return `${field}:[${from}+TO+${to}]`;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
function groupOr(subfilters: string[]): string {
|
|
35
|
+
if (subfilters.length === 0) return '';
|
|
36
|
+
if (subfilters.length === 1) return subfilters[0];
|
|
37
|
+
return `(${subfilters.join('+OR+')})`;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
function groupParen(expr: string): string {
|
|
41
|
+
return `(${expr})`;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
function joinAnd(parts: string[]): string {
|
|
45
|
+
return parts.filter(Boolean).join('+AND+');
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
/**
|
|
49
|
+
* Builds an arXiv search query string from search filters.
|
|
50
|
+
*
|
|
51
|
+
* This function converts the structured `ArxivSearchFilters` object into
|
|
52
|
+
* a query string compatible with the arXiv API search syntax. Multiple terms
|
|
53
|
+
* in the same field are combined with AND, and multiple fields are combined
|
|
54
|
+
* with AND. OR groups and negation (ANDNOT) are also supported.
|
|
55
|
+
*
|
|
56
|
+
* @param filters - Search filters to convert to query string
|
|
57
|
+
* @returns URL-encoded query string ready for arXiv API
|
|
58
|
+
*
|
|
59
|
+
* @example
|
|
60
|
+
* ```typescript
|
|
61
|
+
* const query = buildSearchQuery({
|
|
62
|
+
* title: ['machine learning'],
|
|
63
|
+
* author: ['Geoffrey Hinton'],
|
|
64
|
+
* });
|
|
65
|
+
* // Returns: "ti:\"machine learning\"+AND+au:\"Geoffrey Hinton\""
|
|
66
|
+
* ```
|
|
67
|
+
*
|
|
68
|
+
* @example
|
|
69
|
+
* ```typescript
|
|
70
|
+
* // Complex query with OR groups
|
|
71
|
+
* const query = buildSearchQuery({
|
|
72
|
+
* or: [
|
|
73
|
+
* { title: ['quantum'] },
|
|
74
|
+
* { abstract: ['quantum'] },
|
|
75
|
+
* ],
|
|
76
|
+
* category: ['quant-ph'],
|
|
77
|
+
* });
|
|
78
|
+
* ```
|
|
79
|
+
*
|
|
80
|
+
* @see {@link ArxivSearchFilters} for filter options
|
|
81
|
+
*/
|
|
82
|
+
export function buildSearchQuery(filters: ArxivSearchFilters): string {
|
|
83
|
+
const parts: string[] = [];
|
|
84
|
+
const phraseExact = filters.phraseExact;
|
|
85
|
+
|
|
86
|
+
parts.push(...fieldExpr('all', filters.all, phraseExact)); // "all:" is supported per manual
|
|
87
|
+
parts.push(...fieldExpr('ti', filters.title, phraseExact));
|
|
88
|
+
parts.push(...fieldExpr('au', filters.author, phraseExact));
|
|
89
|
+
parts.push(...fieldExpr('abs', filters.abstract, phraseExact));
|
|
90
|
+
parts.push(...fieldExpr('co', filters.comment, phraseExact));
|
|
91
|
+
parts.push(...fieldExpr('jr', filters.journalRef, phraseExact));
|
|
92
|
+
parts.push(...fieldExpr('cat', filters.category, false));
|
|
93
|
+
|
|
94
|
+
if (filters.submittedDateRange) {
|
|
95
|
+
const { from, to } = filters.submittedDateRange;
|
|
96
|
+
parts.push(rangeExpr('submittedDate', from, to));
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
// OR group: each subfilter becomes an AND-joined clause, then ORed as a group
|
|
100
|
+
if (filters.or && filters.or.length > 0) {
|
|
101
|
+
const orClauses = filters.or.map((sf: ArxivSearchFilters) => buildSearchQuery({ ...sf, or: undefined, andNot: undefined }));
|
|
102
|
+
const grouped = groupOr(orClauses);
|
|
103
|
+
if (grouped) parts.push(grouped);
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
// Build the base query from regular parts
|
|
107
|
+
const baseQuery = joinAnd(parts);
|
|
108
|
+
|
|
109
|
+
// ANDNOT group: a single negated clause (appended separately, not joined with AND)
|
|
110
|
+
if (filters.andNot) {
|
|
111
|
+
const neg = buildSearchQuery({ ...filters.andNot, or: undefined, andNot: undefined });
|
|
112
|
+
if (neg) {
|
|
113
|
+
if (baseQuery) {
|
|
114
|
+
return `${baseQuery}+ANDNOT+${groupParen(neg)}`;
|
|
115
|
+
}
|
|
116
|
+
return `ANDNOT+${groupParen(neg)}`;
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
return baseQuery;
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
function buildUrl(opts: ArxivQueryOptions): string {
|
|
124
|
+
const params: string[] = [];
|
|
125
|
+
|
|
126
|
+
// Add id_list if it exists and has at least one item
|
|
127
|
+
if (opts.idList && Array.isArray(opts.idList) && opts.idList.length > 0) {
|
|
128
|
+
params.push('id_list=' + encodeURIComponent(opts.idList.join(',')));
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
// Add search_query if search is provided (can be used together with id_list)
|
|
132
|
+
if (opts.search) {
|
|
133
|
+
const q = buildSearchQuery(opts.search);
|
|
134
|
+
// Encode the query properly: use encodeURIComponent to encode all special characters,
|
|
135
|
+
// then replace %2B back to + so that + signs decode as spaces (arXiv expects spaces around AND/OR)
|
|
136
|
+
const encodedQuery = encodeURIComponent(q).replace(/%2B/g, '+');
|
|
137
|
+
params.push('search_query=' + encodedQuery);
|
|
138
|
+
}
|
|
139
|
+
if (typeof opts.start === 'number') params.push('start=' + String(opts.start));
|
|
140
|
+
if (typeof opts.maxResults === 'number') params.push('max_results=' + String(opts.maxResults));
|
|
141
|
+
if (opts.sortBy) params.push('sortBy=' + encodeURIComponent(opts.sortBy));
|
|
142
|
+
if (opts.sortOrder) params.push('sortOrder=' + encodeURIComponent(opts.sortOrder));
|
|
143
|
+
const qs = params.join('&');
|
|
144
|
+
return `${ARXIV_BASE_URL}?${qs}`;
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
/**
|
|
148
|
+
* Queries the arXiv API and returns matching paper entries.
|
|
149
|
+
*
|
|
150
|
+
* This is the main function for interacting with the arXiv API. It supports
|
|
151
|
+
* searching by various criteria, fetching specific papers by ID, pagination,
|
|
152
|
+
* sorting, rate limiting, and automatic retries with exponential backoff.
|
|
153
|
+
*
|
|
154
|
+
* @param options - Query options including search filters, pagination, and request configuration
|
|
155
|
+
* @returns Promise resolving to query results with feed metadata and paper entries
|
|
156
|
+
*
|
|
157
|
+
* @throws {Error} If the API request fails after all retries
|
|
158
|
+
* @throws {Error} If the API returns a non-2xx status code
|
|
159
|
+
* @throws {Error} If the API returns an empty response
|
|
160
|
+
*
|
|
161
|
+
* @example
|
|
162
|
+
* ```typescript
|
|
163
|
+
* // Simple search
|
|
164
|
+
* const result = await getArxivEntries({
|
|
165
|
+
* search: {
|
|
166
|
+
* title: ['quantum computing'],
|
|
167
|
+
* author: ['John Doe'],
|
|
168
|
+
* },
|
|
169
|
+
* maxResults: 10,
|
|
170
|
+
* });
|
|
171
|
+
*
|
|
172
|
+
* console.log(`Found ${result.feed.totalResults} papers`);
|
|
173
|
+
* result.entries.forEach(entry => {
|
|
174
|
+
* console.log(`${entry.arxivId}: ${entry.title}`);
|
|
175
|
+
* });
|
|
176
|
+
* ```
|
|
177
|
+
*
|
|
178
|
+
* @example
|
|
179
|
+
* ```typescript
|
|
180
|
+
* // Fetch specific papers by ID
|
|
181
|
+
* const result = await getArxivEntries({
|
|
182
|
+
* idList: ['2101.01234', '2101.05678'],
|
|
183
|
+
* });
|
|
184
|
+
* ```
|
|
185
|
+
*
|
|
186
|
+
* @example
|
|
187
|
+
* ```typescript
|
|
188
|
+
* // With rate limiting and custom timeout
|
|
189
|
+
* const result = await getArxivEntries({
|
|
190
|
+
* search: { title: ['neural networks'] },
|
|
191
|
+
* rateLimit: {
|
|
192
|
+
* tokensPerInterval: 1,
|
|
193
|
+
* intervalMs: 3000, // 1 request per 3 seconds
|
|
194
|
+
* },
|
|
195
|
+
* timeoutMs: 15000,
|
|
196
|
+
* retries: 5,
|
|
197
|
+
* });
|
|
198
|
+
* ```
|
|
199
|
+
*
|
|
200
|
+
* @see {@link ArxivQueryOptions} for all available options
|
|
201
|
+
* @see {@link ArxivQueryResult} for the return type structure
|
|
202
|
+
* @see {@link ArxivSearchFilters} for search filter options
|
|
203
|
+
*/
|
|
204
|
+
export async function getArxivEntries(options: ArxivQueryOptions): Promise<ArxivQueryResult> {
|
|
205
|
+
const timeoutMs = options.timeoutMs ?? 10000;
|
|
206
|
+
const retries = options.retries ?? 3;
|
|
207
|
+
const userAgent = options.userAgent ?? 'arxiv-api-wrapper/1.0 (+https://export.arxiv.org)';
|
|
208
|
+
|
|
209
|
+
const limiter = options.rateLimit
|
|
210
|
+
? new TokenBucketLimiter(options.rateLimit.tokensPerInterval, options.rateLimit.intervalMs)
|
|
211
|
+
: undefined;
|
|
212
|
+
|
|
213
|
+
const url = buildUrl(options);
|
|
214
|
+
if (limiter) await limiter.acquire();
|
|
215
|
+
|
|
216
|
+
const res = await fetchWithRetry(url, { method: 'GET', headers: { Accept: 'application/atom+xml' } }, { retries, timeoutMs, userAgent });
|
|
217
|
+
|
|
218
|
+
// Check response status before parsing
|
|
219
|
+
if (!res.ok) {
|
|
220
|
+
const errorText = await res.text().catch(() => 'Unable to read error response');
|
|
221
|
+
throw new Error(
|
|
222
|
+
`arXiv API returned status ${res.status} ${res.statusText} for URL: ${url}. ` +
|
|
223
|
+
`Response: ${errorText.substring(0, 500)}`
|
|
224
|
+
);
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
const text = await res.text();
|
|
228
|
+
|
|
229
|
+
// Log the response for debugging if it appears empty
|
|
230
|
+
if (!text || text.trim().length === 0) {
|
|
231
|
+
console.error(`Empty response from arXiv API. URL: ${url}, Status: ${res.status}`);
|
|
232
|
+
throw new Error(`arXiv API returned empty response for URL: ${url}`);
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
const feed = parseFeedMeta(text);
|
|
236
|
+
const entries = parseEntries(text);
|
|
237
|
+
|
|
238
|
+
// Log if parsing resulted in empty data
|
|
239
|
+
if (feed.totalResults === 0 && entries.length === 0 && text.length > 0) {
|
|
240
|
+
console.warn(`Parsed empty results from non-empty response. URL: ${url}, Response length: ${text.length}`);
|
|
241
|
+
console.warn(`Response preview (first 500 chars): ${text.substring(0, 500)}`);
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
return { feed, entries };
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
/**
|
|
248
|
+
* Fetches arXiv papers by their IDs using the simpler id_list API mode.
|
|
249
|
+
*
|
|
250
|
+
* This is a convenience function for the simpler arXiv API mode where you provide
|
|
251
|
+
* a comma-delimited list of paper IDs and get back the data for those papers.
|
|
252
|
+
* It's simpler than using search queries when you already know the paper IDs.
|
|
253
|
+
*
|
|
254
|
+
* @param ids - Array of arXiv paper IDs (e.g., ['2101.01234', '2101.05678']). Maximum 100 IDs allowed.
|
|
255
|
+
* @param options - Optional request configuration
|
|
256
|
+
* @param options.rateLimit - Rate limiting configuration to respect arXiv API guidelines
|
|
257
|
+
* @param options.retries - Number of retry attempts for failed requests (default: 3)
|
|
258
|
+
* @param options.timeoutMs - Request timeout in milliseconds (default: 10000)
|
|
259
|
+
* @param options.userAgent - Custom User-Agent header for requests
|
|
260
|
+
* @returns Promise resolving to query results with feed metadata and paper entries
|
|
261
|
+
*
|
|
262
|
+
* @throws {Error} If more than 100 IDs are provided
|
|
263
|
+
* @throws {Error} If the API request fails after all retries
|
|
264
|
+
* @throws {Error} If the API returns a non-2xx status code
|
|
265
|
+
* @throws {Error} If the API returns an empty response
|
|
266
|
+
*
|
|
267
|
+
* @example
|
|
268
|
+
* ```typescript
|
|
269
|
+
* // Fetch papers by ID
|
|
270
|
+
* const result = await getArxivEntriesById(['2101.01234', '2101.05678']);
|
|
271
|
+
*
|
|
272
|
+
* result.entries.forEach(entry => {
|
|
273
|
+
* console.log(`${entry.arxivId}: ${entry.title}`);
|
|
274
|
+
* });
|
|
275
|
+
* ```
|
|
276
|
+
*
|
|
277
|
+
* @example
|
|
278
|
+
* ```typescript
|
|
279
|
+
* // With rate limiting
|
|
280
|
+
* const result = await getArxivEntriesById(
|
|
281
|
+
* ['2101.01234'],
|
|
282
|
+
* {
|
|
283
|
+
* rateLimit: {
|
|
284
|
+
* tokensPerInterval: 1,
|
|
285
|
+
* intervalMs: 3000, // 1 request per 3 seconds
|
|
286
|
+
* },
|
|
287
|
+
* timeoutMs: 15000,
|
|
288
|
+
* }
|
|
289
|
+
* );
|
|
290
|
+
* ```
|
|
291
|
+
*
|
|
292
|
+
* @see {@link getArxivEntries} for more advanced querying with search filters
|
|
293
|
+
* @see {@link ArxivQueryResult} for the return type structure
|
|
294
|
+
*/
|
|
295
|
+
export async function getArxivEntriesById(
|
|
296
|
+
ids: string[],
|
|
297
|
+
options?: {
|
|
298
|
+
rateLimit?: ArxivRateLimitConfig;
|
|
299
|
+
retries?: number;
|
|
300
|
+
timeoutMs?: number;
|
|
301
|
+
userAgent?: string;
|
|
302
|
+
}
|
|
303
|
+
): Promise<ArxivQueryResult> {
|
|
304
|
+
if (ids.length > 100) {
|
|
305
|
+
throw new Error(`Maximum of 100 IDs allowed, but ${ids.length} IDs were provided`);
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
return getArxivEntries({
|
|
309
|
+
idList: ids,
|
|
310
|
+
rateLimit: options?.rateLimit,
|
|
311
|
+
retries: options?.retries,
|
|
312
|
+
timeoutMs: options?.timeoutMs,
|
|
313
|
+
userAgent: options?.userAgent,
|
|
314
|
+
});
|
|
315
|
+
}
|
|
316
|
+
|
package/src/atom.ts
CHANGED