@zenalexa/unicli 0.225.2 → 0.225.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +2 -2
- package/README.md +3 -3
- package/README.zh-CN.md +3 -3
- package/dist/adapters/acl-anthology/papers.d.ts +16 -9
- package/dist/adapters/acl-anthology/papers.d.ts.map +1 -1
- package/dist/adapters/acl-anthology/papers.js +322 -58
- package/dist/adapters/acl-anthology/papers.js.map +1 -1
- package/dist/adapters/arxiv/papers.d.ts +22 -4
- package/dist/adapters/arxiv/papers.d.ts.map +1 -1
- package/dist/adapters/arxiv/papers.js +202 -4
- package/dist/adapters/arxiv/papers.js.map +1 -1
- package/dist/adapters/baidu-scholar/search.d.ts +15 -1
- package/dist/adapters/baidu-scholar/search.d.ts.map +1 -1
- package/dist/adapters/baidu-scholar/search.js +72 -8
- package/dist/adapters/baidu-scholar/search.js.map +1 -1
- package/dist/adapters/biorxiv/preprints.d.ts +9 -0
- package/dist/adapters/biorxiv/preprints.d.ts.map +1 -0
- package/dist/adapters/biorxiv/preprints.js +78 -0
- package/dist/adapters/biorxiv/preprints.js.map +1 -0
- package/dist/adapters/cnki/search.d.ts +82 -0
- package/dist/adapters/cnki/search.d.ts.map +1 -0
- package/dist/adapters/cnki/search.js +236 -0
- package/dist/adapters/cnki/search.js.map +1 -0
- package/dist/adapters/cvf/papers.d.ts +12 -7
- package/dist/adapters/cvf/papers.d.ts.map +1 -1
- package/dist/adapters/cvf/papers.js +210 -27
- package/dist/adapters/cvf/papers.js.map +1 -1
- package/dist/adapters/dblp/publications.d.ts +12 -5
- package/dist/adapters/dblp/publications.d.ts.map +1 -1
- package/dist/adapters/dblp/publications.js +31 -8
- package/dist/adapters/dblp/publications.js.map +1 -1
- package/dist/adapters/google-scholar/search.d.ts +22 -1
- package/dist/adapters/google-scholar/search.d.ts.map +1 -1
- package/dist/adapters/google-scholar/search.js +129 -14
- package/dist/adapters/google-scholar/search.js.map +1 -1
- package/dist/adapters/hf/paper.d.ts +12 -3
- package/dist/adapters/hf/paper.d.ts.map +1 -1
- package/dist/adapters/hf/paper.js +65 -5
- package/dist/adapters/hf/paper.js.map +1 -1
- package/dist/adapters/medrxiv/preprints.d.ts +9 -0
- package/dist/adapters/medrxiv/preprints.d.ts.map +1 -0
- package/dist/adapters/medrxiv/preprints.js +78 -0
- package/dist/adapters/medrxiv/preprints.js.map +1 -0
- package/dist/adapters/neurips/proceedings.d.ts +8 -7
- package/dist/adapters/neurips/proceedings.d.ts.map +1 -1
- package/dist/adapters/neurips/proceedings.js +209 -21
- package/dist/adapters/neurips/proceedings.js.map +1 -1
- package/dist/adapters/openalex/works.d.ts +21 -5
- package/dist/adapters/openalex/works.d.ts.map +1 -1
- package/dist/adapters/openalex/works.js +108 -8
- package/dist/adapters/openalex/works.js.map +1 -1
- package/dist/adapters/openreview/papers.d.ts +10 -4
- package/dist/adapters/openreview/papers.d.ts.map +1 -1
- package/dist/adapters/openreview/papers.js +351 -24
- package/dist/adapters/openreview/papers.js.map +1 -1
- package/dist/adapters/pmlr/proceedings.d.ts +6 -6
- package/dist/adapters/pmlr/proceedings.d.ts.map +1 -1
- package/dist/adapters/pmlr/proceedings.js +92 -12
- package/dist/adapters/pmlr/proceedings.js.map +1 -1
- package/dist/adapters/pubmed/articles.d.ts +8 -4
- package/dist/adapters/pubmed/articles.d.ts.map +1 -1
- package/dist/adapters/pubmed/articles.js +272 -39
- package/dist/adapters/pubmed/articles.js.map +1 -1
- package/dist/adapters/rxiv/preprints.d.ts +75 -0
- package/dist/adapters/rxiv/preprints.d.ts.map +1 -0
- package/dist/adapters/rxiv/preprints.js +651 -0
- package/dist/adapters/rxiv/preprints.js.map +1 -0
- package/dist/adapters/scholar-artifacts/pdf-read.d.ts +49 -0
- package/dist/adapters/scholar-artifacts/pdf-read.d.ts.map +1 -0
- package/dist/adapters/scholar-artifacts/pdf-read.js +204 -0
- package/dist/adapters/scholar-artifacts/pdf-read.js.map +1 -0
- package/dist/adapters/scholar-artifacts/pdf.d.ts +16 -0
- package/dist/adapters/scholar-artifacts/pdf.d.ts.map +1 -0
- package/dist/adapters/scholar-artifacts/pdf.js +122 -0
- package/dist/adapters/scholar-artifacts/pdf.js.map +1 -0
- package/dist/adapters/semantic-scholar/papers.d.ts +6 -6
- package/dist/adapters/semantic-scholar/papers.d.ts.map +1 -1
- package/dist/adapters/semantic-scholar/papers.js +80 -6
- package/dist/adapters/semantic-scholar/papers.js.map +1 -1
- package/dist/adapters/unpaywall/works.d.ts +7 -7
- package/dist/adapters/unpaywall/works.d.ts.map +1 -1
- package/dist/adapters/unpaywall/works.js +104 -12
- package/dist/adapters/unpaywall/works.js.map +1 -1
- package/dist/adapters/wanfang/search.d.ts +14 -0
- package/dist/adapters/wanfang/search.d.ts.map +1 -1
- package/dist/adapters/wanfang/search.js +56 -7
- package/dist/adapters/wanfang/search.js.map +1 -1
- package/dist/browser/page.d.ts +2 -0
- package/dist/browser/page.d.ts.map +1 -1
- package/dist/browser/page.js +12 -0
- package/dist/browser/page.js.map +1 -1
- package/dist/commands/browser/actions.d.ts.map +1 -1
- package/dist/commands/browser/actions.js +59 -3
- package/dist/commands/browser/actions.js.map +1 -1
- package/dist/commands/scholar.d.ts +77 -5
- package/dist/commands/scholar.d.ts.map +1 -1
- package/dist/commands/scholar.js +2945 -83
- package/dist/commands/scholar.js.map +1 -1
- package/dist/core/command-contract.d.ts.map +1 -1
- package/dist/core/command-contract.js +5 -0
- package/dist/core/command-contract.js.map +1 -1
- package/dist/core/schema-v2.d.ts +1 -0
- package/dist/core/schema-v2.d.ts.map +1 -1
- package/dist/core/schema-v2.js +1 -0
- package/dist/core/schema-v2.js.map +1 -1
- package/dist/discovery/aliases.d.ts.map +1 -1
- package/dist/discovery/aliases.js +208 -0
- package/dist/discovery/aliases.js.map +1 -1
- package/dist/discovery/core-catalog.d.ts +2 -0
- package/dist/discovery/core-catalog.d.ts.map +1 -1
- package/dist/discovery/core-catalog.js +487 -0
- package/dist/discovery/core-catalog.js.map +1 -1
- package/dist/discovery/intents.d.ts.map +1 -1
- package/dist/discovery/intents.js +273 -2
- package/dist/discovery/intents.js.map +1 -1
- package/dist/discovery/loader.d.ts.map +1 -1
- package/dist/discovery/loader.js +3 -0
- package/dist/discovery/loader.js.map +1 -1
- package/dist/engine/capability-policy.d.ts.map +1 -1
- package/dist/engine/capability-policy.js +30 -4
- package/dist/engine/capability-policy.js.map +1 -1
- package/dist/engine/kernel/stages.d.ts.map +1 -1
- package/dist/engine/kernel/stages.js +3 -0
- package/dist/engine/kernel/stages.js.map +1 -1
- package/dist/engine/operation-policy.d.ts +4 -1
- package/dist/engine/operation-policy.d.ts.map +1 -1
- package/dist/engine/operation-policy.js +23 -0
- package/dist/engine/operation-policy.js.map +1 -1
- package/dist/fast-path/manifest.d.ts +3 -0
- package/dist/fast-path/manifest.d.ts.map +1 -1
- package/dist/fast-path/manifest.js.map +1 -1
- package/dist/fast-path/policy.d.ts.map +1 -1
- package/dist/fast-path/policy.js +3 -0
- package/dist/fast-path/policy.js.map +1 -1
- package/dist/manifest-compact.txt +1 -1
- package/dist/manifest.json +6804 -1002
- package/dist/registry.d.ts +2 -0
- package/dist/registry.d.ts.map +1 -1
- package/dist/registry.js +1 -0
- package/dist/registry.js.map +1 -1
- package/dist/types/scholarly.d.ts +19 -4
- package/dist/types/scholarly.d.ts.map +1 -1
- package/dist/types/scholarly.js +4 -4
- package/dist/types.d.ts +8 -0
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js.map +1 -1
- package/package.json +1 -1
- package/server.json +2 -2
- package/skills/unicli/SKILL.md +1 -1
- package/skills/unicli-claude-code/SKILL.md +1 -1
- package/skills/unicli-hermes/SKILL.md +1 -1
- package/src/adapters/acl-anthology/papers.test.ts +111 -0
- package/src/adapters/acl-anthology/papers.ts +379 -71
- package/src/adapters/arxiv/papers.test.ts +46 -0
- package/src/adapters/arxiv/papers.ts +251 -4
- package/src/adapters/baidu-scholar/search.ts +74 -11
- package/src/adapters/biorxiv/preprints.ts +112 -0
- package/src/adapters/cnki/search.ts +357 -0
- package/src/adapters/cvf/papers.ts +260 -27
- package/src/adapters/dblp/publications.test.ts +9 -0
- package/src/adapters/dblp/publications.ts +31 -8
- package/src/adapters/google-scholar/search.ts +165 -17
- package/src/adapters/hf/paper.test.ts +23 -0
- package/src/adapters/hf/paper.ts +89 -5
- package/src/adapters/hf/top.yaml +34 -2
- package/src/adapters/huggingface-papers/daily.yaml +37 -3
- package/src/adapters/huggingface-papers/search.yaml +43 -9
- package/src/adapters/medrxiv/preprints.ts +112 -0
- package/src/adapters/neurips/proceedings.ts +266 -22
- package/src/adapters/openalex/works.test.ts +15 -4
- package/src/adapters/openalex/works.ts +136 -8
- package/src/adapters/openreview/papers.test.ts +31 -0
- package/src/adapters/openreview/papers.ts +407 -29
- package/src/adapters/pmlr/proceedings.ts +102 -12
- package/src/adapters/pubmed/articles.test.ts +88 -1
- package/src/adapters/pubmed/articles.ts +343 -44
- package/src/adapters/rxiv/preprints.test.ts +233 -0
- package/src/adapters/rxiv/preprints.ts +849 -0
- package/src/adapters/scholar-artifacts/pdf-read.ts +277 -0
- package/src/adapters/scholar-artifacts/pdf.ts +133 -0
- package/src/adapters/semantic-scholar/papers.ts +98 -6
- package/src/adapters/unpaywall/works.ts +141 -12
- package/src/adapters/wanfang/search.ts +57 -7
- package/src/adapters/cnki/search.yaml +0 -49
|
@@ -0,0 +1,849 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @owner src::adapters::rxiv::preprints
|
|
3
|
+
* @does Provides shared bioRxiv/medRxiv official API search, mapping, PDF download, and read helpers for site-specific entrypoints.
|
|
4
|
+
* @needs api.biorxiv.org details API, bioRxiv/medRxiv PDF/JATS URLs, src/engine/download.ts, pdftotext.
|
|
5
|
+
* @feeds src/adapters/biorxiv/preprints.ts, src/adapters/medrxiv/preprints.ts, scholarly preprint workflows.
|
|
6
|
+
* @breaks API envelope drift, date-window search exhaustion, Cloudflare denial on PDF/XML assets, missing DOI versions, or pdftotext absence stop the preprint artifact loop.
|
|
7
|
+
* @invariants DOI detail and date-window search are source-first through the official API; read prefers JATS XML, then falls back to PDF text extraction; source-denied assets fail closed.
|
|
8
|
+
* @side-effects HTTPS egress to api.biorxiv.org and source PDF/XML hosts; download/read may write PDFs under the requested output directory; read may execute pdftotext.
|
|
9
|
+
* @perf O(1) for DOI detail, O(limit) for recent mapping, O(max-pages * 30) for bounded search.
|
|
10
|
+
* @concurrency safe - per-command local state only
|
|
11
|
+
* @test src/adapters/rxiv/preprints.test.ts, tests/unit/commands/scholar.test.ts
|
|
12
|
+
* @stability experimental
|
|
13
|
+
* @since 0.225.2
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
import { execFile } from "node:child_process";
|
|
17
|
+
import { join, resolve } from "node:path";
|
|
18
|
+
import { promisify } from "node:util";
|
|
19
|
+
|
|
20
|
+
import { DOMParser, type Document, type Element } from "@xmldom/xmldom";
|
|
21
|
+
|
|
22
|
+
import { httpDownload, sanitizeFilename } from "../../engine/download.js";
|
|
23
|
+
import { normalizeXmlText, stripHtml } from "../../engine/text-normalize.js";
|
|
24
|
+
import type { AdapterArg } from "../../types.js";
|
|
25
|
+
|
|
26
|
+
const API_BASE = "https://api.biorxiv.org/details";
|
|
27
|
+
const execFileAsync = promisify(execFile);
|
|
28
|
+
export const RXIV_RECENT_COLUMNS = [
|
|
29
|
+
"rank",
|
|
30
|
+
"id",
|
|
31
|
+
"title",
|
|
32
|
+
"authors",
|
|
33
|
+
"date",
|
|
34
|
+
"version",
|
|
35
|
+
"category",
|
|
36
|
+
"doi",
|
|
37
|
+
"pdf_url",
|
|
38
|
+
"source_url",
|
|
39
|
+
];
|
|
40
|
+
export const RXIV_SEARCH_COLUMNS = [
|
|
41
|
+
...RXIV_RECENT_COLUMNS,
|
|
42
|
+
"matched_fields",
|
|
43
|
+
"search_scope",
|
|
44
|
+
"search_window",
|
|
45
|
+
"search_scanned_records",
|
|
46
|
+
"search_total_records",
|
|
47
|
+
"search_exhaustive",
|
|
48
|
+
];
|
|
49
|
+
export const RXIV_PAPER_COLUMNS = [
|
|
50
|
+
"id",
|
|
51
|
+
"title",
|
|
52
|
+
"authors",
|
|
53
|
+
"date",
|
|
54
|
+
"version",
|
|
55
|
+
"type",
|
|
56
|
+
"license",
|
|
57
|
+
"category",
|
|
58
|
+
"abstract",
|
|
59
|
+
"doi",
|
|
60
|
+
"jatsxml_url",
|
|
61
|
+
"pdf_url",
|
|
62
|
+
"source_url",
|
|
63
|
+
];
|
|
64
|
+
export const RXIV_DOWNLOAD_COLUMNS = [
|
|
65
|
+
"id",
|
|
66
|
+
"title",
|
|
67
|
+
"doi",
|
|
68
|
+
"pdf_url",
|
|
69
|
+
"path",
|
|
70
|
+
"_download",
|
|
71
|
+
];
|
|
72
|
+
export const RXIV_READ_COLUMNS = [
|
|
73
|
+
"id",
|
|
74
|
+
"title",
|
|
75
|
+
"doi",
|
|
76
|
+
"pdf_url",
|
|
77
|
+
"path",
|
|
78
|
+
"text_source",
|
|
79
|
+
"text",
|
|
80
|
+
];
|
|
81
|
+
export const RXIV_RECENT_ARGS: AdapterArg[] = [
|
|
82
|
+
{
|
|
83
|
+
name: "from",
|
|
84
|
+
type: "str" as const,
|
|
85
|
+
description: "Start date YYYY-MM-DD; defaults to seven UTC days ago",
|
|
86
|
+
},
|
|
87
|
+
{
|
|
88
|
+
name: "to",
|
|
89
|
+
type: "str" as const,
|
|
90
|
+
description: "End date YYYY-MM-DD; defaults to today UTC",
|
|
91
|
+
},
|
|
92
|
+
{ name: "cursor", type: "int" as const, default: 0 },
|
|
93
|
+
{ name: "limit", type: "int" as const, default: 30 },
|
|
94
|
+
{
|
|
95
|
+
name: "category",
|
|
96
|
+
type: "str" as const,
|
|
97
|
+
description: "Optional subject category, e.g. epidemiology or cell_biology",
|
|
98
|
+
},
|
|
99
|
+
];
|
|
100
|
+
export const RXIV_SEARCH_ARGS: AdapterArg[] = [
|
|
101
|
+
{
|
|
102
|
+
name: "query",
|
|
103
|
+
type: "str" as const,
|
|
104
|
+
required: true,
|
|
105
|
+
positional: true,
|
|
106
|
+
description:
|
|
107
|
+
"Search text matched against title, abstract, authors, DOI, and category",
|
|
108
|
+
},
|
|
109
|
+
{
|
|
110
|
+
name: "from",
|
|
111
|
+
type: "str" as const,
|
|
112
|
+
description: "Start date YYYY-MM-DD; defaults to seven UTC days ago",
|
|
113
|
+
},
|
|
114
|
+
{
|
|
115
|
+
name: "to",
|
|
116
|
+
type: "str" as const,
|
|
117
|
+
description: "End date YYYY-MM-DD; defaults to today UTC",
|
|
118
|
+
},
|
|
119
|
+
{ name: "cursor", type: "int" as const, default: 0 },
|
|
120
|
+
{ name: "limit", type: "int" as const, default: 20 },
|
|
121
|
+
{
|
|
122
|
+
name: "max-pages",
|
|
123
|
+
type: "int" as const,
|
|
124
|
+
default: 10,
|
|
125
|
+
description:
|
|
126
|
+
"Maximum official API pages to scan; each page contains up to 30 records",
|
|
127
|
+
},
|
|
128
|
+
{
|
|
129
|
+
name: "category",
|
|
130
|
+
type: "str" as const,
|
|
131
|
+
description: "Optional subject category, e.g. epidemiology or cell_biology",
|
|
132
|
+
},
|
|
133
|
+
];
|
|
134
|
+
export const RXIV_PAPER_ARGS: AdapterArg[] = [
|
|
135
|
+
{
|
|
136
|
+
name: "doi",
|
|
137
|
+
type: "str" as const,
|
|
138
|
+
required: true,
|
|
139
|
+
positional: true,
|
|
140
|
+
description: "Preprint DOI",
|
|
141
|
+
"x-unicli-kind": "id",
|
|
142
|
+
"x-unicli-accepts": ["url"],
|
|
143
|
+
},
|
|
144
|
+
];
|
|
145
|
+
export const RXIV_DOWNLOAD_ARGS: AdapterArg[] = [
|
|
146
|
+
...RXIV_PAPER_ARGS,
|
|
147
|
+
{
|
|
148
|
+
name: "output",
|
|
149
|
+
type: "str" as const,
|
|
150
|
+
default: "./rxiv-downloads",
|
|
151
|
+
description: "Output directory",
|
|
152
|
+
"x-unicli-kind": "path",
|
|
153
|
+
},
|
|
154
|
+
];
|
|
155
|
+
export const RXIV_READ_ARGS: AdapterArg[] = [
|
|
156
|
+
...RXIV_DOWNLOAD_ARGS,
|
|
157
|
+
{
|
|
158
|
+
name: "first-page",
|
|
159
|
+
type: "int" as const,
|
|
160
|
+
default: 1,
|
|
161
|
+
description: "First PDF page to extract when JATS XML is unavailable",
|
|
162
|
+
},
|
|
163
|
+
{
|
|
164
|
+
name: "last-page",
|
|
165
|
+
type: "int" as const,
|
|
166
|
+
default: 20,
|
|
167
|
+
description: "Last PDF page to extract when JATS XML is unavailable",
|
|
168
|
+
},
|
|
169
|
+
{
|
|
170
|
+
name: "max-chars",
|
|
171
|
+
type: "int" as const,
|
|
172
|
+
default: 40000,
|
|
173
|
+
description: "Maximum text characters to return",
|
|
174
|
+
},
|
|
175
|
+
];
|
|
176
|
+
export const RXIV_RECENT_CAPABILITIES = [
|
|
177
|
+
"http.fetch",
|
|
178
|
+
"scholar.venue",
|
|
179
|
+
] as const;
|
|
180
|
+
export const RXIV_SEARCH_CAPABILITIES = [
|
|
181
|
+
"http.fetch",
|
|
182
|
+
"scholar.search",
|
|
183
|
+
] as const;
|
|
184
|
+
export const RXIV_PAPER_CAPABILITIES = [
|
|
185
|
+
"http.fetch",
|
|
186
|
+
"scholar.get",
|
|
187
|
+
"scholar.pdf",
|
|
188
|
+
] as const;
|
|
189
|
+
export const RXIV_DOWNLOAD_CAPABILITIES = [
|
|
190
|
+
"http.fetch",
|
|
191
|
+
"http.download",
|
|
192
|
+
"scholar.pdf",
|
|
193
|
+
] as const;
|
|
194
|
+
export const RXIV_READ_CAPABILITIES = [
|
|
195
|
+
"http.fetch",
|
|
196
|
+
"http.download",
|
|
197
|
+
"subprocess.exec",
|
|
198
|
+
"scholar.fulltext",
|
|
199
|
+
"scholar.pdf",
|
|
200
|
+
] as const;
|
|
201
|
+
|
|
202
|
+
type RxivSite = "biorxiv" | "medrxiv";
|
|
203
|
+
|
|
204
|
+
export interface RxivConfig {
|
|
205
|
+
site: RxivSite;
|
|
206
|
+
label: string;
|
|
207
|
+
apiServer: string;
|
|
208
|
+
webOrigin: string;
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
export interface RxivPreprint {
|
|
212
|
+
title?: unknown;
|
|
213
|
+
authors?: unknown;
|
|
214
|
+
author_corresponding?: unknown;
|
|
215
|
+
author_corresponding_institution?: unknown;
|
|
216
|
+
doi?: unknown;
|
|
217
|
+
date?: unknown;
|
|
218
|
+
version?: unknown;
|
|
219
|
+
type?: unknown;
|
|
220
|
+
license?: unknown;
|
|
221
|
+
category?: unknown;
|
|
222
|
+
jatsxml?: unknown;
|
|
223
|
+
abstract?: unknown;
|
|
224
|
+
published?: unknown;
|
|
225
|
+
server?: unknown;
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
interface RxivEnvelope {
|
|
229
|
+
messages?: Array<{ status?: unknown; count?: unknown; total?: unknown }>;
|
|
230
|
+
collection?: RxivPreprint[];
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
type RxivActionableError = Error & {
|
|
234
|
+
code?: string;
|
|
235
|
+
suggestion?: string;
|
|
236
|
+
retryable?: boolean;
|
|
237
|
+
alternatives?: string[];
|
|
238
|
+
};
|
|
239
|
+
|
|
240
|
+
function stringField(value: unknown): string {
|
|
241
|
+
return typeof value === "string" ? value.trim() : "";
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
function cleanText(value: unknown): string {
|
|
245
|
+
return normalizeXmlText(stringField(value));
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
function rxivAssetError(
|
|
249
|
+
config: RxivConfig,
|
|
250
|
+
kind: "PDF" | "JATS XML",
|
|
251
|
+
doi: string,
|
|
252
|
+
detail: string,
|
|
253
|
+
): RxivActionableError {
|
|
254
|
+
const error = new Error(
|
|
255
|
+
`${config.label} ${kind} source asset failed for ${doi}: ${detail}.`,
|
|
256
|
+
) as RxivActionableError;
|
|
257
|
+
error.code = "upstream_error";
|
|
258
|
+
error.suggestion = `${config.label} source asset host rejected ${kind} access on this network path; use the official API metadata, retry later, or open the source URL manually. Do not assume a cookie/login repair will fix this public asset response.`;
|
|
259
|
+
error.retryable = /HTTP 5\d\d|timeout|ECONNRESET|ETIMEDOUT/i.test(detail);
|
|
260
|
+
error.alternatives = [];
|
|
261
|
+
return error;
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
function bareDoi(value: unknown): string {
|
|
265
|
+
return String(value ?? "")
|
|
266
|
+
.trim()
|
|
267
|
+
.replace(/^doi:/i, "")
|
|
268
|
+
.replace(/^https?:\/\/(?:dx\.)?doi\.org\//i, "")
|
|
269
|
+
.replace(/\\\//g, "/");
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
export function requireRxivDoi(value: unknown, site = "rxiv"): string {
|
|
273
|
+
const doi = bareDoi(value);
|
|
274
|
+
if (!/^10\.\S+\/\S+$/i.test(doi)) {
|
|
275
|
+
throw new Error(`${site} DOI "${String(value ?? "")}" is not recognised.`);
|
|
276
|
+
}
|
|
277
|
+
return doi;
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
export function requireRxivLimit(value: unknown, fallback = 30): number {
|
|
281
|
+
if (value === undefined || value === null || value === "") return fallback;
|
|
282
|
+
const n = Number(value);
|
|
283
|
+
if (!Number.isInteger(n) || n < 1 || n > 30) {
|
|
284
|
+
throw new Error(
|
|
285
|
+
`rxiv limit must be an integer in [1, 30]. Got: ${String(value)}`,
|
|
286
|
+
);
|
|
287
|
+
}
|
|
288
|
+
return n;
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
export function requireRxivSearchPages(value: unknown, fallback = 10): number {
|
|
292
|
+
if (value === undefined || value === null || value === "") return fallback;
|
|
293
|
+
const n = Number(value);
|
|
294
|
+
if (!Number.isInteger(n) || n < 1 || n > 20) {
|
|
295
|
+
throw new Error(
|
|
296
|
+
`rxiv max-pages must be an integer in [1, 20]. Got: ${String(value)}`,
|
|
297
|
+
);
|
|
298
|
+
}
|
|
299
|
+
return n;
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
export function requireRxivQuery(value: unknown): string {
|
|
303
|
+
const query = cleanText(value);
|
|
304
|
+
if (!query) throw new Error("rxiv query cannot be empty.");
|
|
305
|
+
return query;
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
export function requireRxivCursor(value: unknown): number {
|
|
309
|
+
if (value === undefined || value === null || value === "") return 0;
|
|
310
|
+
const n = Number(value);
|
|
311
|
+
if (!Number.isInteger(n) || n < 0) {
|
|
312
|
+
throw new Error("rxiv cursor must be a non-negative integer.");
|
|
313
|
+
}
|
|
314
|
+
return n;
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
export function requireRxivDate(value: unknown, label: string): string {
|
|
318
|
+
const date = String(value ?? "").trim();
|
|
319
|
+
if (!/^\d{4}-\d{2}-\d{2}$/.test(date)) {
|
|
320
|
+
throw new Error(`rxiv ${label} must be YYYY-MM-DD.`);
|
|
321
|
+
}
|
|
322
|
+
return date;
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
export function requireRxivMaxChars(value: unknown, fallback = 40_000): number {
|
|
326
|
+
if (value === undefined || value === null || value === "") return fallback;
|
|
327
|
+
const n = Number(value);
|
|
328
|
+
if (!Number.isInteger(n) || n < 1_000 || n > 1_000_000) {
|
|
329
|
+
throw new Error(
|
|
330
|
+
`rxiv max-chars must be an integer in [1000, 1000000]. Got: ${String(value)}`,
|
|
331
|
+
);
|
|
332
|
+
}
|
|
333
|
+
return n;
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
function requireRxivPageRange(
|
|
337
|
+
firstPage: unknown,
|
|
338
|
+
lastPage: unknown,
|
|
339
|
+
): { firstPage: number; lastPage: number } {
|
|
340
|
+
const first = Number(firstPage ?? 1);
|
|
341
|
+
const last = Number(lastPage ?? 20);
|
|
342
|
+
if (!Number.isInteger(first) || first < 1) {
|
|
343
|
+
throw new Error("rxiv first-page must be an integer >= 1.");
|
|
344
|
+
}
|
|
345
|
+
if (!Number.isInteger(last) || last < first) {
|
|
346
|
+
throw new Error("rxiv last-page must be an integer >= first-page.");
|
|
347
|
+
}
|
|
348
|
+
return { firstPage: first, lastPage: last };
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
function isoDateDaysAgo(days: number): string {
|
|
352
|
+
const date = new Date(Date.now() - days * 24 * 60 * 60 * 1000);
|
|
353
|
+
return date.toISOString().slice(0, 10);
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
function encodeDoiPath(doi: string): string {
|
|
357
|
+
return doi.split("/").map(encodeURIComponent).join("/");
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
function pdfUrl(config: RxivConfig, doi: string, version: string): string {
|
|
361
|
+
return `${config.webOrigin}/content/${doi}v${version}.full.pdf`;
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
function landingUrl(config: RxivConfig, doi: string, version: string): string {
|
|
365
|
+
return `${config.webOrigin}/content/${doi}v${version}`;
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
export function rxivArtifactFilename(input: {
|
|
369
|
+
doi: string;
|
|
370
|
+
version?: string;
|
|
371
|
+
title?: unknown;
|
|
372
|
+
}): string {
|
|
373
|
+
const slug = cleanText(input.title)
|
|
374
|
+
.replace(/[^A-Za-z0-9._-]+/g, "-")
|
|
375
|
+
.replace(/^-+|-+$/g, "")
|
|
376
|
+
.slice(0, 80);
|
|
377
|
+
const version = input.version ? `v${input.version}` : "";
|
|
378
|
+
return sanitizeFilename(
|
|
379
|
+
`${input.doi}${version}${slug ? `-${slug}` : ""}.pdf`,
|
|
380
|
+
);
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
export function mapRxivPreprint(
|
|
384
|
+
preprint: RxivPreprint,
|
|
385
|
+
config: RxivConfig,
|
|
386
|
+
rank?: number,
|
|
387
|
+
): Record<string, unknown> {
|
|
388
|
+
const doi = requireRxivDoi(preprint.doi, config.site);
|
|
389
|
+
const version = stringField(preprint.version) || "1";
|
|
390
|
+
const sourceUrl = landingUrl(config, doi, version);
|
|
391
|
+
return {
|
|
392
|
+
...(rank === undefined ? {} : { rank }),
|
|
393
|
+
id: doi,
|
|
394
|
+
title: cleanText(preprint.title),
|
|
395
|
+
authors: stringField(preprint.authors)
|
|
396
|
+
.split(/\s*;\s*/)
|
|
397
|
+
.map((author) => author.trim())
|
|
398
|
+
.filter(Boolean),
|
|
399
|
+
author_corresponding: cleanText(preprint.author_corresponding),
|
|
400
|
+
author_corresponding_institution: cleanText(
|
|
401
|
+
preprint.author_corresponding_institution,
|
|
402
|
+
),
|
|
403
|
+
date: stringField(preprint.date),
|
|
404
|
+
year: Number(stringField(preprint.date).slice(0, 4)) || undefined,
|
|
405
|
+
version,
|
|
406
|
+
type: cleanText(preprint.type),
|
|
407
|
+
license: cleanText(preprint.license),
|
|
408
|
+
category: cleanText(preprint.category),
|
|
409
|
+
venue: config.label,
|
|
410
|
+
abstract: cleanText(preprint.abstract),
|
|
411
|
+
published: cleanText(preprint.published),
|
|
412
|
+
server: cleanText(preprint.server) || config.label,
|
|
413
|
+
doi,
|
|
414
|
+
jatsxml_url: stringField(preprint.jatsxml),
|
|
415
|
+
pdf_url: pdfUrl(config, doi, version),
|
|
416
|
+
landing_url: sourceUrl,
|
|
417
|
+
source_url: sourceUrl,
|
|
418
|
+
source_adapter: config.site,
|
|
419
|
+
retrieved_at: new Date().toISOString(),
|
|
420
|
+
url: sourceUrl,
|
|
421
|
+
};
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
function envelopeRows(envelope: RxivEnvelope): RxivPreprint[] {
|
|
425
|
+
return Array.isArray(envelope.collection) ? envelope.collection : [];
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
async function fetchRxivJson(
|
|
429
|
+
url: string,
|
|
430
|
+
label: string,
|
|
431
|
+
): Promise<RxivEnvelope> {
|
|
432
|
+
const response = await fetch(url, {
|
|
433
|
+
headers: {
|
|
434
|
+
Accept: "application/json",
|
|
435
|
+
"User-Agent": "unicli-rxiv/1.0 (https://github.com/olo-dot-io/Uni-CLI)",
|
|
436
|
+
},
|
|
437
|
+
});
|
|
438
|
+
if (response.status === 404) return { collection: [] };
|
|
439
|
+
if (response.status === 429) throw new Error(`${label} returned HTTP 429.`);
|
|
440
|
+
if (!response.ok)
|
|
441
|
+
throw new Error(`${label} returned HTTP ${response.status}.`);
|
|
442
|
+
const envelope = (await response.json()) as RxivEnvelope;
|
|
443
|
+
const status = stringField(envelope.messages?.[0]?.status).toLowerCase();
|
|
444
|
+
if (status && status !== "ok" && status !== "no posts found") {
|
|
445
|
+
throw new Error(`${label} returned status: ${status}.`);
|
|
446
|
+
}
|
|
447
|
+
return envelope;
|
|
448
|
+
}
|
|
449
|
+
|
|
450
|
+
async function fetchRxivText(url: string, label: string): Promise<string> {
|
|
451
|
+
const response = await fetch(url, {
|
|
452
|
+
headers: {
|
|
453
|
+
Accept: "application/xml,text/xml,*/*",
|
|
454
|
+
"User-Agent": "unicli-rxiv/1.0 (https://github.com/olo-dot-io/Uni-CLI)",
|
|
455
|
+
},
|
|
456
|
+
});
|
|
457
|
+
if (response.status === 429) throw new Error(`${label} returned HTTP 429.`);
|
|
458
|
+
if (!response.ok)
|
|
459
|
+
throw new Error(`${label} returned HTTP ${response.status}.`);
|
|
460
|
+
return response.text();
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
function rowsFromEnvelope(
|
|
464
|
+
envelope: RxivEnvelope,
|
|
465
|
+
config: RxivConfig,
|
|
466
|
+
limit: number,
|
|
467
|
+
): Array<Record<string, unknown>> {
|
|
468
|
+
return envelopeRows(envelope)
|
|
469
|
+
.slice(0, limit)
|
|
470
|
+
.map((preprint, index) => mapRxivPreprint(preprint, config, index + 1));
|
|
471
|
+
}
|
|
472
|
+
|
|
473
|
+
function envelopeTotal(envelope: RxivEnvelope): number | undefined {
|
|
474
|
+
const raw =
|
|
475
|
+
envelope.messages?.[0]?.total ??
|
|
476
|
+
envelope.messages?.[0]?.count ??
|
|
477
|
+
envelope.collection?.length;
|
|
478
|
+
const total = Number(raw);
|
|
479
|
+
return Number.isFinite(total) && total >= 0 ? total : undefined;
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
function searchableText(preprint: RxivPreprint): Record<string, string> {
|
|
483
|
+
return {
|
|
484
|
+
title: cleanText(preprint.title),
|
|
485
|
+
abstract: stripHtml(stringField(preprint.abstract)),
|
|
486
|
+
authors: cleanText(preprint.authors),
|
|
487
|
+
doi: bareDoi(preprint.doi),
|
|
488
|
+
category: cleanText(preprint.category),
|
|
489
|
+
};
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
function queryTokens(query: string): string[] {
|
|
493
|
+
return query
|
|
494
|
+
.toLowerCase()
|
|
495
|
+
.split(/[^\p{L}\p{N}.]+/u)
|
|
496
|
+
.map((token) => token.trim())
|
|
497
|
+
.filter((token) => token.length >= 2);
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
export function rxivSearchMatchedFields(
|
|
501
|
+
preprint: RxivPreprint,
|
|
502
|
+
queryValue: unknown,
|
|
503
|
+
): string[] {
|
|
504
|
+
const query = requireRxivQuery(queryValue).toLowerCase();
|
|
505
|
+
const tokens = queryTokens(query);
|
|
506
|
+
const fields = searchableText(preprint);
|
|
507
|
+
return Object.entries(fields).flatMap(([field, value]) => {
|
|
508
|
+
const haystack = value.toLowerCase();
|
|
509
|
+
if (!haystack) return [];
|
|
510
|
+
if (haystack.includes(query)) return [field];
|
|
511
|
+
return tokens.length > 0 &&
|
|
512
|
+
tokens.every((token) => haystack.includes(token))
|
|
513
|
+
? [field]
|
|
514
|
+
: [];
|
|
515
|
+
});
|
|
516
|
+
}
|
|
517
|
+
|
|
518
|
+
function annotateSearchRow(
|
|
519
|
+
row: Record<string, unknown>,
|
|
520
|
+
input: {
|
|
521
|
+
rank: number;
|
|
522
|
+
matchedFields: string[];
|
|
523
|
+
from: string;
|
|
524
|
+
to: string;
|
|
525
|
+
scannedRecords: number;
|
|
526
|
+
totalRecords?: number;
|
|
527
|
+
isExhaustive: boolean;
|
|
528
|
+
},
|
|
529
|
+
): Record<string, unknown> {
|
|
530
|
+
return {
|
|
531
|
+
...row,
|
|
532
|
+
rank: input.rank,
|
|
533
|
+
matched_fields: input.matchedFields,
|
|
534
|
+
search_scope: "official_api_date_window",
|
|
535
|
+
search_window: `${input.from}:${input.to}`,
|
|
536
|
+
search_scanned_records: input.scannedRecords,
|
|
537
|
+
search_total_records: input.totalRecords,
|
|
538
|
+
search_exhaustive: input.isExhaustive,
|
|
539
|
+
};
|
|
540
|
+
}
|
|
541
|
+
|
|
542
|
+
export async function fetchRecentRows(
|
|
543
|
+
config: RxivConfig,
|
|
544
|
+
kwargs: Record<string, unknown>,
|
|
545
|
+
): Promise<Array<Record<string, unknown>>> {
|
|
546
|
+
const from = requireRxivDate(kwargs.from ?? isoDateDaysAgo(7), "from");
|
|
547
|
+
const to = requireRxivDate(kwargs.to ?? isoDateDaysAgo(0), "to");
|
|
548
|
+
const cursor = requireRxivCursor(kwargs.cursor);
|
|
549
|
+
const limit = requireRxivLimit(kwargs.limit);
|
|
550
|
+
const url = new URL(
|
|
551
|
+
`${API_BASE}/${config.apiServer}/${from}/${to}/${cursor}/json`,
|
|
552
|
+
);
|
|
553
|
+
const category = String(kwargs.category ?? "").trim();
|
|
554
|
+
if (category) url.searchParams.set("category", category);
|
|
555
|
+
const rows = rowsFromEnvelope(
|
|
556
|
+
await fetchRxivJson(url.toString(), `${config.label} recent`),
|
|
557
|
+
config,
|
|
558
|
+
limit,
|
|
559
|
+
);
|
|
560
|
+
if (rows.length === 0) {
|
|
561
|
+
throw new Error(`No ${config.label} preprints found for ${from}:${to}.`);
|
|
562
|
+
}
|
|
563
|
+
return rows;
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
export async function fetchSearchRows(
|
|
567
|
+
config: RxivConfig,
|
|
568
|
+
kwargs: Record<string, unknown>,
|
|
569
|
+
): Promise<Array<Record<string, unknown>>> {
|
|
570
|
+
const query = requireRxivQuery(kwargs.query ?? kwargs.q);
|
|
571
|
+
const from = requireRxivDate(kwargs.from ?? isoDateDaysAgo(7), "from");
|
|
572
|
+
const to = requireRxivDate(kwargs.to ?? isoDateDaysAgo(0), "to");
|
|
573
|
+
const cursor = requireRxivCursor(kwargs.cursor);
|
|
574
|
+
const limit = requireRxivLimit(kwargs.limit, 20);
|
|
575
|
+
const maxPages = requireRxivSearchPages(
|
|
576
|
+
kwargs["max-pages"] ?? kwargs.maxPages,
|
|
577
|
+
10,
|
|
578
|
+
);
|
|
579
|
+
const category = String(kwargs.category ?? "").trim();
|
|
580
|
+
const matches: Array<{
|
|
581
|
+
preprint: RxivPreprint;
|
|
582
|
+
matchedFields: string[];
|
|
583
|
+
}> = [];
|
|
584
|
+
let scannedRecords = 0;
|
|
585
|
+
let totalRecords: number | undefined;
|
|
586
|
+
let isExhaustive = false;
|
|
587
|
+
|
|
588
|
+
for (let pageIndex = 0; pageIndex < maxPages; pageIndex += 1) {
|
|
589
|
+
const pageCursor = cursor + pageIndex * 30;
|
|
590
|
+
const url = new URL(
|
|
591
|
+
`${API_BASE}/${config.apiServer}/${from}/${to}/${pageCursor}/json`,
|
|
592
|
+
);
|
|
593
|
+
if (category) url.searchParams.set("category", category);
|
|
594
|
+
const envelope = await fetchRxivJson(
|
|
595
|
+
url.toString(),
|
|
596
|
+
`${config.label} search`,
|
|
597
|
+
);
|
|
598
|
+
const rows = envelopeRows(envelope);
|
|
599
|
+
const total = envelopeTotal(envelope);
|
|
600
|
+
totalRecords = totalRecords ?? total;
|
|
601
|
+
scannedRecords += rows.length;
|
|
602
|
+
|
|
603
|
+
for (const preprint of rows) {
|
|
604
|
+
const matchedFields = rxivSearchMatchedFields(preprint, query);
|
|
605
|
+
if (matchedFields.length > 0) matches.push({ preprint, matchedFields });
|
|
606
|
+
if (matches.length >= limit) break;
|
|
607
|
+
}
|
|
608
|
+
|
|
609
|
+
const knownTotal = totalRecords ?? 0;
|
|
610
|
+
isExhaustive =
|
|
611
|
+
rows.length === 0 ||
|
|
612
|
+
rows.length < 30 ||
|
|
613
|
+
(knownTotal > 0 && pageCursor + rows.length >= knownTotal);
|
|
614
|
+
if (matches.length >= limit || isExhaustive) break;
|
|
615
|
+
}
|
|
616
|
+
|
|
617
|
+
if (matches.length === 0) {
|
|
618
|
+
throw new Error(
|
|
619
|
+
`No ${config.label} preprints matched "${query}" in official API window ${from}:${to} after scanning ${scannedRecords} record(s).`,
|
|
620
|
+
);
|
|
621
|
+
}
|
|
622
|
+
|
|
623
|
+
return matches.slice(0, limit).map(({ preprint, matchedFields }, index) =>
|
|
624
|
+
annotateSearchRow(mapRxivPreprint(preprint, config), {
|
|
625
|
+
rank: index + 1,
|
|
626
|
+
matchedFields,
|
|
627
|
+
from,
|
|
628
|
+
to,
|
|
629
|
+
scannedRecords,
|
|
630
|
+
totalRecords,
|
|
631
|
+
isExhaustive,
|
|
632
|
+
}),
|
|
633
|
+
);
|
|
634
|
+
}
|
|
635
|
+
|
|
636
|
+
export async function fetchPaperRow(
|
|
637
|
+
config: RxivConfig,
|
|
638
|
+
doiValue: unknown,
|
|
639
|
+
): Promise<Record<string, unknown>> {
|
|
640
|
+
const doi = requireRxivDoi(doiValue, config.site);
|
|
641
|
+
const envelope = await fetchRxivJson(
|
|
642
|
+
`${API_BASE}/${config.apiServer}/${encodeDoiPath(doi)}/na/json`,
|
|
643
|
+
`${config.label} paper ${doi}`,
|
|
644
|
+
);
|
|
645
|
+
const row = envelopeRows(envelope)[0];
|
|
646
|
+
if (!row) throw new Error(`No ${config.label} preprint found for ${doi}.`);
|
|
647
|
+
return mapRxivPreprint(row, config);
|
|
648
|
+
}
|
|
649
|
+
|
|
650
|
+
export async function downloadRxivPdf(
|
|
651
|
+
config: RxivConfig,
|
|
652
|
+
row: Record<string, unknown>,
|
|
653
|
+
output: unknown,
|
|
654
|
+
): Promise<Record<string, unknown>> {
|
|
655
|
+
const doi = requireRxivDoi(row.doi, config.site);
|
|
656
|
+
const pdf = stringField(row.pdf_url);
|
|
657
|
+
if (!pdf) throw new Error(`${config.label} preprint ${doi} has no PDF URL.`);
|
|
658
|
+
const outputDir = resolve(String(output ?? "./rxiv-downloads"));
|
|
659
|
+
const path = join(
|
|
660
|
+
outputDir,
|
|
661
|
+
rxivArtifactFilename({
|
|
662
|
+
doi,
|
|
663
|
+
version: stringField(row.version),
|
|
664
|
+
title: row.title,
|
|
665
|
+
}),
|
|
666
|
+
);
|
|
667
|
+
const download = await httpDownload(pdf, path, {
|
|
668
|
+
Accept: "application/pdf,*/*",
|
|
669
|
+
Referer: `${config.webOrigin}/`,
|
|
670
|
+
"User-Agent": "unicli-rxiv/1.0 (https://github.com/olo-dot-io/Uni-CLI)",
|
|
671
|
+
});
|
|
672
|
+
if (download.status === "failed") {
|
|
673
|
+
throw rxivAssetError(config, "PDF", doi, download.error ?? "unknown error");
|
|
674
|
+
}
|
|
675
|
+
return { ...row, path: download.path, _download: download };
|
|
676
|
+
}
|
|
677
|
+
|
|
678
|
+
function elements(root: Document | Element, tagName: string): Element[] {
|
|
679
|
+
const nodes = root.getElementsByTagName(tagName);
|
|
680
|
+
return Array.from({ length: nodes.length }, (_, index) =>
|
|
681
|
+
nodes.item(index),
|
|
682
|
+
).filter((node): node is Element => node !== null);
|
|
683
|
+
}
|
|
684
|
+
|
|
685
|
+
function firstElement(
|
|
686
|
+
root: Document | Element,
|
|
687
|
+
tagName: string,
|
|
688
|
+
): Element | null {
|
|
689
|
+
return elements(root, tagName)[0] ?? null;
|
|
690
|
+
}
|
|
691
|
+
|
|
692
|
+
function directChildElements(root: Element, tagName: string): Element[] {
|
|
693
|
+
const out: Element[] = [];
|
|
694
|
+
for (let index = 0; index < root.childNodes.length; index += 1) {
|
|
695
|
+
const node = root.childNodes.item(index);
|
|
696
|
+
if (node?.nodeType === 1 && node.nodeName === tagName) {
|
|
697
|
+
out.push(node as Element);
|
|
698
|
+
}
|
|
699
|
+
}
|
|
700
|
+
return out;
|
|
701
|
+
}
|
|
702
|
+
|
|
703
|
+
function directChildText(root: Element, tagName: string): string {
|
|
704
|
+
return cleanText(directChildElements(root, tagName)[0]?.textContent ?? "");
|
|
705
|
+
}
|
|
706
|
+
|
|
707
|
+
function sectionText(section: Element): string {
|
|
708
|
+
const title = directChildText(section, "title");
|
|
709
|
+
const paragraphs = directChildElements(section, "p")
|
|
710
|
+
.map((paragraph) => cleanText(paragraph.textContent ?? ""))
|
|
711
|
+
.filter(Boolean);
|
|
712
|
+
const nested = directChildElements(section, "sec")
|
|
713
|
+
.map(sectionText)
|
|
714
|
+
.filter(Boolean);
|
|
715
|
+
return [title ? `## ${title}` : "", ...paragraphs, ...nested]
|
|
716
|
+
.filter(Boolean)
|
|
717
|
+
.join("\n\n");
|
|
718
|
+
}
|
|
719
|
+
|
|
720
|
+
function truncateText(
|
|
721
|
+
text: string,
|
|
722
|
+
maxChars: number,
|
|
723
|
+
): {
|
|
724
|
+
text: string;
|
|
725
|
+
truncated: boolean;
|
|
726
|
+
} {
|
|
727
|
+
if (text.length <= maxChars) return { text, truncated: false };
|
|
728
|
+
return {
|
|
729
|
+
text: `${text.slice(0, maxChars).trimEnd()}\n\n[truncated at ${maxChars} characters]`,
|
|
730
|
+
truncated: true,
|
|
731
|
+
};
|
|
732
|
+
}
|
|
733
|
+
|
|
734
|
+
export function mapRxivJatsFullTextRow(
|
|
735
|
+
xml: string,
|
|
736
|
+
row: Record<string, unknown>,
|
|
737
|
+
config: RxivConfig,
|
|
738
|
+
maxChars = 40_000,
|
|
739
|
+
): Record<string, unknown> {
|
|
740
|
+
const document = new DOMParser().parseFromString(xml, "text/xml");
|
|
741
|
+
const title =
|
|
742
|
+
cleanText(firstElement(document, "article-title")?.textContent ?? "") ||
|
|
743
|
+
cleanText(row.title);
|
|
744
|
+
if (!title)
|
|
745
|
+
throw new Error(`${config.label} JATS XML did not include a title.`);
|
|
746
|
+
const abstract = cleanText(
|
|
747
|
+
firstElement(document, "abstract")?.textContent ?? "",
|
|
748
|
+
);
|
|
749
|
+
const body = firstElement(document, "body");
|
|
750
|
+
const bodyText = body
|
|
751
|
+
? [
|
|
752
|
+
...directChildElements(body, "p").map((paragraph) =>
|
|
753
|
+
cleanText(paragraph.textContent ?? ""),
|
|
754
|
+
),
|
|
755
|
+
...directChildElements(body, "sec").map(sectionText),
|
|
756
|
+
]
|
|
757
|
+
.filter(Boolean)
|
|
758
|
+
.join("\n\n")
|
|
759
|
+
: "";
|
|
760
|
+
const text = [abstract ? `## Abstract\n\n${abstract}` : "", bodyText]
|
|
761
|
+
.filter(Boolean)
|
|
762
|
+
.join("\n\n");
|
|
763
|
+
if (!text) {
|
|
764
|
+
throw new Error(`${config.label} JATS XML did not include readable text.`);
|
|
765
|
+
}
|
|
766
|
+
const truncated = truncateText(text, maxChars);
|
|
767
|
+
return {
|
|
768
|
+
id: row.id,
|
|
769
|
+
title,
|
|
770
|
+
doi: row.doi,
|
|
771
|
+
version: row.version,
|
|
772
|
+
pdf_url: row.pdf_url,
|
|
773
|
+
source_adapter: config.site,
|
|
774
|
+
source_url: row.source_url,
|
|
775
|
+
text: truncated.text,
|
|
776
|
+
text_truncated: truncated.truncated,
|
|
777
|
+
text_source: "jats_xml",
|
|
778
|
+
retrieved_at: new Date().toISOString(),
|
|
779
|
+
};
|
|
780
|
+
}
|
|
781
|
+
|
|
782
|
+
export async function readRxivPaper(
|
|
783
|
+
config: RxivConfig,
|
|
784
|
+
kwargs: Record<string, unknown>,
|
|
785
|
+
): Promise<Record<string, unknown>> {
|
|
786
|
+
const row = await fetchPaperRow(
|
|
787
|
+
config,
|
|
788
|
+
kwargs.doi ?? kwargs.id ?? kwargs.ref,
|
|
789
|
+
);
|
|
790
|
+
const maxChars = requireRxivMaxChars(
|
|
791
|
+
kwargs["max-chars"] ?? kwargs.maxChars,
|
|
792
|
+
40_000,
|
|
793
|
+
);
|
|
794
|
+
const jatsUrl = stringField(row.jatsxml_url);
|
|
795
|
+
let jatsError = "";
|
|
796
|
+
if (jatsUrl) {
|
|
797
|
+
try {
|
|
798
|
+
const xml = await fetchRxivText(jatsUrl, `${config.label} JATS XML`);
|
|
799
|
+
return mapRxivJatsFullTextRow(xml, row, config, maxChars);
|
|
800
|
+
} catch (error) {
|
|
801
|
+
jatsError = error instanceof Error ? error.message : String(error);
|
|
802
|
+
}
|
|
803
|
+
}
|
|
804
|
+
|
|
805
|
+
const { firstPage, lastPage } = requireRxivPageRange(
|
|
806
|
+
kwargs["first-page"] ?? kwargs.firstPage,
|
|
807
|
+
kwargs["last-page"] ?? kwargs.lastPage,
|
|
808
|
+
);
|
|
809
|
+
const downloaded = await downloadRxivPdf(config, row, kwargs.output);
|
|
810
|
+
const path = stringField(downloaded.path);
|
|
811
|
+
if (!path) throw new Error(`${config.label} PDF download produced no path.`);
|
|
812
|
+
const { stdout } = await execFileAsync(
|
|
813
|
+
"pdftotext",
|
|
814
|
+
[
|
|
815
|
+
"-layout",
|
|
816
|
+
"-enc",
|
|
817
|
+
"UTF-8",
|
|
818
|
+
"-f",
|
|
819
|
+
String(firstPage),
|
|
820
|
+
"-l",
|
|
821
|
+
String(lastPage),
|
|
822
|
+
path,
|
|
823
|
+
"-",
|
|
824
|
+
],
|
|
825
|
+
{ timeout: 60_000, maxBuffer: 10 * 1024 * 1024 },
|
|
826
|
+
);
|
|
827
|
+
const text = stdout.trim();
|
|
828
|
+
if (!text) {
|
|
829
|
+
throw new Error(
|
|
830
|
+
`pdftotext returned no text for ${config.label} ${downloaded.doi} pages ${firstPage}-${lastPage}.`,
|
|
831
|
+
);
|
|
832
|
+
}
|
|
833
|
+
const truncated = truncateText(text, maxChars);
|
|
834
|
+
return {
|
|
835
|
+
id: downloaded.id,
|
|
836
|
+
title: downloaded.title,
|
|
837
|
+
doi: downloaded.doi,
|
|
838
|
+
version: downloaded.version,
|
|
839
|
+
pdf_url: downloaded.pdf_url,
|
|
840
|
+
path,
|
|
841
|
+
source_adapter: downloaded.source_adapter,
|
|
842
|
+
source_url: downloaded.source_url,
|
|
843
|
+
text: truncated.text,
|
|
844
|
+
text_truncated: truncated.truncated,
|
|
845
|
+
text_source: "pdf",
|
|
846
|
+
...(jatsError ? { jats_error: jatsError } : {}),
|
|
847
|
+
retrieved_at: new Date().toISOString(),
|
|
848
|
+
};
|
|
849
|
+
}
|