jats-xml 0.0.6 → 0.0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/cjs/cli/parse.js +4 -4
- package/dist/cjs/download.js +37 -40
- package/dist/cjs/resolvers.js +4 -3
- package/dist/cjs/version.js +1 -1
- package/dist/esm/cli/parse.js +4 -4
- package/dist/esm/download.js +37 -40
- package/dist/esm/resolvers.js +4 -3
- package/dist/esm/version.js +1 -1
- package/dist/jats.js +6694 -6278
- package/dist/types/cli/parse.d.ts.map +1 -1
- package/dist/types/download.d.ts +4 -5
- package/dist/types/download.d.ts.map +1 -1
- package/dist/types/resolvers.d.ts +2 -6
- package/dist/types/resolvers.d.ts.map +1 -1
- package/dist/types/types/session.d.ts +8 -0
- package/dist/types/types/session.d.ts.map +1 -1
- package/dist/types/version.d.ts +1 -1
- package/package.json +3 -3
package/README.md
CHANGED
|
@@ -26,7 +26,7 @@ Commands available:
|
|
|
26
26
|
`download`: attempt to find the JATS file and download it locally.
|
|
27
27
|
|
|
28
28
|
```bash
|
|
29
|
-
jats download https://
|
|
29
|
+
jats download https://elifesciences.org/articles/81952 article.jats
|
|
30
30
|
```
|
|
31
31
|
|
|
32
32
|
Note, currently this just downloads the XML, **not** the associated files.
|
package/dist/cjs/cli/parse.js
CHANGED
|
@@ -30,7 +30,7 @@ const utils_1 = require("../utils");
|
|
|
30
30
|
function hasValidExtension(output) {
|
|
31
31
|
return ['.xml', '.jats'].includes((0, path_1.extname)(output).toLowerCase());
|
|
32
32
|
}
|
|
33
|
-
function downloadAndSaveJats(session, urlOrDoi, output) {
|
|
33
|
+
function downloadAndSaveJats(session, urlOrDoi, output, opts = { resolvers: resolvers_1.DEFAULT_RESOLVERS }) {
|
|
34
34
|
return __awaiter(this, void 0, void 0, function* () {
|
|
35
35
|
if (fs_1.default.existsSync(urlOrDoi)) {
|
|
36
36
|
throw new Error(`File "${urlOrDoi}" is local and cannot be downloaded!`);
|
|
@@ -41,12 +41,12 @@ function downloadAndSaveJats(session, urlOrDoi, output) {
|
|
|
41
41
|
if (!hasValidExtension(output)) {
|
|
42
42
|
session.log.warn(`The extension ${(0, path_1.extname)(output)} is not a valid extension for JATS, try using ".xml" or ".jats"`);
|
|
43
43
|
}
|
|
44
|
-
const { data } = yield (0, download_1.downloadJatsFromUrl)(session, urlOrDoi,
|
|
44
|
+
const { data } = yield (0, download_1.downloadJatsFromUrl)(session, urlOrDoi, opts);
|
|
45
45
|
(0, myst_cli_utils_1.writeFileToFolder)(output, data);
|
|
46
46
|
return data;
|
|
47
47
|
});
|
|
48
48
|
}
|
|
49
|
-
function parseJats(session, file) {
|
|
49
|
+
function parseJats(session, file, opts = { resolvers: resolvers_1.DEFAULT_RESOLVERS }) {
|
|
50
50
|
return __awaiter(this, void 0, void 0, function* () {
|
|
51
51
|
const toc = (0, myst_cli_utils_1.tic)();
|
|
52
52
|
if (fs_1.default.existsSync(file)) {
|
|
@@ -54,7 +54,7 @@ function parseJats(session, file) {
|
|
|
54
54
|
const data = fs_1.default.readFileSync(file).toString();
|
|
55
55
|
return new jats_1.Jats(data, { log: session.log });
|
|
56
56
|
}
|
|
57
|
-
const { source, data } = yield (0, download_1.downloadJatsFromUrl)(session, file,
|
|
57
|
+
const { source, data } = yield (0, download_1.downloadJatsFromUrl)(session, file, opts);
|
|
58
58
|
const jats = new jats_1.Jats(data, { source, log: session.log });
|
|
59
59
|
session.log.debug(toc(`Downloaded and parsed JATS file in %s`));
|
|
60
60
|
return jats;
|
package/dist/cjs/download.js
CHANGED
|
@@ -27,28 +27,17 @@ function logAboutJatsFailing(session, jatsUrls) {
|
|
|
27
27
|
session.log.debug((0, fair_principles_1.formatPrinciples)('A*', { chalk: chalk_1.default }));
|
|
28
28
|
session.log.info(`${chalk_1.default.blue('The link may work in a browser.')}\n`);
|
|
29
29
|
}
|
|
30
|
-
function dowloadFromUrl(session, jatsUrl) {
|
|
30
|
+
function dowloadFromUrl(session, jatsUrl, opts) {
|
|
31
|
+
var _a, _b;
|
|
31
32
|
return __awaiter(this, void 0, void 0, function* () {
|
|
32
33
|
const toc = (0, myst_cli_utils_1.tic)();
|
|
33
34
|
session.log.debug(`Fetching JATS from ${jatsUrl}`);
|
|
34
|
-
const resp = yield (0
|
|
35
|
-
headers: [
|
|
36
|
-
['accept', 'application/xml'],
|
|
37
|
-
[
|
|
38
|
-
'user-agent',
|
|
39
|
-
// A bunch of publishers just show the login screen or quickly block you.
|
|
40
|
-
// We don't want to DDOS these publishers, they are the _good ones_ for sharing the XML!!
|
|
41
|
-
// But some block on the second request?!
|
|
42
|
-
// So we can pretend to be a random browser, I guess. How silly. 🤷♂️
|
|
43
|
-
`Mozilla/5.0 (Macintosh; Intel Mac OS X ${Math.floor(Math.random() * 100)})`,
|
|
44
|
-
],
|
|
45
|
-
],
|
|
46
|
-
});
|
|
35
|
+
const resp = yield ((_a = opts === null || opts === void 0 ? void 0 : opts.fetcher) !== null && _a !== void 0 ? _a : defaultFetcher)(jatsUrl, 'xml');
|
|
47
36
|
if (!resp.ok) {
|
|
48
37
|
session.log.debug(`JATS failed to download from "${jatsUrl}"`);
|
|
49
38
|
throw new Error(`STATUS ${resp.status}: ${resp.statusText}`);
|
|
50
39
|
}
|
|
51
|
-
const contentType = resp.headers.get('content-type');
|
|
40
|
+
const contentType = (_b = resp.headers) === null || _b === void 0 ? void 0 : _b.get('content-type');
|
|
52
41
|
if (!((contentType === null || contentType === void 0 ? void 0 : contentType.includes('application/xml')) ||
|
|
53
42
|
(contentType === null || contentType === void 0 ? void 0 : contentType.includes('text/xml')) ||
|
|
54
43
|
(contentType === null || contentType === void 0 ? void 0 : contentType.includes('text/plain')))) {
|
|
@@ -59,6 +48,16 @@ function dowloadFromUrl(session, jatsUrl) {
|
|
|
59
48
|
return data;
|
|
60
49
|
});
|
|
61
50
|
}
|
|
51
|
+
function defaultFetcher(url, kind) {
|
|
52
|
+
switch (kind) {
|
|
53
|
+
case 'json':
|
|
54
|
+
return (0, node_fetch_1.default)(url, { headers: [['Accept', 'application/json']] });
|
|
55
|
+
case 'xml':
|
|
56
|
+
return (0, node_fetch_1.default)(url, { headers: [['Accept', 'application/xml']] });
|
|
57
|
+
default:
|
|
58
|
+
return (0, node_fetch_1.default)(url);
|
|
59
|
+
}
|
|
60
|
+
}
|
|
62
61
|
/**
|
|
63
62
|
* There are 5.8M or so DOIs that have a full XML record:
|
|
64
63
|
*
|
|
@@ -66,26 +65,26 @@ function dowloadFromUrl(session, jatsUrl) {
|
|
|
66
65
|
*
|
|
67
66
|
* This function tries to find the correct URL for the record.
|
|
68
67
|
*/
|
|
69
|
-
function checkIfDoiHasJats(session, urlOrDoi) {
|
|
70
|
-
var _a, _b, _c, _d;
|
|
68
|
+
function checkIfDoiHasJats(session, urlOrDoi, opts) {
|
|
69
|
+
var _a, _b, _c, _d, _e;
|
|
71
70
|
return __awaiter(this, void 0, void 0, function* () {
|
|
72
71
|
if (!doi_utils_1.default.validate(urlOrDoi))
|
|
73
72
|
return;
|
|
74
73
|
const toc = (0, myst_cli_utils_1.tic)();
|
|
75
74
|
const doiUrl = doi_utils_1.default.buildUrl(urlOrDoi);
|
|
76
75
|
session.log.debug(`Attempting to resolving full XML from DOI ${doiUrl}`);
|
|
77
|
-
const resp = yield (0
|
|
76
|
+
const resp = yield ((_a = opts === null || opts === void 0 ? void 0 : opts.fetcher) !== null && _a !== void 0 ? _a : defaultFetcher)(doiUrl, 'json');
|
|
78
77
|
if (!resp.ok) {
|
|
79
78
|
// Silently return -- other functions can try!
|
|
80
79
|
session.log.debug(`DOI failed to resolve: ${doiUrl}`);
|
|
81
80
|
return;
|
|
82
81
|
}
|
|
83
82
|
const data = (yield resp.json());
|
|
84
|
-
session.log.debug(toc(`DOI resolved in %s with ${(
|
|
83
|
+
session.log.debug(toc(`DOI resolved in %s with ${(_c = (_b = data.link) === null || _b === void 0 ? void 0 : _b.length) !== null && _c !== void 0 ? _c : 0} links to content`));
|
|
85
84
|
if (data.link) {
|
|
86
85
|
session.log.debug(['', ...data.link.map((link) => `content-type: ${link['content-type']}, ${link.URL}\n`)].join(' - '));
|
|
87
86
|
}
|
|
88
|
-
const fullXml = (
|
|
87
|
+
const fullXml = (_e = (_d = data.link) === null || _d === void 0 ? void 0 : _d.find((link) => { var _a; return ['text/xml', 'application/xml'].includes((_a = link['content-type']) !== null && _a !== void 0 ? _a : ''); })) === null || _e === void 0 ? void 0 : _e.URL;
|
|
89
88
|
if (fullXml)
|
|
90
89
|
return fullXml;
|
|
91
90
|
session.log.debug(`Could not find XML in DOI record ${doiUrl}`);
|
|
@@ -95,26 +94,24 @@ function checkIfDoiHasJats(session, urlOrDoi) {
|
|
|
95
94
|
/**
|
|
96
95
|
* https://www.ncbi.nlm.nih.gov/pmc/tools/id-converter-api/
|
|
97
96
|
*/
|
|
98
|
-
function convertPMID2PMCID(session, PMID) {
|
|
99
|
-
var _a, _b;
|
|
97
|
+
function convertPMID2PMCID(session, PMID, opts) {
|
|
98
|
+
var _a, _b, _c;
|
|
100
99
|
return __awaiter(this, void 0, void 0, function* () {
|
|
101
100
|
if (PMID.startsWith('https://')) {
|
|
102
101
|
const idPart = new URL(PMID).pathname.slice(1);
|
|
103
102
|
session.log.debug(`Extract ${PMID} to ${idPart}`);
|
|
104
|
-
return convertPMID2PMCID(session, idPart);
|
|
103
|
+
return convertPMID2PMCID(session, idPart, opts);
|
|
105
104
|
}
|
|
106
105
|
const toc = (0, myst_cli_utils_1.tic)();
|
|
107
106
|
const converter = 'https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/';
|
|
108
|
-
const resp = yield (0
|
|
109
|
-
headers: [['Accept', 'application/json']],
|
|
110
|
-
});
|
|
107
|
+
const resp = yield ((_a = opts === null || opts === void 0 ? void 0 : opts.fetcher) !== null && _a !== void 0 ? _a : defaultFetcher)(`${converter}?tool=jats-xml&format=json&ids=${PMID}`, 'json');
|
|
111
108
|
if (!resp.ok) {
|
|
112
109
|
// Silently return -- other functions can try!
|
|
113
110
|
session.log.debug(`Failed to convert PubMedID: ${PMID}`);
|
|
114
111
|
return;
|
|
115
112
|
}
|
|
116
113
|
const data = yield resp.json();
|
|
117
|
-
const PMCID = (
|
|
114
|
+
const PMCID = (_c = (_b = data === null || data === void 0 ? void 0 : data.records) === null || _b === void 0 ? void 0 : _b[0]) === null || _c === void 0 ? void 0 : _c.pmcid;
|
|
118
115
|
session.log.debug(toc(`Used nih.gov to transform ${PMID} to ${PMCID} in %s.`));
|
|
119
116
|
return PMCID;
|
|
120
117
|
});
|
|
@@ -124,8 +121,8 @@ function pubMedCentralJats(PMCID) {
|
|
|
124
121
|
const normalized = PMCID.replace(/^PMC:?/, '');
|
|
125
122
|
return `https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=${normalized}`;
|
|
126
123
|
}
|
|
127
|
-
function checkIfPubMedCentralHasJats(session, urlOrDoi) {
|
|
128
|
-
var _a, _b;
|
|
124
|
+
function checkIfPubMedCentralHasJats(session, urlOrDoi, opts) {
|
|
125
|
+
var _a, _b, _c;
|
|
129
126
|
return __awaiter(this, void 0, void 0, function* () {
|
|
130
127
|
if (urlOrDoi.match(/^PMC:?([0-9]+)$/))
|
|
131
128
|
return pubMedCentralJats(urlOrDoi);
|
|
@@ -135,18 +132,18 @@ function checkIfPubMedCentralHasJats(session, urlOrDoi) {
|
|
|
135
132
|
const doiUrl = doi_utils_1.default.buildUrl(urlOrDoi);
|
|
136
133
|
session.log.debug(`Attempting to resolve PMCID using OpenAlex from ${doiUrl}`);
|
|
137
134
|
const openAlexUrl = `https://api.openalex.org/works/${doiUrl}`;
|
|
138
|
-
const resp = yield (0
|
|
135
|
+
const resp = yield ((_a = opts === null || opts === void 0 ? void 0 : opts.fetcher) !== null && _a !== void 0 ? _a : defaultFetcher)(openAlexUrl, 'json');
|
|
139
136
|
if (!resp.ok) {
|
|
140
137
|
// Silently return -- other functions can try!
|
|
141
138
|
session.log.debug(`Failed to lookup on OpenAlex: ${openAlexUrl}`);
|
|
142
139
|
return;
|
|
143
140
|
}
|
|
144
141
|
const data = (yield resp.json());
|
|
145
|
-
const PMID = (
|
|
146
|
-
let PMCID = (
|
|
142
|
+
const PMID = (_b = data === null || data === void 0 ? void 0 : data.ids) === null || _b === void 0 ? void 0 : _b.pmid;
|
|
143
|
+
let PMCID = (_c = data === null || data === void 0 ? void 0 : data.ids) === null || _c === void 0 ? void 0 : _c.pmcid;
|
|
147
144
|
if (!PMCID && !!PMID) {
|
|
148
145
|
session.log.debug(toc(`OpenAlex resolved ${data === null || data === void 0 ? void 0 : data.ids.openalex} in %s. There is no PMCID, but there is a PMID`));
|
|
149
|
-
PMCID = yield convertPMID2PMCID(session, PMID);
|
|
146
|
+
PMCID = yield convertPMID2PMCID(session, PMID, opts);
|
|
150
147
|
if (!PMCID) {
|
|
151
148
|
session.log.debug(toc(`PubMed does not have a record of ${PMID}`));
|
|
152
149
|
return;
|
|
@@ -161,18 +158,18 @@ function checkIfPubMedCentralHasJats(session, urlOrDoi) {
|
|
|
161
158
|
});
|
|
162
159
|
}
|
|
163
160
|
exports.checkIfPubMedCentralHasJats = checkIfPubMedCentralHasJats;
|
|
164
|
-
function downloadJatsFromUrl(session, urlOrDoi,
|
|
161
|
+
function downloadJatsFromUrl(session, urlOrDoi, opts = {}) {
|
|
165
162
|
return __awaiter(this, void 0, void 0, function* () {
|
|
166
163
|
const expectedUrls = (yield Promise.all([
|
|
167
|
-
checkIfPubMedCentralHasJats(session, urlOrDoi),
|
|
168
|
-
checkIfDoiHasJats(session, urlOrDoi),
|
|
164
|
+
checkIfPubMedCentralHasJats(session, urlOrDoi, opts),
|
|
165
|
+
checkIfDoiHasJats(session, urlOrDoi, opts),
|
|
169
166
|
])).filter((u) => !!u);
|
|
170
167
|
if (expectedUrls.length > 0) {
|
|
171
168
|
session.log.debug(['Trying URLs:\n', ...expectedUrls.map((url) => ` ${url}\n`)].join(' - '));
|
|
172
169
|
for (let index = 0; index < expectedUrls.length; index++) {
|
|
173
170
|
const url = expectedUrls[index];
|
|
174
171
|
try {
|
|
175
|
-
const data = yield dowloadFromUrl(session, url);
|
|
172
|
+
const data = yield dowloadFromUrl(session, url, opts);
|
|
176
173
|
if (data)
|
|
177
174
|
return { source: url, data };
|
|
178
175
|
}
|
|
@@ -184,13 +181,13 @@ function downloadJatsFromUrl(session, urlOrDoi, resolvers) {
|
|
|
184
181
|
logAboutJatsFailing(session, expectedUrls);
|
|
185
182
|
}
|
|
186
183
|
if (doi_utils_1.default.validate(urlOrDoi)) {
|
|
187
|
-
const jatsUrl = yield (0, resolvers_1.customResolveJatsUrlFromDoi)(session, urlOrDoi,
|
|
188
|
-
const data = yield dowloadFromUrl(session, jatsUrl);
|
|
184
|
+
const jatsUrl = yield (0, resolvers_1.customResolveJatsUrlFromDoi)(session, urlOrDoi, opts);
|
|
185
|
+
const data = yield dowloadFromUrl(session, jatsUrl, opts);
|
|
189
186
|
return { source: jatsUrl, data };
|
|
190
187
|
}
|
|
191
188
|
if ((0, myst_cli_utils_1.isUrl)(urlOrDoi)) {
|
|
192
189
|
session.log.debug("No resolver matched, and the URL doesn't look like a DOI. We will attempt to download it directly.");
|
|
193
|
-
const data = yield dowloadFromUrl(session, urlOrDoi);
|
|
190
|
+
const data = yield dowloadFromUrl(session, urlOrDoi, opts);
|
|
194
191
|
return { source: urlOrDoi, data };
|
|
195
192
|
}
|
|
196
193
|
throw new Error(`Could not find ${urlOrDoi} locally, and it doesn't look like a URL or DOI`);
|
package/dist/cjs/resolvers.js
CHANGED
|
@@ -47,16 +47,17 @@ exports.DEFAULT_RESOLVERS = [exports.elife, exports.plos, exports.joss];
|
|
|
47
47
|
/**
|
|
48
48
|
* Use the known custom resolvers to pick where the JATS should be downloaded from.
|
|
49
49
|
*/
|
|
50
|
-
function customResolveJatsUrlFromDoi(session, doiString,
|
|
50
|
+
function customResolveJatsUrlFromDoi(session, doiString, opts = { resolvers: exports.DEFAULT_RESOLVERS }) {
|
|
51
|
+
var _a, _b;
|
|
51
52
|
return __awaiter(this, void 0, void 0, function* () {
|
|
52
53
|
if (!doi_utils_1.default.validate(doiString))
|
|
53
54
|
throw new Error(`The doi ${doiString} is not valid`);
|
|
54
55
|
const doiUrl = doi_utils_1.default.buildUrl(doiString);
|
|
55
56
|
session.log.debug(`Resolving DOI ${doiUrl}`);
|
|
56
|
-
const resp = yield (0
|
|
57
|
+
const resp = yield ((_a = opts === null || opts === void 0 ? void 0 : opts.fetcher) !== null && _a !== void 0 ? _a : node_fetch_1.default)(doiUrl);
|
|
57
58
|
const articleUrl = resp.url;
|
|
58
59
|
session.log.debug(`Found resolved URL for DOI at ${articleUrl}`);
|
|
59
|
-
const resolver = resolvers.find((r) => r.test(articleUrl));
|
|
60
|
+
const resolver = (_b = opts === null || opts === void 0 ? void 0 : opts.resolvers) === null || _b === void 0 ? void 0 : _b.find((r) => r.test(articleUrl));
|
|
60
61
|
if (!resolver)
|
|
61
62
|
throw new Error(`Could not resolve JATS for ${articleUrl}, no resolver matched`);
|
|
62
63
|
const jatsUrl = resolver.jatsUrl(articleUrl);
|
package/dist/cjs/version.js
CHANGED
package/dist/esm/cli/parse.js
CHANGED
|
@@ -24,7 +24,7 @@ import { findArticleId, formatDate, toDate } from '../utils';
|
|
|
24
24
|
function hasValidExtension(output) {
|
|
25
25
|
return ['.xml', '.jats'].includes(extname(output).toLowerCase());
|
|
26
26
|
}
|
|
27
|
-
function downloadAndSaveJats(session, urlOrDoi, output) {
|
|
27
|
+
function downloadAndSaveJats(session, urlOrDoi, output, opts = { resolvers: DEFAULT_RESOLVERS }) {
|
|
28
28
|
return __awaiter(this, void 0, void 0, function* () {
|
|
29
29
|
if (fs.existsSync(urlOrDoi)) {
|
|
30
30
|
throw new Error(`File "${urlOrDoi}" is local and cannot be downloaded!`);
|
|
@@ -35,12 +35,12 @@ function downloadAndSaveJats(session, urlOrDoi, output) {
|
|
|
35
35
|
if (!hasValidExtension(output)) {
|
|
36
36
|
session.log.warn(`The extension ${extname(output)} is not a valid extension for JATS, try using ".xml" or ".jats"`);
|
|
37
37
|
}
|
|
38
|
-
const { data } = yield downloadJatsFromUrl(session, urlOrDoi,
|
|
38
|
+
const { data } = yield downloadJatsFromUrl(session, urlOrDoi, opts);
|
|
39
39
|
writeFileToFolder(output, data);
|
|
40
40
|
return data;
|
|
41
41
|
});
|
|
42
42
|
}
|
|
43
|
-
function parseJats(session, file) {
|
|
43
|
+
function parseJats(session, file, opts = { resolvers: DEFAULT_RESOLVERS }) {
|
|
44
44
|
return __awaiter(this, void 0, void 0, function* () {
|
|
45
45
|
const toc = tic();
|
|
46
46
|
if (fs.existsSync(file)) {
|
|
@@ -48,7 +48,7 @@ function parseJats(session, file) {
|
|
|
48
48
|
const data = fs.readFileSync(file).toString();
|
|
49
49
|
return new Jats(data, { log: session.log });
|
|
50
50
|
}
|
|
51
|
-
const { source, data } = yield downloadJatsFromUrl(session, file,
|
|
51
|
+
const { source, data } = yield downloadJatsFromUrl(session, file, opts);
|
|
52
52
|
const jats = new Jats(data, { source, log: session.log });
|
|
53
53
|
session.log.debug(toc(`Downloaded and parsed JATS file in %s`));
|
|
54
54
|
return jats;
|
package/dist/esm/download.js
CHANGED
|
@@ -21,28 +21,17 @@ function logAboutJatsFailing(session, jatsUrls) {
|
|
|
21
21
|
session.log.debug(formatPrinciples('A*', { chalk }));
|
|
22
22
|
session.log.info(`${chalk.blue('The link may work in a browser.')}\n`);
|
|
23
23
|
}
|
|
24
|
-
function dowloadFromUrl(session, jatsUrl) {
|
|
24
|
+
function dowloadFromUrl(session, jatsUrl, opts) {
|
|
25
|
+
var _a, _b;
|
|
25
26
|
return __awaiter(this, void 0, void 0, function* () {
|
|
26
27
|
const toc = tic();
|
|
27
28
|
session.log.debug(`Fetching JATS from ${jatsUrl}`);
|
|
28
|
-
const resp = yield
|
|
29
|
-
headers: [
|
|
30
|
-
['accept', 'application/xml'],
|
|
31
|
-
[
|
|
32
|
-
'user-agent',
|
|
33
|
-
// A bunch of publishers just show the login screen or quickly block you.
|
|
34
|
-
// We don't want to DDOS these publishers, they are the _good ones_ for sharing the XML!!
|
|
35
|
-
// But some block on the second request?!
|
|
36
|
-
// So we can pretend to be a random browser, I guess. How silly. 🤷♂️
|
|
37
|
-
`Mozilla/5.0 (Macintosh; Intel Mac OS X ${Math.floor(Math.random() * 100)})`,
|
|
38
|
-
],
|
|
39
|
-
],
|
|
40
|
-
});
|
|
29
|
+
const resp = yield ((_a = opts === null || opts === void 0 ? void 0 : opts.fetcher) !== null && _a !== void 0 ? _a : defaultFetcher)(jatsUrl, 'xml');
|
|
41
30
|
if (!resp.ok) {
|
|
42
31
|
session.log.debug(`JATS failed to download from "${jatsUrl}"`);
|
|
43
32
|
throw new Error(`STATUS ${resp.status}: ${resp.statusText}`);
|
|
44
33
|
}
|
|
45
|
-
const contentType = resp.headers.get('content-type');
|
|
34
|
+
const contentType = (_b = resp.headers) === null || _b === void 0 ? void 0 : _b.get('content-type');
|
|
46
35
|
if (!((contentType === null || contentType === void 0 ? void 0 : contentType.includes('application/xml')) ||
|
|
47
36
|
(contentType === null || contentType === void 0 ? void 0 : contentType.includes('text/xml')) ||
|
|
48
37
|
(contentType === null || contentType === void 0 ? void 0 : contentType.includes('text/plain')))) {
|
|
@@ -53,6 +42,16 @@ function dowloadFromUrl(session, jatsUrl) {
|
|
|
53
42
|
return data;
|
|
54
43
|
});
|
|
55
44
|
}
|
|
45
|
+
function defaultFetcher(url, kind) {
|
|
46
|
+
switch (kind) {
|
|
47
|
+
case 'json':
|
|
48
|
+
return fetch(url, { headers: [['Accept', 'application/json']] });
|
|
49
|
+
case 'xml':
|
|
50
|
+
return fetch(url, { headers: [['Accept', 'application/xml']] });
|
|
51
|
+
default:
|
|
52
|
+
return fetch(url);
|
|
53
|
+
}
|
|
54
|
+
}
|
|
56
55
|
/**
|
|
57
56
|
* There are 5.8M or so DOIs that have a full XML record:
|
|
58
57
|
*
|
|
@@ -60,26 +59,26 @@ function dowloadFromUrl(session, jatsUrl) {
|
|
|
60
59
|
*
|
|
61
60
|
* This function tries to find the correct URL for the record.
|
|
62
61
|
*/
|
|
63
|
-
function checkIfDoiHasJats(session, urlOrDoi) {
|
|
64
|
-
var _a, _b, _c, _d;
|
|
62
|
+
function checkIfDoiHasJats(session, urlOrDoi, opts) {
|
|
63
|
+
var _a, _b, _c, _d, _e;
|
|
65
64
|
return __awaiter(this, void 0, void 0, function* () {
|
|
66
65
|
if (!doi.validate(urlOrDoi))
|
|
67
66
|
return;
|
|
68
67
|
const toc = tic();
|
|
69
68
|
const doiUrl = doi.buildUrl(urlOrDoi);
|
|
70
69
|
session.log.debug(`Attempting to resolving full XML from DOI ${doiUrl}`);
|
|
71
|
-
const resp = yield
|
|
70
|
+
const resp = yield ((_a = opts === null || opts === void 0 ? void 0 : opts.fetcher) !== null && _a !== void 0 ? _a : defaultFetcher)(doiUrl, 'json');
|
|
72
71
|
if (!resp.ok) {
|
|
73
72
|
// Silently return -- other functions can try!
|
|
74
73
|
session.log.debug(`DOI failed to resolve: ${doiUrl}`);
|
|
75
74
|
return;
|
|
76
75
|
}
|
|
77
76
|
const data = (yield resp.json());
|
|
78
|
-
session.log.debug(toc(`DOI resolved in %s with ${(
|
|
77
|
+
session.log.debug(toc(`DOI resolved in %s with ${(_c = (_b = data.link) === null || _b === void 0 ? void 0 : _b.length) !== null && _c !== void 0 ? _c : 0} links to content`));
|
|
79
78
|
if (data.link) {
|
|
80
79
|
session.log.debug(['', ...data.link.map((link) => `content-type: ${link['content-type']}, ${link.URL}\n`)].join(' - '));
|
|
81
80
|
}
|
|
82
|
-
const fullXml = (
|
|
81
|
+
const fullXml = (_e = (_d = data.link) === null || _d === void 0 ? void 0 : _d.find((link) => { var _a; return ['text/xml', 'application/xml'].includes((_a = link['content-type']) !== null && _a !== void 0 ? _a : ''); })) === null || _e === void 0 ? void 0 : _e.URL;
|
|
83
82
|
if (fullXml)
|
|
84
83
|
return fullXml;
|
|
85
84
|
session.log.debug(`Could not find XML in DOI record ${doiUrl}`);
|
|
@@ -89,26 +88,24 @@ function checkIfDoiHasJats(session, urlOrDoi) {
|
|
|
89
88
|
/**
|
|
90
89
|
* https://www.ncbi.nlm.nih.gov/pmc/tools/id-converter-api/
|
|
91
90
|
*/
|
|
92
|
-
export function convertPMID2PMCID(session, PMID) {
|
|
93
|
-
var _a, _b;
|
|
91
|
+
export function convertPMID2PMCID(session, PMID, opts) {
|
|
92
|
+
var _a, _b, _c;
|
|
94
93
|
return __awaiter(this, void 0, void 0, function* () {
|
|
95
94
|
if (PMID.startsWith('https://')) {
|
|
96
95
|
const idPart = new URL(PMID).pathname.slice(1);
|
|
97
96
|
session.log.debug(`Extract ${PMID} to ${idPart}`);
|
|
98
|
-
return convertPMID2PMCID(session, idPart);
|
|
97
|
+
return convertPMID2PMCID(session, idPart, opts);
|
|
99
98
|
}
|
|
100
99
|
const toc = tic();
|
|
101
100
|
const converter = 'https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/';
|
|
102
|
-
const resp = yield
|
|
103
|
-
headers: [['Accept', 'application/json']],
|
|
104
|
-
});
|
|
101
|
+
const resp = yield ((_a = opts === null || opts === void 0 ? void 0 : opts.fetcher) !== null && _a !== void 0 ? _a : defaultFetcher)(`${converter}?tool=jats-xml&format=json&ids=${PMID}`, 'json');
|
|
105
102
|
if (!resp.ok) {
|
|
106
103
|
// Silently return -- other functions can try!
|
|
107
104
|
session.log.debug(`Failed to convert PubMedID: ${PMID}`);
|
|
108
105
|
return;
|
|
109
106
|
}
|
|
110
107
|
const data = yield resp.json();
|
|
111
|
-
const PMCID = (
|
|
108
|
+
const PMCID = (_c = (_b = data === null || data === void 0 ? void 0 : data.records) === null || _b === void 0 ? void 0 : _b[0]) === null || _c === void 0 ? void 0 : _c.pmcid;
|
|
112
109
|
session.log.debug(toc(`Used nih.gov to transform ${PMID} to ${PMCID} in %s.`));
|
|
113
110
|
return PMCID;
|
|
114
111
|
});
|
|
@@ -117,8 +114,8 @@ function pubMedCentralJats(PMCID) {
|
|
|
117
114
|
const normalized = PMCID.replace(/^PMC:?/, '');
|
|
118
115
|
return `https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=${normalized}`;
|
|
119
116
|
}
|
|
120
|
-
export function checkIfPubMedCentralHasJats(session, urlOrDoi) {
|
|
121
|
-
var _a, _b;
|
|
117
|
+
export function checkIfPubMedCentralHasJats(session, urlOrDoi, opts) {
|
|
118
|
+
var _a, _b, _c;
|
|
122
119
|
return __awaiter(this, void 0, void 0, function* () {
|
|
123
120
|
if (urlOrDoi.match(/^PMC:?([0-9]+)$/))
|
|
124
121
|
return pubMedCentralJats(urlOrDoi);
|
|
@@ -128,18 +125,18 @@ export function checkIfPubMedCentralHasJats(session, urlOrDoi) {
|
|
|
128
125
|
const doiUrl = doi.buildUrl(urlOrDoi);
|
|
129
126
|
session.log.debug(`Attempting to resolve PMCID using OpenAlex from ${doiUrl}`);
|
|
130
127
|
const openAlexUrl = `https://api.openalex.org/works/${doiUrl}`;
|
|
131
|
-
const resp = yield
|
|
128
|
+
const resp = yield ((_a = opts === null || opts === void 0 ? void 0 : opts.fetcher) !== null && _a !== void 0 ? _a : defaultFetcher)(openAlexUrl, 'json');
|
|
132
129
|
if (!resp.ok) {
|
|
133
130
|
// Silently return -- other functions can try!
|
|
134
131
|
session.log.debug(`Failed to lookup on OpenAlex: ${openAlexUrl}`);
|
|
135
132
|
return;
|
|
136
133
|
}
|
|
137
134
|
const data = (yield resp.json());
|
|
138
|
-
const PMID = (
|
|
139
|
-
let PMCID = (
|
|
135
|
+
const PMID = (_b = data === null || data === void 0 ? void 0 : data.ids) === null || _b === void 0 ? void 0 : _b.pmid;
|
|
136
|
+
let PMCID = (_c = data === null || data === void 0 ? void 0 : data.ids) === null || _c === void 0 ? void 0 : _c.pmcid;
|
|
140
137
|
if (!PMCID && !!PMID) {
|
|
141
138
|
session.log.debug(toc(`OpenAlex resolved ${data === null || data === void 0 ? void 0 : data.ids.openalex} in %s. There is no PMCID, but there is a PMID`));
|
|
142
|
-
PMCID = yield convertPMID2PMCID(session, PMID);
|
|
139
|
+
PMCID = yield convertPMID2PMCID(session, PMID, opts);
|
|
143
140
|
if (!PMCID) {
|
|
144
141
|
session.log.debug(toc(`PubMed does not have a record of ${PMID}`));
|
|
145
142
|
return;
|
|
@@ -153,18 +150,18 @@ export function checkIfPubMedCentralHasJats(session, urlOrDoi) {
|
|
|
153
150
|
return pubMedCentralJats(PMCID);
|
|
154
151
|
});
|
|
155
152
|
}
|
|
156
|
-
export function downloadJatsFromUrl(session, urlOrDoi,
|
|
153
|
+
export function downloadJatsFromUrl(session, urlOrDoi, opts = {}) {
|
|
157
154
|
return __awaiter(this, void 0, void 0, function* () {
|
|
158
155
|
const expectedUrls = (yield Promise.all([
|
|
159
|
-
checkIfPubMedCentralHasJats(session, urlOrDoi),
|
|
160
|
-
checkIfDoiHasJats(session, urlOrDoi),
|
|
156
|
+
checkIfPubMedCentralHasJats(session, urlOrDoi, opts),
|
|
157
|
+
checkIfDoiHasJats(session, urlOrDoi, opts),
|
|
161
158
|
])).filter((u) => !!u);
|
|
162
159
|
if (expectedUrls.length > 0) {
|
|
163
160
|
session.log.debug(['Trying URLs:\n', ...expectedUrls.map((url) => ` ${url}\n`)].join(' - '));
|
|
164
161
|
for (let index = 0; index < expectedUrls.length; index++) {
|
|
165
162
|
const url = expectedUrls[index];
|
|
166
163
|
try {
|
|
167
|
-
const data = yield dowloadFromUrl(session, url);
|
|
164
|
+
const data = yield dowloadFromUrl(session, url, opts);
|
|
168
165
|
if (data)
|
|
169
166
|
return { source: url, data };
|
|
170
167
|
}
|
|
@@ -176,13 +173,13 @@ export function downloadJatsFromUrl(session, urlOrDoi, resolvers) {
|
|
|
176
173
|
logAboutJatsFailing(session, expectedUrls);
|
|
177
174
|
}
|
|
178
175
|
if (doi.validate(urlOrDoi)) {
|
|
179
|
-
const jatsUrl = yield customResolveJatsUrlFromDoi(session, urlOrDoi,
|
|
180
|
-
const data = yield dowloadFromUrl(session, jatsUrl);
|
|
176
|
+
const jatsUrl = yield customResolveJatsUrlFromDoi(session, urlOrDoi, opts);
|
|
177
|
+
const data = yield dowloadFromUrl(session, jatsUrl, opts);
|
|
181
178
|
return { source: jatsUrl, data };
|
|
182
179
|
}
|
|
183
180
|
if (isUrl(urlOrDoi)) {
|
|
184
181
|
session.log.debug("No resolver matched, and the URL doesn't look like a DOI. We will attempt to download it directly.");
|
|
185
|
-
const data = yield dowloadFromUrl(session, urlOrDoi);
|
|
182
|
+
const data = yield dowloadFromUrl(session, urlOrDoi, opts);
|
|
186
183
|
return { source: urlOrDoi, data };
|
|
187
184
|
}
|
|
188
185
|
throw new Error(`Could not find ${urlOrDoi} locally, and it doesn't look like a URL or DOI`);
|
package/dist/esm/resolvers.js
CHANGED
|
@@ -41,16 +41,17 @@ export const DEFAULT_RESOLVERS = [elife, plos, joss];
|
|
|
41
41
|
/**
|
|
42
42
|
* Use the known custom resolvers to pick where the JATS should be downloaded from.
|
|
43
43
|
*/
|
|
44
|
-
export function customResolveJatsUrlFromDoi(session, doiString,
|
|
44
|
+
export function customResolveJatsUrlFromDoi(session, doiString, opts = { resolvers: DEFAULT_RESOLVERS }) {
|
|
45
|
+
var _a, _b;
|
|
45
46
|
return __awaiter(this, void 0, void 0, function* () {
|
|
46
47
|
if (!doi.validate(doiString))
|
|
47
48
|
throw new Error(`The doi ${doiString} is not valid`);
|
|
48
49
|
const doiUrl = doi.buildUrl(doiString);
|
|
49
50
|
session.log.debug(`Resolving DOI ${doiUrl}`);
|
|
50
|
-
const resp = yield fetch(doiUrl);
|
|
51
|
+
const resp = yield ((_a = opts === null || opts === void 0 ? void 0 : opts.fetcher) !== null && _a !== void 0 ? _a : fetch)(doiUrl);
|
|
51
52
|
const articleUrl = resp.url;
|
|
52
53
|
session.log.debug(`Found resolved URL for DOI at ${articleUrl}`);
|
|
53
|
-
const resolver = resolvers.find((r) => r.test(articleUrl));
|
|
54
|
+
const resolver = (_b = opts === null || opts === void 0 ? void 0 : opts.resolvers) === null || _b === void 0 ? void 0 : _b.find((r) => r.test(articleUrl));
|
|
54
55
|
if (!resolver)
|
|
55
56
|
throw new Error(`Could not resolve JATS for ${articleUrl}, no resolver matched`);
|
|
56
57
|
const jatsUrl = resolver.jatsUrl(articleUrl);
|
package/dist/esm/version.js
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
const version = '0.0.
|
|
1
|
+
const version = '0.0.8';
|
|
2
2
|
export default version;
|