@fbraza/pi-cite 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +44 -0
- package/package.json +37 -0
- package/src/fulltext.ts +524 -0
- package/src/index.ts +12 -0
- package/src/literature-search.ts +345 -0
- package/src/pubmed.ts +286 -0
- package/src/rendering.ts +311 -0
- package/src/semantic-scholar.ts +199 -0
- package/src/shared.ts +103 -0
- package/src/tool-output.ts +47 -0
- package/src/types.ts +30 -0
|
@@ -0,0 +1,345 @@
|
|
|
1
|
+
import type { ExtensionAPI } from "@earendil-works/pi-coding-agent";
|
|
2
|
+
import { Type, type Static } from "typebox";
|
|
3
|
+
import { searchPubmed } from "./pubmed.ts";
|
|
4
|
+
import {
|
|
5
|
+
compactPapersForDisplay,
|
|
6
|
+
renderLiteratureSearchResult,
|
|
7
|
+
type LiteratureSearchDisplayEvent,
|
|
8
|
+
type LiteratureSearchDisplaySearch,
|
|
9
|
+
} from "./rendering.ts";
|
|
10
|
+
import { searchSemanticScholar } from "./semantic-scholar.ts";
|
|
11
|
+
import { formatPaperText, normalizeDoi, unique } from "./shared.ts";
|
|
12
|
+
import { emitProgress, textResult, type TextToolUpdate } from "./tool-output.ts";
|
|
13
|
+
import type { PaperRecord } from "./types.ts";
|
|
14
|
+
|
|
15
|
+
export const LITERATURE_SEARCH_PARAMS = Type.Object({
|
|
16
|
+
pubmed_query: Type.String({
|
|
17
|
+
description:
|
|
18
|
+
"PubMed-ready query using PubMed syntax such as MeSH [mh], title/abstract [tiab], publication type [pt], substance [nm], and Boolean logic.",
|
|
19
|
+
}),
|
|
20
|
+
semantic_scholar_query: Type.Optional(
|
|
21
|
+
Type.String({
|
|
22
|
+
description:
|
|
23
|
+
"Optional natural-language Semantic Scholar query for supplementary search. If omitted and Semantic Scholar is configured, a simplified query is derived from pubmed_query.",
|
|
24
|
+
}),
|
|
25
|
+
),
|
|
26
|
+
max_results: Type.Optional(
|
|
27
|
+
Type.Number({ description: "Maximum results per provider (default 20)" }),
|
|
28
|
+
),
|
|
29
|
+
date_from: Type.Optional(
|
|
30
|
+
Type.String({ description: "PubMed publication start date as YYYY/MM/DD" }),
|
|
31
|
+
),
|
|
32
|
+
date_to: Type.Optional(
|
|
33
|
+
Type.String({ description: "PubMed publication end date as YYYY/MM/DD" }),
|
|
34
|
+
),
|
|
35
|
+
publication_types: Type.Optional(
|
|
36
|
+
Type.Array(Type.String({ description: "PubMed publication type" })),
|
|
37
|
+
),
|
|
38
|
+
fetch_abstracts: Type.Optional(
|
|
39
|
+
Type.Boolean({ description: "Whether PubMed should fetch abstracts (default true)" }),
|
|
40
|
+
),
|
|
41
|
+
});
|
|
42
|
+
|
|
43
|
+
export type LiteratureSearchParams = Static<typeof LITERATURE_SEARCH_PARAMS>;
|
|
44
|
+
|
|
45
|
+
type ProviderExecution =
|
|
46
|
+
| { searched: true; count: number; query: string; total?: number }
|
|
47
|
+
| { searched: false; reason: string };
|
|
48
|
+
|
|
49
|
+
export type LiteratureSearchResult = {
|
|
50
|
+
count: number;
|
|
51
|
+
papers: PaperRecord[];
|
|
52
|
+
providers: {
|
|
53
|
+
pubmed: ProviderExecution;
|
|
54
|
+
semantic_scholar: ProviderExecution;
|
|
55
|
+
};
|
|
56
|
+
searches: LiteratureSearchDisplaySearch[];
|
|
57
|
+
events: LiteratureSearchDisplayEvent[];
|
|
58
|
+
};
|
|
59
|
+
|
|
60
|
+
function firstYear(value?: string): number | undefined {
|
|
61
|
+
const match = value?.match(/^(\d{4})/);
|
|
62
|
+
return match?.[1] ? Number(match[1]) : undefined;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
export function simplifyPubmedQueryForSemanticScholar(query: string): string {
|
|
66
|
+
const simplified = query
|
|
67
|
+
.replace(/\[[^\]]+\]/g, " ")
|
|
68
|
+
.replace(/\b(?:AND|OR|NOT)\b/gi, " ")
|
|
69
|
+
.replace(/[()"']/g, " ")
|
|
70
|
+
.replace(/\s+/g, " ")
|
|
71
|
+
.trim();
|
|
72
|
+
return simplified || query.trim();
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
function sourceList(paper: PaperRecord): string[] {
|
|
76
|
+
return unique([
|
|
77
|
+
...(paper.sources ?? []),
|
|
78
|
+
...(paper.source ? paper.source.split(";") : []),
|
|
79
|
+
].map((source) => source.trim()).filter(Boolean));
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
function normalizedTitle(title: string): string {
|
|
83
|
+
return title
|
|
84
|
+
.toLowerCase()
|
|
85
|
+
.replace(/[^a-z0-9]+/g, " ")
|
|
86
|
+
.replace(/\s+/g, " ")
|
|
87
|
+
.trim();
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
function dedupeKeys(paper: PaperRecord): string[] {
|
|
91
|
+
const doi = normalizeDoi(paper.doi)?.toLowerCase();
|
|
92
|
+
const keys = [
|
|
93
|
+
doi ? `doi:${doi}` : undefined,
|
|
94
|
+
paper.pmid ? `pmid:${paper.pmid}` : undefined,
|
|
95
|
+
paper.s2_id ? `s2:${paper.s2_id}` : undefined,
|
|
96
|
+
];
|
|
97
|
+
const title = normalizedTitle(paper.title);
|
|
98
|
+
if (title && paper.year) keys.push(`title-year:${title}:${paper.year}`);
|
|
99
|
+
return unique(keys);
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
function mergePapers(existing: PaperRecord, incoming: PaperRecord): PaperRecord {
|
|
103
|
+
const sources = unique([...sourceList(existing), ...sourceList(incoming)]);
|
|
104
|
+
return {
|
|
105
|
+
...incoming,
|
|
106
|
+
...existing,
|
|
107
|
+
doi: normalizeDoi(existing.doi) ?? normalizeDoi(incoming.doi),
|
|
108
|
+
pmid: existing.pmid ?? incoming.pmid,
|
|
109
|
+
s2_id: existing.s2_id ?? incoming.s2_id,
|
|
110
|
+
title: existing.title !== "Untitled" ? existing.title : incoming.title,
|
|
111
|
+
abstract: existing.abstract ?? incoming.abstract,
|
|
112
|
+
authors: unique([...(existing.authors ?? []), ...(incoming.authors ?? [])]),
|
|
113
|
+
journal: existing.journal ?? incoming.journal,
|
|
114
|
+
year: existing.year ?? incoming.year,
|
|
115
|
+
publication_types: unique([
|
|
116
|
+
...(existing.publication_types ?? []),
|
|
117
|
+
...(incoming.publication_types ?? []),
|
|
118
|
+
]),
|
|
119
|
+
mesh_terms: unique([...(existing.mesh_terms ?? []), ...(incoming.mesh_terms ?? [])]),
|
|
120
|
+
citation_count: existing.citation_count ?? incoming.citation_count,
|
|
121
|
+
tldr: existing.tldr ?? incoming.tldr,
|
|
122
|
+
open_access_pdf: existing.open_access_pdf ?? incoming.open_access_pdf,
|
|
123
|
+
external_ids: { ...(incoming.external_ids ?? {}), ...(existing.external_ids ?? {}) },
|
|
124
|
+
source: sources.join(";"),
|
|
125
|
+
sources,
|
|
126
|
+
};
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
export function dedupeLiteraturePapers(papers: PaperRecord[]): PaperRecord[] {
|
|
130
|
+
const merged: PaperRecord[] = [];
|
|
131
|
+
const keyToIndex = new Map<string, number>();
|
|
132
|
+
|
|
133
|
+
for (const paper of papers) {
|
|
134
|
+
const keys = dedupeKeys(paper);
|
|
135
|
+
const existingIndex = keys
|
|
136
|
+
.map((key) => keyToIndex.get(key))
|
|
137
|
+
.find((index) => index !== undefined);
|
|
138
|
+
|
|
139
|
+
if (existingIndex === undefined) {
|
|
140
|
+
const index = merged.length;
|
|
141
|
+
const sources = sourceList(paper);
|
|
142
|
+
merged.push({ ...paper, source: sources.join(";"), sources });
|
|
143
|
+
for (const key of keys) keyToIndex.set(key, index);
|
|
144
|
+
continue;
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
merged[existingIndex] = mergePapers(merged[existingIndex], paper);
|
|
148
|
+
for (const key of dedupeKeys(merged[existingIndex])) keyToIndex.set(key, existingIndex);
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
return merged;
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
export async function searchLiterature(
|
|
155
|
+
params: LiteratureSearchParams,
|
|
156
|
+
signal?: AbortSignal,
|
|
157
|
+
onUpdate?: TextToolUpdate,
|
|
158
|
+
): Promise<LiteratureSearchResult> {
|
|
159
|
+
const maxResults = Math.min(
|
|
160
|
+
200,
|
|
161
|
+
Math.max(1, Math.floor(params.max_results ?? 20)),
|
|
162
|
+
);
|
|
163
|
+
|
|
164
|
+
const events: LiteratureSearchDisplayEvent[] = [{ phase: "start" }];
|
|
165
|
+
const searches: LiteratureSearchDisplaySearch[] = [];
|
|
166
|
+
const emitEvent = (text: string) => {
|
|
167
|
+
emitProgress(onUpdate, text, { events: [...events] });
|
|
168
|
+
};
|
|
169
|
+
|
|
170
|
+
emitEvent("Starting literature search...");
|
|
171
|
+
|
|
172
|
+
events.push({
|
|
173
|
+
phase: "query_start",
|
|
174
|
+
provider: "pubmed",
|
|
175
|
+
query_index: 1,
|
|
176
|
+
query: params.pubmed_query,
|
|
177
|
+
});
|
|
178
|
+
emitEvent(`Searching PubMed q1: ${params.pubmed_query}`);
|
|
179
|
+
|
|
180
|
+
const pubmed = await searchPubmed(
|
|
181
|
+
{
|
|
182
|
+
query: params.pubmed_query,
|
|
183
|
+
max_results: maxResults,
|
|
184
|
+
date_from: params.date_from,
|
|
185
|
+
date_to: params.date_to,
|
|
186
|
+
publication_types: params.publication_types,
|
|
187
|
+
fetch_abstracts: params.fetch_abstracts,
|
|
188
|
+
},
|
|
189
|
+
signal,
|
|
190
|
+
undefined,
|
|
191
|
+
);
|
|
192
|
+
|
|
193
|
+
const pubmedDisplayPapers = compactPapersForDisplay(pubmed.papers);
|
|
194
|
+
searches.push({
|
|
195
|
+
provider: "pubmed",
|
|
196
|
+
query_index: 1,
|
|
197
|
+
query: pubmed.query ?? params.pubmed_query,
|
|
198
|
+
count: pubmed.count,
|
|
199
|
+
papers: pubmedDisplayPapers,
|
|
200
|
+
});
|
|
201
|
+
events.push({
|
|
202
|
+
phase: "query_results",
|
|
203
|
+
provider: "pubmed",
|
|
204
|
+
query_index: 1,
|
|
205
|
+
query: pubmed.query ?? params.pubmed_query,
|
|
206
|
+
count: pubmed.count,
|
|
207
|
+
papers: pubmedDisplayPapers,
|
|
208
|
+
});
|
|
209
|
+
emitEvent(`PubMed q1 found ${pubmed.count} candidate papers.`);
|
|
210
|
+
|
|
211
|
+
const semanticScholarApiKey = process.env.SEMANTIC_SCHOLAR_API_KEY?.trim();
|
|
212
|
+
let semanticScholar: ProviderExecution = {
|
|
213
|
+
searched: false,
|
|
214
|
+
reason: "SEMANTIC_SCHOLAR_API_KEY not configured",
|
|
215
|
+
};
|
|
216
|
+
let semanticScholarPapers: PaperRecord[] = [];
|
|
217
|
+
|
|
218
|
+
if (semanticScholarApiKey) {
|
|
219
|
+
const semanticScholarQuery =
|
|
220
|
+
params.semantic_scholar_query?.trim() ||
|
|
221
|
+
simplifyPubmedQueryForSemanticScholar(params.pubmed_query);
|
|
222
|
+
|
|
223
|
+
events.push({
|
|
224
|
+
phase: "query_start",
|
|
225
|
+
provider: "semantic_scholar",
|
|
226
|
+
query_index: 1,
|
|
227
|
+
query: semanticScholarQuery,
|
|
228
|
+
});
|
|
229
|
+
emitEvent(`Searching Semantic Scholar q1: ${semanticScholarQuery}`);
|
|
230
|
+
|
|
231
|
+
try {
|
|
232
|
+
const semanticScholarResult = await searchSemanticScholar(
|
|
233
|
+
{
|
|
234
|
+
query: semanticScholarQuery,
|
|
235
|
+
max_results: Math.min(100, maxResults),
|
|
236
|
+
year_from: firstYear(params.date_from),
|
|
237
|
+
year_to: firstYear(params.date_to),
|
|
238
|
+
},
|
|
239
|
+
signal,
|
|
240
|
+
undefined,
|
|
241
|
+
);
|
|
242
|
+
semanticScholarPapers = semanticScholarResult.papers;
|
|
243
|
+
const semanticScholarDisplayPapers = compactPapersForDisplay(
|
|
244
|
+
semanticScholarResult.papers,
|
|
245
|
+
);
|
|
246
|
+
searches.push({
|
|
247
|
+
provider: "semantic_scholar",
|
|
248
|
+
query_index: 1,
|
|
249
|
+
query: semanticScholarQuery,
|
|
250
|
+
count: semanticScholarResult.count,
|
|
251
|
+
papers: semanticScholarDisplayPapers,
|
|
252
|
+
});
|
|
253
|
+
events.push({
|
|
254
|
+
phase: "query_results",
|
|
255
|
+
provider: "semantic_scholar",
|
|
256
|
+
query_index: 1,
|
|
257
|
+
query: semanticScholarQuery,
|
|
258
|
+
count: semanticScholarResult.count,
|
|
259
|
+
papers: semanticScholarDisplayPapers,
|
|
260
|
+
});
|
|
261
|
+
emitEvent(
|
|
262
|
+
`Semantic Scholar q1 found ${semanticScholarResult.count} candidate papers.`,
|
|
263
|
+
);
|
|
264
|
+
semanticScholar = {
|
|
265
|
+
searched: true,
|
|
266
|
+
count: semanticScholarResult.count,
|
|
267
|
+
query: semanticScholarQuery,
|
|
268
|
+
};
|
|
269
|
+
} catch (err) {
|
|
270
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
271
|
+
events.push({
|
|
272
|
+
phase: "query_error",
|
|
273
|
+
provider: "semantic_scholar",
|
|
274
|
+
query_index: 1,
|
|
275
|
+
query: semanticScholarQuery,
|
|
276
|
+
error: message,
|
|
277
|
+
});
|
|
278
|
+
semanticScholar = {
|
|
279
|
+
searched: false,
|
|
280
|
+
reason: `Semantic Scholar search failed: ${message}`,
|
|
281
|
+
};
|
|
282
|
+
emitEvent(`Semantic Scholar q1 failed: ${message}`);
|
|
283
|
+
}
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
events.push({ phase: "dedupe" });
|
|
287
|
+
emitEvent("Deduplicating literature results...");
|
|
288
|
+
|
|
289
|
+
const papers = dedupeLiteraturePapers([
|
|
290
|
+
...pubmed.papers,
|
|
291
|
+
...semanticScholarPapers,
|
|
292
|
+
]);
|
|
293
|
+
events.push({
|
|
294
|
+
phase: "complete",
|
|
295
|
+
count: papers.length,
|
|
296
|
+
papers: compactPapersForDisplay(papers),
|
|
297
|
+
});
|
|
298
|
+
emitEvent(`Literature search complete: ${papers.length} merged papers.`);
|
|
299
|
+
|
|
300
|
+
return {
|
|
301
|
+
count: papers.length,
|
|
302
|
+
papers,
|
|
303
|
+
providers: {
|
|
304
|
+
pubmed: {
|
|
305
|
+
searched: true,
|
|
306
|
+
count: pubmed.count,
|
|
307
|
+
query: pubmed.query ?? params.pubmed_query,
|
|
308
|
+
total: pubmed.total,
|
|
309
|
+
},
|
|
310
|
+
semantic_scholar: semanticScholar,
|
|
311
|
+
},
|
|
312
|
+
searches,
|
|
313
|
+
events,
|
|
314
|
+
};
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
export function createLiteratureSearchTool() {
|
|
318
|
+
return {
|
|
319
|
+
name: "literature_search",
|
|
320
|
+
label: "Literature Search",
|
|
321
|
+
description:
|
|
322
|
+
"Run the literature workflow search: PubMed is always searched first with a PubMed-ready query; Semantic Scholar is searched as supplementary metadata when SEMANTIC_SCHOLAR_API_KEY is configured.",
|
|
323
|
+
parameters: LITERATURE_SEARCH_PARAMS,
|
|
324
|
+
async execute(
|
|
325
|
+
_toolCallId: string,
|
|
326
|
+
params: LiteratureSearchParams,
|
|
327
|
+
signal?: AbortSignal,
|
|
328
|
+
onUpdate?: TextToolUpdate,
|
|
329
|
+
) {
|
|
330
|
+
const result = await searchLiterature(params, signal, onUpdate);
|
|
331
|
+
return textResult(formatPaperText(result.papers), result);
|
|
332
|
+
},
|
|
333
|
+
renderResult(
|
|
334
|
+
result: Parameters<typeof renderLiteratureSearchResult>[0],
|
|
335
|
+
options: Parameters<typeof renderLiteratureSearchResult>[1],
|
|
336
|
+
theme: Parameters<typeof renderLiteratureSearchResult>[2],
|
|
337
|
+
) {
|
|
338
|
+
return renderLiteratureSearchResult(result, options, theme);
|
|
339
|
+
},
|
|
340
|
+
};
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
export function registerLiteratureSearchTool(pi: ExtensionAPI): void {
|
|
344
|
+
pi.registerTool(createLiteratureSearchTool());
|
|
345
|
+
}
|
package/src/pubmed.ts
ADDED
|
@@ -0,0 +1,286 @@
|
|
|
1
|
+
import type { ExtensionAPI } from "@earendil-works/pi-coding-agent";
|
|
2
|
+
import { Type, type Static } from "typebox";
|
|
3
|
+
import { renderProviderSearchResult } from "./rendering.ts";
|
|
4
|
+
import { emitProgress, textResult, type TextToolUpdate } from "./tool-output.ts";
|
|
5
|
+
import {
|
|
6
|
+
fetchJson,
|
|
7
|
+
fetchText,
|
|
8
|
+
formatPaperText,
|
|
9
|
+
normalizeDoi,
|
|
10
|
+
pickAll,
|
|
11
|
+
pickOne,
|
|
12
|
+
sleep,
|
|
13
|
+
unique,
|
|
14
|
+
xmlDecode,
|
|
15
|
+
} from "./shared.ts";
|
|
16
|
+
import type { PaperRecord } from "./types.ts";
|
|
17
|
+
|
|
18
|
+
export const PUBMED_SEARCH_PARAMS = Type.Object({
|
|
19
|
+
query: Type.String({
|
|
20
|
+
description:
|
|
21
|
+
"PubMed query string (supports field tags like [tiab], [mh], [pt])",
|
|
22
|
+
}),
|
|
23
|
+
max_results: Type.Optional(
|
|
24
|
+
Type.Number({
|
|
25
|
+
description: "Maximum results to return (default 20, max 200)",
|
|
26
|
+
}),
|
|
27
|
+
),
|
|
28
|
+
date_from: Type.Optional(
|
|
29
|
+
Type.String({ description: "Start date as YYYY/MM/DD" }),
|
|
30
|
+
),
|
|
31
|
+
date_to: Type.Optional(
|
|
32
|
+
Type.String({ description: "End date as YYYY/MM/DD" }),
|
|
33
|
+
),
|
|
34
|
+
publication_types: Type.Optional(
|
|
35
|
+
Type.Array(Type.String({ description: "PubMed publication type" })),
|
|
36
|
+
),
|
|
37
|
+
fetch_abstracts: Type.Optional(
|
|
38
|
+
Type.Boolean({ description: "Whether to fetch abstracts (default true)" }),
|
|
39
|
+
),
|
|
40
|
+
sort: Type.Optional(
|
|
41
|
+
Type.Union([
|
|
42
|
+
Type.Literal("relevance"),
|
|
43
|
+
Type.Literal("pub_date"),
|
|
44
|
+
Type.Literal("first_author"),
|
|
45
|
+
]),
|
|
46
|
+
),
|
|
47
|
+
api_key: Type.Optional(
|
|
48
|
+
Type.String({
|
|
49
|
+
description: "Environment variable name containing an NCBI API key (defaults to NCBI_API_KEY when omitted)",
|
|
50
|
+
}),
|
|
51
|
+
),
|
|
52
|
+
});
|
|
53
|
+
|
|
54
|
+
export type PubmedSearchParams = Static<typeof PUBMED_SEARCH_PARAMS>;
|
|
55
|
+
|
|
56
|
+
export const DEFAULT_NCBI_API_KEY_ENV = "NCBI_API_KEY";
|
|
57
|
+
|
|
58
|
+
export function getNcbiApiKey(envVarName?: string): string | undefined {
|
|
59
|
+
const keyEnv = envVarName?.trim() || DEFAULT_NCBI_API_KEY_ENV;
|
|
60
|
+
const apiKey = process.env[keyEnv]?.trim();
|
|
61
|
+
return apiKey || undefined;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
export function addNcbiApiKeyParam(url: URL, envVarName?: string): boolean {
|
|
65
|
+
const apiKey = getNcbiApiKey(envVarName);
|
|
66
|
+
if (!apiKey) return false;
|
|
67
|
+
url.searchParams.set("api_key", apiKey);
|
|
68
|
+
return true;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
export function normalizePubmedQuery(
|
|
72
|
+
query: string,
|
|
73
|
+
publicationTypes?: string[],
|
|
74
|
+
dateFrom?: string,
|
|
75
|
+
dateTo?: string,
|
|
76
|
+
): string {
|
|
77
|
+
const fragments = [query.trim()].filter(Boolean);
|
|
78
|
+
if (publicationTypes && publicationTypes.length > 0) {
|
|
79
|
+
fragments.push(
|
|
80
|
+
`(${publicationTypes.map((item) => `\"${item}\"[Publication Type]`).join(" OR ")})`,
|
|
81
|
+
);
|
|
82
|
+
}
|
|
83
|
+
if (dateFrom || dateTo) {
|
|
84
|
+
const start = dateFrom ?? "1000/01/01";
|
|
85
|
+
const end = dateTo ?? "3000/12/31";
|
|
86
|
+
fragments.push(`(${start}:${end}[Date - Publication])`);
|
|
87
|
+
}
|
|
88
|
+
return fragments.join(" AND ");
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
export function parsePubmedArticle(articleXml: string): PaperRecord {
|
|
92
|
+
const pmid = pickOne(/<PMID[^>]*>(.*?)<\/PMID>/i, articleXml);
|
|
93
|
+
const title =
|
|
94
|
+
pickOne(/<ArticleTitle>([\s\S]*?)<\/ArticleTitle>/i, articleXml) ??
|
|
95
|
+
"Untitled";
|
|
96
|
+
const abstractSections = pickAll(
|
|
97
|
+
/<AbstractText[^>]*>([\s\S]*?)<\/AbstractText>/gi,
|
|
98
|
+
articleXml,
|
|
99
|
+
);
|
|
100
|
+
const abstract = abstractSections.join(" ").trim() || undefined;
|
|
101
|
+
const journal =
|
|
102
|
+
pickOne(/<Title>([\s\S]*?)<\/Title>/i, articleXml) ??
|
|
103
|
+
pickOne(/<ISOAbbreviation>(.*?)<\/ISOAbbreviation>/i, articleXml);
|
|
104
|
+
const yearText =
|
|
105
|
+
pickOne(/<PubDate>[\s\S]*?<Year>(\d{4})<\/Year>/i, articleXml) ??
|
|
106
|
+
pickOne(/<ArticleDate[^>]*>[\s\S]*?<Year>(\d{4})<\/Year>/i, articleXml) ??
|
|
107
|
+
pickOne(
|
|
108
|
+
/<PubMedPubDate[^>]*PubStatus=\"pubmed\">[\s\S]*?<Year>(\d{4})<\/Year>/i,
|
|
109
|
+
articleXml,
|
|
110
|
+
);
|
|
111
|
+
const doi =
|
|
112
|
+
normalizeDoi(
|
|
113
|
+
pickOne(
|
|
114
|
+
/<ELocationID[^>]*EIdType=\"doi\"[^>]*>(.*?)<\/ELocationID>/i,
|
|
115
|
+
articleXml,
|
|
116
|
+
),
|
|
117
|
+
) ??
|
|
118
|
+
normalizeDoi(
|
|
119
|
+
pickOne(
|
|
120
|
+
/<ArticleId[^>]*IdType=\"doi\"[^>]*>(.*?)<\/ArticleId>/i,
|
|
121
|
+
articleXml,
|
|
122
|
+
),
|
|
123
|
+
);
|
|
124
|
+
const publicationTypes = unique(
|
|
125
|
+
pickAll(
|
|
126
|
+
/<PublicationType[^>]*>([\s\S]*?)<\/PublicationType>/gi,
|
|
127
|
+
articleXml,
|
|
128
|
+
),
|
|
129
|
+
);
|
|
130
|
+
const meshTerms = unique(
|
|
131
|
+
pickAll(/<DescriptorName[^>]*>([\s\S]*?)<\/DescriptorName>/gi, articleXml),
|
|
132
|
+
);
|
|
133
|
+
const authors = unique(
|
|
134
|
+
Array.from(
|
|
135
|
+
articleXml.matchAll(
|
|
136
|
+
/<Author[\s\S]*?<LastName>(.*?)<\/LastName>[\s\S]*?(?:<ForeName>(.*?)<\/ForeName>|<Initials>(.*?)<\/Initials>)/gi,
|
|
137
|
+
),
|
|
138
|
+
).map((match) => {
|
|
139
|
+
const last = xmlDecode(match[1] ?? "");
|
|
140
|
+
const fore = xmlDecode(match[2] ?? match[3] ?? "");
|
|
141
|
+
return [fore, last].filter(Boolean).join(" ").trim();
|
|
142
|
+
}),
|
|
143
|
+
);
|
|
144
|
+
const collectiveAuthors = pickAll(
|
|
145
|
+
/<CollectiveName>([\s\S]*?)<\/CollectiveName>/gi,
|
|
146
|
+
articleXml,
|
|
147
|
+
);
|
|
148
|
+
return {
|
|
149
|
+
pmid,
|
|
150
|
+
doi,
|
|
151
|
+
title,
|
|
152
|
+
abstract,
|
|
153
|
+
authors: unique([...authors, ...collectiveAuthors]),
|
|
154
|
+
journal,
|
|
155
|
+
year: yearText ? Number(yearText) : undefined,
|
|
156
|
+
publication_types: publicationTypes,
|
|
157
|
+
mesh_terms: meshTerms,
|
|
158
|
+
source: "pubmed",
|
|
159
|
+
};
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
export function parsePubmedArticles(xml: string): PaperRecord[] {
|
|
163
|
+
const chunks = xml.match(/<PubmedArticle>[\s\S]*?<\/PubmedArticle>/gi) ?? [];
|
|
164
|
+
return chunks.map(parsePubmedArticle);
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
export async function lookupPubmedIdentifiers(
|
|
168
|
+
pmid: string,
|
|
169
|
+
signal?: AbortSignal,
|
|
170
|
+
): Promise<{ doi?: string; title?: string }> {
|
|
171
|
+
const url = new URL(
|
|
172
|
+
"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi",
|
|
173
|
+
);
|
|
174
|
+
url.searchParams.set("db", "pubmed");
|
|
175
|
+
url.searchParams.set("id", pmid);
|
|
176
|
+
url.searchParams.set("retmode", "xml");
|
|
177
|
+
addNcbiApiKeyParam(url);
|
|
178
|
+
const xml = await fetchText(url.toString(), signal);
|
|
179
|
+
const article = parsePubmedArticles(xml)[0];
|
|
180
|
+
return { doi: article?.doi, title: article?.title };
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
export type PubmedSearchResult = {
|
|
184
|
+
count: number;
|
|
185
|
+
papers: PaperRecord[];
|
|
186
|
+
query?: string;
|
|
187
|
+
total?: number;
|
|
188
|
+
};
|
|
189
|
+
|
|
190
|
+
export async function searchPubmed(
|
|
191
|
+
params: PubmedSearchParams,
|
|
192
|
+
signal?: AbortSignal,
|
|
193
|
+
onUpdate?: TextToolUpdate,
|
|
194
|
+
): Promise<PubmedSearchResult> {
|
|
195
|
+
const maxResults = Math.min(
|
|
196
|
+
200,
|
|
197
|
+
Math.max(1, Math.floor(params.max_results ?? 20)),
|
|
198
|
+
);
|
|
199
|
+
const query = normalizePubmedQuery(
|
|
200
|
+
params.query,
|
|
201
|
+
params.publication_types,
|
|
202
|
+
params.date_from,
|
|
203
|
+
params.date_to,
|
|
204
|
+
);
|
|
205
|
+
const esearchUrl = new URL(
|
|
206
|
+
"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi",
|
|
207
|
+
);
|
|
208
|
+
esearchUrl.searchParams.set("db", "pubmed");
|
|
209
|
+
esearchUrl.searchParams.set("retmode", "json");
|
|
210
|
+
esearchUrl.searchParams.set("retmax", String(maxResults));
|
|
211
|
+
esearchUrl.searchParams.set("sort", params.sort ?? "relevance");
|
|
212
|
+
esearchUrl.searchParams.set("term", query);
|
|
213
|
+
const hasApiKey = addNcbiApiKeyParam(esearchUrl, params.api_key);
|
|
214
|
+
emitProgress(onUpdate, `Searching PubMed for: ${params.query}`);
|
|
215
|
+
const esearch = await fetchJson<{
|
|
216
|
+
esearchresult?: { idlist?: string[]; count?: string };
|
|
217
|
+
}>(esearchUrl.toString(), signal);
|
|
218
|
+
const ids = esearch.esearchresult?.idlist ?? [];
|
|
219
|
+
if (ids.length === 0) {
|
|
220
|
+
return { count: 0, papers: [] };
|
|
221
|
+
}
|
|
222
|
+
if (params.fetch_abstracts === false) {
|
|
223
|
+
const papers = ids.map((pmid) => ({
|
|
224
|
+
pmid,
|
|
225
|
+
title: "PubMed record",
|
|
226
|
+
source: "pubmed",
|
|
227
|
+
}));
|
|
228
|
+
return { count: papers.length, papers, query };
|
|
229
|
+
}
|
|
230
|
+
const rateLimitMs = hasApiKey ? 120 : 350;
|
|
231
|
+
const batchSize = 50;
|
|
232
|
+
const papers: PaperRecord[] = [];
|
|
233
|
+
for (let start = 0; start < ids.length; start += batchSize) {
|
|
234
|
+
const batch = ids.slice(start, start + batchSize);
|
|
235
|
+
emitProgress(
|
|
236
|
+
onUpdate,
|
|
237
|
+
`Searching PubMed... found ${ids.length} PMIDs, fetching abstracts ${start + 1}-${Math.min(start + batch.length, ids.length)}...`,
|
|
238
|
+
);
|
|
239
|
+
const efetchUrl = new URL(
|
|
240
|
+
"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi",
|
|
241
|
+
);
|
|
242
|
+
efetchUrl.searchParams.set("db", "pubmed");
|
|
243
|
+
efetchUrl.searchParams.set("retmode", "xml");
|
|
244
|
+
efetchUrl.searchParams.set("id", batch.join(","));
|
|
245
|
+
addNcbiApiKeyParam(efetchUrl, params.api_key);
|
|
246
|
+
const xml = await fetchText(efetchUrl.toString(), signal);
|
|
247
|
+
papers.push(...parsePubmedArticles(xml));
|
|
248
|
+
if (start + batchSize < ids.length) await sleep(rateLimitMs, signal);
|
|
249
|
+
}
|
|
250
|
+
return {
|
|
251
|
+
count: papers.length,
|
|
252
|
+
papers,
|
|
253
|
+
query,
|
|
254
|
+
total: Number(esearch.esearchresult?.count ?? papers.length),
|
|
255
|
+
};
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
export function createPubmedSearchTool() {
|
|
259
|
+
return {
|
|
260
|
+
name: "pubmed_search",
|
|
261
|
+
label: "PubMed Search",
|
|
262
|
+
description:
|
|
263
|
+
"Search PubMed using typed parameters and return metadata with abstracts when available.",
|
|
264
|
+
parameters: PUBMED_SEARCH_PARAMS,
|
|
265
|
+
async execute(
|
|
266
|
+
_toolCallId: string,
|
|
267
|
+
params: PubmedSearchParams,
|
|
268
|
+
signal?: AbortSignal,
|
|
269
|
+
onUpdate?: TextToolUpdate,
|
|
270
|
+
) {
|
|
271
|
+
const result = await searchPubmed(params, signal, onUpdate);
|
|
272
|
+
return textResult(formatPaperText(result.papers), result);
|
|
273
|
+
},
|
|
274
|
+
renderResult(
|
|
275
|
+
result: Parameters<typeof renderProviderSearchResult>[1],
|
|
276
|
+
options: Parameters<typeof renderProviderSearchResult>[2],
|
|
277
|
+
theme: Parameters<typeof renderProviderSearchResult>[3],
|
|
278
|
+
) {
|
|
279
|
+
return renderProviderSearchResult("pubmed", result, options, theme);
|
|
280
|
+
},
|
|
281
|
+
};
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
export function registerPubmedSearchTool(pi: ExtensionAPI): void {
|
|
285
|
+
pi.registerTool(createPubmedSearchTool());
|
|
286
|
+
}
|