ez-reads 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,356 @@
1
+ import Groq from 'groq-sdk';
2
+
3
+ const MODEL = 'qwen/qwen3-32b';
4
+ const MAX_SECTION_CHARS = 3000;
5
+ const MAX_TOKENS = 4096;
6
+ const DELAY_MS = 25_000; // 25s between calls to stay under 6k TPM
7
+
8
+ const DEEP_DIVE = `You are an expert science communicator writing an in-depth technical explainer of a research paper. Your audience is technically curious readers with a general STEM background.
9
+
10
+ Rules:
11
+ - Write with genuine depth — explain the "why" behind methods and design choices, not just the "what."
12
+ - Cover intuition, trade-offs, and how ideas connect to the broader field.
13
+ - Use precise analogies to clarify complex ideas, but never sacrifice accuracy for simplicity.
14
+ - When you introduce a technical term, define it inline on first use.
15
+ - Be thorough but concise — every sentence should add understanding.`;
16
+
17
+ // ---------- per-chunk prompts ----------
18
+ // Chunks map to natural paper sections: abstract, intro, methods, results, discussion
19
+
20
+ const CHUNK_ABSTRACT = {
21
+ name: 'abstract & overview',
22
+ system: `${DEEP_DIVE}
23
+
24
+ Given a paper's title, authors, and abstract, return ONLY valid JSON (no markdown fences, no commentary):
25
+ {
26
+ "title": "string — the exact paper title",
27
+ "authors": ["string — author names"],
28
+ "tldr": "string — a precise one-sentence summary capturing the core contribution and its significance",
29
+ "analogy": "string — a vivid 2-3 sentence analogy that accurately captures the paper's central idea using a real-world parallel. Make it illuminating, not dumbed-down.",
30
+ "abstract_simplified": "string — a thorough 4-6 sentence summary covering: (1) what problem the paper addresses, (2) why it matters, (3) the core approach, (4) the key findings, and (5) what this means for the field",
31
+ "color_theme": {
32
+ "primary": "string — hex color suited to the paper's field (blue for CS/AI, green for biology, purple for physics, teal for math, red for medicine, etc.)",
33
+ "accent": "string — a complementary hex accent color"
34
+ }
35
+ }`,
36
+ buildInput: (d) => {
37
+ let parts = [];
38
+ if (d.title) parts.push(`TITLE: ${d.title}`);
39
+ if (d.authors?.length) parts.push(`AUTHORS: ${d.authors.join(', ')}`);
40
+ if (d.abstract) parts.push(`ABSTRACT:\n${d.abstract}`);
41
+ return parts.join('\n\n').slice(0, MAX_SECTION_CHARS);
42
+ },
43
+ };
44
+
45
+ const CHUNK_INTRO = {
46
+ name: 'key contributions',
47
+ system: `${DEEP_DIVE}
48
+
49
+ Given the introduction and overview sections of a research paper, identify and explain the key contributions in depth. Return ONLY valid JSON (no markdown fences, no commentary):
50
+ {
51
+ "key_contributions": [
52
+ {
53
+ "emoji": "string — one relevant emoji",
54
+ "title": "string — concise label for this contribution",
55
+ "description": "string — 3-5 sentences explaining: what this contribution is, why it is novel compared to prior work, and how it advances the state of the art. Include specific details."
56
+ }
57
+ ]
58
+ }
59
+ Guidelines: 3-5 contributions. Each description should demonstrate real understanding of the novelty, not just restate the claim.`,
60
+ buildInput: (d) => {
61
+ const secs = findSections(d.sections, [
62
+ 'introduction', 'intro', 'contribution', 'overview', 'background', 'motivation', 'related',
63
+ ]);
64
+ let text = secs.map((s) => `## ${s.heading}\n${s.text}`).join('\n\n');
65
+ if (!text) text = d.abstract || '';
66
+ return text.slice(0, MAX_SECTION_CHARS);
67
+ },
68
+ };
69
+
70
+ const CHUNK_METHODS = {
71
+ name: 'methodology deep-dive',
72
+ system: `${DEEP_DIVE}
73
+
74
+ Given the methodology sections of a research paper, produce a detailed, structured breakdown of the approach. Return ONLY valid JSON (no markdown fences, no commentary):
75
+ {
76
+ "methodology": {
77
+ "summary": "string — 3-4 sentences explaining the overall approach: what method was chosen, why it was chosen over alternatives, and what makes it distinctive",
78
+ "steps": [
79
+ {
80
+ "emoji": "string — one relevant emoji for this step",
81
+ "label": "string — clear step name",
82
+ "detail": "string — 3-5 sentences covering: what happens in this step, the intuition behind it, why this particular design choice was made, and how it connects to the next step. Mention specific techniques by name and explain them."
83
+ }
84
+ ]
85
+ }
86
+ }
87
+ Guidelines: 4-7 steps. Focus on conveying the reasoning and intuition behind each choice, not just a mechanical description. Explain trade-offs where relevant.`,
88
+ buildInput: (d) => {
89
+ const secs = findSections(d.sections, [
90
+ 'method', 'approach', 'model', 'architecture', 'framework', 'design',
91
+ 'system', 'technique', 'training', 'implementation', 'algorithm', 'pipeline',
92
+ ]);
93
+ let text = secs.map((s) => `## ${s.heading}\n${s.text}`).join('\n\n');
94
+ if (!text) text = collectAllText(d).slice(0, MAX_SECTION_CHARS);
95
+ return text.slice(0, MAX_SECTION_CHARS);
96
+ },
97
+ };
98
+
99
+ const CHUNK_RESULTS = {
100
+ name: 'results & analysis',
101
+ system: `${DEEP_DIVE}
102
+
103
+ Given the results and experiments sections of a research paper, extract key findings with analytical depth AND notable quantitative metrics. Return ONLY valid JSON (no markdown fences, no commentary):
104
+ {
105
+ "results": [
106
+ {
107
+ "emoji": "string — one relevant emoji",
108
+ "finding": "string — the key result stated clearly and precisely",
109
+ "detail": "string — 3-5 sentences providing: the experimental context, how this result compares to baselines or prior work, what it means practically, and any important nuances or caveats"
110
+ }
111
+ ],
112
+ "stats": [
113
+ {
114
+ "value": "string — the specific number or metric (e.g. '94.2%', '3.5x', '1.2B')",
115
+ "label": "string — what this number measures",
116
+ "context": "string — 1-2 sentences putting this number in perspective: compare to the previous state of the art, explain the magnitude of improvement, or convey what it means in practical terms"
117
+ }
118
+ ]
119
+ }
120
+ Guidelines: 3-5 results with genuine depth — highlight both strengths and surprising or nuanced findings. 2-4 stats with meaningful comparisons. Avoid cherry-picking only the best numbers; include context about where the approach shines and where it doesn't.`,
121
+ buildInput: (d) => {
122
+ const secs = findSections(d.sections, [
123
+ 'result', 'experiment', 'evaluation', 'finding', 'performance',
124
+ 'benchmark', 'ablation', 'comparison', 'analysis', 'empirical',
125
+ ]);
126
+ let text = secs.map((s) => `## ${s.heading}\n${s.text}`).join('\n\n');
127
+ if (!text) text = collectAllText(d).slice(0, MAX_SECTION_CHARS);
128
+ return text.slice(0, MAX_SECTION_CHARS);
129
+ },
130
+ };
131
+
132
+ const CHUNK_DISCUSSION = {
133
+ name: 'significance & context',
134
+ system: `${DEEP_DIVE}
135
+
136
+ Given the discussion, conclusion, and related sections of a research paper, extract the significance, limitations, and key technical terms. Return ONLY valid JSON (no markdown fences, no commentary):
137
+ {
138
+ "significance": "string — 4-6 sentences on broader impact: how this work changes the field, what new possibilities it opens, potential real-world applications, and how it connects to the bigger picture of the discipline",
139
+ "limitations": [
140
+ "string — each limitation as 1-2 sentences: state the limitation clearly AND explain why it matters or how it could be addressed in future work"
141
+ ],
142
+ "glossary": [
143
+ {
144
+ "term": "string — the technical term or concept",
145
+ "definition": "string — a thorough 2-3 sentence definition: what it means, why it's important in this context, and (where helpful) how it relates to more familiar concepts"
146
+ }
147
+ ]
148
+ }
149
+ Guidelines: 3-5 substantive limitations (not just surface-level caveats — explain the impact of each). 5-10 glossary terms covering the most important technical concepts from the paper, with definitions that genuinely teach the reader.`,
150
+ buildInput: (d) => {
151
+ const secs = findSections(d.sections, [
152
+ 'discussion', 'conclusion', 'limitation', 'future', 'related',
153
+ 'significance', 'impact', 'broader', 'abstract', 'summary',
154
+ ]);
155
+ let text = secs.map((s) => `## ${s.heading}\n${s.text}`).join('\n\n');
156
+ if (!text) text = (d.abstract || '') + '\n\n' + collectAllText(d);
157
+ return text.slice(0, MAX_SECTION_CHARS);
158
+ },
159
+ };
160
+
161
+ const CHUNK_FIGURES = {
162
+ name: 'figure selection',
163
+ system: `${DEEP_DIVE}
164
+
165
+ Given a list of figure captions from a research paper, select the 3-5 most important figures — prioritize architecture diagrams, key result charts, and data visualizations that are essential to understanding the work. Return ONLY valid JSON (no markdown fences, no commentary):
166
+ {
167
+ "figures": [
168
+ {
169
+ "id": "string — the figure id exactly as provided",
170
+ "caption": "string — the original caption",
171
+ "explanation": "string — 2-3 sentences explaining what this figure shows and why it is important for understanding the paper",
172
+ "category": "string — one of: Architecture, Results, Data, Method, Comparison, Overview"
173
+ }
174
+ ]
175
+ }
176
+ Guidelines: Select 3-5 figures maximum. Prefer figures that convey the most information visually — architecture diagrams, performance charts, ablation plots, and key data visualizations. Skip trivial or redundant figures.`,
177
+ buildInput: (d) => {
178
+ if (!d.figures?.length) return '';
179
+ return d.figures.map((f) => `[${f.id}] ${f.caption}`).join('\n\n').slice(0, MAX_SECTION_CHARS);
180
+ },
181
+ };
182
+
183
+ const ALL_CHUNKS = [CHUNK_ABSTRACT, CHUNK_INTRO, CHUNK_METHODS, CHUNK_RESULTS, CHUNK_DISCUSSION];
184
+
185
+ // ---------- main export ----------
186
+
187
+ export async function distillPaper(paperData, onProgress) {
188
+ const apiKey = process.env.GROQ_API_KEY;
189
+ if (!apiKey) {
190
+ throw new Error('GROQ_API_KEY environment variable is required.');
191
+ }
192
+
193
+ const client = new Groq({ apiKey });
194
+ const merged = {};
195
+
196
+ // Build the chunk list — add figures chunk if the paper has figures
197
+ const chunks = [...ALL_CHUNKS];
198
+ if (paperData.figures?.length > 0) {
199
+ chunks.push(CHUNK_FIGURES);
200
+ }
201
+
202
+ for (let i = 0; i < chunks.length; i++) {
203
+ const chunk = chunks[i];
204
+
205
+ if (onProgress) onProgress(chunk.name, i + 1, chunks.length);
206
+
207
+ const input = chunk.buildInput(paperData);
208
+ const result = await callGroq(client, chunk.system, input);
209
+ Object.assign(merged, result);
210
+
211
+ // Delay between calls to respect TPM limit (skip after last call)
212
+ if (i < chunks.length - 1) {
213
+ await sleep(DELAY_MS);
214
+ }
215
+ }
216
+
217
+ // If figures were selected by the LLM, merge back the original URLs from paperData
218
+ if (merged.figures && paperData.figures?.length > 0) {
219
+ const figureMap = new Map(paperData.figures.map((f) => [f.id, f]));
220
+ merged.figures = merged.figures
221
+ .filter((f) => figureMap.has(f.id))
222
+ .map((f) => ({ ...f, url: figureMap.get(f.id).url }));
223
+ }
224
+
225
+ return merged;
226
+ }
227
+
228
+ // ---------- helpers ----------
229
+
230
+ async function callGroq(client, system, userContent) {
231
+ const response = await client.chat.completions.create({
232
+ model: MODEL,
233
+ max_tokens: MAX_TOKENS,
234
+ temperature: 0.4,
235
+ messages: [
236
+ { role: 'system', content: system },
237
+ { role: 'user', content: userContent },
238
+ ],
239
+ });
240
+
241
+ const text = response.choices[0]?.message?.content || '';
242
+ const cleaned = cleanModelOutput(text);
243
+
244
+ // Try parsing the cleaned output directly
245
+ try {
246
+ return JSON.parse(cleaned);
247
+ } catch (_) {
248
+ // If JSON is truncated (hit token limit), try to repair it
249
+ try {
250
+ const repaired = repairTruncatedJson(cleaned);
251
+ return JSON.parse(repaired);
252
+ } catch (_) {
253
+ // Fall through to LLM retry
254
+ }
255
+ }
256
+
257
+ // Retry once — ask the model to fix its JSON
258
+ await sleep(DELAY_MS);
259
+ const retry = await client.chat.completions.create({
260
+ model: MODEL,
261
+ max_tokens: MAX_TOKENS,
262
+ temperature: 0,
263
+ messages: [
264
+ {
265
+ role: 'user',
266
+ content: `The following was supposed to be valid JSON but failed to parse. Fix it and return ONLY valid JSON, nothing else:\n\n${cleaned}`,
267
+ },
268
+ ],
269
+ });
270
+ const retryText = cleanModelOutput(retry.choices[0]?.message?.content || '');
271
+
272
+ try {
273
+ return JSON.parse(retryText);
274
+ } catch (_) {
275
+ const repaired = repairTruncatedJson(retryText);
276
+ return JSON.parse(repaired);
277
+ }
278
+ }
279
+
280
+ function cleanModelOutput(text) {
281
+ // Strip Qwen3 <think>...</think> reasoning blocks
282
+ let out = text.replace(/<think>[\s\S]*?<\/think>/g, '').trim();
283
+ // Strip markdown fences
284
+ out = out.replace(/^```(?:json)?\s*/i, '').replace(/\s*```$/i, '').trim();
285
+ return out;
286
+ }
287
+
288
+ /**
289
+ * Attempt to repair truncated JSON by closing open strings, arrays, and objects.
290
+ * Handles the common case where the LLM response was cut off mid-output.
291
+ */
292
+ function repairTruncatedJson(text) {
293
+ let s = text.trim();
294
+
295
+ // If it looks like it was cut off inside a string, close the string
296
+ // Count unescaped quotes to determine if we're inside a string
297
+ let inString = false;
298
+ for (let i = 0; i < s.length; i++) {
299
+ if (s[i] === '\\') { i++; continue; } // skip escaped char
300
+ if (s[i] === '"') inString = !inString;
301
+ }
302
+ if (inString) {
303
+ // Truncate back to the last clean sentence/word boundary if possible
304
+ const lastGoodBreak = s.lastIndexOf('. ');
305
+ const lastSpace = s.lastIndexOf(' ');
306
+ const cutPoint = lastGoodBreak > s.length * 0.5 ? lastGoodBreak + 1 : lastSpace;
307
+ if (cutPoint > s.length * 0.5) {
308
+ s = s.slice(0, cutPoint) + '"';
309
+ } else {
310
+ s += '"';
311
+ }
312
+ }
313
+
314
+ // Remove any trailing comma after closing the string
315
+ s = s.replace(/,\s*$/, '');
316
+
317
+ // Count open brackets/braces and close them
318
+ let openBraces = 0;
319
+ let openBrackets = 0;
320
+ inString = false;
321
+ for (let i = 0; i < s.length; i++) {
322
+ if (s[i] === '\\') { i++; continue; }
323
+ if (s[i] === '"') { inString = !inString; continue; }
324
+ if (inString) continue;
325
+ if (s[i] === '{') openBraces++;
326
+ if (s[i] === '}') openBraces--;
327
+ if (s[i] === '[') openBrackets++;
328
+ if (s[i] === ']') openBrackets--;
329
+ }
330
+
331
+ // Close any open arrays then objects
332
+ for (let i = 0; i < openBrackets; i++) s += ']';
333
+ for (let i = 0; i < openBraces; i++) s += '}';
334
+
335
+ return s;
336
+ }
337
+
338
+ function findSections(sections, keywords) {
339
+ if (!sections?.length) return [];
340
+ return sections.filter((s) =>
341
+ keywords.some((kw) => s.heading.toLowerCase().includes(kw))
342
+ );
343
+ }
344
+
345
+ function collectAllText(data) {
346
+ let parts = [];
347
+ if (data.abstract) parts.push(data.abstract);
348
+ if (data.sections?.length) {
349
+ for (const s of data.sections) parts.push(s.text);
350
+ }
351
+ return parts.join('\n\n');
352
+ }
353
+
354
+ function sleep(ms) {
355
+ return new Promise((r) => setTimeout(r, ms));
356
+ }
@@ -0,0 +1,327 @@
1
+ import * as cheerio from 'cheerio';
2
+
3
+ /**
4
+ * Detect input type and fetch paper content.
5
+ * Returns { title, authors, abstract, sections: [{ heading, text }], url }
6
+ */
7
+ export async function fetchPaper(input) {
8
+ input = input.trim();
9
+
10
+ if (isArxiv(input)) {
11
+ return fetchArxiv(input);
12
+ }
13
+
14
+ if (isDoi(input)) {
15
+ return fetchDoi(input);
16
+ }
17
+
18
+ throw new Error(
19
+ 'Unsupported input. Please provide an ArXiv URL (e.g. https://arxiv.org/abs/2401.00001) or a DOI (e.g. 10.1234/example).'
20
+ );
21
+ }
22
+
23
+ function isArxiv(input) {
24
+ return /arxiv\.org\/(abs|html|pdf)\//.test(input);
25
+ }
26
+
27
+ function isDoi(input) {
28
+ return /^10\.\d{4,}\//.test(input) || /doi\.org\/10\.\d{4,}\//.test(input);
29
+ }
30
+
31
+ // ---------- ArXiv ----------
32
+
33
+ function arxivIdFrom(url) {
34
+ const match = url.match(/arxiv\.org\/(?:abs|html|pdf)\/([^\s?#]+)/);
35
+ return match ? match[1].replace(/\.pdf$/, '') : null;
36
+ }
37
+
38
+ async function fetchArxiv(input) {
39
+ const id = arxivIdFrom(input);
40
+ if (!id) throw new Error('Could not parse ArXiv ID from URL.');
41
+
42
+ const htmlUrl = `https://arxiv.org/html/${id}`;
43
+ const absUrl = `https://arxiv.org/abs/${id}`;
44
+
45
+ let result;
46
+
47
+ // Try HTML version first (richer content)
48
+ try {
49
+ const res = await fetch(htmlUrl, { headers: { 'User-Agent': 'ez-reads/1.0' } });
50
+ if (res.ok) {
51
+ const html = await res.text();
52
+ const parsed = parseArxivHtml(html, htmlUrl);
53
+ if (parsed.sections.length > 0) {
54
+ result = { ...parsed, url: absUrl };
55
+ }
56
+ }
57
+ } catch (_) {
58
+ // fall through to abstract page
59
+ }
60
+
61
+ if (!result) {
62
+ // Fallback: abstract page
63
+ const res = await fetch(absUrl, { headers: { 'User-Agent': 'ez-reads/1.0' } });
64
+ if (!res.ok) throw new Error(`Failed to fetch ArXiv page: ${res.status}`);
65
+ const html = await res.text();
66
+ result = { ...parseArxivAbstract(html), url: absUrl };
67
+ }
68
+
69
+ // If no publishedDate was extracted from the page, try the ArXiv API for the exact date
70
+ if (!result.publishedDate) {
71
+ result.publishedDate = await fetchArxivApiDate(id);
72
+ }
73
+
74
+ // Last resort: derive an approximate date from the ArXiv ID (YYMM.NNNNN)
75
+ if (!result.publishedDate && id) {
76
+ const idMatch = id.match(/^(\d{4})\./);
77
+ if (idMatch) {
78
+ const yymm = idMatch[1];
79
+ const yy = parseInt(yymm.slice(0, 2));
80
+ const mm = yymm.slice(2, 4);
81
+ const year = yy >= 90 ? 1900 + yy : 2000 + yy;
82
+ result.publishedDate = `${year}-${mm}-01`;
83
+ }
84
+ }
85
+
86
+ return result;
87
+ }
88
+
89
+ /**
90
+ * Fetch the exact publication date from the ArXiv Atom API.
91
+ * Returns an ISO date string (YYYY-MM-DD) or null.
92
+ */
93
+ async function fetchArxivApiDate(arxivId) {
94
+ try {
95
+ const res = await fetch(
96
+ `https://export.arxiv.org/api/query?id_list=${arxivId}&max_results=1`,
97
+ { headers: { 'User-Agent': 'ez-reads/1.0' } }
98
+ );
99
+ if (!res.ok) return null;
100
+ const xml = await res.text();
101
+ // Extract <published> element from Atom feed (e.g. <published>2017-06-12T17:57:34Z</published>)
102
+ const match = xml.match(/<published>(\d{4}-\d{2}-\d{2})T/);
103
+ return match ? match[1] : null;
104
+ } catch (_) {
105
+ return null;
106
+ }
107
+ }
108
+
109
+ function parseArxivHtml(html, baseUrl) {
110
+ const $ = cheerio.load(html);
111
+
112
+ const title = $('.ltx_title').first().text().replace(/^\s*Title:\s*/i, '').trim() ||
113
+ $('h1').first().text().trim();
114
+
115
+ const authors = [];
116
+ $('.ltx_personname').each((_, el) => {
117
+ const name = $(el).text().trim();
118
+ if (name) authors.push(name);
119
+ });
120
+
121
+ const abstract = $('.ltx_abstract .ltx_p').map((_, el) => $(el).text().trim()).get().join(' ') ||
122
+ $('.ltx_abstract').text().replace(/^\s*Abstract[:\s]*/i, '').trim();
123
+
124
+ // Extract publication date from the HTML version
125
+ const publishedDate = extractArxivDate($);
126
+
127
+ const sections = [];
128
+ $('section.ltx_section, section.ltx_subsection').each((_, sec) => {
129
+ const heading = $(sec).find('.ltx_title').first().text().trim()
130
+ .replace(/^\d+[\.\s]*/, '');
131
+
132
+ const paragraphs = [];
133
+ $(sec).find('.ltx_para .ltx_p, .ltx_p').each((_, p) => {
134
+ // skip paragraphs that belong to nested subsections
135
+ if ($(p).closest('section.ltx_subsection').length > 0 &&
136
+ $(p).closest('section.ltx_section')[0] === sec &&
137
+ $(p).closest('section.ltx_subsection')[0] !== sec) {
138
+ return;
139
+ }
140
+ const text = $(p).text().trim();
141
+ if (text) paragraphs.push(text);
142
+ });
143
+
144
+ if (heading && paragraphs.length > 0) {
145
+ sections.push({ heading, text: paragraphs.join('\n\n') });
146
+ }
147
+ });
148
+
149
+ // Extract figures (images with captions)
150
+ const figures = [];
151
+ $('figure.ltx_figure, figure.ltx_table').each((_, fig) => {
152
+ const $fig = $(fig);
153
+ const id = $fig.attr('id') || `figure-${figures.length + 1}`;
154
+
155
+ // Find the image source
156
+ const imgEl = $fig.find('img').first();
157
+ let url = imgEl.attr('src') || '';
158
+ if (url && !url.startsWith('http')) {
159
+ // Resolve relative URLs against the arxiv HTML page URL
160
+ url = new URL(url, baseUrl + '/').href;
161
+ }
162
+
163
+ // Extract caption text
164
+ const caption = $fig.find('figcaption, .ltx_caption').text().trim();
165
+
166
+ if (url && caption) {
167
+ figures.push({ url, caption, id });
168
+ }
169
+ });
170
+
171
+ return { title, authors, abstract, sections, publishedDate, figures };
172
+ }
173
+
174
+ function parseArxivAbstract(html) {
175
+ const $ = cheerio.load(html);
176
+
177
+ const title = $('h1.title').text().replace(/^Title:\s*/i, '').trim();
178
+
179
+ const authors = [];
180
+ $('div.authors a').each((_, el) => {
181
+ authors.push($(el).text().trim());
182
+ });
183
+
184
+ const abstract = $('blockquote.abstract').text().replace(/^Abstract:\s*/i, '').trim();
185
+
186
+ // Extract publication / submission date
187
+ const publishedDate = extractArxivDate($);
188
+
189
+ return {
190
+ title,
191
+ authors,
192
+ abstract,
193
+ sections: [{ heading: 'Abstract', text: abstract }],
194
+ publishedDate,
195
+ figures: [],
196
+ };
197
+ }
198
+
199
+ /**
200
+ * Try to extract a publication date from an ArXiv page.
201
+ * Looks at <meta> tags, the dateline div, and submission history.
202
+ * Returns an ISO date string (YYYY-MM-DD) or null.
203
+ */
204
+ function extractArxivDate($) {
205
+ // 1. <meta name="citation_date" content="2017/06/12">
206
+ const citationDate = $('meta[name="citation_date"]').attr('content')
207
+ || $('meta[name="citation_publication_date"]').attr('content')
208
+ || $('meta[name="citation_online_date"]').attr('content');
209
+ if (citationDate) {
210
+ const d = new Date(citationDate.replace(/\//g, '-'));
211
+ if (!isNaN(d)) return d.toISOString().slice(0, 10);
212
+ }
213
+
214
+ // 2. <meta name="dc.date" ...> or <meta property="article:published_time">
215
+ const dcDate = $('meta[name="dc.date"]').attr('content')
216
+ || $('meta[property="article:published_time"]').attr('content');
217
+ if (dcDate) {
218
+ const d = new Date(dcDate);
219
+ if (!isNaN(d)) return d.toISOString().slice(0, 10);
220
+ }
221
+
222
+ // 3. Dateline text like "Submitted on 12 Jun 2017" on the abstract page
223
+ const dateline = $('div.dateline').text() || '';
224
+ const subMatch = dateline.match(/(\d{1,2}\s+\w+\s+\d{4})/);
225
+ if (subMatch) {
226
+ const d = new Date(subMatch[1]);
227
+ if (!isNaN(d)) return d.toISOString().slice(0, 10);
228
+ }
229
+
230
+ // 4. Submission history text
231
+ const historyText = $('div.submission-history').text() || '';
232
+ const histMatch = historyText.match(/\[v1\]\s*\w+,\s*(\d{1,2}\s+\w+\s+\d{4})/);
233
+ if (histMatch) {
234
+ const d = new Date(histMatch[1]);
235
+ if (!isNaN(d)) return d.toISOString().slice(0, 10);
236
+ }
237
+
238
+ return null;
239
+ }
240
+
241
+ // ---------- DOI ----------
242
+
243
+ function extractDoi(input) {
244
+ const match = input.match(/(10\.\d{4,}\/[^\s]+)/);
245
+ return match ? match[1] : input;
246
+ }
247
+
248
+ async function fetchDoi(input) {
249
+ const doi = extractDoi(input);
250
+ const doiUrl = `https://doi.org/${doi}`;
251
+
252
+ // Get structured metadata via content negotiation
253
+ let metadata = {};
254
+ try {
255
+ const metaRes = await fetch(doiUrl, {
256
+ headers: {
257
+ Accept: 'application/vnd.citeprocjson',
258
+ 'User-Agent': 'ez-reads/1.0',
259
+ },
260
+ redirect: 'follow',
261
+ });
262
+ if (metaRes.ok) {
263
+ metadata = await metaRes.json();
264
+ }
265
+ } catch (_) {
266
+ // proceed without structured metadata
267
+ }
268
+
269
+ const title = metadata.title || '';
270
+ const authors = (metadata.author || []).map(
271
+ (a) => [a.given, a.family].filter(Boolean).join(' ')
272
+ );
273
+ const abstract = metadata.abstract || '';
274
+
275
+ // Extract publication date from citeproc metadata
276
+ let publishedDate = null;
277
+ const issued = metadata.issued || metadata.published || metadata.created;
278
+ if (issued && issued['date-parts'] && issued['date-parts'][0]) {
279
+ const parts = issued['date-parts'][0]; // [year, month?, day?]
280
+ if (parts[0]) {
281
+ const y = String(parts[0]);
282
+ const m = String(parts[1] || 1).padStart(2, '0');
283
+ const d = String(parts[2] || 1).padStart(2, '0');
284
+ publishedDate = `${y}-${m}-${d}`;
285
+ }
286
+ }
287
+
288
+ // Try to scrape the landing page for body text
289
+ let sections = [];
290
+ try {
291
+ const pageRes = await fetch(doiUrl, {
292
+ headers: { 'User-Agent': 'ez-reads/1.0' },
293
+ redirect: 'follow',
294
+ });
295
+ if (pageRes.ok) {
296
+ const html = await pageRes.text();
297
+ const $ = cheerio.load(html);
298
+
299
+ // Remove nav, header, footer, scripts
300
+ $('nav, header, footer, script, style, .references, #references').remove();
301
+
302
+ $('h2, h3').each((_, heading) => {
303
+ const headText = $(heading).text().trim();
304
+ const paragraphs = [];
305
+ let next = $(heading).next();
306
+ while (next.length && !next.is('h2, h3')) {
307
+ if (next.is('p, div.paragraph')) {
308
+ const t = next.text().trim();
309
+ if (t) paragraphs.push(t);
310
+ }
311
+ next = next.next();
312
+ }
313
+ if (headText && paragraphs.length) {
314
+ sections.push({ heading: headText, text: paragraphs.join('\n\n') });
315
+ }
316
+ });
317
+ }
318
+ } catch (_) {
319
+ // proceed without body text
320
+ }
321
+
322
+ if (sections.length === 0 && abstract) {
323
+ sections = [{ heading: 'Abstract', text: abstract }];
324
+ }
325
+
326
+ return { title, authors, abstract, sections, url: doiUrl, publishedDate, figures: [] };
327
+ }