@jared-ye/markdown-tex 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1029 @@
1
+ "use strict";
2
+ /**
3
+ * LaTeX → MetaDoc AST parser.
4
+ * Parses document body (between \begin{document} and \end{document} if present).
5
+ * Supports: \section, \subsection, \subsubsection, \textbf, \textit, \texttt,
6
+ * \begin{itemize}, \begin{enumerate}, \begin{verbatim}, \href{}{}, \includegraphics{},
7
+ * \[ ... \], and treats unknown commands as unknown nodes (no crash).
8
+ */
9
+ Object.defineProperty(exports, "__esModule", { value: true });
10
+ exports.sanitizeLatexBody = sanitizeLatexBody;
11
+ exports.parseLatex = parseLatex;
12
+ const nodes_1 = require("../ast/nodes");
13
+ const BEGIN_DOC = '\\begin{document}';
14
+ const END_DOC = '\\end{document}';
15
+ /** Strip LaTeX line comments (% to EOL). Leaves lines intact but removes comment portion; full-comment lines become empty. */
16
+ function stripLineComments(body) {
17
+ return body
18
+ .split('\n')
19
+ .map((line) => {
20
+ let i = 0;
21
+ while (i < line.length) {
22
+ const idx = line.indexOf('%', i);
23
+ if (idx === -1)
24
+ break;
25
+ if (idx > 0 && line[idx - 1] === '\\') {
26
+ i = idx + 1;
27
+ continue;
28
+ }
29
+ return line.slice(0, idx).trimEnd();
30
+ }
31
+ return line;
32
+ })
33
+ .join('\n');
34
+ }
35
+ /** Strip LaTeX control blocks that have no Markdown equivalent (titlepage, tableofcontents, newpage, label, etc.) */
36
+ function stripControlBlocks(body) {
37
+ let s = body;
38
+ // \begin{titlepage}...\end{titlepage}
39
+ s = s.replace(/\\begin\{titlepage\}[\s\S]*?\\end\{titlepage\}/g, '');
40
+ // \tableofcontents
41
+ s = s.replace(/\\tableofcontents\b/g, '');
42
+ // \newpage
43
+ s = s.replace(/\\newpage\b/g, '');
44
+ // \maketitle
45
+ s = s.replace(/\\maketitle\b/g, '');
46
+ // \label{...} (standalone or at end of line)
47
+ s = s.replace(/\\label\{[^{}]*\}/g, '');
48
+ // \clearpage, \cleardoublepage
49
+ s = s.replace(/\\(clearpage|cleardoublepage)\b/g, '');
50
+ // \vspace{...}, \vfill, \centering (standalone)
51
+ s = s.replace(/\\vspace\{[^{}]*\}/g, '');
52
+ s = s.replace(/\\vfill\b/g, '');
53
+ return s;
54
+ }
55
+ /** Extract body between \begin{document} and \end{document}; otherwise use full input; then strip control blocks */
56
+ function sanitizeLatexBody(latex) {
57
+ if (!latex || typeof latex !== 'string')
58
+ return '';
59
+ const normalized = latex.replace(/\r\n/g, '\n').replace(/\r/g, '\n');
60
+ const startIdx = normalized.indexOf(BEGIN_DOC);
61
+ const endIdx = normalized.lastIndexOf(END_DOC);
62
+ let body;
63
+ if (startIdx !== -1) {
64
+ const bodyStart = startIdx + BEGIN_DOC.length;
65
+ const bodyEnd = endIdx !== -1 ? endIdx : undefined;
66
+ body = normalized.slice(bodyStart, bodyEnd).trim();
67
+ }
68
+ else {
69
+ body = normalized.trim();
70
+ }
71
+ const stripped = stripControlBlocks(body);
72
+ const noComments = stripLineComments(stripped);
73
+ return noComments.replace(/\n{3,}/g, '\n\n').trim();
74
+ }
75
+ function parseLatex(latex) {
76
+ const body = sanitizeLatexBody(latex);
77
+ const blocks = parseBlocks(body);
78
+ return (0, nodes_1.createDocument)(blocks);
79
+ }
80
+ /** Extract single balanced { ... } content starting at open brace index */
81
+ function extractBraced(s, start) {
82
+ if (s[start] !== '{')
83
+ return null;
84
+ let depth = 1;
85
+ let i = start + 1;
86
+ while (i < s.length) {
87
+ if (s[i] === '\\' && i + 1 < s.length) {
88
+ i += 2;
89
+ continue;
90
+ }
91
+ if (s[i] === '{')
92
+ depth++;
93
+ else if (s[i] === '}') {
94
+ depth--;
95
+ if (depth === 0) {
96
+ return { content: s.slice(start + 1, i), end: i + 1 };
97
+ }
98
+ }
99
+ i++;
100
+ }
101
+ return null;
102
+ }
103
+ /** Check if a trimmed line looks like a markdown table row (| ... |) */
104
+ function isTableRow(trimmed) {
105
+ return trimmed.startsWith('|') && trimmed.includes('|', 1);
106
+ }
107
+ function splitTableRowLatex(line) {
108
+ const t = line.trim().replace(/^\|/, '').replace(/\|$/, '');
109
+ return t.split('|').map((c) => c.trim());
110
+ }
111
+ function isTableSeparatorLatex(trimmed) {
112
+ if (!trimmed.startsWith('|') || !trimmed.endsWith('|'))
113
+ return false;
114
+ const cells = splitTableRowLatex(trimmed);
115
+ if (cells.length === 0)
116
+ return false;
117
+ return cells.every((c) => /^[-:\s]+$/.test(c) && c.includes('-'));
118
+ }
119
+ /** Parse consecutive markdown-style table lines into one TableNode */
120
+ function parseTableBlock(lines, start) {
121
+ if (!isTableRow(lines[start].trim()))
122
+ return null;
123
+ const rowLines = [];
124
+ let i = start;
125
+ while (i < lines.length && isTableRow(lines[i].trim())) {
126
+ rowLines.push(lines[i].trim());
127
+ i++;
128
+ }
129
+ if (rowLines.length === 0)
130
+ return null;
131
+ const headerRow = splitTableRowLatex(rowLines[0]);
132
+ if (headerRow.length === 0)
133
+ return null;
134
+ let rowStart = 1;
135
+ if (rowLines.length > 1 && isTableSeparatorLatex(rowLines[1]))
136
+ rowStart = 2;
137
+ const rows = [];
138
+ for (let r = rowStart; r < rowLines.length; r++) {
139
+ rows.push(splitTableRowLatex(rowLines[r]));
140
+ }
141
+ return {
142
+ node: { type: 'table', headerRow, rows },
143
+ nextIndex: start + rowLines.length
144
+ };
145
+ }
146
+ /** Parse line-based block structure */
147
+ function parseBlocks(source) {
148
+ const blocks = [];
149
+ const lines = source.split('\n');
150
+ let i = 0;
151
+ while (i < lines.length) {
152
+ const line = lines[i];
153
+ const trimmed = line.trim();
154
+ if (trimmed === '') {
155
+ i++;
156
+ continue;
157
+ }
158
+ // Markdown-style pipe table (consecutive lines | ... |)
159
+ const tableResult = parseTableBlock(lines, i);
160
+ if (tableResult) {
161
+ blocks.push(tableResult.node);
162
+ i = tableResult.nextIndex;
163
+ continue;
164
+ }
165
+ // \section{...}, \subsection{...}, \subsubsection{...}
166
+ const headingMatch = trimmed.match(/^\\(section|subsection|subsubsection)\*?(\s*)\{/);
167
+ if (headingMatch) {
168
+ const braced = extractBraced(trimmed, trimmed.indexOf('{'));
169
+ if (braced) {
170
+ const level = headingMatch[1] === 'section' ? 1 : headingMatch[1] === 'subsection' ? 2 : 3;
171
+ const children = parseInlineLatex(braced.content);
172
+ blocks.push({ type: 'heading', level, children });
173
+ }
174
+ else {
175
+ blocks.push({ type: 'unknown', raw: line });
176
+ }
177
+ i++;
178
+ continue;
179
+ }
180
+ // \begin{verbatim} ... \end{verbatim}
181
+ if (trimmed.startsWith('\\begin{verbatim}')) {
182
+ const contentLines = [];
183
+ i++;
184
+ while (i < lines.length && !lines[i].trim().startsWith('\\end{verbatim}')) {
185
+ contentLines.push(lines[i]);
186
+ i++;
187
+ }
188
+ blocks.push({ type: 'code_block', content: contentLines.join('\n') });
189
+ if (i < lines.length)
190
+ i++;
191
+ continue;
192
+ }
193
+ if (trimmed.startsWith('\\end{verbatim}')) {
194
+ i++;
195
+ continue;
196
+ }
197
+ // \[ ... \] math block (single line or multi)
198
+ if (trimmed.startsWith('\\[')) {
199
+ const rest = trimmed.slice(2).trim();
200
+ if (rest.endsWith('\\]')) {
201
+ blocks.push({ type: 'math_block', content: rest.slice(0, -2).trim() });
202
+ }
203
+ else {
204
+ const mathLines = [rest];
205
+ i++;
206
+ while (i < lines.length && !lines[i].trim().endsWith('\\]')) {
207
+ mathLines.push(lines[i]);
208
+ i++;
209
+ }
210
+ if (i < lines.length) {
211
+ mathLines.push(lines[i].trim().slice(0, -2));
212
+ }
213
+ blocks.push({ type: 'math_block', content: mathLines.join('\n').trim() });
214
+ }
215
+ i++;
216
+ continue;
217
+ }
218
+ // \begin{itemize} ... \end{itemize}
219
+ if (trimmed.startsWith('\\begin{itemize}')) {
220
+ const listResult = parseListEnv(lines, i, false);
221
+ blocks.push(listResult.node);
222
+ i = listResult.nextIndex;
223
+ continue;
224
+ }
225
+ // \begin{enumerate} ... \end{enumerate}
226
+ if (trimmed.startsWith('\\begin{enumerate}')) {
227
+ const listResult = parseListEnv(lines, i, true);
228
+ blocks.push(listResult.node);
229
+ i = listResult.nextIndex;
230
+ continue;
231
+ }
232
+ // \begin{quote} ... \end{quote}
233
+ if (trimmed.startsWith('\\begin{quote}')) {
234
+ const quoteResult = parseQuoteEnv(lines, i);
235
+ blocks.push(quoteResult.node);
236
+ i = quoteResult.nextIndex;
237
+ continue;
238
+ }
239
+ // \begin{abstract} ... \end{abstract} → blockquote (summary)
240
+ if (trimmed.startsWith('\\begin{abstract}')) {
241
+ const abstractResult = parseAbstractEnv(lines, i);
242
+ blocks.push(abstractResult.node);
243
+ i = abstractResult.nextIndex;
244
+ continue;
245
+ }
246
+ // \begin{equation} ... \end{equation} or \begin{equation*} ... \end{equation*}
247
+ if (trimmed.match(/^\\begin\{equation\*?\}/)) {
248
+ const eqResult = parseEquationEnv(lines, i);
249
+ if (eqResult) {
250
+ blocks.push(eqResult.node);
251
+ i = eqResult.nextIndex;
252
+ continue;
253
+ }
254
+ }
255
+ // \begin{figure}...\end{figure} or \begin{figure*}: extract caption + includegraphics
256
+ if (trimmed.match(/^\\begin\{figure\*?\}/)) {
257
+ const figResult = parseFigureEnv(lines, i);
258
+ if (figResult) {
259
+ blocks.push(...figResult.blocks);
260
+ i = figResult.nextIndex;
261
+ continue;
262
+ }
263
+ }
264
+ // \begin{center}...\end{center} with \includegraphics → markdown image(s)
265
+ if (trimmed.startsWith('\\begin{center}')) {
266
+ const centerResult = parseCenterEnv(lines, i);
267
+ if (centerResult.blocks.length > 0) {
268
+ blocks.push(...centerResult.blocks);
269
+ }
270
+ i = centerResult.nextIndex;
271
+ continue;
272
+ }
273
+ // \begin{longtable}...\end{longtable} or \begin{longtblr}...\end{longtblr}
274
+ if (trimmed.startsWith('\\begin{longtable') || trimmed.startsWith('\\begin{longtblr')) {
275
+ const tblResult = parseLongTableEnv(lines, i);
276
+ if (tblResult) {
277
+ blocks.push(tblResult.node);
278
+ i = tblResult.nextIndex;
279
+ continue;
280
+ }
281
+ }
282
+ // \begin{thebibliography}...\end{thebibliography} → ## References + list of bib items
283
+ if (trimmed.startsWith('\\begin{thebibliography}')) {
284
+ const bibResult = parseTheBibliographyEnv(lines, i);
285
+ blocks.push(...bibResult.blocks);
286
+ i = bibResult.nextIndex;
287
+ continue;
288
+ }
289
+ // \begin{subappendices}...\end{subappendices} → parse inner as body
290
+ if (trimmed.startsWith('\\begin{subappendices}')) {
291
+ const innerResult = parseGenericEnv(lines, i, 'subappendices');
292
+ blocks.push(...parseBlocks(innerResult.inner));
293
+ i = innerResult.nextIndex;
294
+ continue;
295
+ }
296
+ // \begin{letter}... and similar: parse inner so \textbf, \item, \subitem convert
297
+ if (trimmed.match(/^\\begin\{(letter|minipage|frame)\}/)) {
298
+ const envName = trimmed.match(/^\\begin\{([^}]+)\}/)[1];
299
+ const innerResult = parseGenericEnv(lines, i, envName);
300
+ blocks.push(...parseBlocks(innerResult.inner));
301
+ i = innerResult.nextIndex;
302
+ continue;
303
+ }
304
+ // Unknown \begin{xxx} ... \end{xxx}: fallback as raw block (e.g. figure, table, tikz)
305
+ const beginUnknownMatch = trimmed.match(/^\\begin\{([a-zA-Z*]+)\}/);
306
+ if (beginUnknownMatch) {
307
+ const envName = beginUnknownMatch[1];
308
+ const endTag = `\\end{${envName}}`;
309
+ const rawLines = [line];
310
+ let j = i + 1;
311
+ while (j < lines.length) {
312
+ rawLines.push(lines[j]);
313
+ if (lines[j].trim().startsWith(endTag))
314
+ break;
315
+ j++;
316
+ }
317
+ blocks.push({ type: 'unknown', raw: rawLines.join('\n') });
318
+ i = j + 1;
319
+ continue;
320
+ }
321
+ // \hrulefill
322
+ if (trimmed === '\\hrulefill') {
323
+ blocks.push({ type: 'thematic_break' });
324
+ i++;
325
+ continue;
326
+ }
327
+ // \includegraphics[...]{path} or \includegraphics{path}
328
+ const incMatch = trimmed.match(/\\includegraphics(?:\[[^\]]*\])?\{([^{}]+)\}/);
329
+ if (incMatch) {
330
+ blocks.push({ type: 'paragraph', children: [{ type: 'image', url: incMatch[1] }] });
331
+ i++;
332
+ continue;
333
+ }
334
+ // Standalone \href{url}{text} line
335
+ const hrefMatch = trimmed.match(/\\href\{([^{}]+)\}\{/);
336
+ if (hrefMatch) {
337
+ const url = hrefMatch[1];
338
+ const openBrace = trimmed.indexOf('}{') + 1;
339
+ const braced = extractBraced(trimmed, trimmed.indexOf('{', openBrace - 1));
340
+ if (braced) {
341
+ const children = parseInlineLatex(braced.content);
342
+ blocks.push({ type: 'paragraph', children: [{ type: 'link', url, children }] });
343
+ }
344
+ else {
345
+ blocks.push({ type: 'unknown', raw: line });
346
+ }
347
+ i++;
348
+ continue;
349
+ }
350
+ // Paragraph line (may contain inline commands)
351
+ const inlines = parseInlineLatex(trimmed);
352
+ if (inlines.length > 0) {
353
+ blocks.push({ type: 'paragraph', children: inlines });
354
+ }
355
+ i++;
356
+ }
357
+ return blocks;
358
+ }
359
+ function parseListEnv(lines, start, ordered) {
360
+ const envName = ordered ? 'enumerate' : 'itemize';
361
+ const items = [];
362
+ let i = start + 1;
363
+ while (i < lines.length) {
364
+ const line = lines[i];
365
+ const trimmed = line.trim();
366
+ if (trimmed.startsWith(`\\end{${envName}}`)) {
367
+ return { node: { type: 'list', ordered, items }, nextIndex: i + 1 };
368
+ }
369
+ // Nested \begin{itemize} or \begin{enumerate}
370
+ if (trimmed.startsWith('\\begin{itemize}')) {
371
+ if (items.length > 0) {
372
+ const listResult = parseListEnv(lines, i, false);
373
+ const last = items[items.length - 1];
374
+ last.children.push(listResult.node);
375
+ i = listResult.nextIndex;
376
+ }
377
+ else {
378
+ i++;
379
+ }
380
+ continue;
381
+ }
382
+ if (trimmed.startsWith('\\begin{enumerate}')) {
383
+ if (items.length > 0) {
384
+ const listResult = parseListEnv(lines, i, true);
385
+ const last = items[items.length - 1];
386
+ last.children.push(listResult.node);
387
+ i = listResult.nextIndex;
388
+ }
389
+ else {
390
+ i++;
391
+ }
392
+ continue;
393
+ }
394
+ // \subitem (nested item, e.g. in letter env): add as nested list under last item
395
+ if (trimmed.startsWith('\\subitem')) {
396
+ const subContent = trimmed.slice(8).trim();
397
+ const subChildren = parseInlineLatex(subContent);
398
+ const subPara = { type: 'paragraph', children: subChildren };
399
+ const subListItem = { type: 'list_item', children: [subPara] };
400
+ if (items.length > 0) {
401
+ const last = items[items.length - 1];
402
+ const lastBlock = last.children[last.children.length - 1];
403
+ if (lastBlock.type === 'list') {
404
+ lastBlock.items.push(subListItem);
405
+ }
406
+ else {
407
+ last.children.push({ type: 'list', ordered: false, items: [subListItem] });
408
+ }
409
+ }
410
+ else {
411
+ items.push({ type: 'list_item', children: [subPara] });
412
+ }
413
+ i++;
414
+ continue;
415
+ }
416
+ // \item or \item[optional label]
417
+ const itemMatch = trimmed.match(/^\\item\s*(?:\[([^\]]*)\])?\s*(.*)$/s);
418
+ if (itemMatch) {
419
+ const optionalLabel = itemMatch[1]?.trim();
420
+ const itemContent = itemMatch[2]?.trim() ?? '';
421
+ const children = parseInlineLatex(itemContent);
422
+ const para = { type: 'paragraph', children };
423
+ const listItem = { type: 'list_item', children: [para] };
424
+ if (optionalLabel && optionalLabel.length > 0) {
425
+ para.children = [{ type: 'text', value: optionalLabel + ' ' }, ...para.children];
426
+ }
427
+ items.push(listItem);
428
+ i++;
429
+ continue;
430
+ }
431
+ if (trimmed === '') {
432
+ i++;
433
+ continue;
434
+ }
435
+ // \setlength etc. - skip
436
+ if (trimmed.startsWith('\\setlength') || trimmed.startsWith('\\vspace')) {
437
+ i++;
438
+ continue;
439
+ }
440
+ // Continuation of previous item
441
+ if (items.length > 0) {
442
+ const last = items[items.length - 1];
443
+ const lastBlock = last.children[last.children.length - 1];
444
+ if (lastBlock.type === 'paragraph') {
445
+ const extra = parseInlineLatex(trimmed);
446
+ lastBlock.children.push({ type: 'text', value: ' ' }, ...extra);
447
+ }
448
+ }
449
+ i++;
450
+ }
451
+ return { node: { type: 'list', ordered, items }, nextIndex: i };
452
+ }
453
+ function parseQuoteEnv(lines, start) {
454
+ const innerLines = [];
455
+ let i = start + 1;
456
+ while (i < lines.length) {
457
+ const line = lines[i];
458
+ const trimmed = line.trim();
459
+ if (trimmed.startsWith('\\end{quote}')) {
460
+ const inner = innerLines.join('\n');
461
+ const innerBlocks = parseBlocks(inner);
462
+ return { node: { type: 'blockquote', children: innerBlocks }, nextIndex: i + 1 };
463
+ }
464
+ innerLines.push(line);
465
+ i++;
466
+ }
467
+ const inner = innerLines.join('\n');
468
+ const innerBlocks = parseBlocks(inner);
469
+ return { node: { type: 'blockquote', children: innerBlocks }, nextIndex: i };
470
+ }
471
+ function parseAbstractEnv(lines, start) {
472
+ const innerLines = [];
473
+ let i = start + 1;
474
+ while (i < lines.length) {
475
+ const trimmed = lines[i].trim();
476
+ if (trimmed.startsWith('\\end{abstract}')) {
477
+ const inner = innerLines.join('\n');
478
+ const innerBlocks = parseBlocks(inner);
479
+ return { node: { type: 'blockquote', children: innerBlocks }, nextIndex: i + 1 };
480
+ }
481
+ innerLines.push(lines[i]);
482
+ i++;
483
+ }
484
+ const inner = innerLines.join('\n');
485
+ return { node: { type: 'blockquote', children: parseBlocks(inner) }, nextIndex: i };
486
+ }
487
+ /** Extract inner content of \begin{envName}...\end{envName}. */
488
+ function parseGenericEnv(lines, start, envName) {
489
+ const endTag = `\\end{${envName}}`;
490
+ const innerLines = [];
491
+ let i = start + 1;
492
+ while (i < lines.length) {
493
+ const trimmed = lines[i].trim();
494
+ if (trimmed.startsWith(endTag)) {
495
+ return { inner: innerLines.join('\n'), nextIndex: i + 1 };
496
+ }
497
+ innerLines.push(lines[i]);
498
+ i++;
499
+ }
500
+ return { inner: innerLines.join('\n'), nextIndex: i };
501
+ }
502
+ /** Parse thebibliography: emit ## References and for each \bibitem{key} a paragraph with anchor and [key] prefix. */
503
+ function parseTheBibliographyEnv(lines, start) {
504
+ const endTag = '\\end{thebibliography}';
505
+ const blocks = [];
506
+ blocks.push({ type: 'heading', level: 2, children: [{ type: 'text', value: 'References' }] });
507
+ let i = start + 1;
508
+ let currentKey = null;
509
+ let currentItem = [];
510
+ const flushItem = (key, rest) => {
511
+ const text = rest.trim();
512
+ if (!text)
513
+ return;
514
+ const anchorHtml = `<span id="ref-${key}"></span>`;
515
+ const prefix = `[${key}] `;
516
+ const children = [
517
+ { type: 'unknown_inline', raw: anchorHtml },
518
+ { type: 'text', value: prefix },
519
+ ...parseInlineLatex(text)
520
+ ];
521
+ blocks.push({ type: 'paragraph', children });
522
+ };
523
+ while (i < lines.length) {
524
+ const line = lines[i];
525
+ const trimmed = line.trim();
526
+ if (trimmed.startsWith(endTag)) {
527
+ if (currentKey !== null && currentItem.length > 0) {
528
+ flushItem(currentKey, currentItem.join(' '));
529
+ }
530
+ i++;
531
+ break;
532
+ }
533
+ const bibMatch = trimmed.match(/^\\bibitem\{([^}]*)\}\s*(.*)$/s);
534
+ if (bibMatch) {
535
+ if (currentKey !== null && currentItem.length > 0) {
536
+ flushItem(currentKey, currentItem.join(' '));
537
+ }
538
+ currentKey = bibMatch[1].trim();
539
+ currentItem = [bibMatch[2].trim()];
540
+ }
541
+ else if (currentKey !== null && trimmed !== '') {
542
+ currentItem.push(trimmed);
543
+ }
544
+ i++;
545
+ }
546
+ if (currentKey !== null && currentItem.length > 0) {
547
+ flushItem(currentKey, currentItem.join(' '));
548
+ }
549
+ return { blocks, nextIndex: i };
550
+ }
551
+ function parseEquationEnv(lines, start) {
552
+ const contentLines = [];
553
+ let i = start + 1;
554
+ while (i < lines.length) {
555
+ const line = lines[i];
556
+ const trimmed = line.trim();
557
+ if (trimmed.startsWith('\\end{equation}') || trimmed.startsWith('\\end{equation*}')) {
558
+ const content = contentLines.join('\n').trim();
559
+ return content ? { node: { type: 'math_block', content }, nextIndex: i + 1 } : null;
560
+ }
561
+ contentLines.push(line);
562
+ i++;
563
+ }
564
+ return null;
565
+ }
566
+ /** Extract caption text from \caption{...} */
567
+ function extractCaption(trimmed) {
568
+ const match = trimmed.match(/\\caption\s*\{/);
569
+ if (!match)
570
+ return null;
571
+ const open = trimmed.indexOf('{', trimmed.indexOf('\\caption'));
572
+ const braced = extractBraced(trimmed, open);
573
+ if (!braced)
574
+ return null;
575
+ return stripLatexForPlainText(braced.content);
576
+ }
577
+ /** Extract path from \includegraphics[...]{path} */
578
+ function extractIncludegraphicsPath(trimmed) {
579
+ const m = trimmed.match(/\\includegraphics(?:\[[^\]]*\])?\{([^{}]+)\}/);
580
+ return m ? m[1] : null;
581
+ }
582
+ /** Replace one \multicolumn{num}{cols}{content} with content (cols can have nested braces like m{4cm}). */
583
+ function replaceOneMulticolumn(s) {
584
+ const idx = s.indexOf('\\multicolumn');
585
+ if (idx === -1)
586
+ return s;
587
+ let pos = idx + '\\multicolumn'.length;
588
+ const rest = s.slice(pos).trimStart();
589
+ if (!rest.startsWith('{'))
590
+ return s;
591
+ pos = s.length - rest.length;
592
+ const b1 = extractBraced(s, pos);
593
+ if (!b1)
594
+ return s;
595
+ const b2 = extractBraced(s, b1.end);
596
+ if (!b2)
597
+ return s;
598
+ const b3 = extractBraced(s, b2.end);
599
+ if (!b3)
600
+ return s;
601
+ const inner = b3.content.replace(/\\centering\s*/g, '').trim();
602
+ return s.slice(0, idx) + inner + s.slice(b3.end);
603
+ }
604
+ /** Rough strip of LaTeX commands to plain text (for caption/cell text). Unwraps \{...}; handles \multicolumn with nested braces. */
605
+ function stripLatexForPlainText(s) {
606
+ let t = s.replace(/\\label\{[^{}]*\}/g, '');
607
+ while (/\\multicolumn\s*\{/.test(t)) {
608
+ const next = replaceOneMulticolumn(t);
609
+ if (next === t)
610
+ break;
611
+ t = next;
612
+ }
613
+ for (let round = 0; round < 10 && /\\[a-zA-Z]+\s*\{/.test(t); round++) {
614
+ const prev = t;
615
+ t = t.replace(/\\textbf\s*\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}/g, '$1');
616
+ t = t.replace(/\\textit\s*\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}/g, '$1');
617
+ t = t.replace(/\\centering\s*\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}/g, '$1');
618
+ if (t === prev)
619
+ break;
620
+ }
621
+ return t.replace(/\s+/g, ' ').trim();
622
+ }
623
+ function parseFigureEnv(lines, start) {
624
+ let caption = null;
625
+ let imageUrl = null;
626
+ let i = start + 1;
627
+ const endTag = lines[start].trim().includes('figure*}') ? '\\end{figure*}' : '\\end{figure}';
628
+ while (i < lines.length) {
629
+ const line = lines[i];
630
+ const trimmed = line.trim();
631
+ if (trimmed.startsWith(endTag)) {
632
+ i++;
633
+ break;
634
+ }
635
+ const cap = extractCaption(trimmed);
636
+ if (cap)
637
+ caption = cap;
638
+ const path = extractIncludegraphicsPath(trimmed);
639
+ if (path)
640
+ imageUrl = path;
641
+ i++;
642
+ }
643
+ const blocks = [];
644
+ if (imageUrl) {
645
+ blocks.push({
646
+ type: 'paragraph',
647
+ children: [{ type: 'image', url: imageUrl, alt: caption ?? undefined }]
648
+ });
649
+ }
650
+ return { blocks, nextIndex: i };
651
+ }
652
+ /** \begin{center}...\end{center}: extract all \includegraphics{path} and emit image paragraphs (no caption). */
653
+ function parseCenterEnv(lines, start) {
654
+ const innerResult = parseGenericEnv(lines, start, 'center');
655
+ const inner = innerResult.inner;
656
+ const blocks = [];
657
+ const incRe = /\\includegraphics(?:\[[^\]]*\])?\{([^{}]+)\}/g;
658
+ let m;
659
+ while ((m = incRe.exec(inner)) !== null) {
660
+ const url = m[1].trim();
661
+ blocks.push({
662
+ type: 'paragraph',
663
+ children: [{ type: 'image', url, alt: undefined }]
664
+ });
665
+ }
666
+ return { blocks, nextIndex: innerResult.nextIndex };
667
+ }
668
+ /** Parse longtable/longtblr: collect rows (lines with &), skip \hline \toprule etc., build TableNode */
669
+ function parseLongTableEnv(lines, start) {
670
+ const line = lines[start].trim();
671
+ const isLongtblr = line.startsWith('\\begin{longtblr}');
672
+ const endTag = isLongtblr ? '\\end{longtblr}' : '\\end{longtable}';
673
+ const rawLines = [];
674
+ let i = start + 1;
675
+ while (i < lines.length) {
676
+ const l = lines[i];
677
+ const t = l.trim();
678
+ if (t.startsWith(endTag)) {
679
+ i++;
680
+ break;
681
+ }
682
+ rawLines.push(l);
683
+ i++;
684
+ }
685
+ const skipOnlyPatterns = /^\s*\\(hline|toprule|midrule|bottomrule|endfirsthead|endhead|endfoot|endlastfoot)\s*$/;
686
+ let headerRow = null;
687
+ const rows = [];
688
+ for (const raw of rawLines) {
689
+ const trimmed = raw.trim();
690
+ if (skipOnlyPatterns.test(trimmed))
691
+ continue;
692
+ if (/^\\caption\s*\{/.test(trimmed) && !trimmed.includes('&'))
693
+ continue;
694
+ if (trimmed.includes('\\endfirsthead') && !trimmed.includes('&'))
695
+ continue;
696
+ if (!trimmed.includes('&'))
697
+ continue;
698
+ const row = parseTableRowLatex(trimmed);
699
+ if (row.length === 0)
700
+ continue;
701
+ if (headerRow === null && rows.length === 0)
702
+ headerRow = row;
703
+ else
704
+ rows.push(row);
705
+ }
706
+ if (headerRow === null)
707
+ headerRow = [];
708
+ return {
709
+ node: { type: 'table', headerRow, rows },
710
+ nextIndex: i
711
+ };
712
+ }
713
+ /** Strip \endfirsthead, \endhead etc. from a table row line so they don't appear in cell text */
714
+ function stripTableRowControl(line) {
715
+ return line
716
+ .replace(/\\endfirsthead\s*/g, '')
717
+ .replace(/\\endhead\s*/g, '')
718
+ .replace(/\\endfoot\s*/g, '')
719
+ .replace(/\\endlastfoot\s*/g, '')
720
+ .trim();
721
+ }
722
+ /** Extract inner content of \begin{tabular}[pos]{cols}...\end{tabular} (cols may have nested {} e.g. @{}l@{}). */
723
+ function extractOneTabular(s) {
724
+ const idx = s.indexOf('\\begin{tabular}');
725
+ if (idx === -1)
726
+ return null;
727
+ let pos = idx + '\\begin{tabular}'.length;
728
+ const rest = s.slice(pos);
729
+ const optBracket = rest.match(/^\s*\[[^\]]*\]/)?.[0] ?? '';
730
+ pos += optBracket.length;
731
+ const braceStart = s.indexOf('{', pos);
732
+ if (braceStart === -1)
733
+ return null;
734
+ const colSpec = extractBraced(s, braceStart);
735
+ if (!colSpec)
736
+ return null;
737
+ const endTag = '\\end{tabular}';
738
+ const endIdx = s.indexOf(endTag, colSpec.end);
739
+ if (endIdx === -1)
740
+ return null;
741
+ const inner = s.slice(colSpec.end, endIdx).trim();
742
+ return inner;
743
+ }
744
+ /** Replace one \begin{tabular}...\end{tabular} with its inner content (processed). */
745
+ function replaceOneTabular(s) {
746
+ const inner = extractOneTabular(s);
747
+ if (inner === null)
748
+ return s;
749
+ const idx = s.indexOf('\\begin{tabular}');
750
+ const endTag = '\\end{tabular}';
751
+ const endIdx = s.indexOf(endTag, idx);
752
+ if (endIdx === -1)
753
+ return s;
754
+ const repl = cellContentToMarkdown(inner) || ' ';
755
+ return s.slice(0, idx) + repl + s.slice(endIdx + endTag.length);
756
+ }
757
+ /** Normalize LaTeX table cell: \\ → space, \_ → _, subscript-like G\_pes → G_pes or $G_{pes}$. */
758
+ function normalizeTableCellLatex(t) {
759
+ let s = t;
760
+ s = s.replace(/\\\\/g, ' ');
761
+ s = s.replace(/\\_/g, '_');
762
+ s = s.replace(/\\~/g, ' ');
763
+ return s;
764
+ }
765
+ /** Convert LaTeX cell content to markdown: preserve \( ... \) and $ ... $ as math, strip other commands. */
766
+ function cellContentToMarkdown(cell) {
767
+ let t = cell.replace(/\\hline/g, '');
768
+ t = stripTableRowControl(t);
769
+ // Preserve inline math: \( ... \) → $ ... $
770
+ t = t.replace(/\\\(([\s\S]*?)\\\)/g, (_, math) => '$' + math.trim() + '$');
771
+ // Preserve $ ... $ (single $) - already markdown math
772
+ t = t.replace(/\$([^$]+)\$/g, (_, math) => '$' + math.trim() + '$');
773
+ // Extract \begin{tabular}...\end{tabular} (column spec can have nested {}) and replace with inner content
774
+ while (/\\begin\{tabular\}/.test(t)) {
775
+ const next = replaceOneTabular(t);
776
+ if (next === t)
777
+ break;
778
+ t = next;
779
+ }
780
+ // \multicolumn with nested braces: \multicolumn{1}{m{4cm}}{\centering Symbol}
781
+ while (/\\multicolumn\s*\{/.test(t)) {
782
+ const next = replaceOneMulticolumn(t);
783
+ if (next === t)
784
+ break;
785
+ t = next;
786
+ }
787
+ t = stripLatexForPlainText(t);
788
+ // Unwrap single top-level { ... } (e.g. longtblr cells like {Whether the country...})
789
+ const trimmed = t.trim();
790
+ if (trimmed.startsWith('{')) {
791
+ const b = extractBraced(trimmed, 0);
792
+ if (b && b.end === trimmed.length)
793
+ t = b.content;
794
+ }
795
+ t = normalizeTableCellLatex(t);
796
+ return t.replace(/\s+/g, ' ').trim();
797
+ }
798
+ /** Parse one LaTeX table row: split by &, clean cell content; preserve math as $...$ */
799
+ function parseTableRowLatex(line) {
800
+ const parts = [];
801
+ let rest = stripTableRowControl(line.replace(/\\\\\s*$/, '').trim());
802
+ const andSplits = splitByUnescaped(rest, '&');
803
+ for (const seg of andSplits) {
804
+ const cell = cellContentToMarkdown(seg.trim());
805
+ parts.push(cell);
806
+ }
807
+ return parts;
808
+ }
809
+ function splitByUnescaped(str, char) {
810
+ const out = [];
811
+ let start = 0;
812
+ for (let i = 0; i < str.length; i++) {
813
+ if (str[i] === char && (i === 0 || str[i - 1] !== '\\')) {
814
+ out.push(str.slice(start, i));
815
+ start = i + 1;
816
+ }
817
+ }
818
+ out.push(str.slice(start));
819
+ return out;
820
+ }
821
+ /**
822
+ * Parse a single line of LaTeX into inline nodes.
823
+ * Handles \textbf{}, \textit{}, \texttt{}, \href{url}{text}, \includegraphics{}, $...$
824
+ */
825
+ function parseInlineLatex(line) {
826
+ const out = [];
827
+ let i = 0;
828
+ while (i < line.length) {
829
+ // \textbackslash → single backslash character (for \ in output)
830
+ if (line.slice(i).startsWith('\\textbackslash')) {
831
+ out.push({ type: 'text', value: '\\' });
832
+ i += '\\textbackslash'.length;
833
+ continue;
834
+ }
835
+ // \verb<delim>...<delim> → verbatim text (e.g. \verb|\| → backslash)
836
+ if (line.slice(i).startsWith('\\verb')) {
837
+ const verbStart = i + 5;
838
+ if (verbStart < line.length) {
839
+ const delim = line[verbStart];
840
+ const endIdx = line.indexOf(delim, verbStart + 1);
841
+ if (endIdx !== -1) {
842
+ const verbContent = line.slice(verbStart + 1, endIdx);
843
+ out.push({ type: 'text', value: verbContent });
844
+ i = endIdx + 1;
845
+ continue;
846
+ }
847
+ }
848
+ }
849
+ // Inline math \( ... \)
850
+ if (line.slice(i).startsWith('\\(')) {
851
+ const end = line.indexOf('\\)', i + 2);
852
+ if (end !== -1) {
853
+ out.push({ type: 'math_inline', content: line.slice(i + 2, end).trim() });
854
+ i = end + 2;
855
+ continue;
856
+ }
857
+ }
858
+ // Inline math $ ... $
859
+ if (line[i] === '$' && i + 1 < line.length && line[i + 1] !== '$') {
860
+ const end = line.indexOf('$', i + 1);
861
+ if (end !== -1) {
862
+ out.push({ type: 'math_inline', content: line.slice(i + 1, end).trim() });
863
+ i = end + 1;
864
+ continue;
865
+ }
866
+ }
867
+ // \textbf{...} — trim content so "** word **" becomes "**word**"
868
+ if (line.slice(i).startsWith('\\textbf{')) {
869
+ const open = line.indexOf('{', i);
870
+ const content = extractBraced(line, open);
871
+ if (content) {
872
+ const trimmed = content.content.trim();
873
+ out.push({ type: 'strong', children: parseInlineLatex(trimmed) });
874
+ i = content.end;
875
+ continue;
876
+ }
877
+ }
878
+ // \textit{...} or \emph{...} — trim content for correct *word* spacing
879
+ if (line.slice(i).startsWith('\\textit{') || line.slice(i).startsWith('\\emph{')) {
880
+ const open = line.indexOf('{', i);
881
+ const content = extractBraced(line, open);
882
+ if (content) {
883
+ const trimmed = content.content.trim();
884
+ out.push({ type: 'emphasis', children: parseInlineLatex(trimmed) });
885
+ i = content.end;
886
+ continue;
887
+ }
888
+ }
889
+ // \texttt{...} — parse inner so \textbackslash etc. resolve; then collapse to literal string
890
+ if (line.slice(i).startsWith('\\texttt{')) {
891
+ const open = line.indexOf('{', i);
892
+ const content = extractBraced(line, open);
893
+ if (content) {
894
+ const innerNodes = parseInlineLatex(content.content);
895
+ const literal = innerNodes
896
+ .map((n) => (n.type === 'text' ? n.value : n.type === 'unknown_inline' ? n.raw : ''))
897
+ .join('');
898
+ out.push({ type: 'inline_code', value: literal });
899
+ i = content.end;
900
+ continue;
901
+ }
902
+ }
903
+ // \sout{...} (strikethrough, ulem package)
904
+ if (line.slice(i).startsWith('\\sout{')) {
905
+ const open = line.indexOf('{', i);
906
+ const content = extractBraced(line, open);
907
+ if (content) {
908
+ out.push({ type: 'strikethrough', children: parseInlineLatex(content.content) });
909
+ i = content.end;
910
+ continue;
911
+ }
912
+ }
913
+ // \textasciitilde{}\textasciitilde{}...\textasciitilde{}\textasciitilde{} (old escaped ~~) → strikethrough
914
+ if (line.slice(i).startsWith('\\textasciitilde{}\\textasciitilde{}')) {
915
+ const prefix = '\\textasciitilde{}\\textasciitilde{}';
916
+ let pos = i + prefix.length;
917
+ const innerStart = pos;
918
+ const closePattern = '\\textasciitilde{}\\textasciitilde{}';
919
+ const closeIdx = line.indexOf(closePattern, pos);
920
+ if (closeIdx !== -1) {
921
+ const inner = line.slice(innerStart, closeIdx);
922
+ out.push({ type: 'strikethrough', children: parseInlineLatex(inner) });
923
+ i = closeIdx + closePattern.length;
924
+ continue;
925
+ }
926
+ }
927
+ // \href{url}{text}
928
+ const hrefRe = /\\href\{([^{}]+)\}\{/;
929
+ const hrefMatch = line.slice(i).match(hrefRe);
930
+ if (hrefMatch && line.slice(i).startsWith('\\href{')) {
931
+ const url = hrefMatch[1];
932
+ const openBrace = i + hrefMatch[0].length - 1;
933
+ const content = extractBraced(line, openBrace);
934
+ if (content) {
935
+ out.push({ type: 'link', url, children: parseInlineLatex(content.content) });
936
+ i = content.end;
937
+ continue;
938
+ }
939
+ }
940
+ // \includegraphics[...]{path}
941
+ const incRe = /\\includegraphics(?:\[[^\]]*\])?\{([^{}]+)\}/;
942
+ const incMatch = line.slice(i).match(incRe);
943
+ if (incMatch) {
944
+ out.push({ type: 'image', url: incMatch[1] });
945
+ i += incMatch[0].length;
946
+ continue;
947
+ }
948
+ // Skip known control commands (no visible output)
949
+ if (/^\\(noindent|centering|raggedright|raggedleft|smallskip|bigskip|newline)\b/.test(line.slice(i))) {
950
+ const skip = line.slice(i).match(/^\\[a-zA-Z]+\*?(\[[^\]]*\])?/)?.[0]?.length ?? 0;
951
+ if (skip > 0) {
952
+ i += skip;
953
+ continue;
954
+ }
955
+ }
956
+ // Escaped char \# \% etc.
957
+ if (line[i] === '\\' && i + 1 < line.length) {
958
+ const next = line[i + 1];
959
+ if (/[#%&_{}$]/.test(next)) {
960
+ out.push({ type: 'text', value: next });
961
+ i += 2;
962
+ continue;
963
+ }
964
+ }
965
+ // \ref{...} → placeholder (no cross-ref resolution)
966
+ if (line.slice(i).startsWith('\\ref{')) {
967
+ const open = line.indexOf('{', i);
968
+ const braced = extractBraced(line, open);
969
+ if (braced) {
970
+ out.push({ type: 'text', value: '[ref]' });
971
+ i = braced.end;
972
+ continue;
973
+ }
974
+ }
975
+ // \cite{...} → link to #ref-{key} with text [key] (for bibliography jump)
976
+ if (line.slice(i).startsWith('\\cite{')) {
977
+ const open = line.indexOf('{', i);
978
+ const braced = extractBraced(line, open);
979
+ if (braced) {
980
+ const key = braced.content.split(',')[0].trim();
981
+ out.push({ type: 'link', url: '#ref-' + key, children: [{ type: 'text', value: '[' + key + ']' }] });
982
+ i = braced.end;
983
+ continue;
984
+ }
985
+ }
986
+ // Unknown LaTeX command: fallback as raw (e.g. custom macros)
987
+ if (line[i] === '\\' && i + 1 < line.length) {
988
+ const nameMatch = line.slice(i).match(/^\\([a-zA-Z@]+)\*?(\[[^\]]*\])?/);
989
+ if (nameMatch) {
990
+ let len = nameMatch[0].length;
991
+ const restIdx = i + len;
992
+ if (restIdx < line.length && line[restIdx] === '{') {
993
+ const braced = extractBraced(line, restIdx);
994
+ if (braced) {
995
+ len = braced.end - i;
996
+ }
997
+ }
998
+ out.push({ type: 'unknown_inline', raw: line.slice(i, i + len) });
999
+ i += len;
1000
+ continue;
1001
+ }
1002
+ }
1003
+ // Plain text: stop at next \command, \(, or \)
1004
+ const rest = line.slice(i);
1005
+ const nextLetter = rest.search(/\\[a-zA-Z@]+/);
1006
+ const nextOpenMath = rest.indexOf('\\(');
1007
+ const nextCloseMath = rest.indexOf('\\)');
1008
+ let end = line.length;
1009
+ if (nextLetter !== -1)
1010
+ end = Math.min(end, i + nextLetter);
1011
+ if (nextOpenMath !== -1)
1012
+ end = Math.min(end, i + nextOpenMath);
1013
+ if (nextCloseMath !== -1)
1014
+ end = Math.min(end, i + nextCloseMath);
1015
+ if (end > i) {
1016
+ const raw = line.slice(i, end);
1017
+ const unescaped = raw.replace(/\\([#%&_{}$])/g, '$1');
1018
+ if (unescaped.length > 0) {
1019
+ out.push({ type: 'text', value: unescaped });
1020
+ }
1021
+ i = end;
1022
+ }
1023
+ else {
1024
+ i++;
1025
+ }
1026
+ }
1027
+ return out;
1028
+ }
1029
+ //# sourceMappingURL=latexParser.js.map