@nataliapc/mcp-openmsx 1.2.5 → 1.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,745 @@
1
+ #!/usr/bin/env python3
2
+ """Convert a LyX file to Markdown, preserving math and formatting."""
3
+
4
+ import re
5
+ import sys
6
+
7
+
8
+ def parse_lyx(filepath):
9
+ """Read a LyX file and return list of lines."""
10
+ with open(filepath, "r", encoding="utf-8", errors="replace") as f:
11
+ return f.readlines()
12
+
13
+
14
+ class LyxToMarkdown:
15
+ def __init__(self):
16
+ self.output = []
17
+ self.lines = []
18
+ self.pos = 0
19
+ self.in_body = False
20
+ self.enumerate_counter = 0
21
+ self.label_map = {} # label name -> section title
22
+ self.current_section = ""
23
+ # Stack of (layout_context, accumulated_text) for nested layouts
24
+ self.layout_stack = []
25
+ self.current_text = ""
26
+ self._layout_context = "unknown"
27
+
28
+ def convert(self, filepath):
29
+ self.lines = parse_lyx(filepath)
30
+ self.pos = 0
31
+
32
+ # First pass: collect labels and map them to section titles
33
+ self._collect_labels()
34
+
35
+ # Second pass: convert
36
+ self.pos = 0
37
+ self.in_body = False
38
+ while self.pos < len(self.lines):
39
+ line = self.lines[self.pos].rstrip("\n")
40
+ self._process_line(line)
41
+ self.pos += 1
42
+
43
+ return "\n".join(self.output)
44
+
45
+ def _collect_labels(self):
46
+ """First pass to map label names to nearby section titles."""
47
+ # We'll look for label names and associate them with the most recent
48
+ # section/chapter heading text
49
+ current_heading = ""
50
+ for i, line in enumerate(self.lines):
51
+ line = line.strip()
52
+ # Detect heading layouts
53
+ m = re.match(
54
+ r"\\begin_layout (Chapter\*?|Section|Subsection|Subsubsection)",
55
+ line,
56
+ )
57
+ if m:
58
+ # Next non-blank, non-directive line is the title
59
+ for j in range(i + 1, min(i + 5, len(self.lines))):
60
+ candidate = self.lines[j].strip()
61
+ if (
62
+ candidate
63
+ and not candidate.startswith("\\")
64
+ and not candidate.startswith("<")
65
+ ):
66
+ current_heading = candidate
67
+ break
68
+ # Detect label names
69
+ if line.startswith('name "') and line.endswith('"'):
70
+ label_name = line[6:-1]
71
+ if label_name and current_heading:
72
+ self.label_map[label_name] = current_heading
73
+ # Also handle Float/Table captions for label association
74
+ m2 = re.match(r"\\begin_inset Float", line)
75
+ if m2:
76
+ # Look ahead for caption text
77
+ for j in range(i + 1, min(i + 50, len(self.lines))):
78
+ cline = self.lines[j].strip()
79
+ if cline == "\\begin_inset Caption":
80
+ for k in range(j + 1, min(j + 10, len(self.lines))):
81
+ capline = self.lines[k].strip()
82
+ if (
83
+ capline
84
+ and not capline.startswith("\\")
85
+ and not capline.startswith("<")
86
+ ):
87
+ current_heading = capline
88
+ break
89
+ break
90
+
91
+ def _process_line(self, line):
92
+ stripped = line.strip()
93
+
94
+ if stripped == "\\begin_body":
95
+ self.in_body = True
96
+ return
97
+ if not self.in_body:
98
+ return
99
+ if stripped == "\\end_body":
100
+ self.in_body = False
101
+ return
102
+
103
+ # Layout begin
104
+ if stripped.startswith("\\begin_layout "):
105
+ layout_type = stripped[len("\\begin_layout ") :]
106
+ self._handle_layout_begin(layout_type)
107
+ return
108
+
109
+ if stripped == "\\end_layout":
110
+ self._handle_layout_end()
111
+ return
112
+
113
+ # Inset begin
114
+ if stripped.startswith("\\begin_inset "):
115
+ inset_spec = stripped[len("\\begin_inset ") :]
116
+ self._handle_inset_begin(inset_spec)
117
+ return
118
+
119
+ if stripped == "\\end_inset":
120
+ # Most end_insets are no-ops for content accumulation
121
+ return
122
+
123
+ # Skip status/alignment/formatting directives
124
+ if stripped in ("status open", "status collapsed", "status inlined"):
125
+ return
126
+ if stripped.startswith("\\align "):
127
+ return
128
+ if stripped.startswith("\\noindent"):
129
+ return
130
+ if stripped.startswith("\\paragraph_spacing"):
131
+ return
132
+ if stripped.startswith("wide ") or stripped.startswith("sideways "):
133
+ return
134
+
135
+ # Formatting commands - inline markers
136
+ # We use \x01 as a "soft space" after closing markers - it becomes a space
137
+ # if the next character is a word char, or is removed before punctuation.
138
+ if stripped.startswith("\\emph "):
139
+ val = stripped.split()[1]
140
+ if val == "on":
141
+ if self.current_text and not self.current_text.endswith(
142
+ (" ", "\n", "(", "[", "\u201c", "\x01")
143
+ ):
144
+ self.current_text += " "
145
+ self.current_text += "*"
146
+ elif val in ("default", "off"):
147
+ if self.current_text.endswith(" "):
148
+ self.current_text = self.current_text[:-1]
149
+ self.current_text += "*\x01"
150
+ return
151
+
152
+ if stripped.startswith("\\series "):
153
+ val = stripped.split()[1]
154
+ if val == "bold":
155
+ if self.current_text and not self.current_text.endswith(
156
+ (" ", "\n", "(", "[", "\u201c", "\x01")
157
+ ):
158
+ self.current_text += " "
159
+ self.current_text += "**"
160
+ elif val in ("default", "medium"):
161
+ if self.current_text.endswith(" "):
162
+ self.current_text = self.current_text[:-1]
163
+ self.current_text += "**\x01"
164
+ return
165
+
166
+ if stripped.startswith("\\shape "):
167
+ val = stripped.split()[1]
168
+ if val == "italic":
169
+ if self.current_text and not self.current_text.endswith(
170
+ (" ", "\n", "(", "[", "\u201c", "\x01")
171
+ ):
172
+ self.current_text += " "
173
+ self.current_text += "*"
174
+ elif val in ("default", "up"):
175
+ if self.current_text.endswith(" "):
176
+ self.current_text = self.current_text[:-1]
177
+ self.current_text += "*\x01"
178
+ return
179
+
180
+ # Skip other formatting directives
181
+ for prefix in (
182
+ "\\noun ",
183
+ "\\bar ",
184
+ "\\color ",
185
+ "\\size ",
186
+ "\\family ",
187
+ "\\strikeout",
188
+ ):
189
+ if stripped.startswith(prefix):
190
+ return
191
+
192
+ # Backslash escape
193
+ if stripped == "\\backslash":
194
+ self.current_text += "\\"
195
+ return
196
+
197
+ # Skip deeper/end_deeper
198
+ if stripped in ("\\begin_deeper", "\\end_deeper"):
199
+ return
200
+
201
+ # Skip various LyX preamble/header directives that might leak
202
+ skip_prefixes = (
203
+ "\\lyxformat",
204
+ "\\papersize",
205
+ "\\use_",
206
+ "\\font_",
207
+ "\\graphics",
208
+ "\\paper",
209
+ "\\spacing",
210
+ "\\cite_",
211
+ "\\secnumdepth",
212
+ "\\tocdepth",
213
+ "\\paragraph_",
214
+ "\\defskip",
215
+ "\\quotes_",
216
+ "\\tracking_",
217
+ "\\output_",
218
+ "\\author",
219
+ "\\textclass",
220
+ "\\language",
221
+ "\\inputencoding",
222
+ "\\default_",
223
+ "\\maintain_",
224
+ )
225
+ if stripped.startswith("\\"):
226
+ for prefix in skip_prefixes:
227
+ if stripped.startswith(prefix):
228
+ return
229
+ # Other backslash commands we don't recognize - skip silently
230
+ if not stripped.startswith("\\begin") and not stripped.startswith("\\end"):
231
+ return
232
+
233
+ # Regular text content
234
+ if stripped and not stripped.startswith("<"):
235
+ # Handle soft space marker: convert to real space before words,
236
+ # remove before punctuation
237
+ if self.current_text.endswith("\x01"):
238
+ self.current_text = self.current_text[:-1] # remove marker
239
+ if stripped[0] not in ".,;:!?)-]":
240
+ self.current_text += " "
241
+ # Add space before text if current_text doesn't end with whitespace/markers
242
+ # and the text doesn't start with punctuation
243
+ elif (
244
+ self.current_text
245
+ and not self.current_text.endswith((" ", "\n", "(", "[", "*"))
246
+ and stripped[0] not in ".,;:!?)-]"
247
+ ):
248
+ self.current_text += " "
249
+ self.current_text += stripped
250
+ # Skip XML-like tags
251
+
252
+ def _handle_layout_begin(self, layout_type):
253
+ """Handle the beginning of a layout, pushing state onto the stack."""
254
+ # Push current state
255
+ self.layout_stack.append((self._layout_context, self.current_text))
256
+
257
+ # Determine new context
258
+ if layout_type == "Title":
259
+ ctx = "title"
260
+ elif layout_type == "Author":
261
+ ctx = "author"
262
+ elif layout_type.startswith("Chapter"):
263
+ ctx = "chapter"
264
+ elif layout_type == "Section":
265
+ ctx = "section"
266
+ elif layout_type == "Subsection":
267
+ ctx = "subsection"
268
+ elif layout_type == "Subsubsection":
269
+ ctx = "subsubsection"
270
+ elif layout_type == "Enumerate":
271
+ ctx = "enumerate"
272
+ self.enumerate_counter += 1
273
+ elif layout_type == "Itemize":
274
+ ctx = "itemize"
275
+ elif layout_type == "Standard":
276
+ ctx = "standard"
277
+ elif layout_type == "Plain" or layout_type.startswith("Plain "):
278
+ ctx = "plain"
279
+ else:
280
+ ctx = "unknown"
281
+
282
+ self._layout_context = ctx
283
+ self.current_text = ""
284
+
285
+ def _handle_layout_end(self):
286
+ """Handle end of layout, popping state from the stack."""
287
+ text = self._clean_text(self.current_text)
288
+ ctx = self._layout_context
289
+
290
+ # Pop parent state
291
+ if self.layout_stack:
292
+ parent_ctx, parent_text = self.layout_stack.pop()
293
+ else:
294
+ parent_ctx, parent_text = "unknown", ""
295
+
296
+ # Emit content based on context
297
+ if ctx == "title":
298
+ self.output.append(f"# {text}")
299
+ self.output.append("")
300
+ elif ctx == "author":
301
+ self.output.append(f"*{text}*")
302
+ self.output.append("")
303
+ elif ctx == "chapter":
304
+ self.output.append(f"## {text}")
305
+ self.output.append("")
306
+ self.current_section = text
307
+ self.enumerate_counter = 0
308
+ elif ctx == "section":
309
+ self.output.append(f"### {text}")
310
+ self.output.append("")
311
+ self.current_section = text
312
+ self.enumerate_counter = 0
313
+ elif ctx == "subsection":
314
+ self.output.append(f"#### {text}")
315
+ self.output.append("")
316
+ self.current_section = text
317
+ self.enumerate_counter = 0
318
+ elif ctx == "subsubsection":
319
+ self.output.append(f"##### {text}")
320
+ self.output.append("")
321
+ self.current_section = text
322
+ self.enumerate_counter = 0
323
+ elif ctx == "enumerate":
324
+ self.output.append(f"{self.enumerate_counter}. {text}")
325
+ elif ctx == "itemize":
326
+ self.output.append(f"- {text}")
327
+ elif ctx == "standard":
328
+ if text.strip():
329
+ self.output.append(text)
330
+ self.output.append("")
331
+ elif ctx == "plain":
332
+ # Plain layout inside insets - push text back to parent
333
+ if text.strip():
334
+ parent_text += " " + text
335
+ elif ctx == "unknown":
336
+ if text.strip():
337
+ self.output.append(text)
338
+ self.output.append("")
339
+
340
+ # Restore parent state
341
+ self._layout_context = parent_ctx
342
+ self.current_text = parent_text
343
+
344
+ def _handle_inset_begin(self, inset_spec):
345
+ if inset_spec.startswith("Formula"):
346
+ self._handle_formula(inset_spec)
347
+ elif inset_spec.startswith("Quotes"):
348
+ self._handle_quotes(inset_spec)
349
+ elif inset_spec.startswith("Newline"):
350
+ self.current_text += " \n"
351
+ elif inset_spec.startswith("Newpage"):
352
+ pass
353
+ elif inset_spec.startswith("CommandInset label"):
354
+ self._handle_label()
355
+ elif inset_spec.startswith("CommandInset ref"):
356
+ self._handle_ref()
357
+ elif inset_spec.startswith("Graphics"):
358
+ self._handle_graphics()
359
+ elif inset_spec.startswith("Float"):
360
+ pass # Float is just a container; nested layouts handle content
361
+ elif inset_spec.startswith("Caption"):
362
+ # Mark that we're in a caption so the plain layout can format it
363
+ self.current_text += "\n\n*Caption:* "
364
+ elif inset_spec.startswith("Tabular"):
365
+ self._handle_tabular()
366
+ elif inset_spec.startswith("Box"):
367
+ self._handle_box()
368
+ elif inset_spec.startswith("Text"):
369
+ pass # Text inset is a container
370
+ elif inset_spec.startswith("ERT"):
371
+ self._handle_ert()
372
+
373
+ def _handle_box(self):
374
+ """Consume Box inset parameters (position, hor_pos, etc.)."""
375
+ # Box parameters appear on separate lines until 'status open/collapsed'
376
+ # We just skip them; the content inside will be handled by nested layouts
377
+ while self.pos + 1 < len(self.lines):
378
+ self.pos += 1
379
+ nline = self.lines[self.pos].strip()
380
+ if nline.startswith("status "):
381
+ return # done consuming parameters
382
+ if nline.startswith("\\begin_layout") or nline == "\\end_inset":
383
+ # Oops, went too far - back up
384
+ self.pos -= 1
385
+ return
386
+
387
+ def _handle_formula(self, inset_spec):
388
+ """Handle Formula insets - both inline and display math."""
389
+ formula_start = inset_spec[len("Formula ") :]
390
+
391
+ if formula_start.startswith("$"):
392
+ # Inline math
393
+ content = formula_start.strip()
394
+ while self.pos + 1 < len(self.lines):
395
+ self.pos += 1
396
+ nline = self.lines[self.pos].strip()
397
+ if nline == "\\end_inset":
398
+ break
399
+ if nline:
400
+ content += nline
401
+ # Handle soft space marker before math
402
+ if self.current_text.endswith("\x01"):
403
+ self.current_text = self.current_text[:-1]
404
+ self.current_text += " "
405
+ # Ensure space before inline math if needed
406
+ elif (
407
+ self.current_text
408
+ and not self.current_text.endswith((" ", "\n", "(", "[", "*"))
409
+ ):
410
+ self.current_text += " "
411
+ self.current_text += content
412
+ # Peek ahead to add space after if next char is text
413
+ # (handled by the text accumulation logic)
414
+
415
+ elif formula_start.startswith("\\["):
416
+ # Display math: \[...\]
417
+ math_content = formula_start.strip()
418
+ while self.pos + 1 < len(self.lines):
419
+ self.pos += 1
420
+ nline = self.lines[self.pos].strip()
421
+ if nline == "\\end_inset":
422
+ break
423
+ if nline:
424
+ math_content += nline
425
+ # Strip \[ and \]
426
+ math_content = math_content.strip()
427
+ if math_content.startswith("\\["):
428
+ math_content = math_content[2:]
429
+ if math_content.endswith("\\]"):
430
+ math_content = math_content[:-2]
431
+ self.current_text += "\n\n$$\n" + math_content.strip() + "\n$$\n\n"
432
+
433
+ elif formula_start.startswith("\\begin{"):
434
+ # LaTeX environment
435
+ math_content = formula_start.strip()
436
+ while self.pos + 1 < len(self.lines):
437
+ self.pos += 1
438
+ nline = self.lines[self.pos].strip()
439
+ if nline == "\\end_inset":
440
+ break
441
+ if nline:
442
+ math_content += "\n" + nline
443
+ math_content = self._convert_latex_env(math_content)
444
+ self.current_text += "\n\n$$\n" + math_content.strip() + "\n$$\n\n"
445
+
446
+ else:
447
+ # Unknown formula type
448
+ math_content = formula_start
449
+ while self.pos + 1 < len(self.lines):
450
+ self.pos += 1
451
+ nline = self.lines[self.pos].strip()
452
+ if nline == "\\end_inset":
453
+ break
454
+ if nline:
455
+ math_content += nline
456
+ self.current_text += math_content
457
+
458
+ def _convert_latex_env(self, content):
459
+ """Convert LaTeX environments to simpler display math."""
460
+ content = re.sub(r"\\begin\{eqnarray\*?\}", "", content)
461
+ content = re.sub(r"\\end\{eqnarray\*?\}", "", content)
462
+ content = re.sub(r"\\begin\{align\*?\}", "", content)
463
+ content = re.sub(r"\\end\{align\*?\}", "", content)
464
+ content = re.sub(r"\\begin\{equation\*?\}", "", content)
465
+ content = re.sub(r"\\end\{equation\*?\}", "", content)
466
+ content = re.sub(r"\\begin\{gathered\}", "", content)
467
+ content = re.sub(r"\\end\{gathered\}", "", content)
468
+ content = re.sub(r"\\begin\{array\}\{[^}]*\}", "", content)
469
+ content = re.sub(r"\\end\{array\}", "", content)
470
+ # Remove \label{...} from display math
471
+ content = re.sub(r"\\label\{[^}]*\}", "", content)
472
+ return content.strip()
473
+
474
+ def _handle_quotes(self, inset_spec):
475
+ """Handle quote insets."""
476
+ if "eld" in inset_spec:
477
+ self.current_text += "\u201c"
478
+ elif "erd" in inset_spec:
479
+ self.current_text += "\u201d"
480
+ elif "els" in inset_spec:
481
+ self.current_text += "\u2018"
482
+ elif "ers" in inset_spec:
483
+ self.current_text += "\u2019"
484
+ else:
485
+ self.current_text += '"'
486
+ # Consume until end_inset
487
+ while self.pos + 1 < len(self.lines):
488
+ self.pos += 1
489
+ if self.lines[self.pos].strip() == "\\end_inset":
490
+ return
491
+
492
+ def _handle_label(self):
493
+ """Handle CommandInset label - skip it."""
494
+ while self.pos + 1 < len(self.lines):
495
+ self.pos += 1
496
+ if self.lines[self.pos].strip() == "\\end_inset":
497
+ return
498
+
499
+ def _handle_ref(self):
500
+ """Handle CommandInset ref - cross-references."""
501
+ ref_name = ""
502
+ while self.pos + 1 < len(self.lines):
503
+ self.pos += 1
504
+ nline = self.lines[self.pos].strip()
505
+ if nline.startswith("reference "):
506
+ ref_name = nline.split('"')[1] if '"' in nline else ""
507
+ if nline == "\\end_inset":
508
+ break
509
+ if ref_name and ref_name in self.label_map:
510
+ resolved = self.label_map[ref_name]
511
+ # Ensure space before ref text
512
+ if self.current_text and not self.current_text.endswith(
513
+ (" ", "\n", "(", "[")
514
+ ):
515
+ self.current_text += " "
516
+ self.current_text += resolved
517
+ # If unresolved, just skip
518
+
519
+ def _handle_graphics(self):
520
+ """Handle Graphics inset."""
521
+ filename = ""
522
+ while self.pos + 1 < len(self.lines):
523
+ self.pos += 1
524
+ nline = self.lines[self.pos].strip()
525
+ if nline.startswith("filename "):
526
+ filename = nline[len("filename ") :]
527
+ if nline == "\\end_inset":
528
+ break
529
+ if filename:
530
+ self.current_text += f"\n\n![{filename}]({filename})\n\n"
531
+
532
+ def _handle_tabular(self):
533
+ """Handle Tabular inset - parse lyxtabular format."""
534
+ rows = []
535
+ current_row = []
536
+ current_cell_text = ""
537
+ in_cell = False
538
+ depth = 1
539
+
540
+ while self.pos + 1 < len(self.lines):
541
+ self.pos += 1
542
+ nline = self.lines[self.pos].strip()
543
+
544
+ if nline == "\\end_inset":
545
+ depth -= 1
546
+ if depth <= 0:
547
+ break
548
+ continue
549
+
550
+ if nline.startswith("\\begin_inset "):
551
+ depth += 1
552
+ if nline.startswith("\\begin_inset Formula"):
553
+ formula_spec = nline[len("\\begin_inset ") :]
554
+ formula_text = formula_spec[len("Formula ") :]
555
+ if formula_text.startswith("$"):
556
+ current_cell_text += formula_text.strip()
557
+ while self.pos + 1 < len(self.lines):
558
+ self.pos += 1
559
+ fl = self.lines[self.pos].strip()
560
+ if fl == "\\end_inset":
561
+ depth -= 1
562
+ break
563
+ if fl:
564
+ current_cell_text += fl
565
+ else:
566
+ math = formula_text
567
+ while self.pos + 1 < len(self.lines):
568
+ self.pos += 1
569
+ fl = self.lines[self.pos].strip()
570
+ if fl == "\\end_inset":
571
+ depth -= 1
572
+ break
573
+ if fl:
574
+ math += " " + fl
575
+ current_cell_text += math.strip()
576
+ elif nline.startswith("\\begin_inset Text"):
577
+ in_cell = True
578
+ current_cell_text = ""
579
+ elif nline.startswith("\\begin_inset ERT"):
580
+ # Skip ERT (usually \hline) inside tables
581
+ ert_depth = 1
582
+ while self.pos + 1 < len(self.lines):
583
+ self.pos += 1
584
+ el = self.lines[self.pos].strip()
585
+ if el.startswith("\\begin_inset"):
586
+ ert_depth += 1
587
+ if el == "\\end_inset":
588
+ ert_depth -= 1
589
+ if ert_depth <= 0:
590
+ depth -= 1 # balance the depth++ above
591
+ break
592
+ continue
593
+
594
+ if nline.startswith("<row"):
595
+ current_row = []
596
+ elif nline.startswith("</row"):
597
+ if current_row:
598
+ rows.append(current_row)
599
+ elif nline.startswith("<cell"):
600
+ current_cell_text = ""
601
+ in_cell = True
602
+ elif nline.startswith("</cell"):
603
+ current_row.append(current_cell_text.strip())
604
+ in_cell = False
605
+ elif nline.startswith("</lyxtabular"):
606
+ if current_row:
607
+ rows.append(current_row)
608
+ break
609
+ elif nline.startswith("<"):
610
+ continue
611
+ elif in_cell and nline and not nline.startswith("\\"):
612
+ if current_cell_text and not current_cell_text.endswith(" "):
613
+ current_cell_text += " "
614
+ current_cell_text += nline
615
+
616
+ if rows:
617
+ self._emit_markdown_table(rows)
618
+
619
+ def _emit_markdown_table(self, rows):
620
+ """Convert rows to markdown table format."""
621
+ if not rows:
622
+ return
623
+ num_cols = max(len(row) for row in rows)
624
+ for row in rows:
625
+ while len(row) < num_cols:
626
+ row.append("")
627
+
628
+ self.current_text += "\n\n"
629
+ self.current_text += "| " + " | ".join(rows[0]) + " |\n"
630
+ self.current_text += "| " + " | ".join(["---"] * num_cols) + " |\n"
631
+ for row in rows[1:]:
632
+ self.current_text += "| " + " | ".join(row) + " |\n"
633
+ self.current_text += "\n"
634
+
635
+ def _handle_ert(self):
636
+ """Handle ERT (Evil Red Text = raw LaTeX) insets - skip."""
637
+ depth = 1
638
+ while self.pos + 1 < len(self.lines):
639
+ self.pos += 1
640
+ nline = self.lines[self.pos].strip()
641
+ if nline.startswith("\\begin_inset"):
642
+ depth += 1
643
+ if nline == "\\end_inset":
644
+ depth -= 1
645
+ if depth <= 0:
646
+ return
647
+
648
+ def _clean_text(self, text):
649
+ """Clean up accumulated text."""
650
+ # Remove any leftover soft space markers
651
+ text = text.replace("\x01", " ")
652
+ text = text.strip()
653
+ # Collapse multiple spaces (but preserve newlines for display math)
654
+ lines = text.split("\n")
655
+ lines = [re.sub(r"[ \t]+", " ", line) for line in lines]
656
+ text = "\n".join(lines)
657
+ # Fix spaces before punctuation (but not inside math)
658
+ # Only fix spaces before punctuation outside of $ delimiters
659
+ text = re.sub(r" ([.,;:!?)\]])", r"\1", text)
660
+ # Fix space after opening paren/bracket
661
+ text = re.sub(r"([\[(]) ", r"\1", text)
662
+ # Fix double spaces
663
+ text = re.sub(r" +", " ", text)
664
+ return text
665
+
666
+
667
+ def clean_markdown(text):
668
+ """Post-process the markdown for cleanliness."""
669
+ lines = text.split("\n")
670
+ result = []
671
+ prev_blank = False
672
+
673
+ for line in lines:
674
+ is_blank = line.strip() == ""
675
+
676
+ if is_blank:
677
+ if prev_blank:
678
+ continue
679
+ prev_blank = True
680
+ else:
681
+ prev_blank = False
682
+
683
+ result.append(line)
684
+
685
+ text = "\n".join(result)
686
+
687
+ # Clean up [?...?] comment markers -> just content
688
+ text = re.sub(r"\[\?", "", text)
689
+ text = re.sub(r"\?\]", "", text)
690
+
691
+ # Clean up stray formatting
692
+ text = re.sub(r"\n{3,}", "\n\n", text)
693
+
694
+ # Remove spurious "LatexCommand label" or "LatexCommand ref" remnants
695
+ text = re.sub(r"\(LatexCommand label\)", "", text)
696
+ text = re.sub(r"\(LatexCommand ref\)", "", text)
697
+ text = re.sub(r"LatexCommand label", "", text)
698
+ text = re.sub(r"LatexCommand ref", "", text)
699
+
700
+ # Fix space after closing inline math before word chars
701
+ # Match: content$word but NOT $$word (display math)
702
+ # Use lookbehind for non-$ char, then $ not followed by $, then word char
703
+ # This catches closing $ only (opening $ is preceded by space or start)
704
+ # Actually this is too error-prone - skip it and rely on the converter
705
+
706
+ # Remove empty emphasis markers
707
+ text = re.sub(r"\*\* \*\*", "", text)
708
+ text = re.sub(r"\* \*", "", text)
709
+ text = text.replace("****", "")
710
+
711
+ # Ensure blank line before headings
712
+ text = re.sub(r"([^\n])\n(#{2,} )", r"\1\n\n\2", text)
713
+
714
+ # Ensure no trailing whitespace on lines
715
+ lines = text.split("\n")
716
+ lines = [line.rstrip() for line in lines]
717
+ text = "\n".join(lines)
718
+
719
+ return text.strip() + "\n"
720
+
721
+
722
+ def main():
723
+ if len(sys.argv) < 2:
724
+ print("Usage: lyx2md.py <input.lyx> [output.md]")
725
+ sys.exit(1)
726
+
727
+ input_file = sys.argv[1]
728
+ if len(sys.argv) > 2:
729
+ output_file = sys.argv[2]
730
+ else:
731
+ output_file = input_file.rsplit(".", 1)[0] + ".md"
732
+
733
+ converter = LyxToMarkdown()
734
+ result = converter.convert(input_file)
735
+ result = clean_markdown(result)
736
+
737
+ with open(output_file, "w", encoding="utf-8") as f:
738
+ f.write(result)
739
+
740
+ print(f"Converted {input_file} -> {output_file}")
741
+ print(f"Output: {len(result)} characters, {result.count(chr(10))} lines")
742
+
743
+
744
+ if __name__ == "__main__":
745
+ main()