regen.mde 0.2.2 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +409 -295
- package/bin/build-corpus-editor.js +5 -3
- package/bin/postinstall.js +259 -187
- package/bin/regen-mdeditor-install.js +1 -1
- package/bin/regen-mdeditor-uninstall.js +1 -1
- package/desktop/BuildCorpusEditor/BuildCorpusBridge.cs +493 -270
- package/desktop/BuildCorpusEditor/EditorForm.cs +853 -540
- package/desktop/BuildCorpusEditor/Program.cs +85 -81
- package/dist/release/regen-mde-0.3.0-win-x64-setup.exe +0 -0
- package/dist/release/{regen.mde-0.2.2-win-x64.zip → regen-mde-0.3.0-win-x64.zip} +0 -0
- package/dist/release/regen-mde-0.7.0-win-x64-setup.exe +0 -0
- package/dist/release/regen-mde-0.7.0-win-x64.zip +0 -0
- package/dist/windows-editor/BuildCorpusEditor.dll +0 -0
- package/dist/windows-editor/BuildCorpusEditor.exe +0 -0
- package/dist/windows-editor/BuildCorpusEditor.pdb +0 -0
- package/dist/windows-editor/wwwroot/assets/index-C_VxJk4k.js +375 -0
- package/dist/windows-editor/wwwroot/assets/index-Wt9zSjIw.css +1 -0
- package/dist/windows-editor/wwwroot/index.html +3 -3
- package/editor-web/index.html +1 -1
- package/editor-web/src/main.jsx +1044 -399
- package/editor-web/src/styles.css +846 -602
- package/installer/install-regen-mde.ps1 +49 -10
- package/installer/regen-mde.nsi +16 -16
- package/package.json +90 -86
- package/pyproject.toml +35 -33
- package/requirements.txt +6 -4
- package/scripts/package-windows-editor.ps1 +8 -8
- package/scripts/release-dual.mjs +105 -0
- package/scripts/run-editor-implementation-plane.ps1 +29 -6
- package/src/build_corpus/docx_exporter.py +1055 -798
- package/src/build_corpus/equations.py +80 -0
- package/src/build_corpus/exporter.py +1488 -1195
- package/src/build_corpus/frontmatter.py +302 -0
- package/src/build_corpus/ppt_exporter.py +543 -532
- package/dist/release/regen.mde-0.2.2-win-x64-setup.exe +0 -0
- package/dist/windows-editor/wwwroot/assets/index-DjJ6xmhy.js +0 -326
- package/dist/windows-editor/wwwroot/assets/index-_dwMNNsm.css +0 -1
|
@@ -1,798 +1,1055 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import json
|
|
4
|
-
import os
|
|
5
|
-
import re
|
|
6
|
-
import shutil
|
|
7
|
-
import subprocess
|
|
8
|
-
import tempfile
|
|
9
|
-
from html.parser import HTMLParser
|
|
10
|
-
from contextlib import ExitStack
|
|
11
|
-
from dataclasses import dataclass, field
|
|
12
|
-
from importlib.resources import as_file, files
|
|
13
|
-
from pathlib import Path
|
|
14
|
-
from zipfile import ZipFile
|
|
15
|
-
|
|
16
|
-
from docx import Document
|
|
17
|
-
from docx.enum.style import WD_STYLE_TYPE
|
|
18
|
-
from docx.enum.text import WD_BREAK
|
|
19
|
-
from docx.oxml import OxmlElement
|
|
20
|
-
from docx.oxml.ns import qn
|
|
21
|
-
from docx.shared import Inches, Pt, RGBColor
|
|
22
|
-
from docx.image.exceptions import UnrecognizedImageError
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
def
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
if
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
)
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
self.
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
index
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
if
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
if
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
if
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
self.
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
import re
|
|
6
|
+
import shutil
|
|
7
|
+
import subprocess
|
|
8
|
+
import tempfile
|
|
9
|
+
from html.parser import HTMLParser
|
|
10
|
+
from contextlib import ExitStack
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
from importlib.resources import as_file, files
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from zipfile import ZipFile
|
|
15
|
+
|
|
16
|
+
from docx import Document
|
|
17
|
+
from docx.enum.style import WD_STYLE_TYPE
|
|
18
|
+
from docx.enum.text import WD_BREAK
|
|
19
|
+
from docx.oxml import OxmlElement
|
|
20
|
+
from docx.oxml.ns import qn
|
|
21
|
+
from docx.shared import Inches, Pt, RGBColor
|
|
22
|
+
from docx.image.exceptions import UnrecognizedImageError
|
|
23
|
+
|
|
24
|
+
try:
|
|
25
|
+
from .equations import latex_to_omath, latex_to_omath_para
|
|
26
|
+
except ImportError: # pragma: no cover - script-style invocation
|
|
27
|
+
from build_corpus.equations import latex_to_omath, latex_to_omath_para
|
|
28
|
+
|
|
29
|
+
try:
|
|
30
|
+
from .frontmatter import inject_frontmatter_into_package, strip_mdk_frontmatter
|
|
31
|
+
except ImportError: # pragma: no cover - script-style invocation
|
|
32
|
+
from build_corpus.frontmatter import inject_frontmatter_into_package, strip_mdk_frontmatter
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# Math delimiters use a (?<!\\) lookbehind so escaped dollars — e.g. currency
|
|
36
|
+
# like "\$252.3B" — are NOT mistaken for inline-math fences. Escaped dollars then
|
|
37
|
+
# flow through the plain-text path and are unescaped to "$" (counted as a fixup).
|
|
38
|
+
INLINE_TOKEN_RE = re.compile(
|
|
39
|
+
r"(!\[[^\]]*\]\([^)]+\)|\[[^\]]+\]\([^)]+\)|`[^`]+`|(?<!\\)\$\$[^$]+\$\$|(?<!\\)\$[^$\n]+\$|\*\*\*.+?\*\*\*|\*\*.+?\*\*|\*.+?\*)"
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
# Image targets python-docx can never embed as a picture (need an external
|
|
43
|
+
# render pipeline, e.g. headless-browser screenshot — NOT build-corpus's job).
|
|
44
|
+
UNRENDERABLE_IMAGE_EXTS = (".html", ".htm", ".jsx", ".tsx", ".js", ".svg", ".md")
|
|
45
|
+
# Vector/metafile formats embeddable only after a metafile->PNG conversion
|
|
46
|
+
# (available on Windows / where LibreOffice is present).
|
|
47
|
+
METAFILE_IMAGE_EXTS = (".emf", ".wmf")
|
|
48
|
+
STYLE_PACKAGE_COMMENT_RE = re.compile(r'<!--\s*build-corpus:word-style-package\b[^>]*\bpath="([^"]+)"[^>]*-->', re.IGNORECASE)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def count_input_elements(markdown: str) -> dict:
|
|
52
|
+
"""Count source-side Markdown elements for input/output reconciliation."""
|
|
53
|
+
lines = markdown.splitlines()
|
|
54
|
+
counts = {"tables": 0, "equations": 0, "images": 0,
|
|
55
|
+
"code_blocks": 0, "headings": 0, "links": 0}
|
|
56
|
+
in_fence = False
|
|
57
|
+
index = 0
|
|
58
|
+
while index < len(lines):
|
|
59
|
+
stripped = lines[index].strip()
|
|
60
|
+
if stripped.startswith("```"):
|
|
61
|
+
counts["code_blocks"] += 1
|
|
62
|
+
index += 1
|
|
63
|
+
while index < len(lines) and not lines[index].strip().startswith("```"):
|
|
64
|
+
index += 1
|
|
65
|
+
index += 1
|
|
66
|
+
continue
|
|
67
|
+
if stripped == "$$":
|
|
68
|
+
counts["equations"] += 1
|
|
69
|
+
index += 1
|
|
70
|
+
while index < len(lines) and lines[index].strip() != "$$":
|
|
71
|
+
index += 1
|
|
72
|
+
index += 1
|
|
73
|
+
continue
|
|
74
|
+
if stripped.startswith("$$") and stripped.endswith("$$") and len(stripped) > 4:
|
|
75
|
+
# standalone single-line display equation (counted once; matches render_markdown)
|
|
76
|
+
counts["equations"] += 1
|
|
77
|
+
index += 1
|
|
78
|
+
continue
|
|
79
|
+
if stripped.startswith("#"):
|
|
80
|
+
counts["headings"] += 1
|
|
81
|
+
elif index + 1 < len(lines) and TABLE_SEPARATOR_RE.match(lines[index + 1]):
|
|
82
|
+
counts["tables"] += 1
|
|
83
|
+
line_text = lines[index]
|
|
84
|
+
# inline elements on this line
|
|
85
|
+
counts["images"] += len(re.findall(r"!\[[^\]]*\]\([^)]+\)", line_text))
|
|
86
|
+
# display $$...$$ embedded in a line with surrounding text (render tokenizes these too)
|
|
87
|
+
counts["equations"] += len(re.findall(r"(?<!\\)\$\$[^$]+\$\$", line_text))
|
|
88
|
+
# inline math: standalone $...$ not part of $$ and not escaped
|
|
89
|
+
counts["equations"] += len(re.findall(r"(?<!\\)(?<!\$)\$[^$\n]+\$(?!\$)", line_text))
|
|
90
|
+
# links that are not images
|
|
91
|
+
counts["links"] += len(re.findall(r"(?<!!)\[[^\]]+\]\([^)]+\)", line_text))
|
|
92
|
+
index += 1
|
|
93
|
+
return counts
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def count_text_fixups(markdown: str) -> dict:
|
|
97
|
+
"""Count markdown-escape unescapes the exporter applies to body prose.
|
|
98
|
+
|
|
99
|
+
Escapes inside fenced code blocks and inline code spans are not unescaped by
|
|
100
|
+
the renderer, so they are excluded here to match actual behavior.
|
|
101
|
+
"""
|
|
102
|
+
# strip fenced code blocks
|
|
103
|
+
no_fences = re.sub(r"```.*?```", "", markdown, flags=re.DOTALL)
|
|
104
|
+
# strip inline code spans
|
|
105
|
+
no_code = re.sub(r"`[^`]+`", "", no_fences)
|
|
106
|
+
matches = re.findall(r"\\([\\`*_{}\[\]()#+.!|$-])", no_code)
|
|
107
|
+
return {"total": len(matches), "currency_unescaped": matches.count("$")}
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def image_looks_like_svg(path: Path) -> bool:
|
|
111
|
+
"""Sniff a file's first bytes to detect SVG/XML content (even if mislabeled)."""
|
|
112
|
+
try:
|
|
113
|
+
head = path.read_bytes()[:512].lstrip()
|
|
114
|
+
except OSError:
|
|
115
|
+
return False
|
|
116
|
+
lowered = head[:256].lower()
|
|
117
|
+
return lowered.startswith(b"<svg") or (lowered.startswith(b"<?xml") and b"<svg" in head.lower())
|
|
118
|
+
BARE_URL_RE = re.compile(r"https?://[^\s<>()]+(?:\([^\s<>()]*\)[^\s<>()]*)*")
|
|
119
|
+
LIST_ITEM_RE = re.compile(r"^(\s*)([-*+]|\d+\.)\s+(.*)$")
|
|
120
|
+
TABLE_SEPARATOR_RE = re.compile(r"^\s*\|?(?:\s*:?-{3,}:?\s*\|)+\s*:?-{3,}:?\s*\|?\s*$")
|
|
121
|
+
DEFAULT_TEMPLATE_FILENAME = "md-to-word-template.dotx"
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
@dataclass
|
|
125
|
+
class WordExportStats:
|
|
126
|
+
paragraphs: int = 0
|
|
127
|
+
headings: int = 0
|
|
128
|
+
lists: int = 0
|
|
129
|
+
tables: int = 0
|
|
130
|
+
code_blocks: int = 0
|
|
131
|
+
blockquotes: int = 0
|
|
132
|
+
links: int = 0
|
|
133
|
+
images: int = 0
|
|
134
|
+
images_failed: int = 0
|
|
135
|
+
equations: int = 0
|
|
136
|
+
equations_omml: int = 0
|
|
137
|
+
equations_fell_back: int = 0
|
|
138
|
+
warnings: list[str] = field(default_factory=list)
|
|
139
|
+
issues: list[dict] = field(default_factory=list)
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def strip_fence(line: str) -> str:
|
|
143
|
+
return line.strip()[3:].strip()
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def split_table_row(line: str) -> list[str]:
|
|
147
|
+
text = line.strip()
|
|
148
|
+
if text.startswith("|"):
|
|
149
|
+
text = text[1:]
|
|
150
|
+
if text.endswith("|"):
|
|
151
|
+
text = text[:-1]
|
|
152
|
+
cells: list[str] = []
|
|
153
|
+
buffer: list[str] = []
|
|
154
|
+
escape = False
|
|
155
|
+
for char in text:
|
|
156
|
+
if escape:
|
|
157
|
+
buffer.append(char)
|
|
158
|
+
escape = False
|
|
159
|
+
continue
|
|
160
|
+
if char == "\\":
|
|
161
|
+
escape = True
|
|
162
|
+
buffer.append(char)
|
|
163
|
+
continue
|
|
164
|
+
if char == "|":
|
|
165
|
+
cells.append("".join(buffer).strip())
|
|
166
|
+
buffer.clear()
|
|
167
|
+
continue
|
|
168
|
+
buffer.append(char)
|
|
169
|
+
cells.append("".join(buffer).strip())
|
|
170
|
+
return cells
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def markdown_link_parts(token: str) -> tuple[str, str]:
|
|
174
|
+
match = re.fullmatch(r"!?\[([^\]]*)\]\(([^)]+)\)", token)
|
|
175
|
+
if not match:
|
|
176
|
+
return token, ""
|
|
177
|
+
return match.group(1), match.group(2).strip()
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def unescape_markdown_text(text: str) -> str:
|
|
181
|
+
return re.sub(r"\\([\\`*_{}\[\]()#+.!|$-])", r"\1", text)
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
class HTMLTableParser(HTMLParser):
|
|
185
|
+
def __init__(self) -> None:
|
|
186
|
+
super().__init__()
|
|
187
|
+
self.rows: list[list[str]] = []
|
|
188
|
+
self.current_row: list[str] | None = None
|
|
189
|
+
self.current_cell: list[str] | None = None
|
|
190
|
+
|
|
191
|
+
def handle_starttag(self, tag: str, attrs) -> None: # type: ignore[override]
|
|
192
|
+
normalized = tag.lower()
|
|
193
|
+
if normalized == "tr":
|
|
194
|
+
self.current_row = []
|
|
195
|
+
elif normalized == "td":
|
|
196
|
+
self.current_cell = []
|
|
197
|
+
elif normalized == "br" and self.current_cell is not None:
|
|
198
|
+
self.current_cell.append("\n")
|
|
199
|
+
|
|
200
|
+
def handle_endtag(self, tag: str) -> None: # type: ignore[override]
|
|
201
|
+
normalized = tag.lower()
|
|
202
|
+
if normalized == "td" and self.current_row is not None and self.current_cell is not None:
|
|
203
|
+
self.current_row.append("".join(self.current_cell).strip())
|
|
204
|
+
self.current_cell = None
|
|
205
|
+
elif normalized == "tr" and self.current_row is not None:
|
|
206
|
+
self.rows.append(self.current_row)
|
|
207
|
+
self.current_row = None
|
|
208
|
+
|
|
209
|
+
def handle_data(self, data: str) -> None: # type: ignore[override]
|
|
210
|
+
if self.current_cell is not None:
|
|
211
|
+
self.current_cell.append(data)
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def parse_html_table(markup: str) -> list[list[str]]:
|
|
215
|
+
parser = HTMLTableParser()
|
|
216
|
+
parser.feed(markup)
|
|
217
|
+
parser.close()
|
|
218
|
+
return [row for row in parser.rows if any(cell.strip() for cell in row)]
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def packaged_template_resource():
|
|
222
|
+
package_root = f"{__package__}.templates" if __package__ else "build_corpus.templates"
|
|
223
|
+
return files(package_root).joinpath(DEFAULT_TEMPLATE_FILENAME)
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def resolve_default_template_path() -> Path | None:
|
|
227
|
+
try:
|
|
228
|
+
packaged_template_resource()
|
|
229
|
+
except ModuleNotFoundError:
|
|
230
|
+
return None
|
|
231
|
+
return Path(f"bundled:{DEFAULT_TEMPLATE_FILENAME}")
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def set_cell_text(cell, text: str) -> None:
|
|
235
|
+
lines = [segment.strip() for segment in text.replace("<br>", "\n").splitlines()]
|
|
236
|
+
lines = [line for line in lines if line]
|
|
237
|
+
if not lines:
|
|
238
|
+
cell.text = ""
|
|
239
|
+
return []
|
|
240
|
+
return lines
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def append_text_with_breaks(paragraph, text: str) -> None:
|
|
244
|
+
text = unescape_markdown_text(text)
|
|
245
|
+
parts = text.split("\n")
|
|
246
|
+
for index, part in enumerate(parts):
|
|
247
|
+
if part:
|
|
248
|
+
paragraph.add_run(part)
|
|
249
|
+
if index < len(parts) - 1:
|
|
250
|
+
paragraph.add_run().add_break(WD_BREAK.LINE)
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def split_trailing_url_punctuation(url: str) -> tuple[str, str]:
|
|
254
|
+
trailing = []
|
|
255
|
+
core = url
|
|
256
|
+
while core and core[-1] in ".,;:!?":
|
|
257
|
+
trailing.append(core[-1])
|
|
258
|
+
core = core[:-1]
|
|
259
|
+
return core, "".join(reversed(trailing))
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def convert_windows_metafile_to_png(source: Path) -> Path | None:
|
|
263
|
+
if os.name != "nt":
|
|
264
|
+
return None
|
|
265
|
+
target_dir = Path(tempfile.gettempdir()) / "build-corpus-image-fallbacks"
|
|
266
|
+
target_dir.mkdir(parents=True, exist_ok=True)
|
|
267
|
+
target = target_dir / f"{source.stem}.png"
|
|
268
|
+
source_literal = str(source).replace("'", "''")
|
|
269
|
+
target_literal = str(target).replace("'", "''")
|
|
270
|
+
command = (
|
|
271
|
+
"Add-Type -AssemblyName System.Drawing; "
|
|
272
|
+
f"$img = [System.Drawing.Image]::FromFile('{source_literal}'); "
|
|
273
|
+
f"$bmp = New-Object System.Drawing.Bitmap $img.Width, $img.Height; "
|
|
274
|
+
"$gfx = [System.Drawing.Graphics]::FromImage($bmp); "
|
|
275
|
+
"$gfx.DrawImage($img, 0, 0, $img.Width, $img.Height); "
|
|
276
|
+
f"$bmp.Save('{target_literal}', [System.Drawing.Imaging.ImageFormat]::Png); "
|
|
277
|
+
"$gfx.Dispose(); $bmp.Dispose(); $img.Dispose()"
|
|
278
|
+
)
|
|
279
|
+
result = subprocess.run(
|
|
280
|
+
["powershell", "-NoProfile", "-Command", command],
|
|
281
|
+
capture_output=True,
|
|
282
|
+
text=True,
|
|
283
|
+
)
|
|
284
|
+
if result.returncode != 0 or not target.exists():
|
|
285
|
+
return None
|
|
286
|
+
return target
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def set_paragraph_shading(paragraph, fill: str) -> None:
|
|
290
|
+
paragraph_pr = paragraph._p.get_or_add_pPr()
|
|
291
|
+
shading = paragraph_pr.find(qn("w:shd"))
|
|
292
|
+
if shading is None:
|
|
293
|
+
shading = OxmlElement("w:shd")
|
|
294
|
+
paragraph_pr.append(shading)
|
|
295
|
+
shading.set(qn("w:fill"), fill)
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
def set_paragraph_border(paragraph, color: str) -> None:
|
|
299
|
+
paragraph_pr = paragraph._p.get_or_add_pPr()
|
|
300
|
+
borders = paragraph_pr.find(qn("w:pBdr"))
|
|
301
|
+
if borders is None:
|
|
302
|
+
borders = OxmlElement("w:pBdr")
|
|
303
|
+
paragraph_pr.append(borders)
|
|
304
|
+
left = borders.find(qn("w:left"))
|
|
305
|
+
if left is None:
|
|
306
|
+
left = OxmlElement("w:left")
|
|
307
|
+
borders.append(left)
|
|
308
|
+
left.set(qn("w:val"), "single")
|
|
309
|
+
left.set(qn("w:sz"), "10")
|
|
310
|
+
left.set(qn("w:space"), "12")
|
|
311
|
+
left.set(qn("w:color"), color)
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
def add_hyperlink(paragraph, text: str, url: str):
|
|
315
|
+
part = paragraph.part
|
|
316
|
+
relationship_id = part.relate_to(
|
|
317
|
+
url,
|
|
318
|
+
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink",
|
|
319
|
+
is_external=True,
|
|
320
|
+
)
|
|
321
|
+
hyperlink = OxmlElement("w:hyperlink")
|
|
322
|
+
hyperlink.set(qn("r:id"), relationship_id)
|
|
323
|
+
|
|
324
|
+
run = OxmlElement("w:r")
|
|
325
|
+
rpr = OxmlElement("w:rPr")
|
|
326
|
+
|
|
327
|
+
color = OxmlElement("w:color")
|
|
328
|
+
color.set(qn("w:val"), "2563EB")
|
|
329
|
+
rpr.append(color)
|
|
330
|
+
|
|
331
|
+
underline = OxmlElement("w:u")
|
|
332
|
+
underline.set(qn("w:val"), "single")
|
|
333
|
+
rpr.append(underline)
|
|
334
|
+
run.append(rpr)
|
|
335
|
+
|
|
336
|
+
text_node = OxmlElement("w:t")
|
|
337
|
+
text_node.text = text
|
|
338
|
+
run.append(text_node)
|
|
339
|
+
hyperlink.append(run)
|
|
340
|
+
paragraph._p.append(hyperlink)
|
|
341
|
+
return hyperlink
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
def append_hyperlink_run(hyperlink, text: str, bold: bool = False, italic: bool = False, code: bool = False) -> None:
|
|
345
|
+
run = OxmlElement("w:r")
|
|
346
|
+
rpr = OxmlElement("w:rPr")
|
|
347
|
+
|
|
348
|
+
color = OxmlElement("w:color")
|
|
349
|
+
color.set(qn("w:val"), "2563EB")
|
|
350
|
+
rpr.append(color)
|
|
351
|
+
|
|
352
|
+
underline = OxmlElement("w:u")
|
|
353
|
+
underline.set(qn("w:val"), "single")
|
|
354
|
+
rpr.append(underline)
|
|
355
|
+
|
|
356
|
+
if bold:
|
|
357
|
+
rpr.append(OxmlElement("w:b"))
|
|
358
|
+
if italic:
|
|
359
|
+
rpr.append(OxmlElement("w:i"))
|
|
360
|
+
|
|
361
|
+
fonts_needed = code
|
|
362
|
+
if fonts_needed:
|
|
363
|
+
fonts = OxmlElement("w:rFonts")
|
|
364
|
+
fonts.set(qn("w:ascii"), "Consolas")
|
|
365
|
+
fonts.set(qn("w:hAnsi"), "Consolas")
|
|
366
|
+
fonts.set(qn("w:cs"), "Consolas")
|
|
367
|
+
rpr.append(fonts)
|
|
368
|
+
size = OxmlElement("w:sz")
|
|
369
|
+
size.set(qn("w:val"), "20")
|
|
370
|
+
rpr.append(size)
|
|
371
|
+
|
|
372
|
+
run.append(rpr)
|
|
373
|
+
text_node = OxmlElement("w:t")
|
|
374
|
+
text_node.text = text
|
|
375
|
+
run.append(text_node)
|
|
376
|
+
hyperlink.append(run)
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
def set_picture_metadata(run, source_name: str) -> None:
|
|
380
|
+
filename = Path(source_name).name
|
|
381
|
+
try:
|
|
382
|
+
doc_props = run._r.xpath(".//*[local-name()='docPr']")
|
|
383
|
+
except Exception:
|
|
384
|
+
doc_props = []
|
|
385
|
+
for doc_prop in doc_props:
|
|
386
|
+
doc_prop.set("name", filename)
|
|
387
|
+
doc_prop.set("descr", filename)
|
|
388
|
+
doc_prop.set("title", filename)
|
|
389
|
+
|
|
390
|
+
|
|
391
|
+
class MarkdownToDocxExporter:
|
|
392
|
+
def __init__(
|
|
393
|
+
self,
|
|
394
|
+
input_path: Path,
|
|
395
|
+
output_dir: Path,
|
|
396
|
+
output_docx: Path | None = None,
|
|
397
|
+
report_path: Path | None = None,
|
|
398
|
+
template_path: Path | None = None,
|
|
399
|
+
):
|
|
400
|
+
self.input_path = input_path
|
|
401
|
+
self.output_dir = output_dir
|
|
402
|
+
self.output_docx = output_docx or (output_dir / f"{input_path.stem}.docx")
|
|
403
|
+
self.report_path = report_path or (output_dir / "export-report.json")
|
|
404
|
+
self._template_resource_stack = ExitStack()
|
|
405
|
+
self.template_path = self.resolve_template_path(template_path)
|
|
406
|
+
self.use_template_styles = self.template_path is not None
|
|
407
|
+
self.stats = WordExportStats()
|
|
408
|
+
|
|
409
|
+
def export(self) -> dict:
|
|
410
|
+
try:
|
|
411
|
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
412
|
+
if self.template_path and not self.template_path.exists():
|
|
413
|
+
raise FileNotFoundError(f"Word template not found: {self.template_path}")
|
|
414
|
+
|
|
415
|
+
document = Document()
|
|
416
|
+
if not self.template_path:
|
|
417
|
+
self.apply_modern_styles(document)
|
|
418
|
+
self.ensure_custom_styles(document)
|
|
419
|
+
|
|
420
|
+
raw_markdown = self.input_path.read_text(encoding="utf-8")
|
|
421
|
+
frontmatter, markdown = strip_mdk_frontmatter(raw_markdown)
|
|
422
|
+
self.render_markdown(document, markdown)
|
|
423
|
+
document.save(self.output_docx)
|
|
424
|
+
if self.template_path:
|
|
425
|
+
self.apply_template_package(self.output_docx, self.template_path)
|
|
426
|
+
if frontmatter:
|
|
427
|
+
inject_frontmatter_into_package(self.output_docx, frontmatter)
|
|
428
|
+
|
|
429
|
+
input_counts = count_input_elements(markdown)
|
|
430
|
+
text_fixups = count_text_fixups(markdown)
|
|
431
|
+
reconciliation = self.build_reconciliation(input_counts)
|
|
432
|
+
fidelity_ok = all(row["ok"] for row in reconciliation.values())
|
|
433
|
+
|
|
434
|
+
report = {
|
|
435
|
+
"input": str(self.input_path),
|
|
436
|
+
"output": str(self.output_docx),
|
|
437
|
+
"template": str(self.template_path) if self.template_path else None,
|
|
438
|
+
"fidelity_ok": fidelity_ok,
|
|
439
|
+
"reconciliation": reconciliation,
|
|
440
|
+
"text_fixups": text_fixups,
|
|
441
|
+
"issues": self.stats.issues,
|
|
442
|
+
"stats": self.stats.__dict__,
|
|
443
|
+
}
|
|
444
|
+
self.report_path.write_text(json.dumps(report, indent=2, ensure_ascii=False), encoding="utf-8")
|
|
445
|
+
self._print_digest(reconciliation, text_fixups, fidelity_ok)
|
|
446
|
+
return report
|
|
447
|
+
finally:
|
|
448
|
+
self._template_resource_stack.close()
|
|
449
|
+
|
|
450
|
+
def build_reconciliation(self, input_counts: dict) -> dict:
|
|
451
|
+
"""Compare source-side element counts to what was emitted."""
|
|
452
|
+
s = self.stats
|
|
453
|
+
equations_in = input_counts["equations"]
|
|
454
|
+
return {
|
|
455
|
+
"tables": {
|
|
456
|
+
"in": input_counts["tables"], "out": s.tables,
|
|
457
|
+
"ok": input_counts["tables"] == s.tables,
|
|
458
|
+
},
|
|
459
|
+
"equations": {
|
|
460
|
+
"in": equations_in, "out_omml": s.equations_omml,
|
|
461
|
+
"fell_back": s.equations_fell_back,
|
|
462
|
+
"ok": s.equations_omml == equations_in and s.equations_fell_back == 0,
|
|
463
|
+
},
|
|
464
|
+
"images": {
|
|
465
|
+
"in": input_counts["images"], "out": s.images, "failed": s.images_failed,
|
|
466
|
+
"ok": s.images_failed == 0 and s.images == input_counts["images"],
|
|
467
|
+
},
|
|
468
|
+
"code_blocks": {
|
|
469
|
+
"in": input_counts["code_blocks"], "out": s.code_blocks,
|
|
470
|
+
"ok": input_counts["code_blocks"] == s.code_blocks,
|
|
471
|
+
},
|
|
472
|
+
"headings": {
|
|
473
|
+
"in": input_counts["headings"], "out": s.headings,
|
|
474
|
+
"ok": input_counts["headings"] == s.headings,
|
|
475
|
+
},
|
|
476
|
+
"links": {
|
|
477
|
+
"in": input_counts["links"], "out": s.links,
|
|
478
|
+
"ok": input_counts["links"] == s.links,
|
|
479
|
+
},
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
def _print_digest(self, reconciliation: dict, text_fixups: dict, fidelity_ok: bool) -> None:
|
|
483
|
+
"""Print a one-line human-readable fidelity summary to stdout."""
|
|
484
|
+
parts = []
|
|
485
|
+
for name, row in reconciliation.items():
|
|
486
|
+
out = row.get("out_omml", row.get("out", 0))
|
|
487
|
+
mark = "OK" if row["ok"] else "!!"
|
|
488
|
+
label = f"[{mark}] {name} {out}/{row['in']}"
|
|
489
|
+
if name == "equations" and row["fell_back"]:
|
|
490
|
+
label += f" ({row['fell_back']} fell back)"
|
|
491
|
+
if name == "images" and row.get("failed"):
|
|
492
|
+
label += f" ({row['failed']} failed)"
|
|
493
|
+
parts.append(label)
|
|
494
|
+
if text_fixups["total"]:
|
|
495
|
+
parts.append(f"text_fixups {text_fixups['total']} "
|
|
496
|
+
f"({text_fixups['currency_unescaped']} currency)")
|
|
497
|
+
bad_lines = sorted({i["line"] for i in self.stats.issues if i.get("line")})
|
|
498
|
+
if bad_lines:
|
|
499
|
+
parts.append("issue lines: " + ",".join(str(n) for n in bad_lines))
|
|
500
|
+
print(" ".join(parts) + f" -> fidelity_ok={str(fidelity_ok).lower()}")
|
|
501
|
+
|
|
502
|
+
def resolve_template_path(self, template_path: Path | None) -> Path | None:
|
|
503
|
+
if template_path is not None:
|
|
504
|
+
return Path(template_path)
|
|
505
|
+
|
|
506
|
+
roundtrip_template = self.resolve_roundtrip_template_path()
|
|
507
|
+
if roundtrip_template is not None:
|
|
508
|
+
return roundtrip_template
|
|
509
|
+
|
|
510
|
+
try:
|
|
511
|
+
return self._template_resource_stack.enter_context(as_file(packaged_template_resource()))
|
|
512
|
+
except (FileNotFoundError, ModuleNotFoundError):
|
|
513
|
+
return None
|
|
514
|
+
|
|
515
|
+
def resolve_roundtrip_template_path(self) -> Path | None:
|
|
516
|
+
sidecar_template = self.input_path.with_suffix(".wordstyle") / "style-package.docx"
|
|
517
|
+
if sidecar_template.exists():
|
|
518
|
+
return sidecar_template
|
|
519
|
+
try:
|
|
520
|
+
head = self.input_path.read_text(encoding="utf-8")[:4096]
|
|
521
|
+
except OSError:
|
|
522
|
+
return None
|
|
523
|
+
match = STYLE_PACKAGE_COMMENT_RE.search(head)
|
|
524
|
+
if not match:
|
|
525
|
+
return None
|
|
526
|
+
candidate = Path(match.group(1))
|
|
527
|
+
if not candidate.is_absolute():
|
|
528
|
+
candidate = self.input_path.parent / candidate
|
|
529
|
+
return candidate if candidate.exists() else None
|
|
530
|
+
|
|
531
|
+
def apply_modern_styles(self, document: Document) -> None:
|
|
532
|
+
section = document.sections[0]
|
|
533
|
+
section.top_margin = Inches(0.8)
|
|
534
|
+
section.bottom_margin = Inches(0.8)
|
|
535
|
+
section.left_margin = Inches(0.9)
|
|
536
|
+
section.right_margin = Inches(0.9)
|
|
537
|
+
|
|
538
|
+
normal = document.styles["Normal"]
|
|
539
|
+
normal.font.name = "Aptos"
|
|
540
|
+
normal.font.size = Pt(11)
|
|
541
|
+
normal.font.color.rgb = RGBColor(31, 41, 55)
|
|
542
|
+
normal.paragraph_format.space_after = Pt(8)
|
|
543
|
+
normal.paragraph_format.line_spacing = 1.15
|
|
544
|
+
|
|
545
|
+
for level, size in ((1, 22), (2, 17), (3, 14), (4, 12)):
|
|
546
|
+
style = document.styles[f"Heading {level}"]
|
|
547
|
+
style.font.name = "Aptos Display"
|
|
548
|
+
style.font.size = Pt(size)
|
|
549
|
+
style.font.bold = True
|
|
550
|
+
style.font.color.rgb = RGBColor(15, 23, 42)
|
|
551
|
+
style.paragraph_format.space_before = Pt(12 if level == 1 else 10)
|
|
552
|
+
style.paragraph_format.space_after = Pt(4)
|
|
553
|
+
|
|
554
|
+
self.ensure_custom_styles(document)
|
|
555
|
+
|
|
556
|
+
def ensure_custom_styles(self, document: Document) -> None:
|
|
557
|
+
if self.use_template_styles:
|
|
558
|
+
return
|
|
559
|
+
if "BuildCorpus Code" not in document.styles:
|
|
560
|
+
style = document.styles.add_style("BuildCorpus Code", WD_STYLE_TYPE.PARAGRAPH)
|
|
561
|
+
style.base_style = document.styles["Normal"]
|
|
562
|
+
style.font.name = "Consolas"
|
|
563
|
+
style.font.size = Pt(10)
|
|
564
|
+
style.paragraph_format.left_indent = Inches(0.2)
|
|
565
|
+
style.paragraph_format.right_indent = Inches(0.2)
|
|
566
|
+
style.paragraph_format.space_before = Pt(4)
|
|
567
|
+
style.paragraph_format.space_after = Pt(6)
|
|
568
|
+
|
|
569
|
+
if "BuildCorpus Quote" not in document.styles:
|
|
570
|
+
style = document.styles.add_style("BuildCorpus Quote", WD_STYLE_TYPE.PARAGRAPH)
|
|
571
|
+
style.base_style = document.styles["Normal"]
|
|
572
|
+
style.font.italic = True
|
|
573
|
+
style.font.color.rgb = RGBColor(71, 85, 105)
|
|
574
|
+
style.paragraph_format.left_indent = Inches(0.35)
|
|
575
|
+
style.paragraph_format.space_after = Pt(6)
|
|
576
|
+
|
|
577
|
+
def render_markdown(self, document: Document, markdown: str) -> None:
|
|
578
|
+
lines = markdown.splitlines()
|
|
579
|
+
index = 0
|
|
580
|
+
while index < len(lines):
|
|
581
|
+
line = lines[index]
|
|
582
|
+
self.current_line = index + 1
|
|
583
|
+
stripped = line.strip()
|
|
584
|
+
|
|
585
|
+
if not stripped:
|
|
586
|
+
index += 1
|
|
587
|
+
continue
|
|
588
|
+
|
|
589
|
+
if stripped.startswith("<!--") and stripped.endswith("-->"):
|
|
590
|
+
index += 1
|
|
591
|
+
continue
|
|
592
|
+
|
|
593
|
+
if stripped.startswith("```"):
|
|
594
|
+
info = strip_fence(line)
|
|
595
|
+
buffer: list[str] = []
|
|
596
|
+
index += 1
|
|
597
|
+
while index < len(lines) and not lines[index].strip().startswith("```"):
|
|
598
|
+
buffer.append(lines[index])
|
|
599
|
+
index += 1
|
|
600
|
+
if index < len(lines):
|
|
601
|
+
index += 1
|
|
602
|
+
self.add_code_block(document, "\n".join(buffer), info)
|
|
603
|
+
continue
|
|
604
|
+
|
|
605
|
+
if stripped == "$$":
|
|
606
|
+
buffer: list[str] = []
|
|
607
|
+
index += 1
|
|
608
|
+
while index < len(lines) and lines[index].strip() != "$$":
|
|
609
|
+
buffer.append(lines[index])
|
|
610
|
+
index += 1
|
|
611
|
+
if index < len(lines):
|
|
612
|
+
index += 1
|
|
613
|
+
self.add_equation_block(document, "\n".join(buffer).strip())
|
|
614
|
+
continue
|
|
615
|
+
|
|
616
|
+
if stripped.startswith("$$") and stripped.endswith("$$") and len(stripped) > 4:
|
|
617
|
+
self.add_equation_block(document, stripped[2:-2].strip())
|
|
618
|
+
index += 1
|
|
619
|
+
continue
|
|
620
|
+
|
|
621
|
+
if stripped.startswith("#"):
|
|
622
|
+
level = len(stripped) - len(stripped.lstrip("#"))
|
|
623
|
+
text = stripped[level:].strip()
|
|
624
|
+
self.add_heading(document, level, text)
|
|
625
|
+
index += 1
|
|
626
|
+
continue
|
|
627
|
+
|
|
628
|
+
if TABLE_SEPARATOR_RE.match(lines[index + 1]) if index + 1 < len(lines) else False:
|
|
629
|
+
table_lines = [line, lines[index + 1]]
|
|
630
|
+
index += 2
|
|
631
|
+
while index < len(lines) and "|" in lines[index]:
|
|
632
|
+
if not lines[index].strip():
|
|
633
|
+
break
|
|
634
|
+
table_lines.append(lines[index])
|
|
635
|
+
index += 1
|
|
636
|
+
self.add_table(document, table_lines)
|
|
637
|
+
continue
|
|
638
|
+
|
|
639
|
+
if stripped.lower() == "<table>":
|
|
640
|
+
table_lines = [line]
|
|
641
|
+
index += 1
|
|
642
|
+
while index < len(lines):
|
|
643
|
+
table_lines.append(lines[index])
|
|
644
|
+
if lines[index].strip().lower() == "</table>":
|
|
645
|
+
index += 1
|
|
646
|
+
break
|
|
647
|
+
index += 1
|
|
648
|
+
self.add_html_table(document, "\n".join(table_lines))
|
|
649
|
+
continue
|
|
650
|
+
|
|
651
|
+
list_match = LIST_ITEM_RE.match(line)
|
|
652
|
+
if list_match:
|
|
653
|
+
index = self.add_list(document, lines, index)
|
|
654
|
+
continue
|
|
655
|
+
|
|
656
|
+
if stripped.startswith(">"):
|
|
657
|
+
quote_lines: list[str] = []
|
|
658
|
+
while index < len(lines) and lines[index].strip().startswith(">"):
|
|
659
|
+
quote_lines.append(lines[index].strip()[1:].strip())
|
|
660
|
+
index += 1
|
|
661
|
+
self.add_blockquote(document, " ".join(quote_lines))
|
|
662
|
+
continue
|
|
663
|
+
|
|
664
|
+
if re.fullmatch(r"[-*_]{3,}", stripped):
|
|
665
|
+
document.add_paragraph("")
|
|
666
|
+
index += 1
|
|
667
|
+
continue
|
|
668
|
+
|
|
669
|
+
paragraph_lines = [line.strip()]
|
|
670
|
+
paragraph_lines = [line.rstrip()]
|
|
671
|
+
paragraph_breaks = [line.endswith(" ") or line.endswith("\\")]
|
|
672
|
+
index += 1
|
|
673
|
+
while index < len(lines):
|
|
674
|
+
candidate = lines[index]
|
|
675
|
+
if not candidate.strip():
|
|
676
|
+
break
|
|
677
|
+
if candidate.strip().startswith(("```", "#", ">")):
|
|
678
|
+
break
|
|
679
|
+
if LIST_ITEM_RE.match(candidate):
|
|
680
|
+
break
|
|
681
|
+
if TABLE_SEPARATOR_RE.match(lines[index + 1]) if index + 1 < len(lines) else False:
|
|
682
|
+
break
|
|
683
|
+
paragraph_lines.append(candidate.rstrip())
|
|
684
|
+
paragraph_breaks.append(candidate.endswith(" ") or candidate.endswith("\\"))
|
|
685
|
+
index += 1
|
|
686
|
+
self.add_paragraph(document, self.combine_paragraph_lines(paragraph_lines, paragraph_breaks))
|
|
687
|
+
|
|
688
|
+
@staticmethod
|
|
689
|
+
def combine_paragraph_lines(lines: list[str], breaks: list[bool]) -> str:
|
|
690
|
+
if not lines:
|
|
691
|
+
return ""
|
|
692
|
+
combined = lines[0]
|
|
693
|
+
for index in range(1, len(lines)):
|
|
694
|
+
separator = "\n" if breaks[index - 1] else " "
|
|
695
|
+
combined += separator + lines[index]
|
|
696
|
+
return combined
|
|
697
|
+
|
|
698
|
+
@staticmethod
|
|
699
|
+
def apply_template_package(output_docx: Path, template_path: Path) -> None:
|
|
700
|
+
transplant_parts = {
|
|
701
|
+
"word/styles.xml",
|
|
702
|
+
"word/stylesWithEffects.xml",
|
|
703
|
+
"word/numbering.xml",
|
|
704
|
+
"word/fontTable.xml",
|
|
705
|
+
"word/settings.xml",
|
|
706
|
+
"word/webSettings.xml",
|
|
707
|
+
"word/theme/theme1.xml",
|
|
708
|
+
}
|
|
709
|
+
with tempfile.TemporaryDirectory(prefix="build-corpus-template-") as tmp:
|
|
710
|
+
tmp_dir = Path(tmp)
|
|
711
|
+
patched = tmp_dir / output_docx.name
|
|
712
|
+
with ZipFile(output_docx) as out_zip, ZipFile(template_path) as template_zip, ZipFile(patched, "w") as patched_zip:
|
|
713
|
+
template_names = set(template_zip.namelist())
|
|
714
|
+
output_names = set(out_zip.namelist())
|
|
715
|
+
for name in out_zip.namelist():
|
|
716
|
+
if name in transplant_parts and name in template_names:
|
|
717
|
+
patched_zip.writestr(name, template_zip.read(name))
|
|
718
|
+
else:
|
|
719
|
+
patched_zip.writestr(name, out_zip.read(name))
|
|
720
|
+
for name in transplant_parts:
|
|
721
|
+
if name in template_names and name not in output_names:
|
|
722
|
+
patched_zip.writestr(name, template_zip.read(name))
|
|
723
|
+
shutil.move(str(patched), output_docx)
|
|
724
|
+
|
|
725
|
+
def add_heading(self, document: Document, level: int, text: str) -> None:
|
|
726
|
+
paragraph = document.add_paragraph(style=f"Heading {min(level, 6)}")
|
|
727
|
+
self.render_inline(paragraph, text)
|
|
728
|
+
self.stats.headings += 1
|
|
729
|
+
|
|
730
|
+
def add_paragraph(self, document: Document, text: str) -> None:
|
|
731
|
+
paragraph = document.add_paragraph(style="Normal")
|
|
732
|
+
self.render_inline(paragraph, text)
|
|
733
|
+
self.stats.paragraphs += 1
|
|
734
|
+
|
|
735
|
+
def add_code_block(self, document: Document, code: str, info: str) -> None:
|
|
736
|
+
paragraph = document.add_paragraph(style="Normal" if self.use_template_styles else "BuildCorpus Code")
|
|
737
|
+
if info:
|
|
738
|
+
label = paragraph.add_run(f"{info}\n")
|
|
739
|
+
label.bold = True
|
|
740
|
+
label.font.color.rgb = RGBColor(37, 99, 235)
|
|
741
|
+
run = paragraph.add_run(code)
|
|
742
|
+
run.font.name = "Consolas"
|
|
743
|
+
run.font.size = Pt(10)
|
|
744
|
+
set_paragraph_shading(paragraph, "F8FAFC")
|
|
745
|
+
self.stats.code_blocks += 1
|
|
746
|
+
|
|
747
|
+
def add_equation_block(self, document: Document, equation: str) -> None:
|
|
748
|
+
paragraph = document.add_paragraph(style="Normal")
|
|
749
|
+
paragraph.paragraph_format.left_indent = Inches(0.3)
|
|
750
|
+
paragraph.paragraph_format.right_indent = Inches(0.3)
|
|
751
|
+
paragraph.paragraph_format.space_before = Pt(4)
|
|
752
|
+
paragraph.paragraph_format.space_after = Pt(8)
|
|
753
|
+
omath = latex_to_omath_para(equation)
|
|
754
|
+
if omath is not None:
|
|
755
|
+
self.add_hidden_math_source(paragraph, equation)
|
|
756
|
+
paragraph._p.append(omath)
|
|
757
|
+
self.stats.equations_omml += 1
|
|
758
|
+
else:
|
|
759
|
+
run = paragraph.add_run(equation)
|
|
760
|
+
run.font.name = "Cambria Math"
|
|
761
|
+
run.font.size = Pt(11)
|
|
762
|
+
self._record_equation_fallback(equation)
|
|
763
|
+
self.stats.equations += 1
|
|
764
|
+
|
|
765
|
+
def _add_inline_equation(self, paragraph, latex: str) -> None:
|
|
766
|
+
"""Append an inline OMML equation, falling back to Cambria Math text."""
|
|
767
|
+
omath = latex_to_omath(latex)
|
|
768
|
+
if omath is not None:
|
|
769
|
+
self.add_hidden_math_source(paragraph, latex)
|
|
770
|
+
paragraph._p.append(omath)
|
|
771
|
+
self.stats.equations_omml += 1
|
|
772
|
+
else:
|
|
773
|
+
run = paragraph.add_run(latex)
|
|
774
|
+
run.font.name = "Cambria Math"
|
|
775
|
+
run.font.size = Pt(11)
|
|
776
|
+
self._record_equation_fallback(latex)
|
|
777
|
+
self.stats.equations += 1
|
|
778
|
+
|
|
779
|
+
@staticmethod
|
|
780
|
+
def add_hidden_math_source(paragraph, latex: str) -> None:
|
|
781
|
+
run = paragraph.add_run(latex)
|
|
782
|
+
run.font.name = "Cambria Math"
|
|
783
|
+
rpr = run._r.get_or_add_rPr()
|
|
784
|
+
rpr.append(OxmlElement("w:vanish"))
|
|
785
|
+
|
|
786
|
+
def _record_equation_fallback(self, latex: str) -> None:
|
|
787
|
+
"""Record a LaTeX fragment that could not be converted to OMML."""
|
|
788
|
+
self.stats.equations_fell_back += 1
|
|
789
|
+
line = getattr(self, "current_line", 0)
|
|
790
|
+
message = f"Equation kept as text (LaTeX not parsed): {latex[:60]}"
|
|
791
|
+
self.stats.warnings.append(message)
|
|
792
|
+
self.stats.issues.append({
|
|
793
|
+
"type": "equation",
|
|
794
|
+
"line": line,
|
|
795
|
+
"source": latex[:120],
|
|
796
|
+
"reason": "latex-parse-failed",
|
|
797
|
+
})
|
|
798
|
+
|
|
799
|
+
def _record_image_failure(self, target: str, reason: str, message: str) -> None:
|
|
800
|
+
"""Record an image that could not be embedded, with a specific reason."""
|
|
801
|
+
self.stats.images_failed += 1
|
|
802
|
+
self.stats.warnings.append(f"{message}: {target}")
|
|
803
|
+
self.stats.issues.append({
|
|
804
|
+
"type": "image",
|
|
805
|
+
"line": getattr(self, "current_line", 0),
|
|
806
|
+
"target": target,
|
|
807
|
+
"reason": reason,
|
|
808
|
+
})
|
|
809
|
+
|
|
810
|
+
def add_blockquote(self, document: Document, text: str) -> None:
|
|
811
|
+
paragraph = document.add_paragraph(style="Quote" if self.use_template_styles else "BuildCorpus Quote")
|
|
812
|
+
self.render_inline(paragraph, text)
|
|
813
|
+
set_paragraph_border(paragraph, "CBD5E1")
|
|
814
|
+
self.stats.blockquotes += 1
|
|
815
|
+
|
|
816
|
+
def add_list(self, document: Document, lines: list[str], start: int) -> int:
|
|
817
|
+
index = start
|
|
818
|
+
while index < len(lines):
|
|
819
|
+
match = LIST_ITEM_RE.match(lines[index])
|
|
820
|
+
if not match:
|
|
821
|
+
break
|
|
822
|
+
indent, marker, body = match.groups()
|
|
823
|
+
ordered = marker.endswith(".")
|
|
824
|
+
body_lines = [body]
|
|
825
|
+
lookahead = index + 1
|
|
826
|
+
while lookahead < len(lines):
|
|
827
|
+
candidate = lines[lookahead]
|
|
828
|
+
stripped = candidate.strip()
|
|
829
|
+
if not stripped:
|
|
830
|
+
break
|
|
831
|
+
if candidate.strip().startswith(("```", "#", ">")):
|
|
832
|
+
break
|
|
833
|
+
if LIST_ITEM_RE.match(candidate):
|
|
834
|
+
break
|
|
835
|
+
if TABLE_SEPARATOR_RE.match(lines[lookahead + 1]) if lookahead + 1 < len(lines) else False:
|
|
836
|
+
break
|
|
837
|
+
if not candidate[:1].isspace():
|
|
838
|
+
break
|
|
839
|
+
body_lines.append(candidate.rstrip())
|
|
840
|
+
lookahead += 1
|
|
841
|
+
style_name = self.list_style_name(document, ordered, indent)
|
|
842
|
+
paragraph = document.add_paragraph(style=style_name)
|
|
843
|
+
if style_name in {"List Bullet", "List Number"}:
|
|
844
|
+
paragraph.paragraph_format.left_indent = Inches(0.25 + (len(indent.replace("\t", " ")) // 2) * 0.18)
|
|
845
|
+
self.render_inline(paragraph, "\n".join(body_lines))
|
|
846
|
+
self.stats.lists += 1
|
|
847
|
+
index = lookahead
|
|
848
|
+
return index
|
|
849
|
+
|
|
850
|
+
def list_style_name(self, document: Document, ordered: bool, indent: str) -> str:
|
|
851
|
+
level = min(3, max(1, (len(indent.replace("\t", " ")) // 2) + 1))
|
|
852
|
+
base = "List Number" if ordered else "List Bullet"
|
|
853
|
+
candidate = base if level == 1 else f"{base} {level}"
|
|
854
|
+
return candidate if candidate in document.styles else base
|
|
855
|
+
|
|
856
|
+
def add_table(self, document: Document, table_lines: list[str]) -> None:
|
|
857
|
+
rows = [split_table_row(line) for line in table_lines if not TABLE_SEPARATOR_RE.match(line)]
|
|
858
|
+
self.add_table_rows(document, rows)
|
|
859
|
+
|
|
860
|
+
def add_html_table(self, document: Document, table_markup: str) -> None:
|
|
861
|
+
self.add_table_rows(document, parse_html_table(table_markup))
|
|
862
|
+
|
|
863
|
+
def add_table_rows(self, document: Document, rows: list[list[str]]) -> None:
|
|
864
|
+
if not rows:
|
|
865
|
+
return
|
|
866
|
+
width = max(len(row) for row in rows)
|
|
867
|
+
table = document.add_table(rows=len(rows), cols=width)
|
|
868
|
+
table.style = "Light List Accent 1" if "Light List Accent 1" in document.styles else "Table Grid"
|
|
869
|
+
for row_index, row in enumerate(rows):
|
|
870
|
+
for col_index in range(width):
|
|
871
|
+
value = row[col_index] if col_index < len(row) else ""
|
|
872
|
+
self.render_table_cell(table.cell(row_index, col_index), value)
|
|
873
|
+
self.stats.tables += 1
|
|
874
|
+
|
|
875
|
+
def render_table_cell(self, cell, text: str) -> None:
|
|
876
|
+
lines = set_cell_text(cell, text)
|
|
877
|
+
if not lines:
|
|
878
|
+
cell.text = ""
|
|
879
|
+
return
|
|
880
|
+
first = cell.paragraphs[0]
|
|
881
|
+
first.text = ""
|
|
882
|
+
self.render_inline(first, lines[0])
|
|
883
|
+
for line in lines[1:]:
|
|
884
|
+
paragraph = cell.add_paragraph("")
|
|
885
|
+
self.render_inline(paragraph, line)
|
|
886
|
+
|
|
887
|
+
def render_inline(self, paragraph, text: str) -> None:
|
|
888
|
+
cursor = 0
|
|
889
|
+
for match in INLINE_TOKEN_RE.finditer(text):
|
|
890
|
+
if match.start() > cursor:
|
|
891
|
+
self.render_plain_text(paragraph, text[cursor:match.start()])
|
|
892
|
+
token = match.group(0)
|
|
893
|
+
self.render_inline_token(paragraph, token)
|
|
894
|
+
cursor = match.end()
|
|
895
|
+
if cursor < len(text):
|
|
896
|
+
self.render_plain_text(paragraph, text[cursor:])
|
|
897
|
+
|
|
898
|
+
def render_plain_text(self, paragraph, text: str) -> None:
|
|
899
|
+
cursor = 0
|
|
900
|
+
for match in BARE_URL_RE.finditer(text):
|
|
901
|
+
if match.start() > cursor:
|
|
902
|
+
append_text_with_breaks(paragraph, text[cursor:match.start()])
|
|
903
|
+
url = match.group(0)
|
|
904
|
+
normalized_url, trailing = split_trailing_url_punctuation(url)
|
|
905
|
+
if normalized_url:
|
|
906
|
+
add_hyperlink(paragraph, normalized_url, normalized_url)
|
|
907
|
+
if trailing:
|
|
908
|
+
append_text_with_breaks(paragraph, trailing)
|
|
909
|
+
cursor = match.end()
|
|
910
|
+
if cursor < len(text):
|
|
911
|
+
append_text_with_breaks(paragraph, text[cursor:])
|
|
912
|
+
|
|
913
|
+
def render_inline_token(self, paragraph, token: str) -> None:
|
|
914
|
+
if token.startswith("!["):
|
|
915
|
+
alt, target = markdown_link_parts(token)
|
|
916
|
+
image_path = (self.input_path.parent / target).resolve()
|
|
917
|
+
ext = image_path.suffix.lower()
|
|
918
|
+
if target.startswith(("http://", "https://", "data:")):
|
|
919
|
+
paragraph.add_run(f"[image: {alt or target}]")
|
|
920
|
+
self._record_image_failure(target, "skipped-remote",
|
|
921
|
+
"Skipped non-local image target")
|
|
922
|
+
return
|
|
923
|
+
if ext == ".svg" or (ext in UNRENDERABLE_IMAGE_EXTS and image_looks_like_svg(image_path)):
|
|
924
|
+
paragraph.add_run(f"[unsupported image: {target}]")
|
|
925
|
+
self._record_image_failure(target, "svg-needs-rasterization",
|
|
926
|
+
"SVG cannot be embedded by python-docx; rasterize to PNG via the render pipeline")
|
|
927
|
+
return
|
|
928
|
+
if ext in UNRENDERABLE_IMAGE_EXTS:
|
|
929
|
+
# HTML/JSX can never be embedded as a picture — flag for the
|
|
930
|
+
# external render pipeline. We deliberately do NOT rasterize here.
|
|
931
|
+
paragraph.add_run(f"[unsupported image: {target}]")
|
|
932
|
+
self._record_image_failure(target, "unsupported-format",
|
|
933
|
+
f"Image format cannot be embedded ({ext}); route to render pipeline")
|
|
934
|
+
return
|
|
935
|
+
if not image_path.exists():
|
|
936
|
+
paragraph.add_run(f"[missing image: {target}]")
|
|
937
|
+
self._record_image_failure(target, "missing-file", "Missing image asset")
|
|
938
|
+
return
|
|
939
|
+
run = paragraph.add_run()
|
|
940
|
+
try:
|
|
941
|
+
run.add_picture(str(image_path), width=Inches(5.8))
|
|
942
|
+
set_picture_metadata(run, target)
|
|
943
|
+
self.stats.images += 1
|
|
944
|
+
except UnrecognizedImageError:
|
|
945
|
+
converted = convert_windows_metafile_to_png(image_path)
|
|
946
|
+
if converted is not None:
|
|
947
|
+
run.add_picture(str(converted), width=Inches(5.8))
|
|
948
|
+
set_picture_metadata(run, target)
|
|
949
|
+
self.stats.images += 1
|
|
950
|
+
self.stats.warnings.append(f"Converted unsupported image to PNG: {target}")
|
|
951
|
+
elif image_looks_like_svg(image_path):
|
|
952
|
+
# SVG content mislabeled with a raster extension (e.g. .png that
|
|
953
|
+
# is really SVG+PNG-fallback the extractor flattened wrong).
|
|
954
|
+
paragraph.add_run(f"[unsupported image: {target}]")
|
|
955
|
+
self._record_image_failure(target, "mislabeled-svg",
|
|
956
|
+
f"{target} is SVG content with a {ext or 'raster'} extension; rasterize to PNG and repoint")
|
|
957
|
+
elif ext in METAFILE_IMAGE_EXTS:
|
|
958
|
+
paragraph.add_run(f"[unsupported image: {target}]")
|
|
959
|
+
self._record_image_failure(target, "unsupported-on-platform",
|
|
960
|
+
f"{ext} needs metafile->PNG conversion (install LibreOffice / run on Windows)")
|
|
961
|
+
else:
|
|
962
|
+
paragraph.add_run(f"[unsupported image: {target}]")
|
|
963
|
+
self._record_image_failure(target, "unsupported-format",
|
|
964
|
+
"Image format not recognized by python-docx")
|
|
965
|
+
return
|
|
966
|
+
|
|
967
|
+
if token.startswith("["):
|
|
968
|
+
label, target = markdown_link_parts(token)
|
|
969
|
+
hyperlink = add_hyperlink(paragraph, "", target)
|
|
970
|
+
self.render_hyperlink_label(hyperlink, label)
|
|
971
|
+
self.stats.links += 1
|
|
972
|
+
return
|
|
973
|
+
|
|
974
|
+
if token.startswith("`"):
|
|
975
|
+
run = paragraph.add_run(token[1:-1])
|
|
976
|
+
run.font.name = "Consolas"
|
|
977
|
+
run.font.size = Pt(10)
|
|
978
|
+
return
|
|
979
|
+
|
|
980
|
+
if token.startswith("$$") and token.endswith("$$"):
|
|
981
|
+
self._add_inline_equation(paragraph, token[2:-2])
|
|
982
|
+
return
|
|
983
|
+
|
|
984
|
+
if token.startswith("$") and token.endswith("$"):
|
|
985
|
+
self._add_inline_equation(paragraph, token[1:-1])
|
|
986
|
+
return
|
|
987
|
+
|
|
988
|
+
if token.startswith("***") and token.endswith("***"):
|
|
989
|
+
run = paragraph.add_run(token[3:-3])
|
|
990
|
+
run.bold = True
|
|
991
|
+
run.italic = True
|
|
992
|
+
return
|
|
993
|
+
|
|
994
|
+
if token.startswith("**") and token.endswith("**"):
|
|
995
|
+
run = paragraph.add_run(token[2:-2])
|
|
996
|
+
run.bold = True
|
|
997
|
+
return
|
|
998
|
+
|
|
999
|
+
if token.startswith("*") and token.endswith("*"):
|
|
1000
|
+
run = paragraph.add_run(token[1:-1])
|
|
1001
|
+
run.italic = True
|
|
1002
|
+
return
|
|
1003
|
+
|
|
1004
|
+
paragraph.add_run(token)
|
|
1005
|
+
|
|
1006
|
+
def render_hyperlink_label(self, hyperlink, text: str) -> None:
|
|
1007
|
+
cursor = 0
|
|
1008
|
+
for match in INLINE_TOKEN_RE.finditer(text):
|
|
1009
|
+
if match.start() > cursor:
|
|
1010
|
+
append_hyperlink_run(hyperlink, unescape_markdown_text(text[cursor:match.start()]))
|
|
1011
|
+
token = match.group(0)
|
|
1012
|
+
self.render_hyperlink_token(hyperlink, token)
|
|
1013
|
+
cursor = match.end()
|
|
1014
|
+
if cursor < len(text):
|
|
1015
|
+
append_hyperlink_run(hyperlink, unescape_markdown_text(text[cursor:]))
|
|
1016
|
+
|
|
1017
|
+
def render_hyperlink_token(self, hyperlink, token: str) -> None:
|
|
1018
|
+
if token.startswith("`") and token.endswith("`"):
|
|
1019
|
+
append_hyperlink_run(hyperlink, unescape_markdown_text(token[1:-1]), code=True)
|
|
1020
|
+
return
|
|
1021
|
+
if token.startswith("***") and token.endswith("***"):
|
|
1022
|
+
append_hyperlink_run(hyperlink, unescape_markdown_text(token[3:-3]), bold=True, italic=True)
|
|
1023
|
+
return
|
|
1024
|
+
if token.startswith("**") and token.endswith("**"):
|
|
1025
|
+
append_hyperlink_run(hyperlink, unescape_markdown_text(token[2:-2]), bold=True)
|
|
1026
|
+
return
|
|
1027
|
+
if token.startswith("*") and token.endswith("*"):
|
|
1028
|
+
append_hyperlink_run(hyperlink, unescape_markdown_text(token[1:-1]), italic=True)
|
|
1029
|
+
return
|
|
1030
|
+
append_hyperlink_run(hyperlink, unescape_markdown_text(token))
|
|
1031
|
+
|
|
1032
|
+
|
|
1033
|
+
def export_markdown_to_docx(
|
|
1034
|
+
input_path: Path,
|
|
1035
|
+
output_root: Path,
|
|
1036
|
+
out_same_dir: bool,
|
|
1037
|
+
template_path: Path | None = None,
|
|
1038
|
+
) -> dict:
|
|
1039
|
+
if out_same_dir:
|
|
1040
|
+
output_dir = input_path.parent
|
|
1041
|
+
output_docx = input_path.with_suffix(".docx")
|
|
1042
|
+
report_path = input_path.with_name(f"{input_path.stem}.export-report.json")
|
|
1043
|
+
else:
|
|
1044
|
+
output_dir = output_root / input_path.stem
|
|
1045
|
+
output_docx = None
|
|
1046
|
+
report_path = None
|
|
1047
|
+
|
|
1048
|
+
exporter = MarkdownToDocxExporter(
|
|
1049
|
+
input_path=input_path,
|
|
1050
|
+
output_dir=output_dir,
|
|
1051
|
+
output_docx=output_docx,
|
|
1052
|
+
report_path=report_path,
|
|
1053
|
+
template_path=template_path,
|
|
1054
|
+
)
|
|
1055
|
+
return exporter.export()
|