@fiduswriter/document 0.1.0-alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. package/LICENSE +661 -0
  2. package/README.md +16 -0
  3. package/jest.config.js +23 -0
  4. package/package.json +59 -0
  5. package/schema.json +1 -0
  6. package/scripts/export-schema.js +16 -0
  7. package/src/bibliography/common.js +92 -0
  8. package/src/bibliography/csl_bib.js +139 -0
  9. package/src/citations/citeproc_sys.js +42 -0
  10. package/src/citations/format.js +194 -0
  11. package/src/common/blob.js +10 -0
  12. package/src/common/file.js +25 -0
  13. package/src/common/index.js +12 -0
  14. package/src/common/network.js +79 -0
  15. package/src/common/text.js +44 -0
  16. package/src/editor/e2ee/encryptor.js +228 -0
  17. package/src/exporter/docx/citations.js +177 -0
  18. package/src/exporter/docx/comments.js +165 -0
  19. package/src/exporter/docx/footnotes.js +240 -0
  20. package/src/exporter/docx/images.js +101 -0
  21. package/src/exporter/docx/index.js +185 -0
  22. package/src/exporter/docx/lists.js +260 -0
  23. package/src/exporter/docx/math.js +46 -0
  24. package/src/exporter/docx/metadata.js +289 -0
  25. package/src/exporter/docx/rels.js +193 -0
  26. package/src/exporter/docx/render.js +941 -0
  27. package/src/exporter/docx/richtext.js +1182 -0
  28. package/src/exporter/docx/tables.js +112 -0
  29. package/src/exporter/docx/tools.js +50 -0
  30. package/src/exporter/epub/index.js +142 -0
  31. package/src/exporter/epub/templates.js +140 -0
  32. package/src/exporter/epub/tools.js +96 -0
  33. package/src/exporter/html/citations.js +121 -0
  34. package/src/exporter/html/convert.js +813 -0
  35. package/src/exporter/html/index.js +192 -0
  36. package/src/exporter/html/templates.js +34 -0
  37. package/src/exporter/html/tools.js +50 -0
  38. package/src/exporter/jats/bibliography.js +183 -0
  39. package/src/exporter/jats/citations.js +109 -0
  40. package/src/exporter/jats/convert.js +871 -0
  41. package/src/exporter/jats/index.js +92 -0
  42. package/src/exporter/jats/templates.js +35 -0
  43. package/src/exporter/jats/text.js +72 -0
  44. package/src/exporter/latex/convert.js +934 -0
  45. package/src/exporter/latex/escape_latex.js +21 -0
  46. package/src/exporter/latex/index.js +74 -0
  47. package/src/exporter/latex/readme.js +22 -0
  48. package/src/exporter/native/shrink.js +132 -0
  49. package/src/exporter/odt/citations.js +101 -0
  50. package/src/exporter/odt/footnotes.js +147 -0
  51. package/src/exporter/odt/images.js +115 -0
  52. package/src/exporter/odt/index.js +156 -0
  53. package/src/exporter/odt/math.js +57 -0
  54. package/src/exporter/odt/metadata.js +251 -0
  55. package/src/exporter/odt/render.js +806 -0
  56. package/src/exporter/odt/richtext.js +865 -0
  57. package/src/exporter/odt/styles.js +387 -0
  58. package/src/exporter/odt/track.js +68 -0
  59. package/src/exporter/pandoc/citations.js +98 -0
  60. package/src/exporter/pandoc/convert.js +1017 -0
  61. package/src/exporter/pandoc/index.js +92 -0
  62. package/src/exporter/pandoc/readme.js +8 -0
  63. package/src/exporter/pandoc/tools.js +51 -0
  64. package/src/exporter/print/index.js +177 -0
  65. package/src/exporter/tools/doc_content.js +144 -0
  66. package/src/exporter/tools/file.js +9 -0
  67. package/src/exporter/tools/json.js +73 -0
  68. package/src/exporter/tools/svg.js +29 -0
  69. package/src/exporter/tools/xml.js +531 -0
  70. package/src/exporter/tools/xml_zip.js +95 -0
  71. package/src/exporter/tools/zip.js +90 -0
  72. package/src/exporter/tools/zotero_csl.js +93 -0
  73. package/src/importer/citations.js +129 -0
  74. package/src/importer/docx/citations.js +123 -0
  75. package/src/importer/docx/convert.js +1427 -0
  76. package/src/importer/docx/helpers.js +9 -0
  77. package/src/importer/docx/omml2mathml.js +1448 -0
  78. package/src/importer/docx/parse.js +735 -0
  79. package/src/importer/native/get_images.js +76 -0
  80. package/src/importer/native/update.js +29 -0
  81. package/src/importer/odt/citations.js +87 -0
  82. package/src/importer/odt/convert.js +1855 -0
  83. package/src/importer/pandoc/convert.js +884 -0
  84. package/src/importer/pandoc/helpers.js +84 -0
  85. package/src/importer/zip_analyzer.js +102 -0
  86. package/src/index.js +1 -0
  87. package/src/mathlive/opf_includes.js +24 -0
  88. package/src/schema/common/annotate.js +76 -0
  89. package/src/schema/common/base.js +118 -0
  90. package/src/schema/common/citation.js +62 -0
  91. package/src/schema/common/equation.js +31 -0
  92. package/src/schema/common/figure.js +190 -0
  93. package/src/schema/common/heading.js +43 -0
  94. package/src/schema/common/index.js +40 -0
  95. package/src/schema/common/list.js +95 -0
  96. package/src/schema/common/reference.js +100 -0
  97. package/src/schema/common/table.js +103 -0
  98. package/src/schema/common/track.js +190 -0
  99. package/src/schema/const.js +58 -0
  100. package/src/schema/convert.js +1272 -0
  101. package/src/schema/document/content.js +187 -0
  102. package/src/schema/document/index.js +117 -0
  103. package/src/schema/document/structure.js +452 -0
  104. package/src/schema/export.js +21 -0
  105. package/src/schema/footnotes.js +126 -0
  106. package/src/schema/footnotes_convert.js +31 -0
  107. package/src/schema/i18n.js +595 -0
  108. package/src/schema/index.js +5 -0
  109. package/src/schema/mini_json.js +61 -0
  110. package/src/schema/text.js +22 -0
@@ -0,0 +1,735 @@
1
+ import {xmlDOM} from "../../exporter/tools/xml.js"
2
+ import {randomCommentId} from "../../schema/common/index.js"
3
+
4
+ const DEFAULT_STYLES_XML = `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
5
+ <w:styles xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
6
+ </w:styles>`
7
+
8
+ export class DocxParser {
9
+ constructor(zip) {
10
+ this.zip = zip
11
+ this.styles = {}
12
+ this.numbering = {}
13
+ this.comments = {}
14
+ this.footnotes = {}
15
+ this.endnotes = {}
16
+ this.relationships = {}
17
+ this.images = {}
18
+
19
+ this.coreDoc = null
20
+ this.document = null
21
+ }
22
+
23
+ init() {
24
+ return this.parseStyles()
25
+ .then(() => this.parseNumbering())
26
+ .then(() => this.parseComments())
27
+ .then(() => this.parseCommentsExtended())
28
+ .then(() => this.parseFootnotes())
29
+ .then(() => this.parseEndnotes())
30
+ .then(() => this.parseRelationships())
31
+ .then(() => this.parseImages())
32
+ .then(() => this.parseCoreDoc())
33
+ .then(() => this.parseCustomDoc())
34
+ .then(() => this.parseDocument())
35
+ }
36
+
37
+ async parseStyles() {
38
+ try {
39
+ const content = await this.zip
40
+ .file("word/styles.xml")
41
+ ?.async("string")
42
+ const stylesDoc = xmlDOM(content || DEFAULT_STYLES_XML)
43
+ const styles = stylesDoc.queryAll("w:style")
44
+
45
+ styles.forEach(style => {
46
+ const id = style.getAttribute("w:styleId")
47
+ const type = style.getAttribute("w:type")
48
+ const name = style.query("w:name")?.getAttribute("w:val")
49
+ const basedOn = style.query("w:basedOn")?.getAttribute("w:val")
50
+
51
+ this.styles[id] = {
52
+ id,
53
+ type,
54
+ name,
55
+ isHeading:
56
+ (id && /heading\d+/i.test(id)) ||
57
+ (basedOn && /heading\d+/i.test(basedOn)),
58
+ isCaption:
59
+ (id && /caption/i.test(id)) ||
60
+ (basedOn && /caption/i.test(basedOn)),
61
+ level: id ? parseInt(id.match(/\d+/)?.[0] || 0) : 0,
62
+ basedOn,
63
+ paragraphProps: this.extractParagraphProperties(style),
64
+ runProps: this.extractRunProperties(style)
65
+ }
66
+ })
67
+ } catch (err) {
68
+ console.warn("Could not parse styles", err)
69
+ }
70
+ }
71
+
72
+ isCodeStyle(styleId) {
73
+ let current = styleId
74
+ const visited = new Set()
75
+ while (current && !visited.has(current)) {
76
+ visited.add(current)
77
+ const style = this.styles[current]
78
+ if (!style) {
79
+ return false
80
+ }
81
+ const name = style.name?.toLowerCase() || ""
82
+ if (
83
+ /^code(\s|$)/i.test(style.id) ||
84
+ name === "code" ||
85
+ name.includes("code") ||
86
+ /^html(\s|$)/i.test(style.id) ||
87
+ /^pre(\s|$)/i.test(style.id)
88
+ ) {
89
+ return true
90
+ }
91
+ // Check font family on the style
92
+ if (style.runProps?.fontFamily) {
93
+ const fontFamily = style.runProps.fontFamily.toLowerCase()
94
+ const monospacePatterns = [
95
+ "courier",
96
+ "consolas",
97
+ "monaco",
98
+ "menlo",
99
+ "lucida console",
100
+ "liberation mono",
101
+ "dejavu sans mono",
102
+ "bitstream vera sans mono",
103
+ "source code pro",
104
+ "fira code",
105
+ "ubuntu mono",
106
+ "droid sans mono",
107
+ "monospace"
108
+ ]
109
+ if (monospacePatterns.some(p => fontFamily.includes(p))) {
110
+ return true
111
+ }
112
+ }
113
+ current = style.basedOn
114
+ }
115
+ return false
116
+ }
117
+
118
+ extractParagraphProperties(style) {
119
+ const pPr = style.query("w:pPr")
120
+ if (!pPr) {
121
+ return {}
122
+ }
123
+
124
+ return {
125
+ indent: this.extractIndentation(pPr),
126
+ alignment: pPr.query("w:jc")?.getAttribute("w:val"),
127
+ numbering: this.extractNumbering(pPr),
128
+ keepNext: Boolean(pPr.query("w:keepNext"))
129
+ }
130
+ }
131
+
132
+ extractIndentation(pPr) {
133
+ const ind = pPr.query("w:ind")
134
+ if (!ind) {
135
+ return {}
136
+ }
137
+
138
+ return {
139
+ left: parseInt(
140
+ ind.getAttribute("w:left") || ind.getAttribute("w:start") || "0"
141
+ ),
142
+ right: parseInt(
143
+ ind.getAttribute("w:right") || ind.getAttribute("w:end") || "0"
144
+ ),
145
+ hanging: parseInt(ind.getAttribute("w:hanging") || "0"),
146
+ firstLine: parseInt(ind.getAttribute("w:firstLine") || "0")
147
+ }
148
+ }
149
+
150
+ extractNumbering(pPr) {
151
+ const numPr = pPr.query("w:numPr")
152
+ if (!numPr) {
153
+ return null
154
+ }
155
+
156
+ return {
157
+ id: numPr.query("w:numId")?.getAttribute("w:val"),
158
+ level: parseInt(numPr.query("w:ilvl")?.getAttribute("w:val") || "0")
159
+ }
160
+ }
161
+
162
+ extractRunProperties(rPr) {
163
+ if (!rPr) {
164
+ return {}
165
+ }
166
+
167
+ return {
168
+ bold: Boolean(rPr.query("w:b")),
169
+ italic: Boolean(rPr.query("w:i")),
170
+ underline: rPr.query("w:u")?.getAttribute("w:val") || false,
171
+ strike: Boolean(rPr.query("w:strike")),
172
+ smallCaps: Boolean(rPr.query("w:smallCaps")),
173
+ vertAlign: rPr.query("w:vertAlign")?.getAttribute("w:val") || false,
174
+ fontSize:
175
+ parseInt(rPr.query("w:sz")?.getAttribute("w:val") || "0") / 2,
176
+ color: rPr.query("w:color")?.getAttribute("w:val") || false,
177
+ fontFamily: rPr.query("w:rFonts")?.getAttribute("w:ascii") || false
178
+ }
179
+ }
180
+
181
+ async parseNumbering() {
182
+ try {
183
+ const content = await this.zip
184
+ .file("word/numbering.xml")
185
+ ?.async("string")
186
+ if (!content) {
187
+ return
188
+ }
189
+ const numberingDoc = xmlDOM(content)
190
+
191
+ // Parse abstract numbering definitions
192
+ const abstractNums = numberingDoc.queryAll("w:abstractNum")
193
+ const abstractNumbering = {}
194
+
195
+ abstractNums.forEach(abstractNum => {
196
+ const id = abstractNum.getAttribute("w:abstractNumId")
197
+ const levels = abstractNum.queryAll("w:lvl").map(lvl => ({
198
+ level: lvl.getAttribute("w:ilvl"),
199
+ format: lvl.query("w:numFmt")?.getAttribute("w:val"),
200
+ text: lvl.query("w:lvlText")?.getAttribute("w:val"),
201
+ start: parseInt(
202
+ lvl.query("w:start")?.getAttribute("w:val") || "1"
203
+ )
204
+ }))
205
+ abstractNumbering[id] = levels
206
+ })
207
+
208
+ // Parse numbering instances
209
+ const nums = numberingDoc.queryAll("w:num")
210
+ nums.forEach(num => {
211
+ const numId = num.getAttribute("w:numId")
212
+ const abstractNumId = num
213
+ .query("w:abstractNumId")
214
+ ?.getAttribute("w:val")
215
+
216
+ this.numbering[numId] = {
217
+ abstractId: abstractNumId,
218
+ levels: abstractNumbering[abstractNumId] || [],
219
+ overrides: this.extractNumberingOverrides(num)
220
+ }
221
+ })
222
+ } catch (err) {
223
+ console.warn("Could not parse numbering", err)
224
+ }
225
+ }
226
+
227
+ extractNumberingOverrides(num) {
228
+ return num.queryAll("w:lvlOverride").map(override => ({
229
+ level: override.getAttribute("w:ilvl"),
230
+ start: parseInt(
231
+ override.query("w:startOverride")?.getAttribute("w:val") || "1"
232
+ )
233
+ }))
234
+ }
235
+
236
+ async parseComments() {
237
+ try {
238
+ const content = await this.zip
239
+ .file("word/comments.xml")
240
+ ?.async("string")
241
+ if (!content) {
242
+ return
243
+ }
244
+ const commentsDoc = xmlDOM(content)
245
+
246
+ const commentList = commentsDoc.queryAll("w:comment")
247
+
248
+ // First pass: parse all comments into the expected format
249
+ commentList.forEach(comment => {
250
+ const id = comment.getAttribute("w:id")
251
+ const dateStr = comment.getAttribute("w:date")
252
+ this.comments[id] = {
253
+ user: 0,
254
+ username:
255
+ comment.getAttribute("w:author") || gettext("Unknown"),
256
+ date: dateStr ? new Date(dateStr).getTime() : Date.now(),
257
+ comment: this.extractCommentContent(comment),
258
+ answers: [],
259
+ resolved: false,
260
+ isMajor: false
261
+ }
262
+ })
263
+ } catch (err) {
264
+ console.warn("Could not parse comments", err)
265
+ }
266
+ }
267
+
268
+ async parseCommentsExtended() {
269
+ try {
270
+ const content = await this.zip
271
+ .file("word/commentsExtended.xml")
272
+ ?.async("string")
273
+ if (!content) {
274
+ return
275
+ }
276
+ const commentsExDoc = xmlDOM(content)
277
+ const extendedEntries = commentsExDoc.queryAll("w15:commentEx")
278
+
279
+ if (!extendedEntries.length) {
280
+ return
281
+ }
282
+
283
+ // Parse extended entries into main (no parentParaId) and answer entries
284
+ const mainEntries = []
285
+ const answerEntries = []
286
+
287
+ extendedEntries.forEach(entry => {
288
+ const paraId = entry.getAttribute("w15:paraId")
289
+ const done = entry.getAttribute("w15:done") === "1"
290
+ const paraIdParent = entry.getAttribute("w15:paraIdParent")
291
+
292
+ if (paraId) {
293
+ if (paraIdParent) {
294
+ answerEntries.push({
295
+ paraId,
296
+ parentParaId: paraIdParent,
297
+ done
298
+ })
299
+ } else {
300
+ mainEntries.push({paraId, done})
301
+ }
302
+ }
303
+ })
304
+
305
+ // Map resolved status to comments by position/order.
306
+ // Main comments are written first in comments.xml, and their
307
+ // extended entries appear first in commentsExtended.xml.
308
+ const commentIds = Object.keys(this.comments)
309
+ .map(Number)
310
+ .sort((a, b) => a - b)
311
+ .map(String)
312
+
313
+ // Track which comment IDs are parents vs answers
314
+ const parentCommentIds = []
315
+
316
+ commentIds.forEach((commentId, index) => {
317
+ if (index < mainEntries.length) {
318
+ // This is a main comment - apply resolved status
319
+ this.comments[commentId].resolved = mainEntries[index].done
320
+ parentCommentIds.push(commentId)
321
+ } else {
322
+ // This is an answer comment - group under nearest parent
323
+ const answerComment = this.comments[commentId]
324
+ if (answerComment) {
325
+ // Find the parent - answers are written right after
326
+ // their parent comment in comments.xml
327
+ const answerIndex = index - mainEntries.length
328
+ const answerEntry = answerEntries[answerIndex]
329
+ if (answerEntry) {
330
+ // Map answer to its parent comment
331
+ const parentId = parentCommentIds.length
332
+ ? parentCommentIds[parentCommentIds.length - 1]
333
+ : null
334
+ if (parentId && this.comments[parentId]) {
335
+ this.comments[parentId].answers.push({
336
+ id: randomCommentId(),
337
+ user: 0,
338
+ username: answerComment.username,
339
+ date: answerComment.date,
340
+ answer: answerComment.comment
341
+ })
342
+ // Remove the answer from top-level
343
+ delete this.comments[commentId]
344
+ }
345
+ }
346
+ }
347
+ }
348
+ })
349
+ } catch (err) {
350
+ console.warn("Could not parse comments extended", err)
351
+ }
352
+ }
353
+
354
+ extractCommentContent(comment) {
355
+ const content = []
356
+ comment.queryAll("w:p").forEach(p => {
357
+ content.push({
358
+ type: "paragraph",
359
+ content: this.extractParagraphContent(p)
360
+ })
361
+ })
362
+ return content
363
+ }
364
+
365
+ async parseFootnotes() {
366
+ try {
367
+ const content = await this.zip
368
+ .file("word/footnotes.xml")
369
+ ?.async("string")
370
+ if (!content) {
371
+ return
372
+ }
373
+ const footnotesDoc = xmlDOM(content)
374
+
375
+ footnotesDoc.queryAll("w:footnote").forEach(footnote => {
376
+ const id = footnote.getAttribute("w:id")
377
+ if (id === "0" || id === "-1") {
378
+ return // Skip separator footnotes
379
+ }
380
+ this.footnotes[id] = {
381
+ id,
382
+ content: this.extractBlockContent(footnote)
383
+ }
384
+ })
385
+ } catch (err) {
386
+ console.warn("Could not parse footnotes", err)
387
+ }
388
+ }
389
+
390
+ // async parseFootnotes() {
391
+ // try {
392
+ // const content = await this.zip
393
+ // .file("word/footnotes.xml")
394
+ // ?.async("string")
395
+ // if (!content) {
396
+ // return
397
+ // }
398
+ // const footnotesDoc = xmlDOM(content)
399
+
400
+ // footnotesDoc.queryAll("w:footnote").forEach(footnote => {
401
+ // const id = footnote.getAttribute("w:id")
402
+ // if (id === "0" || id === "-1") {
403
+ // return // Skip separator footnotes
404
+ // }
405
+
406
+ // // Process each paragraph in the footnote
407
+ // const paragraphs = []
408
+ // footnote.queryAll("w:p").forEach(p => {
409
+ // paragraphs.push({
410
+ // type: "paragraph",
411
+ // content: this.extractParagraphContent(p)
412
+ // })
413
+ // })
414
+
415
+ // this.footnotes[id] = {
416
+ // id,
417
+ // content: paragraphs
418
+ // }
419
+ // })
420
+ // } catch (err) {
421
+ // console.warn("Could not parse footnotes", err)
422
+ // }
423
+ // }
424
+
425
+ // extractParagraphContent(p) {
426
+ // const content = []
427
+
428
+ // // Handle field codes (for cross-references)
429
+ // const fieldRuns = []
430
+ // let currentFieldCode = null
431
+ // let collectingField = false
432
+
433
+ // p.queryAll("w:r").forEach(r => {
434
+ // const fieldChar = r.query("w:fldChar")
435
+ // if (fieldChar) {
436
+ // const type = fieldChar.getAttribute("w:fldCharType")
437
+ // if (type === "begin") {
438
+ // collectingField = true
439
+ // currentFieldCode = { code: "", result: "" }
440
+ // } else if (type === "separate") {
441
+ // collectingField = false
442
+ // } else if (type === "end") {
443
+ // if (currentFieldCode) {
444
+ // fieldRuns.push(currentFieldCode)
445
+ // currentFieldCode = null
446
+ // }
447
+ // }
448
+ // } else if (collectingField && currentFieldCode) {
449
+ // const instrText = r.query("w:instrText")
450
+ // if (instrText) {
451
+ // currentFieldCode.code += instrText.textContent
452
+ // }
453
+ // } else if (currentFieldCode) {
454
+ // const text = r.query("w:t")?.textContent
455
+ // if (text) {
456
+ // currentFieldCode.result += text
457
+ // }
458
+ // }
459
+
460
+ // // Normal text processing
461
+ // const text = r.query("w:t")?.textContent
462
+ // if (!text && !r.query("w:drawing") && !r.query("w:pict")) {
463
+ // // Check for breaks
464
+ // if (r.query("w:br")) {
465
+ // content.push({ type: "hard_break" })
466
+ // }
467
+ // return
468
+ // }
469
+
470
+ // // Check for hyperlinks
471
+ // const hyperlink = r.closest("w:hyperlink")
472
+ // if (hyperlink && !r.query("w:drawing") && !r.query("w:pict")) {
473
+ // // This will be handled separately
474
+ // return
475
+ // }
476
+
477
+ // const rPr = r.query("w:rPr")
478
+ // const formatting = rPr ? this.extractRunProperties(rPr) : {}
479
+
480
+ // if (text) {
481
+ // content.push({
482
+ // type: "text",
483
+ // text,
484
+ // marks: this.createMarksFromFormatting(formatting)
485
+ // })
486
+ // }
487
+ // })
488
+
489
+ // // Process hyperlinks in the paragraph
490
+ // p.queryAll("w:hyperlink").forEach(hyperlink => {
491
+ // const rId = hyperlink.getAttribute("r:id")
492
+ // const anchor = hyperlink.getAttribute("w:anchor")
493
+
494
+ // // Collect all text from the hyperlink
495
+ // let linkText = ""
496
+ // hyperlink.queryAll("w:r").forEach(r => {
497
+ // const t = r.query("w:t")
498
+ // if (t) {
499
+ // linkText += t.textContent
500
+ // }
501
+ // })
502
+
503
+ // if (linkText) {
504
+ // let href = "#"
505
+ // if (rId && this.relationships[rId]) {
506
+ // href = this.relationships[rId].target
507
+ // } else if (anchor) {
508
+ // href = `#${anchor}`
509
+ // }
510
+
511
+ // content.push({
512
+ // type: "text",
513
+ // text: linkText,
514
+ // marks: [{
515
+ // type: "link",
516
+ // attrs: {
517
+ // href,
518
+ // title: linkText
519
+ // }
520
+ // }]
521
+ // })
522
+ // }
523
+ // })
524
+
525
+ // // Process field runs for cross-references
526
+ // fieldRuns.forEach(field => {
527
+ // if (field.code.startsWith("REF ")) {
528
+ // const target = field.code.substring(4).trim().split(/\s+/)[0]
529
+ // content.push({
530
+ // type: "cross_reference",
531
+ // attrs: {
532
+ // id: target,
533
+ // title: field.result || target
534
+ // }
535
+ // })
536
+ // }
537
+ // })
538
+
539
+ // // Handle equations
540
+ // const oMath = p.query("m:oMath")
541
+ // if (oMath) {
542
+ // // Very basic LaTeX conversion (would need a proper OMML to LaTeX converter)
543
+ // const latex = "x^2" // Placeholder - should use a proper converter
544
+ // content.push({
545
+ // type: "equation",
546
+ // attrs: {
547
+ // equation: latex
548
+ // }
549
+ // })
550
+ // }
551
+
552
+ // return content
553
+ // }
554
+
555
+ async parseEndnotes() {
556
+ try {
557
+ const content = await this.zip
558
+ .file("word/endnotes.xml")
559
+ ?.async("string")
560
+ if (!content) {
561
+ return
562
+ }
563
+ const endnotesDoc = xmlDOM(content)
564
+
565
+ endnotesDoc.queryAll("w:endnote").forEach(endnote => {
566
+ const id = endnote.getAttribute("w:id")
567
+ if (id === "0" || id === "-1") {
568
+ return // Skip separator endnotes
569
+ }
570
+ this.endnotes[id] = {
571
+ id,
572
+ content: this.extractBlockContent(endnote)
573
+ }
574
+ })
575
+ } catch (err) {
576
+ console.warn("Could not parse endnotes", err)
577
+ }
578
+ }
579
+
580
+ async parseRelationships() {
581
+ try {
582
+ const content = await this.zip
583
+ .file("word/_rels/document.xml.rels")
584
+ ?.async("string")
585
+ if (!content) {
586
+ return
587
+ }
588
+ const relsDoc = xmlDOM(content)
589
+
590
+ relsDoc.queryAll("Relationship").forEach(rel => {
591
+ const id = rel.getAttribute("Id")
592
+ this.relationships[id] = {
593
+ id,
594
+ type: rel.getAttribute("Type"),
595
+ target: rel.getAttribute("Target")
596
+ }
597
+ })
598
+ } catch (err) {
599
+ console.warn("Could not parse relationships", err)
600
+ }
601
+ }
602
+
603
+ async parseImages() {
604
+ // Find and extract image files
605
+ const imageFiles = Object.keys(this.zip.files).filter(path =>
606
+ path.startsWith("word/media/")
607
+ )
608
+
609
+ for (const path of imageFiles) {
610
+ try {
611
+ const blob = await this.zip.file(path).async("blob")
612
+ const filename = path.split("/").pop()
613
+ const content = this.addMimeType(blob, filename)
614
+ this.images[filename] = content
615
+ } catch (err) {
616
+ console.warn(`Could not parse image ${path}`, err)
617
+ }
618
+ }
619
+ }
620
+
621
+ addMimeType(blob, filename) {
622
+ return new File([blob], filename, {
623
+ type: this.getImageFileType(filename)
624
+ })
625
+ }
626
+
627
+ getImageFileType(filename) {
628
+ const ext = filename.split(".").pop().toLowerCase()
629
+ switch (ext) {
630
+ case "avif":
631
+ case "avifs":
632
+ return "image/avif"
633
+ case "png":
634
+ return "image/png"
635
+ case "jpg":
636
+ case "jpeg":
637
+ return "image/jpeg"
638
+ case "gif":
639
+ return "image/gif"
640
+ case "svg":
641
+ return "image/svg+xml"
642
+ case "webp":
643
+ return "image/webp"
644
+ default:
645
+ return "image/png" // Default fallback
646
+ }
647
+ }
648
+
649
+ extractBlockContent(node) {
650
+ const content = []
651
+ node.queryAll("w:p").forEach(p => {
652
+ content.push({
653
+ type: "paragraph",
654
+ content: this.extractParagraphContent(p)
655
+ })
656
+ })
657
+ return content
658
+ }
659
+
660
+ extractParagraphContent(p) {
661
+ const content = []
662
+ p.queryAll("w:r").forEach(r => {
663
+ const text = r.query("w:t")?.textContent
664
+ if (!text) {
665
+ return
666
+ }
667
+
668
+ const rPr = r.query("w:rPr")
669
+ const formatting = rPr ? this.extractRunProperties(rPr) : {}
670
+
671
+ content.push({
672
+ type: "text",
673
+ text,
674
+ marks: this.createMarksFromFormatting(formatting)
675
+ })
676
+ })
677
+ return content
678
+ }
679
+
680
+ createMarksFromFormatting(formatting) {
681
+ const marks = []
682
+ if (formatting.bold) {
683
+ marks.push({type: "strong"})
684
+ }
685
+ if (formatting.italic) {
686
+ marks.push({type: "em"})
687
+ }
688
+ if (formatting.underline) {
689
+ marks.push({type: "underline"})
690
+ }
691
+ return marks
692
+ }
693
+
694
+ async parseCoreDoc() {
695
+ try {
696
+ const content = await this.zip
697
+ .file("docProps/core.xml")
698
+ ?.async("string")
699
+ if (!content) {
700
+ return
701
+ }
702
+ this.coreDoc = xmlDOM(content)
703
+ } catch (err) {
704
+ console.warn("Could not parse core doc", err)
705
+ }
706
+ }
707
+
708
+ async parseCustomDoc() {
709
+ try {
710
+ const content = await this.zip
711
+ .file("docProps/custom.xml")
712
+ ?.async("string")
713
+ if (!content) {
714
+ return
715
+ }
716
+ this.customDoc = xmlDOM(content)
717
+ } catch (err) {
718
+ console.warn("Could not parse custom doc", err)
719
+ }
720
+ }
721
+
722
+ async parseDocument() {
723
+ try {
724
+ const content = await this.zip
725
+ .file("word/document.xml")
726
+ ?.async("string")
727
+ if (!content) {
728
+ return
729
+ }
730
+ this.document = xmlDOM(content)
731
+ } catch (err) {
732
+ console.warn("Could not parse document", err)
733
+ }
734
+ }
735
+ }