@fiduswriter/document 0.1.0-alpha.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +661 -0
- package/README.md +16 -0
- package/jest.config.js +23 -0
- package/package.json +59 -0
- package/schema.json +1 -0
- package/scripts/export-schema.js +16 -0
- package/src/bibliography/common.js +92 -0
- package/src/bibliography/csl_bib.js +139 -0
- package/src/citations/citeproc_sys.js +42 -0
- package/src/citations/format.js +194 -0
- package/src/common/blob.js +10 -0
- package/src/common/file.js +25 -0
- package/src/common/index.js +12 -0
- package/src/common/network.js +79 -0
- package/src/common/text.js +44 -0
- package/src/editor/e2ee/encryptor.js +228 -0
- package/src/exporter/docx/citations.js +177 -0
- package/src/exporter/docx/comments.js +165 -0
- package/src/exporter/docx/footnotes.js +240 -0
- package/src/exporter/docx/images.js +101 -0
- package/src/exporter/docx/index.js +185 -0
- package/src/exporter/docx/lists.js +260 -0
- package/src/exporter/docx/math.js +46 -0
- package/src/exporter/docx/metadata.js +289 -0
- package/src/exporter/docx/rels.js +193 -0
- package/src/exporter/docx/render.js +941 -0
- package/src/exporter/docx/richtext.js +1182 -0
- package/src/exporter/docx/tables.js +112 -0
- package/src/exporter/docx/tools.js +50 -0
- package/src/exporter/epub/index.js +142 -0
- package/src/exporter/epub/templates.js +140 -0
- package/src/exporter/epub/tools.js +96 -0
- package/src/exporter/html/citations.js +121 -0
- package/src/exporter/html/convert.js +813 -0
- package/src/exporter/html/index.js +192 -0
- package/src/exporter/html/templates.js +34 -0
- package/src/exporter/html/tools.js +50 -0
- package/src/exporter/jats/bibliography.js +183 -0
- package/src/exporter/jats/citations.js +109 -0
- package/src/exporter/jats/convert.js +871 -0
- package/src/exporter/jats/index.js +92 -0
- package/src/exporter/jats/templates.js +35 -0
- package/src/exporter/jats/text.js +72 -0
- package/src/exporter/latex/convert.js +934 -0
- package/src/exporter/latex/escape_latex.js +21 -0
- package/src/exporter/latex/index.js +74 -0
- package/src/exporter/latex/readme.js +22 -0
- package/src/exporter/native/shrink.js +132 -0
- package/src/exporter/odt/citations.js +101 -0
- package/src/exporter/odt/footnotes.js +147 -0
- package/src/exporter/odt/images.js +115 -0
- package/src/exporter/odt/index.js +156 -0
- package/src/exporter/odt/math.js +57 -0
- package/src/exporter/odt/metadata.js +251 -0
- package/src/exporter/odt/render.js +806 -0
- package/src/exporter/odt/richtext.js +865 -0
- package/src/exporter/odt/styles.js +387 -0
- package/src/exporter/odt/track.js +68 -0
- package/src/exporter/pandoc/citations.js +98 -0
- package/src/exporter/pandoc/convert.js +1017 -0
- package/src/exporter/pandoc/index.js +92 -0
- package/src/exporter/pandoc/readme.js +8 -0
- package/src/exporter/pandoc/tools.js +51 -0
- package/src/exporter/print/index.js +177 -0
- package/src/exporter/tools/doc_content.js +144 -0
- package/src/exporter/tools/file.js +9 -0
- package/src/exporter/tools/json.js +73 -0
- package/src/exporter/tools/svg.js +29 -0
- package/src/exporter/tools/xml.js +531 -0
- package/src/exporter/tools/xml_zip.js +95 -0
- package/src/exporter/tools/zip.js +90 -0
- package/src/exporter/tools/zotero_csl.js +93 -0
- package/src/importer/citations.js +129 -0
- package/src/importer/docx/citations.js +123 -0
- package/src/importer/docx/convert.js +1427 -0
- package/src/importer/docx/helpers.js +9 -0
- package/src/importer/docx/omml2mathml.js +1448 -0
- package/src/importer/docx/parse.js +735 -0
- package/src/importer/native/get_images.js +76 -0
- package/src/importer/native/update.js +29 -0
- package/src/importer/odt/citations.js +87 -0
- package/src/importer/odt/convert.js +1855 -0
- package/src/importer/pandoc/convert.js +884 -0
- package/src/importer/pandoc/helpers.js +84 -0
- package/src/importer/zip_analyzer.js +102 -0
- package/src/index.js +1 -0
- package/src/mathlive/opf_includes.js +24 -0
- package/src/schema/common/annotate.js +76 -0
- package/src/schema/common/base.js +118 -0
- package/src/schema/common/citation.js +62 -0
- package/src/schema/common/equation.js +31 -0
- package/src/schema/common/figure.js +190 -0
- package/src/schema/common/heading.js +43 -0
- package/src/schema/common/index.js +40 -0
- package/src/schema/common/list.js +95 -0
- package/src/schema/common/reference.js +100 -0
- package/src/schema/common/table.js +103 -0
- package/src/schema/common/track.js +190 -0
- package/src/schema/const.js +58 -0
- package/src/schema/convert.js +1272 -0
- package/src/schema/document/content.js +187 -0
- package/src/schema/document/index.js +117 -0
- package/src/schema/document/structure.js +452 -0
- package/src/schema/export.js +21 -0
- package/src/schema/footnotes.js +126 -0
- package/src/schema/footnotes_convert.js +31 -0
- package/src/schema/i18n.js +595 -0
- package/src/schema/index.js +5 -0
- package/src/schema/mini_json.js +61 -0
- package/src/schema/text.js +22 -0
|
@@ -0,0 +1,735 @@
|
|
|
1
|
+
import {xmlDOM} from "../../exporter/tools/xml.js"
|
|
2
|
+
import {randomCommentId} from "../../schema/common/index.js"
|
|
3
|
+
|
|
4
|
+
const DEFAULT_STYLES_XML = `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
|
5
|
+
<w:styles xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
|
6
|
+
</w:styles>`
|
|
7
|
+
|
|
8
|
+
export class DocxParser {
|
|
9
|
+
constructor(zip) {
|
|
10
|
+
this.zip = zip
|
|
11
|
+
this.styles = {}
|
|
12
|
+
this.numbering = {}
|
|
13
|
+
this.comments = {}
|
|
14
|
+
this.footnotes = {}
|
|
15
|
+
this.endnotes = {}
|
|
16
|
+
this.relationships = {}
|
|
17
|
+
this.images = {}
|
|
18
|
+
|
|
19
|
+
this.coreDoc = null
|
|
20
|
+
this.document = null
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
init() {
|
|
24
|
+
return this.parseStyles()
|
|
25
|
+
.then(() => this.parseNumbering())
|
|
26
|
+
.then(() => this.parseComments())
|
|
27
|
+
.then(() => this.parseCommentsExtended())
|
|
28
|
+
.then(() => this.parseFootnotes())
|
|
29
|
+
.then(() => this.parseEndnotes())
|
|
30
|
+
.then(() => this.parseRelationships())
|
|
31
|
+
.then(() => this.parseImages())
|
|
32
|
+
.then(() => this.parseCoreDoc())
|
|
33
|
+
.then(() => this.parseCustomDoc())
|
|
34
|
+
.then(() => this.parseDocument())
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
async parseStyles() {
|
|
38
|
+
try {
|
|
39
|
+
const content = await this.zip
|
|
40
|
+
.file("word/styles.xml")
|
|
41
|
+
?.async("string")
|
|
42
|
+
const stylesDoc = xmlDOM(content || DEFAULT_STYLES_XML)
|
|
43
|
+
const styles = stylesDoc.queryAll("w:style")
|
|
44
|
+
|
|
45
|
+
styles.forEach(style => {
|
|
46
|
+
const id = style.getAttribute("w:styleId")
|
|
47
|
+
const type = style.getAttribute("w:type")
|
|
48
|
+
const name = style.query("w:name")?.getAttribute("w:val")
|
|
49
|
+
const basedOn = style.query("w:basedOn")?.getAttribute("w:val")
|
|
50
|
+
|
|
51
|
+
this.styles[id] = {
|
|
52
|
+
id,
|
|
53
|
+
type,
|
|
54
|
+
name,
|
|
55
|
+
isHeading:
|
|
56
|
+
(id && /heading\d+/i.test(id)) ||
|
|
57
|
+
(basedOn && /heading\d+/i.test(basedOn)),
|
|
58
|
+
isCaption:
|
|
59
|
+
(id && /caption/i.test(id)) ||
|
|
60
|
+
(basedOn && /caption/i.test(basedOn)),
|
|
61
|
+
level: id ? parseInt(id.match(/\d+/)?.[0] || 0) : 0,
|
|
62
|
+
basedOn,
|
|
63
|
+
paragraphProps: this.extractParagraphProperties(style),
|
|
64
|
+
runProps: this.extractRunProperties(style)
|
|
65
|
+
}
|
|
66
|
+
})
|
|
67
|
+
} catch (err) {
|
|
68
|
+
console.warn("Could not parse styles", err)
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
isCodeStyle(styleId) {
|
|
73
|
+
let current = styleId
|
|
74
|
+
const visited = new Set()
|
|
75
|
+
while (current && !visited.has(current)) {
|
|
76
|
+
visited.add(current)
|
|
77
|
+
const style = this.styles[current]
|
|
78
|
+
if (!style) {
|
|
79
|
+
return false
|
|
80
|
+
}
|
|
81
|
+
const name = style.name?.toLowerCase() || ""
|
|
82
|
+
if (
|
|
83
|
+
/^code(\s|$)/i.test(style.id) ||
|
|
84
|
+
name === "code" ||
|
|
85
|
+
name.includes("code") ||
|
|
86
|
+
/^html(\s|$)/i.test(style.id) ||
|
|
87
|
+
/^pre(\s|$)/i.test(style.id)
|
|
88
|
+
) {
|
|
89
|
+
return true
|
|
90
|
+
}
|
|
91
|
+
// Check font family on the style
|
|
92
|
+
if (style.runProps?.fontFamily) {
|
|
93
|
+
const fontFamily = style.runProps.fontFamily.toLowerCase()
|
|
94
|
+
const monospacePatterns = [
|
|
95
|
+
"courier",
|
|
96
|
+
"consolas",
|
|
97
|
+
"monaco",
|
|
98
|
+
"menlo",
|
|
99
|
+
"lucida console",
|
|
100
|
+
"liberation mono",
|
|
101
|
+
"dejavu sans mono",
|
|
102
|
+
"bitstream vera sans mono",
|
|
103
|
+
"source code pro",
|
|
104
|
+
"fira code",
|
|
105
|
+
"ubuntu mono",
|
|
106
|
+
"droid sans mono",
|
|
107
|
+
"monospace"
|
|
108
|
+
]
|
|
109
|
+
if (monospacePatterns.some(p => fontFamily.includes(p))) {
|
|
110
|
+
return true
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
current = style.basedOn
|
|
114
|
+
}
|
|
115
|
+
return false
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
extractParagraphProperties(style) {
|
|
119
|
+
const pPr = style.query("w:pPr")
|
|
120
|
+
if (!pPr) {
|
|
121
|
+
return {}
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
return {
|
|
125
|
+
indent: this.extractIndentation(pPr),
|
|
126
|
+
alignment: pPr.query("w:jc")?.getAttribute("w:val"),
|
|
127
|
+
numbering: this.extractNumbering(pPr),
|
|
128
|
+
keepNext: Boolean(pPr.query("w:keepNext"))
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
extractIndentation(pPr) {
|
|
133
|
+
const ind = pPr.query("w:ind")
|
|
134
|
+
if (!ind) {
|
|
135
|
+
return {}
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
return {
|
|
139
|
+
left: parseInt(
|
|
140
|
+
ind.getAttribute("w:left") || ind.getAttribute("w:start") || "0"
|
|
141
|
+
),
|
|
142
|
+
right: parseInt(
|
|
143
|
+
ind.getAttribute("w:right") || ind.getAttribute("w:end") || "0"
|
|
144
|
+
),
|
|
145
|
+
hanging: parseInt(ind.getAttribute("w:hanging") || "0"),
|
|
146
|
+
firstLine: parseInt(ind.getAttribute("w:firstLine") || "0")
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
extractNumbering(pPr) {
|
|
151
|
+
const numPr = pPr.query("w:numPr")
|
|
152
|
+
if (!numPr) {
|
|
153
|
+
return null
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
return {
|
|
157
|
+
id: numPr.query("w:numId")?.getAttribute("w:val"),
|
|
158
|
+
level: parseInt(numPr.query("w:ilvl")?.getAttribute("w:val") || "0")
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
extractRunProperties(rPr) {
|
|
163
|
+
if (!rPr) {
|
|
164
|
+
return {}
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
return {
|
|
168
|
+
bold: Boolean(rPr.query("w:b")),
|
|
169
|
+
italic: Boolean(rPr.query("w:i")),
|
|
170
|
+
underline: rPr.query("w:u")?.getAttribute("w:val") || false,
|
|
171
|
+
strike: Boolean(rPr.query("w:strike")),
|
|
172
|
+
smallCaps: Boolean(rPr.query("w:smallCaps")),
|
|
173
|
+
vertAlign: rPr.query("w:vertAlign")?.getAttribute("w:val") || false,
|
|
174
|
+
fontSize:
|
|
175
|
+
parseInt(rPr.query("w:sz")?.getAttribute("w:val") || "0") / 2,
|
|
176
|
+
color: rPr.query("w:color")?.getAttribute("w:val") || false,
|
|
177
|
+
fontFamily: rPr.query("w:rFonts")?.getAttribute("w:ascii") || false
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
async parseNumbering() {
|
|
182
|
+
try {
|
|
183
|
+
const content = await this.zip
|
|
184
|
+
.file("word/numbering.xml")
|
|
185
|
+
?.async("string")
|
|
186
|
+
if (!content) {
|
|
187
|
+
return
|
|
188
|
+
}
|
|
189
|
+
const numberingDoc = xmlDOM(content)
|
|
190
|
+
|
|
191
|
+
// Parse abstract numbering definitions
|
|
192
|
+
const abstractNums = numberingDoc.queryAll("w:abstractNum")
|
|
193
|
+
const abstractNumbering = {}
|
|
194
|
+
|
|
195
|
+
abstractNums.forEach(abstractNum => {
|
|
196
|
+
const id = abstractNum.getAttribute("w:abstractNumId")
|
|
197
|
+
const levels = abstractNum.queryAll("w:lvl").map(lvl => ({
|
|
198
|
+
level: lvl.getAttribute("w:ilvl"),
|
|
199
|
+
format: lvl.query("w:numFmt")?.getAttribute("w:val"),
|
|
200
|
+
text: lvl.query("w:lvlText")?.getAttribute("w:val"),
|
|
201
|
+
start: parseInt(
|
|
202
|
+
lvl.query("w:start")?.getAttribute("w:val") || "1"
|
|
203
|
+
)
|
|
204
|
+
}))
|
|
205
|
+
abstractNumbering[id] = levels
|
|
206
|
+
})
|
|
207
|
+
|
|
208
|
+
// Parse numbering instances
|
|
209
|
+
const nums = numberingDoc.queryAll("w:num")
|
|
210
|
+
nums.forEach(num => {
|
|
211
|
+
const numId = num.getAttribute("w:numId")
|
|
212
|
+
const abstractNumId = num
|
|
213
|
+
.query("w:abstractNumId")
|
|
214
|
+
?.getAttribute("w:val")
|
|
215
|
+
|
|
216
|
+
this.numbering[numId] = {
|
|
217
|
+
abstractId: abstractNumId,
|
|
218
|
+
levels: abstractNumbering[abstractNumId] || [],
|
|
219
|
+
overrides: this.extractNumberingOverrides(num)
|
|
220
|
+
}
|
|
221
|
+
})
|
|
222
|
+
} catch (err) {
|
|
223
|
+
console.warn("Could not parse numbering", err)
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
extractNumberingOverrides(num) {
|
|
228
|
+
return num.queryAll("w:lvlOverride").map(override => ({
|
|
229
|
+
level: override.getAttribute("w:ilvl"),
|
|
230
|
+
start: parseInt(
|
|
231
|
+
override.query("w:startOverride")?.getAttribute("w:val") || "1"
|
|
232
|
+
)
|
|
233
|
+
}))
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
async parseComments() {
|
|
237
|
+
try {
|
|
238
|
+
const content = await this.zip
|
|
239
|
+
.file("word/comments.xml")
|
|
240
|
+
?.async("string")
|
|
241
|
+
if (!content) {
|
|
242
|
+
return
|
|
243
|
+
}
|
|
244
|
+
const commentsDoc = xmlDOM(content)
|
|
245
|
+
|
|
246
|
+
const commentList = commentsDoc.queryAll("w:comment")
|
|
247
|
+
|
|
248
|
+
// First pass: parse all comments into the expected format
|
|
249
|
+
commentList.forEach(comment => {
|
|
250
|
+
const id = comment.getAttribute("w:id")
|
|
251
|
+
const dateStr = comment.getAttribute("w:date")
|
|
252
|
+
this.comments[id] = {
|
|
253
|
+
user: 0,
|
|
254
|
+
username:
|
|
255
|
+
comment.getAttribute("w:author") || gettext("Unknown"),
|
|
256
|
+
date: dateStr ? new Date(dateStr).getTime() : Date.now(),
|
|
257
|
+
comment: this.extractCommentContent(comment),
|
|
258
|
+
answers: [],
|
|
259
|
+
resolved: false,
|
|
260
|
+
isMajor: false
|
|
261
|
+
}
|
|
262
|
+
})
|
|
263
|
+
} catch (err) {
|
|
264
|
+
console.warn("Could not parse comments", err)
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
async parseCommentsExtended() {
|
|
269
|
+
try {
|
|
270
|
+
const content = await this.zip
|
|
271
|
+
.file("word/commentsExtended.xml")
|
|
272
|
+
?.async("string")
|
|
273
|
+
if (!content) {
|
|
274
|
+
return
|
|
275
|
+
}
|
|
276
|
+
const commentsExDoc = xmlDOM(content)
|
|
277
|
+
const extendedEntries = commentsExDoc.queryAll("w15:commentEx")
|
|
278
|
+
|
|
279
|
+
if (!extendedEntries.length) {
|
|
280
|
+
return
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
// Parse extended entries into main (no parentParaId) and answer entries
|
|
284
|
+
const mainEntries = []
|
|
285
|
+
const answerEntries = []
|
|
286
|
+
|
|
287
|
+
extendedEntries.forEach(entry => {
|
|
288
|
+
const paraId = entry.getAttribute("w15:paraId")
|
|
289
|
+
const done = entry.getAttribute("w15:done") === "1"
|
|
290
|
+
const paraIdParent = entry.getAttribute("w15:paraIdParent")
|
|
291
|
+
|
|
292
|
+
if (paraId) {
|
|
293
|
+
if (paraIdParent) {
|
|
294
|
+
answerEntries.push({
|
|
295
|
+
paraId,
|
|
296
|
+
parentParaId: paraIdParent,
|
|
297
|
+
done
|
|
298
|
+
})
|
|
299
|
+
} else {
|
|
300
|
+
mainEntries.push({paraId, done})
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
})
|
|
304
|
+
|
|
305
|
+
// Map resolved status to comments by position/order.
|
|
306
|
+
// Main comments are written first in comments.xml, and their
|
|
307
|
+
// extended entries appear first in commentsExtended.xml.
|
|
308
|
+
const commentIds = Object.keys(this.comments)
|
|
309
|
+
.map(Number)
|
|
310
|
+
.sort((a, b) => a - b)
|
|
311
|
+
.map(String)
|
|
312
|
+
|
|
313
|
+
// Track which comment IDs are parents vs answers
|
|
314
|
+
const parentCommentIds = []
|
|
315
|
+
|
|
316
|
+
commentIds.forEach((commentId, index) => {
|
|
317
|
+
if (index < mainEntries.length) {
|
|
318
|
+
// This is a main comment - apply resolved status
|
|
319
|
+
this.comments[commentId].resolved = mainEntries[index].done
|
|
320
|
+
parentCommentIds.push(commentId)
|
|
321
|
+
} else {
|
|
322
|
+
// This is an answer comment - group under nearest parent
|
|
323
|
+
const answerComment = this.comments[commentId]
|
|
324
|
+
if (answerComment) {
|
|
325
|
+
// Find the parent - answers are written right after
|
|
326
|
+
// their parent comment in comments.xml
|
|
327
|
+
const answerIndex = index - mainEntries.length
|
|
328
|
+
const answerEntry = answerEntries[answerIndex]
|
|
329
|
+
if (answerEntry) {
|
|
330
|
+
// Map answer to its parent comment
|
|
331
|
+
const parentId = parentCommentIds.length
|
|
332
|
+
? parentCommentIds[parentCommentIds.length - 1]
|
|
333
|
+
: null
|
|
334
|
+
if (parentId && this.comments[parentId]) {
|
|
335
|
+
this.comments[parentId].answers.push({
|
|
336
|
+
id: randomCommentId(),
|
|
337
|
+
user: 0,
|
|
338
|
+
username: answerComment.username,
|
|
339
|
+
date: answerComment.date,
|
|
340
|
+
answer: answerComment.comment
|
|
341
|
+
})
|
|
342
|
+
// Remove the answer from top-level
|
|
343
|
+
delete this.comments[commentId]
|
|
344
|
+
}
|
|
345
|
+
}
|
|
346
|
+
}
|
|
347
|
+
}
|
|
348
|
+
})
|
|
349
|
+
} catch (err) {
|
|
350
|
+
console.warn("Could not parse comments extended", err)
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
extractCommentContent(comment) {
|
|
355
|
+
const content = []
|
|
356
|
+
comment.queryAll("w:p").forEach(p => {
|
|
357
|
+
content.push({
|
|
358
|
+
type: "paragraph",
|
|
359
|
+
content: this.extractParagraphContent(p)
|
|
360
|
+
})
|
|
361
|
+
})
|
|
362
|
+
return content
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
async parseFootnotes() {
|
|
366
|
+
try {
|
|
367
|
+
const content = await this.zip
|
|
368
|
+
.file("word/footnotes.xml")
|
|
369
|
+
?.async("string")
|
|
370
|
+
if (!content) {
|
|
371
|
+
return
|
|
372
|
+
}
|
|
373
|
+
const footnotesDoc = xmlDOM(content)
|
|
374
|
+
|
|
375
|
+
footnotesDoc.queryAll("w:footnote").forEach(footnote => {
|
|
376
|
+
const id = footnote.getAttribute("w:id")
|
|
377
|
+
if (id === "0" || id === "-1") {
|
|
378
|
+
return // Skip separator footnotes
|
|
379
|
+
}
|
|
380
|
+
this.footnotes[id] = {
|
|
381
|
+
id,
|
|
382
|
+
content: this.extractBlockContent(footnote)
|
|
383
|
+
}
|
|
384
|
+
})
|
|
385
|
+
} catch (err) {
|
|
386
|
+
console.warn("Could not parse footnotes", err)
|
|
387
|
+
}
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
// async parseFootnotes() {
|
|
391
|
+
// try {
|
|
392
|
+
// const content = await this.zip
|
|
393
|
+
// .file("word/footnotes.xml")
|
|
394
|
+
// ?.async("string")
|
|
395
|
+
// if (!content) {
|
|
396
|
+
// return
|
|
397
|
+
// }
|
|
398
|
+
// const footnotesDoc = xmlDOM(content)
|
|
399
|
+
|
|
400
|
+
// footnotesDoc.queryAll("w:footnote").forEach(footnote => {
|
|
401
|
+
// const id = footnote.getAttribute("w:id")
|
|
402
|
+
// if (id === "0" || id === "-1") {
|
|
403
|
+
// return // Skip separator footnotes
|
|
404
|
+
// }
|
|
405
|
+
|
|
406
|
+
// // Process each paragraph in the footnote
|
|
407
|
+
// const paragraphs = []
|
|
408
|
+
// footnote.queryAll("w:p").forEach(p => {
|
|
409
|
+
// paragraphs.push({
|
|
410
|
+
// type: "paragraph",
|
|
411
|
+
// content: this.extractParagraphContent(p)
|
|
412
|
+
// })
|
|
413
|
+
// })
|
|
414
|
+
|
|
415
|
+
// this.footnotes[id] = {
|
|
416
|
+
// id,
|
|
417
|
+
// content: paragraphs
|
|
418
|
+
// }
|
|
419
|
+
// })
|
|
420
|
+
// } catch (err) {
|
|
421
|
+
// console.warn("Could not parse footnotes", err)
|
|
422
|
+
// }
|
|
423
|
+
// }
|
|
424
|
+
|
|
425
|
+
// extractParagraphContent(p) {
|
|
426
|
+
// const content = []
|
|
427
|
+
|
|
428
|
+
// // Handle field codes (for cross-references)
|
|
429
|
+
// const fieldRuns = []
|
|
430
|
+
// let currentFieldCode = null
|
|
431
|
+
// let collectingField = false
|
|
432
|
+
|
|
433
|
+
// p.queryAll("w:r").forEach(r => {
|
|
434
|
+
// const fieldChar = r.query("w:fldChar")
|
|
435
|
+
// if (fieldChar) {
|
|
436
|
+
// const type = fieldChar.getAttribute("w:fldCharType")
|
|
437
|
+
// if (type === "begin") {
|
|
438
|
+
// collectingField = true
|
|
439
|
+
// currentFieldCode = { code: "", result: "" }
|
|
440
|
+
// } else if (type === "separate") {
|
|
441
|
+
// collectingField = false
|
|
442
|
+
// } else if (type === "end") {
|
|
443
|
+
// if (currentFieldCode) {
|
|
444
|
+
// fieldRuns.push(currentFieldCode)
|
|
445
|
+
// currentFieldCode = null
|
|
446
|
+
// }
|
|
447
|
+
// }
|
|
448
|
+
// } else if (collectingField && currentFieldCode) {
|
|
449
|
+
// const instrText = r.query("w:instrText")
|
|
450
|
+
// if (instrText) {
|
|
451
|
+
// currentFieldCode.code += instrText.textContent
|
|
452
|
+
// }
|
|
453
|
+
// } else if (currentFieldCode) {
|
|
454
|
+
// const text = r.query("w:t")?.textContent
|
|
455
|
+
// if (text) {
|
|
456
|
+
// currentFieldCode.result += text
|
|
457
|
+
// }
|
|
458
|
+
// }
|
|
459
|
+
|
|
460
|
+
// // Normal text processing
|
|
461
|
+
// const text = r.query("w:t")?.textContent
|
|
462
|
+
// if (!text && !r.query("w:drawing") && !r.query("w:pict")) {
|
|
463
|
+
// // Check for breaks
|
|
464
|
+
// if (r.query("w:br")) {
|
|
465
|
+
// content.push({ type: "hard_break" })
|
|
466
|
+
// }
|
|
467
|
+
// return
|
|
468
|
+
// }
|
|
469
|
+
|
|
470
|
+
// // Check for hyperlinks
|
|
471
|
+
// const hyperlink = r.closest("w:hyperlink")
|
|
472
|
+
// if (hyperlink && !r.query("w:drawing") && !r.query("w:pict")) {
|
|
473
|
+
// // This will be handled separately
|
|
474
|
+
// return
|
|
475
|
+
// }
|
|
476
|
+
|
|
477
|
+
// const rPr = r.query("w:rPr")
|
|
478
|
+
// const formatting = rPr ? this.extractRunProperties(rPr) : {}
|
|
479
|
+
|
|
480
|
+
// if (text) {
|
|
481
|
+
// content.push({
|
|
482
|
+
// type: "text",
|
|
483
|
+
// text,
|
|
484
|
+
// marks: this.createMarksFromFormatting(formatting)
|
|
485
|
+
// })
|
|
486
|
+
// }
|
|
487
|
+
// })
|
|
488
|
+
|
|
489
|
+
// // Process hyperlinks in the paragraph
|
|
490
|
+
// p.queryAll("w:hyperlink").forEach(hyperlink => {
|
|
491
|
+
// const rId = hyperlink.getAttribute("r:id")
|
|
492
|
+
// const anchor = hyperlink.getAttribute("w:anchor")
|
|
493
|
+
|
|
494
|
+
// // Collect all text from the hyperlink
|
|
495
|
+
// let linkText = ""
|
|
496
|
+
// hyperlink.queryAll("w:r").forEach(r => {
|
|
497
|
+
// const t = r.query("w:t")
|
|
498
|
+
// if (t) {
|
|
499
|
+
// linkText += t.textContent
|
|
500
|
+
// }
|
|
501
|
+
// })
|
|
502
|
+
|
|
503
|
+
// if (linkText) {
|
|
504
|
+
// let href = "#"
|
|
505
|
+
// if (rId && this.relationships[rId]) {
|
|
506
|
+
// href = this.relationships[rId].target
|
|
507
|
+
// } else if (anchor) {
|
|
508
|
+
// href = `#${anchor}`
|
|
509
|
+
// }
|
|
510
|
+
|
|
511
|
+
// content.push({
|
|
512
|
+
// type: "text",
|
|
513
|
+
// text: linkText,
|
|
514
|
+
// marks: [{
|
|
515
|
+
// type: "link",
|
|
516
|
+
// attrs: {
|
|
517
|
+
// href,
|
|
518
|
+
// title: linkText
|
|
519
|
+
// }
|
|
520
|
+
// }]
|
|
521
|
+
// })
|
|
522
|
+
// }
|
|
523
|
+
// })
|
|
524
|
+
|
|
525
|
+
// // Process field runs for cross-references
|
|
526
|
+
// fieldRuns.forEach(field => {
|
|
527
|
+
// if (field.code.startsWith("REF ")) {
|
|
528
|
+
// const target = field.code.substring(4).trim().split(/\s+/)[0]
|
|
529
|
+
// content.push({
|
|
530
|
+
// type: "cross_reference",
|
|
531
|
+
// attrs: {
|
|
532
|
+
// id: target,
|
|
533
|
+
// title: field.result || target
|
|
534
|
+
// }
|
|
535
|
+
// })
|
|
536
|
+
// }
|
|
537
|
+
// })
|
|
538
|
+
|
|
539
|
+
// // Handle equations
|
|
540
|
+
// const oMath = p.query("m:oMath")
|
|
541
|
+
// if (oMath) {
|
|
542
|
+
// // Very basic LaTeX conversion (would need a proper OMML to LaTeX converter)
|
|
543
|
+
// const latex = "x^2" // Placeholder - should use a proper converter
|
|
544
|
+
// content.push({
|
|
545
|
+
// type: "equation",
|
|
546
|
+
// attrs: {
|
|
547
|
+
// equation: latex
|
|
548
|
+
// }
|
|
549
|
+
// })
|
|
550
|
+
// }
|
|
551
|
+
|
|
552
|
+
// return content
|
|
553
|
+
// }
|
|
554
|
+
|
|
555
|
+
async parseEndnotes() {
|
|
556
|
+
try {
|
|
557
|
+
const content = await this.zip
|
|
558
|
+
.file("word/endnotes.xml")
|
|
559
|
+
?.async("string")
|
|
560
|
+
if (!content) {
|
|
561
|
+
return
|
|
562
|
+
}
|
|
563
|
+
const endnotesDoc = xmlDOM(content)
|
|
564
|
+
|
|
565
|
+
endnotesDoc.queryAll("w:endnote").forEach(endnote => {
|
|
566
|
+
const id = endnote.getAttribute("w:id")
|
|
567
|
+
if (id === "0" || id === "-1") {
|
|
568
|
+
return // Skip separator endnotes
|
|
569
|
+
}
|
|
570
|
+
this.endnotes[id] = {
|
|
571
|
+
id,
|
|
572
|
+
content: this.extractBlockContent(endnote)
|
|
573
|
+
}
|
|
574
|
+
})
|
|
575
|
+
} catch (err) {
|
|
576
|
+
console.warn("Could not parse endnotes", err)
|
|
577
|
+
}
|
|
578
|
+
}
|
|
579
|
+
|
|
580
|
+
async parseRelationships() {
|
|
581
|
+
try {
|
|
582
|
+
const content = await this.zip
|
|
583
|
+
.file("word/_rels/document.xml.rels")
|
|
584
|
+
?.async("string")
|
|
585
|
+
if (!content) {
|
|
586
|
+
return
|
|
587
|
+
}
|
|
588
|
+
const relsDoc = xmlDOM(content)
|
|
589
|
+
|
|
590
|
+
relsDoc.queryAll("Relationship").forEach(rel => {
|
|
591
|
+
const id = rel.getAttribute("Id")
|
|
592
|
+
this.relationships[id] = {
|
|
593
|
+
id,
|
|
594
|
+
type: rel.getAttribute("Type"),
|
|
595
|
+
target: rel.getAttribute("Target")
|
|
596
|
+
}
|
|
597
|
+
})
|
|
598
|
+
} catch (err) {
|
|
599
|
+
console.warn("Could not parse relationships", err)
|
|
600
|
+
}
|
|
601
|
+
}
|
|
602
|
+
|
|
603
|
+
async parseImages() {
|
|
604
|
+
// Find and extract image files
|
|
605
|
+
const imageFiles = Object.keys(this.zip.files).filter(path =>
|
|
606
|
+
path.startsWith("word/media/")
|
|
607
|
+
)
|
|
608
|
+
|
|
609
|
+
for (const path of imageFiles) {
|
|
610
|
+
try {
|
|
611
|
+
const blob = await this.zip.file(path).async("blob")
|
|
612
|
+
const filename = path.split("/").pop()
|
|
613
|
+
const content = this.addMimeType(blob, filename)
|
|
614
|
+
this.images[filename] = content
|
|
615
|
+
} catch (err) {
|
|
616
|
+
console.warn(`Could not parse image ${path}`, err)
|
|
617
|
+
}
|
|
618
|
+
}
|
|
619
|
+
}
|
|
620
|
+
|
|
621
|
+
addMimeType(blob, filename) {
|
|
622
|
+
return new File([blob], filename, {
|
|
623
|
+
type: this.getImageFileType(filename)
|
|
624
|
+
})
|
|
625
|
+
}
|
|
626
|
+
|
|
627
|
+
getImageFileType(filename) {
|
|
628
|
+
const ext = filename.split(".").pop().toLowerCase()
|
|
629
|
+
switch (ext) {
|
|
630
|
+
case "avif":
|
|
631
|
+
case "avifs":
|
|
632
|
+
return "image/avif"
|
|
633
|
+
case "png":
|
|
634
|
+
return "image/png"
|
|
635
|
+
case "jpg":
|
|
636
|
+
case "jpeg":
|
|
637
|
+
return "image/jpeg"
|
|
638
|
+
case "gif":
|
|
639
|
+
return "image/gif"
|
|
640
|
+
case "svg":
|
|
641
|
+
return "image/svg+xml"
|
|
642
|
+
case "webp":
|
|
643
|
+
return "image/webp"
|
|
644
|
+
default:
|
|
645
|
+
return "image/png" // Default fallback
|
|
646
|
+
}
|
|
647
|
+
}
|
|
648
|
+
|
|
649
|
+
extractBlockContent(node) {
|
|
650
|
+
const content = []
|
|
651
|
+
node.queryAll("w:p").forEach(p => {
|
|
652
|
+
content.push({
|
|
653
|
+
type: "paragraph",
|
|
654
|
+
content: this.extractParagraphContent(p)
|
|
655
|
+
})
|
|
656
|
+
})
|
|
657
|
+
return content
|
|
658
|
+
}
|
|
659
|
+
|
|
660
|
+
extractParagraphContent(p) {
|
|
661
|
+
const content = []
|
|
662
|
+
p.queryAll("w:r").forEach(r => {
|
|
663
|
+
const text = r.query("w:t")?.textContent
|
|
664
|
+
if (!text) {
|
|
665
|
+
return
|
|
666
|
+
}
|
|
667
|
+
|
|
668
|
+
const rPr = r.query("w:rPr")
|
|
669
|
+
const formatting = rPr ? this.extractRunProperties(rPr) : {}
|
|
670
|
+
|
|
671
|
+
content.push({
|
|
672
|
+
type: "text",
|
|
673
|
+
text,
|
|
674
|
+
marks: this.createMarksFromFormatting(formatting)
|
|
675
|
+
})
|
|
676
|
+
})
|
|
677
|
+
return content
|
|
678
|
+
}
|
|
679
|
+
|
|
680
|
+
createMarksFromFormatting(formatting) {
|
|
681
|
+
const marks = []
|
|
682
|
+
if (formatting.bold) {
|
|
683
|
+
marks.push({type: "strong"})
|
|
684
|
+
}
|
|
685
|
+
if (formatting.italic) {
|
|
686
|
+
marks.push({type: "em"})
|
|
687
|
+
}
|
|
688
|
+
if (formatting.underline) {
|
|
689
|
+
marks.push({type: "underline"})
|
|
690
|
+
}
|
|
691
|
+
return marks
|
|
692
|
+
}
|
|
693
|
+
|
|
694
|
+
async parseCoreDoc() {
|
|
695
|
+
try {
|
|
696
|
+
const content = await this.zip
|
|
697
|
+
.file("docProps/core.xml")
|
|
698
|
+
?.async("string")
|
|
699
|
+
if (!content) {
|
|
700
|
+
return
|
|
701
|
+
}
|
|
702
|
+
this.coreDoc = xmlDOM(content)
|
|
703
|
+
} catch (err) {
|
|
704
|
+
console.warn("Could not parse core doc", err)
|
|
705
|
+
}
|
|
706
|
+
}
|
|
707
|
+
|
|
708
|
+
async parseCustomDoc() {
|
|
709
|
+
try {
|
|
710
|
+
const content = await this.zip
|
|
711
|
+
.file("docProps/custom.xml")
|
|
712
|
+
?.async("string")
|
|
713
|
+
if (!content) {
|
|
714
|
+
return
|
|
715
|
+
}
|
|
716
|
+
this.customDoc = xmlDOM(content)
|
|
717
|
+
} catch (err) {
|
|
718
|
+
console.warn("Could not parse custom doc", err)
|
|
719
|
+
}
|
|
720
|
+
}
|
|
721
|
+
|
|
722
|
+
async parseDocument() {
|
|
723
|
+
try {
|
|
724
|
+
const content = await this.zip
|
|
725
|
+
.file("word/document.xml")
|
|
726
|
+
?.async("string")
|
|
727
|
+
if (!content) {
|
|
728
|
+
return
|
|
729
|
+
}
|
|
730
|
+
this.document = xmlDOM(content)
|
|
731
|
+
} catch (err) {
|
|
732
|
+
console.warn("Could not parse document", err)
|
|
733
|
+
}
|
|
734
|
+
}
|
|
735
|
+
}
|