@usejunior/docx-core 0.9.1 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/.tsbuildinfo +1 -1
- package/dist/atomizer.d.ts +15 -1
- package/dist/atomizer.d.ts.map +1 -1
- package/dist/atomizer.js +37 -1
- package/dist/atomizer.js.map +1 -1
- package/dist/baselines/atomizer/documentReconstructor.d.ts.map +1 -1
- package/dist/baselines/atomizer/documentReconstructor.js +218 -90
- package/dist/baselines/atomizer/documentReconstructor.js.map +1 -1
- package/dist/baselines/atomizer/formattingFidelity.d.ts +99 -0
- package/dist/baselines/atomizer/formattingFidelity.d.ts.map +1 -0
- package/dist/baselines/atomizer/formattingFidelity.js +449 -0
- package/dist/baselines/atomizer/formattingFidelity.js.map +1 -0
- package/dist/baselines/atomizer/inPlaceModifier-bookmarks.d.ts +37 -0
- package/dist/baselines/atomizer/inPlaceModifier-bookmarks.d.ts.map +1 -0
- package/dist/baselines/atomizer/inPlaceModifier-bookmarks.js +189 -0
- package/dist/baselines/atomizer/inPlaceModifier-bookmarks.js.map +1 -0
- package/dist/baselines/atomizer/inPlaceModifier-containers.d.ts +74 -0
- package/dist/baselines/atomizer/inPlaceModifier-containers.d.ts.map +1 -0
- package/dist/baselines/atomizer/inPlaceModifier-containers.js +171 -0
- package/dist/baselines/atomizer/inPlaceModifier-containers.js.map +1 -0
- package/dist/baselines/atomizer/inPlaceModifier-deletion.d.ts +88 -0
- package/dist/baselines/atomizer/inPlaceModifier-deletion.d.ts.map +1 -0
- package/dist/baselines/atomizer/inPlaceModifier-deletion.js +326 -0
- package/dist/baselines/atomizer/inPlaceModifier-deletion.js.map +1 -0
- package/dist/baselines/atomizer/inPlaceModifier-postprocess.d.ts +85 -0
- package/dist/baselines/atomizer/inPlaceModifier-postprocess.d.ts.map +1 -0
- package/dist/baselines/atomizer/inPlaceModifier-postprocess.js +402 -0
- package/dist/baselines/atomizer/inPlaceModifier-postprocess.js.map +1 -0
- package/dist/baselines/atomizer/inPlaceModifier-presplit.d.ts +39 -0
- package/dist/baselines/atomizer/inPlaceModifier-presplit.d.ts.map +1 -0
- package/dist/baselines/atomizer/inPlaceModifier-presplit.js +265 -0
- package/dist/baselines/atomizer/inPlaceModifier-presplit.js.map +1 -0
- package/dist/baselines/atomizer/inPlaceModifier-shared.d.ts +62 -0
- package/dist/baselines/atomizer/inPlaceModifier-shared.d.ts.map +1 -0
- package/dist/baselines/atomizer/inPlaceModifier-shared.js +139 -0
- package/dist/baselines/atomizer/inPlaceModifier-shared.js.map +1 -0
- package/dist/baselines/atomizer/inPlaceModifier-wrappers.d.ts +189 -0
- package/dist/baselines/atomizer/inPlaceModifier-wrappers.d.ts.map +1 -0
- package/dist/baselines/atomizer/inPlaceModifier-wrappers.js +427 -0
- package/dist/baselines/atomizer/inPlaceModifier-wrappers.js.map +1 -0
- package/dist/baselines/atomizer/inPlaceModifier.d.ts +6 -290
- package/dist/baselines/atomizer/inPlaceModifier.d.ts.map +1 -1
- package/dist/baselines/atomizer/inPlaceModifier.js +23 -1828
- package/dist/baselines/atomizer/inPlaceModifier.js.map +1 -1
- package/dist/baselines/atomizer/pipeline.d.ts +76 -1
- package/dist/baselines/atomizer/pipeline.d.ts.map +1 -1
- package/dist/baselines/atomizer/pipeline.js +204 -27
- package/dist/baselines/atomizer/pipeline.js.map +1 -1
- package/dist/baselines/atomizer/trackChangesAcceptorAst.d.ts.map +1 -1
- package/dist/baselines/atomizer/trackChangesAcceptorAst.js +56 -160
- package/dist/baselines/atomizer/trackChangesAcceptorAst.js.map +1 -1
- package/dist/compare-types.d.ts +151 -0
- package/dist/compare-types.d.ts.map +1 -0
- package/dist/compare-types.js +2 -0
- package/dist/compare-types.js.map +1 -0
- package/dist/core-types.d.ts +5 -1
- package/dist/core-types.d.ts.map +1 -1
- package/dist/core-types.js +5 -1
- package/dist/core-types.js.map +1 -1
- package/dist/footnotes.d.ts +8 -3
- package/dist/footnotes.d.ts.map +1 -1
- package/dist/footnotes.js +8 -3
- package/dist/footnotes.js.map +1 -1
- package/dist/index.d.ts +6 -150
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +6 -0
- package/dist/index.js.map +1 -1
- package/dist/integration/libreoffice-oracle.d.ts +41 -0
- package/dist/integration/libreoffice-oracle.d.ts.map +1 -0
- package/dist/integration/libreoffice-oracle.js +282 -0
- package/dist/integration/libreoffice-oracle.js.map +1 -0
- package/dist/primitives/accept_changes.d.ts +2 -2
- package/dist/primitives/accept_changes.d.ts.map +1 -1
- package/dist/primitives/accept_changes.js +24 -79
- package/dist/primitives/accept_changes.js.map +1 -1
- package/dist/primitives/comments.d.ts +12 -3
- package/dist/primitives/comments.d.ts.map +1 -1
- package/dist/primitives/comments.js +374 -97
- package/dist/primitives/comments.js.map +1 -1
- package/dist/primitives/content_fingerprint.d.ts +29 -0
- package/dist/primitives/content_fingerprint.d.ts.map +1 -0
- package/dist/primitives/content_fingerprint.js +63 -0
- package/dist/primitives/content_fingerprint.js.map +1 -0
- package/dist/primitives/document.d.ts +56 -15
- package/dist/primitives/document.d.ts.map +1 -1
- package/dist/primitives/document.js +303 -32
- package/dist/primitives/document.js.map +1 -1
- package/dist/primitives/document_view-comments.d.ts +18 -0
- package/dist/primitives/document_view-comments.d.ts.map +1 -0
- package/dist/primitives/document_view-comments.js +159 -0
- package/dist/primitives/document_view-comments.js.map +1 -0
- package/dist/primitives/document_view-headings.d.ts +45 -0
- package/dist/primitives/document_view-headings.d.ts.map +1 -0
- package/dist/primitives/document_view-headings.js +247 -0
- package/dist/primitives/document_view-headings.js.map +1 -0
- package/dist/primitives/document_view-styles.d.ts +11 -0
- package/dist/primitives/document_view-styles.d.ts.map +1 -0
- package/dist/primitives/document_view-styles.js +104 -0
- package/dist/primitives/document_view-styles.js.map +1 -0
- package/dist/primitives/document_view-toon.d.ts +37 -0
- package/dist/primitives/document_view-toon.d.ts.map +1 -0
- package/dist/primitives/document_view-toon.js +199 -0
- package/dist/primitives/document_view-toon.js.map +1 -0
- package/dist/primitives/document_view-types.d.ts +137 -0
- package/dist/primitives/document_view-types.d.ts.map +1 -0
- package/dist/primitives/document_view-types.js +2 -0
- package/dist/primitives/document_view-types.js.map +1 -0
- package/dist/primitives/document_view.d.ts +8 -106
- package/dist/primitives/document_view.d.ts.map +1 -1
- package/dist/primitives/document_view.js +134 -301
- package/dist/primitives/document_view.js.map +1 -1
- package/dist/primitives/dom-helpers.d.ts +9 -0
- package/dist/primitives/dom-helpers.d.ts.map +1 -1
- package/dist/primitives/dom-helpers.js +10 -1
- package/dist/primitives/dom-helpers.js.map +1 -1
- package/dist/primitives/footnotes.d.ts +4 -3
- package/dist/primitives/footnotes.d.ts.map +1 -1
- package/dist/primitives/footnotes.js +232 -44
- package/dist/primitives/footnotes.js.map +1 -1
- package/dist/primitives/formatting_tags.d.ts +6 -0
- package/dist/primitives/formatting_tags.d.ts.map +1 -1
- package/dist/primitives/formatting_tags.js +6 -1
- package/dist/primitives/formatting_tags.js.map +1 -1
- package/dist/primitives/index.d.ts +6 -0
- package/dist/primitives/index.d.ts.map +1 -1
- package/dist/primitives/index.js +5 -0
- package/dist/primitives/index.js.map +1 -1
- package/dist/primitives/layout.d.ts +4 -3
- package/dist/primitives/layout.d.ts.map +1 -1
- package/dist/primitives/layout.js +32 -3
- package/dist/primitives/layout.js.map +1 -1
- package/dist/primitives/merge_runs.d.ts +21 -3
- package/dist/primitives/merge_runs.d.ts.map +1 -1
- package/dist/primitives/merge_runs.js +32 -10
- package/dist/primitives/merge_runs.js.map +1 -1
- package/dist/primitives/namespaces.d.ts +6 -0
- package/dist/primitives/namespaces.d.ts.map +1 -1
- package/dist/primitives/namespaces.js +9 -0
- package/dist/primitives/namespaces.js.map +1 -1
- package/dist/primitives/reject_changes.d.ts +2 -2
- package/dist/primitives/reject_changes.d.ts.map +1 -1
- package/dist/primitives/reject_changes.js +24 -81
- package/dist/primitives/reject_changes.js.map +1 -1
- package/dist/primitives/semantic_tags.d.ts +7 -0
- package/dist/primitives/semantic_tags.d.ts.map +1 -1
- package/dist/primitives/semantic_tags.js +21 -3
- package/dist/primitives/semantic_tags.js.map +1 -1
- package/dist/primitives/serialize_html.d.ts +36 -0
- package/dist/primitives/serialize_html.d.ts.map +1 -0
- package/dist/primitives/serialize_html.js +393 -0
- package/dist/primitives/serialize_html.js.map +1 -0
- package/dist/primitives/serialize_markdown.d.ts +16 -0
- package/dist/primitives/serialize_markdown.d.ts.map +1 -0
- package/dist/primitives/serialize_markdown.js +300 -0
- package/dist/primitives/serialize_markdown.js.map +1 -0
- package/dist/primitives/serialize_plaintext.d.ts +15 -0
- package/dist/primitives/serialize_plaintext.d.ts.map +1 -0
- package/dist/primitives/serialize_plaintext.js +154 -0
- package/dist/primitives/serialize_plaintext.js.map +1 -0
- package/dist/primitives/styles.js +22 -22
- package/dist/primitives/styles.js.map +1 -1
- package/dist/primitives/tables.d.ts.map +1 -1
- package/dist/primitives/tables.js +13 -3
- package/dist/primitives/tables.js.map +1 -1
- package/dist/primitives/text.d.ts +2 -1
- package/dist/primitives/text.d.ts.map +1 -1
- package/dist/primitives/text.js +116 -12
- package/dist/primitives/text.js.map +1 -1
- package/dist/primitives/track-changes-emitter.d.ts +139 -0
- package/dist/primitives/track-changes-emitter.d.ts.map +1 -0
- package/dist/primitives/track-changes-emitter.js +241 -0
- package/dist/primitives/track-changes-emitter.js.map +1 -0
- package/dist/primitives/xml-helpers.d.ts +29 -0
- package/dist/primitives/xml-helpers.d.ts.map +1 -0
- package/dist/primitives/xml-helpers.js +35 -0
- package/dist/primitives/xml-helpers.js.map +1 -0
- package/dist/shared/ooxml/namespaces.d.ts +4 -1
- package/dist/shared/ooxml/namespaces.d.ts.map +1 -1
- package/dist/shared/ooxml/namespaces.js +4 -1
- package/dist/shared/ooxml/namespaces.js.map +1 -1
- package/package.json +7 -6
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
// DOCX → Markdown serializer.
|
|
2
|
+
//
|
|
3
|
+
// This is a *serializer over the existing structured document model* — it does no OOXML
|
|
4
|
+
// parsing. `DocxDocument.buildDocumentView({ showFormatting: true })` already yields a
|
|
5
|
+
// `DocumentViewNode[]` carrying headings, list metadata, grid-aware table context, injected
|
|
6
|
+
// `[^n]` footnote markers, and an HTML-shaped inline-tag string (`tagged_text`). This module
|
|
7
|
+
// turns that model into GitHub-Flavored Markdown.
|
|
8
|
+
//
|
|
9
|
+
// Markdown is intentionally *lossy*: there is no round-trip guarantee. Constructs without a
|
|
10
|
+
// Markdown equivalent (highlighting, font runs, merged/nested table cells, layout) are
|
|
11
|
+
// downgraded as documented below rather than preserved.
|
|
12
|
+
//
|
|
13
|
+
// The inline tokenizer (`inlineTagsToMarkdown`) is the reusable core; the planned HTML
|
|
14
|
+
// emitter (#304) renders the same tokens, so neither serializer reasons about the tag
|
|
15
|
+
// grammar independently and drifts from the emitter in `formatting_tags.ts`.
|
|
16
|
+
import { tokenizeToonInline } from './document_view.js';
|
|
17
|
+
import { LabelType } from './list_labels.js';
|
|
18
|
+
/** Footnote markers already injected into `tagged_text`, e.g. `[^1]`, `[^12]`. */
|
|
19
|
+
const FOOTNOTE_MARKER_RE = /\[\^\d+\]/g;
|
|
20
|
+
/**
|
|
21
|
+
* Backslash-escape the inline Markdown-significant characters that would otherwise be
|
|
22
|
+
* interpreted mid-line. GFM honours backslash escapes for ASCII punctuation, so `\*`
|
|
23
|
+
* renders a literal `*`. We escape only the characters that trigger *inline* constructs
|
|
24
|
+
* (emphasis, code, links, raw HTML, table pipes); block-level triggers (`#`, `-`, `>`, …)
|
|
25
|
+
* are handled per-line by {@link escapeLeadingBlockSyntax} so we don't litter prose with
|
|
26
|
+
* `\.` and `\-` on every sentence.
|
|
27
|
+
*
|
|
28
|
+
* Already-present `[^n]` footnote markers are protected: escaping their `[`/`]`/`^` would
|
|
29
|
+
* sever them from the appended `[^n]: …` definitions.
|
|
30
|
+
*/
|
|
31
|
+
function escapeInlineText(text) {
|
|
32
|
+
const escapeSpan = (s) => s.replace(/[\\`*_[\]<|]/g, (c) => `\\${c}`);
|
|
33
|
+
let out = '';
|
|
34
|
+
let lastIndex = 0;
|
|
35
|
+
for (const match of text.matchAll(FOOTNOTE_MARKER_RE)) {
|
|
36
|
+
const idx = match.index ?? 0;
|
|
37
|
+
out += escapeSpan(text.slice(lastIndex, idx));
|
|
38
|
+
out += match[0]; // leave the footnote marker untouched
|
|
39
|
+
lastIndex = idx + match[0].length;
|
|
40
|
+
}
|
|
41
|
+
out += escapeSpan(text.slice(lastIndex));
|
|
42
|
+
return out;
|
|
43
|
+
}
|
|
44
|
+
/**
|
|
45
|
+
* Escape a leading block-level trigger so a normal paragraph whose visible text begins with
|
|
46
|
+
* `#`, `>`, `-`, `+`, `* `, or `N.`/`N)` is not mis-read as a heading, quote, or list.
|
|
47
|
+
* Block triggers always require a trailing space, whereas the emphasis we emit (`**`, `*`)
|
|
48
|
+
* never does — so matching the space-terminated forms cannot corrupt generated Markdown.
|
|
49
|
+
*/
|
|
50
|
+
function escapeLeadingBlockSyntax(line) {
|
|
51
|
+
return line.replace(/^(\s*)(#{1,6}(?= )|>(?= )|[-+*](?= )|\d+[.)](?= ))/, (_m, ws, trig) => {
|
|
52
|
+
if (/^\d/.test(trig)) {
|
|
53
|
+
// ordered-list trigger: escape the delimiter (the `.` or `)`), keep the digits
|
|
54
|
+
return `${ws}${trig.slice(0, -1)}\\${trig.slice(-1)}`;
|
|
55
|
+
}
|
|
56
|
+
return `${ws}\\${trig[0]}${trig.slice(1)}`;
|
|
57
|
+
});
|
|
58
|
+
}
|
|
59
|
+
export function inlineTagsToMarkdown(text) {
|
|
60
|
+
const ops = [];
|
|
61
|
+
const linkUrls = []; // stack of open <a> hrefs (links don't nest meaningfully)
|
|
62
|
+
for (const token of tokenizeToonInline(text)) {
|
|
63
|
+
if (token.kind === 'text') {
|
|
64
|
+
ops.push({ t: 'md', v: escapeInlineText(token.value) });
|
|
65
|
+
continue;
|
|
66
|
+
}
|
|
67
|
+
const tag = token.value;
|
|
68
|
+
if (tag === '<b>')
|
|
69
|
+
ops.push({ t: 'emph', kind: 'b', dir: 1 });
|
|
70
|
+
else if (tag === '</b>')
|
|
71
|
+
ops.push({ t: 'emph', kind: 'b', dir: -1 });
|
|
72
|
+
else if (tag === '<i>')
|
|
73
|
+
ops.push({ t: 'emph', kind: 'i', dir: 1 });
|
|
74
|
+
else if (tag === '</i>')
|
|
75
|
+
ops.push({ t: 'emph', kind: 'i', dir: -1 });
|
|
76
|
+
else if (tag === '<u>' || tag === '</u>')
|
|
77
|
+
ops.push({ t: 'md', v: tag }); // raw HTML passthrough
|
|
78
|
+
else if (tag.startsWith('<a ')) {
|
|
79
|
+
linkUrls.push(/href="([^"]*)"/.exec(tag)?.[1] ?? '');
|
|
80
|
+
ops.push({ t: 'md', v: '[' });
|
|
81
|
+
}
|
|
82
|
+
else if (tag === '</a>') {
|
|
83
|
+
ops.push({ t: 'md', v: `](${linkUrls.pop() ?? ''})` });
|
|
84
|
+
}
|
|
85
|
+
// <highlight>, </highlight>, <font …>, </font> → strip (emit nothing, keep inner text)
|
|
86
|
+
}
|
|
87
|
+
// Defensive: an unbalanced <a> (no closing tag) would leave a dangling "["; close it.
|
|
88
|
+
while (linkUrls.length > 0) {
|
|
89
|
+
ops.push({ t: 'md', v: `](${linkUrls.pop()})` });
|
|
90
|
+
}
|
|
91
|
+
// Emit emphasis delimiters only where the *active* emphasis state actually changes
|
|
92
|
+
// between two text spans. Word splits a single formatted phrase into many runs, so
|
|
93
|
+
// `tagged_text` carries boundary noise: `</b></i><b><i>` (state unchanged across the
|
|
94
|
+
// boundary), interleaved different-kind toggles `</b></i><i><b>`, or empty `<b></b>`
|
|
95
|
+
// pairs. Mapping each toggle naively yields runs like `******` / `****` that render as
|
|
96
|
+
// literal asterisks. Tracking the state and reconciling on a stack collapses all of that
|
|
97
|
+
// to the minimal delimiters while keeping nesting well-formed (`**a*b*c**`).
|
|
98
|
+
const DELIM = { b: '**', i: '*' };
|
|
99
|
+
let out = '';
|
|
100
|
+
const activeStack = []; // emphasis kinds currently open, in open order
|
|
101
|
+
const desired = new Set(); // running target state as we scan emph ops
|
|
102
|
+
const reconcile = () => {
|
|
103
|
+
// Close from the top until every still-open kind is wanted, remembering any wanted
|
|
104
|
+
// kinds we had to close (because they sat above an unwanted one) so we can reopen them.
|
|
105
|
+
const reopen = [];
|
|
106
|
+
while (activeStack.length > 0 && !activeStack.every((k) => desired.has(k))) {
|
|
107
|
+
const k = activeStack.pop();
|
|
108
|
+
out += DELIM[k];
|
|
109
|
+
if (desired.has(k))
|
|
110
|
+
reopen.push(k);
|
|
111
|
+
}
|
|
112
|
+
// Open the kinds that are wanted but not currently open: the reopened ones first (in
|
|
113
|
+
// their original open order), then any brand-new kinds in a stable order (b before i).
|
|
114
|
+
const active = new Set(activeStack);
|
|
115
|
+
const toOpen = [...reopen.reverse(), ...['b', 'i'].filter((k) => desired.has(k))];
|
|
116
|
+
for (const k of toOpen) {
|
|
117
|
+
if (active.has(k))
|
|
118
|
+
continue;
|
|
119
|
+
active.add(k);
|
|
120
|
+
activeStack.push(k);
|
|
121
|
+
out += DELIM[k];
|
|
122
|
+
}
|
|
123
|
+
};
|
|
124
|
+
for (const op of ops) {
|
|
125
|
+
if (op.t === 'emph') {
|
|
126
|
+
if (op.dir === 1)
|
|
127
|
+
desired.add(op.kind);
|
|
128
|
+
else
|
|
129
|
+
desired.delete(op.kind);
|
|
130
|
+
continue;
|
|
131
|
+
}
|
|
132
|
+
reconcile(); // realize pending state changes before emitting literal text/Markdown
|
|
133
|
+
out += op.v;
|
|
134
|
+
}
|
|
135
|
+
desired.clear();
|
|
136
|
+
reconcile(); // close any still-open emphasis at end of string
|
|
137
|
+
return out;
|
|
138
|
+
}
|
|
139
|
+
/** A heading is structural (gets `#`) only when Word's style told us so and gave a level. */
|
|
140
|
+
function isStructuralHeading(node) {
|
|
141
|
+
return node.heading?.source === 'word_style' && typeof node.heading.level === 'number';
|
|
142
|
+
}
|
|
143
|
+
function renderListItem(node) {
|
|
144
|
+
const lm = node.list_metadata;
|
|
145
|
+
const level = Math.max(0, lm.list_level);
|
|
146
|
+
const indent = ' '.repeat(level);
|
|
147
|
+
const text = inlineTagsToMarkdown(node.tagged_text).trim();
|
|
148
|
+
const label = lm.label_string?.trim() ?? '';
|
|
149
|
+
// True auto-numbered numeric lists render as a Markdown ordered list (let the renderer
|
|
150
|
+
// number them). Everything else preserves the *literal* label — legal documents carry
|
|
151
|
+
// meaningful labels like `Section 2.1`, `Article IV`, `(a)`, `(i)` that a bare `1.` would
|
|
152
|
+
// silently destroy.
|
|
153
|
+
if (lm.label_type === LabelType.NUMBER && lm.is_auto_numbered) {
|
|
154
|
+
return `${indent}1. ${text}`.trimEnd();
|
|
155
|
+
}
|
|
156
|
+
if (label) {
|
|
157
|
+
return `${indent}- ${label} ${text}`.trimEnd();
|
|
158
|
+
}
|
|
159
|
+
return `${indent}- ${text}`.trimEnd();
|
|
160
|
+
}
|
|
161
|
+
/**
|
|
162
|
+
* Render a run of nodes that share a `table_context.table_id` as a GFM table.
|
|
163
|
+
*
|
|
164
|
+
* Lossy by design (GFM has no merged/nested-cell support):
|
|
165
|
+
* - Horizontally merged cells (`gridSpan`) advance `col_index`, leaving grid gaps that we
|
|
166
|
+
* fill with empty cells so the column count stays consistent and viewers don't break.
|
|
167
|
+
* - Vertically merged cells (`vMerge`) and nested tables are flattened into the body-level
|
|
168
|
+
* grid; multi-paragraph cells are joined with `<br>`.
|
|
169
|
+
*/
|
|
170
|
+
function renderTable(group) {
|
|
171
|
+
let totalCols = 0;
|
|
172
|
+
for (const n of group) {
|
|
173
|
+
const tc = n.table_context;
|
|
174
|
+
if (!tc)
|
|
175
|
+
continue;
|
|
176
|
+
totalCols = Math.max(totalCols, tc.total_cols, tc.col_index + 1);
|
|
177
|
+
}
|
|
178
|
+
if (totalCols <= 0)
|
|
179
|
+
return [];
|
|
180
|
+
const rows = new Map();
|
|
181
|
+
const rowOrder = [];
|
|
182
|
+
const headerRows = new Set();
|
|
183
|
+
for (const n of group) {
|
|
184
|
+
const tc = n.table_context;
|
|
185
|
+
if (!tc)
|
|
186
|
+
continue;
|
|
187
|
+
if (!rows.has(tc.row_index)) {
|
|
188
|
+
rows.set(tc.row_index, new Map());
|
|
189
|
+
rowOrder.push(tc.row_index);
|
|
190
|
+
}
|
|
191
|
+
const cellMap = rows.get(tc.row_index);
|
|
192
|
+
// A raw newline inside a cell (from a line break) would split the GFM table row and
|
|
193
|
+
// break the whole table, so collapse intra-cell newlines to `<br>`.
|
|
194
|
+
const cellText = inlineTagsToMarkdown(n.tagged_text).replace(/\s*\n+\s*/g, '<br>').trim();
|
|
195
|
+
const parts = cellMap.get(tc.col_index) ?? [];
|
|
196
|
+
if (cellText)
|
|
197
|
+
parts.push(cellText);
|
|
198
|
+
cellMap.set(tc.col_index, parts);
|
|
199
|
+
if (tc.is_header_row)
|
|
200
|
+
headerRows.add(tc.row_index);
|
|
201
|
+
}
|
|
202
|
+
rowOrder.sort((a, b) => a - b);
|
|
203
|
+
if (rowOrder.length === 0)
|
|
204
|
+
return [];
|
|
205
|
+
const cellsFor = (rowIndex) => {
|
|
206
|
+
const cellMap = rows.get(rowIndex) ?? new Map();
|
|
207
|
+
const cells = [];
|
|
208
|
+
for (let c = 0; c < totalCols; c++) {
|
|
209
|
+
cells.push((cellMap.get(c) ?? []).join('<br>'));
|
|
210
|
+
}
|
|
211
|
+
return cells;
|
|
212
|
+
};
|
|
213
|
+
// GFM requires exactly one header row. Prefer the first row Word flagged as a header;
|
|
214
|
+
// otherwise treat the first row as the header (the common case).
|
|
215
|
+
const headerRowIndex = rowOrder.find((ri) => headerRows.has(ri)) ?? rowOrder[0];
|
|
216
|
+
const lines = [];
|
|
217
|
+
lines.push(`| ${cellsFor(headerRowIndex).join(' | ')} |`);
|
|
218
|
+
lines.push(`| ${Array.from({ length: totalCols }, () => '---').join(' | ')} |`);
|
|
219
|
+
for (const ri of rowOrder) {
|
|
220
|
+
if (ri === headerRowIndex)
|
|
221
|
+
continue;
|
|
222
|
+
lines.push(`| ${cellsFor(ri).join(' | ')} |`);
|
|
223
|
+
}
|
|
224
|
+
return lines;
|
|
225
|
+
}
|
|
226
|
+
/**
|
|
227
|
+
* Serialize a structured document view to GitHub-Flavored Markdown.
|
|
228
|
+
*
|
|
229
|
+
* @param nodes Block nodes from `buildDocumentView({ showFormatting: true }).nodes`.
|
|
230
|
+
* @param footnotes Footnotes from `DocxDocument.getFootnotes()` (already sorted by
|
|
231
|
+
* `displayNumber`); appended as `[^n]: …` definitions.
|
|
232
|
+
*/
|
|
233
|
+
export function serializeToMarkdown(nodes, footnotes = [], _opts = {}) {
|
|
234
|
+
const blocks = [];
|
|
235
|
+
let inList = false;
|
|
236
|
+
const closeList = () => {
|
|
237
|
+
if (inList) {
|
|
238
|
+
blocks.push('');
|
|
239
|
+
inList = false;
|
|
240
|
+
}
|
|
241
|
+
};
|
|
242
|
+
for (let i = 0; i < nodes.length; i++) {
|
|
243
|
+
const node = nodes[i];
|
|
244
|
+
// ── Tables: consume the whole run of same-table_id nodes at once ──
|
|
245
|
+
if (node.table_context) {
|
|
246
|
+
closeList();
|
|
247
|
+
const tableId = node.table_context.table_id;
|
|
248
|
+
const group = [];
|
|
249
|
+
while (i < nodes.length && nodes[i].table_context?.table_id === tableId) {
|
|
250
|
+
group.push(nodes[i]);
|
|
251
|
+
i++;
|
|
252
|
+
}
|
|
253
|
+
i--; // for-loop will re-increment
|
|
254
|
+
const tableLines = renderTable(group);
|
|
255
|
+
if (tableLines.length > 0) {
|
|
256
|
+
blocks.push(tableLines.join('\n'));
|
|
257
|
+
blocks.push('');
|
|
258
|
+
}
|
|
259
|
+
continue;
|
|
260
|
+
}
|
|
261
|
+
// ── Structural (Word-styled) headings ──
|
|
262
|
+
if (isStructuralHeading(node)) {
|
|
263
|
+
closeList();
|
|
264
|
+
const level = Math.min(6, Math.max(1, node.heading.level));
|
|
265
|
+
const text = inlineTagsToMarkdown(node.tagged_text).trim();
|
|
266
|
+
blocks.push(`${'#'.repeat(level)} ${text}`.trimEnd());
|
|
267
|
+
blocks.push('');
|
|
268
|
+
continue;
|
|
269
|
+
}
|
|
270
|
+
// ── List items ──
|
|
271
|
+
if (node.list_metadata.list_level >= 0) {
|
|
272
|
+
inList = true;
|
|
273
|
+
blocks.push(renderListItem(node));
|
|
274
|
+
continue;
|
|
275
|
+
}
|
|
276
|
+
// ── Normal paragraphs (heuristic headings land here: their run-in bold already lives
|
|
277
|
+
// in the inline tags, so we keep them as paragraphs rather than inventing a `#`). ──
|
|
278
|
+
closeList();
|
|
279
|
+
const text = escapeLeadingBlockSyntax(inlineTagsToMarkdown(node.tagged_text));
|
|
280
|
+
if (text.trim() === '') {
|
|
281
|
+
blocks.push('');
|
|
282
|
+
}
|
|
283
|
+
else {
|
|
284
|
+
blocks.push(text);
|
|
285
|
+
blocks.push('');
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
closeList();
|
|
289
|
+
// ── Footnote definitions ──
|
|
290
|
+
const defs = footnotes.filter((fn) => fn.displayNumber > 0);
|
|
291
|
+
if (defs.length > 0) {
|
|
292
|
+
blocks.push('');
|
|
293
|
+
for (const fn of defs) {
|
|
294
|
+
const body = escapeInlineText(fn.text.replace(/\s+/g, ' ').trim());
|
|
295
|
+
blocks.push(`[^${fn.displayNumber}]: ${body}`);
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
return `${blocks.join('\n').replace(/\n{3,}/g, '\n\n').trim()}\n`;
|
|
299
|
+
}
|
|
300
|
+
//# sourceMappingURL=serialize_markdown.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"serialize_markdown.js","sourceRoot":"","sources":["../../src/primitives/serialize_markdown.ts"],"names":[],"mappings":"AAAA,8BAA8B;AAC9B,EAAE;AACF,wFAAwF;AACxF,uFAAuF;AACvF,4FAA4F;AAC5F,6FAA6F;AAC7F,kDAAkD;AAClD,EAAE;AACF,4FAA4F;AAC5F,uFAAuF;AACvF,wDAAwD;AACxD,EAAE;AACF,uFAAuF;AACvF,sFAAsF;AACtF,6EAA6E;AAE7E,OAAO,EAAE,kBAAkB,EAAE,MAAM,oBAAoB,CAAC;AAExD,OAAO,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAC;AAG7C,kFAAkF;AAClF,MAAM,kBAAkB,GAAG,YAAY,CAAC;AAExC;;;;;;;;;;GAUG;AACH,SAAS,gBAAgB,CAAC,IAAY;IACpC,MAAM,UAAU,GAAG,CAAC,CAAS,EAAU,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,eAAe,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC;IAEtF,IAAI,GAAG,GAAG,EAAE,CAAC;IACb,IAAI,SAAS,GAAG,CAAC,CAAC;IAClB,KAAK,MAAM,KAAK,IAAI,IAAI,CAAC,QAAQ,CAAC,kBAAkB,CAAC,EAAE,CAAC;QACtD,MAAM,GAAG,GAAG,KAAK,CAAC,KAAK,IAAI,CAAC,CAAC;QAC7B,GAAG,IAAI,UAAU,CAAC,IAAI,CAAC,KAAK,CAAC,SAAS,EAAE,GAAG,CAAC,CAAC,CAAC;QAC9C,GAAG,IAAI,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,sCAAsC;QACvD,SAAS,GAAG,GAAG,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC;IACpC,CAAC;IACD,GAAG,IAAI,UAAU,CAAC,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC,CAAC;IACzC,OAAO,GAAG,CAAC;AACb,CAAC;AAED;;;;;GAKG;AACH,SAAS,wBAAwB,CAAC,IAAY;IAC5C,OAAO,IAAI,CAAC,OAAO,CAAC,oDAAoD,EAAE,CAAC,EAAE,EAAE,EAAU,EAAE,IAAY,EAAE,EAAE;QACzG,IAAI,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;YACrB,+EAA+E;YAC/E,OAAO,GAAG,EAAE,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,KAAK,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QACxD,CAAC;QACD,OAAO,GAAG,EAAE,KAAK,IAAI,CAAC,CAAC,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC;IAC7C,CAAC,CAAC,CAAC;AACL,CAAC;AAoBD,MAAM,UAAU,oBAAoB,CAAC,IAAY;IAC/C,MAAM,GAAG,GAAe,EAAE,CAAC;IAC3B,MAAM,QAAQ,GAAa,EAAE,CAAC,CAAC,0DAA0D;IAEzF,KAAK,MAAM,KAAK,IAAI,kBAAkB,CAAC,IAAI,CAAC,EAAE,CAAC;QAC7C,IAAI,KAAK,CAAC,IAAI,KAAK,MAAM,EAAE,CAAC;YAC1B,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,IAAI,EAAE,CAAC,EAAE,gBAAgB,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC;YACxD,SAAS;QACX,CAAC;QACD,MAAM,GAAG,GAAG,KAAK,CAAC,KAAK,CAAC;QACxB,IAAI,GAAG,KAAK,KAAK;YAAE,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,MAAM,EAAE,IAAI,EAAE,GAAG,EAAE,GAAG,EAAE,CAAC,EAAE,CAAC,CAAC;aACzD,IAAI,GAAG,KAAK,MAAM;YAAE,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,MAAM,EAAE,IAAI,EAAE,GAAG,EAAE,GAAG,EAAE,CAAC,CAAC,EAAE,CAAC,CAAC;aAChE,IAAI,GAAG,KAAK,KAAK;YAAE,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,MAAM,EAAE,IAAI,EAAE,GAAG,EAAE,GAAG,EAAE,CAAC,EAAE,CAAC,CAAC;aAC9D,IAAI,GAAG,KAAK,MAAM;YAAE,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,MAAM,EAAE,IAAI,EAAE,GAAG,EAAE,GAAG,EAAE,CAAC,CAAC,EAAE,CAAC,CAAC;aAChE,IAAI,GAAG,KAAK,KAAK,IAAI,GAAG,KAAK,MAAM;YAAE,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,IAAI,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,CAAC,CAAC,uBAAuB;aAC3F,IAAI,GAAG,CAAC,UAAU,CAAC,KAAK,CAAC,EAAE,CAAC;YAC/B,QAAQ,CAAC,IAAI,CAAC,gBAAgB,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;YACrD,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,IAAI,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,CAAC;QAChC,CAAC;aAAM,IAAI,GAAG,KAAK,MAAM,EAAE,CAAC;YAC1B,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,IAAI,EAAE,CAAC,EAAE,KAAK,QAAQ,CAAC,GAAG,EAAE,IAAI,EAAE,GAAG,EAAE,CAAC,CAAC;QACzD,CAAC;QACD,uFAAuF;IACzF,CAAC;IAED,sFAAsF;IACtF,OAAO,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC3B,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,IAAI,EAAE,CAAC,EAAE,KAAK,QAAQ,CAAC,GAAG,EAAE,GAAG,EAAE,CAAC,CAAC;IACnD,CAAC;IAED,mFAAmF;IACnF,mFAAmF;IACnF,qFAAqF;IACrF,qFAAqF;IACrF,uFAAuF;IACvF,yFAAyF;IACzF,6EAA6E;IAC7E,MAAM,KAAK,GAA8B,EAAE,CAAC,EAAE,IAAI,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC;IAC7D,IAAI,GAAG,GAAG,EAAE,CAAC;IACb,MAAM,WAAW,GAAkB,EAAE,CAAC,CAAC,+CAA+C;IACtF,MAAM,OAAO,GAAG,IAAI,GAAG,EAAa,CAAC,CAAC,2CAA2C;IAEjF,MAAM,SAAS,GAAG,GAAS,EAAE;QAC3B,mFAAmF;QACnF,wFAAwF;QACxF,MAAM,MAAM,GAAkB,EAAE,CAAC;QACjC,OAAO,WAAW,CAAC,MAAM,GAAG,CAAC,IAAI,CAAC,WAAW,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;YAC3E,MAAM,CAAC,GAAG,WAAW,CAAC,GAAG,EAAG,CAAC;YAC7B,GAAG,IAAI,KAAK,CAAC,CAAC,CAAC,CAAC;YAChB,IAAI,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC;gBAAE,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACrC,CAAC;QACD,qFAAqF;QACrF,uFAAuF;QACvF,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,WAAW,CAAC,CAAC;QACpC,MAAM,MAAM,GAAG,CAAC,GAAG,MAAM,CAAC,OAAO,EAAE,EAAE,GAAI,CAAC,GAAG,EAAE,GAAG,CAAW,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QAC7F,KAAK,MAAM,CAAC,IAAI,MAAM,EAAE,CAAC;YACvB,IAAI,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC;gBAAE,SAAS;YAC5B,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;YACd,WAAW,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YACpB,GAAG,IAAI,KAAK,CAAC,CAAC,CAAC,CAAC;QAClB,CAAC;IACH,CAAC,CAAC;IAEF,KAAK,MAAM,EAAE,IAAI,GAAG,EAAE,CAAC;QACrB,IAAI,EAAE,CAAC,CAAC,KAAK,MAAM,EAAE,CAAC;YACpB,IAAI,EAAE,CAAC,GAAG,KAAK,CAAC;gBAAE,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,CAAC;;gBAClC,OAAO,CAAC,MAAM,CAAC,EAAE,CAAC,IAAI,CAAC,CAAC;YAC7B,SAAS;QACX,CAAC;QACD,SAAS,EAAE,CAAC,CAAC,sEAAsE;QACnF,GAAG,IAAI,EAAE,CAAC,CAAC,CAAC;IACd,CAAC;IACD,OAAO,CAAC,KAAK,EAAE,CAAC;IAChB,SAAS,EAAE,CAAC,CAAC,iDAAiD;IAC9D,OAAO,GAAG,CAAC;AACb,CAAC;AAED,6FAA6F;AAC7F,SAAS,mBAAmB,CAAC,IAAsB;IACjD,OAAO,IAAI,CAAC,OAAO,EAAE,MAAM,KAAK,YAAY,IAAI,OAAO,IAAI,CAAC,OAAO,CAAC,KAAK,KAAK,QAAQ,CAAC;AACzF,CAAC;AAED,SAAS,cAAc,CAAC,IAAsB;IAC5C,MAAM,EAAE,GAAG,IAAI,CAAC,aAAa,CAAC;IAC9B,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,EAAE,CAAC,UAAU,CAAC,CAAC;IACzC,MAAM,MAAM,GAAG,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;IAClC,MAAM,IAAI,GAAG,oBAAoB,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC,IAAI,EAAE,CAAC;IAC3D,MAAM,KAAK,GAAG,EAAE,CAAC,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;IAE5C,uFAAuF;IACvF,sFAAsF;IACtF,0FAA0F;IAC1F,oBAAoB;IACpB,IAAI,EAAE,CAAC,UAAU,KAAK,SAAS,CAAC,MAAM,IAAI,EAAE,CAAC,gBAAgB,EAAE,CAAC;QAC9D,OAAO,GAAG,MAAM,MAAM,IAAI,EAAE,CAAC,OAAO,EAAE,CAAC;IACzC,CAAC;IACD,IAAI,KAAK,EAAE,CAAC;QACV,OAAO,GAAG,MAAM,KAAK,KAAK,IAAI,IAAI,EAAE,CAAC,OAAO,EAAE,CAAC;IACjD,CAAC;IACD,OAAO,GAAG,MAAM,KAAK,IAAI,EAAE,CAAC,OAAO,EAAE,CAAC;AACxC,CAAC;AAED;;;;;;;;GAQG;AACH,SAAS,WAAW,CAAC,KAAyB;IAC5C,IAAI,SAAS,GAAG,CAAC,CAAC;IAClB,KAAK,MAAM,CAAC,IAAI,KAAK,EAAE,CAAC;QACtB,MAAM,EAAE,GAAG,CAAC,CAAC,aAAa,CAAC;QAC3B,IAAI,CAAC,EAAE;YAAE,SAAS;QAClB,SAAS,GAAG,IAAI,CAAC,GAAG,CAAC,SAAS,EAAE,EAAE,CAAC,UAAU,EAAE,EAAE,CAAC,SAAS,GAAG,CAAC,CAAC,CAAC;IACnE,CAAC;IACD,IAAI,SAAS,IAAI,CAAC;QAAE,OAAO,EAAE,CAAC;IAE9B,MAAM,IAAI,GAAG,IAAI,GAAG,EAAiC,CAAC;IACtD,MAAM,QAAQ,GAAa,EAAE,CAAC;IAC9B,MAAM,UAAU,GAAG,IAAI,GAAG,EAAU,CAAC;IAErC,KAAK,MAAM,CAAC,IAAI,KAAK,EAAE,CAAC;QACtB,MAAM,EAAE,GAAG,CAAC,CAAC,aAAa,CAAC;QAC3B,IAAI,CAAC,EAAE;YAAE,SAAS;QAClB,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,SAAS,CAAC,EAAE,CAAC;YAC5B,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,SAAS,EAAE,IAAI,GAAG,EAAE,CAAC,CAAC;YAClC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC,SAAS,CAAC,CAAC;QAC9B,CAAC;QACD,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,SAAS,CAAE,CAAC;QACxC,oFAAoF;QACpF,oEAAoE;QACpE,MAAM,QAAQ,GAAG,oBAAoB,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,OAAO,CAAC,YAAY,EAAE,MAAM,CAAC,CAAC,IAAI,EAAE,CAAC;QAC1F,MAAM,KAAK,GAAG,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,SAAS,CAAC,IAAI,EAAE,CAAC;QAC9C,IAAI,QAAQ;YAAE,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;QACnC,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,SAAS,EAAE,KAAK,CAAC,CAAC;QACjC,IAAI,EAAE,CAAC,aAAa;YAAE,UAAU,CAAC,GAAG,CAAC,EAAE,CAAC,SAAS,CAAC,CAAC;IACrD,CAAC;IAED,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IAC/B,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IAErC,MAAM,QAAQ,GAAG,CAAC,QAAgB,EAAY,EAAE;QAC9C,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,QAAQ,CAAC,IAAI,IAAI,GAAG,EAAoB,CAAC;QAClE,MAAM,KAAK,GAAa,EAAE,CAAC;QAC3B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,EAAE,CAAC,EAAE,EAAE,CAAC;YACnC,KAAK,CAAC,IAAI,CAAC,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC;QAClD,CAAC;QACD,OAAO,KAAK,CAAC;IACf,CAAC,CAAC;IAEF,sFAAsF;IACtF,iEAAiE;IACjE,MAAM,cAAc,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,IAAI,QAAQ,CAAC,CAAC,CAAE,CAAC;IAEjF,MAAM,KAAK,GAAa,EAAE,CAAC;IAC3B,KAAK,CAAC,IAAI,CAAC,KAAK,QAAQ,CAAC,cAAc,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAC1D,KAAK,CAAC,IAAI,CAAC,KAAK,KAAK,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,SAAS,EAAE,EAAE,GAAG,EAAE,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAChF,KAAK,MAAM,EAAE,IAAI,QAAQ,EAAE,CAAC;QAC1B,IAAI,EAAE,KAAK,cAAc;YAAE,SAAS;QACpC,KAAK,CAAC,IAAI,CAAC,KAAK,QAAQ,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAChD,CAAC;IACD,OAAO,KAAK,CAAC;AACf,CAAC;AAOD;;;;;;GAMG;AACH,MAAM,UAAU,mBAAmB,CACjC,KAAyB,EACzB,YAAwB,EAAE,EAC1B,QAAkC,EAAE;IAEpC,MAAM,MAAM,GAAa,EAAE,CAAC;IAC5B,IAAI,MAAM,GAAG,KAAK,CAAC;IAEnB,MAAM,SAAS,GAAG,GAAS,EAAE;QAC3B,IAAI,MAAM,EAAE,CAAC;YACX,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YAChB,MAAM,GAAG,KAAK,CAAC;QACjB,CAAC;IACH,CAAC,CAAC;IAEF,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAE,CAAC;QAEvB,qEAAqE;QACrE,IAAI,IAAI,CAAC,aAAa,EAAE,CAAC;YACvB,SAAS,EAAE,CAAC;YACZ,MAAM,OAAO,GAAG,IAAI,CAAC,aAAa,CAAC,QAAQ,CAAC;YAC5C,MAAM,KAAK,GAAuB,EAAE,CAAC;YACrC,OAAO,CAAC,GAAG,KAAK,CAAC,MAAM,IAAI,KAAK,CAAC,CAAC,CAAE,CAAC,aAAa,EAAE,QAAQ,KAAK,OAAO,EAAE,CAAC;gBACzE,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAE,CAAC,CAAC;gBACtB,CAAC,EAAE,CAAC;YACN,CAAC;YACD,CAAC,EAAE,CAAC,CAAC,6BAA6B;YAClC,MAAM,UAAU,GAAG,WAAW,CAAC,KAAK,CAAC,CAAC;YACtC,IAAI,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAC1B,MAAM,CAAC,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC;gBACnC,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YAClB,CAAC;YACD,SAAS;QACX,CAAC;QAED,0CAA0C;QAC1C,IAAI,mBAAmB,CAAC,IAAI,CAAC,EAAE,CAAC;YAC9B,SAAS,EAAE,CAAC;YACZ,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,OAAQ,CAAC,KAAe,CAAC,CAAC,CAAC;YACtE,MAAM,IAAI,GAAG,oBAAoB,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC,IAAI,EAAE,CAAC;YAC3D,MAAM,CAAC,IAAI,CAAC,GAAG,GAAG,CAAC,MAAM,CAAC,KAAK,CAAC,IAAI,IAAI,EAAE,CAAC,OAAO,EAAE,CAAC,CAAC;YACtD,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YAChB,SAAS;QACX,CAAC;QAED,mBAAmB;QACnB,IAAI,IAAI,CAAC,aAAa,CAAC,UAAU,IAAI,CAAC,EAAE,CAAC;YACvC,MAAM,GAAG,IAAI,CAAC;YACd,MAAM,CAAC,IAAI,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC,CAAC;YAClC,SAAS;QACX,CAAC;QAED,sFAAsF;QACtF,yFAAyF;QACzF,SAAS,EAAE,CAAC;QACZ,MAAM,IAAI,GAAG,wBAAwB,CAAC,oBAAoB,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC,CAAC;QAC9E,IAAI,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,EAAE,CAAC;YACvB,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QAClB,CAAC;aAAM,CAAC;YACN,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAClB,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QAClB,CAAC;IACH,CAAC;IAED,SAAS,EAAE,CAAC;IAEZ,6BAA6B;IAC7B,MAAM,IAAI,GAAG,SAAS,CAAC,MAAM,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,CAAC,aAAa,GAAG,CAAC,CAAC,CAAC;IAC5D,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACpB,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QAChB,KAAK,MAAM,EAAE,IAAI,IAAI,EAAE,CAAC;YACtB,MAAM,IAAI,GAAG,gBAAgB,CAAC,EAAE,CAAC,IAAI,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;YACnE,MAAM,CAAC,IAAI,CAAC,KAAK,EAAE,CAAC,aAAa,MAAM,IAAI,EAAE,CAAC,CAAC;QACjD,CAAC;IACH,CAAC;IAED,OAAO,GAAG,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC,IAAI,EAAE,IAAI,CAAC;AACpE,CAAC"}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import type { DocumentViewNode } from './document_view.js';
|
|
2
|
+
import type { Footnote } from './footnotes.js';
|
|
3
|
+
export interface SerializePlainTextOptions {
|
|
4
|
+
/** Reserved for future knobs (footnote policy, table layout). Currently unused. */
|
|
5
|
+
readonly _reserved?: never;
|
|
6
|
+
}
|
|
7
|
+
/**
|
|
8
|
+
* Serialize a structured document view to plain text.
|
|
9
|
+
*
|
|
10
|
+
* @param nodes Block nodes from `buildDocumentView({ showFormatting: true }).nodes`.
|
|
11
|
+
* @param footnotes Footnotes from `DocxDocument.getFootnotes()` (already sorted by
|
|
12
|
+
* `displayNumber`); appended as `[^n] …` definitions.
|
|
13
|
+
*/
|
|
14
|
+
export declare function serializeToPlainText(nodes: DocumentViewNode[], footnotes?: Footnote[], _opts?: SerializePlainTextOptions): string;
|
|
15
|
+
//# sourceMappingURL=serialize_plaintext.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"serialize_plaintext.d.ts","sourceRoot":"","sources":["../../src/primitives/serialize_plaintext.ts"],"names":[],"mappings":"AAoBA,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,oBAAoB,CAAC;AAE3D,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,gBAAgB,CAAC;AA2E/C,MAAM,WAAW,yBAAyB;IACxC,mFAAmF;IACnF,QAAQ,CAAC,SAAS,CAAC,EAAE,KAAK,CAAC;CAC5B;AAED;;;;;;GAMG;AACH,wBAAgB,oBAAoB,CAClC,KAAK,EAAE,gBAAgB,EAAE,EACzB,SAAS,GAAE,QAAQ,EAAO,EAC1B,KAAK,GAAE,yBAA8B,GACpC,MAAM,CA4DR"}
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
// DOCX → plain text serializer.
|
|
2
|
+
//
|
|
3
|
+
// The thinnest member of the export family. Like `serialize_markdown.ts`, this is a
|
|
4
|
+
// *serializer over the existing structured document model* — it does no OOXML parsing.
|
|
5
|
+
// `DocxDocument.buildDocumentView({ showFormatting: true })` already yields a
|
|
6
|
+
// `DocumentViewNode[]` carrying headings, list metadata, grid-aware table context, injected
|
|
7
|
+
// `[^n]` footnote markers, and an HTML-shaped inline-tag string (`tagged_text`). This module
|
|
8
|
+
// turns that model into plain text with no markup.
|
|
9
|
+
//
|
|
10
|
+
// Where the Markdown emitter *maps* inline tags to Markdown syntax, the plain-text emitter
|
|
11
|
+
// *strips* them (via `stripAllInlineTags`) and keeps only sensible block separators:
|
|
12
|
+
// - a blank line between block-level paragraphs (including headings),
|
|
13
|
+
// - simple `- ` list bullets (preserving literal legal labels like `Section 2.1`),
|
|
14
|
+
// - tab-separated table cells, one row per line,
|
|
15
|
+
// - injected `[^n]` footnote markers kept inline, definitions appended at the end.
|
|
16
|
+
//
|
|
17
|
+
// Plain text is intentionally *lossy*: all formatting (bold/italic/underline, highlight,
|
|
18
|
+
// fonts, links, merged/nested table cells, layout) is discarded — that is the whole point of
|
|
19
|
+
// a "just give me the text" rendering.
|
|
20
|
+
import { stripAllInlineTags } from './semantic_tags.js';
|
|
21
|
+
/** Convert one `tagged_text` value to plain text: strip all inline/semantic tags, keep text. */
|
|
22
|
+
function toPlainInline(text) {
|
|
23
|
+
return stripAllInlineTags(text);
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* Render a list item as a simple bullet. Auto-numbered numeric items and unlabeled items get
|
|
27
|
+
* a bare `- ` bullet; items carrying a literal label (legal documents use meaningful labels
|
|
28
|
+
* like `Section 2.1`, `Article IV`, `(a)`, `(i)`) keep that label so it isn't silently lost.
|
|
29
|
+
* Indentation tracks the list level.
|
|
30
|
+
*/
|
|
31
|
+
function renderListItem(node) {
|
|
32
|
+
const lm = node.list_metadata;
|
|
33
|
+
const level = Math.max(0, lm.list_level);
|
|
34
|
+
const indent = ' '.repeat(level);
|
|
35
|
+
const text = toPlainInline(node.tagged_text).trim();
|
|
36
|
+
const label = lm.label_string?.trim() ?? '';
|
|
37
|
+
if (label) {
|
|
38
|
+
return `${indent}- ${label} ${text}`.trimEnd();
|
|
39
|
+
}
|
|
40
|
+
return `${indent}- ${text}`.trimEnd();
|
|
41
|
+
}
|
|
42
|
+
/**
|
|
43
|
+
* Render a run of nodes sharing a `table_context.table_id` as tab-separated rows.
|
|
44
|
+
*
|
|
45
|
+
* Lossy by design (plain text has no table model):
|
|
46
|
+
* - Horizontally merged cells (`gridSpan`) leave grid gaps; we fill them with empty fields so
|
|
47
|
+
* every row keeps the same tab-delimited column count (a row `X<gap>Z` → `X\t\tZ`).
|
|
48
|
+
* - Vertically merged cells (`vMerge`) and nested tables are flattened into the body grid.
|
|
49
|
+
* - Multi-paragraph / multi-node cells and intra-cell line breaks are joined with a space
|
|
50
|
+
* (a raw newline would split the tab-delimited row).
|
|
51
|
+
*/
|
|
52
|
+
function renderTable(group) {
|
|
53
|
+
let totalCols = 0;
|
|
54
|
+
for (const n of group) {
|
|
55
|
+
const tc = n.table_context;
|
|
56
|
+
if (!tc)
|
|
57
|
+
continue;
|
|
58
|
+
totalCols = Math.max(totalCols, tc.total_cols, tc.col_index + 1);
|
|
59
|
+
}
|
|
60
|
+
if (totalCols <= 0)
|
|
61
|
+
return [];
|
|
62
|
+
const rows = new Map();
|
|
63
|
+
const rowOrder = [];
|
|
64
|
+
for (const n of group) {
|
|
65
|
+
const tc = n.table_context;
|
|
66
|
+
if (!tc)
|
|
67
|
+
continue;
|
|
68
|
+
if (!rows.has(tc.row_index)) {
|
|
69
|
+
rows.set(tc.row_index, new Map());
|
|
70
|
+
rowOrder.push(tc.row_index);
|
|
71
|
+
}
|
|
72
|
+
const cellMap = rows.get(tc.row_index);
|
|
73
|
+
const cellText = toPlainInline(n.tagged_text).replace(/\s*\n+\s*/g, ' ').trim();
|
|
74
|
+
const parts = cellMap.get(tc.col_index) ?? [];
|
|
75
|
+
if (cellText)
|
|
76
|
+
parts.push(cellText);
|
|
77
|
+
cellMap.set(tc.col_index, parts);
|
|
78
|
+
}
|
|
79
|
+
rowOrder.sort((a, b) => a - b);
|
|
80
|
+
const lines = [];
|
|
81
|
+
for (const ri of rowOrder) {
|
|
82
|
+
const cellMap = rows.get(ri) ?? new Map();
|
|
83
|
+
const cells = [];
|
|
84
|
+
for (let c = 0; c < totalCols; c++) {
|
|
85
|
+
cells.push((cellMap.get(c) ?? []).join(' '));
|
|
86
|
+
}
|
|
87
|
+
lines.push(cells.join('\t'));
|
|
88
|
+
}
|
|
89
|
+
return lines;
|
|
90
|
+
}
|
|
91
|
+
/**
|
|
92
|
+
* Serialize a structured document view to plain text.
|
|
93
|
+
*
|
|
94
|
+
* @param nodes Block nodes from `buildDocumentView({ showFormatting: true }).nodes`.
|
|
95
|
+
* @param footnotes Footnotes from `DocxDocument.getFootnotes()` (already sorted by
|
|
96
|
+
* `displayNumber`); appended as `[^n] …` definitions.
|
|
97
|
+
*/
|
|
98
|
+
export function serializeToPlainText(nodes, footnotes = [], _opts = {}) {
|
|
99
|
+
const blocks = [];
|
|
100
|
+
for (let i = 0; i < nodes.length; i++) {
|
|
101
|
+
const node = nodes[i];
|
|
102
|
+
// ── Tables: consume the whole run of same-table_id nodes at once ──
|
|
103
|
+
if (node.table_context) {
|
|
104
|
+
const tableId = node.table_context.table_id;
|
|
105
|
+
const group = [];
|
|
106
|
+
while (i < nodes.length && nodes[i].table_context?.table_id === tableId) {
|
|
107
|
+
group.push(nodes[i]);
|
|
108
|
+
i++;
|
|
109
|
+
}
|
|
110
|
+
i--; // for-loop will re-increment
|
|
111
|
+
const tableLines = renderTable(group);
|
|
112
|
+
if (tableLines.length > 0) {
|
|
113
|
+
blocks.push(tableLines.join('\n'));
|
|
114
|
+
blocks.push('');
|
|
115
|
+
}
|
|
116
|
+
continue;
|
|
117
|
+
}
|
|
118
|
+
// ── List items: a bullet per item, no surrounding blank lines ──
|
|
119
|
+
if (node.list_metadata.list_level >= 0) {
|
|
120
|
+
blocks.push(renderListItem(node));
|
|
121
|
+
continue;
|
|
122
|
+
}
|
|
123
|
+
// ── Headings and normal paragraphs alike: plain text, blank line between blocks ──
|
|
124
|
+
// Plain text has no heading syntax, so a Word-styled heading is just its text.
|
|
125
|
+
const text = toPlainInline(node.tagged_text).trim();
|
|
126
|
+
if (text === '') {
|
|
127
|
+
blocks.push('');
|
|
128
|
+
}
|
|
129
|
+
else {
|
|
130
|
+
blocks.push(text);
|
|
131
|
+
blocks.push('');
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
// ── Footnote definitions ──
|
|
135
|
+
const defs = footnotes.filter((fn) => fn.displayNumber > 0);
|
|
136
|
+
if (defs.length > 0) {
|
|
137
|
+
blocks.push('');
|
|
138
|
+
for (const fn of defs) {
|
|
139
|
+
const body = fn.text.replace(/\s+/g, ' ').trim();
|
|
140
|
+
blocks.push(`[^${fn.displayNumber}] ${body}`.trimEnd());
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
// Trim only blank *lines* at the document boundary — not all whitespace. A plain `.trim()`
|
|
144
|
+
// would eat a leading/trailing tab that is a meaningful empty TSV field when the document
|
|
145
|
+
// starts or ends with a table whose boundary cell is empty (e.g. a row `\tZ`), breaking the
|
|
146
|
+
// "every row keeps the same column count" contract.
|
|
147
|
+
const rendered = blocks
|
|
148
|
+
.join('\n')
|
|
149
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
150
|
+
.replace(/^\n+/, '')
|
|
151
|
+
.replace(/\n+$/, '');
|
|
152
|
+
return `${rendered}\n`;
|
|
153
|
+
}
|
|
154
|
+
//# sourceMappingURL=serialize_plaintext.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"serialize_plaintext.js","sourceRoot":"","sources":["../../src/primitives/serialize_plaintext.ts"],"names":[],"mappings":"AAAA,gCAAgC;AAChC,EAAE;AACF,oFAAoF;AACpF,uFAAuF;AACvF,8EAA8E;AAC9E,4FAA4F;AAC5F,6FAA6F;AAC7F,mDAAmD;AACnD,EAAE;AACF,2FAA2F;AAC3F,qFAAqF;AACrF,wEAAwE;AACxE,qFAAqF;AACrF,mDAAmD;AACnD,qFAAqF;AACrF,EAAE;AACF,yFAAyF;AACzF,6FAA6F;AAC7F,uCAAuC;AAGvC,OAAO,EAAE,kBAAkB,EAAE,MAAM,oBAAoB,CAAC;AAGxD,gGAAgG;AAChG,SAAS,aAAa,CAAC,IAAY;IACjC,OAAO,kBAAkB,CAAC,IAAI,CAAC,CAAC;AAClC,CAAC;AAED;;;;;GAKG;AACH,SAAS,cAAc,CAAC,IAAsB;IAC5C,MAAM,EAAE,GAAG,IAAI,CAAC,aAAa,CAAC;IAC9B,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,EAAE,CAAC,UAAU,CAAC,CAAC;IACzC,MAAM,MAAM,GAAG,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;IAClC,MAAM,IAAI,GAAG,aAAa,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC,IAAI,EAAE,CAAC;IACpD,MAAM,KAAK,GAAG,EAAE,CAAC,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;IAC5C,IAAI,KAAK,EAAE,CAAC;QACV,OAAO,GAAG,MAAM,KAAK,KAAK,IAAI,IAAI,EAAE,CAAC,OAAO,EAAE,CAAC;IACjD,CAAC;IACD,OAAO,GAAG,MAAM,KAAK,IAAI,EAAE,CAAC,OAAO,EAAE,CAAC;AACxC,CAAC;AAED;;;;;;;;;GASG;AACH,SAAS,WAAW,CAAC,KAAyB;IAC5C,IAAI,SAAS,GAAG,CAAC,CAAC;IAClB,KAAK,MAAM,CAAC,IAAI,KAAK,EAAE,CAAC;QACtB,MAAM,EAAE,GAAG,CAAC,CAAC,aAAa,CAAC;QAC3B,IAAI,CAAC,EAAE;YAAE,SAAS;QAClB,SAAS,GAAG,IAAI,CAAC,GAAG,CAAC,SAAS,EAAE,EAAE,CAAC,UAAU,EAAE,EAAE,CAAC,SAAS,GAAG,CAAC,CAAC,CAAC;IACnE,CAAC;IACD,IAAI,SAAS,IAAI,CAAC;QAAE,OAAO,EAAE,CAAC;IAE9B,MAAM,IAAI,GAAG,IAAI,GAAG,EAAiC,CAAC;IACtD,MAAM,QAAQ,GAAa,EAAE,CAAC;IAE9B,KAAK,MAAM,CAAC,IAAI,KAAK,EAAE,CAAC;QACtB,MAAM,EAAE,GAAG,CAAC,CAAC,aAAa,CAAC;QAC3B,IAAI,CAAC,EAAE;YAAE,SAAS;QAClB,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,SAAS,CAAC,EAAE,CAAC;YAC5B,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,SAAS,EAAE,IAAI,GAAG,EAAE,CAAC,CAAC;YAClC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC,SAAS,CAAC,CAAC;QAC9B,CAAC;QACD,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,SAAS,CAAE,CAAC;QACxC,MAAM,QAAQ,GAAG,aAAa,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,OAAO,CAAC,YAAY,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;QAChF,MAAM,KAAK,GAAG,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,SAAS,CAAC,IAAI,EAAE,CAAC;QAC9C,IAAI,QAAQ;YAAE,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;QACnC,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,SAAS,EAAE,KAAK,CAAC,CAAC;IACnC,CAAC;IAED,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IAE/B,MAAM,KAAK,GAAa,EAAE,CAAC;IAC3B,KAAK,MAAM,EAAE,IAAI,QAAQ,EAAE,CAAC;QAC1B,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,IAAI,GAAG,EAAoB,CAAC;QAC5D,MAAM,KAAK,GAAa,EAAE,CAAC;QAC3B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,EAAE,CAAC,EAAE,EAAE,CAAC;YACnC,KAAK,CAAC,IAAI,CAAC,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC;QAC/C,CAAC;QACD,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC;IAC/B,CAAC;IACD,OAAO,KAAK,CAAC;AACf,CAAC;AAOD;;;;;;GAMG;AACH,MAAM,UAAU,oBAAoB,CAClC,KAAyB,EACzB,YAAwB,EAAE,EAC1B,QAAmC,EAAE;IAErC,MAAM,MAAM,GAAa,EAAE,CAAC;IAE5B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAE,CAAC;QAEvB,qEAAqE;QACrE,IAAI,IAAI,CAAC,aAAa,EAAE,CAAC;YACvB,MAAM,OAAO,GAAG,IAAI,CAAC,aAAa,CAAC,QAAQ,CAAC;YAC5C,MAAM,KAAK,GAAuB,EAAE,CAAC;YACrC,OAAO,CAAC,GAAG,KAAK,CAAC,MAAM,IAAI,KAAK,CAAC,CAAC,CAAE,CAAC,aAAa,EAAE,QAAQ,KAAK,OAAO,EAAE,CAAC;gBACzE,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAE,CAAC,CAAC;gBACtB,CAAC,EAAE,CAAC;YACN,CAAC;YACD,CAAC,EAAE,CAAC,CAAC,6BAA6B;YAClC,MAAM,UAAU,GAAG,WAAW,CAAC,KAAK,CAAC,CAAC;YACtC,IAAI,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAC1B,MAAM,CAAC,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC;gBACnC,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YAClB,CAAC;YACD,SAAS;QACX,CAAC;QAED,kEAAkE;QAClE,IAAI,IAAI,CAAC,aAAa,CAAC,UAAU,IAAI,CAAC,EAAE,CAAC;YACvC,MAAM,CAAC,IAAI,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC,CAAC;YAClC,SAAS;QACX,CAAC;QAED,oFAAoF;QACpF,+EAA+E;QAC/E,MAAM,IAAI,GAAG,aAAa,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC,IAAI,EAAE,CAAC;QACpD,IAAI,IAAI,KAAK,EAAE,EAAE,CAAC;YAChB,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QAClB,CAAC;aAAM,CAAC;YACN,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAClB,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QAClB,CAAC;IACH,CAAC;IAED,6BAA6B;IAC7B,MAAM,IAAI,GAAG,SAAS,CAAC,MAAM,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,CAAC,aAAa,GAAG,CAAC,CAAC,CAAC;IAC5D,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACpB,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QAChB,KAAK,MAAM,EAAE,IAAI,IAAI,EAAE,CAAC;YACtB,MAAM,IAAI,GAAG,EAAE,CAAC,IAAI,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;YACjD,MAAM,CAAC,IAAI,CAAC,KAAK,EAAE,CAAC,aAAa,KAAK,IAAI,EAAE,CAAC,OAAO,EAAE,CAAC,CAAC;QAC1D,CAAC;IACH,CAAC;IAED,2FAA2F;IAC3F,0FAA0F;IAC1F,4FAA4F;IAC5F,oDAAoD;IACpD,MAAM,QAAQ,GAAG,MAAM;SACpB,IAAI,CAAC,IAAI,CAAC;SACV,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC;SAC1B,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC;SACnB,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC;IACvB,OAAO,GAAG,QAAQ,IAAI,CAAC;AACzB,CAAC"}
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
import { OOXML, W } from './namespaces.js';
|
|
2
|
+
import { getAttributeSafe, getFirstChild } from './xml-helpers.js';
|
|
2
3
|
function getWAttr(el, localName) {
|
|
3
|
-
//
|
|
4
|
-
//
|
|
5
|
-
|
|
6
|
-
return el.getAttributeNS(OOXML.W_NS, localName) || el.getAttribute(`w:${localName}`) || el.getAttribute(localName) || null;
|
|
4
|
+
// Preserve legacy truthy fallback for empty strings from namespace-bound reads
|
|
5
|
+
// when attributes were written without a real namespace binding.
|
|
6
|
+
return getAttributeSafe(el, OOXML.W_NS, localName, 'w', { emptyIsMissing: true });
|
|
7
7
|
}
|
|
8
8
|
export function parseStylesXml(stylesDoc) {
|
|
9
9
|
const byId = new Map();
|
|
@@ -14,10 +14,10 @@ export function parseStylesXml(stylesDoc) {
|
|
|
14
14
|
const id = getWAttr(st, 'styleId');
|
|
15
15
|
if (!id)
|
|
16
16
|
continue;
|
|
17
|
-
const nameEl = st
|
|
18
|
-
const basedOnEl = st
|
|
19
|
-
const pPr = st
|
|
20
|
-
const rPr = st
|
|
17
|
+
const nameEl = getFirstChild(st, OOXML.W_NS, W.name);
|
|
18
|
+
const basedOnEl = getFirstChild(st, OOXML.W_NS, W.basedOn);
|
|
19
|
+
const pPr = getFirstChild(st, OOXML.W_NS, W.pPr);
|
|
20
|
+
const rPr = getFirstChild(st, OOXML.W_NS, W.rPr);
|
|
21
21
|
const name = nameEl ? (getWAttr(nameEl, 'val') ?? id) : id;
|
|
22
22
|
const basedOn = basedOnEl ? (getWAttr(basedOnEl, 'val') ?? null) : null;
|
|
23
23
|
byId.set(id, {
|
|
@@ -85,15 +85,15 @@ function firstNonNull(vals) {
|
|
|
85
85
|
return null;
|
|
86
86
|
}
|
|
87
87
|
export function extractParagraphFormatting(pPr, styles) {
|
|
88
|
-
const pStyleEl = pPr ? pPr
|
|
88
|
+
const pStyleEl = pPr ? getFirstChild(pPr, OOXML.W_NS, W.pStyle) : null;
|
|
89
89
|
const styleId = pStyleEl ? (getWAttr(pStyleEl, 'val') ?? null) : null;
|
|
90
90
|
const chain = resolveStyleChain(styles, styleId);
|
|
91
91
|
const styleName = (styleId && styles.byId.get(styleId)?.name) || styleId || '';
|
|
92
92
|
// Resolve alignment and indents: direct pPr overrides style chain.
|
|
93
|
-
const directJc = pPr ? pPr
|
|
94
|
-
const directInd = pPr ? pPr
|
|
95
|
-
const styleJc = firstNonNull(chain.map((s) => (s.pPr ? s.pPr
|
|
96
|
-
const styleInd = firstNonNull(chain.map((s) => (s.pPr ? s.pPr
|
|
93
|
+
const directJc = pPr ? getFirstChild(pPr, OOXML.W_NS, W.jc) : null;
|
|
94
|
+
const directInd = pPr ? getFirstChild(pPr, OOXML.W_NS, W.ind) : null;
|
|
95
|
+
const styleJc = firstNonNull(chain.map((s) => (s.pPr ? getFirstChild(s.pPr, OOXML.W_NS, W.jc) : null)));
|
|
96
|
+
const styleInd = firstNonNull(chain.map((s) => (s.pPr ? getFirstChild(s.pPr, OOXML.W_NS, W.ind) : null)));
|
|
97
97
|
const alignment = parseAlignment(directJc ?? styleJc);
|
|
98
98
|
const ind = parseIndentPt(directInd ?? styleInd);
|
|
99
99
|
return {
|
|
@@ -107,7 +107,7 @@ export function extractParagraphFormatting(pPr, styles) {
|
|
|
107
107
|
function parseBoolProp(parent, tagLocal) {
|
|
108
108
|
if (!parent)
|
|
109
109
|
return null;
|
|
110
|
-
const el = parent
|
|
110
|
+
const el = getFirstChild(parent, OOXML.W_NS, tagLocal);
|
|
111
111
|
if (!el)
|
|
112
112
|
return null;
|
|
113
113
|
// <w:b/> implies true. <w:b w:val="0"/> implies false.
|
|
@@ -119,7 +119,7 @@ function parseBoolProp(parent, tagLocal) {
|
|
|
119
119
|
function parseUnderline(parent) {
|
|
120
120
|
if (!parent)
|
|
121
121
|
return null;
|
|
122
|
-
const el = parent
|
|
122
|
+
const el = getFirstChild(parent, OOXML.W_NS, W.u);
|
|
123
123
|
if (!el)
|
|
124
124
|
return null;
|
|
125
125
|
const v = getWAttr(el, 'val');
|
|
@@ -130,7 +130,7 @@ function parseUnderline(parent) {
|
|
|
130
130
|
function parseFontName(parent) {
|
|
131
131
|
if (!parent)
|
|
132
132
|
return null;
|
|
133
|
-
const el = parent
|
|
133
|
+
const el = getFirstChild(parent, OOXML.W_NS, W.rFonts);
|
|
134
134
|
if (!el)
|
|
135
135
|
return null;
|
|
136
136
|
return getWAttr(el, 'ascii') ?? getWAttr(el, 'hAnsi') ?? getWAttr(el, 'cs') ?? getWAttr(el, 'val') ?? null;
|
|
@@ -138,7 +138,7 @@ function parseFontName(parent) {
|
|
|
138
138
|
function parseFontSizePt(parent) {
|
|
139
139
|
if (!parent)
|
|
140
140
|
return null;
|
|
141
|
-
const el = parent
|
|
141
|
+
const el = getFirstChild(parent, OOXML.W_NS, W.sz);
|
|
142
142
|
if (!el)
|
|
143
143
|
return null;
|
|
144
144
|
const valStr = getWAttr(el, 'val') || el.getAttribute('val');
|
|
@@ -153,7 +153,7 @@ function parseFontSizePt(parent) {
|
|
|
153
153
|
function parseColorHex(parent) {
|
|
154
154
|
if (!parent)
|
|
155
155
|
return null;
|
|
156
|
-
const el = parent
|
|
156
|
+
const el = getFirstChild(parent, OOXML.W_NS, W.color);
|
|
157
157
|
if (!el)
|
|
158
158
|
return null;
|
|
159
159
|
const v = getWAttr(el, 'val') || el.getAttribute('val');
|
|
@@ -164,7 +164,7 @@ function parseColorHex(parent) {
|
|
|
164
164
|
function parseHighlightVal(parent) {
|
|
165
165
|
if (!parent)
|
|
166
166
|
return null;
|
|
167
|
-
const el = parent
|
|
167
|
+
const el = getFirstChild(parent, OOXML.W_NS, W.highlight);
|
|
168
168
|
if (!el)
|
|
169
169
|
return null;
|
|
170
170
|
const v = getWAttr(el, 'val');
|
|
@@ -175,10 +175,10 @@ function parseHighlightVal(parent) {
|
|
|
175
175
|
export function extractEffectiveRunFormatting(params) {
|
|
176
176
|
const { run, paragraphPPr, paragraphStyleId, styles } = params;
|
|
177
177
|
const isRun = run.localName === W.r || run.localName === 'r';
|
|
178
|
-
const rPr = isRun ? run
|
|
179
|
-
const pRPr = paragraphPPr ? paragraphPPr
|
|
178
|
+
const rPr = isRun ? getFirstChild(run, OOXML.W_NS, W.rPr) : null;
|
|
179
|
+
const pRPr = paragraphPPr ? getFirstChild(paragraphPPr, OOXML.W_NS, W.rPr) : null;
|
|
180
180
|
// Resolve w:rStyle character style chain (e.g. "Strong" → bold via style definition).
|
|
181
|
-
const rStyleEl = rPr
|
|
181
|
+
const rStyleEl = rPr ? getFirstChild(rPr, OOXML.W_NS, W.rStyle) : null;
|
|
182
182
|
const rStyleId = rStyleEl ? (getWAttr(rStyleEl, 'val') ?? null) : null;
|
|
183
183
|
const rStyleChain = resolveStyleChain(styles, rStyleId);
|
|
184
184
|
const rStyleRPr = firstNonNull(rStyleChain.map((s) => s.rPr));
|