@usejunior/docx-core 0.0.1 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +86 -28
- package/dist/.tsbuildinfo +1 -0
- package/dist/atomizer.d.ts +218 -0
- package/dist/atomizer.d.ts.map +1 -0
- package/dist/atomizer.js +856 -0
- package/dist/atomizer.js.map +1 -0
- package/dist/baselines/atomizer/atomLcs.d.ts +96 -0
- package/dist/baselines/atomizer/atomLcs.d.ts.map +1 -0
- package/dist/baselines/atomizer/atomLcs.js +347 -0
- package/dist/baselines/atomizer/atomLcs.js.map +1 -0
- package/dist/baselines/atomizer/debug.d.ts +41 -0
- package/dist/baselines/atomizer/debug.d.ts.map +1 -0
- package/dist/baselines/atomizer/debug.js +85 -0
- package/dist/baselines/atomizer/debug.js.map +1 -0
- package/dist/baselines/atomizer/documentReconstructor.d.ts +64 -0
- package/dist/baselines/atomizer/documentReconstructor.d.ts.map +1 -0
- package/dist/baselines/atomizer/documentReconstructor.js +939 -0
- package/dist/baselines/atomizer/documentReconstructor.js.map +1 -0
- package/dist/baselines/atomizer/hierarchicalLcs.d.ts +111 -0
- package/dist/baselines/atomizer/hierarchicalLcs.d.ts.map +1 -0
- package/dist/baselines/atomizer/hierarchicalLcs.js +469 -0
- package/dist/baselines/atomizer/hierarchicalLcs.js.map +1 -0
- package/dist/baselines/atomizer/inPlaceModifier.d.ts +183 -0
- package/dist/baselines/atomizer/inPlaceModifier.d.ts.map +1 -0
- package/dist/baselines/atomizer/inPlaceModifier.js +1600 -0
- package/dist/baselines/atomizer/inPlaceModifier.js.map +1 -0
- package/dist/baselines/atomizer/numberingIntegration.d.ts +59 -0
- package/dist/baselines/atomizer/numberingIntegration.d.ts.map +1 -0
- package/dist/baselines/atomizer/numberingIntegration.js +209 -0
- package/dist/baselines/atomizer/numberingIntegration.js.map +1 -0
- package/dist/baselines/atomizer/pipeline.d.ts +65 -0
- package/dist/baselines/atomizer/pipeline.d.ts.map +1 -0
- package/dist/baselines/atomizer/pipeline.js +510 -0
- package/dist/baselines/atomizer/pipeline.js.map +1 -0
- package/dist/baselines/atomizer/premergeRuns.d.ts +26 -0
- package/dist/baselines/atomizer/premergeRuns.d.ts.map +1 -0
- package/dist/baselines/atomizer/premergeRuns.js +150 -0
- package/dist/baselines/atomizer/premergeRuns.js.map +1 -0
- package/dist/baselines/atomizer/trackChangesAcceptor.d.ts +63 -0
- package/dist/baselines/atomizer/trackChangesAcceptor.d.ts.map +1 -0
- package/dist/baselines/atomizer/trackChangesAcceptor.js +254 -0
- package/dist/baselines/atomizer/trackChangesAcceptor.js.map +1 -0
- package/dist/baselines/atomizer/trackChangesAcceptorAst.d.ts +64 -0
- package/dist/baselines/atomizer/trackChangesAcceptorAst.d.ts.map +1 -0
- package/dist/baselines/atomizer/trackChangesAcceptorAst.js +586 -0
- package/dist/baselines/atomizer/trackChangesAcceptorAst.js.map +1 -0
- package/dist/baselines/atomizer/xmlToWmlElement.d.ts +65 -0
- package/dist/baselines/atomizer/xmlToWmlElement.d.ts.map +1 -0
- package/dist/baselines/atomizer/xmlToWmlElement.js +95 -0
- package/dist/baselines/atomizer/xmlToWmlElement.js.map +1 -0
- package/dist/baselines/diffmatch/documentBuilder.d.ts +44 -0
- package/dist/baselines/diffmatch/documentBuilder.d.ts.map +1 -0
- package/dist/baselines/diffmatch/documentBuilder.js +227 -0
- package/dist/baselines/diffmatch/documentBuilder.js.map +1 -0
- package/dist/baselines/diffmatch/paragraphAlignment.d.ts +75 -0
- package/dist/baselines/diffmatch/paragraphAlignment.d.ts.map +1 -0
- package/dist/baselines/diffmatch/paragraphAlignment.js +206 -0
- package/dist/baselines/diffmatch/paragraphAlignment.js.map +1 -0
- package/dist/baselines/diffmatch/pipeline.d.ts +33 -0
- package/dist/baselines/diffmatch/pipeline.d.ts.map +1 -0
- package/dist/baselines/diffmatch/pipeline.js +84 -0
- package/dist/baselines/diffmatch/pipeline.js.map +1 -0
- package/dist/baselines/diffmatch/runDiff.d.ts +53 -0
- package/dist/baselines/diffmatch/runDiff.d.ts.map +1 -0
- package/dist/baselines/diffmatch/runDiff.js +253 -0
- package/dist/baselines/diffmatch/runDiff.js.map +1 -0
- package/dist/baselines/diffmatch/trackChangesRenderer.d.ts +64 -0
- package/dist/baselines/diffmatch/trackChangesRenderer.d.ts.map +1 -0
- package/dist/baselines/diffmatch/trackChangesRenderer.js +178 -0
- package/dist/baselines/diffmatch/trackChangesRenderer.js.map +1 -0
- package/dist/baselines/diffmatch/xmlParser.d.ts +45 -0
- package/dist/baselines/diffmatch/xmlParser.d.ts.map +1 -0
- package/dist/baselines/diffmatch/xmlParser.js +344 -0
- package/dist/baselines/diffmatch/xmlParser.js.map +1 -0
- package/dist/baselines/wmlcomparer/DocxodusWasm.d.ts +51 -0
- package/dist/baselines/wmlcomparer/DocxodusWasm.d.ts.map +1 -0
- package/dist/baselines/wmlcomparer/DocxodusWasm.js +83 -0
- package/dist/baselines/wmlcomparer/DocxodusWasm.js.map +1 -0
- package/dist/baselines/wmlcomparer/DotnetCli.d.ts +40 -0
- package/dist/baselines/wmlcomparer/DotnetCli.d.ts.map +1 -0
- package/dist/baselines/wmlcomparer/DotnetCli.js +135 -0
- package/dist/baselines/wmlcomparer/DotnetCli.js.map +1 -0
- package/dist/benchmark/metrics.d.ts +72 -0
- package/dist/benchmark/metrics.d.ts.map +1 -0
- package/dist/benchmark/metrics.js +45 -0
- package/dist/benchmark/metrics.js.map +1 -0
- package/dist/benchmark/reporter.d.ts +23 -0
- package/dist/benchmark/reporter.d.ts.map +1 -0
- package/dist/benchmark/reporter.js +147 -0
- package/dist/benchmark/reporter.js.map +1 -0
- package/dist/benchmark/runner.d.ts +30 -0
- package/dist/benchmark/runner.d.ts.map +1 -0
- package/dist/benchmark/runner.js +233 -0
- package/dist/benchmark/runner.js.map +1 -0
- package/dist/cli/compare-two.d.ts +28 -0
- package/dist/cli/compare-two.d.ts.map +1 -0
- package/dist/cli/compare-two.js +110 -0
- package/dist/cli/compare-two.js.map +1 -0
- package/dist/cli/index.d.ts +3 -0
- package/dist/cli/index.d.ts.map +1 -0
- package/dist/cli/index.js +21 -0
- package/dist/cli/index.js.map +1 -0
- package/dist/core-types.d.ts +296 -0
- package/dist/core-types.d.ts.map +1 -0
- package/dist/core-types.js +122 -0
- package/dist/core-types.js.map +1 -0
- package/dist/footnotes.d.ts +144 -0
- package/dist/footnotes.d.ts.map +1 -0
- package/dist/footnotes.js +291 -0
- package/dist/footnotes.js.map +1 -0
- package/dist/format-detection.d.ts +120 -0
- package/dist/format-detection.d.ts.map +1 -0
- package/dist/format-detection.js +338 -0
- package/dist/format-detection.js.map +1 -0
- package/dist/index.d.ts +177 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +55 -0
- package/dist/index.js.map +1 -0
- package/dist/integration/output-artifacts.d.ts +6 -0
- package/dist/integration/output-artifacts.d.ts.map +1 -0
- package/dist/integration/output-artifacts.js +30 -0
- package/dist/integration/output-artifacts.js.map +1 -0
- package/dist/move-detection.d.ts +211 -0
- package/dist/move-detection.d.ts.map +1 -0
- package/dist/move-detection.js +391 -0
- package/dist/move-detection.js.map +1 -0
- package/dist/numbering.d.ts +136 -0
- package/dist/numbering.d.ts.map +1 -0
- package/dist/numbering.js +446 -0
- package/dist/numbering.js.map +1 -0
- package/dist/primitives/accept_changes.d.ts +30 -0
- package/dist/primitives/accept_changes.d.ts.map +1 -0
- package/dist/primitives/accept_changes.js +241 -0
- package/dist/primitives/accept_changes.js.map +1 -0
- package/dist/primitives/bookmarks.d.ts +12 -0
- package/dist/primitives/bookmarks.d.ts.map +1 -0
- package/dist/primitives/bookmarks.js +248 -0
- package/dist/primitives/bookmarks.js.map +1 -0
- package/dist/primitives/comments.d.ts +88 -0
- package/dist/primitives/comments.d.ts.map +1 -0
- package/dist/primitives/comments.js +703 -0
- package/dist/primitives/comments.js.map +1 -0
- package/dist/primitives/document.d.ts +168 -0
- package/dist/primitives/document.d.ts.map +1 -0
- package/dist/primitives/document.js +532 -0
- package/dist/primitives/document.js.map +1 -0
- package/dist/primitives/document_view.d.ts +93 -0
- package/dist/primitives/document_view.d.ts.map +1 -0
- package/dist/primitives/document_view.js +722 -0
- package/dist/primitives/document_view.js.map +1 -0
- package/dist/primitives/dom-helpers.d.ts +94 -0
- package/dist/primitives/dom-helpers.d.ts.map +1 -0
- package/dist/primitives/dom-helpers.js +219 -0
- package/dist/primitives/dom-helpers.js.map +1 -0
- package/dist/primitives/errors.d.ts +7 -0
- package/dist/primitives/errors.d.ts.map +1 -0
- package/dist/primitives/errors.js +10 -0
- package/dist/primitives/errors.js.map +1 -0
- package/dist/primitives/extract_revisions.d.ts +50 -0
- package/dist/primitives/extract_revisions.d.ts.map +1 -0
- package/dist/primitives/extract_revisions.js +340 -0
- package/dist/primitives/extract_revisions.js.map +1 -0
- package/dist/primitives/footnotes.d.ts +37 -0
- package/dist/primitives/footnotes.d.ts.map +1 -0
- package/dist/primitives/footnotes.js +552 -0
- package/dist/primitives/footnotes.js.map +1 -0
- package/dist/primitives/formatting_tags.d.ts +30 -0
- package/dist/primitives/formatting_tags.d.ts.map +1 -0
- package/dist/primitives/formatting_tags.js +217 -0
- package/dist/primitives/formatting_tags.js.map +1 -0
- package/dist/primitives/index.d.ts +26 -0
- package/dist/primitives/index.d.ts.map +1 -0
- package/dist/primitives/index.js +26 -0
- package/dist/primitives/index.js.map +1 -0
- package/dist/primitives/layout.d.ts +53 -0
- package/dist/primitives/layout.d.ts.map +1 -0
- package/dist/primitives/layout.js +178 -0
- package/dist/primitives/layout.js.map +1 -0
- package/dist/primitives/list_labels.d.ts +19 -0
- package/dist/primitives/list_labels.d.ts.map +1 -0
- package/dist/primitives/list_labels.js +57 -0
- package/dist/primitives/list_labels.js.map +1 -0
- package/dist/primitives/matching.d.ts +17 -0
- package/dist/primitives/matching.d.ts.map +1 -0
- package/dist/primitives/matching.js +144 -0
- package/dist/primitives/matching.js.map +1 -0
- package/dist/primitives/merge_runs.d.ts +23 -0
- package/dist/primitives/merge_runs.d.ts.map +1 -0
- package/dist/primitives/merge_runs.js +195 -0
- package/dist/primitives/merge_runs.js.map +1 -0
- package/dist/primitives/namespaces.d.ts +90 -0
- package/dist/primitives/namespaces.d.ts.map +1 -0
- package/dist/primitives/namespaces.js +107 -0
- package/dist/primitives/namespaces.js.map +1 -0
- package/dist/primitives/numbering.d.ts +27 -0
- package/dist/primitives/numbering.d.ts.map +1 -0
- package/dist/primitives/numbering.js +182 -0
- package/dist/primitives/numbering.js.map +1 -0
- package/dist/primitives/prevent_double_elevation.d.ts +18 -0
- package/dist/primitives/prevent_double_elevation.d.ts.map +1 -0
- package/dist/primitives/prevent_double_elevation.js +190 -0
- package/dist/primitives/prevent_double_elevation.js.map +1 -0
- package/dist/primitives/reject_changes.d.ts +27 -0
- package/dist/primitives/reject_changes.d.ts.map +1 -0
- package/dist/primitives/reject_changes.js +371 -0
- package/dist/primitives/reject_changes.js.map +1 -0
- package/dist/primitives/relationships.d.ts +7 -0
- package/dist/primitives/relationships.d.ts.map +1 -0
- package/dist/primitives/relationships.js +24 -0
- package/dist/primitives/relationships.js.map +1 -0
- package/dist/primitives/semantic_tags.d.ts +32 -0
- package/dist/primitives/semantic_tags.d.ts.map +1 -0
- package/dist/primitives/semantic_tags.js +139 -0
- package/dist/primitives/semantic_tags.js.map +1 -0
- package/dist/primitives/simplify_redlines.d.ts +19 -0
- package/dist/primitives/simplify_redlines.d.ts.map +1 -0
- package/dist/primitives/simplify_redlines.js +94 -0
- package/dist/primitives/simplify_redlines.js.map +1 -0
- package/dist/primitives/styles.d.ts +36 -0
- package/dist/primitives/styles.d.ts.map +1 -0
- package/dist/primitives/styles.js +190 -0
- package/dist/primitives/styles.js.map +1 -0
- package/dist/primitives/text.d.ts +27 -0
- package/dist/primitives/text.d.ts.map +1 -0
- package/dist/primitives/text.js +416 -0
- package/dist/primitives/text.js.map +1 -0
- package/dist/primitives/validate_document.d.ts +24 -0
- package/dist/primitives/validate_document.d.ts.map +1 -0
- package/dist/primitives/validate_document.js +147 -0
- package/dist/primitives/validate_document.js.map +1 -0
- package/dist/primitives/xml.d.ts +5 -0
- package/dist/primitives/xml.d.ts.map +1 -0
- package/dist/primitives/xml.js +19 -0
- package/dist/primitives/xml.js.map +1 -0
- package/dist/primitives/zip.d.ts +25 -0
- package/dist/primitives/zip.d.ts.map +1 -0
- package/dist/primitives/zip.js +78 -0
- package/dist/primitives/zip.js.map +1 -0
- package/dist/shared/docx/DocxArchive.d.ts +94 -0
- package/dist/shared/docx/DocxArchive.d.ts.map +1 -0
- package/dist/shared/docx/DocxArchive.js +169 -0
- package/dist/shared/docx/DocxArchive.js.map +1 -0
- package/dist/shared/ooxml/namespaces.d.ts +149 -0
- package/dist/shared/ooxml/namespaces.d.ts.map +1 -0
- package/dist/shared/ooxml/namespaces.js +224 -0
- package/dist/shared/ooxml/namespaces.js.map +1 -0
- package/dist/shared/ooxml/types.d.ts +136 -0
- package/dist/shared/ooxml/types.d.ts.map +1 -0
- package/dist/shared/ooxml/types.js +7 -0
- package/dist/shared/ooxml/types.js.map +1 -0
- package/package.json +63 -6
|
@@ -0,0 +1,722 @@
|
|
|
1
|
+
import { OOXML, W } from './namespaces.js';
|
|
2
|
+
import { getParagraphText, getParagraphRuns } from './text.js';
|
|
3
|
+
import { extractListLabel, stripListLabel, LabelType } from './list_labels.js';
|
|
4
|
+
import { parseNumberingXml, computeListLabelForParagraph } from './numbering.js';
|
|
5
|
+
import { parseStylesXml, extractParagraphFormatting, extractEffectiveRunFormatting } from './styles.js';
|
|
6
|
+
import { emitDefinitionTagsFromString, detectDefinitionSpans, HIGHLIGHT_TAG } from './semantic_tags.js';
|
|
7
|
+
import { computeModalBaseline, emitFormattingTags, mergeAdjacentTags } from './formatting_tags.js';
|
|
8
|
+
import { isReservedFootnote } from './footnotes.js';
|
|
9
|
+
const SHORT_HEADER_MAX_LENGTH = 50;
|
|
10
|
+
const MAX_HEADER_TEXT_LENGTH = 60;
|
|
11
|
+
const STYLE_EXAMPLE_TEXT_PREVIEW_LENGTH = 50;
|
|
12
|
+
function getWAttr(el, localName) {
|
|
13
|
+
return el.getAttributeNS(OOXML.W_NS, localName) ?? el.getAttribute(`w:${localName}`) ?? el.getAttribute(localName);
|
|
14
|
+
}
|
|
15
|
+
function runHighlightVal(run) {
|
|
16
|
+
const rPr = run.getElementsByTagNameNS(OOXML.W_NS, W.rPr).item(0);
|
|
17
|
+
if (!rPr)
|
|
18
|
+
return null;
|
|
19
|
+
const h = rPr.getElementsByTagNameNS(OOXML.W_NS, W.highlight).item(0);
|
|
20
|
+
if (!h)
|
|
21
|
+
return null;
|
|
22
|
+
const v = getWAttr(h, 'val');
|
|
23
|
+
if (!v || v === 'none')
|
|
24
|
+
return null;
|
|
25
|
+
return v;
|
|
26
|
+
}
|
|
27
|
+
function emitHighlightTagsFromParagraph(p) {
|
|
28
|
+
const runs = getParagraphRuns(p);
|
|
29
|
+
if (runs.length === 0)
|
|
30
|
+
return '';
|
|
31
|
+
const out = [];
|
|
32
|
+
let inHighlight = false;
|
|
33
|
+
for (const tr of runs) {
|
|
34
|
+
const isHighlighted = !!runHighlightVal(tr.r);
|
|
35
|
+
if (isHighlighted && !inHighlight) {
|
|
36
|
+
out.push(`<${HIGHLIGHT_TAG}>`);
|
|
37
|
+
inHighlight = true;
|
|
38
|
+
}
|
|
39
|
+
else if (!isHighlighted && inHighlight) {
|
|
40
|
+
out.push(`</${HIGHLIGHT_TAG}>`);
|
|
41
|
+
inHighlight = false;
|
|
42
|
+
}
|
|
43
|
+
out.push(tr.text);
|
|
44
|
+
}
|
|
45
|
+
if (inHighlight)
|
|
46
|
+
out.push(`</${HIGHLIGHT_TAG}>`);
|
|
47
|
+
return out.join('');
|
|
48
|
+
}
|
|
49
|
+
function fingerprintKey(fp) {
|
|
50
|
+
// Stable JSON-ish key used for Map lookups.
|
|
51
|
+
return `${fp.list_level}|${fp.left_indent_pt.toFixed(1)}|${fp.first_line_indent_pt.toFixed(1)}|${fp.style_name}|${fp.alignment}`;
|
|
52
|
+
}
|
|
53
|
+
// Pattern-based header detection fallback (ported from Python ingestor._extract_header_info).
|
|
54
|
+
const HEADER_PATTERN = /^([A-Z][^.!?:]*(?:\s+[A-Z][^.!?:]*)*)([.:]?)(?:\s|$)/;
|
|
55
|
+
function extractHeaderInfo(cleanText) {
|
|
56
|
+
if (!cleanText || cleanText.length < 2)
|
|
57
|
+
return { header_text: null, header_style: null };
|
|
58
|
+
if (!/^[A-Z]/.test(cleanText))
|
|
59
|
+
return { header_text: null, header_style: null };
|
|
60
|
+
const stripped = cleanText.trim();
|
|
61
|
+
if (stripped.length <= SHORT_HEADER_MAX_LENGTH) {
|
|
62
|
+
if (stripped.endsWith('.'))
|
|
63
|
+
return { header_text: stripped.slice(0, -1), header_style: 'title_with_period' };
|
|
64
|
+
if (stripped.endsWith(':'))
|
|
65
|
+
return { header_text: stripped.slice(0, -1), header_style: 'title_with_colon' };
|
|
66
|
+
const words = stripped.split(/\s+/);
|
|
67
|
+
if (words.length <= 5)
|
|
68
|
+
return { header_text: stripped, header_style: 'title_bare' };
|
|
69
|
+
return { header_text: null, header_style: null };
|
|
70
|
+
}
|
|
71
|
+
const m = HEADER_PATTERN.exec(stripped);
|
|
72
|
+
if (!m)
|
|
73
|
+
return { header_text: null, header_style: null };
|
|
74
|
+
const headerText = (m[1] ?? '').trim();
|
|
75
|
+
const terminator = m[2] ?? '';
|
|
76
|
+
const remaining = stripped.slice(m[0].length);
|
|
77
|
+
if (!remaining || headerText.length > MAX_HEADER_TEXT_LENGTH)
|
|
78
|
+
return { header_text: null, header_style: null };
|
|
79
|
+
if (terminator === '.')
|
|
80
|
+
return { header_text: headerText, header_style: 'title_with_period' };
|
|
81
|
+
if (terminator === ':')
|
|
82
|
+
return { header_text: headerText, header_style: 'title_with_colon' };
|
|
83
|
+
return { header_text: headerText, header_style: 'title_bare' };
|
|
84
|
+
}
|
|
85
|
+
function detectRunInHeader(params) {
|
|
86
|
+
const { paragraph, paragraphPPr, paragraphStyleId, styles } = params;
|
|
87
|
+
const punct = new Set(['.', ':', '-']);
|
|
88
|
+
// Use visible runs only (field code text stripped in getParagraphRuns()).
|
|
89
|
+
const runs = getParagraphRuns(paragraph);
|
|
90
|
+
if (runs.length === 0)
|
|
91
|
+
return null;
|
|
92
|
+
// Group by run element, preserving order.
|
|
93
|
+
const orderedUniqueRuns = [];
|
|
94
|
+
const seen = new Set();
|
|
95
|
+
for (const tr of runs) {
|
|
96
|
+
if (!seen.has(tr.r)) {
|
|
97
|
+
seen.add(tr.r);
|
|
98
|
+
orderedUniqueRuns.push(tr.r);
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
let headerText = '';
|
|
102
|
+
let formatting = null;
|
|
103
|
+
let headerCharCount = 0;
|
|
104
|
+
for (const r of orderedUniqueRuns) {
|
|
105
|
+
const fmt = extractEffectiveRunFormatting({ run: r, paragraphPPr, paragraphStyleId, styles });
|
|
106
|
+
const isHeaderStyle = fmt.bold || fmt.underline;
|
|
107
|
+
if (!isHeaderStyle)
|
|
108
|
+
break;
|
|
109
|
+
// Accumulate run text.
|
|
110
|
+
const ts = Array.from(r.getElementsByTagNameNS(OOXML.W_NS, W.t));
|
|
111
|
+
for (const t of ts) {
|
|
112
|
+
const tc = t.textContent ?? '';
|
|
113
|
+
headerText += tc;
|
|
114
|
+
headerCharCount += tc.length;
|
|
115
|
+
}
|
|
116
|
+
if (!formatting)
|
|
117
|
+
formatting = { bold: fmt.bold, italic: fmt.italic, underline: fmt.underline };
|
|
118
|
+
}
|
|
119
|
+
const trimmed = headerText.trim();
|
|
120
|
+
if (!trimmed)
|
|
121
|
+
return null;
|
|
122
|
+
if (!punct.has(trimmed[trimmed.length - 1]))
|
|
123
|
+
return null;
|
|
124
|
+
if (!formatting)
|
|
125
|
+
return null;
|
|
126
|
+
return { raw_text: trimmed, formatting, headerCharCount };
|
|
127
|
+
}
|
|
128
|
+
function inferSemanticName(params) {
|
|
129
|
+
const { fp, nodes } = params;
|
|
130
|
+
// Find first label_type if present.
|
|
131
|
+
let labelType = null;
|
|
132
|
+
for (const n of nodes) {
|
|
133
|
+
if (n.list_metadata.label_type) {
|
|
134
|
+
labelType = n.list_metadata.label_type;
|
|
135
|
+
break;
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
const listLevel = fp.list_level;
|
|
139
|
+
if (listLevel >= 0) {
|
|
140
|
+
if (listLevel === 0) {
|
|
141
|
+
if (labelType === LabelType.ARTICLE)
|
|
142
|
+
return { base_id: 'article', display_name: 'Article Heading' };
|
|
143
|
+
if (labelType === LabelType.SECTION)
|
|
144
|
+
return { base_id: 'section', display_name: 'Section Heading' };
|
|
145
|
+
if (labelType === LabelType.ROMAN)
|
|
146
|
+
return { base_id: 'roman_section', display_name: 'Roman Numeral Section' };
|
|
147
|
+
return { base_id: 'top_level', display_name: 'Top-Level List Item' };
|
|
148
|
+
}
|
|
149
|
+
if (listLevel === 1) {
|
|
150
|
+
if (labelType === LabelType.LETTER)
|
|
151
|
+
return { base_id: 'subsection', display_name: 'Subsection (a)/(A)' };
|
|
152
|
+
if (labelType === LabelType.NUMBER)
|
|
153
|
+
return { base_id: 'subsection_number', display_name: 'Numbered Subsection' };
|
|
154
|
+
if (labelType === LabelType.ROMAN)
|
|
155
|
+
return { base_id: 'subsection_roman', display_name: 'Roman Subsection' };
|
|
156
|
+
return { base_id: 'level_1', display_name: `Level ${listLevel} List Item` };
|
|
157
|
+
}
|
|
158
|
+
if (labelType === LabelType.ROMAN)
|
|
159
|
+
return { base_id: `level_${listLevel}_roman`, display_name: `Level ${listLevel} Roman` };
|
|
160
|
+
if (labelType === LabelType.LETTER)
|
|
161
|
+
return { base_id: `level_${listLevel}_letter`, display_name: `Level ${listLevel} Letter` };
|
|
162
|
+
return { base_id: `level_${listLevel}`, display_name: `Level ${listLevel} List Item` };
|
|
163
|
+
}
|
|
164
|
+
// Non-list.
|
|
165
|
+
const styleName = fp.style_name.toLowerCase().replace(/\s+/g, '_');
|
|
166
|
+
if (fp.left_indent_pt > 0)
|
|
167
|
+
return { base_id: 'indent_block', display_name: 'Indented Block' };
|
|
168
|
+
if (styleName.includes('heading') || styleName.includes('title'))
|
|
169
|
+
return { base_id: 'heading', display_name: 'Heading' };
|
|
170
|
+
if (styleName.includes('quote') || styleName.includes('block'))
|
|
171
|
+
return { base_id: 'block_quote', display_name: 'Block Quote' };
|
|
172
|
+
return { base_id: 'body', display_name: 'Body Text' };
|
|
173
|
+
}
|
|
174
|
+
export function discoverStyles(nodes) {
|
|
175
|
+
const groups = new Map();
|
|
176
|
+
for (const n of nodes) {
|
|
177
|
+
const key = fingerprintKey(n.style_fingerprint);
|
|
178
|
+
const g = groups.get(key);
|
|
179
|
+
if (g)
|
|
180
|
+
g.nodes.push(n);
|
|
181
|
+
else
|
|
182
|
+
groups.set(key, { fp: n.style_fingerprint, nodes: [n] });
|
|
183
|
+
}
|
|
184
|
+
const used = {};
|
|
185
|
+
const styles = new Map();
|
|
186
|
+
const fpToStyle = new Map();
|
|
187
|
+
for (const [fpKey, g] of groups.entries()) {
|
|
188
|
+
const { base_id, display_name } = inferSemanticName({ fp: g.fp, nodes: g.nodes });
|
|
189
|
+
let styleId = base_id;
|
|
190
|
+
if (used[base_id] !== undefined) {
|
|
191
|
+
used[base_id] += 1;
|
|
192
|
+
styleId = `${base_id}_${used[base_id]}`;
|
|
193
|
+
}
|
|
194
|
+
else {
|
|
195
|
+
used[base_id] = 0;
|
|
196
|
+
}
|
|
197
|
+
const median = g.nodes[Math.floor(g.nodes.length / 2)];
|
|
198
|
+
const info = {
|
|
199
|
+
style_id: styleId,
|
|
200
|
+
display_name,
|
|
201
|
+
fingerprint: g.fp,
|
|
202
|
+
example_node_id: median.id,
|
|
203
|
+
example_text: median.clean_text.slice(0, STYLE_EXAMPLE_TEXT_PREVIEW_LENGTH),
|
|
204
|
+
count: g.nodes.length,
|
|
205
|
+
dominant_alignment: g.fp.alignment,
|
|
206
|
+
};
|
|
207
|
+
styles.set(styleId, info);
|
|
208
|
+
fpToStyle.set(fpKey, styleId);
|
|
209
|
+
}
|
|
210
|
+
return { styles, fingerprint_to_style: fpToStyle };
|
|
211
|
+
}
|
|
212
|
+
function headerStripFromText(params) {
|
|
213
|
+
// Mirrors Python TOONRenderer header stripping.
|
|
214
|
+
const { header } = params;
|
|
215
|
+
let { text } = params;
|
|
216
|
+
if (!header)
|
|
217
|
+
return text;
|
|
218
|
+
const headerNorm = header.trim().toLowerCase();
|
|
219
|
+
const textLower = text.toLowerCase();
|
|
220
|
+
for (const punct of [':', '.', '-', ';', '']) {
|
|
221
|
+
const testPrefix = `${headerNorm}${punct}`;
|
|
222
|
+
if (textLower.startsWith(testPrefix)) {
|
|
223
|
+
text = text.slice(testPrefix.length).trimStart();
|
|
224
|
+
return text;
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
if (text.startsWith(header)) {
|
|
228
|
+
text = text.slice(header.length).replace(/^[.:\-;]+/, '').trimStart();
|
|
229
|
+
}
|
|
230
|
+
return text;
|
|
231
|
+
}
|
|
232
|
+
export function renderToon(nodes) {
|
|
233
|
+
const lines = ['#SCHEMA id | list_label | header | style | text'];
|
|
234
|
+
for (const n of nodes) {
|
|
235
|
+
let text = n.tagged_text;
|
|
236
|
+
if (n.header)
|
|
237
|
+
text = headerStripFromText({ header: n.header, text });
|
|
238
|
+
let header = n.header;
|
|
239
|
+
if (header && !text) {
|
|
240
|
+
text = header;
|
|
241
|
+
header = '';
|
|
242
|
+
}
|
|
243
|
+
lines.push(`${n.id} | ${n.list_label} | ${header} | ${n.style} | ${text}`);
|
|
244
|
+
}
|
|
245
|
+
return lines.join('\n');
|
|
246
|
+
}
|
|
247
|
+
export function buildDocumentView(params) {
|
|
248
|
+
const { documentXml, stylesXml, numberingXml, opts } = params;
|
|
249
|
+
const includeSemantic = opts?.include_semantic_tags ?? true;
|
|
250
|
+
void includeSemantic;
|
|
251
|
+
const stylesModel = parseStylesXml(stylesXml);
|
|
252
|
+
void stylesModel;
|
|
253
|
+
const numberingModel = parseNumberingXml(numberingXml);
|
|
254
|
+
void numberingModel;
|
|
255
|
+
const counters = new Map();
|
|
256
|
+
void counters;
|
|
257
|
+
const body = documentXml.getElementsByTagNameNS(OOXML.W_NS, W.body).item(0);
|
|
258
|
+
if (!body)
|
|
259
|
+
return { nodes: [], styles: { styles: new Map(), fingerprint_to_style: new Map() } };
|
|
260
|
+
const paragraphs = Array.from(body.getElementsByTagNameNS(OOXML.W_NS, W.p));
|
|
261
|
+
const nodes = [];
|
|
262
|
+
for (const p of paragraphs) {
|
|
263
|
+
const prev = p.previousSibling;
|
|
264
|
+
void prev;
|
|
265
|
+
}
|
|
266
|
+
return { nodes, styles: { styles: new Map(), fingerprint_to_style: new Map() } };
|
|
267
|
+
}
|
|
268
|
+
// ── Helpers for building AnnotatedRun arrays ─────────────────────────
|
|
269
|
+
/**
|
|
270
|
+
* Resolve the hyperlink URL for a run element by checking if its parent is a
|
|
271
|
+
* `w:hyperlink` element with an `r:id` attribute pointing into the rels map.
|
|
272
|
+
*/
|
|
273
|
+
function resolveRunHyperlinkUrl(runEl, relsMap) {
|
|
274
|
+
if (!relsMap || relsMap.size === 0)
|
|
275
|
+
return null;
|
|
276
|
+
const parent = runEl.parentNode;
|
|
277
|
+
if (!parent || parent.localName !== W.hyperlink)
|
|
278
|
+
return null;
|
|
279
|
+
// r:id attribute can be namespaced or prefixed.
|
|
280
|
+
const rId = parent.getAttributeNS(OOXML.R_NS, 'id') ??
|
|
281
|
+
parent.getAttribute('r:id') ??
|
|
282
|
+
null;
|
|
283
|
+
if (!rId)
|
|
284
|
+
return null;
|
|
285
|
+
return relsMap.get(rId) ?? null;
|
|
286
|
+
}
|
|
287
|
+
/**
|
|
288
|
+
* Build AnnotatedRun[] for a single paragraph. All runs are included;
|
|
289
|
+
* `isHeaderRun` is set to false initially (caller marks header runs separately).
|
|
290
|
+
*/
|
|
291
|
+
function buildAnnotatedRuns(params) {
|
|
292
|
+
const { p, paragraphPPr, paragraphStyleId, stylesModel, relsMap } = params;
|
|
293
|
+
const textRuns = getParagraphRuns(p);
|
|
294
|
+
const annotated = [];
|
|
295
|
+
// Track unique run elements to avoid double-counting when getParagraphRuns
|
|
296
|
+
// returns multiple TextRun entries for the same w:r element.
|
|
297
|
+
const seenRunEls = new Set();
|
|
298
|
+
for (const tr of textRuns) {
|
|
299
|
+
if (seenRunEls.has(tr.r)) {
|
|
300
|
+
// Append text to existing entry for this run element.
|
|
301
|
+
const existing = annotated[annotated.length - 1];
|
|
302
|
+
existing.text += tr.text;
|
|
303
|
+
existing.charCount += tr.text.length;
|
|
304
|
+
continue;
|
|
305
|
+
}
|
|
306
|
+
seenRunEls.add(tr.r);
|
|
307
|
+
const formatting = extractEffectiveRunFormatting({
|
|
308
|
+
run: tr.r,
|
|
309
|
+
paragraphPPr,
|
|
310
|
+
paragraphStyleId,
|
|
311
|
+
styles: stylesModel,
|
|
312
|
+
});
|
|
313
|
+
const hyperlinkUrl = resolveRunHyperlinkUrl(tr.r, relsMap);
|
|
314
|
+
annotated.push({
|
|
315
|
+
text: tr.text,
|
|
316
|
+
formatting,
|
|
317
|
+
hyperlinkUrl,
|
|
318
|
+
charCount: tr.text.length,
|
|
319
|
+
isHeaderRun: false,
|
|
320
|
+
});
|
|
321
|
+
}
|
|
322
|
+
return annotated;
|
|
323
|
+
}
|
|
324
|
+
// ── Footnote marker helpers (view-only) ─────────────────────────────
|
|
325
|
+
/**
|
|
326
|
+
* Build a map from footnote ID → display number by scanning documentXml
|
|
327
|
+
* for w:footnoteReference elements in DOM order (skipping reserved IDs).
|
|
328
|
+
*/
|
|
329
|
+
function buildFootnoteDisplayMap(documentXml, footnotesXml) {
|
|
330
|
+
const reservedIds = new Set();
|
|
331
|
+
if (footnotesXml) {
|
|
332
|
+
const fnEls = footnotesXml.getElementsByTagNameNS(OOXML.W_NS, W.footnotes);
|
|
333
|
+
const container = fnEls.length > 0 ? fnEls.item(0) : footnotesXml.documentElement;
|
|
334
|
+
const footnoteEls = container.getElementsByTagNameNS(OOXML.W_NS, W.footnote);
|
|
335
|
+
for (let i = 0; i < footnoteEls.length; i++) {
|
|
336
|
+
const el = footnoteEls.item(i);
|
|
337
|
+
if (isReservedFootnote(el)) {
|
|
338
|
+
const idStr = getWAttr(el, 'id');
|
|
339
|
+
if (idStr)
|
|
340
|
+
reservedIds.add(parseInt(idStr, 10));
|
|
341
|
+
}
|
|
342
|
+
}
|
|
343
|
+
}
|
|
344
|
+
const refs = documentXml.getElementsByTagNameNS(OOXML.W_NS, W.footnoteReference);
|
|
345
|
+
const map = new Map();
|
|
346
|
+
let displayNum = 1;
|
|
347
|
+
for (let i = 0; i < refs.length; i++) {
|
|
348
|
+
const ref = refs.item(i);
|
|
349
|
+
const idStr = getWAttr(ref, 'id');
|
|
350
|
+
if (!idStr)
|
|
351
|
+
continue;
|
|
352
|
+
const id = parseInt(idStr, 10);
|
|
353
|
+
if (reservedIds.has(id))
|
|
354
|
+
continue;
|
|
355
|
+
if (!map.has(id)) {
|
|
356
|
+
map.set(id, displayNum++);
|
|
357
|
+
}
|
|
358
|
+
}
|
|
359
|
+
return map;
|
|
360
|
+
}
|
|
361
|
+
/**
|
|
362
|
+
* Compute footnote marker insertion points for a paragraph.
|
|
363
|
+
* Returns an array of { offset, marker } sorted by offset descending
|
|
364
|
+
* for safe right-to-left insertion into the text string.
|
|
365
|
+
*
|
|
366
|
+
* Self-contained: only inspects the paragraph DOM for w:footnoteReference
|
|
367
|
+
* elements. Does NOT modify getParagraphRuns or getParagraphText.
|
|
368
|
+
*/
|
|
369
|
+
function getFootnoteMarkersForParagraph(p, displayMap) {
|
|
370
|
+
if (displayMap.size === 0)
|
|
371
|
+
return [];
|
|
372
|
+
// Walk through direct children (and hyperlink children) to find w:r elements
|
|
373
|
+
// and their visible text, tracking position. When we find a footnoteReference,
|
|
374
|
+
// record its position.
|
|
375
|
+
const markers = [];
|
|
376
|
+
let visibleOffset = 0;
|
|
377
|
+
// We need to iterate runs in paragraph order. Use the same approach as getParagraphRuns
|
|
378
|
+
// but also detect footnoteReference elements.
|
|
379
|
+
const rElems = Array.from(p.getElementsByTagNameNS(OOXML.W_NS, W.r));
|
|
380
|
+
// Track field state to skip field codes (same as getParagraphRuns)
|
|
381
|
+
let fieldState = 0; // 0=outside, 1=in_code, 2=in_result
|
|
382
|
+
for (const r of rElems) {
|
|
383
|
+
let runVisibleLen = 0;
|
|
384
|
+
let hasFootnoteRef = false;
|
|
385
|
+
let footnoteId = -1;
|
|
386
|
+
for (const child of Array.from(r.childNodes)) {
|
|
387
|
+
if (child.nodeType !== 1)
|
|
388
|
+
continue;
|
|
389
|
+
const el = child;
|
|
390
|
+
if (el.namespaceURI !== OOXML.W_NS)
|
|
391
|
+
continue;
|
|
392
|
+
if (el.localName === W.fldChar) {
|
|
393
|
+
const typ = getWAttr(el, 'fldCharType') ?? '';
|
|
394
|
+
if (typ === 'begin')
|
|
395
|
+
fieldState = 1;
|
|
396
|
+
else if (typ === 'separate')
|
|
397
|
+
fieldState = 2;
|
|
398
|
+
else if (typ === 'end')
|
|
399
|
+
fieldState = 0;
|
|
400
|
+
continue;
|
|
401
|
+
}
|
|
402
|
+
if (fieldState === 1)
|
|
403
|
+
continue; // skip field code
|
|
404
|
+
if (el.localName === W.t) {
|
|
405
|
+
runVisibleLen += (el.textContent ?? '').length;
|
|
406
|
+
}
|
|
407
|
+
else if (el.localName === W.tab || el.localName === W.br) {
|
|
408
|
+
runVisibleLen += 1;
|
|
409
|
+
}
|
|
410
|
+
else if (el.localName === W.footnoteReference) {
|
|
411
|
+
hasFootnoteRef = true;
|
|
412
|
+
const idStr = getWAttr(el, 'id');
|
|
413
|
+
if (idStr)
|
|
414
|
+
footnoteId = parseInt(idStr, 10);
|
|
415
|
+
}
|
|
416
|
+
}
|
|
417
|
+
// The footnote reference position is at the end of this run's visible text
|
|
418
|
+
if (hasFootnoteRef && footnoteId >= 0) {
|
|
419
|
+
const displayNum = displayMap.get(footnoteId);
|
|
420
|
+
if (displayNum != null) {
|
|
421
|
+
markers.push({
|
|
422
|
+
offset: visibleOffset + runVisibleLen,
|
|
423
|
+
marker: `[^${displayNum}]`,
|
|
424
|
+
});
|
|
425
|
+
}
|
|
426
|
+
}
|
|
427
|
+
visibleOffset += runVisibleLen;
|
|
428
|
+
}
|
|
429
|
+
// Sort descending by offset for safe right-to-left insertion
|
|
430
|
+
markers.sort((a, b) => b.offset - a.offset);
|
|
431
|
+
return markers;
|
|
432
|
+
}
|
|
433
|
+
/**
|
|
434
|
+
* Inject footnote markers into a text string at the given offsets.
|
|
435
|
+
* Markers must be sorted descending by offset.
|
|
436
|
+
*/
|
|
437
|
+
function injectFootnoteMarkers(text, markers) {
|
|
438
|
+
if (markers.length === 0)
|
|
439
|
+
return text;
|
|
440
|
+
let result = text;
|
|
441
|
+
for (const { offset, marker } of markers) {
|
|
442
|
+
// Clamp offset to text length
|
|
443
|
+
const pos = Math.min(offset, result.length);
|
|
444
|
+
result = result.slice(0, pos) + marker + result.slice(pos);
|
|
445
|
+
}
|
|
446
|
+
return result;
|
|
447
|
+
}
|
|
448
|
+
export function buildNodesForDocumentView(params) {
|
|
449
|
+
const { paragraphs, stylesXml, numberingXml, relsMap } = params;
|
|
450
|
+
const includeSemantic = params.include_semantic_tags ?? true;
|
|
451
|
+
const showFormatting = params.show_formatting ?? false;
|
|
452
|
+
// Build footnote display number map if documentXml is provided
|
|
453
|
+
const footnoteDisplayMap = params.documentXml
|
|
454
|
+
? buildFootnoteDisplayMap(params.documentXml, params.footnotesXml ?? null)
|
|
455
|
+
: new Map();
|
|
456
|
+
const stylesModel = parseStylesXml(stylesXml);
|
|
457
|
+
const numberingModel = parseNumberingXml(numberingXml);
|
|
458
|
+
const counters = new Map();
|
|
459
|
+
// ── Pass 1 (formatting mode): pre-compute annotated runs per paragraph ──
|
|
460
|
+
// We also collect all non-header, non-heading-style body runs for a
|
|
461
|
+
// document-wide FormattingBaseline.
|
|
462
|
+
const paraAnnotatedRuns = new Map();
|
|
463
|
+
const allBodyRuns = [];
|
|
464
|
+
if (showFormatting) {
|
|
465
|
+
for (const { p } of paragraphs) {
|
|
466
|
+
const paraPPr = p.getElementsByTagNameNS(OOXML.W_NS, W.pPr).item(0);
|
|
467
|
+
const paraFmt = extractParagraphFormatting(paraPPr ?? null, stylesModel);
|
|
468
|
+
const runs = buildAnnotatedRuns({
|
|
469
|
+
p,
|
|
470
|
+
paragraphPPr: paraPPr ?? null,
|
|
471
|
+
paragraphStyleId: paraFmt.styleId,
|
|
472
|
+
stylesModel,
|
|
473
|
+
relsMap,
|
|
474
|
+
});
|
|
475
|
+
// Mark run-in header prefix runs so baseline suppression ignores them.
|
|
476
|
+
try {
|
|
477
|
+
const hdr = detectRunInHeader({
|
|
478
|
+
paragraph: p,
|
|
479
|
+
paragraphPPr: paraPPr ?? null,
|
|
480
|
+
paragraphStyleId: paraFmt.styleId,
|
|
481
|
+
styles: stylesModel,
|
|
482
|
+
});
|
|
483
|
+
if (hdr && hdr.headerCharCount > 0) {
|
|
484
|
+
let seen = 0;
|
|
485
|
+
for (const r of runs) {
|
|
486
|
+
if (seen >= hdr.headerCharCount)
|
|
487
|
+
break;
|
|
488
|
+
r.isHeaderRun = true;
|
|
489
|
+
seen += r.charCount;
|
|
490
|
+
}
|
|
491
|
+
}
|
|
492
|
+
}
|
|
493
|
+
catch {
|
|
494
|
+
// Ignore header-detection errors for baseline precomputation.
|
|
495
|
+
}
|
|
496
|
+
paraAnnotatedRuns.set(p, runs);
|
|
497
|
+
// Skip heading-style paragraphs from baseline computation.
|
|
498
|
+
const styleName = (paraFmt.styleName ?? '').toLowerCase();
|
|
499
|
+
const isHeadingStyle = styleName.includes('heading') || styleName.includes('title');
|
|
500
|
+
if (!isHeadingStyle) {
|
|
501
|
+
for (const r of runs) {
|
|
502
|
+
if (r.charCount > 0)
|
|
503
|
+
allBodyRuns.push(r);
|
|
504
|
+
}
|
|
505
|
+
}
|
|
506
|
+
}
|
|
507
|
+
}
|
|
508
|
+
const docBaseline = showFormatting
|
|
509
|
+
? computeModalBaseline(allBodyRuns)
|
|
510
|
+
: { bold: false, italic: false, underline: false, suppressed: false };
|
|
511
|
+
// ── Pass 2: main loop ──
|
|
512
|
+
const nodes = [];
|
|
513
|
+
for (let idx = 0; idx < paragraphs.length; idx++) {
|
|
514
|
+
const { id, p } = paragraphs[idx];
|
|
515
|
+
const paraPPr = p.getElementsByTagNameNS(OOXML.W_NS, W.pPr).item(0);
|
|
516
|
+
const paraFmt = extractParagraphFormatting(paraPPr ?? null, stylesModel);
|
|
517
|
+
// Visible clean text (field codes stripped).
|
|
518
|
+
const fullText = getParagraphText(p).replace(/\r/g, '').replace(/\n/g, '').trim();
|
|
519
|
+
if (!fullText)
|
|
520
|
+
continue;
|
|
521
|
+
// Numbering (auto-numbered) info from numPr.
|
|
522
|
+
let numId = null;
|
|
523
|
+
let ilvl = null;
|
|
524
|
+
const numPr = paraPPr ? paraPPr.getElementsByTagNameNS(OOXML.W_NS, W.numPr).item(0) : null;
|
|
525
|
+
if (numPr) {
|
|
526
|
+
const numIdEl = numPr.getElementsByTagNameNS(OOXML.W_NS, W.numId).item(0);
|
|
527
|
+
const ilvlEl = numPr.getElementsByTagNameNS(OOXML.W_NS, W.ilvl).item(0);
|
|
528
|
+
const numIdVal = numIdEl ? getWAttr(numIdEl, 'val') : null;
|
|
529
|
+
const ilvlVal = ilvlEl ? getWAttr(ilvlEl, 'val') : null;
|
|
530
|
+
if (numIdVal)
|
|
531
|
+
numId = numIdVal;
|
|
532
|
+
if (ilvlVal != null) {
|
|
533
|
+
const v = Number.parseInt(ilvlVal, 10);
|
|
534
|
+
if (!Number.isNaN(v))
|
|
535
|
+
ilvl = v;
|
|
536
|
+
}
|
|
537
|
+
}
|
|
538
|
+
let labelString = '';
|
|
539
|
+
let labelType = null;
|
|
540
|
+
let cleanTextNoLabel = fullText;
|
|
541
|
+
let isAutoNumbered = false;
|
|
542
|
+
let listLevel = -1;
|
|
543
|
+
let manualLabelMatchEnd = 0;
|
|
544
|
+
if (numId && ilvl != null) {
|
|
545
|
+
isAutoNumbered = true;
|
|
546
|
+
listLevel = ilvl;
|
|
547
|
+
labelString = computeListLabelForParagraph(numberingModel, counters, { numId, ilvl }) || '';
|
|
548
|
+
if (labelString) {
|
|
549
|
+
const cls = extractListLabel(labelString);
|
|
550
|
+
labelType = cls.label_type;
|
|
551
|
+
}
|
|
552
|
+
}
|
|
553
|
+
else {
|
|
554
|
+
// Manual label detection from visible text.
|
|
555
|
+
const stripped = stripListLabel(fullText);
|
|
556
|
+
cleanTextNoLabel = stripped.stripped_text;
|
|
557
|
+
if (stripped.result.label) {
|
|
558
|
+
labelString = stripped.result.label;
|
|
559
|
+
labelType = stripped.result.label_type;
|
|
560
|
+
listLevel = 0;
|
|
561
|
+
manualLabelMatchEnd = stripped.result.match_end;
|
|
562
|
+
}
|
|
563
|
+
}
|
|
564
|
+
// Run-in header detection (formatting-based) first.
|
|
565
|
+
let headerText = null;
|
|
566
|
+
let headerStyle = null;
|
|
567
|
+
let headerFormatting = null;
|
|
568
|
+
let headerCharCount = 0;
|
|
569
|
+
try {
|
|
570
|
+
const hdr = detectRunInHeader({ paragraph: p, paragraphPPr: paraPPr ?? null, paragraphStyleId: paraFmt.styleId, styles: stylesModel });
|
|
571
|
+
if (hdr) {
|
|
572
|
+
headerText = hdr.raw_text.replace(/[.:\-]+$/g, '');
|
|
573
|
+
headerStyle = 'run_in_header';
|
|
574
|
+
headerFormatting = hdr.formatting;
|
|
575
|
+
headerCharCount = hdr.headerCharCount;
|
|
576
|
+
}
|
|
577
|
+
}
|
|
578
|
+
catch {
|
|
579
|
+
// ignore
|
|
580
|
+
}
|
|
581
|
+
if (!headerText) {
|
|
582
|
+
const fallback = extractHeaderInfo(cleanTextNoLabel);
|
|
583
|
+
headerText = fallback.header_text;
|
|
584
|
+
headerStyle = fallback.header_style;
|
|
585
|
+
}
|
|
586
|
+
// ── Tag emission ──
|
|
587
|
+
let tagged = cleanTextNoLabel;
|
|
588
|
+
if (showFormatting) {
|
|
589
|
+
// Formatting tags mode: emit inline <b>/<i>/<u>/<highlighting>/<a> tags.
|
|
590
|
+
const annotatedRuns = paraAnnotatedRuns.get(p) ?? [];
|
|
591
|
+
// Mark header-prefix runs as isHeaderRun.
|
|
592
|
+
if (headerCharCount > 0) {
|
|
593
|
+
let charsSeen = 0;
|
|
594
|
+
for (const ar of annotatedRuns) {
|
|
595
|
+
if (charsSeen >= headerCharCount)
|
|
596
|
+
break;
|
|
597
|
+
ar.isHeaderRun = true;
|
|
598
|
+
charsSeen += ar.charCount;
|
|
599
|
+
}
|
|
600
|
+
}
|
|
601
|
+
// Handle manual label: skip runs whose text falls within the label portion.
|
|
602
|
+
let bodyRuns;
|
|
603
|
+
if (manualLabelMatchEnd > 0) {
|
|
604
|
+
// Skip characters in the label portion.
|
|
605
|
+
bodyRuns = [];
|
|
606
|
+
let charsSeen = 0;
|
|
607
|
+
for (const ar of annotatedRuns) {
|
|
608
|
+
const runEnd = charsSeen + ar.charCount;
|
|
609
|
+
if (runEnd <= manualLabelMatchEnd) {
|
|
610
|
+
// Entire run is within the label — skip it.
|
|
611
|
+
charsSeen = runEnd;
|
|
612
|
+
continue;
|
|
613
|
+
}
|
|
614
|
+
if (charsSeen < manualLabelMatchEnd) {
|
|
615
|
+
// Run spans the label boundary — take only the body portion.
|
|
616
|
+
const bodyStart = manualLabelMatchEnd - charsSeen;
|
|
617
|
+
bodyRuns.push({
|
|
618
|
+
...ar,
|
|
619
|
+
text: ar.text.slice(bodyStart),
|
|
620
|
+
charCount: ar.charCount - bodyStart,
|
|
621
|
+
});
|
|
622
|
+
charsSeen = runEnd;
|
|
623
|
+
continue;
|
|
624
|
+
}
|
|
625
|
+
bodyRuns.push(ar);
|
|
626
|
+
charsSeen = runEnd;
|
|
627
|
+
}
|
|
628
|
+
// Also trim leading whitespace from the first body run (matching stripListLabel behavior).
|
|
629
|
+
if (bodyRuns.length > 0) {
|
|
630
|
+
const first = bodyRuns[0];
|
|
631
|
+
const trimmed = first.text.replace(/^\s+/, '');
|
|
632
|
+
if (trimmed.length < first.text.length) {
|
|
633
|
+
bodyRuns[0] = { ...first, text: trimmed, charCount: trimmed.length };
|
|
634
|
+
}
|
|
635
|
+
}
|
|
636
|
+
}
|
|
637
|
+
else {
|
|
638
|
+
bodyRuns = annotatedRuns;
|
|
639
|
+
}
|
|
640
|
+
// Emit formatting tags from run-level metadata.
|
|
641
|
+
// Detect definition spans from the exact body run text used for emission so
|
|
642
|
+
// span offsets stay aligned after list-label stripping/slicing.
|
|
643
|
+
const bodyPlainText = bodyRuns.map((r) => r.text).join('');
|
|
644
|
+
const defSpans = includeSemantic ? detectDefinitionSpans(bodyPlainText) : undefined;
|
|
645
|
+
tagged = emitFormattingTags({ runs: bodyRuns, baseline: docBaseline, definitionSpans: defSpans });
|
|
646
|
+
tagged = mergeAdjacentTags(tagged);
|
|
647
|
+
}
|
|
648
|
+
else if (includeSemantic) {
|
|
649
|
+
// Legacy path: emit only highlight + definition tags (no formatting tags).
|
|
650
|
+
const highlightTagged = emitHighlightTagsFromParagraph(p).replace(/\r/g, '').replace(/\n/g, '').trim();
|
|
651
|
+
const semanticBase = cleanTextNoLabel === fullText ? highlightTagged : cleanTextNoLabel;
|
|
652
|
+
tagged = emitDefinitionTagsFromString(semanticBase);
|
|
653
|
+
}
|
|
654
|
+
const fp = {
|
|
655
|
+
list_level: listLevel,
|
|
656
|
+
left_indent_pt: Math.round(paraFmt.leftIndentPt * 10) / 10,
|
|
657
|
+
first_line_indent_pt: Math.round(paraFmt.firstLineIndentPt * 10) / 10,
|
|
658
|
+
style_name: paraFmt.styleName,
|
|
659
|
+
alignment: paraFmt.alignment,
|
|
660
|
+
};
|
|
661
|
+
// Body run formatting: pick the first visible run after any header prefix.
|
|
662
|
+
let bodyFmt = null;
|
|
663
|
+
try {
|
|
664
|
+
const trs = getParagraphRuns(p);
|
|
665
|
+
const seenRun = new Set();
|
|
666
|
+
for (const tr of trs) {
|
|
667
|
+
if (seenRun.has(tr.r))
|
|
668
|
+
continue;
|
|
669
|
+
seenRun.add(tr.r);
|
|
670
|
+
const fmt = extractEffectiveRunFormatting({ run: tr.r, paragraphPPr: paraPPr ?? null, paragraphStyleId: paraFmt.styleId, styles: stylesModel });
|
|
671
|
+
// Skip header-style runs (bold/underline) at the very beginning; we want body.
|
|
672
|
+
const isHeaderStyle = fmt.bold || fmt.underline;
|
|
673
|
+
if (isHeaderStyle && (headerText || '').length > 0)
|
|
674
|
+
continue;
|
|
675
|
+
bodyFmt = fmt;
|
|
676
|
+
break;
|
|
677
|
+
}
|
|
678
|
+
}
|
|
679
|
+
catch {
|
|
680
|
+
bodyFmt = null;
|
|
681
|
+
}
|
|
682
|
+
// Inject footnote [^N] markers into view text (view-only, not shared text primitives)
|
|
683
|
+
const fnMarkers = getFootnoteMarkersForParagraph(p, footnoteDisplayMap);
|
|
684
|
+
if (fnMarkers.length > 0) {
|
|
685
|
+
tagged = injectFootnoteMarkers(tagged, fnMarkers);
|
|
686
|
+
}
|
|
687
|
+
nodes.push({
|
|
688
|
+
id,
|
|
689
|
+
list_label: labelString,
|
|
690
|
+
header: headerText ?? '',
|
|
691
|
+
style: '', // filled after style discovery
|
|
692
|
+
text: tagged, // filled after header stripping at render time
|
|
693
|
+
clean_text: cleanTextNoLabel,
|
|
694
|
+
tagged_text: tagged,
|
|
695
|
+
list_metadata: {
|
|
696
|
+
list_level: listLevel,
|
|
697
|
+
label_type: labelType,
|
|
698
|
+
label_string: labelString,
|
|
699
|
+
header_text: headerText,
|
|
700
|
+
header_style: headerStyle,
|
|
701
|
+
header_formatting: headerFormatting,
|
|
702
|
+
is_auto_numbered: isAutoNumbered,
|
|
703
|
+
},
|
|
704
|
+
style_fingerprint: fp,
|
|
705
|
+
paragraph_style_id: paraFmt.styleId,
|
|
706
|
+
paragraph_style_name: paraFmt.styleName,
|
|
707
|
+
paragraph_alignment: paraFmt.alignment,
|
|
708
|
+
paragraph_indents_pt: { left: fp.left_indent_pt, first_line: fp.first_line_indent_pt },
|
|
709
|
+
numbering: { num_id: numId, ilvl, is_auto_numbered: isAutoNumbered },
|
|
710
|
+
header_formatting: headerFormatting,
|
|
711
|
+
body_run_formatting: bodyFmt,
|
|
712
|
+
});
|
|
713
|
+
}
|
|
714
|
+
const styles = discoverStyles(nodes);
|
|
715
|
+
for (const n of nodes) {
|
|
716
|
+
const sid = styles.fingerprint_to_style.get(fingerprintKey(n.style_fingerprint));
|
|
717
|
+
n.style = sid ?? (n.style_fingerprint.list_level >= 0 ? `level_${n.style_fingerprint.list_level}` : 'body');
|
|
718
|
+
n.text = n.tagged_text;
|
|
719
|
+
}
|
|
720
|
+
return { nodes, styles };
|
|
721
|
+
}
|
|
722
|
+
//# sourceMappingURL=document_view.js.map
|