hwp-convert 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +185 -0
- package/LICENSE +25 -0
- package/NOTICE +23 -0
- package/README.md +338 -0
- package/dist/browser/hwp-convert.browser.mjs +20677 -0
- package/dist/browser/hwp-convert.browser.mjs.map +7 -0
- package/dist/cli.d.ts +2 -0
- package/dist/cli.js +267 -0
- package/dist/index.d.ts +5 -0
- package/dist/index.js +5 -0
- package/dist/lib/errors.d.ts +9 -0
- package/dist/lib/errors.js +18 -0
- package/dist/lib/hwp/binData.d.ts +15 -0
- package/dist/lib/hwp/binData.js +64 -0
- package/dist/lib/hwp/bodyText.d.ts +31 -0
- package/dist/lib/hwp/bodyText.js +208 -0
- package/dist/lib/hwp/byteReader.d.ts +40 -0
- package/dist/lib/hwp/byteReader.js +116 -0
- package/dist/lib/hwp/cfbReader.d.ts +44 -0
- package/dist/lib/hwp/cfbReader.js +134 -0
- package/dist/lib/hwp/control.d.ts +17 -0
- package/dist/lib/hwp/control.js +290 -0
- package/dist/lib/hwp/converter.d.ts +22 -0
- package/dist/lib/hwp/converter.js +41 -0
- package/dist/lib/hwp/docInfo.d.ts +26 -0
- package/dist/lib/hwp/docInfo.js +396 -0
- package/dist/lib/hwp/fileHeader.d.ts +42 -0
- package/dist/lib/hwp/fileHeader.js +66 -0
- package/dist/lib/hwp/htmlReader.d.ts +17 -0
- package/dist/lib/hwp/htmlReader.js +602 -0
- package/dist/lib/hwp/hwpxBuilder.d.ts +19 -0
- package/dist/lib/hwp/hwpxBuilder.js +633 -0
- package/dist/lib/hwp/index.d.ts +68 -0
- package/dist/lib/hwp/index.js +149 -0
- package/dist/lib/hwp/mdReader.d.ts +16 -0
- package/dist/lib/hwp/mdReader.js +485 -0
- package/dist/lib/hwp/mdWriter.d.ts +23 -0
- package/dist/lib/hwp/mdWriter.js +182 -0
- package/dist/lib/hwp/owpml.d.ts +33 -0
- package/dist/lib/hwp/owpml.js +86 -0
- package/dist/lib/hwp/record.d.ts +24 -0
- package/dist/lib/hwp/record.js +59 -0
- package/dist/lib/hwp/tags.d.ts +115 -0
- package/dist/lib/hwp/tags.js +217 -0
- package/dist/lib/hwp/types.d.ts +214 -0
- package/dist/lib/hwp/types.js +5 -0
- package/dist/lib/hwpxReader.d.ts +60 -0
- package/dist/lib/hwpxReader.js +1104 -0
- package/dist/lib/types.d.ts +47 -0
- package/dist/lib/types.js +1 -0
- package/dist/lib/writer.d.ts +19 -0
- package/dist/lib/writer.js +149 -0
- package/package.json +94 -0
|
@@ -0,0 +1,1104 @@
|
|
|
1
|
+
import { XMLParser } from "fast-xml-parser";
|
|
2
|
+
import JSZip from "jszip";
|
|
3
|
+
import { HwpxEncryptedDocumentError, HwpxNotLoadedError } from "./errors.js";
|
|
4
|
+
const DECODER_UTF8 = new TextDecoder("utf-8");
|
|
5
|
+
const DECODER_UTF16LE = new TextDecoder("utf-16le");
|
|
6
|
+
const DECODER_UTF16BE = new TextDecoder("utf-16be");
|
|
7
|
+
function detectTextEncoding(bytes) {
|
|
8
|
+
if (bytes.length >= 3 && bytes[0] === 0xef && bytes[1] === 0xbb && bytes[2] === 0xbf)
|
|
9
|
+
return "utf-8";
|
|
10
|
+
if (bytes.length >= 2 && bytes[0] === 0xff && bytes[1] === 0xfe)
|
|
11
|
+
return "utf-16le";
|
|
12
|
+
if (bytes.length >= 2 && bytes[0] === 0xfe && bytes[1] === 0xff)
|
|
13
|
+
return "utf-16be";
|
|
14
|
+
// Heuristic: many zeros on odd/even positions → UTF-16
|
|
15
|
+
let zeroEven = 0, zeroOdd = 0, sample = Math.min(bytes.length, 1024);
|
|
16
|
+
for (let i = 0; i < sample; i++) {
|
|
17
|
+
if (bytes[i] === 0)
|
|
18
|
+
(i % 2 === 0 ? zeroEven++ : zeroOdd++);
|
|
19
|
+
}
|
|
20
|
+
if (zeroOdd > zeroEven * 2)
|
|
21
|
+
return "utf-16le"; // LE: xx 00 xx 00
|
|
22
|
+
if (zeroEven > zeroOdd * 2)
|
|
23
|
+
return "utf-16be"; // BE: 00 xx 00 xx
|
|
24
|
+
return "utf-8";
|
|
25
|
+
}
|
|
26
|
+
function decodeBytesSmart(bytes) {
|
|
27
|
+
const enc = detectTextEncoding(bytes);
|
|
28
|
+
// Strip BOM
|
|
29
|
+
if (enc === "utf-8" && bytes.length >= 3 && bytes[0] === 0xef && bytes[1] === 0xbb && bytes[2] === 0xbf) {
|
|
30
|
+
return DECODER_UTF8.decode(bytes.subarray(3));
|
|
31
|
+
}
|
|
32
|
+
if (enc === "utf-16le" && bytes.length >= 2 && bytes[0] === 0xff && bytes[1] === 0xfe) {
|
|
33
|
+
return DECODER_UTF16LE.decode(bytes.subarray(2));
|
|
34
|
+
}
|
|
35
|
+
if (enc === "utf-16be" && bytes.length >= 2 && bytes[0] === 0xfe && bytes[1] === 0xff) {
|
|
36
|
+
return DECODER_UTF16BE.decode(bytes.subarray(2));
|
|
37
|
+
}
|
|
38
|
+
if (enc === "utf-8")
|
|
39
|
+
return DECODER_UTF8.decode(bytes);
|
|
40
|
+
if (enc === "utf-16le")
|
|
41
|
+
return DECODER_UTF16LE.decode(bytes);
|
|
42
|
+
return DECODER_UTF16BE.decode(bytes);
|
|
43
|
+
}
|
|
44
|
+
function getOrEmpty(value) {
|
|
45
|
+
return value ?? undefined;
|
|
46
|
+
}
|
|
47
|
+
/**
|
|
48
|
+
* fast-xml-parser 의 텍스트 노드는 string 외에도 number/boolean 또는 객체({#text})/배열 형태로
|
|
49
|
+
* 등장할 수 있다. 모든 케이스를 문자열로 정규화하여 pieces 에 추가한다.
|
|
50
|
+
*/
|
|
51
|
+
function escapeMd(text) {
|
|
52
|
+
return text
|
|
53
|
+
.replace(/\\/g, "\\\\")
|
|
54
|
+
.replace(/([*_`~])/g, "\\$1")
|
|
55
|
+
.replace(/^([#>])/gm, "\\$1");
|
|
56
|
+
}
|
|
57
|
+
function pushTextNode(t, pieces) {
|
|
58
|
+
if (t === undefined || t === null)
|
|
59
|
+
return;
|
|
60
|
+
if (typeof t === "string") {
|
|
61
|
+
pieces.push(t);
|
|
62
|
+
return;
|
|
63
|
+
}
|
|
64
|
+
if (typeof t === "number" || typeof t === "boolean") {
|
|
65
|
+
pieces.push(String(t));
|
|
66
|
+
return;
|
|
67
|
+
}
|
|
68
|
+
if (Array.isArray(t)) {
|
|
69
|
+
for (const item of t)
|
|
70
|
+
pushTextNode(item, pieces);
|
|
71
|
+
return;
|
|
72
|
+
}
|
|
73
|
+
if (typeof t === "object") {
|
|
74
|
+
const inner = t["#text"];
|
|
75
|
+
if (inner !== undefined)
|
|
76
|
+
pushTextNode(inner, pieces);
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
export class HwpxReader {
|
|
80
|
+
zip = null;
|
|
81
|
+
files = {};
|
|
82
|
+
encryptedCache = null;
|
|
83
|
+
characterProperties = new Map();
|
|
84
|
+
fontFaces = new Map();
|
|
85
|
+
async loadFromArrayBuffer(buffer) {
|
|
86
|
+
const zip = await JSZip.loadAsync(buffer);
|
|
87
|
+
this.zip = zip;
|
|
88
|
+
this.files = {};
|
|
89
|
+
const entries = Object.keys(zip.files);
|
|
90
|
+
await Promise.all(entries.map(async (name) => {
|
|
91
|
+
const file = zip.file(name);
|
|
92
|
+
if (!file)
|
|
93
|
+
return;
|
|
94
|
+
// store raw bytes for flexible processing (images, xml, etc.)
|
|
95
|
+
this.files[name] = new Uint8Array(await file.async("uint8array"));
|
|
96
|
+
}));
|
|
97
|
+
// Validate mimetype (per spec: application/owpml). 다양한 변형을 수용하고, 불일치 시에도 진행.
|
|
98
|
+
const mime = this.getTextFile("mimetype")?.trim();
|
|
99
|
+
if (mime && !this.isLikelyHwpxMime(mime)) {
|
|
100
|
+
// 엄격 차단 대신 경고성 에러로 유지하려면 throw를 피한다.
|
|
101
|
+
// throw new InvalidHwpxFormatError();
|
|
102
|
+
}
|
|
103
|
+
// Try to locate content via META-INF/container.xml if present (not mandatory but helpful)
|
|
104
|
+
const containerXml = this.getTextFile("META-INF/container.xml");
|
|
105
|
+
if (containerXml) {
|
|
106
|
+
const cx = this.parseXml(containerXml);
|
|
107
|
+
// not strictly necessary now; reserved for future rootfile discovery
|
|
108
|
+
void cx;
|
|
109
|
+
}
|
|
110
|
+
// Parse styles from header.xml
|
|
111
|
+
this.parseStyleDefinitions();
|
|
112
|
+
}
|
|
113
|
+
isLikelyHwpxMime(m) {
|
|
114
|
+
const s = m.toLowerCase();
|
|
115
|
+
// 허용: application/hwp+zip(한컴 표준), application/owpml, hwpx/owpml/hwp+zip 포함 케이스
|
|
116
|
+
return (s === "application/hwp+zip" ||
|
|
117
|
+
s === "application/owpml" ||
|
|
118
|
+
s.includes("hwp+zip") ||
|
|
119
|
+
s.includes("owpml") ||
|
|
120
|
+
s.includes("hwpx"));
|
|
121
|
+
}
|
|
122
|
+
getTextFile(path) {
|
|
123
|
+
const bytes = this.files[path];
|
|
124
|
+
if (!bytes)
|
|
125
|
+
return null;
|
|
126
|
+
return decodeBytesSmart(bytes);
|
|
127
|
+
}
|
|
128
|
+
findFilePathIgnoreCase(targetPath) {
|
|
129
|
+
const lower = targetPath.toLowerCase();
|
|
130
|
+
for (const key of Object.keys(this.files)) {
|
|
131
|
+
if (key.toLowerCase() === lower)
|
|
132
|
+
return key;
|
|
133
|
+
}
|
|
134
|
+
return null;
|
|
135
|
+
}
|
|
136
|
+
parseXml(xml) {
|
|
137
|
+
try {
|
|
138
|
+
const parser = new XMLParser({
|
|
139
|
+
ignoreAttributes: false,
|
|
140
|
+
attributeNamePrefix: "@",
|
|
141
|
+
// 텍스트 내부 공백 보존 — `<hp:t>이것은 </hp:t>` 같은 run 사이 공백이 사라지지 않도록
|
|
142
|
+
trimValues: false,
|
|
143
|
+
removeNSPrefix: true,
|
|
144
|
+
// 텍스트 노드 자동 타입 변환 끄기 — "1", "true" 등이 number/boolean 으로 변환되는 것을 방지
|
|
145
|
+
parseTagValue: false,
|
|
146
|
+
parseAttributeValue: false,
|
|
147
|
+
});
|
|
148
|
+
const obj = parser.parse(xml);
|
|
149
|
+
return obj;
|
|
150
|
+
}
|
|
151
|
+
catch (_err) {
|
|
152
|
+
return null;
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
summarizePackage() {
|
|
156
|
+
const hasEncryptionInfo = this.detectEncryption();
|
|
157
|
+
const contentsFiles = Object.keys(this.files).filter((p) => p.startsWith("Contents/")).sort();
|
|
158
|
+
const contentHpf = this.getTextFile("Contents/content.hpf");
|
|
159
|
+
let manifest;
|
|
160
|
+
let spine;
|
|
161
|
+
if (contentHpf) {
|
|
162
|
+
const xml = this.parseXml(contentHpf);
|
|
163
|
+
const pkg = xml?.package ?? xml?.opf?.package;
|
|
164
|
+
const man = pkg?.manifest?.item;
|
|
165
|
+
if (man) {
|
|
166
|
+
const items = Array.isArray(man) ? man : [man];
|
|
167
|
+
manifest = items.map((it) => ({
|
|
168
|
+
id: it?.["@id"],
|
|
169
|
+
href: it?.["@href"],
|
|
170
|
+
mediaType: it?.["@media-type"] ?? it?.["@mediaType"],
|
|
171
|
+
}));
|
|
172
|
+
}
|
|
173
|
+
const sp = pkg?.spine?.itemref ?? pkg?.spine?.itemRef;
|
|
174
|
+
if (sp) {
|
|
175
|
+
const refs = Array.isArray(sp) ? sp : [sp];
|
|
176
|
+
spine = refs.map((r) => r?.["@idref"] ?? r?.["@idRef"]).filter(Boolean);
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
return { hasEncryptionInfo, contentsFiles, manifest, spine };
|
|
180
|
+
}
|
|
181
|
+
getSectionPathsBySpine() {
|
|
182
|
+
const contentHpf = this.getTextFile("Contents/content.hpf");
|
|
183
|
+
if (!contentHpf)
|
|
184
|
+
return null;
|
|
185
|
+
const xml = this.parseXml(contentHpf);
|
|
186
|
+
const pkg = xml?.package ?? xml?.opf?.package;
|
|
187
|
+
const man = pkg?.manifest?.item;
|
|
188
|
+
const map = new Map(); // id -> href
|
|
189
|
+
if (man) {
|
|
190
|
+
const items = Array.isArray(man) ? man : [man];
|
|
191
|
+
for (const it of items) {
|
|
192
|
+
const id = it?.["@id"];
|
|
193
|
+
const href = it?.["@href"];
|
|
194
|
+
if (id && href && /Contents\/section\d+\.xml$/i.test(href))
|
|
195
|
+
map.set(id, href);
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
const sp = pkg?.spine?.itemref ?? pkg?.spine?.itemRef;
|
|
199
|
+
const refs = sp ? (Array.isArray(sp) ? sp : [sp]) : [];
|
|
200
|
+
const paths = [];
|
|
201
|
+
for (const r of refs) {
|
|
202
|
+
const id = r?.["@idref"] ?? r?.["@idRef"];
|
|
203
|
+
const href = id ? map.get(id) : undefined;
|
|
204
|
+
if (href && this.files[href])
|
|
205
|
+
paths.push(href);
|
|
206
|
+
}
|
|
207
|
+
return paths.length ? paths : null;
|
|
208
|
+
}
|
|
209
|
+
detectEncryption() {
|
|
210
|
+
if (this.encryptedCache !== null)
|
|
211
|
+
return this.encryptedCache;
|
|
212
|
+
const manifestXml = this.getTextFile("META-INF/manifest.xml");
|
|
213
|
+
if (!manifestXml) {
|
|
214
|
+
this.encryptedCache = false;
|
|
215
|
+
return false;
|
|
216
|
+
}
|
|
217
|
+
const obj = this.parseXml(manifestXml);
|
|
218
|
+
const has = this.containsEncryptionMarker(obj);
|
|
219
|
+
this.encryptedCache = !!has;
|
|
220
|
+
return this.encryptedCache;
|
|
221
|
+
}
|
|
222
|
+
containsEncryptionMarker(node) {
|
|
223
|
+
if (!node)
|
|
224
|
+
return false;
|
|
225
|
+
if (typeof node === "string") {
|
|
226
|
+
return /encrypt|cipher/i.test(node);
|
|
227
|
+
}
|
|
228
|
+
if (Array.isArray(node)) {
|
|
229
|
+
for (const item of node) {
|
|
230
|
+
if (this.containsEncryptionMarker(item))
|
|
231
|
+
return true;
|
|
232
|
+
}
|
|
233
|
+
return false;
|
|
234
|
+
}
|
|
235
|
+
if (typeof node === "object") {
|
|
236
|
+
for (const [k, v] of Object.entries(node)) {
|
|
237
|
+
if (/encrypt|cipher/i.test(k))
|
|
238
|
+
return true;
|
|
239
|
+
if (typeof v === "string" && /encrypt|cipher/i.test(v))
|
|
240
|
+
return true;
|
|
241
|
+
if (this.containsEncryptionMarker(v))
|
|
242
|
+
return true;
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
return false;
|
|
246
|
+
}
|
|
247
|
+
readMetadata() {
|
|
248
|
+
const contentHpf = this.getTextFile("Contents/content.hpf");
|
|
249
|
+
const metadata = {};
|
|
250
|
+
if (contentHpf) {
|
|
251
|
+
const xml = this.parseXml(contentHpf);
|
|
252
|
+
// OPF-like: package > metadata
|
|
253
|
+
const md = xml?.package?.metadata;
|
|
254
|
+
if (md) {
|
|
255
|
+
metadata.title = getOrEmpty(md["dc:title"] ?? md.title);
|
|
256
|
+
metadata.creator = getOrEmpty(md["dc:creator"] ?? md.creator);
|
|
257
|
+
metadata.created = getOrEmpty(md["dcterms:created"] ?? md.created);
|
|
258
|
+
metadata.modified = getOrEmpty(md["dcterms:modified"] ?? md.modified);
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
const versionXml = this.getTextFile("version.xml");
|
|
262
|
+
if (versionXml) {
|
|
263
|
+
const v = this.parseXml(versionXml);
|
|
264
|
+
const ver = v?.Version?.OWPMLVersion ?? v?.version?.owpmlVersion;
|
|
265
|
+
if (typeof ver === "string") {
|
|
266
|
+
metadata.version = ver;
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
const settingsXml = this.getTextFile("settings.xml");
|
|
270
|
+
if (settingsXml) {
|
|
271
|
+
const s = this.parseXml(settingsXml);
|
|
272
|
+
// 표준 예시: ha:HWPApplicationSetting > ha:CaretPosition(listIDRef, paraIDRef, pos)
|
|
273
|
+
const app = s?.HWPApplicationSetting ?? s?.Settings ?? s?.settings;
|
|
274
|
+
const caret = app?.CaretPosition ?? app?.caretPosition;
|
|
275
|
+
if (caret && (caret["@listIDRef"] || caret["@paraIDRef"] || caret["@pos"])) {
|
|
276
|
+
const listId = caret["@listIDRef"] ?? "0";
|
|
277
|
+
const paraId = caret["@paraIDRef"] ?? "0";
|
|
278
|
+
const pos = caret["@pos"] ?? "0";
|
|
279
|
+
metadata.caretPosition = `${listId}:${paraId}:${pos}`;
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
return metadata;
|
|
283
|
+
}
|
|
284
|
+
async getDocumentInfo() {
|
|
285
|
+
if (!this.zip)
|
|
286
|
+
throw new HwpxNotLoadedError();
|
|
287
|
+
const summary = this.summarizePackage();
|
|
288
|
+
const metadata = this.readMetadata();
|
|
289
|
+
return { metadata, summary };
|
|
290
|
+
}
|
|
291
|
+
async extractText(options) {
|
|
292
|
+
if (!this.zip)
|
|
293
|
+
throw new HwpxNotLoadedError();
|
|
294
|
+
const summary = this.summarizePackage();
|
|
295
|
+
if (summary.hasEncryptionInfo) {
|
|
296
|
+
throw new HwpxEncryptedDocumentError();
|
|
297
|
+
}
|
|
298
|
+
// HWPX 본문: Contents/section*.xml 에서 hp:t 텍스트를 추출
|
|
299
|
+
const joiner = options?.joinParagraphs ?? "\n";
|
|
300
|
+
let sectionPaths = this.getSectionPathsBySpine() ?? Object.keys(this.files)
|
|
301
|
+
.filter((p) => /^contents\/section\d+\.xml$/.test(p.toLowerCase()))
|
|
302
|
+
.sort((a, b) => {
|
|
303
|
+
const na = Number(a.match(/section(\d+)\.xml/)?.[1] ?? 0);
|
|
304
|
+
const nb = Number(b.match(/section(\d+)\.xml/)?.[1] ?? 0);
|
|
305
|
+
return na - nb;
|
|
306
|
+
});
|
|
307
|
+
// Fallback: 탐색에 실패하면 Contents/*.xml 중 루트가 section 인 파일을 수색
|
|
308
|
+
if (sectionPaths.length === 0) {
|
|
309
|
+
const candidates = Object.keys(this.files).filter((p) => p.startsWith("Contents/") && p.toLowerCase().endsWith(".xml"));
|
|
310
|
+
for (const p of candidates) {
|
|
311
|
+
const xmlText = this.getTextFile(p);
|
|
312
|
+
if (!xmlText)
|
|
313
|
+
continue;
|
|
314
|
+
const xml = this.parseXml(xmlText);
|
|
315
|
+
if (xml && (xml.sec || xml.section || xml["hp:section"])) {
|
|
316
|
+
sectionPaths.push(p);
|
|
317
|
+
}
|
|
318
|
+
}
|
|
319
|
+
sectionPaths.sort((a, b) => {
|
|
320
|
+
const na = Number(a.match(/section(\d+)\.xml/)?.[1] ?? 0);
|
|
321
|
+
const nb = Number(b.match(/section(\d+)\.xml/)?.[1] ?? 0);
|
|
322
|
+
return na - nb;
|
|
323
|
+
});
|
|
324
|
+
}
|
|
325
|
+
const paragraphs = [];
|
|
326
|
+
for (const path of sectionPaths) {
|
|
327
|
+
const xmlText = this.getTextFile(path);
|
|
328
|
+
if (!xmlText)
|
|
329
|
+
continue;
|
|
330
|
+
const xml = this.parseXml(xmlText);
|
|
331
|
+
// 구조 참고: sec > p* > run* > t, 네임스페이스 제거됨
|
|
332
|
+
const section = xml?.sec ?? xml?.section ?? xml?.["hp:section"];
|
|
333
|
+
if (!section) {
|
|
334
|
+
const segs = [];
|
|
335
|
+
this.collectAllText(xml, segs);
|
|
336
|
+
if (segs.length)
|
|
337
|
+
paragraphs.push(segs.join(""));
|
|
338
|
+
continue;
|
|
339
|
+
}
|
|
340
|
+
const ps = section?.p ?? section?.["hp:p"];
|
|
341
|
+
if (!ps) {
|
|
342
|
+
const segs = [];
|
|
343
|
+
this.collectAllText(section, segs);
|
|
344
|
+
if (segs.length)
|
|
345
|
+
paragraphs.push(segs.join(""));
|
|
346
|
+
continue;
|
|
347
|
+
}
|
|
348
|
+
const paras = Array.isArray(ps) ? ps : [ps];
|
|
349
|
+
for (const p of paras) {
|
|
350
|
+
paragraphs.push(this.extractParagraphText(p));
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
const combined = paragraphs.join(joiner);
|
|
354
|
+
if (combined.trim().length > 0)
|
|
355
|
+
return combined;
|
|
356
|
+
// Fallback: Preview text
|
|
357
|
+
const prvPath = this.findFilePathIgnoreCase("Preview/PrvText.txt") ||
|
|
358
|
+
this.findFilePathIgnoreCase("preview/prvtext.txt");
|
|
359
|
+
if (prvPath) {
|
|
360
|
+
const prv = this.getTextFile(prvPath);
|
|
361
|
+
if (prv && prv.trim().length > 0)
|
|
362
|
+
return prv;
|
|
363
|
+
}
|
|
364
|
+
return combined;
|
|
365
|
+
}
|
|
366
|
+
/**
|
|
367
|
+
* 한 문단(<hp:p>)에서 텍스트를 추출. 표/이미지 등 인라인 컨트롤이 있으면 셀/내부 문단을 재귀 탐색.
|
|
368
|
+
*
|
|
369
|
+
* 표는 셀 단위로 텍스트를 모은 후 같은 행 내 셀은 공백으로, 행 사이는 줄바꿈으로 결합한다.
|
|
370
|
+
*/
|
|
371
|
+
extractParagraphText(p) {
|
|
372
|
+
const runs = p?.run ?? p?.["hp:run"];
|
|
373
|
+
if (!runs)
|
|
374
|
+
return "";
|
|
375
|
+
const runArr = Array.isArray(runs) ? runs : [runs];
|
|
376
|
+
const pieces = [];
|
|
377
|
+
for (const run of runArr) {
|
|
378
|
+
// 섹션/컬럼 설정 같은 메타 컨트롤은 텍스트 없음 — secPr/ctrl 안에 든 자식까지 무시
|
|
379
|
+
if (run?.secPr || run?.ctrl)
|
|
380
|
+
continue;
|
|
381
|
+
// 직접 텍스트
|
|
382
|
+
const t = run?.t ?? run?.["hp:t"];
|
|
383
|
+
pushTextNode(t, pieces);
|
|
384
|
+
// 표
|
|
385
|
+
const tbl = run?.tbl ?? run?.["hp:tbl"];
|
|
386
|
+
if (tbl) {
|
|
387
|
+
const tbls = Array.isArray(tbl) ? tbl : [tbl];
|
|
388
|
+
for (const tb of tbls) {
|
|
389
|
+
pieces.push(this.extractTableText(tb));
|
|
390
|
+
}
|
|
391
|
+
}
|
|
392
|
+
}
|
|
393
|
+
return pieces.join("");
|
|
394
|
+
}
|
|
395
|
+
extractTableText(tbl) {
|
|
396
|
+
const trs = tbl?.tr ?? tbl?.["hp:tr"];
|
|
397
|
+
if (!trs)
|
|
398
|
+
return "";
|
|
399
|
+
const trArr = Array.isArray(trs) ? trs : [trs];
|
|
400
|
+
const rowTexts = [];
|
|
401
|
+
for (const tr of trArr) {
|
|
402
|
+
const tcs = tr?.tc ?? tr?.["hp:tc"];
|
|
403
|
+
if (!tcs)
|
|
404
|
+
continue;
|
|
405
|
+
const tcArr = Array.isArray(tcs) ? tcs : [tcs];
|
|
406
|
+
const cellTexts = [];
|
|
407
|
+
for (const tc of tcArr) {
|
|
408
|
+
cellTexts.push(this.extractCellText(tc));
|
|
409
|
+
}
|
|
410
|
+
rowTexts.push(cellTexts.join(" "));
|
|
411
|
+
}
|
|
412
|
+
return rowTexts.join("\n");
|
|
413
|
+
}
|
|
414
|
+
extractCellText(tc) {
|
|
415
|
+
const sub = tc?.subList ?? tc?.["hp:subList"];
|
|
416
|
+
if (!sub)
|
|
417
|
+
return "";
|
|
418
|
+
const ps = sub?.p ?? sub?.["hp:p"];
|
|
419
|
+
if (!ps)
|
|
420
|
+
return "";
|
|
421
|
+
const paras = Array.isArray(ps) ? ps : [ps];
|
|
422
|
+
return paras.map((q) => this.extractParagraphText(q)).join("\n");
|
|
423
|
+
}
|
|
424
|
+
/**
|
|
425
|
+
* 문서 전체를 Markdown 으로 변환.
|
|
426
|
+
* 표는 마크다운 표 (셀 병합은 평탄화), 이미지는 ``.
|
|
427
|
+
*/
|
|
428
|
+
async extractMarkdown(options) {
|
|
429
|
+
if (!this.zip)
|
|
430
|
+
throw new HwpxNotLoadedError();
|
|
431
|
+
const summary = this.summarizePackage();
|
|
432
|
+
if (summary.hasEncryptionInfo) {
|
|
433
|
+
throw new HwpxEncryptedDocumentError();
|
|
434
|
+
}
|
|
435
|
+
let sectionPaths = this.getSectionPathsBySpine() ?? Object.keys(this.files)
|
|
436
|
+
.filter((p) => /^contents\/section\d+\.xml$/.test(p.toLowerCase()))
|
|
437
|
+
.sort();
|
|
438
|
+
if (sectionPaths.length === 0) {
|
|
439
|
+
const candidates = Object.keys(this.files).filter((p) => p.startsWith("Contents/") && p.toLowerCase().endsWith(".xml"));
|
|
440
|
+
for (const p of candidates) {
|
|
441
|
+
const xmlText = this.getTextFile(p);
|
|
442
|
+
if (!xmlText)
|
|
443
|
+
continue;
|
|
444
|
+
const xml = this.parseXml(xmlText);
|
|
445
|
+
if (xml && (xml.sec || xml.section || xml["hp:section"]))
|
|
446
|
+
sectionPaths.push(p);
|
|
447
|
+
}
|
|
448
|
+
}
|
|
449
|
+
const blocks = [];
|
|
450
|
+
for (const path of sectionPaths) {
|
|
451
|
+
const xmlText = this.getTextFile(path);
|
|
452
|
+
if (!xmlText)
|
|
453
|
+
continue;
|
|
454
|
+
const xml = this.parseXml(xmlText);
|
|
455
|
+
const section = xml?.sec ?? xml?.section ?? xml?.["hp:section"];
|
|
456
|
+
if (!section)
|
|
457
|
+
continue;
|
|
458
|
+
const ps = section?.p ?? section?.["hp:p"];
|
|
459
|
+
if (!ps)
|
|
460
|
+
continue;
|
|
461
|
+
const paras = Array.isArray(ps) ? ps : [ps];
|
|
462
|
+
for (const p of paras) {
|
|
463
|
+
const md = this.extractParagraphMarkdown(p, options);
|
|
464
|
+
if (md.trim().length > 0)
|
|
465
|
+
blocks.push(md);
|
|
466
|
+
}
|
|
467
|
+
}
|
|
468
|
+
return blocks.join("\n\n").trim() + "\n";
|
|
469
|
+
}
|
|
470
|
+
extractParagraphMarkdown(p, options) {
|
|
471
|
+
const runs = p?.run ?? p?.["hp:run"];
|
|
472
|
+
if (!runs)
|
|
473
|
+
return "";
|
|
474
|
+
const runArr = Array.isArray(runs) ? runs : [runs];
|
|
475
|
+
const parts = [];
|
|
476
|
+
let textBuf = "";
|
|
477
|
+
for (const run of runArr) {
|
|
478
|
+
if (run?.secPr || run?.ctrl)
|
|
479
|
+
continue;
|
|
480
|
+
// 텍스트 + charPrIDRef → 굵게/기울임 적용
|
|
481
|
+
const t = run?.t ?? run?.["hp:t"];
|
|
482
|
+
const pieces = [];
|
|
483
|
+
pushTextNode(t, pieces);
|
|
484
|
+
let raw = pieces.join("");
|
|
485
|
+
if (raw.length > 0) {
|
|
486
|
+
const charPrId = run?.["@charPrIDRef"];
|
|
487
|
+
if (charPrId !== undefined && this.characterProperties.has(String(charPrId))) {
|
|
488
|
+
const cs = this.characterProperties.get(String(charPrId));
|
|
489
|
+
let s = escapeMd(raw);
|
|
490
|
+
if (cs?.bold)
|
|
491
|
+
s = `**${s}**`;
|
|
492
|
+
if (cs?.italic)
|
|
493
|
+
s = `*${s}*`;
|
|
494
|
+
textBuf += s;
|
|
495
|
+
}
|
|
496
|
+
else {
|
|
497
|
+
textBuf += escapeMd(raw);
|
|
498
|
+
}
|
|
499
|
+
}
|
|
500
|
+
// 표
|
|
501
|
+
const tbl = run?.tbl ?? run?.["hp:tbl"];
|
|
502
|
+
if (tbl) {
|
|
503
|
+
if (textBuf) {
|
|
504
|
+
parts.push(textBuf);
|
|
505
|
+
textBuf = "";
|
|
506
|
+
}
|
|
507
|
+
const tbls = Array.isArray(tbl) ? tbl : [tbl];
|
|
508
|
+
for (const tb of tbls)
|
|
509
|
+
parts.push(this.extractTableMarkdown(tb, options));
|
|
510
|
+
}
|
|
511
|
+
// 그림
|
|
512
|
+
const pic = run?.pic ?? run?.["hp:pic"];
|
|
513
|
+
if (pic) {
|
|
514
|
+
if (textBuf) {
|
|
515
|
+
parts.push(textBuf);
|
|
516
|
+
textBuf = "";
|
|
517
|
+
}
|
|
518
|
+
const href = pic?.["@href"];
|
|
519
|
+
const img = pic?.img ?? pic?.["hc:img"];
|
|
520
|
+
const ref = img?.["@binaryItemIDRef"];
|
|
521
|
+
const path = typeof href === "string" ? href : ref ? `BinData/${ref}` : "";
|
|
522
|
+
if (path) {
|
|
523
|
+
if (options?.embedImages) {
|
|
524
|
+
const data = this.files[path];
|
|
525
|
+
if (data) {
|
|
526
|
+
const ext = path.split(".").pop()?.toLowerCase() ?? "";
|
|
527
|
+
const mime = ext === "png" ? "image/png" : ext === "jpg" || ext === "jpeg" ? "image/jpeg" : ext === "gif" ? "image/gif" : "application/octet-stream";
|
|
528
|
+
parts.push(`})`);
|
|
529
|
+
}
|
|
530
|
+
else {
|
|
531
|
+
parts.push(``);
|
|
532
|
+
}
|
|
533
|
+
}
|
|
534
|
+
else if (options?.imageSrcResolver) {
|
|
535
|
+
parts.push(`})`);
|
|
536
|
+
}
|
|
537
|
+
else {
|
|
538
|
+
parts.push(``);
|
|
539
|
+
}
|
|
540
|
+
}
|
|
541
|
+
}
|
|
542
|
+
}
|
|
543
|
+
if (textBuf)
|
|
544
|
+
parts.push(textBuf);
|
|
545
|
+
return parts.join("\n\n");
|
|
546
|
+
}
|
|
547
|
+
extractTableMarkdown(tbl, options) {
|
|
548
|
+
const trs = tbl?.tr ?? tbl?.["hp:tr"];
|
|
549
|
+
if (!trs)
|
|
550
|
+
return "";
|
|
551
|
+
const trArr = Array.isArray(trs) ? trs : [trs];
|
|
552
|
+
// 행/셀 텍스트 모음 (병합은 무시 — 마크다운 표 한계)
|
|
553
|
+
const rows = [];
|
|
554
|
+
let maxCols = 0;
|
|
555
|
+
for (const tr of trArr) {
|
|
556
|
+
const tcs = tr?.tc ?? tr?.["hp:tc"];
|
|
557
|
+
if (!tcs)
|
|
558
|
+
continue;
|
|
559
|
+
const tcArr = Array.isArray(tcs) ? tcs : [tcs];
|
|
560
|
+
const cellTexts = [];
|
|
561
|
+
for (const tc of tcArr) {
|
|
562
|
+
const sub = tc?.subList ?? tc?.["hp:subList"];
|
|
563
|
+
if (!sub) {
|
|
564
|
+
cellTexts.push("");
|
|
565
|
+
continue;
|
|
566
|
+
}
|
|
567
|
+
const cps = sub?.p ?? sub?.["hp:p"];
|
|
568
|
+
if (!cps) {
|
|
569
|
+
cellTexts.push("");
|
|
570
|
+
continue;
|
|
571
|
+
}
|
|
572
|
+
const cellParas = Array.isArray(cps) ? cps : [cps];
|
|
573
|
+
const inner = cellParas
|
|
574
|
+
.map((q) => this.extractParagraphMarkdown(q, options))
|
|
575
|
+
.join(" ")
|
|
576
|
+
.replace(/\n+/g, " ")
|
|
577
|
+
.replace(/\|/g, "\\|");
|
|
578
|
+
cellTexts.push(inner);
|
|
579
|
+
}
|
|
580
|
+
if (cellTexts.length > maxCols)
|
|
581
|
+
maxCols = cellTexts.length;
|
|
582
|
+
rows.push(cellTexts);
|
|
583
|
+
}
|
|
584
|
+
if (rows.length === 0)
|
|
585
|
+
return "";
|
|
586
|
+
// 모든 행의 셀 수를 maxCols 로 패딩
|
|
587
|
+
for (const r of rows) {
|
|
588
|
+
while (r.length < maxCols)
|
|
589
|
+
r.push("");
|
|
590
|
+
}
|
|
591
|
+
const fmt = (cells) => `| ${cells.map((c) => c || " ").join(" | ")} |`;
|
|
592
|
+
const lines = [];
|
|
593
|
+
lines.push(fmt(rows[0]));
|
|
594
|
+
lines.push(fmt(new Array(maxCols).fill("---")));
|
|
595
|
+
for (let i = 1; i < rows.length; i++)
|
|
596
|
+
lines.push(fmt(rows[i]));
|
|
597
|
+
return lines.join("\n");
|
|
598
|
+
}
|
|
599
|
+
// 아주 단순한 텍스트 템플릿 치환: {{key}} → value (문단 텍스트에만 적용)
|
|
600
|
+
applyTemplateToText(raw, data) {
|
|
601
|
+
return raw.replace(/\{\{\s*([\w.]+)\s*\}\}/g, (_m, key) => {
|
|
602
|
+
const value = key.split('.').reduce((acc, k) => (acc && acc[k] !== undefined ? acc[k] : undefined), data);
|
|
603
|
+
return value === undefined || value === null ? '' : String(value);
|
|
604
|
+
});
|
|
605
|
+
}
|
|
606
|
+
async extractHtml(options) {
|
|
607
|
+
if (!this.zip)
|
|
608
|
+
throw new HwpxNotLoadedError();
|
|
609
|
+
const summary = this.summarizePackage();
|
|
610
|
+
if (summary.hasEncryptionInfo) {
|
|
611
|
+
throw new HwpxEncryptedDocumentError();
|
|
612
|
+
}
|
|
613
|
+
const paragraphTag = options?.paragraphTag ?? "p";
|
|
614
|
+
const enableImages = options?.renderImages ?? true;
|
|
615
|
+
const enableTables = options?.renderTables ?? true;
|
|
616
|
+
const enableStyles = options?.renderStyles ?? true;
|
|
617
|
+
let sectionPaths = this.getSectionPathsBySpine() ?? Object.keys(this.files)
|
|
618
|
+
.filter((p) => /^contents\/section\d+\.xml$/.test(p.toLowerCase()))
|
|
619
|
+
.sort((a, b) => {
|
|
620
|
+
const na = Number(a.match(/section(\d+)\.xml/)?.[1] ?? 0);
|
|
621
|
+
const nb = Number(b.match(/section(\d+)\.xml/)?.[1] ?? 0);
|
|
622
|
+
return na - nb;
|
|
623
|
+
});
|
|
624
|
+
if (sectionPaths.length === 0) {
|
|
625
|
+
const candidates = Object.keys(this.files).filter((p) => p.startsWith("Contents/") && p.toLowerCase().endsWith(".xml"));
|
|
626
|
+
for (const p of candidates) {
|
|
627
|
+
const xmlText = this.getTextFile(p);
|
|
628
|
+
if (!xmlText)
|
|
629
|
+
continue;
|
|
630
|
+
const xml = this.parseXml(xmlText);
|
|
631
|
+
if (xml && (xml.sec || xml.section || xml["hp:section"])) {
|
|
632
|
+
sectionPaths.push(p);
|
|
633
|
+
}
|
|
634
|
+
}
|
|
635
|
+
sectionPaths.sort((a, b) => {
|
|
636
|
+
const na = Number(a.match(/section(\d+)\.xml/)?.[1] ?? 0);
|
|
637
|
+
const nb = Number(b.match(/section(\d+)\.xml/)?.[1] ?? 0);
|
|
638
|
+
return na - nb;
|
|
639
|
+
});
|
|
640
|
+
}
|
|
641
|
+
const tableClass = options?.tableClassName ?? "hwpx-table";
|
|
642
|
+
const pieces = [];
|
|
643
|
+
for (const path of sectionPaths) {
|
|
644
|
+
const xmlText = this.getTextFile(path);
|
|
645
|
+
if (!xmlText)
|
|
646
|
+
continue;
|
|
647
|
+
const xml = this.parseXml(xmlText);
|
|
648
|
+
const section = xml?.sec ?? xml?.section ?? xml?.["hp:section"];
|
|
649
|
+
if (!section)
|
|
650
|
+
continue;
|
|
651
|
+
// paragraphs (표가 paragraph 안의 run 에 포함되어 있을 수 있으므로 분리 추출)
|
|
652
|
+
const ps = section?.p ?? section?.["hp:p"];
|
|
653
|
+
if (ps) {
|
|
654
|
+
const paras = Array.isArray(ps) ? ps : [ps];
|
|
655
|
+
for (const p of paras) {
|
|
656
|
+
// 텍스트와 이미지 등 인라인 컨텐츠
|
|
657
|
+
const inner = this.renderNodeToHtml(p, { enableImages, enableStyles }, options);
|
|
658
|
+
const alignStyle = this.getAlignStyle(p);
|
|
659
|
+
const styleAttr = alignStyle ? ` style="${alignStyle}"` : "";
|
|
660
|
+
pieces.push(`<${paragraphTag}${styleAttr}>${inner}</${paragraphTag}>`);
|
|
661
|
+
// paragraph 내 표는 <p> 형제 요소로 출력 (HTML 에서 <p> 안에 <table> 불가)
|
|
662
|
+
if (enableTables) {
|
|
663
|
+
for (const tbl of this.collectTablesInParagraph(p)) {
|
|
664
|
+
pieces.push(this.renderTableHtml(tbl, tableClass, options));
|
|
665
|
+
}
|
|
666
|
+
}
|
|
667
|
+
}
|
|
668
|
+
}
|
|
669
|
+
// section 직속 tables (구식 HWPX)
|
|
670
|
+
const tbls = section?.tbl ?? section?.["hp:tbl"];
|
|
671
|
+
if (tbls && enableTables) {
|
|
672
|
+
const tables = Array.isArray(tbls) ? tbls : [tbls];
|
|
673
|
+
for (const tbl of tables) {
|
|
674
|
+
pieces.push(this.renderTableHtml(tbl, tableClass, options));
|
|
675
|
+
}
|
|
676
|
+
}
|
|
677
|
+
}
|
|
678
|
+
let html = pieces.join("");
|
|
679
|
+
if (html.trim().length > 0)
|
|
680
|
+
return html;
|
|
681
|
+
// Fallback: Preview text
|
|
682
|
+
const prvPath = this.findFilePathIgnoreCase("Preview/PrvText.txt") ||
|
|
683
|
+
this.findFilePathIgnoreCase("preview/prvtext.txt");
|
|
684
|
+
if (prvPath) {
|
|
685
|
+
const prv = this.getTextFile(prvPath);
|
|
686
|
+
if (prv && prv.trim().length > 0) {
|
|
687
|
+
const escaped = this.escapeHtml(prv);
|
|
688
|
+
html = `<p>${escaped.replace(/\n+/g, '</p><p>')}</p>`;
|
|
689
|
+
}
|
|
690
|
+
}
|
|
691
|
+
return html;
|
|
692
|
+
}
|
|
693
|
+
collectTablesInParagraph(p) {
|
|
694
|
+
const out = [];
|
|
695
|
+
const runs = p?.run ?? p?.["hp:run"];
|
|
696
|
+
if (!runs)
|
|
697
|
+
return out;
|
|
698
|
+
const runArr = Array.isArray(runs) ? runs : [runs];
|
|
699
|
+
for (const run of runArr) {
|
|
700
|
+
if (run?.secPr || run?.ctrl)
|
|
701
|
+
continue;
|
|
702
|
+
const tbl = run?.tbl ?? run?.["hp:tbl"];
|
|
703
|
+
if (!tbl)
|
|
704
|
+
continue;
|
|
705
|
+
if (Array.isArray(tbl))
|
|
706
|
+
out.push(...tbl);
|
|
707
|
+
else
|
|
708
|
+
out.push(tbl);
|
|
709
|
+
}
|
|
710
|
+
return out;
|
|
711
|
+
}
|
|
712
|
+
renderTableHtml(tbl, tableClass, options) {
|
|
713
|
+
const trs = tbl?.tr ?? tbl?.["hp:tr"];
|
|
714
|
+
const rows = trs ? (Array.isArray(trs) ? trs : [trs]) : [];
|
|
715
|
+
const enableImages = options?.renderImages ?? true;
|
|
716
|
+
const enableStyles = options?.renderStyles ?? true;
|
|
717
|
+
const rowHtml = [];
|
|
718
|
+
rows.forEach((tr, rowIndex) => {
|
|
719
|
+
const tcs = tr?.tc ?? tr?.["hp:tc"];
|
|
720
|
+
const cells = tcs ? (Array.isArray(tcs) ? tcs : [tcs]) : [];
|
|
721
|
+
const cellHtml = [];
|
|
722
|
+
for (const tc of cells) {
|
|
723
|
+
// 셀 안 paragraph: <hp:tc><hp:subList><hp:p>...</hp:p></hp:subList></hp:tc>
|
|
724
|
+
// 또는 직접 <hp:tc><hp:p>... 둘 다 지원
|
|
725
|
+
const inner = this.renderCellContentHtml(tc, { enableImages, enableStyles }, options);
|
|
726
|
+
// 구형(속성 @colSpan) + 신형(자식 <hp:cellSpan colSpan rowSpan>) 둘 다 지원
|
|
727
|
+
const cellSpan = tc?.cellSpan ?? tc?.["hp:cellSpan"];
|
|
728
|
+
const colSpan = tc?.["@colSpan"] ?? tc?.["@colspan"] ?? tc?.["@gridSpan"] ?? cellSpan?.["@colSpan"];
|
|
729
|
+
const rowSpan = tc?.["@rowSpan"] ?? tc?.["@rowspan"] ?? cellSpan?.["@rowSpan"];
|
|
730
|
+
const alignStyle = this.getAlignStyle(tc);
|
|
731
|
+
const attrs = [];
|
|
732
|
+
if (colSpan && String(colSpan) !== "1")
|
|
733
|
+
attrs.push(` colspan="${String(colSpan)}"`);
|
|
734
|
+
if (rowSpan && String(rowSpan) !== "1")
|
|
735
|
+
attrs.push(` rowspan="${String(rowSpan)}"`);
|
|
736
|
+
if (alignStyle)
|
|
737
|
+
attrs.push(` style="${alignStyle}"`);
|
|
738
|
+
const isHeader = options?.tableHeaderFirstRow && rowIndex === 0;
|
|
739
|
+
const tag = isHeader ? "th" : "td";
|
|
740
|
+
cellHtml.push(`<${tag}${attrs.join("")}>${inner}</${tag}>`);
|
|
741
|
+
}
|
|
742
|
+
rowHtml.push(`<tr>${cellHtml.join("")}</tr>`);
|
|
743
|
+
});
|
|
744
|
+
return `<table class="${tableClass}">${rowHtml.join("")}</table>`;
|
|
745
|
+
}
|
|
746
|
+
renderCellContentHtml(tc, flags, options) {
|
|
747
|
+
// subList 우선 (현대 HWPX), 없으면 tc 자체를 노드로 처리
|
|
748
|
+
const sub = tc?.subList ?? tc?.["hp:subList"];
|
|
749
|
+
if (sub)
|
|
750
|
+
return this.renderNodeToHtml(sub, flags, options);
|
|
751
|
+
return this.renderNodeToHtml(tc, flags, options);
|
|
752
|
+
}
|
|
753
|
+
getAlignStyle(node) {
|
|
754
|
+
const a = node?.["@align"] ?? node?.["@textAlign"] ?? node?.paraPr?.["@align"] ?? node?.cellPr?.["@align"];
|
|
755
|
+
if (typeof a !== "string")
|
|
756
|
+
return "";
|
|
757
|
+
const v = a.toLowerCase();
|
|
758
|
+
if (v === "center" || v === "right" || v === "left" || v === "justify") {
|
|
759
|
+
return `text-align:${v}`;
|
|
760
|
+
}
|
|
761
|
+
return "";
|
|
762
|
+
}
|
|
763
|
+
renderNodeToHtml(node, flags, options) {
|
|
764
|
+
if (!node)
|
|
765
|
+
return "";
|
|
766
|
+
// paragraph aggregation
|
|
767
|
+
const ps = node?.["hp:p"] ?? node?.p;
|
|
768
|
+
if (ps) {
|
|
769
|
+
const paras = Array.isArray(ps) ? ps : [ps];
|
|
770
|
+
return paras.map((p) => this.renderNodeToHtml(p, flags, options)).join("\n");
|
|
771
|
+
}
|
|
772
|
+
// runs
|
|
773
|
+
const runs = node?.["hp:run"] ?? node?.run;
|
|
774
|
+
const runArr = runs ? (Array.isArray(runs) ? runs : [runs]) : [];
|
|
775
|
+
if (runArr.length > 0) {
|
|
776
|
+
return runArr.map((run) => this.renderRunToHtml(run, flags, options)).join("");
|
|
777
|
+
}
|
|
778
|
+
// direct text
|
|
779
|
+
if (typeof node === "string")
|
|
780
|
+
return this.escapeHtml(node);
|
|
781
|
+
if (typeof node?.["#text"] === "string")
|
|
782
|
+
return this.escapeHtml(node["#text"]);
|
|
783
|
+
return "";
|
|
784
|
+
}
|
|
785
|
+
collectAllText(node, out) {
|
|
786
|
+
if (node == null)
|
|
787
|
+
return;
|
|
788
|
+
// 설정 관련 노드들은 건너뛰기
|
|
789
|
+
if (typeof node === "object" && (node.secPr || node.ctrl || node.linesegarray)) {
|
|
790
|
+
return;
|
|
791
|
+
}
|
|
792
|
+
if (typeof node === "string") {
|
|
793
|
+
out.push(node);
|
|
794
|
+
return;
|
|
795
|
+
}
|
|
796
|
+
if (typeof node === "object") {
|
|
797
|
+
const text = node["#text"];
|
|
798
|
+
if (typeof text === "string")
|
|
799
|
+
out.push(text);
|
|
800
|
+
// 't' 속성이 있으면 직접 추출
|
|
801
|
+
const t = node.t;
|
|
802
|
+
if (typeof t === "string") {
|
|
803
|
+
out.push(t);
|
|
804
|
+
return; // t가 있으면 더 이상 탐색하지 않음
|
|
805
|
+
}
|
|
806
|
+
for (const [k, v] of Object.entries(node)) {
|
|
807
|
+
if (k === "#text" || k === "t")
|
|
808
|
+
continue;
|
|
809
|
+
// 설정 관련 키들은 건너뛰기
|
|
810
|
+
if (k === "secPr" || k === "ctrl" || k === "linesegarray")
|
|
811
|
+
continue;
|
|
812
|
+
this.collectAllText(v, out);
|
|
813
|
+
}
|
|
814
|
+
}
|
|
815
|
+
}
|
|
816
|
+
renderRunToHtml(run, flags, options) {
|
|
817
|
+
// 섹션 설정이나 컨트롤 정보가 있는 run은 건너뛰기
|
|
818
|
+
if (run?.secPr || run?.ctrl)
|
|
819
|
+
return "";
|
|
820
|
+
// Text
|
|
821
|
+
const t = run?.["hp:t"] ?? run?.t;
|
|
822
|
+
const text = typeof t === "string" ? t : typeof t?.["#text"] === "string" ? t["#text"] : "";
|
|
823
|
+
let html = this.escapeHtml(text);
|
|
824
|
+
// Image (simplified): hp:picture or hp:img-like reference to BinData
|
|
825
|
+
if (flags.enableImages) {
|
|
826
|
+
const binRef = this.findBinRefInRun(run);
|
|
827
|
+
if (typeof binRef === "string") {
|
|
828
|
+
// Resolve binaryItemIDRef through manifest if needed
|
|
829
|
+
const binPath = this.resolveBinaryPath(binRef);
|
|
830
|
+
if (binPath) {
|
|
831
|
+
let src;
|
|
832
|
+
if (options?.embedImages) {
|
|
833
|
+
const data = this.files[binPath];
|
|
834
|
+
if (data) {
|
|
835
|
+
const mime = this.detectMimeType(binPath);
|
|
836
|
+
const b64 = this.toBase64(data);
|
|
837
|
+
src = `data:${mime};base64,${b64}`;
|
|
838
|
+
}
|
|
839
|
+
else {
|
|
840
|
+
src = binPath;
|
|
841
|
+
}
|
|
842
|
+
}
|
|
843
|
+
else if (options?.imageSrcResolver) {
|
|
844
|
+
src = options.imageSrcResolver(binPath);
|
|
845
|
+
}
|
|
846
|
+
else {
|
|
847
|
+
src = binPath;
|
|
848
|
+
}
|
|
849
|
+
html += `<img src="${this.escapeHtml(src)}" alt="" />`;
|
|
850
|
+
}
|
|
851
|
+
}
|
|
852
|
+
}
|
|
853
|
+
// Styles: Resolve charPrIDRef to actual character properties
|
|
854
|
+
if (flags.enableStyles) {
|
|
855
|
+
const charPrId = run?.["@charPrIDRef"];
|
|
856
|
+
if (charPrId && this.characterProperties.has(charPrId)) {
|
|
857
|
+
const charProps = this.characterProperties.get(charPrId);
|
|
858
|
+
let open = "";
|
|
859
|
+
let close = "";
|
|
860
|
+
const styleParts = [];
|
|
861
|
+
// Apply formatting
|
|
862
|
+
if (charProps?.bold) {
|
|
863
|
+
open += "<strong>";
|
|
864
|
+
close = "</strong>" + close;
|
|
865
|
+
}
|
|
866
|
+
if (charProps?.italic) {
|
|
867
|
+
open += "<em>";
|
|
868
|
+
close = "</em>" + close;
|
|
869
|
+
}
|
|
870
|
+
// Handle underline
|
|
871
|
+
if (charProps?.underline && charProps.underline?.["@type"] !== "NONE") {
|
|
872
|
+
styleParts.push("text-decoration:underline");
|
|
873
|
+
}
|
|
874
|
+
// Handle color
|
|
875
|
+
if (charProps?.textColor && charProps.textColor !== "#000000") {
|
|
876
|
+
styleParts.push(`color:${this.normalizeColor(charProps.textColor)}`);
|
|
877
|
+
}
|
|
878
|
+
// Handle font size (convert HWPUNIT to points)
|
|
879
|
+
if (charProps?.height) {
|
|
880
|
+
const sizeInPt = this.convertHwpUnitToPoints(charProps.height);
|
|
881
|
+
styleParts.push(`font-size:${sizeInPt}pt`);
|
|
882
|
+
}
|
|
883
|
+
// Handle background color
|
|
884
|
+
if (charProps?.shadeColor && charProps.shadeColor !== "none" && charProps.shadeColor !== "#FFFFFF") {
|
|
885
|
+
styleParts.push(`background-color:${this.normalizeColor(charProps.shadeColor)}`);
|
|
886
|
+
}
|
|
887
|
+
const styleAttr = styleParts.length ? ` style="${styleParts.join(";")}"` : "";
|
|
888
|
+
if (open || styleAttr) {
|
|
889
|
+
html = `${open}<span${styleAttr}>${html}</span>${close}`;
|
|
890
|
+
}
|
|
891
|
+
}
|
|
892
|
+
}
|
|
893
|
+
return html;
|
|
894
|
+
}
|
|
895
|
+
findBinRefInRun(run) {
|
|
896
|
+
// common patterns - note: XML parser removes namespaces, so hp:pic becomes 'pic', hc:img becomes 'img'
|
|
897
|
+
const pic = run?.["hp:picture"] ?? run?.picture ?? run?.pic;
|
|
898
|
+
const draw = run?.["hp:draw"] ?? run?.draw;
|
|
899
|
+
const img = run?.["hp:img"] ?? run?.img;
|
|
900
|
+
const hcImg = run?.["hc:img"] ?? run?.["hp:hc:img"];
|
|
901
|
+
const tryExtract = (node) => {
|
|
902
|
+
if (!node)
|
|
903
|
+
return undefined;
|
|
904
|
+
// Check for binaryItemIDRef attribute (used by hc:img)
|
|
905
|
+
const binaryRef = node?.["@binaryItemIDRef"];
|
|
906
|
+
if (typeof binaryRef === "string")
|
|
907
|
+
return binaryRef;
|
|
908
|
+
// For picture elements, the img may be nested inside (hc:img becomes nested img)
|
|
909
|
+
const nestedImg = node?.img;
|
|
910
|
+
if (nestedImg && typeof nestedImg?.["@binaryItemIDRef"] === "string") {
|
|
911
|
+
return nestedImg["@binaryItemIDRef"];
|
|
912
|
+
}
|
|
913
|
+
// Check for traditional hp:binItem reference
|
|
914
|
+
const ref = node?.["hp:binItem"]?.["@ref"] ?? node?.binItem?.["@ref"] ?? node?.["@ref"];
|
|
915
|
+
if (typeof ref === "string")
|
|
916
|
+
return ref;
|
|
917
|
+
return undefined;
|
|
918
|
+
};
|
|
919
|
+
return tryExtract(pic) || tryExtract(draw) || tryExtract(img) || tryExtract(hcImg);
|
|
920
|
+
}
|
|
921
|
+
resolveBinaryPath(binRef) {
|
|
922
|
+
// First, try direct path (legacy format)
|
|
923
|
+
const directPath = `BinData/${binRef}`;
|
|
924
|
+
if (this.files[directPath]) {
|
|
925
|
+
return directPath;
|
|
926
|
+
}
|
|
927
|
+
// Try to resolve through manifest
|
|
928
|
+
try {
|
|
929
|
+
const summary = this.summarizePackage();
|
|
930
|
+
if (summary.manifest) {
|
|
931
|
+
const manifestItem = summary.manifest.find(item => item.id === binRef);
|
|
932
|
+
if (manifestItem?.href) {
|
|
933
|
+
// The href might include the full path or relative path
|
|
934
|
+
const resolvedPath = manifestItem.href.startsWith('BinData/')
|
|
935
|
+
? manifestItem.href
|
|
936
|
+
: `BinData/${manifestItem.href}`;
|
|
937
|
+
if (this.files[resolvedPath]) {
|
|
938
|
+
return resolvedPath;
|
|
939
|
+
}
|
|
940
|
+
// Try the href as-is
|
|
941
|
+
if (this.files[manifestItem.href]) {
|
|
942
|
+
return manifestItem.href;
|
|
943
|
+
}
|
|
944
|
+
}
|
|
945
|
+
}
|
|
946
|
+
}
|
|
947
|
+
catch (e) {
|
|
948
|
+
// Fall back if manifest parsing fails
|
|
949
|
+
}
|
|
950
|
+
// Fallback: return the direct path even if file doesn't exist
|
|
951
|
+
return directPath;
|
|
952
|
+
}
|
|
953
|
+
normalizeColor(c) {
|
|
954
|
+
const s = c.trim();
|
|
955
|
+
if (/^#?[0-9a-fA-F]{6}$/.test(s))
|
|
956
|
+
return s.startsWith('#') ? s : `#${s}`;
|
|
957
|
+
return s; // fallback as-is
|
|
958
|
+
}
|
|
959
|
+
normalizeSize(sz) {
|
|
960
|
+
const n = typeof sz === 'number' ? sz : Number(sz);
|
|
961
|
+
if (!isNaN(n))
|
|
962
|
+
return `${n}pt`;
|
|
963
|
+
return String(sz);
|
|
964
|
+
}
|
|
965
|
+
convertHwpUnitToPoints(hwpUnit) {
|
|
966
|
+
// HWPUNIT is approximately 1/100th of a point
|
|
967
|
+
// 1000 HWPUNIT = 10 points
|
|
968
|
+
const units = typeof hwpUnit === 'number' ? hwpUnit : parseInt(String(hwpUnit), 10);
|
|
969
|
+
return Math.round((units / 100) * 10) / 10; // Round to 1 decimal place
|
|
970
|
+
}
|
|
971
|
+
parseStyleDefinitions() {
|
|
972
|
+
// Clear existing definitions
|
|
973
|
+
this.characterProperties.clear();
|
|
974
|
+
this.fontFaces.clear();
|
|
975
|
+
// Find and parse header.xml
|
|
976
|
+
const headerXml = this.getTextFile("Contents/header.xml");
|
|
977
|
+
if (!headerXml)
|
|
978
|
+
return;
|
|
979
|
+
try {
|
|
980
|
+
const header = this.parseXml(headerXml);
|
|
981
|
+
const root = header?.head ?? header;
|
|
982
|
+
if (!root)
|
|
983
|
+
return;
|
|
984
|
+
// Character properties are in head/refList/charProperties
|
|
985
|
+
const refList = root?.refList;
|
|
986
|
+
if (!refList)
|
|
987
|
+
return;
|
|
988
|
+
// Parse font faces
|
|
989
|
+
const fontfaces = refList?.fontfaces;
|
|
990
|
+
if (fontfaces?.fontface) {
|
|
991
|
+
const fonts = Array.isArray(fontfaces.fontface) ? fontfaces.fontface : [fontfaces.fontface];
|
|
992
|
+
for (const font of fonts) {
|
|
993
|
+
const id = font?.["@id"];
|
|
994
|
+
if (id) {
|
|
995
|
+
this.fontFaces.set(id, font);
|
|
996
|
+
}
|
|
997
|
+
}
|
|
998
|
+
}
|
|
999
|
+
// Parse character properties from refList
|
|
1000
|
+
const charProperties = refList?.charProperties;
|
|
1001
|
+
if (charProperties?.charPr) {
|
|
1002
|
+
const charPrs = Array.isArray(charProperties.charPr) ? charProperties.charPr : [charProperties.charPr];
|
|
1003
|
+
for (const charPr of charPrs) {
|
|
1004
|
+
const id = charPr?.["@id"];
|
|
1005
|
+
if (id) {
|
|
1006
|
+
this.characterProperties.set(id, this.processCharacterProperties(charPr));
|
|
1007
|
+
}
|
|
1008
|
+
}
|
|
1009
|
+
}
|
|
1010
|
+
}
|
|
1011
|
+
catch {
|
|
1012
|
+
// Silent fail - styles are optional
|
|
1013
|
+
}
|
|
1014
|
+
}
|
|
1015
|
+
processCharacterProperties(charPr) {
|
|
1016
|
+
// Bold is indicated by presence of <hh:bold/> element (after namespace removal, becomes 'bold')
|
|
1017
|
+
const hasBold = charPr?.bold !== undefined;
|
|
1018
|
+
const hasItalic = charPr?.italic !== undefined;
|
|
1019
|
+
return {
|
|
1020
|
+
height: charPr?.["@height"], // Font size in HWPUNIT
|
|
1021
|
+
textColor: charPr?.["@textColor"], // Text color
|
|
1022
|
+
shadeColor: charPr?.["@shadeColor"], // Background color
|
|
1023
|
+
bold: hasBold, // Bold formatting (element presence)
|
|
1024
|
+
italic: hasItalic, // Italic formatting (element presence)
|
|
1025
|
+
underline: charPr?.underline, // Underline info
|
|
1026
|
+
strikeout: charPr?.strikeout, // Strikeout info
|
|
1027
|
+
fontRef: charPr?.fontRef, // Font reference
|
|
1028
|
+
raw: charPr // Keep original for debugging
|
|
1029
|
+
};
|
|
1030
|
+
}
|
|
1031
|
+
detectMimeType(path) {
|
|
1032
|
+
const lower = path.toLowerCase();
|
|
1033
|
+
if (lower.endsWith(".png"))
|
|
1034
|
+
return "image/png";
|
|
1035
|
+
if (lower.endsWith(".jpg") || lower.endsWith(".jpeg"))
|
|
1036
|
+
return "image/jpeg";
|
|
1037
|
+
if (lower.endsWith(".gif"))
|
|
1038
|
+
return "image/gif";
|
|
1039
|
+
if (lower.endsWith(".bmp"))
|
|
1040
|
+
return "image/bmp";
|
|
1041
|
+
if (lower.endsWith(".webp"))
|
|
1042
|
+
return "image/webp";
|
|
1043
|
+
return "application/octet-stream";
|
|
1044
|
+
}
|
|
1045
|
+
toBase64(bytes) {
|
|
1046
|
+
if (typeof Buffer !== "undefined") {
|
|
1047
|
+
return Buffer.from(bytes).toString("base64");
|
|
1048
|
+
}
|
|
1049
|
+
let binary = "";
|
|
1050
|
+
for (let i = 0; i < bytes.length; i++)
|
|
1051
|
+
binary += String.fromCharCode(bytes[i]);
|
|
1052
|
+
// btoa may not exist in Node, handled by Buffer path above
|
|
1053
|
+
// @ts-ignore
|
|
1054
|
+
return btoa(binary);
|
|
1055
|
+
}
|
|
1056
|
+
extractTextFromNode(node) {
|
|
1057
|
+
if (!node)
|
|
1058
|
+
return "";
|
|
1059
|
+
// hp:p → hp:run → hp:t
|
|
1060
|
+
const ps = node?.["hp:p"] ?? node?.p;
|
|
1061
|
+
if (ps) {
|
|
1062
|
+
const paras = Array.isArray(ps) ? ps : [ps];
|
|
1063
|
+
return paras.map((p) => this.extractTextFromNode(p)).join("\n");
|
|
1064
|
+
}
|
|
1065
|
+
const runs = node?.["hp:run"] ?? node?.run;
|
|
1066
|
+
const runArr = runs ? (Array.isArray(runs) ? runs : [runs]) : [];
|
|
1067
|
+
const textPieces = [];
|
|
1068
|
+
for (const run of runArr) {
|
|
1069
|
+
// 섹션 설정이나 컨트롤 정보가 있는 run은 건너뛰기
|
|
1070
|
+
if (run?.secPr || run?.ctrl)
|
|
1071
|
+
continue;
|
|
1072
|
+
const t = run?.["hp:t"] ?? run?.t;
|
|
1073
|
+
if (t === undefined || t === null)
|
|
1074
|
+
continue;
|
|
1075
|
+
if (typeof t === "string")
|
|
1076
|
+
textPieces.push(t);
|
|
1077
|
+
else if (typeof t?.["#text"] === "string")
|
|
1078
|
+
textPieces.push(t["#text"]);
|
|
1079
|
+
}
|
|
1080
|
+
if (textPieces.length > 0)
|
|
1081
|
+
return textPieces.join("");
|
|
1082
|
+
// direct text
|
|
1083
|
+
if (typeof node === "string")
|
|
1084
|
+
return node;
|
|
1085
|
+
if (typeof node?.["#text"] === "string")
|
|
1086
|
+
return node["#text"];
|
|
1087
|
+
return "";
|
|
1088
|
+
}
|
|
1089
|
+
escapeHtml(text) {
|
|
1090
|
+
return text
|
|
1091
|
+
.replace(/&/g, "&")
|
|
1092
|
+
.replace(/</g, "<")
|
|
1093
|
+
.replace(/>/g, ">");
|
|
1094
|
+
}
|
|
1095
|
+
async listImages() {
|
|
1096
|
+
if (!this.zip)
|
|
1097
|
+
throw new HwpxNotLoadedError();
|
|
1098
|
+
// 이미지: BinData/ 내 파일들 (원 규격상 다양한 바이너리 포함)
|
|
1099
|
+
return Object.keys(this.files)
|
|
1100
|
+
.filter((p) => p.startsWith("BinData/") && !p.endsWith("/"))
|
|
1101
|
+
.sort();
|
|
1102
|
+
}
|
|
1103
|
+
}
|
|
1104
|
+
export default HwpxReader;
|