hwp-convert 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/CHANGELOG.md +185 -0
  2. package/LICENSE +25 -0
  3. package/NOTICE +23 -0
  4. package/README.md +338 -0
  5. package/dist/browser/hwp-convert.browser.mjs +20677 -0
  6. package/dist/browser/hwp-convert.browser.mjs.map +7 -0
  7. package/dist/cli.d.ts +2 -0
  8. package/dist/cli.js +267 -0
  9. package/dist/index.d.ts +5 -0
  10. package/dist/index.js +5 -0
  11. package/dist/lib/errors.d.ts +9 -0
  12. package/dist/lib/errors.js +18 -0
  13. package/dist/lib/hwp/binData.d.ts +15 -0
  14. package/dist/lib/hwp/binData.js +64 -0
  15. package/dist/lib/hwp/bodyText.d.ts +31 -0
  16. package/dist/lib/hwp/bodyText.js +208 -0
  17. package/dist/lib/hwp/byteReader.d.ts +40 -0
  18. package/dist/lib/hwp/byteReader.js +116 -0
  19. package/dist/lib/hwp/cfbReader.d.ts +44 -0
  20. package/dist/lib/hwp/cfbReader.js +134 -0
  21. package/dist/lib/hwp/control.d.ts +17 -0
  22. package/dist/lib/hwp/control.js +290 -0
  23. package/dist/lib/hwp/converter.d.ts +22 -0
  24. package/dist/lib/hwp/converter.js +41 -0
  25. package/dist/lib/hwp/docInfo.d.ts +26 -0
  26. package/dist/lib/hwp/docInfo.js +396 -0
  27. package/dist/lib/hwp/fileHeader.d.ts +42 -0
  28. package/dist/lib/hwp/fileHeader.js +66 -0
  29. package/dist/lib/hwp/htmlReader.d.ts +17 -0
  30. package/dist/lib/hwp/htmlReader.js +602 -0
  31. package/dist/lib/hwp/hwpxBuilder.d.ts +19 -0
  32. package/dist/lib/hwp/hwpxBuilder.js +633 -0
  33. package/dist/lib/hwp/index.d.ts +68 -0
  34. package/dist/lib/hwp/index.js +149 -0
  35. package/dist/lib/hwp/mdReader.d.ts +16 -0
  36. package/dist/lib/hwp/mdReader.js +485 -0
  37. package/dist/lib/hwp/mdWriter.d.ts +23 -0
  38. package/dist/lib/hwp/mdWriter.js +182 -0
  39. package/dist/lib/hwp/owpml.d.ts +33 -0
  40. package/dist/lib/hwp/owpml.js +86 -0
  41. package/dist/lib/hwp/record.d.ts +24 -0
  42. package/dist/lib/hwp/record.js +59 -0
  43. package/dist/lib/hwp/tags.d.ts +115 -0
  44. package/dist/lib/hwp/tags.js +217 -0
  45. package/dist/lib/hwp/types.d.ts +214 -0
  46. package/dist/lib/hwp/types.js +5 -0
  47. package/dist/lib/hwpxReader.d.ts +60 -0
  48. package/dist/lib/hwpxReader.js +1104 -0
  49. package/dist/lib/types.d.ts +47 -0
  50. package/dist/lib/types.js +1 -0
  51. package/dist/lib/writer.d.ts +19 -0
  52. package/dist/lib/writer.js +149 -0
  53. package/package.json +94 -0
package/dist/cli.d.ts ADDED
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env node
2
+ export {};
package/dist/cli.js ADDED
@@ -0,0 +1,267 @@
1
+ #!/usr/bin/env node
2
+ import { readFile } from "node:fs/promises";
3
+ import { readFileSync } from "node:fs";
4
+ import { fileURLToPath } from "node:url";
5
+ import { resolve as resolvePath, dirname, isAbsolute } from "node:path";
6
+ import HwpxReader from "./lib/hwpxReader.js";
7
+ import HwpxWriter from "./lib/writer.js";
8
+ import { hwpToHwpx, hwpToText, hwpToMarkdown, markdownToHwpx, htmlToHwpx, } from "./lib/hwp/index.js";
9
+ /**
10
+ * CLI(Node) 전용 fs 기반 이미지 resolver.
11
+ * md/html 안의 file://·절대/상대 경로 이미지를 읽어 BinData 로 임베드한다.
12
+ * (data URI 와 원격 URL 은 null → 코어가 처리하거나 스킵)
13
+ */
14
+ function fsImageResolver(baseDir) {
15
+ return (src) => {
16
+ try {
17
+ if (/^data:/i.test(src) || /^https?:\/\//i.test(src))
18
+ return null;
19
+ const p = /^file:\/\//i.test(src)
20
+ ? fileURLToPath(src)
21
+ : isAbsolute(src)
22
+ ? src
23
+ : resolvePath(baseDir, src);
24
+ const data = new Uint8Array(readFileSync(p));
25
+ if (data.length === 0)
26
+ return null;
27
+ const ext = (p.split(".").pop() || "png").toLowerCase();
28
+ return { data, extension: ext };
29
+ }
30
+ catch {
31
+ return null;
32
+ }
33
+ };
34
+ }
35
+ async function main() {
36
+ const [command, inputPath, maybeOut] = process.argv.slice(2);
37
+ if (!command || !inputPath) {
38
+ console.error("Usage:\n" +
39
+ " hwpxjs inspect <file.hwpx>\n" +
40
+ " hwpxjs txt <file.hwpx|file.hwp>\n" +
41
+ " hwpxjs html <file.hwpx>\n" +
42
+ " hwpxjs md <file.hwpx|file.hwp> # Markdown 추출\n" +
43
+ " hwpxjs html:tpl <file.hwpx> <data.json>\n" +
44
+ " hwpxjs batch <inFolder> <outFolder>\n" +
45
+ " hwpxjs batch:tpl <inFolder> <dataFolder> <outFolder>\n" +
46
+ " hwpxjs write:txt <textfile> <out.hwpx>\n" +
47
+ " hwpxjs md:hwpx <file.md> <out.hwpx> # Markdown → HWPX\n" +
48
+ " hwpxjs html:hwpx <file.html> <out.hwpx># HTML → HWPX\n" +
49
+ " hwpxjs convert:hwp <file.hwp> <out.hwpx>\n" +
50
+ " hwpxjs hwp:txt <file.hwp>\n" +
51
+ " hwpxjs hwp:md <file.hwp>");
52
+ process.exit(1);
53
+ }
54
+ if (command === "hwp:txt") {
55
+ const buf = await readFile(inputPath);
56
+ const ab = toArrayBuffer(buf);
57
+ const text = await hwpToText(new Uint8Array(ab));
58
+ console.log(text);
59
+ return;
60
+ }
61
+ if (command === "hwp:md") {
62
+ const buf = await readFile(inputPath);
63
+ const ab = toArrayBuffer(buf);
64
+ const md = await hwpToMarkdown(new Uint8Array(ab));
65
+ console.log(md);
66
+ return;
67
+ }
68
+ if (command === "md:hwpx") {
69
+ const outPath = maybeOut;
70
+ if (!outPath) {
71
+ console.error("Usage: hwpxjs md:hwpx <input.md> <output.hwpx>");
72
+ process.exit(1);
73
+ }
74
+ const md = await readFile(inputPath, "utf-8");
75
+ const bytes = await markdownToHwpx(md, {
76
+ imageResolver: fsImageResolver(dirname(resolvePath(inputPath))),
77
+ });
78
+ const { writeFile: wf } = await import("node:fs/promises");
79
+ await wf(outPath, bytes);
80
+ console.log(`Wrote ${outPath}`);
81
+ return;
82
+ }
83
+ if (command === "html:hwpx") {
84
+ const outPath = maybeOut;
85
+ if (!outPath) {
86
+ console.error("Usage: hwpxjs html:hwpx <input.html> <output.hwpx>");
87
+ process.exit(1);
88
+ }
89
+ const html = await readFile(inputPath, "utf-8");
90
+ const bytes = await htmlToHwpx(html, {
91
+ imageResolver: fsImageResolver(dirname(resolvePath(inputPath))),
92
+ });
93
+ const { writeFile: wf } = await import("node:fs/promises");
94
+ await wf(outPath, bytes);
95
+ console.log(`Wrote ${outPath}`);
96
+ return;
97
+ }
98
+ if (command === "convert:hwp") {
99
+ const outPath = maybeOut;
100
+ if (!outPath) {
101
+ console.error("Usage: hwpxjs convert:hwp <input.hwp> <output.hwpx>");
102
+ process.exit(1);
103
+ }
104
+ const buf = await readFile(inputPath);
105
+ const bytes = await hwpToHwpx(new Uint8Array(toArrayBuffer(buf)));
106
+ const { writeFile: wf } = await import("node:fs/promises");
107
+ await wf(outPath, bytes);
108
+ console.log(`Wrote ${outPath}`);
109
+ return;
110
+ }
111
+ if (command === "write:txt") {
112
+ const textPath = inputPath;
113
+ const outPath = maybeOut;
114
+ if (!outPath) {
115
+ console.error("Usage: hwpxjs write:txt <textfile> <out.hwpx>");
116
+ process.exit(1);
117
+ }
118
+ const { writeFile: wf } = await import("node:fs/promises");
119
+ const text = await readFile(textPath, "utf-8");
120
+ const writer = new HwpxWriter();
121
+ const bytes = await writer.createFromPlainText(text);
122
+ await wf(outPath, bytes);
123
+ console.log(`Wrote ${outPath}`);
124
+ return;
125
+ }
126
+ // 자동 감지: .hwp 파일이 들어오면 hwp:txt 로 라우팅
127
+ if (command === "txt" && /\.hwp$/i.test(inputPath)) {
128
+ const buf = await readFile(inputPath);
129
+ const ab = toArrayBuffer(buf);
130
+ const text = await hwpToText(new Uint8Array(ab));
131
+ console.log(text);
132
+ return;
133
+ }
134
+ if (command === "md") {
135
+ if (/\.hwp$/i.test(inputPath)) {
136
+ const buf = await readFile(inputPath);
137
+ const md = await hwpToMarkdown(new Uint8Array(toArrayBuffer(buf)));
138
+ console.log(md);
139
+ return;
140
+ }
141
+ // HWPX 경로
142
+ const buf = await readFile(inputPath);
143
+ const reader = new HwpxReader();
144
+ await reader.loadFromArrayBuffer(toArrayBuffer(buf));
145
+ const md = await reader.extractMarkdown();
146
+ console.log(md);
147
+ return;
148
+ }
149
+ const buf = await readFile(inputPath);
150
+ const ab = toArrayBuffer(buf);
151
+ const reader = new HwpxReader();
152
+ await reader.loadFromArrayBuffer(ab);
153
+ if (command === "inspect") {
154
+ const info = await reader.getDocumentInfo();
155
+ console.log(JSON.stringify(info, null, 2));
156
+ return;
157
+ }
158
+ if (command === "txt") {
159
+ const text = await reader.extractText();
160
+ console.log(text);
161
+ return;
162
+ }
163
+ if (command === "html") {
164
+ const html = await reader.extractHtml({ embedImages: true });
165
+ console.log(html);
166
+ return;
167
+ }
168
+ if (command === "html:tpl") {
169
+ const dataPath = maybeOut;
170
+ if (!dataPath) {
171
+ console.error("Usage: hwpxjs html:tpl <in.hwpx> <data.json>");
172
+ process.exit(1);
173
+ }
174
+ const { readFile: rf } = await import("node:fs/promises");
175
+ const json = JSON.parse(await rf(dataPath, "utf-8"));
176
+ const rawText = await reader.extractText({});
177
+ const replaced = reader
178
+ .applyTemplateToText(rawText, json);
179
+ const html = replaced
180
+ .split(/\n+/)
181
+ .map((line) => `<p>${line.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;")}</p>`)
182
+ .join("");
183
+ console.log(html);
184
+ return;
185
+ }
186
+ if (command === "batch") {
187
+ const inDir = inputPath;
188
+ const outDir = maybeOut;
189
+ if (!outDir) {
190
+ console.error("Usage: hwpxjs batch <inFolder> <outFolder>");
191
+ process.exit(1);
192
+ }
193
+ const { readdir, readFile: rf, writeFile, mkdir } = await import("node:fs/promises");
194
+ const { join } = await import("node:path");
195
+ await mkdir(outDir, { recursive: true });
196
+ const entries = await readdir(inDir);
197
+ for (const name of entries) {
198
+ if (!name.toLowerCase().endsWith(".hwpx"))
199
+ continue;
200
+ const inPath = join(inDir, name);
201
+ const buf2 = await rf(inPath);
202
+ const reader2 = new HwpxReader();
203
+ await reader2.loadFromArrayBuffer(toArrayBuffer(buf2));
204
+ const html = await reader2.extractHtml({ embedImages: true });
205
+ const outName = name.replace(/\.hwpx$/i, ".html");
206
+ const outPath = join(outDir, outName);
207
+ await writeFile(outPath, html, "utf-8");
208
+ console.log(`Wrote ${outPath}`);
209
+ }
210
+ return;
211
+ }
212
+ if (command === "batch:tpl") {
213
+ const inDir = inputPath;
214
+ const dataDir = maybeOut;
215
+ const outDir = process.argv[5];
216
+ if (!dataDir || !outDir) {
217
+ console.error("Usage: hwpxjs batch:tpl <inFolder> <dataFolder> <outFolder>");
218
+ process.exit(1);
219
+ }
220
+ const { readdir, readFile: rf, writeFile, mkdir } = await import("node:fs/promises");
221
+ const { join, basename } = await import("node:path");
222
+ await mkdir(outDir, { recursive: true });
223
+ const entries = await readdir(inDir);
224
+ for (const name of entries) {
225
+ if (!name.toLowerCase().endsWith(".hwpx"))
226
+ continue;
227
+ const inPath = join(inDir, name);
228
+ const buf2 = await rf(inPath);
229
+ const reader2 = new HwpxReader();
230
+ await reader2.loadFromArrayBuffer(toArrayBuffer(buf2));
231
+ const jsonName = basename(name, ".hwpx") + ".json";
232
+ let json = {};
233
+ try {
234
+ json = JSON.parse(await rf(join(dataDir, jsonName), "utf-8"));
235
+ }
236
+ catch {
237
+ try {
238
+ json = JSON.parse(await rf(join(dataDir, "default.json"), "utf-8"));
239
+ }
240
+ catch {
241
+ /* ignore */
242
+ }
243
+ }
244
+ const rawText = await reader2.extractText({});
245
+ const replaced = reader2
246
+ .applyTemplateToText(rawText, json);
247
+ const html = replaced
248
+ .split(/\n+/)
249
+ .map((line) => `<p>${line.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;")}</p>`)
250
+ .join("");
251
+ const outName = name.replace(/\.hwpx$/i, ".html");
252
+ const outPath = join(outDir, outName);
253
+ await writeFile(outPath, html, "utf-8");
254
+ console.log(`Wrote ${outPath}`);
255
+ }
256
+ return;
257
+ }
258
+ console.error("Unknown command:", command);
259
+ process.exit(1);
260
+ }
261
+ function toArrayBuffer(buf) {
262
+ return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
263
+ }
264
+ main().catch((err) => {
265
+ console.error(err?.message || err);
266
+ process.exit(1);
267
+ });
@@ -0,0 +1,5 @@
1
+ export * from "./lib/hwpxReader.js";
2
+ export * from "./lib/types.js";
3
+ export * from "./lib/errors.js";
4
+ export * from "./lib/writer.js";
5
+ export * from "./lib/hwp/index.js";
package/dist/index.js ADDED
@@ -0,0 +1,5 @@
1
+ export * from "./lib/hwpxReader.js";
2
+ export * from "./lib/types.js";
3
+ export * from "./lib/errors.js";
4
+ export * from "./lib/writer.js";
5
+ export * from "./lib/hwp/index.js";
@@ -0,0 +1,9 @@
1
+ export declare class HwpxNotLoadedError extends Error {
2
+ constructor();
3
+ }
4
+ export declare class HwpxEncryptedDocumentError extends Error {
5
+ constructor(message?: string);
6
+ }
7
+ export declare class InvalidHwpxFormatError extends Error {
8
+ constructor(message?: string);
9
+ }
@@ -0,0 +1,18 @@
1
+ export class HwpxNotLoadedError extends Error {
2
+ constructor() {
3
+ super("HWPX가 로드되지 않았습니다. loadFromArrayBuffer를 먼저 호출하세요.");
4
+ this.name = "HwpxNotLoadedError";
5
+ }
6
+ }
7
+ export class HwpxEncryptedDocumentError extends Error {
8
+ constructor(message = "암호화된 HWPX 문서는 현재 지원하지 않습니다.") {
9
+ super(message);
10
+ this.name = "HwpxEncryptedDocumentError";
11
+ }
12
+ }
13
+ export class InvalidHwpxFormatError extends Error {
14
+ constructor(message = "유효한 HWPX(mimetype: application/hwp+zip) 문서가 아닙니다.") {
15
+ super(message);
16
+ this.name = "InvalidHwpxFormatError";
17
+ }
18
+ }
@@ -0,0 +1,15 @@
1
+ /**
2
+ * BinData (CFB Storage) 에서 임베디드 이미지/OLE 데이터 추출.
3
+ *
4
+ * /BinData/BIN0001.png, BIN0002.jpg, ... 패턴.
5
+ * DocInfo의 BIN_DATA 레코드와 storageId 로 연결됨.
6
+ *
7
+ * 원작: rhwp/src/parser/bin_data.rs (MIT, Edward Kim)
8
+ */
9
+ import type { HwpCfbReader } from "./cfbReader.js";
10
+ import type { HwpBinDataRef } from "./types.js";
11
+ export declare function loadBinDataContent(cfb: HwpCfbReader, refs: HwpBinDataRef[]): Map<number, {
12
+ data: Uint8Array;
13
+ extension: string;
14
+ }>;
15
+ export declare function detectImageMime(extension: string): string;
@@ -0,0 +1,64 @@
1
+ /**
2
+ * BinData (CFB Storage) 에서 임베디드 이미지/OLE 데이터 추출.
3
+ *
4
+ * /BinData/BIN0001.png, BIN0002.jpg, ... 패턴.
5
+ * DocInfo의 BIN_DATA 레코드와 storageId 로 연결됨.
6
+ *
7
+ * 원작: rhwp/src/parser/bin_data.rs (MIT, Edward Kim)
8
+ */
9
+ const CFB_MAGIC = [0xd0, 0xcf, 0x11, 0xe0, 0xa1, 0xb1, 0x1a, 0xe1];
10
+ export function loadBinDataContent(cfb, refs) {
11
+ const out = new Map();
12
+ for (const ref of refs) {
13
+ if (ref.type === "link")
14
+ continue;
15
+ const isStorage = ref.type === "storage";
16
+ const ext = ref.extension ?? (isStorage ? "OLE" : "dat");
17
+ // 파일명: BIN{XXXX}.{ext} (4자리 hex, 대문자/소문자 둘 다 시도)
18
+ const idHex = ref.storageId.toString(16).padStart(4, "0");
19
+ const candidates = [
20
+ `/BinData/BIN${idHex.toUpperCase()}.${ext}`,
21
+ `/BinData/BIN${idHex.toLowerCase()}.${ext}`,
22
+ ];
23
+ let bytes = null;
24
+ for (const path of candidates) {
25
+ bytes = cfb.readBinData(path);
26
+ if (bytes)
27
+ break;
28
+ }
29
+ if (!bytes)
30
+ continue;
31
+ // OLE Storage 의 경우 선두 4바이트 size prefix 가 붙는 경우가 있어 정리
32
+ if (isStorage && bytes.byteLength > 12) {
33
+ const headIsCfb = bytes[0] === CFB_MAGIC[0] &&
34
+ bytes[1] === CFB_MAGIC[1] &&
35
+ bytes[2] === CFB_MAGIC[2] &&
36
+ bytes[3] === CFB_MAGIC[3];
37
+ const cfbAt4 = bytes[4] === CFB_MAGIC[0] &&
38
+ bytes[5] === CFB_MAGIC[1] &&
39
+ bytes[6] === CFB_MAGIC[2] &&
40
+ bytes[7] === CFB_MAGIC[3];
41
+ if (!headIsCfb && cfbAt4) {
42
+ bytes = bytes.subarray(4);
43
+ }
44
+ }
45
+ out.set(ref.storageId, { data: new Uint8Array(bytes), extension: ext });
46
+ }
47
+ return out;
48
+ }
49
+ export function detectImageMime(extension) {
50
+ const ext = extension.toLowerCase();
51
+ if (ext === "png")
52
+ return "image/png";
53
+ if (ext === "jpg" || ext === "jpeg")
54
+ return "image/jpeg";
55
+ if (ext === "gif")
56
+ return "image/gif";
57
+ if (ext === "bmp")
58
+ return "image/bmp";
59
+ if (ext === "webp")
60
+ return "image/webp";
61
+ if (ext === "svg")
62
+ return "image/svg+xml";
63
+ return "application/octet-stream";
64
+ }
@@ -0,0 +1,31 @@
1
+ /**
2
+ * BodyText 섹션 파싱 — 계층 파싱.
3
+ *
4
+ * 레코드 트리:
5
+ * PARA_HEADER (level=0)
6
+ * PARA_TEXT (level=1)
7
+ * PARA_CHAR_SHAPE (level=1)
8
+ * PARA_LINE_SEG (level=1)
9
+ * CTRL_HEADER (level=1) ← 표/그림/머리말 등
10
+ * TABLE (level=2)
11
+ * LIST_HEADER (level=2) ← 셀
12
+ * PARA_HEADER (level=3)
13
+ * PARA_TEXT (level=4)
14
+ *
15
+ * 외곽 파라그래프와 셀 안 파라그래프는 분리되어 보존된다.
16
+ *
17
+ * 원작: rhwp/src/parser/body_text.rs (MIT, Edward Kim)
18
+ */
19
+ import { type Record } from "./record.js";
20
+ import type { HwpParagraph, HwpSection } from "./types.js";
21
+ export declare class BodyTextError extends Error {
22
+ constructor(msg: string);
23
+ }
24
+ export declare function parseBodyTextSection(data: Uint8Array): HwpSection;
25
+ /**
26
+ * 주어진 레코드 시퀀스에서 baseLevel 인 PARA_HEADER 들을 찾아 문단 목록으로 변환.
27
+ *
28
+ * @param records 정렬된 레코드 시퀀스 (서브트리 또는 전체)
29
+ * @param baseLevel 추출 대상 PARA_HEADER 의 레벨 (보통 외부 컨테이너 레벨 + 1, 섹션 최상위면 0)
30
+ */
31
+ export declare function parseParagraphList(records: Record[], baseLevel: number): HwpParagraph[];
@@ -0,0 +1,208 @@
1
+ /**
2
+ * BodyText 섹션 파싱 — 계층 파싱.
3
+ *
4
+ * 레코드 트리:
5
+ * PARA_HEADER (level=0)
6
+ * PARA_TEXT (level=1)
7
+ * PARA_CHAR_SHAPE (level=1)
8
+ * PARA_LINE_SEG (level=1)
9
+ * CTRL_HEADER (level=1) ← 표/그림/머리말 등
10
+ * TABLE (level=2)
11
+ * LIST_HEADER (level=2) ← 셀
12
+ * PARA_HEADER (level=3)
13
+ * PARA_TEXT (level=4)
14
+ *
15
+ * 외곽 파라그래프와 셀 안 파라그래프는 분리되어 보존된다.
16
+ *
17
+ * 원작: rhwp/src/parser/body_text.rs (MIT, Edward Kim)
18
+ */
19
+ import { ByteReader } from "./byteReader.js";
20
+ import { readAllRecords } from "./record.js";
21
+ import { HWPTAG_PARA_HEADER, HWPTAG_PARA_TEXT, HWPTAG_PARA_CHAR_SHAPE, HWPTAG_CTRL_HEADER, } from "./tags.js";
22
+ import { parseCtrlHeader } from "./control.js";
23
+ export class BodyTextError extends Error {
24
+ constructor(msg) {
25
+ super(msg);
26
+ this.name = "BodyTextError";
27
+ }
28
+ }
29
+ export function parseBodyTextSection(data) {
30
+ const records = readAllRecords(data);
31
+ // 섹션의 최상위 PARA_HEADER 들 (보통 level=0)
32
+ const topLevel = records.length > 0 ? records[0].level : 0;
33
+ return { paragraphs: parseParagraphList(records, topLevel) };
34
+ }
35
+ /**
36
+ * 주어진 레코드 시퀀스에서 baseLevel 인 PARA_HEADER 들을 찾아 문단 목록으로 변환.
37
+ *
38
+ * @param records 정렬된 레코드 시퀀스 (서브트리 또는 전체)
39
+ * @param baseLevel 추출 대상 PARA_HEADER 의 레벨 (보통 외부 컨테이너 레벨 + 1, 섹션 최상위면 0)
40
+ */
41
+ export function parseParagraphList(records, baseLevel) {
42
+ const paragraphs = [];
43
+ for (let i = 0; i < records.length; i++) {
44
+ const rec = records[i];
45
+ if (rec.tagId !== HWPTAG_PARA_HEADER)
46
+ continue;
47
+ if (rec.level !== baseLevel)
48
+ continue;
49
+ // 자기 서브트리 종료점: level <= baseLevel 이 다시 등장하는 위치
50
+ let end = i + 1;
51
+ while (end < records.length && records[end].level > baseLevel)
52
+ end++;
53
+ const paraRecords = records.slice(i, end);
54
+ paragraphs.push(buildParagraph(paraRecords));
55
+ i = end - 1;
56
+ }
57
+ return paragraphs;
58
+ }
59
+ function buildParagraph(records) {
60
+ const header = records[0];
61
+ const headerInfo = parseParaHeader(header.data);
62
+ const baseLevel = header.level;
63
+ let text = "";
64
+ let charShapeChanges = [];
65
+ const controls = [];
66
+ for (let j = 1; j < records.length; j++) {
67
+ const r = records[j];
68
+ if (r.level !== baseLevel + 1)
69
+ continue; // 직접 자식만
70
+ switch (r.tagId) {
71
+ case HWPTAG_PARA_TEXT: {
72
+ text = parseParaText(r.data);
73
+ break;
74
+ }
75
+ case HWPTAG_PARA_CHAR_SHAPE: {
76
+ charShapeChanges = parseParaCharShape(r.data);
77
+ break;
78
+ }
79
+ case HWPTAG_CTRL_HEADER: {
80
+ // CTRL_HEADER 의 자식 (level > baseLevel+1) 수집
81
+ const ctrlChildren = [];
82
+ for (let k = j + 1; k < records.length; k++) {
83
+ if (records[k].level <= baseLevel + 1)
84
+ break;
85
+ ctrlChildren.push(records[k]);
86
+ }
87
+ const ctrl = parseCtrlHeader(r, ctrlChildren, parseParagraphList);
88
+ controls.push(ctrl);
89
+ break;
90
+ }
91
+ default:
92
+ break;
93
+ }
94
+ }
95
+ return {
96
+ paraShapeId: headerInfo.paraShapeId,
97
+ styleId: headerInfo.styleId,
98
+ text,
99
+ runs: buildRuns(text, charShapeChanges),
100
+ controls,
101
+ };
102
+ }
103
+ function parseParaHeader(data) {
104
+ const r = new ByteReader(data);
105
+ const nCharsRaw = r.remaining() >= 4 ? r.readU32() : 0;
106
+ const charCount = nCharsRaw & 0x7fffffff;
107
+ const controlMask = r.remaining() >= 4 ? r.readU32() : 0;
108
+ const paraShapeId = r.remaining() >= 2 ? r.readU16() : 0;
109
+ const styleId = r.remaining() >= 1 ? r.readU8() : 0;
110
+ return { charCount, controlMask, paraShapeId, styleId };
111
+ }
112
+ /**
113
+ * PARA_TEXT 디코딩 (텍스트만; 컨트롤 위치는 buildParagraph 의 CTRL_HEADER 처리에서 별도 추적).
114
+ *
115
+ * 컨트롤 문자 분류 (HWP 5.0 표 6):
116
+ * - 1 word (2바이트): 0, 10 (LF), 13 (para break — 종료), 24~31
117
+ * - 8 word (16바이트): 1~8, 11~12, 14~23
118
+ * - 9 (탭): 8 word
119
+ */
120
+ function parseParaText(data) {
121
+ let text = "";
122
+ let pos = 0;
123
+ const end = data.byteLength;
124
+ while (pos + 1 < end) {
125
+ const ch = data[pos] | (data[pos + 1] << 8);
126
+ if (ch === 0) {
127
+ pos += 2;
128
+ }
129
+ else if (ch === 0x09) {
130
+ text += "\t";
131
+ pos += 16;
132
+ }
133
+ else if (ch === 0x0a) {
134
+ text += "\n";
135
+ pos += 2;
136
+ }
137
+ else if (ch === 0x0d) {
138
+ break;
139
+ }
140
+ else if (isExtendedCtrl(ch)) {
141
+ pos += 16;
142
+ }
143
+ else if (ch < 0x20) {
144
+ switch (ch) {
145
+ case 0x18:
146
+ text += " ";
147
+ break;
148
+ case 0x19:
149
+ text += " ";
150
+ break;
151
+ case 0x1e:
152
+ text += "-";
153
+ break;
154
+ case 0x1f:
155
+ text += " ";
156
+ break;
157
+ default:
158
+ break;
159
+ }
160
+ pos += 2;
161
+ }
162
+ else {
163
+ if (ch >= 0xd800 && ch <= 0xdbff && pos + 3 < end) {
164
+ const low = data[pos + 2] | (data[pos + 3] << 8);
165
+ if (low >= 0xdc00 && low <= 0xdfff) {
166
+ text += String.fromCharCode(ch, low);
167
+ pos += 4;
168
+ continue;
169
+ }
170
+ }
171
+ text += String.fromCharCode(ch);
172
+ pos += 2;
173
+ }
174
+ }
175
+ return text;
176
+ }
177
+ function isExtendedCtrl(ch) {
178
+ return ((ch >= 1 && ch <= 8) ||
179
+ ch === 11 ||
180
+ ch === 12 ||
181
+ (ch >= 14 && ch <= 23));
182
+ }
183
+ function parseParaCharShape(data) {
184
+ const r = new ByteReader(data);
185
+ const out = [];
186
+ while (r.remaining() >= 8) {
187
+ const charPos = r.readU32();
188
+ const charShapeId = r.readU32();
189
+ out.push({ charPos, charShapeId });
190
+ }
191
+ return out;
192
+ }
193
+ function buildRuns(text, changes) {
194
+ if (text.length === 0)
195
+ return [];
196
+ if (changes.length === 0)
197
+ return [{ charShapeId: 0, text }];
198
+ const sorted = [...changes].sort((a, b) => a.charPos - b.charPos);
199
+ const runs = [];
200
+ for (let i = 0; i < sorted.length; i++) {
201
+ const start = sorted[i].charPos;
202
+ const stop = i + 1 < sorted.length ? sorted[i + 1].charPos : text.length;
203
+ if (stop > start) {
204
+ runs.push({ charShapeId: sorted[i].charShapeId, text: text.slice(start, stop) });
205
+ }
206
+ }
207
+ return runs;
208
+ }
@@ -0,0 +1,40 @@
1
+ /**
2
+ * 바이너리 데이터 읽기 유틸리티 (커서 기반).
3
+ * HWP 5.0 스트림은 모두 little-endian, 문자열은 UTF-16LE.
4
+ *
5
+ * 원작: rhwp/src/parser/byte_reader.rs (MIT, Copyright (c) 2025-2026 Edward Kim)
6
+ */
7
+ export declare class ByteReader {
8
+ private view;
9
+ private offset;
10
+ private readonly end;
11
+ constructor(data: Uint8Array, offset?: number, length?: number);
12
+ /** 현재 위치 (바이트 오프셋, 시작점 기준) */
13
+ position(): number;
14
+ /** 남은 바이트 */
15
+ remaining(): number;
16
+ isEmpty(): boolean;
17
+ setPosition(pos: number): void;
18
+ skip(n: number): void;
19
+ readU8(): number;
20
+ readU16(): number;
21
+ readU32(): number;
22
+ readI8(): number;
23
+ readI16(): number;
24
+ readI32(): number;
25
+ /** i64 (BigInt). HWP에서는 거의 등장하지 않지만 호환을 위해. */
26
+ readI64(): bigint;
27
+ /** 지정 길이 바이트를 복사하지 않고 sub-view 반환 (Uint8Array) */
28
+ readBytes(len: number): Uint8Array;
29
+ /** 남은 전부 */
30
+ readRemaining(): Uint8Array;
31
+ /**
32
+ * HWP 문자열: [u16 charCount] + [UTF-16LE bytes * charCount].
33
+ */
34
+ readHwpString(): string;
35
+ /** 지정 글자 수의 UTF-16LE 문자열 */
36
+ readUtf16(charCount: number): string;
37
+ /** ColorRef (0x00BBGGRR) — u32 그대로 */
38
+ readColorRef(): number;
39
+ private ensure;
40
+ }