hwp-convert 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +185 -0
- package/LICENSE +25 -0
- package/NOTICE +23 -0
- package/README.md +338 -0
- package/dist/browser/hwp-convert.browser.mjs +20677 -0
- package/dist/browser/hwp-convert.browser.mjs.map +7 -0
- package/dist/cli.d.ts +2 -0
- package/dist/cli.js +267 -0
- package/dist/index.d.ts +5 -0
- package/dist/index.js +5 -0
- package/dist/lib/errors.d.ts +9 -0
- package/dist/lib/errors.js +18 -0
- package/dist/lib/hwp/binData.d.ts +15 -0
- package/dist/lib/hwp/binData.js +64 -0
- package/dist/lib/hwp/bodyText.d.ts +31 -0
- package/dist/lib/hwp/bodyText.js +208 -0
- package/dist/lib/hwp/byteReader.d.ts +40 -0
- package/dist/lib/hwp/byteReader.js +116 -0
- package/dist/lib/hwp/cfbReader.d.ts +44 -0
- package/dist/lib/hwp/cfbReader.js +134 -0
- package/dist/lib/hwp/control.d.ts +17 -0
- package/dist/lib/hwp/control.js +290 -0
- package/dist/lib/hwp/converter.d.ts +22 -0
- package/dist/lib/hwp/converter.js +41 -0
- package/dist/lib/hwp/docInfo.d.ts +26 -0
- package/dist/lib/hwp/docInfo.js +396 -0
- package/dist/lib/hwp/fileHeader.d.ts +42 -0
- package/dist/lib/hwp/fileHeader.js +66 -0
- package/dist/lib/hwp/htmlReader.d.ts +17 -0
- package/dist/lib/hwp/htmlReader.js +602 -0
- package/dist/lib/hwp/hwpxBuilder.d.ts +19 -0
- package/dist/lib/hwp/hwpxBuilder.js +633 -0
- package/dist/lib/hwp/index.d.ts +68 -0
- package/dist/lib/hwp/index.js +149 -0
- package/dist/lib/hwp/mdReader.d.ts +16 -0
- package/dist/lib/hwp/mdReader.js +485 -0
- package/dist/lib/hwp/mdWriter.d.ts +23 -0
- package/dist/lib/hwp/mdWriter.js +182 -0
- package/dist/lib/hwp/owpml.d.ts +33 -0
- package/dist/lib/hwp/owpml.js +86 -0
- package/dist/lib/hwp/record.d.ts +24 -0
- package/dist/lib/hwp/record.js +59 -0
- package/dist/lib/hwp/tags.d.ts +115 -0
- package/dist/lib/hwp/tags.js +217 -0
- package/dist/lib/hwp/types.d.ts +214 -0
- package/dist/lib/hwp/types.js +5 -0
- package/dist/lib/hwpxReader.d.ts +60 -0
- package/dist/lib/hwpxReader.js +1104 -0
- package/dist/lib/types.d.ts +47 -0
- package/dist/lib/types.js +1 -0
- package/dist/lib/writer.d.ts +19 -0
- package/dist/lib/writer.js +149 -0
- package/package.json +94 -0
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* 바이너리 데이터 읽기 유틸리티 (커서 기반).
|
|
3
|
+
* HWP 5.0 스트림은 모두 little-endian, 문자열은 UTF-16LE.
|
|
4
|
+
*
|
|
5
|
+
* 원작: rhwp/src/parser/byte_reader.rs (MIT, Copyright (c) 2025-2026 Edward Kim)
|
|
6
|
+
*/
|
|
7
|
+
const UTF16_LE = new TextDecoder("utf-16le");
|
|
8
|
+
export class ByteReader {
|
|
9
|
+
view;
|
|
10
|
+
offset;
|
|
11
|
+
end;
|
|
12
|
+
constructor(data, offset = 0, length) {
|
|
13
|
+
this.view = new DataView(data.buffer, data.byteOffset, data.byteLength);
|
|
14
|
+
this.offset = offset;
|
|
15
|
+
this.end = length === undefined ? data.byteLength : offset + length;
|
|
16
|
+
}
|
|
17
|
+
/** 현재 위치 (바이트 오프셋, 시작점 기준) */
|
|
18
|
+
position() {
|
|
19
|
+
return this.offset;
|
|
20
|
+
}
|
|
21
|
+
/** 남은 바이트 */
|
|
22
|
+
remaining() {
|
|
23
|
+
return Math.max(0, this.end - this.offset);
|
|
24
|
+
}
|
|
25
|
+
isEmpty() {
|
|
26
|
+
return this.remaining() === 0;
|
|
27
|
+
}
|
|
28
|
+
setPosition(pos) {
|
|
29
|
+
if (pos < 0 || pos > this.end) {
|
|
30
|
+
throw new RangeError(`setPosition out of range: ${pos} (end=${this.end})`);
|
|
31
|
+
}
|
|
32
|
+
this.offset = pos;
|
|
33
|
+
}
|
|
34
|
+
skip(n) {
|
|
35
|
+
if (this.offset + n > this.end) {
|
|
36
|
+
throw new RangeError(`skip exceeds end: pos=${this.offset}+${n} > ${this.end}`);
|
|
37
|
+
}
|
|
38
|
+
this.offset += n;
|
|
39
|
+
}
|
|
40
|
+
readU8() {
|
|
41
|
+
this.ensure(1);
|
|
42
|
+
return this.view.getUint8(this.offset++);
|
|
43
|
+
}
|
|
44
|
+
readU16() {
|
|
45
|
+
this.ensure(2);
|
|
46
|
+
const v = this.view.getUint16(this.offset, true);
|
|
47
|
+
this.offset += 2;
|
|
48
|
+
return v;
|
|
49
|
+
}
|
|
50
|
+
readU32() {
|
|
51
|
+
this.ensure(4);
|
|
52
|
+
const v = this.view.getUint32(this.offset, true);
|
|
53
|
+
this.offset += 4;
|
|
54
|
+
return v;
|
|
55
|
+
}
|
|
56
|
+
readI8() {
|
|
57
|
+
this.ensure(1);
|
|
58
|
+
return this.view.getInt8(this.offset++);
|
|
59
|
+
}
|
|
60
|
+
readI16() {
|
|
61
|
+
this.ensure(2);
|
|
62
|
+
const v = this.view.getInt16(this.offset, true);
|
|
63
|
+
this.offset += 2;
|
|
64
|
+
return v;
|
|
65
|
+
}
|
|
66
|
+
readI32() {
|
|
67
|
+
this.ensure(4);
|
|
68
|
+
const v = this.view.getInt32(this.offset, true);
|
|
69
|
+
this.offset += 4;
|
|
70
|
+
return v;
|
|
71
|
+
}
|
|
72
|
+
/** i64 (BigInt). HWP에서는 거의 등장하지 않지만 호환을 위해. */
|
|
73
|
+
readI64() {
|
|
74
|
+
this.ensure(8);
|
|
75
|
+
const v = this.view.getBigInt64(this.offset, true);
|
|
76
|
+
this.offset += 8;
|
|
77
|
+
return v;
|
|
78
|
+
}
|
|
79
|
+
/** 지정 길이 바이트를 복사하지 않고 sub-view 반환 (Uint8Array) */
|
|
80
|
+
readBytes(len) {
|
|
81
|
+
this.ensure(len);
|
|
82
|
+
const out = new Uint8Array(this.view.buffer, this.view.byteOffset + this.offset, len);
|
|
83
|
+
this.offset += len;
|
|
84
|
+
return new Uint8Array(out); // 복사본 — 호출자가 외부 라이프사이클을 신경 쓸 필요 없게
|
|
85
|
+
}
|
|
86
|
+
/** 남은 전부 */
|
|
87
|
+
readRemaining() {
|
|
88
|
+
return this.readBytes(this.remaining());
|
|
89
|
+
}
|
|
90
|
+
/**
|
|
91
|
+
* HWP 문자열: [u16 charCount] + [UTF-16LE bytes * charCount].
|
|
92
|
+
*/
|
|
93
|
+
readHwpString() {
|
|
94
|
+
const charCount = this.readU16();
|
|
95
|
+
if (charCount === 0)
|
|
96
|
+
return "";
|
|
97
|
+
return this.readUtf16(charCount);
|
|
98
|
+
}
|
|
99
|
+
/** 지정 글자 수의 UTF-16LE 문자열 */
|
|
100
|
+
readUtf16(charCount) {
|
|
101
|
+
const byteLen = charCount * 2;
|
|
102
|
+
this.ensure(byteLen);
|
|
103
|
+
const slice = new Uint8Array(this.view.buffer, this.view.byteOffset + this.offset, byteLen);
|
|
104
|
+
this.offset += byteLen;
|
|
105
|
+
return UTF16_LE.decode(slice);
|
|
106
|
+
}
|
|
107
|
+
/** ColorRef (0x00BBGGRR) — u32 그대로 */
|
|
108
|
+
readColorRef() {
|
|
109
|
+
return this.readU32();
|
|
110
|
+
}
|
|
111
|
+
ensure(n) {
|
|
112
|
+
if (this.offset + n > this.end) {
|
|
113
|
+
throw new RangeError(`ByteReader: not enough bytes. need=${n}, have=${this.end - this.offset}, pos=${this.offset}`);
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* HWP 5.0 CFB(OLE2) 컨테이너 리더.
|
|
3
|
+
* SheetJS 'cfb' 패키지를 감싸 HWP 친화적 API 제공.
|
|
4
|
+
*
|
|
5
|
+
* HWP CFB 구조:
|
|
6
|
+
* /FileHeader (256B, 비압축)
|
|
7
|
+
* /DocInfo (압축 가능 — 비밀 stream에서 raw deflate)
|
|
8
|
+
* /BodyText/Section{N} (압축 가능)
|
|
9
|
+
* /ViewText/Section{N} (배포용 문서; 암호화)
|
|
10
|
+
* /BinData/BIN{XXXX}.{ext} (이미지/임베디드)
|
|
11
|
+
* /PrvImage, /PrvText (미리보기)
|
|
12
|
+
* /Scripts/..., /DocOptions/...
|
|
13
|
+
*
|
|
14
|
+
* 원작 참고: rhwp/src/parser/cfb_reader.rs (MIT, Edward Kim)
|
|
15
|
+
*/
|
|
16
|
+
export declare class CfbError extends Error {
|
|
17
|
+
constructor(msg: string);
|
|
18
|
+
}
|
|
19
|
+
export declare class HwpCfbReader {
|
|
20
|
+
private container;
|
|
21
|
+
constructor(data: Uint8Array);
|
|
22
|
+
/** path 정확히 일치하는 stream 의 raw bytes (압축/암호화 그대로) */
|
|
23
|
+
readStreamRaw(path: string): Uint8Array | null;
|
|
24
|
+
/** 디플레이트 압축 해제. raw deflate (zlib 헤더 없음). */
|
|
25
|
+
static decompress(data: Uint8Array): Uint8Array;
|
|
26
|
+
/** FileHeader (256바이트, 비압축) */
|
|
27
|
+
readFileHeader(): Uint8Array;
|
|
28
|
+
/** DocInfo (compressed 플래그에 따라 자동 해제) */
|
|
29
|
+
readDocInfo(compressed: boolean): Uint8Array;
|
|
30
|
+
/** /BodyText/SectionN 또는 /ViewText/SectionN (배포용) */
|
|
31
|
+
readBodySection(index: number, compressed: boolean, distribution: boolean): Uint8Array | null;
|
|
32
|
+
/** BodyText 섹션 개수 (Section0 ~ SectionN-1) */
|
|
33
|
+
sectionCount(distribution?: boolean): number;
|
|
34
|
+
/** /BinData/BIN{XXXX}.{ext} 모두 나열 */
|
|
35
|
+
listBinData(): {
|
|
36
|
+
name: string;
|
|
37
|
+
storageId: number;
|
|
38
|
+
extension: string;
|
|
39
|
+
}[];
|
|
40
|
+
/** /BinData/BIN{XXXX}.{ext} 의 데이터 (압축되어 있으면 해제 시도) */
|
|
41
|
+
readBinData(path: string): Uint8Array | null;
|
|
42
|
+
/** 디버그 — 모든 stream 경로 */
|
|
43
|
+
listStreams(): string[];
|
|
44
|
+
}
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* HWP 5.0 CFB(OLE2) 컨테이너 리더.
|
|
3
|
+
* SheetJS 'cfb' 패키지를 감싸 HWP 친화적 API 제공.
|
|
4
|
+
*
|
|
5
|
+
* HWP CFB 구조:
|
|
6
|
+
* /FileHeader (256B, 비압축)
|
|
7
|
+
* /DocInfo (압축 가능 — 비밀 stream에서 raw deflate)
|
|
8
|
+
* /BodyText/Section{N} (압축 가능)
|
|
9
|
+
* /ViewText/Section{N} (배포용 문서; 암호화)
|
|
10
|
+
* /BinData/BIN{XXXX}.{ext} (이미지/임베디드)
|
|
11
|
+
* /PrvImage, /PrvText (미리보기)
|
|
12
|
+
* /Scripts/..., /DocOptions/...
|
|
13
|
+
*
|
|
14
|
+
* 원작 참고: rhwp/src/parser/cfb_reader.rs (MIT, Edward Kim)
|
|
15
|
+
*/
|
|
16
|
+
import * as CFB from "cfb";
|
|
17
|
+
import { inflateRaw } from "pako";
|
|
18
|
+
export class CfbError extends Error {
|
|
19
|
+
constructor(msg) {
|
|
20
|
+
super(msg);
|
|
21
|
+
this.name = "CfbError";
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
export class HwpCfbReader {
|
|
25
|
+
container;
|
|
26
|
+
constructor(data) {
|
|
27
|
+
try {
|
|
28
|
+
this.container = CFB.read(data, { type: "buffer" });
|
|
29
|
+
}
|
|
30
|
+
catch (e) {
|
|
31
|
+
throw new CfbError(`CFB 파싱 실패: ${e instanceof Error ? e.message : String(e)}`);
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
/** path 정확히 일치하는 stream 의 raw bytes (압축/암호화 그대로) */
|
|
35
|
+
readStreamRaw(path) {
|
|
36
|
+
const norm = path.startsWith("/") ? path : `/${path}`;
|
|
37
|
+
const entry = CFB.find(this.container, norm);
|
|
38
|
+
if (!entry || entry.type !== 2 /* stream */)
|
|
39
|
+
return null;
|
|
40
|
+
return toUint8Array(entry.content);
|
|
41
|
+
}
|
|
42
|
+
/** 디플레이트 압축 해제. raw deflate (zlib 헤더 없음). */
|
|
43
|
+
static decompress(data) {
|
|
44
|
+
if (data.byteLength === 0)
|
|
45
|
+
return data;
|
|
46
|
+
try {
|
|
47
|
+
return inflateRaw(data);
|
|
48
|
+
}
|
|
49
|
+
catch (e) {
|
|
50
|
+
throw new CfbError(`deflate 해제 실패: ${e instanceof Error ? e.message : String(e)}`);
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
/** FileHeader (256바이트, 비압축) */
|
|
54
|
+
readFileHeader() {
|
|
55
|
+
const data = this.readStreamRaw("/FileHeader");
|
|
56
|
+
if (!data)
|
|
57
|
+
throw new CfbError("/FileHeader 스트림이 없습니다");
|
|
58
|
+
return data;
|
|
59
|
+
}
|
|
60
|
+
/** DocInfo (compressed 플래그에 따라 자동 해제) */
|
|
61
|
+
readDocInfo(compressed) {
|
|
62
|
+
const raw = this.readStreamRaw("/DocInfo");
|
|
63
|
+
if (!raw)
|
|
64
|
+
throw new CfbError("/DocInfo 스트림이 없습니다");
|
|
65
|
+
return compressed ? HwpCfbReader.decompress(raw) : raw;
|
|
66
|
+
}
|
|
67
|
+
/** /BodyText/SectionN 또는 /ViewText/SectionN (배포용) */
|
|
68
|
+
readBodySection(index, compressed, distribution) {
|
|
69
|
+
const folder = distribution ? "ViewText" : "BodyText";
|
|
70
|
+
const raw = this.readStreamRaw(`/${folder}/Section${index}`);
|
|
71
|
+
if (!raw)
|
|
72
|
+
return null;
|
|
73
|
+
if (distribution) {
|
|
74
|
+
// ViewText 는 암호화되어 있어 v1 미지원: 호출자가 raw 처리
|
|
75
|
+
return raw;
|
|
76
|
+
}
|
|
77
|
+
return compressed ? HwpCfbReader.decompress(raw) : raw;
|
|
78
|
+
}
|
|
79
|
+
/** BodyText 섹션 개수 (Section0 ~ SectionN-1) */
|
|
80
|
+
sectionCount(distribution = false) {
|
|
81
|
+
const folder = distribution ? "ViewText" : "BodyText";
|
|
82
|
+
const re = new RegExp(`(?:^|/)${folder}/Section\\d+$`);
|
|
83
|
+
let n = 0;
|
|
84
|
+
for (const path of this.container.FullPaths) {
|
|
85
|
+
if (re.test(path))
|
|
86
|
+
n++;
|
|
87
|
+
}
|
|
88
|
+
return n;
|
|
89
|
+
}
|
|
90
|
+
/** /BinData/BIN{XXXX}.{ext} 모두 나열 */
|
|
91
|
+
listBinData() {
|
|
92
|
+
const result = [];
|
|
93
|
+
const re = /(?:^|\/)BinData\/BIN([0-9A-Fa-f]{4})\.([^/]+)$/;
|
|
94
|
+
for (const path of this.container.FullPaths) {
|
|
95
|
+
const m = re.exec(path);
|
|
96
|
+
if (!m)
|
|
97
|
+
continue;
|
|
98
|
+
result.push({
|
|
99
|
+
name: path,
|
|
100
|
+
storageId: parseInt(m[1], 16),
|
|
101
|
+
extension: m[2].toLowerCase(),
|
|
102
|
+
});
|
|
103
|
+
}
|
|
104
|
+
return result;
|
|
105
|
+
}
|
|
106
|
+
/** /BinData/BIN{XXXX}.{ext} 의 데이터 (압축되어 있으면 해제 시도) */
|
|
107
|
+
readBinData(path) {
|
|
108
|
+
const raw = this.readStreamRaw(path);
|
|
109
|
+
if (!raw)
|
|
110
|
+
return null;
|
|
111
|
+
// BinData 도 일반적으로 압축됨
|
|
112
|
+
try {
|
|
113
|
+
return HwpCfbReader.decompress(raw);
|
|
114
|
+
}
|
|
115
|
+
catch {
|
|
116
|
+
return raw;
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
/** 디버그 — 모든 stream 경로 */
|
|
120
|
+
listStreams() {
|
|
121
|
+
const out = [];
|
|
122
|
+
for (let i = 0; i < this.container.FileIndex.length; i++) {
|
|
123
|
+
if (this.container.FileIndex[i].type === 2)
|
|
124
|
+
out.push(this.container.FullPaths[i]);
|
|
125
|
+
}
|
|
126
|
+
return out;
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
function toUint8Array(content) {
|
|
130
|
+
if (content instanceof Uint8Array)
|
|
131
|
+
return content;
|
|
132
|
+
// number[]
|
|
133
|
+
return Uint8Array.from(content);
|
|
134
|
+
}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* 컨트롤 파싱 (CTRL_HEADER 의 ctrl_id 별 분기).
|
|
3
|
+
*
|
|
4
|
+
* 1차 포팅 범위: 표(tbl) / 그림(gso+pic) / 머리말(head) / 꼬리말(foot) / 각주(fn)
|
|
5
|
+
*
|
|
6
|
+
* 원작: rhwp/src/parser/control.rs (MIT, Edward Kim)
|
|
7
|
+
*/
|
|
8
|
+
import type { Record } from "./record.js";
|
|
9
|
+
import type { HwpControl, HwpParagraph } from "./types.js";
|
|
10
|
+
/**
|
|
11
|
+
* CTRL_HEADER 레코드와 그 자식 레코드들을 받아 HwpControl 로 변환.
|
|
12
|
+
*
|
|
13
|
+
* @param ctrlHeader CTRL_HEADER 레코드 자체
|
|
14
|
+
* @param children CTRL_HEADER 의 자식 레코드들 (level > ctrlHeader.level). subtree 의 후손까지 포함.
|
|
15
|
+
* @param parseParagraphList 재귀 파싱용 콜백 — 셀/머리말 등 내부 문단 추출
|
|
16
|
+
*/
|
|
17
|
+
export declare function parseCtrlHeader(ctrlHeader: Record, children: Record[], parseParagraphList: (records: Record[], baseLevel: number) => HwpParagraph[]): HwpControl;
|
|
@@ -0,0 +1,290 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* 컨트롤 파싱 (CTRL_HEADER 의 ctrl_id 별 분기).
|
|
3
|
+
*
|
|
4
|
+
* 1차 포팅 범위: 표(tbl) / 그림(gso+pic) / 머리말(head) / 꼬리말(foot) / 각주(fn)
|
|
5
|
+
*
|
|
6
|
+
* 원작: rhwp/src/parser/control.rs (MIT, Edward Kim)
|
|
7
|
+
*/
|
|
8
|
+
import { ByteReader } from "./byteReader.js";
|
|
9
|
+
import { CTRL_TABLE, CTRL_GEN_SHAPE, CTRL_HEADER, CTRL_FOOTER, CTRL_FOOTNOTE, CTRL_EQUATION, HWPTAG_TABLE, HWPTAG_LIST_HEADER, HWPTAG_SHAPE_COMPONENT_PICTURE, HWPTAG_SHAPE_COMPONENT_LINE, HWPTAG_SHAPE_COMPONENT_RECTANGLE, HWPTAG_SHAPE_COMPONENT_ELLIPSE, HWPTAG_SHAPE_COMPONENT_ARC, HWPTAG_SHAPE_COMPONENT_POLYGON, HWPTAG_SHAPE_COMPONENT_CURVE, HWPTAG_EQEDIT, ctrlIdToString, isFieldCtrlId, } from "./tags.js";
|
|
10
|
+
/**
|
|
11
|
+
* CTRL_HEADER 레코드와 그 자식 레코드들을 받아 HwpControl 로 변환.
|
|
12
|
+
*
|
|
13
|
+
* @param ctrlHeader CTRL_HEADER 레코드 자체
|
|
14
|
+
* @param children CTRL_HEADER 의 자식 레코드들 (level > ctrlHeader.level). subtree 의 후손까지 포함.
|
|
15
|
+
* @param parseParagraphList 재귀 파싱용 콜백 — 셀/머리말 등 내부 문단 추출
|
|
16
|
+
*/
|
|
17
|
+
export function parseCtrlHeader(ctrlHeader, children, parseParagraphList) {
|
|
18
|
+
if (ctrlHeader.data.byteLength < 4) {
|
|
19
|
+
return { kind: "unknown", ctrlId: "" };
|
|
20
|
+
}
|
|
21
|
+
// ctrl_id 는 첫 4바이트의 u32(LE) 값. tags.ts 의 ctrlId() 와 동일한 big-endian 표기 정수를 반환하도록 변환.
|
|
22
|
+
// 파일에는 "secd" 가 [0x64, 0x63, 0x65, 0x73] 순서(LE u32)로 저장되며, 같은 4글자를 BE u32 로 인코딩한 값과 동일.
|
|
23
|
+
const r = new ByteReader(ctrlHeader.data);
|
|
24
|
+
const ctrlIdRaw = r.readU32(); // LE u32 → 그대로 BE 인코딩한 ctrl_id 와 일치
|
|
25
|
+
const ctrlData = ctrlHeader.data.subarray(4);
|
|
26
|
+
switch (ctrlIdRaw) {
|
|
27
|
+
case CTRL_TABLE:
|
|
28
|
+
return parseTableControl(ctrlHeader, children, parseParagraphList);
|
|
29
|
+
case CTRL_GEN_SHAPE:
|
|
30
|
+
return parseGsoControl(ctrlHeader, children);
|
|
31
|
+
case CTRL_HEADER:
|
|
32
|
+
return {
|
|
33
|
+
kind: "header",
|
|
34
|
+
paragraphs: collectListHeaderParagraphs(ctrlHeader, children, parseParagraphList),
|
|
35
|
+
};
|
|
36
|
+
case CTRL_FOOTER:
|
|
37
|
+
return {
|
|
38
|
+
kind: "footer",
|
|
39
|
+
paragraphs: collectListHeaderParagraphs(ctrlHeader, children, parseParagraphList),
|
|
40
|
+
};
|
|
41
|
+
case CTRL_FOOTNOTE:
|
|
42
|
+
return {
|
|
43
|
+
kind: "footnote",
|
|
44
|
+
paragraphs: collectListHeaderParagraphs(ctrlHeader, children, parseParagraphList),
|
|
45
|
+
};
|
|
46
|
+
case CTRL_EQUATION:
|
|
47
|
+
return parseEquationControl(ctrlHeader, children);
|
|
48
|
+
default:
|
|
49
|
+
if (isFieldCtrlId(ctrlIdRaw)) {
|
|
50
|
+
return parseFieldControl(ctrlIdRaw, ctrlData);
|
|
51
|
+
}
|
|
52
|
+
return { kind: "unknown", ctrlId: ctrlIdToString(ctrlIdRaw) };
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
// ============================================================
|
|
56
|
+
// 표
|
|
57
|
+
// ============================================================
|
|
58
|
+
function parseTableControl(ctrlHeader, children, parseParagraphList) {
|
|
59
|
+
const baseLevel = ctrlHeader.level;
|
|
60
|
+
let rowCount = 0;
|
|
61
|
+
let colCount = 0;
|
|
62
|
+
const cells = [];
|
|
63
|
+
// 자식 중 직접 자식(level == baseLevel + 1) 만 처리.
|
|
64
|
+
// - HWPTAG_TABLE: 표 메타데이터 (행/열)
|
|
65
|
+
// - HWPTAG_LIST_HEADER: 셀 (그 자체는 레벨 baseLevel+1)
|
|
66
|
+
// 각 셀의 내부 문단 PARA_HEADER 는 level baseLevel+2
|
|
67
|
+
let tableSeen = false;
|
|
68
|
+
for (let i = 0; i < children.length; i++) {
|
|
69
|
+
const r = children[i];
|
|
70
|
+
if (r.level !== baseLevel + 1)
|
|
71
|
+
continue;
|
|
72
|
+
if (r.tagId === HWPTAG_TABLE) {
|
|
73
|
+
tableSeen = true;
|
|
74
|
+
const meta = parseTableMeta(r.data);
|
|
75
|
+
rowCount = meta.rowCount;
|
|
76
|
+
colCount = meta.colCount;
|
|
77
|
+
}
|
|
78
|
+
else if (r.tagId === HWPTAG_LIST_HEADER) {
|
|
79
|
+
// TABLE 레코드 이전의 LIST_HEADER 는 캡션 — 1차 포팅에서는 무시
|
|
80
|
+
if (!tableSeen)
|
|
81
|
+
continue;
|
|
82
|
+
// 셀 메타데이터 파싱 (LIST_HEADER 의 데이터)
|
|
83
|
+
const cellMeta = parseCellMeta(r.data);
|
|
84
|
+
// HWP 셀 구조 특이점: LIST_HEADER 와 그 셀의 PARA_HEADER 가 같은 level 이다.
|
|
85
|
+
// 따라서 셀 범위는 다음 LIST_HEADER 또는 TABLE (같은 level) 까지.
|
|
86
|
+
// PARA_HEADER 는 같은 level 이라도 셀 내용이므로 포함시킨다.
|
|
87
|
+
const cellChildren = [];
|
|
88
|
+
for (let j = i + 1; j < children.length; j++) {
|
|
89
|
+
const cr = children[j];
|
|
90
|
+
if (cr.level < baseLevel + 1)
|
|
91
|
+
break; // 서브트리 탈출
|
|
92
|
+
if (cr.level === baseLevel + 1) {
|
|
93
|
+
if (cr.tagId === HWPTAG_LIST_HEADER || cr.tagId === HWPTAG_TABLE)
|
|
94
|
+
break;
|
|
95
|
+
}
|
|
96
|
+
cellChildren.push(cr);
|
|
97
|
+
}
|
|
98
|
+
// 셀 문단은 LIST_HEADER 와 같은 level (baseLevel + 1)
|
|
99
|
+
const cellParagraphs = parseParagraphList(cellChildren, baseLevel + 1);
|
|
100
|
+
cells.push({
|
|
101
|
+
col: cellMeta.col,
|
|
102
|
+
row: cellMeta.row,
|
|
103
|
+
colSpan: cellMeta.colSpan,
|
|
104
|
+
rowSpan: cellMeta.rowSpan,
|
|
105
|
+
paragraphs: cellParagraphs,
|
|
106
|
+
});
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
return { kind: "table", rowCount, colCount, cells };
|
|
110
|
+
}
|
|
111
|
+
function parseTableMeta(data) {
|
|
112
|
+
const r = new ByteReader(data);
|
|
113
|
+
if (r.remaining() < 8)
|
|
114
|
+
return { rowCount: 0, colCount: 0 };
|
|
115
|
+
r.readU32(); // attr
|
|
116
|
+
const rowCount = r.readU16();
|
|
117
|
+
const colCount = r.readU16();
|
|
118
|
+
return { rowCount, colCount };
|
|
119
|
+
}
|
|
120
|
+
function parseCellMeta(data) {
|
|
121
|
+
const r = new ByteReader(data);
|
|
122
|
+
// LIST_HEADER 공통: nParagraphs(u16) + listAttr(u32) + listHeaderWidthRef(u16)
|
|
123
|
+
if (r.remaining() < 8)
|
|
124
|
+
return { col: 0, row: 0, colSpan: 1, rowSpan: 1 };
|
|
125
|
+
r.readU16();
|
|
126
|
+
r.readU32();
|
|
127
|
+
r.readU16();
|
|
128
|
+
// 셀 메타: col(u16) row(u16) colSpan(u16) rowSpan(u16)
|
|
129
|
+
if (r.remaining() < 8)
|
|
130
|
+
return { col: 0, row: 0, colSpan: 1, rowSpan: 1 };
|
|
131
|
+
const col = r.readU16();
|
|
132
|
+
const row = r.readU16();
|
|
133
|
+
const colSpan = r.readU16();
|
|
134
|
+
const rowSpan = r.readU16();
|
|
135
|
+
return {
|
|
136
|
+
col,
|
|
137
|
+
row,
|
|
138
|
+
colSpan: colSpan === 0 ? 1 : colSpan,
|
|
139
|
+
rowSpan: rowSpan === 0 ? 1 : rowSpan,
|
|
140
|
+
};
|
|
141
|
+
}
|
|
142
|
+
// ============================================================
|
|
143
|
+
// 그리기 개체 (gso) — 1차 포팅: PICTURE 만 추출
|
|
144
|
+
// ============================================================
|
|
145
|
+
function parseGsoControl(ctrlHeader, children) {
|
|
146
|
+
const baseLevel = ctrlHeader.level;
|
|
147
|
+
// 그림이 우선 — SHAPE_COMPONENT_PICTURE 레코드 검색
|
|
148
|
+
for (const r of children) {
|
|
149
|
+
if (r.tagId === HWPTAG_SHAPE_COMPONENT_PICTURE && r.level <= baseLevel + 3) {
|
|
150
|
+
const binDataId = parsePictureBinDataId(r.data);
|
|
151
|
+
if (binDataId !== undefined) {
|
|
152
|
+
return { kind: "picture", binDataId };
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
// 도형 (line/rect/ellipse/arc/polygon/curve)
|
|
157
|
+
for (const r of children) {
|
|
158
|
+
if (r.level > baseLevel + 3)
|
|
159
|
+
continue;
|
|
160
|
+
switch (r.tagId) {
|
|
161
|
+
case HWPTAG_SHAPE_COMPONENT_LINE:
|
|
162
|
+
return parseLineShape(r.data);
|
|
163
|
+
case HWPTAG_SHAPE_COMPONENT_RECTANGLE:
|
|
164
|
+
return { kind: "shape", shapeType: "rectangle" };
|
|
165
|
+
case HWPTAG_SHAPE_COMPONENT_ELLIPSE:
|
|
166
|
+
return { kind: "shape", shapeType: "ellipse" };
|
|
167
|
+
case HWPTAG_SHAPE_COMPONENT_ARC:
|
|
168
|
+
return { kind: "shape", shapeType: "arc" };
|
|
169
|
+
case HWPTAG_SHAPE_COMPONENT_POLYGON:
|
|
170
|
+
return { kind: "shape", shapeType: "polygon" };
|
|
171
|
+
case HWPTAG_SHAPE_COMPONENT_CURVE:
|
|
172
|
+
return { kind: "shape", shapeType: "curve" };
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
return { kind: "unknown", ctrlId: "gso " };
|
|
176
|
+
}
|
|
177
|
+
function parseLineShape(data) {
|
|
178
|
+
// SHAPE_LINE: x1(i32), y1(i32), x2(i32), y2(i32) + 추가 속성
|
|
179
|
+
if (data.byteLength < 16)
|
|
180
|
+
return { kind: "shape", shapeType: "line" };
|
|
181
|
+
const r = new ByteReader(data);
|
|
182
|
+
const x1 = r.readI32();
|
|
183
|
+
const y1 = r.readI32();
|
|
184
|
+
const x2 = r.readI32();
|
|
185
|
+
const y2 = r.readI32();
|
|
186
|
+
return { kind: "shape", shapeType: "line", x1, y1, x2, y2 };
|
|
187
|
+
}
|
|
188
|
+
// ============================================================
|
|
189
|
+
// 수식
|
|
190
|
+
// ============================================================
|
|
191
|
+
function parseEquationControl(ctrlHeader, children) {
|
|
192
|
+
// EQEDIT 레코드를 자식에서 찾는다.
|
|
193
|
+
// EQEDIT 레이아웃: u32 attr + u16 width + u16 height + u32 charCount(?) + WCHAR script + ...
|
|
194
|
+
// 여기서는 단순히 UTF-16 문자열을 추출해서 반환.
|
|
195
|
+
for (const r of children) {
|
|
196
|
+
if (r.tagId === HWPTAG_EQEDIT) {
|
|
197
|
+
return { kind: "equation", script: extractEquationScript(r.data) };
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
return { kind: "equation", script: "" };
|
|
201
|
+
}
|
|
202
|
+
function extractEquationScript(data) {
|
|
203
|
+
// 보수적으로 파싱: u32 attr 다음 가변. HWP 스펙상:
|
|
204
|
+
// u32 attr | i16 nLine | i16 lineHeight | u8 charScale | u32 baseUnit |
|
|
205
|
+
// u16 strLen | WCHAR script[strLen] | u16 charSet | i32 fontSize ...
|
|
206
|
+
// strLen 이 어디인지 찾기 위해 처음 몇 u16 의 패턴을 검사.
|
|
207
|
+
const r = new ByteReader(data);
|
|
208
|
+
if (r.remaining() < 4)
|
|
209
|
+
return "";
|
|
210
|
+
r.readU32(); // attr
|
|
211
|
+
// nLine, lineHeight (i16 x 2), charScale (u8), baseUnit (u32) — 11바이트
|
|
212
|
+
if (r.remaining() < 11)
|
|
213
|
+
return "";
|
|
214
|
+
r.skip(11);
|
|
215
|
+
if (r.remaining() < 2)
|
|
216
|
+
return "";
|
|
217
|
+
const strLen = r.readU16();
|
|
218
|
+
if (strLen === 0 || strLen > 10000)
|
|
219
|
+
return "";
|
|
220
|
+
const need = strLen * 2;
|
|
221
|
+
if (r.remaining() < need)
|
|
222
|
+
return "";
|
|
223
|
+
try {
|
|
224
|
+
return r.readUtf16(strLen);
|
|
225
|
+
}
|
|
226
|
+
catch {
|
|
227
|
+
return "";
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
/**
|
|
231
|
+
* SHAPE_COMPONENT_PICTURE 레코드에서 bin_data_id 추출.
|
|
232
|
+
* 레이아웃 (rhwp 기준):
|
|
233
|
+
* border_color u32 (4)
|
|
234
|
+
* border_width i32 (4)
|
|
235
|
+
* border_attr u32 (4)
|
|
236
|
+
* border_x[4] i32 each (16)
|
|
237
|
+
* border_y[4] i32 each (16)
|
|
238
|
+
* crop 4x i32 (16)
|
|
239
|
+
* padding 4x i16 (8)
|
|
240
|
+
* brightness i8 (1)
|
|
241
|
+
* contrast i8 (1)
|
|
242
|
+
* effect u8 (1)
|
|
243
|
+
* bin_data_id u16 (2) ← offset 71
|
|
244
|
+
*/
|
|
245
|
+
function parsePictureBinDataId(data) {
|
|
246
|
+
const OFFSET = 4 + 4 + 4 + 16 + 16 + 16 + 8 + 1 + 1 + 1; // 71
|
|
247
|
+
if (data.byteLength < OFFSET + 2)
|
|
248
|
+
return undefined;
|
|
249
|
+
const view = new DataView(data.buffer, data.byteOffset, data.byteLength);
|
|
250
|
+
return view.getUint16(OFFSET, true);
|
|
251
|
+
}
|
|
252
|
+
// ============================================================
|
|
253
|
+
// LIST_HEADER 기반 컨트롤 (head/foot/fn)
|
|
254
|
+
// ============================================================
|
|
255
|
+
function collectListHeaderParagraphs(ctrlHeader, children, parseParagraphList) {
|
|
256
|
+
const baseLevel = ctrlHeader.level;
|
|
257
|
+
// 첫 LIST_HEADER (level baseLevel+1) 의 자식 PARA_HEADER (level baseLevel+2) 수집
|
|
258
|
+
const lhIdx = children.findIndex((r) => r.tagId === HWPTAG_LIST_HEADER && r.level === baseLevel + 1);
|
|
259
|
+
if (lhIdx < 0)
|
|
260
|
+
return [];
|
|
261
|
+
const subtree = [];
|
|
262
|
+
for (let j = lhIdx + 1; j < children.length; j++) {
|
|
263
|
+
if (children[j].level <= baseLevel + 1)
|
|
264
|
+
break;
|
|
265
|
+
subtree.push(children[j]);
|
|
266
|
+
}
|
|
267
|
+
return parseParagraphList(subtree, baseLevel + 2);
|
|
268
|
+
}
|
|
269
|
+
// ============================================================
|
|
270
|
+
// 필드 컨트롤
|
|
271
|
+
// ============================================================
|
|
272
|
+
function parseFieldControl(ctrlIdRaw, ctrlData) {
|
|
273
|
+
const id = ctrlIdToString(ctrlIdRaw);
|
|
274
|
+
if (ctrlData.byteLength < 7)
|
|
275
|
+
return { kind: "field", ctrlId: id };
|
|
276
|
+
const r = new ByteReader(ctrlData);
|
|
277
|
+
r.readU32(); // properties
|
|
278
|
+
r.readU8(); // extra
|
|
279
|
+
const commandLen = r.readU16();
|
|
280
|
+
let command;
|
|
281
|
+
if (commandLen > 0 && r.remaining() >= commandLen * 2) {
|
|
282
|
+
try {
|
|
283
|
+
command = r.readUtf16(commandLen);
|
|
284
|
+
}
|
|
285
|
+
catch {
|
|
286
|
+
command = undefined;
|
|
287
|
+
}
|
|
288
|
+
}
|
|
289
|
+
return { kind: "field", ctrlId: id, command };
|
|
290
|
+
}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* HwpDocument IR → HWPX/Text 변환기.
|
|
3
|
+
*
|
|
4
|
+
* - hwpToText: 모든 섹션의 문단 텍스트를 평탄화 (표 셀, 헤더/푸터 등 포함).
|
|
5
|
+
* - hwpToHwpx: hwpxBuilder 로 풍부한 HWPX 패키지 생성 (표/이미지 포함).
|
|
6
|
+
*/
|
|
7
|
+
import { hwpDocumentToMarkdown, type MarkdownWriteOptions } from "./mdWriter.js";
|
|
8
|
+
import { markdownToHwpDocument } from "./mdReader.js";
|
|
9
|
+
import { htmlToHwpDocument } from "./htmlReader.js";
|
|
10
|
+
import type { HwpDocument } from "./types.js";
|
|
11
|
+
export { hwpDocumentToMarkdown, markdownToHwpDocument, htmlToHwpDocument };
|
|
12
|
+
export type { MarkdownWriteOptions };
|
|
13
|
+
export interface HwpToTextOptions {
|
|
14
|
+
paragraphSeparator?: string;
|
|
15
|
+
sectionSeparator?: string;
|
|
16
|
+
}
|
|
17
|
+
export declare function hwpDocumentToText(doc: HwpDocument, options?: HwpToTextOptions): string;
|
|
18
|
+
export interface HwpToHwpxOptions {
|
|
19
|
+
title?: string;
|
|
20
|
+
creator?: string;
|
|
21
|
+
}
|
|
22
|
+
export declare function hwpDocumentToHwpx(doc: HwpDocument, options?: HwpToHwpxOptions): Promise<Uint8Array>;
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* HwpDocument IR → HWPX/Text 변환기.
|
|
3
|
+
*
|
|
4
|
+
* - hwpToText: 모든 섹션의 문단 텍스트를 평탄화 (표 셀, 헤더/푸터 등 포함).
|
|
5
|
+
* - hwpToHwpx: hwpxBuilder 로 풍부한 HWPX 패키지 생성 (표/이미지 포함).
|
|
6
|
+
*/
|
|
7
|
+
import { buildHwpxFromDocument } from "./hwpxBuilder.js";
|
|
8
|
+
import { hwpDocumentToMarkdown } from "./mdWriter.js";
|
|
9
|
+
import { markdownToHwpDocument } from "./mdReader.js";
|
|
10
|
+
import { htmlToHwpDocument } from "./htmlReader.js";
|
|
11
|
+
export { hwpDocumentToMarkdown, markdownToHwpDocument, htmlToHwpDocument };
|
|
12
|
+
export function hwpDocumentToText(doc, options) {
|
|
13
|
+
const paraSep = options?.paragraphSeparator ?? "\n";
|
|
14
|
+
const sectSep = options?.sectionSeparator ?? "\n\n";
|
|
15
|
+
return doc.sections
|
|
16
|
+
.map((s) => s.paragraphs.map((p) => flattenParagraphText(p)).join(paraSep))
|
|
17
|
+
.join(sectSep);
|
|
18
|
+
}
|
|
19
|
+
function flattenParagraphText(p) {
|
|
20
|
+
const parts = [];
|
|
21
|
+
if (p.text.length > 0)
|
|
22
|
+
parts.push(p.text);
|
|
23
|
+
for (const ctrl of p.controls) {
|
|
24
|
+
if (ctrl.kind === "table") {
|
|
25
|
+
const cellTexts = ctrl.cells.map((cell) => cell.paragraphs.map((q) => flattenParagraphText(q)).join("\n"));
|
|
26
|
+
parts.push(cellTexts.join("\n"));
|
|
27
|
+
}
|
|
28
|
+
else if (ctrl.kind === "header" ||
|
|
29
|
+
ctrl.kind === "footer" ||
|
|
30
|
+
ctrl.kind === "footnote") {
|
|
31
|
+
parts.push(ctrl.paragraphs.map((q) => flattenParagraphText(q)).join("\n"));
|
|
32
|
+
}
|
|
33
|
+
else if (ctrl.kind === "equation" && ctrl.script.length > 0) {
|
|
34
|
+
parts.push(ctrl.script);
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
return parts.join("\n");
|
|
38
|
+
}
|
|
39
|
+
export async function hwpDocumentToHwpx(doc, options) {
|
|
40
|
+
return await buildHwpxFromDocument(doc, options);
|
|
41
|
+
}
|