docpler 0.1.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,352 @@
1
+ # This file is automatically @generated by Cargo.
2
+ # It is not intended for manual editing.
3
+ version = 4
4
+
5
+ [[package]]
6
+ name = "adler2"
7
+ version = "2.0.1"
8
+ source = "registry+https://github.com/rust-lang/crates.io-index"
9
+ checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa"
10
+
11
+ [[package]]
12
+ name = "autocfg"
13
+ version = "1.5.0"
14
+ source = "registry+https://github.com/rust-lang/crates.io-index"
15
+ checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
16
+
17
+ [[package]]
18
+ name = "bumpalo"
19
+ version = "3.20.2"
20
+ source = "registry+https://github.com/rust-lang/crates.io-index"
21
+ checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb"
22
+
23
+ [[package]]
24
+ name = "byteorder"
25
+ version = "1.5.0"
26
+ source = "registry+https://github.com/rust-lang/crates.io-index"
27
+ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
28
+
29
+ [[package]]
30
+ name = "cfb"
31
+ version = "0.10.0"
32
+ source = "registry+https://github.com/rust-lang/crates.io-index"
33
+ checksum = "d8a4f8e55be323b378facfcf1f06aa97f6ec17cf4ac84fb17325093aaf62da41"
34
+ dependencies = [
35
+ "byteorder",
36
+ "fnv",
37
+ "uuid",
38
+ ]
39
+
40
+ [[package]]
41
+ name = "cfg-if"
42
+ version = "1.0.4"
43
+ source = "registry+https://github.com/rust-lang/crates.io-index"
44
+ checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
45
+
46
+ [[package]]
47
+ name = "crc32fast"
48
+ version = "1.5.0"
49
+ source = "registry+https://github.com/rust-lang/crates.io-index"
50
+ checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511"
51
+ dependencies = [
52
+ "cfg-if",
53
+ ]
54
+
55
+ [[package]]
56
+ name = "docpler-bindings"
57
+ version = "0.1.0"
58
+ dependencies = [
59
+ "docpler-core",
60
+ "pyo3",
61
+ ]
62
+
63
+ [[package]]
64
+ name = "docpler-core"
65
+ version = "0.1.0"
66
+ dependencies = [
67
+ "cfb",
68
+ "flate2",
69
+ "thiserror",
70
+ ]
71
+
72
+ [[package]]
73
+ name = "docpler-premium"
74
+ version = "0.1.0"
75
+ dependencies = [
76
+ "docpler-core",
77
+ ]
78
+
79
+ [[package]]
80
+ name = "flate2"
81
+ version = "1.1.9"
82
+ source = "registry+https://github.com/rust-lang/crates.io-index"
83
+ checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c"
84
+ dependencies = [
85
+ "crc32fast",
86
+ "miniz_oxide",
87
+ ]
88
+
89
+ [[package]]
90
+ name = "fnv"
91
+ version = "1.0.7"
92
+ source = "registry+https://github.com/rust-lang/crates.io-index"
93
+ checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
94
+
95
+ [[package]]
96
+ name = "heck"
97
+ version = "0.5.0"
98
+ source = "registry+https://github.com/rust-lang/crates.io-index"
99
+ checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
100
+
101
+ [[package]]
102
+ name = "indoc"
103
+ version = "2.0.7"
104
+ source = "registry+https://github.com/rust-lang/crates.io-index"
105
+ checksum = "79cf5c93f93228cf8efb3ba362535fb11199ac548a09ce117c9b1adc3030d706"
106
+ dependencies = [
107
+ "rustversion",
108
+ ]
109
+
110
+ [[package]]
111
+ name = "js-sys"
112
+ version = "0.3.92"
113
+ source = "registry+https://github.com/rust-lang/crates.io-index"
114
+ checksum = "cc4c90f45aa2e6eacbe8645f77fdea542ac97a494bcd117a67df9ff4d611f995"
115
+ dependencies = [
116
+ "once_cell",
117
+ "wasm-bindgen",
118
+ ]
119
+
120
+ [[package]]
121
+ name = "libc"
122
+ version = "0.2.183"
123
+ source = "registry+https://github.com/rust-lang/crates.io-index"
124
+ checksum = "b5b646652bf6661599e1da8901b3b9522896f01e736bad5f723fe7a3a27f899d"
125
+
126
+ [[package]]
127
+ name = "memoffset"
128
+ version = "0.9.1"
129
+ source = "registry+https://github.com/rust-lang/crates.io-index"
130
+ checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a"
131
+ dependencies = [
132
+ "autocfg",
133
+ ]
134
+
135
+ [[package]]
136
+ name = "miniz_oxide"
137
+ version = "0.8.9"
138
+ source = "registry+https://github.com/rust-lang/crates.io-index"
139
+ checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316"
140
+ dependencies = [
141
+ "adler2",
142
+ "simd-adler32",
143
+ ]
144
+
145
+ [[package]]
146
+ name = "once_cell"
147
+ version = "1.21.4"
148
+ source = "registry+https://github.com/rust-lang/crates.io-index"
149
+ checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50"
150
+
151
+ [[package]]
152
+ name = "portable-atomic"
153
+ version = "1.13.1"
154
+ source = "registry+https://github.com/rust-lang/crates.io-index"
155
+ checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49"
156
+
157
+ [[package]]
158
+ name = "proc-macro2"
159
+ version = "1.0.106"
160
+ source = "registry+https://github.com/rust-lang/crates.io-index"
161
+ checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934"
162
+ dependencies = [
163
+ "unicode-ident",
164
+ ]
165
+
166
+ [[package]]
167
+ name = "pyo3"
168
+ version = "0.22.6"
169
+ source = "registry+https://github.com/rust-lang/crates.io-index"
170
+ checksum = "f402062616ab18202ae8319da13fa4279883a2b8a9d9f83f20dbade813ce1884"
171
+ dependencies = [
172
+ "cfg-if",
173
+ "indoc",
174
+ "libc",
175
+ "memoffset",
176
+ "once_cell",
177
+ "portable-atomic",
178
+ "pyo3-build-config",
179
+ "pyo3-ffi",
180
+ "pyo3-macros",
181
+ "unindent",
182
+ ]
183
+
184
+ [[package]]
185
+ name = "pyo3-build-config"
186
+ version = "0.22.6"
187
+ source = "registry+https://github.com/rust-lang/crates.io-index"
188
+ checksum = "b14b5775b5ff446dd1056212d778012cbe8a0fbffd368029fd9e25b514479c38"
189
+ dependencies = [
190
+ "once_cell",
191
+ "target-lexicon",
192
+ ]
193
+
194
+ [[package]]
195
+ name = "pyo3-ffi"
196
+ version = "0.22.6"
197
+ source = "registry+https://github.com/rust-lang/crates.io-index"
198
+ checksum = "9ab5bcf04a2cdcbb50c7d6105de943f543f9ed92af55818fd17b660390fc8636"
199
+ dependencies = [
200
+ "libc",
201
+ "pyo3-build-config",
202
+ ]
203
+
204
+ [[package]]
205
+ name = "pyo3-macros"
206
+ version = "0.22.6"
207
+ source = "registry+https://github.com/rust-lang/crates.io-index"
208
+ checksum = "0fd24d897903a9e6d80b968368a34e1525aeb719d568dba8b3d4bfa5dc67d453"
209
+ dependencies = [
210
+ "proc-macro2",
211
+ "pyo3-macros-backend",
212
+ "quote",
213
+ "syn",
214
+ ]
215
+
216
+ [[package]]
217
+ name = "pyo3-macros-backend"
218
+ version = "0.22.6"
219
+ source = "registry+https://github.com/rust-lang/crates.io-index"
220
+ checksum = "36c011a03ba1e50152b4b394b479826cad97e7a21eb52df179cd91ac411cbfbe"
221
+ dependencies = [
222
+ "heck",
223
+ "proc-macro2",
224
+ "pyo3-build-config",
225
+ "quote",
226
+ "syn",
227
+ ]
228
+
229
+ [[package]]
230
+ name = "quote"
231
+ version = "1.0.45"
232
+ source = "registry+https://github.com/rust-lang/crates.io-index"
233
+ checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924"
234
+ dependencies = [
235
+ "proc-macro2",
236
+ ]
237
+
238
+ [[package]]
239
+ name = "rustversion"
240
+ version = "1.0.22"
241
+ source = "registry+https://github.com/rust-lang/crates.io-index"
242
+ checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
243
+
244
+ [[package]]
245
+ name = "simd-adler32"
246
+ version = "0.3.9"
247
+ source = "registry+https://github.com/rust-lang/crates.io-index"
248
+ checksum = "703d5c7ef118737c72f1af64ad2f6f8c5e1921f818cdcb97b8fe6fc69bf66214"
249
+
250
+ [[package]]
251
+ name = "syn"
252
+ version = "2.0.117"
253
+ source = "registry+https://github.com/rust-lang/crates.io-index"
254
+ checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99"
255
+ dependencies = [
256
+ "proc-macro2",
257
+ "quote",
258
+ "unicode-ident",
259
+ ]
260
+
261
+ [[package]]
262
+ name = "target-lexicon"
263
+ version = "0.12.16"
264
+ source = "registry+https://github.com/rust-lang/crates.io-index"
265
+ checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1"
266
+
267
+ [[package]]
268
+ name = "thiserror"
269
+ version = "1.0.69"
270
+ source = "registry+https://github.com/rust-lang/crates.io-index"
271
+ checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
272
+ dependencies = [
273
+ "thiserror-impl",
274
+ ]
275
+
276
+ [[package]]
277
+ name = "thiserror-impl"
278
+ version = "1.0.69"
279
+ source = "registry+https://github.com/rust-lang/crates.io-index"
280
+ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
281
+ dependencies = [
282
+ "proc-macro2",
283
+ "quote",
284
+ "syn",
285
+ ]
286
+
287
+ [[package]]
288
+ name = "unicode-ident"
289
+ version = "1.0.24"
290
+ source = "registry+https://github.com/rust-lang/crates.io-index"
291
+ checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75"
292
+
293
+ [[package]]
294
+ name = "unindent"
295
+ version = "0.2.4"
296
+ source = "registry+https://github.com/rust-lang/crates.io-index"
297
+ checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3"
298
+
299
+ [[package]]
300
+ name = "uuid"
301
+ version = "1.23.0"
302
+ source = "registry+https://github.com/rust-lang/crates.io-index"
303
+ checksum = "5ac8b6f42ead25368cf5b098aeb3dc8a1a2c05a3eee8a9a1a68c640edbfc79d9"
304
+ dependencies = [
305
+ "js-sys",
306
+ "wasm-bindgen",
307
+ ]
308
+
309
+ [[package]]
310
+ name = "wasm-bindgen"
311
+ version = "0.2.115"
312
+ source = "registry+https://github.com/rust-lang/crates.io-index"
313
+ checksum = "6523d69017b7633e396a89c5efab138161ed5aafcbc8d3e5c5a42ae38f50495a"
314
+ dependencies = [
315
+ "cfg-if",
316
+ "once_cell",
317
+ "rustversion",
318
+ "wasm-bindgen-macro",
319
+ "wasm-bindgen-shared",
320
+ ]
321
+
322
+ [[package]]
323
+ name = "wasm-bindgen-macro"
324
+ version = "0.2.115"
325
+ source = "registry+https://github.com/rust-lang/crates.io-index"
326
+ checksum = "4e3a6c758eb2f701ed3d052ff5737f5bfe6614326ea7f3bbac7156192dc32e67"
327
+ dependencies = [
328
+ "quote",
329
+ "wasm-bindgen-macro-support",
330
+ ]
331
+
332
+ [[package]]
333
+ name = "wasm-bindgen-macro-support"
334
+ version = "0.2.115"
335
+ source = "registry+https://github.com/rust-lang/crates.io-index"
336
+ checksum = "921de2737904886b52bcbb237301552d05969a6f9c40d261eb0533c8b055fedf"
337
+ dependencies = [
338
+ "bumpalo",
339
+ "proc-macro2",
340
+ "quote",
341
+ "syn",
342
+ "wasm-bindgen-shared",
343
+ ]
344
+
345
+ [[package]]
346
+ name = "wasm-bindgen-shared"
347
+ version = "0.2.115"
348
+ source = "registry+https://github.com/rust-lang/crates.io-index"
349
+ checksum = "a93e946af942b58934c604527337bad9ae33ba1d5c6900bbb41c2c07c2364a93"
350
+ dependencies = [
351
+ "unicode-ident",
352
+ ]
@@ -0,0 +1,9 @@
1
+ [workspace]
2
+ members = ["crates/docpler-core", "crates/docpler-bindings"]
3
+ resolver = "2"
4
+
5
+ [profile.release]
6
+ opt-level = 3
7
+ strip = true
8
+ lto = true
9
+ codegen-units = 1
docpler-0.1.5/PKG-INFO ADDED
@@ -0,0 +1,62 @@
1
+ Metadata-Version: 2.4
2
+ Name: docpler
3
+ Version: 0.1.5
4
+ Requires-Dist: markitdown>=0.0.1 ; extra == 'markitdown'
5
+ Provides-Extra: markitdown
6
+ Summary: Document processing tool - converts HWP (and more) to Markdown
7
+ License: MIT
8
+ Requires-Python: >=3.9
9
+ Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
10
+
11
+ # docpler
12
+
13
+ HWP(한글 워드프로세서) 등 문서 파일을 Markdown으로 변환하는 Python 패키지.
14
+ Rust 코어 기반으로 빠르고 정확한 파싱을 제공합니다.
15
+
16
+ ## 지원 포맷
17
+
18
+ | 포맷 | 읽기 | 출력 |
19
+ |------|------|------|
20
+ | HWP 5.0 | ✅ | Markdown |
21
+
22
+ ## 설치
23
+
24
+ ```bash
25
+ pip install docpler
26
+ ```
27
+
28
+ ## 사용법
29
+
30
+ ### 기본 사용
31
+
32
+ ```python
33
+ from docpler.hwp import convert
34
+
35
+ markdown = convert("document.hwp")
36
+ print(markdown)
37
+ ```
38
+
39
+ ### markitdown 플러그인
40
+
41
+ ```python
42
+ from markitdown import MarkItDown
43
+
44
+ md = MarkItDown(enable_plugins=True)
45
+ result = md.convert("document.hwp")
46
+ print(result.text_content)
47
+ ```
48
+
49
+ ## 라이선스
50
+
51
+ 이 프로젝트는 MIT 라이선스로 제공되며, 현재 버전의 사용에 별도의 제약은 없습니다.
52
+
53
+ - **Python 래퍼 코드**: 오픈소스 (MIT)
54
+ - **Rust 코어 엔진**: 컴파일된 바이너리로 배포되며, 소스 코드는 비공개입니다.
55
+
56
+ ## HWP 포맷 관련 고지
57
+
58
+ 본 제품은 한글과컴퓨터의 한글 문서 파일(.hwp) 공개 문서를 참고하여 개발하였습니다.
59
+
60
+ HWP 파일 포맷 공개 문서의 저작권은 (주)한글과컴퓨터에 있으며,
61
+ 공개 문서의 전문은 [한글과컴퓨터 공식 페이지](https://www.hancom.com/etc/hwpDownload.do)에서 확인할 수 있습니다.
62
+
@@ -0,0 +1,51 @@
1
+ # docpler
2
+
3
+ HWP(한글 워드프로세서) 등 문서 파일을 Markdown으로 변환하는 Python 패키지.
4
+ Rust 코어 기반으로 빠르고 정확한 파싱을 제공합니다.
5
+
6
+ ## 지원 포맷
7
+
8
+ | 포맷 | 읽기 | 출력 |
9
+ |------|------|------|
10
+ | HWP 5.0 | ✅ | Markdown |
11
+
12
+ ## 설치
13
+
14
+ ```bash
15
+ pip install docpler
16
+ ```
17
+
18
+ ## 사용법
19
+
20
+ ### 기본 사용
21
+
22
+ ```python
23
+ from docpler.hwp import convert
24
+
25
+ markdown = convert("document.hwp")
26
+ print(markdown)
27
+ ```
28
+
29
+ ### markitdown 플러그인
30
+
31
+ ```python
32
+ from markitdown import MarkItDown
33
+
34
+ md = MarkItDown(enable_plugins=True)
35
+ result = md.convert("document.hwp")
36
+ print(result.text_content)
37
+ ```
38
+
39
+ ## 라이선스
40
+
41
+ 이 프로젝트는 MIT 라이선스로 제공되며, 현재 버전의 사용에 별도의 제약은 없습니다.
42
+
43
+ - **Python 래퍼 코드**: 오픈소스 (MIT)
44
+ - **Rust 코어 엔진**: 컴파일된 바이너리로 배포되며, 소스 코드는 비공개입니다.
45
+
46
+ ## HWP 포맷 관련 고지
47
+
48
+ 본 제품은 한글과컴퓨터의 한글 문서 파일(.hwp) 공개 문서를 참고하여 개발하였습니다.
49
+
50
+ HWP 파일 포맷 공개 문서의 저작권은 (주)한글과컴퓨터에 있으며,
51
+ 공개 문서의 전문은 [한글과컴퓨터 공식 페이지](https://www.hancom.com/etc/hwpDownload.do)에서 확인할 수 있습니다.
@@ -0,0 +1,13 @@
1
+ [package]
2
+ name = "docpler-bindings"
3
+ version = "0.1.0"
4
+ edition = "2021"
5
+ description = "Python (PyO3) bindings for docpler"
6
+
7
+ [lib]
8
+ name = "_docpler"
9
+ crate-type = ["cdylib", "rlib"]
10
+
11
+ [dependencies]
12
+ docpler-core = { path = "../docpler-core" }
13
+ pyo3 = { version = "0.22", features = ["extension-module"] }
@@ -0,0 +1,23 @@
1
+ use pyo3::prelude::*;
2
+
3
+ /// HWP 파일을 Markdown 문자열로 변환한다.
4
+ #[pyfunction]
5
+ fn hwp_to_markdown(path: &str) -> PyResult<String> {
6
+ let doc = docpler_core::hwp::parse(path)
7
+ .map_err(|e| pyo3::exceptions::PyRuntimeError::new_err(e.to_string()))?;
8
+ Ok(docpler_core::markdown::render(&doc))
9
+ }
10
+
11
+ /// 디버그용: 섹션 레코드의 (tag_id, level, size) 목록을 반환한다.
12
+ #[pyfunction]
13
+ fn hwp_debug_records(path: &str) -> PyResult<Vec<(u16, u16, usize)>> {
14
+ docpler_core::hwp::debug_records(path)
15
+ .map_err(|e| pyo3::exceptions::PyRuntimeError::new_err(e.to_string()))
16
+ }
17
+
18
+ #[pymodule]
19
+ fn _docpler(m: &Bound<'_, PyModule>) -> PyResult<()> {
20
+ m.add_function(wrap_pyfunction!(hwp_to_markdown, m)?)?;
21
+ m.add_function(wrap_pyfunction!(hwp_debug_records, m)?)?;
22
+ Ok(())
23
+ }
@@ -0,0 +1,10 @@
1
+ [package]
2
+ name = "docpler-core"
3
+ version = "0.1.0"
4
+ edition = "2021"
5
+ description = "Core document parsing library (HWP and more)"
6
+
7
+ [dependencies]
8
+ cfb = "0.10"
9
+ flate2 = "1.0"
10
+ thiserror = "1.0"
@@ -0,0 +1,21 @@
1
+ use thiserror::Error;
2
+
3
+ #[derive(Error, Debug)]
4
+ pub enum DocplerError {
5
+ #[error("IO error: {0}")]
6
+ Io(#[from] std::io::Error),
7
+
8
+ #[error("Invalid HWP signature")]
9
+ InvalidSignature,
10
+
11
+ #[error("Password-protected HWP files are not supported")]
12
+ EncryptedFile,
13
+
14
+ #[error("Compound file error: {0}")]
15
+ Cfb(String),
16
+
17
+ #[error("Parse error: {0}")]
18
+ Parse(String),
19
+ }
20
+
21
+ pub type Result<T> = std::result::Result<T, DocplerError>;
@@ -0,0 +1,591 @@
1
+ use std::io::Read;
2
+
3
+ use cfb::CompoundFile;
4
+ use flate2::read::DeflateDecoder;
5
+
6
+ use crate::error::{DocplerError, Result};
7
+ use crate::reader::ByteReader;
8
+
9
+ // ─── HWP 파일 시그니처 ───────────────────────────────────────────────────────
10
+ const HWP_SIGNATURE: &[u8] = b"HWP Document File";
11
+
12
+ // ─── 레코드 태그 ID (HWP 5.0 규격) ──────────────────────────────────────────
13
+ const TAG_PARA_TEXT: u16 = 0x43; // 문단 텍스트
14
+ const TAG_CTRL_HEADER: u16 = 0x47; // 컨트롤 헤더
15
+ const TAG_LIST_HEADER: u16 = 0x48; // 목록 헤더 (표 셀)
16
+ const TAG_TABLE: u16 = 0x4D; // 표 정보
17
+ const TAG_EQEDIT: u16 = 0x58; // 수식 정보
18
+
19
+ // ─── 컨트롤 타입 ID ──────────────────────────────────────────────────────────
20
+ // Java: CtrlID.make(a,b,c,d) = (a<<24)|(b<<16)|(c<<8)|d → LE로 파일에 저장
21
+ const CTRL_TABLE: u32 = 0x74626C20; // make('t','b','l',' ')
22
+ const CTRL_EQUATION: u32 = 0x65716564; // make('e','q','e','d')
23
+ const CTRL_GSO: u32 = 0x67736F20; // make('g','s','o',' ') - 그리기 개체 (글상자 포함)
24
+
25
+ // ─── 그리기 개체 태그 ────────────────────────────────────────────────────────
26
+ const TAG_SHAPE_COMPONENT: u16 = 0x4C; // 개체 공통 속성
27
+
28
+ // ─── 문서 모델 ───────────────────────────────────────────────────────────────
29
+
30
+ pub struct Document {
31
+ pub blocks: Vec<Block>,
32
+ }
33
+
34
+ pub enum Block {
35
+ Paragraph(String),
36
+ Table(Vec<Vec<String>>),
37
+ /// HWP EQN 스크립트 (한글 수식 편집기 포맷)
38
+ Equation(String),
39
+ }
40
+
41
+ // ─── 내부 레코드 ─────────────────────────────────────────────────────────────
42
+
43
+ struct Record {
44
+ tag_id: u16,
45
+ /// HWP 레코드 헤더 bits[10-19]: 트리 깊이
46
+ level: u16,
47
+ data: Vec<u8>,
48
+ }
49
+
50
+ // ─── 공개 API ────────────────────────────────────────────────────────────────
51
+
52
+ /// 디버그용: Section0의 레코드 목록을 (tag_id, level, size)로 반환한다.
53
+ pub fn debug_records(path: &str) -> Result<Vec<(u16, u16, usize)>> {
54
+ let file = std::fs::File::open(path)?;
55
+ let mut cfb = CompoundFile::open(file)
56
+ .map_err(|e| DocplerError::Cfb(e.to_string()))?;
57
+ let compressed = read_file_header(&mut cfb)?;
58
+ let data = read_stream(&mut cfb, "/BodyText/Section0", compressed)?;
59
+ let records = parse_records(&data);
60
+ Ok(records
61
+ .iter()
62
+ .map(|r| (r.tag_id, r.level, r.data.len()))
63
+ .collect())
64
+ }
65
+
66
+ pub fn parse(path: &str) -> Result<Document> {
67
+ let file = std::fs::File::open(path)?;
68
+ let mut cfb = CompoundFile::open(file)
69
+ .map_err(|e| DocplerError::Cfb(e.to_string()))?;
70
+
71
+ let compressed = read_file_header(&mut cfb)?;
72
+
73
+ let mut blocks = Vec::new();
74
+ for section_idx in 0.. {
75
+ let stream_path = format!("/BodyText/Section{}", section_idx);
76
+ let data = match read_stream(&mut cfb, &stream_path, compressed) {
77
+ Ok(d) => d,
78
+ Err(_) => break,
79
+ };
80
+ let records = parse_records(&data);
81
+ blocks.extend(process_records(&records));
82
+ }
83
+
84
+ Ok(Document { blocks })
85
+ }
86
+
87
+ // ─── FileHeader 파싱 ─────────────────────────────────────────────────────────
88
+
89
+ fn read_file_header(cfb: &mut CompoundFile<std::fs::File>) -> Result<bool> {
90
+ let mut data = Vec::new();
91
+ cfb.open_stream("/FileHeader")
92
+ .map_err(|e| DocplerError::Cfb(e.to_string()))?
93
+ .read_to_end(&mut data)?;
94
+
95
+ if !data.starts_with(HWP_SIGNATURE) {
96
+ return Err(DocplerError::InvalidSignature);
97
+ }
98
+ if data.len() < 40 {
99
+ return Err(DocplerError::Parse("FileHeader too short".into()));
100
+ }
101
+
102
+ // offset 36: 속성 플래그 DWORD bit0=압축, bit1=암호화
103
+ let props = u32::from_le_bytes(data[36..40].try_into().unwrap());
104
+ let compressed = (props & 0x01) != 0;
105
+ let encrypted = (props & 0x02) != 0;
106
+
107
+ if encrypted {
108
+ return Err(DocplerError::EncryptedFile);
109
+ }
110
+
111
+ Ok(compressed)
112
+ }
113
+
114
+ // ─── 스트림 읽기 ─────────────────────────────────────────────────────────────
115
+
116
+ fn read_stream(
117
+ cfb: &mut CompoundFile<std::fs::File>,
118
+ path: &str,
119
+ compressed: bool,
120
+ ) -> Result<Vec<u8>> {
121
+ let mut raw = Vec::new();
122
+ cfb.open_stream(path)
123
+ .map_err(|e| DocplerError::Cfb(e.to_string()))?
124
+ .read_to_end(&mut raw)?;
125
+
126
+ if compressed {
127
+ // HWP는 raw deflate 압축 (zlib 헤더 없음, Java Inflater nowrap=true 동일)
128
+ let mut decoder = DeflateDecoder::new(raw.as_slice());
129
+ let mut out = Vec::new();
130
+ decoder
131
+ .read_to_end(&mut out)
132
+ .map_err(|e| DocplerError::Parse(format!("압축 해제 실패: {}", e)))?;
133
+ Ok(out)
134
+ } else {
135
+ Ok(raw)
136
+ }
137
+ }
138
+
139
+ // ─── 레코드 파싱 ─────────────────────────────────────────────────────────────
140
+
141
+ /// 섹션 스트림을 Record 목록으로 변환.
142
+ ///
143
+ /// 레코드 헤더 형식 (4바이트 DWORD):
144
+ /// bits 0- 9 : TagID (10비트)
145
+ /// bits 10-19 : Level (10비트)
146
+ /// bits 20-31 : Size (12비트, 0xFFF이면 다음 DWORD가 실제 크기)
147
+ fn parse_records(data: &[u8]) -> Vec<Record> {
148
+ let mut reader = ByteReader::new(data);
149
+ let mut records = Vec::new();
150
+
151
+ while reader.remaining() >= 4 {
152
+ let header = reader.read_u32();
153
+ let tag_id = (header & 0x3FF) as u16;
154
+ let level = ((header >> 10) & 0x3FF) as u16;
155
+ let mut size = (header >> 20) as usize;
156
+
157
+ if size == 0xFFF {
158
+ if reader.remaining() < 4 {
159
+ break;
160
+ }
161
+ size = reader.read_u32() as usize;
162
+ }
163
+
164
+ if reader.remaining() < size {
165
+ break;
166
+ }
167
+
168
+ let record_data = reader.read_bytes(size).to_vec();
169
+ records.push(Record { tag_id, level, data: record_data });
170
+ }
171
+
172
+ records
173
+ }
174
+
175
+ // ─── 레코드 처리 ─────────────────────────────────────────────────────────────
176
+
177
+ /// 레코드 목록을 Block 목록으로 변환.
178
+ ///
179
+ /// HWP 5.0 레코드 레벨 구조 (섹션 기준):
180
+ /// level 0 : PARA_HEADER, PARA_TEXT, PARA_CHAR_SHAPE 등 (최상위 문단)
181
+ /// level 1 : CTRL_HEADER, CTRL_DATA, TABLE, LIST_HEADER(셀) 등
182
+ /// level 2 : 셀 내부 문단 텍스트
183
+ fn process_records(records: &[Record]) -> Vec<Block> {
184
+ let mut blocks = Vec::new();
185
+ let mut i = 0;
186
+
187
+ while i < records.len() {
188
+ let rec = &records[i];
189
+
190
+ match rec.tag_id {
191
+ TAG_PARA_TEXT if rec.level == 1 => {
192
+ let text = extract_para_text(&rec.data);
193
+ if !text.trim().is_empty() {
194
+ blocks.push(Block::Paragraph(text));
195
+ }
196
+ i += 1;
197
+ }
198
+ TAG_CTRL_HEADER => {
199
+ let ctrl_level = rec.level;
200
+ let ctrl_id = first_u32(&rec.data);
201
+ i += 1;
202
+
203
+ if ctrl_id == CTRL_TABLE {
204
+ let (table_opt, consumed) = parse_table_block(records, i, ctrl_level);
205
+ if let Some(table) = table_opt {
206
+ blocks.push(Block::Table(table));
207
+ }
208
+ i += consumed;
209
+ } else if ctrl_id == CTRL_EQUATION {
210
+ let (eq_opt, consumed) = parse_equation_block(records, i, ctrl_level);
211
+ if let Some(script) = eq_opt {
212
+ blocks.push(Block::Equation(script));
213
+ }
214
+ i += consumed;
215
+ } else if ctrl_id == CTRL_GSO {
216
+ let (text_opt, consumed) = parse_gso_block(records, i, ctrl_level);
217
+ if let Some(text) = text_opt {
218
+ blocks.push(Block::Paragraph(text));
219
+ }
220
+ i += consumed;
221
+ } else {
222
+ // 이 컨트롤에 속한 레코드(level > ctrl_level) 모두 건너뜀
223
+ while i < records.len() && records[i].level > ctrl_level {
224
+ i += 1;
225
+ }
226
+ }
227
+ }
228
+ _ => {
229
+ i += 1;
230
+ }
231
+ }
232
+ }
233
+
234
+ blocks
235
+ }
236
+
237
+ // ─── 표 파싱 ─────────────────────────────────────────────────────────────────
238
+
239
+ /// CTRL_HEADER(표) 이후 레코드들에서 표 데이터를 추출.
240
+ ///
241
+ /// 실제 레벨 구조 (ctrl_level=1 기준):
242
+ /// level 2 (=ctrl_level+1): TABLE, LIST_HEADER(셀), PARA_HEADER(셀 문단)
243
+ /// level 3 (=ctrl_level+2): 셀 내부 PARA_TEXT
244
+ ///
245
+ /// TABLE 레코드 데이터 레이아웃:
246
+ /// props(4) + rowCount(2) + colCount(2) + cellSpacing(2) + margins(4×2=8)
247
+ /// + cellCountOfRow[rowCount](2 each) → cellCountOfRow 시작 offset = 18
248
+ fn parse_table_block(
249
+ records: &[Record],
250
+ start: usize,
251
+ ctrl_level: u16,
252
+ ) -> (Option<Vec<Vec<String>>>, usize) {
253
+ // TABLE과 셀 LIST_HEADER는 ctrl_level+1에 위치
254
+ let table_level = ctrl_level + 1;
255
+ let mut i = start;
256
+
257
+ // TABLE 레코드를 찾을 때까지 CTRL_DATA·캡션 LIST_HEADER를 건너뜀
258
+ while i < records.len() {
259
+ let rec = &records[i];
260
+ if rec.level < table_level {
261
+ return (None, i - start);
262
+ }
263
+ if rec.tag_id == TAG_TABLE && rec.level == table_level {
264
+ break;
265
+ }
266
+ if rec.tag_id == TAG_LIST_HEADER && rec.level == table_level {
267
+ // 캡션 LIST_HEADER → 자식(level > table_level)까지 건너뜀
268
+ i += 1;
269
+ while i < records.len() && records[i].level > table_level {
270
+ i += 1;
271
+ }
272
+ } else {
273
+ i += 1;
274
+ }
275
+ }
276
+
277
+ if i >= records.len() || records[i].tag_id != TAG_TABLE {
278
+ return (None, i - start);
279
+ }
280
+
281
+ // TABLE 레코드 데이터 파싱
282
+ let tdata = &records[i].data;
283
+ if tdata.len() < 18 {
284
+ return (None, i - start + 1);
285
+ }
286
+
287
+ let row_count = u16::from_le_bytes([tdata[4], tdata[5]]) as usize;
288
+ let mut cell_counts: Vec<usize> = Vec::with_capacity(row_count);
289
+ for j in 0..row_count {
290
+ let off = 18 + j * 2;
291
+ if off + 2 <= tdata.len() {
292
+ cell_counts.push(u16::from_le_bytes([tdata[off], tdata[off + 1]]) as usize);
293
+ } else {
294
+ cell_counts.push(0);
295
+ }
296
+ }
297
+
298
+ i += 1; // TABLE 소비
299
+
300
+ // 셀 파싱:
301
+ // 각 셀 = LIST_HEADER(table_level) 다음에 오는 레코드들
302
+ // 다음 LIST_HEADER(table_level) 또는 level < table_level이 나올 때까지가 이 셀의 내용
303
+ // (PARA_HEADER도 table_level이므로 TAG로만 구분)
304
+ let mut rows: Vec<Vec<String>> = Vec::new();
305
+
306
+ for row_idx in 0..row_count {
307
+ let col_count = cell_counts.get(row_idx).copied().unwrap_or(0);
308
+ let mut row: Vec<String> = Vec::with_capacity(col_count);
309
+
310
+ for _ in 0..col_count {
311
+ if i < records.len()
312
+ && records[i].tag_id == TAG_LIST_HEADER
313
+ && records[i].level == table_level
314
+ {
315
+ i += 1; // LIST_HEADER 소비
316
+
317
+ let mut cell_text = String::new();
318
+ // 다음 LIST_HEADER(table_level) 또는 table_level 미만까지가 이 셀
319
+ while i < records.len() {
320
+ let rec = &records[i];
321
+ if rec.level < table_level {
322
+ break;
323
+ }
324
+ if rec.tag_id == TAG_LIST_HEADER && rec.level == table_level {
325
+ break; // 다음 셀 시작
326
+ }
327
+ if rec.tag_id == TAG_PARA_TEXT {
328
+ let t = extract_para_text(&rec.data);
329
+ let t = t.trim().to_string();
330
+ if !t.is_empty() {
331
+ if !cell_text.is_empty() {
332
+ cell_text.push(' ');
333
+ }
334
+ cell_text.push_str(&t);
335
+ }
336
+ }
337
+ i += 1;
338
+ }
339
+ row.push(cell_text);
340
+ } else {
341
+ row.push(String::new());
342
+ }
343
+ }
344
+
345
+ rows.push(row);
346
+ }
347
+
348
+ // 이 표 컨트롤의 나머지 레코드 건너뜀
349
+ while i < records.len() && records[i].level >= table_level {
350
+ i += 1;
351
+ }
352
+
353
+ if rows.is_empty() {
354
+ (None, i - start)
355
+ } else {
356
+ (Some(rows), i - start)
357
+ }
358
+ }
359
+
360
+ // ─── 수식 파싱 ───────────────────────────────────────────────────────────────
361
+
362
+ /// CTRL_HEADER(수식) 이후 레코드들에서 HWP EQN 스크립트를 추출.
363
+ ///
364
+ /// 구조 (ctrl_level=1 기준):
365
+ /// [LIST_HEADER: level 2, 캡션 (선택)]
366
+ /// EQEDIT: level 2 → 수식 스크립트 포함
367
+ ///
368
+ /// EQEDIT 데이터 레이아웃:
369
+ /// offset 0 (4바이트): property
370
+ /// offset 4 (2바이트): script wchar 길이
371
+ /// offset 6 (length×2 바이트): HWP EQN 스크립트 (UTF-16LE)
372
+ fn parse_equation_block(
373
+ records: &[Record],
374
+ start: usize,
375
+ ctrl_level: u16,
376
+ ) -> (Option<String>, usize) {
377
+ let child_level = ctrl_level + 1;
378
+ let mut i = start;
379
+
380
+ // EQEDIT 레코드 탐색 (캡션 LIST_HEADER 건너뜀)
381
+ while i < records.len() {
382
+ let rec = &records[i];
383
+ if rec.level < child_level {
384
+ return (None, i - start);
385
+ }
386
+ if rec.tag_id == TAG_EQEDIT && rec.level == child_level {
387
+ break;
388
+ }
389
+ i += 1;
390
+ }
391
+
392
+ if i >= records.len() || records[i].tag_id != TAG_EQEDIT {
393
+ return (None, i - start);
394
+ }
395
+
396
+ let script = extract_eqedit_script(&records[i].data);
397
+ i += 1;
398
+
399
+ // 나머지 컨트롤 레코드 건너뜀
400
+ while i < records.len() && records[i].level > ctrl_level {
401
+ i += 1;
402
+ }
403
+
404
+ (script, i - start)
405
+ }
406
+
407
+ /// EQEDIT 레코드 데이터에서 HWP EQN 스크립트를 추출한다.
408
+ fn extract_eqedit_script(data: &[u8]) -> Option<String> {
409
+ if data.len() < 6 {
410
+ return None;
411
+ }
412
+ // property(4) 건너뜀
413
+ let len_wchars = u16::from_le_bytes([data[4], data[5]]) as usize;
414
+ if len_wchars == 0 {
415
+ return None;
416
+ }
417
+ let script_start = 6;
418
+ let script_end = script_start + len_wchars * 2;
419
+ if script_end > data.len() {
420
+ return None;
421
+ }
422
+ // UTF-16LE → String
423
+ let utf16: Vec<u16> = data[script_start..script_end]
424
+ .chunks_exact(2)
425
+ .map(|c| u16::from_le_bytes([c[0], c[1]]))
426
+ .collect();
427
+ String::from_utf16(&utf16).ok()
428
+ }
429
+
430
+ // ─── 글상자(GSO) 파싱 ───────────────────────────────────────────────────────
431
+
432
+ /// CTRL_HEADER(gso) 이후 레코드들에서 글상자 텍스트를 추출.
433
+ ///
434
+ /// 구조 (ctrl_level=1 기준):
435
+ /// [LIST_HEADER: level 2, 캡션 (선택)]
436
+ /// [CTRL_DATA: level 2 (선택)]
437
+ /// SHAPE_COMPONENT: level 2 → 첫 4바이트 = gsoId
438
+ /// [CTRL_DATA: level 2 (선택)]
439
+ /// LIST_HEADER: level 2 → 글상자 본문 (텍스트 박스)
440
+ /// PARA_HEADER: level 2
441
+ /// PARA_TEXT: level 3
442
+ /// SHAPE_COMPONENT_RECTANGLE: level 2
443
+ ///
444
+ /// 전략: SHAPE_COMPONENT를 찾은 뒤, 이후 첫 LIST_HEADER의 하위 PARA_TEXT를 수집.
445
+ fn parse_gso_block(
446
+ records: &[Record],
447
+ start: usize,
448
+ ctrl_level: u16,
449
+ ) -> (Option<String>, usize) {
450
+ let child_level = ctrl_level + 1;
451
+ let mut i = start;
452
+
453
+ // SHAPE_COMPONENT 찾기 (캡션 LIST_HEADER, CTRL_DATA 건너뜀)
454
+ while i < records.len() {
455
+ let rec = &records[i];
456
+ if rec.level <= ctrl_level {
457
+ return (None, i - start);
458
+ }
459
+ if rec.tag_id == TAG_SHAPE_COMPONENT && rec.level == child_level {
460
+ break;
461
+ }
462
+ i += 1;
463
+ }
464
+
465
+ if i >= records.len() {
466
+ return (None, i - start);
467
+ }
468
+ i += 1; // SHAPE_COMPONENT 소비
469
+
470
+ // SHAPE_COMPONENT 이후 첫 LIST_HEADER(글상자 본문) 찾기
471
+ let mut found_list = false;
472
+ while i < records.len() {
473
+ let rec = &records[i];
474
+ if rec.level <= ctrl_level {
475
+ break;
476
+ }
477
+ if rec.tag_id == TAG_LIST_HEADER && rec.level == child_level {
478
+ found_list = true;
479
+ i += 1; // LIST_HEADER 소비
480
+ break;
481
+ }
482
+ i += 1;
483
+ }
484
+
485
+ if !found_list {
486
+ // GSO 블록 나머지 건너뜀
487
+ while i < records.len() && records[i].level > ctrl_level {
488
+ i += 1;
489
+ }
490
+ return (None, i - start);
491
+ }
492
+
493
+ // LIST_HEADER 이후 PARA_TEXT 수집.
494
+ // PARA_HEADER는 child_level(=2)에, PARA_TEXT는 child_level+1(=3)에 위치.
495
+ // SHAPE_COMPONENT_RECTANGLE(child_level)이 나오면 본문 종료.
496
+ let mut texts: Vec<String> = Vec::new();
497
+ while i < records.len() && records[i].level > ctrl_level {
498
+ let rec = &records[i];
499
+ if rec.tag_id == TAG_PARA_TEXT {
500
+ let t = extract_para_text(&rec.data);
501
+ let t = t.trim().to_string();
502
+ if !t.is_empty() {
503
+ texts.push(t);
504
+ }
505
+ }
506
+ i += 1;
507
+ }
508
+
509
+ // GSO 블록 나머지 건너뜀 (혹시 남은 경우)
510
+ while i < records.len() && records[i].level > ctrl_level {
511
+ i += 1;
512
+ }
513
+
514
+ if texts.is_empty() {
515
+ (None, i - start)
516
+ } else {
517
+ (Some(texts.join("\n")), i - start)
518
+ }
519
+ }
520
+
521
+ // ─── 텍스트 추출 ─────────────────────────────────────────────────────────────
522
+
523
+ /// 레코드 데이터 첫 4바이트를 LE u32로 읽는다.
524
+ fn first_u32(data: &[u8]) -> u32 {
525
+ if data.len() >= 4 {
526
+ u32::from_le_bytes(data[..4].try_into().unwrap())
527
+ } else {
528
+ 0
529
+ }
530
+ }
531
+
532
+ /// PARA_TEXT 레코드 데이터에서 가시 텍스트를 추출.
533
+ ///
534
+ /// 각 문자는 2바이트(UTF-16LE). 제어 코드 처리:
535
+ /// code > 31 → 일반 문자 (Normal)
536
+ /// 1,2,3,11~18,21~23 → ControlExtend: 총 16바이트 (2+12+2)
537
+ /// 4~9,19,20 → ControlInline: 총 16바이트 (2+12+2)
538
+ /// 0,10,13,24~31 → ControlChar: 2바이트만
539
+ pub fn extract_para_text(data: &[u8]) -> String {
540
+ let mut text = String::new();
541
+ let mut i = 0;
542
+
543
+ while i + 2 <= data.len() {
544
+ let code = u16::from_le_bytes([data[i], data[i + 1]]) as u32;
545
+
546
+ if code > 31 {
547
+ // 일반 문자 (UTF-16LE)
548
+ if (0xD800..0xDC00).contains(&code) {
549
+ // 상위 서로게이트 → 하위 서로게이트와 쌍
550
+ if i + 4 <= data.len() {
551
+ let low = u16::from_le_bytes([data[i + 2], data[i + 3]]) as u32;
552
+ if (0xDC00..0xE000).contains(&low) {
553
+ let cp = 0x10000 + ((code - 0xD800) << 10) + (low - 0xDC00);
554
+ if let Some(ch) = char::from_u32(cp) {
555
+ text.push(ch);
556
+ }
557
+ i += 4;
558
+ continue;
559
+ }
560
+ }
561
+ i += 2;
562
+ } else if (0xDC00..0xE000).contains(&code) {
563
+ // 고립된 하위 서로게이트 → 건너뜀
564
+ i += 2;
565
+ } else if let Some(ch) = char::from_u32(code) {
566
+ text.push(ch);
567
+ i += 2;
568
+ } else {
569
+ i += 2;
570
+ }
571
+ } else {
572
+ // 제어 코드
573
+ match code {
574
+ 1 | 2 | 3 | 11 | 12 | 14 | 15 | 16 | 17 | 18 | 21 | 22 | 23 => {
575
+ // ControlExtend: 16바이트
576
+ i += if i + 16 <= data.len() { 16 } else { break };
577
+ }
578
+ 4 | 5 | 6 | 7 | 8 | 9 | 19 | 20 => {
579
+ // ControlInline: 16바이트
580
+ i += if i + 16 <= data.len() { 16 } else { break };
581
+ }
582
+ _ => {
583
+ // ControlChar: 2바이트 (줄바꿈 등 무시)
584
+ i += 2;
585
+ }
586
+ }
587
+ }
588
+ }
589
+
590
+ text
591
+ }
@@ -0,0 +1,4 @@
1
+ pub mod error;
2
+ pub mod hwp;
3
+ pub mod markdown;
4
+ pub mod reader;
@@ -0,0 +1,70 @@
1
+ use crate::hwp::{Block, Document};
2
+
3
+ pub fn render(doc: &Document) -> String {
4
+ let mut out = String::new();
5
+
6
+ for block in &doc.blocks {
7
+ match block {
8
+ Block::Paragraph(text) => {
9
+ let text = text.trim();
10
+ if !text.is_empty() {
11
+ out.push_str(text);
12
+ out.push_str("\n\n");
13
+ }
14
+ }
15
+ Block::Table(rows) => {
16
+ if !rows.is_empty() {
17
+ render_table(&mut out, rows);
18
+ out.push('\n');
19
+ }
20
+ }
21
+ Block::Equation(script) => {
22
+ // HWP EQN 포맷 스크립트를 코드 블록으로 출력.
23
+ // TODO: HWP EQN → LaTeX 변환
24
+ out.push_str("$$\n");
25
+ out.push_str(script.trim());
26
+ out.push_str("\n$$\n\n");
27
+ }
28
+ }
29
+ }
30
+
31
+ out.trim_end().to_string()
32
+ }
33
+
34
+ fn render_table(out: &mut String, rows: &[Vec<String>]) {
35
+ let max_cols = rows.iter().map(|r| r.len()).max().unwrap_or(0);
36
+ if max_cols == 0 {
37
+ return;
38
+ }
39
+
40
+ let cell = |row: &Vec<String>, col: usize| -> String {
41
+ row.get(col)
42
+ .map(|s| s.replace('|', "\\|").replace('\n', " "))
43
+ .unwrap_or_default()
44
+ };
45
+
46
+ // 헤더 행
47
+ out.push('|');
48
+ for col in 0..max_cols {
49
+ out.push_str(&cell(&rows[0], col));
50
+ out.push('|');
51
+ }
52
+ out.push('\n');
53
+
54
+ // 구분선
55
+ out.push('|');
56
+ for _ in 0..max_cols {
57
+ out.push_str("---|");
58
+ }
59
+ out.push('\n');
60
+
61
+ // 데이터 행
62
+ for row in &rows[1..] {
63
+ out.push('|');
64
+ for col in 0..max_cols {
65
+ out.push_str(&cell(row, col));
66
+ out.push('|');
67
+ }
68
+ out.push('\n');
69
+ }
70
+ }
@@ -0,0 +1,37 @@
1
+ /// 바이트 슬라이스에서 리틀엔디안 정수를 읽는 헬퍼.
2
+ /// HWP 레코드 스트림 파싱에 사용된다.
3
+ pub struct ByteReader<'a> {
4
+ data: &'a [u8],
5
+ pos: usize,
6
+ }
7
+
8
+ impl<'a> ByteReader<'a> {
9
+ pub fn new(data: &'a [u8]) -> Self {
10
+ Self { data, pos: 0 }
11
+ }
12
+
13
+ pub fn remaining(&self) -> usize {
14
+ self.data.len().saturating_sub(self.pos)
15
+ }
16
+
17
+ pub fn read_u32(&mut self) -> u32 {
18
+ if self.pos + 4 <= self.data.len() {
19
+ let val = u32::from_le_bytes(
20
+ self.data[self.pos..self.pos + 4].try_into().unwrap(),
21
+ );
22
+ self.pos += 4;
23
+ val
24
+ } else {
25
+ self.pos = self.data.len();
26
+ 0
27
+ }
28
+ }
29
+
30
+ /// n 바이트를 읽어 슬라이스로 반환. 데이터 부족 시 가능한 만큼만 반환.
31
+ pub fn read_bytes(&mut self, n: usize) -> &'a [u8] {
32
+ let start = self.pos;
33
+ let end = (self.pos + n).min(self.data.len());
34
+ self.pos = end;
35
+ &self.data[start..end]
36
+ }
37
+ }
@@ -0,0 +1,25 @@
1
+ [build-system]
2
+ requires = ["maturin>=1.7,<2.0"]
3
+ build-backend = "maturin"
4
+
5
+ [project]
6
+ name = "docpler"
7
+ version = "0.1.5"
8
+ description = "Document processing tool - converts HWP (and more) to Markdown"
9
+ requires-python = ">=3.9"
10
+ license = { text = "MIT" }
11
+ readme = "PYPI_README.md"
12
+ dependencies = []
13
+
14
+ [project.optional-dependencies]
15
+ markitdown = ["markitdown>=0.0.1"]
16
+
17
+ # markitdown 플러그인으로 자동 등록
18
+ [project.entry-points."markitdown.plugin"]
19
+ docpler = "docpler.hwp:HwpConverterPlugin"
20
+
21
+ [tool.maturin]
22
+ manifest-path = "crates/docpler-bindings/Cargo.toml"
23
+ python-source = "python"
24
+ module-name = "docpler._docpler"
25
+ features = ["pyo3/extension-module"]
@@ -0,0 +1,4 @@
1
+ """
2
+ docpler - Document processing tool.
3
+ HWP, and more formats to Markdown.
4
+ """
@@ -0,0 +1,57 @@
1
+ """
2
+ HWP (한글 워드프로세서) → Markdown 변환기.
3
+ markitdown 플러그인 및 독립 사용 모두 지원.
4
+ """
5
+ from pathlib import Path
6
+ from typing import Any, BinaryIO, Optional
7
+
8
+
9
+ ACCEPTED_EXTENSIONS = frozenset({".hwp"})
10
+ ACCEPTED_MIME_TYPES = frozenset({
11
+ "application/x-hwp",
12
+ "application/haansofthwp",
13
+ "application/vnd.hancom.hwp",
14
+ })
15
+
16
+
17
+ def convert(path: str | Path) -> str:
18
+ """HWP 파일을 Markdown 문자열로 변환한다."""
19
+ from ._docpler import hwp_to_markdown
20
+ return hwp_to_markdown(str(path))
21
+
22
+
23
+ class HwpConverter:
24
+ """markitdown 컨버터: HWP 파일을 Markdown으로 변환."""
25
+
26
+ def accepts(
27
+ self,
28
+ file_stream: BinaryIO,
29
+ stream_info: Any,
30
+ **kwargs,
31
+ ) -> bool:
32
+ ext = getattr(stream_info, "extension", "") or ""
33
+ mime = getattr(stream_info, "mimetype", "") or ""
34
+ return ext.lower() in ACCEPTED_EXTENSIONS or mime in ACCEPTED_MIME_TYPES
35
+
36
+ def convert(
37
+ self,
38
+ file_stream: BinaryIO,
39
+ stream_info: Any,
40
+ **kwargs,
41
+ ):
42
+ from markitdown import DocumentConverterResult
43
+
44
+ local_path = getattr(stream_info, "local_path", None)
45
+ if not local_path:
46
+ return None
47
+
48
+ markdown_text = convert(local_path)
49
+ return DocumentConverterResult(markdown=markdown_text)
50
+
51
+
52
+ class HwpConverterPlugin:
53
+ """markitdown 플러그인 엔트리포인트."""
54
+
55
+ @staticmethod
56
+ def register_converters(markitdown_instance, **kwargs):
57
+ markitdown_instance.register_converter(HwpConverter())