kordoc 2.3.0 → 2.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -1
- package/dist/{chunk-OEJJPCMM.js → chunk-KSBPABBQ.js} +117 -9
- package/dist/chunk-KSBPABBQ.js.map +1 -0
- package/dist/{chunk-Z7UPTVMX.js → chunk-VJPDY4YT.js} +2 -2
- package/dist/{chunk-Z7UPTVMX.js.map → chunk-VJPDY4YT.js.map} +1 -1
- package/dist/{chunk-ZNJPRRIA.cjs → chunk-VLSATRNQ.cjs} +2 -2
- package/dist/{chunk-ZNJPRRIA.cjs.map → chunk-VLSATRNQ.cjs.map} +1 -1
- package/dist/{chunk-JFTFC2BB.js → chunk-XG5CQUSC.js} +2 -2
- package/dist/{chunk-JFTFC2BB.js.map → chunk-XG5CQUSC.js.map} +1 -1
- package/dist/cli.js +4 -4
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +239 -131
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +2 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +116 -8
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +3 -3
- package/dist/{parser-4LKJXBPP.js → parser-4275GJRB.js} +2 -2
- package/dist/{parser-KBQZB3QY.cjs → parser-STAOZMUC.cjs} +15 -15
- package/dist/{parser-KBQZB3QY.cjs.map → parser-STAOZMUC.cjs.map} +1 -1
- package/dist/{parser-25LF2S2J.js → parser-XRUZEFZT.js} +2 -2
- package/dist/{watch-GXRBLW3Y.js → watch-BFLNFJBE.js} +3 -3
- package/package.json +1 -1
- package/dist/chunk-OEJJPCMM.js.map +0 -1
- /package/dist/{parser-4LKJXBPP.js.map → parser-4275GJRB.js.map} +0 -0
- /package/dist/{parser-25LF2S2J.js.map → parser-XRUZEFZT.js.map} +0 -0
- /package/dist/{watch-GXRBLW3Y.js.map → watch-BFLNFJBE.js.map} +0 -0
package/README.md
CHANGED
|
@@ -28,13 +28,20 @@ HWP, HWPX, PDF, XLSX, DOCX — 관공서에서 쏟아지는 모든 문서를 파
|
|
|
28
28
|
|
|
29
29
|
---
|
|
30
30
|
|
|
31
|
-
## v2.
|
|
31
|
+
## v2.4.0 변경사항
|
|
32
|
+
|
|
33
|
+
- **🔓 HWPX DRM 배포용 문서 자동 추출** — 공공기관 배포용 DRM이 걸린 HWPX 파일을 한컴 오피스 COM API로 자동 텍스트 추출. `manifest.xml`에서 암호화 감지 → `HWPFrame.HwpObject`의 `GetPageText`로 페이지별 추출 → Markdown 변환. Windows + 한컴 오피스 설치 환경에서 별도 설정 없이 동작.
|
|
34
|
+
|
|
35
|
+
<details>
|
|
36
|
+
<summary>v2.3.0 변경사항</summary>
|
|
32
37
|
|
|
33
38
|
- **📄 HWPML 2.x 파서 추가** — XML 기반 한컴 문서(`.hwp` XML 방식) 파싱 지원. `npx kordoc <file.hwp>`에서 `지원하지 않는 파일 형식` 오류가 나던 XML 기반 공문서를 이제 Markdown으로 변환할 수 있습니다. HWP 5.x 바이너리와 자동 구분(XML 시그니처 감지).
|
|
34
39
|
- **🧩 중첩 테이블 마커** — HWPX/HWP5에서 셀 내부 중첩 테이블이 있던 위치에 `[중첩 테이블 #N]` 마커 삽입. 큰 중첩 테이블(≥3행 + ≥2열)은 별도 블록으로 분리, 작은 것은 셀 내 평탄화. HWP5는 기존에 내용이 완전히 손실되던 것을 마커로 복구.
|
|
35
40
|
- **🖼️ HWPX 이미지 추출 버그 수정** — `binaryItemIDRef`가 확장자 없이(`"image1"`) 저장된 HWPX에서 이미지 추출이 실패하던 문제 해결. ZIP 내 파일명 regex 매칭으로 복원.
|
|
36
41
|
- **📄 PDF 머리글/바닥글 감지 개선** — 텍스트 반복 패턴 + y좌표 클러스터링 하이브리드. 페이지마다 달라지는 동적 머리글(챕터명 등)도 위치 기반으로 감지. 감지 영역 10% → 12%로 확장.
|
|
37
42
|
|
|
43
|
+
</details>
|
|
44
|
+
|
|
38
45
|
<details>
|
|
39
46
|
<summary>v2.2.4 변경사항</summary>
|
|
40
47
|
|
|
@@ -20,7 +20,7 @@ import {
|
|
|
20
20
|
sanitizeHref,
|
|
21
21
|
stripDtd,
|
|
22
22
|
toArrayBuffer
|
|
23
|
-
} from "./chunk-
|
|
23
|
+
} from "./chunk-VJPDY4YT.js";
|
|
24
24
|
import {
|
|
25
25
|
parsePageRange
|
|
26
26
|
} from "./chunk-MOL7MDBG.js";
|
|
@@ -29,6 +29,100 @@ import {
|
|
|
29
29
|
import JSZip from "jszip";
|
|
30
30
|
import { inflateRawSync } from "zlib";
|
|
31
31
|
import { DOMParser } from "@xmldom/xmldom";
|
|
32
|
+
|
|
33
|
+
// src/hwpx/com-fallback.ts
|
|
34
|
+
import { execFileSync } from "child_process";
|
|
35
|
+
import { platform } from "os";
|
|
36
|
+
function isComFallbackAvailable() {
|
|
37
|
+
return platform() === "win32";
|
|
38
|
+
}
|
|
39
|
+
function isEncryptedHwpx(manifestXml) {
|
|
40
|
+
return manifestXml.includes("encryption-data");
|
|
41
|
+
}
|
|
42
|
+
function extractTextViaCom(filePath) {
|
|
43
|
+
if (!isComFallbackAvailable()) {
|
|
44
|
+
throw new Error("COM fallback\uC740 Windows\uC5D0\uC11C\uB9CC \uC0AC\uC6A9 \uAC00\uB2A5\uD569\uB2C8\uB2E4");
|
|
45
|
+
}
|
|
46
|
+
const escaped = filePath.replace(/'/g, "''");
|
|
47
|
+
const ps1 = `
|
|
48
|
+
[Console]::OutputEncoding = [System.Text.Encoding]::UTF8
|
|
49
|
+
$ErrorActionPreference = 'Stop'
|
|
50
|
+
try {
|
|
51
|
+
$hwp = New-Object -ComObject HWPFrame.HwpObject
|
|
52
|
+
$hwp.RegisterModule('FilePathCheckerModule', 'FilePathCheckerModuleExample') | Out-Null
|
|
53
|
+
$hwp.Open('${escaped}', '', '') | Out-Null
|
|
54
|
+
$pc = $hwp.PageCount
|
|
55
|
+
$result = @{ pageCount = $pc; pages = @() }
|
|
56
|
+
for ($p = 1; $p -le $pc; $p++) {
|
|
57
|
+
$t = $hwp.GetPageText($p, 0)
|
|
58
|
+
$result.pages += @($t)
|
|
59
|
+
}
|
|
60
|
+
$hwp.Clear(1)
|
|
61
|
+
[System.Runtime.InteropServices.Marshal]::ReleaseComObject($hwp) | Out-Null
|
|
62
|
+
$result | ConvertTo-Json -Depth 3 -Compress
|
|
63
|
+
} catch {
|
|
64
|
+
@{ error = $_.Exception.Message } | ConvertTo-Json -Compress
|
|
65
|
+
}
|
|
66
|
+
`;
|
|
67
|
+
const stdout = execFileSync("powershell", [
|
|
68
|
+
"-NoProfile",
|
|
69
|
+
"-NonInteractive",
|
|
70
|
+
"-ExecutionPolicy",
|
|
71
|
+
"Bypass",
|
|
72
|
+
"-Command",
|
|
73
|
+
ps1
|
|
74
|
+
], {
|
|
75
|
+
encoding: "utf-8",
|
|
76
|
+
timeout: 12e4,
|
|
77
|
+
// 2분 타임아웃
|
|
78
|
+
windowsHide: true,
|
|
79
|
+
maxBuffer: 50 * 1024 * 1024
|
|
80
|
+
// 50MB
|
|
81
|
+
});
|
|
82
|
+
const trimmed = stdout.trim();
|
|
83
|
+
const jsonStart = trimmed.indexOf("{");
|
|
84
|
+
if (jsonStart < 0) throw new Error(`COM \uCD9C\uB825\uC5D0 JSON\uC774 \uC5C6\uC2B5\uB2C8\uB2E4: ${trimmed.slice(0, 200)}`);
|
|
85
|
+
const json = JSON.parse(trimmed.slice(jsonStart));
|
|
86
|
+
if (json.error) {
|
|
87
|
+
throw new Error(`COM \uD14D\uC2A4\uD2B8 \uCD94\uCD9C \uC2E4\uD328: ${json.error}`);
|
|
88
|
+
}
|
|
89
|
+
const warnings = [];
|
|
90
|
+
const pages = Array.isArray(json.pages) ? json.pages : [];
|
|
91
|
+
const pageCount = json.pageCount ?? pages.length;
|
|
92
|
+
if (pages.length === 0) {
|
|
93
|
+
warnings.push({ message: "COM\uC73C\uB85C \uD14D\uC2A4\uD2B8\uB97C \uCD94\uCD9C\uD558\uC9C0 \uBABB\uD588\uC2B5\uB2C8\uB2E4", code: "COM_EMPTY" });
|
|
94
|
+
}
|
|
95
|
+
return { pages, pageCount, warnings };
|
|
96
|
+
}
|
|
97
|
+
function comResultToParseResult(pages, pageCount, warnings) {
|
|
98
|
+
const blocks = [];
|
|
99
|
+
const lines = [];
|
|
100
|
+
for (let i = 0; i < pages.length; i++) {
|
|
101
|
+
const text = (pages[i] ?? "").trim();
|
|
102
|
+
if (!text) continue;
|
|
103
|
+
const paragraphs = text.split(/\n/);
|
|
104
|
+
for (const para of paragraphs) {
|
|
105
|
+
const trimmed = para.trim();
|
|
106
|
+
if (!trimmed) continue;
|
|
107
|
+
blocks.push({ type: "paragraph", text: trimmed, pageNumber: i + 1 });
|
|
108
|
+
lines.push(trimmed);
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
const markdown = lines.join("\n\n");
|
|
112
|
+
const metadata = { pageCount };
|
|
113
|
+
warnings.push({
|
|
114
|
+
message: "DRM \uBB38\uC11C: \uD55C\uCEF4 COM API\uB85C \uD14D\uC2A4\uD2B8 \uCD94\uCD9C (\uC11C\uC2DD/\uD45C \uC815\uBCF4 \uC81C\uD55C\uC801)",
|
|
115
|
+
code: "DRM_COM_FALLBACK"
|
|
116
|
+
});
|
|
117
|
+
return {
|
|
118
|
+
markdown,
|
|
119
|
+
blocks,
|
|
120
|
+
metadata,
|
|
121
|
+
warnings: warnings.length > 0 ? warnings : void 0
|
|
122
|
+
};
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
// src/hwpx/parser.ts
|
|
32
126
|
var MAX_DECOMPRESS_SIZE = 100 * 1024 * 1024;
|
|
33
127
|
var MAX_ZIP_ENTRIES = 500;
|
|
34
128
|
function clampSpan(val, max) {
|
|
@@ -133,6 +227,19 @@ async function parseHwpxDocument(buffer, options) {
|
|
|
133
227
|
if (actualEntryCount > MAX_ZIP_ENTRIES) {
|
|
134
228
|
throw new KordocError("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
135
229
|
}
|
|
230
|
+
const manifestFile = zip.file("META-INF/manifest.xml");
|
|
231
|
+
if (manifestFile) {
|
|
232
|
+
const manifestXml = await manifestFile.async("text");
|
|
233
|
+
if (isEncryptedHwpx(manifestXml)) {
|
|
234
|
+
if (isComFallbackAvailable() && options?.filePath) {
|
|
235
|
+
const { pages, pageCount, warnings: warnings2 } = extractTextViaCom(options.filePath);
|
|
236
|
+
if (pages.some((p) => p && p.trim().length > 0)) {
|
|
237
|
+
return comResultToParseResult(pages, pageCount, warnings2);
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
throw new KordocError("DRM \uC554\uD638\uD654\uB41C HWPX \uD30C\uC77C\uC785\uB2C8\uB2E4. Windows + \uD55C\uCEF4 \uC624\uD53C\uC2A4 \uC124\uCE58 \uC2DC \uC790\uB3D9 \uCD94\uCD9C\uB429\uB2C8\uB2E4.");
|
|
241
|
+
}
|
|
242
|
+
}
|
|
136
243
|
const decompressed = { total: 0 };
|
|
137
244
|
const metadata = {};
|
|
138
245
|
await extractHwpxMetadata(zip, metadata, decompressed);
|
|
@@ -4543,6 +4650,7 @@ function countSections(body) {
|
|
|
4543
4650
|
// src/index.ts
|
|
4544
4651
|
async function parse(input, options) {
|
|
4545
4652
|
let buffer;
|
|
4653
|
+
const opts = typeof input === "string" && !options?.filePath ? { ...options, filePath: input } : options;
|
|
4546
4654
|
if (typeof input === "string") {
|
|
4547
4655
|
try {
|
|
4548
4656
|
const buf = await readFile(input);
|
|
@@ -4563,16 +4671,16 @@ async function parse(input, options) {
|
|
|
4563
4671
|
switch (format) {
|
|
4564
4672
|
case "hwpx": {
|
|
4565
4673
|
const zipFormat = await detectZipFormat(buffer);
|
|
4566
|
-
if (zipFormat === "xlsx") return parseXlsx(buffer,
|
|
4567
|
-
if (zipFormat === "docx") return parseDocx(buffer,
|
|
4568
|
-
return parseHwpx(buffer,
|
|
4674
|
+
if (zipFormat === "xlsx") return parseXlsx(buffer, opts);
|
|
4675
|
+
if (zipFormat === "docx") return parseDocx(buffer, opts);
|
|
4676
|
+
return parseHwpx(buffer, opts);
|
|
4569
4677
|
}
|
|
4570
4678
|
case "hwp":
|
|
4571
|
-
return parseHwp(buffer,
|
|
4679
|
+
return parseHwp(buffer, opts);
|
|
4572
4680
|
case "hwpml":
|
|
4573
|
-
return parseHwpml(buffer,
|
|
4681
|
+
return parseHwpml(buffer, opts);
|
|
4574
4682
|
case "pdf":
|
|
4575
|
-
return parsePdf(buffer,
|
|
4683
|
+
return parsePdf(buffer, opts);
|
|
4576
4684
|
default:
|
|
4577
4685
|
return { success: false, fileType: "unknown", error: "\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD30C\uC77C \uD615\uC2DD\uC785\uB2C8\uB2E4.", code: "UNSUPPORTED_FORMAT" };
|
|
4578
4686
|
}
|
|
@@ -4596,7 +4704,7 @@ async function parseHwp(buffer, options) {
|
|
|
4596
4704
|
async function parsePdf(buffer, options) {
|
|
4597
4705
|
let parsePdfDocument;
|
|
4598
4706
|
try {
|
|
4599
|
-
const mod = await import("./parser-
|
|
4707
|
+
const mod = await import("./parser-4275GJRB.js");
|
|
4600
4708
|
parsePdfDocument = mod.parsePdfDocument;
|
|
4601
4709
|
} catch {
|
|
4602
4710
|
return {
|
|
@@ -4826,4 +4934,4 @@ export {
|
|
|
4826
4934
|
compare,
|
|
4827
4935
|
parse
|
|
4828
4936
|
};
|
|
4829
|
-
//# sourceMappingURL=chunk-
|
|
4937
|
+
//# sourceMappingURL=chunk-KSBPABBQ.js.map
|