kordoc 2.3.0 → 2.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -28,13 +28,20 @@ HWP, HWPX, PDF, XLSX, DOCX — 관공서에서 쏟아지는 모든 문서를 파
28
28
 
29
29
  ---
30
30
 
31
- ## v2.3.0 변경사항
31
+ ## v2.4.0 변경사항
32
+
33
+ - **🔓 HWPX DRM 배포용 문서 자동 추출** — 공공기관 배포용 DRM이 걸린 HWPX 파일을 한컴 오피스 COM API로 자동 텍스트 추출. `manifest.xml`에서 암호화 감지 → `HWPFrame.HwpObject`의 `GetPageText`로 페이지별 추출 → Markdown 변환. Windows + 한컴 오피스 설치 환경에서 별도 설정 없이 동작.
34
+
35
+ <details>
36
+ <summary>v2.3.0 변경사항</summary>
32
37
 
33
38
  - **📄 HWPML 2.x 파서 추가** — XML 기반 한컴 문서(`.hwp` XML 방식) 파싱 지원. `npx kordoc <file.hwp>`에서 `지원하지 않는 파일 형식` 오류가 나던 XML 기반 공문서를 이제 Markdown으로 변환할 수 있습니다. HWP 5.x 바이너리와 자동 구분(XML 시그니처 감지).
34
39
  - **🧩 중첩 테이블 마커** — HWPX/HWP5에서 셀 내부 중첩 테이블이 있던 위치에 `[중첩 테이블 #N]` 마커 삽입. 큰 중첩 테이블(≥3행 + ≥2열)은 별도 블록으로 분리, 작은 것은 셀 내 평탄화. HWP5는 기존에 내용이 완전히 손실되던 것을 마커로 복구.
35
40
  - **🖼️ HWPX 이미지 추출 버그 수정** — `binaryItemIDRef`가 확장자 없이(`"image1"`) 저장된 HWPX에서 이미지 추출이 실패하던 문제 해결. ZIP 내 파일명 regex 매칭으로 복원.
36
41
  - **📄 PDF 머리글/바닥글 감지 개선** — 텍스트 반복 패턴 + y좌표 클러스터링 하이브리드. 페이지마다 달라지는 동적 머리글(챕터명 등)도 위치 기반으로 감지. 감지 영역 10% → 12%로 확장.
37
42
 
43
+ </details>
44
+
38
45
  <details>
39
46
  <summary>v2.2.4 변경사항</summary>
40
47
 
@@ -20,7 +20,7 @@ import {
20
20
  sanitizeHref,
21
21
  stripDtd,
22
22
  toArrayBuffer
23
- } from "./chunk-Z7UPTVMX.js";
23
+ } from "./chunk-VJPDY4YT.js";
24
24
  import {
25
25
  parsePageRange
26
26
  } from "./chunk-MOL7MDBG.js";
@@ -29,6 +29,100 @@ import {
29
29
  import JSZip from "jszip";
30
30
  import { inflateRawSync } from "zlib";
31
31
  import { DOMParser } from "@xmldom/xmldom";
32
+
33
+ // src/hwpx/com-fallback.ts
34
+ import { execFileSync } from "child_process";
35
+ import { platform } from "os";
36
+ function isComFallbackAvailable() {
37
+ return platform() === "win32";
38
+ }
39
+ function isEncryptedHwpx(manifestXml) {
40
+ return manifestXml.includes("encryption-data");
41
+ }
42
+ function extractTextViaCom(filePath) {
43
+ if (!isComFallbackAvailable()) {
44
+ throw new Error("COM fallback\uC740 Windows\uC5D0\uC11C\uB9CC \uC0AC\uC6A9 \uAC00\uB2A5\uD569\uB2C8\uB2E4");
45
+ }
46
+ const escaped = filePath.replace(/'/g, "''");
47
+ const ps1 = `
48
+ [Console]::OutputEncoding = [System.Text.Encoding]::UTF8
49
+ $ErrorActionPreference = 'Stop'
50
+ try {
51
+ $hwp = New-Object -ComObject HWPFrame.HwpObject
52
+ $hwp.RegisterModule('FilePathCheckerModule', 'FilePathCheckerModuleExample') | Out-Null
53
+ $hwp.Open('${escaped}', '', '') | Out-Null
54
+ $pc = $hwp.PageCount
55
+ $result = @{ pageCount = $pc; pages = @() }
56
+ for ($p = 1; $p -le $pc; $p++) {
57
+ $t = $hwp.GetPageText($p, 0)
58
+ $result.pages += @($t)
59
+ }
60
+ $hwp.Clear(1)
61
+ [System.Runtime.InteropServices.Marshal]::ReleaseComObject($hwp) | Out-Null
62
+ $result | ConvertTo-Json -Depth 3 -Compress
63
+ } catch {
64
+ @{ error = $_.Exception.Message } | ConvertTo-Json -Compress
65
+ }
66
+ `;
67
+ const stdout = execFileSync("powershell", [
68
+ "-NoProfile",
69
+ "-NonInteractive",
70
+ "-ExecutionPolicy",
71
+ "Bypass",
72
+ "-Command",
73
+ ps1
74
+ ], {
75
+ encoding: "utf-8",
76
+ timeout: 12e4,
77
+ // 2분 타임아웃
78
+ windowsHide: true,
79
+ maxBuffer: 50 * 1024 * 1024
80
+ // 50MB
81
+ });
82
+ const trimmed = stdout.trim();
83
+ const jsonStart = trimmed.indexOf("{");
84
+ if (jsonStart < 0) throw new Error(`COM \uCD9C\uB825\uC5D0 JSON\uC774 \uC5C6\uC2B5\uB2C8\uB2E4: ${trimmed.slice(0, 200)}`);
85
+ const json = JSON.parse(trimmed.slice(jsonStart));
86
+ if (json.error) {
87
+ throw new Error(`COM \uD14D\uC2A4\uD2B8 \uCD94\uCD9C \uC2E4\uD328: ${json.error}`);
88
+ }
89
+ const warnings = [];
90
+ const pages = Array.isArray(json.pages) ? json.pages : [];
91
+ const pageCount = json.pageCount ?? pages.length;
92
+ if (pages.length === 0) {
93
+ warnings.push({ message: "COM\uC73C\uB85C \uD14D\uC2A4\uD2B8\uB97C \uCD94\uCD9C\uD558\uC9C0 \uBABB\uD588\uC2B5\uB2C8\uB2E4", code: "COM_EMPTY" });
94
+ }
95
+ return { pages, pageCount, warnings };
96
+ }
97
+ function comResultToParseResult(pages, pageCount, warnings) {
98
+ const blocks = [];
99
+ const lines = [];
100
+ for (let i = 0; i < pages.length; i++) {
101
+ const text = (pages[i] ?? "").trim();
102
+ if (!text) continue;
103
+ const paragraphs = text.split(/\n/);
104
+ for (const para of paragraphs) {
105
+ const trimmed = para.trim();
106
+ if (!trimmed) continue;
107
+ blocks.push({ type: "paragraph", text: trimmed, pageNumber: i + 1 });
108
+ lines.push(trimmed);
109
+ }
110
+ }
111
+ const markdown = lines.join("\n\n");
112
+ const metadata = { pageCount };
113
+ warnings.push({
114
+ message: "DRM \uBB38\uC11C: \uD55C\uCEF4 COM API\uB85C \uD14D\uC2A4\uD2B8 \uCD94\uCD9C (\uC11C\uC2DD/\uD45C \uC815\uBCF4 \uC81C\uD55C\uC801)",
115
+ code: "DRM_COM_FALLBACK"
116
+ });
117
+ return {
118
+ markdown,
119
+ blocks,
120
+ metadata,
121
+ warnings: warnings.length > 0 ? warnings : void 0
122
+ };
123
+ }
124
+
125
+ // src/hwpx/parser.ts
32
126
  var MAX_DECOMPRESS_SIZE = 100 * 1024 * 1024;
33
127
  var MAX_ZIP_ENTRIES = 500;
34
128
  function clampSpan(val, max) {
@@ -133,6 +227,19 @@ async function parseHwpxDocument(buffer, options) {
133
227
  if (actualEntryCount > MAX_ZIP_ENTRIES) {
134
228
  throw new KordocError("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
135
229
  }
230
+ const manifestFile = zip.file("META-INF/manifest.xml");
231
+ if (manifestFile) {
232
+ const manifestXml = await manifestFile.async("text");
233
+ if (isEncryptedHwpx(manifestXml)) {
234
+ if (isComFallbackAvailable() && options?.filePath) {
235
+ const { pages, pageCount, warnings: warnings2 } = extractTextViaCom(options.filePath);
236
+ if (pages.some((p) => p && p.trim().length > 0)) {
237
+ return comResultToParseResult(pages, pageCount, warnings2);
238
+ }
239
+ }
240
+ throw new KordocError("DRM \uC554\uD638\uD654\uB41C HWPX \uD30C\uC77C\uC785\uB2C8\uB2E4. Windows + \uD55C\uCEF4 \uC624\uD53C\uC2A4 \uC124\uCE58 \uC2DC \uC790\uB3D9 \uCD94\uCD9C\uB429\uB2C8\uB2E4.");
241
+ }
242
+ }
136
243
  const decompressed = { total: 0 };
137
244
  const metadata = {};
138
245
  await extractHwpxMetadata(zip, metadata, decompressed);
@@ -4543,6 +4650,7 @@ function countSections(body) {
4543
4650
  // src/index.ts
4544
4651
  async function parse(input, options) {
4545
4652
  let buffer;
4653
+ const opts = typeof input === "string" && !options?.filePath ? { ...options, filePath: input } : options;
4546
4654
  if (typeof input === "string") {
4547
4655
  try {
4548
4656
  const buf = await readFile(input);
@@ -4563,16 +4671,16 @@ async function parse(input, options) {
4563
4671
  switch (format) {
4564
4672
  case "hwpx": {
4565
4673
  const zipFormat = await detectZipFormat(buffer);
4566
- if (zipFormat === "xlsx") return parseXlsx(buffer, options);
4567
- if (zipFormat === "docx") return parseDocx(buffer, options);
4568
- return parseHwpx(buffer, options);
4674
+ if (zipFormat === "xlsx") return parseXlsx(buffer, opts);
4675
+ if (zipFormat === "docx") return parseDocx(buffer, opts);
4676
+ return parseHwpx(buffer, opts);
4569
4677
  }
4570
4678
  case "hwp":
4571
- return parseHwp(buffer, options);
4679
+ return parseHwp(buffer, opts);
4572
4680
  case "hwpml":
4573
- return parseHwpml(buffer, options);
4681
+ return parseHwpml(buffer, opts);
4574
4682
  case "pdf":
4575
- return parsePdf(buffer, options);
4683
+ return parsePdf(buffer, opts);
4576
4684
  default:
4577
4685
  return { success: false, fileType: "unknown", error: "\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD30C\uC77C \uD615\uC2DD\uC785\uB2C8\uB2E4.", code: "UNSUPPORTED_FORMAT" };
4578
4686
  }
@@ -4596,7 +4704,7 @@ async function parseHwp(buffer, options) {
4596
4704
  async function parsePdf(buffer, options) {
4597
4705
  let parsePdfDocument;
4598
4706
  try {
4599
- const mod = await import("./parser-4LKJXBPP.js");
4707
+ const mod = await import("./parser-4275GJRB.js");
4600
4708
  parsePdfDocument = mod.parsePdfDocument;
4601
4709
  } catch {
4602
4710
  return {
@@ -4826,4 +4934,4 @@ export {
4826
4934
  compare,
4827
4935
  parse
4828
4936
  };
4829
- //# sourceMappingURL=chunk-OEJJPCMM.js.map
4937
+ //# sourceMappingURL=chunk-KSBPABBQ.js.map