kordoc 2.0.1 → 2.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/utils.ts"],"sourcesContent":["/** kordoc 공용 유틸리티 */\r\n\r\n/** 빌드 타임에 tsup define으로 주입되는 버전 */\r\ndeclare const __KORDOC_VERSION__: string\r\nexport const VERSION: string = typeof __KORDOC_VERSION__ !== \"undefined\" ? __KORDOC_VERSION__ : \"0.0.0-dev\"\r\n\r\n/**\r\n * Node.js Buffer → ArrayBuffer 변환\r\n * pool Buffer의 공유 ArrayBuffer 문제를 안전하게 처리.\r\n * offset=0이고 전체 ArrayBuffer를 차지하면 복사 없이 직접 반환.\r\n */\r\nexport function toArrayBuffer(buf: Buffer): ArrayBuffer {\r\n if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {\r\n return buf.buffer as ArrayBuffer\r\n }\r\n return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength) as ArrayBuffer\r\n}\r\n\r\n/**\r\n * kordoc 내부 에러 클래스 — 사용자에게 노출해도 안전한 메시지만 포함.\r\n * MCP 에러 정제에서 instanceof로 판별하여 allowlist 패턴 매칭 없이 안전하게 통과.\r\n */\r\nexport class KordocError extends Error {\r\n constructor(message: string) {\r\n super(message)\r\n this.name = \"KordocError\"\r\n }\r\n}\r\n\r\n/**\r\n * 에러 메시지 정제 — KordocError는 그대로, 나머지는 일반 메시지로 대체.\r\n * 파일시스템 경로, 스택 트레이스 등 내부 정보 노출 방지.\r\n */\r\nexport function sanitizeError(err: unknown): string {\r\n if (err instanceof KordocError) return err.message\r\n return \"문서 처리 중 오류가 발생했습니다\"\r\n}\r\n\r\n/**\r\n * ZIP 엔트리 경로의 경로 순회 여부 판별.\r\n * 백슬래시 정규화, .., 절대경로, Windows 드라이브 문자 모두 차단.\r\n */\r\nexport function isPathTraversal(name: string): boolean {\r\n if (name.includes(\"\\x00\")) return true\r\n const normalized = name.replace(/\\\\/g, \"/\")\r\n return normalized.includes(\"..\") || normalized.startsWith(\"/\") || /^[A-Za-z]:/.test(normalized)\r\n}\r\n\r\n// ─── ZIP 안전 로딩 (ZIP bomb 방지) ────────────────────\r\n\r\n/**\r\n * ZIP bomb 사전 검사 — Central Directory에서 비압축 합계와 엔트리 수 확인.\r\n * HWPX/XLSX/DOCX 등 모든 ZIP 기반 포맷에서 공통 사용.\r\n */\r\nexport function precheckZipSize(\r\n buffer: ArrayBuffer,\r\n maxUncompressedSize = 100 * 1024 * 1024,\r\n maxEntries = 500,\r\n): { totalUncompressed: number; entryCount: number } {\r\n try {\r\n const data = new DataView(buffer)\r\n const len = buffer.byteLength\r\n // EOCD 시그니처 역방향 스캔\r\n let eocdOffset = -1\r\n for (let i = len - 22; i >= Math.max(0, len - 65557); i--) {\r\n if (data.getUint32(i, true) === 0x06054b50) { eocdOffset = i; break }\r\n }\r\n if (eocdOffset < 0) return { totalUncompressed: 0, entryCount: 0 }\r\n\r\n const entryCount = data.getUint16(eocdOffset + 10, true)\r\n if (entryCount > maxEntries) {\r\n throw new KordocError(`ZIP 엔트리 수 초과: ${entryCount} (최대 ${maxEntries})`)\r\n }\r\n\r\n const cdSize = data.getUint32(eocdOffset + 12, true)\r\n const cdOffset = data.getUint32(eocdOffset + 16, true)\r\n if (cdOffset + cdSize > len) return { totalUncompressed: 0, entryCount }\r\n\r\n let totalUncompressed = 0\r\n let pos = cdOffset\r\n for (let i = 0; i < entryCount && pos + 46 <= cdOffset + cdSize; i++) {\r\n if (data.getUint32(pos, true) !== 0x02014b50) break\r\n totalUncompressed += data.getUint32(pos + 24, true)\r\n const nameLen = data.getUint16(pos + 28, true)\r\n const extraLen = data.getUint16(pos + 30, true)\r\n const commentLen = data.getUint16(pos + 32, true)\r\n pos += 46 + nameLen + extraLen + commentLen\r\n }\r\n\r\n if (totalUncompressed > maxUncompressedSize) {\r\n throw new KordocError(`ZIP 비압축 크기 초과: ${(totalUncompressed / 1024 / 1024).toFixed(1)}MB (최대 ${maxUncompressedSize / 1024 / 1024}MB)`)\r\n }\r\n\r\n return { totalUncompressed, entryCount }\r\n } catch (err) {\r\n if (err instanceof KordocError) throw err\r\n return { totalUncompressed: 0, entryCount: 0 }\r\n }\r\n}\r\n\r\n/** 하이퍼링크 URL 살균 — javascript: 등 XSS 위험 스킴 차단 */\r\nconst SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i\r\nexport function sanitizeHref(href: string): string | null {\r\n const trimmed = href.trim()\r\n if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null\r\n return trimmed\r\n}\r\n\r\n// ─── 에러 분류 ──────────────────────────────────────\r\n\r\nimport type { ErrorCode } from \"./types.js\"\r\n\r\n/** 에러를 구조화된 ErrorCode로 분류 — KordocError 메시지 패턴 매칭 */\r\nexport function classifyError(err: unknown): ErrorCode {\r\n if (!(err instanceof Error)) return \"PARSE_ERROR\"\r\n const msg = err.message\r\n if (msg.includes(\"암호화\")) return \"ENCRYPTED\"\r\n if (msg.includes(\"DRM\")) return \"DRM_PROTECTED\"\r\n if (msg.includes(\"ZIP bomb\") || msg.includes(\"ZIP 비압축 크기 초과\") || msg.includes(\"ZIP 엔트리 수 초과\")) return \"ZIP_BOMB\"\r\n if (msg.includes(\"bomb\") || msg.includes(\"크기 초과\") || msg.includes(\"압축 해제\")) return \"DECOMPRESSION_BOMB\"\r\n if (msg.includes(\"이미지 기반\")) return \"IMAGE_BASED_PDF\"\r\n if (msg.includes(\"섹션\") && (msg.includes(\"찾을 수 없\") || msg.includes(\"없음\"))) return \"NO_SECTIONS\"\r\n if (msg.includes(\"시그니처\") || msg.includes(\"복구할 수 없\")) return \"CORRUPTED\"\r\n return \"PARSE_ERROR\"\r\n}\r\n"],"mappings":";;;AAIO,IAAM,UAAkB,OAA4C,UAAqB;AAOzF,SAAS,cAAc,KAA0B;AACtD,MAAI,IAAI,eAAe,KAAK,IAAI,eAAe,IAAI,OAAO,YAAY;AACpE,WAAO,IAAI;AAAA,EACb;AACA,SAAO,IAAI,OAAO,MAAM,IAAI,YAAY,IAAI,aAAa,IAAI,UAAU;AACzE;AAMO,IAAM,cAAN,cAA0B,MAAM;AAAA,EACrC,YAAY,SAAiB;AAC3B,UAAM,OAAO;AACb,SAAK,OAAO;AAAA,EACd;AACF;AAMO,SAAS,cAAc,KAAsB;AAClD,MAAI,eAAe,YAAa,QAAO,IAAI;AAC3C,SAAO;AACT;AAMO,SAAS,gBAAgB,MAAuB;AACrD,MAAI,KAAK,SAAS,IAAM,EAAG,QAAO;AAClC,QAAM,aAAa,KAAK,QAAQ,OAAO,GAAG;AAC1C,SAAO,WAAW,SAAS,IAAI,KAAK,WAAW,WAAW,GAAG,KAAK,aAAa,KAAK,UAAU;AAChG;AAQO,SAAS,gBACd,QACA,sBAAsB,MAAM,OAAO,MACnC,aAAa,KACsC;AACnD,MAAI;AACF,UAAM,OAAO,IAAI,SAAS,MAAM;AAChC,UAAM,MAAM,OAAO;AAEnB,QAAI,aAAa;AACjB,aAAS,IAAI,MAAM,IAAI,KAAK,KAAK,IAAI,GAAG,MAAM,KAAK,GAAG,KAAK;AACzD,UAAI,KAAK,UAAU,GAAG,IAAI,MAAM,WAAY;AAAE,qBAAa;AAAG;AAAA,MAAM;AAAA,IACtE;AACA,QAAI,aAAa,EAAG,QAAO,EAAE,mBAAmB,GAAG,YAAY,EAAE;AAEjE,UAAM,aAAa,KAAK,UAAU,aAAa,IAAI,IAAI;AACvD,QAAI,aAAa,YAAY;AAC3B,YAAM,IAAI,YAAY,+CAAiB,UAAU,kBAAQ,UAAU,GAAG;AAAA,IACxE;AAEA,UAAM,SAAS,KAAK,UAAU,aAAa,IAAI,IAAI;AACnD,UAAM,WAAW,KAAK,UAAU,aAAa,IAAI,IAAI;AACrD,QAAI,WAAW,SAAS,IAAK,QAAO,EAAE,mBAAmB,GAAG,WAAW;AAEvE,QAAI,oBAAoB;AACxB,QAAI,MAAM;AACV,aAAS,IAAI,GAAG,IAAI,cAAc,MAAM,MAAM,WAAW,QAAQ,KAAK;AACpE,UAAI,KAAK,UAAU,KAAK,IAAI,MAAM,SAAY;AAC9C,2BAAqB,KAAK,UAAU,MAAM,IAAI,IAAI;AAClD,YAAM,UAAU,KAAK,UAAU,MAAM,IAAI,IAAI;AAC7C,YAAM,WAAW,KAAK,UAAU,MAAM,IAAI,IAAI;AAC9C,YAAM,aAAa,KAAK,UAAU,MAAM,IAAI,IAAI;AAChD,aAAO,KAAK,UAAU,WAAW;AAAA,IACnC;AAEA,QAAI,oBAAoB,qBAAqB;AAC3C,YAAM,IAAI,YAAY,sDAAmB,oBAAoB,OAAO,MAAM,QAAQ,CAAC,CAAC,oBAAU,sBAAsB,OAAO,IAAI,KAAK;AAAA,IACtI;AAEA,WAAO,EAAE,mBAAmB,WAAW;AAAA,EACzC,SAAS,KAAK;AACZ,QAAI,eAAe,YAAa,OAAM;AACtC,WAAO,EAAE,mBAAmB,GAAG,YAAY,EAAE;AAAA,EAC/C;AACF;AAGA,IAAM,eAAe;AACd,SAAS,aAAa,MAA6B;AACxD,QAAM,UAAU,KAAK,KAAK;AAC1B,MAAI,CAAC,WAAW,CAAC,aAAa,KAAK,OAAO,EAAG,QAAO;AACpD,SAAO;AACT;AAOO,SAAS,cAAc,KAAyB;AACrD,MAAI,EAAE,eAAe,OAAQ,QAAO;AACpC,QAAM,MAAM,IAAI;AAChB,MAAI,IAAI,SAAS,oBAAK,EAAG,QAAO;AAChC,MAAI,IAAI,SAAS,KAAK,EAAG,QAAO;AAChC,MAAI,IAAI,SAAS,UAAU,KAAK,IAAI,SAAS,kDAAe,KAAK,IAAI,SAAS,4CAAc,EAAG,QAAO;AACtG,MAAI,IAAI,SAAS,MAAM,KAAK,IAAI,SAAS,2BAAO,KAAK,IAAI,SAAS,2BAAO,EAAG,QAAO;AACnF,MAAI,IAAI,SAAS,iCAAQ,EAAG,QAAO;AACnC,MAAI,IAAI,SAAS,cAAI,MAAM,IAAI,SAAS,4BAAQ,KAAK,IAAI,SAAS,cAAI,GAAI,QAAO;AACjF,MAAI,IAAI,SAAS,0BAAM,KAAK,IAAI,SAAS,kCAAS,EAAG,QAAO;AAC5D,SAAO;AACT;","names":[]}
@@ -32,4 +32,4 @@ function parsePageRange(spec, maxPages) {
32
32
  export {
33
33
  parsePageRange
34
34
  };
35
- //# sourceMappingURL=chunk-MOL7MDBG.js.map
35
+ //# sourceMappingURL=chunk-3TBUDJDE.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/page-range.ts"],"sourcesContent":["/** 페이지/섹션 범위 파싱 유틸리티 */\r\n\r\n/**\r\n * 페이지 범위 지정을 1-based Set<number>로 변환.\r\n *\r\n * @param spec - [1,2,3] 또는 \"1-3\" 또는 \"1,3,5-7\"\r\n * @param maxPages - 최대 페이지 수 (클램핑 상한)\r\n * @returns 1-based 페이지 번호 Set\r\n */\r\nexport function parsePageRange(spec: number[] | string, maxPages: number): Set<number> {\r\n const result = new Set<number>()\r\n if (maxPages <= 0) return result\r\n\r\n if (Array.isArray(spec)) {\r\n for (const n of spec) {\r\n const page = Math.round(n)\r\n if (page >= 1 && page <= maxPages) result.add(page)\r\n }\r\n return result\r\n }\r\n\r\n if (typeof spec !== \"string\" || spec.trim() === \"\") return result\r\n\r\n const parts = spec.split(\",\")\r\n for (const part of parts) {\r\n const trimmed = part.trim()\r\n if (!trimmed) continue\r\n\r\n const rangeMatch = trimmed.match(/^(\\d+)\\s*-\\s*(\\d+)$/)\r\n if (rangeMatch) {\r\n const start = Math.max(1, parseInt(rangeMatch[1], 10))\r\n const end = Math.min(maxPages, parseInt(rangeMatch[2], 10))\r\n for (let i = start; i <= end; i++) result.add(i)\r\n } else {\r\n const page = parseInt(trimmed, 10)\r\n if (!isNaN(page) && page >= 1 && page <= maxPages) result.add(page)\r\n }\r\n }\r\n\r\n return result\r\n}\r\n"],"mappings":";;;AASO,SAAS,eAAe,MAAyB,UAA+B;AACrF,QAAM,SAAS,oBAAI,IAAY;AAC/B,MAAI,YAAY,EAAG,QAAO;AAE1B,MAAI,MAAM,QAAQ,IAAI,GAAG;AACvB,eAAW,KAAK,MAAM;AACpB,YAAM,OAAO,KAAK,MAAM,CAAC;AACzB,UAAI,QAAQ,KAAK,QAAQ,SAAU,QAAO,IAAI,IAAI;AAAA,IACpD;AACA,WAAO;AAAA,EACT;AAEA,MAAI,OAAO,SAAS,YAAY,KAAK,KAAK,MAAM,GAAI,QAAO;AAE3D,QAAM,QAAQ,KAAK,MAAM,GAAG;AAC5B,aAAW,QAAQ,OAAO;AACxB,UAAM,UAAU,KAAK,KAAK;AAC1B,QAAI,CAAC,QAAS;AAEd,UAAM,aAAa,QAAQ,MAAM,qBAAqB;AACtD,QAAI,YAAY;AACd,YAAM,QAAQ,KAAK,IAAI,GAAG,SAAS,WAAW,CAAC,GAAG,EAAE,CAAC;AACrD,YAAM,MAAM,KAAK,IAAI,UAAU,SAAS,WAAW,CAAC,GAAG,EAAE,CAAC;AAC1D,eAAS,IAAI,OAAO,KAAK,KAAK,IAAK,QAAO,IAAI,CAAC;AAAA,IACjD,OAAO;AACL,YAAM,OAAO,SAAS,SAAS,EAAE;AACjC,UAAI,CAAC,MAAM,IAAI,KAAK,QAAQ,KAAK,QAAQ,SAAU,QAAO,IAAI,IAAI;AAAA,IACpE;AAAA,EACF;AAEA,SAAO;AACT;","names":[]}
@@ -6,10 +6,10 @@ import {
6
6
  precheckZipSize,
7
7
  sanitizeHref,
8
8
  toArrayBuffer
9
- } from "./chunk-L4OFASDS.js";
9
+ } from "./chunk-25TXW6EP.js";
10
10
  import {
11
11
  parsePageRange
12
- } from "./chunk-MOL7MDBG.js";
12
+ } from "./chunk-3TBUDJDE.js";
13
13
 
14
14
  // src/detect.ts
15
15
  import JSZip from "jszip";
@@ -163,6 +163,47 @@ function sanitizeText(text) {
163
163
  }
164
164
  return result;
165
165
  }
166
+ function flattenLayoutTables(blocks) {
167
+ const result = [];
168
+ for (const block of blocks) {
169
+ if (block.type !== "table" || !block.table) {
170
+ result.push(block);
171
+ continue;
172
+ }
173
+ const { rows: numRows, cols: numCols, cells } = block.table;
174
+ if (numRows === 1 && numCols === 1) {
175
+ result.push(block);
176
+ continue;
177
+ }
178
+ if (numRows <= 3) {
179
+ let totalNewlines = 0;
180
+ let totalTextLen = 0;
181
+ for (let r = 0; r < numRows; r++) {
182
+ for (let c = 0; c < numCols; c++) {
183
+ const t = cells[r]?.[c]?.text || "";
184
+ totalNewlines += (t.match(/\n/g) || []).length;
185
+ totalTextLen += t.length;
186
+ }
187
+ }
188
+ if (totalNewlines > 5 || numRows <= 2 && totalTextLen > 300) {
189
+ for (let r = 0; r < numRows; r++) {
190
+ for (let c = 0; c < numCols; c++) {
191
+ const cellText = cells[r]?.[c]?.text?.trim();
192
+ if (!cellText) continue;
193
+ for (const line of cellText.split("\n")) {
194
+ const trimmed = line.trim();
195
+ if (!trimmed) continue;
196
+ result.push({ type: "paragraph", text: trimmed, pageNumber: block.pageNumber });
197
+ }
198
+ }
199
+ }
200
+ continue;
201
+ }
202
+ }
203
+ result.push(block);
204
+ }
205
+ return result;
206
+ }
166
207
  function blocksToMarkdown(blocks) {
167
208
  const lines = [];
168
209
  for (let i = 0; i < blocks.length; i++) {
@@ -224,8 +265,11 @@ function blocksToMarkdown(blocks) {
224
265
  if (lines.length > 0 && lines[lines.length - 1] !== "") {
225
266
  lines.push("");
226
267
  }
227
- lines.push(tableToMarkdown(block.table));
228
- lines.push("");
268
+ const tableMd = tableToMarkdown(block.table);
269
+ if (tableMd) {
270
+ lines.push(tableMd);
271
+ lines.push("");
272
+ }
229
273
  }
230
274
  }
231
275
  return lines.join("\n").trim();
@@ -235,6 +279,7 @@ function tableToMarkdown(table) {
235
279
  const { cells, rows: numRows, cols: numCols } = table;
236
280
  if (numRows === 1 && numCols === 1) {
237
281
  const content = sanitizeText(cells[0][0].text);
282
+ if (!content) return "";
238
283
  return content.split(/\n/).map((line) => {
239
284
  const trimmed = line.trim();
240
285
  if (!trimmed) return "";
@@ -271,9 +316,9 @@ function tableToMarkdown(table) {
271
316
  const row = display[r];
272
317
  const isEmptyPlaceholder = row.every((cell) => cell === "");
273
318
  if (isEmptyPlaceholder) continue;
274
- const hasSkippedCols = row.some((cell, c) => cell === "" && skip.has(`${r},${c}`));
275
319
  const nonEmptyCols = row.filter((cell) => cell !== "");
276
- if (!hasSkippedCols && nonEmptyCols.length === 1 && row[0] !== "" && row.slice(1).every((c) => c === "")) {
320
+ const hasSkipInRow = row.some((_, c) => skip.has(`${r},${c}`));
321
+ if (!hasSkipInRow && nonEmptyCols.length === 1 && row[0] !== "" && row.slice(1).every((c) => c === "")) {
277
322
  pendingFirstCol = row[0];
278
323
  continue;
279
324
  }
@@ -705,7 +750,8 @@ function detectHwpxHeadings(blocks, styleMap) {
705
750
  else if (ratio >= HEADING_RATIO_H2) level = 2;
706
751
  else if (ratio >= HEADING_RATIO_H3) level = 3;
707
752
  }
708
- if (/^제\d+[조장절편]/.test(text) && text.length <= 50) {
753
+ const compactText = text.replace(/\s+/g, "");
754
+ if (/^제\d+[조장절편]/.test(compactText) && text.length <= 50) {
709
755
  if (level === 0) level = 3;
710
756
  }
711
757
  if (level > 0) {
@@ -757,9 +803,14 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
757
803
  if (newTable.rows.length > 0) {
758
804
  if (tableStack.length > 0) {
759
805
  const parentTable = tableStack.pop();
760
- const nestedText = convertTableToText(newTable.rows);
761
- if (parentTable.cell) {
762
- parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
806
+ const nestedCols = Math.max(...newTable.rows.map((r) => r.length));
807
+ if (newTable.rows.length >= 3 && nestedCols >= 2) {
808
+ blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
809
+ } else {
810
+ const nestedText = convertTableToText(newTable.rows);
811
+ if (parentTable.cell) {
812
+ parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
813
+ }
763
814
  }
764
815
  tableCtx = parentTable;
765
816
  } else {
@@ -859,9 +910,14 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
859
910
  if (newTable.rows.length > 0) {
860
911
  if (tableStack.length > 0) {
861
912
  const parentTable = tableStack.pop();
862
- const nestedText = convertTableToText(newTable.rows);
863
- if (parentTable.cell) {
864
- parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
913
+ const nestedCols = Math.max(...newTable.rows.map((r) => r.length));
914
+ if (newTable.rows.length >= 3 && nestedCols >= 2) {
915
+ blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
916
+ } else {
917
+ const nestedText = convertTableToText(newTable.rows);
918
+ if (parentTable.cell) {
919
+ parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
920
+ }
865
921
  }
866
922
  tableCtx = parentTable;
867
923
  } else {
@@ -872,13 +928,20 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
872
928
  tableCtx = tableStack.length > 0 ? tableStack.pop() : null;
873
929
  }
874
930
  } else if (localTag === "pic" || localTag === "shape" || localTag === "drawingObject") {
875
- const imgRef = extractImageRef(el);
876
- if (imgRef) {
877
- blocks.push({ type: "image", text: imgRef, pageNumber: sectionNum });
878
- } else if (warnings && sectionNum) {
879
- warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
931
+ const drawTextChild = findDescendant(el, "drawText");
932
+ if (drawTextChild) {
933
+ extractDrawTextBlocks(drawTextChild, blocks, styleMap, sectionNum);
934
+ } else {
935
+ const imgRef = extractImageRef(el);
936
+ if (imgRef) {
937
+ blocks.push({ type: "image", text: imgRef, pageNumber: sectionNum });
938
+ } else if (warnings && sectionNum) {
939
+ warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
940
+ }
880
941
  }
881
- } else if (localTag === "r" || localTag === "run" || localTag === "ctrl") {
942
+ } else if (localTag === "drawText") {
943
+ extractDrawTextBlocks(el, blocks, styleMap, sectionNum);
944
+ } else if (localTag === "r" || localTag === "run" || localTag === "ctrl" || localTag === "rect" || localTag === "ellipse" || localTag === "polygon" || localTag === "line" || localTag === "arc" || localTag === "curve" || localTag === "connectLine" || localTag === "container") {
882
945
  walkChildren(el, d + 1);
883
946
  }
884
947
  }
@@ -886,6 +949,40 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
886
949
  walkChildren(node, depth);
887
950
  return tableCtx;
888
951
  }
952
+ function findDescendant(node, targetTag, depth = 0) {
953
+ if (depth > 5) return null;
954
+ const children = node.childNodes;
955
+ if (!children) return null;
956
+ for (let i = 0; i < children.length; i++) {
957
+ const child = children[i];
958
+ if (child.nodeType !== 1) continue;
959
+ const tag = (child.tagName || child.localName || "").replace(/^[^:]+:/, "");
960
+ if (tag === targetTag) return child;
961
+ const found = findDescendant(child, targetTag, depth + 1);
962
+ if (found) return found;
963
+ }
964
+ return null;
965
+ }
966
+ function extractDrawTextBlocks(drawTextNode, blocks, styleMap, sectionNum) {
967
+ const children = drawTextNode.childNodes;
968
+ if (!children) return;
969
+ for (let i = 0; i < children.length; i++) {
970
+ const child = children[i];
971
+ if (child.nodeType !== 1) continue;
972
+ const tag = (child.tagName || child.localName || "").replace(/^[^:]+:/, "");
973
+ if (tag === "subList" || tag === "p" || tag === "para") {
974
+ if (tag === "subList") {
975
+ extractDrawTextBlocks(child, blocks, styleMap, sectionNum);
976
+ } else {
977
+ const info = extractParagraphInfo(child, styleMap);
978
+ const text = info.text.trim();
979
+ if (text) {
980
+ blocks.push({ type: "paragraph", text, style: info.style ?? void 0, pageNumber: sectionNum });
981
+ }
982
+ }
983
+ }
984
+ }
985
+ }
889
986
  function extractParagraphInfo(para, styleMap) {
890
987
  let text = "";
891
988
  let href;
@@ -904,11 +1001,18 @@ function extractParagraphInfo(para, styleMap) {
904
1001
  const tag = (child.tagName || child.localName || "").replace(/^[^:]+:/, "");
905
1002
  switch (tag) {
906
1003
  case "t":
907
- text += child.textContent || "";
1004
+ walk(child);
908
1005
  break;
909
- case "tab":
910
- text += " ";
1006
+ // 자식 순회 (tab 등 하위 요소 처리)
1007
+ case "tab": {
1008
+ const leader = child.getAttribute("leader");
1009
+ if (leader && leader !== "0") {
1010
+ text += "";
1011
+ } else {
1012
+ text += " ";
1013
+ }
911
1014
  break;
1015
+ }
912
1016
  case "br":
913
1017
  if ((child.getAttribute("type") || "line") === "line") text += "\n";
914
1018
  break;
@@ -975,6 +1079,8 @@ function extractParagraphInfo(para, styleMap) {
975
1079
  }
976
1080
  };
977
1081
  walk(para);
1082
+ const leaderIdx = text.indexOf("");
1083
+ if (leaderIdx >= 0) text = text.substring(0, leaderIdx);
978
1084
  let cleanText = text.replace(/[ \t]+/g, " ").trim();
979
1085
  if (/^그림입니다\.?\s*원본\s*그림의\s*(이름|크기)/.test(cleanText)) cleanText = "";
980
1086
  cleanText = cleanText.replace(/그림입니다\.?\s*원본\s*그림의\s*(이름|크기)[^\n]*(\n[^\n]*원본\s*그림의\s*(이름|크기)[^\n]*)*/g, "").trim();
@@ -1013,8 +1119,9 @@ var TAG_CHAR_SHAPE = 68;
1013
1119
  var TAG_CTRL_HEADER = 71;
1014
1120
  var TAG_LIST_HEADER = 72;
1015
1121
  var TAG_TABLE = 77;
1016
- var TAG_DOC_CHAR_SHAPE = 55;
1017
- var TAG_DOC_STYLE = 58;
1122
+ var TAG_DOC_CHAR_SHAPE = 21;
1123
+ var TAG_DOC_PARA_SHAPE = 25;
1124
+ var TAG_DOC_STYLE = 26;
1018
1125
  var CHAR_LINE = 0;
1019
1126
  var CHAR_SECTION_BREAK = 10;
1020
1127
  var CHAR_PARA = 13;
@@ -1070,8 +1177,14 @@ function parseFileHeader(data) {
1070
1177
  }
1071
1178
  function parseDocInfo(records) {
1072
1179
  const charShapes = [];
1180
+ const paraShapes = [];
1073
1181
  const styles = [];
1074
1182
  for (const rec of records) {
1183
+ if (rec.tagId === TAG_DOC_PARA_SHAPE && rec.data.length >= 4) {
1184
+ const flags = rec.data.readUInt32LE(0);
1185
+ const outlineLevel = flags >> 25 & 7;
1186
+ paraShapes.push({ outlineLevel });
1187
+ }
1075
1188
  if (rec.tagId === TAG_DOC_CHAR_SHAPE && rec.data.length >= 18) {
1076
1189
  if (rec.data.length >= 50) {
1077
1190
  const fontSize = rec.data.readUInt32LE(42);
@@ -1111,7 +1224,7 @@ function parseDocInfo(records) {
1111
1224
  }
1112
1225
  }
1113
1226
  }
1114
- return { charShapes, styles };
1227
+ return { charShapes, paraShapes, styles };
1115
1228
  }
1116
1229
  function extractText(data) {
1117
1230
  let result = "";
@@ -2121,12 +2234,13 @@ function parseHwp5Document(buffer, options) {
2121
2234
  }
2122
2235
  }
2123
2236
  const images = cfb ? extractHwp5Images(cfb, blocks, compressed, warnings) : extractHwp5ImagesLenient(lenientCfb, blocks, compressed, warnings);
2237
+ const flatBlocks = flattenLayoutTables(blocks);
2124
2238
  if (docInfo) {
2125
- detectHwp5Headings(blocks, docInfo);
2239
+ detectHwp5Headings(flatBlocks, docInfo);
2126
2240
  }
2127
- const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
2128
- const markdown = blocksToMarkdown(blocks);
2129
- return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0, images: images.length > 0 ? images : void 0 };
2241
+ const outline = flatBlocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
2242
+ const markdown = blocksToMarkdown(flatBlocks);
2243
+ return { markdown, blocks: flatBlocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0, images: images.length > 0 ? images : void 0 };
2130
2244
  }
2131
2245
  function parseDocInfoStream(cfb, compressed) {
2132
2246
  try {
@@ -2177,16 +2291,21 @@ function detectHwp5Headings(blocks, docInfo) {
2177
2291
  }
2178
2292
  if (baseFontSize <= 0) return;
2179
2293
  for (const block of blocks) {
2180
- if (block.type !== "paragraph" || !block.text || !block.style?.fontSize) continue;
2294
+ if (block.type === "heading") continue;
2295
+ if (block.type !== "paragraph" || !block.text) continue;
2181
2296
  const text = block.text.trim();
2182
2297
  if (text.length === 0 || text.length > 200) continue;
2183
2298
  if (/^\d+$/.test(text)) continue;
2184
- const ratio = block.style.fontSize / baseFontSize;
2185
2299
  let level = 0;
2186
- if (ratio >= HEADING_RATIO_H1) level = 1;
2187
- else if (ratio >= HEADING_RATIO_H2) level = 2;
2188
- else if (ratio >= HEADING_RATIO_H3) level = 3;
2189
- if (/^제\d+[조장절편]/.test(text) && text.length <= 50) {
2300
+ if (block.style?.fontSize && baseFontSize > 0) {
2301
+ const ratio = block.style.fontSize / baseFontSize;
2302
+ if (ratio >= HEADING_RATIO_H1) level = 1;
2303
+ else if (ratio >= HEADING_RATIO_H2) level = 2;
2304
+ else if (ratio >= HEADING_RATIO_H3) level = 3;
2305
+ }
2306
+ if (/^제\d+[장절편]\s/.test(text) && text.length <= 50) {
2307
+ if (level === 0) level = 2;
2308
+ } else if (/^제\d+(조의?\d*)\s*[\((]/.test(text) && text.length <= 80) {
2190
2309
  if (level === 0) level = 3;
2191
2310
  }
2192
2311
  if (level > 0) {
@@ -2432,13 +2551,20 @@ function parseSection(records, docInfo, warnings, sectionNum) {
2432
2551
  while (i < records.length) {
2433
2552
  const rec = records[i];
2434
2553
  if (rec.tagId === TAG_PARA_HEADER && rec.level === 0) {
2435
- const { paragraph, tables, nextIdx, charShapeIds } = parseParagraphWithTables(records, i);
2554
+ const { paragraph, tables, nextIdx, charShapeIds, paraShapeId } = parseParagraphWithTables(records, i);
2436
2555
  if (paragraph) {
2437
2556
  const block = { type: "paragraph", text: paragraph, pageNumber: sectionNum };
2438
2557
  if (docInfo && charShapeIds.length > 0) {
2439
2558
  const style = resolveCharStyle(charShapeIds, docInfo);
2440
2559
  if (style) block.style = style;
2441
2560
  }
2561
+ if (docInfo && paraShapeId >= 0 && paraShapeId < docInfo.paraShapes.length) {
2562
+ const ol = docInfo.paraShapes[paraShapeId].outlineLevel;
2563
+ if (ol >= 1 && ol <= 6) {
2564
+ block.type = "heading";
2565
+ block.level = ol;
2566
+ }
2567
+ }
2442
2568
  blocks.push(block);
2443
2569
  }
2444
2570
  for (const t of tables) blocks.push({ type: "table", table: t, pageNumber: sectionNum });
@@ -2458,7 +2584,10 @@ function parseSection(records, docInfo, warnings, sectionNum) {
2458
2584
  if (binId >= 0) {
2459
2585
  blocks.push({ type: "image", text: String(binId), pageNumber: sectionNum });
2460
2586
  } else {
2461
- warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC81C\uC5B4 \uC694\uC18C: ${ctrlId.trim()}`, code: "SKIPPED_IMAGE" });
2587
+ const boxText = extractTextBoxText(records, i);
2588
+ if (boxText) {
2589
+ blocks.push({ type: "paragraph", text: boxText, pageNumber: sectionNum });
2590
+ }
2462
2591
  }
2463
2592
  } else if (ctrlId === " elo" || ctrlId === "ole ") {
2464
2593
  warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC81C\uC5B4 \uC694\uC18C: ${ctrlId.trim()}`, code: "SKIPPED_IMAGE" });
@@ -2497,6 +2626,19 @@ function extractNoteText(records, ctrlIdx) {
2497
2626
  }
2498
2627
  return texts.length > 0 ? texts.join(" ") : null;
2499
2628
  }
2629
+ function extractTextBoxText(records, ctrlIdx) {
2630
+ const ctrlLevel = records[ctrlIdx].level;
2631
+ const texts = [];
2632
+ for (let j = ctrlIdx + 1; j < records.length && j < ctrlIdx + 200; j++) {
2633
+ const r = records[j];
2634
+ if (r.level <= ctrlLevel) break;
2635
+ if (r.tagId === TAG_PARA_TEXT) {
2636
+ const t = extractText(r.data).trim();
2637
+ if (t) texts.push(t);
2638
+ }
2639
+ }
2640
+ return texts.length > 0 ? texts.join("\n") : null;
2641
+ }
2500
2642
  function extractHyperlinkUrl(data) {
2501
2643
  try {
2502
2644
  const httpSig = Buffer.from("http", "utf16le");
@@ -2542,6 +2684,8 @@ function parseParagraphWithTables(records, startIdx) {
2542
2684
  let text = "";
2543
2685
  const tables = [];
2544
2686
  const charShapeIds = [];
2687
+ const paraHeaderData = records[startIdx].data;
2688
+ const paraShapeId = paraHeaderData.length >= 10 ? paraHeaderData.readUInt16LE(8) : -1;
2545
2689
  let i = startIdx + 1;
2546
2690
  while (i < records.length) {
2547
2691
  const rec = records[i];
@@ -2566,7 +2710,7 @@ function parseParagraphWithTables(records, startIdx) {
2566
2710
  i++;
2567
2711
  }
2568
2712
  const trimmed = text.trim();
2569
- return { paragraph: trimmed || null, tables, nextIdx: i, charShapeIds };
2713
+ return { paragraph: trimmed || null, tables, nextIdx: i, charShapeIds, paraShapeId };
2570
2714
  }
2571
2715
  function parseTableBlock(records, startIdx) {
2572
2716
  const tableLevel = records[startIdx].level;
@@ -3384,7 +3528,7 @@ async function parsePdfDocument(buffer, options) {
3384
3528
  if (totalChars / Math.max(parsedPageCount, 1) < 10) {
3385
3529
  if (options?.ocr) {
3386
3530
  try {
3387
- const { ocrPages } = await import("./provider-A4FHJSID.js");
3531
+ const { ocrPages } = await import("./provider-EU3CG724.js");
3388
3532
  const ocrBlocks = await ocrPages(doc, options.ocr, pageFilter, effectivePageCount);
3389
3533
  if (ocrBlocks.length > 0) {
3390
3534
  const ocrMarkdown = ocrBlocks.map((b) => b.text || "").filter(Boolean).join("\n\n");
@@ -4482,7 +4626,7 @@ async function parseXlsxDocument(buffer, options) {
4482
4626
  }
4483
4627
  let pageFilter = null;
4484
4628
  if (options?.pages) {
4485
- const { parsePageRange: parsePageRange2 } = await import("./page-range-737B4EZW.js");
4629
+ const { parsePageRange: parsePageRange2 } = await import("./page-range-OF5I4PQY.js");
4486
4630
  pageFilter = parsePageRange2(options.pages, sheets.length);
4487
4631
  }
4488
4632
  const blocks = [];
@@ -5365,4 +5509,4 @@ export {
5365
5509
  extractFormFields,
5366
5510
  parse
5367
5511
  };
5368
- //# sourceMappingURL=chunk-JJ65GKUH.js.map
5512
+ //# sourceMappingURL=chunk-4UH6ABAY.js.map