kordoc 2.0.2 → 2.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -21
- package/README.md +291 -291
- package/dist/{chunk-EVWOJ4T5.js → chunk-25TXW6EP.js} +2 -2
- package/dist/chunk-25TXW6EP.js.map +1 -0
- package/dist/{chunk-MOL7MDBG.js → chunk-3TBUDJDE.js} +1 -1
- package/dist/chunk-3TBUDJDE.js.map +1 -0
- package/dist/{chunk-XJYM2AUA.js → chunk-4UH6ABAY.js} +83 -20
- package/dist/chunk-4UH6ABAY.js.map +1 -0
- package/dist/cli.js +5 -5
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +79 -16
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +79 -16
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +3 -3
- package/dist/mcp.js.map +1 -1
- package/dist/page-range-OF5I4PQY.js +8 -0
- package/dist/{provider-A4FHJSID.js → provider-EU3CG724.js} +1 -1
- package/dist/provider-EU3CG724.js.map +1 -0
- package/dist/{utils-6JEIFBCJ.js → utils-BTZ4WSYX.js} +2 -2
- package/dist/{watch-BCPDLGOE.js → watch-QD3PDNXQ.js} +4 -4
- package/dist/watch-QD3PDNXQ.js.map +1 -0
- package/package.json +1 -1
- package/dist/chunk-EVWOJ4T5.js.map +0 -1
- package/dist/chunk-MOL7MDBG.js.map +0 -1
- package/dist/chunk-XJYM2AUA.js.map +0 -1
- package/dist/page-range-737B4EZW.js +0 -8
- package/dist/provider-A4FHJSID.js.map +0 -1
- package/dist/watch-BCPDLGOE.js.map +0 -1
- /package/dist/{page-range-737B4EZW.js.map → page-range-OF5I4PQY.js.map} +0 -0
- /package/dist/{utils-6JEIFBCJ.js.map → utils-BTZ4WSYX.js.map} +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/utils.ts"],"sourcesContent":["/** kordoc 공용 유틸리티 */\r\n\r\n/** 빌드 타임에 tsup define으로 주입되는 버전 */\r\ndeclare const __KORDOC_VERSION__: string\r\nexport const VERSION: string = typeof __KORDOC_VERSION__ !== \"undefined\" ? __KORDOC_VERSION__ : \"0.0.0-dev\"\r\n\r\n/**\r\n * Node.js Buffer → ArrayBuffer 변환\r\n * pool Buffer의 공유 ArrayBuffer 문제를 안전하게 처리.\r\n * offset=0이고 전체 ArrayBuffer를 차지하면 복사 없이 직접 반환.\r\n */\r\nexport function toArrayBuffer(buf: Buffer): ArrayBuffer {\r\n if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {\r\n return buf.buffer as ArrayBuffer\r\n }\r\n return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength) as ArrayBuffer\r\n}\r\n\r\n/**\r\n * kordoc 내부 에러 클래스 — 사용자에게 노출해도 안전한 메시지만 포함.\r\n * MCP 에러 정제에서 instanceof로 판별하여 allowlist 패턴 매칭 없이 안전하게 통과.\r\n */\r\nexport class KordocError extends Error {\r\n constructor(message: string) {\r\n super(message)\r\n this.name = \"KordocError\"\r\n }\r\n}\r\n\r\n/**\r\n * 에러 메시지 정제 — KordocError는 그대로, 나머지는 일반 메시지로 대체.\r\n * 파일시스템 경로, 스택 트레이스 등 내부 정보 노출 방지.\r\n */\r\nexport function sanitizeError(err: unknown): string {\r\n if (err instanceof KordocError) return err.message\r\n return \"문서 처리 중 오류가 발생했습니다\"\r\n}\r\n\r\n/**\r\n * ZIP 엔트리 경로의 경로 순회 여부 판별.\r\n * 백슬래시 정규화, .., 절대경로, Windows 드라이브 문자 모두 차단.\r\n */\r\nexport function isPathTraversal(name: string): boolean {\r\n if (name.includes(\"\\x00\")) return true\r\n const normalized = name.replace(/\\\\/g, \"/\")\r\n return normalized.includes(\"..\") || normalized.startsWith(\"/\") || /^[A-Za-z]:/.test(normalized)\r\n}\r\n\r\n// ─── ZIP 안전 로딩 (ZIP bomb 방지) ────────────────────\r\n\r\n/**\r\n * ZIP bomb 사전 검사 — Central Directory에서 비압축 합계와 엔트리 수 확인.\r\n * HWPX/XLSX/DOCX 등 모든 ZIP 기반 포맷에서 공통 사용.\r\n */\r\nexport function precheckZipSize(\r\n buffer: ArrayBuffer,\r\n maxUncompressedSize = 100 * 1024 * 1024,\r\n maxEntries = 500,\r\n): { totalUncompressed: number; entryCount: number } {\r\n try {\r\n const data = new DataView(buffer)\r\n const len = buffer.byteLength\r\n // EOCD 시그니처 역방향 스캔\r\n let eocdOffset = -1\r\n for (let i = len - 22; i >= Math.max(0, len - 65557); i--) {\r\n if (data.getUint32(i, true) === 0x06054b50) { eocdOffset = i; break }\r\n }\r\n if (eocdOffset < 0) return { totalUncompressed: 0, entryCount: 0 }\r\n\r\n const entryCount = data.getUint16(eocdOffset + 10, true)\r\n if (entryCount > maxEntries) {\r\n throw new KordocError(`ZIP 엔트리 수 초과: ${entryCount} (최대 ${maxEntries})`)\r\n }\r\n\r\n const cdSize = data.getUint32(eocdOffset + 12, true)\r\n const cdOffset = data.getUint32(eocdOffset + 16, true)\r\n if (cdOffset + cdSize > len) return { totalUncompressed: 0, entryCount }\r\n\r\n let totalUncompressed = 0\r\n let pos = cdOffset\r\n for (let i = 0; i < entryCount && pos + 46 <= cdOffset + cdSize; i++) {\r\n if (data.getUint32(pos, true) !== 0x02014b50) break\r\n totalUncompressed += data.getUint32(pos + 24, true)\r\n const nameLen = data.getUint16(pos + 28, true)\r\n const extraLen = data.getUint16(pos + 30, true)\r\n const commentLen = data.getUint16(pos + 32, true)\r\n pos += 46 + nameLen + extraLen + commentLen\r\n }\r\n\r\n if (totalUncompressed > maxUncompressedSize) {\r\n throw new KordocError(`ZIP 비압축 크기 초과: ${(totalUncompressed / 1024 / 1024).toFixed(1)}MB (최대 ${maxUncompressedSize / 1024 / 1024}MB)`)\r\n }\r\n\r\n return { totalUncompressed, entryCount }\r\n } catch (err) {\r\n if (err instanceof KordocError) throw err\r\n return { totalUncompressed: 0, entryCount: 0 }\r\n }\r\n}\r\n\r\n/** 하이퍼링크 URL 살균 — javascript: 등 XSS 위험 스킴 차단 */\r\nconst SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i\r\nexport function sanitizeHref(href: string): string | null {\r\n const trimmed = href.trim()\r\n if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null\r\n return trimmed\r\n}\r\n\r\n// ─── 에러 분류 ──────────────────────────────────────\r\n\r\nimport type { ErrorCode } from \"./types.js\"\r\n\r\n/** 에러를 구조화된 ErrorCode로 분류 — KordocError 메시지 패턴 매칭 */\r\nexport function classifyError(err: unknown): ErrorCode {\r\n if (!(err instanceof Error)) return \"PARSE_ERROR\"\r\n const msg = err.message\r\n if (msg.includes(\"암호화\")) return \"ENCRYPTED\"\r\n if (msg.includes(\"DRM\")) return \"DRM_PROTECTED\"\r\n if (msg.includes(\"ZIP bomb\") || msg.includes(\"ZIP 비압축 크기 초과\") || msg.includes(\"ZIP 엔트리 수 초과\")) return \"ZIP_BOMB\"\r\n if (msg.includes(\"bomb\") || msg.includes(\"크기 초과\") || msg.includes(\"압축 해제\")) return \"DECOMPRESSION_BOMB\"\r\n if (msg.includes(\"이미지 기반\")) return \"IMAGE_BASED_PDF\"\r\n if (msg.includes(\"섹션\") && (msg.includes(\"찾을 수 없\") || msg.includes(\"없음\"))) return \"NO_SECTIONS\"\r\n if (msg.includes(\"시그니처\") || msg.includes(\"복구할 수 없\")) return \"CORRUPTED\"\r\n return \"PARSE_ERROR\"\r\n}\r\n"],"mappings":";;;AAIO,IAAM,UAAkB,OAA4C,UAAqB;AAOzF,SAAS,cAAc,KAA0B;AACtD,MAAI,IAAI,eAAe,KAAK,IAAI,eAAe,IAAI,OAAO,YAAY;AACpE,WAAO,IAAI;AAAA,EACb;AACA,SAAO,IAAI,OAAO,MAAM,IAAI,YAAY,IAAI,aAAa,IAAI,UAAU;AACzE;AAMO,IAAM,cAAN,cAA0B,MAAM;AAAA,EACrC,YAAY,SAAiB;AAC3B,UAAM,OAAO;AACb,SAAK,OAAO;AAAA,EACd;AACF;AAMO,SAAS,cAAc,KAAsB;AAClD,MAAI,eAAe,YAAa,QAAO,IAAI;AAC3C,SAAO;AACT;AAMO,SAAS,gBAAgB,MAAuB;AACrD,MAAI,KAAK,SAAS,IAAM,EAAG,QAAO;AAClC,QAAM,aAAa,KAAK,QAAQ,OAAO,GAAG;AAC1C,SAAO,WAAW,SAAS,IAAI,KAAK,WAAW,WAAW,GAAG,KAAK,aAAa,KAAK,UAAU;AAChG;AAQO,SAAS,gBACd,QACA,sBAAsB,MAAM,OAAO,MACnC,aAAa,KACsC;AACnD,MAAI;AACF,UAAM,OAAO,IAAI,SAAS,MAAM;AAChC,UAAM,MAAM,OAAO;AAEnB,QAAI,aAAa;AACjB,aAAS,IAAI,MAAM,IAAI,KAAK,KAAK,IAAI,GAAG,MAAM,KAAK,GAAG,KAAK;AACzD,UAAI,KAAK,UAAU,GAAG,IAAI,MAAM,WAAY;AAAE,qBAAa;AAAG;AAAA,MAAM;AAAA,IACtE;AACA,QAAI,aAAa,EAAG,QAAO,EAAE,mBAAmB,GAAG,YAAY,EAAE;AAEjE,UAAM,aAAa,KAAK,UAAU,aAAa,IAAI,IAAI;AACvD,QAAI,aAAa,YAAY;AAC3B,YAAM,IAAI,YAAY,+CAAiB,UAAU,kBAAQ,UAAU,GAAG;AAAA,IACxE;AAEA,UAAM,SAAS,KAAK,UAAU,aAAa,IAAI,IAAI;AACnD,UAAM,WAAW,KAAK,UAAU,aAAa,IAAI,IAAI;AACrD,QAAI,WAAW,SAAS,IAAK,QAAO,EAAE,mBAAmB,GAAG,WAAW;AAEvE,QAAI,oBAAoB;AACxB,QAAI,MAAM;AACV,aAAS,IAAI,GAAG,IAAI,cAAc,MAAM,MAAM,WAAW,QAAQ,KAAK;AACpE,UAAI,KAAK,UAAU,KAAK,IAAI,MAAM,SAAY;AAC9C,2BAAqB,KAAK,UAAU,MAAM,IAAI,IAAI;AAClD,YAAM,UAAU,KAAK,UAAU,MAAM,IAAI,IAAI;AAC7C,YAAM,WAAW,KAAK,UAAU,MAAM,IAAI,IAAI;AAC9C,YAAM,aAAa,KAAK,UAAU,MAAM,IAAI,IAAI;AAChD,aAAO,KAAK,UAAU,WAAW;AAAA,IACnC;AAEA,QAAI,oBAAoB,qBAAqB;AAC3C,YAAM,IAAI,YAAY,sDAAmB,oBAAoB,OAAO,MAAM,QAAQ,CAAC,CAAC,oBAAU,sBAAsB,OAAO,IAAI,KAAK;AAAA,IACtI;AAEA,WAAO,EAAE,mBAAmB,WAAW;AAAA,EACzC,SAAS,KAAK;AACZ,QAAI,eAAe,YAAa,OAAM;AACtC,WAAO,EAAE,mBAAmB,GAAG,YAAY,EAAE;AAAA,EAC/C;AACF;AAGA,IAAM,eAAe;AACd,SAAS,aAAa,MAA6B;AACxD,QAAM,UAAU,KAAK,KAAK;AAC1B,MAAI,CAAC,WAAW,CAAC,aAAa,KAAK,OAAO,EAAG,QAAO;AACpD,SAAO;AACT;AAOO,SAAS,cAAc,KAAyB;AACrD,MAAI,EAAE,eAAe,OAAQ,QAAO;AACpC,QAAM,MAAM,IAAI;AAChB,MAAI,IAAI,SAAS,oBAAK,EAAG,QAAO;AAChC,MAAI,IAAI,SAAS,KAAK,EAAG,QAAO;AAChC,MAAI,IAAI,SAAS,UAAU,KAAK,IAAI,SAAS,kDAAe,KAAK,IAAI,SAAS,4CAAc,EAAG,QAAO;AACtG,MAAI,IAAI,SAAS,MAAM,KAAK,IAAI,SAAS,2BAAO,KAAK,IAAI,SAAS,2BAAO,EAAG,QAAO;AACnF,MAAI,IAAI,SAAS,iCAAQ,EAAG,QAAO;AACnC,MAAI,IAAI,SAAS,cAAI,MAAM,IAAI,SAAS,4BAAQ,KAAK,IAAI,SAAS,cAAI,GAAI,QAAO;AACjF,MAAI,IAAI,SAAS,0BAAM,KAAK,IAAI,SAAS,kCAAS,EAAG,QAAO;AAC5D,SAAO;AACT;","names":[]}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/page-range.ts"],"sourcesContent":["/** 페이지/섹션 범위 파싱 유틸리티 */\r\n\r\n/**\r\n * 페이지 범위 지정을 1-based Set<number>로 변환.\r\n *\r\n * @param spec - [1,2,3] 또는 \"1-3\" 또는 \"1,3,5-7\"\r\n * @param maxPages - 최대 페이지 수 (클램핑 상한)\r\n * @returns 1-based 페이지 번호 Set\r\n */\r\nexport function parsePageRange(spec: number[] | string, maxPages: number): Set<number> {\r\n const result = new Set<number>()\r\n if (maxPages <= 0) return result\r\n\r\n if (Array.isArray(spec)) {\r\n for (const n of spec) {\r\n const page = Math.round(n)\r\n if (page >= 1 && page <= maxPages) result.add(page)\r\n }\r\n return result\r\n }\r\n\r\n if (typeof spec !== \"string\" || spec.trim() === \"\") return result\r\n\r\n const parts = spec.split(\",\")\r\n for (const part of parts) {\r\n const trimmed = part.trim()\r\n if (!trimmed) continue\r\n\r\n const rangeMatch = trimmed.match(/^(\\d+)\\s*-\\s*(\\d+)$/)\r\n if (rangeMatch) {\r\n const start = Math.max(1, parseInt(rangeMatch[1], 10))\r\n const end = Math.min(maxPages, parseInt(rangeMatch[2], 10))\r\n for (let i = start; i <= end; i++) result.add(i)\r\n } else {\r\n const page = parseInt(trimmed, 10)\r\n if (!isNaN(page) && page >= 1 && page <= maxPages) result.add(page)\r\n }\r\n }\r\n\r\n return result\r\n}\r\n"],"mappings":";;;AASO,SAAS,eAAe,MAAyB,UAA+B;AACrF,QAAM,SAAS,oBAAI,IAAY;AAC/B,MAAI,YAAY,EAAG,QAAO;AAE1B,MAAI,MAAM,QAAQ,IAAI,GAAG;AACvB,eAAW,KAAK,MAAM;AACpB,YAAM,OAAO,KAAK,MAAM,CAAC;AACzB,UAAI,QAAQ,KAAK,QAAQ,SAAU,QAAO,IAAI,IAAI;AAAA,IACpD;AACA,WAAO;AAAA,EACT;AAEA,MAAI,OAAO,SAAS,YAAY,KAAK,KAAK,MAAM,GAAI,QAAO;AAE3D,QAAM,QAAQ,KAAK,MAAM,GAAG;AAC5B,aAAW,QAAQ,OAAO;AACxB,UAAM,UAAU,KAAK,KAAK;AAC1B,QAAI,CAAC,QAAS;AAEd,UAAM,aAAa,QAAQ,MAAM,qBAAqB;AACtD,QAAI,YAAY;AACd,YAAM,QAAQ,KAAK,IAAI,GAAG,SAAS,WAAW,CAAC,GAAG,EAAE,CAAC;AACrD,YAAM,MAAM,KAAK,IAAI,UAAU,SAAS,WAAW,CAAC,GAAG,EAAE,CAAC;AAC1D,eAAS,IAAI,OAAO,KAAK,KAAK,IAAK,QAAO,IAAI,CAAC;AAAA,IACjD,OAAO;AACL,YAAM,OAAO,SAAS,SAAS,EAAE;AACjC,UAAI,CAAC,MAAM,IAAI,KAAK,QAAQ,KAAK,QAAQ,SAAU,QAAO,IAAI,IAAI;AAAA,IACpE;AAAA,EACF;AAEA,SAAO;AACT;","names":[]}
|
|
@@ -6,10 +6,10 @@ import {
|
|
|
6
6
|
precheckZipSize,
|
|
7
7
|
sanitizeHref,
|
|
8
8
|
toArrayBuffer
|
|
9
|
-
} from "./chunk-
|
|
9
|
+
} from "./chunk-25TXW6EP.js";
|
|
10
10
|
import {
|
|
11
11
|
parsePageRange
|
|
12
|
-
} from "./chunk-
|
|
12
|
+
} from "./chunk-3TBUDJDE.js";
|
|
13
13
|
|
|
14
14
|
// src/detect.ts
|
|
15
15
|
import JSZip from "jszip";
|
|
@@ -163,6 +163,47 @@ function sanitizeText(text) {
|
|
|
163
163
|
}
|
|
164
164
|
return result;
|
|
165
165
|
}
|
|
166
|
+
function flattenLayoutTables(blocks) {
|
|
167
|
+
const result = [];
|
|
168
|
+
for (const block of blocks) {
|
|
169
|
+
if (block.type !== "table" || !block.table) {
|
|
170
|
+
result.push(block);
|
|
171
|
+
continue;
|
|
172
|
+
}
|
|
173
|
+
const { rows: numRows, cols: numCols, cells } = block.table;
|
|
174
|
+
if (numRows === 1 && numCols === 1) {
|
|
175
|
+
result.push(block);
|
|
176
|
+
continue;
|
|
177
|
+
}
|
|
178
|
+
if (numRows <= 3) {
|
|
179
|
+
let totalNewlines = 0;
|
|
180
|
+
let totalTextLen = 0;
|
|
181
|
+
for (let r = 0; r < numRows; r++) {
|
|
182
|
+
for (let c = 0; c < numCols; c++) {
|
|
183
|
+
const t = cells[r]?.[c]?.text || "";
|
|
184
|
+
totalNewlines += (t.match(/\n/g) || []).length;
|
|
185
|
+
totalTextLen += t.length;
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
if (totalNewlines > 5 || numRows <= 2 && totalTextLen > 300) {
|
|
189
|
+
for (let r = 0; r < numRows; r++) {
|
|
190
|
+
for (let c = 0; c < numCols; c++) {
|
|
191
|
+
const cellText = cells[r]?.[c]?.text?.trim();
|
|
192
|
+
if (!cellText) continue;
|
|
193
|
+
for (const line of cellText.split("\n")) {
|
|
194
|
+
const trimmed = line.trim();
|
|
195
|
+
if (!trimmed) continue;
|
|
196
|
+
result.push({ type: "paragraph", text: trimmed, pageNumber: block.pageNumber });
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
continue;
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
result.push(block);
|
|
204
|
+
}
|
|
205
|
+
return result;
|
|
206
|
+
}
|
|
166
207
|
function blocksToMarkdown(blocks) {
|
|
167
208
|
const lines = [];
|
|
168
209
|
for (let i = 0; i < blocks.length; i++) {
|
|
@@ -1078,8 +1119,9 @@ var TAG_CHAR_SHAPE = 68;
|
|
|
1078
1119
|
var TAG_CTRL_HEADER = 71;
|
|
1079
1120
|
var TAG_LIST_HEADER = 72;
|
|
1080
1121
|
var TAG_TABLE = 77;
|
|
1081
|
-
var TAG_DOC_CHAR_SHAPE =
|
|
1082
|
-
var
|
|
1122
|
+
var TAG_DOC_CHAR_SHAPE = 21;
|
|
1123
|
+
var TAG_DOC_PARA_SHAPE = 25;
|
|
1124
|
+
var TAG_DOC_STYLE = 26;
|
|
1083
1125
|
var CHAR_LINE = 0;
|
|
1084
1126
|
var CHAR_SECTION_BREAK = 10;
|
|
1085
1127
|
var CHAR_PARA = 13;
|
|
@@ -1135,8 +1177,14 @@ function parseFileHeader(data) {
|
|
|
1135
1177
|
}
|
|
1136
1178
|
function parseDocInfo(records) {
|
|
1137
1179
|
const charShapes = [];
|
|
1180
|
+
const paraShapes = [];
|
|
1138
1181
|
const styles = [];
|
|
1139
1182
|
for (const rec of records) {
|
|
1183
|
+
if (rec.tagId === TAG_DOC_PARA_SHAPE && rec.data.length >= 4) {
|
|
1184
|
+
const flags = rec.data.readUInt32LE(0);
|
|
1185
|
+
const outlineLevel = flags >> 25 & 7;
|
|
1186
|
+
paraShapes.push({ outlineLevel });
|
|
1187
|
+
}
|
|
1140
1188
|
if (rec.tagId === TAG_DOC_CHAR_SHAPE && rec.data.length >= 18) {
|
|
1141
1189
|
if (rec.data.length >= 50) {
|
|
1142
1190
|
const fontSize = rec.data.readUInt32LE(42);
|
|
@@ -1176,7 +1224,7 @@ function parseDocInfo(records) {
|
|
|
1176
1224
|
}
|
|
1177
1225
|
}
|
|
1178
1226
|
}
|
|
1179
|
-
return { charShapes, styles };
|
|
1227
|
+
return { charShapes, paraShapes, styles };
|
|
1180
1228
|
}
|
|
1181
1229
|
function extractText(data) {
|
|
1182
1230
|
let result = "";
|
|
@@ -2186,12 +2234,13 @@ function parseHwp5Document(buffer, options) {
|
|
|
2186
2234
|
}
|
|
2187
2235
|
}
|
|
2188
2236
|
const images = cfb ? extractHwp5Images(cfb, blocks, compressed, warnings) : extractHwp5ImagesLenient(lenientCfb, blocks, compressed, warnings);
|
|
2237
|
+
const flatBlocks = flattenLayoutTables(blocks);
|
|
2189
2238
|
if (docInfo) {
|
|
2190
|
-
detectHwp5Headings(
|
|
2239
|
+
detectHwp5Headings(flatBlocks, docInfo);
|
|
2191
2240
|
}
|
|
2192
|
-
const outline =
|
|
2193
|
-
const markdown = blocksToMarkdown(
|
|
2194
|
-
return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0, images: images.length > 0 ? images : void 0 };
|
|
2241
|
+
const outline = flatBlocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
|
|
2242
|
+
const markdown = blocksToMarkdown(flatBlocks);
|
|
2243
|
+
return { markdown, blocks: flatBlocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0, images: images.length > 0 ? images : void 0 };
|
|
2195
2244
|
}
|
|
2196
2245
|
function parseDocInfoStream(cfb, compressed) {
|
|
2197
2246
|
try {
|
|
@@ -2242,16 +2291,21 @@ function detectHwp5Headings(blocks, docInfo) {
|
|
|
2242
2291
|
}
|
|
2243
2292
|
if (baseFontSize <= 0) return;
|
|
2244
2293
|
for (const block of blocks) {
|
|
2245
|
-
if (block.type
|
|
2294
|
+
if (block.type === "heading") continue;
|
|
2295
|
+
if (block.type !== "paragraph" || !block.text) continue;
|
|
2246
2296
|
const text = block.text.trim();
|
|
2247
2297
|
if (text.length === 0 || text.length > 200) continue;
|
|
2248
2298
|
if (/^\d+$/.test(text)) continue;
|
|
2249
|
-
const ratio = block.style.fontSize / baseFontSize;
|
|
2250
2299
|
let level = 0;
|
|
2251
|
-
if (
|
|
2252
|
-
|
|
2253
|
-
|
|
2254
|
-
|
|
2300
|
+
if (block.style?.fontSize && baseFontSize > 0) {
|
|
2301
|
+
const ratio = block.style.fontSize / baseFontSize;
|
|
2302
|
+
if (ratio >= HEADING_RATIO_H1) level = 1;
|
|
2303
|
+
else if (ratio >= HEADING_RATIO_H2) level = 2;
|
|
2304
|
+
else if (ratio >= HEADING_RATIO_H3) level = 3;
|
|
2305
|
+
}
|
|
2306
|
+
if (/^제\d+[장절편]\s/.test(text) && text.length <= 50) {
|
|
2307
|
+
if (level === 0) level = 2;
|
|
2308
|
+
} else if (/^제\d+(조의?\d*)\s*[\((]/.test(text) && text.length <= 80) {
|
|
2255
2309
|
if (level === 0) level = 3;
|
|
2256
2310
|
}
|
|
2257
2311
|
if (level > 0) {
|
|
@@ -2497,13 +2551,20 @@ function parseSection(records, docInfo, warnings, sectionNum) {
|
|
|
2497
2551
|
while (i < records.length) {
|
|
2498
2552
|
const rec = records[i];
|
|
2499
2553
|
if (rec.tagId === TAG_PARA_HEADER && rec.level === 0) {
|
|
2500
|
-
const { paragraph, tables, nextIdx, charShapeIds } = parseParagraphWithTables(records, i);
|
|
2554
|
+
const { paragraph, tables, nextIdx, charShapeIds, paraShapeId } = parseParagraphWithTables(records, i);
|
|
2501
2555
|
if (paragraph) {
|
|
2502
2556
|
const block = { type: "paragraph", text: paragraph, pageNumber: sectionNum };
|
|
2503
2557
|
if (docInfo && charShapeIds.length > 0) {
|
|
2504
2558
|
const style = resolveCharStyle(charShapeIds, docInfo);
|
|
2505
2559
|
if (style) block.style = style;
|
|
2506
2560
|
}
|
|
2561
|
+
if (docInfo && paraShapeId >= 0 && paraShapeId < docInfo.paraShapes.length) {
|
|
2562
|
+
const ol = docInfo.paraShapes[paraShapeId].outlineLevel;
|
|
2563
|
+
if (ol >= 1 && ol <= 6) {
|
|
2564
|
+
block.type = "heading";
|
|
2565
|
+
block.level = ol;
|
|
2566
|
+
}
|
|
2567
|
+
}
|
|
2507
2568
|
blocks.push(block);
|
|
2508
2569
|
}
|
|
2509
2570
|
for (const t of tables) blocks.push({ type: "table", table: t, pageNumber: sectionNum });
|
|
@@ -2623,6 +2684,8 @@ function parseParagraphWithTables(records, startIdx) {
|
|
|
2623
2684
|
let text = "";
|
|
2624
2685
|
const tables = [];
|
|
2625
2686
|
const charShapeIds = [];
|
|
2687
|
+
const paraHeaderData = records[startIdx].data;
|
|
2688
|
+
const paraShapeId = paraHeaderData.length >= 10 ? paraHeaderData.readUInt16LE(8) : -1;
|
|
2626
2689
|
let i = startIdx + 1;
|
|
2627
2690
|
while (i < records.length) {
|
|
2628
2691
|
const rec = records[i];
|
|
@@ -2647,7 +2710,7 @@ function parseParagraphWithTables(records, startIdx) {
|
|
|
2647
2710
|
i++;
|
|
2648
2711
|
}
|
|
2649
2712
|
const trimmed = text.trim();
|
|
2650
|
-
return { paragraph: trimmed || null, tables, nextIdx: i, charShapeIds };
|
|
2713
|
+
return { paragraph: trimmed || null, tables, nextIdx: i, charShapeIds, paraShapeId };
|
|
2651
2714
|
}
|
|
2652
2715
|
function parseTableBlock(records, startIdx) {
|
|
2653
2716
|
const tableLevel = records[startIdx].level;
|
|
@@ -3465,7 +3528,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
3465
3528
|
if (totalChars / Math.max(parsedPageCount, 1) < 10) {
|
|
3466
3529
|
if (options?.ocr) {
|
|
3467
3530
|
try {
|
|
3468
|
-
const { ocrPages } = await import("./provider-
|
|
3531
|
+
const { ocrPages } = await import("./provider-EU3CG724.js");
|
|
3469
3532
|
const ocrBlocks = await ocrPages(doc, options.ocr, pageFilter, effectivePageCount);
|
|
3470
3533
|
if (ocrBlocks.length > 0) {
|
|
3471
3534
|
const ocrMarkdown = ocrBlocks.map((b) => b.text || "").filter(Boolean).join("\n\n");
|
|
@@ -4563,7 +4626,7 @@ async function parseXlsxDocument(buffer, options) {
|
|
|
4563
4626
|
}
|
|
4564
4627
|
let pageFilter = null;
|
|
4565
4628
|
if (options?.pages) {
|
|
4566
|
-
const { parsePageRange: parsePageRange2 } = await import("./page-range-
|
|
4629
|
+
const { parsePageRange: parsePageRange2 } = await import("./page-range-OF5I4PQY.js");
|
|
4567
4630
|
pageFilter = parsePageRange2(options.pages, sheets.length);
|
|
4568
4631
|
}
|
|
4569
4632
|
const blocks = [];
|
|
@@ -5446,4 +5509,4 @@ export {
|
|
|
5446
5509
|
extractFormFields,
|
|
5447
5510
|
parse
|
|
5448
5511
|
};
|
|
5449
|
-
//# sourceMappingURL=chunk-
|
|
5512
|
+
//# sourceMappingURL=chunk-4UH6ABAY.js.map
|