kordoc 2.0.1 → 2.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -21
- package/README.md +291 -291
- package/dist/{chunk-L4OFASDS.js → chunk-25TXW6EP.js} +2 -2
- package/dist/chunk-25TXW6EP.js.map +1 -0
- package/dist/{chunk-MOL7MDBG.js → chunk-3TBUDJDE.js} +1 -1
- package/dist/chunk-3TBUDJDE.js.map +1 -0
- package/dist/{chunk-JJ65GKUH.js → chunk-4UH6ABAY.js} +185 -41
- package/dist/chunk-4UH6ABAY.js.map +1 -0
- package/dist/cli.js +5 -5
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +181 -37
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +181 -37
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +3 -3
- package/dist/mcp.js.map +1 -1
- package/dist/page-range-OF5I4PQY.js +8 -0
- package/dist/{provider-A4FHJSID.js → provider-EU3CG724.js} +1 -1
- package/dist/provider-EU3CG724.js.map +1 -0
- package/dist/{utils-4HVKHULU.js → utils-BTZ4WSYX.js} +2 -2
- package/dist/{watch-RNZ3KESY.js → watch-QD3PDNXQ.js} +4 -4
- package/dist/watch-QD3PDNXQ.js.map +1 -0
- package/package.json +1 -1
- package/dist/chunk-JJ65GKUH.js.map +0 -1
- package/dist/chunk-L4OFASDS.js.map +0 -1
- package/dist/chunk-MOL7MDBG.js.map +0 -1
- package/dist/page-range-737B4EZW.js +0 -8
- package/dist/provider-A4FHJSID.js.map +0 -1
- package/dist/watch-RNZ3KESY.js.map +0 -1
- /package/dist/{page-range-737B4EZW.js.map → page-range-OF5I4PQY.js.map} +0 -0
- /package/dist/{utils-4HVKHULU.js.map → utils-BTZ4WSYX.js.map} +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/utils.ts"],"sourcesContent":["/** kordoc 공용 유틸리티 */\r\n\r\n/** 빌드 타임에 tsup define으로 주입되는 버전 */\r\ndeclare const __KORDOC_VERSION__: string\r\nexport const VERSION: string = typeof __KORDOC_VERSION__ !== \"undefined\" ? __KORDOC_VERSION__ : \"0.0.0-dev\"\r\n\r\n/**\r\n * Node.js Buffer → ArrayBuffer 변환\r\n * pool Buffer의 공유 ArrayBuffer 문제를 안전하게 처리.\r\n * offset=0이고 전체 ArrayBuffer를 차지하면 복사 없이 직접 반환.\r\n */\r\nexport function toArrayBuffer(buf: Buffer): ArrayBuffer {\r\n if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {\r\n return buf.buffer as ArrayBuffer\r\n }\r\n return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength) as ArrayBuffer\r\n}\r\n\r\n/**\r\n * kordoc 내부 에러 클래스 — 사용자에게 노출해도 안전한 메시지만 포함.\r\n * MCP 에러 정제에서 instanceof로 판별하여 allowlist 패턴 매칭 없이 안전하게 통과.\r\n */\r\nexport class KordocError extends Error {\r\n constructor(message: string) {\r\n super(message)\r\n this.name = \"KordocError\"\r\n }\r\n}\r\n\r\n/**\r\n * 에러 메시지 정제 — KordocError는 그대로, 나머지는 일반 메시지로 대체.\r\n * 파일시스템 경로, 스택 트레이스 등 내부 정보 노출 방지.\r\n */\r\nexport function sanitizeError(err: unknown): string {\r\n if (err instanceof KordocError) return err.message\r\n return \"문서 처리 중 오류가 발생했습니다\"\r\n}\r\n\r\n/**\r\n * ZIP 엔트리 경로의 경로 순회 여부 판별.\r\n * 백슬래시 정규화, .., 절대경로, Windows 드라이브 문자 모두 차단.\r\n */\r\nexport function isPathTraversal(name: string): boolean {\r\n if (name.includes(\"\\x00\")) return true\r\n const normalized = name.replace(/\\\\/g, \"/\")\r\n return normalized.includes(\"..\") || normalized.startsWith(\"/\") || /^[A-Za-z]:/.test(normalized)\r\n}\r\n\r\n// ─── ZIP 안전 로딩 (ZIP bomb 방지) ────────────────────\r\n\r\n/**\r\n * ZIP bomb 사전 검사 — Central Directory에서 비압축 합계와 엔트리 수 확인.\r\n * HWPX/XLSX/DOCX 등 모든 ZIP 기반 포맷에서 공통 사용.\r\n */\r\nexport function precheckZipSize(\r\n buffer: ArrayBuffer,\r\n maxUncompressedSize = 100 * 1024 * 1024,\r\n maxEntries = 500,\r\n): { totalUncompressed: number; entryCount: number } {\r\n try {\r\n const data = new DataView(buffer)\r\n const len = buffer.byteLength\r\n // EOCD 시그니처 역방향 스캔\r\n let eocdOffset = -1\r\n for (let i = len - 22; i >= Math.max(0, len - 65557); i--) {\r\n if (data.getUint32(i, true) === 0x06054b50) { eocdOffset = i; break }\r\n }\r\n if (eocdOffset < 0) return { totalUncompressed: 0, entryCount: 0 }\r\n\r\n const entryCount = data.getUint16(eocdOffset + 10, true)\r\n if (entryCount > maxEntries) {\r\n throw new KordocError(`ZIP 엔트리 수 초과: ${entryCount} (최대 ${maxEntries})`)\r\n }\r\n\r\n const cdSize = data.getUint32(eocdOffset + 12, true)\r\n const cdOffset = data.getUint32(eocdOffset + 16, true)\r\n if (cdOffset + cdSize > len) return { totalUncompressed: 0, entryCount }\r\n\r\n let totalUncompressed = 0\r\n let pos = cdOffset\r\n for (let i = 0; i < entryCount && pos + 46 <= cdOffset + cdSize; i++) {\r\n if (data.getUint32(pos, true) !== 0x02014b50) break\r\n totalUncompressed += data.getUint32(pos + 24, true)\r\n const nameLen = data.getUint16(pos + 28, true)\r\n const extraLen = data.getUint16(pos + 30, true)\r\n const commentLen = data.getUint16(pos + 32, true)\r\n pos += 46 + nameLen + extraLen + commentLen\r\n }\r\n\r\n if (totalUncompressed > maxUncompressedSize) {\r\n throw new KordocError(`ZIP 비압축 크기 초과: ${(totalUncompressed / 1024 / 1024).toFixed(1)}MB (최대 ${maxUncompressedSize / 1024 / 1024}MB)`)\r\n }\r\n\r\n return { totalUncompressed, entryCount }\r\n } catch (err) {\r\n if (err instanceof KordocError) throw err\r\n return { totalUncompressed: 0, entryCount: 0 }\r\n }\r\n}\r\n\r\n/** 하이퍼링크 URL 살균 — javascript: 등 XSS 위험 스킴 차단 */\r\nconst SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i\r\nexport function sanitizeHref(href: string): string | null {\r\n const trimmed = href.trim()\r\n if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null\r\n return trimmed\r\n}\r\n\r\n// ─── 에러 분류 ──────────────────────────────────────\r\n\r\nimport type { ErrorCode } from \"./types.js\"\r\n\r\n/** 에러를 구조화된 ErrorCode로 분류 — KordocError 메시지 패턴 매칭 */\r\nexport function classifyError(err: unknown): ErrorCode {\r\n if (!(err instanceof Error)) return \"PARSE_ERROR\"\r\n const msg = err.message\r\n if (msg.includes(\"암호화\")) return \"ENCRYPTED\"\r\n if (msg.includes(\"DRM\")) return \"DRM_PROTECTED\"\r\n if (msg.includes(\"ZIP bomb\") || msg.includes(\"ZIP 비압축 크기 초과\") || msg.includes(\"ZIP 엔트리 수 초과\")) return \"ZIP_BOMB\"\r\n if (msg.includes(\"bomb\") || msg.includes(\"크기 초과\") || msg.includes(\"압축 해제\")) return \"DECOMPRESSION_BOMB\"\r\n if (msg.includes(\"이미지 기반\")) return \"IMAGE_BASED_PDF\"\r\n if (msg.includes(\"섹션\") && (msg.includes(\"찾을 수 없\") || msg.includes(\"없음\"))) return \"NO_SECTIONS\"\r\n if (msg.includes(\"시그니처\") || msg.includes(\"복구할 수 없\")) return \"CORRUPTED\"\r\n return \"PARSE_ERROR\"\r\n}\r\n"],"mappings":";;;AAIO,IAAM,UAAkB,OAA4C,UAAqB;AAOzF,SAAS,cAAc,KAA0B;AACtD,MAAI,IAAI,eAAe,KAAK,IAAI,eAAe,IAAI,OAAO,YAAY;AACpE,WAAO,IAAI;AAAA,EACb;AACA,SAAO,IAAI,OAAO,MAAM,IAAI,YAAY,IAAI,aAAa,IAAI,UAAU;AACzE;AAMO,IAAM,cAAN,cAA0B,MAAM;AAAA,EACrC,YAAY,SAAiB;AAC3B,UAAM,OAAO;AACb,SAAK,OAAO;AAAA,EACd;AACF;AAMO,SAAS,cAAc,KAAsB;AAClD,MAAI,eAAe,YAAa,QAAO,IAAI;AAC3C,SAAO;AACT;AAMO,SAAS,gBAAgB,MAAuB;AACrD,MAAI,KAAK,SAAS,IAAM,EAAG,QAAO;AAClC,QAAM,aAAa,KAAK,QAAQ,OAAO,GAAG;AAC1C,SAAO,WAAW,SAAS,IAAI,KAAK,WAAW,WAAW,GAAG,KAAK,aAAa,KAAK,UAAU;AAChG;AAQO,SAAS,gBACd,QACA,sBAAsB,MAAM,OAAO,MACnC,aAAa,KACsC;AACnD,MAAI;AACF,UAAM,OAAO,IAAI,SAAS,MAAM;AAChC,UAAM,MAAM,OAAO;AAEnB,QAAI,aAAa;AACjB,aAAS,IAAI,MAAM,IAAI,KAAK,KAAK,IAAI,GAAG,MAAM,KAAK,GAAG,KAAK;AACzD,UAAI,KAAK,UAAU,GAAG,IAAI,MAAM,WAAY;AAAE,qBAAa;AAAG;AAAA,MAAM;AAAA,IACtE;AACA,QAAI,aAAa,EAAG,QAAO,EAAE,mBAAmB,GAAG,YAAY,EAAE;AAEjE,UAAM,aAAa,KAAK,UAAU,aAAa,IAAI,IAAI;AACvD,QAAI,aAAa,YAAY;AAC3B,YAAM,IAAI,YAAY,+CAAiB,UAAU,kBAAQ,UAAU,GAAG;AAAA,IACxE;AAEA,UAAM,SAAS,KAAK,UAAU,aAAa,IAAI,IAAI;AACnD,UAAM,WAAW,KAAK,UAAU,aAAa,IAAI,IAAI;AACrD,QAAI,WAAW,SAAS,IAAK,QAAO,EAAE,mBAAmB,GAAG,WAAW;AAEvE,QAAI,oBAAoB;AACxB,QAAI,MAAM;AACV,aAAS,IAAI,GAAG,IAAI,cAAc,MAAM,MAAM,WAAW,QAAQ,KAAK;AACpE,UAAI,KAAK,UAAU,KAAK,IAAI,MAAM,SAAY;AAC9C,2BAAqB,KAAK,UAAU,MAAM,IAAI,IAAI;AAClD,YAAM,UAAU,KAAK,UAAU,MAAM,IAAI,IAAI;AAC7C,YAAM,WAAW,KAAK,UAAU,MAAM,IAAI,IAAI;AAC9C,YAAM,aAAa,KAAK,UAAU,MAAM,IAAI,IAAI;AAChD,aAAO,KAAK,UAAU,WAAW;AAAA,IACnC;AAEA,QAAI,oBAAoB,qBAAqB;AAC3C,YAAM,IAAI,YAAY,sDAAmB,oBAAoB,OAAO,MAAM,QAAQ,CAAC,CAAC,oBAAU,sBAAsB,OAAO,IAAI,KAAK;AAAA,IACtI;AAEA,WAAO,EAAE,mBAAmB,WAAW;AAAA,EACzC,SAAS,KAAK;AACZ,QAAI,eAAe,YAAa,OAAM;AACtC,WAAO,EAAE,mBAAmB,GAAG,YAAY,EAAE;AAAA,EAC/C;AACF;AAGA,IAAM,eAAe;AACd,SAAS,aAAa,MAA6B;AACxD,QAAM,UAAU,KAAK,KAAK;AAC1B,MAAI,CAAC,WAAW,CAAC,aAAa,KAAK,OAAO,EAAG,QAAO;AACpD,SAAO;AACT;AAOO,SAAS,cAAc,KAAyB;AACrD,MAAI,EAAE,eAAe,OAAQ,QAAO;AACpC,QAAM,MAAM,IAAI;AAChB,MAAI,IAAI,SAAS,oBAAK,EAAG,QAAO;AAChC,MAAI,IAAI,SAAS,KAAK,EAAG,QAAO;AAChC,MAAI,IAAI,SAAS,UAAU,KAAK,IAAI,SAAS,kDAAe,KAAK,IAAI,SAAS,4CAAc,EAAG,QAAO;AACtG,MAAI,IAAI,SAAS,MAAM,KAAK,IAAI,SAAS,2BAAO,KAAK,IAAI,SAAS,2BAAO,EAAG,QAAO;AACnF,MAAI,IAAI,SAAS,iCAAQ,EAAG,QAAO;AACnC,MAAI,IAAI,SAAS,cAAI,MAAM,IAAI,SAAS,4BAAQ,KAAK,IAAI,SAAS,cAAI,GAAI,QAAO;AACjF,MAAI,IAAI,SAAS,0BAAM,KAAK,IAAI,SAAS,kCAAS,EAAG,QAAO;AAC5D,SAAO;AACT;","names":[]}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/page-range.ts"],"sourcesContent":["/** 페이지/섹션 범위 파싱 유틸리티 */\r\n\r\n/**\r\n * 페이지 범위 지정을 1-based Set<number>로 변환.\r\n *\r\n * @param spec - [1,2,3] 또는 \"1-3\" 또는 \"1,3,5-7\"\r\n * @param maxPages - 최대 페이지 수 (클램핑 상한)\r\n * @returns 1-based 페이지 번호 Set\r\n */\r\nexport function parsePageRange(spec: number[] | string, maxPages: number): Set<number> {\r\n const result = new Set<number>()\r\n if (maxPages <= 0) return result\r\n\r\n if (Array.isArray(spec)) {\r\n for (const n of spec) {\r\n const page = Math.round(n)\r\n if (page >= 1 && page <= maxPages) result.add(page)\r\n }\r\n return result\r\n }\r\n\r\n if (typeof spec !== \"string\" || spec.trim() === \"\") return result\r\n\r\n const parts = spec.split(\",\")\r\n for (const part of parts) {\r\n const trimmed = part.trim()\r\n if (!trimmed) continue\r\n\r\n const rangeMatch = trimmed.match(/^(\\d+)\\s*-\\s*(\\d+)$/)\r\n if (rangeMatch) {\r\n const start = Math.max(1, parseInt(rangeMatch[1], 10))\r\n const end = Math.min(maxPages, parseInt(rangeMatch[2], 10))\r\n for (let i = start; i <= end; i++) result.add(i)\r\n } else {\r\n const page = parseInt(trimmed, 10)\r\n if (!isNaN(page) && page >= 1 && page <= maxPages) result.add(page)\r\n }\r\n }\r\n\r\n return result\r\n}\r\n"],"mappings":";;;AASO,SAAS,eAAe,MAAyB,UAA+B;AACrF,QAAM,SAAS,oBAAI,IAAY;AAC/B,MAAI,YAAY,EAAG,QAAO;AAE1B,MAAI,MAAM,QAAQ,IAAI,GAAG;AACvB,eAAW,KAAK,MAAM;AACpB,YAAM,OAAO,KAAK,MAAM,CAAC;AACzB,UAAI,QAAQ,KAAK,QAAQ,SAAU,QAAO,IAAI,IAAI;AAAA,IACpD;AACA,WAAO;AAAA,EACT;AAEA,MAAI,OAAO,SAAS,YAAY,KAAK,KAAK,MAAM,GAAI,QAAO;AAE3D,QAAM,QAAQ,KAAK,MAAM,GAAG;AAC5B,aAAW,QAAQ,OAAO;AACxB,UAAM,UAAU,KAAK,KAAK;AAC1B,QAAI,CAAC,QAAS;AAEd,UAAM,aAAa,QAAQ,MAAM,qBAAqB;AACtD,QAAI,YAAY;AACd,YAAM,QAAQ,KAAK,IAAI,GAAG,SAAS,WAAW,CAAC,GAAG,EAAE,CAAC;AACrD,YAAM,MAAM,KAAK,IAAI,UAAU,SAAS,WAAW,CAAC,GAAG,EAAE,CAAC;AAC1D,eAAS,IAAI,OAAO,KAAK,KAAK,IAAK,QAAO,IAAI,CAAC;AAAA,IACjD,OAAO;AACL,YAAM,OAAO,SAAS,SAAS,EAAE;AACjC,UAAI,CAAC,MAAM,IAAI,KAAK,QAAQ,KAAK,QAAQ,SAAU,QAAO,IAAI,IAAI;AAAA,IACpE;AAAA,EACF;AAEA,SAAO;AACT;","names":[]}
|
|
@@ -6,10 +6,10 @@ import {
|
|
|
6
6
|
precheckZipSize,
|
|
7
7
|
sanitizeHref,
|
|
8
8
|
toArrayBuffer
|
|
9
|
-
} from "./chunk-
|
|
9
|
+
} from "./chunk-25TXW6EP.js";
|
|
10
10
|
import {
|
|
11
11
|
parsePageRange
|
|
12
|
-
} from "./chunk-
|
|
12
|
+
} from "./chunk-3TBUDJDE.js";
|
|
13
13
|
|
|
14
14
|
// src/detect.ts
|
|
15
15
|
import JSZip from "jszip";
|
|
@@ -163,6 +163,47 @@ function sanitizeText(text) {
|
|
|
163
163
|
}
|
|
164
164
|
return result;
|
|
165
165
|
}
|
|
166
|
+
function flattenLayoutTables(blocks) {
|
|
167
|
+
const result = [];
|
|
168
|
+
for (const block of blocks) {
|
|
169
|
+
if (block.type !== "table" || !block.table) {
|
|
170
|
+
result.push(block);
|
|
171
|
+
continue;
|
|
172
|
+
}
|
|
173
|
+
const { rows: numRows, cols: numCols, cells } = block.table;
|
|
174
|
+
if (numRows === 1 && numCols === 1) {
|
|
175
|
+
result.push(block);
|
|
176
|
+
continue;
|
|
177
|
+
}
|
|
178
|
+
if (numRows <= 3) {
|
|
179
|
+
let totalNewlines = 0;
|
|
180
|
+
let totalTextLen = 0;
|
|
181
|
+
for (let r = 0; r < numRows; r++) {
|
|
182
|
+
for (let c = 0; c < numCols; c++) {
|
|
183
|
+
const t = cells[r]?.[c]?.text || "";
|
|
184
|
+
totalNewlines += (t.match(/\n/g) || []).length;
|
|
185
|
+
totalTextLen += t.length;
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
if (totalNewlines > 5 || numRows <= 2 && totalTextLen > 300) {
|
|
189
|
+
for (let r = 0; r < numRows; r++) {
|
|
190
|
+
for (let c = 0; c < numCols; c++) {
|
|
191
|
+
const cellText = cells[r]?.[c]?.text?.trim();
|
|
192
|
+
if (!cellText) continue;
|
|
193
|
+
for (const line of cellText.split("\n")) {
|
|
194
|
+
const trimmed = line.trim();
|
|
195
|
+
if (!trimmed) continue;
|
|
196
|
+
result.push({ type: "paragraph", text: trimmed, pageNumber: block.pageNumber });
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
continue;
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
result.push(block);
|
|
204
|
+
}
|
|
205
|
+
return result;
|
|
206
|
+
}
|
|
166
207
|
function blocksToMarkdown(blocks) {
|
|
167
208
|
const lines = [];
|
|
168
209
|
for (let i = 0; i < blocks.length; i++) {
|
|
@@ -224,8 +265,11 @@ function blocksToMarkdown(blocks) {
|
|
|
224
265
|
if (lines.length > 0 && lines[lines.length - 1] !== "") {
|
|
225
266
|
lines.push("");
|
|
226
267
|
}
|
|
227
|
-
|
|
228
|
-
|
|
268
|
+
const tableMd = tableToMarkdown(block.table);
|
|
269
|
+
if (tableMd) {
|
|
270
|
+
lines.push(tableMd);
|
|
271
|
+
lines.push("");
|
|
272
|
+
}
|
|
229
273
|
}
|
|
230
274
|
}
|
|
231
275
|
return lines.join("\n").trim();
|
|
@@ -235,6 +279,7 @@ function tableToMarkdown(table) {
|
|
|
235
279
|
const { cells, rows: numRows, cols: numCols } = table;
|
|
236
280
|
if (numRows === 1 && numCols === 1) {
|
|
237
281
|
const content = sanitizeText(cells[0][0].text);
|
|
282
|
+
if (!content) return "";
|
|
238
283
|
return content.split(/\n/).map((line) => {
|
|
239
284
|
const trimmed = line.trim();
|
|
240
285
|
if (!trimmed) return "";
|
|
@@ -271,9 +316,9 @@ function tableToMarkdown(table) {
|
|
|
271
316
|
const row = display[r];
|
|
272
317
|
const isEmptyPlaceholder = row.every((cell) => cell === "");
|
|
273
318
|
if (isEmptyPlaceholder) continue;
|
|
274
|
-
const hasSkippedCols = row.some((cell, c) => cell === "" && skip.has(`${r},${c}`));
|
|
275
319
|
const nonEmptyCols = row.filter((cell) => cell !== "");
|
|
276
|
-
|
|
320
|
+
const hasSkipInRow = row.some((_, c) => skip.has(`${r},${c}`));
|
|
321
|
+
if (!hasSkipInRow && nonEmptyCols.length === 1 && row[0] !== "" && row.slice(1).every((c) => c === "")) {
|
|
277
322
|
pendingFirstCol = row[0];
|
|
278
323
|
continue;
|
|
279
324
|
}
|
|
@@ -705,7 +750,8 @@ function detectHwpxHeadings(blocks, styleMap) {
|
|
|
705
750
|
else if (ratio >= HEADING_RATIO_H2) level = 2;
|
|
706
751
|
else if (ratio >= HEADING_RATIO_H3) level = 3;
|
|
707
752
|
}
|
|
708
|
-
|
|
753
|
+
const compactText = text.replace(/\s+/g, "");
|
|
754
|
+
if (/^제\d+[조장절편]/.test(compactText) && text.length <= 50) {
|
|
709
755
|
if (level === 0) level = 3;
|
|
710
756
|
}
|
|
711
757
|
if (level > 0) {
|
|
@@ -757,9 +803,14 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
757
803
|
if (newTable.rows.length > 0) {
|
|
758
804
|
if (tableStack.length > 0) {
|
|
759
805
|
const parentTable = tableStack.pop();
|
|
760
|
-
const
|
|
761
|
-
if (
|
|
762
|
-
|
|
806
|
+
const nestedCols = Math.max(...newTable.rows.map((r) => r.length));
|
|
807
|
+
if (newTable.rows.length >= 3 && nestedCols >= 2) {
|
|
808
|
+
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
|
|
809
|
+
} else {
|
|
810
|
+
const nestedText = convertTableToText(newTable.rows);
|
|
811
|
+
if (parentTable.cell) {
|
|
812
|
+
parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
|
|
813
|
+
}
|
|
763
814
|
}
|
|
764
815
|
tableCtx = parentTable;
|
|
765
816
|
} else {
|
|
@@ -859,9 +910,14 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
|
|
|
859
910
|
if (newTable.rows.length > 0) {
|
|
860
911
|
if (tableStack.length > 0) {
|
|
861
912
|
const parentTable = tableStack.pop();
|
|
862
|
-
const
|
|
863
|
-
if (
|
|
864
|
-
|
|
913
|
+
const nestedCols = Math.max(...newTable.rows.map((r) => r.length));
|
|
914
|
+
if (newTable.rows.length >= 3 && nestedCols >= 2) {
|
|
915
|
+
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
|
|
916
|
+
} else {
|
|
917
|
+
const nestedText = convertTableToText(newTable.rows);
|
|
918
|
+
if (parentTable.cell) {
|
|
919
|
+
parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
|
|
920
|
+
}
|
|
865
921
|
}
|
|
866
922
|
tableCtx = parentTable;
|
|
867
923
|
} else {
|
|
@@ -872,13 +928,20 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
|
|
|
872
928
|
tableCtx = tableStack.length > 0 ? tableStack.pop() : null;
|
|
873
929
|
}
|
|
874
930
|
} else if (localTag === "pic" || localTag === "shape" || localTag === "drawingObject") {
|
|
875
|
-
const
|
|
876
|
-
if (
|
|
877
|
-
|
|
878
|
-
} else
|
|
879
|
-
|
|
931
|
+
const drawTextChild = findDescendant(el, "drawText");
|
|
932
|
+
if (drawTextChild) {
|
|
933
|
+
extractDrawTextBlocks(drawTextChild, blocks, styleMap, sectionNum);
|
|
934
|
+
} else {
|
|
935
|
+
const imgRef = extractImageRef(el);
|
|
936
|
+
if (imgRef) {
|
|
937
|
+
blocks.push({ type: "image", text: imgRef, pageNumber: sectionNum });
|
|
938
|
+
} else if (warnings && sectionNum) {
|
|
939
|
+
warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
|
|
940
|
+
}
|
|
880
941
|
}
|
|
881
|
-
} else if (localTag === "
|
|
942
|
+
} else if (localTag === "drawText") {
|
|
943
|
+
extractDrawTextBlocks(el, blocks, styleMap, sectionNum);
|
|
944
|
+
} else if (localTag === "r" || localTag === "run" || localTag === "ctrl" || localTag === "rect" || localTag === "ellipse" || localTag === "polygon" || localTag === "line" || localTag === "arc" || localTag === "curve" || localTag === "connectLine" || localTag === "container") {
|
|
882
945
|
walkChildren(el, d + 1);
|
|
883
946
|
}
|
|
884
947
|
}
|
|
@@ -886,6 +949,40 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
|
|
|
886
949
|
walkChildren(node, depth);
|
|
887
950
|
return tableCtx;
|
|
888
951
|
}
|
|
952
|
+
function findDescendant(node, targetTag, depth = 0) {
|
|
953
|
+
if (depth > 5) return null;
|
|
954
|
+
const children = node.childNodes;
|
|
955
|
+
if (!children) return null;
|
|
956
|
+
for (let i = 0; i < children.length; i++) {
|
|
957
|
+
const child = children[i];
|
|
958
|
+
if (child.nodeType !== 1) continue;
|
|
959
|
+
const tag = (child.tagName || child.localName || "").replace(/^[^:]+:/, "");
|
|
960
|
+
if (tag === targetTag) return child;
|
|
961
|
+
const found = findDescendant(child, targetTag, depth + 1);
|
|
962
|
+
if (found) return found;
|
|
963
|
+
}
|
|
964
|
+
return null;
|
|
965
|
+
}
|
|
966
|
+
function extractDrawTextBlocks(drawTextNode, blocks, styleMap, sectionNum) {
|
|
967
|
+
const children = drawTextNode.childNodes;
|
|
968
|
+
if (!children) return;
|
|
969
|
+
for (let i = 0; i < children.length; i++) {
|
|
970
|
+
const child = children[i];
|
|
971
|
+
if (child.nodeType !== 1) continue;
|
|
972
|
+
const tag = (child.tagName || child.localName || "").replace(/^[^:]+:/, "");
|
|
973
|
+
if (tag === "subList" || tag === "p" || tag === "para") {
|
|
974
|
+
if (tag === "subList") {
|
|
975
|
+
extractDrawTextBlocks(child, blocks, styleMap, sectionNum);
|
|
976
|
+
} else {
|
|
977
|
+
const info = extractParagraphInfo(child, styleMap);
|
|
978
|
+
const text = info.text.trim();
|
|
979
|
+
if (text) {
|
|
980
|
+
blocks.push({ type: "paragraph", text, style: info.style ?? void 0, pageNumber: sectionNum });
|
|
981
|
+
}
|
|
982
|
+
}
|
|
983
|
+
}
|
|
984
|
+
}
|
|
985
|
+
}
|
|
889
986
|
function extractParagraphInfo(para, styleMap) {
|
|
890
987
|
let text = "";
|
|
891
988
|
let href;
|
|
@@ -904,11 +1001,18 @@ function extractParagraphInfo(para, styleMap) {
|
|
|
904
1001
|
const tag = (child.tagName || child.localName || "").replace(/^[^:]+:/, "");
|
|
905
1002
|
switch (tag) {
|
|
906
1003
|
case "t":
|
|
907
|
-
|
|
1004
|
+
walk(child);
|
|
908
1005
|
break;
|
|
909
|
-
|
|
910
|
-
|
|
1006
|
+
// 자식 순회 (tab 등 하위 요소 처리)
|
|
1007
|
+
case "tab": {
|
|
1008
|
+
const leader = child.getAttribute("leader");
|
|
1009
|
+
if (leader && leader !== "0") {
|
|
1010
|
+
text += "";
|
|
1011
|
+
} else {
|
|
1012
|
+
text += " ";
|
|
1013
|
+
}
|
|
911
1014
|
break;
|
|
1015
|
+
}
|
|
912
1016
|
case "br":
|
|
913
1017
|
if ((child.getAttribute("type") || "line") === "line") text += "\n";
|
|
914
1018
|
break;
|
|
@@ -975,6 +1079,8 @@ function extractParagraphInfo(para, styleMap) {
|
|
|
975
1079
|
}
|
|
976
1080
|
};
|
|
977
1081
|
walk(para);
|
|
1082
|
+
const leaderIdx = text.indexOf("");
|
|
1083
|
+
if (leaderIdx >= 0) text = text.substring(0, leaderIdx);
|
|
978
1084
|
let cleanText = text.replace(/[ \t]+/g, " ").trim();
|
|
979
1085
|
if (/^그림입니다\.?\s*원본\s*그림의\s*(이름|크기)/.test(cleanText)) cleanText = "";
|
|
980
1086
|
cleanText = cleanText.replace(/그림입니다\.?\s*원본\s*그림의\s*(이름|크기)[^\n]*(\n[^\n]*원본\s*그림의\s*(이름|크기)[^\n]*)*/g, "").trim();
|
|
@@ -1013,8 +1119,9 @@ var TAG_CHAR_SHAPE = 68;
|
|
|
1013
1119
|
var TAG_CTRL_HEADER = 71;
|
|
1014
1120
|
var TAG_LIST_HEADER = 72;
|
|
1015
1121
|
var TAG_TABLE = 77;
|
|
1016
|
-
var TAG_DOC_CHAR_SHAPE =
|
|
1017
|
-
var
|
|
1122
|
+
var TAG_DOC_CHAR_SHAPE = 21;
|
|
1123
|
+
var TAG_DOC_PARA_SHAPE = 25;
|
|
1124
|
+
var TAG_DOC_STYLE = 26;
|
|
1018
1125
|
var CHAR_LINE = 0;
|
|
1019
1126
|
var CHAR_SECTION_BREAK = 10;
|
|
1020
1127
|
var CHAR_PARA = 13;
|
|
@@ -1070,8 +1177,14 @@ function parseFileHeader(data) {
|
|
|
1070
1177
|
}
|
|
1071
1178
|
function parseDocInfo(records) {
|
|
1072
1179
|
const charShapes = [];
|
|
1180
|
+
const paraShapes = [];
|
|
1073
1181
|
const styles = [];
|
|
1074
1182
|
for (const rec of records) {
|
|
1183
|
+
if (rec.tagId === TAG_DOC_PARA_SHAPE && rec.data.length >= 4) {
|
|
1184
|
+
const flags = rec.data.readUInt32LE(0);
|
|
1185
|
+
const outlineLevel = flags >> 25 & 7;
|
|
1186
|
+
paraShapes.push({ outlineLevel });
|
|
1187
|
+
}
|
|
1075
1188
|
if (rec.tagId === TAG_DOC_CHAR_SHAPE && rec.data.length >= 18) {
|
|
1076
1189
|
if (rec.data.length >= 50) {
|
|
1077
1190
|
const fontSize = rec.data.readUInt32LE(42);
|
|
@@ -1111,7 +1224,7 @@ function parseDocInfo(records) {
|
|
|
1111
1224
|
}
|
|
1112
1225
|
}
|
|
1113
1226
|
}
|
|
1114
|
-
return { charShapes, styles };
|
|
1227
|
+
return { charShapes, paraShapes, styles };
|
|
1115
1228
|
}
|
|
1116
1229
|
function extractText(data) {
|
|
1117
1230
|
let result = "";
|
|
@@ -2121,12 +2234,13 @@ function parseHwp5Document(buffer, options) {
|
|
|
2121
2234
|
}
|
|
2122
2235
|
}
|
|
2123
2236
|
const images = cfb ? extractHwp5Images(cfb, blocks, compressed, warnings) : extractHwp5ImagesLenient(lenientCfb, blocks, compressed, warnings);
|
|
2237
|
+
const flatBlocks = flattenLayoutTables(blocks);
|
|
2124
2238
|
if (docInfo) {
|
|
2125
|
-
detectHwp5Headings(
|
|
2239
|
+
detectHwp5Headings(flatBlocks, docInfo);
|
|
2126
2240
|
}
|
|
2127
|
-
const outline =
|
|
2128
|
-
const markdown = blocksToMarkdown(
|
|
2129
|
-
return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0, images: images.length > 0 ? images : void 0 };
|
|
2241
|
+
const outline = flatBlocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
|
|
2242
|
+
const markdown = blocksToMarkdown(flatBlocks);
|
|
2243
|
+
return { markdown, blocks: flatBlocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0, images: images.length > 0 ? images : void 0 };
|
|
2130
2244
|
}
|
|
2131
2245
|
function parseDocInfoStream(cfb, compressed) {
|
|
2132
2246
|
try {
|
|
@@ -2177,16 +2291,21 @@ function detectHwp5Headings(blocks, docInfo) {
|
|
|
2177
2291
|
}
|
|
2178
2292
|
if (baseFontSize <= 0) return;
|
|
2179
2293
|
for (const block of blocks) {
|
|
2180
|
-
if (block.type
|
|
2294
|
+
if (block.type === "heading") continue;
|
|
2295
|
+
if (block.type !== "paragraph" || !block.text) continue;
|
|
2181
2296
|
const text = block.text.trim();
|
|
2182
2297
|
if (text.length === 0 || text.length > 200) continue;
|
|
2183
2298
|
if (/^\d+$/.test(text)) continue;
|
|
2184
|
-
const ratio = block.style.fontSize / baseFontSize;
|
|
2185
2299
|
let level = 0;
|
|
2186
|
-
if (
|
|
2187
|
-
|
|
2188
|
-
|
|
2189
|
-
|
|
2300
|
+
if (block.style?.fontSize && baseFontSize > 0) {
|
|
2301
|
+
const ratio = block.style.fontSize / baseFontSize;
|
|
2302
|
+
if (ratio >= HEADING_RATIO_H1) level = 1;
|
|
2303
|
+
else if (ratio >= HEADING_RATIO_H2) level = 2;
|
|
2304
|
+
else if (ratio >= HEADING_RATIO_H3) level = 3;
|
|
2305
|
+
}
|
|
2306
|
+
if (/^제\d+[장절편]\s/.test(text) && text.length <= 50) {
|
|
2307
|
+
if (level === 0) level = 2;
|
|
2308
|
+
} else if (/^제\d+(조의?\d*)\s*[\((]/.test(text) && text.length <= 80) {
|
|
2190
2309
|
if (level === 0) level = 3;
|
|
2191
2310
|
}
|
|
2192
2311
|
if (level > 0) {
|
|
@@ -2432,13 +2551,20 @@ function parseSection(records, docInfo, warnings, sectionNum) {
|
|
|
2432
2551
|
while (i < records.length) {
|
|
2433
2552
|
const rec = records[i];
|
|
2434
2553
|
if (rec.tagId === TAG_PARA_HEADER && rec.level === 0) {
|
|
2435
|
-
const { paragraph, tables, nextIdx, charShapeIds } = parseParagraphWithTables(records, i);
|
|
2554
|
+
const { paragraph, tables, nextIdx, charShapeIds, paraShapeId } = parseParagraphWithTables(records, i);
|
|
2436
2555
|
if (paragraph) {
|
|
2437
2556
|
const block = { type: "paragraph", text: paragraph, pageNumber: sectionNum };
|
|
2438
2557
|
if (docInfo && charShapeIds.length > 0) {
|
|
2439
2558
|
const style = resolveCharStyle(charShapeIds, docInfo);
|
|
2440
2559
|
if (style) block.style = style;
|
|
2441
2560
|
}
|
|
2561
|
+
if (docInfo && paraShapeId >= 0 && paraShapeId < docInfo.paraShapes.length) {
|
|
2562
|
+
const ol = docInfo.paraShapes[paraShapeId].outlineLevel;
|
|
2563
|
+
if (ol >= 1 && ol <= 6) {
|
|
2564
|
+
block.type = "heading";
|
|
2565
|
+
block.level = ol;
|
|
2566
|
+
}
|
|
2567
|
+
}
|
|
2442
2568
|
blocks.push(block);
|
|
2443
2569
|
}
|
|
2444
2570
|
for (const t of tables) blocks.push({ type: "table", table: t, pageNumber: sectionNum });
|
|
@@ -2458,7 +2584,10 @@ function parseSection(records, docInfo, warnings, sectionNum) {
|
|
|
2458
2584
|
if (binId >= 0) {
|
|
2459
2585
|
blocks.push({ type: "image", text: String(binId), pageNumber: sectionNum });
|
|
2460
2586
|
} else {
|
|
2461
|
-
|
|
2587
|
+
const boxText = extractTextBoxText(records, i);
|
|
2588
|
+
if (boxText) {
|
|
2589
|
+
blocks.push({ type: "paragraph", text: boxText, pageNumber: sectionNum });
|
|
2590
|
+
}
|
|
2462
2591
|
}
|
|
2463
2592
|
} else if (ctrlId === " elo" || ctrlId === "ole ") {
|
|
2464
2593
|
warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC81C\uC5B4 \uC694\uC18C: ${ctrlId.trim()}`, code: "SKIPPED_IMAGE" });
|
|
@@ -2497,6 +2626,19 @@ function extractNoteText(records, ctrlIdx) {
|
|
|
2497
2626
|
}
|
|
2498
2627
|
return texts.length > 0 ? texts.join(" ") : null;
|
|
2499
2628
|
}
|
|
2629
|
+
function extractTextBoxText(records, ctrlIdx) {
|
|
2630
|
+
const ctrlLevel = records[ctrlIdx].level;
|
|
2631
|
+
const texts = [];
|
|
2632
|
+
for (let j = ctrlIdx + 1; j < records.length && j < ctrlIdx + 200; j++) {
|
|
2633
|
+
const r = records[j];
|
|
2634
|
+
if (r.level <= ctrlLevel) break;
|
|
2635
|
+
if (r.tagId === TAG_PARA_TEXT) {
|
|
2636
|
+
const t = extractText(r.data).trim();
|
|
2637
|
+
if (t) texts.push(t);
|
|
2638
|
+
}
|
|
2639
|
+
}
|
|
2640
|
+
return texts.length > 0 ? texts.join("\n") : null;
|
|
2641
|
+
}
|
|
2500
2642
|
function extractHyperlinkUrl(data) {
|
|
2501
2643
|
try {
|
|
2502
2644
|
const httpSig = Buffer.from("http", "utf16le");
|
|
@@ -2542,6 +2684,8 @@ function parseParagraphWithTables(records, startIdx) {
|
|
|
2542
2684
|
let text = "";
|
|
2543
2685
|
const tables = [];
|
|
2544
2686
|
const charShapeIds = [];
|
|
2687
|
+
const paraHeaderData = records[startIdx].data;
|
|
2688
|
+
const paraShapeId = paraHeaderData.length >= 10 ? paraHeaderData.readUInt16LE(8) : -1;
|
|
2545
2689
|
let i = startIdx + 1;
|
|
2546
2690
|
while (i < records.length) {
|
|
2547
2691
|
const rec = records[i];
|
|
@@ -2566,7 +2710,7 @@ function parseParagraphWithTables(records, startIdx) {
|
|
|
2566
2710
|
i++;
|
|
2567
2711
|
}
|
|
2568
2712
|
const trimmed = text.trim();
|
|
2569
|
-
return { paragraph: trimmed || null, tables, nextIdx: i, charShapeIds };
|
|
2713
|
+
return { paragraph: trimmed || null, tables, nextIdx: i, charShapeIds, paraShapeId };
|
|
2570
2714
|
}
|
|
2571
2715
|
function parseTableBlock(records, startIdx) {
|
|
2572
2716
|
const tableLevel = records[startIdx].level;
|
|
@@ -3384,7 +3528,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
3384
3528
|
if (totalChars / Math.max(parsedPageCount, 1) < 10) {
|
|
3385
3529
|
if (options?.ocr) {
|
|
3386
3530
|
try {
|
|
3387
|
-
const { ocrPages } = await import("./provider-
|
|
3531
|
+
const { ocrPages } = await import("./provider-EU3CG724.js");
|
|
3388
3532
|
const ocrBlocks = await ocrPages(doc, options.ocr, pageFilter, effectivePageCount);
|
|
3389
3533
|
if (ocrBlocks.length > 0) {
|
|
3390
3534
|
const ocrMarkdown = ocrBlocks.map((b) => b.text || "").filter(Boolean).join("\n\n");
|
|
@@ -4482,7 +4626,7 @@ async function parseXlsxDocument(buffer, options) {
|
|
|
4482
4626
|
}
|
|
4483
4627
|
let pageFilter = null;
|
|
4484
4628
|
if (options?.pages) {
|
|
4485
|
-
const { parsePageRange: parsePageRange2 } = await import("./page-range-
|
|
4629
|
+
const { parsePageRange: parsePageRange2 } = await import("./page-range-OF5I4PQY.js");
|
|
4486
4630
|
pageFilter = parsePageRange2(options.pages, sheets.length);
|
|
4487
4631
|
}
|
|
4488
4632
|
const blocks = [];
|
|
@@ -5365,4 +5509,4 @@ export {
|
|
|
5365
5509
|
extractFormFields,
|
|
5366
5510
|
parse
|
|
5367
5511
|
};
|
|
5368
|
-
//# sourceMappingURL=chunk-
|
|
5512
|
+
//# sourceMappingURL=chunk-4UH6ABAY.js.map
|