kordoc 2.0.1 → 2.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -21
- package/README.md +291 -291
- package/dist/{chunk-L4OFASDS.js → chunk-25TXW6EP.js} +2 -2
- package/dist/chunk-25TXW6EP.js.map +1 -0
- package/dist/{chunk-MOL7MDBG.js → chunk-3TBUDJDE.js} +1 -1
- package/dist/chunk-3TBUDJDE.js.map +1 -0
- package/dist/{chunk-JJ65GKUH.js → chunk-4UH6ABAY.js} +185 -41
- package/dist/chunk-4UH6ABAY.js.map +1 -0
- package/dist/cli.js +5 -5
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +181 -37
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +181 -37
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +3 -3
- package/dist/mcp.js.map +1 -1
- package/dist/page-range-OF5I4PQY.js +8 -0
- package/dist/{provider-A4FHJSID.js → provider-EU3CG724.js} +1 -1
- package/dist/provider-EU3CG724.js.map +1 -0
- package/dist/{utils-4HVKHULU.js → utils-BTZ4WSYX.js} +2 -2
- package/dist/{watch-RNZ3KESY.js → watch-QD3PDNXQ.js} +4 -4
- package/dist/watch-QD3PDNXQ.js.map +1 -0
- package/package.json +1 -1
- package/dist/chunk-JJ65GKUH.js.map +0 -1
- package/dist/chunk-L4OFASDS.js.map +0 -1
- package/dist/chunk-MOL7MDBG.js.map +0 -1
- package/dist/page-range-737B4EZW.js +0 -8
- package/dist/provider-A4FHJSID.js.map +0 -1
- package/dist/watch-RNZ3KESY.js.map +0 -1
- /package/dist/{page-range-737B4EZW.js.map → page-range-OF5I4PQY.js.map} +0 -0
- /package/dist/{utils-4HVKHULU.js.map → utils-BTZ4WSYX.js.map} +0 -0
package/dist/cli.js
CHANGED
|
@@ -2,12 +2,12 @@
|
|
|
2
2
|
import {
|
|
3
3
|
detectFormat,
|
|
4
4
|
parse
|
|
5
|
-
} from "./chunk-
|
|
5
|
+
} from "./chunk-4UH6ABAY.js";
|
|
6
6
|
import {
|
|
7
7
|
VERSION,
|
|
8
8
|
toArrayBuffer
|
|
9
|
-
} from "./chunk-
|
|
10
|
-
import "./chunk-
|
|
9
|
+
} from "./chunk-25TXW6EP.js";
|
|
10
|
+
import "./chunk-3TBUDJDE.js";
|
|
11
11
|
|
|
12
12
|
// src/cli.ts
|
|
13
13
|
import { readFileSync, writeFileSync, mkdirSync, statSync } from "fs";
|
|
@@ -92,7 +92,7 @@ program.name("kordoc").description("\uBAA8\uB450 \uD30C\uC2F1\uD574\uBC84\uB9AC\
|
|
|
92
92
|
process.stdout.write(output + "\n");
|
|
93
93
|
}
|
|
94
94
|
} catch (err) {
|
|
95
|
-
const { sanitizeError } = await import("./utils-
|
|
95
|
+
const { sanitizeError } = await import("./utils-BTZ4WSYX.js");
|
|
96
96
|
process.stderr.write(`
|
|
97
97
|
[kordoc] ERROR: ${fileName} \u2014 ${sanitizeError(err)}
|
|
98
98
|
`);
|
|
@@ -101,7 +101,7 @@ program.name("kordoc").description("\uBAA8\uB450 \uD30C\uC2F1\uD574\uBC84\uB9AC\
|
|
|
101
101
|
}
|
|
102
102
|
});
|
|
103
103
|
program.command("watch <dir>").description("\uB514\uB809\uD1A0\uB9AC \uAC10\uC2DC \u2014 \uC0C8 \uBB38\uC11C \uC790\uB3D9 \uBCC0\uD658").option("--webhook <url>", "\uACB0\uACFC \uC804\uC1A1 \uC6F9\uD6C5 URL").option("-d, --out-dir <dir>", "\uBCC0\uD658 \uACB0\uACFC \uCD9C\uB825 \uB514\uB809\uD1A0\uB9AC").option("-p, --pages <range>", "\uD398\uC774\uC9C0/\uC139\uC158 \uBC94\uC704").option("--format <type>", "\uCD9C\uB825 \uD615\uC2DD: markdown \uB610\uB294 json", "markdown").option("--silent", "\uC9C4\uD589 \uBA54\uC2DC\uC9C0 \uC228\uAE30\uAE30").action(async (dir, opts) => {
|
|
104
|
-
const { watchDirectory } = await import("./watch-
|
|
104
|
+
const { watchDirectory } = await import("./watch-QD3PDNXQ.js");
|
|
105
105
|
await watchDirectory({
|
|
106
106
|
dir,
|
|
107
107
|
outDir: opts.outDir,
|
package/dist/cli.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/cli.ts"],"sourcesContent":["/** kordoc CLI — 모두 파싱해버리겠다 */\n\nimport { readFileSync, writeFileSync, mkdirSync, statSync } from \"fs\"\nimport { basename, resolve } from \"path\"\nimport { Command } from \"commander\"\nimport { parse, detectFormat } from \"./index.js\"\nimport type { ParseOptions } from \"./types.js\"\nimport { VERSION, toArrayBuffer } from \"./utils.js\"\n\nconst program = new Command()\n\nprogram\n .name(\"kordoc\")\n .description(\"모두 파싱해버리겠다 — HWP, HWPX, PDF, XLSX, DOCX → Markdown\")\n .version(VERSION)\n .argument(\"<files...>\", \"변환할 파일 경로 (HWP, HWPX, PDF, XLSX, DOCX)\")\n .option(\"-o, --output <path>\", \"출력 파일 경로 (단일 파일 시)\")\n .option(\"-d, --out-dir <dir>\", \"출력 디렉토리 (다중 파일 시)\")\n .option(\"-p, --pages <range>\", \"페이지/섹션 범위 (예: 1-3, 1,3,5)\")\n .option(\"--format <type>\", \"출력 형식: markdown (기본) 또는 json\", \"markdown\")\n .option(\"--no-header-footer\", \"PDF 머리글/바닥글 자동 제거\")\n .option(\"--silent\", \"진행 메시지 숨기기\")\n .action(async (files: string[], opts) => {\n const validFormats = [\"markdown\", \"json\"]\n if (!validFormats.includes(opts.format)) {\n process.stderr.write(`[kordoc] 지원하지 않는 형식: ${opts.format} (markdown 또는 json)\\n`)\n process.exit(1)\n }\n for (let fi = 0; fi < files.length; fi++) {\n const filePath = files[fi]\n const absPath = resolve(filePath)\n const fileName = basename(absPath)\n const filePrefix = files.length > 1 ? `[${fi + 1}/${files.length}] ` : \"\"\n\n try {\n const fileSize = statSync(absPath).size\n if (fileSize > 500 * 1024 * 1024) {\n process.stderr.write(`\\n[kordoc] SKIP: ${fileName} — 파일이 너무 큽니다 (${(fileSize / 1024 / 1024).toFixed(1)}MB)\\n`)\n process.exitCode = 1\n continue\n }\n const buffer = readFileSync(absPath)\n const arrayBuffer = toArrayBuffer(buffer)\n const format = detectFormat(arrayBuffer)\n\n if (!opts.silent) {\n process.stderr.write(`[kordoc] ${filePrefix}${fileName} (${format}) ...`)\n }\n\n const parseOptions: ParseOptions = {}\n if (opts.pages) parseOptions.pages = opts.pages as string\n if (opts.headerFooter === false) parseOptions.removeHeaderFooter = false\n if (!opts.silent) {\n parseOptions.onProgress = (current: number, total: number) => {\n process.stderr.write(`\\r[kordoc] ${filePrefix}${fileName} (${format}) [${current}/${total}]`)\n }\n }\n const result = await parse(arrayBuffer, parseOptions)\n\n if (!result.success) {\n process.stderr.write(` FAIL\\n`)\n process.stderr.write(` → ${result.error}\\n`)\n process.exitCode = 1\n continue\n }\n\n if (!opts.silent) process.stderr.write(` OK\\n`)\n\n let markdown = result.markdown\n // --out-dir 시 이미지 참조 경로에 images/ 접두사 추가\n if (opts.outDir && result.images?.length) {\n markdown = markdown.replace(/!\\[image\\]\\(image_/g, \"\n }\n const output = opts.format === \"json\"\n ? JSON.stringify(result, null, 2)\n : markdown\n\n // 이미지 저장 (--out-dir 또는 --output 시)\n const saveImages = (dir: string) => {\n if (!result.images?.length) return\n const imgDir = resolve(dir, \"images\")\n mkdirSync(imgDir, { recursive: true })\n for (const img of result.images) {\n writeFileSync(resolve(imgDir, img.filename), img.data)\n }\n if (!opts.silent) process.stderr.write(` → ${result.images.length}개 이미지 → ${imgDir}\\n`)\n }\n\n if (opts.output && files.length === 1) {\n writeFileSync(opts.output, output, \"utf-8\")\n if (!opts.silent) process.stderr.write(` → ${opts.output}\\n`)\n saveImages(resolve(opts.output, \"..\"))\n } else if (opts.outDir) {\n mkdirSync(opts.outDir, { recursive: true })\n const outExt = opts.format === \"json\" ? \".json\" : \".md\"\n const outPath = resolve(opts.outDir, fileName.replace(/\\.[^.]+$/, outExt))\n writeFileSync(outPath, output, \"utf-8\")\n if (!opts.silent) process.stderr.write(` → ${outPath}\\n`)\n saveImages(opts.outDir)\n } else {\n process.stdout.write(output + \"\\n\")\n }\n } catch (err) {\n const { sanitizeError } = await import(\"./utils.js\")\n process.stderr.write(`\\n[kordoc] ERROR: ${fileName} — ${sanitizeError(err)}\\n`)\n process.exitCode = 1\n }\n }\n })\n\nprogram\n .command(\"watch <dir>\")\n .description(\"디렉토리 감시 — 새 문서 자동 변환\")\n .option(\"--webhook <url>\", \"결과 전송 웹훅 URL\")\n .option(\"-d, --out-dir <dir>\", \"변환 결과 출력 디렉토리\")\n .option(\"-p, --pages <range>\", \"페이지/섹션 범위\")\n .option(\"--format <type>\", \"출력 형식: markdown 또는 json\", \"markdown\")\n .option(\"--silent\", \"진행 메시지 숨기기\")\n .action(async (dir: string, opts) => {\n const { watchDirectory } = await import(\"./watch.js\")\n await watchDirectory({\n dir,\n outDir: opts.outDir,\n webhook: opts.webhook,\n format: opts.format,\n pages: opts.pages,\n silent: opts.silent,\n })\n })\n\nprogram.parse()\n"],"mappings":";;;;;;;;;;;;AAEA,SAAS,cAAc,eAAe,WAAW,gBAAgB;AACjE,SAAS,UAAU,eAAe;AAClC,SAAS,eAAe;AAKxB,IAAM,UAAU,IAAI,QAAQ;AAE5B,QACG,KAAK,QAAQ,EACb,YAAY,2GAAoD,EAChE,QAAQ,OAAO,EACf,SAAS,cAAc,2EAAwC,EAC/D,OAAO,uBAAuB,2EAAoB,EAClD,OAAO,uBAAuB,0EAAmB,EACjD,OAAO,uBAAuB,mEAA2B,EACzD,OAAO,mBAAmB,wEAAgC,UAAU,EACpE,OAAO,sBAAsB,qEAAmB,EAChD,OAAO,YAAY,oDAAY,EAC/B,OAAO,OAAO,OAAiB,SAAS;AACvC,QAAM,eAAe,CAAC,YAAY,MAAM;AACxC,MAAI,CAAC,aAAa,SAAS,KAAK,MAAM,GAAG;AACvC,YAAQ,OAAO,MAAM,gEAAwB,KAAK,MAAM;AAAA,CAAuB;AAC/E,YAAQ,KAAK,CAAC;AAAA,EAChB;AACA,WAAS,KAAK,GAAG,KAAK,MAAM,QAAQ,MAAM;AACxC,UAAM,WAAW,MAAM,EAAE;AACzB,UAAM,UAAU,QAAQ,QAAQ;AAChC,UAAM,WAAW,SAAS,OAAO;AACjC,UAAM,aAAa,MAAM,SAAS,IAAI,IAAI,KAAK,CAAC,IAAI,MAAM,MAAM,OAAO;AAEvE,QAAI;AACF,YAAM,WAAW,SAAS,OAAO,EAAE;AACnC,UAAI,WAAW,MAAM,OAAO,MAAM;AAChC,gBAAQ,OAAO,MAAM;AAAA,iBAAoB,QAAQ,gEAAmB,WAAW,OAAO,MAAM,QAAQ,CAAC,CAAC;AAAA,CAAO;AAC7G,gBAAQ,WAAW;AACnB;AAAA,MACF;AACA,YAAM,SAAS,aAAa,OAAO;AACnC,YAAM,cAAc,cAAc,MAAM;AACxC,YAAM,SAAS,aAAa,WAAW;AAEvC,UAAI,CAAC,KAAK,QAAQ;AAChB,gBAAQ,OAAO,MAAM,YAAY,UAAU,GAAG,QAAQ,KAAK,MAAM,OAAO;AAAA,MAC1E;AAEA,YAAM,eAA6B,CAAC;AACpC,UAAI,KAAK,MAAO,cAAa,QAAQ,KAAK;AAC1C,UAAI,KAAK,iBAAiB,MAAO,cAAa,qBAAqB;AACnE,UAAI,CAAC,KAAK,QAAQ;AAChB,qBAAa,aAAa,CAAC,SAAiB,UAAkB;AAC5D,kBAAQ,OAAO,MAAM,cAAc,UAAU,GAAG,QAAQ,KAAK,MAAM,MAAM,OAAO,IAAI,KAAK,GAAG;AAAA,QAC9F;AAAA,MACF;AACA,YAAM,SAAS,MAAM,MAAM,aAAa,YAAY;AAEpD,UAAI,CAAC,OAAO,SAAS;AACnB,gBAAQ,OAAO,MAAM;AAAA,CAAS;AAC9B,gBAAQ,OAAO,MAAM,YAAO,OAAO,KAAK;AAAA,CAAI;AAC5C,gBAAQ,WAAW;AACnB;AAAA,MACF;AAEA,UAAI,CAAC,KAAK,OAAQ,SAAQ,OAAO,MAAM;AAAA,CAAO;AAE9C,UAAI,WAAW,OAAO;AAEtB,UAAI,KAAK,UAAU,OAAO,QAAQ,QAAQ;AACxC,mBAAW,SAAS,QAAQ,uBAAuB,wBAAwB;AAAA,MAC7E;AACA,YAAM,SAAS,KAAK,WAAW,SAC3B,KAAK,UAAU,QAAQ,MAAM,CAAC,IAC9B;AAGJ,YAAM,aAAa,CAAC,QAAgB;AAClC,YAAI,CAAC,OAAO,QAAQ,OAAQ;AAC5B,cAAM,SAAS,QAAQ,KAAK,QAAQ;AACpC,kBAAU,QAAQ,EAAE,WAAW,KAAK,CAAC;AACrC,mBAAW,OAAO,OAAO,QAAQ;AAC/B,wBAAc,QAAQ,QAAQ,IAAI,QAAQ,GAAG,IAAI,IAAI;AAAA,QACvD;AACA,YAAI,CAAC,KAAK,OAAQ,SAAQ,OAAO,MAAM,YAAO,OAAO,OAAO,MAAM,oCAAW,MAAM;AAAA,CAAI;AAAA,MACzF;AAEA,UAAI,KAAK,UAAU,MAAM,WAAW,GAAG;AACrC,sBAAc,KAAK,QAAQ,QAAQ,OAAO;AAC1C,YAAI,CAAC,KAAK,OAAQ,SAAQ,OAAO,MAAM,YAAO,KAAK,MAAM;AAAA,CAAI;AAC7D,mBAAW,QAAQ,KAAK,QAAQ,IAAI,CAAC;AAAA,MACvC,WAAW,KAAK,QAAQ;AACtB,kBAAU,KAAK,QAAQ,EAAE,WAAW,KAAK,CAAC;AAC1C,cAAM,SAAS,KAAK,WAAW,SAAS,UAAU;AAClD,cAAM,UAAU,QAAQ,KAAK,QAAQ,SAAS,QAAQ,YAAY,MAAM,CAAC;AACzE,sBAAc,SAAS,QAAQ,OAAO;AACtC,YAAI,CAAC,KAAK,OAAQ,SAAQ,OAAO,MAAM,YAAO,OAAO;AAAA,CAAI;AACzD,mBAAW,KAAK,MAAM;AAAA,MACxB,OAAO;AACL,gBAAQ,OAAO,MAAM,SAAS,IAAI;AAAA,MACpC;AAAA,IACF,SAAS,KAAK;AACZ,YAAM,EAAE,cAAc,IAAI,MAAM,OAAO,qBAAY;AACnD,cAAQ,OAAO,MAAM;AAAA,kBAAqB,QAAQ,WAAM,cAAc,GAAG,CAAC;AAAA,CAAI;AAC9E,cAAQ,WAAW;AAAA,IACrB;AAAA,EACF;AACF,CAAC;AAEH,QACG,QAAQ,aAAa,EACrB,YAAY,4FAAsB,EAClC,OAAO,mBAAmB,4CAAc,EACxC,OAAO,uBAAuB,iEAAe,EAC7C,OAAO,uBAAuB,8CAAW,EACzC,OAAO,mBAAmB,yDAA2B,UAAU,EAC/D,OAAO,YAAY,oDAAY,EAC/B,OAAO,OAAO,KAAa,SAAS;AACnC,QAAM,EAAE,eAAe,IAAI,MAAM,OAAO,qBAAY;AACpD,QAAM,eAAe;AAAA,IACnB;AAAA,IACA,QAAQ,KAAK;AAAA,IACb,SAAS,KAAK;AAAA,IACd,QAAQ,KAAK;AAAA,IACb,OAAO,KAAK;AAAA,IACZ,QAAQ,KAAK;AAAA,EACf,CAAC;AACH,CAAC;AAEH,QAAQ,MAAM;","names":[]}
|
|
1
|
+
{"version":3,"sources":["../src/cli.ts"],"sourcesContent":["/** kordoc CLI — 모두 파싱해버리겠다 */\r\n\r\nimport { readFileSync, writeFileSync, mkdirSync, statSync } from \"fs\"\r\nimport { basename, resolve } from \"path\"\r\nimport { Command } from \"commander\"\r\nimport { parse, detectFormat } from \"./index.js\"\r\nimport type { ParseOptions } from \"./types.js\"\r\nimport { VERSION, toArrayBuffer } from \"./utils.js\"\r\n\r\nconst program = new Command()\r\n\r\nprogram\r\n .name(\"kordoc\")\r\n .description(\"모두 파싱해버리겠다 — HWP, HWPX, PDF, XLSX, DOCX → Markdown\")\r\n .version(VERSION)\r\n .argument(\"<files...>\", \"변환할 파일 경로 (HWP, HWPX, PDF, XLSX, DOCX)\")\r\n .option(\"-o, --output <path>\", \"출력 파일 경로 (단일 파일 시)\")\r\n .option(\"-d, --out-dir <dir>\", \"출력 디렉토리 (다중 파일 시)\")\r\n .option(\"-p, --pages <range>\", \"페이지/섹션 범위 (예: 1-3, 1,3,5)\")\r\n .option(\"--format <type>\", \"출력 형식: markdown (기본) 또는 json\", \"markdown\")\r\n .option(\"--no-header-footer\", \"PDF 머리글/바닥글 자동 제거\")\r\n .option(\"--silent\", \"진행 메시지 숨기기\")\r\n .action(async (files: string[], opts) => {\r\n const validFormats = [\"markdown\", \"json\"]\r\n if (!validFormats.includes(opts.format)) {\r\n process.stderr.write(`[kordoc] 지원하지 않는 형식: ${opts.format} (markdown 또는 json)\\n`)\r\n process.exit(1)\r\n }\r\n for (let fi = 0; fi < files.length; fi++) {\r\n const filePath = files[fi]\r\n const absPath = resolve(filePath)\r\n const fileName = basename(absPath)\r\n const filePrefix = files.length > 1 ? `[${fi + 1}/${files.length}] ` : \"\"\r\n\r\n try {\r\n const fileSize = statSync(absPath).size\r\n if (fileSize > 500 * 1024 * 1024) {\r\n process.stderr.write(`\\n[kordoc] SKIP: ${fileName} — 파일이 너무 큽니다 (${(fileSize / 1024 / 1024).toFixed(1)}MB)\\n`)\r\n process.exitCode = 1\r\n continue\r\n }\r\n const buffer = readFileSync(absPath)\r\n const arrayBuffer = toArrayBuffer(buffer)\r\n const format = detectFormat(arrayBuffer)\r\n\r\n if (!opts.silent) {\r\n process.stderr.write(`[kordoc] ${filePrefix}${fileName} (${format}) ...`)\r\n }\r\n\r\n const parseOptions: ParseOptions = {}\r\n if (opts.pages) parseOptions.pages = opts.pages as string\r\n if (opts.headerFooter === false) parseOptions.removeHeaderFooter = false\r\n if (!opts.silent) {\r\n parseOptions.onProgress = (current: number, total: number) => {\r\n process.stderr.write(`\\r[kordoc] ${filePrefix}${fileName} (${format}) [${current}/${total}]`)\r\n }\r\n }\r\n const result = await parse(arrayBuffer, parseOptions)\r\n\r\n if (!result.success) {\r\n process.stderr.write(` FAIL\\n`)\r\n process.stderr.write(` → ${result.error}\\n`)\r\n process.exitCode = 1\r\n continue\r\n }\r\n\r\n if (!opts.silent) process.stderr.write(` OK\\n`)\r\n\r\n let markdown = result.markdown\r\n // --out-dir 시 이미지 참조 경로에 images/ 접두사 추가\r\n if (opts.outDir && result.images?.length) {\r\n markdown = markdown.replace(/!\\[image\\]\\(image_/g, \"\r\n }\r\n const output = opts.format === \"json\"\r\n ? JSON.stringify(result, null, 2)\r\n : markdown\r\n\r\n // 이미지 저장 (--out-dir 또는 --output 시)\r\n const saveImages = (dir: string) => {\r\n if (!result.images?.length) return\r\n const imgDir = resolve(dir, \"images\")\r\n mkdirSync(imgDir, { recursive: true })\r\n for (const img of result.images) {\r\n writeFileSync(resolve(imgDir, img.filename), img.data)\r\n }\r\n if (!opts.silent) process.stderr.write(` → ${result.images.length}개 이미지 → ${imgDir}\\n`)\r\n }\r\n\r\n if (opts.output && files.length === 1) {\r\n writeFileSync(opts.output, output, \"utf-8\")\r\n if (!opts.silent) process.stderr.write(` → ${opts.output}\\n`)\r\n saveImages(resolve(opts.output, \"..\"))\r\n } else if (opts.outDir) {\r\n mkdirSync(opts.outDir, { recursive: true })\r\n const outExt = opts.format === \"json\" ? \".json\" : \".md\"\r\n const outPath = resolve(opts.outDir, fileName.replace(/\\.[^.]+$/, outExt))\r\n writeFileSync(outPath, output, \"utf-8\")\r\n if (!opts.silent) process.stderr.write(` → ${outPath}\\n`)\r\n saveImages(opts.outDir)\r\n } else {\r\n process.stdout.write(output + \"\\n\")\r\n }\r\n } catch (err) {\r\n const { sanitizeError } = await import(\"./utils.js\")\r\n process.stderr.write(`\\n[kordoc] ERROR: ${fileName} — ${sanitizeError(err)}\\n`)\r\n process.exitCode = 1\r\n }\r\n }\r\n })\r\n\r\nprogram\r\n .command(\"watch <dir>\")\r\n .description(\"디렉토리 감시 — 새 문서 자동 변환\")\r\n .option(\"--webhook <url>\", \"결과 전송 웹훅 URL\")\r\n .option(\"-d, --out-dir <dir>\", \"변환 결과 출력 디렉토리\")\r\n .option(\"-p, --pages <range>\", \"페이지/섹션 범위\")\r\n .option(\"--format <type>\", \"출력 형식: markdown 또는 json\", \"markdown\")\r\n .option(\"--silent\", \"진행 메시지 숨기기\")\r\n .action(async (dir: string, opts) => {\r\n const { watchDirectory } = await import(\"./watch.js\")\r\n await watchDirectory({\r\n dir,\r\n outDir: opts.outDir,\r\n webhook: opts.webhook,\r\n format: opts.format,\r\n pages: opts.pages,\r\n silent: opts.silent,\r\n })\r\n })\r\n\r\nprogram.parse()\r\n"],"mappings":";;;;;;;;;;;;AAEA,SAAS,cAAc,eAAe,WAAW,gBAAgB;AACjE,SAAS,UAAU,eAAe;AAClC,SAAS,eAAe;AAKxB,IAAM,UAAU,IAAI,QAAQ;AAE5B,QACG,KAAK,QAAQ,EACb,YAAY,2GAAoD,EAChE,QAAQ,OAAO,EACf,SAAS,cAAc,2EAAwC,EAC/D,OAAO,uBAAuB,2EAAoB,EAClD,OAAO,uBAAuB,0EAAmB,EACjD,OAAO,uBAAuB,mEAA2B,EACzD,OAAO,mBAAmB,wEAAgC,UAAU,EACpE,OAAO,sBAAsB,qEAAmB,EAChD,OAAO,YAAY,oDAAY,EAC/B,OAAO,OAAO,OAAiB,SAAS;AACvC,QAAM,eAAe,CAAC,YAAY,MAAM;AACxC,MAAI,CAAC,aAAa,SAAS,KAAK,MAAM,GAAG;AACvC,YAAQ,OAAO,MAAM,gEAAwB,KAAK,MAAM;AAAA,CAAuB;AAC/E,YAAQ,KAAK,CAAC;AAAA,EAChB;AACA,WAAS,KAAK,GAAG,KAAK,MAAM,QAAQ,MAAM;AACxC,UAAM,WAAW,MAAM,EAAE;AACzB,UAAM,UAAU,QAAQ,QAAQ;AAChC,UAAM,WAAW,SAAS,OAAO;AACjC,UAAM,aAAa,MAAM,SAAS,IAAI,IAAI,KAAK,CAAC,IAAI,MAAM,MAAM,OAAO;AAEvE,QAAI;AACF,YAAM,WAAW,SAAS,OAAO,EAAE;AACnC,UAAI,WAAW,MAAM,OAAO,MAAM;AAChC,gBAAQ,OAAO,MAAM;AAAA,iBAAoB,QAAQ,gEAAmB,WAAW,OAAO,MAAM,QAAQ,CAAC,CAAC;AAAA,CAAO;AAC7G,gBAAQ,WAAW;AACnB;AAAA,MACF;AACA,YAAM,SAAS,aAAa,OAAO;AACnC,YAAM,cAAc,cAAc,MAAM;AACxC,YAAM,SAAS,aAAa,WAAW;AAEvC,UAAI,CAAC,KAAK,QAAQ;AAChB,gBAAQ,OAAO,MAAM,YAAY,UAAU,GAAG,QAAQ,KAAK,MAAM,OAAO;AAAA,MAC1E;AAEA,YAAM,eAA6B,CAAC;AACpC,UAAI,KAAK,MAAO,cAAa,QAAQ,KAAK;AAC1C,UAAI,KAAK,iBAAiB,MAAO,cAAa,qBAAqB;AACnE,UAAI,CAAC,KAAK,QAAQ;AAChB,qBAAa,aAAa,CAAC,SAAiB,UAAkB;AAC5D,kBAAQ,OAAO,MAAM,cAAc,UAAU,GAAG,QAAQ,KAAK,MAAM,MAAM,OAAO,IAAI,KAAK,GAAG;AAAA,QAC9F;AAAA,MACF;AACA,YAAM,SAAS,MAAM,MAAM,aAAa,YAAY;AAEpD,UAAI,CAAC,OAAO,SAAS;AACnB,gBAAQ,OAAO,MAAM;AAAA,CAAS;AAC9B,gBAAQ,OAAO,MAAM,YAAO,OAAO,KAAK;AAAA,CAAI;AAC5C,gBAAQ,WAAW;AACnB;AAAA,MACF;AAEA,UAAI,CAAC,KAAK,OAAQ,SAAQ,OAAO,MAAM;AAAA,CAAO;AAE9C,UAAI,WAAW,OAAO;AAEtB,UAAI,KAAK,UAAU,OAAO,QAAQ,QAAQ;AACxC,mBAAW,SAAS,QAAQ,uBAAuB,wBAAwB;AAAA,MAC7E;AACA,YAAM,SAAS,KAAK,WAAW,SAC3B,KAAK,UAAU,QAAQ,MAAM,CAAC,IAC9B;AAGJ,YAAM,aAAa,CAAC,QAAgB;AAClC,YAAI,CAAC,OAAO,QAAQ,OAAQ;AAC5B,cAAM,SAAS,QAAQ,KAAK,QAAQ;AACpC,kBAAU,QAAQ,EAAE,WAAW,KAAK,CAAC;AACrC,mBAAW,OAAO,OAAO,QAAQ;AAC/B,wBAAc,QAAQ,QAAQ,IAAI,QAAQ,GAAG,IAAI,IAAI;AAAA,QACvD;AACA,YAAI,CAAC,KAAK,OAAQ,SAAQ,OAAO,MAAM,YAAO,OAAO,OAAO,MAAM,oCAAW,MAAM;AAAA,CAAI;AAAA,MACzF;AAEA,UAAI,KAAK,UAAU,MAAM,WAAW,GAAG;AACrC,sBAAc,KAAK,QAAQ,QAAQ,OAAO;AAC1C,YAAI,CAAC,KAAK,OAAQ,SAAQ,OAAO,MAAM,YAAO,KAAK,MAAM;AAAA,CAAI;AAC7D,mBAAW,QAAQ,KAAK,QAAQ,IAAI,CAAC;AAAA,MACvC,WAAW,KAAK,QAAQ;AACtB,kBAAU,KAAK,QAAQ,EAAE,WAAW,KAAK,CAAC;AAC1C,cAAM,SAAS,KAAK,WAAW,SAAS,UAAU;AAClD,cAAM,UAAU,QAAQ,KAAK,QAAQ,SAAS,QAAQ,YAAY,MAAM,CAAC;AACzE,sBAAc,SAAS,QAAQ,OAAO;AACtC,YAAI,CAAC,KAAK,OAAQ,SAAQ,OAAO,MAAM,YAAO,OAAO;AAAA,CAAI;AACzD,mBAAW,KAAK,MAAM;AAAA,MACxB,OAAO;AACL,gBAAQ,OAAO,MAAM,SAAS,IAAI;AAAA,MACpC;AAAA,IACF,SAAS,KAAK;AACZ,YAAM,EAAE,cAAc,IAAI,MAAM,OAAO,qBAAY;AACnD,cAAQ,OAAO,MAAM;AAAA,kBAAqB,QAAQ,WAAM,cAAc,GAAG,CAAC;AAAA,CAAI;AAC9E,cAAQ,WAAW;AAAA,IACrB;AAAA,EACF;AACF,CAAC;AAEH,QACG,QAAQ,aAAa,EACrB,YAAY,4FAAsB,EAClC,OAAO,mBAAmB,4CAAc,EACxC,OAAO,uBAAuB,iEAAe,EAC7C,OAAO,uBAAuB,8CAAW,EACzC,OAAO,mBAAmB,yDAA2B,UAAU,EAC/D,OAAO,YAAY,oDAAY,EAC/B,OAAO,OAAO,KAAa,SAAS;AACnC,QAAM,EAAE,eAAe,IAAI,MAAM,OAAO,qBAAY;AACpD,QAAM,eAAe;AAAA,IACnB;AAAA,IACA,QAAQ,KAAK;AAAA,IACb,SAAS,KAAK;AAAA,IACd,QAAQ,KAAK;AAAA,IACb,OAAO,KAAK;AAAA,IACZ,QAAQ,KAAK;AAAA,EACf,CAAC;AACH,CAAC;AAEH,QAAQ,MAAM;","names":[]}
|
package/dist/index.cjs
CHANGED
|
@@ -182,7 +182,7 @@ var import_zlib = require("zlib");
|
|
|
182
182
|
var import_xmldom = require("@xmldom/xmldom");
|
|
183
183
|
|
|
184
184
|
// src/utils.ts
|
|
185
|
-
var VERSION = true ? "2.0.
|
|
185
|
+
var VERSION = true ? "2.0.3" : "0.0.0-dev";
|
|
186
186
|
function toArrayBuffer(buf) {
|
|
187
187
|
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
188
188
|
return buf.buffer;
|
|
@@ -371,6 +371,47 @@ function sanitizeText(text) {
|
|
|
371
371
|
}
|
|
372
372
|
return result;
|
|
373
373
|
}
|
|
374
|
+
function flattenLayoutTables(blocks) {
|
|
375
|
+
const result = [];
|
|
376
|
+
for (const block of blocks) {
|
|
377
|
+
if (block.type !== "table" || !block.table) {
|
|
378
|
+
result.push(block);
|
|
379
|
+
continue;
|
|
380
|
+
}
|
|
381
|
+
const { rows: numRows, cols: numCols, cells } = block.table;
|
|
382
|
+
if (numRows === 1 && numCols === 1) {
|
|
383
|
+
result.push(block);
|
|
384
|
+
continue;
|
|
385
|
+
}
|
|
386
|
+
if (numRows <= 3) {
|
|
387
|
+
let totalNewlines = 0;
|
|
388
|
+
let totalTextLen = 0;
|
|
389
|
+
for (let r = 0; r < numRows; r++) {
|
|
390
|
+
for (let c = 0; c < numCols; c++) {
|
|
391
|
+
const t = cells[r]?.[c]?.text || "";
|
|
392
|
+
totalNewlines += (t.match(/\n/g) || []).length;
|
|
393
|
+
totalTextLen += t.length;
|
|
394
|
+
}
|
|
395
|
+
}
|
|
396
|
+
if (totalNewlines > 5 || numRows <= 2 && totalTextLen > 300) {
|
|
397
|
+
for (let r = 0; r < numRows; r++) {
|
|
398
|
+
for (let c = 0; c < numCols; c++) {
|
|
399
|
+
const cellText = cells[r]?.[c]?.text?.trim();
|
|
400
|
+
if (!cellText) continue;
|
|
401
|
+
for (const line of cellText.split("\n")) {
|
|
402
|
+
const trimmed = line.trim();
|
|
403
|
+
if (!trimmed) continue;
|
|
404
|
+
result.push({ type: "paragraph", text: trimmed, pageNumber: block.pageNumber });
|
|
405
|
+
}
|
|
406
|
+
}
|
|
407
|
+
}
|
|
408
|
+
continue;
|
|
409
|
+
}
|
|
410
|
+
}
|
|
411
|
+
result.push(block);
|
|
412
|
+
}
|
|
413
|
+
return result;
|
|
414
|
+
}
|
|
374
415
|
function blocksToMarkdown(blocks) {
|
|
375
416
|
const lines = [];
|
|
376
417
|
for (let i = 0; i < blocks.length; i++) {
|
|
@@ -432,8 +473,11 @@ function blocksToMarkdown(blocks) {
|
|
|
432
473
|
if (lines.length > 0 && lines[lines.length - 1] !== "") {
|
|
433
474
|
lines.push("");
|
|
434
475
|
}
|
|
435
|
-
|
|
436
|
-
|
|
476
|
+
const tableMd = tableToMarkdown(block.table);
|
|
477
|
+
if (tableMd) {
|
|
478
|
+
lines.push(tableMd);
|
|
479
|
+
lines.push("");
|
|
480
|
+
}
|
|
437
481
|
}
|
|
438
482
|
}
|
|
439
483
|
return lines.join("\n").trim();
|
|
@@ -443,6 +487,7 @@ function tableToMarkdown(table) {
|
|
|
443
487
|
const { cells, rows: numRows, cols: numCols } = table;
|
|
444
488
|
if (numRows === 1 && numCols === 1) {
|
|
445
489
|
const content = sanitizeText(cells[0][0].text);
|
|
490
|
+
if (!content) return "";
|
|
446
491
|
return content.split(/\n/).map((line) => {
|
|
447
492
|
const trimmed = line.trim();
|
|
448
493
|
if (!trimmed) return "";
|
|
@@ -479,9 +524,9 @@ function tableToMarkdown(table) {
|
|
|
479
524
|
const row = display[r];
|
|
480
525
|
const isEmptyPlaceholder = row.every((cell) => cell === "");
|
|
481
526
|
if (isEmptyPlaceholder) continue;
|
|
482
|
-
const hasSkippedCols = row.some((cell, c) => cell === "" && skip.has(`${r},${c}`));
|
|
483
527
|
const nonEmptyCols = row.filter((cell) => cell !== "");
|
|
484
|
-
|
|
528
|
+
const hasSkipInRow = row.some((_, c) => skip.has(`${r},${c}`));
|
|
529
|
+
if (!hasSkipInRow && nonEmptyCols.length === 1 && row[0] !== "" && row.slice(1).every((c) => c === "")) {
|
|
485
530
|
pendingFirstCol = row[0];
|
|
486
531
|
continue;
|
|
487
532
|
}
|
|
@@ -896,7 +941,8 @@ function detectHwpxHeadings(blocks, styleMap) {
|
|
|
896
941
|
else if (ratio >= HEADING_RATIO_H2) level = 2;
|
|
897
942
|
else if (ratio >= HEADING_RATIO_H3) level = 3;
|
|
898
943
|
}
|
|
899
|
-
|
|
944
|
+
const compactText = text.replace(/\s+/g, "");
|
|
945
|
+
if (/^제\d+[조장절편]/.test(compactText) && text.length <= 50) {
|
|
900
946
|
if (level === 0) level = 3;
|
|
901
947
|
}
|
|
902
948
|
if (level > 0) {
|
|
@@ -948,9 +994,14 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
948
994
|
if (newTable.rows.length > 0) {
|
|
949
995
|
if (tableStack.length > 0) {
|
|
950
996
|
const parentTable = tableStack.pop();
|
|
951
|
-
const
|
|
952
|
-
if (
|
|
953
|
-
|
|
997
|
+
const nestedCols = Math.max(...newTable.rows.map((r) => r.length));
|
|
998
|
+
if (newTable.rows.length >= 3 && nestedCols >= 2) {
|
|
999
|
+
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
|
|
1000
|
+
} else {
|
|
1001
|
+
const nestedText = convertTableToText(newTable.rows);
|
|
1002
|
+
if (parentTable.cell) {
|
|
1003
|
+
parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
|
|
1004
|
+
}
|
|
954
1005
|
}
|
|
955
1006
|
tableCtx = parentTable;
|
|
956
1007
|
} else {
|
|
@@ -1050,9 +1101,14 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
|
|
|
1050
1101
|
if (newTable.rows.length > 0) {
|
|
1051
1102
|
if (tableStack.length > 0) {
|
|
1052
1103
|
const parentTable = tableStack.pop();
|
|
1053
|
-
const
|
|
1054
|
-
if (
|
|
1055
|
-
|
|
1104
|
+
const nestedCols = Math.max(...newTable.rows.map((r) => r.length));
|
|
1105
|
+
if (newTable.rows.length >= 3 && nestedCols >= 2) {
|
|
1106
|
+
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
|
|
1107
|
+
} else {
|
|
1108
|
+
const nestedText = convertTableToText(newTable.rows);
|
|
1109
|
+
if (parentTable.cell) {
|
|
1110
|
+
parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
|
|
1111
|
+
}
|
|
1056
1112
|
}
|
|
1057
1113
|
tableCtx = parentTable;
|
|
1058
1114
|
} else {
|
|
@@ -1063,13 +1119,20 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
|
|
|
1063
1119
|
tableCtx = tableStack.length > 0 ? tableStack.pop() : null;
|
|
1064
1120
|
}
|
|
1065
1121
|
} else if (localTag === "pic" || localTag === "shape" || localTag === "drawingObject") {
|
|
1066
|
-
const
|
|
1067
|
-
if (
|
|
1068
|
-
|
|
1069
|
-
} else
|
|
1070
|
-
|
|
1122
|
+
const drawTextChild = findDescendant(el, "drawText");
|
|
1123
|
+
if (drawTextChild) {
|
|
1124
|
+
extractDrawTextBlocks(drawTextChild, blocks, styleMap, sectionNum);
|
|
1125
|
+
} else {
|
|
1126
|
+
const imgRef = extractImageRef(el);
|
|
1127
|
+
if (imgRef) {
|
|
1128
|
+
blocks.push({ type: "image", text: imgRef, pageNumber: sectionNum });
|
|
1129
|
+
} else if (warnings && sectionNum) {
|
|
1130
|
+
warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
|
|
1131
|
+
}
|
|
1071
1132
|
}
|
|
1072
|
-
} else if (localTag === "
|
|
1133
|
+
} else if (localTag === "drawText") {
|
|
1134
|
+
extractDrawTextBlocks(el, blocks, styleMap, sectionNum);
|
|
1135
|
+
} else if (localTag === "r" || localTag === "run" || localTag === "ctrl" || localTag === "rect" || localTag === "ellipse" || localTag === "polygon" || localTag === "line" || localTag === "arc" || localTag === "curve" || localTag === "connectLine" || localTag === "container") {
|
|
1073
1136
|
walkChildren(el, d + 1);
|
|
1074
1137
|
}
|
|
1075
1138
|
}
|
|
@@ -1077,6 +1140,40 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
|
|
|
1077
1140
|
walkChildren(node, depth);
|
|
1078
1141
|
return tableCtx;
|
|
1079
1142
|
}
|
|
1143
|
+
function findDescendant(node, targetTag, depth = 0) {
|
|
1144
|
+
if (depth > 5) return null;
|
|
1145
|
+
const children = node.childNodes;
|
|
1146
|
+
if (!children) return null;
|
|
1147
|
+
for (let i = 0; i < children.length; i++) {
|
|
1148
|
+
const child = children[i];
|
|
1149
|
+
if (child.nodeType !== 1) continue;
|
|
1150
|
+
const tag = (child.tagName || child.localName || "").replace(/^[^:]+:/, "");
|
|
1151
|
+
if (tag === targetTag) return child;
|
|
1152
|
+
const found = findDescendant(child, targetTag, depth + 1);
|
|
1153
|
+
if (found) return found;
|
|
1154
|
+
}
|
|
1155
|
+
return null;
|
|
1156
|
+
}
|
|
1157
|
+
function extractDrawTextBlocks(drawTextNode, blocks, styleMap, sectionNum) {
|
|
1158
|
+
const children = drawTextNode.childNodes;
|
|
1159
|
+
if (!children) return;
|
|
1160
|
+
for (let i = 0; i < children.length; i++) {
|
|
1161
|
+
const child = children[i];
|
|
1162
|
+
if (child.nodeType !== 1) continue;
|
|
1163
|
+
const tag = (child.tagName || child.localName || "").replace(/^[^:]+:/, "");
|
|
1164
|
+
if (tag === "subList" || tag === "p" || tag === "para") {
|
|
1165
|
+
if (tag === "subList") {
|
|
1166
|
+
extractDrawTextBlocks(child, blocks, styleMap, sectionNum);
|
|
1167
|
+
} else {
|
|
1168
|
+
const info = extractParagraphInfo(child, styleMap);
|
|
1169
|
+
const text = info.text.trim();
|
|
1170
|
+
if (text) {
|
|
1171
|
+
blocks.push({ type: "paragraph", text, style: info.style ?? void 0, pageNumber: sectionNum });
|
|
1172
|
+
}
|
|
1173
|
+
}
|
|
1174
|
+
}
|
|
1175
|
+
}
|
|
1176
|
+
}
|
|
1080
1177
|
function extractParagraphInfo(para, styleMap) {
|
|
1081
1178
|
let text = "";
|
|
1082
1179
|
let href;
|
|
@@ -1095,11 +1192,18 @@ function extractParagraphInfo(para, styleMap) {
|
|
|
1095
1192
|
const tag = (child.tagName || child.localName || "").replace(/^[^:]+:/, "");
|
|
1096
1193
|
switch (tag) {
|
|
1097
1194
|
case "t":
|
|
1098
|
-
|
|
1195
|
+
walk(child);
|
|
1099
1196
|
break;
|
|
1100
|
-
|
|
1101
|
-
|
|
1197
|
+
// 자식 순회 (tab 등 하위 요소 처리)
|
|
1198
|
+
case "tab": {
|
|
1199
|
+
const leader = child.getAttribute("leader");
|
|
1200
|
+
if (leader && leader !== "0") {
|
|
1201
|
+
text += "";
|
|
1202
|
+
} else {
|
|
1203
|
+
text += " ";
|
|
1204
|
+
}
|
|
1102
1205
|
break;
|
|
1206
|
+
}
|
|
1103
1207
|
case "br":
|
|
1104
1208
|
if ((child.getAttribute("type") || "line") === "line") text += "\n";
|
|
1105
1209
|
break;
|
|
@@ -1166,6 +1270,8 @@ function extractParagraphInfo(para, styleMap) {
|
|
|
1166
1270
|
}
|
|
1167
1271
|
};
|
|
1168
1272
|
walk(para);
|
|
1273
|
+
const leaderIdx = text.indexOf("");
|
|
1274
|
+
if (leaderIdx >= 0) text = text.substring(0, leaderIdx);
|
|
1169
1275
|
let cleanText = text.replace(/[ \t]+/g, " ").trim();
|
|
1170
1276
|
if (/^그림입니다\.?\s*원본\s*그림의\s*(이름|크기)/.test(cleanText)) cleanText = "";
|
|
1171
1277
|
cleanText = cleanText.replace(/그림입니다\.?\s*원본\s*그림의\s*(이름|크기)[^\n]*(\n[^\n]*원본\s*그림의\s*(이름|크기)[^\n]*)*/g, "").trim();
|
|
@@ -1204,8 +1310,9 @@ var TAG_CHAR_SHAPE = 68;
|
|
|
1204
1310
|
var TAG_CTRL_HEADER = 71;
|
|
1205
1311
|
var TAG_LIST_HEADER = 72;
|
|
1206
1312
|
var TAG_TABLE = 77;
|
|
1207
|
-
var TAG_DOC_CHAR_SHAPE =
|
|
1208
|
-
var
|
|
1313
|
+
var TAG_DOC_CHAR_SHAPE = 21;
|
|
1314
|
+
var TAG_DOC_PARA_SHAPE = 25;
|
|
1315
|
+
var TAG_DOC_STYLE = 26;
|
|
1209
1316
|
var CHAR_LINE = 0;
|
|
1210
1317
|
var CHAR_SECTION_BREAK = 10;
|
|
1211
1318
|
var CHAR_PARA = 13;
|
|
@@ -1261,8 +1368,14 @@ function parseFileHeader(data) {
|
|
|
1261
1368
|
}
|
|
1262
1369
|
function parseDocInfo(records) {
|
|
1263
1370
|
const charShapes = [];
|
|
1371
|
+
const paraShapes = [];
|
|
1264
1372
|
const styles = [];
|
|
1265
1373
|
for (const rec of records) {
|
|
1374
|
+
if (rec.tagId === TAG_DOC_PARA_SHAPE && rec.data.length >= 4) {
|
|
1375
|
+
const flags = rec.data.readUInt32LE(0);
|
|
1376
|
+
const outlineLevel = flags >> 25 & 7;
|
|
1377
|
+
paraShapes.push({ outlineLevel });
|
|
1378
|
+
}
|
|
1266
1379
|
if (rec.tagId === TAG_DOC_CHAR_SHAPE && rec.data.length >= 18) {
|
|
1267
1380
|
if (rec.data.length >= 50) {
|
|
1268
1381
|
const fontSize = rec.data.readUInt32LE(42);
|
|
@@ -1302,7 +1415,7 @@ function parseDocInfo(records) {
|
|
|
1302
1415
|
}
|
|
1303
1416
|
}
|
|
1304
1417
|
}
|
|
1305
|
-
return { charShapes, styles };
|
|
1418
|
+
return { charShapes, paraShapes, styles };
|
|
1306
1419
|
}
|
|
1307
1420
|
function extractText(data) {
|
|
1308
1421
|
let result = "";
|
|
@@ -2314,12 +2427,13 @@ function parseHwp5Document(buffer, options) {
|
|
|
2314
2427
|
}
|
|
2315
2428
|
}
|
|
2316
2429
|
const images = cfb ? extractHwp5Images(cfb, blocks, compressed, warnings) : extractHwp5ImagesLenient(lenientCfb, blocks, compressed, warnings);
|
|
2430
|
+
const flatBlocks = flattenLayoutTables(blocks);
|
|
2317
2431
|
if (docInfo) {
|
|
2318
|
-
detectHwp5Headings(
|
|
2432
|
+
detectHwp5Headings(flatBlocks, docInfo);
|
|
2319
2433
|
}
|
|
2320
|
-
const outline =
|
|
2321
|
-
const markdown = blocksToMarkdown(
|
|
2322
|
-
return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0, images: images.length > 0 ? images : void 0 };
|
|
2434
|
+
const outline = flatBlocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
|
|
2435
|
+
const markdown = blocksToMarkdown(flatBlocks);
|
|
2436
|
+
return { markdown, blocks: flatBlocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0, images: images.length > 0 ? images : void 0 };
|
|
2323
2437
|
}
|
|
2324
2438
|
function parseDocInfoStream(cfb, compressed) {
|
|
2325
2439
|
try {
|
|
@@ -2370,16 +2484,21 @@ function detectHwp5Headings(blocks, docInfo) {
|
|
|
2370
2484
|
}
|
|
2371
2485
|
if (baseFontSize <= 0) return;
|
|
2372
2486
|
for (const block of blocks) {
|
|
2373
|
-
if (block.type
|
|
2487
|
+
if (block.type === "heading") continue;
|
|
2488
|
+
if (block.type !== "paragraph" || !block.text) continue;
|
|
2374
2489
|
const text = block.text.trim();
|
|
2375
2490
|
if (text.length === 0 || text.length > 200) continue;
|
|
2376
2491
|
if (/^\d+$/.test(text)) continue;
|
|
2377
|
-
const ratio = block.style.fontSize / baseFontSize;
|
|
2378
2492
|
let level = 0;
|
|
2379
|
-
if (
|
|
2380
|
-
|
|
2381
|
-
|
|
2382
|
-
|
|
2493
|
+
if (block.style?.fontSize && baseFontSize > 0) {
|
|
2494
|
+
const ratio = block.style.fontSize / baseFontSize;
|
|
2495
|
+
if (ratio >= HEADING_RATIO_H1) level = 1;
|
|
2496
|
+
else if (ratio >= HEADING_RATIO_H2) level = 2;
|
|
2497
|
+
else if (ratio >= HEADING_RATIO_H3) level = 3;
|
|
2498
|
+
}
|
|
2499
|
+
if (/^제\d+[장절편]\s/.test(text) && text.length <= 50) {
|
|
2500
|
+
if (level === 0) level = 2;
|
|
2501
|
+
} else if (/^제\d+(조의?\d*)\s*[\((]/.test(text) && text.length <= 80) {
|
|
2383
2502
|
if (level === 0) level = 3;
|
|
2384
2503
|
}
|
|
2385
2504
|
if (level > 0) {
|
|
@@ -2611,13 +2730,20 @@ function parseSection(records, docInfo, warnings, sectionNum) {
|
|
|
2611
2730
|
while (i < records.length) {
|
|
2612
2731
|
const rec = records[i];
|
|
2613
2732
|
if (rec.tagId === TAG_PARA_HEADER && rec.level === 0) {
|
|
2614
|
-
const { paragraph, tables, nextIdx, charShapeIds } = parseParagraphWithTables(records, i);
|
|
2733
|
+
const { paragraph, tables, nextIdx, charShapeIds, paraShapeId } = parseParagraphWithTables(records, i);
|
|
2615
2734
|
if (paragraph) {
|
|
2616
2735
|
const block = { type: "paragraph", text: paragraph, pageNumber: sectionNum };
|
|
2617
2736
|
if (docInfo && charShapeIds.length > 0) {
|
|
2618
2737
|
const style = resolveCharStyle(charShapeIds, docInfo);
|
|
2619
2738
|
if (style) block.style = style;
|
|
2620
2739
|
}
|
|
2740
|
+
if (docInfo && paraShapeId >= 0 && paraShapeId < docInfo.paraShapes.length) {
|
|
2741
|
+
const ol = docInfo.paraShapes[paraShapeId].outlineLevel;
|
|
2742
|
+
if (ol >= 1 && ol <= 6) {
|
|
2743
|
+
block.type = "heading";
|
|
2744
|
+
block.level = ol;
|
|
2745
|
+
}
|
|
2746
|
+
}
|
|
2621
2747
|
blocks.push(block);
|
|
2622
2748
|
}
|
|
2623
2749
|
for (const t of tables) blocks.push({ type: "table", table: t, pageNumber: sectionNum });
|
|
@@ -2637,7 +2763,10 @@ function parseSection(records, docInfo, warnings, sectionNum) {
|
|
|
2637
2763
|
if (binId >= 0) {
|
|
2638
2764
|
blocks.push({ type: "image", text: String(binId), pageNumber: sectionNum });
|
|
2639
2765
|
} else {
|
|
2640
|
-
|
|
2766
|
+
const boxText = extractTextBoxText(records, i);
|
|
2767
|
+
if (boxText) {
|
|
2768
|
+
blocks.push({ type: "paragraph", text: boxText, pageNumber: sectionNum });
|
|
2769
|
+
}
|
|
2641
2770
|
}
|
|
2642
2771
|
} else if (ctrlId === " elo" || ctrlId === "ole ") {
|
|
2643
2772
|
warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC81C\uC5B4 \uC694\uC18C: ${ctrlId.trim()}`, code: "SKIPPED_IMAGE" });
|
|
@@ -2676,6 +2805,19 @@ function extractNoteText(records, ctrlIdx) {
|
|
|
2676
2805
|
}
|
|
2677
2806
|
return texts.length > 0 ? texts.join(" ") : null;
|
|
2678
2807
|
}
|
|
2808
|
+
function extractTextBoxText(records, ctrlIdx) {
|
|
2809
|
+
const ctrlLevel = records[ctrlIdx].level;
|
|
2810
|
+
const texts = [];
|
|
2811
|
+
for (let j = ctrlIdx + 1; j < records.length && j < ctrlIdx + 200; j++) {
|
|
2812
|
+
const r = records[j];
|
|
2813
|
+
if (r.level <= ctrlLevel) break;
|
|
2814
|
+
if (r.tagId === TAG_PARA_TEXT) {
|
|
2815
|
+
const t = extractText(r.data).trim();
|
|
2816
|
+
if (t) texts.push(t);
|
|
2817
|
+
}
|
|
2818
|
+
}
|
|
2819
|
+
return texts.length > 0 ? texts.join("\n") : null;
|
|
2820
|
+
}
|
|
2679
2821
|
function extractHyperlinkUrl(data) {
|
|
2680
2822
|
try {
|
|
2681
2823
|
const httpSig = Buffer.from("http", "utf16le");
|
|
@@ -2721,6 +2863,8 @@ function parseParagraphWithTables(records, startIdx) {
|
|
|
2721
2863
|
let text = "";
|
|
2722
2864
|
const tables = [];
|
|
2723
2865
|
const charShapeIds = [];
|
|
2866
|
+
const paraHeaderData = records[startIdx].data;
|
|
2867
|
+
const paraShapeId = paraHeaderData.length >= 10 ? paraHeaderData.readUInt16LE(8) : -1;
|
|
2724
2868
|
let i = startIdx + 1;
|
|
2725
2869
|
while (i < records.length) {
|
|
2726
2870
|
const rec = records[i];
|
|
@@ -2745,7 +2889,7 @@ function parseParagraphWithTables(records, startIdx) {
|
|
|
2745
2889
|
i++;
|
|
2746
2890
|
}
|
|
2747
2891
|
const trimmed = text.trim();
|
|
2748
|
-
return { paragraph: trimmed || null, tables, nextIdx: i, charShapeIds };
|
|
2892
|
+
return { paragraph: trimmed || null, tables, nextIdx: i, charShapeIds, paraShapeId };
|
|
2749
2893
|
}
|
|
2750
2894
|
function parseTableBlock(records, startIdx) {
|
|
2751
2895
|
const tableLevel = records[startIdx].level;
|