kordoc 2.7.2 → 2.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +20 -1
- package/dist/{chunk-4NWDJGAU.js → chunk-M24KMDAR.js} +53 -26
- package/dist/chunk-M24KMDAR.js.map +1 -0
- package/dist/{chunk-Y476BOHI.cjs → chunk-QB7CS534.cjs} +2 -2
- package/dist/{chunk-Y476BOHI.cjs.map → chunk-QB7CS534.cjs.map} +1 -1
- package/dist/{chunk-LB7E2KDF.js → chunk-RXZLTACX.js} +2 -2
- package/dist/{chunk-4SK2PDMQ.js.map → chunk-RXZLTACX.js.map} +1 -1
- package/dist/{chunk-4SK2PDMQ.js → chunk-SJ5TPMBT.js} +2 -2
- package/dist/{chunk-LB7E2KDF.js.map → chunk-SJ5TPMBT.js.map} +1 -1
- package/dist/cli.js +3 -3
- package/dist/index.cjs +162 -135
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +56 -2
- package/dist/index.d.ts +56 -2
- package/dist/index.js +52 -25
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +3 -3
- package/dist/{parser-7OFQ67QL.cjs → parser-EL5YETUA.cjs} +158 -18
- package/dist/parser-EL5YETUA.cjs.map +1 -0
- package/dist/{parser-DJCMY3OO.js → parser-OMPBVEFU.js} +145 -5
- package/dist/parser-OMPBVEFU.js.map +1 -0
- package/dist/{parser-QMMQ7Y7R.js → parser-XBYGROQB.js} +145 -5
- package/dist/parser-XBYGROQB.js.map +1 -0
- package/dist/{watch-FVMVIZ5Q.js → watch-ULLLK7ID.js} +3 -3
- package/package.json +1 -1
- package/dist/chunk-4NWDJGAU.js.map +0 -1
- package/dist/parser-7OFQ67QL.cjs.map +0 -1
- package/dist/parser-DJCMY3OO.js.map +0 -1
- package/dist/parser-QMMQ7Y7R.js.map +0 -1
- /package/dist/{watch-FVMVIZ5Q.js.map → watch-ULLLK7ID.js.map} +0 -0
package/README.md
CHANGED
|
@@ -7,7 +7,7 @@
|
|
|
7
7
|
|
|
8
8
|
> *대한민국에서 둘째가라면 서러울 문서지옥. 거기서 7년 버틴 공무원이 만들었습니다.*
|
|
9
9
|
|
|
10
|
-
HWP 3.x/5.x, HWPX, PDF, XLS, XLSX, DOCX — 관공서에서 쏟아지는 모든 문서를 파싱하고, 비교하고, 분석하고, 생성합니다.
|
|
10
|
+
HWP 3.x/5.x, HWPX, HWPML, PDF, XLS, XLSX, DOCX — 관공서에서 쏟아지는 모든 문서를 파싱하고, 비교하고, 분석하고, 생성합니다.
|
|
11
11
|
|
|
12
12
|
[English](./README-EN.md)
|
|
13
13
|
|
|
@@ -312,6 +312,25 @@ const result = await parse(buffer, {
|
|
|
312
312
|
})
|
|
313
313
|
```
|
|
314
314
|
|
|
315
|
+
### PDF 텍스트 품질 신호 (v2.9.0+)
|
|
316
|
+
|
|
317
|
+
PDF는 텍스트층이 있어도 ToUnicode/CMap이 깨졌거나 NUL 등 제어문자가 섞이는 경우가 많다. `parsePdf` 결과는 페이지별 품질 신호를 함께 반환한다.
|
|
318
|
+
|
|
319
|
+
```typescript
|
|
320
|
+
const r = await parsePdf(buffer)
|
|
321
|
+
if (r.success && r.qualitySummary?.needsOcr) {
|
|
322
|
+
// OCR 큐로 라우팅 (kordoc은 OCR을 기본 탑재하지 않음)
|
|
323
|
+
await routeToOcr(buffer, r.qualitySummary.ocrCandidatePages)
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
// 페이지 단위 신호
|
|
327
|
+
for (const p of r.pageQuality ?? []) {
|
|
328
|
+
if (p.needsOcr) console.log(`p${p.page} 검토 필요: ${p.ocrReason}`)
|
|
329
|
+
}
|
|
330
|
+
```
|
|
331
|
+
|
|
332
|
+
신호 키: `textChars`, `hangulRatio`, `controlCharRatio`, `replacementCharRatio`, `puaRatio` / `needsOcr` (페이지·문서 단위) / `ocrReason` (`low_text` | `high_pua` | `high_control` | `high_replacement`).
|
|
333
|
+
|
|
315
334
|
## CLI
|
|
316
335
|
|
|
317
336
|
```bash
|
|
@@ -22,7 +22,7 @@ import {
|
|
|
22
22
|
sanitizeHref,
|
|
23
23
|
stripDtd,
|
|
24
24
|
toArrayBuffer
|
|
25
|
-
} from "./chunk-
|
|
25
|
+
} from "./chunk-SJ5TPMBT.js";
|
|
26
26
|
import {
|
|
27
27
|
parsePageRange
|
|
28
28
|
} from "./chunk-MOL7MDBG.js";
|
|
@@ -3928,6 +3928,8 @@ var CHAR_H1 = 5;
|
|
|
3928
3928
|
var CHAR_H2 = 6;
|
|
3929
3929
|
var CHAR_H3 = 7;
|
|
3930
3930
|
var CHAR_H4 = 8;
|
|
3931
|
+
var CHAR_TABLE_HEADER = 9;
|
|
3932
|
+
var CHAR_QUOTE = 10;
|
|
3931
3933
|
var PARA_NORMAL = 0;
|
|
3932
3934
|
var PARA_H1 = 1;
|
|
3933
3935
|
var PARA_H2 = 2;
|
|
@@ -3936,14 +3938,30 @@ var PARA_H4 = 4;
|
|
|
3936
3938
|
var PARA_CODE = 5;
|
|
3937
3939
|
var PARA_QUOTE = 6;
|
|
3938
3940
|
var PARA_LIST = 7;
|
|
3939
|
-
|
|
3941
|
+
var DEFAULT_TEXT_COLOR = "#000000";
|
|
3942
|
+
function resolveTheme(theme) {
|
|
3943
|
+
return {
|
|
3944
|
+
h1: theme?.headingColors?.[1] ?? DEFAULT_TEXT_COLOR,
|
|
3945
|
+
h2: theme?.headingColors?.[2] ?? DEFAULT_TEXT_COLOR,
|
|
3946
|
+
h3: theme?.headingColors?.[3] ?? DEFAULT_TEXT_COLOR,
|
|
3947
|
+
h4: theme?.headingColors?.[4] ?? theme?.headingColors?.[3] ?? DEFAULT_TEXT_COLOR,
|
|
3948
|
+
body: theme?.bodyColor ?? DEFAULT_TEXT_COLOR,
|
|
3949
|
+
quote: theme?.quoteColor ?? DEFAULT_TEXT_COLOR,
|
|
3950
|
+
/** quoteColor가 명시되었는지 — blockquote charPr 분기에 사용 (baseline 호환) */
|
|
3951
|
+
hasQuoteOption: theme?.quoteColor !== void 0,
|
|
3952
|
+
tableHeader: theme?.tableHeaderColor ?? theme?.bodyColor ?? DEFAULT_TEXT_COLOR,
|
|
3953
|
+
tableHeaderBold: !!theme?.tableHeaderBold
|
|
3954
|
+
};
|
|
3955
|
+
}
|
|
3956
|
+
async function markdownToHwpx(markdown, options) {
|
|
3957
|
+
const theme = resolveTheme(options?.theme);
|
|
3940
3958
|
const blocks = parseMarkdownToBlocks(markdown);
|
|
3941
|
-
const sectionXml = blocksToSectionXml(blocks);
|
|
3959
|
+
const sectionXml = blocksToSectionXml(blocks, theme);
|
|
3942
3960
|
const zip = new JSZip3();
|
|
3943
3961
|
zip.file("mimetype", "application/hwp+zip", { compression: "STORE" });
|
|
3944
3962
|
zip.file("META-INF/container.xml", generateContainerXml());
|
|
3945
3963
|
zip.file("Contents/content.hpf", generateManifest());
|
|
3946
|
-
zip.file("Contents/header.xml", generateHeaderXml());
|
|
3964
|
+
zip.file("Contents/header.xml", generateHeaderXml(theme));
|
|
3947
3965
|
zip.file("Contents/section0.xml", sectionXml);
|
|
3948
3966
|
zip.file("Preview/PrvText.txt", buildPrvText(blocks));
|
|
3949
3967
|
return await zip.generateAsync({ type: "arraybuffer" });
|
|
@@ -4123,11 +4141,11 @@ function generateManifest() {
|
|
|
4123
4141
|
</opf:spine>
|
|
4124
4142
|
</opf:package>`;
|
|
4125
4143
|
}
|
|
4126
|
-
function charPr(id, height, bold, italic, fontId = 0) {
|
|
4144
|
+
function charPr(id, height, bold, italic, fontId = 0, textColor = DEFAULT_TEXT_COLOR) {
|
|
4127
4145
|
const boldAttr = bold ? ` bold="1"` : "";
|
|
4128
4146
|
const italicAttr = italic ? ` italic="1"` : "";
|
|
4129
4147
|
const effFont = bold ? 2 : fontId;
|
|
4130
|
-
return ` <hh:charPr id="${id}" height="${height}" textColor="
|
|
4148
|
+
return ` <hh:charPr id="${id}" height="${height}" textColor="${textColor}" shadeColor="none" useFontSpace="0" useKerning="0" symMark="NONE" borderFillIDRef="0"${boldAttr}${italicAttr}>
|
|
4131
4149
|
<hh:fontRef hangul="${effFont}" latin="${effFont}" hanja="${effFont}" japanese="${effFont}" other="${effFont}" symbol="${effFont}" user="${effFont}"/>
|
|
4132
4150
|
<hh:ratio hangul="100" latin="100" hanja="100" japanese="100" other="100" symbol="100" user="100"/>
|
|
4133
4151
|
<hh:spacing hangul="0" latin="0" hanja="0" japanese="0" other="0" symbol="0" user="0"/>
|
|
@@ -4147,7 +4165,7 @@ function paraPr(id, opts = {}) {
|
|
|
4147
4165
|
<hh:border borderFillIDRef="0" offsetLeft="0" offsetRight="0" offsetTop="0" offsetBottom="0" connect="0" ignoreMargin="0"/>
|
|
4148
4166
|
</hh:paraPr>`;
|
|
4149
4167
|
}
|
|
4150
|
-
function generateHeaderXml() {
|
|
4168
|
+
function generateHeaderXml(theme) {
|
|
4151
4169
|
return `<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
|
|
4152
4170
|
<hh:head xmlns:hh="${NS_HEAD}" xmlns:hp="${NS_PARA}" version="1.4" secCnt="1">
|
|
4153
4171
|
<hh:beginNum page="1" footnote="1" endnote="1" pic="1" tbl="1" equation="1"/>
|
|
@@ -4223,16 +4241,18 @@ function generateHeaderXml() {
|
|
|
4223
4241
|
<hh:fillInfo/>
|
|
4224
4242
|
</hh:borderFill>
|
|
4225
4243
|
</hh:borderFills>
|
|
4226
|
-
<hh:charProperties itemCnt="
|
|
4227
|
-
${charPr(0, 1e3, false, false)}
|
|
4228
|
-
${charPr(1, 1e3, true, false)}
|
|
4229
|
-
${charPr(2, 1e3, false, true)}
|
|
4230
|
-
${charPr(3, 1e3, true, true)}
|
|
4244
|
+
<hh:charProperties itemCnt="11">
|
|
4245
|
+
${charPr(0, 1e3, false, false, 0, theme.body)}
|
|
4246
|
+
${charPr(1, 1e3, true, false, 0, theme.body)}
|
|
4247
|
+
${charPr(2, 1e3, false, true, 0, theme.body)}
|
|
4248
|
+
${charPr(3, 1e3, true, true, 0, theme.body)}
|
|
4231
4249
|
${charPr(4, 900, false, false, 1)}
|
|
4232
|
-
${charPr(5, 1800, true, false, 1)}
|
|
4233
|
-
${charPr(6, 1400, true, false, 1)}
|
|
4234
|
-
${charPr(7, 1200, true, false, 1)}
|
|
4235
|
-
${charPr(8, 1100, true, false, 1)}
|
|
4250
|
+
${charPr(5, 1800, true, false, 1, theme.h1)}
|
|
4251
|
+
${charPr(6, 1400, true, false, 1, theme.h2)}
|
|
4252
|
+
${charPr(7, 1200, true, false, 1, theme.h3)}
|
|
4253
|
+
${charPr(8, 1100, true, false, 1, theme.h4)}
|
|
4254
|
+
${charPr(CHAR_TABLE_HEADER, 1e3, theme.tableHeaderBold, false, 0, theme.tableHeader)}
|
|
4255
|
+
${charPr(CHAR_QUOTE, 1e3, false, true, 0, theme.quote)}
|
|
4236
4256
|
</hh:charProperties>
|
|
4237
4257
|
<hh:tabProperties itemCnt="0"/>
|
|
4238
4258
|
<hh:numberings itemCnt="0"/>
|
|
@@ -4262,7 +4282,7 @@ var tableIdCounter = TABLE_ID_BASE;
|
|
|
4262
4282
|
function nextTableId() {
|
|
4263
4283
|
return ++tableIdCounter;
|
|
4264
4284
|
}
|
|
4265
|
-
function generateTable(rows) {
|
|
4285
|
+
function generateTable(rows, theme) {
|
|
4266
4286
|
const rowCnt = rows.length;
|
|
4267
4287
|
const colCnt = Math.max(...rows.map((r) => r.length), 1);
|
|
4268
4288
|
const cellW = Math.floor(44e3 / colCnt);
|
|
@@ -4270,12 +4290,15 @@ function generateTable(rows) {
|
|
|
4270
4290
|
const tblW = cellW * colCnt;
|
|
4271
4291
|
const tblH = cellH * rowCnt;
|
|
4272
4292
|
const tblId = nextTableId();
|
|
4293
|
+
const useHeaderStyle = theme.tableHeader !== theme.body || theme.tableHeaderBold;
|
|
4273
4294
|
const trElements = rows.map((row, rowIdx) => {
|
|
4274
4295
|
const cells = row.length < colCnt ? [...row, ...Array(colCnt - row.length).fill("")] : row;
|
|
4296
|
+
const isHeaderRow = rowIdx === 0;
|
|
4297
|
+
const headerCharPr = isHeaderRow && useHeaderStyle ? CHAR_TABLE_HEADER : CHAR_NORMAL;
|
|
4275
4298
|
const tdElements = cells.map((cell, colIdx) => {
|
|
4276
|
-
const runs = generateRuns(cell);
|
|
4299
|
+
const runs = generateRuns(cell, headerCharPr);
|
|
4277
4300
|
const p = `<hp:p paraPrIDRef="0" styleIDRef="0">${runs}</hp:p>`;
|
|
4278
|
-
return `<hp:tc name="" header="${
|
|
4301
|
+
return `<hp:tc name="" header="${isHeaderRow ? 1 : 0}" hasMargin="0" protect="0" editable="1" dirty="0" borderFillIDRef="1"><hp:subList id="" textDirection="HORIZONTAL" lineWrap="BREAK" vertAlign="TOP" linkListIDRef="0" linkListNextIDRef="0" textWidth="0" textHeight="0" hasTextRef="0" hasNumRef="0">${p}</hp:subList><hp:cellAddr colAddr="${colIdx}" rowAddr="${rowIdx}"/><hp:cellSpan colSpan="1" rowSpan="1"/><hp:cellSz width="${cellW}" height="${cellH}"/><hp:cellMargin left="141" right="141" top="141" bottom="141"/></hp:tc>`;
|
|
4279
4302
|
}).join("");
|
|
4280
4303
|
return `<hp:tr>${tdElements}</hp:tr>`;
|
|
4281
4304
|
}).join("");
|
|
@@ -4283,7 +4306,7 @@ function generateTable(rows) {
|
|
|
4283
4306
|
const tbl = `<hp:tbl id="${tblId}" zOrder="0" numberingType="TABLE" pageBreak="CELL" repeatHeader="0" rowCnt="${rowCnt}" colCnt="${colCnt}" cellSpacing="0" borderFillIDRef="1" noShading="0">${tblInner}</hp:tbl>`;
|
|
4284
4307
|
return `<hp:p paraPrIDRef="0" styleIDRef="0"><hp:run charPrIDRef="0">${tbl}</hp:run></hp:p>`;
|
|
4285
4308
|
}
|
|
4286
|
-
function blocksToSectionXml(blocks) {
|
|
4309
|
+
function blocksToSectionXml(blocks, theme) {
|
|
4287
4310
|
const paraXmls = [];
|
|
4288
4311
|
let isFirst = true;
|
|
4289
4312
|
const orderedCounters = {};
|
|
@@ -4312,7 +4335,11 @@ function blocksToSectionXml(blocks) {
|
|
|
4312
4335
|
break;
|
|
4313
4336
|
}
|
|
4314
4337
|
case "blockquote":
|
|
4315
|
-
xml = generateParagraph(
|
|
4338
|
+
xml = generateParagraph(
|
|
4339
|
+
block.text || "",
|
|
4340
|
+
PARA_QUOTE,
|
|
4341
|
+
theme.hasQuoteOption ? CHAR_QUOTE : CHAR_NORMAL
|
|
4342
|
+
);
|
|
4316
4343
|
break;
|
|
4317
4344
|
case "list_item": {
|
|
4318
4345
|
const indent = block.indent || 0;
|
|
@@ -4345,7 +4372,7 @@ function blocksToSectionXml(blocks) {
|
|
|
4345
4372
|
paraXmls.push(`<hp:p paraPrIDRef="0" styleIDRef="0">${secRun}</hp:p>`);
|
|
4346
4373
|
isFirst = false;
|
|
4347
4374
|
}
|
|
4348
|
-
xml = generateTable(block.rows);
|
|
4375
|
+
xml = generateTable(block.rows, theme);
|
|
4349
4376
|
}
|
|
4350
4377
|
break;
|
|
4351
4378
|
}
|
|
@@ -18714,7 +18741,7 @@ async function parseHwp(buffer, options) {
|
|
|
18714
18741
|
async function parsePdf(buffer, options) {
|
|
18715
18742
|
let parsePdfDocument;
|
|
18716
18743
|
try {
|
|
18717
|
-
const mod = await import("./parser-
|
|
18744
|
+
const mod = await import("./parser-XBYGROQB.js");
|
|
18718
18745
|
parsePdfDocument = mod.parsePdfDocument;
|
|
18719
18746
|
} catch {
|
|
18720
18747
|
return {
|
|
@@ -18725,8 +18752,8 @@ async function parsePdf(buffer, options) {
|
|
|
18725
18752
|
};
|
|
18726
18753
|
}
|
|
18727
18754
|
try {
|
|
18728
|
-
const { markdown, blocks, metadata, outline, warnings, isImageBased } = await parsePdfDocument(buffer, options);
|
|
18729
|
-
return { success: true, fileType: "pdf", markdown, blocks, metadata, outline, warnings, isImageBased };
|
|
18755
|
+
const { markdown, blocks, metadata, outline, warnings, isImageBased, pageQuality, qualitySummary } = await parsePdfDocument(buffer, options);
|
|
18756
|
+
return { success: true, fileType: "pdf", markdown, blocks, metadata, outline, warnings, isImageBased, pageQuality, qualitySummary };
|
|
18730
18757
|
} catch (err) {
|
|
18731
18758
|
const isImageBased = err instanceof Error && "isImageBased" in err ? true : void 0;
|
|
18732
18759
|
return { success: false, fileType: "pdf", error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err), isImageBased };
|
|
@@ -18952,4 +18979,4 @@ export {
|
|
|
18952
18979
|
compare,
|
|
18953
18980
|
parse
|
|
18954
18981
|
};
|
|
18955
|
-
//# sourceMappingURL=chunk-
|
|
18982
|
+
//# sourceMappingURL=chunk-M24KMDAR.js.map
|