kordoc 2.7.2 → 2.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -7,7 +7,7 @@
7
7
 
8
8
  > *대한민국에서 둘째가라면 서러울 문서지옥. 거기서 7년 버틴 공무원이 만들었습니다.*
9
9
 
10
- HWP 3.x/5.x, HWPX, PDF, XLS, XLSX, DOCX — 관공서에서 쏟아지는 모든 문서를 파싱하고, 비교하고, 분석하고, 생성합니다.
10
+ HWP 3.x/5.x, HWPX, HWPML, PDF, XLS, XLSX, DOCX — 관공서에서 쏟아지는 모든 문서를 파싱하고, 비교하고, 분석하고, 생성합니다.
11
11
 
12
12
  [English](./README-EN.md)
13
13
 
@@ -312,6 +312,25 @@ const result = await parse(buffer, {
312
312
  })
313
313
  ```
314
314
 
315
+ ### PDF 텍스트 품질 신호 (v2.9.0+)
316
+
317
+ PDF는 텍스트층이 있어도 ToUnicode/CMap이 깨졌거나 NUL 등 제어문자가 섞이는 경우가 많다. `parsePdf` 결과는 페이지별 품질 신호를 함께 반환한다.
318
+
319
+ ```typescript
320
+ const r = await parsePdf(buffer)
321
+ if (r.success && r.qualitySummary?.needsOcr) {
322
+ // OCR 큐로 라우팅 (kordoc은 OCR을 기본 탑재하지 않음)
323
+ await routeToOcr(buffer, r.qualitySummary.ocrCandidatePages)
324
+ }
325
+
326
+ // 페이지 단위 신호
327
+ for (const p of r.pageQuality ?? []) {
328
+ if (p.needsOcr) console.log(`p${p.page} 검토 필요: ${p.ocrReason}`)
329
+ }
330
+ ```
331
+
332
+ 신호 키: `textChars`, `hangulRatio`, `controlCharRatio`, `replacementCharRatio`, `puaRatio` / `needsOcr` (페이지·문서 단위) / `ocrReason` (`low_text` | `high_pua` | `high_control` | `high_replacement`).
333
+
315
334
  ## CLI
316
335
 
317
336
  ```bash
@@ -22,7 +22,7 @@ import {
22
22
  sanitizeHref,
23
23
  stripDtd,
24
24
  toArrayBuffer
25
- } from "./chunk-4SK2PDMQ.js";
25
+ } from "./chunk-SJ5TPMBT.js";
26
26
  import {
27
27
  parsePageRange
28
28
  } from "./chunk-MOL7MDBG.js";
@@ -3928,6 +3928,8 @@ var CHAR_H1 = 5;
3928
3928
  var CHAR_H2 = 6;
3929
3929
  var CHAR_H3 = 7;
3930
3930
  var CHAR_H4 = 8;
3931
+ var CHAR_TABLE_HEADER = 9;
3932
+ var CHAR_QUOTE = 10;
3931
3933
  var PARA_NORMAL = 0;
3932
3934
  var PARA_H1 = 1;
3933
3935
  var PARA_H2 = 2;
@@ -3936,14 +3938,30 @@ var PARA_H4 = 4;
3936
3938
  var PARA_CODE = 5;
3937
3939
  var PARA_QUOTE = 6;
3938
3940
  var PARA_LIST = 7;
3939
- async function markdownToHwpx(markdown) {
3941
+ var DEFAULT_TEXT_COLOR = "#000000";
3942
+ function resolveTheme(theme) {
3943
+ return {
3944
+ h1: theme?.headingColors?.[1] ?? DEFAULT_TEXT_COLOR,
3945
+ h2: theme?.headingColors?.[2] ?? DEFAULT_TEXT_COLOR,
3946
+ h3: theme?.headingColors?.[3] ?? DEFAULT_TEXT_COLOR,
3947
+ h4: theme?.headingColors?.[4] ?? theme?.headingColors?.[3] ?? DEFAULT_TEXT_COLOR,
3948
+ body: theme?.bodyColor ?? DEFAULT_TEXT_COLOR,
3949
+ quote: theme?.quoteColor ?? DEFAULT_TEXT_COLOR,
3950
+ /** quoteColor가 명시되었는지 — blockquote charPr 분기에 사용 (baseline 호환) */
3951
+ hasQuoteOption: theme?.quoteColor !== void 0,
3952
+ tableHeader: theme?.tableHeaderColor ?? theme?.bodyColor ?? DEFAULT_TEXT_COLOR,
3953
+ tableHeaderBold: !!theme?.tableHeaderBold
3954
+ };
3955
+ }
3956
+ async function markdownToHwpx(markdown, options) {
3957
+ const theme = resolveTheme(options?.theme);
3940
3958
  const blocks = parseMarkdownToBlocks(markdown);
3941
- const sectionXml = blocksToSectionXml(blocks);
3959
+ const sectionXml = blocksToSectionXml(blocks, theme);
3942
3960
  const zip = new JSZip3();
3943
3961
  zip.file("mimetype", "application/hwp+zip", { compression: "STORE" });
3944
3962
  zip.file("META-INF/container.xml", generateContainerXml());
3945
3963
  zip.file("Contents/content.hpf", generateManifest());
3946
- zip.file("Contents/header.xml", generateHeaderXml());
3964
+ zip.file("Contents/header.xml", generateHeaderXml(theme));
3947
3965
  zip.file("Contents/section0.xml", sectionXml);
3948
3966
  zip.file("Preview/PrvText.txt", buildPrvText(blocks));
3949
3967
  return await zip.generateAsync({ type: "arraybuffer" });
@@ -4123,11 +4141,11 @@ function generateManifest() {
4123
4141
  </opf:spine>
4124
4142
  </opf:package>`;
4125
4143
  }
4126
- function charPr(id, height, bold, italic, fontId = 0) {
4144
+ function charPr(id, height, bold, italic, fontId = 0, textColor = DEFAULT_TEXT_COLOR) {
4127
4145
  const boldAttr = bold ? ` bold="1"` : "";
4128
4146
  const italicAttr = italic ? ` italic="1"` : "";
4129
4147
  const effFont = bold ? 2 : fontId;
4130
- return ` <hh:charPr id="${id}" height="${height}" textColor="#000000" shadeColor="none" useFontSpace="0" useKerning="0" symMark="NONE" borderFillIDRef="0"${boldAttr}${italicAttr}>
4148
+ return ` <hh:charPr id="${id}" height="${height}" textColor="${textColor}" shadeColor="none" useFontSpace="0" useKerning="0" symMark="NONE" borderFillIDRef="0"${boldAttr}${italicAttr}>
4131
4149
  <hh:fontRef hangul="${effFont}" latin="${effFont}" hanja="${effFont}" japanese="${effFont}" other="${effFont}" symbol="${effFont}" user="${effFont}"/>
4132
4150
  <hh:ratio hangul="100" latin="100" hanja="100" japanese="100" other="100" symbol="100" user="100"/>
4133
4151
  <hh:spacing hangul="0" latin="0" hanja="0" japanese="0" other="0" symbol="0" user="0"/>
@@ -4147,7 +4165,7 @@ function paraPr(id, opts = {}) {
4147
4165
  <hh:border borderFillIDRef="0" offsetLeft="0" offsetRight="0" offsetTop="0" offsetBottom="0" connect="0" ignoreMargin="0"/>
4148
4166
  </hh:paraPr>`;
4149
4167
  }
4150
- function generateHeaderXml() {
4168
+ function generateHeaderXml(theme) {
4151
4169
  return `<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
4152
4170
  <hh:head xmlns:hh="${NS_HEAD}" xmlns:hp="${NS_PARA}" version="1.4" secCnt="1">
4153
4171
  <hh:beginNum page="1" footnote="1" endnote="1" pic="1" tbl="1" equation="1"/>
@@ -4223,16 +4241,18 @@ function generateHeaderXml() {
4223
4241
  <hh:fillInfo/>
4224
4242
  </hh:borderFill>
4225
4243
  </hh:borderFills>
4226
- <hh:charProperties itemCnt="9">
4227
- ${charPr(0, 1e3, false, false)}
4228
- ${charPr(1, 1e3, true, false)}
4229
- ${charPr(2, 1e3, false, true)}
4230
- ${charPr(3, 1e3, true, true)}
4244
+ <hh:charProperties itemCnt="11">
4245
+ ${charPr(0, 1e3, false, false, 0, theme.body)}
4246
+ ${charPr(1, 1e3, true, false, 0, theme.body)}
4247
+ ${charPr(2, 1e3, false, true, 0, theme.body)}
4248
+ ${charPr(3, 1e3, true, true, 0, theme.body)}
4231
4249
  ${charPr(4, 900, false, false, 1)}
4232
- ${charPr(5, 1800, true, false, 1)}
4233
- ${charPr(6, 1400, true, false, 1)}
4234
- ${charPr(7, 1200, true, false, 1)}
4235
- ${charPr(8, 1100, true, false, 1)}
4250
+ ${charPr(5, 1800, true, false, 1, theme.h1)}
4251
+ ${charPr(6, 1400, true, false, 1, theme.h2)}
4252
+ ${charPr(7, 1200, true, false, 1, theme.h3)}
4253
+ ${charPr(8, 1100, true, false, 1, theme.h4)}
4254
+ ${charPr(CHAR_TABLE_HEADER, 1e3, theme.tableHeaderBold, false, 0, theme.tableHeader)}
4255
+ ${charPr(CHAR_QUOTE, 1e3, false, true, 0, theme.quote)}
4236
4256
  </hh:charProperties>
4237
4257
  <hh:tabProperties itemCnt="0"/>
4238
4258
  <hh:numberings itemCnt="0"/>
@@ -4262,7 +4282,7 @@ var tableIdCounter = TABLE_ID_BASE;
4262
4282
  function nextTableId() {
4263
4283
  return ++tableIdCounter;
4264
4284
  }
4265
- function generateTable(rows) {
4285
+ function generateTable(rows, theme) {
4266
4286
  const rowCnt = rows.length;
4267
4287
  const colCnt = Math.max(...rows.map((r) => r.length), 1);
4268
4288
  const cellW = Math.floor(44e3 / colCnt);
@@ -4270,12 +4290,15 @@ function generateTable(rows) {
4270
4290
  const tblW = cellW * colCnt;
4271
4291
  const tblH = cellH * rowCnt;
4272
4292
  const tblId = nextTableId();
4293
+ const useHeaderStyle = theme.tableHeader !== theme.body || theme.tableHeaderBold;
4273
4294
  const trElements = rows.map((row, rowIdx) => {
4274
4295
  const cells = row.length < colCnt ? [...row, ...Array(colCnt - row.length).fill("")] : row;
4296
+ const isHeaderRow = rowIdx === 0;
4297
+ const headerCharPr = isHeaderRow && useHeaderStyle ? CHAR_TABLE_HEADER : CHAR_NORMAL;
4275
4298
  const tdElements = cells.map((cell, colIdx) => {
4276
- const runs = generateRuns(cell);
4299
+ const runs = generateRuns(cell, headerCharPr);
4277
4300
  const p = `<hp:p paraPrIDRef="0" styleIDRef="0">${runs}</hp:p>`;
4278
- return `<hp:tc name="" header="${rowIdx === 0 ? 1 : 0}" hasMargin="0" protect="0" editable="1" dirty="0" borderFillIDRef="1"><hp:subList id="" textDirection="HORIZONTAL" lineWrap="BREAK" vertAlign="TOP" linkListIDRef="0" linkListNextIDRef="0" textWidth="0" textHeight="0" hasTextRef="0" hasNumRef="0">${p}</hp:subList><hp:cellAddr colAddr="${colIdx}" rowAddr="${rowIdx}"/><hp:cellSpan colSpan="1" rowSpan="1"/><hp:cellSz width="${cellW}" height="${cellH}"/><hp:cellMargin left="141" right="141" top="141" bottom="141"/></hp:tc>`;
4301
+ return `<hp:tc name="" header="${isHeaderRow ? 1 : 0}" hasMargin="0" protect="0" editable="1" dirty="0" borderFillIDRef="1"><hp:subList id="" textDirection="HORIZONTAL" lineWrap="BREAK" vertAlign="TOP" linkListIDRef="0" linkListNextIDRef="0" textWidth="0" textHeight="0" hasTextRef="0" hasNumRef="0">${p}</hp:subList><hp:cellAddr colAddr="${colIdx}" rowAddr="${rowIdx}"/><hp:cellSpan colSpan="1" rowSpan="1"/><hp:cellSz width="${cellW}" height="${cellH}"/><hp:cellMargin left="141" right="141" top="141" bottom="141"/></hp:tc>`;
4279
4302
  }).join("");
4280
4303
  return `<hp:tr>${tdElements}</hp:tr>`;
4281
4304
  }).join("");
@@ -4283,7 +4306,7 @@ function generateTable(rows) {
4283
4306
  const tbl = `<hp:tbl id="${tblId}" zOrder="0" numberingType="TABLE" pageBreak="CELL" repeatHeader="0" rowCnt="${rowCnt}" colCnt="${colCnt}" cellSpacing="0" borderFillIDRef="1" noShading="0">${tblInner}</hp:tbl>`;
4284
4307
  return `<hp:p paraPrIDRef="0" styleIDRef="0"><hp:run charPrIDRef="0">${tbl}</hp:run></hp:p>`;
4285
4308
  }
4286
- function blocksToSectionXml(blocks) {
4309
+ function blocksToSectionXml(blocks, theme) {
4287
4310
  const paraXmls = [];
4288
4311
  let isFirst = true;
4289
4312
  const orderedCounters = {};
@@ -4312,7 +4335,11 @@ function blocksToSectionXml(blocks) {
4312
4335
  break;
4313
4336
  }
4314
4337
  case "blockquote":
4315
- xml = generateParagraph(block.text || "", PARA_QUOTE);
4338
+ xml = generateParagraph(
4339
+ block.text || "",
4340
+ PARA_QUOTE,
4341
+ theme.hasQuoteOption ? CHAR_QUOTE : CHAR_NORMAL
4342
+ );
4316
4343
  break;
4317
4344
  case "list_item": {
4318
4345
  const indent = block.indent || 0;
@@ -4345,7 +4372,7 @@ function blocksToSectionXml(blocks) {
4345
4372
  paraXmls.push(`<hp:p paraPrIDRef="0" styleIDRef="0">${secRun}</hp:p>`);
4346
4373
  isFirst = false;
4347
4374
  }
4348
- xml = generateTable(block.rows);
4375
+ xml = generateTable(block.rows, theme);
4349
4376
  }
4350
4377
  break;
4351
4378
  }
@@ -18714,7 +18741,7 @@ async function parseHwp(buffer, options) {
18714
18741
  async function parsePdf(buffer, options) {
18715
18742
  let parsePdfDocument;
18716
18743
  try {
18717
- const mod = await import("./parser-QMMQ7Y7R.js");
18744
+ const mod = await import("./parser-XBYGROQB.js");
18718
18745
  parsePdfDocument = mod.parsePdfDocument;
18719
18746
  } catch {
18720
18747
  return {
@@ -18725,8 +18752,8 @@ async function parsePdf(buffer, options) {
18725
18752
  };
18726
18753
  }
18727
18754
  try {
18728
- const { markdown, blocks, metadata, outline, warnings, isImageBased } = await parsePdfDocument(buffer, options);
18729
- return { success: true, fileType: "pdf", markdown, blocks, metadata, outline, warnings, isImageBased };
18755
+ const { markdown, blocks, metadata, outline, warnings, isImageBased, pageQuality, qualitySummary } = await parsePdfDocument(buffer, options);
18756
+ return { success: true, fileType: "pdf", markdown, blocks, metadata, outline, warnings, isImageBased, pageQuality, qualitySummary };
18730
18757
  } catch (err) {
18731
18758
  const isImageBased = err instanceof Error && "isImageBased" in err ? true : void 0;
18732
18759
  return { success: false, fileType: "pdf", error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err), isImageBased };
@@ -18952,4 +18979,4 @@ export {
18952
18979
  compare,
18953
18980
  parse
18954
18981
  };
18955
- //# sourceMappingURL=chunk-4NWDJGAU.js.map
18982
+ //# sourceMappingURL=chunk-M24KMDAR.js.map