kordoc 2.2.6 → 2.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. package/README.md +22 -3
  2. package/dist/{chunk-RF6UJXR3.js → chunk-KSBPABBQ.js} +482 -78
  3. package/dist/chunk-KSBPABBQ.js.map +1 -0
  4. package/dist/{chunk-5Y2Q3BRW.js → chunk-M3E3C5GS.js} +8 -1
  5. package/dist/chunk-M3E3C5GS.js.map +1 -0
  6. package/dist/{chunk-FCQEF2ZM.js → chunk-VJPDY4YT.js} +2 -2
  7. package/dist/{chunk-NL5XLN5R.js.map → chunk-VJPDY4YT.js.map} +1 -1
  8. package/dist/{chunk-HXUCZ2IL.cjs → chunk-VLSATRNQ.cjs} +2 -2
  9. package/dist/{chunk-HXUCZ2IL.cjs.map → chunk-VLSATRNQ.cjs.map} +1 -1
  10. package/dist/{chunk-NL5XLN5R.js → chunk-XG5CQUSC.js} +2 -2
  11. package/dist/{chunk-FCQEF2ZM.js.map → chunk-XG5CQUSC.js.map} +1 -1
  12. package/dist/cli.js +5 -5
  13. package/dist/cli.js.map +1 -1
  14. package/dist/{detect-GYK3HKD5.js → detect-I7YIS4Q6.js} +4 -2
  15. package/dist/index.cjs +608 -197
  16. package/dist/index.cjs.map +1 -1
  17. package/dist/index.d.cts +6 -2
  18. package/dist/index.d.ts +6 -2
  19. package/dist/index.js +500 -89
  20. package/dist/index.js.map +1 -1
  21. package/dist/mcp.js +5 -5
  22. package/dist/{parser-AMP7MAOH.js → parser-4275GJRB.js} +45 -42
  23. package/dist/{parser-AMP7MAOH.js.map → parser-4275GJRB.js.map} +1 -1
  24. package/dist/{parser-KOWPTDJU.cjs → parser-STAOZMUC.cjs} +61 -58
  25. package/dist/{parser-KOWPTDJU.cjs.map → parser-STAOZMUC.cjs.map} +1 -1
  26. package/dist/{parser-43IAQ5KE.js → parser-XRUZEFZT.js} +45 -42
  27. package/dist/{parser-43IAQ5KE.js.map → parser-XRUZEFZT.js.map} +1 -1
  28. package/dist/{watch-IUQXOXW3.js → watch-BFLNFJBE.js} +4 -4
  29. package/package.json +2 -2
  30. package/dist/chunk-5Y2Q3BRW.js.map +0 -1
  31. package/dist/chunk-RF6UJXR3.js.map +0 -1
  32. /package/dist/{detect-GYK3HKD5.js.map → detect-I7YIS4Q6.js.map} +0 -0
  33. /package/dist/{watch-IUQXOXW3.js.map → watch-BFLNFJBE.js.map} +0 -0
package/dist/mcp.js CHANGED
@@ -8,18 +8,18 @@ import {
8
8
  fillHwpx,
9
9
  markdownToHwpx,
10
10
  parse
11
- } from "./chunk-RF6UJXR3.js";
11
+ } from "./chunk-KSBPABBQ.js";
12
12
  import {
13
13
  detectFormat,
14
14
  detectZipFormat
15
- } from "./chunk-5Y2Q3BRW.js";
15
+ } from "./chunk-M3E3C5GS.js";
16
16
  import {
17
17
  KordocError,
18
18
  VERSION,
19
19
  blocksToMarkdown,
20
20
  sanitizeError,
21
21
  toArrayBuffer
22
- } from "./chunk-FCQEF2ZM.js";
22
+ } from "./chunk-VJPDY4YT.js";
23
23
  import "./chunk-MOL7MDBG.js";
24
24
 
25
25
  // src/mcp.ts
@@ -178,7 +178,7 @@ server.tool(
178
178
  let metadata;
179
179
  let effectiveFormat = format;
180
180
  if (format === "hwpx") {
181
- const { detectZipFormat: detectZipFormat2 } = await import("./detect-GYK3HKD5.js");
181
+ const { detectZipFormat: detectZipFormat2 } = await import("./detect-I7YIS4Q6.js");
182
182
  const zipFormat = await detectZipFormat2(buffer);
183
183
  if (zipFormat === "xlsx" || zipFormat === "docx") effectiveFormat = zipFormat;
184
184
  }
@@ -191,7 +191,7 @@ server.tool(
191
191
  break;
192
192
  case "pdf":
193
193
  try {
194
- const { extractPdfMetadataOnly } = await import("./parser-AMP7MAOH.js");
194
+ const { extractPdfMetadataOnly } = await import("./parser-4275GJRB.js");
195
195
  metadata = await extractPdfMetadataOnly(buffer);
196
196
  } catch {
197
197
  metadata = void 0;
@@ -7,7 +7,7 @@ import {
7
7
  blocksToMarkdown,
8
8
  safeMax,
9
9
  safeMin
10
- } from "./chunk-FCQEF2ZM.js";
10
+ } from "./chunk-VJPDY4YT.js";
11
11
  import {
12
12
  parsePageRange
13
13
  } from "./chunk-MOL7MDBG.js";
@@ -2187,10 +2187,11 @@ function detectSpecialKoreanTables(blocks) {
2187
2187
  return result;
2188
2188
  }
2189
2189
  function removeHeaderFooterBlocks(blocks, pageHeights, warnings) {
2190
- const ZONE_RATIO = 0.1;
2190
+ const ZONE_RATIO = 0.12;
2191
2191
  const MIN_REPEAT = 3;
2192
- const headerTexts = /* @__PURE__ */ new Map();
2193
- const footerTexts = /* @__PURE__ */ new Map();
2192
+ const Y_BUCKET = 5;
2193
+ const topEntries = [];
2194
+ const bottomEntries = [];
2194
2195
  for (let bi = 0; bi < blocks.length; bi++) {
2195
2196
  const b = blocks[bi];
2196
2197
  if (!b.bbox || !b.pageNumber || !b.text?.trim()) continue;
@@ -2198,49 +2199,51 @@ function removeHeaderFooterBlocks(blocks, pageHeights, warnings) {
2198
2199
  if (!ph) continue;
2199
2200
  const blockTop = ph - (b.bbox.y + b.bbox.height);
2200
2201
  const blockBottom = ph - b.bbox.y;
2201
- if (blockBottom <= ph * ZONE_RATIO) {
2202
- const arr = footerTexts.get(b.pageNumber) || [];
2203
- arr.push(b.text.trim());
2204
- footerTexts.set(b.pageNumber, arr);
2205
- } else if (blockTop >= ph * (1 - ZONE_RATIO)) {
2206
- const arr = headerTexts.get(b.pageNumber) || [];
2207
- arr.push(b.text.trim());
2208
- headerTexts.set(b.pageNumber, arr);
2209
- }
2210
- }
2211
- const repeatedPatterns = /* @__PURE__ */ new Set();
2212
- for (const textsMap of [headerTexts, footerTexts]) {
2202
+ const entry = { blockIdx: bi, page: b.pageNumber, y: b.bbox.y, text: b.text.trim() };
2203
+ if (blockBottom <= ph * ZONE_RATIO) bottomEntries.push(entry);
2204
+ else if (blockTop >= ph * (1 - ZONE_RATIO)) topEntries.push(entry);
2205
+ }
2206
+ const removeSet = /* @__PURE__ */ new Set();
2207
+ for (const entries of [topEntries, bottomEntries]) {
2208
+ if (entries.length === 0) continue;
2213
2209
  const patternCount = /* @__PURE__ */ new Map();
2214
- for (const [, texts] of textsMap) {
2215
- for (const t of texts) {
2216
- const normalized = t.replace(/\d+/g, "#");
2217
- patternCount.set(normalized, (patternCount.get(normalized) || 0) + 1);
2210
+ const patternPages = /* @__PURE__ */ new Map();
2211
+ for (const e of entries) {
2212
+ const norm = e.text.replace(/\d+/g, "#");
2213
+ patternCount.set(norm, (patternCount.get(norm) || 0) + 1);
2214
+ const pages = patternPages.get(norm) || /* @__PURE__ */ new Set();
2215
+ pages.add(e.page);
2216
+ patternPages.set(norm, pages);
2217
+ }
2218
+ const repeatedPatterns = /* @__PURE__ */ new Set();
2219
+ for (const [p, count] of patternCount) {
2220
+ if (count >= MIN_REPEAT && (patternPages.get(p)?.size ?? 0) >= MIN_REPEAT) {
2221
+ repeatedPatterns.add(p);
2218
2222
  }
2219
2223
  }
2220
- for (const [pattern, count] of patternCount) {
2221
- if (count >= MIN_REPEAT) repeatedPatterns.add(pattern);
2222
- }
2223
- }
2224
- if (repeatedPatterns.size === 0) return [];
2225
- const removeIndices = [];
2226
- for (let bi = 0; bi < blocks.length; bi++) {
2227
- const b = blocks[bi];
2228
- if (!b.bbox || !b.pageNumber || !b.text?.trim()) continue;
2229
- const ph = pageHeights.get(b.bbox.page) || pageHeights.get(b.pageNumber);
2230
- if (!ph) continue;
2231
- const blockTop = ph - (b.bbox.y + b.bbox.height);
2232
- const blockBottom = ph - b.bbox.y;
2233
- const inZone = blockBottom <= ph * ZONE_RATIO || blockTop >= ph * (1 - ZONE_RATIO);
2234
- if (!inZone) continue;
2235
- const normalized = b.text.trim().replace(/\d+/g, "#");
2236
- if (repeatedPatterns.has(normalized)) {
2237
- removeIndices.push(bi);
2224
+ const bucketPages = /* @__PURE__ */ new Map();
2225
+ for (const e of entries) {
2226
+ const bucket = Math.round(e.y / Y_BUCKET);
2227
+ const pages = bucketPages.get(bucket) || /* @__PURE__ */ new Set();
2228
+ pages.add(e.page);
2229
+ bucketPages.set(bucket, pages);
2230
+ }
2231
+ const repeatedBuckets = /* @__PURE__ */ new Set();
2232
+ for (const [b, pages] of bucketPages) {
2233
+ if (pages.size >= MIN_REPEAT) repeatedBuckets.add(b);
2234
+ }
2235
+ for (const e of entries) {
2236
+ const norm = e.text.replace(/\d+/g, "#");
2237
+ const bucket = Math.round(e.y / Y_BUCKET);
2238
+ if (repeatedPatterns.has(norm) || repeatedBuckets.has(bucket)) {
2239
+ removeSet.add(e.blockIdx);
2240
+ }
2238
2241
  }
2239
2242
  }
2240
- if (removeIndices.length > 0) {
2241
- warnings.push({ message: `${removeIndices.length}\uAC1C \uBA38\uB9AC\uAE00/\uBC14\uB2E5\uAE00 \uC694\uC18C \uC81C\uAC70\uB428`, code: "HIDDEN_TEXT_FILTERED" });
2243
+ if (removeSet.size > 0) {
2244
+ warnings.push({ message: `${removeSet.size}\uAC1C \uBA38\uB9AC\uAE00/\uBC14\uB2E5\uAE00 \uC694\uC18C \uC81C\uAC70\uB428`, code: "HIDDEN_TEXT_FILTERED" });
2242
2245
  }
2243
- return removeIndices;
2246
+ return [...removeSet].sort((a, b) => a - b);
2244
2247
  }
2245
2248
  function mergeKoreanLines(text) {
2246
2249
  if (!text) return "";
@@ -2276,4 +2279,4 @@ export {
2276
2279
  extractPdfMetadataOnly,
2277
2280
  parsePdfDocument
2278
2281
  };
2279
- //# sourceMappingURL=parser-AMP7MAOH.js.map
2282
+ //# sourceMappingURL=parser-4275GJRB.js.map