kordoc 2.2.5 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. package/README.md +16 -4
  2. package/dist/{chunk-UU2O6D3R.js → chunk-JFTFC2BB.js} +2 -2
  3. package/dist/{chunk-JH5XLWJQ.js.map → chunk-JFTFC2BB.js.map} +1 -1
  4. package/dist/{chunk-5Y2Q3BRW.js → chunk-M3E3C5GS.js} +8 -1
  5. package/dist/chunk-M3E3C5GS.js.map +1 -0
  6. package/dist/{chunk-RQWICKON.js → chunk-OEJJPCMM.js} +369 -73
  7. package/dist/chunk-OEJJPCMM.js.map +1 -0
  8. package/dist/{chunk-JH5XLWJQ.js → chunk-Z7UPTVMX.js} +2 -2
  9. package/dist/{chunk-UU2O6D3R.js.map → chunk-Z7UPTVMX.js.map} +1 -1
  10. package/dist/{chunk-OJ4QR33V.cjs → chunk-ZNJPRRIA.cjs} +2 -2
  11. package/dist/{chunk-OJ4QR33V.cjs.map → chunk-ZNJPRRIA.cjs.map} +1 -1
  12. package/dist/cli.js +7 -4
  13. package/dist/cli.js.map +1 -1
  14. package/dist/{detect-GYK3HKD5.js → detect-I7YIS4Q6.js} +4 -2
  15. package/dist/index.cjs +463 -160
  16. package/dist/index.cjs.map +1 -1
  17. package/dist/index.d.cts +4 -2
  18. package/dist/index.d.ts +4 -2
  19. package/dist/index.js +387 -84
  20. package/dist/index.js.map +1 -1
  21. package/dist/mcp.js +5 -5
  22. package/dist/{parser-OIRWPKIQ.js → parser-25LF2S2J.js} +45 -42
  23. package/dist/{parser-OIRWPKIQ.js.map → parser-25LF2S2J.js.map} +1 -1
  24. package/dist/{parser-PXD73E4H.js → parser-4LKJXBPP.js} +45 -42
  25. package/dist/{parser-PXD73E4H.js.map → parser-4LKJXBPP.js.map} +1 -1
  26. package/dist/{parser-CYBX5MP4.cjs → parser-KBQZB3QY.cjs} +61 -58
  27. package/dist/{parser-CYBX5MP4.cjs.map → parser-KBQZB3QY.cjs.map} +1 -1
  28. package/dist/{watch-NSBABJ4A.js → watch-GXRBLW3Y.js} +4 -4
  29. package/package.json +2 -2
  30. package/dist/chunk-5Y2Q3BRW.js.map +0 -1
  31. package/dist/chunk-RQWICKON.js.map +0 -1
  32. /package/dist/{detect-GYK3HKD5.js.map → detect-I7YIS4Q6.js.map} +0 -0
  33. /package/dist/{watch-NSBABJ4A.js.map → watch-GXRBLW3Y.js.map} +0 -0
package/dist/mcp.js CHANGED
@@ -8,18 +8,18 @@ import {
8
8
  fillHwpx,
9
9
  markdownToHwpx,
10
10
  parse
11
- } from "./chunk-RQWICKON.js";
11
+ } from "./chunk-OEJJPCMM.js";
12
12
  import {
13
13
  detectFormat,
14
14
  detectZipFormat
15
- } from "./chunk-5Y2Q3BRW.js";
15
+ } from "./chunk-M3E3C5GS.js";
16
16
  import {
17
17
  KordocError,
18
18
  VERSION,
19
19
  blocksToMarkdown,
20
20
  sanitizeError,
21
21
  toArrayBuffer
22
- } from "./chunk-JH5XLWJQ.js";
22
+ } from "./chunk-Z7UPTVMX.js";
23
23
  import "./chunk-MOL7MDBG.js";
24
24
 
25
25
  // src/mcp.ts
@@ -178,7 +178,7 @@ server.tool(
178
178
  let metadata;
179
179
  let effectiveFormat = format;
180
180
  if (format === "hwpx") {
181
- const { detectZipFormat: detectZipFormat2 } = await import("./detect-GYK3HKD5.js");
181
+ const { detectZipFormat: detectZipFormat2 } = await import("./detect-I7YIS4Q6.js");
182
182
  const zipFormat = await detectZipFormat2(buffer);
183
183
  if (zipFormat === "xlsx" || zipFormat === "docx") effectiveFormat = zipFormat;
184
184
  }
@@ -191,7 +191,7 @@ server.tool(
191
191
  break;
192
192
  case "pdf":
193
193
  try {
194
- const { extractPdfMetadataOnly } = await import("./parser-PXD73E4H.js");
194
+ const { extractPdfMetadataOnly } = await import("./parser-4LKJXBPP.js");
195
195
  metadata = await extractPdfMetadataOnly(buffer);
196
196
  } catch {
197
197
  metadata = void 0;
@@ -6,7 +6,7 @@ import {
6
6
  blocksToMarkdown,
7
7
  safeMax,
8
8
  safeMin
9
- } from "./chunk-UU2O6D3R.js";
9
+ } from "./chunk-JFTFC2BB.js";
10
10
  import {
11
11
  parsePageRange
12
12
  } from "./chunk-SBVRCJFH.js";
@@ -2186,10 +2186,11 @@ function detectSpecialKoreanTables(blocks) {
2186
2186
  return result;
2187
2187
  }
2188
2188
  function removeHeaderFooterBlocks(blocks, pageHeights, warnings) {
2189
- const ZONE_RATIO = 0.1;
2189
+ const ZONE_RATIO = 0.12;
2190
2190
  const MIN_REPEAT = 3;
2191
- const headerTexts = /* @__PURE__ */ new Map();
2192
- const footerTexts = /* @__PURE__ */ new Map();
2191
+ const Y_BUCKET = 5;
2192
+ const topEntries = [];
2193
+ const bottomEntries = [];
2193
2194
  for (let bi = 0; bi < blocks.length; bi++) {
2194
2195
  const b = blocks[bi];
2195
2196
  if (!b.bbox || !b.pageNumber || !b.text?.trim()) continue;
@@ -2197,49 +2198,51 @@ function removeHeaderFooterBlocks(blocks, pageHeights, warnings) {
2197
2198
  if (!ph) continue;
2198
2199
  const blockTop = ph - (b.bbox.y + b.bbox.height);
2199
2200
  const blockBottom = ph - b.bbox.y;
2200
- if (blockBottom <= ph * ZONE_RATIO) {
2201
- const arr = footerTexts.get(b.pageNumber) || [];
2202
- arr.push(b.text.trim());
2203
- footerTexts.set(b.pageNumber, arr);
2204
- } else if (blockTop >= ph * (1 - ZONE_RATIO)) {
2205
- const arr = headerTexts.get(b.pageNumber) || [];
2206
- arr.push(b.text.trim());
2207
- headerTexts.set(b.pageNumber, arr);
2208
- }
2209
- }
2210
- const repeatedPatterns = /* @__PURE__ */ new Set();
2211
- for (const textsMap of [headerTexts, footerTexts]) {
2201
+ const entry = { blockIdx: bi, page: b.pageNumber, y: b.bbox.y, text: b.text.trim() };
2202
+ if (blockBottom <= ph * ZONE_RATIO) bottomEntries.push(entry);
2203
+ else if (blockTop >= ph * (1 - ZONE_RATIO)) topEntries.push(entry);
2204
+ }
2205
+ const removeSet = /* @__PURE__ */ new Set();
2206
+ for (const entries of [topEntries, bottomEntries]) {
2207
+ if (entries.length === 0) continue;
2212
2208
  const patternCount = /* @__PURE__ */ new Map();
2213
- for (const [, texts] of textsMap) {
2214
- for (const t of texts) {
2215
- const normalized = t.replace(/\d+/g, "#");
2216
- patternCount.set(normalized, (patternCount.get(normalized) || 0) + 1);
2209
+ const patternPages = /* @__PURE__ */ new Map();
2210
+ for (const e of entries) {
2211
+ const norm = e.text.replace(/\d+/g, "#");
2212
+ patternCount.set(norm, (patternCount.get(norm) || 0) + 1);
2213
+ const pages = patternPages.get(norm) || /* @__PURE__ */ new Set();
2214
+ pages.add(e.page);
2215
+ patternPages.set(norm, pages);
2216
+ }
2217
+ const repeatedPatterns = /* @__PURE__ */ new Set();
2218
+ for (const [p, count] of patternCount) {
2219
+ if (count >= MIN_REPEAT && (patternPages.get(p)?.size ?? 0) >= MIN_REPEAT) {
2220
+ repeatedPatterns.add(p);
2217
2221
  }
2218
2222
  }
2219
- for (const [pattern, count] of patternCount) {
2220
- if (count >= MIN_REPEAT) repeatedPatterns.add(pattern);
2221
- }
2222
- }
2223
- if (repeatedPatterns.size === 0) return [];
2224
- const removeIndices = [];
2225
- for (let bi = 0; bi < blocks.length; bi++) {
2226
- const b = blocks[bi];
2227
- if (!b.bbox || !b.pageNumber || !b.text?.trim()) continue;
2228
- const ph = pageHeights.get(b.bbox.page) || pageHeights.get(b.pageNumber);
2229
- if (!ph) continue;
2230
- const blockTop = ph - (b.bbox.y + b.bbox.height);
2231
- const blockBottom = ph - b.bbox.y;
2232
- const inZone = blockBottom <= ph * ZONE_RATIO || blockTop >= ph * (1 - ZONE_RATIO);
2233
- if (!inZone) continue;
2234
- const normalized = b.text.trim().replace(/\d+/g, "#");
2235
- if (repeatedPatterns.has(normalized)) {
2236
- removeIndices.push(bi);
2223
+ const bucketPages = /* @__PURE__ */ new Map();
2224
+ for (const e of entries) {
2225
+ const bucket = Math.round(e.y / Y_BUCKET);
2226
+ const pages = bucketPages.get(bucket) || /* @__PURE__ */ new Set();
2227
+ pages.add(e.page);
2228
+ bucketPages.set(bucket, pages);
2229
+ }
2230
+ const repeatedBuckets = /* @__PURE__ */ new Set();
2231
+ for (const [b, pages] of bucketPages) {
2232
+ if (pages.size >= MIN_REPEAT) repeatedBuckets.add(b);
2233
+ }
2234
+ for (const e of entries) {
2235
+ const norm = e.text.replace(/\d+/g, "#");
2236
+ const bucket = Math.round(e.y / Y_BUCKET);
2237
+ if (repeatedPatterns.has(norm) || repeatedBuckets.has(bucket)) {
2238
+ removeSet.add(e.blockIdx);
2239
+ }
2237
2240
  }
2238
2241
  }
2239
- if (removeIndices.length > 0) {
2240
- warnings.push({ message: `${removeIndices.length}\uAC1C \uBA38\uB9AC\uAE00/\uBC14\uB2E5\uAE00 \uC694\uC18C \uC81C\uAC70\uB428`, code: "HIDDEN_TEXT_FILTERED" });
2242
+ if (removeSet.size > 0) {
2243
+ warnings.push({ message: `${removeSet.size}\uAC1C \uBA38\uB9AC\uAE00/\uBC14\uB2E5\uAE00 \uC694\uC18C \uC81C\uAC70\uB428`, code: "HIDDEN_TEXT_FILTERED" });
2241
2244
  }
2242
- return removeIndices;
2245
+ return [...removeSet].sort((a, b) => a - b);
2243
2246
  }
2244
2247
  function mergeKoreanLines(text) {
2245
2248
  if (!text) return "";
@@ -2275,4 +2278,4 @@ export {
2275
2278
  extractPdfMetadataOnly,
2276
2279
  parsePdfDocument
2277
2280
  };
2278
- //# sourceMappingURL=parser-OIRWPKIQ.js.map
2281
+ //# sourceMappingURL=parser-25LF2S2J.js.map