kordoc 2.2.6 → 2.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +22 -3
- package/dist/{chunk-RF6UJXR3.js → chunk-KSBPABBQ.js} +482 -78
- package/dist/chunk-KSBPABBQ.js.map +1 -0
- package/dist/{chunk-5Y2Q3BRW.js → chunk-M3E3C5GS.js} +8 -1
- package/dist/chunk-M3E3C5GS.js.map +1 -0
- package/dist/{chunk-FCQEF2ZM.js → chunk-VJPDY4YT.js} +2 -2
- package/dist/{chunk-NL5XLN5R.js.map → chunk-VJPDY4YT.js.map} +1 -1
- package/dist/{chunk-HXUCZ2IL.cjs → chunk-VLSATRNQ.cjs} +2 -2
- package/dist/{chunk-HXUCZ2IL.cjs.map → chunk-VLSATRNQ.cjs.map} +1 -1
- package/dist/{chunk-NL5XLN5R.js → chunk-XG5CQUSC.js} +2 -2
- package/dist/{chunk-FCQEF2ZM.js.map → chunk-XG5CQUSC.js.map} +1 -1
- package/dist/cli.js +5 -5
- package/dist/cli.js.map +1 -1
- package/dist/{detect-GYK3HKD5.js → detect-I7YIS4Q6.js} +4 -2
- package/dist/index.cjs +608 -197
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +6 -2
- package/dist/index.d.ts +6 -2
- package/dist/index.js +500 -89
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +5 -5
- package/dist/{parser-AMP7MAOH.js → parser-4275GJRB.js} +45 -42
- package/dist/{parser-AMP7MAOH.js.map → parser-4275GJRB.js.map} +1 -1
- package/dist/{parser-KOWPTDJU.cjs → parser-STAOZMUC.cjs} +61 -58
- package/dist/{parser-KOWPTDJU.cjs.map → parser-STAOZMUC.cjs.map} +1 -1
- package/dist/{parser-43IAQ5KE.js → parser-XRUZEFZT.js} +45 -42
- package/dist/{parser-43IAQ5KE.js.map → parser-XRUZEFZT.js.map} +1 -1
- package/dist/{watch-IUQXOXW3.js → watch-BFLNFJBE.js} +4 -4
- package/package.json +2 -2
- package/dist/chunk-5Y2Q3BRW.js.map +0 -1
- package/dist/chunk-RF6UJXR3.js.map +0 -1
- /package/dist/{detect-GYK3HKD5.js.map → detect-I7YIS4Q6.js.map} +0 -0
- /package/dist/{watch-IUQXOXW3.js.map → watch-BFLNFJBE.js.map} +0 -0
package/dist/mcp.js
CHANGED
|
@@ -8,18 +8,18 @@ import {
|
|
|
8
8
|
fillHwpx,
|
|
9
9
|
markdownToHwpx,
|
|
10
10
|
parse
|
|
11
|
-
} from "./chunk-
|
|
11
|
+
} from "./chunk-KSBPABBQ.js";
|
|
12
12
|
import {
|
|
13
13
|
detectFormat,
|
|
14
14
|
detectZipFormat
|
|
15
|
-
} from "./chunk-
|
|
15
|
+
} from "./chunk-M3E3C5GS.js";
|
|
16
16
|
import {
|
|
17
17
|
KordocError,
|
|
18
18
|
VERSION,
|
|
19
19
|
blocksToMarkdown,
|
|
20
20
|
sanitizeError,
|
|
21
21
|
toArrayBuffer
|
|
22
|
-
} from "./chunk-
|
|
22
|
+
} from "./chunk-VJPDY4YT.js";
|
|
23
23
|
import "./chunk-MOL7MDBG.js";
|
|
24
24
|
|
|
25
25
|
// src/mcp.ts
|
|
@@ -178,7 +178,7 @@ server.tool(
|
|
|
178
178
|
let metadata;
|
|
179
179
|
let effectiveFormat = format;
|
|
180
180
|
if (format === "hwpx") {
|
|
181
|
-
const { detectZipFormat: detectZipFormat2 } = await import("./detect-
|
|
181
|
+
const { detectZipFormat: detectZipFormat2 } = await import("./detect-I7YIS4Q6.js");
|
|
182
182
|
const zipFormat = await detectZipFormat2(buffer);
|
|
183
183
|
if (zipFormat === "xlsx" || zipFormat === "docx") effectiveFormat = zipFormat;
|
|
184
184
|
}
|
|
@@ -191,7 +191,7 @@ server.tool(
|
|
|
191
191
|
break;
|
|
192
192
|
case "pdf":
|
|
193
193
|
try {
|
|
194
|
-
const { extractPdfMetadataOnly } = await import("./parser-
|
|
194
|
+
const { extractPdfMetadataOnly } = await import("./parser-4275GJRB.js");
|
|
195
195
|
metadata = await extractPdfMetadataOnly(buffer);
|
|
196
196
|
} catch {
|
|
197
197
|
metadata = void 0;
|
|
@@ -7,7 +7,7 @@ import {
|
|
|
7
7
|
blocksToMarkdown,
|
|
8
8
|
safeMax,
|
|
9
9
|
safeMin
|
|
10
|
-
} from "./chunk-
|
|
10
|
+
} from "./chunk-VJPDY4YT.js";
|
|
11
11
|
import {
|
|
12
12
|
parsePageRange
|
|
13
13
|
} from "./chunk-MOL7MDBG.js";
|
|
@@ -2187,10 +2187,11 @@ function detectSpecialKoreanTables(blocks) {
|
|
|
2187
2187
|
return result;
|
|
2188
2188
|
}
|
|
2189
2189
|
function removeHeaderFooterBlocks(blocks, pageHeights, warnings) {
|
|
2190
|
-
const ZONE_RATIO = 0.
|
|
2190
|
+
const ZONE_RATIO = 0.12;
|
|
2191
2191
|
const MIN_REPEAT = 3;
|
|
2192
|
-
const
|
|
2193
|
-
const
|
|
2192
|
+
const Y_BUCKET = 5;
|
|
2193
|
+
const topEntries = [];
|
|
2194
|
+
const bottomEntries = [];
|
|
2194
2195
|
for (let bi = 0; bi < blocks.length; bi++) {
|
|
2195
2196
|
const b = blocks[bi];
|
|
2196
2197
|
if (!b.bbox || !b.pageNumber || !b.text?.trim()) continue;
|
|
@@ -2198,49 +2199,51 @@ function removeHeaderFooterBlocks(blocks, pageHeights, warnings) {
|
|
|
2198
2199
|
if (!ph) continue;
|
|
2199
2200
|
const blockTop = ph - (b.bbox.y + b.bbox.height);
|
|
2200
2201
|
const blockBottom = ph - b.bbox.y;
|
|
2201
|
-
|
|
2202
|
-
|
|
2203
|
-
|
|
2204
|
-
|
|
2205
|
-
|
|
2206
|
-
|
|
2207
|
-
|
|
2208
|
-
headerTexts.set(b.pageNumber, arr);
|
|
2209
|
-
}
|
|
2210
|
-
}
|
|
2211
|
-
const repeatedPatterns = /* @__PURE__ */ new Set();
|
|
2212
|
-
for (const textsMap of [headerTexts, footerTexts]) {
|
|
2202
|
+
const entry = { blockIdx: bi, page: b.pageNumber, y: b.bbox.y, text: b.text.trim() };
|
|
2203
|
+
if (blockBottom <= ph * ZONE_RATIO) bottomEntries.push(entry);
|
|
2204
|
+
else if (blockTop >= ph * (1 - ZONE_RATIO)) topEntries.push(entry);
|
|
2205
|
+
}
|
|
2206
|
+
const removeSet = /* @__PURE__ */ new Set();
|
|
2207
|
+
for (const entries of [topEntries, bottomEntries]) {
|
|
2208
|
+
if (entries.length === 0) continue;
|
|
2213
2209
|
const patternCount = /* @__PURE__ */ new Map();
|
|
2214
|
-
|
|
2215
|
-
|
|
2216
|
-
|
|
2217
|
-
|
|
2210
|
+
const patternPages = /* @__PURE__ */ new Map();
|
|
2211
|
+
for (const e of entries) {
|
|
2212
|
+
const norm = e.text.replace(/\d+/g, "#");
|
|
2213
|
+
patternCount.set(norm, (patternCount.get(norm) || 0) + 1);
|
|
2214
|
+
const pages = patternPages.get(norm) || /* @__PURE__ */ new Set();
|
|
2215
|
+
pages.add(e.page);
|
|
2216
|
+
patternPages.set(norm, pages);
|
|
2217
|
+
}
|
|
2218
|
+
const repeatedPatterns = /* @__PURE__ */ new Set();
|
|
2219
|
+
for (const [p, count] of patternCount) {
|
|
2220
|
+
if (count >= MIN_REPEAT && (patternPages.get(p)?.size ?? 0) >= MIN_REPEAT) {
|
|
2221
|
+
repeatedPatterns.add(p);
|
|
2218
2222
|
}
|
|
2219
2223
|
}
|
|
2220
|
-
|
|
2221
|
-
|
|
2222
|
-
|
|
2223
|
-
|
|
2224
|
-
|
|
2225
|
-
|
|
2226
|
-
|
|
2227
|
-
const
|
|
2228
|
-
|
|
2229
|
-
|
|
2230
|
-
|
|
2231
|
-
const
|
|
2232
|
-
|
|
2233
|
-
|
|
2234
|
-
|
|
2235
|
-
|
|
2236
|
-
|
|
2237
|
-
removeIndices.push(bi);
|
|
2224
|
+
const bucketPages = /* @__PURE__ */ new Map();
|
|
2225
|
+
for (const e of entries) {
|
|
2226
|
+
const bucket = Math.round(e.y / Y_BUCKET);
|
|
2227
|
+
const pages = bucketPages.get(bucket) || /* @__PURE__ */ new Set();
|
|
2228
|
+
pages.add(e.page);
|
|
2229
|
+
bucketPages.set(bucket, pages);
|
|
2230
|
+
}
|
|
2231
|
+
const repeatedBuckets = /* @__PURE__ */ new Set();
|
|
2232
|
+
for (const [b, pages] of bucketPages) {
|
|
2233
|
+
if (pages.size >= MIN_REPEAT) repeatedBuckets.add(b);
|
|
2234
|
+
}
|
|
2235
|
+
for (const e of entries) {
|
|
2236
|
+
const norm = e.text.replace(/\d+/g, "#");
|
|
2237
|
+
const bucket = Math.round(e.y / Y_BUCKET);
|
|
2238
|
+
if (repeatedPatterns.has(norm) || repeatedBuckets.has(bucket)) {
|
|
2239
|
+
removeSet.add(e.blockIdx);
|
|
2240
|
+
}
|
|
2238
2241
|
}
|
|
2239
2242
|
}
|
|
2240
|
-
if (
|
|
2241
|
-
warnings.push({ message: `${
|
|
2243
|
+
if (removeSet.size > 0) {
|
|
2244
|
+
warnings.push({ message: `${removeSet.size}\uAC1C \uBA38\uB9AC\uAE00/\uBC14\uB2E5\uAE00 \uC694\uC18C \uC81C\uAC70\uB428`, code: "HIDDEN_TEXT_FILTERED" });
|
|
2242
2245
|
}
|
|
2243
|
-
return
|
|
2246
|
+
return [...removeSet].sort((a, b) => a - b);
|
|
2244
2247
|
}
|
|
2245
2248
|
function mergeKoreanLines(text) {
|
|
2246
2249
|
if (!text) return "";
|
|
@@ -2276,4 +2279,4 @@ export {
|
|
|
2276
2279
|
extractPdfMetadataOnly,
|
|
2277
2280
|
parsePdfDocument
|
|
2278
2281
|
};
|
|
2279
|
-
//# sourceMappingURL=parser-
|
|
2282
|
+
//# sourceMappingURL=parser-4275GJRB.js.map
|