kordoc 2.2.6 → 2.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +22 -3
- package/dist/{chunk-RF6UJXR3.js → chunk-KSBPABBQ.js} +482 -78
- package/dist/chunk-KSBPABBQ.js.map +1 -0
- package/dist/{chunk-5Y2Q3BRW.js → chunk-M3E3C5GS.js} +8 -1
- package/dist/chunk-M3E3C5GS.js.map +1 -0
- package/dist/{chunk-FCQEF2ZM.js → chunk-VJPDY4YT.js} +2 -2
- package/dist/{chunk-NL5XLN5R.js.map → chunk-VJPDY4YT.js.map} +1 -1
- package/dist/{chunk-HXUCZ2IL.cjs → chunk-VLSATRNQ.cjs} +2 -2
- package/dist/{chunk-HXUCZ2IL.cjs.map → chunk-VLSATRNQ.cjs.map} +1 -1
- package/dist/{chunk-NL5XLN5R.js → chunk-XG5CQUSC.js} +2 -2
- package/dist/{chunk-FCQEF2ZM.js.map → chunk-XG5CQUSC.js.map} +1 -1
- package/dist/cli.js +5 -5
- package/dist/cli.js.map +1 -1
- package/dist/{detect-GYK3HKD5.js → detect-I7YIS4Q6.js} +4 -2
- package/dist/index.cjs +608 -197
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +6 -2
- package/dist/index.d.ts +6 -2
- package/dist/index.js +500 -89
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +5 -5
- package/dist/{parser-AMP7MAOH.js → parser-4275GJRB.js} +45 -42
- package/dist/{parser-AMP7MAOH.js.map → parser-4275GJRB.js.map} +1 -1
- package/dist/{parser-KOWPTDJU.cjs → parser-STAOZMUC.cjs} +61 -58
- package/dist/{parser-KOWPTDJU.cjs.map → parser-STAOZMUC.cjs.map} +1 -1
- package/dist/{parser-43IAQ5KE.js → parser-XRUZEFZT.js} +45 -42
- package/dist/{parser-43IAQ5KE.js.map → parser-XRUZEFZT.js.map} +1 -1
- package/dist/{watch-IUQXOXW3.js → watch-BFLNFJBE.js} +4 -4
- package/package.json +2 -2
- package/dist/chunk-5Y2Q3BRW.js.map +0 -1
- package/dist/chunk-RF6UJXR3.js.map +0 -1
- /package/dist/{detect-GYK3HKD5.js.map → detect-I7YIS4Q6.js.map} +0 -0
- /package/dist/{watch-IUQXOXW3.js.map → watch-BFLNFJBE.js.map} +0 -0
|
@@ -6,7 +6,7 @@ import {
|
|
|
6
6
|
blocksToMarkdown,
|
|
7
7
|
safeMax,
|
|
8
8
|
safeMin
|
|
9
|
-
} from "./chunk-
|
|
9
|
+
} from "./chunk-XG5CQUSC.js";
|
|
10
10
|
import {
|
|
11
11
|
parsePageRange
|
|
12
12
|
} from "./chunk-SBVRCJFH.js";
|
|
@@ -2186,10 +2186,11 @@ function detectSpecialKoreanTables(blocks) {
|
|
|
2186
2186
|
return result;
|
|
2187
2187
|
}
|
|
2188
2188
|
function removeHeaderFooterBlocks(blocks, pageHeights, warnings) {
|
|
2189
|
-
const ZONE_RATIO = 0.
|
|
2189
|
+
const ZONE_RATIO = 0.12;
|
|
2190
2190
|
const MIN_REPEAT = 3;
|
|
2191
|
-
const
|
|
2192
|
-
const
|
|
2191
|
+
const Y_BUCKET = 5;
|
|
2192
|
+
const topEntries = [];
|
|
2193
|
+
const bottomEntries = [];
|
|
2193
2194
|
for (let bi = 0; bi < blocks.length; bi++) {
|
|
2194
2195
|
const b = blocks[bi];
|
|
2195
2196
|
if (!b.bbox || !b.pageNumber || !b.text?.trim()) continue;
|
|
@@ -2197,49 +2198,51 @@ function removeHeaderFooterBlocks(blocks, pageHeights, warnings) {
|
|
|
2197
2198
|
if (!ph) continue;
|
|
2198
2199
|
const blockTop = ph - (b.bbox.y + b.bbox.height);
|
|
2199
2200
|
const blockBottom = ph - b.bbox.y;
|
|
2200
|
-
|
|
2201
|
-
|
|
2202
|
-
|
|
2203
|
-
|
|
2204
|
-
|
|
2205
|
-
|
|
2206
|
-
|
|
2207
|
-
headerTexts.set(b.pageNumber, arr);
|
|
2208
|
-
}
|
|
2209
|
-
}
|
|
2210
|
-
const repeatedPatterns = /* @__PURE__ */ new Set();
|
|
2211
|
-
for (const textsMap of [headerTexts, footerTexts]) {
|
|
2201
|
+
const entry = { blockIdx: bi, page: b.pageNumber, y: b.bbox.y, text: b.text.trim() };
|
|
2202
|
+
if (blockBottom <= ph * ZONE_RATIO) bottomEntries.push(entry);
|
|
2203
|
+
else if (blockTop >= ph * (1 - ZONE_RATIO)) topEntries.push(entry);
|
|
2204
|
+
}
|
|
2205
|
+
const removeSet = /* @__PURE__ */ new Set();
|
|
2206
|
+
for (const entries of [topEntries, bottomEntries]) {
|
|
2207
|
+
if (entries.length === 0) continue;
|
|
2212
2208
|
const patternCount = /* @__PURE__ */ new Map();
|
|
2213
|
-
|
|
2214
|
-
|
|
2215
|
-
|
|
2216
|
-
|
|
2209
|
+
const patternPages = /* @__PURE__ */ new Map();
|
|
2210
|
+
for (const e of entries) {
|
|
2211
|
+
const norm = e.text.replace(/\d+/g, "#");
|
|
2212
|
+
patternCount.set(norm, (patternCount.get(norm) || 0) + 1);
|
|
2213
|
+
const pages = patternPages.get(norm) || /* @__PURE__ */ new Set();
|
|
2214
|
+
pages.add(e.page);
|
|
2215
|
+
patternPages.set(norm, pages);
|
|
2216
|
+
}
|
|
2217
|
+
const repeatedPatterns = /* @__PURE__ */ new Set();
|
|
2218
|
+
for (const [p, count] of patternCount) {
|
|
2219
|
+
if (count >= MIN_REPEAT && (patternPages.get(p)?.size ?? 0) >= MIN_REPEAT) {
|
|
2220
|
+
repeatedPatterns.add(p);
|
|
2217
2221
|
}
|
|
2218
2222
|
}
|
|
2219
|
-
|
|
2220
|
-
|
|
2221
|
-
|
|
2222
|
-
|
|
2223
|
-
|
|
2224
|
-
|
|
2225
|
-
|
|
2226
|
-
const
|
|
2227
|
-
|
|
2228
|
-
|
|
2229
|
-
|
|
2230
|
-
const
|
|
2231
|
-
|
|
2232
|
-
|
|
2233
|
-
|
|
2234
|
-
|
|
2235
|
-
|
|
2236
|
-
removeIndices.push(bi);
|
|
2223
|
+
const bucketPages = /* @__PURE__ */ new Map();
|
|
2224
|
+
for (const e of entries) {
|
|
2225
|
+
const bucket = Math.round(e.y / Y_BUCKET);
|
|
2226
|
+
const pages = bucketPages.get(bucket) || /* @__PURE__ */ new Set();
|
|
2227
|
+
pages.add(e.page);
|
|
2228
|
+
bucketPages.set(bucket, pages);
|
|
2229
|
+
}
|
|
2230
|
+
const repeatedBuckets = /* @__PURE__ */ new Set();
|
|
2231
|
+
for (const [b, pages] of bucketPages) {
|
|
2232
|
+
if (pages.size >= MIN_REPEAT) repeatedBuckets.add(b);
|
|
2233
|
+
}
|
|
2234
|
+
for (const e of entries) {
|
|
2235
|
+
const norm = e.text.replace(/\d+/g, "#");
|
|
2236
|
+
const bucket = Math.round(e.y / Y_BUCKET);
|
|
2237
|
+
if (repeatedPatterns.has(norm) || repeatedBuckets.has(bucket)) {
|
|
2238
|
+
removeSet.add(e.blockIdx);
|
|
2239
|
+
}
|
|
2237
2240
|
}
|
|
2238
2241
|
}
|
|
2239
|
-
if (
|
|
2240
|
-
warnings.push({ message: `${
|
|
2242
|
+
if (removeSet.size > 0) {
|
|
2243
|
+
warnings.push({ message: `${removeSet.size}\uAC1C \uBA38\uB9AC\uAE00/\uBC14\uB2E5\uAE00 \uC694\uC18C \uC81C\uAC70\uB428`, code: "HIDDEN_TEXT_FILTERED" });
|
|
2241
2244
|
}
|
|
2242
|
-
return
|
|
2245
|
+
return [...removeSet].sort((a, b) => a - b);
|
|
2243
2246
|
}
|
|
2244
2247
|
function mergeKoreanLines(text) {
|
|
2245
2248
|
if (!text) return "";
|
|
@@ -2275,4 +2278,4 @@ export {
|
|
|
2275
2278
|
extractPdfMetadataOnly,
|
|
2276
2279
|
parsePdfDocument
|
|
2277
2280
|
};
|
|
2278
|
-
//# sourceMappingURL=parser-
|
|
2281
|
+
//# sourceMappingURL=parser-XRUZEFZT.js.map
|