kordoc 1.1.2 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-LHETZ3IN.js → chunk-KCGDEP7Q.js} +271 -132
- package/dist/chunk-KCGDEP7Q.js.map +1 -0
- package/dist/cli.js +1 -1
- package/dist/index.cjs +270 -132
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +270 -131
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +1 -1
- package/package.json +1 -1
- package/dist/chunk-LHETZ3IN.js.map +0 -1
|
@@ -25,7 +25,7 @@ function detectFormat(buffer) {
|
|
|
25
25
|
}
|
|
26
26
|
|
|
27
27
|
// src/utils.ts
|
|
28
|
-
var VERSION = true ? "1.
|
|
28
|
+
var VERSION = true ? "1.3.0" : "0.0.0-dev";
|
|
29
29
|
function toArrayBuffer(buf) {
|
|
30
30
|
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
31
31
|
return buf.buffer;
|
|
@@ -739,143 +739,316 @@ function arrangeCells(rows, cols, cells) {
|
|
|
739
739
|
return grid.map((row) => row.map((c) => c || { text: "", colSpan: 1, rowSpan: 1 }));
|
|
740
740
|
}
|
|
741
741
|
|
|
742
|
+
// src/pdf/polyfill.ts
|
|
743
|
+
import * as pdfjsWorker from "pdfjs-dist/legacy/build/pdf.worker.mjs";
|
|
744
|
+
var g = globalThis;
|
|
745
|
+
if (typeof g.DOMMatrix === "undefined") {
|
|
746
|
+
g.DOMMatrix = class DOMMatrix {
|
|
747
|
+
m = [1, 0, 0, 1, 0, 0];
|
|
748
|
+
constructor(init) {
|
|
749
|
+
if (init) this.m = init;
|
|
750
|
+
}
|
|
751
|
+
};
|
|
752
|
+
}
|
|
753
|
+
if (typeof g.Path2D === "undefined") {
|
|
754
|
+
g.Path2D = class Path2D {
|
|
755
|
+
};
|
|
756
|
+
}
|
|
757
|
+
g.pdfjsWorker = pdfjsWorker;
|
|
758
|
+
|
|
742
759
|
// src/pdf/parser.ts
|
|
743
|
-
import {
|
|
744
|
-
|
|
760
|
+
import { getDocument, GlobalWorkerOptions } from "pdfjs-dist/legacy/build/pdf.mjs";
|
|
761
|
+
GlobalWorkerOptions.workerSrc = "";
|
|
745
762
|
var MAX_PAGES = 5e3;
|
|
746
763
|
var MAX_TOTAL_TEXT = 100 * 1024 * 1024;
|
|
747
|
-
var pdfjsModule = null;
|
|
748
|
-
async function loadPdfjs() {
|
|
749
|
-
if (pdfjsModule) return pdfjsModule;
|
|
750
|
-
try {
|
|
751
|
-
const mod = await import("pdfjs-dist/legacy/build/pdf.mjs");
|
|
752
|
-
const req = createRequire2(import.meta.url);
|
|
753
|
-
const workerPath = req.resolve("pdfjs-dist/legacy/build/pdf.worker.mjs");
|
|
754
|
-
mod.GlobalWorkerOptions.workerSrc = pathToFileURL(workerPath).href;
|
|
755
|
-
pdfjsModule = mod;
|
|
756
|
-
return mod;
|
|
757
|
-
} catch (err) {
|
|
758
|
-
const msg = err instanceof Error ? err.message : String(err);
|
|
759
|
-
if (msg.includes("Cannot find") || msg.includes("MODULE_NOT_FOUND")) {
|
|
760
|
-
return null;
|
|
761
|
-
}
|
|
762
|
-
throw new KordocError(`pdfjs-dist \uB85C\uB529 \uC2E4\uD328: ${msg}`);
|
|
763
|
-
}
|
|
764
|
-
}
|
|
765
764
|
async function parsePdfDocument(buffer) {
|
|
766
|
-
const
|
|
767
|
-
|
|
768
|
-
return {
|
|
769
|
-
success: false,
|
|
770
|
-
fileType: "pdf",
|
|
771
|
-
pageCount: 0,
|
|
772
|
-
error: "pdfjs-dist\uAC00 \uC124\uCE58\uB418\uC9C0 \uC54A\uC558\uC2B5\uB2C8\uB2E4. npm install pdfjs-dist"
|
|
773
|
-
};
|
|
774
|
-
}
|
|
775
|
-
const data = new Uint8Array(buffer);
|
|
776
|
-
const doc = await pdfjs.getDocument({
|
|
777
|
-
data,
|
|
765
|
+
const doc = await getDocument({
|
|
766
|
+
data: new Uint8Array(buffer),
|
|
778
767
|
useSystemFonts: true,
|
|
779
768
|
disableFontFace: true,
|
|
780
769
|
isEvalSupported: false
|
|
781
770
|
}).promise;
|
|
782
771
|
try {
|
|
783
772
|
const pageCount = doc.numPages;
|
|
784
|
-
if (pageCount === 0) {
|
|
785
|
-
return { success: false, fileType: "pdf", pageCount: 0, error: "PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4." };
|
|
786
|
-
}
|
|
773
|
+
if (pageCount === 0) return { success: false, fileType: "pdf", pageCount: 0, error: "PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4." };
|
|
787
774
|
const pageTexts = [];
|
|
788
775
|
let totalChars = 0;
|
|
789
776
|
let totalTextBytes = 0;
|
|
790
777
|
const effectivePageCount = Math.min(pageCount, MAX_PAGES);
|
|
791
778
|
for (let i = 1; i <= effectivePageCount; i++) {
|
|
792
779
|
const page = await doc.getPage(i);
|
|
793
|
-
const
|
|
794
|
-
const
|
|
795
|
-
const pageText = lines.join("\n");
|
|
780
|
+
const tc = await page.getTextContent();
|
|
781
|
+
const pageText = extractPageContent(tc.items);
|
|
796
782
|
totalChars += pageText.replace(/\s/g, "").length;
|
|
797
783
|
totalTextBytes += pageText.length * 2;
|
|
798
|
-
if (totalTextBytes > MAX_TOTAL_TEXT) throw new KordocError(
|
|
784
|
+
if (totalTextBytes > MAX_TOTAL_TEXT) throw new KordocError("\uD14D\uC2A4\uD2B8 \uCD94\uCD9C \uD06C\uAE30 \uCD08\uACFC");
|
|
799
785
|
pageTexts.push(pageText);
|
|
800
786
|
}
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
return {
|
|
804
|
-
success: false,
|
|
805
|
-
fileType: "pdf",
|
|
806
|
-
pageCount,
|
|
807
|
-
isImageBased: true,
|
|
808
|
-
error: `\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF\uB85C \uCD94\uC815\uB429\uB2C8\uB2E4 (${pageCount}\uD398\uC774\uC9C0, \uCD94\uCD9C \uD14D\uC2A4\uD2B8 ${totalChars}\uC790).`
|
|
809
|
-
};
|
|
810
|
-
}
|
|
811
|
-
let markdown = "";
|
|
812
|
-
for (let i = 0; i < pageTexts.length; i++) {
|
|
813
|
-
const cleaned = cleanPdfText(pageTexts[i]);
|
|
814
|
-
if (cleaned.trim()) {
|
|
815
|
-
if (i > 0 && markdown) markdown += "\n\n";
|
|
816
|
-
markdown += cleaned;
|
|
817
|
-
}
|
|
787
|
+
if (totalChars / effectivePageCount < 10) {
|
|
788
|
+
return { success: false, fileType: "pdf", pageCount, isImageBased: true, error: `\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)` };
|
|
818
789
|
}
|
|
819
|
-
markdown =
|
|
820
|
-
|
|
821
|
-
return { success: true, fileType: "pdf", markdown, pageCount: effectivePageCount
|
|
790
|
+
let markdown = pageTexts.filter((t) => t.trim()).join("\n\n");
|
|
791
|
+
markdown = cleanPdfText(markdown);
|
|
792
|
+
return { success: true, fileType: "pdf", markdown, pageCount: effectivePageCount };
|
|
822
793
|
} finally {
|
|
823
794
|
await doc.destroy().catch(() => {
|
|
824
795
|
});
|
|
825
796
|
}
|
|
826
797
|
}
|
|
827
|
-
function
|
|
798
|
+
function extractPageContent(rawItems) {
|
|
799
|
+
const items = normalizeItems(rawItems);
|
|
800
|
+
if (items.length === 0) return "";
|
|
801
|
+
const yLines = groupByY(items);
|
|
802
|
+
const columns = detectColumns(yLines);
|
|
803
|
+
if (columns && columns.length >= 3) {
|
|
804
|
+
return extractWithColumns(yLines, columns);
|
|
805
|
+
}
|
|
806
|
+
return yLines.map((line) => mergeLineSimple(line)).join("\n");
|
|
807
|
+
}
|
|
808
|
+
function normalizeItems(rawItems) {
|
|
809
|
+
return rawItems.filter((i) => typeof i.str === "string" && i.str.trim() !== "").map((i) => ({
|
|
810
|
+
text: i.str.trim(),
|
|
811
|
+
x: Math.round(i.transform[4]),
|
|
812
|
+
y: Math.round(i.transform[5]),
|
|
813
|
+
w: Math.round(i.width),
|
|
814
|
+
h: Math.round(i.height)
|
|
815
|
+
})).sort((a, b) => b.y - a.y || a.x - b.x);
|
|
816
|
+
}
|
|
817
|
+
function groupByY(items) {
|
|
828
818
|
if (items.length === 0) return [];
|
|
829
|
-
const textItems = items.filter((item) => typeof item.str === "string" && item.str.trim() !== "");
|
|
830
|
-
if (textItems.length === 0) return [];
|
|
831
|
-
textItems.sort((a, b) => {
|
|
832
|
-
const yDiff = b.transform[5] - a.transform[5];
|
|
833
|
-
if (Math.abs(yDiff) < 2) return a.transform[4] - b.transform[4];
|
|
834
|
-
return yDiff;
|
|
835
|
-
});
|
|
836
819
|
const lines = [];
|
|
837
|
-
let
|
|
838
|
-
let
|
|
839
|
-
for (
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
if (currentLine.length > 0) lines.push(mergeLineItems(currentLine));
|
|
820
|
+
let curY = items[0].y;
|
|
821
|
+
let curLine = [items[0]];
|
|
822
|
+
for (let i = 1; i < items.length; i++) {
|
|
823
|
+
if (Math.abs(items[i].y - curY) > 3) {
|
|
824
|
+
lines.push(curLine);
|
|
825
|
+
curLine = [];
|
|
826
|
+
curY = items[i].y;
|
|
827
|
+
}
|
|
828
|
+
curLine.push(items[i]);
|
|
829
|
+
}
|
|
830
|
+
if (curLine.length > 0) lines.push(curLine);
|
|
849
831
|
return lines;
|
|
850
832
|
}
|
|
851
|
-
function
|
|
833
|
+
function isProseSpread(items) {
|
|
834
|
+
if (items.length < 4) return false;
|
|
835
|
+
const sorted = [...items].sort((a, b) => a.x - b.x);
|
|
836
|
+
const gaps = [];
|
|
837
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
838
|
+
gaps.push(sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w));
|
|
839
|
+
}
|
|
840
|
+
const maxGap = Math.max(...gaps);
|
|
841
|
+
const avgLen = items.reduce((s, i) => s + i.text.length, 0) / items.length;
|
|
842
|
+
return maxGap < 40 && avgLen < 5;
|
|
843
|
+
}
|
|
844
|
+
function detectColumns(yLines) {
|
|
845
|
+
const allItems = yLines.flat();
|
|
846
|
+
if (allItems.length === 0) return null;
|
|
847
|
+
const pageWidth = Math.max(...allItems.map((i) => i.x + i.w)) - Math.min(...allItems.map((i) => i.x));
|
|
848
|
+
if (pageWidth < 100) return null;
|
|
849
|
+
let bigoLineIdx = -1;
|
|
850
|
+
for (let i = 0; i < yLines.length; i++) {
|
|
851
|
+
if (yLines[i].length <= 2 && yLines[i].some((item) => item.text === "\uBE44\uACE0")) {
|
|
852
|
+
bigoLineIdx = i;
|
|
853
|
+
break;
|
|
854
|
+
}
|
|
855
|
+
}
|
|
856
|
+
const tableYLines = bigoLineIdx >= 0 ? yLines.slice(0, bigoLineIdx) : yLines;
|
|
857
|
+
const CLUSTER_TOL = 22;
|
|
858
|
+
const xClusters = [];
|
|
859
|
+
for (const line of tableYLines) {
|
|
860
|
+
if (isProseSpread(line)) continue;
|
|
861
|
+
for (const item of line) {
|
|
862
|
+
let found = false;
|
|
863
|
+
for (const c of xClusters) {
|
|
864
|
+
if (Math.abs(item.x - c.center) <= CLUSTER_TOL) {
|
|
865
|
+
c.center = Math.round((c.center * c.count + item.x) / (c.count + 1));
|
|
866
|
+
c.minX = Math.min(c.minX, item.x);
|
|
867
|
+
c.count++;
|
|
868
|
+
found = true;
|
|
869
|
+
break;
|
|
870
|
+
}
|
|
871
|
+
}
|
|
872
|
+
if (!found) {
|
|
873
|
+
xClusters.push({ center: item.x, count: 1, minX: item.x });
|
|
874
|
+
}
|
|
875
|
+
}
|
|
876
|
+
}
|
|
877
|
+
const peaks = xClusters.filter((c) => c.count >= 3).sort((a, b) => a.minX - b.minX);
|
|
878
|
+
if (peaks.length < 3) return null;
|
|
879
|
+
const MERGE_TOL = 30;
|
|
880
|
+
const merged = [peaks[0]];
|
|
881
|
+
for (let i = 1; i < peaks.length; i++) {
|
|
882
|
+
const prev = merged[merged.length - 1];
|
|
883
|
+
if (peaks[i].minX - prev.minX < MERGE_TOL) {
|
|
884
|
+
if (peaks[i].count > prev.count) {
|
|
885
|
+
prev.center = peaks[i].center;
|
|
886
|
+
}
|
|
887
|
+
prev.count += peaks[i].count;
|
|
888
|
+
prev.minX = Math.min(prev.minX, peaks[i].minX);
|
|
889
|
+
} else {
|
|
890
|
+
merged.push({ ...peaks[i] });
|
|
891
|
+
}
|
|
892
|
+
}
|
|
893
|
+
const columns = merged.filter((c) => c.count >= 3).map((c) => c.minX);
|
|
894
|
+
return columns.length >= 3 ? columns : null;
|
|
895
|
+
}
|
|
896
|
+
function findColumn(x, columns) {
|
|
897
|
+
for (let i = columns.length - 1; i >= 0; i--) {
|
|
898
|
+
if (x >= columns[i] - 10) return i;
|
|
899
|
+
}
|
|
900
|
+
return 0;
|
|
901
|
+
}
|
|
902
|
+
function extractWithColumns(yLines, columns) {
|
|
903
|
+
const result = [];
|
|
904
|
+
const colMin = columns[0];
|
|
905
|
+
const colMax = columns[columns.length - 1];
|
|
906
|
+
let bigoIdx = -1;
|
|
907
|
+
for (let i = 0; i < yLines.length; i++) {
|
|
908
|
+
if (yLines[i].length <= 2 && yLines[i].some((item) => item.text === "\uBE44\uACE0")) {
|
|
909
|
+
bigoIdx = i;
|
|
910
|
+
break;
|
|
911
|
+
}
|
|
912
|
+
}
|
|
913
|
+
let tableStart = -1;
|
|
914
|
+
for (let i = 0; i < (bigoIdx >= 0 ? bigoIdx : yLines.length); i++) {
|
|
915
|
+
const usedCols = new Set(yLines[i].map((item) => findColumn(item.x, columns)));
|
|
916
|
+
if (usedCols.size >= 3) {
|
|
917
|
+
tableStart = i;
|
|
918
|
+
break;
|
|
919
|
+
}
|
|
920
|
+
}
|
|
921
|
+
const tableEnd = bigoIdx >= 0 ? bigoIdx : yLines.length;
|
|
922
|
+
for (let i = 0; i < (tableStart >= 0 ? tableStart : tableEnd); i++) {
|
|
923
|
+
result.push(mergeLineSimple(yLines[i]));
|
|
924
|
+
}
|
|
925
|
+
if (tableStart >= 0) {
|
|
926
|
+
const tableLines = yLines.slice(tableStart, tableEnd);
|
|
927
|
+
const gridLines = [];
|
|
928
|
+
for (const line of tableLines) {
|
|
929
|
+
const inRange = line.some(
|
|
930
|
+
(item) => item.x >= colMin - 20 && item.x <= colMax + 200
|
|
931
|
+
);
|
|
932
|
+
if (inRange && !isProseSpread(line)) {
|
|
933
|
+
gridLines.push(line);
|
|
934
|
+
} else {
|
|
935
|
+
if (gridLines.length > 0) {
|
|
936
|
+
result.push(buildGridTable(gridLines.splice(0), columns));
|
|
937
|
+
}
|
|
938
|
+
result.push(mergeLineSimple(line));
|
|
939
|
+
}
|
|
940
|
+
}
|
|
941
|
+
if (gridLines.length > 0) {
|
|
942
|
+
result.push(buildGridTable(gridLines, columns));
|
|
943
|
+
}
|
|
944
|
+
}
|
|
945
|
+
if (bigoIdx >= 0) {
|
|
946
|
+
result.push("");
|
|
947
|
+
for (let i = bigoIdx; i < yLines.length; i++) {
|
|
948
|
+
result.push(mergeLineSimple(yLines[i]));
|
|
949
|
+
}
|
|
950
|
+
}
|
|
951
|
+
return result.join("\n");
|
|
952
|
+
}
|
|
953
|
+
function buildGridTable(lines, columns) {
|
|
954
|
+
const numCols = columns.length;
|
|
955
|
+
const yRows = lines.map((items) => {
|
|
956
|
+
const row = Array(numCols).fill("");
|
|
957
|
+
for (const item of items) {
|
|
958
|
+
const col = findColumn(item.x, columns);
|
|
959
|
+
row[col] = row[col] ? row[col] + " " + item.text : item.text;
|
|
960
|
+
}
|
|
961
|
+
return row;
|
|
962
|
+
});
|
|
963
|
+
const dataColStart = Math.max(2, Math.floor(numCols / 2));
|
|
964
|
+
const merged = [];
|
|
965
|
+
for (const row of yRows) {
|
|
966
|
+
if (row.every((c) => c === "")) continue;
|
|
967
|
+
if (merged.length === 0) {
|
|
968
|
+
merged.push([...row]);
|
|
969
|
+
continue;
|
|
970
|
+
}
|
|
971
|
+
const prev = merged[merged.length - 1];
|
|
972
|
+
const filledCols = row.map((c, i) => c ? i : -1).filter((i) => i >= 0);
|
|
973
|
+
const filledCount = filledCols.length;
|
|
974
|
+
let isNewRow = false;
|
|
975
|
+
if (row[0] && row[0].length >= 3) {
|
|
976
|
+
isNewRow = true;
|
|
977
|
+
}
|
|
978
|
+
if (!isNewRow && numCols > 1 && row[1]) {
|
|
979
|
+
isNewRow = true;
|
|
980
|
+
}
|
|
981
|
+
if (!isNewRow) {
|
|
982
|
+
const hasData = row.slice(dataColStart).some((c) => c !== "");
|
|
983
|
+
const prevHasData = prev.slice(dataColStart).some((c) => c !== "");
|
|
984
|
+
if (hasData && prevHasData) {
|
|
985
|
+
isNewRow = true;
|
|
986
|
+
}
|
|
987
|
+
}
|
|
988
|
+
if (isNewRow && filledCount === 1 && row[0] && row[0].length <= 2) {
|
|
989
|
+
isNewRow = false;
|
|
990
|
+
}
|
|
991
|
+
if (isNewRow) {
|
|
992
|
+
merged.push([...row]);
|
|
993
|
+
} else {
|
|
994
|
+
for (let c = 0; c < numCols; c++) {
|
|
995
|
+
if (row[c]) {
|
|
996
|
+
prev[c] = prev[c] ? prev[c] + " " + row[c] : row[c];
|
|
997
|
+
}
|
|
998
|
+
}
|
|
999
|
+
}
|
|
1000
|
+
}
|
|
1001
|
+
if (merged.length < 2) {
|
|
1002
|
+
return merged.map((r) => r.filter((c) => c).join(" ")).join("\n");
|
|
1003
|
+
}
|
|
1004
|
+
let headerEnd = 0;
|
|
1005
|
+
for (let r = 0; r < merged.length; r++) {
|
|
1006
|
+
const hasDataValues = merged[r].slice(dataColStart).some((c) => c && /\d/.test(c));
|
|
1007
|
+
if (hasDataValues) break;
|
|
1008
|
+
headerEnd = r + 1;
|
|
1009
|
+
}
|
|
1010
|
+
if (headerEnd > 1) {
|
|
1011
|
+
const headerRow = Array(numCols).fill("");
|
|
1012
|
+
for (let r = 0; r < headerEnd; r++) {
|
|
1013
|
+
for (let c = 0; c < numCols; c++) {
|
|
1014
|
+
if (merged[r][c]) {
|
|
1015
|
+
headerRow[c] = headerRow[c] ? headerRow[c] + " " + merged[r][c] : merged[r][c];
|
|
1016
|
+
}
|
|
1017
|
+
}
|
|
1018
|
+
}
|
|
1019
|
+
merged.splice(0, headerEnd, headerRow);
|
|
1020
|
+
}
|
|
1021
|
+
const md = [];
|
|
1022
|
+
md.push("| " + merged[0].join(" | ") + " |");
|
|
1023
|
+
md.push("| " + merged[0].map(() => "---").join(" | ") + " |");
|
|
1024
|
+
for (let r = 1; r < merged.length; r++) {
|
|
1025
|
+
md.push("| " + merged[r].join(" | ") + " |");
|
|
1026
|
+
}
|
|
1027
|
+
return md.join("\n");
|
|
1028
|
+
}
|
|
1029
|
+
function mergeLineSimple(items) {
|
|
852
1030
|
if (items.length <= 1) return items[0]?.text || "";
|
|
853
|
-
items.sort((a, b) => a.x - b.x);
|
|
854
|
-
let result =
|
|
855
|
-
for (let i = 1; i <
|
|
856
|
-
const gap =
|
|
1031
|
+
const sorted = [...items].sort((a, b) => a.x - b.x);
|
|
1032
|
+
let result = sorted[0].text;
|
|
1033
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
1034
|
+
const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
|
|
857
1035
|
if (gap > 15) result += " ";
|
|
858
1036
|
else if (gap > 3) result += " ";
|
|
859
|
-
result +=
|
|
1037
|
+
result += sorted[i].text;
|
|
860
1038
|
}
|
|
861
1039
|
return result;
|
|
862
1040
|
}
|
|
863
1041
|
function cleanPdfText(text) {
|
|
864
|
-
|
|
865
|
-
|
|
1042
|
+
return mergeKoreanLines(
|
|
1043
|
+
text.replace(/^[\s]*[-–—]\s*\d+\s*[-–—][\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "")
|
|
1044
|
+
).replace(/\n{3,}/g, "\n\n").trim();
|
|
866
1045
|
}
|
|
867
1046
|
function startsWithMarker(line) {
|
|
868
1047
|
const t = line.trimStart();
|
|
869
|
-
|
|
870
|
-
if (/^\d+[.)]/.test(t)) return true;
|
|
871
|
-
if (/^\([가-힣ㄱ-ㅎ\d]+\)/.test(t)) return true;
|
|
872
|
-
if (/^[○●※▶▷◆◇■□★☆\-·]\s/.test(t)) return true;
|
|
873
|
-
if (/^제\d+[조항호장절]/.test(t)) return true;
|
|
874
|
-
return false;
|
|
1048
|
+
return /^[가-힣ㄱ-ㅎ][.)]/.test(t) || /^\d+[.)]/.test(t) || /^\([가-힣ㄱ-ㅎ\d]+\)/.test(t) || /^[○●※▶▷◆◇■□★☆\-·]\s/.test(t) || /^제\d+[조항호장절]/.test(t);
|
|
875
1049
|
}
|
|
876
1050
|
function isStandaloneHeader(line) {
|
|
877
|
-
|
|
878
|
-
return /^제\d+[조항호장절](\([^)]*\))?(\s+\S+){0,7}$/.test(t);
|
|
1051
|
+
return /^제\d+[조항호장절](\([^)]*\))?(\s+\S+){0,7}$/.test(line.trim());
|
|
879
1052
|
}
|
|
880
1053
|
function mergeKoreanLines(text) {
|
|
881
1054
|
if (!text) return "";
|
|
@@ -885,8 +1058,7 @@ function mergeKoreanLines(text) {
|
|
|
885
1058
|
for (let i = 1; i < lines.length; i++) {
|
|
886
1059
|
const prev = result[result.length - 1];
|
|
887
1060
|
const curr = lines[i];
|
|
888
|
-
|
|
889
|
-
if (shouldMerge) {
|
|
1061
|
+
if (/[가-힣·,\-]$/.test(prev) && /^[가-힣(]/.test(curr) && !startsWithMarker(curr) && !isStandaloneHeader(prev)) {
|
|
890
1062
|
result[result.length - 1] = prev + " " + curr;
|
|
891
1063
|
} else {
|
|
892
1064
|
result.push(curr);
|
|
@@ -894,39 +1066,6 @@ function mergeKoreanLines(text) {
|
|
|
894
1066
|
}
|
|
895
1067
|
return result.join("\n");
|
|
896
1068
|
}
|
|
897
|
-
function reconstructTables(text) {
|
|
898
|
-
const lines = text.split("\n");
|
|
899
|
-
const result = [];
|
|
900
|
-
let tableBuffer = [];
|
|
901
|
-
for (const line of lines) {
|
|
902
|
-
if (line.includes(" ")) {
|
|
903
|
-
tableBuffer.push(line.split(" ").map((c) => c.trim()));
|
|
904
|
-
} else {
|
|
905
|
-
if (tableBuffer.length >= 2) result.push(formatAsMarkdownTable(tableBuffer));
|
|
906
|
-
else if (tableBuffer.length === 1) result.push(tableBuffer[0].join(" | "));
|
|
907
|
-
tableBuffer = [];
|
|
908
|
-
result.push(line);
|
|
909
|
-
}
|
|
910
|
-
}
|
|
911
|
-
if (tableBuffer.length >= 2) result.push(formatAsMarkdownTable(tableBuffer));
|
|
912
|
-
else if (tableBuffer.length === 1) result.push(tableBuffer[0].join(" | "));
|
|
913
|
-
return result.join("\n");
|
|
914
|
-
}
|
|
915
|
-
function formatAsMarkdownTable(rows) {
|
|
916
|
-
const maxCols = Math.max(...rows.map((r) => r.length));
|
|
917
|
-
const normalized = rows.map((r) => {
|
|
918
|
-
const copy = [...r];
|
|
919
|
-
while (copy.length < maxCols) copy.push("");
|
|
920
|
-
return copy;
|
|
921
|
-
});
|
|
922
|
-
const lines = [];
|
|
923
|
-
lines.push("| " + normalized[0].join(" | ") + " |");
|
|
924
|
-
lines.push("| " + normalized[0].map(() => "---").join(" | ") + " |");
|
|
925
|
-
for (let i = 1; i < normalized.length; i++) {
|
|
926
|
-
lines.push("| " + normalized[i].join(" | ") + " |");
|
|
927
|
-
}
|
|
928
|
-
return lines.join("\n");
|
|
929
|
-
}
|
|
930
1069
|
|
|
931
1070
|
// src/index.ts
|
|
932
1071
|
async function parse(buffer) {
|
|
@@ -977,4 +1116,4 @@ export {
|
|
|
977
1116
|
sanitizeError,
|
|
978
1117
|
parse
|
|
979
1118
|
};
|
|
980
|
-
//# sourceMappingURL=chunk-
|
|
1119
|
+
//# sourceMappingURL=chunk-KCGDEP7Q.js.map
|