kordoc 1.1.1 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -2
- package/dist/{chunk-HJHA6H3F.js → chunk-4BKNDXGU.js} +253 -111
- package/dist/chunk-4BKNDXGU.js.map +1 -0
- package/dist/cli.js +1 -1
- package/dist/index.cjs +252 -110
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +1 -23
- package/dist/index.d.ts +1 -23
- package/dist/index.js +252 -110
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +1 -1
- package/package.json +1 -1
- package/dist/chunk-HJHA6H3F.js.map +0 -1
package/README.md
CHANGED
|
@@ -136,10 +136,10 @@ interface ParseResult {
|
|
|
136
136
|
### Types
|
|
137
137
|
|
|
138
138
|
```typescript
|
|
139
|
-
import type { ParseResult,
|
|
139
|
+
import type { ParseResult, ParseSuccess, ParseFailure, FileType } from "kordoc"
|
|
140
140
|
```
|
|
141
141
|
|
|
142
|
-
> Internal utilities (`KordocError`, `sanitizeError`, `isPathTraversal`, `buildTable`, `blocksToMarkdown
|
|
142
|
+
> Internal types (`IRBlock`, `IRTable`, `IRCell`, `CellContext`) and utilities (`KordocError`, `sanitizeError`, `isPathTraversal`, `buildTable`, `blocksToMarkdown`) are not part of the public API.
|
|
143
143
|
|
|
144
144
|
## Requirements
|
|
145
145
|
|
|
@@ -25,7 +25,7 @@ function detectFormat(buffer) {
|
|
|
25
25
|
}
|
|
26
26
|
|
|
27
27
|
// src/utils.ts
|
|
28
|
-
var VERSION = true ? "1.
|
|
28
|
+
var VERSION = true ? "1.2.0" : "0.0.0-dev";
|
|
29
29
|
function toArrayBuffer(buf) {
|
|
30
30
|
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
31
31
|
return buf.buffer;
|
|
@@ -756,126 +756,302 @@ async function loadPdfjs() {
|
|
|
756
756
|
return mod;
|
|
757
757
|
} catch (err) {
|
|
758
758
|
const msg = err instanceof Error ? err.message : String(err);
|
|
759
|
-
if (msg.includes("Cannot find") || msg.includes("MODULE_NOT_FOUND"))
|
|
760
|
-
return null;
|
|
761
|
-
}
|
|
759
|
+
if (msg.includes("Cannot find") || msg.includes("MODULE_NOT_FOUND")) return null;
|
|
762
760
|
throw new KordocError(`pdfjs-dist \uB85C\uB529 \uC2E4\uD328: ${msg}`);
|
|
763
761
|
}
|
|
764
762
|
}
|
|
765
763
|
async function parsePdfDocument(buffer) {
|
|
766
764
|
const pdfjs = await loadPdfjs();
|
|
767
765
|
if (!pdfjs) {
|
|
768
|
-
return {
|
|
769
|
-
success: false,
|
|
770
|
-
fileType: "pdf",
|
|
771
|
-
pageCount: 0,
|
|
772
|
-
error: "pdfjs-dist\uAC00 \uC124\uCE58\uB418\uC9C0 \uC54A\uC558\uC2B5\uB2C8\uB2E4. npm install pdfjs-dist"
|
|
773
|
-
};
|
|
766
|
+
return { success: false, fileType: "pdf", pageCount: 0, error: "pdfjs-dist\uAC00 \uC124\uCE58\uB418\uC9C0 \uC54A\uC558\uC2B5\uB2C8\uB2E4. npm install pdfjs-dist" };
|
|
774
767
|
}
|
|
775
|
-
const data = new Uint8Array(buffer);
|
|
776
768
|
const doc = await pdfjs.getDocument({
|
|
777
|
-
data,
|
|
769
|
+
data: new Uint8Array(buffer),
|
|
778
770
|
useSystemFonts: true,
|
|
779
771
|
disableFontFace: true,
|
|
780
772
|
isEvalSupported: false
|
|
781
773
|
}).promise;
|
|
782
774
|
try {
|
|
783
775
|
const pageCount = doc.numPages;
|
|
784
|
-
if (pageCount === 0) {
|
|
785
|
-
return { success: false, fileType: "pdf", pageCount: 0, error: "PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4." };
|
|
786
|
-
}
|
|
776
|
+
if (pageCount === 0) return { success: false, fileType: "pdf", pageCount: 0, error: "PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4." };
|
|
787
777
|
const pageTexts = [];
|
|
788
778
|
let totalChars = 0;
|
|
789
779
|
let totalTextBytes = 0;
|
|
790
780
|
const effectivePageCount = Math.min(pageCount, MAX_PAGES);
|
|
791
781
|
for (let i = 1; i <= effectivePageCount; i++) {
|
|
792
782
|
const page = await doc.getPage(i);
|
|
793
|
-
const
|
|
794
|
-
const
|
|
795
|
-
const pageText = lines.join("\n");
|
|
783
|
+
const tc = await page.getTextContent();
|
|
784
|
+
const pageText = extractPageContent(tc.items);
|
|
796
785
|
totalChars += pageText.replace(/\s/g, "").length;
|
|
797
786
|
totalTextBytes += pageText.length * 2;
|
|
798
|
-
if (totalTextBytes > MAX_TOTAL_TEXT) throw new KordocError(
|
|
787
|
+
if (totalTextBytes > MAX_TOTAL_TEXT) throw new KordocError("\uD14D\uC2A4\uD2B8 \uCD94\uCD9C \uD06C\uAE30 \uCD08\uACFC");
|
|
799
788
|
pageTexts.push(pageText);
|
|
800
789
|
}
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
return {
|
|
804
|
-
success: false,
|
|
805
|
-
fileType: "pdf",
|
|
806
|
-
pageCount,
|
|
807
|
-
isImageBased: true,
|
|
808
|
-
error: `\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF\uB85C \uCD94\uC815\uB429\uB2C8\uB2E4 (${pageCount}\uD398\uC774\uC9C0, \uCD94\uCD9C \uD14D\uC2A4\uD2B8 ${totalChars}\uC790).`
|
|
809
|
-
};
|
|
810
|
-
}
|
|
811
|
-
let markdown = "";
|
|
812
|
-
for (let i = 0; i < pageTexts.length; i++) {
|
|
813
|
-
const cleaned = cleanPdfText(pageTexts[i]);
|
|
814
|
-
if (cleaned.trim()) {
|
|
815
|
-
if (i > 0 && markdown) markdown += "\n\n";
|
|
816
|
-
markdown += cleaned;
|
|
817
|
-
}
|
|
790
|
+
if (totalChars / effectivePageCount < 10) {
|
|
791
|
+
return { success: false, fileType: "pdf", pageCount, isImageBased: true, error: `\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)` };
|
|
818
792
|
}
|
|
819
|
-
markdown =
|
|
820
|
-
|
|
821
|
-
return { success: true, fileType: "pdf", markdown, pageCount: effectivePageCount
|
|
793
|
+
let markdown = pageTexts.filter((t) => t.trim()).join("\n\n");
|
|
794
|
+
markdown = cleanPdfText(markdown);
|
|
795
|
+
return { success: true, fileType: "pdf", markdown, pageCount: effectivePageCount };
|
|
822
796
|
} finally {
|
|
823
797
|
await doc.destroy().catch(() => {
|
|
824
798
|
});
|
|
825
799
|
}
|
|
826
800
|
}
|
|
827
|
-
function
|
|
801
|
+
function extractPageContent(rawItems) {
|
|
802
|
+
const items = normalizeItems(rawItems);
|
|
803
|
+
if (items.length === 0) return "";
|
|
804
|
+
const yLines = groupByY(items);
|
|
805
|
+
const columns = detectColumns(yLines);
|
|
806
|
+
if (columns && columns.length >= 3) {
|
|
807
|
+
return extractWithColumns(yLines, columns);
|
|
808
|
+
}
|
|
809
|
+
return yLines.map((line) => mergeLineSimple(line)).join("\n");
|
|
810
|
+
}
|
|
811
|
+
function normalizeItems(rawItems) {
|
|
812
|
+
return rawItems.filter((i) => typeof i.str === "string" && i.str.trim() !== "").map((i) => ({
|
|
813
|
+
text: i.str.trim(),
|
|
814
|
+
x: Math.round(i.transform[4]),
|
|
815
|
+
y: Math.round(i.transform[5]),
|
|
816
|
+
w: Math.round(i.width),
|
|
817
|
+
h: Math.round(i.height)
|
|
818
|
+
})).sort((a, b) => b.y - a.y || a.x - b.x);
|
|
819
|
+
}
|
|
820
|
+
function groupByY(items) {
|
|
828
821
|
if (items.length === 0) return [];
|
|
829
|
-
const textItems = items.filter((item) => typeof item.str === "string" && item.str.trim() !== "");
|
|
830
|
-
if (textItems.length === 0) return [];
|
|
831
|
-
textItems.sort((a, b) => {
|
|
832
|
-
const yDiff = b.transform[5] - a.transform[5];
|
|
833
|
-
if (Math.abs(yDiff) < 2) return a.transform[4] - b.transform[4];
|
|
834
|
-
return yDiff;
|
|
835
|
-
});
|
|
836
822
|
const lines = [];
|
|
837
|
-
let
|
|
838
|
-
let
|
|
839
|
-
for (
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
if (currentLine.length > 0) lines.push(mergeLineItems(currentLine));
|
|
823
|
+
let curY = items[0].y;
|
|
824
|
+
let curLine = [items[0]];
|
|
825
|
+
for (let i = 1; i < items.length; i++) {
|
|
826
|
+
if (Math.abs(items[i].y - curY) > 3) {
|
|
827
|
+
lines.push(curLine);
|
|
828
|
+
curLine = [];
|
|
829
|
+
curY = items[i].y;
|
|
830
|
+
}
|
|
831
|
+
curLine.push(items[i]);
|
|
832
|
+
}
|
|
833
|
+
if (curLine.length > 0) lines.push(curLine);
|
|
849
834
|
return lines;
|
|
850
835
|
}
|
|
851
|
-
function
|
|
836
|
+
function isProseSpread(items) {
|
|
837
|
+
if (items.length < 4) return false;
|
|
838
|
+
const sorted = [...items].sort((a, b) => a.x - b.x);
|
|
839
|
+
const gaps = [];
|
|
840
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
841
|
+
gaps.push(sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w));
|
|
842
|
+
}
|
|
843
|
+
const maxGap = Math.max(...gaps);
|
|
844
|
+
const avgLen = items.reduce((s, i) => s + i.text.length, 0) / items.length;
|
|
845
|
+
return maxGap < 40 && avgLen < 5;
|
|
846
|
+
}
|
|
847
|
+
function detectColumns(yLines) {
|
|
848
|
+
const allItems = yLines.flat();
|
|
849
|
+
if (allItems.length === 0) return null;
|
|
850
|
+
const pageWidth = Math.max(...allItems.map((i) => i.x + i.w)) - Math.min(...allItems.map((i) => i.x));
|
|
851
|
+
if (pageWidth < 100) return null;
|
|
852
|
+
let bigoLineIdx = -1;
|
|
853
|
+
for (let i = 0; i < yLines.length; i++) {
|
|
854
|
+
if (yLines[i].length <= 2 && yLines[i].some((item) => item.text === "\uBE44\uACE0")) {
|
|
855
|
+
bigoLineIdx = i;
|
|
856
|
+
break;
|
|
857
|
+
}
|
|
858
|
+
}
|
|
859
|
+
const tableYLines = bigoLineIdx >= 0 ? yLines.slice(0, bigoLineIdx) : yLines;
|
|
860
|
+
const CLUSTER_TOL = 22;
|
|
861
|
+
const xClusters = [];
|
|
862
|
+
for (const line of tableYLines) {
|
|
863
|
+
if (isProseSpread(line)) continue;
|
|
864
|
+
for (const item of line) {
|
|
865
|
+
let found = false;
|
|
866
|
+
for (const c of xClusters) {
|
|
867
|
+
if (Math.abs(item.x - c.center) <= CLUSTER_TOL) {
|
|
868
|
+
c.center = Math.round((c.center * c.count + item.x) / (c.count + 1));
|
|
869
|
+
c.minX = Math.min(c.minX, item.x);
|
|
870
|
+
c.count++;
|
|
871
|
+
found = true;
|
|
872
|
+
break;
|
|
873
|
+
}
|
|
874
|
+
}
|
|
875
|
+
if (!found) {
|
|
876
|
+
xClusters.push({ center: item.x, count: 1, minX: item.x });
|
|
877
|
+
}
|
|
878
|
+
}
|
|
879
|
+
}
|
|
880
|
+
const peaks = xClusters.filter((c) => c.count >= 3).sort((a, b) => a.minX - b.minX);
|
|
881
|
+
if (peaks.length < 3) return null;
|
|
882
|
+
const MERGE_TOL = 30;
|
|
883
|
+
const merged = [peaks[0]];
|
|
884
|
+
for (let i = 1; i < peaks.length; i++) {
|
|
885
|
+
const prev = merged[merged.length - 1];
|
|
886
|
+
if (peaks[i].minX - prev.minX < MERGE_TOL) {
|
|
887
|
+
if (peaks[i].count > prev.count) {
|
|
888
|
+
prev.center = peaks[i].center;
|
|
889
|
+
}
|
|
890
|
+
prev.count += peaks[i].count;
|
|
891
|
+
prev.minX = Math.min(prev.minX, peaks[i].minX);
|
|
892
|
+
} else {
|
|
893
|
+
merged.push({ ...peaks[i] });
|
|
894
|
+
}
|
|
895
|
+
}
|
|
896
|
+
const columns = merged.filter((c) => c.count >= 3).map((c) => c.minX);
|
|
897
|
+
return columns.length >= 3 ? columns : null;
|
|
898
|
+
}
|
|
899
|
+
function findColumn(x, columns) {
|
|
900
|
+
for (let i = columns.length - 1; i >= 0; i--) {
|
|
901
|
+
if (x >= columns[i] - 10) return i;
|
|
902
|
+
}
|
|
903
|
+
return 0;
|
|
904
|
+
}
|
|
905
|
+
function extractWithColumns(yLines, columns) {
|
|
906
|
+
const result = [];
|
|
907
|
+
const colMin = columns[0];
|
|
908
|
+
const colMax = columns[columns.length - 1];
|
|
909
|
+
let bigoIdx = -1;
|
|
910
|
+
for (let i = 0; i < yLines.length; i++) {
|
|
911
|
+
if (yLines[i].length <= 2 && yLines[i].some((item) => item.text === "\uBE44\uACE0")) {
|
|
912
|
+
bigoIdx = i;
|
|
913
|
+
break;
|
|
914
|
+
}
|
|
915
|
+
}
|
|
916
|
+
let tableStart = -1;
|
|
917
|
+
for (let i = 0; i < (bigoIdx >= 0 ? bigoIdx : yLines.length); i++) {
|
|
918
|
+
const usedCols = new Set(yLines[i].map((item) => findColumn(item.x, columns)));
|
|
919
|
+
if (usedCols.size >= 3) {
|
|
920
|
+
tableStart = i;
|
|
921
|
+
break;
|
|
922
|
+
}
|
|
923
|
+
}
|
|
924
|
+
const tableEnd = bigoIdx >= 0 ? bigoIdx : yLines.length;
|
|
925
|
+
for (let i = 0; i < (tableStart >= 0 ? tableStart : tableEnd); i++) {
|
|
926
|
+
result.push(mergeLineSimple(yLines[i]));
|
|
927
|
+
}
|
|
928
|
+
if (tableStart >= 0) {
|
|
929
|
+
const tableLines = yLines.slice(tableStart, tableEnd);
|
|
930
|
+
const gridLines = [];
|
|
931
|
+
for (const line of tableLines) {
|
|
932
|
+
const inRange = line.some(
|
|
933
|
+
(item) => item.x >= colMin - 20 && item.x <= colMax + 200
|
|
934
|
+
);
|
|
935
|
+
if (inRange && !isProseSpread(line)) {
|
|
936
|
+
gridLines.push(line);
|
|
937
|
+
} else {
|
|
938
|
+
if (gridLines.length > 0) {
|
|
939
|
+
result.push(buildGridTable(gridLines.splice(0), columns));
|
|
940
|
+
}
|
|
941
|
+
result.push(mergeLineSimple(line));
|
|
942
|
+
}
|
|
943
|
+
}
|
|
944
|
+
if (gridLines.length > 0) {
|
|
945
|
+
result.push(buildGridTable(gridLines, columns));
|
|
946
|
+
}
|
|
947
|
+
}
|
|
948
|
+
if (bigoIdx >= 0) {
|
|
949
|
+
result.push("");
|
|
950
|
+
for (let i = bigoIdx; i < yLines.length; i++) {
|
|
951
|
+
result.push(mergeLineSimple(yLines[i]));
|
|
952
|
+
}
|
|
953
|
+
}
|
|
954
|
+
return result.join("\n");
|
|
955
|
+
}
|
|
956
|
+
function buildGridTable(lines, columns) {
|
|
957
|
+
const numCols = columns.length;
|
|
958
|
+
const yRows = lines.map((items) => {
|
|
959
|
+
const row = Array(numCols).fill("");
|
|
960
|
+
for (const item of items) {
|
|
961
|
+
const col = findColumn(item.x, columns);
|
|
962
|
+
row[col] = row[col] ? row[col] + " " + item.text : item.text;
|
|
963
|
+
}
|
|
964
|
+
return row;
|
|
965
|
+
});
|
|
966
|
+
const dataColStart = Math.max(2, Math.floor(numCols / 2));
|
|
967
|
+
const merged = [];
|
|
968
|
+
for (const row of yRows) {
|
|
969
|
+
if (row.every((c) => c === "")) continue;
|
|
970
|
+
if (merged.length === 0) {
|
|
971
|
+
merged.push([...row]);
|
|
972
|
+
continue;
|
|
973
|
+
}
|
|
974
|
+
const prev = merged[merged.length - 1];
|
|
975
|
+
const filledCols = row.map((c, i) => c ? i : -1).filter((i) => i >= 0);
|
|
976
|
+
const filledCount = filledCols.length;
|
|
977
|
+
let isNewRow = false;
|
|
978
|
+
if (row[0] && row[0].length >= 3) {
|
|
979
|
+
isNewRow = true;
|
|
980
|
+
}
|
|
981
|
+
if (!isNewRow && numCols > 1 && row[1]) {
|
|
982
|
+
isNewRow = true;
|
|
983
|
+
}
|
|
984
|
+
if (!isNewRow) {
|
|
985
|
+
const hasData = row.slice(dataColStart).some((c) => c !== "");
|
|
986
|
+
const prevHasData = prev.slice(dataColStart).some((c) => c !== "");
|
|
987
|
+
if (hasData && prevHasData) {
|
|
988
|
+
isNewRow = true;
|
|
989
|
+
}
|
|
990
|
+
}
|
|
991
|
+
if (isNewRow && filledCount === 1 && row[0] && row[0].length <= 2) {
|
|
992
|
+
isNewRow = false;
|
|
993
|
+
}
|
|
994
|
+
if (isNewRow) {
|
|
995
|
+
merged.push([...row]);
|
|
996
|
+
} else {
|
|
997
|
+
for (let c = 0; c < numCols; c++) {
|
|
998
|
+
if (row[c]) {
|
|
999
|
+
prev[c] = prev[c] ? prev[c] + " " + row[c] : row[c];
|
|
1000
|
+
}
|
|
1001
|
+
}
|
|
1002
|
+
}
|
|
1003
|
+
}
|
|
1004
|
+
if (merged.length < 2) {
|
|
1005
|
+
return merged.map((r) => r.filter((c) => c).join(" ")).join("\n");
|
|
1006
|
+
}
|
|
1007
|
+
let headerEnd = 0;
|
|
1008
|
+
for (let r = 0; r < merged.length; r++) {
|
|
1009
|
+
const hasDataValues = merged[r].slice(dataColStart).some((c) => c && /\d/.test(c));
|
|
1010
|
+
if (hasDataValues) break;
|
|
1011
|
+
headerEnd = r + 1;
|
|
1012
|
+
}
|
|
1013
|
+
if (headerEnd > 1) {
|
|
1014
|
+
const headerRow = Array(numCols).fill("");
|
|
1015
|
+
for (let r = 0; r < headerEnd; r++) {
|
|
1016
|
+
for (let c = 0; c < numCols; c++) {
|
|
1017
|
+
if (merged[r][c]) {
|
|
1018
|
+
headerRow[c] = headerRow[c] ? headerRow[c] + " " + merged[r][c] : merged[r][c];
|
|
1019
|
+
}
|
|
1020
|
+
}
|
|
1021
|
+
}
|
|
1022
|
+
merged.splice(0, headerEnd, headerRow);
|
|
1023
|
+
}
|
|
1024
|
+
const md = [];
|
|
1025
|
+
md.push("| " + merged[0].join(" | ") + " |");
|
|
1026
|
+
md.push("| " + merged[0].map(() => "---").join(" | ") + " |");
|
|
1027
|
+
for (let r = 1; r < merged.length; r++) {
|
|
1028
|
+
md.push("| " + merged[r].join(" | ") + " |");
|
|
1029
|
+
}
|
|
1030
|
+
return md.join("\n");
|
|
1031
|
+
}
|
|
1032
|
+
function mergeLineSimple(items) {
|
|
852
1033
|
if (items.length <= 1) return items[0]?.text || "";
|
|
853
|
-
items.sort((a, b) => a.x - b.x);
|
|
854
|
-
let result =
|
|
855
|
-
for (let i = 1; i <
|
|
856
|
-
const gap =
|
|
1034
|
+
const sorted = [...items].sort((a, b) => a.x - b.x);
|
|
1035
|
+
let result = sorted[0].text;
|
|
1036
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
1037
|
+
const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
|
|
857
1038
|
if (gap > 15) result += " ";
|
|
858
1039
|
else if (gap > 3) result += " ";
|
|
859
|
-
result +=
|
|
1040
|
+
result += sorted[i].text;
|
|
860
1041
|
}
|
|
861
1042
|
return result;
|
|
862
1043
|
}
|
|
863
1044
|
function cleanPdfText(text) {
|
|
864
|
-
|
|
865
|
-
|
|
1045
|
+
return mergeKoreanLines(
|
|
1046
|
+
text.replace(/^[\s]*[-–—]\s*\d+\s*[-–—][\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "")
|
|
1047
|
+
).replace(/\n{3,}/g, "\n\n").trim();
|
|
866
1048
|
}
|
|
867
1049
|
function startsWithMarker(line) {
|
|
868
1050
|
const t = line.trimStart();
|
|
869
|
-
|
|
870
|
-
if (/^\d+[.)]/.test(t)) return true;
|
|
871
|
-
if (/^\([가-힣ㄱ-ㅎ\d]+\)/.test(t)) return true;
|
|
872
|
-
if (/^[○●※▶▷◆◇■□★☆\-·]\s/.test(t)) return true;
|
|
873
|
-
if (/^제\d+[조항호장절]/.test(t)) return true;
|
|
874
|
-
return false;
|
|
1051
|
+
return /^[가-힣ㄱ-ㅎ][.)]/.test(t) || /^\d+[.)]/.test(t) || /^\([가-힣ㄱ-ㅎ\d]+\)/.test(t) || /^[○●※▶▷◆◇■□★☆\-·]\s/.test(t) || /^제\d+[조항호장절]/.test(t);
|
|
875
1052
|
}
|
|
876
1053
|
function isStandaloneHeader(line) {
|
|
877
|
-
|
|
878
|
-
return /^제\d+[조항호장절](\([^)]*\))?(\s+\S+){0,4}$/.test(t);
|
|
1054
|
+
return /^제\d+[조항호장절](\([^)]*\))?(\s+\S+){0,7}$/.test(line.trim());
|
|
879
1055
|
}
|
|
880
1056
|
function mergeKoreanLines(text) {
|
|
881
1057
|
if (!text) return "";
|
|
@@ -885,8 +1061,7 @@ function mergeKoreanLines(text) {
|
|
|
885
1061
|
for (let i = 1; i < lines.length; i++) {
|
|
886
1062
|
const prev = result[result.length - 1];
|
|
887
1063
|
const curr = lines[i];
|
|
888
|
-
|
|
889
|
-
if (shouldMerge) {
|
|
1064
|
+
if (/[가-힣·,\-]$/.test(prev) && /^[가-힣(]/.test(curr) && !startsWithMarker(curr) && !isStandaloneHeader(prev)) {
|
|
890
1065
|
result[result.length - 1] = prev + " " + curr;
|
|
891
1066
|
} else {
|
|
892
1067
|
result.push(curr);
|
|
@@ -894,39 +1069,6 @@ function mergeKoreanLines(text) {
|
|
|
894
1069
|
}
|
|
895
1070
|
return result.join("\n");
|
|
896
1071
|
}
|
|
897
|
-
function reconstructTables(text) {
|
|
898
|
-
const lines = text.split("\n");
|
|
899
|
-
const result = [];
|
|
900
|
-
let tableBuffer = [];
|
|
901
|
-
for (const line of lines) {
|
|
902
|
-
if (line.includes(" ")) {
|
|
903
|
-
tableBuffer.push(line.split(" ").map((c) => c.trim()));
|
|
904
|
-
} else {
|
|
905
|
-
if (tableBuffer.length >= 2) result.push(formatAsMarkdownTable(tableBuffer));
|
|
906
|
-
else if (tableBuffer.length === 1) result.push(tableBuffer[0].join(" | "));
|
|
907
|
-
tableBuffer = [];
|
|
908
|
-
result.push(line);
|
|
909
|
-
}
|
|
910
|
-
}
|
|
911
|
-
if (tableBuffer.length >= 2) result.push(formatAsMarkdownTable(tableBuffer));
|
|
912
|
-
else if (tableBuffer.length === 1) result.push(tableBuffer[0].join(" | "));
|
|
913
|
-
return result.join("\n");
|
|
914
|
-
}
|
|
915
|
-
function formatAsMarkdownTable(rows) {
|
|
916
|
-
const maxCols = Math.max(...rows.map((r) => r.length));
|
|
917
|
-
const normalized = rows.map((r) => {
|
|
918
|
-
const copy = [...r];
|
|
919
|
-
while (copy.length < maxCols) copy.push("");
|
|
920
|
-
return copy;
|
|
921
|
-
});
|
|
922
|
-
const lines = [];
|
|
923
|
-
lines.push("| " + normalized[0].join(" | ") + " |");
|
|
924
|
-
lines.push("| " + normalized[0].map(() => "---").join(" | ") + " |");
|
|
925
|
-
for (let i = 1; i < normalized.length; i++) {
|
|
926
|
-
lines.push("| " + normalized[i].join(" | ") + " |");
|
|
927
|
-
}
|
|
928
|
-
return lines.join("\n");
|
|
929
|
-
}
|
|
930
1072
|
|
|
931
1073
|
// src/index.ts
|
|
932
1074
|
async function parse(buffer) {
|
|
@@ -977,4 +1119,4 @@ export {
|
|
|
977
1119
|
sanitizeError,
|
|
978
1120
|
parse
|
|
979
1121
|
};
|
|
980
|
-
//# sourceMappingURL=chunk-
|
|
1122
|
+
//# sourceMappingURL=chunk-4BKNDXGU.js.map
|