kordoc 1.1.2 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -25,7 +25,7 @@ function detectFormat(buffer) {
25
25
  }
26
26
 
27
27
  // src/utils.ts
28
- var VERSION = true ? "1.1.2" : "0.0.0-dev";
28
+ var VERSION = true ? "1.3.0" : "0.0.0-dev";
29
29
  function toArrayBuffer(buf) {
30
30
  if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
31
31
  return buf.buffer;
@@ -739,143 +739,316 @@ function arrangeCells(rows, cols, cells) {
739
739
  return grid.map((row) => row.map((c) => c || { text: "", colSpan: 1, rowSpan: 1 }));
740
740
  }
741
741
 
742
+ // src/pdf/polyfill.ts
743
+ import * as pdfjsWorker from "pdfjs-dist/legacy/build/pdf.worker.mjs";
744
+ var g = globalThis;
745
+ if (typeof g.DOMMatrix === "undefined") {
746
+ g.DOMMatrix = class DOMMatrix {
747
+ m = [1, 0, 0, 1, 0, 0];
748
+ constructor(init) {
749
+ if (init) this.m = init;
750
+ }
751
+ };
752
+ }
753
+ if (typeof g.Path2D === "undefined") {
754
+ g.Path2D = class Path2D {
755
+ };
756
+ }
757
+ g.pdfjsWorker = pdfjsWorker;
758
+
742
759
  // src/pdf/parser.ts
743
- import { createRequire as createRequire2 } from "module";
744
- import { pathToFileURL } from "url";
760
+ import { getDocument, GlobalWorkerOptions } from "pdfjs-dist/legacy/build/pdf.mjs";
761
+ GlobalWorkerOptions.workerSrc = "";
745
762
  var MAX_PAGES = 5e3;
746
763
  var MAX_TOTAL_TEXT = 100 * 1024 * 1024;
747
- var pdfjsModule = null;
748
- async function loadPdfjs() {
749
- if (pdfjsModule) return pdfjsModule;
750
- try {
751
- const mod = await import("pdfjs-dist/legacy/build/pdf.mjs");
752
- const req = createRequire2(import.meta.url);
753
- const workerPath = req.resolve("pdfjs-dist/legacy/build/pdf.worker.mjs");
754
- mod.GlobalWorkerOptions.workerSrc = pathToFileURL(workerPath).href;
755
- pdfjsModule = mod;
756
- return mod;
757
- } catch (err) {
758
- const msg = err instanceof Error ? err.message : String(err);
759
- if (msg.includes("Cannot find") || msg.includes("MODULE_NOT_FOUND")) {
760
- return null;
761
- }
762
- throw new KordocError(`pdfjs-dist \uB85C\uB529 \uC2E4\uD328: ${msg}`);
763
- }
764
- }
765
764
  async function parsePdfDocument(buffer) {
766
- const pdfjs = await loadPdfjs();
767
- if (!pdfjs) {
768
- return {
769
- success: false,
770
- fileType: "pdf",
771
- pageCount: 0,
772
- error: "pdfjs-dist\uAC00 \uC124\uCE58\uB418\uC9C0 \uC54A\uC558\uC2B5\uB2C8\uB2E4. npm install pdfjs-dist"
773
- };
774
- }
775
- const data = new Uint8Array(buffer);
776
- const doc = await pdfjs.getDocument({
777
- data,
765
+ const doc = await getDocument({
766
+ data: new Uint8Array(buffer),
778
767
  useSystemFonts: true,
779
768
  disableFontFace: true,
780
769
  isEvalSupported: false
781
770
  }).promise;
782
771
  try {
783
772
  const pageCount = doc.numPages;
784
- if (pageCount === 0) {
785
- return { success: false, fileType: "pdf", pageCount: 0, error: "PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4." };
786
- }
773
+ if (pageCount === 0) return { success: false, fileType: "pdf", pageCount: 0, error: "PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4." };
787
774
  const pageTexts = [];
788
775
  let totalChars = 0;
789
776
  let totalTextBytes = 0;
790
777
  const effectivePageCount = Math.min(pageCount, MAX_PAGES);
791
778
  for (let i = 1; i <= effectivePageCount; i++) {
792
779
  const page = await doc.getPage(i);
793
- const textContent = await page.getTextContent();
794
- const lines = groupTextItemsByLine(textContent.items);
795
- const pageText = lines.join("\n");
780
+ const tc = await page.getTextContent();
781
+ const pageText = extractPageContent(tc.items);
796
782
  totalChars += pageText.replace(/\s/g, "").length;
797
783
  totalTextBytes += pageText.length * 2;
798
- if (totalTextBytes > MAX_TOTAL_TEXT) throw new KordocError(`\uD14D\uC2A4\uD2B8 \uCD94\uCD9C \uD06C\uAE30 \uCD08\uACFC (${MAX_TOTAL_TEXT / 1024 / 1024}MB \uC81C\uD55C)`);
784
+ if (totalTextBytes > MAX_TOTAL_TEXT) throw new KordocError("\uD14D\uC2A4\uD2B8 \uCD94\uCD9C \uD06C\uAE30 \uCD08\uACFC");
799
785
  pageTexts.push(pageText);
800
786
  }
801
- const avgCharsPerPage = totalChars / effectivePageCount;
802
- if (avgCharsPerPage < 10) {
803
- return {
804
- success: false,
805
- fileType: "pdf",
806
- pageCount,
807
- isImageBased: true,
808
- error: `\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF\uB85C \uCD94\uC815\uB429\uB2C8\uB2E4 (${pageCount}\uD398\uC774\uC9C0, \uCD94\uCD9C \uD14D\uC2A4\uD2B8 ${totalChars}\uC790).`
809
- };
810
- }
811
- let markdown = "";
812
- for (let i = 0; i < pageTexts.length; i++) {
813
- const cleaned = cleanPdfText(pageTexts[i]);
814
- if (cleaned.trim()) {
815
- if (i > 0 && markdown) markdown += "\n\n";
816
- markdown += cleaned;
817
- }
787
+ if (totalChars / effectivePageCount < 10) {
788
+ return { success: false, fileType: "pdf", pageCount, isImageBased: true, error: `\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)` };
818
789
  }
819
- markdown = reconstructTables(markdown);
820
- const truncated = pageCount > MAX_PAGES;
821
- return { success: true, fileType: "pdf", markdown, pageCount: effectivePageCount, isImageBased: false, ...truncated && { warning: `PDF\uAC00 ${pageCount}\uD398\uC774\uC9C0\uC774\uC9C0\uB9CC ${MAX_PAGES}\uD398\uC774\uC9C0\uAE4C\uC9C0\uB9CC \uCC98\uB9AC\uD588\uC2B5\uB2C8\uB2E4` } };
790
+ let markdown = pageTexts.filter((t) => t.trim()).join("\n\n");
791
+ markdown = cleanPdfText(markdown);
792
+ return { success: true, fileType: "pdf", markdown, pageCount: effectivePageCount };
822
793
  } finally {
823
794
  await doc.destroy().catch(() => {
824
795
  });
825
796
  }
826
797
  }
827
- function groupTextItemsByLine(items) {
798
+ function extractPageContent(rawItems) {
799
+ const items = normalizeItems(rawItems);
800
+ if (items.length === 0) return "";
801
+ const yLines = groupByY(items);
802
+ const columns = detectColumns(yLines);
803
+ if (columns && columns.length >= 3) {
804
+ return extractWithColumns(yLines, columns);
805
+ }
806
+ return yLines.map((line) => mergeLineSimple(line)).join("\n");
807
+ }
808
+ function normalizeItems(rawItems) {
809
+ return rawItems.filter((i) => typeof i.str === "string" && i.str.trim() !== "").map((i) => ({
810
+ text: i.str.trim(),
811
+ x: Math.round(i.transform[4]),
812
+ y: Math.round(i.transform[5]),
813
+ w: Math.round(i.width),
814
+ h: Math.round(i.height)
815
+ })).sort((a, b) => b.y - a.y || a.x - b.x);
816
+ }
817
+ function groupByY(items) {
828
818
  if (items.length === 0) return [];
829
- const textItems = items.filter((item) => typeof item.str === "string" && item.str.trim() !== "");
830
- if (textItems.length === 0) return [];
831
- textItems.sort((a, b) => {
832
- const yDiff = b.transform[5] - a.transform[5];
833
- if (Math.abs(yDiff) < 2) return a.transform[4] - b.transform[4];
834
- return yDiff;
835
- });
836
819
  const lines = [];
837
- let currentY = textItems[0].transform[5];
838
- let currentLine = [];
839
- for (const item of textItems) {
840
- const y = item.transform[5];
841
- if (Math.abs(currentY - y) > Math.max(item.height * 0.5, 2)) {
842
- if (currentLine.length > 0) lines.push(mergeLineItems(currentLine));
843
- currentLine = [];
844
- currentY = y;
845
- }
846
- currentLine.push({ text: item.str, x: item.transform[4], width: item.width });
847
- }
848
- if (currentLine.length > 0) lines.push(mergeLineItems(currentLine));
820
+ let curY = items[0].y;
821
+ let curLine = [items[0]];
822
+ for (let i = 1; i < items.length; i++) {
823
+ if (Math.abs(items[i].y - curY) > 3) {
824
+ lines.push(curLine);
825
+ curLine = [];
826
+ curY = items[i].y;
827
+ }
828
+ curLine.push(items[i]);
829
+ }
830
+ if (curLine.length > 0) lines.push(curLine);
849
831
  return lines;
850
832
  }
851
- function mergeLineItems(items) {
833
+ function isProseSpread(items) {
834
+ if (items.length < 4) return false;
835
+ const sorted = [...items].sort((a, b) => a.x - b.x);
836
+ const gaps = [];
837
+ for (let i = 1; i < sorted.length; i++) {
838
+ gaps.push(sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w));
839
+ }
840
+ const maxGap = Math.max(...gaps);
841
+ const avgLen = items.reduce((s, i) => s + i.text.length, 0) / items.length;
842
+ return maxGap < 40 && avgLen < 5;
843
+ }
844
+ function detectColumns(yLines) {
845
+ const allItems = yLines.flat();
846
+ if (allItems.length === 0) return null;
847
+ const pageWidth = Math.max(...allItems.map((i) => i.x + i.w)) - Math.min(...allItems.map((i) => i.x));
848
+ if (pageWidth < 100) return null;
849
+ let bigoLineIdx = -1;
850
+ for (let i = 0; i < yLines.length; i++) {
851
+ if (yLines[i].length <= 2 && yLines[i].some((item) => item.text === "\uBE44\uACE0")) {
852
+ bigoLineIdx = i;
853
+ break;
854
+ }
855
+ }
856
+ const tableYLines = bigoLineIdx >= 0 ? yLines.slice(0, bigoLineIdx) : yLines;
857
+ const CLUSTER_TOL = 22;
858
+ const xClusters = [];
859
+ for (const line of tableYLines) {
860
+ if (isProseSpread(line)) continue;
861
+ for (const item of line) {
862
+ let found = false;
863
+ for (const c of xClusters) {
864
+ if (Math.abs(item.x - c.center) <= CLUSTER_TOL) {
865
+ c.center = Math.round((c.center * c.count + item.x) / (c.count + 1));
866
+ c.minX = Math.min(c.minX, item.x);
867
+ c.count++;
868
+ found = true;
869
+ break;
870
+ }
871
+ }
872
+ if (!found) {
873
+ xClusters.push({ center: item.x, count: 1, minX: item.x });
874
+ }
875
+ }
876
+ }
877
+ const peaks = xClusters.filter((c) => c.count >= 3).sort((a, b) => a.minX - b.minX);
878
+ if (peaks.length < 3) return null;
879
+ const MERGE_TOL = 30;
880
+ const merged = [peaks[0]];
881
+ for (let i = 1; i < peaks.length; i++) {
882
+ const prev = merged[merged.length - 1];
883
+ if (peaks[i].minX - prev.minX < MERGE_TOL) {
884
+ if (peaks[i].count > prev.count) {
885
+ prev.center = peaks[i].center;
886
+ }
887
+ prev.count += peaks[i].count;
888
+ prev.minX = Math.min(prev.minX, peaks[i].minX);
889
+ } else {
890
+ merged.push({ ...peaks[i] });
891
+ }
892
+ }
893
+ const columns = merged.filter((c) => c.count >= 3).map((c) => c.minX);
894
+ return columns.length >= 3 ? columns : null;
895
+ }
896
+ function findColumn(x, columns) {
897
+ for (let i = columns.length - 1; i >= 0; i--) {
898
+ if (x >= columns[i] - 10) return i;
899
+ }
900
+ return 0;
901
+ }
902
+ function extractWithColumns(yLines, columns) {
903
+ const result = [];
904
+ const colMin = columns[0];
905
+ const colMax = columns[columns.length - 1];
906
+ let bigoIdx = -1;
907
+ for (let i = 0; i < yLines.length; i++) {
908
+ if (yLines[i].length <= 2 && yLines[i].some((item) => item.text === "\uBE44\uACE0")) {
909
+ bigoIdx = i;
910
+ break;
911
+ }
912
+ }
913
+ let tableStart = -1;
914
+ for (let i = 0; i < (bigoIdx >= 0 ? bigoIdx : yLines.length); i++) {
915
+ const usedCols = new Set(yLines[i].map((item) => findColumn(item.x, columns)));
916
+ if (usedCols.size >= 3) {
917
+ tableStart = i;
918
+ break;
919
+ }
920
+ }
921
+ const tableEnd = bigoIdx >= 0 ? bigoIdx : yLines.length;
922
+ for (let i = 0; i < (tableStart >= 0 ? tableStart : tableEnd); i++) {
923
+ result.push(mergeLineSimple(yLines[i]));
924
+ }
925
+ if (tableStart >= 0) {
926
+ const tableLines = yLines.slice(tableStart, tableEnd);
927
+ const gridLines = [];
928
+ for (const line of tableLines) {
929
+ const inRange = line.some(
930
+ (item) => item.x >= colMin - 20 && item.x <= colMax + 200
931
+ );
932
+ if (inRange && !isProseSpread(line)) {
933
+ gridLines.push(line);
934
+ } else {
935
+ if (gridLines.length > 0) {
936
+ result.push(buildGridTable(gridLines.splice(0), columns));
937
+ }
938
+ result.push(mergeLineSimple(line));
939
+ }
940
+ }
941
+ if (gridLines.length > 0) {
942
+ result.push(buildGridTable(gridLines, columns));
943
+ }
944
+ }
945
+ if (bigoIdx >= 0) {
946
+ result.push("");
947
+ for (let i = bigoIdx; i < yLines.length; i++) {
948
+ result.push(mergeLineSimple(yLines[i]));
949
+ }
950
+ }
951
+ return result.join("\n");
952
+ }
953
+ function buildGridTable(lines, columns) {
954
+ const numCols = columns.length;
955
+ const yRows = lines.map((items) => {
956
+ const row = Array(numCols).fill("");
957
+ for (const item of items) {
958
+ const col = findColumn(item.x, columns);
959
+ row[col] = row[col] ? row[col] + " " + item.text : item.text;
960
+ }
961
+ return row;
962
+ });
963
+ const dataColStart = Math.max(2, Math.floor(numCols / 2));
964
+ const merged = [];
965
+ for (const row of yRows) {
966
+ if (row.every((c) => c === "")) continue;
967
+ if (merged.length === 0) {
968
+ merged.push([...row]);
969
+ continue;
970
+ }
971
+ const prev = merged[merged.length - 1];
972
+ const filledCols = row.map((c, i) => c ? i : -1).filter((i) => i >= 0);
973
+ const filledCount = filledCols.length;
974
+ let isNewRow = false;
975
+ if (row[0] && row[0].length >= 3) {
976
+ isNewRow = true;
977
+ }
978
+ if (!isNewRow && numCols > 1 && row[1]) {
979
+ isNewRow = true;
980
+ }
981
+ if (!isNewRow) {
982
+ const hasData = row.slice(dataColStart).some((c) => c !== "");
983
+ const prevHasData = prev.slice(dataColStart).some((c) => c !== "");
984
+ if (hasData && prevHasData) {
985
+ isNewRow = true;
986
+ }
987
+ }
988
+ if (isNewRow && filledCount === 1 && row[0] && row[0].length <= 2) {
989
+ isNewRow = false;
990
+ }
991
+ if (isNewRow) {
992
+ merged.push([...row]);
993
+ } else {
994
+ for (let c = 0; c < numCols; c++) {
995
+ if (row[c]) {
996
+ prev[c] = prev[c] ? prev[c] + " " + row[c] : row[c];
997
+ }
998
+ }
999
+ }
1000
+ }
1001
+ if (merged.length < 2) {
1002
+ return merged.map((r) => r.filter((c) => c).join(" ")).join("\n");
1003
+ }
1004
+ let headerEnd = 0;
1005
+ for (let r = 0; r < merged.length; r++) {
1006
+ const hasDataValues = merged[r].slice(dataColStart).some((c) => c && /\d/.test(c));
1007
+ if (hasDataValues) break;
1008
+ headerEnd = r + 1;
1009
+ }
1010
+ if (headerEnd > 1) {
1011
+ const headerRow = Array(numCols).fill("");
1012
+ for (let r = 0; r < headerEnd; r++) {
1013
+ for (let c = 0; c < numCols; c++) {
1014
+ if (merged[r][c]) {
1015
+ headerRow[c] = headerRow[c] ? headerRow[c] + " " + merged[r][c] : merged[r][c];
1016
+ }
1017
+ }
1018
+ }
1019
+ merged.splice(0, headerEnd, headerRow);
1020
+ }
1021
+ const md = [];
1022
+ md.push("| " + merged[0].join(" | ") + " |");
1023
+ md.push("| " + merged[0].map(() => "---").join(" | ") + " |");
1024
+ for (let r = 1; r < merged.length; r++) {
1025
+ md.push("| " + merged[r].join(" | ") + " |");
1026
+ }
1027
+ return md.join("\n");
1028
+ }
1029
+ function mergeLineSimple(items) {
852
1030
  if (items.length <= 1) return items[0]?.text || "";
853
- items.sort((a, b) => a.x - b.x);
854
- let result = items[0].text;
855
- for (let i = 1; i < items.length; i++) {
856
- const gap = items[i].x - (items[i - 1].x + items[i - 1].width);
1031
+ const sorted = [...items].sort((a, b) => a.x - b.x);
1032
+ let result = sorted[0].text;
1033
+ for (let i = 1; i < sorted.length; i++) {
1034
+ const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
857
1035
  if (gap > 15) result += " ";
858
1036
  else if (gap > 3) result += " ";
859
- result += items[i].text;
1037
+ result += sorted[i].text;
860
1038
  }
861
1039
  return result;
862
1040
  }
863
1041
  function cleanPdfText(text) {
864
- const stripped = text.replace(/^[\s]*[-–—]\s*\d+\s*[-–—][\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "");
865
- return mergeKoreanLines(stripped).replace(/\n{3,}/g, "\n\n").trim();
1042
+ return mergeKoreanLines(
1043
+ text.replace(/^[\s]*[-–—]\s*\d+\s*[-–—][\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "")
1044
+ ).replace(/\n{3,}/g, "\n\n").trim();
866
1045
  }
867
1046
  function startsWithMarker(line) {
868
1047
  const t = line.trimStart();
869
- if (/^[가-힣ㄱ-ㅎ][.)]/.test(t)) return true;
870
- if (/^\d+[.)]/.test(t)) return true;
871
- if (/^\([가-힣ㄱ-ㅎ\d]+\)/.test(t)) return true;
872
- if (/^[○●※▶▷◆◇■□★☆\-·]\s/.test(t)) return true;
873
- if (/^제\d+[조항호장절]/.test(t)) return true;
874
- return false;
1048
+ return /^[가-힣ㄱ-ㅎ][.)]/.test(t) || /^\d+[.)]/.test(t) || /^\([가-힣ㄱ-ㅎ\d]+\)/.test(t) || /^[○●※▶▷◆◇■□★☆\-·]\s/.test(t) || /^제\d+[조항호장절]/.test(t);
875
1049
  }
876
1050
  function isStandaloneHeader(line) {
877
- const t = line.trim();
878
- return /^제\d+[조항호장절](\([^)]*\))?(\s+\S+){0,7}$/.test(t);
1051
+ return /^제\d+[조항호장절](\([^)]*\))?(\s+\S+){0,7}$/.test(line.trim());
879
1052
  }
880
1053
  function mergeKoreanLines(text) {
881
1054
  if (!text) return "";
@@ -885,8 +1058,7 @@ function mergeKoreanLines(text) {
885
1058
  for (let i = 1; i < lines.length; i++) {
886
1059
  const prev = result[result.length - 1];
887
1060
  const curr = lines[i];
888
- const shouldMerge = /[가-힣·,\-]$/.test(prev) && /^[가-힣(]/.test(curr) && !startsWithMarker(curr) && !isStandaloneHeader(prev);
889
- if (shouldMerge) {
1061
+ if (/[가-힣·,\-]$/.test(prev) && /^[가-힣(]/.test(curr) && !startsWithMarker(curr) && !isStandaloneHeader(prev)) {
890
1062
  result[result.length - 1] = prev + " " + curr;
891
1063
  } else {
892
1064
  result.push(curr);
@@ -894,39 +1066,6 @@ function mergeKoreanLines(text) {
894
1066
  }
895
1067
  return result.join("\n");
896
1068
  }
897
- function reconstructTables(text) {
898
- const lines = text.split("\n");
899
- const result = [];
900
- let tableBuffer = [];
901
- for (const line of lines) {
902
- if (line.includes(" ")) {
903
- tableBuffer.push(line.split(" ").map((c) => c.trim()));
904
- } else {
905
- if (tableBuffer.length >= 2) result.push(formatAsMarkdownTable(tableBuffer));
906
- else if (tableBuffer.length === 1) result.push(tableBuffer[0].join(" | "));
907
- tableBuffer = [];
908
- result.push(line);
909
- }
910
- }
911
- if (tableBuffer.length >= 2) result.push(formatAsMarkdownTable(tableBuffer));
912
- else if (tableBuffer.length === 1) result.push(tableBuffer[0].join(" | "));
913
- return result.join("\n");
914
- }
915
- function formatAsMarkdownTable(rows) {
916
- const maxCols = Math.max(...rows.map((r) => r.length));
917
- const normalized = rows.map((r) => {
918
- const copy = [...r];
919
- while (copy.length < maxCols) copy.push("");
920
- return copy;
921
- });
922
- const lines = [];
923
- lines.push("| " + normalized[0].join(" | ") + " |");
924
- lines.push("| " + normalized[0].map(() => "---").join(" | ") + " |");
925
- for (let i = 1; i < normalized.length; i++) {
926
- lines.push("| " + normalized[i].join(" | ") + " |");
927
- }
928
- return lines.join("\n");
929
- }
930
1069
 
931
1070
  // src/index.ts
932
1071
  async function parse(buffer) {
@@ -977,4 +1116,4 @@ export {
977
1116
  sanitizeError,
978
1117
  parse
979
1118
  };
980
- //# sourceMappingURL=chunk-LHETZ3IN.js.map
1119
+ //# sourceMappingURL=chunk-KCGDEP7Q.js.map