kordoc 1.1.1 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -136,10 +136,10 @@ interface ParseResult {
136
136
  ### Types
137
137
 
138
138
  ```typescript
139
- import type { ParseResult, IRBlock, IRTable, IRCell, CellContext } from "kordoc"
139
+ import type { ParseResult, ParseSuccess, ParseFailure, FileType } from "kordoc"
140
140
  ```
141
141
 
142
- > Internal utilities (`KordocError`, `sanitizeError`, `isPathTraversal`, `buildTable`, `blocksToMarkdown`, `convertTableToText`) are not part of the public API.
142
+ > Internal types (`IRBlock`, `IRTable`, `IRCell`, `CellContext`) and utilities (`KordocError`, `sanitizeError`, `isPathTraversal`, `buildTable`, `blocksToMarkdown`) are not part of the public API.
143
143
 
144
144
  ## Requirements
145
145
 
@@ -25,7 +25,7 @@ function detectFormat(buffer) {
25
25
  }
26
26
 
27
27
  // src/utils.ts
28
- var VERSION = true ? "1.1.1" : "0.0.0-dev";
28
+ var VERSION = true ? "1.2.0" : "0.0.0-dev";
29
29
  function toArrayBuffer(buf) {
30
30
  if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
31
31
  return buf.buffer;
@@ -756,126 +756,302 @@ async function loadPdfjs() {
756
756
  return mod;
757
757
  } catch (err) {
758
758
  const msg = err instanceof Error ? err.message : String(err);
759
- if (msg.includes("Cannot find") || msg.includes("MODULE_NOT_FOUND")) {
760
- return null;
761
- }
759
+ if (msg.includes("Cannot find") || msg.includes("MODULE_NOT_FOUND")) return null;
762
760
  throw new KordocError(`pdfjs-dist \uB85C\uB529 \uC2E4\uD328: ${msg}`);
763
761
  }
764
762
  }
765
763
  async function parsePdfDocument(buffer) {
766
764
  const pdfjs = await loadPdfjs();
767
765
  if (!pdfjs) {
768
- return {
769
- success: false,
770
- fileType: "pdf",
771
- pageCount: 0,
772
- error: "pdfjs-dist\uAC00 \uC124\uCE58\uB418\uC9C0 \uC54A\uC558\uC2B5\uB2C8\uB2E4. npm install pdfjs-dist"
773
- };
766
+ return { success: false, fileType: "pdf", pageCount: 0, error: "pdfjs-dist\uAC00 \uC124\uCE58\uB418\uC9C0 \uC54A\uC558\uC2B5\uB2C8\uB2E4. npm install pdfjs-dist" };
774
767
  }
775
- const data = new Uint8Array(buffer);
776
768
  const doc = await pdfjs.getDocument({
777
- data,
769
+ data: new Uint8Array(buffer),
778
770
  useSystemFonts: true,
779
771
  disableFontFace: true,
780
772
  isEvalSupported: false
781
773
  }).promise;
782
774
  try {
783
775
  const pageCount = doc.numPages;
784
- if (pageCount === 0) {
785
- return { success: false, fileType: "pdf", pageCount: 0, error: "PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4." };
786
- }
776
+ if (pageCount === 0) return { success: false, fileType: "pdf", pageCount: 0, error: "PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4." };
787
777
  const pageTexts = [];
788
778
  let totalChars = 0;
789
779
  let totalTextBytes = 0;
790
780
  const effectivePageCount = Math.min(pageCount, MAX_PAGES);
791
781
  for (let i = 1; i <= effectivePageCount; i++) {
792
782
  const page = await doc.getPage(i);
793
- const textContent = await page.getTextContent();
794
- const lines = groupTextItemsByLine(textContent.items);
795
- const pageText = lines.join("\n");
783
+ const tc = await page.getTextContent();
784
+ const pageText = extractPageContent(tc.items);
796
785
  totalChars += pageText.replace(/\s/g, "").length;
797
786
  totalTextBytes += pageText.length * 2;
798
- if (totalTextBytes > MAX_TOTAL_TEXT) throw new KordocError(`\uD14D\uC2A4\uD2B8 \uCD94\uCD9C \uD06C\uAE30 \uCD08\uACFC (${MAX_TOTAL_TEXT / 1024 / 1024}MB \uC81C\uD55C)`);
787
+ if (totalTextBytes > MAX_TOTAL_TEXT) throw new KordocError("\uD14D\uC2A4\uD2B8 \uCD94\uCD9C \uD06C\uAE30 \uCD08\uACFC");
799
788
  pageTexts.push(pageText);
800
789
  }
801
- const avgCharsPerPage = totalChars / effectivePageCount;
802
- if (avgCharsPerPage < 10) {
803
- return {
804
- success: false,
805
- fileType: "pdf",
806
- pageCount,
807
- isImageBased: true,
808
- error: `\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF\uB85C \uCD94\uC815\uB429\uB2C8\uB2E4 (${pageCount}\uD398\uC774\uC9C0, \uCD94\uCD9C \uD14D\uC2A4\uD2B8 ${totalChars}\uC790).`
809
- };
810
- }
811
- let markdown = "";
812
- for (let i = 0; i < pageTexts.length; i++) {
813
- const cleaned = cleanPdfText(pageTexts[i]);
814
- if (cleaned.trim()) {
815
- if (i > 0 && markdown) markdown += "\n\n";
816
- markdown += cleaned;
817
- }
790
+ if (totalChars / effectivePageCount < 10) {
791
+ return { success: false, fileType: "pdf", pageCount, isImageBased: true, error: `\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)` };
818
792
  }
819
- markdown = reconstructTables(markdown);
820
- const truncated = pageCount > MAX_PAGES;
821
- return { success: true, fileType: "pdf", markdown, pageCount: effectivePageCount, isImageBased: false, ...truncated && { warning: `PDF\uAC00 ${pageCount}\uD398\uC774\uC9C0\uC774\uC9C0\uB9CC ${MAX_PAGES}\uD398\uC774\uC9C0\uAE4C\uC9C0\uB9CC \uCC98\uB9AC\uD588\uC2B5\uB2C8\uB2E4` } };
793
+ let markdown = pageTexts.filter((t) => t.trim()).join("\n\n");
794
+ markdown = cleanPdfText(markdown);
795
+ return { success: true, fileType: "pdf", markdown, pageCount: effectivePageCount };
822
796
  } finally {
823
797
  await doc.destroy().catch(() => {
824
798
  });
825
799
  }
826
800
  }
827
- function groupTextItemsByLine(items) {
801
+ function extractPageContent(rawItems) {
802
+ const items = normalizeItems(rawItems);
803
+ if (items.length === 0) return "";
804
+ const yLines = groupByY(items);
805
+ const columns = detectColumns(yLines);
806
+ if (columns && columns.length >= 3) {
807
+ return extractWithColumns(yLines, columns);
808
+ }
809
+ return yLines.map((line) => mergeLineSimple(line)).join("\n");
810
+ }
811
+ function normalizeItems(rawItems) {
812
+ return rawItems.filter((i) => typeof i.str === "string" && i.str.trim() !== "").map((i) => ({
813
+ text: i.str.trim(),
814
+ x: Math.round(i.transform[4]),
815
+ y: Math.round(i.transform[5]),
816
+ w: Math.round(i.width),
817
+ h: Math.round(i.height)
818
+ })).sort((a, b) => b.y - a.y || a.x - b.x);
819
+ }
820
+ function groupByY(items) {
828
821
  if (items.length === 0) return [];
829
- const textItems = items.filter((item) => typeof item.str === "string" && item.str.trim() !== "");
830
- if (textItems.length === 0) return [];
831
- textItems.sort((a, b) => {
832
- const yDiff = b.transform[5] - a.transform[5];
833
- if (Math.abs(yDiff) < 2) return a.transform[4] - b.transform[4];
834
- return yDiff;
835
- });
836
822
  const lines = [];
837
- let currentY = textItems[0].transform[5];
838
- let currentLine = [];
839
- for (const item of textItems) {
840
- const y = item.transform[5];
841
- if (Math.abs(currentY - y) > Math.max(item.height * 0.5, 2)) {
842
- if (currentLine.length > 0) lines.push(mergeLineItems(currentLine));
843
- currentLine = [];
844
- currentY = y;
845
- }
846
- currentLine.push({ text: item.str, x: item.transform[4], width: item.width });
847
- }
848
- if (currentLine.length > 0) lines.push(mergeLineItems(currentLine));
823
+ let curY = items[0].y;
824
+ let curLine = [items[0]];
825
+ for (let i = 1; i < items.length; i++) {
826
+ if (Math.abs(items[i].y - curY) > 3) {
827
+ lines.push(curLine);
828
+ curLine = [];
829
+ curY = items[i].y;
830
+ }
831
+ curLine.push(items[i]);
832
+ }
833
+ if (curLine.length > 0) lines.push(curLine);
849
834
  return lines;
850
835
  }
851
- function mergeLineItems(items) {
836
+ function isProseSpread(items) {
837
+ if (items.length < 4) return false;
838
+ const sorted = [...items].sort((a, b) => a.x - b.x);
839
+ const gaps = [];
840
+ for (let i = 1; i < sorted.length; i++) {
841
+ gaps.push(sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w));
842
+ }
843
+ const maxGap = Math.max(...gaps);
844
+ const avgLen = items.reduce((s, i) => s + i.text.length, 0) / items.length;
845
+ return maxGap < 40 && avgLen < 5;
846
+ }
847
+ function detectColumns(yLines) {
848
+ const allItems = yLines.flat();
849
+ if (allItems.length === 0) return null;
850
+ const pageWidth = Math.max(...allItems.map((i) => i.x + i.w)) - Math.min(...allItems.map((i) => i.x));
851
+ if (pageWidth < 100) return null;
852
+ let bigoLineIdx = -1;
853
+ for (let i = 0; i < yLines.length; i++) {
854
+ if (yLines[i].length <= 2 && yLines[i].some((item) => item.text === "\uBE44\uACE0")) {
855
+ bigoLineIdx = i;
856
+ break;
857
+ }
858
+ }
859
+ const tableYLines = bigoLineIdx >= 0 ? yLines.slice(0, bigoLineIdx) : yLines;
860
+ const CLUSTER_TOL = 22;
861
+ const xClusters = [];
862
+ for (const line of tableYLines) {
863
+ if (isProseSpread(line)) continue;
864
+ for (const item of line) {
865
+ let found = false;
866
+ for (const c of xClusters) {
867
+ if (Math.abs(item.x - c.center) <= CLUSTER_TOL) {
868
+ c.center = Math.round((c.center * c.count + item.x) / (c.count + 1));
869
+ c.minX = Math.min(c.minX, item.x);
870
+ c.count++;
871
+ found = true;
872
+ break;
873
+ }
874
+ }
875
+ if (!found) {
876
+ xClusters.push({ center: item.x, count: 1, minX: item.x });
877
+ }
878
+ }
879
+ }
880
+ const peaks = xClusters.filter((c) => c.count >= 3).sort((a, b) => a.minX - b.minX);
881
+ if (peaks.length < 3) return null;
882
+ const MERGE_TOL = 30;
883
+ const merged = [peaks[0]];
884
+ for (let i = 1; i < peaks.length; i++) {
885
+ const prev = merged[merged.length - 1];
886
+ if (peaks[i].minX - prev.minX < MERGE_TOL) {
887
+ if (peaks[i].count > prev.count) {
888
+ prev.center = peaks[i].center;
889
+ }
890
+ prev.count += peaks[i].count;
891
+ prev.minX = Math.min(prev.minX, peaks[i].minX);
892
+ } else {
893
+ merged.push({ ...peaks[i] });
894
+ }
895
+ }
896
+ const columns = merged.filter((c) => c.count >= 3).map((c) => c.minX);
897
+ return columns.length >= 3 ? columns : null;
898
+ }
899
+ function findColumn(x, columns) {
900
+ for (let i = columns.length - 1; i >= 0; i--) {
901
+ if (x >= columns[i] - 10) return i;
902
+ }
903
+ return 0;
904
+ }
905
+ function extractWithColumns(yLines, columns) {
906
+ const result = [];
907
+ const colMin = columns[0];
908
+ const colMax = columns[columns.length - 1];
909
+ let bigoIdx = -1;
910
+ for (let i = 0; i < yLines.length; i++) {
911
+ if (yLines[i].length <= 2 && yLines[i].some((item) => item.text === "\uBE44\uACE0")) {
912
+ bigoIdx = i;
913
+ break;
914
+ }
915
+ }
916
+ let tableStart = -1;
917
+ for (let i = 0; i < (bigoIdx >= 0 ? bigoIdx : yLines.length); i++) {
918
+ const usedCols = new Set(yLines[i].map((item) => findColumn(item.x, columns)));
919
+ if (usedCols.size >= 3) {
920
+ tableStart = i;
921
+ break;
922
+ }
923
+ }
924
+ const tableEnd = bigoIdx >= 0 ? bigoIdx : yLines.length;
925
+ for (let i = 0; i < (tableStart >= 0 ? tableStart : tableEnd); i++) {
926
+ result.push(mergeLineSimple(yLines[i]));
927
+ }
928
+ if (tableStart >= 0) {
929
+ const tableLines = yLines.slice(tableStart, tableEnd);
930
+ const gridLines = [];
931
+ for (const line of tableLines) {
932
+ const inRange = line.some(
933
+ (item) => item.x >= colMin - 20 && item.x <= colMax + 200
934
+ );
935
+ if (inRange && !isProseSpread(line)) {
936
+ gridLines.push(line);
937
+ } else {
938
+ if (gridLines.length > 0) {
939
+ result.push(buildGridTable(gridLines.splice(0), columns));
940
+ }
941
+ result.push(mergeLineSimple(line));
942
+ }
943
+ }
944
+ if (gridLines.length > 0) {
945
+ result.push(buildGridTable(gridLines, columns));
946
+ }
947
+ }
948
+ if (bigoIdx >= 0) {
949
+ result.push("");
950
+ for (let i = bigoIdx; i < yLines.length; i++) {
951
+ result.push(mergeLineSimple(yLines[i]));
952
+ }
953
+ }
954
+ return result.join("\n");
955
+ }
956
+ function buildGridTable(lines, columns) {
957
+ const numCols = columns.length;
958
+ const yRows = lines.map((items) => {
959
+ const row = Array(numCols).fill("");
960
+ for (const item of items) {
961
+ const col = findColumn(item.x, columns);
962
+ row[col] = row[col] ? row[col] + " " + item.text : item.text;
963
+ }
964
+ return row;
965
+ });
966
+ const dataColStart = Math.max(2, Math.floor(numCols / 2));
967
+ const merged = [];
968
+ for (const row of yRows) {
969
+ if (row.every((c) => c === "")) continue;
970
+ if (merged.length === 0) {
971
+ merged.push([...row]);
972
+ continue;
973
+ }
974
+ const prev = merged[merged.length - 1];
975
+ const filledCols = row.map((c, i) => c ? i : -1).filter((i) => i >= 0);
976
+ const filledCount = filledCols.length;
977
+ let isNewRow = false;
978
+ if (row[0] && row[0].length >= 3) {
979
+ isNewRow = true;
980
+ }
981
+ if (!isNewRow && numCols > 1 && row[1]) {
982
+ isNewRow = true;
983
+ }
984
+ if (!isNewRow) {
985
+ const hasData = row.slice(dataColStart).some((c) => c !== "");
986
+ const prevHasData = prev.slice(dataColStart).some((c) => c !== "");
987
+ if (hasData && prevHasData) {
988
+ isNewRow = true;
989
+ }
990
+ }
991
+ if (isNewRow && filledCount === 1 && row[0] && row[0].length <= 2) {
992
+ isNewRow = false;
993
+ }
994
+ if (isNewRow) {
995
+ merged.push([...row]);
996
+ } else {
997
+ for (let c = 0; c < numCols; c++) {
998
+ if (row[c]) {
999
+ prev[c] = prev[c] ? prev[c] + " " + row[c] : row[c];
1000
+ }
1001
+ }
1002
+ }
1003
+ }
1004
+ if (merged.length < 2) {
1005
+ return merged.map((r) => r.filter((c) => c).join(" ")).join("\n");
1006
+ }
1007
+ let headerEnd = 0;
1008
+ for (let r = 0; r < merged.length; r++) {
1009
+ const hasDataValues = merged[r].slice(dataColStart).some((c) => c && /\d/.test(c));
1010
+ if (hasDataValues) break;
1011
+ headerEnd = r + 1;
1012
+ }
1013
+ if (headerEnd > 1) {
1014
+ const headerRow = Array(numCols).fill("");
1015
+ for (let r = 0; r < headerEnd; r++) {
1016
+ for (let c = 0; c < numCols; c++) {
1017
+ if (merged[r][c]) {
1018
+ headerRow[c] = headerRow[c] ? headerRow[c] + " " + merged[r][c] : merged[r][c];
1019
+ }
1020
+ }
1021
+ }
1022
+ merged.splice(0, headerEnd, headerRow);
1023
+ }
1024
+ const md = [];
1025
+ md.push("| " + merged[0].join(" | ") + " |");
1026
+ md.push("| " + merged[0].map(() => "---").join(" | ") + " |");
1027
+ for (let r = 1; r < merged.length; r++) {
1028
+ md.push("| " + merged[r].join(" | ") + " |");
1029
+ }
1030
+ return md.join("\n");
1031
+ }
1032
+ function mergeLineSimple(items) {
852
1033
  if (items.length <= 1) return items[0]?.text || "";
853
- items.sort((a, b) => a.x - b.x);
854
- let result = items[0].text;
855
- for (let i = 1; i < items.length; i++) {
856
- const gap = items[i].x - (items[i - 1].x + items[i - 1].width);
1034
+ const sorted = [...items].sort((a, b) => a.x - b.x);
1035
+ let result = sorted[0].text;
1036
+ for (let i = 1; i < sorted.length; i++) {
1037
+ const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
857
1038
  if (gap > 15) result += " ";
858
1039
  else if (gap > 3) result += " ";
859
- result += items[i].text;
1040
+ result += sorted[i].text;
860
1041
  }
861
1042
  return result;
862
1043
  }
863
1044
  function cleanPdfText(text) {
864
- const stripped = text.replace(/^[\s]*[-–—]\s*\d+\s*[-–—][\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "");
865
- return mergeKoreanLines(stripped).replace(/\n{3,}/g, "\n\n").trim();
1045
+ return mergeKoreanLines(
1046
+ text.replace(/^[\s]*[-–—]\s*\d+\s*[-–—][\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "")
1047
+ ).replace(/\n{3,}/g, "\n\n").trim();
866
1048
  }
867
1049
  function startsWithMarker(line) {
868
1050
  const t = line.trimStart();
869
- if (/^[가-힣ㄱ-ㅎ][.)]/.test(t)) return true;
870
- if (/^\d+[.)]/.test(t)) return true;
871
- if (/^\([가-힣ㄱ-ㅎ\d]+\)/.test(t)) return true;
872
- if (/^[○●※▶▷◆◇■□★☆\-·]\s/.test(t)) return true;
873
- if (/^제\d+[조항호장절]/.test(t)) return true;
874
- return false;
1051
+ return /^[가-힣ㄱ-ㅎ][.)]/.test(t) || /^\d+[.)]/.test(t) || /^\([가-힣ㄱ-ㅎ\d]+\)/.test(t) || /^[○●※▶▷◆◇■□★☆\-·]\s/.test(t) || /^제\d+[조항호장절]/.test(t);
875
1052
  }
876
1053
  function isStandaloneHeader(line) {
877
- const t = line.trim();
878
- return /^제\d+[조항호장절](\([^)]*\))?(\s+\S+){0,4}$/.test(t);
1054
+ return /^제\d+[조항호장절](\([^)]*\))?(\s+\S+){0,7}$/.test(line.trim());
879
1055
  }
880
1056
  function mergeKoreanLines(text) {
881
1057
  if (!text) return "";
@@ -885,8 +1061,7 @@ function mergeKoreanLines(text) {
885
1061
  for (let i = 1; i < lines.length; i++) {
886
1062
  const prev = result[result.length - 1];
887
1063
  const curr = lines[i];
888
- const shouldMerge = /[가-힣·,\-]$/.test(prev) && /^[가-힣(]/.test(curr) && !startsWithMarker(curr) && !isStandaloneHeader(prev);
889
- if (shouldMerge) {
1064
+ if (/[가-힣·,\-]$/.test(prev) && /^[가-힣(]/.test(curr) && !startsWithMarker(curr) && !isStandaloneHeader(prev)) {
890
1065
  result[result.length - 1] = prev + " " + curr;
891
1066
  } else {
892
1067
  result.push(curr);
@@ -894,39 +1069,6 @@ function mergeKoreanLines(text) {
894
1069
  }
895
1070
  return result.join("\n");
896
1071
  }
897
- function reconstructTables(text) {
898
- const lines = text.split("\n");
899
- const result = [];
900
- let tableBuffer = [];
901
- for (const line of lines) {
902
- if (line.includes(" ")) {
903
- tableBuffer.push(line.split(" ").map((c) => c.trim()));
904
- } else {
905
- if (tableBuffer.length >= 2) result.push(formatAsMarkdownTable(tableBuffer));
906
- else if (tableBuffer.length === 1) result.push(tableBuffer[0].join(" | "));
907
- tableBuffer = [];
908
- result.push(line);
909
- }
910
- }
911
- if (tableBuffer.length >= 2) result.push(formatAsMarkdownTable(tableBuffer));
912
- else if (tableBuffer.length === 1) result.push(tableBuffer[0].join(" | "));
913
- return result.join("\n");
914
- }
915
- function formatAsMarkdownTable(rows) {
916
- const maxCols = Math.max(...rows.map((r) => r.length));
917
- const normalized = rows.map((r) => {
918
- const copy = [...r];
919
- while (copy.length < maxCols) copy.push("");
920
- return copy;
921
- });
922
- const lines = [];
923
- lines.push("| " + normalized[0].join(" | ") + " |");
924
- lines.push("| " + normalized[0].map(() => "---").join(" | ") + " |");
925
- for (let i = 1; i < normalized.length; i++) {
926
- lines.push("| " + normalized[i].join(" | ") + " |");
927
- }
928
- return lines.join("\n");
929
- }
930
1072
 
931
1073
  // src/index.ts
932
1074
  async function parse(buffer) {
@@ -977,4 +1119,4 @@ export {
977
1119
  sanitizeError,
978
1120
  parse
979
1121
  };
980
- //# sourceMappingURL=chunk-HJHA6H3F.js.map
1122
+ //# sourceMappingURL=chunk-4BKNDXGU.js.map