aurochs 0.6.3 → 0.6.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -456,7 +456,7 @@ function buildDiagnostics(args) {
456
456
  if (decoded.includes("�")) {
457
457
  replacementCharMapCount += 1;
458
458
  }
459
- if (containsPrivateUseCharacter(decoded)) {
459
+ if (containsPrivateUseCharacter$1(decoded)) {
460
460
  privateUseCharMapCount += 1;
461
461
  }
462
462
  }
@@ -469,7 +469,7 @@ function buildDiagnostics(args) {
469
469
  sourceCodeLengthHistogram: histogram
470
470
  };
471
471
  }
472
- function containsPrivateUseCharacter(text) {
472
+ function containsPrivateUseCharacter$1(text) {
473
473
  return Array.from(text).some((char) => {
474
474
  const codePoint = char.codePointAt(0);
475
475
  if (codePoint === void 0) {
@@ -1728,6 +1728,401 @@ function decodeCIDFallback(cid, ordering) {
1728
1728
  }
1729
1729
  return null;
1730
1730
  }
1731
+ const BYTE_TO_HEX = Array.from(
1732
+ { length: 256 },
1733
+ (_, value) => value.toString(16).padStart(2, "0").toUpperCase()
1734
+ );
1735
+ function bytesToString(bytes) {
1736
+ return String.fromCharCode(...bytes);
1737
+ }
1738
+ function rawTextToBytes$1(rawText) {
1739
+ const bytes = new Uint8Array(rawText.length);
1740
+ for (let i2 = 0; i2 < rawText.length; i2++) {
1741
+ bytes[i2] = rawText.charCodeAt(i2) & 255;
1742
+ }
1743
+ return bytes;
1744
+ }
1745
+ function byteSliceToHex(bytes, start, length) {
1746
+ const parts = new Array(length);
1747
+ for (let i2 = 0; i2 < length; i2++) {
1748
+ parts[i2] = BYTE_TO_HEX[bytes[start + i2] ?? 0];
1749
+ }
1750
+ return parts.join("");
1751
+ }
1752
+ function resolveSourceCodeByteLengths(byteMapping, preferred) {
1753
+ if (preferred && preferred.length > 0) {
1754
+ const normalized = [...new Set(preferred.filter((length) => Number.isInteger(length) && length > 0))].sort((a, b) => b - a);
1755
+ if (normalized.length > 0) {
1756
+ return normalized;
1757
+ }
1758
+ }
1759
+ const inferred = /* @__PURE__ */ new Set();
1760
+ for (const sourceHex of byteMapping.keys()) {
1761
+ inferred.add(sourceHex.length / 2);
1762
+ }
1763
+ return [...inferred].filter((length) => Number.isInteger(length) && length > 0).sort((a, b) => b - a);
1764
+ }
1765
+ function containsPrivateUseCharacter(text) {
1766
+ return Array.from(text).some((char) => {
1767
+ const codePoint = char.codePointAt(0);
1768
+ if (codePoint === void 0) {
1769
+ return false;
1770
+ }
1771
+ return codePoint >= 57344 && codePoint <= 63743 || codePoint >= 983040 && codePoint <= 1048573 || codePoint >= 1048576 && codePoint <= 1114109;
1772
+ });
1773
+ }
1774
+ function decodeCidFallback(args) {
1775
+ const { code, ordering, cidCodeToUnicodeFallbackMap } = args;
1776
+ const glyphFallback = cidCodeToUnicodeFallbackMap?.get(code);
1777
+ if (glyphFallback !== void 0) {
1778
+ return glyphFallback;
1779
+ }
1780
+ return decodeCIDFallback(code, ordering ?? null);
1781
+ }
1782
+ function resolveCidFallbackOrdering(fontInfo) {
1783
+ const ordering = fontInfo.ordering;
1784
+ if (ordering && ordering !== "Identity") {
1785
+ return ordering;
1786
+ }
1787
+ return void 0;
1788
+ }
1789
+ function isToUnicodeSeverelyCorrupted(fontInfo) {
1790
+ const byteMapping = fontInfo.toUnicodeByteMapping;
1791
+ const diagnostics = fontInfo.toUnicodeDiagnostics;
1792
+ if (!byteMapping || byteMapping.size === 0 || !diagnostics) {
1793
+ return false;
1794
+ }
1795
+ const total = byteMapping.size;
1796
+ const replacementRatio = diagnostics.replacementCharMapCount / total;
1797
+ const privateUseRatio = diagnostics.privateUseCharMapCount / total;
1798
+ return replacementRatio >= 0.5 || privateUseRatio >= 0.5;
1799
+ }
1800
+ function scoreCharacter(c) {
1801
+ if (c === 9 || c === 10 || c === 13) {
1802
+ return 1;
1803
+ }
1804
+ if (c === 32) {
1805
+ return 2;
1806
+ }
1807
+ if (c < 32 || c === 127) {
1808
+ return -3;
1809
+ }
1810
+ if (c >= 48 && c <= 57) {
1811
+ return 1;
1812
+ }
1813
+ if (c >= 65 && c <= 90 || c >= 97 && c <= 122) {
1814
+ return 1;
1815
+ }
1816
+ if (`=()[]{}.,;:'"_-+/\\<>`.includes(String.fromCharCode(c))) {
1817
+ return 1;
1818
+ }
1819
+ return 0;
1820
+ }
1821
+ function scoreAsciiQuality(s) {
1822
+ if (s.length === 0) {
1823
+ return 0;
1824
+ }
1825
+ const totalScore = Array.from(s).reduce((acc, char) => acc + scoreCharacter(char.charCodeAt(0)), 0);
1826
+ return totalScore / s.length;
1827
+ }
1828
+ function maybeNormalizeSingleByteRawText(rawText) {
1829
+ if (!rawText.includes("\0")) {
1830
+ return rawText;
1831
+ }
1832
+ const bytes = new Array(rawText.length);
1833
+ for (let i2 = 0; i2 < rawText.length; i2++) {
1834
+ bytes[i2] = rawText.charCodeAt(i2) & 255;
1835
+ }
1836
+ const candidates = [];
1837
+ candidates.push(bytesToString(bytes.filter((b) => b !== 0)));
1838
+ if (bytes.length >= 6 && bytes.length % 2 === 0) {
1839
+ const pairs = bytes.length / 2;
1840
+ const bytePairs = Array.from({ length: pairs }, (_, i2) => ({
1841
+ hi: bytes[i2 * 2],
1842
+ lo: bytes[i2 * 2 + 1]
1843
+ }));
1844
+ const hiNearZero = bytePairs.filter(({ hi }) => hi <= 1).length;
1845
+ const loAscii = bytePairs.filter(({ lo }) => lo >= 3 && lo <= 126).length;
1846
+ if (hiNearZero / pairs >= 0.7 && loAscii / pairs >= 0.7) {
1847
+ const lows = bytePairs.map(({ lo }) => lo);
1848
+ candidates.push(bytesToString(lows));
1849
+ }
1850
+ }
1851
+ const shifted = candidates.map((c) => {
1852
+ const b2 = Array.from(c).map((char) => {
1853
+ const b = char.charCodeAt(0) & 255;
1854
+ return b >= 3 ? b - 3 : 0;
1855
+ });
1856
+ return bytesToString(b2);
1857
+ });
1858
+ const sanitizeXmlText = (s) => {
1859
+ return Array.from(s).filter((char) => char.charCodeAt(0) !== 0).map((char) => {
1860
+ const code = char.charCodeAt(0);
1861
+ const isForbidden = code >= 1 && code <= 8 || code === 11 || code === 12 || code >= 14 && code <= 31;
1862
+ return isForbidden ? " " : char;
1863
+ }).join("");
1864
+ };
1865
+ const all = [...candidates, ...shifted].map((s) => sanitizeXmlText(s));
1866
+ const sanitizedRaw = sanitizeXmlText(rawText);
1867
+ const initial = { best: sanitizedRaw, score: scoreAsciiQuality(sanitizedRaw) };
1868
+ const result = all.reduce((acc, s) => {
1869
+ const score = scoreAsciiQuality(s);
1870
+ return score > acc.score + 0.1 ? { best: s, score } : acc;
1871
+ }, initial);
1872
+ return result.best;
1873
+ }
1874
+ function sanitizeDecodedText(decoded) {
1875
+ return Array.from(decoded).filter((char) => char.charCodeAt(0) !== 0).map((char) => {
1876
+ const code = char.charCodeAt(0);
1877
+ const isKeepWhitespace = code === 9 || code === 10 || code === 13;
1878
+ const isForbiddenControl = code >= 1 && code <= 8 || code === 11 || code === 12 || code >= 14 && code <= 31 || code === 127;
1879
+ return isForbiddenControl && !isKeepWhitespace ? " " : char;
1880
+ }).join("");
1881
+ }
1882
+ function findFontInfo(fontName, mappings) {
1883
+ const cleanName = fontName.startsWith("/") ? fontName.slice(1) : fontName;
1884
+ const exactMatch = mappings.get(cleanName);
1885
+ if (exactMatch) {
1886
+ return exactMatch;
1887
+ }
1888
+ const plusIndex = cleanName.indexOf("+");
1889
+ if (plusIndex > 0) {
1890
+ const baseName = cleanName.slice(plusIndex + 1);
1891
+ const baseMatch = mappings.get(baseName);
1892
+ if (baseMatch) {
1893
+ return baseMatch;
1894
+ }
1895
+ }
1896
+ for (const [key, value] of mappings.entries()) {
1897
+ if (cleanName.includes(key) || key.includes(cleanName)) {
1898
+ return value;
1899
+ }
1900
+ }
1901
+ return void 0;
1902
+ }
1903
+ function decodeText(rawText, fontName, mappings) {
1904
+ const fontInfo = findFontInfo(fontName, mappings);
1905
+ if (!fontInfo) {
1906
+ return sanitizeDecodedText(rawText);
1907
+ }
1908
+ return decodeTextWithFontInfo(rawText, fontInfo);
1909
+ }
1910
+ function decodeTextWithFontInfo(rawText, fontInfo) {
1911
+ const {
1912
+ mapping,
1913
+ codeByteWidth,
1914
+ encodingMap,
1915
+ toUnicodeByteMapping,
1916
+ toUnicodeSourceCodeByteLengths,
1917
+ cidCodeToUnicodeFallbackMap
1918
+ } = fontInfo;
1919
+ const cidFallbackOrdering = resolveCidFallbackOrdering(fontInfo);
1920
+ const severeCidMode = isToUnicodeSeverelyCorrupted(fontInfo) && codeByteWidth === 2;
1921
+ if (toUnicodeByteMapping && toUnicodeByteMapping.size > 0) {
1922
+ return sanitizeDecodedText(
1923
+ decodeByToUnicodeByteMapping({
1924
+ rawText,
1925
+ byteMapping: toUnicodeByteMapping,
1926
+ sourceCodeByteLengths: toUnicodeSourceCodeByteLengths,
1927
+ legacyMapping: mapping,
1928
+ codeByteWidth,
1929
+ ordering: cidFallbackOrdering,
1930
+ cidCodeToUnicodeFallbackMap,
1931
+ treatReplacementAsMissing: severeCidMode,
1932
+ treatPrivateUseAsMissing: severeCidMode,
1933
+ allowCidFallbackOnBadToUnicode: severeCidMode,
1934
+ allowCidFallbackOnMiss: severeCidMode
1935
+ })
1936
+ );
1937
+ }
1938
+ if (codeByteWidth === 2) {
1939
+ return sanitizeDecodedText(
1940
+ decodeTwoByteText({
1941
+ rawText,
1942
+ mapping,
1943
+ ordering: cidFallbackOrdering,
1944
+ cidCodeToUnicodeFallbackMap,
1945
+ allowCidFallback: mapping.size === 0
1946
+ })
1947
+ );
1948
+ }
1949
+ if (mapping.size > 0) {
1950
+ return sanitizeDecodedText(decodeSingleByteTextWithFallback(rawText, mapping, encodingMap));
1951
+ }
1952
+ if (encodingMap && encodingMap.size > 0) {
1953
+ const normalized = maybeNormalizeSingleByteRawText(rawText);
1954
+ const mutableMap = new Map(encodingMap);
1955
+ return sanitizeDecodedText(decodeSingleByteText(normalized, mutableMap));
1956
+ }
1957
+ return sanitizeDecodedText(rawText);
1958
+ }
1959
+ function decodeByToUnicodeByteMapping(args) {
1960
+ const {
1961
+ rawText,
1962
+ byteMapping,
1963
+ sourceCodeByteLengths,
1964
+ legacyMapping,
1965
+ codeByteWidth,
1966
+ ordering,
1967
+ cidCodeToUnicodeFallbackMap,
1968
+ treatReplacementAsMissing = false,
1969
+ treatPrivateUseAsMissing = false,
1970
+ allowCidFallbackOnBadToUnicode = false,
1971
+ allowCidFallbackOnMiss = false
1972
+ } = args;
1973
+ const bytes = rawTextToBytes$1(rawText);
1974
+ const byteLengths = resolveSourceCodeByteLengths(byteMapping, sourceCodeByteLengths);
1975
+ if (byteLengths.length === 0) {
1976
+ return rawText;
1977
+ }
1978
+ const minByteLength = byteLengths[byteLengths.length - 1] ?? 1;
1979
+ const missAdvance = minByteLength;
1980
+ const chars = [];
1981
+ for (let i2 = 0; i2 < bytes.length; ) {
1982
+ const matchState = { mapped: void 0, consumed: 0 };
1983
+ for (const byteLength of byteLengths) {
1984
+ if (i2 + byteLength > bytes.length) {
1985
+ continue;
1986
+ }
1987
+ const sourceHex = byteSliceToHex(bytes, i2, byteLength);
1988
+ const hit = byteMapping.get(sourceHex);
1989
+ if (hit !== void 0) {
1990
+ matchState.mapped = hit;
1991
+ matchState.consumed = byteLength;
1992
+ break;
1993
+ }
1994
+ }
1995
+ if (matchState.mapped !== void 0 && matchState.consumed > 0) {
1996
+ const shouldDiscardReplacement = treatReplacementAsMissing && matchState.mapped.includes("�");
1997
+ const shouldDiscardPrivateUse = treatPrivateUseAsMissing && containsPrivateUseCharacter(matchState.mapped);
1998
+ if (!shouldDiscardReplacement && !shouldDiscardPrivateUse) {
1999
+ chars.push(matchState.mapped);
2000
+ i2 += matchState.consumed;
2001
+ continue;
2002
+ }
2003
+ if (codeByteWidth === 2 && matchState.consumed === 2) {
2004
+ const code = bytes[i2] << 8 | bytes[i2 + 1];
2005
+ if (allowCidFallbackOnBadToUnicode) {
2006
+ const cidFallback = decodeCidFallback({
2007
+ code,
2008
+ ordering,
2009
+ cidCodeToUnicodeFallbackMap
2010
+ });
2011
+ if (cidFallback) {
2012
+ chars.push(cidFallback);
2013
+ i2 += 2;
2014
+ continue;
2015
+ }
2016
+ }
2017
+ }
2018
+ chars.push("�");
2019
+ i2 += matchState.consumed;
2020
+ continue;
2021
+ }
2022
+ if (codeByteWidth === 2 && i2 + 1 < bytes.length) {
2023
+ const code = bytes[i2] << 8 | bytes[i2 + 1];
2024
+ const legacyMapped = legacyMapping.get(code);
2025
+ if (legacyMapped !== void 0) {
2026
+ const shouldDiscardReplacement = treatReplacementAsMissing && legacyMapped.includes("�");
2027
+ const shouldDiscardPrivateUse = treatPrivateUseAsMissing && containsPrivateUseCharacter(legacyMapped);
2028
+ if (!shouldDiscardReplacement && !shouldDiscardPrivateUse) {
2029
+ chars.push(legacyMapped);
2030
+ i2 += 2;
2031
+ continue;
2032
+ }
2033
+ if (allowCidFallbackOnBadToUnicode) {
2034
+ const cidFallback = decodeCidFallback({
2035
+ code,
2036
+ ordering,
2037
+ cidCodeToUnicodeFallbackMap
2038
+ });
2039
+ if (cidFallback) {
2040
+ chars.push(cidFallback);
2041
+ i2 += 2;
2042
+ continue;
2043
+ }
2044
+ }
2045
+ }
2046
+ if (allowCidFallbackOnMiss || byteMapping.size === 0 && legacyMapping.size === 0) {
2047
+ const cidFallback = decodeCidFallback({
2048
+ code,
2049
+ ordering,
2050
+ cidCodeToUnicodeFallbackMap
2051
+ });
2052
+ if (cidFallback) {
2053
+ chars.push(cidFallback);
2054
+ i2 += 2;
2055
+ continue;
2056
+ }
2057
+ }
2058
+ }
2059
+ if (codeByteWidth === 1) {
2060
+ const legacyMapped = legacyMapping.get(bytes[i2]);
2061
+ if (legacyMapped !== void 0) {
2062
+ chars.push(legacyMapped);
2063
+ i2 += 1;
2064
+ continue;
2065
+ }
2066
+ }
2067
+ chars.push("�");
2068
+ i2 += Math.min(Math.max(missAdvance, 1), bytes.length - i2);
2069
+ }
2070
+ return chars.join("");
2071
+ }
2072
+ function decodeTwoByteText(args) {
2073
+ const {
2074
+ rawText,
2075
+ mapping,
2076
+ ordering,
2077
+ cidCodeToUnicodeFallbackMap,
2078
+ allowCidFallback = false,
2079
+ treatReplacementAsMissing = false,
2080
+ treatPrivateUseAsMissing = false
2081
+ } = args;
2082
+ const chars = [];
2083
+ for (let i2 = 0; i2 < rawText.length; i2 += 2) {
2084
+ const highByte = rawText.charCodeAt(i2);
2085
+ const lowByte = i2 + 1 < rawText.length ? rawText.charCodeAt(i2 + 1) : 0;
2086
+ const code = highByte << 8 | lowByte;
2087
+ const mapped = mapping.get(code);
2088
+ const shouldDiscardReplacement = treatReplacementAsMissing && mapped?.includes("�") === true;
2089
+ const shouldDiscardPrivateUse = treatPrivateUseAsMissing && mapped !== void 0 && containsPrivateUseCharacter(mapped);
2090
+ if (mapped && !shouldDiscardReplacement && !shouldDiscardPrivateUse) {
2091
+ chars.push(mapped);
2092
+ continue;
2093
+ }
2094
+ if (allowCidFallback) {
2095
+ const cidFallback = decodeCidFallback({
2096
+ code,
2097
+ ordering,
2098
+ cidCodeToUnicodeFallbackMap
2099
+ });
2100
+ if (cidFallback) {
2101
+ chars.push(cidFallback);
2102
+ continue;
2103
+ }
2104
+ }
2105
+ chars.push("�");
2106
+ }
2107
+ return chars.join("");
2108
+ }
2109
+ function decodeSingleByteText(rawText, mapping) {
2110
+ return Array.from(rawText).map((char) => {
2111
+ const code = char.charCodeAt(0);
2112
+ return mapping.get(code) ?? char;
2113
+ }).join("");
2114
+ }
2115
+ function decodeSingleByteTextWithFallback(rawText, toUnicode, encodingMap) {
2116
+ return Array.from(rawText).map((char) => {
2117
+ const code = char.charCodeAt(0);
2118
+ const mapped = toUnicode.get(code);
2119
+ if (mapped) {
2120
+ return mapped;
2121
+ }
2122
+ const fallback = encodingMap?.get(code);
2123
+ return fallback ?? char;
2124
+ }).join("");
2125
+ }
1731
2126
  const WINANSI_ENCODING = /* @__PURE__ */ new Map([
1732
2127
  // 0x20-0x7E: Standard ASCII printable characters
1733
2128
  [32, " "],
@@ -5852,9 +6247,12 @@ function createTextRun(text, textState, gfxState) {
5852
6247
  const endUserY = startUserY + advanceUserY;
5853
6248
  const endPos = transformPoint({ x: endUserX, y: endUserY }, ctm);
5854
6249
  const effectiveFontSize = calculateEffectiveFontSize(currentFontSize, textMatrix, ctm);
6250
+ const decodedText = currentFontInfo ? decodeTextWithFontInfo(text, currentFontInfo) : text;
5855
6251
  const run = {
5856
- text,
6252
+ text: decodedText,
6253
+ rawText: text,
5857
6254
  rawBytes: rawTextToBytes(text),
6255
+ codeByteWidth: currentCodeByteWidth,
5858
6256
  textMatrix,
5859
6257
  x: startPos.x,
5860
6258
  y: startPos.y,
@@ -22071,7 +22469,7 @@ async function getPdfPageDimensions(data, pageNumber = 1) {
22071
22469
  return pages[pageNumber - 1].getSize();
22072
22470
  }
22073
22471
  export {
22074
- applyGraphicsSoftMaskToPdfImage as A,
22472
+ decodeText as A,
22075
22473
  DEFAULT_FILL_COLOR as D,
22076
22474
  IDENTITY_MATRIX$2 as I,
22077
22475
  translationMatrix as a,
@@ -22096,9 +22494,9 @@ export {
22096
22494
  transformPoint as t,
22097
22495
  getPdfPageCount as u,
22098
22496
  getPdfPageDimensions as v,
22099
- decodeCIDFallback as w,
22100
- DEFAULT_FONT_METRICS as x,
22101
- calculateTextDisplacement as y,
22102
- rasterizeSoftMaskedFillPath as z
22497
+ DEFAULT_FONT_METRICS as w,
22498
+ calculateTextDisplacement as x,
22499
+ rasterizeSoftMaskedFillPath as y,
22500
+ applyGraphicsSoftMaskToPdfImage as z
22103
22501
  };
22104
- //# sourceMappingURL=pdf-parser-Ciztl2kx.js.map
22502
+ //# sourceMappingURL=pdf-parser-NK9_pSDg.js.map