@cj-tech-master/excelts 8.0.0 → 8.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. package/README.md +14 -1
  2. package/README_zh.md +6 -0
  3. package/dist/browser/modules/archive/zip/stream.d.ts +4 -0
  4. package/dist/browser/modules/archive/zip/stream.js +53 -0
  5. package/dist/browser/modules/pdf/core/crypto.d.ts +65 -0
  6. package/dist/browser/modules/pdf/core/crypto.js +637 -0
  7. package/dist/browser/modules/pdf/core/encryption.d.ts +23 -20
  8. package/dist/browser/modules/pdf/core/encryption.js +88 -261
  9. package/dist/browser/modules/pdf/core/pdf-writer.d.ts +6 -4
  10. package/dist/browser/modules/pdf/core/pdf-writer.js +19 -10
  11. package/dist/browser/modules/pdf/index.d.ts +23 -2
  12. package/dist/browser/modules/pdf/index.js +21 -3
  13. package/dist/browser/modules/pdf/reader/annotation-extractor.d.ts +63 -0
  14. package/dist/browser/modules/pdf/reader/annotation-extractor.js +155 -0
  15. package/dist/browser/modules/pdf/reader/cmap-parser.d.ts +70 -0
  16. package/dist/browser/modules/pdf/reader/cmap-parser.js +321 -0
  17. package/dist/browser/modules/pdf/reader/content-interpreter.d.ts +57 -0
  18. package/dist/browser/modules/pdf/reader/content-interpreter.js +715 -0
  19. package/dist/browser/modules/pdf/reader/font-decoder.d.ts +58 -0
  20. package/dist/browser/modules/pdf/reader/font-decoder.js +1513 -0
  21. package/dist/browser/modules/pdf/reader/form-extractor.d.ts +48 -0
  22. package/dist/browser/modules/pdf/reader/form-extractor.js +355 -0
  23. package/dist/browser/modules/pdf/reader/image-extractor.d.ts +55 -0
  24. package/dist/browser/modules/pdf/reader/image-extractor.js +220 -0
  25. package/dist/browser/modules/pdf/reader/metadata-reader.d.ts +56 -0
  26. package/dist/browser/modules/pdf/reader/metadata-reader.js +275 -0
  27. package/dist/browser/modules/pdf/reader/pdf-decrypt.d.ts +26 -0
  28. package/dist/browser/modules/pdf/reader/pdf-decrypt.js +443 -0
  29. package/dist/browser/modules/pdf/reader/pdf-document.d.ts +191 -0
  30. package/dist/browser/modules/pdf/reader/pdf-document.js +818 -0
  31. package/dist/browser/modules/pdf/reader/pdf-parser.d.ts +65 -0
  32. package/dist/browser/modules/pdf/reader/pdf-parser.js +285 -0
  33. package/dist/browser/modules/pdf/reader/pdf-reader.d.ts +143 -0
  34. package/dist/browser/modules/pdf/reader/pdf-reader.js +200 -0
  35. package/dist/browser/modules/pdf/reader/pdf-tokenizer.d.ts +101 -0
  36. package/dist/browser/modules/pdf/reader/pdf-tokenizer.js +543 -0
  37. package/dist/browser/modules/pdf/reader/reader-utils.d.ts +15 -0
  38. package/dist/browser/modules/pdf/reader/reader-utils.js +27 -0
  39. package/dist/browser/modules/pdf/reader/stream-filters.d.ts +20 -0
  40. package/dist/browser/modules/pdf/reader/stream-filters.js +456 -0
  41. package/dist/browser/modules/pdf/reader/text-reconstruction.d.ts +44 -0
  42. package/dist/browser/modules/pdf/reader/text-reconstruction.js +463 -0
  43. package/dist/cjs/modules/archive/zip/stream.js +53 -0
  44. package/dist/cjs/modules/pdf/core/crypto.js +649 -0
  45. package/dist/cjs/modules/pdf/core/encryption.js +88 -263
  46. package/dist/cjs/modules/pdf/core/pdf-writer.js +19 -10
  47. package/dist/cjs/modules/pdf/index.js +23 -4
  48. package/dist/cjs/modules/pdf/reader/annotation-extractor.js +158 -0
  49. package/dist/cjs/modules/pdf/reader/cmap-parser.js +326 -0
  50. package/dist/cjs/modules/pdf/reader/content-interpreter.js +718 -0
  51. package/dist/cjs/modules/pdf/reader/font-decoder.js +1518 -0
  52. package/dist/cjs/modules/pdf/reader/form-extractor.js +358 -0
  53. package/dist/cjs/modules/pdf/reader/image-extractor.js +223 -0
  54. package/dist/cjs/modules/pdf/reader/metadata-reader.js +278 -0
  55. package/dist/cjs/modules/pdf/reader/pdf-decrypt.js +447 -0
  56. package/dist/cjs/modules/pdf/reader/pdf-document.js +822 -0
  57. package/dist/cjs/modules/pdf/reader/pdf-parser.js +301 -0
  58. package/dist/cjs/modules/pdf/reader/pdf-reader.js +203 -0
  59. package/dist/cjs/modules/pdf/reader/pdf-tokenizer.js +517 -0
  60. package/dist/cjs/modules/pdf/reader/reader-utils.js +30 -0
  61. package/dist/cjs/modules/pdf/reader/stream-filters.js +459 -0
  62. package/dist/cjs/modules/pdf/reader/text-reconstruction.js +467 -0
  63. package/dist/esm/modules/archive/zip/stream.js +53 -0
  64. package/dist/esm/modules/pdf/core/crypto.js +637 -0
  65. package/dist/esm/modules/pdf/core/encryption.js +88 -261
  66. package/dist/esm/modules/pdf/core/pdf-writer.js +19 -10
  67. package/dist/esm/modules/pdf/index.js +21 -3
  68. package/dist/esm/modules/pdf/reader/annotation-extractor.js +155 -0
  69. package/dist/esm/modules/pdf/reader/cmap-parser.js +321 -0
  70. package/dist/esm/modules/pdf/reader/content-interpreter.js +715 -0
  71. package/dist/esm/modules/pdf/reader/font-decoder.js +1513 -0
  72. package/dist/esm/modules/pdf/reader/form-extractor.js +355 -0
  73. package/dist/esm/modules/pdf/reader/image-extractor.js +220 -0
  74. package/dist/esm/modules/pdf/reader/metadata-reader.js +275 -0
  75. package/dist/esm/modules/pdf/reader/pdf-decrypt.js +443 -0
  76. package/dist/esm/modules/pdf/reader/pdf-document.js +818 -0
  77. package/dist/esm/modules/pdf/reader/pdf-parser.js +285 -0
  78. package/dist/esm/modules/pdf/reader/pdf-reader.js +200 -0
  79. package/dist/esm/modules/pdf/reader/pdf-tokenizer.js +543 -0
  80. package/dist/esm/modules/pdf/reader/reader-utils.js +27 -0
  81. package/dist/esm/modules/pdf/reader/stream-filters.js +456 -0
  82. package/dist/esm/modules/pdf/reader/text-reconstruction.js +463 -0
  83. package/dist/iife/excelts.iife.js +703 -267
  84. package/dist/iife/excelts.iife.js.map +1 -1
  85. package/dist/iife/excelts.iife.min.js +35 -35
  86. package/dist/types/modules/archive/zip/stream.d.ts +4 -0
  87. package/dist/types/modules/pdf/core/crypto.d.ts +65 -0
  88. package/dist/types/modules/pdf/core/encryption.d.ts +23 -20
  89. package/dist/types/modules/pdf/core/pdf-writer.d.ts +6 -4
  90. package/dist/types/modules/pdf/index.d.ts +23 -2
  91. package/dist/types/modules/pdf/reader/annotation-extractor.d.ts +63 -0
  92. package/dist/types/modules/pdf/reader/cmap-parser.d.ts +70 -0
  93. package/dist/types/modules/pdf/reader/content-interpreter.d.ts +57 -0
  94. package/dist/types/modules/pdf/reader/font-decoder.d.ts +58 -0
  95. package/dist/types/modules/pdf/reader/form-extractor.d.ts +48 -0
  96. package/dist/types/modules/pdf/reader/image-extractor.d.ts +55 -0
  97. package/dist/types/modules/pdf/reader/metadata-reader.d.ts +56 -0
  98. package/dist/types/modules/pdf/reader/pdf-decrypt.d.ts +26 -0
  99. package/dist/types/modules/pdf/reader/pdf-document.d.ts +191 -0
  100. package/dist/types/modules/pdf/reader/pdf-parser.d.ts +65 -0
  101. package/dist/types/modules/pdf/reader/pdf-reader.d.ts +143 -0
  102. package/dist/types/modules/pdf/reader/pdf-tokenizer.d.ts +101 -0
  103. package/dist/types/modules/pdf/reader/reader-utils.d.ts +15 -0
  104. package/dist/types/modules/pdf/reader/stream-filters.d.ts +20 -0
  105. package/dist/types/modules/pdf/reader/text-reconstruction.d.ts +44 -0
  106. package/package.json +1 -1
@@ -0,0 +1,63 @@
1
+ /**
2
+ * PDF annotation extractor.
3
+ *
4
+ * Extracts annotations from a PDF page's `/Annots` array.
5
+ * Supports all standard annotation subtypes defined in PDF Reference 1.7, §12.5.
6
+ *
7
+ * Common annotation types:
8
+ * - **Link** — Hyperlinks (URI, GoTo, GoToR)
9
+ * - **Text** — Sticky notes / comments
10
+ * - **FreeText** — Inline text annotations
11
+ * - **Highlight / Underline / StrikeOut / Squiggly** — Text markup
12
+ * - **Stamp** — Rubber stamp annotations
13
+ * - **Popup** — Associated popup windows
14
+ * - **Widget** — Form field widgets (handled separately by form-extractor)
15
+ *
16
+ * @see PDF Reference 1.7, §12.5 - Annotations
17
+ */
18
+ import type { PdfDocument } from "./pdf-document.js";
19
+ import type { PdfDictValue } from "./pdf-parser.js";
20
+ /** Rectangle in PDF coordinate space [x1, y1, x2, y2] */
21
+ export interface PdfRect {
22
+ /** Left edge (x1) */
23
+ x1: number;
24
+ /** Bottom edge (y1) */
25
+ y1: number;
26
+ /** Right edge (x2) */
27
+ x2: number;
28
+ /** Top edge (y2) */
29
+ y2: number;
30
+ }
31
+ /** A PDF annotation extracted from a page. */
32
+ export interface PdfAnnotation {
33
+ /** Annotation subtype (e.g. "Link", "Text", "Highlight", "FreeText", "Stamp") */
34
+ subtype: string;
35
+ /** Bounding rectangle in page coordinates (points) */
36
+ rect: PdfRect;
37
+ /** Text content (/Contents entry) */
38
+ contents: string;
39
+ /** Author / title (/T entry) */
40
+ author: string;
41
+ /** Subject (/Subj entry) */
42
+ subject: string;
43
+ /** Modification date (/M entry) — raw PDF date string */
44
+ modifiedDate: string;
45
+ /** For Link annotations: the destination URI */
46
+ uri: string;
47
+ /** For Link annotations: named destination */
48
+ destination: string;
49
+ /** Annotation flags (/F entry) */
50
+ flags: number;
51
+ /** Color (/C entry) — array of 0-3 values in [0,1] */
52
+ color: number[];
53
+ }
54
+ /**
55
+ * Extract annotations from a PDF page.
56
+ *
57
+ * Skips Widget annotations (form fields) — those are handled by the form extractor.
58
+ *
59
+ * @param pageDict - The page dictionary
60
+ * @param doc - The PDF document for resolving references
61
+ * @returns Array of extracted annotations
62
+ */
63
+ export declare function extractAnnotationsFromPage(pageDict: PdfDictValue, doc: PdfDocument): PdfAnnotation[];
@@ -0,0 +1,155 @@
1
+ /**
2
+ * PDF annotation extractor.
3
+ *
4
+ * Extracts annotations from a PDF page's `/Annots` array.
5
+ * Supports all standard annotation subtypes defined in PDF Reference 1.7, §12.5.
6
+ *
7
+ * Common annotation types:
8
+ * - **Link** — Hyperlinks (URI, GoTo, GoToR)
9
+ * - **Text** — Sticky notes / comments
10
+ * - **FreeText** — Inline text annotations
11
+ * - **Highlight / Underline / StrikeOut / Squiggly** — Text markup
12
+ * - **Stamp** — Rubber stamp annotations
13
+ * - **Popup** — Associated popup windows
14
+ * - **Widget** — Form field widgets (handled separately by form-extractor)
15
+ *
16
+ * @see PDF Reference 1.7, §12.5 - Annotations
17
+ */
18
+ import { isPdfArray, dictGetName, dictGetNumber, decodePdfStringBytes } from "./pdf-parser.js";
19
+ import { getDictStringValue } from "./reader-utils.js";
20
+ // =============================================================================
21
+ // Public API
22
+ // =============================================================================
23
+ /**
24
+ * Extract annotations from a PDF page.
25
+ *
26
+ * Skips Widget annotations (form fields) — those are handled by the form extractor.
27
+ *
28
+ * @param pageDict - The page dictionary
29
+ * @param doc - The PDF document for resolving references
30
+ * @returns Array of extracted annotations
31
+ */
32
+ export function extractAnnotationsFromPage(pageDict, doc) {
33
+ const annotsObj = pageDict.get("Annots");
34
+ if (!annotsObj) {
35
+ return [];
36
+ }
37
+ // Resolve the Annots array (may be an indirect reference)
38
+ const annotsResolved = doc.deref(annotsObj);
39
+ if (!isPdfArray(annotsResolved)) {
40
+ return [];
41
+ }
42
+ const annotations = [];
43
+ for (const annotRef of annotsResolved) {
44
+ try {
45
+ const annotDict = doc.derefDict(annotRef);
46
+ if (!annotDict) {
47
+ continue;
48
+ }
49
+ const subtype = dictGetName(annotDict, "Subtype") ?? "";
50
+ // Skip Widget annotations — handled by form-extractor
51
+ if (subtype === "Widget") {
52
+ continue;
53
+ }
54
+ // Skip Popup annotations — they are auxiliary
55
+ if (subtype === "Popup") {
56
+ continue;
57
+ }
58
+ const annotation = parseAnnotation(annotDict, subtype, doc);
59
+ if (annotation) {
60
+ annotations.push(annotation);
61
+ }
62
+ }
63
+ catch {
64
+ // Skip malformed annotations
65
+ }
66
+ }
67
+ return annotations;
68
+ }
69
+ // =============================================================================
70
+ // Parsing
71
+ // =============================================================================
72
+ function parseAnnotation(dict, subtype, doc) {
73
+ const rect = parseRect(dict.get("Rect"), doc);
74
+ if (!rect) {
75
+ return null;
76
+ }
77
+ const contents = getDictStringValue(dict, "Contents", doc);
78
+ const author = getDictStringValue(dict, "T", doc);
79
+ const subject = getDictStringValue(dict, "Subj", doc);
80
+ const modifiedDate = getDictStringValue(dict, "M", doc);
81
+ const flags = dictGetNumber(dict, "F") ?? 0;
82
+ const color = parseColorArray(dict.get("C"), doc);
83
+ // Extract link-specific fields
84
+ let uri = "";
85
+ let destination = "";
86
+ if (subtype === "Link") {
87
+ const actionObj = doc.derefDict(dict.get("A"));
88
+ if (actionObj) {
89
+ const actionType = dictGetName(actionObj, "S");
90
+ if (actionType === "URI") {
91
+ uri = getDictStringValue(actionObj, "URI", doc);
92
+ }
93
+ else if (actionType === "GoTo") {
94
+ const dest = actionObj.get("D");
95
+ if (typeof dest === "string") {
96
+ destination = dest;
97
+ }
98
+ }
99
+ else if (actionType === "GoToR") {
100
+ uri = getDictStringValue(actionObj, "F", doc);
101
+ }
102
+ }
103
+ // Check /Dest directly (older PDFs use this instead of /A)
104
+ if (!uri && !destination) {
105
+ const destObj = dict.get("Dest");
106
+ if (destObj) {
107
+ const resolved = doc.deref(destObj);
108
+ if (typeof resolved === "string") {
109
+ destination = resolved;
110
+ }
111
+ else if (resolved instanceof Uint8Array) {
112
+ destination = decodePdfStringBytes(resolved);
113
+ }
114
+ }
115
+ }
116
+ }
117
+ return {
118
+ subtype,
119
+ rect,
120
+ contents,
121
+ author,
122
+ subject,
123
+ modifiedDate,
124
+ uri,
125
+ destination,
126
+ flags,
127
+ color
128
+ };
129
+ }
130
+ function parseRect(obj, doc) {
131
+ if (!obj) {
132
+ return null;
133
+ }
134
+ const resolved = doc.deref(obj);
135
+ if (!isPdfArray(resolved) || resolved.length < 4) {
136
+ return null;
137
+ }
138
+ const nums = resolved.map(v => (typeof v === "number" ? v : 0));
139
+ return {
140
+ x1: nums[0],
141
+ y1: nums[1],
142
+ x2: nums[2],
143
+ y2: nums[3]
144
+ };
145
+ }
146
+ function parseColorArray(obj, doc) {
147
+ if (!obj) {
148
+ return [];
149
+ }
150
+ const resolved = doc.deref(obj);
151
+ if (!isPdfArray(resolved)) {
152
+ return [];
153
+ }
154
+ return resolved.map(v => (typeof v === "number" ? v : 0));
155
+ }
@@ -0,0 +1,70 @@
1
+ /**
2
+ * CMap parser for PDF text extraction.
3
+ *
4
+ * Parses /ToUnicode CMap programs to build character code → Unicode mappings.
5
+ * This is essential for extracting text from PDFs that use CIDFonts or
6
+ * custom encodings.
7
+ *
8
+ * Supports:
9
+ * - beginbfchar / endbfchar (single character mappings)
10
+ * - beginbfrange / endbfrange (range mappings, including array form)
11
+ * - begincodespacerange / endcodespacerange
12
+ * - Multi-byte character codes (1-4 bytes)
13
+ * - UTF-16BE encoded target strings (including surrogate pairs)
14
+ *
15
+ * @see PDF Reference 1.7, §5.9 - ToUnicode CMaps
16
+ * @see Adobe Technical Note #5411 - CMap Resources
17
+ */
18
+ /**
19
+ * A parsed CMap that maps character codes to Unicode strings.
20
+ */
21
+ export declare class CMap {
22
+ private codeSpaceRanges;
23
+ private bfChars;
24
+ private bfRanges;
25
+ /** Number of bytes per character code (detected from codespace ranges) */
26
+ bytesPerCode: number;
27
+ constructor();
28
+ /**
29
+ * Look up the Unicode string for a character code.
30
+ * Uses binary search over sorted bfRanges for efficient lookup.
31
+ */
32
+ lookup(code: number): string | undefined;
33
+ /**
34
+ * Add a code space range.
35
+ */
36
+ addCodeSpaceRange(low: number, high: number, bytes: number): void;
37
+ /**
38
+ * Add a bfchar mapping.
39
+ */
40
+ addBfChar(code: number, unicode: string): void;
41
+ /**
42
+ * Add a bfrange mapping.
43
+ */
44
+ addBfRange(low: number, high: number, mapping: string | string[]): void;
45
+ /**
46
+ * Sort bfRanges by low value for binary search.
47
+ * Should be called after all ranges have been added.
48
+ */
49
+ sortRanges(): void;
50
+ /**
51
+ * Determine the code length (in bytes) for a given first byte,
52
+ * using the codespace ranges. When multiple ranges match (e.g. a 1-byte
53
+ * range covering 0x00-0xFF and a 2-byte range whose first byte overlaps),
54
+ * returns the longest match per the PDF spec's greedy matching rule.
55
+ * Falls back to bytesPerCode if no range matches.
56
+ */
57
+ getCodeLength(firstByte: number): number;
58
+ /**
59
+ * Check if this CMap has any mappings.
60
+ */
61
+ get isEmpty(): boolean;
62
+ /**
63
+ * Check if this CMap has codespace ranges defined.
64
+ */
65
+ get hasCodeSpaceRanges(): boolean;
66
+ }
67
+ /**
68
+ * Parse a CMap program (typically from a /ToUnicode stream).
69
+ */
70
+ export declare function parseCMap(data: Uint8Array): CMap;
@@ -0,0 +1,321 @@
1
+ /**
2
+ * CMap parser for PDF text extraction.
3
+ *
4
+ * Parses /ToUnicode CMap programs to build character code → Unicode mappings.
5
+ * This is essential for extracting text from PDFs that use CIDFonts or
6
+ * custom encodings.
7
+ *
8
+ * Supports:
9
+ * - beginbfchar / endbfchar (single character mappings)
10
+ * - beginbfrange / endbfrange (range mappings, including array form)
11
+ * - begincodespacerange / endcodespacerange
12
+ * - Multi-byte character codes (1-4 bytes)
13
+ * - UTF-16BE encoded target strings (including surrogate pairs)
14
+ *
15
+ * @see PDF Reference 1.7, §5.9 - ToUnicode CMaps
16
+ * @see Adobe Technical Note #5411 - CMap Resources
17
+ */
18
+ import { PdfTokenizer, TokenType } from "./pdf-tokenizer.js";
19
+ // =============================================================================
20
+ // Public API
21
+ // =============================================================================
22
+ /**
23
+ * A parsed CMap that maps character codes to Unicode strings.
24
+ */
25
+ export class CMap {
26
+ constructor() {
27
+ this.codeSpaceRanges = [];
28
+ this.bfChars = new Map();
29
+ this.bfRanges = [];
30
+ this.bytesPerCode = 1;
31
+ }
32
+ /**
33
+ * Look up the Unicode string for a character code.
34
+ * Uses binary search over sorted bfRanges for efficient lookup.
35
+ */
36
+ lookup(code) {
37
+ // Check bfchar mappings first (exact match)
38
+ const charMapping = this.bfChars.get(code);
39
+ if (charMapping !== undefined) {
40
+ return charMapping;
41
+ }
42
+ // Check bfrange mappings using binary search
43
+ const ranges = this.bfRanges;
44
+ let lo = 0;
45
+ let hi = ranges.length - 1;
46
+ while (lo <= hi) {
47
+ const mid = (lo + hi) >>> 1;
48
+ const range = ranges[mid];
49
+ if (code < range.low) {
50
+ hi = mid - 1;
51
+ }
52
+ else if (code > range.high) {
53
+ lo = mid + 1;
54
+ }
55
+ else {
56
+ // code is within this range
57
+ if (typeof range.mapping === "string") {
58
+ // Single base string — offset the code point
59
+ const offset = code - range.low;
60
+ const baseCode = stringToCodePoint(range.mapping);
61
+ return String.fromCodePoint(baseCode + offset);
62
+ }
63
+ // Array mapping
64
+ const index = code - range.low;
65
+ if (index < range.mapping.length) {
66
+ return range.mapping[index];
67
+ }
68
+ return undefined;
69
+ }
70
+ }
71
+ return undefined;
72
+ }
73
+ /**
74
+ * Add a code space range.
75
+ */
76
+ addCodeSpaceRange(low, high, bytes) {
77
+ this.codeSpaceRanges.push({ low, high, bytes });
78
+ if (bytes > this.bytesPerCode) {
79
+ this.bytesPerCode = bytes;
80
+ }
81
+ }
82
+ /**
83
+ * Add a bfchar mapping.
84
+ */
85
+ addBfChar(code, unicode) {
86
+ this.bfChars.set(code, unicode);
87
+ }
88
+ /**
89
+ * Add a bfrange mapping.
90
+ */
91
+ addBfRange(low, high, mapping) {
92
+ this.bfRanges.push({ low, high, mapping });
93
+ }
94
+ /**
95
+ * Sort bfRanges by low value for binary search.
96
+ * Should be called after all ranges have been added.
97
+ */
98
+ sortRanges() {
99
+ this.bfRanges.sort((a, b) => a.low - b.low);
100
+ }
101
+ /**
102
+ * Determine the code length (in bytes) for a given first byte,
103
+ * using the codespace ranges. When multiple ranges match (e.g. a 1-byte
104
+ * range covering 0x00-0xFF and a 2-byte range whose first byte overlaps),
105
+ * returns the longest match per the PDF spec's greedy matching rule.
106
+ * Falls back to bytesPerCode if no range matches.
107
+ */
108
+ getCodeLength(firstByte) {
109
+ let bestLen = 0;
110
+ for (const range of this.codeSpaceRanges) {
111
+ if (range.bytes === 1) {
112
+ if (firstByte >= (range.low & 0xff) && firstByte <= (range.high & 0xff)) {
113
+ if (bestLen < 1) {
114
+ bestLen = 1;
115
+ }
116
+ }
117
+ }
118
+ else if (range.bytes === 2) {
119
+ const highByteLow = (range.low >> 8) & 0xff;
120
+ const highByteHigh = (range.high >> 8) & 0xff;
121
+ if (firstByte >= highByteLow && firstByte <= highByteHigh) {
122
+ if (bestLen < 2) {
123
+ bestLen = 2;
124
+ }
125
+ }
126
+ }
127
+ else {
128
+ // For multi-byte ranges (3+ bytes), check the high byte
129
+ const hiLow = range.low >>> ((range.bytes - 1) * 8);
130
+ const hiHigh = range.high >>> ((range.bytes - 1) * 8);
131
+ if (firstByte >= hiLow && firstByte <= hiHigh) {
132
+ if (range.bytes > bestLen) {
133
+ bestLen = range.bytes;
134
+ }
135
+ }
136
+ }
137
+ }
138
+ return bestLen > 0 ? bestLen : this.bytesPerCode; // fallback
139
+ }
140
+ /**
141
+ * Check if this CMap has any mappings.
142
+ */
143
+ get isEmpty() {
144
+ return this.bfChars.size === 0 && this.bfRanges.length === 0;
145
+ }
146
+ /**
147
+ * Check if this CMap has codespace ranges defined.
148
+ */
149
+ get hasCodeSpaceRanges() {
150
+ return this.codeSpaceRanges.length > 0;
151
+ }
152
+ }
153
+ // =============================================================================
154
+ // CMap Parser
155
+ // =============================================================================
156
+ /**
157
+ * Parse a CMap program (typically from a /ToUnicode stream).
158
+ */
159
+ export function parseCMap(data) {
160
+ const cmap = new CMap();
161
+ const tokenizer = new PdfTokenizer(data);
162
+ while (true) {
163
+ const token = tokenizer.next();
164
+ if (token.type === TokenType.EOF) {
165
+ break;
166
+ }
167
+ if (token.type === TokenType.Keyword) {
168
+ const kw = token.strValue;
169
+ if (kw === "begincodespacerange") {
170
+ parseCodeSpaceRange(tokenizer, cmap);
171
+ }
172
+ else if (kw === "beginbfchar") {
173
+ parseBfChar(tokenizer, cmap);
174
+ }
175
+ else if (kw === "beginbfrange") {
176
+ parseBfRange(tokenizer, cmap);
177
+ }
178
+ }
179
+ }
180
+ // Sort bfRanges for binary search lookup
181
+ cmap.sortRanges();
182
+ return cmap;
183
+ }
184
+ /**
185
+ * Parse codespacerange section.
186
+ */
187
+ function parseCodeSpaceRange(tokenizer, cmap) {
188
+ while (true) {
189
+ const token = tokenizer.next();
190
+ if (token.type === TokenType.EOF) {
191
+ break;
192
+ }
193
+ if (token.type === TokenType.Keyword && token.strValue === "endcodespacerange") {
194
+ break;
195
+ }
196
+ // Expect two hex strings: low high
197
+ if (token.type === TokenType.HexString) {
198
+ const lowBytes = token.rawBytes;
199
+ const highToken = tokenizer.next();
200
+ if (highToken.type === TokenType.HexString) {
201
+ const highBytes = highToken.rawBytes;
202
+ const low = bytesToInt(lowBytes);
203
+ const high = bytesToInt(highBytes);
204
+ cmap.addCodeSpaceRange(low, high, lowBytes.length);
205
+ }
206
+ }
207
+ }
208
+ }
209
+ /**
210
+ * Parse bfchar section.
211
+ * Format: <srcCode> <dstString>
212
+ */
213
+ function parseBfChar(tokenizer, cmap) {
214
+ while (true) {
215
+ const token = tokenizer.next();
216
+ if (token.type === TokenType.EOF) {
217
+ break;
218
+ }
219
+ if (token.type === TokenType.Keyword && token.strValue === "endbfchar") {
220
+ break;
221
+ }
222
+ if (token.type === TokenType.HexString) {
223
+ const code = bytesToInt(token.rawBytes);
224
+ const target = tokenizer.next();
225
+ if (target.type === TokenType.HexString) {
226
+ const unicode = decodeUtf16BE(target.rawBytes);
227
+ cmap.addBfChar(code, unicode);
228
+ }
229
+ }
230
+ }
231
+ }
232
+ /**
233
+ * Parse bfrange section.
234
+ * Formats:
235
+ * <low> <high> <dstString> — sequential mapping
236
+ * <low> <high> [<str1> <str2> ...] — array mapping
237
+ */
238
+ function parseBfRange(tokenizer, cmap) {
239
+ while (true) {
240
+ const token = tokenizer.next();
241
+ if (token.type === TokenType.EOF) {
242
+ break;
243
+ }
244
+ if (token.type === TokenType.Keyword && token.strValue === "endbfrange") {
245
+ break;
246
+ }
247
+ if (token.type === TokenType.HexString) {
248
+ const low = bytesToInt(token.rawBytes);
249
+ const highToken = tokenizer.next();
250
+ if (highToken.type !== TokenType.HexString) {
251
+ continue;
252
+ }
253
+ const high = bytesToInt(highToken.rawBytes);
254
+ const mappingToken = tokenizer.next();
255
+ if (mappingToken.type === TokenType.HexString) {
256
+ // Sequential mapping from base string
257
+ const unicode = decodeUtf16BE(mappingToken.rawBytes);
258
+ cmap.addBfRange(low, high, unicode);
259
+ }
260
+ else if (mappingToken.type === TokenType.ArrayBegin) {
261
+ // Array of individual mappings
262
+ const mappings = [];
263
+ while (true) {
264
+ const elem = tokenizer.next();
265
+ if (elem.type === TokenType.ArrayEnd || elem.type === TokenType.EOF) {
266
+ break;
267
+ }
268
+ if (elem.type === TokenType.HexString) {
269
+ mappings.push(decodeUtf16BE(elem.rawBytes));
270
+ }
271
+ }
272
+ cmap.addBfRange(low, high, mappings);
273
+ }
274
+ }
275
+ }
276
+ }
277
+ // =============================================================================
278
+ // Helpers
279
+ // =============================================================================
280
+ /**
281
+ * Convert a byte array to a big-endian integer.
282
+ * Uses multiplication instead of bitshift to avoid overflow for large codes.
283
+ */
284
+ function bytesToInt(bytes) {
285
+ let result = 0;
286
+ for (let i = 0; i < bytes.length; i++) {
287
+ result = result * 256 + bytes[i];
288
+ }
289
+ return result;
290
+ }
291
+ /**
292
+ * Decode a UTF-16BE byte array to a JavaScript string.
293
+ */
294
+ function decodeUtf16BE(bytes) {
295
+ let result = "";
296
+ for (let i = 0; i + 1 < bytes.length; i += 2) {
297
+ const code = (bytes[i] << 8) | bytes[i + 1];
298
+ // Handle surrogate pairs
299
+ if (code >= 0xd800 && code <= 0xdbff && i + 3 < bytes.length) {
300
+ const low = (bytes[i + 2] << 8) | bytes[i + 3];
301
+ if (low >= 0xdc00 && low <= 0xdfff) {
302
+ const cp = 0x10000 + ((code - 0xd800) << 10) + (low - 0xdc00);
303
+ result += String.fromCodePoint(cp);
304
+ i += 2;
305
+ continue;
306
+ }
307
+ }
308
+ result += String.fromCharCode(code);
309
+ }
310
+ // Single-byte code: treat as direct character code
311
+ if (bytes.length === 1) {
312
+ return String.fromCharCode(bytes[0]);
313
+ }
314
+ return result;
315
+ }
316
+ /**
317
+ * Get the first code point from a string.
318
+ */
319
+ function stringToCodePoint(str) {
320
+ return str.codePointAt(0) ?? 0;
321
+ }
@@ -0,0 +1,57 @@
1
+ /**
2
+ * PDF content stream interpreter for text extraction.
3
+ *
4
+ * Implements a full PDF graphics state machine that processes content stream
5
+ * operators to extract positioned text fragments. These fragments are then
6
+ * assembled into readable text by the text reconstruction module.
7
+ *
8
+ * Supported operator categories:
9
+ * - Text state: Tf, Tc, Tw, Tz, TL, Ts, Tr
10
+ * - Text positioning: Td, TD, Tm, T*
11
+ * - Text showing: Tj, TJ, ', "
12
+ * - Text objects: BT, ET
13
+ * - Graphics state: q, Q, cm, gs, i, M, ri, W, W*
14
+ * - Color: CS, cs, SC, sc, SCN, scn
15
+ * - Marked content: BDC, BMC, EMC, MP, DP
16
+ * - Type3 glyph: d0, d1
17
+ * - Shading: sh
18
+ * - Inline images: BI/ID/EI
19
+ * - XObject invocation: Do (for form XObjects containing text)
20
+ *
21
+ * @see PDF Reference 1.7, Chapter 5 - Text
22
+ * @see PDF Reference 1.7, Chapter 4 - Graphics
23
+ */
24
+ import type { PdfDocument } from "./pdf-document.js";
25
+ import type { PdfDictValue } from "./pdf-parser.js";
26
+ /**
27
+ * A text fragment extracted from a PDF page.
28
+ * Contains the text string and its position in page coordinates.
29
+ */
30
+ export interface TextFragment {
31
+ /** The extracted text */
32
+ text: string;
33
+ /** X position in page coordinates (points, origin = bottom-left) */
34
+ x: number;
35
+ /** Y position in page coordinates */
36
+ y: number;
37
+ /** Font size in points */
38
+ fontSize: number;
39
+ /** Font name */
40
+ fontName: string;
41
+ /** Width of the text in points */
42
+ width: number;
43
+ /** Character spacing */
44
+ charSpacing: number;
45
+ /** Word spacing */
46
+ wordSpacing: number;
47
+ /** Horizontal scaling factor (100 = normal) */
48
+ horizontalScaling: number;
49
+ /** Whether the text is vertical (WMode=1) */
50
+ isVertical: boolean;
51
+ /** Whether the text is right-to-left (Arabic, Hebrew, etc.) */
52
+ isRtl: boolean;
53
+ }
54
+ /**
55
+ * Extract text fragments from a page's content stream(s).
56
+ */
57
+ export declare function extractTextFromPage(pageDict: PdfDictValue, doc: PdfDocument): TextFragment[];