@cj-tech-master/excelts 8.0.0 → 8.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. package/README.md +14 -1
  2. package/README_zh.md +6 -0
  3. package/dist/browser/modules/archive/zip/stream.d.ts +4 -0
  4. package/dist/browser/modules/archive/zip/stream.js +53 -0
  5. package/dist/browser/modules/pdf/core/crypto.d.ts +65 -0
  6. package/dist/browser/modules/pdf/core/crypto.js +637 -0
  7. package/dist/browser/modules/pdf/core/encryption.d.ts +23 -20
  8. package/dist/browser/modules/pdf/core/encryption.js +88 -261
  9. package/dist/browser/modules/pdf/core/pdf-writer.d.ts +6 -4
  10. package/dist/browser/modules/pdf/core/pdf-writer.js +19 -10
  11. package/dist/browser/modules/pdf/index.d.ts +23 -2
  12. package/dist/browser/modules/pdf/index.js +21 -3
  13. package/dist/browser/modules/pdf/reader/annotation-extractor.d.ts +63 -0
  14. package/dist/browser/modules/pdf/reader/annotation-extractor.js +155 -0
  15. package/dist/browser/modules/pdf/reader/cmap-parser.d.ts +70 -0
  16. package/dist/browser/modules/pdf/reader/cmap-parser.js +321 -0
  17. package/dist/browser/modules/pdf/reader/content-interpreter.d.ts +57 -0
  18. package/dist/browser/modules/pdf/reader/content-interpreter.js +715 -0
  19. package/dist/browser/modules/pdf/reader/font-decoder.d.ts +58 -0
  20. package/dist/browser/modules/pdf/reader/font-decoder.js +1513 -0
  21. package/dist/browser/modules/pdf/reader/form-extractor.d.ts +48 -0
  22. package/dist/browser/modules/pdf/reader/form-extractor.js +355 -0
  23. package/dist/browser/modules/pdf/reader/image-extractor.d.ts +55 -0
  24. package/dist/browser/modules/pdf/reader/image-extractor.js +220 -0
  25. package/dist/browser/modules/pdf/reader/metadata-reader.d.ts +56 -0
  26. package/dist/browser/modules/pdf/reader/metadata-reader.js +275 -0
  27. package/dist/browser/modules/pdf/reader/pdf-decrypt.d.ts +26 -0
  28. package/dist/browser/modules/pdf/reader/pdf-decrypt.js +443 -0
  29. package/dist/browser/modules/pdf/reader/pdf-document.d.ts +191 -0
  30. package/dist/browser/modules/pdf/reader/pdf-document.js +818 -0
  31. package/dist/browser/modules/pdf/reader/pdf-parser.d.ts +65 -0
  32. package/dist/browser/modules/pdf/reader/pdf-parser.js +285 -0
  33. package/dist/browser/modules/pdf/reader/pdf-reader.d.ts +143 -0
  34. package/dist/browser/modules/pdf/reader/pdf-reader.js +200 -0
  35. package/dist/browser/modules/pdf/reader/pdf-tokenizer.d.ts +101 -0
  36. package/dist/browser/modules/pdf/reader/pdf-tokenizer.js +543 -0
  37. package/dist/browser/modules/pdf/reader/reader-utils.d.ts +15 -0
  38. package/dist/browser/modules/pdf/reader/reader-utils.js +27 -0
  39. package/dist/browser/modules/pdf/reader/stream-filters.d.ts +20 -0
  40. package/dist/browser/modules/pdf/reader/stream-filters.js +456 -0
  41. package/dist/browser/modules/pdf/reader/text-reconstruction.d.ts +44 -0
  42. package/dist/browser/modules/pdf/reader/text-reconstruction.js +463 -0
  43. package/dist/cjs/modules/archive/zip/stream.js +53 -0
  44. package/dist/cjs/modules/pdf/core/crypto.js +649 -0
  45. package/dist/cjs/modules/pdf/core/encryption.js +88 -263
  46. package/dist/cjs/modules/pdf/core/pdf-writer.js +19 -10
  47. package/dist/cjs/modules/pdf/index.js +23 -4
  48. package/dist/cjs/modules/pdf/reader/annotation-extractor.js +158 -0
  49. package/dist/cjs/modules/pdf/reader/cmap-parser.js +326 -0
  50. package/dist/cjs/modules/pdf/reader/content-interpreter.js +718 -0
  51. package/dist/cjs/modules/pdf/reader/font-decoder.js +1518 -0
  52. package/dist/cjs/modules/pdf/reader/form-extractor.js +358 -0
  53. package/dist/cjs/modules/pdf/reader/image-extractor.js +223 -0
  54. package/dist/cjs/modules/pdf/reader/metadata-reader.js +278 -0
  55. package/dist/cjs/modules/pdf/reader/pdf-decrypt.js +447 -0
  56. package/dist/cjs/modules/pdf/reader/pdf-document.js +822 -0
  57. package/dist/cjs/modules/pdf/reader/pdf-parser.js +301 -0
  58. package/dist/cjs/modules/pdf/reader/pdf-reader.js +203 -0
  59. package/dist/cjs/modules/pdf/reader/pdf-tokenizer.js +517 -0
  60. package/dist/cjs/modules/pdf/reader/reader-utils.js +30 -0
  61. package/dist/cjs/modules/pdf/reader/stream-filters.js +459 -0
  62. package/dist/cjs/modules/pdf/reader/text-reconstruction.js +467 -0
  63. package/dist/esm/modules/archive/zip/stream.js +53 -0
  64. package/dist/esm/modules/pdf/core/crypto.js +637 -0
  65. package/dist/esm/modules/pdf/core/encryption.js +88 -261
  66. package/dist/esm/modules/pdf/core/pdf-writer.js +19 -10
  67. package/dist/esm/modules/pdf/index.js +21 -3
  68. package/dist/esm/modules/pdf/reader/annotation-extractor.js +155 -0
  69. package/dist/esm/modules/pdf/reader/cmap-parser.js +321 -0
  70. package/dist/esm/modules/pdf/reader/content-interpreter.js +715 -0
  71. package/dist/esm/modules/pdf/reader/font-decoder.js +1513 -0
  72. package/dist/esm/modules/pdf/reader/form-extractor.js +355 -0
  73. package/dist/esm/modules/pdf/reader/image-extractor.js +220 -0
  74. package/dist/esm/modules/pdf/reader/metadata-reader.js +275 -0
  75. package/dist/esm/modules/pdf/reader/pdf-decrypt.js +443 -0
  76. package/dist/esm/modules/pdf/reader/pdf-document.js +818 -0
  77. package/dist/esm/modules/pdf/reader/pdf-parser.js +285 -0
  78. package/dist/esm/modules/pdf/reader/pdf-reader.js +200 -0
  79. package/dist/esm/modules/pdf/reader/pdf-tokenizer.js +543 -0
  80. package/dist/esm/modules/pdf/reader/reader-utils.js +27 -0
  81. package/dist/esm/modules/pdf/reader/stream-filters.js +456 -0
  82. package/dist/esm/modules/pdf/reader/text-reconstruction.js +463 -0
  83. package/dist/iife/excelts.iife.js +703 -267
  84. package/dist/iife/excelts.iife.js.map +1 -1
  85. package/dist/iife/excelts.iife.min.js +35 -35
  86. package/dist/types/modules/archive/zip/stream.d.ts +4 -0
  87. package/dist/types/modules/pdf/core/crypto.d.ts +65 -0
  88. package/dist/types/modules/pdf/core/encryption.d.ts +23 -20
  89. package/dist/types/modules/pdf/core/pdf-writer.d.ts +6 -4
  90. package/dist/types/modules/pdf/index.d.ts +23 -2
  91. package/dist/types/modules/pdf/reader/annotation-extractor.d.ts +63 -0
  92. package/dist/types/modules/pdf/reader/cmap-parser.d.ts +70 -0
  93. package/dist/types/modules/pdf/reader/content-interpreter.d.ts +57 -0
  94. package/dist/types/modules/pdf/reader/font-decoder.d.ts +58 -0
  95. package/dist/types/modules/pdf/reader/form-extractor.d.ts +48 -0
  96. package/dist/types/modules/pdf/reader/image-extractor.d.ts +55 -0
  97. package/dist/types/modules/pdf/reader/metadata-reader.d.ts +56 -0
  98. package/dist/types/modules/pdf/reader/pdf-decrypt.d.ts +26 -0
  99. package/dist/types/modules/pdf/reader/pdf-document.d.ts +191 -0
  100. package/dist/types/modules/pdf/reader/pdf-parser.d.ts +65 -0
  101. package/dist/types/modules/pdf/reader/pdf-reader.d.ts +143 -0
  102. package/dist/types/modules/pdf/reader/pdf-tokenizer.d.ts +101 -0
  103. package/dist/types/modules/pdf/reader/reader-utils.d.ts +15 -0
  104. package/dist/types/modules/pdf/reader/stream-filters.d.ts +20 -0
  105. package/dist/types/modules/pdf/reader/text-reconstruction.d.ts +44 -0
  106. package/package.json +1 -1
@@ -0,0 +1,278 @@
1
+ "use strict";
2
+ /**
3
+ * PDF metadata reader.
4
+ *
5
+ * Extracts document metadata from:
6
+ * 1. Info Dictionary (traditional metadata)
7
+ * - Title, Author, Subject, Keywords, Creator, Producer
8
+ * - CreationDate, ModDate
9
+ *
10
+ * 2. XMP Metadata Stream (XML-based, more comprehensive)
11
+ * - All of the above plus:
12
+ * - Dublin Core metadata, custom properties
13
+ *
14
+ * @see PDF Reference 1.7, §10.2 - Metadata
15
+ * @see XMP Specification Part 1
16
+ */
17
+ Object.defineProperty(exports, "__esModule", { value: true });
18
+ exports.extractMetadata = extractMetadata;
19
+ const pdf_parser_1 = require("./pdf-parser");
20
+ // =============================================================================
21
+ // Public API
22
+ // =============================================================================
23
+ /**
24
+ * Extract metadata from a PDF document.
25
+ */
26
+ function extractMetadata(doc) {
27
+ const metadata = {
28
+ title: "",
29
+ author: "",
30
+ subject: "",
31
+ keywords: "",
32
+ creator: "",
33
+ producer: "",
34
+ creationDate: null,
35
+ modDate: null,
36
+ pdfVersion: extractPdfVersion(doc),
37
+ pageCount: 0,
38
+ encrypted: doc.trailer.has("Encrypt"),
39
+ pageSize: null,
40
+ xmpXml: null,
41
+ custom: {}
42
+ };
43
+ // Extract from Info dictionary
44
+ extractInfoDict(doc, metadata);
45
+ // Extract from XMP metadata stream
46
+ extractXmpMetadata(doc, metadata);
47
+ // Get page count and first page size
48
+ try {
49
+ const pages = doc.getPages();
50
+ metadata.pageCount = pages.length;
51
+ if (pages.length > 0) {
52
+ metadata.pageSize = extractPageSize(pages[0], doc);
53
+ }
54
+ }
55
+ catch {
56
+ // Ignore page tree errors
57
+ }
58
+ return metadata;
59
+ }
60
+ // =============================================================================
61
+ // PDF Version
62
+ // =============================================================================
63
+ function extractPdfVersion(doc) {
64
+ const data = doc.data;
65
+ // First line: %PDF-X.Y
66
+ if (data[0] === 0x25 &&
67
+ data[1] === 0x50 &&
68
+ data[2] === 0x44 &&
69
+ data[3] === 0x46 &&
70
+ data[4] === 0x2d) {
71
+ let version = "";
72
+ for (let i = 5; i < Math.min(data.length, 15); i++) {
73
+ const b = data[i];
74
+ if (b === 0x0a || b === 0x0d || b === 0x20) {
75
+ break;
76
+ }
77
+ version += String.fromCharCode(b);
78
+ }
79
+ return version;
80
+ }
81
+ // Check catalog /Version
82
+ try {
83
+ const catalog = doc.getCatalog();
84
+ const version = catalog.get("Version");
85
+ if (typeof version === "string") {
86
+ return version;
87
+ }
88
+ }
89
+ catch {
90
+ // Ignore
91
+ }
92
+ return "1.0";
93
+ }
94
+ // =============================================================================
95
+ // Info Dictionary
96
+ // =============================================================================
97
+ function extractInfoDict(doc, metadata) {
98
+ const infoRef = (0, pdf_parser_1.dictGetRef)(doc.trailer, "Info");
99
+ if (!infoRef) {
100
+ return;
101
+ }
102
+ const infoDict = doc.derefDict(infoRef);
103
+ if (!infoDict) {
104
+ return;
105
+ }
106
+ const knownKeys = new Set([
107
+ "Title",
108
+ "Author",
109
+ "Subject",
110
+ "Keywords",
111
+ "Creator",
112
+ "Producer",
113
+ "CreationDate",
114
+ "ModDate"
115
+ ]);
116
+ for (const [key, value] of infoDict) {
117
+ const strValue = value instanceof Uint8Array ? (0, pdf_parser_1.decodePdfStringBytes)(value) : String(value ?? "");
118
+ switch (key) {
119
+ case "Title":
120
+ metadata.title = strValue;
121
+ break;
122
+ case "Author":
123
+ metadata.author = strValue;
124
+ break;
125
+ case "Subject":
126
+ metadata.subject = strValue;
127
+ break;
128
+ case "Keywords":
129
+ metadata.keywords = strValue;
130
+ break;
131
+ case "Creator":
132
+ metadata.creator = strValue;
133
+ break;
134
+ case "Producer":
135
+ metadata.producer = strValue;
136
+ break;
137
+ case "CreationDate":
138
+ metadata.creationDate = parsePdfDate(strValue);
139
+ break;
140
+ case "ModDate":
141
+ metadata.modDate = parsePdfDate(strValue);
142
+ break;
143
+ default:
144
+ if (!knownKeys.has(key)) {
145
+ metadata.custom[key] = strValue;
146
+ }
147
+ break;
148
+ }
149
+ }
150
+ }
151
+ // =============================================================================
152
+ // XMP Metadata
153
+ // =============================================================================
154
+ function extractXmpMetadata(doc, metadata) {
155
+ try {
156
+ const catalog = doc.getCatalog();
157
+ const metadataRef = catalog.get("Metadata");
158
+ if (!metadataRef) {
159
+ return;
160
+ }
161
+ const result = doc.derefStreamWithObjNum(metadataRef);
162
+ if (!result) {
163
+ return;
164
+ }
165
+ const data = doc.getStreamData(result.stream, result.objNum, result.gen);
166
+ const xml = new TextDecoder("utf-8").decode(data);
167
+ metadata.xmpXml = xml;
168
+ // Parse key fields from XMP
169
+ if (!metadata.title) {
170
+ metadata.title = extractXmpField(xml, "dc:title") ?? "";
171
+ }
172
+ if (!metadata.author) {
173
+ metadata.author = extractXmpField(xml, "dc:creator") ?? "";
174
+ }
175
+ if (!metadata.subject) {
176
+ metadata.subject = extractXmpField(xml, "dc:description") ?? "";
177
+ }
178
+ if (!metadata.keywords) {
179
+ metadata.keywords = extractXmpField(xml, "pdf:Keywords") ?? "";
180
+ }
181
+ if (!metadata.creator) {
182
+ metadata.creator = extractXmpField(xml, "xmp:CreatorTool") ?? "";
183
+ }
184
+ if (!metadata.producer) {
185
+ metadata.producer = extractXmpField(xml, "pdf:Producer") ?? "";
186
+ }
187
+ if (!metadata.creationDate) {
188
+ const dateStr = extractXmpField(xml, "xmp:CreateDate");
189
+ if (dateStr) {
190
+ metadata.creationDate = new Date(dateStr);
191
+ }
192
+ }
193
+ if (!metadata.modDate) {
194
+ const dateStr = extractXmpField(xml, "xmp:ModifyDate");
195
+ if (dateStr) {
196
+ metadata.modDate = new Date(dateStr);
197
+ }
198
+ }
199
+ }
200
+ catch {
201
+ // Ignore XMP errors
202
+ }
203
+ }
204
+ /**
205
+ * Extract a field value from XMP XML using simple regex.
206
+ * Handles both simple elements and rdf:Alt/rdf:Bag/rdf:Seq containers.
207
+ */
208
+ function extractXmpField(xml, field) {
209
+ // Try simple element: <field>value</field>
210
+ const simpleRegex = new RegExp(`<${field}[^>]*>([^<]+)</${field}>`, "i");
211
+ const simpleMatch = simpleRegex.exec(xml);
212
+ if (simpleMatch) {
213
+ return decodeXmlEntities(simpleMatch[1].trim());
214
+ }
215
+ // Try rdf:Alt/rdf:Bag/rdf:Seq container: <field>...<rdf:li>value</rdf:li>...</field>
216
+ const containerRegex = new RegExp(`<${field}[^>]*>.*?<rdf:li[^>]*>([^<]+)</rdf:li>`, "is");
217
+ const containerMatch = containerRegex.exec(xml);
218
+ if (containerMatch) {
219
+ return decodeXmlEntities(containerMatch[1].trim());
220
+ }
221
+ return null;
222
+ }
223
+ function decodeXmlEntities(text) {
224
+ return text
225
+ .replace(/&amp;/g, "&")
226
+ .replace(/&lt;/g, "<")
227
+ .replace(/&gt;/g, ">")
228
+ .replace(/&quot;/g, '"')
229
+ .replace(/&apos;/g, "'");
230
+ }
231
+ // =============================================================================
232
+ // Page Size
233
+ // =============================================================================
234
+ function extractPageSize(pageDict, doc) {
235
+ return doc.resolvePageBox(pageDict);
236
+ }
237
+ // =============================================================================
238
+ // PDF Date Parsing
239
+ // =============================================================================
240
+ /**
241
+ * Parse a PDF date string to a Date object.
242
+ * Format: D:YYYYMMDDHHmmSSOHH'mm
243
+ */
244
+ function parsePdfDate(dateStr) {
245
+ if (!dateStr) {
246
+ return null;
247
+ }
248
+ // Remove leading "D:" if present
249
+ let s = dateStr;
250
+ if (s.startsWith("D:")) {
251
+ s = s.substring(2);
252
+ }
253
+ // Parse components
254
+ const year = parseInt(s.substring(0, 4), 10);
255
+ if (isNaN(year)) {
256
+ return null;
257
+ }
258
+ const month = parseInt(s.substring(4, 6), 10) || 1;
259
+ const day = parseInt(s.substring(6, 8), 10) || 1;
260
+ const hour = parseInt(s.substring(8, 10), 10) || 0;
261
+ const minute = parseInt(s.substring(10, 12), 10) || 0;
262
+ const second = parseInt(s.substring(12, 14), 10) || 0;
263
+ // Parse timezone
264
+ const tzChar = s.charAt(14);
265
+ let offsetMinutes = 0;
266
+ if (tzChar === "+" || tzChar === "-") {
267
+ const tzHour = parseInt(s.substring(15, 17), 10) || 0;
268
+ const tzMin = parseInt(s.substring(18, 20), 10) || 0;
269
+ offsetMinutes = (tzHour * 60 + tzMin) * (tzChar === "-" ? -1 : 1);
270
+ }
271
+ // Create Date in UTC
272
+ const date = new Date(Date.UTC(year, month - 1, day, hour, minute, second));
273
+ // Apply timezone offset
274
+ if (offsetMinutes !== 0 && tzChar !== "Z") {
275
+ date.setUTCMinutes(date.getUTCMinutes() - offsetMinutes);
276
+ }
277
+ return isNaN(date.getTime()) ? null : date;
278
+ }