@cj-tech-master/excelts 8.0.0 → 8.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. package/README.md +14 -1
  2. package/README_zh.md +6 -0
  3. package/dist/browser/modules/archive/zip/stream.d.ts +4 -0
  4. package/dist/browser/modules/archive/zip/stream.js +53 -0
  5. package/dist/browser/modules/pdf/core/crypto.d.ts +65 -0
  6. package/dist/browser/modules/pdf/core/crypto.js +637 -0
  7. package/dist/browser/modules/pdf/core/encryption.d.ts +23 -20
  8. package/dist/browser/modules/pdf/core/encryption.js +88 -261
  9. package/dist/browser/modules/pdf/core/pdf-writer.d.ts +6 -4
  10. package/dist/browser/modules/pdf/core/pdf-writer.js +19 -10
  11. package/dist/browser/modules/pdf/index.d.ts +23 -2
  12. package/dist/browser/modules/pdf/index.js +21 -3
  13. package/dist/browser/modules/pdf/reader/annotation-extractor.d.ts +63 -0
  14. package/dist/browser/modules/pdf/reader/annotation-extractor.js +155 -0
  15. package/dist/browser/modules/pdf/reader/cmap-parser.d.ts +70 -0
  16. package/dist/browser/modules/pdf/reader/cmap-parser.js +321 -0
  17. package/dist/browser/modules/pdf/reader/content-interpreter.d.ts +57 -0
  18. package/dist/browser/modules/pdf/reader/content-interpreter.js +715 -0
  19. package/dist/browser/modules/pdf/reader/font-decoder.d.ts +58 -0
  20. package/dist/browser/modules/pdf/reader/font-decoder.js +1513 -0
  21. package/dist/browser/modules/pdf/reader/form-extractor.d.ts +48 -0
  22. package/dist/browser/modules/pdf/reader/form-extractor.js +355 -0
  23. package/dist/browser/modules/pdf/reader/image-extractor.d.ts +55 -0
  24. package/dist/browser/modules/pdf/reader/image-extractor.js +220 -0
  25. package/dist/browser/modules/pdf/reader/metadata-reader.d.ts +56 -0
  26. package/dist/browser/modules/pdf/reader/metadata-reader.js +275 -0
  27. package/dist/browser/modules/pdf/reader/pdf-decrypt.d.ts +26 -0
  28. package/dist/browser/modules/pdf/reader/pdf-decrypt.js +443 -0
  29. package/dist/browser/modules/pdf/reader/pdf-document.d.ts +191 -0
  30. package/dist/browser/modules/pdf/reader/pdf-document.js +818 -0
  31. package/dist/browser/modules/pdf/reader/pdf-parser.d.ts +65 -0
  32. package/dist/browser/modules/pdf/reader/pdf-parser.js +285 -0
  33. package/dist/browser/modules/pdf/reader/pdf-reader.d.ts +143 -0
  34. package/dist/browser/modules/pdf/reader/pdf-reader.js +200 -0
  35. package/dist/browser/modules/pdf/reader/pdf-tokenizer.d.ts +101 -0
  36. package/dist/browser/modules/pdf/reader/pdf-tokenizer.js +543 -0
  37. package/dist/browser/modules/pdf/reader/reader-utils.d.ts +15 -0
  38. package/dist/browser/modules/pdf/reader/reader-utils.js +27 -0
  39. package/dist/browser/modules/pdf/reader/stream-filters.d.ts +20 -0
  40. package/dist/browser/modules/pdf/reader/stream-filters.js +456 -0
  41. package/dist/browser/modules/pdf/reader/text-reconstruction.d.ts +44 -0
  42. package/dist/browser/modules/pdf/reader/text-reconstruction.js +463 -0
  43. package/dist/cjs/modules/archive/zip/stream.js +53 -0
  44. package/dist/cjs/modules/pdf/core/crypto.js +649 -0
  45. package/dist/cjs/modules/pdf/core/encryption.js +88 -263
  46. package/dist/cjs/modules/pdf/core/pdf-writer.js +19 -10
  47. package/dist/cjs/modules/pdf/index.js +23 -4
  48. package/dist/cjs/modules/pdf/reader/annotation-extractor.js +158 -0
  49. package/dist/cjs/modules/pdf/reader/cmap-parser.js +326 -0
  50. package/dist/cjs/modules/pdf/reader/content-interpreter.js +718 -0
  51. package/dist/cjs/modules/pdf/reader/font-decoder.js +1518 -0
  52. package/dist/cjs/modules/pdf/reader/form-extractor.js +358 -0
  53. package/dist/cjs/modules/pdf/reader/image-extractor.js +223 -0
  54. package/dist/cjs/modules/pdf/reader/metadata-reader.js +278 -0
  55. package/dist/cjs/modules/pdf/reader/pdf-decrypt.js +447 -0
  56. package/dist/cjs/modules/pdf/reader/pdf-document.js +822 -0
  57. package/dist/cjs/modules/pdf/reader/pdf-parser.js +301 -0
  58. package/dist/cjs/modules/pdf/reader/pdf-reader.js +203 -0
  59. package/dist/cjs/modules/pdf/reader/pdf-tokenizer.js +517 -0
  60. package/dist/cjs/modules/pdf/reader/reader-utils.js +30 -0
  61. package/dist/cjs/modules/pdf/reader/stream-filters.js +459 -0
  62. package/dist/cjs/modules/pdf/reader/text-reconstruction.js +467 -0
  63. package/dist/esm/modules/archive/zip/stream.js +53 -0
  64. package/dist/esm/modules/pdf/core/crypto.js +637 -0
  65. package/dist/esm/modules/pdf/core/encryption.js +88 -261
  66. package/dist/esm/modules/pdf/core/pdf-writer.js +19 -10
  67. package/dist/esm/modules/pdf/index.js +21 -3
  68. package/dist/esm/modules/pdf/reader/annotation-extractor.js +155 -0
  69. package/dist/esm/modules/pdf/reader/cmap-parser.js +321 -0
  70. package/dist/esm/modules/pdf/reader/content-interpreter.js +715 -0
  71. package/dist/esm/modules/pdf/reader/font-decoder.js +1513 -0
  72. package/dist/esm/modules/pdf/reader/form-extractor.js +355 -0
  73. package/dist/esm/modules/pdf/reader/image-extractor.js +220 -0
  74. package/dist/esm/modules/pdf/reader/metadata-reader.js +275 -0
  75. package/dist/esm/modules/pdf/reader/pdf-decrypt.js +443 -0
  76. package/dist/esm/modules/pdf/reader/pdf-document.js +818 -0
  77. package/dist/esm/modules/pdf/reader/pdf-parser.js +285 -0
  78. package/dist/esm/modules/pdf/reader/pdf-reader.js +200 -0
  79. package/dist/esm/modules/pdf/reader/pdf-tokenizer.js +543 -0
  80. package/dist/esm/modules/pdf/reader/reader-utils.js +27 -0
  81. package/dist/esm/modules/pdf/reader/stream-filters.js +456 -0
  82. package/dist/esm/modules/pdf/reader/text-reconstruction.js +463 -0
  83. package/dist/iife/excelts.iife.js +703 -267
  84. package/dist/iife/excelts.iife.js.map +1 -1
  85. package/dist/iife/excelts.iife.min.js +35 -35
  86. package/dist/types/modules/archive/zip/stream.d.ts +4 -0
  87. package/dist/types/modules/pdf/core/crypto.d.ts +65 -0
  88. package/dist/types/modules/pdf/core/encryption.d.ts +23 -20
  89. package/dist/types/modules/pdf/core/pdf-writer.d.ts +6 -4
  90. package/dist/types/modules/pdf/index.d.ts +23 -2
  91. package/dist/types/modules/pdf/reader/annotation-extractor.d.ts +63 -0
  92. package/dist/types/modules/pdf/reader/cmap-parser.d.ts +70 -0
  93. package/dist/types/modules/pdf/reader/content-interpreter.d.ts +57 -0
  94. package/dist/types/modules/pdf/reader/font-decoder.d.ts +58 -0
  95. package/dist/types/modules/pdf/reader/form-extractor.d.ts +48 -0
  96. package/dist/types/modules/pdf/reader/image-extractor.d.ts +55 -0
  97. package/dist/types/modules/pdf/reader/metadata-reader.d.ts +56 -0
  98. package/dist/types/modules/pdf/reader/pdf-decrypt.d.ts +26 -0
  99. package/dist/types/modules/pdf/reader/pdf-document.d.ts +191 -0
  100. package/dist/types/modules/pdf/reader/pdf-parser.d.ts +65 -0
  101. package/dist/types/modules/pdf/reader/pdf-reader.d.ts +143 -0
  102. package/dist/types/modules/pdf/reader/pdf-tokenizer.d.ts +101 -0
  103. package/dist/types/modules/pdf/reader/reader-utils.d.ts +15 -0
  104. package/dist/types/modules/pdf/reader/stream-filters.d.ts +20 -0
  105. package/dist/types/modules/pdf/reader/text-reconstruction.d.ts +44 -0
  106. package/package.json +1 -1
@@ -0,0 +1,275 @@
1
+ /**
2
+ * PDF metadata reader.
3
+ *
4
+ * Extracts document metadata from:
5
+ * 1. Info Dictionary (traditional metadata)
6
+ * - Title, Author, Subject, Keywords, Creator, Producer
7
+ * - CreationDate, ModDate
8
+ *
9
+ * 2. XMP Metadata Stream (XML-based, more comprehensive)
10
+ * - All of the above plus:
11
+ * - Dublin Core metadata, custom properties
12
+ *
13
+ * @see PDF Reference 1.7, §10.2 - Metadata
14
+ * @see XMP Specification Part 1
15
+ */
16
+ import { dictGetRef, decodePdfStringBytes } from "./pdf-parser.js";
17
+ // =============================================================================
18
+ // Public API
19
+ // =============================================================================
20
+ /**
21
+ * Extract metadata from a PDF document.
22
+ */
23
+ export function extractMetadata(doc) {
24
+ const metadata = {
25
+ title: "",
26
+ author: "",
27
+ subject: "",
28
+ keywords: "",
29
+ creator: "",
30
+ producer: "",
31
+ creationDate: null,
32
+ modDate: null,
33
+ pdfVersion: extractPdfVersion(doc),
34
+ pageCount: 0,
35
+ encrypted: doc.trailer.has("Encrypt"),
36
+ pageSize: null,
37
+ xmpXml: null,
38
+ custom: {}
39
+ };
40
+ // Extract from Info dictionary
41
+ extractInfoDict(doc, metadata);
42
+ // Extract from XMP metadata stream
43
+ extractXmpMetadata(doc, metadata);
44
+ // Get page count and first page size
45
+ try {
46
+ const pages = doc.getPages();
47
+ metadata.pageCount = pages.length;
48
+ if (pages.length > 0) {
49
+ metadata.pageSize = extractPageSize(pages[0], doc);
50
+ }
51
+ }
52
+ catch {
53
+ // Ignore page tree errors
54
+ }
55
+ return metadata;
56
+ }
57
+ // =============================================================================
58
+ // PDF Version
59
+ // =============================================================================
60
+ function extractPdfVersion(doc) {
61
+ const data = doc.data;
62
+ // First line: %PDF-X.Y
63
+ if (data[0] === 0x25 &&
64
+ data[1] === 0x50 &&
65
+ data[2] === 0x44 &&
66
+ data[3] === 0x46 &&
67
+ data[4] === 0x2d) {
68
+ let version = "";
69
+ for (let i = 5; i < Math.min(data.length, 15); i++) {
70
+ const b = data[i];
71
+ if (b === 0x0a || b === 0x0d || b === 0x20) {
72
+ break;
73
+ }
74
+ version += String.fromCharCode(b);
75
+ }
76
+ return version;
77
+ }
78
+ // Check catalog /Version
79
+ try {
80
+ const catalog = doc.getCatalog();
81
+ const version = catalog.get("Version");
82
+ if (typeof version === "string") {
83
+ return version;
84
+ }
85
+ }
86
+ catch {
87
+ // Ignore
88
+ }
89
+ return "1.0";
90
+ }
91
+ // =============================================================================
92
+ // Info Dictionary
93
+ // =============================================================================
94
+ function extractInfoDict(doc, metadata) {
95
+ const infoRef = dictGetRef(doc.trailer, "Info");
96
+ if (!infoRef) {
97
+ return;
98
+ }
99
+ const infoDict = doc.derefDict(infoRef);
100
+ if (!infoDict) {
101
+ return;
102
+ }
103
+ const knownKeys = new Set([
104
+ "Title",
105
+ "Author",
106
+ "Subject",
107
+ "Keywords",
108
+ "Creator",
109
+ "Producer",
110
+ "CreationDate",
111
+ "ModDate"
112
+ ]);
113
+ for (const [key, value] of infoDict) {
114
+ const strValue = value instanceof Uint8Array ? decodePdfStringBytes(value) : String(value ?? "");
115
+ switch (key) {
116
+ case "Title":
117
+ metadata.title = strValue;
118
+ break;
119
+ case "Author":
120
+ metadata.author = strValue;
121
+ break;
122
+ case "Subject":
123
+ metadata.subject = strValue;
124
+ break;
125
+ case "Keywords":
126
+ metadata.keywords = strValue;
127
+ break;
128
+ case "Creator":
129
+ metadata.creator = strValue;
130
+ break;
131
+ case "Producer":
132
+ metadata.producer = strValue;
133
+ break;
134
+ case "CreationDate":
135
+ metadata.creationDate = parsePdfDate(strValue);
136
+ break;
137
+ case "ModDate":
138
+ metadata.modDate = parsePdfDate(strValue);
139
+ break;
140
+ default:
141
+ if (!knownKeys.has(key)) {
142
+ metadata.custom[key] = strValue;
143
+ }
144
+ break;
145
+ }
146
+ }
147
+ }
148
+ // =============================================================================
149
+ // XMP Metadata
150
+ // =============================================================================
151
+ function extractXmpMetadata(doc, metadata) {
152
+ try {
153
+ const catalog = doc.getCatalog();
154
+ const metadataRef = catalog.get("Metadata");
155
+ if (!metadataRef) {
156
+ return;
157
+ }
158
+ const result = doc.derefStreamWithObjNum(metadataRef);
159
+ if (!result) {
160
+ return;
161
+ }
162
+ const data = doc.getStreamData(result.stream, result.objNum, result.gen);
163
+ const xml = new TextDecoder("utf-8").decode(data);
164
+ metadata.xmpXml = xml;
165
+ // Parse key fields from XMP
166
+ if (!metadata.title) {
167
+ metadata.title = extractXmpField(xml, "dc:title") ?? "";
168
+ }
169
+ if (!metadata.author) {
170
+ metadata.author = extractXmpField(xml, "dc:creator") ?? "";
171
+ }
172
+ if (!metadata.subject) {
173
+ metadata.subject = extractXmpField(xml, "dc:description") ?? "";
174
+ }
175
+ if (!metadata.keywords) {
176
+ metadata.keywords = extractXmpField(xml, "pdf:Keywords") ?? "";
177
+ }
178
+ if (!metadata.creator) {
179
+ metadata.creator = extractXmpField(xml, "xmp:CreatorTool") ?? "";
180
+ }
181
+ if (!metadata.producer) {
182
+ metadata.producer = extractXmpField(xml, "pdf:Producer") ?? "";
183
+ }
184
+ if (!metadata.creationDate) {
185
+ const dateStr = extractXmpField(xml, "xmp:CreateDate");
186
+ if (dateStr) {
187
+ metadata.creationDate = new Date(dateStr);
188
+ }
189
+ }
190
+ if (!metadata.modDate) {
191
+ const dateStr = extractXmpField(xml, "xmp:ModifyDate");
192
+ if (dateStr) {
193
+ metadata.modDate = new Date(dateStr);
194
+ }
195
+ }
196
+ }
197
+ catch {
198
+ // Ignore XMP errors
199
+ }
200
+ }
201
+ /**
202
+ * Extract a field value from XMP XML using simple regex.
203
+ * Handles both simple elements and rdf:Alt/rdf:Bag/rdf:Seq containers.
204
+ */
205
+ function extractXmpField(xml, field) {
206
+ // Try simple element: <field>value</field>
207
+ const simpleRegex = new RegExp(`<${field}[^>]*>([^<]+)</${field}>`, "i");
208
+ const simpleMatch = simpleRegex.exec(xml);
209
+ if (simpleMatch) {
210
+ return decodeXmlEntities(simpleMatch[1].trim());
211
+ }
212
+ // Try rdf:Alt/rdf:Bag/rdf:Seq container: <field>...<rdf:li>value</rdf:li>...</field>
213
+ const containerRegex = new RegExp(`<${field}[^>]*>.*?<rdf:li[^>]*>([^<]+)</rdf:li>`, "is");
214
+ const containerMatch = containerRegex.exec(xml);
215
+ if (containerMatch) {
216
+ return decodeXmlEntities(containerMatch[1].trim());
217
+ }
218
+ return null;
219
+ }
220
+ function decodeXmlEntities(text) {
221
+ return text
222
+ .replace(/&amp;/g, "&")
223
+ .replace(/&lt;/g, "<")
224
+ .replace(/&gt;/g, ">")
225
+ .replace(/&quot;/g, '"')
226
+ .replace(/&apos;/g, "'");
227
+ }
228
+ // =============================================================================
229
+ // Page Size
230
+ // =============================================================================
231
+ function extractPageSize(pageDict, doc) {
232
+ return doc.resolvePageBox(pageDict);
233
+ }
234
+ // =============================================================================
235
+ // PDF Date Parsing
236
+ // =============================================================================
237
+ /**
238
+ * Parse a PDF date string to a Date object.
239
+ * Format: D:YYYYMMDDHHmmSSOHH'mm
240
+ */
241
+ function parsePdfDate(dateStr) {
242
+ if (!dateStr) {
243
+ return null;
244
+ }
245
+ // Remove leading "D:" if present
246
+ let s = dateStr;
247
+ if (s.startsWith("D:")) {
248
+ s = s.substring(2);
249
+ }
250
+ // Parse components
251
+ const year = parseInt(s.substring(0, 4), 10);
252
+ if (isNaN(year)) {
253
+ return null;
254
+ }
255
+ const month = parseInt(s.substring(4, 6), 10) || 1;
256
+ const day = parseInt(s.substring(6, 8), 10) || 1;
257
+ const hour = parseInt(s.substring(8, 10), 10) || 0;
258
+ const minute = parseInt(s.substring(10, 12), 10) || 0;
259
+ const second = parseInt(s.substring(12, 14), 10) || 0;
260
+ // Parse timezone
261
+ const tzChar = s.charAt(14);
262
+ let offsetMinutes = 0;
263
+ if (tzChar === "+" || tzChar === "-") {
264
+ const tzHour = parseInt(s.substring(15, 17), 10) || 0;
265
+ const tzMin = parseInt(s.substring(18, 20), 10) || 0;
266
+ offsetMinutes = (tzHour * 60 + tzMin) * (tzChar === "-" ? -1 : 1);
267
+ }
268
+ // Create Date in UTC
269
+ const date = new Date(Date.UTC(year, month - 1, day, hour, minute, second));
270
+ // Apply timezone offset
271
+ if (offsetMinutes !== 0 && tzChar !== "Z") {
272
+ date.setUTCMinutes(date.getUTCMinutes() - offsetMinutes);
273
+ }
274
+ return isNaN(date.getTime()) ? null : date;
275
+ }