@cj-tech-master/excelts 8.0.0 → 8.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +14 -1
- package/README_zh.md +6 -0
- package/dist/browser/modules/archive/zip/stream.d.ts +4 -0
- package/dist/browser/modules/archive/zip/stream.js +53 -0
- package/dist/browser/modules/pdf/core/crypto.d.ts +65 -0
- package/dist/browser/modules/pdf/core/crypto.js +637 -0
- package/dist/browser/modules/pdf/core/encryption.d.ts +23 -20
- package/dist/browser/modules/pdf/core/encryption.js +88 -261
- package/dist/browser/modules/pdf/core/pdf-writer.d.ts +6 -4
- package/dist/browser/modules/pdf/core/pdf-writer.js +19 -10
- package/dist/browser/modules/pdf/index.d.ts +23 -2
- package/dist/browser/modules/pdf/index.js +21 -3
- package/dist/browser/modules/pdf/reader/annotation-extractor.d.ts +63 -0
- package/dist/browser/modules/pdf/reader/annotation-extractor.js +155 -0
- package/dist/browser/modules/pdf/reader/cmap-parser.d.ts +70 -0
- package/dist/browser/modules/pdf/reader/cmap-parser.js +321 -0
- package/dist/browser/modules/pdf/reader/content-interpreter.d.ts +57 -0
- package/dist/browser/modules/pdf/reader/content-interpreter.js +715 -0
- package/dist/browser/modules/pdf/reader/font-decoder.d.ts +58 -0
- package/dist/browser/modules/pdf/reader/font-decoder.js +1513 -0
- package/dist/browser/modules/pdf/reader/form-extractor.d.ts +48 -0
- package/dist/browser/modules/pdf/reader/form-extractor.js +355 -0
- package/dist/browser/modules/pdf/reader/image-extractor.d.ts +55 -0
- package/dist/browser/modules/pdf/reader/image-extractor.js +220 -0
- package/dist/browser/modules/pdf/reader/metadata-reader.d.ts +56 -0
- package/dist/browser/modules/pdf/reader/metadata-reader.js +275 -0
- package/dist/browser/modules/pdf/reader/pdf-decrypt.d.ts +26 -0
- package/dist/browser/modules/pdf/reader/pdf-decrypt.js +443 -0
- package/dist/browser/modules/pdf/reader/pdf-document.d.ts +191 -0
- package/dist/browser/modules/pdf/reader/pdf-document.js +818 -0
- package/dist/browser/modules/pdf/reader/pdf-parser.d.ts +65 -0
- package/dist/browser/modules/pdf/reader/pdf-parser.js +285 -0
- package/dist/browser/modules/pdf/reader/pdf-reader.d.ts +143 -0
- package/dist/browser/modules/pdf/reader/pdf-reader.js +200 -0
- package/dist/browser/modules/pdf/reader/pdf-tokenizer.d.ts +101 -0
- package/dist/browser/modules/pdf/reader/pdf-tokenizer.js +543 -0
- package/dist/browser/modules/pdf/reader/reader-utils.d.ts +15 -0
- package/dist/browser/modules/pdf/reader/reader-utils.js +27 -0
- package/dist/browser/modules/pdf/reader/stream-filters.d.ts +20 -0
- package/dist/browser/modules/pdf/reader/stream-filters.js +456 -0
- package/dist/browser/modules/pdf/reader/text-reconstruction.d.ts +44 -0
- package/dist/browser/modules/pdf/reader/text-reconstruction.js +463 -0
- package/dist/cjs/modules/archive/zip/stream.js +53 -0
- package/dist/cjs/modules/pdf/core/crypto.js +649 -0
- package/dist/cjs/modules/pdf/core/encryption.js +88 -263
- package/dist/cjs/modules/pdf/core/pdf-writer.js +19 -10
- package/dist/cjs/modules/pdf/index.js +23 -4
- package/dist/cjs/modules/pdf/reader/annotation-extractor.js +158 -0
- package/dist/cjs/modules/pdf/reader/cmap-parser.js +326 -0
- package/dist/cjs/modules/pdf/reader/content-interpreter.js +718 -0
- package/dist/cjs/modules/pdf/reader/font-decoder.js +1518 -0
- package/dist/cjs/modules/pdf/reader/form-extractor.js +358 -0
- package/dist/cjs/modules/pdf/reader/image-extractor.js +223 -0
- package/dist/cjs/modules/pdf/reader/metadata-reader.js +278 -0
- package/dist/cjs/modules/pdf/reader/pdf-decrypt.js +447 -0
- package/dist/cjs/modules/pdf/reader/pdf-document.js +822 -0
- package/dist/cjs/modules/pdf/reader/pdf-parser.js +301 -0
- package/dist/cjs/modules/pdf/reader/pdf-reader.js +203 -0
- package/dist/cjs/modules/pdf/reader/pdf-tokenizer.js +517 -0
- package/dist/cjs/modules/pdf/reader/reader-utils.js +30 -0
- package/dist/cjs/modules/pdf/reader/stream-filters.js +459 -0
- package/dist/cjs/modules/pdf/reader/text-reconstruction.js +467 -0
- package/dist/esm/modules/archive/zip/stream.js +53 -0
- package/dist/esm/modules/pdf/core/crypto.js +637 -0
- package/dist/esm/modules/pdf/core/encryption.js +88 -261
- package/dist/esm/modules/pdf/core/pdf-writer.js +19 -10
- package/dist/esm/modules/pdf/index.js +21 -3
- package/dist/esm/modules/pdf/reader/annotation-extractor.js +155 -0
- package/dist/esm/modules/pdf/reader/cmap-parser.js +321 -0
- package/dist/esm/modules/pdf/reader/content-interpreter.js +715 -0
- package/dist/esm/modules/pdf/reader/font-decoder.js +1513 -0
- package/dist/esm/modules/pdf/reader/form-extractor.js +355 -0
- package/dist/esm/modules/pdf/reader/image-extractor.js +220 -0
- package/dist/esm/modules/pdf/reader/metadata-reader.js +275 -0
- package/dist/esm/modules/pdf/reader/pdf-decrypt.js +443 -0
- package/dist/esm/modules/pdf/reader/pdf-document.js +818 -0
- package/dist/esm/modules/pdf/reader/pdf-parser.js +285 -0
- package/dist/esm/modules/pdf/reader/pdf-reader.js +200 -0
- package/dist/esm/modules/pdf/reader/pdf-tokenizer.js +543 -0
- package/dist/esm/modules/pdf/reader/reader-utils.js +27 -0
- package/dist/esm/modules/pdf/reader/stream-filters.js +456 -0
- package/dist/esm/modules/pdf/reader/text-reconstruction.js +463 -0
- package/dist/iife/excelts.iife.js +703 -267
- package/dist/iife/excelts.iife.js.map +1 -1
- package/dist/iife/excelts.iife.min.js +35 -35
- package/dist/types/modules/archive/zip/stream.d.ts +4 -0
- package/dist/types/modules/pdf/core/crypto.d.ts +65 -0
- package/dist/types/modules/pdf/core/encryption.d.ts +23 -20
- package/dist/types/modules/pdf/core/pdf-writer.d.ts +6 -4
- package/dist/types/modules/pdf/index.d.ts +23 -2
- package/dist/types/modules/pdf/reader/annotation-extractor.d.ts +63 -0
- package/dist/types/modules/pdf/reader/cmap-parser.d.ts +70 -0
- package/dist/types/modules/pdf/reader/content-interpreter.d.ts +57 -0
- package/dist/types/modules/pdf/reader/font-decoder.d.ts +58 -0
- package/dist/types/modules/pdf/reader/form-extractor.d.ts +48 -0
- package/dist/types/modules/pdf/reader/image-extractor.d.ts +55 -0
- package/dist/types/modules/pdf/reader/metadata-reader.d.ts +56 -0
- package/dist/types/modules/pdf/reader/pdf-decrypt.d.ts +26 -0
- package/dist/types/modules/pdf/reader/pdf-document.d.ts +191 -0
- package/dist/types/modules/pdf/reader/pdf-parser.d.ts +65 -0
- package/dist/types/modules/pdf/reader/pdf-reader.d.ts +143 -0
- package/dist/types/modules/pdf/reader/pdf-tokenizer.d.ts +101 -0
- package/dist/types/modules/pdf/reader/reader-utils.d.ts +15 -0
- package/dist/types/modules/pdf/reader/stream-filters.d.ts +20 -0
- package/dist/types/modules/pdf/reader/text-reconstruction.d.ts +44 -0
- package/package.json +1 -1
|
@@ -0,0 +1,278 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* PDF metadata reader.
|
|
4
|
+
*
|
|
5
|
+
* Extracts document metadata from:
|
|
6
|
+
* 1. Info Dictionary (traditional metadata)
|
|
7
|
+
* - Title, Author, Subject, Keywords, Creator, Producer
|
|
8
|
+
* - CreationDate, ModDate
|
|
9
|
+
*
|
|
10
|
+
* 2. XMP Metadata Stream (XML-based, more comprehensive)
|
|
11
|
+
* - All of the above plus:
|
|
12
|
+
* - Dublin Core metadata, custom properties
|
|
13
|
+
*
|
|
14
|
+
* @see PDF Reference 1.7, §10.2 - Metadata
|
|
15
|
+
* @see XMP Specification Part 1
|
|
16
|
+
*/
|
|
17
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
18
|
+
exports.extractMetadata = extractMetadata;
|
|
19
|
+
const pdf_parser_1 = require("./pdf-parser");
|
|
20
|
+
// =============================================================================
|
|
21
|
+
// Public API
|
|
22
|
+
// =============================================================================
|
|
23
|
+
/**
|
|
24
|
+
* Extract metadata from a PDF document.
|
|
25
|
+
*/
|
|
26
|
+
function extractMetadata(doc) {
|
|
27
|
+
const metadata = {
|
|
28
|
+
title: "",
|
|
29
|
+
author: "",
|
|
30
|
+
subject: "",
|
|
31
|
+
keywords: "",
|
|
32
|
+
creator: "",
|
|
33
|
+
producer: "",
|
|
34
|
+
creationDate: null,
|
|
35
|
+
modDate: null,
|
|
36
|
+
pdfVersion: extractPdfVersion(doc),
|
|
37
|
+
pageCount: 0,
|
|
38
|
+
encrypted: doc.trailer.has("Encrypt"),
|
|
39
|
+
pageSize: null,
|
|
40
|
+
xmpXml: null,
|
|
41
|
+
custom: {}
|
|
42
|
+
};
|
|
43
|
+
// Extract from Info dictionary
|
|
44
|
+
extractInfoDict(doc, metadata);
|
|
45
|
+
// Extract from XMP metadata stream
|
|
46
|
+
extractXmpMetadata(doc, metadata);
|
|
47
|
+
// Get page count and first page size
|
|
48
|
+
try {
|
|
49
|
+
const pages = doc.getPages();
|
|
50
|
+
metadata.pageCount = pages.length;
|
|
51
|
+
if (pages.length > 0) {
|
|
52
|
+
metadata.pageSize = extractPageSize(pages[0], doc);
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
catch {
|
|
56
|
+
// Ignore page tree errors
|
|
57
|
+
}
|
|
58
|
+
return metadata;
|
|
59
|
+
}
|
|
60
|
+
// =============================================================================
|
|
61
|
+
// PDF Version
|
|
62
|
+
// =============================================================================
|
|
63
|
+
function extractPdfVersion(doc) {
|
|
64
|
+
const data = doc.data;
|
|
65
|
+
// First line: %PDF-X.Y
|
|
66
|
+
if (data[0] === 0x25 &&
|
|
67
|
+
data[1] === 0x50 &&
|
|
68
|
+
data[2] === 0x44 &&
|
|
69
|
+
data[3] === 0x46 &&
|
|
70
|
+
data[4] === 0x2d) {
|
|
71
|
+
let version = "";
|
|
72
|
+
for (let i = 5; i < Math.min(data.length, 15); i++) {
|
|
73
|
+
const b = data[i];
|
|
74
|
+
if (b === 0x0a || b === 0x0d || b === 0x20) {
|
|
75
|
+
break;
|
|
76
|
+
}
|
|
77
|
+
version += String.fromCharCode(b);
|
|
78
|
+
}
|
|
79
|
+
return version;
|
|
80
|
+
}
|
|
81
|
+
// Check catalog /Version
|
|
82
|
+
try {
|
|
83
|
+
const catalog = doc.getCatalog();
|
|
84
|
+
const version = catalog.get("Version");
|
|
85
|
+
if (typeof version === "string") {
|
|
86
|
+
return version;
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
catch {
|
|
90
|
+
// Ignore
|
|
91
|
+
}
|
|
92
|
+
return "1.0";
|
|
93
|
+
}
|
|
94
|
+
// =============================================================================
|
|
95
|
+
// Info Dictionary
|
|
96
|
+
// =============================================================================
|
|
97
|
+
function extractInfoDict(doc, metadata) {
|
|
98
|
+
const infoRef = (0, pdf_parser_1.dictGetRef)(doc.trailer, "Info");
|
|
99
|
+
if (!infoRef) {
|
|
100
|
+
return;
|
|
101
|
+
}
|
|
102
|
+
const infoDict = doc.derefDict(infoRef);
|
|
103
|
+
if (!infoDict) {
|
|
104
|
+
return;
|
|
105
|
+
}
|
|
106
|
+
const knownKeys = new Set([
|
|
107
|
+
"Title",
|
|
108
|
+
"Author",
|
|
109
|
+
"Subject",
|
|
110
|
+
"Keywords",
|
|
111
|
+
"Creator",
|
|
112
|
+
"Producer",
|
|
113
|
+
"CreationDate",
|
|
114
|
+
"ModDate"
|
|
115
|
+
]);
|
|
116
|
+
for (const [key, value] of infoDict) {
|
|
117
|
+
const strValue = value instanceof Uint8Array ? (0, pdf_parser_1.decodePdfStringBytes)(value) : String(value ?? "");
|
|
118
|
+
switch (key) {
|
|
119
|
+
case "Title":
|
|
120
|
+
metadata.title = strValue;
|
|
121
|
+
break;
|
|
122
|
+
case "Author":
|
|
123
|
+
metadata.author = strValue;
|
|
124
|
+
break;
|
|
125
|
+
case "Subject":
|
|
126
|
+
metadata.subject = strValue;
|
|
127
|
+
break;
|
|
128
|
+
case "Keywords":
|
|
129
|
+
metadata.keywords = strValue;
|
|
130
|
+
break;
|
|
131
|
+
case "Creator":
|
|
132
|
+
metadata.creator = strValue;
|
|
133
|
+
break;
|
|
134
|
+
case "Producer":
|
|
135
|
+
metadata.producer = strValue;
|
|
136
|
+
break;
|
|
137
|
+
case "CreationDate":
|
|
138
|
+
metadata.creationDate = parsePdfDate(strValue);
|
|
139
|
+
break;
|
|
140
|
+
case "ModDate":
|
|
141
|
+
metadata.modDate = parsePdfDate(strValue);
|
|
142
|
+
break;
|
|
143
|
+
default:
|
|
144
|
+
if (!knownKeys.has(key)) {
|
|
145
|
+
metadata.custom[key] = strValue;
|
|
146
|
+
}
|
|
147
|
+
break;
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
// =============================================================================
|
|
152
|
+
// XMP Metadata
|
|
153
|
+
// =============================================================================
|
|
154
|
+
function extractXmpMetadata(doc, metadata) {
|
|
155
|
+
try {
|
|
156
|
+
const catalog = doc.getCatalog();
|
|
157
|
+
const metadataRef = catalog.get("Metadata");
|
|
158
|
+
if (!metadataRef) {
|
|
159
|
+
return;
|
|
160
|
+
}
|
|
161
|
+
const result = doc.derefStreamWithObjNum(metadataRef);
|
|
162
|
+
if (!result) {
|
|
163
|
+
return;
|
|
164
|
+
}
|
|
165
|
+
const data = doc.getStreamData(result.stream, result.objNum, result.gen);
|
|
166
|
+
const xml = new TextDecoder("utf-8").decode(data);
|
|
167
|
+
metadata.xmpXml = xml;
|
|
168
|
+
// Parse key fields from XMP
|
|
169
|
+
if (!metadata.title) {
|
|
170
|
+
metadata.title = extractXmpField(xml, "dc:title") ?? "";
|
|
171
|
+
}
|
|
172
|
+
if (!metadata.author) {
|
|
173
|
+
metadata.author = extractXmpField(xml, "dc:creator") ?? "";
|
|
174
|
+
}
|
|
175
|
+
if (!metadata.subject) {
|
|
176
|
+
metadata.subject = extractXmpField(xml, "dc:description") ?? "";
|
|
177
|
+
}
|
|
178
|
+
if (!metadata.keywords) {
|
|
179
|
+
metadata.keywords = extractXmpField(xml, "pdf:Keywords") ?? "";
|
|
180
|
+
}
|
|
181
|
+
if (!metadata.creator) {
|
|
182
|
+
metadata.creator = extractXmpField(xml, "xmp:CreatorTool") ?? "";
|
|
183
|
+
}
|
|
184
|
+
if (!metadata.producer) {
|
|
185
|
+
metadata.producer = extractXmpField(xml, "pdf:Producer") ?? "";
|
|
186
|
+
}
|
|
187
|
+
if (!metadata.creationDate) {
|
|
188
|
+
const dateStr = extractXmpField(xml, "xmp:CreateDate");
|
|
189
|
+
if (dateStr) {
|
|
190
|
+
metadata.creationDate = new Date(dateStr);
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
if (!metadata.modDate) {
|
|
194
|
+
const dateStr = extractXmpField(xml, "xmp:ModifyDate");
|
|
195
|
+
if (dateStr) {
|
|
196
|
+
metadata.modDate = new Date(dateStr);
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
catch {
|
|
201
|
+
// Ignore XMP errors
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
/**
|
|
205
|
+
* Extract a field value from XMP XML using simple regex.
|
|
206
|
+
* Handles both simple elements and rdf:Alt/rdf:Bag/rdf:Seq containers.
|
|
207
|
+
*/
|
|
208
|
+
function extractXmpField(xml, field) {
|
|
209
|
+
// Try simple element: <field>value</field>
|
|
210
|
+
const simpleRegex = new RegExp(`<${field}[^>]*>([^<]+)</${field}>`, "i");
|
|
211
|
+
const simpleMatch = simpleRegex.exec(xml);
|
|
212
|
+
if (simpleMatch) {
|
|
213
|
+
return decodeXmlEntities(simpleMatch[1].trim());
|
|
214
|
+
}
|
|
215
|
+
// Try rdf:Alt/rdf:Bag/rdf:Seq container: <field>...<rdf:li>value</rdf:li>...</field>
|
|
216
|
+
const containerRegex = new RegExp(`<${field}[^>]*>.*?<rdf:li[^>]*>([^<]+)</rdf:li>`, "is");
|
|
217
|
+
const containerMatch = containerRegex.exec(xml);
|
|
218
|
+
if (containerMatch) {
|
|
219
|
+
return decodeXmlEntities(containerMatch[1].trim());
|
|
220
|
+
}
|
|
221
|
+
return null;
|
|
222
|
+
}
|
|
223
|
+
function decodeXmlEntities(text) {
|
|
224
|
+
return text
|
|
225
|
+
.replace(/&/g, "&")
|
|
226
|
+
.replace(/</g, "<")
|
|
227
|
+
.replace(/>/g, ">")
|
|
228
|
+
.replace(/"/g, '"')
|
|
229
|
+
.replace(/'/g, "'");
|
|
230
|
+
}
|
|
231
|
+
// =============================================================================
|
|
232
|
+
// Page Size
|
|
233
|
+
// =============================================================================
|
|
234
|
+
function extractPageSize(pageDict, doc) {
|
|
235
|
+
return doc.resolvePageBox(pageDict);
|
|
236
|
+
}
|
|
237
|
+
// =============================================================================
|
|
238
|
+
// PDF Date Parsing
|
|
239
|
+
// =============================================================================
|
|
240
|
+
/**
|
|
241
|
+
* Parse a PDF date string to a Date object.
|
|
242
|
+
* Format: D:YYYYMMDDHHmmSSOHH'mm
|
|
243
|
+
*/
|
|
244
|
+
function parsePdfDate(dateStr) {
|
|
245
|
+
if (!dateStr) {
|
|
246
|
+
return null;
|
|
247
|
+
}
|
|
248
|
+
// Remove leading "D:" if present
|
|
249
|
+
let s = dateStr;
|
|
250
|
+
if (s.startsWith("D:")) {
|
|
251
|
+
s = s.substring(2);
|
|
252
|
+
}
|
|
253
|
+
// Parse components
|
|
254
|
+
const year = parseInt(s.substring(0, 4), 10);
|
|
255
|
+
if (isNaN(year)) {
|
|
256
|
+
return null;
|
|
257
|
+
}
|
|
258
|
+
const month = parseInt(s.substring(4, 6), 10) || 1;
|
|
259
|
+
const day = parseInt(s.substring(6, 8), 10) || 1;
|
|
260
|
+
const hour = parseInt(s.substring(8, 10), 10) || 0;
|
|
261
|
+
const minute = parseInt(s.substring(10, 12), 10) || 0;
|
|
262
|
+
const second = parseInt(s.substring(12, 14), 10) || 0;
|
|
263
|
+
// Parse timezone
|
|
264
|
+
const tzChar = s.charAt(14);
|
|
265
|
+
let offsetMinutes = 0;
|
|
266
|
+
if (tzChar === "+" || tzChar === "-") {
|
|
267
|
+
const tzHour = parseInt(s.substring(15, 17), 10) || 0;
|
|
268
|
+
const tzMin = parseInt(s.substring(18, 20), 10) || 0;
|
|
269
|
+
offsetMinutes = (tzHour * 60 + tzMin) * (tzChar === "-" ? -1 : 1);
|
|
270
|
+
}
|
|
271
|
+
// Create Date in UTC
|
|
272
|
+
const date = new Date(Date.UTC(year, month - 1, day, hour, minute, second));
|
|
273
|
+
// Apply timezone offset
|
|
274
|
+
if (offsetMinutes !== 0 && tzChar !== "Z") {
|
|
275
|
+
date.setUTCMinutes(date.getUTCMinutes() - offsetMinutes);
|
|
276
|
+
}
|
|
277
|
+
return isNaN(date.getTime()) ? null : date;
|
|
278
|
+
}
|