@shuji-bonji/pdf-reader-mcp 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +44 -0
- package/LICENSE +21 -0
- package/README.ja.md +190 -0
- package/README.md +206 -0
- package/dist/constants.d.ts +22 -0
- package/dist/constants.d.ts.map +1 -0
- package/dist/constants.js +23 -0
- package/dist/constants.js.map +1 -0
- package/dist/index.d.ts +9 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +35 -0
- package/dist/index.js.map +1 -0
- package/dist/schemas/common.d.ts +14 -0
- package/dist/schemas/common.d.ts.map +1 -0
- package/dist/schemas/common.js +26 -0
- package/dist/schemas/common.js.map +1 -0
- package/dist/schemas/tier1.d.ts +104 -0
- package/dist/schemas/tier1.d.ts.map +1 -0
- package/dist/schemas/tier1.js +77 -0
- package/dist/schemas/tier1.js.map +1 -0
- package/dist/schemas/tier2.d.ts +68 -0
- package/dist/schemas/tier2.d.ts.map +1 -0
- package/dist/schemas/tier2.js +42 -0
- package/dist/schemas/tier2.js.map +1 -0
- package/dist/schemas/tier3.d.ts +44 -0
- package/dist/schemas/tier3.d.ts.map +1 -0
- package/dist/schemas/tier3.js +28 -0
- package/dist/schemas/tier3.js.map +1 -0
- package/dist/services/pdfjs-service.d.ts +65 -0
- package/dist/services/pdfjs-service.d.ts.map +1 -0
- package/dist/services/pdfjs-service.js +520 -0
- package/dist/services/pdfjs-service.js.map +1 -0
- package/dist/services/pdflib-service.d.ts +35 -0
- package/dist/services/pdflib-service.d.ts.map +1 -0
- package/dist/services/pdflib-service.js +318 -0
- package/dist/services/pdflib-service.js.map +1 -0
- package/dist/services/url-fetcher.d.ts +8 -0
- package/dist/services/url-fetcher.d.ts.map +1 -0
- package/dist/services/url-fetcher.js +40 -0
- package/dist/services/url-fetcher.js.map +1 -0
- package/dist/services/validation-service.d.ts +49 -0
- package/dist/services/validation-service.d.ts.map +1 -0
- package/dist/services/validation-service.js +670 -0
- package/dist/services/validation-service.js.map +1 -0
- package/dist/tools/index.d.ts +10 -0
- package/dist/tools/index.d.ts.map +1 -0
- package/dist/tools/index.js +46 -0
- package/dist/tools/index.js.map +1 -0
- package/dist/tools/tier1/get-metadata.d.ts +6 -0
- package/dist/tools/tier1/get-metadata.d.ts.map +1 -0
- package/dist/tools/tier1/get-metadata.js +49 -0
- package/dist/tools/tier1/get-metadata.js.map +1 -0
- package/dist/tools/tier1/get-page-count.d.ts +6 -0
- package/dist/tools/tier1/get-page-count.d.ts.map +1 -0
- package/dist/tools/tier1/get-page-count.js +50 -0
- package/dist/tools/tier1/get-page-count.js.map +1 -0
- package/dist/tools/tier1/read-images.d.ts +6 -0
- package/dist/tools/tier1/read-images.d.ts.map +1 -0
- package/dist/tools/tier1/read-images.js +79 -0
- package/dist/tools/tier1/read-images.js.map +1 -0
- package/dist/tools/tier1/read-text.d.ts +6 -0
- package/dist/tools/tier1/read-text.d.ts.map +1 -0
- package/dist/tools/tier1/read-text.js +57 -0
- package/dist/tools/tier1/read-text.js.map +1 -0
- package/dist/tools/tier1/read-url.d.ts +6 -0
- package/dist/tools/tier1/read-url.d.ts.map +1 -0
- package/dist/tools/tier1/read-url.js +64 -0
- package/dist/tools/tier1/read-url.js.map +1 -0
- package/dist/tools/tier1/search-text.d.ts +6 -0
- package/dist/tools/tier1/search-text.d.ts.map +1 -0
- package/dist/tools/tier1/search-text.js +62 -0
- package/dist/tools/tier1/search-text.js.map +1 -0
- package/dist/tools/tier1/summarize.d.ts +6 -0
- package/dist/tools/tier1/summarize.d.ts.map +1 -0
- package/dist/tools/tier1/summarize.js +70 -0
- package/dist/tools/tier1/summarize.js.map +1 -0
- package/dist/tools/tier2/inspect-annotations.d.ts +6 -0
- package/dist/tools/tier2/inspect-annotations.d.ts.map +1 -0
- package/dist/tools/tier2/inspect-annotations.js +47 -0
- package/dist/tools/tier2/inspect-annotations.js.map +1 -0
- package/dist/tools/tier2/inspect-fonts.d.ts +6 -0
- package/dist/tools/tier2/inspect-fonts.d.ts.map +1 -0
- package/dist/tools/tier2/inspect-fonts.js +54 -0
- package/dist/tools/tier2/inspect-fonts.js.map +1 -0
- package/dist/tools/tier2/inspect-signatures.d.ts +6 -0
- package/dist/tools/tier2/inspect-signatures.d.ts.map +1 -0
- package/dist/tools/tier2/inspect-signatures.js +48 -0
- package/dist/tools/tier2/inspect-signatures.js.map +1 -0
- package/dist/tools/tier2/inspect-structure.d.ts +6 -0
- package/dist/tools/tier2/inspect-structure.d.ts.map +1 -0
- package/dist/tools/tier2/inspect-structure.js +46 -0
- package/dist/tools/tier2/inspect-structure.js.map +1 -0
- package/dist/tools/tier2/inspect-tags.d.ts +6 -0
- package/dist/tools/tier2/inspect-tags.d.ts.map +1 -0
- package/dist/tools/tier2/inspect-tags.js +46 -0
- package/dist/tools/tier2/inspect-tags.js.map +1 -0
- package/dist/tools/tier3/compare-structure.d.ts +6 -0
- package/dist/tools/tier3/compare-structure.d.ts.map +1 -0
- package/dist/tools/tier3/compare-structure.js +47 -0
- package/dist/tools/tier3/compare-structure.js.map +1 -0
- package/dist/tools/tier3/validate-metadata.d.ts +6 -0
- package/dist/tools/tier3/validate-metadata.d.ts.map +1 -0
- package/dist/tools/tier3/validate-metadata.js +57 -0
- package/dist/tools/tier3/validate-metadata.js.map +1 -0
- package/dist/tools/tier3/validate-tagged.d.ts +6 -0
- package/dist/tools/tier3/validate-tagged.d.ts.map +1 -0
- package/dist/tools/tier3/validate-tagged.js +56 -0
- package/dist/tools/tier3/validate-tagged.js.map +1 -0
- package/dist/types.d.ts +226 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +5 -0
- package/dist/types.js.map +1 -0
- package/dist/utils/batch-processor.d.ts +60 -0
- package/dist/utils/batch-processor.d.ts.map +1 -0
- package/dist/utils/batch-processor.js +72 -0
- package/dist/utils/batch-processor.js.map +1 -0
- package/dist/utils/error-handler.d.ts +23 -0
- package/dist/utils/error-handler.d.ts.map +1 -0
- package/dist/utils/error-handler.js +76 -0
- package/dist/utils/error-handler.js.map +1 -0
- package/dist/utils/formatter.d.ts +64 -0
- package/dist/utils/formatter.d.ts.map +1 -0
- package/dist/utils/formatter.js +379 -0
- package/dist/utils/formatter.js.map +1 -0
- package/dist/utils/pdf-helpers.d.ts +22 -0
- package/dist/utils/pdf-helpers.d.ts.map +1 -0
- package/dist/utils/pdf-helpers.js +68 -0
- package/dist/utils/pdf-helpers.js.map +1 -0
- package/package.json +78 -0
|
@@ -0,0 +1,670 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Validation & comparison service for Tier 3 tools.
|
|
3
|
+
*
|
|
4
|
+
* Provides PDF/UA tag validation, metadata conformance checks,
|
|
5
|
+
* and structural comparison between two PDFs.
|
|
6
|
+
*/
|
|
7
|
+
import { basename } from 'node:path';
|
|
8
|
+
import { formatFileSize } from '../utils/formatter.js';
|
|
9
|
+
import { getMetadata, loadDocument, analyzeTagsFromDoc, countImagesFromDoc, } from './pdfjs-service.js';
|
|
10
|
+
import { analyzeStructure, analyzeFontsWithPdfLib } from './pdflib-service.js';
|
|
11
|
+
// ─── validate_tagged ─────────────────────────────────────
|
|
12
|
+
/**
|
|
13
|
+
* Validate PDF/UA tagged structure requirements.
|
|
14
|
+
*
|
|
15
|
+
* Checks performed:
|
|
16
|
+
* - Document is marked as tagged
|
|
17
|
+
* - Structure tree root exists
|
|
18
|
+
* - Document-level root tag present
|
|
19
|
+
* - Heading hierarchy (H1-H6) is sequential
|
|
20
|
+
* - Figure tags have content (potential alt text)
|
|
21
|
+
* - Table tags are present when expected
|
|
22
|
+
* - Minimum structure depth
|
|
23
|
+
*/
|
|
24
|
+
export async function validateTagged(filePath) {
|
|
25
|
+
const issues = [];
|
|
26
|
+
let totalChecks = 0;
|
|
27
|
+
let passed = 0;
|
|
28
|
+
let failed = 0;
|
|
29
|
+
let warnings = 0;
|
|
30
|
+
// ドキュメントを1回だけロードし、タグ解析と画像カウントで共有
|
|
31
|
+
const doc = await loadDocument(filePath);
|
|
32
|
+
let tagsAnalysis;
|
|
33
|
+
let imageCount;
|
|
34
|
+
try {
|
|
35
|
+
// タグ解析と画像カウントを並列実行
|
|
36
|
+
[tagsAnalysis, imageCount] = await Promise.all([
|
|
37
|
+
analyzeTagsFromDoc(doc),
|
|
38
|
+
countImagesFromDoc(doc),
|
|
39
|
+
]);
|
|
40
|
+
}
|
|
41
|
+
finally {
|
|
42
|
+
await doc.destroy();
|
|
43
|
+
}
|
|
44
|
+
// Check 1: Is the document tagged?
|
|
45
|
+
totalChecks++;
|
|
46
|
+
if (!tagsAnalysis.isTagged) {
|
|
47
|
+
failed++;
|
|
48
|
+
issues.push({
|
|
49
|
+
severity: 'error',
|
|
50
|
+
code: 'TAG-001',
|
|
51
|
+
message: 'Document is not tagged',
|
|
52
|
+
details: 'The PDF does not have the Marked flag set in the MarkInfo dictionary. Tagged PDF is required for PDF/UA compliance.',
|
|
53
|
+
});
|
|
54
|
+
return {
|
|
55
|
+
isTagged: false,
|
|
56
|
+
totalChecks,
|
|
57
|
+
passed,
|
|
58
|
+
failed,
|
|
59
|
+
warnings,
|
|
60
|
+
issues,
|
|
61
|
+
summary: 'Document is not tagged. PDF/UA validation cannot proceed.',
|
|
62
|
+
};
|
|
63
|
+
}
|
|
64
|
+
passed++;
|
|
65
|
+
issues.push({
|
|
66
|
+
severity: 'info',
|
|
67
|
+
code: 'TAG-001',
|
|
68
|
+
message: 'Document is marked as tagged',
|
|
69
|
+
});
|
|
70
|
+
// Check 2: Structure tree root exists
|
|
71
|
+
totalChecks++;
|
|
72
|
+
if (!tagsAnalysis.rootTag) {
|
|
73
|
+
failed++;
|
|
74
|
+
issues.push({
|
|
75
|
+
severity: 'error',
|
|
76
|
+
code: 'TAG-002',
|
|
77
|
+
message: 'No structure tree root found',
|
|
78
|
+
details: 'The document is marked as tagged but has no StructTreeRoot.',
|
|
79
|
+
});
|
|
80
|
+
}
|
|
81
|
+
else {
|
|
82
|
+
passed++;
|
|
83
|
+
issues.push({
|
|
84
|
+
severity: 'info',
|
|
85
|
+
code: 'TAG-002',
|
|
86
|
+
message: 'Structure tree root exists',
|
|
87
|
+
});
|
|
88
|
+
}
|
|
89
|
+
// Check 3: Document-level root tag
|
|
90
|
+
totalChecks++;
|
|
91
|
+
const hasDocumentTag = tagsAnalysis.roleCounts.Document !== undefined;
|
|
92
|
+
if (!hasDocumentTag) {
|
|
93
|
+
warnings++;
|
|
94
|
+
issues.push({
|
|
95
|
+
severity: 'warning',
|
|
96
|
+
code: 'TAG-003',
|
|
97
|
+
message: 'No Document root tag found',
|
|
98
|
+
details: 'PDF/UA recommends a single Document tag as the root of the structure tree.',
|
|
99
|
+
});
|
|
100
|
+
}
|
|
101
|
+
else {
|
|
102
|
+
passed++;
|
|
103
|
+
issues.push({
|
|
104
|
+
severity: 'info',
|
|
105
|
+
code: 'TAG-003',
|
|
106
|
+
message: 'Document root tag present',
|
|
107
|
+
});
|
|
108
|
+
}
|
|
109
|
+
// Check 4: Heading hierarchy
|
|
110
|
+
totalChecks++;
|
|
111
|
+
const headingLevels = [];
|
|
112
|
+
for (let i = 1; i <= 6; i++) {
|
|
113
|
+
if (tagsAnalysis.roleCounts[`H${i}`]) {
|
|
114
|
+
headingLevels.push(i);
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
// Also check generic H tag
|
|
118
|
+
const hasGenericH = tagsAnalysis.roleCounts.H !== undefined;
|
|
119
|
+
if (headingLevels.length === 0 && !hasGenericH) {
|
|
120
|
+
warnings++;
|
|
121
|
+
issues.push({
|
|
122
|
+
severity: 'warning',
|
|
123
|
+
code: 'TAG-004',
|
|
124
|
+
message: 'No heading tags found',
|
|
125
|
+
details: 'Document has no heading tags (H1-H6 or H). Headings are recommended for document navigation.',
|
|
126
|
+
});
|
|
127
|
+
}
|
|
128
|
+
else if (headingLevels.length > 0) {
|
|
129
|
+
// Check for skipped levels
|
|
130
|
+
let isSequential = true;
|
|
131
|
+
if (headingLevels[0] !== 1) {
|
|
132
|
+
isSequential = false;
|
|
133
|
+
}
|
|
134
|
+
for (let i = 1; i < headingLevels.length; i++) {
|
|
135
|
+
if (headingLevels[i] - headingLevels[i - 1] > 1) {
|
|
136
|
+
isSequential = false;
|
|
137
|
+
break;
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
if (!isSequential) {
|
|
141
|
+
warnings++;
|
|
142
|
+
issues.push({
|
|
143
|
+
severity: 'warning',
|
|
144
|
+
code: 'TAG-004',
|
|
145
|
+
message: `Heading hierarchy has gaps: ${headingLevels.map((l) => `H${l}`).join(', ')}`,
|
|
146
|
+
details: 'Heading levels should be sequential without skipping levels (e.g., H1 → H2 → H3).',
|
|
147
|
+
});
|
|
148
|
+
}
|
|
149
|
+
else {
|
|
150
|
+
passed++;
|
|
151
|
+
issues.push({
|
|
152
|
+
severity: 'info',
|
|
153
|
+
code: 'TAG-004',
|
|
154
|
+
message: `Heading hierarchy is sequential: ${headingLevels.map((l) => `H${l}`).join(', ')}`,
|
|
155
|
+
});
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
else {
|
|
159
|
+
passed++;
|
|
160
|
+
issues.push({
|
|
161
|
+
severity: 'info',
|
|
162
|
+
code: 'TAG-004',
|
|
163
|
+
message: 'Generic H tags used for headings',
|
|
164
|
+
});
|
|
165
|
+
}
|
|
166
|
+
// Check 5: Figure tags for images
|
|
167
|
+
totalChecks++;
|
|
168
|
+
const figureCount = tagsAnalysis.roleCounts.Figure ?? 0;
|
|
169
|
+
if (imageCount > 0 && figureCount === 0) {
|
|
170
|
+
failed++;
|
|
171
|
+
issues.push({
|
|
172
|
+
severity: 'error',
|
|
173
|
+
code: 'TAG-005',
|
|
174
|
+
message: `Document has ${imageCount} image(s) but no Figure tags`,
|
|
175
|
+
details: 'Images must be tagged as Figure with appropriate alt text for PDF/UA compliance.',
|
|
176
|
+
});
|
|
177
|
+
}
|
|
178
|
+
else if (imageCount > 0 && figureCount < imageCount) {
|
|
179
|
+
warnings++;
|
|
180
|
+
issues.push({
|
|
181
|
+
severity: 'warning',
|
|
182
|
+
code: 'TAG-005',
|
|
183
|
+
message: `${imageCount} image(s) found but only ${figureCount} Figure tag(s)`,
|
|
184
|
+
details: 'Some images may not be properly tagged. Decorative images should be marked as artifacts.',
|
|
185
|
+
});
|
|
186
|
+
}
|
|
187
|
+
else if (imageCount > 0) {
|
|
188
|
+
passed++;
|
|
189
|
+
issues.push({
|
|
190
|
+
severity: 'info',
|
|
191
|
+
code: 'TAG-005',
|
|
192
|
+
message: `${figureCount} Figure tag(s) for ${imageCount} image(s)`,
|
|
193
|
+
});
|
|
194
|
+
}
|
|
195
|
+
else {
|
|
196
|
+
passed++;
|
|
197
|
+
issues.push({
|
|
198
|
+
severity: 'info',
|
|
199
|
+
code: 'TAG-005',
|
|
200
|
+
message: 'No images detected; Figure tag check not applicable',
|
|
201
|
+
});
|
|
202
|
+
}
|
|
203
|
+
// Check 6: Paragraph tags
|
|
204
|
+
totalChecks++;
|
|
205
|
+
const hasParagraphs = tagsAnalysis.roleCounts.P !== undefined;
|
|
206
|
+
if (!hasParagraphs) {
|
|
207
|
+
warnings++;
|
|
208
|
+
issues.push({
|
|
209
|
+
severity: 'warning',
|
|
210
|
+
code: 'TAG-006',
|
|
211
|
+
message: 'No P (Paragraph) tags found',
|
|
212
|
+
details: 'Text content should be wrapped in P tags for proper reading order.',
|
|
213
|
+
});
|
|
214
|
+
}
|
|
215
|
+
else {
|
|
216
|
+
passed++;
|
|
217
|
+
issues.push({
|
|
218
|
+
severity: 'info',
|
|
219
|
+
code: 'TAG-006',
|
|
220
|
+
message: `${tagsAnalysis.roleCounts.P} Paragraph tag(s) found`,
|
|
221
|
+
});
|
|
222
|
+
}
|
|
223
|
+
// Check 7: Minimum element count
|
|
224
|
+
totalChecks++;
|
|
225
|
+
if (tagsAnalysis.totalElements < 2) {
|
|
226
|
+
warnings++;
|
|
227
|
+
issues.push({
|
|
228
|
+
severity: 'warning',
|
|
229
|
+
code: 'TAG-007',
|
|
230
|
+
message: `Only ${tagsAnalysis.totalElements} structure element(s) found`,
|
|
231
|
+
details: 'A properly tagged document should have multiple structure elements reflecting its content.',
|
|
232
|
+
});
|
|
233
|
+
}
|
|
234
|
+
else {
|
|
235
|
+
passed++;
|
|
236
|
+
issues.push({
|
|
237
|
+
severity: 'info',
|
|
238
|
+
code: 'TAG-007',
|
|
239
|
+
message: `${tagsAnalysis.totalElements} structure elements found`,
|
|
240
|
+
});
|
|
241
|
+
}
|
|
242
|
+
// Check 8: Table tags
|
|
243
|
+
totalChecks++;
|
|
244
|
+
const tableCount = tagsAnalysis.roleCounts.Table ?? 0;
|
|
245
|
+
const trCount = tagsAnalysis.roleCounts.TR ?? 0;
|
|
246
|
+
const tdCount = tagsAnalysis.roleCounts.TD ?? 0;
|
|
247
|
+
const thCount = tagsAnalysis.roleCounts.TH ?? 0;
|
|
248
|
+
if (tableCount > 0) {
|
|
249
|
+
if (trCount === 0) {
|
|
250
|
+
warnings++;
|
|
251
|
+
issues.push({
|
|
252
|
+
severity: 'warning',
|
|
253
|
+
code: 'TAG-008',
|
|
254
|
+
message: `${tableCount} Table tag(s) but no TR (Table Row) tags`,
|
|
255
|
+
details: 'Tables should contain TR, TH, and TD tags for proper structure.',
|
|
256
|
+
});
|
|
257
|
+
}
|
|
258
|
+
else if (thCount === 0) {
|
|
259
|
+
warnings++;
|
|
260
|
+
issues.push({
|
|
261
|
+
severity: 'warning',
|
|
262
|
+
code: 'TAG-008',
|
|
263
|
+
message: `Table has ${trCount} row(s) but no TH (Table Header) tags`,
|
|
264
|
+
details: 'PDF/UA requires tables to have header cells (TH) for accessibility.',
|
|
265
|
+
});
|
|
266
|
+
}
|
|
267
|
+
else {
|
|
268
|
+
passed++;
|
|
269
|
+
issues.push({
|
|
270
|
+
severity: 'info',
|
|
271
|
+
code: 'TAG-008',
|
|
272
|
+
message: `Table structure: ${tableCount} table(s), ${trCount} row(s), ${thCount} header(s), ${tdCount} cell(s)`,
|
|
273
|
+
});
|
|
274
|
+
}
|
|
275
|
+
}
|
|
276
|
+
else {
|
|
277
|
+
passed++;
|
|
278
|
+
issues.push({
|
|
279
|
+
severity: 'info',
|
|
280
|
+
code: 'TAG-008',
|
|
281
|
+
message: 'No Table tags found; table check not applicable',
|
|
282
|
+
});
|
|
283
|
+
}
|
|
284
|
+
const errorCount = issues.filter((i) => i.severity === 'error').length;
|
|
285
|
+
const warnCount = issues.filter((i) => i.severity === 'warning').length;
|
|
286
|
+
let summary;
|
|
287
|
+
if (errorCount === 0 && warnCount === 0) {
|
|
288
|
+
summary = `All ${totalChecks} checks passed. Document appears well-structured for PDF/UA compliance.`;
|
|
289
|
+
}
|
|
290
|
+
else if (errorCount === 0) {
|
|
291
|
+
summary = `${passed}/${totalChecks} checks passed with ${warnCount} warning(s). Review warnings for full PDF/UA compliance.`;
|
|
292
|
+
}
|
|
293
|
+
else {
|
|
294
|
+
summary = `${passed}/${totalChecks} checks passed, ${errorCount} error(s), ${warnCount} warning(s). Critical issues must be resolved for PDF/UA compliance.`;
|
|
295
|
+
}
|
|
296
|
+
return {
|
|
297
|
+
isTagged: true,
|
|
298
|
+
totalChecks,
|
|
299
|
+
passed,
|
|
300
|
+
failed,
|
|
301
|
+
warnings,
|
|
302
|
+
issues,
|
|
303
|
+
summary,
|
|
304
|
+
};
|
|
305
|
+
}
|
|
306
|
+
// ─── validate_metadata ───────────────────────────────────
|
|
307
|
+
/**
|
|
308
|
+
* Validate PDF metadata conformance.
|
|
309
|
+
*
|
|
310
|
+
* Checks performed:
|
|
311
|
+
* - Title is present (required for PDF/UA, PDF/A)
|
|
312
|
+
* - Author is present
|
|
313
|
+
* - CreationDate format
|
|
314
|
+
* - ModificationDate format
|
|
315
|
+
* - Producer is present
|
|
316
|
+
* - PDF version is detected
|
|
317
|
+
* - Document is tagged (metadata flag)
|
|
318
|
+
* - Subject and Keywords (informational)
|
|
319
|
+
*/
|
|
320
|
+
export async function validateMetadata(filePath) {
|
|
321
|
+
const issues = [];
|
|
322
|
+
let totalChecks = 0;
|
|
323
|
+
let passed = 0;
|
|
324
|
+
let failed = 0;
|
|
325
|
+
let warnings = 0;
|
|
326
|
+
const meta = await getMetadata(filePath);
|
|
327
|
+
// Check 1: Title
|
|
328
|
+
totalChecks++;
|
|
329
|
+
if (!meta.title) {
|
|
330
|
+
failed++;
|
|
331
|
+
issues.push({
|
|
332
|
+
severity: 'error',
|
|
333
|
+
code: 'META-001',
|
|
334
|
+
message: 'Title is missing',
|
|
335
|
+
details: 'A document title is required for PDF/UA and PDF/A compliance. It is used by assistive technology and search engines.',
|
|
336
|
+
});
|
|
337
|
+
}
|
|
338
|
+
else {
|
|
339
|
+
passed++;
|
|
340
|
+
issues.push({
|
|
341
|
+
severity: 'info',
|
|
342
|
+
code: 'META-001',
|
|
343
|
+
message: `Title: "${meta.title}"`,
|
|
344
|
+
});
|
|
345
|
+
}
|
|
346
|
+
// Check 2: Author
|
|
347
|
+
totalChecks++;
|
|
348
|
+
if (!meta.author) {
|
|
349
|
+
warnings++;
|
|
350
|
+
issues.push({
|
|
351
|
+
severity: 'warning',
|
|
352
|
+
code: 'META-002',
|
|
353
|
+
message: 'Author is missing',
|
|
354
|
+
details: 'Author metadata is recommended for document identification.',
|
|
355
|
+
});
|
|
356
|
+
}
|
|
357
|
+
else {
|
|
358
|
+
passed++;
|
|
359
|
+
issues.push({
|
|
360
|
+
severity: 'info',
|
|
361
|
+
code: 'META-002',
|
|
362
|
+
message: `Author: "${meta.author}"`,
|
|
363
|
+
});
|
|
364
|
+
}
|
|
365
|
+
// Check 3: CreationDate
|
|
366
|
+
totalChecks++;
|
|
367
|
+
if (!meta.creationDate) {
|
|
368
|
+
warnings++;
|
|
369
|
+
issues.push({
|
|
370
|
+
severity: 'warning',
|
|
371
|
+
code: 'META-003',
|
|
372
|
+
message: 'Creation date is missing',
|
|
373
|
+
details: 'Creation date metadata is recommended.',
|
|
374
|
+
});
|
|
375
|
+
}
|
|
376
|
+
else {
|
|
377
|
+
const isValidDate = isValidPdfDate(meta.creationDate);
|
|
378
|
+
if (isValidDate) {
|
|
379
|
+
passed++;
|
|
380
|
+
issues.push({
|
|
381
|
+
severity: 'info',
|
|
382
|
+
code: 'META-003',
|
|
383
|
+
message: `Creation date: ${meta.creationDate}`,
|
|
384
|
+
});
|
|
385
|
+
}
|
|
386
|
+
else {
|
|
387
|
+
warnings++;
|
|
388
|
+
issues.push({
|
|
389
|
+
severity: 'warning',
|
|
390
|
+
code: 'META-003',
|
|
391
|
+
message: `Creation date format may be non-standard: ${meta.creationDate}`,
|
|
392
|
+
details: "PDF date format should follow D:YYYYMMDDHHmmSSOHH'mm'.",
|
|
393
|
+
});
|
|
394
|
+
}
|
|
395
|
+
}
|
|
396
|
+
// Check 4: ModificationDate
|
|
397
|
+
totalChecks++;
|
|
398
|
+
if (!meta.modificationDate) {
|
|
399
|
+
warnings++;
|
|
400
|
+
issues.push({
|
|
401
|
+
severity: 'warning',
|
|
402
|
+
code: 'META-004',
|
|
403
|
+
message: 'Modification date is missing',
|
|
404
|
+
details: 'Modification date metadata is recommended for document tracking.',
|
|
405
|
+
});
|
|
406
|
+
}
|
|
407
|
+
else {
|
|
408
|
+
passed++;
|
|
409
|
+
issues.push({
|
|
410
|
+
severity: 'info',
|
|
411
|
+
code: 'META-004',
|
|
412
|
+
message: `Modification date: ${meta.modificationDate}`,
|
|
413
|
+
});
|
|
414
|
+
}
|
|
415
|
+
// Check 5: Producer
|
|
416
|
+
totalChecks++;
|
|
417
|
+
if (!meta.producer) {
|
|
418
|
+
warnings++;
|
|
419
|
+
issues.push({
|
|
420
|
+
severity: 'warning',
|
|
421
|
+
code: 'META-005',
|
|
422
|
+
message: 'Producer is missing',
|
|
423
|
+
details: 'Producer identifies the application that created the PDF. Useful for debugging rendering issues.',
|
|
424
|
+
});
|
|
425
|
+
}
|
|
426
|
+
else {
|
|
427
|
+
passed++;
|
|
428
|
+
issues.push({
|
|
429
|
+
severity: 'info',
|
|
430
|
+
code: 'META-005',
|
|
431
|
+
message: `Producer: "${meta.producer}"`,
|
|
432
|
+
});
|
|
433
|
+
}
|
|
434
|
+
// Check 6: PDF version
|
|
435
|
+
totalChecks++;
|
|
436
|
+
if (!meta.pdfVersion) {
|
|
437
|
+
warnings++;
|
|
438
|
+
issues.push({
|
|
439
|
+
severity: 'warning',
|
|
440
|
+
code: 'META-006',
|
|
441
|
+
message: 'PDF version not detected',
|
|
442
|
+
details: 'The PDF version could not be determined from the file header.',
|
|
443
|
+
});
|
|
444
|
+
}
|
|
445
|
+
else {
|
|
446
|
+
passed++;
|
|
447
|
+
const version = Number.parseFloat(meta.pdfVersion);
|
|
448
|
+
if (version < 1.4) {
|
|
449
|
+
issues.push({
|
|
450
|
+
severity: 'info',
|
|
451
|
+
code: 'META-006',
|
|
452
|
+
message: `PDF version: ${meta.pdfVersion} (pre-1.4; limited feature support)`,
|
|
453
|
+
});
|
|
454
|
+
}
|
|
455
|
+
else {
|
|
456
|
+
issues.push({
|
|
457
|
+
severity: 'info',
|
|
458
|
+
code: 'META-006',
|
|
459
|
+
message: `PDF version: ${meta.pdfVersion}`,
|
|
460
|
+
});
|
|
461
|
+
}
|
|
462
|
+
}
|
|
463
|
+
// Check 7: Tagged flag
|
|
464
|
+
totalChecks++;
|
|
465
|
+
if (!meta.isTagged) {
|
|
466
|
+
warnings++;
|
|
467
|
+
issues.push({
|
|
468
|
+
severity: 'warning',
|
|
469
|
+
code: 'META-007',
|
|
470
|
+
message: 'Document is not tagged',
|
|
471
|
+
details: 'Tagged PDF is required for PDF/UA compliance and recommended for accessibility.',
|
|
472
|
+
});
|
|
473
|
+
}
|
|
474
|
+
else {
|
|
475
|
+
passed++;
|
|
476
|
+
issues.push({
|
|
477
|
+
severity: 'info',
|
|
478
|
+
code: 'META-007',
|
|
479
|
+
message: 'Document is tagged',
|
|
480
|
+
});
|
|
481
|
+
}
|
|
482
|
+
// Check 8: Subject
|
|
483
|
+
totalChecks++;
|
|
484
|
+
if (!meta.subject) {
|
|
485
|
+
warnings++;
|
|
486
|
+
issues.push({
|
|
487
|
+
severity: 'warning',
|
|
488
|
+
code: 'META-008',
|
|
489
|
+
message: 'Subject is missing',
|
|
490
|
+
details: 'Subject metadata helps describe the document purpose.',
|
|
491
|
+
});
|
|
492
|
+
}
|
|
493
|
+
else {
|
|
494
|
+
passed++;
|
|
495
|
+
issues.push({
|
|
496
|
+
severity: 'info',
|
|
497
|
+
code: 'META-008',
|
|
498
|
+
message: `Subject: "${meta.subject}"`,
|
|
499
|
+
});
|
|
500
|
+
}
|
|
501
|
+
// Check 9: Keywords
|
|
502
|
+
totalChecks++;
|
|
503
|
+
if (!meta.keywords) {
|
|
504
|
+
warnings++;
|
|
505
|
+
issues.push({
|
|
506
|
+
severity: 'warning',
|
|
507
|
+
code: 'META-009',
|
|
508
|
+
message: 'Keywords are missing',
|
|
509
|
+
details: 'Keywords metadata aids document discoverability.',
|
|
510
|
+
});
|
|
511
|
+
}
|
|
512
|
+
else {
|
|
513
|
+
passed++;
|
|
514
|
+
issues.push({
|
|
515
|
+
severity: 'info',
|
|
516
|
+
code: 'META-009',
|
|
517
|
+
message: `Keywords: "${meta.keywords}"`,
|
|
518
|
+
});
|
|
519
|
+
}
|
|
520
|
+
// Check 10: Encryption and accessibility
|
|
521
|
+
totalChecks++;
|
|
522
|
+
if (meta.isEncrypted) {
|
|
523
|
+
warnings++;
|
|
524
|
+
issues.push({
|
|
525
|
+
severity: 'warning',
|
|
526
|
+
code: 'META-010',
|
|
527
|
+
message: 'Document is encrypted',
|
|
528
|
+
details: 'Encryption may restrict assistive technology access. Ensure accessibility permissions are enabled.',
|
|
529
|
+
});
|
|
530
|
+
}
|
|
531
|
+
else {
|
|
532
|
+
passed++;
|
|
533
|
+
issues.push({
|
|
534
|
+
severity: 'info',
|
|
535
|
+
code: 'META-010',
|
|
536
|
+
message: 'Document is not encrypted',
|
|
537
|
+
});
|
|
538
|
+
}
|
|
539
|
+
const errorCount = issues.filter((i) => i.severity === 'error').length;
|
|
540
|
+
const warnCount = issues.filter((i) => i.severity === 'warning').length;
|
|
541
|
+
let summary;
|
|
542
|
+
if (errorCount === 0 && warnCount === 0) {
|
|
543
|
+
summary = `All ${totalChecks} metadata checks passed.`;
|
|
544
|
+
}
|
|
545
|
+
else if (errorCount === 0) {
|
|
546
|
+
summary = `${passed}/${totalChecks} checks passed with ${warnCount} warning(s).`;
|
|
547
|
+
}
|
|
548
|
+
else {
|
|
549
|
+
summary = `${passed}/${totalChecks} checks passed, ${errorCount} error(s), ${warnCount} warning(s).`;
|
|
550
|
+
}
|
|
551
|
+
return {
|
|
552
|
+
totalChecks,
|
|
553
|
+
passed,
|
|
554
|
+
failed,
|
|
555
|
+
warnings,
|
|
556
|
+
issues,
|
|
557
|
+
metadata: {
|
|
558
|
+
hasTitle: !!meta.title,
|
|
559
|
+
hasAuthor: !!meta.author,
|
|
560
|
+
hasSubject: !!meta.subject,
|
|
561
|
+
hasKeywords: !!meta.keywords,
|
|
562
|
+
hasCreator: !!meta.creator,
|
|
563
|
+
hasProducer: !!meta.producer,
|
|
564
|
+
hasCreationDate: !!meta.creationDate,
|
|
565
|
+
hasModificationDate: !!meta.modificationDate,
|
|
566
|
+
pdfVersion: meta.pdfVersion,
|
|
567
|
+
isTagged: meta.isTagged,
|
|
568
|
+
},
|
|
569
|
+
summary,
|
|
570
|
+
};
|
|
571
|
+
}
|
|
572
|
+
// ─── compare_structure ───────────────────────────────────
|
|
573
|
+
/**
|
|
574
|
+
* Compare the structure of two PDF documents.
|
|
575
|
+
*
|
|
576
|
+
* Compares:
|
|
577
|
+
* - Page count
|
|
578
|
+
* - PDF version
|
|
579
|
+
* - Encryption status
|
|
580
|
+
* - Tagged status
|
|
581
|
+
* - Total objects
|
|
582
|
+
* - Stream count
|
|
583
|
+
* - Page dimensions (first page)
|
|
584
|
+
* - Fonts used (set difference)
|
|
585
|
+
*/
|
|
586
|
+
export async function compareStructure(filePath1, filePath2) {
|
|
587
|
+
// Analyze both files in parallel
|
|
588
|
+
const [struct1, struct2, fonts1, fonts2, meta1, meta2] = await Promise.all([
|
|
589
|
+
analyzeStructure(filePath1),
|
|
590
|
+
analyzeStructure(filePath2),
|
|
591
|
+
analyzeFontsWithPdfLib(filePath1),
|
|
592
|
+
analyzeFontsWithPdfLib(filePath2),
|
|
593
|
+
getMetadata(filePath1),
|
|
594
|
+
getMetadata(filePath2),
|
|
595
|
+
]);
|
|
596
|
+
const diffs = [];
|
|
597
|
+
// Page count
|
|
598
|
+
addDiff(diffs, 'Page Count', String(struct1.pageTree.totalPages), String(struct2.pageTree.totalPages));
|
|
599
|
+
// PDF version
|
|
600
|
+
addDiff(diffs, 'PDF Version', struct1.pdfVersion ?? 'Unknown', struct2.pdfVersion ?? 'Unknown');
|
|
601
|
+
// Encrypted
|
|
602
|
+
addDiff(diffs, 'Encrypted', String(struct1.isEncrypted), String(struct2.isEncrypted));
|
|
603
|
+
// Tagged
|
|
604
|
+
addDiff(diffs, 'Tagged', String(meta1.isTagged), String(meta2.isTagged));
|
|
605
|
+
// Total objects
|
|
606
|
+
addDiff(diffs, 'Total Objects', String(struct1.objectStats.totalObjects), String(struct2.objectStats.totalObjects));
|
|
607
|
+
// Stream count
|
|
608
|
+
addDiff(diffs, 'Stream Count', String(struct1.objectStats.streamCount), String(struct2.objectStats.streamCount));
|
|
609
|
+
// First page dimensions
|
|
610
|
+
const dim1 = struct1.pageTree.mediaBoxSamples[0];
|
|
611
|
+
const dim2 = struct2.pageTree.mediaBoxSamples[0];
|
|
612
|
+
addDiff(diffs, 'Page 1 Dimensions (pt)', dim1 ? `${dim1.width} x ${dim1.height}` : 'N/A', dim2 ? `${dim2.width} x ${dim2.height}` : 'N/A');
|
|
613
|
+
// File size
|
|
614
|
+
addDiff(diffs, 'File Size', formatFileSize(meta1.fileSize), formatFileSize(meta2.fileSize));
|
|
615
|
+
// Catalog entry count
|
|
616
|
+
addDiff(diffs, 'Catalog Entries', String(struct1.catalog.length), String(struct2.catalog.length));
|
|
617
|
+
// Signatures
|
|
618
|
+
addDiff(diffs, 'Has Signatures', String(meta1.hasSignatures), String(meta2.hasSignatures));
|
|
619
|
+
// Font comparison
|
|
620
|
+
const fontNames1 = new Set(fonts1.fontMap.keys());
|
|
621
|
+
const fontNames2 = new Set(fonts2.fontMap.keys());
|
|
622
|
+
const onlyInFile1 = [...fontNames1].filter((f) => !fontNames2.has(f));
|
|
623
|
+
const onlyInFile2 = [...fontNames2].filter((f) => !fontNames1.has(f));
|
|
624
|
+
const inBoth = [...fontNames1].filter((f) => fontNames2.has(f));
|
|
625
|
+
addDiff(diffs, 'Total Fonts', String(fontNames1.size), String(fontNames2.size));
|
|
626
|
+
// Summary
|
|
627
|
+
const matchCount = diffs.filter((d) => d.status === 'match').length;
|
|
628
|
+
const diffCount = diffs.filter((d) => d.status === 'differ').length;
|
|
629
|
+
const summary = diffCount === 0
|
|
630
|
+
? `All ${matchCount} properties match between the two PDFs.`
|
|
631
|
+
: `${diffCount} difference(s) found out of ${diffs.length} properties compared.`;
|
|
632
|
+
return {
|
|
633
|
+
file1: basename(filePath1),
|
|
634
|
+
file2: basename(filePath2),
|
|
635
|
+
diffs,
|
|
636
|
+
fontComparison: { onlyInFile1, onlyInFile2, inBoth },
|
|
637
|
+
summary,
|
|
638
|
+
};
|
|
639
|
+
}
|
|
640
|
+
// ─── Internal helpers ────────────────────────────────────
|
|
641
|
+
function addDiff(diffs, property, val1, val2) {
|
|
642
|
+
diffs.push({
|
|
643
|
+
property,
|
|
644
|
+
file1Value: val1,
|
|
645
|
+
file2Value: val2,
|
|
646
|
+
status: val1 === val2 ? 'match' : 'differ',
|
|
647
|
+
});
|
|
648
|
+
}
|
|
649
|
+
/**
|
|
650
|
+
* Check if a date string looks like a valid PDF date format.
|
|
651
|
+
*
|
|
652
|
+
* Per ISO 32000-1 §7.9.4, PDF date format is:
|
|
653
|
+
* D:YYYYMMDDHHmmSSOHH'mm'
|
|
654
|
+
* where YYYY is required and other parts are optional.
|
|
655
|
+
* O is the timezone indicator: +, -, or Z.
|
|
656
|
+
*
|
|
657
|
+
* Also accepts ISO 8601 dates (YYYY-MM-DDTHH:mm:ss).
|
|
658
|
+
*/
|
|
659
|
+
function isValidPdfDate(dateStr) {
|
|
660
|
+
// PDF date: D:YYYY with optional MM DD HH mm SS and timezone
|
|
661
|
+
if (/^D:\d{4}(\d{2}(\d{2}(\d{2}(\d{2}(\d{2}([+-Z](\d{2}'\d{2}'?)?)?)?)?)?)?)?$/.test(dateStr)) {
|
|
662
|
+
return true;
|
|
663
|
+
}
|
|
664
|
+
// ISO 8601 dates: YYYY-MM-DD with optional time
|
|
665
|
+
if (/^\d{4}-\d{2}-\d{2}(T\d{2}:\d{2}(:\d{2})?([+-]\d{2}:?\d{2}|Z)?)?$/.test(dateStr)) {
|
|
666
|
+
return true;
|
|
667
|
+
}
|
|
668
|
+
return false;
|
|
669
|
+
}
|
|
670
|
+
//# sourceMappingURL=validation-service.js.map
|