@geotechcli/core 0.4.22 → 0.4.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agents/brain.d.ts.map +1 -1
- package/dist/agents/brain.js +2 -1
- package/dist/agents/brain.js.map +1 -1
- package/dist/agents/data-tools.js +759 -0
- package/dist/agents/data-tools.js.map +1 -1
- package/dist/agents/swarm.d.ts.map +1 -1
- package/dist/agents/swarm.js +22 -2
- package/dist/agents/swarm.js.map +1 -1
- package/dist/agents/tool-runtime.d.ts +7 -0
- package/dist/agents/tool-runtime.d.ts.map +1 -0
- package/dist/agents/tool-runtime.js +9 -0
- package/dist/agents/tool-runtime.js.map +1 -0
- package/dist/config/index.d.ts +4 -4
- package/dist/config/index.js +1 -1
- package/dist/config/index.js.map +1 -1
- package/dist/geo/coordinates.d.ts +40 -0
- package/dist/geo/coordinates.d.ts.map +1 -0
- package/dist/geo/coordinates.js +461 -0
- package/dist/geo/coordinates.js.map +1 -0
- package/dist/geo/index.d.ts +1 -0
- package/dist/geo/index.d.ts.map +1 -1
- package/dist/geo/index.js +1 -0
- package/dist/geo/index.js.map +1 -1
- package/dist/index.d.ts +3 -2
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +3 -2
- package/dist/index.js.map +1 -1
- package/dist/ingest/ags.d.ts +3 -0
- package/dist/ingest/ags.d.ts.map +1 -1
- package/dist/ingest/ags.js +98 -9
- package/dist/ingest/ags.js.map +1 -1
- package/dist/ingest/cpt.d.ts +4 -0
- package/dist/ingest/cpt.d.ts.map +1 -1
- package/dist/ingest/cpt.js +87 -25
- package/dist/ingest/cpt.js.map +1 -1
- package/dist/ingest/document-inputs.d.ts +37 -0
- package/dist/ingest/document-inputs.d.ts.map +1 -0
- package/dist/ingest/document-inputs.js +197 -0
- package/dist/ingest/document-inputs.js.map +1 -0
- package/dist/ingest/geotech-document.d.ts +118 -0
- package/dist/ingest/geotech-document.d.ts.map +1 -0
- package/dist/ingest/geotech-document.js +1006 -0
- package/dist/ingest/geotech-document.js.map +1 -0
- package/dist/ingest/geotech-extract.d.ts +86 -0
- package/dist/ingest/geotech-extract.d.ts.map +1 -0
- package/dist/ingest/geotech-extract.js +652 -0
- package/dist/ingest/geotech-extract.js.map +1 -0
- package/dist/ingest/geotech-schemas.d.ts +248 -0
- package/dist/ingest/geotech-schemas.d.ts.map +1 -0
- package/dist/ingest/geotech-schemas.js +150 -0
- package/dist/ingest/geotech-schemas.js.map +1 -0
- package/dist/ingest/index.d.ts +8 -0
- package/dist/ingest/index.d.ts.map +1 -1
- package/dist/ingest/index.js +8 -0
- package/dist/ingest/index.js.map +1 -1
- package/dist/ingest/ingest-job-child.d.ts +2 -0
- package/dist/ingest/ingest-job-child.d.ts.map +1 -0
- package/dist/ingest/ingest-job-child.js +45 -0
- package/dist/ingest/ingest-job-child.js.map +1 -0
- package/dist/ingest/job-store.d.ts +117 -0
- package/dist/ingest/job-store.d.ts.map +1 -0
- package/dist/ingest/job-store.js +541 -0
- package/dist/ingest/job-store.js.map +1 -0
- package/dist/ingest/job-worker.d.ts +24 -0
- package/dist/ingest/job-worker.d.ts.map +1 -0
- package/dist/ingest/job-worker.js +1129 -0
- package/dist/ingest/job-worker.js.map +1 -0
- package/dist/ingest/pdf.d.ts +102 -0
- package/dist/ingest/pdf.d.ts.map +1 -0
- package/dist/ingest/pdf.js +1544 -0
- package/dist/ingest/pdf.js.map +1 -0
- package/dist/ingest/review-store.d.ts +215 -0
- package/dist/ingest/review-store.d.ts.map +1 -0
- package/dist/ingest/review-store.js +1995 -0
- package/dist/ingest/review-store.js.map +1 -0
- package/dist/llm/capabilities.d.ts +8 -0
- package/dist/llm/capabilities.d.ts.map +1 -0
- package/dist/llm/capabilities.js +73 -0
- package/dist/llm/capabilities.js.map +1 -0
- package/dist/llm/index.d.ts +3 -2
- package/dist/llm/index.d.ts.map +1 -1
- package/dist/llm/index.js +2 -1
- package/dist/llm/index.js.map +1 -1
- package/dist/llm/providers/anthropic.d.ts +6 -0
- package/dist/llm/providers/anthropic.d.ts.map +1 -1
- package/dist/llm/providers/anthropic.js +10 -1
- package/dist/llm/providers/anthropic.js.map +1 -1
- package/dist/llm/providers/hosted-beta.d.ts +6 -0
- package/dist/llm/providers/hosted-beta.d.ts.map +1 -1
- package/dist/llm/providers/hosted-beta.js +40 -10
- package/dist/llm/providers/hosted-beta.js.map +1 -1
- package/dist/llm/providers/huggingface.d.ts +6 -0
- package/dist/llm/providers/huggingface.d.ts.map +1 -1
- package/dist/llm/providers/huggingface.js +21 -1
- package/dist/llm/providers/huggingface.js.map +1 -1
- package/dist/llm/providers/openai-compatible.d.ts +6 -0
- package/dist/llm/providers/openai-compatible.d.ts.map +1 -1
- package/dist/llm/providers/openai-compatible.js +21 -1
- package/dist/llm/providers/openai-compatible.js.map +1 -1
- package/dist/llm/providers/zhipu.d.ts +6 -0
- package/dist/llm/providers/zhipu.d.ts.map +1 -1
- package/dist/llm/providers/zhipu.js +15 -1
- package/dist/llm/providers/zhipu.js.map +1 -1
- package/dist/llm/router.d.ts +7 -0
- package/dist/llm/router.d.ts.map +1 -1
- package/dist/llm/router.js +33 -13
- package/dist/llm/router.js.map +1 -1
- package/dist/llm/types.d.ts +22 -4
- package/dist/llm/types.d.ts.map +1 -1
- package/dist/llm/types.js.map +1 -1
- package/dist/meta/metadata.json +1 -1
- package/dist/report/html.d.ts +3 -0
- package/dist/report/html.d.ts.map +1 -0
- package/dist/report/html.js +626 -0
- package/dist/report/html.js.map +1 -0
- package/dist/report/index.d.ts +2 -0
- package/dist/report/index.d.ts.map +1 -1
- package/dist/report/index.js +2 -0
- package/dist/report/index.js.map +1 -1
- package/dist/report/ingest-dossier.d.ts +81 -0
- package/dist/report/ingest-dossier.d.ts.map +1 -0
- package/dist/report/ingest-dossier.js +324 -0
- package/dist/report/ingest-dossier.js.map +1 -0
- package/dist/storage/index.d.ts +5 -0
- package/dist/storage/index.d.ts.map +1 -1
- package/dist/storage/index.js +12 -6
- package/dist/storage/index.js.map +1 -1
- package/dist/vision/geotech-document.d.ts +46 -0
- package/dist/vision/geotech-document.d.ts.map +1 -0
- package/dist/vision/geotech-document.js +576 -0
- package/dist/vision/geotech-document.js.map +1 -0
- package/dist/vision/index.d.ts +31 -0
- package/dist/vision/index.d.ts.map +1 -1
- package/dist/vision/index.js +659 -27
- package/dist/vision/index.js.map +1 -1
- package/dist/vision/ocr.d.ts +29 -0
- package/dist/vision/ocr.d.ts.map +1 -0
- package/dist/vision/ocr.js +287 -0
- package/dist/vision/ocr.js.map +1 -0
- package/dist/vision/preprocess.d.ts +26 -0
- package/dist/vision/preprocess.d.ts.map +1 -0
- package/dist/vision/preprocess.js +194 -0
- package/dist/vision/preprocess.js.map +1 -0
- package/package.json +5 -1
|
@@ -0,0 +1,1544 @@
|
|
|
1
|
+
import { readFileSync } from 'node:fs';
|
|
2
|
+
import { resolve } from 'node:path';
|
|
3
|
+
import { inflateSync } from 'node:zlib';
|
|
4
|
+
import { encodeRawRasterToPng, preprocessVisionImageBuffer, renderPdfPageToImageBuffer, } from '../vision/preprocess.js';
|
|
5
|
+
const DICTIONARY_START = '<<';
|
|
6
|
+
const DICTIONARY_END = '>>';
|
|
7
|
+
const VECTOR_GRAPHICS_OPERATORS = new Set([
|
|
8
|
+
'm', 'l', 'c', 'v', 'y', 'h', 're',
|
|
9
|
+
'S', 's', 'f', 'F', 'f*', 'B', 'B*', 'b', 'b*',
|
|
10
|
+
'W', 'W*', 'n', 'sh',
|
|
11
|
+
]);
|
|
12
|
+
const CONTENT_STREAM_OPERATORS = new Set([
|
|
13
|
+
'BT', 'ET', 'Tf', 'Tm', 'Td', 'TD', 'T*', 'Tc', 'Tw', 'Tz', 'TL', 'Tr', 'Ts',
|
|
14
|
+
'Tj', 'TJ', '\'', '"',
|
|
15
|
+
'Do',
|
|
16
|
+
'm', 'l', 'c', 'v', 'y', 'h', 're',
|
|
17
|
+
'S', 's', 'f', 'F', 'f*', 'B', 'B*', 'b', 'b*',
|
|
18
|
+
'W', 'W*', 'n', 'q', 'Q', 'cm', 'rg', 'RG', 'g', 'G',
|
|
19
|
+
]);
|
|
20
|
+
const TEXT_QUALITY_DICTIONARY = new Set([
|
|
21
|
+
'and', 'at', 'borehole', 'classification', 'clay', 'cohesion', 'content',
|
|
22
|
+
'depth', 'description', 'easting', 'elevation', 'engineering', 'fill',
|
|
23
|
+
'foundation', 'friction', 'geology', 'geotechnical', 'ground', 'groundwater',
|
|
24
|
+
'investigation', 'layer', 'limit', 'lithology', 'moisture', 'northing',
|
|
25
|
+
'parameter', 'permeability', 'plasticity', 'report', 'rock', 'sample',
|
|
26
|
+
'sand', 'silt', 'soil', 'spt', 'strength', 'table', 'test', 'unit',
|
|
27
|
+
'water', 'weight',
|
|
28
|
+
]);
|
|
29
|
+
export function inspectPdfDocument(input) {
|
|
30
|
+
const buffer = typeof input === 'string'
|
|
31
|
+
? readFileSync(resolve(input))
|
|
32
|
+
: Buffer.from(input);
|
|
33
|
+
const metadata = {
|
|
34
|
+
parser: 'lightweight-page-inspector',
|
|
35
|
+
byteLength: buffer.byteLength,
|
|
36
|
+
pdfVersion: readPdfVersion(buffer),
|
|
37
|
+
isEncrypted: hasPdfEncryptMarker(buffer),
|
|
38
|
+
objectCount: 0,
|
|
39
|
+
};
|
|
40
|
+
const warnings = [];
|
|
41
|
+
if (metadata.isEncrypted) {
|
|
42
|
+
warnings.push('Encrypted PDF markers were detected. This MVP parser does not support decryption.');
|
|
43
|
+
}
|
|
44
|
+
const objects = parsePdfObjects(buffer);
|
|
45
|
+
metadata.objectCount = objects.size;
|
|
46
|
+
const pageObjects = collectPageObjects(objects);
|
|
47
|
+
const totalPages = pageObjects.length;
|
|
48
|
+
const pages = pageObjects.map((pageObject, index) => inspectPage(pageObject, index + 1, totalPages, buffer, objects));
|
|
49
|
+
for (const page of pages) {
|
|
50
|
+
warnings.push(...page.warnings);
|
|
51
|
+
}
|
|
52
|
+
const capabilities = summarizeDocumentCapabilities(pages);
|
|
53
|
+
const degradation = summarizeDocumentDegradation(pages, warnings, metadata.isEncrypted);
|
|
54
|
+
return {
|
|
55
|
+
kind: 'pdf-document-inspection',
|
|
56
|
+
totalPages,
|
|
57
|
+
pages,
|
|
58
|
+
capabilities,
|
|
59
|
+
degradation,
|
|
60
|
+
gracefulDegradationNotes: degradation.notes,
|
|
61
|
+
metadata,
|
|
62
|
+
warnings: uniqueStrings(warnings),
|
|
63
|
+
};
|
|
64
|
+
}
|
|
65
|
+
export async function extractPrimaryPdfPageImages(input) {
|
|
66
|
+
const buffer = typeof input === 'string'
|
|
67
|
+
? readFileSync(resolve(input))
|
|
68
|
+
: Buffer.from(input);
|
|
69
|
+
const objects = parsePdfObjects(buffer);
|
|
70
|
+
const pageObjects = collectPageObjects(objects);
|
|
71
|
+
const totalPages = pageObjects.length;
|
|
72
|
+
const images = [];
|
|
73
|
+
for (const [index, pageObject] of pageObjects.entries()) {
|
|
74
|
+
const pageImage = await extractPrimaryPageImage(pageObject, index + 1, totalPages, buffer, objects);
|
|
75
|
+
if (pageImage) {
|
|
76
|
+
images.push(pageImage);
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
return images;
|
|
80
|
+
}
|
|
81
|
+
export async function renderPdfPageImage(input, pageNumber, options) {
|
|
82
|
+
const buffer = typeof input === 'string'
|
|
83
|
+
? readFileSync(resolve(input))
|
|
84
|
+
: Buffer.from(input);
|
|
85
|
+
const renderedPage = await renderPdfPageToImageBuffer(buffer, pageNumber, {
|
|
86
|
+
scale: options?.scale,
|
|
87
|
+
preprocessPolicy: 'ocr-optimized',
|
|
88
|
+
});
|
|
89
|
+
if (!renderedPage) {
|
|
90
|
+
return null;
|
|
91
|
+
}
|
|
92
|
+
return {
|
|
93
|
+
pageNumber,
|
|
94
|
+
totalPages: inspectPdfDocument(buffer).totalPages,
|
|
95
|
+
objectRef: `page:${pageNumber}`,
|
|
96
|
+
mimeType: renderedPage.mimeType,
|
|
97
|
+
width: renderedPage.width,
|
|
98
|
+
height: renderedPage.height,
|
|
99
|
+
byteLength: renderedPage.buffer.length,
|
|
100
|
+
source: 'page-render',
|
|
101
|
+
warnings: renderedPage.warnings,
|
|
102
|
+
data: renderedPage.buffer,
|
|
103
|
+
};
|
|
104
|
+
}
|
|
105
|
+
function inspectPage(pageObject, pageNumber, totalPages, buffer, objects) {
|
|
106
|
+
const pageAssembly = assemblePage(pageObject, objects);
|
|
107
|
+
const contentStreamRefs = parseSingleOrArrayRefs(pageObject.dictionaryEntries.get('Contents'));
|
|
108
|
+
const warnings = [];
|
|
109
|
+
const decodedAnalyses = [];
|
|
110
|
+
const allFilters = [];
|
|
111
|
+
let decodedContentStreamCount = 0;
|
|
112
|
+
for (const ref of contentStreamRefs) {
|
|
113
|
+
const object = objects.get(pdfRefKey(ref));
|
|
114
|
+
if (!object || !object.streamSpec) {
|
|
115
|
+
warnings.push(`Content stream ${pdfRefLabel(ref)} could not be resolved.`);
|
|
116
|
+
continue;
|
|
117
|
+
}
|
|
118
|
+
const decoded = decodeObjectStream(object, objects, buffer);
|
|
119
|
+
allFilters.push(...decoded.filters);
|
|
120
|
+
if (decoded.warning) {
|
|
121
|
+
warnings.push(decoded.warning);
|
|
122
|
+
continue;
|
|
123
|
+
}
|
|
124
|
+
decodedContentStreamCount += 1;
|
|
125
|
+
decodedAnalyses.push(analyzeContentStream(decoded.content));
|
|
126
|
+
}
|
|
127
|
+
const extractedText = decodedAnalyses
|
|
128
|
+
.map((analysis) => analysis.extractedText)
|
|
129
|
+
.filter((value) => value.length > 0)
|
|
130
|
+
.join('\n')
|
|
131
|
+
.trim();
|
|
132
|
+
const normalizedText = normalizeExtractedText(extractedText);
|
|
133
|
+
const textQuality = assessPdfTextQuality(normalizedText);
|
|
134
|
+
const hasTextOperators = decodedAnalyses.some((analysis) => analysis.hasTextOperators);
|
|
135
|
+
const hasVectorGraphics = decodedAnalyses.some((analysis) => analysis.hasVectorGraphics);
|
|
136
|
+
const paintedXObjects = new Set();
|
|
137
|
+
let hasInlineImages = false;
|
|
138
|
+
for (const analysis of decodedAnalyses) {
|
|
139
|
+
hasInlineImages ||= analysis.hasInlineImages;
|
|
140
|
+
for (const name of analysis.paintedXObjectNames) {
|
|
141
|
+
paintedXObjects.add(name);
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
const paintedRasterImages = [...paintedXObjects].some((name) => pageAssembly.imageXObjectNames.has(name));
|
|
145
|
+
const hasRasterImages = hasInlineImages || paintedRasterImages;
|
|
146
|
+
const characterCount = normalizedText.length;
|
|
147
|
+
const wordCount = normalizedText.length === 0 ? 0 : normalizedText.split(/\s+/).filter(Boolean).length;
|
|
148
|
+
const lineCount = extractedText.length === 0 ? 0 : extractedText.split(/\r?\n/).filter((line) => line.trim().length > 0).length;
|
|
149
|
+
const undecodedContentStreamCount = Math.max(contentStreamRefs.length - decodedContentStreamCount, 0);
|
|
150
|
+
const classification = classifyPage({
|
|
151
|
+
normalizedText,
|
|
152
|
+
textQuality,
|
|
153
|
+
hasTextOperators,
|
|
154
|
+
hasRasterImages,
|
|
155
|
+
hasVectorGraphics,
|
|
156
|
+
contentStreamCount: contentStreamRefs.length,
|
|
157
|
+
decodedContentStreamCount,
|
|
158
|
+
});
|
|
159
|
+
const degradationNotes = buildPageDegradationNotes({
|
|
160
|
+
classification,
|
|
161
|
+
normalizedText,
|
|
162
|
+
textQuality,
|
|
163
|
+
hasTextOperators,
|
|
164
|
+
hasRasterImages,
|
|
165
|
+
contentStreamCount: contentStreamRefs.length,
|
|
166
|
+
decodedContentStreamCount,
|
|
167
|
+
warnings,
|
|
168
|
+
});
|
|
169
|
+
const degradation = {
|
|
170
|
+
level: inferDegradationLevel(degradationNotes, textQuality.accepted && normalizedText.length > 0),
|
|
171
|
+
notes: degradationNotes,
|
|
172
|
+
};
|
|
173
|
+
const capabilities = {
|
|
174
|
+
nativeTextExtraction: inferPageTextCapability({
|
|
175
|
+
normalizedText: textQuality.accepted ? normalizedText : '',
|
|
176
|
+
hasTextOperators,
|
|
177
|
+
contentStreamCount: contentStreamRefs.length,
|
|
178
|
+
decodedContentStreamCount,
|
|
179
|
+
}),
|
|
180
|
+
pageRendering: 'available',
|
|
181
|
+
ocr: 'unavailable',
|
|
182
|
+
};
|
|
183
|
+
const contentHints = extractContentHints(textQuality.accepted ? normalizedText : '');
|
|
184
|
+
const normalizedArtifact = {
|
|
185
|
+
pageNumber,
|
|
186
|
+
classification,
|
|
187
|
+
rotation: pageAssembly.rotation,
|
|
188
|
+
nativeText: textQuality.accepted ? normalizedText : null,
|
|
189
|
+
textQuality,
|
|
190
|
+
textSource: normalizedText.length === 0
|
|
191
|
+
? 'none'
|
|
192
|
+
: textQuality.accepted
|
|
193
|
+
? 'native-text'
|
|
194
|
+
: 'native-text-low-quality',
|
|
195
|
+
renderedImageAvailable: true,
|
|
196
|
+
headingHints: contentHints.headings,
|
|
197
|
+
tablesDetected: contentHints.tablesDetected,
|
|
198
|
+
figuresDetected: contentHints.figuresDetected,
|
|
199
|
+
warnings: uniqueStrings([
|
|
200
|
+
...warnings,
|
|
201
|
+
...textQuality.reasons.map((reason) => `Native text quality: ${reason}`),
|
|
202
|
+
]),
|
|
203
|
+
confidence: Math.round(textQuality.score * 100),
|
|
204
|
+
};
|
|
205
|
+
return {
|
|
206
|
+
pageNumber,
|
|
207
|
+
totalPages,
|
|
208
|
+
classification,
|
|
209
|
+
extractedText,
|
|
210
|
+
normalizedText,
|
|
211
|
+
normalizedArtifact,
|
|
212
|
+
gracefulDegradationNotes: degradation.notes,
|
|
213
|
+
degradation,
|
|
214
|
+
capabilities,
|
|
215
|
+
metadata: {
|
|
216
|
+
objectRef: pdfRefLabel(pageObject.ref),
|
|
217
|
+
width: pageAssembly.width,
|
|
218
|
+
height: pageAssembly.height,
|
|
219
|
+
rotation: pageAssembly.rotation,
|
|
220
|
+
characterCount,
|
|
221
|
+
wordCount,
|
|
222
|
+
lineCount,
|
|
223
|
+
contentStreamCount: contentStreamRefs.length,
|
|
224
|
+
decodedContentStreamCount,
|
|
225
|
+
undecodedContentStreamCount,
|
|
226
|
+
contentFilters: uniqueStrings(allFilters),
|
|
227
|
+
fontNames: pageAssembly.fontNames,
|
|
228
|
+
hasTextOperators,
|
|
229
|
+
hasRasterImages,
|
|
230
|
+
hasVectorGraphics,
|
|
231
|
+
},
|
|
232
|
+
warnings: uniqueStrings(warnings),
|
|
233
|
+
};
|
|
234
|
+
}
|
|
235
|
+
function summarizeDocumentCapabilities(pages) {
|
|
236
|
+
const pageCapabilities = new Set(pages.map((page) => page.capabilities.nativeTextExtraction));
|
|
237
|
+
let nativeTextExtraction = 'unavailable';
|
|
238
|
+
if (pageCapabilities.has('available') && pageCapabilities.size === 1) {
|
|
239
|
+
nativeTextExtraction = 'available';
|
|
240
|
+
}
|
|
241
|
+
else if (pageCapabilities.has('available') || pageCapabilities.has('partial')) {
|
|
242
|
+
nativeTextExtraction = 'partial';
|
|
243
|
+
}
|
|
244
|
+
else if (pages.length === 0) {
|
|
245
|
+
nativeTextExtraction = 'unavailable';
|
|
246
|
+
}
|
|
247
|
+
return {
|
|
248
|
+
nativeTextExtraction,
|
|
249
|
+
pageRendering: 'available',
|
|
250
|
+
ocr: 'unavailable',
|
|
251
|
+
};
|
|
252
|
+
}
|
|
253
|
+
function summarizeDocumentDegradation(pages, warnings, isEncrypted) {
|
|
254
|
+
const notes = [];
|
|
255
|
+
if (isEncrypted) {
|
|
256
|
+
notes.push('Encrypted content is not supported by this lightweight PDF inspector.');
|
|
257
|
+
}
|
|
258
|
+
const pagesRequiringOcr = pages.filter((page) => page.classification === 'image-only');
|
|
259
|
+
if (pagesRequiringOcr.length > 0) {
|
|
260
|
+
notes.push(`${pagesRequiringOcr.length} image-only page(s) should be rerouted through raster/OCR fallback because they do not expose usable native text.`);
|
|
261
|
+
}
|
|
262
|
+
const unreadablePages = pages.filter((page) => page.classification === 'text-unreadable');
|
|
263
|
+
if (unreadablePages.length > 0) {
|
|
264
|
+
notes.push(`${unreadablePages.length} page(s) have unreadable or rejected native text and should use raster/OCR fallback.`);
|
|
265
|
+
}
|
|
266
|
+
if (warnings.some((warning) => warning.includes('Unsupported filter pipeline'))) {
|
|
267
|
+
notes.push('One or more content streams use filters that this MVP parser does not decode.');
|
|
268
|
+
}
|
|
269
|
+
const uniqueNotes = uniqueStrings(notes);
|
|
270
|
+
return {
|
|
271
|
+
level: inferDocumentDegradationLevel(pages, uniqueNotes),
|
|
272
|
+
notes: uniqueNotes,
|
|
273
|
+
};
|
|
274
|
+
}
|
|
275
|
+
function inferDocumentDegradationLevel(pages, notes) {
|
|
276
|
+
if (notes.length === 0)
|
|
277
|
+
return 'none';
|
|
278
|
+
if (pages.length === 0)
|
|
279
|
+
return 'full';
|
|
280
|
+
const pagesWithText = pages.filter((page) => page.normalizedArtifact.nativeText?.length).length;
|
|
281
|
+
return pagesWithText > 0 ? 'partial' : 'full';
|
|
282
|
+
}
|
|
283
|
+
function inferPageTextCapability(input) {
|
|
284
|
+
if (input.normalizedText.length > 0 && input.decodedContentStreamCount === input.contentStreamCount) {
|
|
285
|
+
return 'available';
|
|
286
|
+
}
|
|
287
|
+
if (input.normalizedText.length > 0 || input.hasTextOperators || input.decodedContentStreamCount > 0) {
|
|
288
|
+
return 'partial';
|
|
289
|
+
}
|
|
290
|
+
return 'unavailable';
|
|
291
|
+
}
|
|
292
|
+
function buildPageDegradationNotes(input) {
|
|
293
|
+
const notes = [];
|
|
294
|
+
if (input.classification === 'image-only' && input.hasRasterImages) {
|
|
295
|
+
notes.push('No usable native text layer was extracted from this page; raster/OCR fallback should be used.');
|
|
296
|
+
}
|
|
297
|
+
if (input.classification === 'text-unreadable' && input.hasTextOperators) {
|
|
298
|
+
notes.push('Text drawing operators were detected, but the lightweight parser could not decode the page text reliably.');
|
|
299
|
+
}
|
|
300
|
+
if (input.normalizedText.length > 0 && !input.textQuality.accepted) {
|
|
301
|
+
notes.push('The native PDF text layer looked degraded or garbled and should be rerouted through OCR or page rendering fallback.');
|
|
302
|
+
}
|
|
303
|
+
if (input.contentStreamCount > input.decodedContentStreamCount) {
|
|
304
|
+
notes.push(`${input.contentStreamCount - input.decodedContentStreamCount} content stream(s) could not be decoded by the lightweight parser.`);
|
|
305
|
+
}
|
|
306
|
+
if (input.classification === 'unknown' && input.contentStreamCount === 0) {
|
|
307
|
+
notes.push('The page does not expose a standard content stream that this MVP parser can inspect.');
|
|
308
|
+
}
|
|
309
|
+
for (const warning of input.warnings) {
|
|
310
|
+
if (warning.includes('Unsupported filter pipeline')) {
|
|
311
|
+
notes.push('The page uses an unsupported content-stream filter pipeline.');
|
|
312
|
+
}
|
|
313
|
+
}
|
|
314
|
+
return uniqueStrings(notes);
|
|
315
|
+
}
|
|
316
|
+
function inferDegradationLevel(notes, hasExtractedText) {
|
|
317
|
+
if (notes.length === 0)
|
|
318
|
+
return 'none';
|
|
319
|
+
return hasExtractedText ? 'partial' : 'full';
|
|
320
|
+
}
|
|
321
|
+
function classifyPage(input) {
|
|
322
|
+
const hasText = input.normalizedText.length > 0 && input.textQuality.accepted;
|
|
323
|
+
if (hasText) {
|
|
324
|
+
return input.hasRasterImages || input.hasVectorGraphics ? 'mixed' : 'digital-text';
|
|
325
|
+
}
|
|
326
|
+
if (input.hasRasterImages)
|
|
327
|
+
return 'image-only';
|
|
328
|
+
if (input.hasVectorGraphics)
|
|
329
|
+
return 'graphics-only';
|
|
330
|
+
if (input.hasTextOperators)
|
|
331
|
+
return 'text-unreadable';
|
|
332
|
+
if (input.contentStreamCount === 0)
|
|
333
|
+
return 'empty';
|
|
334
|
+
if (input.decodedContentStreamCount === 0)
|
|
335
|
+
return 'unknown';
|
|
336
|
+
return 'empty';
|
|
337
|
+
}
|
|
338
|
+
function assemblePage(pageObject, objects) {
|
|
339
|
+
const mediaBox = resolveInheritedNumberArray(pageObject, 'MediaBox', objects);
|
|
340
|
+
const width = mediaBox && mediaBox.length >= 4 ? mediaBox[2] - mediaBox[0] : null;
|
|
341
|
+
const height = mediaBox && mediaBox.length >= 4 ? mediaBox[3] - mediaBox[1] : null;
|
|
342
|
+
const rotation = resolveInheritedNumber(pageObject, 'Rotate', objects) ?? 0;
|
|
343
|
+
const resourceEntries = resolveInheritedDictionary(pageObject, 'Resources', objects);
|
|
344
|
+
const fontNames = resourceEntries ? extractFontNames(resourceEntries, objects) : [];
|
|
345
|
+
const imageXObjectNames = resourceEntries ? extractRasterImageXObjectNames(resourceEntries, objects) : new Set();
|
|
346
|
+
return {
|
|
347
|
+
pageObject,
|
|
348
|
+
width,
|
|
349
|
+
height,
|
|
350
|
+
rotation,
|
|
351
|
+
fontNames,
|
|
352
|
+
imageXObjectNames,
|
|
353
|
+
imageXObjects: resourceEntries ? extractRasterImageXObjects(resourceEntries, objects) : [],
|
|
354
|
+
};
|
|
355
|
+
}
|
|
356
|
+
function extractFontNames(resourceEntries, objects) {
|
|
357
|
+
const fontEntries = resolveDictionaryFromRaw(resourceEntries.get('Font'), objects);
|
|
358
|
+
if (!fontEntries)
|
|
359
|
+
return [];
|
|
360
|
+
const names = [];
|
|
361
|
+
for (const [alias, rawValue] of fontEntries.entries()) {
|
|
362
|
+
const resolved = resolveDictionaryFromRaw(rawValue, objects);
|
|
363
|
+
const baseFont = resolved?.get('BaseFont');
|
|
364
|
+
const baseFontName = parsePdfName(baseFont);
|
|
365
|
+
names.push(baseFontName ?? alias);
|
|
366
|
+
}
|
|
367
|
+
return uniqueStrings(names);
|
|
368
|
+
}
|
|
369
|
+
function extractRasterImageXObjectNames(resourceEntries, objects) {
|
|
370
|
+
const names = new Set();
|
|
371
|
+
const xObjectEntries = resolveDictionaryFromRaw(resourceEntries.get('XObject'), objects);
|
|
372
|
+
if (!xObjectEntries)
|
|
373
|
+
return names;
|
|
374
|
+
for (const [alias, rawValue] of xObjectEntries.entries()) {
|
|
375
|
+
const resolved = resolveDictionaryFromRaw(rawValue, objects);
|
|
376
|
+
const subtype = parsePdfName(resolved?.get('Subtype'));
|
|
377
|
+
if (subtype === 'Image') {
|
|
378
|
+
names.add(alias);
|
|
379
|
+
}
|
|
380
|
+
}
|
|
381
|
+
return names;
|
|
382
|
+
}
|
|
383
|
+
function extractRasterImageXObjects(resourceEntries, objects) {
|
|
384
|
+
const images = [];
|
|
385
|
+
const xObjectEntries = resolveDictionaryFromRaw(resourceEntries.get('XObject'), objects);
|
|
386
|
+
if (!xObjectEntries)
|
|
387
|
+
return images;
|
|
388
|
+
for (const [alias, rawValue] of xObjectEntries.entries()) {
|
|
389
|
+
const ref = parsePdfRef(rawValue);
|
|
390
|
+
if (!ref)
|
|
391
|
+
continue;
|
|
392
|
+
const object = objects.get(pdfRefKey(ref));
|
|
393
|
+
if (!object)
|
|
394
|
+
continue;
|
|
395
|
+
const resolved = resolveDictionaryFromRaw(rawValue, objects);
|
|
396
|
+
const subtype = parsePdfName(resolved?.get('Subtype'));
|
|
397
|
+
if (subtype !== 'Image')
|
|
398
|
+
continue;
|
|
399
|
+
images.push({
|
|
400
|
+
alias,
|
|
401
|
+
ref,
|
|
402
|
+
object,
|
|
403
|
+
width: parseDirectNumber(resolved?.get('Width')) ?? parseDirectNumber(resolved?.get('W')),
|
|
404
|
+
height: parseDirectNumber(resolved?.get('Height')) ?? parseDirectNumber(resolved?.get('H')),
|
|
405
|
+
bitsPerComponent: parseDirectNumber(resolved?.get('BitsPerComponent'))
|
|
406
|
+
?? parseDirectNumber(resolved?.get('BPC')),
|
|
407
|
+
colorSpace: parsePdfName(resolved?.get('ColorSpace')) ?? parseColorSpaceName(resolved?.get('ColorSpace')),
|
|
408
|
+
filters: parseFilterNames(resolved?.get('Filter')),
|
|
409
|
+
});
|
|
410
|
+
}
|
|
411
|
+
return images;
|
|
412
|
+
}
|
|
413
|
+
async function extractPrimaryPageImage(pageObject, pageNumber, totalPages, buffer, objects) {
|
|
414
|
+
const pageAssembly = assemblePage(pageObject, objects);
|
|
415
|
+
if (pageAssembly.imageXObjects.length === 0) {
|
|
416
|
+
return null;
|
|
417
|
+
}
|
|
418
|
+
const paintedAliases = collectPaintedXObjectAliases(pageObject, buffer, objects);
|
|
419
|
+
const candidateImages = paintedAliases.size > 0
|
|
420
|
+
? pageAssembly.imageXObjects.filter((image) => paintedAliases.has(image.alias))
|
|
421
|
+
: pageAssembly.imageXObjects;
|
|
422
|
+
const sortedCandidates = [...candidateImages].sort((left, right) => {
|
|
423
|
+
const leftPixels = (left.width ?? 0) * (left.height ?? 0);
|
|
424
|
+
const rightPixels = (right.width ?? 0) * (right.height ?? 0);
|
|
425
|
+
return rightPixels - leftPixels;
|
|
426
|
+
});
|
|
427
|
+
for (const candidate of sortedCandidates) {
|
|
428
|
+
const decoded = await decodeRasterImageObject(candidate, buffer, objects);
|
|
429
|
+
if (!decoded) {
|
|
430
|
+
continue;
|
|
431
|
+
}
|
|
432
|
+
return {
|
|
433
|
+
pageNumber,
|
|
434
|
+
totalPages,
|
|
435
|
+
objectRef: pdfRefLabel(candidate.ref),
|
|
436
|
+
mimeType: decoded.mimeType,
|
|
437
|
+
width: candidate.width,
|
|
438
|
+
height: candidate.height,
|
|
439
|
+
byteLength: decoded.data.length,
|
|
440
|
+
source: 'xobject-image',
|
|
441
|
+
warnings: decoded.warnings,
|
|
442
|
+
data: decoded.data,
|
|
443
|
+
};
|
|
444
|
+
}
|
|
445
|
+
return null;
|
|
446
|
+
}
|
|
447
|
+
function collectPaintedXObjectAliases(pageObject, buffer, objects) {
|
|
448
|
+
const aliases = new Set();
|
|
449
|
+
const contentStreamRefs = parseSingleOrArrayRefs(pageObject.dictionaryEntries.get('Contents'));
|
|
450
|
+
for (const ref of contentStreamRefs) {
|
|
451
|
+
const object = objects.get(pdfRefKey(ref));
|
|
452
|
+
if (!object || !object.streamSpec) {
|
|
453
|
+
continue;
|
|
454
|
+
}
|
|
455
|
+
const decoded = decodeObjectStream(object, objects, buffer);
|
|
456
|
+
if (decoded.warning) {
|
|
457
|
+
continue;
|
|
458
|
+
}
|
|
459
|
+
const analysis = analyzeContentStream(decoded.content);
|
|
460
|
+
for (const name of analysis.paintedXObjectNames) {
|
|
461
|
+
aliases.add(name);
|
|
462
|
+
}
|
|
463
|
+
}
|
|
464
|
+
return aliases;
|
|
465
|
+
}
|
|
466
|
+
async function decodeRasterImageObject(image, buffer, objects) {
|
|
467
|
+
if (!image.object.streamSpec) {
|
|
468
|
+
return null;
|
|
469
|
+
}
|
|
470
|
+
const length = resolveStreamLength(image.object.streamSpec.lengthRaw, objects);
|
|
471
|
+
const rawStream = sliceObjectStreamBytes(image.object.streamSpec, length, buffer);
|
|
472
|
+
const filters = image.filters;
|
|
473
|
+
const warnings = [];
|
|
474
|
+
if (filters.length === 1 && filters[0] === 'DCTDecode') {
|
|
475
|
+
const preprocessed = await preprocessVisionImageBuffer(rawStream, 'image/jpeg');
|
|
476
|
+
warnings.push(...preprocessed.warnings);
|
|
477
|
+
return {
|
|
478
|
+
mimeType: preprocessed.mimeType,
|
|
479
|
+
data: preprocessed.buffer,
|
|
480
|
+
warnings,
|
|
481
|
+
};
|
|
482
|
+
}
|
|
483
|
+
if (filters.length === 1 && filters[0] === 'JPXDecode') {
|
|
484
|
+
const preprocessed = await preprocessVisionImageBuffer(rawStream, 'image/jp2');
|
|
485
|
+
warnings.push(...preprocessed.warnings);
|
|
486
|
+
return {
|
|
487
|
+
mimeType: preprocessed.mimeType,
|
|
488
|
+
data: preprocessed.buffer,
|
|
489
|
+
warnings,
|
|
490
|
+
};
|
|
491
|
+
}
|
|
492
|
+
if (filters.length > 1) {
|
|
493
|
+
warnings.push(`Unsupported image filter pipeline on ${pdfRefLabel(image.ref)}: ${filters.join(', ')}.`);
|
|
494
|
+
return null;
|
|
495
|
+
}
|
|
496
|
+
const rawRaster = filters.length === 1 && filters[0] === 'FlateDecode'
|
|
497
|
+
? tryInflateRaster(rawStream, image.ref, warnings)
|
|
498
|
+
: filters.length === 0
|
|
499
|
+
? rawStream
|
|
500
|
+
: null;
|
|
501
|
+
if (!rawRaster) {
|
|
502
|
+
return null;
|
|
503
|
+
}
|
|
504
|
+
const channels = inferColorChannels(image.colorSpace);
|
|
505
|
+
if (!image.width || !image.height || !channels || image.bitsPerComponent !== 8) {
|
|
506
|
+
warnings.push(`Unsupported raw image encoding on ${pdfRefLabel(image.ref)}. Width/height/channels/bits-per-component were insufficient for PNG conversion.`);
|
|
507
|
+
return null;
|
|
508
|
+
}
|
|
509
|
+
const expectedBytes = image.width * image.height * channels;
|
|
510
|
+
if (rawRaster.length < expectedBytes) {
|
|
511
|
+
warnings.push(`Raw image data on ${pdfRefLabel(image.ref)} was shorter than expected (${rawRaster.length}/${expectedBytes} bytes).`);
|
|
512
|
+
return null;
|
|
513
|
+
}
|
|
514
|
+
try {
|
|
515
|
+
const pngBuffer = await encodeRawRasterToPng(rawRaster.subarray(0, expectedBytes), {
|
|
516
|
+
width: image.width,
|
|
517
|
+
height: image.height,
|
|
518
|
+
channels,
|
|
519
|
+
});
|
|
520
|
+
const preprocessed = await preprocessVisionImageBuffer(pngBuffer, 'image/png');
|
|
521
|
+
warnings.push(...preprocessed.warnings);
|
|
522
|
+
return {
|
|
523
|
+
mimeType: preprocessed.mimeType,
|
|
524
|
+
data: preprocessed.buffer,
|
|
525
|
+
warnings,
|
|
526
|
+
};
|
|
527
|
+
}
|
|
528
|
+
catch (error) {
|
|
529
|
+
warnings.push(`Failed to convert raw image data on ${pdfRefLabel(image.ref)} to PNG: ${error instanceof Error ? error.message : String(error)}`);
|
|
530
|
+
return null;
|
|
531
|
+
}
|
|
532
|
+
}
|
|
533
|
+
function tryInflateRaster(rawStream, ref, warnings) {
|
|
534
|
+
try {
|
|
535
|
+
return inflateSync(rawStream);
|
|
536
|
+
}
|
|
537
|
+
catch {
|
|
538
|
+
warnings.push(`Failed to inflate raster image data for ${pdfRefLabel(ref)}.`);
|
|
539
|
+
return null;
|
|
540
|
+
}
|
|
541
|
+
}
|
|
542
|
+
function inferColorChannels(colorSpace) {
|
|
543
|
+
switch (colorSpace) {
|
|
544
|
+
case 'DeviceGray':
|
|
545
|
+
return 1;
|
|
546
|
+
case 'DeviceRGB':
|
|
547
|
+
return 3;
|
|
548
|
+
case 'DeviceCMYK':
|
|
549
|
+
return 4;
|
|
550
|
+
default:
|
|
551
|
+
return null;
|
|
552
|
+
}
|
|
553
|
+
}
|
|
554
|
+
function resolveInheritedDictionary(pageObject, key, objects) {
|
|
555
|
+
const rawValue = resolveInheritedRawValue(pageObject, key, objects);
|
|
556
|
+
return resolveDictionaryFromRaw(rawValue, objects);
|
|
557
|
+
}
|
|
558
|
+
function resolveInheritedNumberArray(pageObject, key, objects) {
|
|
559
|
+
const rawValue = resolveInheritedRawValue(pageObject, key, objects);
|
|
560
|
+
if (!rawValue)
|
|
561
|
+
return null;
|
|
562
|
+
return parseNumberArray(rawValue);
|
|
563
|
+
}
|
|
564
|
+
function resolveInheritedNumber(pageObject, key, objects) {
|
|
565
|
+
const rawValue = resolveInheritedRawValue(pageObject, key, objects);
|
|
566
|
+
if (!rawValue)
|
|
567
|
+
return null;
|
|
568
|
+
return parseDirectNumber(rawValue);
|
|
569
|
+
}
|
|
570
|
+
function resolveInheritedRawValue(pageObject, key, objects) {
|
|
571
|
+
let current = pageObject;
|
|
572
|
+
while (current) {
|
|
573
|
+
const rawValue = current.dictionaryEntries.get(key);
|
|
574
|
+
if (rawValue !== undefined)
|
|
575
|
+
return rawValue;
|
|
576
|
+
const parentRef = parsePdfRef(current.dictionaryEntries.get('Parent'));
|
|
577
|
+
current = parentRef ? objects.get(pdfRefKey(parentRef)) : undefined;
|
|
578
|
+
}
|
|
579
|
+
return undefined;
|
|
580
|
+
}
|
|
581
|
+
function decodeObjectStream(object, objects, buffer) {
|
|
582
|
+
if (!object.streamSpec) {
|
|
583
|
+
return {
|
|
584
|
+
filters: [],
|
|
585
|
+
content: '',
|
|
586
|
+
warning: `Object ${pdfRefLabel(object.ref)} does not expose a stream.`,
|
|
587
|
+
};
|
|
588
|
+
}
|
|
589
|
+
const filters = parseFilterNames(object.dictionaryEntries.get('Filter'));
|
|
590
|
+
const length = resolveStreamLength(object.streamSpec.lengthRaw, objects);
|
|
591
|
+
const rawStream = sliceObjectStreamBytes(object.streamSpec, length, buffer);
|
|
592
|
+
let decoded = rawStream;
|
|
593
|
+
if (filters.length > 0) {
|
|
594
|
+
for (const filter of filters) {
|
|
595
|
+
if (filter !== 'FlateDecode') {
|
|
596
|
+
return {
|
|
597
|
+
filters,
|
|
598
|
+
content: '',
|
|
599
|
+
warning: `Unsupported filter pipeline on ${pdfRefLabel(object.ref)}: ${filters.join(', ')}.`,
|
|
600
|
+
};
|
|
601
|
+
}
|
|
602
|
+
try {
|
|
603
|
+
decoded = inflateSync(decoded);
|
|
604
|
+
}
|
|
605
|
+
catch {
|
|
606
|
+
return {
|
|
607
|
+
filters,
|
|
608
|
+
content: '',
|
|
609
|
+
warning: `Failed to inflate FlateDecode content for ${pdfRefLabel(object.ref)}.`,
|
|
610
|
+
};
|
|
611
|
+
}
|
|
612
|
+
}
|
|
613
|
+
}
|
|
614
|
+
return {
|
|
615
|
+
filters,
|
|
616
|
+
content: decoded.toString('latin1'),
|
|
617
|
+
warning: null,
|
|
618
|
+
};
|
|
619
|
+
}
|
|
620
|
+
function sliceObjectStreamBytes(streamSpec, resolvedLength, buffer) {
|
|
621
|
+
if (resolvedLength !== null && resolvedLength >= 0) {
|
|
622
|
+
const endOffset = Math.min(streamSpec.startOffset + resolvedLength, buffer.length);
|
|
623
|
+
return buffer.subarray(streamSpec.startOffset, endOffset);
|
|
624
|
+
}
|
|
625
|
+
let rawStream = buffer.subarray(streamSpec.startOffset, Math.min(streamSpec.fallbackEndOffset, buffer.length));
|
|
626
|
+
while (rawStream.length > 0 && isPdfWhitespace(rawStream[rawStream.length - 1])) {
|
|
627
|
+
rawStream = rawStream.subarray(0, rawStream.length - 1);
|
|
628
|
+
}
|
|
629
|
+
return rawStream;
|
|
630
|
+
}
|
|
631
|
+
function resolveStreamLength(lengthRaw, objects) {
|
|
632
|
+
if (!lengthRaw)
|
|
633
|
+
return null;
|
|
634
|
+
const directNumber = parseDirectNumber(lengthRaw);
|
|
635
|
+
if (directNumber !== null)
|
|
636
|
+
return directNumber;
|
|
637
|
+
const ref = parsePdfRef(lengthRaw);
|
|
638
|
+
if (!ref)
|
|
639
|
+
return null;
|
|
640
|
+
const referencedObject = objects.get(pdfRefKey(ref));
|
|
641
|
+
if (!referencedObject)
|
|
642
|
+
return null;
|
|
643
|
+
return parseDirectNumber(referencedObject.body.trim());
|
|
644
|
+
}
|
|
645
|
+
function analyzeContentStream(content) {
|
|
646
|
+
const tokens = tokenizeContentStream(content);
|
|
647
|
+
const operands = [];
|
|
648
|
+
const textParts = [];
|
|
649
|
+
let hasTextOperators = false;
|
|
650
|
+
let hasVectorGraphics = false;
|
|
651
|
+
let hasInlineImages = /\bBI\b[\s\S]*?\bID\b/.test(content);
|
|
652
|
+
const paintedXObjectNames = new Set();
|
|
653
|
+
let nextTextJoinMode = 'space';
|
|
654
|
+
for (const token of tokens) {
|
|
655
|
+
if (token.kind !== 'operator') {
|
|
656
|
+
operands.push(token);
|
|
657
|
+
continue;
|
|
658
|
+
}
|
|
659
|
+
const operator = token.raw;
|
|
660
|
+
if (VECTOR_GRAPHICS_OPERATORS.has(operator)) {
|
|
661
|
+
hasVectorGraphics = true;
|
|
662
|
+
}
|
|
663
|
+
if (operator === 'Do') {
|
|
664
|
+
const nameToken = findLastNameOperand(operands);
|
|
665
|
+
if (nameToken) {
|
|
666
|
+
paintedXObjectNames.add(nameToken.raw.slice(1));
|
|
667
|
+
}
|
|
668
|
+
}
|
|
669
|
+
if (operator === 'BT' && textParts.length > 0) {
|
|
670
|
+
nextTextJoinMode = 'newline';
|
|
671
|
+
}
|
|
672
|
+
if (operator === 'Tj') {
|
|
673
|
+
hasTextOperators = true;
|
|
674
|
+
appendTextPart(textParts, decodeTextOperand(operands[operands.length - 1]), nextTextJoinMode);
|
|
675
|
+
nextTextJoinMode = 'space';
|
|
676
|
+
}
|
|
677
|
+
else if (operator === 'TJ') {
|
|
678
|
+
hasTextOperators = true;
|
|
679
|
+
appendTextPart(textParts, decodeTextArrayOperand(operands[operands.length - 1]), nextTextJoinMode);
|
|
680
|
+
nextTextJoinMode = 'space';
|
|
681
|
+
}
|
|
682
|
+
else if (operator === '\'') {
|
|
683
|
+
hasTextOperators = true;
|
|
684
|
+
appendTextPart(textParts, decodeTextOperand(operands[operands.length - 1]), 'newline');
|
|
685
|
+
nextTextJoinMode = 'space';
|
|
686
|
+
}
|
|
687
|
+
else if (operator === '"') {
|
|
688
|
+
hasTextOperators = true;
|
|
689
|
+
appendTextPart(textParts, decodeTextOperand(operands[operands.length - 1]), 'newline');
|
|
690
|
+
nextTextJoinMode = 'space';
|
|
691
|
+
}
|
|
692
|
+
operands.length = 0;
|
|
693
|
+
}
|
|
694
|
+
const extractedText = textParts.join('').trim();
|
|
695
|
+
const normalizedText = normalizeExtractedText(extractedText);
|
|
696
|
+
return {
|
|
697
|
+
extractedText,
|
|
698
|
+
normalizedText,
|
|
699
|
+
hasTextOperators,
|
|
700
|
+
hasVectorGraphics,
|
|
701
|
+
hasInlineImages,
|
|
702
|
+
paintedXObjectNames: [...paintedXObjectNames],
|
|
703
|
+
};
|
|
704
|
+
}
|
|
705
|
+
function appendTextPart(parts, value, joinMode) {
|
|
706
|
+
const normalized = normalizeFragment(value);
|
|
707
|
+
if (normalized.length === 0)
|
|
708
|
+
return;
|
|
709
|
+
if (parts.length > 0) {
|
|
710
|
+
parts.push(joinMode === 'newline' ? '\n' : ' ');
|
|
711
|
+
}
|
|
712
|
+
parts.push(normalized);
|
|
713
|
+
}
|
|
714
|
+
function normalizeFragment(value) {
|
|
715
|
+
return value
|
|
716
|
+
.replace(/\u0000/g, '')
|
|
717
|
+
.replace(/[\u0001-\u0008\u000B\u000C\u000E-\u001F]+/g, ' ')
|
|
718
|
+
.replace(/[ \t]+/g, ' ')
|
|
719
|
+
.trim();
|
|
720
|
+
}
|
|
721
|
+
function normalizeExtractedText(value) {
|
|
722
|
+
return value
|
|
723
|
+
.replace(/\r/g, '\n')
|
|
724
|
+
.replace(/[ \t]+\n/g, '\n')
|
|
725
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
726
|
+
.replace(/[ \t]{2,}/g, ' ')
|
|
727
|
+
.trim();
|
|
728
|
+
}
|
|
729
|
+
function normalizeTextQualityToken(token) {
|
|
730
|
+
return token
|
|
731
|
+
.replace(/^[^A-Za-z0-9]+/, '')
|
|
732
|
+
.replace(/[^A-Za-z0-9]+$/, '');
|
|
733
|
+
}
|
|
734
|
+
function scoreTextTokenShape(token) {
|
|
735
|
+
const normalized = normalizeTextQualityToken(token);
|
|
736
|
+
if (!normalized)
|
|
737
|
+
return 0;
|
|
738
|
+
if (/^\d+(?:\.\d+)?(?:[-/]\d+(?:\.\d+)?)*(?:m|mm|cm|kpa|mpa|%)?$/i.test(normalized)) {
|
|
739
|
+
return 0.85;
|
|
740
|
+
}
|
|
741
|
+
if (/^[A-Za-z0-9]+(?:[-/.][A-Za-z0-9]+)*$/.test(normalized)) {
|
|
742
|
+
const lettersOnly = normalized.replace(/[^A-Za-z]/g, '');
|
|
743
|
+
if (lettersOnly.length === 0) {
|
|
744
|
+
return 0.75;
|
|
745
|
+
}
|
|
746
|
+
if (lettersOnly.length >= 5 && !/[AEIOUYaeiouy]/.test(lettersOnly) && !/^[A-Z]{2,5}$/.test(lettersOnly)) {
|
|
747
|
+
return 0.2;
|
|
748
|
+
}
|
|
749
|
+
return 1;
|
|
750
|
+
}
|
|
751
|
+
if (/^[().,:;/%-]+$/.test(normalized)) {
|
|
752
|
+
return 0.15;
|
|
753
|
+
}
|
|
754
|
+
return /[A-Za-z0-9]/.test(normalized) ? 0.45 : 0.05;
|
|
755
|
+
}
|
|
756
|
+
function isSuspiciousTextQualityToken(token) {
|
|
757
|
+
const normalized = normalizeTextQualityToken(token);
|
|
758
|
+
if (!normalized) {
|
|
759
|
+
return true;
|
|
760
|
+
}
|
|
761
|
+
if (normalized.includes('\uFFFD')) {
|
|
762
|
+
return true;
|
|
763
|
+
}
|
|
764
|
+
const visibleChars = [...normalized];
|
|
765
|
+
const symbolCount = visibleChars.filter((char) => /[@#$%^&*_~`|<>{}\[\]\\]/.test(char)).length;
|
|
766
|
+
if (visibleChars.length >= 3 && symbolCount >= Math.ceil(visibleChars.length / 2)) {
|
|
767
|
+
return true;
|
|
768
|
+
}
|
|
769
|
+
const lettersOnly = normalized.replace(/[^A-Za-z]/g, '');
|
|
770
|
+
return lettersOnly.length >= 5
|
|
771
|
+
&& !/[AEIOUYaeiouy]/.test(lettersOnly)
|
|
772
|
+
&& !/^[A-Z]{2,5}$/.test(lettersOnly);
|
|
773
|
+
}
|
|
774
|
+
export function assessPdfTextQuality(value) {
|
|
775
|
+
const normalized = normalizeExtractedText(value);
|
|
776
|
+
if (!normalized) {
|
|
777
|
+
return {
|
|
778
|
+
accepted: false,
|
|
779
|
+
score: 0,
|
|
780
|
+
printableRatio: 0,
|
|
781
|
+
replacementRatio: 0,
|
|
782
|
+
symbolNoiseRatio: 0,
|
|
783
|
+
suspiciousTokenRatio: 0,
|
|
784
|
+
dictionaryCoverageRatio: 0,
|
|
785
|
+
averageTokenShapeScore: 0,
|
|
786
|
+
reasons: ['No native text was extracted from the PDF page.'],
|
|
787
|
+
};
|
|
788
|
+
}
|
|
789
|
+
const visibleChars = [...normalized].filter((char) => !/\s/.test(char));
|
|
790
|
+
const printableCount = visibleChars.filter((char) => !/[\u0000-\u001F\u007F]/.test(char)).length;
|
|
791
|
+
const replacementCount = visibleChars.filter((char) => char === '\uFFFD').length;
|
|
792
|
+
const symbolNoiseCount = visibleChars.filter((char) => /[@#$%^&*_~`|<>{}\[\]\\]/.test(char)).length;
|
|
793
|
+
const mojibakeNoiseCount = visibleChars.filter((char) => /[\uFF61-\uFFEF]/.test(char)).length;
|
|
794
|
+
const tokens = normalized.split(/\s+/).filter(Boolean);
|
|
795
|
+
const suspiciousTokenCount = tokens.filter(isSuspiciousTextQualityToken).length;
|
|
796
|
+
const normalizedTokens = tokens.map(normalizeTextQualityToken).filter(Boolean);
|
|
797
|
+
const dictionaryHits = normalizedTokens.filter((token) => TEXT_QUALITY_DICTIONARY.has(token.toLowerCase())).length;
|
|
798
|
+
const averageTokenShapeScore = normalizedTokens.length === 0
|
|
799
|
+
? 0
|
|
800
|
+
: normalizedTokens.reduce((sum, token) => sum + scoreTextTokenShape(token), 0) / normalizedTokens.length;
|
|
801
|
+
const printableRatio = visibleChars.length === 0 ? 0 : printableCount / visibleChars.length;
|
|
802
|
+
const replacementRatio = visibleChars.length === 0 ? 0 : replacementCount / visibleChars.length;
|
|
803
|
+
const symbolNoiseRatio = visibleChars.length === 0 ? 0 : symbolNoiseCount / visibleChars.length;
|
|
804
|
+
const mojibakeNoiseRatio = visibleChars.length === 0 ? 0 : mojibakeNoiseCount / visibleChars.length;
|
|
805
|
+
const suspiciousTokenRatio = tokens.length === 0 ? 0 : suspiciousTokenCount / tokens.length;
|
|
806
|
+
const dictionaryCoverageRatio = normalizedTokens.length === 0 ? 0 : dictionaryHits / normalizedTokens.length;
|
|
807
|
+
const lacksRecognizableWordsOnLongPage = normalizedTokens.length >= 12
|
|
808
|
+
&& dictionaryCoverageRatio === 0
|
|
809
|
+
&& (symbolNoiseRatio >= 0.12 || suspiciousTokenRatio >= 0.18 || averageTokenShapeScore < 0.78);
|
|
810
|
+
const score = Math.max(0, Math.min(1, (printableRatio * 0.22)
|
|
811
|
+
+ ((1 - replacementRatio) * 0.18)
|
|
812
|
+
+ ((1 - symbolNoiseRatio) * 0.18)
|
|
813
|
+
+ ((1 - suspiciousTokenRatio) * 0.22)
|
|
814
|
+
+ (averageTokenShapeScore * 0.15)
|
|
815
|
+
+ (Math.min(dictionaryCoverageRatio, 0.25) * 0.2)));
|
|
816
|
+
const accepted = printableRatio >= 0.85
|
|
817
|
+
&& replacementRatio < 0.08
|
|
818
|
+
&& symbolNoiseRatio < 0.3
|
|
819
|
+
&& mojibakeNoiseRatio < 0.08
|
|
820
|
+
&& score >= 0.62
|
|
821
|
+
&& !(suspiciousTokenRatio >= 0.6 && averageTokenShapeScore < 0.55)
|
|
822
|
+
&& !(dictionaryCoverageRatio < 0.05 && averageTokenShapeScore < 0.5 && normalizedTokens.length >= 4)
|
|
823
|
+
&& !lacksRecognizableWordsOnLongPage;
|
|
824
|
+
const reasons = [];
|
|
825
|
+
if (printableRatio < 0.85) {
|
|
826
|
+
reasons.push('Printable character coverage was too low.');
|
|
827
|
+
}
|
|
828
|
+
if (replacementRatio >= 0.02) {
|
|
829
|
+
reasons.push('Replacement characters were detected in the native text layer.');
|
|
830
|
+
}
|
|
831
|
+
if (symbolNoiseRatio >= 0.18) {
|
|
832
|
+
reasons.push('Symbol-heavy noise was too high for trusted native text.');
|
|
833
|
+
}
|
|
834
|
+
if (mojibakeNoiseRatio >= 0.08) {
|
|
835
|
+
reasons.push('Encoding noise was too high for trusted native text.');
|
|
836
|
+
}
|
|
837
|
+
if (suspiciousTokenRatio >= 0.45) {
|
|
838
|
+
reasons.push('Too many tokens looked garbled or non-linguistic.');
|
|
839
|
+
}
|
|
840
|
+
if (lacksRecognizableWordsOnLongPage) {
|
|
841
|
+
reasons.push('Recognizable word coverage was too low to trust this longer native text layer.');
|
|
842
|
+
}
|
|
843
|
+
if (dictionaryCoverageRatio < 0.08 && averageTokenShapeScore < 0.55 && normalizedTokens.length >= 4) {
|
|
844
|
+
reasons.push('Recognizable word coverage was too low to trust the native text layer.');
|
|
845
|
+
}
|
|
846
|
+
if (!accepted && reasons.length === 0) {
|
|
847
|
+
reasons.push('Native text fell below the PDF text-quality threshold.');
|
|
848
|
+
}
|
|
849
|
+
return {
|
|
850
|
+
accepted,
|
|
851
|
+
score,
|
|
852
|
+
printableRatio,
|
|
853
|
+
replacementRatio,
|
|
854
|
+
symbolNoiseRatio,
|
|
855
|
+
suspiciousTokenRatio,
|
|
856
|
+
dictionaryCoverageRatio,
|
|
857
|
+
averageTokenShapeScore,
|
|
858
|
+
reasons: accepted ? [] : uniqueStrings(reasons),
|
|
859
|
+
};
|
|
860
|
+
}
|
|
861
|
+
function extractContentHints(text) {
|
|
862
|
+
const lines = text
|
|
863
|
+
.split(/\r?\n/)
|
|
864
|
+
.map((line) => line.trim())
|
|
865
|
+
.filter((line) => line.length > 0);
|
|
866
|
+
const headings = uniqueStrings([
|
|
867
|
+
...lines.filter((line) => line.length <= 80
|
|
868
|
+
&& /^(BH[-\s]?\d+|borehole|project|report|page|sheet|appendix|table|figure)/i.test(line)),
|
|
869
|
+
lines[0] ?? '',
|
|
870
|
+
]).slice(0, 3);
|
|
871
|
+
const numericLineCount = lines.filter((line) => /\b\d+(?:\.\d+)?\b/.test(line)
|
|
872
|
+
&& /\b\d+(?:\.\d+)?\b.*\b\d+(?:\.\d+)?\b/.test(line)).length;
|
|
873
|
+
return {
|
|
874
|
+
headings,
|
|
875
|
+
tablesDetected: numericLineCount >= 2
|
|
876
|
+
|| (/\bdepth\b/i.test(text) && /\bspt\b/i.test(text)),
|
|
877
|
+
figuresDetected: /\b(figure|fig\.|plate|photo|sketch|section)\b/i.test(text),
|
|
878
|
+
};
|
|
879
|
+
}
|
|
880
|
+
function decodeTextOperand(token) {
|
|
881
|
+
if (!token)
|
|
882
|
+
return '';
|
|
883
|
+
if (token.kind === 'literal-string' || token.kind === 'hex-string') {
|
|
884
|
+
return token.decodedText ?? '';
|
|
885
|
+
}
|
|
886
|
+
if (token.kind === 'array') {
|
|
887
|
+
return decodeTextArrayOperand(token);
|
|
888
|
+
}
|
|
889
|
+
return '';
|
|
890
|
+
}
|
|
891
|
+
function decodeTextArrayOperand(token) {
|
|
892
|
+
if (!token || token.kind !== 'array')
|
|
893
|
+
return '';
|
|
894
|
+
const pieces = [];
|
|
895
|
+
let index = 1;
|
|
896
|
+
const raw = token.raw;
|
|
897
|
+
while (index < raw.length - 1) {
|
|
898
|
+
index = skipPdfWhitespaceAndComments(raw, index);
|
|
899
|
+
if (index >= raw.length - 1)
|
|
900
|
+
break;
|
|
901
|
+
const value = readPdfValueRaw(raw, index);
|
|
902
|
+
if (!value)
|
|
903
|
+
break;
|
|
904
|
+
if (value.raw.startsWith('(')) {
|
|
905
|
+
pieces.push(decodeLiteralString(value.raw));
|
|
906
|
+
}
|
|
907
|
+
else if (value.raw.startsWith('<') && !value.raw.startsWith(DICTIONARY_START)) {
|
|
908
|
+
pieces.push(decodeHexString(value.raw));
|
|
909
|
+
}
|
|
910
|
+
index = value.nextIndex;
|
|
911
|
+
}
|
|
912
|
+
return pieces.join('');
|
|
913
|
+
}
|
|
914
|
+
function findLastNameOperand(tokens) {
|
|
915
|
+
for (let index = tokens.length - 1; index >= 0; index -= 1) {
|
|
916
|
+
if (tokens[index].kind === 'name') {
|
|
917
|
+
return tokens[index];
|
|
918
|
+
}
|
|
919
|
+
}
|
|
920
|
+
return undefined;
|
|
921
|
+
}
|
|
922
|
+
function tokenizeContentStream(content) {
|
|
923
|
+
const tokens = [];
|
|
924
|
+
let index = 0;
|
|
925
|
+
while (index < content.length) {
|
|
926
|
+
index = skipPdfWhitespaceAndComments(content, index);
|
|
927
|
+
if (index >= content.length)
|
|
928
|
+
break;
|
|
929
|
+
const value = readPdfValueRaw(content, index);
|
|
930
|
+
if (!value)
|
|
931
|
+
break;
|
|
932
|
+
const raw = value.raw;
|
|
933
|
+
if (raw.startsWith('(')) {
|
|
934
|
+
tokens.push({
|
|
935
|
+
kind: 'literal-string',
|
|
936
|
+
raw,
|
|
937
|
+
decodedText: decodeLiteralString(raw),
|
|
938
|
+
});
|
|
939
|
+
}
|
|
940
|
+
else if (raw.startsWith('<') && !raw.startsWith(DICTIONARY_START)) {
|
|
941
|
+
tokens.push({
|
|
942
|
+
kind: 'hex-string',
|
|
943
|
+
raw,
|
|
944
|
+
decodedText: decodeHexString(raw),
|
|
945
|
+
});
|
|
946
|
+
}
|
|
947
|
+
else if (raw.startsWith('[')) {
|
|
948
|
+
tokens.push({ kind: 'array', raw });
|
|
949
|
+
}
|
|
950
|
+
else if (raw.startsWith('/')) {
|
|
951
|
+
tokens.push({ kind: 'name', raw });
|
|
952
|
+
}
|
|
953
|
+
else if (CONTENT_STREAM_OPERATORS.has(raw)) {
|
|
954
|
+
tokens.push({ kind: 'operator', raw });
|
|
955
|
+
}
|
|
956
|
+
else {
|
|
957
|
+
tokens.push({ kind: 'other', raw });
|
|
958
|
+
}
|
|
959
|
+
index = value.nextIndex;
|
|
960
|
+
}
|
|
961
|
+
return tokens;
|
|
962
|
+
}
|
|
963
|
+
function decodeLiteralString(raw) {
|
|
964
|
+
let result = '';
|
|
965
|
+
for (let index = 1; index < raw.length - 1; index += 1) {
|
|
966
|
+
const char = raw[index];
|
|
967
|
+
if (char !== '\\') {
|
|
968
|
+
result += char;
|
|
969
|
+
continue;
|
|
970
|
+
}
|
|
971
|
+
index += 1;
|
|
972
|
+
const escaped = raw[index];
|
|
973
|
+
if (escaped === undefined)
|
|
974
|
+
break;
|
|
975
|
+
if (/[0-7]/.test(escaped)) {
|
|
976
|
+
let octal = escaped;
|
|
977
|
+
for (let lookahead = 0; lookahead < 2; lookahead += 1) {
|
|
978
|
+
const nextChar = raw[index + 1];
|
|
979
|
+
if (!nextChar || !/[0-7]/.test(nextChar))
|
|
980
|
+
break;
|
|
981
|
+
octal += nextChar;
|
|
982
|
+
index += 1;
|
|
983
|
+
}
|
|
984
|
+
result += String.fromCharCode(parseInt(octal, 8));
|
|
985
|
+
continue;
|
|
986
|
+
}
|
|
987
|
+
switch (escaped) {
|
|
988
|
+
case 'n':
|
|
989
|
+
result += '\n';
|
|
990
|
+
break;
|
|
991
|
+
case 'r':
|
|
992
|
+
result += '\r';
|
|
993
|
+
break;
|
|
994
|
+
case 't':
|
|
995
|
+
result += '\t';
|
|
996
|
+
break;
|
|
997
|
+
case 'b':
|
|
998
|
+
result += '\b';
|
|
999
|
+
break;
|
|
1000
|
+
case 'f':
|
|
1001
|
+
result += '\f';
|
|
1002
|
+
break;
|
|
1003
|
+
case '\n':
|
|
1004
|
+
break;
|
|
1005
|
+
case '\r':
|
|
1006
|
+
if (raw[index + 1] === '\n') {
|
|
1007
|
+
index += 1;
|
|
1008
|
+
}
|
|
1009
|
+
break;
|
|
1010
|
+
default:
|
|
1011
|
+
result += escaped;
|
|
1012
|
+
break;
|
|
1013
|
+
}
|
|
1014
|
+
}
|
|
1015
|
+
return decodePdfByteString(Buffer.from(result, 'latin1'));
|
|
1016
|
+
}
|
|
1017
|
+
function decodeHexString(raw) {
|
|
1018
|
+
let hex = raw.slice(1, -1).replace(/\s+/g, '');
|
|
1019
|
+
if (hex.length % 2 === 1) {
|
|
1020
|
+
hex += '0';
|
|
1021
|
+
}
|
|
1022
|
+
return decodePdfByteString(Buffer.from(hex, 'hex'));
|
|
1023
|
+
}
|
|
1024
|
+
function decodePdfByteString(bytes) {
|
|
1025
|
+
if (bytes.length >= 2 && bytes[0] === 0xfe && bytes[1] === 0xff) {
|
|
1026
|
+
return decodeUtf16BigEndian(bytes.subarray(2));
|
|
1027
|
+
}
|
|
1028
|
+
if (bytes.length >= 2 && bytes[0] === 0xff && bytes[1] === 0xfe) {
|
|
1029
|
+
return decodeUtf16LittleEndian(bytes.subarray(2));
|
|
1030
|
+
}
|
|
1031
|
+
if (bytes.length >= 4 && looksLikeUtf16BigEndian(bytes)) {
|
|
1032
|
+
return decodeUtf16BigEndian(bytes);
|
|
1033
|
+
}
|
|
1034
|
+
if (bytes.length >= 4 && looksLikeUtf16LittleEndian(bytes)) {
|
|
1035
|
+
return decodeUtf16LittleEndian(bytes);
|
|
1036
|
+
}
|
|
1037
|
+
return bytes.toString('latin1');
|
|
1038
|
+
}
|
|
1039
|
+
function decodeUtf16BigEndian(bytes) {
|
|
1040
|
+
const chars = [];
|
|
1041
|
+
for (let index = 0; index + 1 < bytes.length; index += 2) {
|
|
1042
|
+
chars.push(String.fromCharCode((bytes[index] << 8) | bytes[index + 1]));
|
|
1043
|
+
}
|
|
1044
|
+
return chars.join('');
|
|
1045
|
+
}
|
|
1046
|
+
function decodeUtf16LittleEndian(bytes) {
|
|
1047
|
+
const chars = [];
|
|
1048
|
+
for (let index = 0; index + 1 < bytes.length; index += 2) {
|
|
1049
|
+
chars.push(String.fromCharCode(bytes[index] | (bytes[index + 1] << 8)));
|
|
1050
|
+
}
|
|
1051
|
+
return chars.join('');
|
|
1052
|
+
}
|
|
1053
|
+
function looksLikeUtf16BigEndian(bytes) {
|
|
1054
|
+
let zeroBytes = 0;
|
|
1055
|
+
for (let index = 0; index < bytes.length; index += 2) {
|
|
1056
|
+
if (bytes[index] === 0x00)
|
|
1057
|
+
zeroBytes += 1;
|
|
1058
|
+
}
|
|
1059
|
+
return zeroBytes >= Math.floor(bytes.length / 4);
|
|
1060
|
+
}
|
|
1061
|
+
function looksLikeUtf16LittleEndian(bytes) {
|
|
1062
|
+
let zeroBytes = 0;
|
|
1063
|
+
for (let index = 1; index < bytes.length; index += 2) {
|
|
1064
|
+
if (bytes[index] === 0x00)
|
|
1065
|
+
zeroBytes += 1;
|
|
1066
|
+
}
|
|
1067
|
+
return zeroBytes >= Math.floor(bytes.length / 4);
|
|
1068
|
+
}
|
|
1069
|
+
function parsePdfObjects(buffer) {
|
|
1070
|
+
const source = buffer.toString('latin1');
|
|
1071
|
+
const objects = new Map();
|
|
1072
|
+
const objectPattern = /(\d+)\s+(\d+)\s+obj\b([\s\S]*?)endobj/g;
|
|
1073
|
+
for (const match of source.matchAll(objectPattern)) {
|
|
1074
|
+
const objectNumber = Number(match[1]);
|
|
1075
|
+
const generationNumber = Number(match[2]);
|
|
1076
|
+
const rawBody = match[3] ?? '';
|
|
1077
|
+
const trimmedBody = rawBody.trim();
|
|
1078
|
+
const fullMatch = match[0];
|
|
1079
|
+
const matchIndex = match.index ?? 0;
|
|
1080
|
+
const bodyRelativeStart = fullMatch.indexOf(rawBody);
|
|
1081
|
+
const bodyAbsoluteStart = matchIndex + Math.max(bodyRelativeStart, 0);
|
|
1082
|
+
const streamMarker = findStreamMarker(rawBody);
|
|
1083
|
+
let dictionarySource = null;
|
|
1084
|
+
let dictionaryEntries = new Map();
|
|
1085
|
+
let streamSpec = null;
|
|
1086
|
+
if (streamMarker) {
|
|
1087
|
+
const dictCandidate = rawBody.slice(0, streamMarker.markerIndex).trim();
|
|
1088
|
+
if (dictCandidate.startsWith(DICTIONARY_START)) {
|
|
1089
|
+
dictionarySource = dictCandidate;
|
|
1090
|
+
dictionaryEntries = parseDictionaryEntries(dictCandidate);
|
|
1091
|
+
}
|
|
1092
|
+
streamSpec = {
|
|
1093
|
+
startOffset: bodyAbsoluteStart + streamMarker.contentStartIndex,
|
|
1094
|
+
fallbackEndOffset: bodyAbsoluteStart + streamMarker.endstreamIndex,
|
|
1095
|
+
lengthRaw: dictionaryEntries.get('Length'),
|
|
1096
|
+
};
|
|
1097
|
+
}
|
|
1098
|
+
else if (trimmedBody.startsWith(DICTIONARY_START)) {
|
|
1099
|
+
dictionarySource = trimmedBody;
|
|
1100
|
+
dictionaryEntries = parseDictionaryEntries(trimmedBody);
|
|
1101
|
+
}
|
|
1102
|
+
const ref = { objectNumber, generationNumber };
|
|
1103
|
+
objects.set(pdfRefKey(ref), {
|
|
1104
|
+
ref,
|
|
1105
|
+
body: trimmedBody,
|
|
1106
|
+
dictionarySource,
|
|
1107
|
+
dictionaryEntries,
|
|
1108
|
+
streamSpec,
|
|
1109
|
+
});
|
|
1110
|
+
}
|
|
1111
|
+
return objects;
|
|
1112
|
+
}
|
|
1113
|
+
function findStreamMarker(rawBody) {
|
|
1114
|
+
const markerIndex = rawBody.indexOf('stream');
|
|
1115
|
+
if (markerIndex < 0)
|
|
1116
|
+
return null;
|
|
1117
|
+
const endstreamIndex = rawBody.indexOf('endstream', markerIndex);
|
|
1118
|
+
if (endstreamIndex < 0)
|
|
1119
|
+
return null;
|
|
1120
|
+
let contentStartIndex = markerIndex + 'stream'.length;
|
|
1121
|
+
if (rawBody.startsWith('\r\n', contentStartIndex)) {
|
|
1122
|
+
contentStartIndex += 2;
|
|
1123
|
+
}
|
|
1124
|
+
else if (rawBody[contentStartIndex] === '\n' || rawBody[contentStartIndex] === '\r') {
|
|
1125
|
+
contentStartIndex += 1;
|
|
1126
|
+
}
|
|
1127
|
+
return {
|
|
1128
|
+
markerIndex,
|
|
1129
|
+
contentStartIndex,
|
|
1130
|
+
endstreamIndex,
|
|
1131
|
+
};
|
|
1132
|
+
}
|
|
1133
|
+
function collectPageObjects(objects) {
|
|
1134
|
+
const catalog = [...objects.values()].find((object) => parsePdfName(object.dictionaryEntries.get('Type')) === 'Catalog');
|
|
1135
|
+
const rootPagesRef = catalog ? parsePdfRef(catalog.dictionaryEntries.get('Pages')) : null;
|
|
1136
|
+
if (rootPagesRef) {
|
|
1137
|
+
return walkPageTree(rootPagesRef, objects, new Set());
|
|
1138
|
+
}
|
|
1139
|
+
return [...objects.values()]
|
|
1140
|
+
.filter((object) => parsePdfName(object.dictionaryEntries.get('Type')) === 'Page')
|
|
1141
|
+
.sort((left, right) => left.ref.objectNumber - right.ref.objectNumber);
|
|
1142
|
+
}
|
|
1143
|
+
function walkPageTree(ref, objects, visited) {
|
|
1144
|
+
const key = pdfRefKey(ref);
|
|
1145
|
+
if (visited.has(key))
|
|
1146
|
+
return [];
|
|
1147
|
+
visited.add(key);
|
|
1148
|
+
const object = objects.get(key);
|
|
1149
|
+
if (!object)
|
|
1150
|
+
return [];
|
|
1151
|
+
const type = parsePdfName(object.dictionaryEntries.get('Type'));
|
|
1152
|
+
if (type === 'Page') {
|
|
1153
|
+
return [object];
|
|
1154
|
+
}
|
|
1155
|
+
if (type !== 'Pages') {
|
|
1156
|
+
return [];
|
|
1157
|
+
}
|
|
1158
|
+
const kids = parseSingleOrArrayRefs(object.dictionaryEntries.get('Kids'));
|
|
1159
|
+
return kids.flatMap((kid) => walkPageTree(kid, objects, visited));
|
|
1160
|
+
}
|
|
1161
|
+
function resolveDictionaryFromRaw(rawValue, objects) {
|
|
1162
|
+
if (!rawValue)
|
|
1163
|
+
return null;
|
|
1164
|
+
const trimmed = rawValue.trim();
|
|
1165
|
+
if (trimmed.startsWith(DICTIONARY_START)) {
|
|
1166
|
+
return parseDictionaryEntries(trimmed);
|
|
1167
|
+
}
|
|
1168
|
+
const ref = parsePdfRef(trimmed);
|
|
1169
|
+
if (!ref)
|
|
1170
|
+
return null;
|
|
1171
|
+
const object = objects.get(pdfRefKey(ref));
|
|
1172
|
+
if (!object)
|
|
1173
|
+
return null;
|
|
1174
|
+
if (object.dictionarySource) {
|
|
1175
|
+
return object.dictionaryEntries;
|
|
1176
|
+
}
|
|
1177
|
+
if (object.body.startsWith(DICTIONARY_START)) {
|
|
1178
|
+
return parseDictionaryEntries(object.body);
|
|
1179
|
+
}
|
|
1180
|
+
return null;
|
|
1181
|
+
}
|
|
1182
|
+
function parseSingleOrArrayRefs(rawValue) {
|
|
1183
|
+
if (!rawValue)
|
|
1184
|
+
return [];
|
|
1185
|
+
const directRef = parsePdfRef(rawValue);
|
|
1186
|
+
if (directRef)
|
|
1187
|
+
return [directRef];
|
|
1188
|
+
const trimmed = rawValue.trim();
|
|
1189
|
+
if (!trimmed.startsWith('['))
|
|
1190
|
+
return [];
|
|
1191
|
+
const refs = [];
|
|
1192
|
+
let index = 1;
|
|
1193
|
+
while (index < trimmed.length - 1) {
|
|
1194
|
+
index = skipPdfWhitespaceAndComments(trimmed, index);
|
|
1195
|
+
if (index >= trimmed.length - 1)
|
|
1196
|
+
break;
|
|
1197
|
+
const value = readPdfValueRaw(trimmed, index);
|
|
1198
|
+
if (!value)
|
|
1199
|
+
break;
|
|
1200
|
+
const ref = parsePdfRef(value.raw);
|
|
1201
|
+
if (ref)
|
|
1202
|
+
refs.push(ref);
|
|
1203
|
+
index = value.nextIndex;
|
|
1204
|
+
}
|
|
1205
|
+
return refs;
|
|
1206
|
+
}
|
|
1207
|
+
function parseColorSpaceName(rawValue) {
|
|
1208
|
+
if (!rawValue)
|
|
1209
|
+
return null;
|
|
1210
|
+
const directName = parsePdfName(rawValue);
|
|
1211
|
+
if (directName)
|
|
1212
|
+
return directName;
|
|
1213
|
+
const trimmed = rawValue.trim();
|
|
1214
|
+
if (!trimmed.startsWith('['))
|
|
1215
|
+
return null;
|
|
1216
|
+
const match = trimmed.match(/\/([A-Za-z0-9._-]+)/);
|
|
1217
|
+
return match?.[1] ?? null;
|
|
1218
|
+
}
|
|
1219
|
+
function parseNumberArray(rawValue) {
|
|
1220
|
+
const trimmed = rawValue.trim();
|
|
1221
|
+
if (!trimmed.startsWith('['))
|
|
1222
|
+
return null;
|
|
1223
|
+
const numbers = [];
|
|
1224
|
+
let index = 1;
|
|
1225
|
+
while (index < trimmed.length - 1) {
|
|
1226
|
+
index = skipPdfWhitespaceAndComments(trimmed, index);
|
|
1227
|
+
if (index >= trimmed.length - 1)
|
|
1228
|
+
break;
|
|
1229
|
+
const value = readPdfValueRaw(trimmed, index);
|
|
1230
|
+
if (!value)
|
|
1231
|
+
break;
|
|
1232
|
+
const numericValue = parseDirectNumber(value.raw);
|
|
1233
|
+
if (numericValue !== null)
|
|
1234
|
+
numbers.push(numericValue);
|
|
1235
|
+
index = value.nextIndex;
|
|
1236
|
+
}
|
|
1237
|
+
return numbers;
|
|
1238
|
+
}
|
|
1239
|
+
function parseFilterNames(rawValue) {
|
|
1240
|
+
if (!rawValue)
|
|
1241
|
+
return [];
|
|
1242
|
+
const directName = parsePdfName(rawValue);
|
|
1243
|
+
if (directName)
|
|
1244
|
+
return [directName];
|
|
1245
|
+
const trimmed = rawValue.trim();
|
|
1246
|
+
if (!trimmed.startsWith('['))
|
|
1247
|
+
return [];
|
|
1248
|
+
const names = [];
|
|
1249
|
+
let index = 1;
|
|
1250
|
+
while (index < trimmed.length - 1) {
|
|
1251
|
+
index = skipPdfWhitespaceAndComments(trimmed, index);
|
|
1252
|
+
if (index >= trimmed.length - 1)
|
|
1253
|
+
break;
|
|
1254
|
+
const value = readPdfValueRaw(trimmed, index);
|
|
1255
|
+
if (!value)
|
|
1256
|
+
break;
|
|
1257
|
+
const name = parsePdfName(value.raw);
|
|
1258
|
+
if (name)
|
|
1259
|
+
names.push(name);
|
|
1260
|
+
index = value.nextIndex;
|
|
1261
|
+
}
|
|
1262
|
+
return names;
|
|
1263
|
+
}
|
|
1264
|
+
function parsePdfRef(rawValue) {
|
|
1265
|
+
if (!rawValue)
|
|
1266
|
+
return null;
|
|
1267
|
+
const match = rawValue.trim().match(/^(\d+)\s+(\d+)\s+R$/);
|
|
1268
|
+
if (!match)
|
|
1269
|
+
return null;
|
|
1270
|
+
return {
|
|
1271
|
+
objectNumber: Number(match[1]),
|
|
1272
|
+
generationNumber: Number(match[2]),
|
|
1273
|
+
};
|
|
1274
|
+
}
|
|
1275
|
+
function parsePdfName(rawValue) {
|
|
1276
|
+
if (!rawValue)
|
|
1277
|
+
return null;
|
|
1278
|
+
const trimmed = rawValue.trim();
|
|
1279
|
+
if (!trimmed.startsWith('/'))
|
|
1280
|
+
return null;
|
|
1281
|
+
return trimmed.slice(1);
|
|
1282
|
+
}
|
|
1283
|
+
function parseDirectNumber(rawValue) {
|
|
1284
|
+
if (!rawValue)
|
|
1285
|
+
return null;
|
|
1286
|
+
const trimmed = rawValue.trim();
|
|
1287
|
+
if (!/^[+-]?\d+(?:\.\d+)?$/.test(trimmed))
|
|
1288
|
+
return null;
|
|
1289
|
+
return Number(trimmed);
|
|
1290
|
+
}
|
|
1291
|
+
function parseDictionaryEntries(dictionarySource) {
|
|
1292
|
+
const entries = new Map();
|
|
1293
|
+
let index = dictionarySource.indexOf(DICTIONARY_START);
|
|
1294
|
+
if (index < 0)
|
|
1295
|
+
return entries;
|
|
1296
|
+
index += DICTIONARY_START.length;
|
|
1297
|
+
while (index < dictionarySource.length) {
|
|
1298
|
+
index = skipPdfWhitespaceAndComments(dictionarySource, index);
|
|
1299
|
+
if (index >= dictionarySource.length)
|
|
1300
|
+
break;
|
|
1301
|
+
if (dictionarySource.startsWith(DICTIONARY_END, index))
|
|
1302
|
+
break;
|
|
1303
|
+
if (dictionarySource[index] !== '/') {
|
|
1304
|
+
index += 1;
|
|
1305
|
+
continue;
|
|
1306
|
+
}
|
|
1307
|
+
const keyToken = readPdfNameToken(dictionarySource, index);
|
|
1308
|
+
const key = keyToken.raw.slice(1);
|
|
1309
|
+
index = skipPdfWhitespaceAndComments(dictionarySource, keyToken.nextIndex);
|
|
1310
|
+
const value = readPdfValueRaw(dictionarySource, index);
|
|
1311
|
+
if (!value) {
|
|
1312
|
+
entries.set(key, '');
|
|
1313
|
+
break;
|
|
1314
|
+
}
|
|
1315
|
+
entries.set(key, value.raw.trim());
|
|
1316
|
+
index = value.nextIndex;
|
|
1317
|
+
}
|
|
1318
|
+
return entries;
|
|
1319
|
+
}
|
|
1320
|
+
function readPdfValueRaw(source, startIndex) {
|
|
1321
|
+
let index = skipPdfWhitespaceAndComments(source, startIndex);
|
|
1322
|
+
if (index >= source.length)
|
|
1323
|
+
return null;
|
|
1324
|
+
const char = source[index];
|
|
1325
|
+
if (char === '(') {
|
|
1326
|
+
return readEnclosedToken(source, index, '(', ')');
|
|
1327
|
+
}
|
|
1328
|
+
if (char === '<') {
|
|
1329
|
+
if (source.startsWith(DICTIONARY_START, index)) {
|
|
1330
|
+
return readNestedDictionaryToken(source, index);
|
|
1331
|
+
}
|
|
1332
|
+
return readHexStringToken(source, index);
|
|
1333
|
+
}
|
|
1334
|
+
if (char === '[') {
|
|
1335
|
+
return readEnclosedToken(source, index, '[', ']');
|
|
1336
|
+
}
|
|
1337
|
+
if (char === '/') {
|
|
1338
|
+
return readPdfNameToken(source, index);
|
|
1339
|
+
}
|
|
1340
|
+
if (char === '\'' || char === '"') {
|
|
1341
|
+
return { raw: char, nextIndex: index + 1 };
|
|
1342
|
+
}
|
|
1343
|
+
const firstWord = readPdfWord(source, index);
|
|
1344
|
+
if (!firstWord)
|
|
1345
|
+
return null;
|
|
1346
|
+
if (/^[+-]?\d+(?:\.\d+)?$/.test(firstWord.raw)) {
|
|
1347
|
+
let lookaheadIndex = skipPdfWhitespaceAndComments(source, firstWord.nextIndex);
|
|
1348
|
+
const secondWord = readPdfWord(source, lookaheadIndex);
|
|
1349
|
+
if (secondWord && /^\d+$/.test(secondWord.raw)) {
|
|
1350
|
+
lookaheadIndex = skipPdfWhitespaceAndComments(source, secondWord.nextIndex);
|
|
1351
|
+
const refWord = readPdfWord(source, lookaheadIndex);
|
|
1352
|
+
if (refWord?.raw === 'R') {
|
|
1353
|
+
return {
|
|
1354
|
+
raw: source.slice(index, refWord.nextIndex),
|
|
1355
|
+
nextIndex: refWord.nextIndex,
|
|
1356
|
+
};
|
|
1357
|
+
}
|
|
1358
|
+
}
|
|
1359
|
+
}
|
|
1360
|
+
return firstWord;
|
|
1361
|
+
}
|
|
1362
|
+
function readPdfNameToken(source, startIndex) {
|
|
1363
|
+
let index = startIndex + 1;
|
|
1364
|
+
while (index < source.length && !isPdfDelimiter(source[index]) && !isPdfWhitespaceChar(source[index])) {
|
|
1365
|
+
index += 1;
|
|
1366
|
+
}
|
|
1367
|
+
return {
|
|
1368
|
+
raw: source.slice(startIndex, index),
|
|
1369
|
+
nextIndex: index,
|
|
1370
|
+
};
|
|
1371
|
+
}
|
|
1372
|
+
function readHexStringToken(source, startIndex) {
|
|
1373
|
+
let index = startIndex + 1;
|
|
1374
|
+
while (index < source.length && source[index] !== '>') {
|
|
1375
|
+
index += 1;
|
|
1376
|
+
}
|
|
1377
|
+
const nextIndex = index < source.length ? index + 1 : source.length;
|
|
1378
|
+
return {
|
|
1379
|
+
raw: source.slice(startIndex, nextIndex),
|
|
1380
|
+
nextIndex,
|
|
1381
|
+
};
|
|
1382
|
+
}
|
|
1383
|
+
function readNestedDictionaryToken(source, startIndex) {
|
|
1384
|
+
let index = startIndex;
|
|
1385
|
+
let depth = 0;
|
|
1386
|
+
while (index < source.length) {
|
|
1387
|
+
if (source.startsWith(DICTIONARY_START, index)) {
|
|
1388
|
+
depth += 1;
|
|
1389
|
+
index += 2;
|
|
1390
|
+
continue;
|
|
1391
|
+
}
|
|
1392
|
+
if (source.startsWith(DICTIONARY_END, index)) {
|
|
1393
|
+
depth -= 1;
|
|
1394
|
+
index += 2;
|
|
1395
|
+
if (depth === 0) {
|
|
1396
|
+
return {
|
|
1397
|
+
raw: source.slice(startIndex, index),
|
|
1398
|
+
nextIndex: index,
|
|
1399
|
+
};
|
|
1400
|
+
}
|
|
1401
|
+
continue;
|
|
1402
|
+
}
|
|
1403
|
+
if (source[index] === '(') {
|
|
1404
|
+
const literal = readEnclosedToken(source, index, '(', ')');
|
|
1405
|
+
index = literal.nextIndex;
|
|
1406
|
+
continue;
|
|
1407
|
+
}
|
|
1408
|
+
if (source[index] === '<' && !source.startsWith(DICTIONARY_START, index)) {
|
|
1409
|
+
const hexString = readHexStringToken(source, index);
|
|
1410
|
+
index = hexString.nextIndex;
|
|
1411
|
+
continue;
|
|
1412
|
+
}
|
|
1413
|
+
if (source[index] === '[') {
|
|
1414
|
+
const arrayToken = readEnclosedToken(source, index, '[', ']');
|
|
1415
|
+
index = arrayToken.nextIndex;
|
|
1416
|
+
continue;
|
|
1417
|
+
}
|
|
1418
|
+
index += 1;
|
|
1419
|
+
}
|
|
1420
|
+
return {
|
|
1421
|
+
raw: source.slice(startIndex),
|
|
1422
|
+
nextIndex: source.length,
|
|
1423
|
+
};
|
|
1424
|
+
}
|
|
1425
|
+
function readEnclosedToken(source, startIndex, openChar, closeChar) {
|
|
1426
|
+
let index = startIndex + 1;
|
|
1427
|
+
let depth = 1;
|
|
1428
|
+
while (index < source.length) {
|
|
1429
|
+
const char = source[index];
|
|
1430
|
+
if (openChar === '(' && char === '\\') {
|
|
1431
|
+
index += 2;
|
|
1432
|
+
continue;
|
|
1433
|
+
}
|
|
1434
|
+
if (char === openChar) {
|
|
1435
|
+
depth += 1;
|
|
1436
|
+
index += 1;
|
|
1437
|
+
continue;
|
|
1438
|
+
}
|
|
1439
|
+
if (char === closeChar) {
|
|
1440
|
+
depth -= 1;
|
|
1441
|
+
index += 1;
|
|
1442
|
+
if (depth === 0) {
|
|
1443
|
+
return {
|
|
1444
|
+
raw: source.slice(startIndex, index),
|
|
1445
|
+
nextIndex: index,
|
|
1446
|
+
};
|
|
1447
|
+
}
|
|
1448
|
+
continue;
|
|
1449
|
+
}
|
|
1450
|
+
if (openChar === '[' && char === '(') {
|
|
1451
|
+
const literal = readEnclosedToken(source, index, '(', ')');
|
|
1452
|
+
index = literal.nextIndex;
|
|
1453
|
+
continue;
|
|
1454
|
+
}
|
|
1455
|
+
if (openChar === '[' && char === '<') {
|
|
1456
|
+
if (source.startsWith(DICTIONARY_START, index)) {
|
|
1457
|
+
const dict = readNestedDictionaryToken(source, index);
|
|
1458
|
+
index = dict.nextIndex;
|
|
1459
|
+
}
|
|
1460
|
+
else {
|
|
1461
|
+
const hexString = readHexStringToken(source, index);
|
|
1462
|
+
index = hexString.nextIndex;
|
|
1463
|
+
}
|
|
1464
|
+
continue;
|
|
1465
|
+
}
|
|
1466
|
+
if (openChar === '[' && char === '[') {
|
|
1467
|
+
const nested = readEnclosedToken(source, index, '[', ']');
|
|
1468
|
+
index = nested.nextIndex;
|
|
1469
|
+
continue;
|
|
1470
|
+
}
|
|
1471
|
+
index += 1;
|
|
1472
|
+
}
|
|
1473
|
+
return {
|
|
1474
|
+
raw: source.slice(startIndex),
|
|
1475
|
+
nextIndex: source.length,
|
|
1476
|
+
};
|
|
1477
|
+
}
|
|
1478
|
+
function readPdfWord(source, startIndex) {
|
|
1479
|
+
let index = startIndex;
|
|
1480
|
+
while (index < source.length && !isPdfDelimiter(source[index]) && !isPdfWhitespaceChar(source[index])) {
|
|
1481
|
+
index += 1;
|
|
1482
|
+
}
|
|
1483
|
+
if (index === startIndex)
|
|
1484
|
+
return null;
|
|
1485
|
+
return {
|
|
1486
|
+
raw: source.slice(startIndex, index),
|
|
1487
|
+
nextIndex: index,
|
|
1488
|
+
};
|
|
1489
|
+
}
|
|
1490
|
+
function skipPdfWhitespaceAndComments(source, startIndex) {
|
|
1491
|
+
let index = startIndex;
|
|
1492
|
+
while (index < source.length) {
|
|
1493
|
+
const char = source[index];
|
|
1494
|
+
if (isPdfWhitespaceChar(char)) {
|
|
1495
|
+
index += 1;
|
|
1496
|
+
continue;
|
|
1497
|
+
}
|
|
1498
|
+
if (char === '%') {
|
|
1499
|
+
while (index < source.length && source[index] !== '\n' && source[index] !== '\r') {
|
|
1500
|
+
index += 1;
|
|
1501
|
+
}
|
|
1502
|
+
continue;
|
|
1503
|
+
}
|
|
1504
|
+
break;
|
|
1505
|
+
}
|
|
1506
|
+
return index;
|
|
1507
|
+
}
|
|
1508
|
+
function isPdfWhitespace(value) {
|
|
1509
|
+
return value === 0x00 || value === 0x09 || value === 0x0a || value === 0x0c || value === 0x0d || value === 0x20;
|
|
1510
|
+
}
|
|
1511
|
+
function isPdfWhitespaceChar(char) {
|
|
1512
|
+
return char === '\u0000' || char === '\t' || char === '\n' || char === '\f' || char === '\r' || char === ' ';
|
|
1513
|
+
}
|
|
1514
|
+
function isPdfDelimiter(char) {
|
|
1515
|
+
return char === '('
|
|
1516
|
+
|| char === ')'
|
|
1517
|
+
|| char === '<'
|
|
1518
|
+
|| char === '>'
|
|
1519
|
+
|| char === '['
|
|
1520
|
+
|| char === ']'
|
|
1521
|
+
|| char === '{'
|
|
1522
|
+
|| char === '}'
|
|
1523
|
+
|| char === '/'
|
|
1524
|
+
|| char === '%';
|
|
1525
|
+
}
|
|
1526
|
+
function readPdfVersion(buffer) {
|
|
1527
|
+
const header = buffer.subarray(0, Math.min(buffer.length, 32)).toString('latin1');
|
|
1528
|
+
const match = header.match(/%PDF-(\d\.\d)/);
|
|
1529
|
+
return match ? match[1] : null;
|
|
1530
|
+
}
|
|
1531
|
+
function hasPdfEncryptMarker(buffer) {
|
|
1532
|
+
const sample = buffer.subarray(0, Math.min(buffer.length, 8192)).toString('latin1');
|
|
1533
|
+
return /\/Encrypt\b/.test(sample);
|
|
1534
|
+
}
|
|
1535
|
+
function pdfRefKey(ref) {
|
|
1536
|
+
return `${ref.objectNumber} ${ref.generationNumber}`;
|
|
1537
|
+
}
|
|
1538
|
+
function pdfRefLabel(ref) {
|
|
1539
|
+
return `${ref.objectNumber} ${ref.generationNumber} R`;
|
|
1540
|
+
}
|
|
1541
|
+
function uniqueStrings(values) {
|
|
1542
|
+
return [...new Set(values.filter((value) => value.length > 0))];
|
|
1543
|
+
}
|
|
1544
|
+
//# sourceMappingURL=pdf.js.map
|