scribe.js-ocr 0.7.4 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build-deno-compile.sh +30 -0
- package/cli/cli.js +46 -18
- package/cli/detectPDFType.js +1 -2
- package/cli/extract.js +14 -7
- package/cli/main.js +39 -39
- package/cli/require.js +1 -1
- package/cli/scribe.js +12 -11
- package/fonts/Dingbats.woff +0 -0
- package/fonts/all/URWGothicBook-Bold.woff +0 -0
- package/fonts/all/URWGothicBook-BoldItalic.woff +0 -0
- package/fonts/all/URWGothicBook-Italic.woff +0 -0
- package/fonts/all/URWGothicBook-Regular.woff +0 -0
- package/fonts/latin/URWGothicBook-Bold.woff +0 -0
- package/fonts/latin/URWGothicBook-BoldItalic.woff +0 -0
- package/fonts/latin/URWGothicBook-Italic.woff +0 -0
- package/fonts/latin/URWGothicBook-Regular.woff +0 -0
- package/js/canvasAdapter.js +4 -1
- package/js/clear.js +7 -8
- package/js/containers/app.js +2 -0
- package/js/containers/dataContainer.js +1 -4
- package/js/containers/fontContainer.js +59 -44
- package/js/containers/imageContainer.js +13 -35
- package/js/coordinates.js +3 -3
- package/js/debug.js +2 -2
- package/js/export/export.js +103 -18
- package/js/export/exportDebugCsv.js +4 -3
- package/js/export/pdf/writePdf.js +389 -0
- package/js/export/{writePdfFonts.js → pdf/writePdfFonts.js} +16 -12
- package/js/export/pdf/writePdfImages.js +218 -0
- package/js/export/{writePdf.js → pdf/writePdfText.js} +28 -315
- package/js/export/writeDocx.js +12 -5
- package/js/export/writeHocr.js +11 -10
- package/js/export/writeHtml.js +208 -48
- package/js/export/writeTabular.js +31 -20
- package/js/export/writeText.js +12 -10
- package/js/fontContainerMain.js +101 -50
- package/js/fontEval.js +18 -14
- package/js/fontStatistics.js +90 -90
- package/js/generalWorkerMain.js +52 -6
- package/js/global.d.ts +178 -6
- package/js/import/convertDocTextract.js +447 -0
- package/js/import/convertPageAbbyy.js +10 -4
- package/js/import/convertPageBlocks.js +4 -4
- package/js/import/convertPageGoogleVision.js +204 -0
- package/js/import/convertPageHocr.js +3 -3
- package/js/import/convertPageShared.js +1 -0
- package/js/import/convertPageStext.js +18 -10
- package/js/import/convertPageText.js +289 -0
- package/js/import/import.js +133 -125
- package/js/import/importOCR.js +98 -46
- package/js/import/nodeAdapter.js +2 -2
- package/js/modifyOCR.js +6 -5
- package/js/nudge.js +3 -3
- package/js/objects/{fontMetricsObjects.js → charMetricsObjects.js} +12 -12
- package/js/objects/imageObjects.js +3 -2
- package/js/objects/layoutObjects.js +37 -0
- package/js/objects/ocrObjects.js +51 -3
- package/js/recognizeConvert.js +74 -23
- package/js/utils/fontUtils.js +32 -1
- package/js/utils/imageUtils.js +99 -0
- package/js/utils/miscUtils.js +158 -9
- package/js/utils/reflowPars.js +4 -0
- package/js/worker/compareOCRModule.js +20 -18
- package/js/worker/generalWorker.js +12 -6
- package/js/worker/optimizeFontModule.js +19 -19
- package/mupdf/libmupdf.js +3 -3
- package/mupdf/libmupdf.wasm +0 -0
- package/mupdf/mupdf-async.js +1 -1
- package/mupdf/mupdf-worker.js +9 -4
- package/package.json +7 -4
- package/scribe.js +5 -5
- package/tess/tesseract.esm.min.js +1 -1
- package/tess/tesseract.min.js +1 -1
- package/tess/worker.min.js +1 -1
|
@@ -0,0 +1,389 @@
|
|
|
1
|
+
import { FontCont } from '../../containers/fontContainer.js';
|
|
2
|
+
|
|
3
|
+
import { createEmbeddedFontType0, createEmbeddedFontType1 } from './writePdfFonts.js';
|
|
4
|
+
import { createEmbeddedImages, createImageResourceDict, drawImageCommands } from './writePdfImages.js';
|
|
5
|
+
|
|
6
|
+
import { opt } from '../../containers/app.js';
|
|
7
|
+
import { ocrPageToPDFStream } from './writePdfText.js';
|
|
8
|
+
import { getDistinctCharsFont, subsetFont } from '../../utils/fontUtils.js';
|
|
9
|
+
|
|
10
|
+
// Creates 3 PDF objects necessary to embed font.
|
|
11
|
+
// These are (1) the font dictionary, (2) the font descriptor, and (3) the font file,
|
|
12
|
+
// which will be located at objects firstObjIndex, firstObjIndex + 1, and firstObjIndex + 2 (respectively).
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Create a PDF from an array of ocrPage objects.
|
|
16
|
+
*
|
|
17
|
+
* @param {Object} params
|
|
18
|
+
* @param {Array<OcrPage>} params.ocrArr -
|
|
19
|
+
* @param {PageMetrics[]} params.pageMetricsArr -
|
|
20
|
+
* @param {number} [params.minpage=0] -
|
|
21
|
+
* @param {number} [params.maxpage=-1] -
|
|
22
|
+
* @param {("ebook"|"eval"|"proof"|"invis")} [params.textMode="ebook"] -
|
|
23
|
+
* @param {boolean} [params.rotateText=false] -
|
|
24
|
+
* @param {boolean} [params.rotateBackground=false] -
|
|
25
|
+
* @param {dims} [params.dimsLimit] -
|
|
26
|
+
* @param {number} [params.confThreshHigh=85] -
|
|
27
|
+
* @param {number} [params.confThreshMed=75] -
|
|
28
|
+
* @param {number} [params.proofOpacity=0.8] -
|
|
29
|
+
* @param {Array<ImageWrapper>} [params.images=[]] - Array of images to include in PDF
|
|
30
|
+
* @param {boolean} [params.includeImages=false] - Whether to include images in the PDF
|
|
31
|
+
*
|
|
32
|
+
* A valid PDF will be created if an empty array is provided for `ocrArr`, as long as `maxpage` is set manually.
|
|
33
|
+
*/
|
|
34
|
+
export async function writePdf({
|
|
35
|
+
ocrArr,
|
|
36
|
+
pageMetricsArr,
|
|
37
|
+
minpage = 0,
|
|
38
|
+
maxpage = -1,
|
|
39
|
+
textMode = 'ebook',
|
|
40
|
+
rotateText = false,
|
|
41
|
+
rotateBackground = false,
|
|
42
|
+
dimsLimit = { width: -1, height: -1 },
|
|
43
|
+
confThreshHigh = 85,
|
|
44
|
+
confThreshMed = 75,
|
|
45
|
+
proofOpacity = 0.8,
|
|
46
|
+
images = [],
|
|
47
|
+
includeImages = false,
|
|
48
|
+
}) {
|
|
49
|
+
if (!FontCont.raw) throw new Error('No fonts loaded.');
|
|
50
|
+
|
|
51
|
+
if (maxpage === -1) {
|
|
52
|
+
maxpage = ocrArr.length - 1;
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
// This can happen if (1) `ocrArr` is length 0 and (2) `maxpage` is left as the default (-1).
|
|
56
|
+
if (maxpage < 0) throw new Error('PDF with negative page count requested.');
|
|
57
|
+
|
|
58
|
+
let fontI = 0;
|
|
59
|
+
let objectI = 3;
|
|
60
|
+
/** @type {Object<string, PdfFontFamily>} */
|
|
61
|
+
const pdfFonts = {};
|
|
62
|
+
/** @type {{familyKey: string, key: string}[]} */
|
|
63
|
+
const pdfFontRefs = [];
|
|
64
|
+
/** @type {string[][]} */
|
|
65
|
+
const pdfFontObjStrArr = [];
|
|
66
|
+
/** @type {Set<PdfFontInfo>} */
|
|
67
|
+
const pdfFontsUsed = new Set();
|
|
68
|
+
|
|
69
|
+
/**
|
|
70
|
+
*
|
|
71
|
+
* @param {string} familyKey
|
|
72
|
+
* @param {FontContainerFamily} familyObj
|
|
73
|
+
*/
|
|
74
|
+
const addFontFamilyRef = async (familyKey, familyObj) => {
|
|
75
|
+
pdfFonts[familyKey] = {};
|
|
76
|
+
for (const [key, value] of Object.entries(familyObj)) {
|
|
77
|
+
// This should include both (1) if this is a standard 14 font and (2) if characters outside of the Windows-1252 range are used.
|
|
78
|
+
// If the latter is true, then a composite font is needed, even if the font is a standard 14 font.
|
|
79
|
+
// TODO: We currently have no mechanism for resolving name conflicts between fonts in the base and overlay document.
|
|
80
|
+
// As a workaround, we use the names `/FO[n]` rather than the more common `/F[n]`.
|
|
81
|
+
// However, this likely will cause issues if this application is used to create visible text, and then the resulting PDF is uploaded.
|
|
82
|
+
// This would move the fonts from the overlay document to the base document, and the names would conflict.
|
|
83
|
+
const isStandardFont = false;
|
|
84
|
+
if (isStandardFont) {
|
|
85
|
+
pdfFonts[familyKey][key] = {
|
|
86
|
+
type: 1, index: fontI, name: `/FO${String(fontI)}`, objN: objectI, opentype: value.opentype,
|
|
87
|
+
};
|
|
88
|
+
pdfFontRefs.push({ familyKey, key });
|
|
89
|
+
pdfFontObjStrArr.push(null);
|
|
90
|
+
objectI += 3;
|
|
91
|
+
} else {
|
|
92
|
+
pdfFonts[familyKey][key] = {
|
|
93
|
+
type: 0, index: fontI, name: `/FO${String(fontI)}`, objN: objectI, opentype: value.opentype,
|
|
94
|
+
};
|
|
95
|
+
pdfFontRefs.push({ familyKey, key });
|
|
96
|
+
pdfFontObjStrArr.push(null);
|
|
97
|
+
objectI += 6;
|
|
98
|
+
}
|
|
99
|
+
fontI++;
|
|
100
|
+
}
|
|
101
|
+
};
|
|
102
|
+
|
|
103
|
+
// Create reference to all fonts.
|
|
104
|
+
// Only the fonts that are actually used will be included in the final PDF.
|
|
105
|
+
for (const familyKeyI of Object.keys(FontCont.raw)) {
|
|
106
|
+
const useOpt = FontCont.useOptFamily(familyKeyI);
|
|
107
|
+
const familyObjI = {
|
|
108
|
+
normal: useOpt && FontCont.opt?.[familyKeyI]?.normal ? FontCont.opt[familyKeyI].normal : FontCont.raw[familyKeyI].normal,
|
|
109
|
+
italic: useOpt && FontCont.opt?.[familyKeyI]?.italic ? FontCont.opt[familyKeyI].italic : FontCont.raw[familyKeyI].italic,
|
|
110
|
+
bold: useOpt && FontCont.opt?.[familyKeyI]?.bold ? FontCont.opt[familyKeyI].bold : FontCont.raw[familyKeyI].bold,
|
|
111
|
+
boldItalic: useOpt && FontCont.opt?.[familyKeyI]?.boldItalic ? FontCont.opt[familyKeyI].boldItalic : FontCont.raw[familyKeyI].boldItalic,
|
|
112
|
+
};
|
|
113
|
+
await addFontFamilyRef(familyKeyI, familyObjI);
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
if (FontCont.doc) {
|
|
117
|
+
for (const familyKeyI of Object.keys(FontCont.doc)) {
|
|
118
|
+
await addFontFamilyRef(familyKeyI, FontCont.doc[familyKeyI]);
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
if (FontCont.supp.chi_sim) {
|
|
123
|
+
const charArr = getDistinctCharsFont(ocrArr, FontCont.supp.chi_sim.family);
|
|
124
|
+
|
|
125
|
+
if (charArr.length > 0) {
|
|
126
|
+
const fontExport = await subsetFont(FontCont.supp.chi_sim.opentype, charArr);
|
|
127
|
+
|
|
128
|
+
pdfFonts.NotoSansSC = {};
|
|
129
|
+
pdfFonts.NotoSansSC.normal = {
|
|
130
|
+
type: 0, index: fontI, name: `/FO${String(fontI)}`, objN: objectI, opentype: fontExport,
|
|
131
|
+
};
|
|
132
|
+
pdfFontRefs.push({ familyKey: 'NotoSansSC', key: 'normal' });
|
|
133
|
+
pdfFontObjStrArr.push(null);
|
|
134
|
+
objectI += 6;
|
|
135
|
+
fontI++;
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
// Add images [WIP]
|
|
140
|
+
/** @type {Array<string>} */
|
|
141
|
+
const pdfImageObjStrArr = [];
|
|
142
|
+
const imageObjIndices = [];
|
|
143
|
+
|
|
144
|
+
if (includeImages && images && images.length > 0) {
|
|
145
|
+
const imageObjects = createEmbeddedImages(images, objectI);
|
|
146
|
+
for (let i = 0; i < imageObjects.length; i++) {
|
|
147
|
+
pdfImageObjStrArr.push(imageObjects[i]);
|
|
148
|
+
imageObjIndices.push(objectI + i);
|
|
149
|
+
}
|
|
150
|
+
objectI += imageObjects.length;
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
/** @type {Array<string>} */
|
|
154
|
+
const pdfPageObjStrArr = [];
|
|
155
|
+
|
|
156
|
+
// Add pages
|
|
157
|
+
const pageIndexArr = [];
|
|
158
|
+
for (let i = minpage; i <= maxpage; i++) {
|
|
159
|
+
const angle = pageMetricsArr[i].angle || 0;
|
|
160
|
+
const { dims } = pageMetricsArr[i];
|
|
161
|
+
|
|
162
|
+
// eslint-disable-next-line no-await-in-loop
|
|
163
|
+
const { pdfObj, pdfFontsUsed: pdfFontsUsedI } = (await ocrPageToPDF({
|
|
164
|
+
pageObj: ocrArr[i],
|
|
165
|
+
inputDims: dims,
|
|
166
|
+
outputDims: dimsLimit,
|
|
167
|
+
firstObjIndex: objectI,
|
|
168
|
+
parentIndex: 2,
|
|
169
|
+
proofOpacity,
|
|
170
|
+
pdfFonts,
|
|
171
|
+
textMode,
|
|
172
|
+
angle,
|
|
173
|
+
rotateText,
|
|
174
|
+
rotateBackground,
|
|
175
|
+
confThreshHigh,
|
|
176
|
+
confThreshMed,
|
|
177
|
+
imageObjIndices,
|
|
178
|
+
includeImages,
|
|
179
|
+
}));
|
|
180
|
+
|
|
181
|
+
for (const font of pdfFontsUsedI) {
|
|
182
|
+
pdfFontsUsed.add(font);
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
for (let j = 0; j < pdfObj.length; j++) {
|
|
186
|
+
pdfPageObjStrArr.push(pdfObj[j]);
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
// This assumes the "page" is always the first object returned by `ocrPageToPDF`.
|
|
190
|
+
pageIndexArr.push(objectI);
|
|
191
|
+
|
|
192
|
+
objectI += pdfObj.length;
|
|
193
|
+
|
|
194
|
+
opt.progressHandler({ n: i, type: 'export', info: { } });
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
// Create font objects for fonts that are used
|
|
198
|
+
for (const pdfFont of pdfFontsUsed) {
|
|
199
|
+
if (pdfFont.opentype?.names?.postScriptName?.en === 'NotoSansSC-Regular') continue;
|
|
200
|
+
const isStandardFont = false;
|
|
201
|
+
if (isStandardFont) {
|
|
202
|
+
pdfFontObjStrArr[pdfFont.index] = createEmbeddedFontType1(pdfFont.opentype, pdfFont.objN);
|
|
203
|
+
} else {
|
|
204
|
+
pdfFontObjStrArr[pdfFont.index] = createEmbeddedFontType0({ font: pdfFont.opentype, firstObjIndex: pdfFont.objN });
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
/** @type {Array<string>} */
|
|
209
|
+
const pdfObjStrArr = [];
|
|
210
|
+
|
|
211
|
+
let pdfOut = '%PDF-1.7\n%µ¶n\n';
|
|
212
|
+
|
|
213
|
+
pdfObjStrArr.push('1 0 obj\n<</Type /Catalog\n/Pages 2 0 R>>\nendobj\n\n');
|
|
214
|
+
|
|
215
|
+
let pagesObjStr = '2 0 obj\n<</Type /Pages\n/Kids [';
|
|
216
|
+
for (let i = 0; i < (maxpage - minpage + 1); i++) {
|
|
217
|
+
pagesObjStr += `${String(pageIndexArr[i])} 0 R\n`;
|
|
218
|
+
}
|
|
219
|
+
pagesObjStr += `]\n/Count ${String(maxpage - minpage + 1)}>>\nendobj\n\n`;
|
|
220
|
+
|
|
221
|
+
pdfObjStrArr.push(pagesObjStr);
|
|
222
|
+
|
|
223
|
+
/** @type {{type: string, offset: number}[]} */
|
|
224
|
+
const xrefArr = [];
|
|
225
|
+
for (let i = 0; i < pdfObjStrArr.length; i++) {
|
|
226
|
+
xrefArr.push({ type: 'obj', offset: pdfOut.length + 2 });
|
|
227
|
+
pdfOut += pdfObjStrArr[i];
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
for (let i = 0; i < pdfFontRefs.length; i++) {
|
|
231
|
+
if (pdfFontObjStrArr[i]) {
|
|
232
|
+
for (let j = 0; j < pdfFontObjStrArr[i].length; j++) {
|
|
233
|
+
xrefArr.push({ type: 'obj', offset: pdfOut.length + 2 });
|
|
234
|
+
pdfOut += pdfFontObjStrArr[i][j];
|
|
235
|
+
}
|
|
236
|
+
} else {
|
|
237
|
+
xrefArr.push({ type: 'free', offset: 0 });
|
|
238
|
+
xrefArr.push({ type: 'free', offset: 0 });
|
|
239
|
+
xrefArr.push({ type: 'free', offset: 0 });
|
|
240
|
+
xrefArr.push({ type: 'free', offset: 0 });
|
|
241
|
+
xrefArr.push({ type: 'free', offset: 0 });
|
|
242
|
+
xrefArr.push({ type: 'free', offset: 0 });
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
for (let i = 0; i < pdfImageObjStrArr.length; i++) {
|
|
247
|
+
xrefArr.push({ type: 'obj', offset: pdfOut.length + 2 });
|
|
248
|
+
pdfOut += pdfImageObjStrArr[i];
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
for (let i = 0; i < pdfPageObjStrArr.length; i++) {
|
|
252
|
+
xrefArr.push({ type: 'obj', offset: pdfOut.length + 2 });
|
|
253
|
+
pdfOut += pdfPageObjStrArr[i];
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
// The 0th object always exists, and contains no meaningful data.
|
|
257
|
+
const objCount = pdfObjStrArr.length + pdfFontRefs.length * 6 + pdfImageObjStrArr.length + pdfPageObjStrArr.length + 1;
|
|
258
|
+
|
|
259
|
+
const xrefOffset = pdfOut.length + 2;
|
|
260
|
+
|
|
261
|
+
let xrefStr = `xref\n0 ${objCount}\n`;
|
|
262
|
+
|
|
263
|
+
xrefStr += '0000000000 65535 f\n';
|
|
264
|
+
|
|
265
|
+
for (let i = 0; i < xrefArr.length; i++) {
|
|
266
|
+
if (xrefArr[i].type === 'obj') {
|
|
267
|
+
xrefStr += `${String(xrefArr[i].offset).padStart(10, '0')} 00000 n\n`;
|
|
268
|
+
} else {
|
|
269
|
+
xrefStr += '0000000000 65535 f\n';
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
xrefStr += `trailer
|
|
274
|
+
<< /Root 1 0 R
|
|
275
|
+
/Size ${objCount}
|
|
276
|
+
>>
|
|
277
|
+
startxref
|
|
278
|
+
${xrefOffset}
|
|
279
|
+
%%EOF`;
|
|
280
|
+
|
|
281
|
+
pdfOut += xrefStr;
|
|
282
|
+
|
|
283
|
+
return pdfOut;
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
/**
|
|
287
|
+
* Generates PDF objects for a single page of OCR data.
|
|
288
|
+
* Generally returns an array of 2 strings, the first being the text content object, and the second being the page object.
|
|
289
|
+
* If there is no text content, only the page object is returned.
|
|
290
|
+
* @param {Object} params - Parameters object
|
|
291
|
+
* @param {OcrPage} params.pageObj
|
|
292
|
+
* @param {dims} params.inputDims
|
|
293
|
+
* @param {dims} params.outputDims
|
|
294
|
+
* @param {number} params.firstObjIndex
|
|
295
|
+
* @param {number} params.parentIndex
|
|
296
|
+
* @param {number} params.proofOpacity
|
|
297
|
+
* @param {Object<string, PdfFontFamily>} params.pdfFonts
|
|
298
|
+
* @param {("ebook"|"eval"|"proof"|"invis")} params.textMode -
|
|
299
|
+
* @param {number} params.angle
|
|
300
|
+
* @param {boolean} [params.rotateText=false]
|
|
301
|
+
* @param {boolean} [params.rotateBackground=false]
|
|
302
|
+
* @param {number} [params.confThreshHigh=85]
|
|
303
|
+
* @param {number} [params.confThreshMed=75]
|
|
304
|
+
* @param {?import('opentype.js').Font} [params.fontChiSim=null]
|
|
305
|
+
* @param {Array<number>} [params.imageObjIndices=[]] - Array of image object indices
|
|
306
|
+
* @param {boolean} [params.includeImages=false] - Whether to include images
|
|
307
|
+
*/
|
|
308
|
+
async function ocrPageToPDF({
|
|
309
|
+
pageObj,
|
|
310
|
+
inputDims,
|
|
311
|
+
outputDims,
|
|
312
|
+
firstObjIndex,
|
|
313
|
+
parentIndex,
|
|
314
|
+
proofOpacity,
|
|
315
|
+
pdfFonts,
|
|
316
|
+
textMode,
|
|
317
|
+
angle,
|
|
318
|
+
rotateText = false,
|
|
319
|
+
rotateBackground = false,
|
|
320
|
+
confThreshHigh = 85,
|
|
321
|
+
confThreshMed = 75,
|
|
322
|
+
imageObjIndices = [],
|
|
323
|
+
includeImages = false,
|
|
324
|
+
}) {
|
|
325
|
+
if (outputDims.width < 1) {
|
|
326
|
+
outputDims = inputDims;
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
const noTextContent = !pageObj || pageObj.lines.length === 0;
|
|
330
|
+
const noImageContent = !includeImages || imageObjIndices.length === 0;
|
|
331
|
+
|
|
332
|
+
const pageIndex = firstObjIndex;
|
|
333
|
+
let pageObjStr = `${String(pageIndex)} 0 obj\n<</Type/Page/MediaBox[0 0 ${String(outputDims.width)} ${String(outputDims.height)}]`;
|
|
334
|
+
|
|
335
|
+
if (noTextContent && noImageContent) {
|
|
336
|
+
pageObjStr += '/Resources<<>>';
|
|
337
|
+
pageObjStr += `/Parent ${parentIndex} 0 R>>\nendobj\n\n`;
|
|
338
|
+
return { pdfObj: [pageObjStr], pdfFontsUsed: /** @type {Set<PdfFontInfo>} */ (new Set()) };
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
pageObjStr += `/Contents ${String(firstObjIndex + 2)} 0 R`;
|
|
342
|
+
|
|
343
|
+
let imageContentObjStr = '';
|
|
344
|
+
|
|
345
|
+
if (includeImages && imageObjIndices.length > 0) {
|
|
346
|
+
if (imageObjIndices.length > 0) {
|
|
347
|
+
let rotation = 0;
|
|
348
|
+
if (rotateBackground && Math.abs(angle ?? 0) > 0.05) {
|
|
349
|
+
rotation = angle;
|
|
350
|
+
}
|
|
351
|
+
imageContentObjStr += drawImageCommands(0, 0, 0, outputDims.width, outputDims.height, rotation);
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
const { textContentObjStr, pdfFontsUsed } = await ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle,
|
|
356
|
+
rotateText, rotateBackground, confThreshHigh, confThreshMed);
|
|
357
|
+
|
|
358
|
+
let pdfFontsStr = '';
|
|
359
|
+
for (const font of pdfFontsUsed) {
|
|
360
|
+
pdfFontsStr += `${String(font.name)} ${String(font.objN)} 0 R\n`;
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
let resourceDictObjStr = `${String(firstObjIndex + 1)} 0 obj\n<<`;
|
|
364
|
+
|
|
365
|
+
resourceDictObjStr += `/Font<<${pdfFontsStr}>>`;
|
|
366
|
+
|
|
367
|
+
if (includeImages && imageObjIndices.length > 0) {
|
|
368
|
+
const imageResourceStr = createImageResourceDict(imageObjIndices);
|
|
369
|
+
resourceDictObjStr += imageResourceStr;
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
// Use `GSO` prefix to avoid conflicts with other graphics states, which are normally named `/GS[n]` by convention.
|
|
373
|
+
resourceDictObjStr += '/ExtGState<<';
|
|
374
|
+
resourceDictObjStr += '/GSO0 <</ca 0.0>>';
|
|
375
|
+
resourceDictObjStr += `/GSO1 <</ca ${proofOpacity}>>`;
|
|
376
|
+
resourceDictObjStr += '>>';
|
|
377
|
+
|
|
378
|
+
resourceDictObjStr += '>>\nendobj\n\n';
|
|
379
|
+
|
|
380
|
+
const pageResourceStr = `/Resources ${String(firstObjIndex + 1)} 0 R`;
|
|
381
|
+
|
|
382
|
+
pageObjStr += `${pageResourceStr}/Parent ${parentIndex} 0 R>>\nendobj\n\n`;
|
|
383
|
+
|
|
384
|
+
const pageContentObjStr = `${String(firstObjIndex + 2)} 0 obj\n<</Length ${String(imageContentObjStr.length + textContentObjStr.length)} >>\nstream\n${imageContentObjStr}${textContentObjStr}\nendstream\nendobj\n\n`;
|
|
385
|
+
|
|
386
|
+
return {
|
|
387
|
+
pdfObj: [pageObjStr, resourceDictObjStr, pageContentObjStr], pdfFontsUsed,
|
|
388
|
+
};
|
|
389
|
+
}
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
// Function for converting from bufferArray to hex (string)
|
|
2
2
|
// Taken from https://stackoverflow.com/questions/40031688/javascript-arraybuffer-to-hex
|
|
3
3
|
|
|
4
|
-
import { win1252Chars } from '
|
|
5
|
-
import { determineSansSerif } from '
|
|
4
|
+
import { win1252Chars } from '../../../fonts/encoding.js';
|
|
5
|
+
import { determineSansSerif } from '../../utils/miscUtils.js';
|
|
6
6
|
|
|
7
7
|
/** @type {Array<string>} */
|
|
8
8
|
const byteToHex = [];
|
|
@@ -15,17 +15,18 @@ for (let n = 0; n <= 0xff; ++n) {
|
|
|
15
15
|
/**
|
|
16
16
|
* Converts an ArrayBuffer to a hexadecimal string.
|
|
17
17
|
*
|
|
18
|
-
* @param {
|
|
18
|
+
* @param {ArrayBufferLike} arrayBuffer - The ArrayBuffer to be converted.
|
|
19
19
|
* @returns {string} The hexadecimal representation of the ArrayBuffer.
|
|
20
20
|
*/
|
|
21
21
|
export function hex(arrayBuffer) {
|
|
22
22
|
const buff = new Uint8Array(arrayBuffer);
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
23
|
+
let hexOctets = '';
|
|
24
|
+
for (let i = 0; i < buff.length; ++i) {
|
|
25
|
+
if (i % 32 === 0 && i !== 0) hexOctets += '\n';
|
|
26
|
+
hexOctets += byteToHex[buff[i]];
|
|
27
|
+
}
|
|
27
28
|
|
|
28
|
-
return hexOctets
|
|
29
|
+
return hexOctets;
|
|
29
30
|
}
|
|
30
31
|
|
|
31
32
|
/**
|
|
@@ -248,16 +249,19 @@ export function createEmbeddedFontType1(font, firstObjIndex, italic = false, isS
|
|
|
248
249
|
* Converts a Opentype.js font object into an array of strings for adding to a PDF.
|
|
249
250
|
* The font is represented as a composite "Type 0" font.
|
|
250
251
|
*
|
|
251
|
-
* @param {
|
|
252
|
-
* @param {
|
|
253
|
-
* @param {
|
|
252
|
+
* @param {Object} options - Configuration object
|
|
253
|
+
* @param {opentype.Font} options.font - Opentype.js font object
|
|
254
|
+
* @param {number} options.firstObjIndex - Index for the first PDF object
|
|
255
|
+
* @param {boolean} [options.italic=false] - Whether the font is italic.
|
|
254
256
|
*
|
|
255
257
|
* This function does not produce "toUnicode" or "Widths" objects,
|
|
256
258
|
* so any PDF it creates directly will lack usable copy/paste.
|
|
257
259
|
* However, both of these objects will be created from the embedded file
|
|
258
260
|
* when the result is run through mupdf.
|
|
259
261
|
*/
|
|
260
|
-
export function createEmbeddedFontType0(
|
|
262
|
+
export function createEmbeddedFontType0({
|
|
263
|
+
font, firstObjIndex, italic = false,
|
|
264
|
+
}) {
|
|
261
265
|
// Start 1st object: Font Dictionary
|
|
262
266
|
let fontDictObjStr = `${String(firstObjIndex)} 0 obj\n<</Type/Font/Subtype/Type0`;
|
|
263
267
|
|
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
/* eslint-disable no-bitwise */
|
|
2
|
+
import { imageUtils } from '../../objects/imageObjects.js';
|
|
3
|
+
import { base64ToBytes, getPngIHDRInfo } from '../../utils/imageUtils.js';
|
|
4
|
+
import { hex } from './writePdfFonts.js';
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Extracts the concatenated data from all IDAT chunks of a PNG file.
|
|
8
|
+
* @param {Uint8Array} pngBytes - The raw bytes of the PNG file.
|
|
9
|
+
* @returns {Uint8Array} The concatenated zlib-compressed image data.
|
|
10
|
+
*/
|
|
11
|
+
function extractPngIdatData(pngBytes) {
|
|
12
|
+
// PNG signature
|
|
13
|
+
const signature = [137, 80, 78, 71, 13, 10, 26, 10];
|
|
14
|
+
for (let i = 0; i < 8; i++) {
|
|
15
|
+
if (pngBytes[i] !== signature[i]) {
|
|
16
|
+
throw new Error('Invalid PNG file signature');
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
let offset = 8;
|
|
21
|
+
const idatChunks = [];
|
|
22
|
+
|
|
23
|
+
while (offset < pngBytes.length) {
|
|
24
|
+
// Read chunk length directly from bytes (big-endian)
|
|
25
|
+
const length = (pngBytes[offset] << 24)
|
|
26
|
+
| (pngBytes[offset + 1] << 16)
|
|
27
|
+
| (pngBytes[offset + 2] << 8)
|
|
28
|
+
| pngBytes[offset + 3];
|
|
29
|
+
offset += 4;
|
|
30
|
+
|
|
31
|
+
const type = String.fromCharCode(
|
|
32
|
+
pngBytes[offset],
|
|
33
|
+
pngBytes[offset + 1],
|
|
34
|
+
pngBytes[offset + 2],
|
|
35
|
+
pngBytes[offset + 3],
|
|
36
|
+
);
|
|
37
|
+
offset += 4;
|
|
38
|
+
|
|
39
|
+
if (type === 'IDAT') {
|
|
40
|
+
idatChunks.push(pngBytes.subarray(offset, offset + length));
|
|
41
|
+
} else if (type === 'IEND') {
|
|
42
|
+
break;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
offset += length + 4; // Skip data and CRC
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
if (idatChunks.length === 0) {
|
|
49
|
+
console.warn('No IDAT chunks found in PNG image.');
|
|
50
|
+
return pngBytes; // Fallback if no IDAT chunks are found
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
const totalLength = idatChunks.reduce((acc, chunk) => acc + chunk.length, 0);
|
|
54
|
+
const concatenated = new Uint8Array(totalLength);
|
|
55
|
+
let currentOffset = 0;
|
|
56
|
+
for (const chunk of idatChunks) {
|
|
57
|
+
concatenated.set(chunk, currentOffset);
|
|
58
|
+
currentOffset += chunk.length;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
return concatenated;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
/**
|
|
65
|
+
* Creates PDF XObject for an .jpeg image
|
|
66
|
+
* @param {number} objIndex - PDF object index
|
|
67
|
+
* @param {ArrayBufferLike} imageData - Raw image data
|
|
68
|
+
* @param {number} width - Image width
|
|
69
|
+
* @param {number} height - Image height
|
|
70
|
+
* @returns {string} PDF XObject string
|
|
71
|
+
*/
|
|
72
|
+
const createImageXObjectJpeg = (objIndex, imageData, width, height) => {
|
|
73
|
+
const imageBytes = new Uint8Array(imageData);
|
|
74
|
+
let objStr = `${String(objIndex)} 0 obj\n`;
|
|
75
|
+
objStr += '<</Type /XObject\n';
|
|
76
|
+
objStr += '/Subtype /Image\n';
|
|
77
|
+
|
|
78
|
+
// For JPEG, we can use the raw JPEG data directly
|
|
79
|
+
const imageHexStr = hex(imageBytes.buffer);
|
|
80
|
+
|
|
81
|
+
objStr += `/Width ${String(width)}\n`;
|
|
82
|
+
objStr += `/Height ${String(height)}\n`;
|
|
83
|
+
objStr += '/ColorSpace /DeviceRGB\n';
|
|
84
|
+
objStr += '/BitsPerComponent 8\n';
|
|
85
|
+
objStr += '/Filter [ /ASCIIHexDecode /DCTDecode ]\n';
|
|
86
|
+
objStr += `/Length ${String(imageHexStr.length)}\n`;
|
|
87
|
+
objStr += '>>\nstream\n';
|
|
88
|
+
objStr += `${imageHexStr}\n`;
|
|
89
|
+
objStr += 'endstream\nendobj\n\n';
|
|
90
|
+
return objStr;
|
|
91
|
+
};
|
|
92
|
+
|
|
93
|
+
/**
|
|
94
|
+
* Creates PDF XObject for an .png image
|
|
95
|
+
* @param {number} objIndex - PDF object index
|
|
96
|
+
* @param {ArrayBufferLike} imageData - Raw image data
|
|
97
|
+
* @returns {string} PDF XObject string
|
|
98
|
+
*/
|
|
99
|
+
const createImageXObjectPng = (objIndex, imageData) => {
|
|
100
|
+
const imageBytes = new Uint8Array(imageData);
|
|
101
|
+
let objStr = `${String(objIndex)} 0 obj\n`;
|
|
102
|
+
objStr += '<</Type /XObject\n';
|
|
103
|
+
objStr += '/Subtype /Image\n';
|
|
104
|
+
|
|
105
|
+
// For PNG, extract IDAT data and get header info
|
|
106
|
+
const imageDataOutput = extractPngIdatData(imageBytes);
|
|
107
|
+
const imageHexStr = hex(imageDataOutput.buffer);
|
|
108
|
+
const idhr = getPngIHDRInfo(imageBytes);
|
|
109
|
+
|
|
110
|
+
const predictor = 15;
|
|
111
|
+
let colors = 3;
|
|
112
|
+
let colorSpace = '/DeviceRGB';
|
|
113
|
+
|
|
114
|
+
// Determine color space and number of color components based on PNG color type
|
|
115
|
+
// Missing palette support (colorType 3)
|
|
116
|
+
if (idhr.colorType === 0) {
|
|
117
|
+
colors = 1;
|
|
118
|
+
colorSpace = '/DeviceGray';
|
|
119
|
+
} else if (idhr.colorType === 2) {
|
|
120
|
+
colors = 3;
|
|
121
|
+
colorSpace = '/DeviceRGB';
|
|
122
|
+
} else if (idhr.colorType === 4) {
|
|
123
|
+
colors = 2;
|
|
124
|
+
colorSpace = '/DeviceGray';
|
|
125
|
+
} else if (idhr.colorType === 6) {
|
|
126
|
+
colors = 4;
|
|
127
|
+
colorSpace = '/DeviceRGB';
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
objStr += '/DecodeParms [ null <<';
|
|
131
|
+
objStr += `/Predictor ${predictor} `;
|
|
132
|
+
objStr += `/Colors ${colors} `;
|
|
133
|
+
objStr += `/Columns ${String(idhr.width)} `;
|
|
134
|
+
objStr += ' >> ]\n';
|
|
135
|
+
objStr += `/Width ${String(idhr.width)}\n`;
|
|
136
|
+
objStr += `/Height ${String(idhr.height)}\n`;
|
|
137
|
+
objStr += `/ColorSpace ${colorSpace}\n`;
|
|
138
|
+
objStr += `/BitsPerComponent ${idhr.bitDepth}\n`;
|
|
139
|
+
objStr += '/Filter [ /ASCIIHexDecode /FlateDecode ]\n';
|
|
140
|
+
objStr += `/Length ${String(imageHexStr.length)}\n`;
|
|
141
|
+
objStr += '>>\nstream\n';
|
|
142
|
+
objStr += `${imageHexStr}\n`;
|
|
143
|
+
objStr += 'endstream\nendobj\n\n';
|
|
144
|
+
|
|
145
|
+
return objStr;
|
|
146
|
+
};
|
|
147
|
+
|
|
148
|
+
/**
|
|
149
|
+
* Creates PDF objects for multiple images
|
|
150
|
+
* @param {ImageWrapper[]} images - Array of image data
|
|
151
|
+
* @param {number} firstObjIndex - Starting object index
|
|
152
|
+
*/
|
|
153
|
+
export function createEmbeddedImages(images, firstObjIndex) {
|
|
154
|
+
/** @type {string[]} */
|
|
155
|
+
const imageObjArr = [];
|
|
156
|
+
|
|
157
|
+
images.forEach((image, index) => {
|
|
158
|
+
const objIndex = firstObjIndex + index;
|
|
159
|
+
const dims = imageUtils.getDims(image);
|
|
160
|
+
const imageBytes = base64ToBytes(image.src);
|
|
161
|
+
let objParts;
|
|
162
|
+
if (image.format === 'jpeg') {
|
|
163
|
+
objParts = createImageXObjectJpeg(objIndex, imageBytes.buffer, dims.width, dims.height);
|
|
164
|
+
} else {
|
|
165
|
+
objParts = createImageXObjectPng(objIndex, imageBytes.buffer);
|
|
166
|
+
}
|
|
167
|
+
imageObjArr.push(objParts);
|
|
168
|
+
});
|
|
169
|
+
|
|
170
|
+
return imageObjArr;
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
/**
|
|
174
|
+
* Creates a resource dictionary entry for images
|
|
175
|
+
* @param {Array<number>} imageObjIndices - Array of image object indices
|
|
176
|
+
* @returns {string} Resource dictionary XObject entries
|
|
177
|
+
*/
|
|
178
|
+
export function createImageResourceDict(imageObjIndices) {
|
|
179
|
+
if (imageObjIndices.length === 0) return '';
|
|
180
|
+
|
|
181
|
+
let resourceStr = '/XObject<<';
|
|
182
|
+
imageObjIndices.forEach((objIndex, i) => {
|
|
183
|
+
resourceStr += `/Im${String(i)} ${String(objIndex)} 0 R\n`;
|
|
184
|
+
});
|
|
185
|
+
resourceStr += '>>';
|
|
186
|
+
|
|
187
|
+
return resourceStr;
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
/**
|
|
191
|
+
* Generates PDF drawing commands to place an image on a page with optional rotation
|
|
192
|
+
* @param {number} imageIndex - Index of the image (for /Im naming)
|
|
193
|
+
* @param {number} x - X position
|
|
194
|
+
* @param {number} y - Y position
|
|
195
|
+
* @param {number} width - Display width
|
|
196
|
+
* @param {number} height - Display height
|
|
197
|
+
* @param {number} rotation - Rotation angle in degrees (default: 0)
|
|
198
|
+
* @returns {string} PDF drawing commands
|
|
199
|
+
*/
|
|
200
|
+
export function drawImageCommands(imageIndex, x, y, width, height, rotation = 0) {
|
|
201
|
+
const angle = (rotation * Math.PI) / 180;
|
|
202
|
+
|
|
203
|
+
const centerX = x + width / 2;
|
|
204
|
+
const centerY = y + height / 2;
|
|
205
|
+
|
|
206
|
+
const cos = Math.cos(angle);
|
|
207
|
+
const sin = Math.sin(angle);
|
|
208
|
+
|
|
209
|
+
const a = width * cos;
|
|
210
|
+
const b = width * sin;
|
|
211
|
+
const c = -height * sin;
|
|
212
|
+
const d = height * cos;
|
|
213
|
+
|
|
214
|
+
const e = centerX - (width * cos - height * sin) / 2;
|
|
215
|
+
const f = centerY - (width * sin + height * cos) / 2;
|
|
216
|
+
|
|
217
|
+
return `q\n${a} ${b} ${c} ${d} ${e} ${f} cm\n/Im${imageIndex} Do\nQ\n`;
|
|
218
|
+
}
|