scribe.js-ocr 0.7.4 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/build-deno-compile.sh +30 -0
  2. package/cli/cli.js +46 -18
  3. package/cli/detectPDFType.js +1 -2
  4. package/cli/extract.js +14 -7
  5. package/cli/main.js +39 -39
  6. package/cli/require.js +1 -1
  7. package/cli/scribe.js +12 -11
  8. package/fonts/Dingbats.woff +0 -0
  9. package/fonts/all/URWGothicBook-Bold.woff +0 -0
  10. package/fonts/all/URWGothicBook-BoldItalic.woff +0 -0
  11. package/fonts/all/URWGothicBook-Italic.woff +0 -0
  12. package/fonts/all/URWGothicBook-Regular.woff +0 -0
  13. package/fonts/latin/URWGothicBook-Bold.woff +0 -0
  14. package/fonts/latin/URWGothicBook-BoldItalic.woff +0 -0
  15. package/fonts/latin/URWGothicBook-Italic.woff +0 -0
  16. package/fonts/latin/URWGothicBook-Regular.woff +0 -0
  17. package/js/canvasAdapter.js +4 -1
  18. package/js/clear.js +7 -8
  19. package/js/containers/app.js +2 -0
  20. package/js/containers/dataContainer.js +1 -4
  21. package/js/containers/fontContainer.js +59 -44
  22. package/js/containers/imageContainer.js +13 -35
  23. package/js/coordinates.js +3 -3
  24. package/js/debug.js +2 -2
  25. package/js/export/export.js +103 -18
  26. package/js/export/exportDebugCsv.js +4 -3
  27. package/js/export/pdf/writePdf.js +389 -0
  28. package/js/export/{writePdfFonts.js → pdf/writePdfFonts.js} +16 -12
  29. package/js/export/pdf/writePdfImages.js +218 -0
  30. package/js/export/{writePdf.js → pdf/writePdfText.js} +28 -315
  31. package/js/export/writeDocx.js +12 -5
  32. package/js/export/writeHocr.js +11 -10
  33. package/js/export/writeHtml.js +208 -48
  34. package/js/export/writeTabular.js +31 -20
  35. package/js/export/writeText.js +12 -10
  36. package/js/fontContainerMain.js +101 -50
  37. package/js/fontEval.js +18 -14
  38. package/js/fontStatistics.js +90 -90
  39. package/js/generalWorkerMain.js +52 -6
  40. package/js/global.d.ts +178 -6
  41. package/js/import/convertDocTextract.js +447 -0
  42. package/js/import/convertPageAbbyy.js +10 -4
  43. package/js/import/convertPageBlocks.js +4 -4
  44. package/js/import/convertPageGoogleVision.js +204 -0
  45. package/js/import/convertPageHocr.js +3 -3
  46. package/js/import/convertPageShared.js +1 -0
  47. package/js/import/convertPageStext.js +18 -10
  48. package/js/import/convertPageText.js +289 -0
  49. package/js/import/import.js +133 -125
  50. package/js/import/importOCR.js +98 -46
  51. package/js/import/nodeAdapter.js +2 -2
  52. package/js/modifyOCR.js +6 -5
  53. package/js/nudge.js +3 -3
  54. package/js/objects/{fontMetricsObjects.js → charMetricsObjects.js} +12 -12
  55. package/js/objects/imageObjects.js +3 -2
  56. package/js/objects/layoutObjects.js +37 -0
  57. package/js/objects/ocrObjects.js +51 -3
  58. package/js/recognizeConvert.js +74 -23
  59. package/js/utils/fontUtils.js +32 -1
  60. package/js/utils/imageUtils.js +99 -0
  61. package/js/utils/miscUtils.js +158 -9
  62. package/js/utils/reflowPars.js +4 -0
  63. package/js/worker/compareOCRModule.js +20 -18
  64. package/js/worker/generalWorker.js +12 -6
  65. package/js/worker/optimizeFontModule.js +19 -19
  66. package/mupdf/libmupdf.js +3 -3
  67. package/mupdf/libmupdf.wasm +0 -0
  68. package/mupdf/mupdf-async.js +1 -1
  69. package/mupdf/mupdf-worker.js +9 -4
  70. package/package.json +7 -4
  71. package/scribe.js +5 -5
  72. package/tess/tesseract.esm.min.js +1 -1
  73. package/tess/tesseract.min.js +1 -1
  74. package/tess/worker.min.js +1 -1
@@ -0,0 +1,389 @@
1
+ import { FontCont } from '../../containers/fontContainer.js';
2
+
3
+ import { createEmbeddedFontType0, createEmbeddedFontType1 } from './writePdfFonts.js';
4
+ import { createEmbeddedImages, createImageResourceDict, drawImageCommands } from './writePdfImages.js';
5
+
6
+ import { opt } from '../../containers/app.js';
7
+ import { ocrPageToPDFStream } from './writePdfText.js';
8
+ import { getDistinctCharsFont, subsetFont } from '../../utils/fontUtils.js';
9
+
10
+ // Creates 3 PDF objects necessary to embed font.
11
+ // These are (1) the font dictionary, (2) the font descriptor, and (3) the font file,
12
+ // which will be located at objects firstObjIndex, firstObjIndex + 1, and firstObjIndex + 2 (respectively).
13
+
14
+ /**
15
+ * Create a PDF from an array of ocrPage objects.
16
+ *
17
+ * @param {Object} params
18
+ * @param {Array<OcrPage>} params.ocrArr -
19
+ * @param {PageMetrics[]} params.pageMetricsArr -
20
+ * @param {number} [params.minpage=0] -
21
+ * @param {number} [params.maxpage=-1] -
22
+ * @param {("ebook"|"eval"|"proof"|"invis")} [params.textMode="ebook"] -
23
+ * @param {boolean} [params.rotateText=false] -
24
+ * @param {boolean} [params.rotateBackground=false] -
25
+ * @param {dims} [params.dimsLimit] -
26
+ * @param {number} [params.confThreshHigh=85] -
27
+ * @param {number} [params.confThreshMed=75] -
28
+ * @param {number} [params.proofOpacity=0.8] -
29
+ * @param {Array<ImageWrapper>} [params.images=[]] - Array of images to include in PDF
30
+ * @param {boolean} [params.includeImages=false] - Whether to include images in the PDF
31
+ *
32
+ * A valid PDF will be created if an empty array is provided for `ocrArr`, as long as `maxpage` is set manually.
33
+ */
34
+ export async function writePdf({
35
+ ocrArr,
36
+ pageMetricsArr,
37
+ minpage = 0,
38
+ maxpage = -1,
39
+ textMode = 'ebook',
40
+ rotateText = false,
41
+ rotateBackground = false,
42
+ dimsLimit = { width: -1, height: -1 },
43
+ confThreshHigh = 85,
44
+ confThreshMed = 75,
45
+ proofOpacity = 0.8,
46
+ images = [],
47
+ includeImages = false,
48
+ }) {
49
+ if (!FontCont.raw) throw new Error('No fonts loaded.');
50
+
51
+ if (maxpage === -1) {
52
+ maxpage = ocrArr.length - 1;
53
+ }
54
+
55
+ // This can happen if (1) `ocrArr` is length 0 and (2) `maxpage` is left as the default (-1).
56
+ if (maxpage < 0) throw new Error('PDF with negative page count requested.');
57
+
58
+ let fontI = 0;
59
+ let objectI = 3;
60
+ /** @type {Object<string, PdfFontFamily>} */
61
+ const pdfFonts = {};
62
+ /** @type {{familyKey: string, key: string}[]} */
63
+ const pdfFontRefs = [];
64
+ /** @type {string[][]} */
65
+ const pdfFontObjStrArr = [];
66
+ /** @type {Set<PdfFontInfo>} */
67
+ const pdfFontsUsed = new Set();
68
+
69
+ /**
70
+ *
71
+ * @param {string} familyKey
72
+ * @param {FontContainerFamily} familyObj
73
+ */
74
+ const addFontFamilyRef = async (familyKey, familyObj) => {
75
+ pdfFonts[familyKey] = {};
76
+ for (const [key, value] of Object.entries(familyObj)) {
77
+ // This should include both (1) if this is a standard 14 font and (2) if characters outside of the Windows-1252 range are used.
78
+ // If the latter is true, then a composite font is needed, even if the font is a standard 14 font.
79
+ // TODO: We currently have no mechanism for resolving name conflicts between fonts in the base and overlay document.
80
+ // As a workaround, we use the names `/FO[n]` rather than the more common `/F[n]`.
81
+ // However, this likely will cause issues if this application is used to create visible text, and then the resulting PDF is uploaded.
82
+ // This would move the fonts from the overlay document to the base document, and the names would conflict.
83
+ const isStandardFont = false;
84
+ if (isStandardFont) {
85
+ pdfFonts[familyKey][key] = {
86
+ type: 1, index: fontI, name: `/FO${String(fontI)}`, objN: objectI, opentype: value.opentype,
87
+ };
88
+ pdfFontRefs.push({ familyKey, key });
89
+ pdfFontObjStrArr.push(null);
90
+ objectI += 3;
91
+ } else {
92
+ pdfFonts[familyKey][key] = {
93
+ type: 0, index: fontI, name: `/FO${String(fontI)}`, objN: objectI, opentype: value.opentype,
94
+ };
95
+ pdfFontRefs.push({ familyKey, key });
96
+ pdfFontObjStrArr.push(null);
97
+ objectI += 6;
98
+ }
99
+ fontI++;
100
+ }
101
+ };
102
+
103
+ // Create reference to all fonts.
104
+ // Only the fonts that are actually used will be included in the final PDF.
105
+ for (const familyKeyI of Object.keys(FontCont.raw)) {
106
+ const useOpt = FontCont.useOptFamily(familyKeyI);
107
+ const familyObjI = {
108
+ normal: useOpt && FontCont.opt?.[familyKeyI]?.normal ? FontCont.opt[familyKeyI].normal : FontCont.raw[familyKeyI].normal,
109
+ italic: useOpt && FontCont.opt?.[familyKeyI]?.italic ? FontCont.opt[familyKeyI].italic : FontCont.raw[familyKeyI].italic,
110
+ bold: useOpt && FontCont.opt?.[familyKeyI]?.bold ? FontCont.opt[familyKeyI].bold : FontCont.raw[familyKeyI].bold,
111
+ boldItalic: useOpt && FontCont.opt?.[familyKeyI]?.boldItalic ? FontCont.opt[familyKeyI].boldItalic : FontCont.raw[familyKeyI].boldItalic,
112
+ };
113
+ await addFontFamilyRef(familyKeyI, familyObjI);
114
+ }
115
+
116
+ if (FontCont.doc) {
117
+ for (const familyKeyI of Object.keys(FontCont.doc)) {
118
+ await addFontFamilyRef(familyKeyI, FontCont.doc[familyKeyI]);
119
+ }
120
+ }
121
+
122
+ if (FontCont.supp.chi_sim) {
123
+ const charArr = getDistinctCharsFont(ocrArr, FontCont.supp.chi_sim.family);
124
+
125
+ if (charArr.length > 0) {
126
+ const fontExport = await subsetFont(FontCont.supp.chi_sim.opentype, charArr);
127
+
128
+ pdfFonts.NotoSansSC = {};
129
+ pdfFonts.NotoSansSC.normal = {
130
+ type: 0, index: fontI, name: `/FO${String(fontI)}`, objN: objectI, opentype: fontExport,
131
+ };
132
+ pdfFontRefs.push({ familyKey: 'NotoSansSC', key: 'normal' });
133
+ pdfFontObjStrArr.push(null);
134
+ objectI += 6;
135
+ fontI++;
136
+ }
137
+ }
138
+
139
+ // Add images [WIP]
140
+ /** @type {Array<string>} */
141
+ const pdfImageObjStrArr = [];
142
+ const imageObjIndices = [];
143
+
144
+ if (includeImages && images && images.length > 0) {
145
+ const imageObjects = createEmbeddedImages(images, objectI);
146
+ for (let i = 0; i < imageObjects.length; i++) {
147
+ pdfImageObjStrArr.push(imageObjects[i]);
148
+ imageObjIndices.push(objectI + i);
149
+ }
150
+ objectI += imageObjects.length;
151
+ }
152
+
153
+ /** @type {Array<string>} */
154
+ const pdfPageObjStrArr = [];
155
+
156
+ // Add pages
157
+ const pageIndexArr = [];
158
+ for (let i = minpage; i <= maxpage; i++) {
159
+ const angle = pageMetricsArr[i].angle || 0;
160
+ const { dims } = pageMetricsArr[i];
161
+
162
+ // eslint-disable-next-line no-await-in-loop
163
+ const { pdfObj, pdfFontsUsed: pdfFontsUsedI } = (await ocrPageToPDF({
164
+ pageObj: ocrArr[i],
165
+ inputDims: dims,
166
+ outputDims: dimsLimit,
167
+ firstObjIndex: objectI,
168
+ parentIndex: 2,
169
+ proofOpacity,
170
+ pdfFonts,
171
+ textMode,
172
+ angle,
173
+ rotateText,
174
+ rotateBackground,
175
+ confThreshHigh,
176
+ confThreshMed,
177
+ imageObjIndices,
178
+ includeImages,
179
+ }));
180
+
181
+ for (const font of pdfFontsUsedI) {
182
+ pdfFontsUsed.add(font);
183
+ }
184
+
185
+ for (let j = 0; j < pdfObj.length; j++) {
186
+ pdfPageObjStrArr.push(pdfObj[j]);
187
+ }
188
+
189
+ // This assumes the "page" is always the first object returned by `ocrPageToPDF`.
190
+ pageIndexArr.push(objectI);
191
+
192
+ objectI += pdfObj.length;
193
+
194
+ opt.progressHandler({ n: i, type: 'export', info: { } });
195
+ }
196
+
197
+ // Create font objects for fonts that are used
198
+ for (const pdfFont of pdfFontsUsed) {
199
+ if (pdfFont.opentype?.names?.postScriptName?.en === 'NotoSansSC-Regular') continue;
200
+ const isStandardFont = false;
201
+ if (isStandardFont) {
202
+ pdfFontObjStrArr[pdfFont.index] = createEmbeddedFontType1(pdfFont.opentype, pdfFont.objN);
203
+ } else {
204
+ pdfFontObjStrArr[pdfFont.index] = createEmbeddedFontType0({ font: pdfFont.opentype, firstObjIndex: pdfFont.objN });
205
+ }
206
+ }
207
+
208
+ /** @type {Array<string>} */
209
+ const pdfObjStrArr = [];
210
+
211
+ let pdfOut = '%PDF-1.7\n%µ¶n\n';
212
+
213
+ pdfObjStrArr.push('1 0 obj\n<</Type /Catalog\n/Pages 2 0 R>>\nendobj\n\n');
214
+
215
+ let pagesObjStr = '2 0 obj\n<</Type /Pages\n/Kids [';
216
+ for (let i = 0; i < (maxpage - minpage + 1); i++) {
217
+ pagesObjStr += `${String(pageIndexArr[i])} 0 R\n`;
218
+ }
219
+ pagesObjStr += `]\n/Count ${String(maxpage - minpage + 1)}>>\nendobj\n\n`;
220
+
221
+ pdfObjStrArr.push(pagesObjStr);
222
+
223
+ /** @type {{type: string, offset: number}[]} */
224
+ const xrefArr = [];
225
+ for (let i = 0; i < pdfObjStrArr.length; i++) {
226
+ xrefArr.push({ type: 'obj', offset: pdfOut.length + 2 });
227
+ pdfOut += pdfObjStrArr[i];
228
+ }
229
+
230
+ for (let i = 0; i < pdfFontRefs.length; i++) {
231
+ if (pdfFontObjStrArr[i]) {
232
+ for (let j = 0; j < pdfFontObjStrArr[i].length; j++) {
233
+ xrefArr.push({ type: 'obj', offset: pdfOut.length + 2 });
234
+ pdfOut += pdfFontObjStrArr[i][j];
235
+ }
236
+ } else {
237
+ xrefArr.push({ type: 'free', offset: 0 });
238
+ xrefArr.push({ type: 'free', offset: 0 });
239
+ xrefArr.push({ type: 'free', offset: 0 });
240
+ xrefArr.push({ type: 'free', offset: 0 });
241
+ xrefArr.push({ type: 'free', offset: 0 });
242
+ xrefArr.push({ type: 'free', offset: 0 });
243
+ }
244
+ }
245
+
246
+ for (let i = 0; i < pdfImageObjStrArr.length; i++) {
247
+ xrefArr.push({ type: 'obj', offset: pdfOut.length + 2 });
248
+ pdfOut += pdfImageObjStrArr[i];
249
+ }
250
+
251
+ for (let i = 0; i < pdfPageObjStrArr.length; i++) {
252
+ xrefArr.push({ type: 'obj', offset: pdfOut.length + 2 });
253
+ pdfOut += pdfPageObjStrArr[i];
254
+ }
255
+
256
+ // The 0th object always exists, and contains no meaningful data.
257
+ const objCount = pdfObjStrArr.length + pdfFontRefs.length * 6 + pdfImageObjStrArr.length + pdfPageObjStrArr.length + 1;
258
+
259
+ const xrefOffset = pdfOut.length + 2;
260
+
261
+ let xrefStr = `xref\n0 ${objCount}\n`;
262
+
263
+ xrefStr += '0000000000 65535 f\n';
264
+
265
+ for (let i = 0; i < xrefArr.length; i++) {
266
+ if (xrefArr[i].type === 'obj') {
267
+ xrefStr += `${String(xrefArr[i].offset).padStart(10, '0')} 00000 n\n`;
268
+ } else {
269
+ xrefStr += '0000000000 65535 f\n';
270
+ }
271
+ }
272
+
273
+ xrefStr += `trailer
274
+ << /Root 1 0 R
275
+ /Size ${objCount}
276
+ >>
277
+ startxref
278
+ ${xrefOffset}
279
+ %%EOF`;
280
+
281
+ pdfOut += xrefStr;
282
+
283
+ return pdfOut;
284
+ }
285
+
286
+ /**
287
+ * Generates PDF objects for a single page of OCR data.
288
+ * Generally returns an array of 2 strings, the first being the text content object, and the second being the page object.
289
+ * If there is no text content, only the page object is returned.
290
+ * @param {Object} params - Parameters object
291
+ * @param {OcrPage} params.pageObj
292
+ * @param {dims} params.inputDims
293
+ * @param {dims} params.outputDims
294
+ * @param {number} params.firstObjIndex
295
+ * @param {number} params.parentIndex
296
+ * @param {number} params.proofOpacity
297
+ * @param {Object<string, PdfFontFamily>} params.pdfFonts
298
+ * @param {("ebook"|"eval"|"proof"|"invis")} params.textMode -
299
+ * @param {number} params.angle
300
+ * @param {boolean} [params.rotateText=false]
301
+ * @param {boolean} [params.rotateBackground=false]
302
+ * @param {number} [params.confThreshHigh=85]
303
+ * @param {number} [params.confThreshMed=75]
304
+ * @param {?import('opentype.js').Font} [params.fontChiSim=null]
305
+ * @param {Array<number>} [params.imageObjIndices=[]] - Array of image object indices
306
+ * @param {boolean} [params.includeImages=false] - Whether to include images
307
+ */
308
+ async function ocrPageToPDF({
309
+ pageObj,
310
+ inputDims,
311
+ outputDims,
312
+ firstObjIndex,
313
+ parentIndex,
314
+ proofOpacity,
315
+ pdfFonts,
316
+ textMode,
317
+ angle,
318
+ rotateText = false,
319
+ rotateBackground = false,
320
+ confThreshHigh = 85,
321
+ confThreshMed = 75,
322
+ imageObjIndices = [],
323
+ includeImages = false,
324
+ }) {
325
+ if (outputDims.width < 1) {
326
+ outputDims = inputDims;
327
+ }
328
+
329
+ const noTextContent = !pageObj || pageObj.lines.length === 0;
330
+ const noImageContent = !includeImages || imageObjIndices.length === 0;
331
+
332
+ const pageIndex = firstObjIndex;
333
+ let pageObjStr = `${String(pageIndex)} 0 obj\n<</Type/Page/MediaBox[0 0 ${String(outputDims.width)} ${String(outputDims.height)}]`;
334
+
335
+ if (noTextContent && noImageContent) {
336
+ pageObjStr += '/Resources<<>>';
337
+ pageObjStr += `/Parent ${parentIndex} 0 R>>\nendobj\n\n`;
338
+ return { pdfObj: [pageObjStr], pdfFontsUsed: /** @type {Set<PdfFontInfo>} */ (new Set()) };
339
+ }
340
+
341
+ pageObjStr += `/Contents ${String(firstObjIndex + 2)} 0 R`;
342
+
343
+ let imageContentObjStr = '';
344
+
345
+ if (includeImages && imageObjIndices.length > 0) {
346
+ if (imageObjIndices.length > 0) {
347
+ let rotation = 0;
348
+ if (rotateBackground && Math.abs(angle ?? 0) > 0.05) {
349
+ rotation = angle;
350
+ }
351
+ imageContentObjStr += drawImageCommands(0, 0, 0, outputDims.width, outputDims.height, rotation);
352
+ }
353
+ }
354
+
355
+ const { textContentObjStr, pdfFontsUsed } = await ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle,
356
+ rotateText, rotateBackground, confThreshHigh, confThreshMed);
357
+
358
+ let pdfFontsStr = '';
359
+ for (const font of pdfFontsUsed) {
360
+ pdfFontsStr += `${String(font.name)} ${String(font.objN)} 0 R\n`;
361
+ }
362
+
363
+ let resourceDictObjStr = `${String(firstObjIndex + 1)} 0 obj\n<<`;
364
+
365
+ resourceDictObjStr += `/Font<<${pdfFontsStr}>>`;
366
+
367
+ if (includeImages && imageObjIndices.length > 0) {
368
+ const imageResourceStr = createImageResourceDict(imageObjIndices);
369
+ resourceDictObjStr += imageResourceStr;
370
+ }
371
+
372
+ // Use `GSO` prefix to avoid conflicts with other graphics states, which are normally named `/GS[n]` by convention.
373
+ resourceDictObjStr += '/ExtGState<<';
374
+ resourceDictObjStr += '/GSO0 <</ca 0.0>>';
375
+ resourceDictObjStr += `/GSO1 <</ca ${proofOpacity}>>`;
376
+ resourceDictObjStr += '>>';
377
+
378
+ resourceDictObjStr += '>>\nendobj\n\n';
379
+
380
+ const pageResourceStr = `/Resources ${String(firstObjIndex + 1)} 0 R`;
381
+
382
+ pageObjStr += `${pageResourceStr}/Parent ${parentIndex} 0 R>>\nendobj\n\n`;
383
+
384
+ const pageContentObjStr = `${String(firstObjIndex + 2)} 0 obj\n<</Length ${String(imageContentObjStr.length + textContentObjStr.length)} >>\nstream\n${imageContentObjStr}${textContentObjStr}\nendstream\nendobj\n\n`;
385
+
386
+ return {
387
+ pdfObj: [pageObjStr, resourceDictObjStr, pageContentObjStr], pdfFontsUsed,
388
+ };
389
+ }
@@ -1,8 +1,8 @@
1
1
  // Function for converting from bufferArray to hex (string)
2
2
  // Taken from https://stackoverflow.com/questions/40031688/javascript-arraybuffer-to-hex
3
3
 
4
- import { win1252Chars } from '../../fonts/encoding.js';
5
- import { determineSansSerif } from '../utils/miscUtils.js';
4
+ import { win1252Chars } from '../../../fonts/encoding.js';
5
+ import { determineSansSerif } from '../../utils/miscUtils.js';
6
6
 
7
7
  /** @type {Array<string>} */
8
8
  const byteToHex = [];
@@ -15,17 +15,18 @@ for (let n = 0; n <= 0xff; ++n) {
15
15
  /**
16
16
  * Converts an ArrayBuffer to a hexadecimal string.
17
17
  *
18
- * @param {ArrayBuffer} arrayBuffer - The ArrayBuffer to be converted.
18
+ * @param {ArrayBufferLike} arrayBuffer - The ArrayBuffer to be converted.
19
19
  * @returns {string} The hexadecimal representation of the ArrayBuffer.
20
20
  */
21
21
  export function hex(arrayBuffer) {
22
22
  const buff = new Uint8Array(arrayBuffer);
23
- /** @type {Array<string>} */
24
- const hexOctets = []; // new Array(buff.length) is even faster (preallocates necessary array size), then use hexOctets[i] instead of .push()
25
-
26
- for (let i = 0; i < buff.length; ++i) hexOctets.push(byteToHex[buff[i]]);
23
+ let hexOctets = '';
24
+ for (let i = 0; i < buff.length; ++i) {
25
+ if (i % 32 === 0 && i !== 0) hexOctets += '\n';
26
+ hexOctets += byteToHex[buff[i]];
27
+ }
27
28
 
28
- return hexOctets.join('');
29
+ return hexOctets;
29
30
  }
30
31
 
31
32
  /**
@@ -248,16 +249,19 @@ export function createEmbeddedFontType1(font, firstObjIndex, italic = false, isS
248
249
  * Converts a Opentype.js font object into an array of strings for adding to a PDF.
249
250
  * The font is represented as a composite "Type 0" font.
250
251
  *
251
- * @param {opentype.Font} font - Opentype.js font object
252
- * @param {number} firstObjIndex - Index for the first PDF object
253
- * @param {boolean} [italic=false] - Whether the font is italic.
252
+ * @param {Object} options - Configuration object
253
+ * @param {opentype.Font} options.font - Opentype.js font object
254
+ * @param {number} options.firstObjIndex - Index for the first PDF object
255
+ * @param {boolean} [options.italic=false] - Whether the font is italic.
254
256
  *
255
257
  * This function does not produce "toUnicode" or "Widths" objects,
256
258
  * so any PDF it creates directly will lack usable copy/paste.
257
259
  * However, both of these objects will be created from the embedded file
258
260
  * when the result is run through mupdf.
259
261
  */
260
- export function createEmbeddedFontType0(font, firstObjIndex, italic = false) {
262
+ export function createEmbeddedFontType0({
263
+ font, firstObjIndex, italic = false,
264
+ }) {
261
265
  // Start 1st object: Font Dictionary
262
266
  let fontDictObjStr = `${String(firstObjIndex)} 0 obj\n<</Type/Font/Subtype/Type0`;
263
267
 
@@ -0,0 +1,218 @@
1
+ /* eslint-disable no-bitwise */
2
+ import { imageUtils } from '../../objects/imageObjects.js';
3
+ import { base64ToBytes, getPngIHDRInfo } from '../../utils/imageUtils.js';
4
+ import { hex } from './writePdfFonts.js';
5
+
6
+ /**
7
+ * Extracts the concatenated data from all IDAT chunks of a PNG file.
8
+ * @param {Uint8Array} pngBytes - The raw bytes of the PNG file.
9
+ * @returns {Uint8Array} The concatenated zlib-compressed image data.
10
+ */
11
+ function extractPngIdatData(pngBytes) {
12
+ // PNG signature
13
+ const signature = [137, 80, 78, 71, 13, 10, 26, 10];
14
+ for (let i = 0; i < 8; i++) {
15
+ if (pngBytes[i] !== signature[i]) {
16
+ throw new Error('Invalid PNG file signature');
17
+ }
18
+ }
19
+
20
+ let offset = 8;
21
+ const idatChunks = [];
22
+
23
+ while (offset < pngBytes.length) {
24
+ // Read chunk length directly from bytes (big-endian)
25
+ const length = (pngBytes[offset] << 24)
26
+ | (pngBytes[offset + 1] << 16)
27
+ | (pngBytes[offset + 2] << 8)
28
+ | pngBytes[offset + 3];
29
+ offset += 4;
30
+
31
+ const type = String.fromCharCode(
32
+ pngBytes[offset],
33
+ pngBytes[offset + 1],
34
+ pngBytes[offset + 2],
35
+ pngBytes[offset + 3],
36
+ );
37
+ offset += 4;
38
+
39
+ if (type === 'IDAT') {
40
+ idatChunks.push(pngBytes.subarray(offset, offset + length));
41
+ } else if (type === 'IEND') {
42
+ break;
43
+ }
44
+
45
+ offset += length + 4; // Skip data and CRC
46
+ }
47
+
48
+ if (idatChunks.length === 0) {
49
+ console.warn('No IDAT chunks found in PNG image.');
50
+ return pngBytes; // Fallback if no IDAT chunks are found
51
+ }
52
+
53
+ const totalLength = idatChunks.reduce((acc, chunk) => acc + chunk.length, 0);
54
+ const concatenated = new Uint8Array(totalLength);
55
+ let currentOffset = 0;
56
+ for (const chunk of idatChunks) {
57
+ concatenated.set(chunk, currentOffset);
58
+ currentOffset += chunk.length;
59
+ }
60
+
61
+ return concatenated;
62
+ }
63
+
64
+ /**
65
+ * Creates PDF XObject for an .jpeg image
66
+ * @param {number} objIndex - PDF object index
67
+ * @param {ArrayBufferLike} imageData - Raw image data
68
+ * @param {number} width - Image width
69
+ * @param {number} height - Image height
70
+ * @returns {string} PDF XObject string
71
+ */
72
+ const createImageXObjectJpeg = (objIndex, imageData, width, height) => {
73
+ const imageBytes = new Uint8Array(imageData);
74
+ let objStr = `${String(objIndex)} 0 obj\n`;
75
+ objStr += '<</Type /XObject\n';
76
+ objStr += '/Subtype /Image\n';
77
+
78
+ // For JPEG, we can use the raw JPEG data directly
79
+ const imageHexStr = hex(imageBytes.buffer);
80
+
81
+ objStr += `/Width ${String(width)}\n`;
82
+ objStr += `/Height ${String(height)}\n`;
83
+ objStr += '/ColorSpace /DeviceRGB\n';
84
+ objStr += '/BitsPerComponent 8\n';
85
+ objStr += '/Filter [ /ASCIIHexDecode /DCTDecode ]\n';
86
+ objStr += `/Length ${String(imageHexStr.length)}\n`;
87
+ objStr += '>>\nstream\n';
88
+ objStr += `${imageHexStr}\n`;
89
+ objStr += 'endstream\nendobj\n\n';
90
+ return objStr;
91
+ };
92
+
93
+ /**
94
+ * Creates PDF XObject for an .png image
95
+ * @param {number} objIndex - PDF object index
96
+ * @param {ArrayBufferLike} imageData - Raw image data
97
+ * @returns {string} PDF XObject string
98
+ */
99
+ const createImageXObjectPng = (objIndex, imageData) => {
100
+ const imageBytes = new Uint8Array(imageData);
101
+ let objStr = `${String(objIndex)} 0 obj\n`;
102
+ objStr += '<</Type /XObject\n';
103
+ objStr += '/Subtype /Image\n';
104
+
105
+ // For PNG, extract IDAT data and get header info
106
+ const imageDataOutput = extractPngIdatData(imageBytes);
107
+ const imageHexStr = hex(imageDataOutput.buffer);
108
+ const idhr = getPngIHDRInfo(imageBytes);
109
+
110
+ const predictor = 15;
111
+ let colors = 3;
112
+ let colorSpace = '/DeviceRGB';
113
+
114
+ // Determine color space and number of color components based on PNG color type
115
+ // Missing palette support (colorType 3)
116
+ if (idhr.colorType === 0) {
117
+ colors = 1;
118
+ colorSpace = '/DeviceGray';
119
+ } else if (idhr.colorType === 2) {
120
+ colors = 3;
121
+ colorSpace = '/DeviceRGB';
122
+ } else if (idhr.colorType === 4) {
123
+ colors = 2;
124
+ colorSpace = '/DeviceGray';
125
+ } else if (idhr.colorType === 6) {
126
+ colors = 4;
127
+ colorSpace = '/DeviceRGB';
128
+ }
129
+
130
+ objStr += '/DecodeParms [ null <<';
131
+ objStr += `/Predictor ${predictor} `;
132
+ objStr += `/Colors ${colors} `;
133
+ objStr += `/Columns ${String(idhr.width)} `;
134
+ objStr += ' >> ]\n';
135
+ objStr += `/Width ${String(idhr.width)}\n`;
136
+ objStr += `/Height ${String(idhr.height)}\n`;
137
+ objStr += `/ColorSpace ${colorSpace}\n`;
138
+ objStr += `/BitsPerComponent ${idhr.bitDepth}\n`;
139
+ objStr += '/Filter [ /ASCIIHexDecode /FlateDecode ]\n';
140
+ objStr += `/Length ${String(imageHexStr.length)}\n`;
141
+ objStr += '>>\nstream\n';
142
+ objStr += `${imageHexStr}\n`;
143
+ objStr += 'endstream\nendobj\n\n';
144
+
145
+ return objStr;
146
+ };
147
+
148
+ /**
149
+ * Creates PDF objects for multiple images
150
+ * @param {ImageWrapper[]} images - Array of image data
151
+ * @param {number} firstObjIndex - Starting object index
152
+ */
153
+ export function createEmbeddedImages(images, firstObjIndex) {
154
+ /** @type {string[]} */
155
+ const imageObjArr = [];
156
+
157
+ images.forEach((image, index) => {
158
+ const objIndex = firstObjIndex + index;
159
+ const dims = imageUtils.getDims(image);
160
+ const imageBytes = base64ToBytes(image.src);
161
+ let objParts;
162
+ if (image.format === 'jpeg') {
163
+ objParts = createImageXObjectJpeg(objIndex, imageBytes.buffer, dims.width, dims.height);
164
+ } else {
165
+ objParts = createImageXObjectPng(objIndex, imageBytes.buffer);
166
+ }
167
+ imageObjArr.push(objParts);
168
+ });
169
+
170
+ return imageObjArr;
171
+ }
172
+
173
+ /**
174
+ * Creates a resource dictionary entry for images
175
+ * @param {Array<number>} imageObjIndices - Array of image object indices
176
+ * @returns {string} Resource dictionary XObject entries
177
+ */
178
+ export function createImageResourceDict(imageObjIndices) {
179
+ if (imageObjIndices.length === 0) return '';
180
+
181
+ let resourceStr = '/XObject<<';
182
+ imageObjIndices.forEach((objIndex, i) => {
183
+ resourceStr += `/Im${String(i)} ${String(objIndex)} 0 R\n`;
184
+ });
185
+ resourceStr += '>>';
186
+
187
+ return resourceStr;
188
+ }
189
+
190
+ /**
191
+ * Generates PDF drawing commands to place an image on a page with optional rotation
192
+ * @param {number} imageIndex - Index of the image (for /Im naming)
193
+ * @param {number} x - X position
194
+ * @param {number} y - Y position
195
+ * @param {number} width - Display width
196
+ * @param {number} height - Display height
197
+ * @param {number} rotation - Rotation angle in degrees (default: 0)
198
+ * @returns {string} PDF drawing commands
199
+ */
200
+ export function drawImageCommands(imageIndex, x, y, width, height, rotation = 0) {
201
+ const angle = (rotation * Math.PI) / 180;
202
+
203
+ const centerX = x + width / 2;
204
+ const centerY = y + height / 2;
205
+
206
+ const cos = Math.cos(angle);
207
+ const sin = Math.sin(angle);
208
+
209
+ const a = width * cos;
210
+ const b = width * sin;
211
+ const c = -height * sin;
212
+ const d = height * cos;
213
+
214
+ const e = centerX - (width * cos - height * sin) / 2;
215
+ const f = centerY - (width * sin + height * cos) / 2;
216
+
217
+ return `q\n${a} ${b} ${c} ${d} ${e} ${f} cm\n/Im${imageIndex} Do\nQ\n`;
218
+ }