@embedpdf/engines 1.0.4 → 1.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -1,4 +1,4 @@
1
- import { SearchTarget, PdfImage, PdfEngine, Logger, Task, PdfErrorReason, PdfFileUrl, PdfUrlOptions, PdfDocumentObject, PdfFile, PdfFileLoader, PdfSignatureObject, PdfBookmarkObject, PdfPageObject, Rotation, PdfRenderOptions, PdfTask, Rect, PdfAnnotationObject, PdfAnnotationTransformation, PdfTextRectObject, PdfAttachmentObject, PdfWidgetAnnoObject, FormFieldValue, PdfPageFlattenFlag, PdfPageFlattenResult, PdfInkListObject, PdfStampAnnoObjectContents, Position, PdfPageGeometry, PdfGlyphObject, MatchFlag, SearchAllPagesResult, PdfEngineMethodName, PdfEngineMethodArgs, TaskReturn, PdfEngineMethodReturnType, PdfMetadataObject, PdfBookmarksObject } from '@embedpdf/models';
1
+ import { SearchTarget, PdfImage, ImageConversionTypes, PdfEngine, Logger, Task, PdfErrorReason, PdfFileUrl, PdfUrlOptions, PdfDocumentObject, PdfFile, PdfFileLoader, PdfSignatureObject, PdfBookmarkObject, PdfPageObject, Rotation, PdfRenderOptions, PdfTask, Rect, PdfAnnotationObject, PdfAnnotationTransformation, PdfTextRectObject, PdfAttachmentObject, PdfWidgetAnnoObject, FormFieldValue, PdfPageFlattenFlag, PdfPageFlattenResult, PageTextSlice, PdfInkListObject, PdfStampAnnoObjectContents, Position, PdfPageGeometry, PdfGlyphObject, MatchFlag, SearchAllPagesResult, PdfEngineMethodName, PdfEngineMethodArgs, TaskReturn, PdfEngineMethodReturnType, PdfMetadataObject, PdfBookmarksObject } from '@embedpdf/models';
2
2
  import { WrappedPdfiumModule, PdfiumRuntimeMethods, PdfiumModule } from '@embedpdf/pdfium';
3
3
 
4
4
  /**
@@ -61,7 +61,7 @@ declare enum PdfiumErrorCode {
61
61
  * In browser: uses OffscreenCanvas
62
62
  * In Node.js: can use Sharp or other image processing libraries
63
63
  */
64
- type ImageDataConverter<T = Blob> = (imageData: PdfImage) => Promise<T>;
64
+ type ImageDataConverter<T = Blob> = (imageData: PdfImage, imageType?: ImageConversionTypes) => Promise<T>;
65
65
  declare const browserImageDataToBlobConverter: ImageDataConverter<Blob>;
66
66
  /**
67
67
  * Pdf engine that based on pdfium wasm
@@ -183,13 +183,13 @@ declare class PdfiumEngine<T = Blob> implements PdfEngine<T> {
183
183
  *
184
184
  * @public
185
185
  */
186
- renderPage(doc: PdfDocumentObject, page: PdfPageObject, scaleFactor?: number, rotation?: Rotation, dpr?: number, options?: PdfRenderOptions): PdfTask<T>;
186
+ renderPage(doc: PdfDocumentObject, page: PdfPageObject, scaleFactor?: number, rotation?: Rotation, dpr?: number, options?: PdfRenderOptions, imageType?: ImageConversionTypes): PdfTask<T>;
187
187
  /**
188
188
  * {@inheritDoc @embedpdf/models!PdfEngine.renderPageRect}
189
189
  *
190
190
  * @public
191
191
  */
192
- renderPageRect(doc: PdfDocumentObject, page: PdfPageObject, scaleFactor: number, rotation: Rotation, dpr: number, rect: Rect, options: PdfRenderOptions): PdfTask<T>;
192
+ renderPageRect(doc: PdfDocumentObject, page: PdfPageObject, scaleFactor: number, rotation: Rotation, dpr: number, rect: Rect, options: PdfRenderOptions, imageType?: ImageConversionTypes): PdfTask<T>;
193
193
  /**
194
194
  * {@inheritDoc @embedpdf/models!PdfEngine.getAllAnnotations}
195
195
  *
@@ -269,6 +269,12 @@ declare class PdfiumEngine<T = Blob> implements PdfEngine<T> {
269
269
  * @public
270
270
  */
271
271
  extractText(doc: PdfDocumentObject, pageIndexes: number[]): Task<any, PdfErrorReason> | Task<string, PdfErrorReason>;
272
+ /**
273
+ * {@inheritDoc @embedpdf/models!PdfEngine.getTextSlices}
274
+ *
275
+ * @public
276
+ */
277
+ getTextSlices(doc: PdfDocumentObject, slices: PageTextSlice[]): PdfTask<string[]>;
272
278
  /**
273
279
  * {@inheritDoc @embedpdf/models!PdfEngine.merge}
274
280
  *
@@ -1334,13 +1340,13 @@ declare class WebWorkerEngine implements PdfEngine {
1334
1340
  *
1335
1341
  * @public
1336
1342
  */
1337
- renderPage(doc: PdfDocumentObject, page: PdfPageObject, scaleFactor: number, rotation: Rotation, dpr: number, options: PdfRenderOptions): WorkerTask<Blob>;
1343
+ renderPage(doc: PdfDocumentObject, page: PdfPageObject, scaleFactor: number, rotation: Rotation, dpr: number, options: PdfRenderOptions, imageType?: ImageConversionTypes): WorkerTask<Blob>;
1338
1344
  /**
1339
1345
  * {@inheritDoc @embedpdf/models!PdfEngine.renderPageRect}
1340
1346
  *
1341
1347
  * @public
1342
1348
  */
1343
- renderPageRect(doc: PdfDocumentObject, page: PdfPageObject, scaleFactor: number, rotation: Rotation, dpr: number, rect: Rect, options: PdfRenderOptions): WorkerTask<Blob>;
1349
+ renderPageRect(doc: PdfDocumentObject, page: PdfPageObject, scaleFactor: number, rotation: Rotation, dpr: number, rect: Rect, options: PdfRenderOptions, imageType?: ImageConversionTypes): WorkerTask<Blob>;
1344
1350
  /**
1345
1351
  * {@inheritDoc @embedpdf/models!PdfEngine.getAllAnnotations}
1346
1352
  *
@@ -1431,6 +1437,12 @@ declare class WebWorkerEngine implements PdfEngine {
1431
1437
  * @public
1432
1438
  */
1433
1439
  extractText(doc: PdfDocumentObject, pageIndexes: number[]): WorkerTask<string>;
1440
+ /**
1441
+ * {@inheritDoc @embedpdf/models!PdfEngine.getTextSlices}
1442
+ *
1443
+ * @public
1444
+ */
1445
+ getTextSlices(doc: PdfDocumentObject, slices: PageTextSlice[]): WorkerTask<string[]>;
1434
1446
  /**
1435
1447
  * {@inheritDoc @embedpdf/models!PdfEngine.getPageGlyphs}
1436
1448
  *
package/dist/index.js CHANGED
@@ -1,4 +1,4 @@
1
- import { NoopLogger, PdfTaskHelper, PdfErrorCode, Task, Rotation, PdfAnnotationSubtype, PdfPageObjectType, PdfAnnotationObjectStatus, quadToRect, PDF_FORM_FIELD_TYPE, toIntRect, transformRect, toIntSize, transformSize, PdfActionType, PdfZoomMode, AppearanceMode, MatchFlag, swap, PdfPageFlattenResult } from '@embedpdf/models';
1
+ import { NoopLogger, PdfTaskHelper, PdfErrorCode, Task, Rotation, PdfAnnotationSubtype, stripPdfUnwantedMarkers, PdfPageObjectType, PdfAnnotationObjectStatus, quadToRect, PDF_FORM_FIELD_TYPE, toIntRect, transformRect, toIntSize, transformSize, PdfActionType, PdfZoomMode, AppearanceMode, MatchFlag, swap, PdfPageFlattenResult } from '@embedpdf/models';
2
2
  import { init } from '@embedpdf/pdfium';
3
3
 
4
4
  /**
@@ -265,7 +265,7 @@ var PdfiumErrorCode;
265
265
  PdfiumErrorCode[PdfiumErrorCode["XFALoad"] = 7] = "XFALoad";
266
266
  PdfiumErrorCode[PdfiumErrorCode["XFALayout"] = 8] = "XFALayout";
267
267
  })(PdfiumErrorCode || (PdfiumErrorCode = {}));
268
- const browserImageDataToBlobConverter = (pdfImageData) => {
268
+ const browserImageDataToBlobConverter = (pdfImageData, imageType = 'image/webp') => {
269
269
  // Check if we're in a browser environment
270
270
  if (typeof OffscreenCanvas === 'undefined') {
271
271
  throw new Error('OffscreenCanvas is not available in this environment. ' +
@@ -275,7 +275,7 @@ const browserImageDataToBlobConverter = (pdfImageData) => {
275
275
  const imageData = new ImageData(pdfImageData.data, pdfImageData.width, pdfImageData.height);
276
276
  const off = new OffscreenCanvas(imageData.width, imageData.height);
277
277
  off.getContext('2d').putImageData(imageData, 0, 0);
278
- return off.convertToBlob({ type: 'image/webp' });
278
+ return off.convertToBlob({ type: imageType });
279
279
  };
280
280
  /**
281
281
  * Pdf engine that based on pdfium wasm
@@ -781,7 +781,7 @@ class PdfiumEngine {
781
781
  *
782
782
  * @public
783
783
  */
784
- renderPage(doc, page, scaleFactor = 1, rotation = Rotation.Degree0, dpr = 1, options = { withAnnotations: false }) {
784
+ renderPage(doc, page, scaleFactor = 1, rotation = Rotation.Degree0, dpr = 1, options = { withAnnotations: false }, imageType = 'image/webp') {
785
785
  const task = new Task();
786
786
  this.logger.debug(LOG_SOURCE$2, LOG_CATEGORY$2, 'renderPage', doc, page, scaleFactor, rotation, dpr, options);
787
787
  this.logger.perf(LOG_SOURCE$2, LOG_CATEGORY$2, `RenderPage`, 'Begin', `${doc.id}-${page.index}`);
@@ -798,7 +798,7 @@ class PdfiumEngine {
798
798
  size: page.size,
799
799
  }, scaleFactor, rotation, dpr, options);
800
800
  this.logger.perf(LOG_SOURCE$2, LOG_CATEGORY$2, `RenderPage`, 'End', `${doc.id}-${page.index}`);
801
- this.imageDataConverter(imageData).then((blob) => task.resolve(blob));
801
+ this.imageDataConverter(imageData, imageType).then((blob) => task.resolve(blob));
802
802
  return task;
803
803
  }
804
804
  /**
@@ -806,7 +806,7 @@ class PdfiumEngine {
806
806
  *
807
807
  * @public
808
808
  */
809
- renderPageRect(doc, page, scaleFactor, rotation, dpr, rect, options) {
809
+ renderPageRect(doc, page, scaleFactor, rotation, dpr, rect, options, imageType = 'image/webp') {
810
810
  const task = new Task();
811
811
  this.logger.debug(LOG_SOURCE$2, LOG_CATEGORY$2, 'renderPageRect', doc, page, scaleFactor, rotation, dpr, rect, options);
812
812
  this.logger.perf(LOG_SOURCE$2, LOG_CATEGORY$2, `RenderPageRect`, 'Begin', `${doc.id}-${page.index}`);
@@ -820,7 +820,7 @@ class PdfiumEngine {
820
820
  }
821
821
  const imageData = this.renderPageRectToImageData(ctx, page, rect, scaleFactor, rotation, dpr, options);
822
822
  this.logger.perf(LOG_SOURCE$2, LOG_CATEGORY$2, `RenderPageRect`, 'End', `${doc.id}-${page.index}`);
823
- this.imageDataConverter(imageData).then((blob) => task.resolve(blob));
823
+ this.imageDataConverter(imageData, imageType).then((blob) => task.resolve(blob));
824
824
  return task;
825
825
  }
826
826
  /**
@@ -1362,6 +1362,62 @@ class PdfiumEngine {
1362
1362
  this.logger.perf(LOG_SOURCE$2, LOG_CATEGORY$2, `ExtractText`, 'End', doc.id);
1363
1363
  return PdfTaskHelper.resolve(text);
1364
1364
  }
1365
+ /**
1366
+ * {@inheritDoc @embedpdf/models!PdfEngine.getTextSlices}
1367
+ *
1368
+ * @public
1369
+ */
1370
+ getTextSlices(doc, slices) {
1371
+ this.logger.debug(LOG_SOURCE$2, LOG_CATEGORY$2, 'getTextSlices', doc, slices);
1372
+ this.logger.perf(LOG_SOURCE$2, LOG_CATEGORY$2, 'GetTextSlices', 'Begin', doc.id);
1373
+ /* ⚠︎ 1 — trivial case */
1374
+ if (slices.length === 0) {
1375
+ this.logger.perf(LOG_SOURCE$2, LOG_CATEGORY$2, 'GetTextSlices', 'End', doc.id);
1376
+ return PdfTaskHelper.resolve([]);
1377
+ }
1378
+ /* ⚠︎ 2 — document must be open */
1379
+ const ctx = this.cache.getContext(doc.id);
1380
+ if (!ctx) {
1381
+ this.logger.perf(LOG_SOURCE$2, LOG_CATEGORY$2, 'GetTextSlices', 'End', doc.id);
1382
+ return PdfTaskHelper.reject({
1383
+ code: PdfErrorCode.DocNotOpen,
1384
+ message: 'document does not open',
1385
+ });
1386
+ }
1387
+ try {
1388
+ /* keep caller order */
1389
+ const out = new Array(slices.length);
1390
+ /* group → open each page once */
1391
+ const byPage = new Map();
1392
+ slices.forEach((s, i) => {
1393
+ (byPage.get(s.pageIndex) ?? byPage.set(s.pageIndex, []).get(s.pageIndex)).push({
1394
+ slice: s,
1395
+ pos: i,
1396
+ });
1397
+ });
1398
+ for (const [pageIdx, list] of byPage) {
1399
+ const pageCtx = ctx.acquirePage(pageIdx);
1400
+ const textPagePtr = pageCtx.getTextPage();
1401
+ for (const { slice, pos } of list) {
1402
+ const bufPtr = this.malloc(2 * (slice.charCount + 1)); // UTF-16 + NIL
1403
+ this.pdfiumModule.FPDFText_GetText(textPagePtr, slice.charIndex, slice.charCount, bufPtr);
1404
+ out[pos] = stripPdfUnwantedMarkers(this.pdfiumModule.pdfium.UTF16ToString(bufPtr));
1405
+ this.free(bufPtr);
1406
+ }
1407
+ pageCtx.release();
1408
+ }
1409
+ this.logger.perf(LOG_SOURCE$2, LOG_CATEGORY$2, 'GetTextSlices', 'End', doc.id);
1410
+ return PdfTaskHelper.resolve(out);
1411
+ }
1412
+ catch (e) {
1413
+ this.logger.error(LOG_SOURCE$2, LOG_CATEGORY$2, 'getTextSlices error', e);
1414
+ this.logger.perf(LOG_SOURCE$2, LOG_CATEGORY$2, 'GetTextSlices', 'End', doc.id);
1415
+ return PdfTaskHelper.reject({
1416
+ code: PdfErrorCode.Unknown,
1417
+ message: String(e),
1418
+ });
1419
+ }
1420
+ }
1365
1421
  /**
1366
1422
  * {@inheritDoc @embedpdf/models!PdfEngine.merge}
1367
1423
  *
@@ -1891,14 +1947,12 @@ class PdfiumEngine {
1891
1947
  const runs = [];
1892
1948
  let current = null;
1893
1949
  let curObjPtr = null;
1950
+ let bounds = null;
1894
1951
  /** ── main loop ──────────────────────────────────────────── */
1895
1952
  for (let i = 0; i < glyphs.length; i++) {
1896
1953
  const g = glyphs[i];
1897
1954
  /* 1 — find the CPDF_TextObject this glyph belongs to */
1898
1955
  const objPtr = this.pdfiumModule.FPDFText_GetTextObject(textPagePtr, i);
1899
- if (g.isEmpty) {
1900
- continue;
1901
- }
1902
1956
  /* 2 — start a new run when the text object changes */
1903
1957
  if (objPtr !== curObjPtr) {
1904
1958
  curObjPtr = objPtr;
@@ -1912,6 +1966,12 @@ class PdfiumEngine {
1912
1966
  charStart: i,
1913
1967
  glyphs: [],
1914
1968
  };
1969
+ bounds = {
1970
+ minX: g.origin.x,
1971
+ minY: g.origin.y,
1972
+ maxX: g.origin.x + g.size.width,
1973
+ maxY: g.origin.y + g.size.height,
1974
+ };
1915
1975
  runs.push(current);
1916
1976
  }
1917
1977
  /* 3 — append the slim glyph record */
@@ -1920,16 +1980,24 @@ class PdfiumEngine {
1920
1980
  y: g.origin.y,
1921
1981
  width: g.size.width,
1922
1982
  height: g.size.height,
1923
- flags: g.isSpace ? 1 : 0,
1983
+ flags: g.isEmpty ? 2 : g.isSpace ? 1 : 0,
1924
1984
  });
1925
1985
  /* 4 — expand the run's bounding rect */
1986
+ if (g.isEmpty) {
1987
+ continue;
1988
+ }
1926
1989
  const right = g.origin.x + g.size.width;
1927
1990
  const bottom = g.origin.y + g.size.height;
1928
- current.rect.width =
1929
- Math.max(current.rect.x + current.rect.width, right) - current.rect.x;
1930
- current.rect.y = Math.min(current.rect.y, g.origin.y);
1931
- current.rect.height =
1932
- Math.max(current.rect.y + current.rect.height, bottom) - current.rect.y;
1991
+ // Update bounds
1992
+ bounds.minX = Math.min(bounds.minX, g.origin.x);
1993
+ bounds.minY = Math.min(bounds.minY, g.origin.y);
1994
+ bounds.maxX = Math.max(bounds.maxX, right);
1995
+ bounds.maxY = Math.max(bounds.maxY, bottom);
1996
+ // Calculate final rect from bounds
1997
+ current.rect.x = bounds.minX;
1998
+ current.rect.y = bounds.minY;
1999
+ current.rect.width = bounds.maxX - bounds.minX;
2000
+ current.rect.height = bounds.maxY - bounds.minY;
1933
2001
  }
1934
2002
  return runs;
1935
2003
  }
@@ -4175,6 +4243,9 @@ class EngineRunner {
4175
4243
  case 'extractText':
4176
4244
  task = this.engine[name](...args);
4177
4245
  break;
4246
+ case 'getTextSlices':
4247
+ task = this.engine[name](...args);
4248
+ break;
4178
4249
  case 'getPageGlyphs':
4179
4250
  task = this.engine[name](...args);
4180
4251
  break;
@@ -4600,7 +4671,7 @@ class WebWorkerEngine {
4600
4671
  *
4601
4672
  * @public
4602
4673
  */
4603
- renderPage(doc, page, scaleFactor, rotation, dpr, options) {
4674
+ renderPage(doc, page, scaleFactor, rotation, dpr, options, imageType = 'image/webp') {
4604
4675
  this.logger.debug(LOG_SOURCE, LOG_CATEGORY, 'renderPage', doc, page, scaleFactor, rotation, dpr, options);
4605
4676
  const requestId = this.generateRequestId(doc.id);
4606
4677
  const task = new WorkerTask(this.worker, requestId);
@@ -4609,7 +4680,7 @@ class WebWorkerEngine {
4609
4680
  type: 'ExecuteRequest',
4610
4681
  data: {
4611
4682
  name: 'renderPage',
4612
- args: [doc, page, scaleFactor, rotation, dpr, options],
4683
+ args: [doc, page, scaleFactor, rotation, dpr, options, imageType],
4613
4684
  },
4614
4685
  };
4615
4686
  this.proxy(task, request);
@@ -4620,7 +4691,7 @@ class WebWorkerEngine {
4620
4691
  *
4621
4692
  * @public
4622
4693
  */
4623
- renderPageRect(doc, page, scaleFactor, rotation, dpr, rect, options) {
4694
+ renderPageRect(doc, page, scaleFactor, rotation, dpr, rect, options, imageType = 'image/webp') {
4624
4695
  this.logger.debug(LOG_SOURCE, LOG_CATEGORY, 'renderPageRect', doc, page, scaleFactor, rotation, dpr, rect, options);
4625
4696
  const requestId = this.generateRequestId(doc.id);
4626
4697
  const task = new WorkerTask(this.worker, requestId);
@@ -4629,7 +4700,7 @@ class WebWorkerEngine {
4629
4700
  type: 'ExecuteRequest',
4630
4701
  data: {
4631
4702
  name: 'renderPageRect',
4632
- args: [doc, page, scaleFactor, rotation, dpr, rect, options],
4703
+ args: [doc, page, scaleFactor, rotation, dpr, rect, options, imageType],
4633
4704
  },
4634
4705
  };
4635
4706
  this.proxy(task, request);
@@ -4935,6 +5006,26 @@ class WebWorkerEngine {
4935
5006
  this.proxy(task, request);
4936
5007
  return task;
4937
5008
  }
5009
+ /**
5010
+ * {@inheritDoc @embedpdf/models!PdfEngine.getTextSlices}
5011
+ *
5012
+ * @public
5013
+ */
5014
+ getTextSlices(doc, slices) {
5015
+ this.logger.debug(LOG_SOURCE, LOG_CATEGORY, 'getTextSlices', doc, slices);
5016
+ const requestId = this.generateRequestId(doc.id);
5017
+ const task = new WorkerTask(this.worker, requestId);
5018
+ const request = {
5019
+ id: requestId,
5020
+ type: 'ExecuteRequest',
5021
+ data: {
5022
+ name: 'getTextSlices',
5023
+ args: [doc, slices],
5024
+ },
5025
+ };
5026
+ this.proxy(task, request);
5027
+ return task;
5028
+ }
4938
5029
  /**
4939
5030
  * {@inheritDoc @embedpdf/models!PdfEngine.getPageGlyphs}
4940
5031
  *
@@ -5299,6 +5390,9 @@ function createMockPdfEngine(partialEngine) {
5299
5390
  extractText: (pdf, pageIndexes) => {
5300
5391
  return PdfTaskHelper.resolve('');
5301
5392
  },
5393
+ getTextSlices: (doc, slices) => {
5394
+ return PdfTaskHelper.resolve([]);
5395
+ },
5302
5396
  getPageGlyphs: (doc, page) => {
5303
5397
  return PdfTaskHelper.resolve([]);
5304
5398
  },