@embedpdf/engines 1.0.5 → 1.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,4 @@
1
- import { PdfEngine, Logger, Task, PdfErrorReason, PdfFileUrl, PdfUrlOptions, PdfDocumentObject, PdfFile, PdfFileLoader, PdfSignatureObject, PdfBookmarkObject, PdfPageObject, Rotation, PdfRenderOptions, ImageConversionTypes, PdfTask, Rect, PdfAnnotationObject, PdfAnnotationTransformation, PdfTextRectObject, PdfAttachmentObject, PdfWidgetAnnoObject, FormFieldValue, PdfPageFlattenFlag, PdfPageFlattenResult, PdfInkListObject, PdfStampAnnoObjectContents, Position, PdfPageGeometry, PdfGlyphObject, MatchFlag, SearchAllPagesResult, PdfImage } from '@embedpdf/models';
1
+ import { PdfEngine, Logger, Task, PdfErrorReason, PdfFileUrl, PdfUrlOptions, PdfDocumentObject, PdfFile, PdfFileLoader, PdfSignatureObject, PdfBookmarkObject, PdfPageObject, Rotation, PdfRenderOptions, ImageConversionTypes, PdfTask, Rect, PdfAnnotationObject, PdfAnnotationTransformation, PdfTextRectObject, PdfAttachmentObject, PdfWidgetAnnoObject, FormFieldValue, PdfPageFlattenFlag, PdfPageFlattenResult, PageTextSlice, PdfInkListObject, PdfStampAnnoObjectContents, Position, PdfPageGeometry, PdfGlyphObject, MatchFlag, SearchAllPagesResult, PdfImage } from '@embedpdf/models';
2
2
  import { WrappedPdfiumModule } from '@embedpdf/pdfium';
3
3
 
4
4
  /**
@@ -213,6 +213,12 @@ declare class PdfiumEngine<T = Blob> implements PdfEngine<T> {
213
213
  * @public
214
214
  */
215
215
  extractText(doc: PdfDocumentObject, pageIndexes: number[]): Task<any, PdfErrorReason> | Task<string, PdfErrorReason>;
216
+ /**
217
+ * {@inheritDoc @embedpdf/models!PdfEngine.getTextSlices}
218
+ *
219
+ * @public
220
+ */
221
+ getTextSlices(doc: PdfDocumentObject, slices: PageTextSlice[]): PdfTask<string[]>;
216
222
  /**
217
223
  * {@inheritDoc @embedpdf/models!PdfEngine.merge}
218
224
  *
@@ -986,6 +992,6 @@ declare class PdfiumEngine<T = Blob> implements PdfEngine<T> {
986
992
  private searchAllInPage;
987
993
  }
988
994
 
989
- declare function createPdfiumEngine(wasmUrl: string): Promise<PdfiumEngine>;
995
+ declare function createPdfiumEngine(wasmUrl: string, logger?: Logger): Promise<PdfiumEngine>;
990
996
 
991
997
  export { createPdfiumEngine };
@@ -1,5 +1,5 @@
1
- import { NoopLogger, PdfTaskHelper, PdfErrorCode, Task, Rotation, PdfAnnotationSubtype, PdfPageObjectType, PdfAnnotationObjectStatus, quadToRect, PDF_FORM_FIELD_TYPE, toIntRect, transformRect, toIntSize, transformSize, PdfActionType, PdfZoomMode, AppearanceMode, MatchFlag } from '@embedpdf/models';
2
1
  import { init } from '@embedpdf/pdfium';
2
+ import { NoopLogger, PdfTaskHelper, PdfErrorCode, Task, Rotation, PdfAnnotationSubtype, stripPdfUnwantedMarkers, PdfPageObjectType, PdfAnnotationObjectStatus, quadToRect, PDF_FORM_FIELD_TYPE, toIntRect, transformRect, toIntSize, transformSize, PdfActionType, PdfZoomMode, AppearanceMode, MatchFlag } from '@embedpdf/models';
3
3
 
4
4
  /**
5
5
  * Read string from WASM heap
@@ -1362,6 +1362,62 @@ class PdfiumEngine {
1362
1362
  this.logger.perf(LOG_SOURCE, LOG_CATEGORY, `ExtractText`, 'End', doc.id);
1363
1363
  return PdfTaskHelper.resolve(text);
1364
1364
  }
1365
+ /**
1366
+ * {@inheritDoc @embedpdf/models!PdfEngine.getTextSlices}
1367
+ *
1368
+ * @public
1369
+ */
1370
+ getTextSlices(doc, slices) {
1371
+ this.logger.debug(LOG_SOURCE, LOG_CATEGORY, 'getTextSlices', doc, slices);
1372
+ this.logger.perf(LOG_SOURCE, LOG_CATEGORY, 'GetTextSlices', 'Begin', doc.id);
1373
+ /* ⚠︎ 1 — trivial case */
1374
+ if (slices.length === 0) {
1375
+ this.logger.perf(LOG_SOURCE, LOG_CATEGORY, 'GetTextSlices', 'End', doc.id);
1376
+ return PdfTaskHelper.resolve([]);
1377
+ }
1378
+ /* ⚠︎ 2 — document must be open */
1379
+ const ctx = this.cache.getContext(doc.id);
1380
+ if (!ctx) {
1381
+ this.logger.perf(LOG_SOURCE, LOG_CATEGORY, 'GetTextSlices', 'End', doc.id);
1382
+ return PdfTaskHelper.reject({
1383
+ code: PdfErrorCode.DocNotOpen,
1384
+ message: 'document does not open',
1385
+ });
1386
+ }
1387
+ try {
1388
+ /* keep caller order */
1389
+ const out = new Array(slices.length);
1390
+ /* group → open each page once */
1391
+ const byPage = new Map();
1392
+ slices.forEach((s, i) => {
1393
+ (byPage.get(s.pageIndex) ?? byPage.set(s.pageIndex, []).get(s.pageIndex)).push({
1394
+ slice: s,
1395
+ pos: i,
1396
+ });
1397
+ });
1398
+ for (const [pageIdx, list] of byPage) {
1399
+ const pageCtx = ctx.acquirePage(pageIdx);
1400
+ const textPagePtr = pageCtx.getTextPage();
1401
+ for (const { slice, pos } of list) {
1402
+ const bufPtr = this.malloc(2 * (slice.charCount + 1)); // UTF-16 + NIL
1403
+ this.pdfiumModule.FPDFText_GetText(textPagePtr, slice.charIndex, slice.charCount, bufPtr);
1404
+ out[pos] = stripPdfUnwantedMarkers(this.pdfiumModule.pdfium.UTF16ToString(bufPtr));
1405
+ this.free(bufPtr);
1406
+ }
1407
+ pageCtx.release();
1408
+ }
1409
+ this.logger.perf(LOG_SOURCE, LOG_CATEGORY, 'GetTextSlices', 'End', doc.id);
1410
+ return PdfTaskHelper.resolve(out);
1411
+ }
1412
+ catch (e) {
1413
+ this.logger.error(LOG_SOURCE, LOG_CATEGORY, 'getTextSlices error', e);
1414
+ this.logger.perf(LOG_SOURCE, LOG_CATEGORY, 'GetTextSlices', 'End', doc.id);
1415
+ return PdfTaskHelper.reject({
1416
+ code: PdfErrorCode.Unknown,
1417
+ message: String(e),
1418
+ });
1419
+ }
1420
+ }
1365
1421
  /**
1366
1422
  * {@inheritDoc @embedpdf/models!PdfEngine.merge}
1367
1423
  *
@@ -1891,14 +1947,12 @@ class PdfiumEngine {
1891
1947
  const runs = [];
1892
1948
  let current = null;
1893
1949
  let curObjPtr = null;
1950
+ let bounds = null;
1894
1951
  /** ── main loop ──────────────────────────────────────────── */
1895
1952
  for (let i = 0; i < glyphs.length; i++) {
1896
1953
  const g = glyphs[i];
1897
1954
  /* 1 — find the CPDF_TextObject this glyph belongs to */
1898
1955
  const objPtr = this.pdfiumModule.FPDFText_GetTextObject(textPagePtr, i);
1899
- if (g.isEmpty) {
1900
- continue;
1901
- }
1902
1956
  /* 2 — start a new run when the text object changes */
1903
1957
  if (objPtr !== curObjPtr) {
1904
1958
  curObjPtr = objPtr;
@@ -1912,6 +1966,12 @@ class PdfiumEngine {
1912
1966
  charStart: i,
1913
1967
  glyphs: [],
1914
1968
  };
1969
+ bounds = {
1970
+ minX: g.origin.x,
1971
+ minY: g.origin.y,
1972
+ maxX: g.origin.x + g.size.width,
1973
+ maxY: g.origin.y + g.size.height,
1974
+ };
1915
1975
  runs.push(current);
1916
1976
  }
1917
1977
  /* 3 — append the slim glyph record */
@@ -1920,16 +1980,24 @@ class PdfiumEngine {
1920
1980
  y: g.origin.y,
1921
1981
  width: g.size.width,
1922
1982
  height: g.size.height,
1923
- flags: g.isSpace ? 1 : 0,
1983
+ flags: g.isEmpty ? 2 : g.isSpace ? 1 : 0,
1924
1984
  });
1925
1985
  /* 4 — expand the run's bounding rect */
1986
+ if (g.isEmpty) {
1987
+ continue;
1988
+ }
1926
1989
  const right = g.origin.x + g.size.width;
1927
1990
  const bottom = g.origin.y + g.size.height;
1928
- current.rect.width =
1929
- Math.max(current.rect.x + current.rect.width, right) - current.rect.x;
1930
- current.rect.y = Math.min(current.rect.y, g.origin.y);
1931
- current.rect.height =
1932
- Math.max(current.rect.y + current.rect.height, bottom) - current.rect.y;
1991
+ // Update bounds
1992
+ bounds.minX = Math.min(bounds.minX, g.origin.x);
1993
+ bounds.minY = Math.min(bounds.minY, g.origin.y);
1994
+ bounds.maxX = Math.max(bounds.maxX, right);
1995
+ bounds.maxY = Math.max(bounds.maxY, bottom);
1996
+ // Calculate final rect from bounds
1997
+ current.rect.x = bounds.minX;
1998
+ current.rect.y = bounds.minY;
1999
+ current.rect.width = bounds.maxX - bounds.minX;
2000
+ current.rect.height = bounds.maxY - bounds.minY;
1933
2001
  }
1934
2002
  return runs;
1935
2003
  }
@@ -4024,11 +4092,11 @@ class PdfiumEngine {
4024
4092
  }
4025
4093
  }
4026
4094
 
4027
- async function createPdfiumEngine(wasmUrl) {
4095
+ async function createPdfiumEngine(wasmUrl, logger) {
4028
4096
  const response = await fetch(wasmUrl);
4029
4097
  const wasmBinary = await response.arrayBuffer();
4030
4098
  const wasmModule = await init({ wasmBinary });
4031
- return new PdfiumEngine(wasmModule);
4099
+ return new PdfiumEngine(wasmModule, logger);
4032
4100
  }
4033
4101
 
4034
4102
  export { createPdfiumEngine };