pdfjs-reader-core 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -2431,12 +2431,24 @@ interface CharPosition {
2431
2431
  char: string;
2432
2432
  rect: HighlightRect;
2433
2433
  }
2434
+ /** Internal text item representation */
2435
+ interface TextItem {
2436
+ text: string;
2437
+ transform: number[];
2438
+ width: number;
2439
+ height: number;
2440
+ }
2434
2441
  /**
2435
- * Extract text content with character positions from a PDF page.
2442
+ * Extract text content with text items from a PDF page.
2443
+ * Uses proper text item tracking for accurate positioning.
2436
2444
  */
2437
2445
  declare function extractPageText(document: PDFDocumentProxy, pageNumber: number): Promise<{
2438
2446
  fullText: string;
2439
- charPositions: CharPosition[];
2447
+ textItems: TextItem[];
2448
+ viewport: {
2449
+ width: number;
2450
+ height: number;
2451
+ };
2440
2452
  }>;
2441
2453
  /**
2442
2454
  * Find all occurrences of text on a specific page.
package/dist/index.d.ts CHANGED
@@ -2431,12 +2431,24 @@ interface CharPosition {
2431
2431
  char: string;
2432
2432
  rect: HighlightRect;
2433
2433
  }
2434
+ /** Internal text item representation */
2435
+ interface TextItem {
2436
+ text: string;
2437
+ transform: number[];
2438
+ width: number;
2439
+ height: number;
2440
+ }
2434
2441
  /**
2435
- * Extract text content with character positions from a PDF page.
2442
+ * Extract text content with text items from a PDF page.
2443
+ * Uses proper text item tracking for accurate positioning.
2436
2444
  */
2437
2445
  declare function extractPageText(document: PDFDocumentProxy, pageNumber: number): Promise<{
2438
2446
  fullText: string;
2439
- charPositions: CharPosition[];
2447
+ textItems: TextItem[];
2448
+ viewport: {
2449
+ width: number;
2450
+ height: number;
2451
+ };
2440
2452
  }>;
2441
2453
  /**
2442
2454
  * Find all occurrences of text on a specific page.
package/dist/index.js CHANGED
@@ -1307,37 +1307,54 @@ async function extractPageText(document2, pageNumber) {
1307
1307
  const textContent = await page.getTextContent();
1308
1308
  const viewport = page.getViewport({ scale: 1 });
1309
1309
  let fullText = "";
1310
- const charPositions = [];
1310
+ const textItems = [];
1311
1311
  for (const item of textContent.items) {
1312
1312
  if ("str" in item && item.str) {
1313
- const tx = item.transform;
1314
- const x = tx[4];
1315
- const y = viewport.height - tx[5];
1316
- const width = item.width ?? 0;
1317
- const height = item.height ?? 12;
1318
- const charWidth = item.str.length > 0 ? width / item.str.length : width;
1319
- for (let i = 0; i < item.str.length; i++) {
1320
- charPositions.push({
1321
- char: item.str[i],
1322
- rect: {
1323
- x: x + i * charWidth,
1324
- y: y - height,
1325
- width: charWidth,
1326
- height
1327
- }
1328
- });
1329
- }
1313
+ textItems.push({
1314
+ text: item.str,
1315
+ transform: item.transform,
1316
+ width: item.width ?? 0,
1317
+ height: item.height ?? 12
1318
+ });
1330
1319
  fullText += item.str;
1331
1320
  }
1332
1321
  }
1333
- return { fullText, charPositions };
1322
+ return { fullText, textItems, viewport };
1323
+ }
1324
+ function calculateMatchRects(textItems, startOffset, length, viewport) {
1325
+ const rects = [];
1326
+ let currentOffset = 0;
1327
+ for (const item of textItems) {
1328
+ const itemStart = currentOffset;
1329
+ const itemEnd = currentOffset + item.text.length;
1330
+ if (itemEnd > startOffset && itemStart < startOffset + length) {
1331
+ const [, , c, d, tx, ty] = item.transform;
1332
+ const x = tx;
1333
+ const y = viewport.height - ty;
1334
+ const height = Math.sqrt(c * c + d * d);
1335
+ const matchStartInItem = Math.max(0, startOffset - itemStart);
1336
+ const matchEndInItem = Math.min(item.text.length, startOffset + length - itemStart);
1337
+ const charWidth = item.text.length > 0 ? item.width / item.text.length : item.width;
1338
+ const matchWidth = charWidth * (matchEndInItem - matchStartInItem);
1339
+ const matchX = x + charWidth * matchStartInItem;
1340
+ const yOffset = height * 0.15;
1341
+ rects.push({
1342
+ x: matchX,
1343
+ y: y - height + yOffset,
1344
+ width: matchWidth,
1345
+ height
1346
+ });
1347
+ }
1348
+ currentOffset = itemEnd;
1349
+ }
1350
+ return rects;
1334
1351
  }
1335
1352
  async function findTextOnPage(document2, pageNumber, query, options = {}) {
1336
1353
  const { caseSensitive = false, wholeWord = false } = options;
1337
1354
  if (!query || pageNumber < 1 || pageNumber > document2.numPages) {
1338
1355
  return [];
1339
1356
  }
1340
- const { fullText, charPositions } = await extractPageText(document2, pageNumber);
1357
+ const { fullText, textItems, viewport } = await extractPageText(document2, pageNumber);
1341
1358
  const matches = [];
1342
1359
  const searchText = caseSensitive ? query : query.toLowerCase();
1343
1360
  const textToSearch = caseSensitive ? fullText : fullText.toLowerCase();
@@ -1353,17 +1370,15 @@ async function findTextOnPage(document2, pageNumber, query, options = {}) {
1353
1370
  continue;
1354
1371
  }
1355
1372
  }
1356
- const matchRects = [];
1357
- for (let i = matchIndex; i < matchIndex + query.length && i < charPositions.length; i++) {
1358
- matchRects.push(charPositions[i].rect);
1373
+ const matchRects = calculateMatchRects(textItems, matchIndex, query.length, viewport);
1374
+ if (matchRects.length > 0) {
1375
+ matches.push({
1376
+ text: fullText.substring(matchIndex, matchIndex + query.length),
1377
+ rects: matchRects,
1378
+ pageNumber,
1379
+ startIndex: matchIndex
1380
+ });
1359
1381
  }
1360
- const mergedRects = mergeAdjacentRects(matchRects);
1361
- matches.push({
1362
- text: fullText.substring(matchIndex, matchIndex + query.length),
1363
- rects: mergedRects,
1364
- pageNumber,
1365
- startIndex: matchIndex
1366
- });
1367
1382
  startIndex = matchIndex + 1;
1368
1383
  }
1369
1384
  return matches;
@@ -1617,7 +1632,7 @@ function createSearchStore(initialOverrides = {}) {
1617
1632
  }
1618
1633
  }
1619
1634
  const matchText = pageText.substring(startIndex, startIndex + query.length);
1620
- const rects = calculateMatchRects(textItems, startIndex, query.length, viewport);
1635
+ const rects = calculateMatchRects2(textItems, startIndex, query.length, viewport);
1621
1636
  results.push({
1622
1637
  pageNumber: pageNum,
1623
1638
  matchIndex: matchIndex++,
@@ -1678,7 +1693,7 @@ function createSearchStore(initialOverrides = {}) {
1678
1693
  }
1679
1694
  }));
1680
1695
  }
1681
- function calculateMatchRects(textItems, startOffset, length, viewport) {
1696
+ function calculateMatchRects2(textItems, startOffset, length, viewport) {
1682
1697
  const rects = [];
1683
1698
  let currentOffset = 0;
1684
1699
  for (const item of textItems) {
@@ -1693,9 +1708,10 @@ function calculateMatchRects(textItems, startOffset, length, viewport) {
1693
1708
  const matchEndInItem = Math.min(item.text.length, startOffset + length - itemStart);
1694
1709
  const matchWidth = item.width / item.text.length * (matchEndInItem - matchStartInItem);
1695
1710
  const matchX = x + item.width / item.text.length * matchStartInItem;
1711
+ const yOffset = height * 0.15;
1696
1712
  rects.push({
1697
1713
  x: matchX,
1698
- y: y - height,
1714
+ y: y - height + yOffset,
1699
1715
  width: matchWidth,
1700
1716
  height
1701
1717
  });
@@ -9267,24 +9283,33 @@ function getSrcIdentifier(src) {
9267
9283
  const last = Array.from(data.slice(-4)).map((b) => b.toString(16).padStart(2, "0")).join("");
9268
9284
  return `binary:${len}:${first}:${last}`;
9269
9285
  }
9270
- function mergeRects2(rects) {
9271
- if (rects.length === 0) return [];
9272
- const sorted = [...rects].sort((a, b) => a.y - b.y || a.x - b.x);
9273
- const merged = [];
9274
- let current = { ...sorted[0] };
9275
- for (let i = 1; i < sorted.length; i++) {
9276
- const rect = sorted[i];
9277
- if (Math.abs(rect.y - current.y) < 2 && rect.x <= current.x + current.width + 2) {
9278
- const newRight = Math.max(current.x + current.width, rect.x + rect.width);
9279
- current.width = newRight - current.x;
9280
- current.height = Math.max(current.height, rect.height);
9281
- } else {
9282
- merged.push(current);
9283
- current = { ...rect };
9286
+ function calculateMatchRects3(textItems, startOffset, length, viewport) {
9287
+ const rects = [];
9288
+ let currentOffset = 0;
9289
+ for (const item of textItems) {
9290
+ const itemStart = currentOffset;
9291
+ const itemEnd = currentOffset + item.text.length;
9292
+ if (itemEnd > startOffset && itemStart < startOffset + length) {
9293
+ const [, , c, d, tx, ty] = item.transform;
9294
+ const x = tx;
9295
+ const y = viewport.height - ty;
9296
+ const height = Math.sqrt(c * c + d * d);
9297
+ const matchStartInItem = Math.max(0, startOffset - itemStart);
9298
+ const matchEndInItem = Math.min(item.text.length, startOffset + length - itemStart);
9299
+ const charWidth = item.text.length > 0 ? item.width / item.text.length : item.width;
9300
+ const matchWidth = charWidth * (matchEndInItem - matchStartInItem);
9301
+ const matchX = x + charWidth * matchStartInItem;
9302
+ const yOffset = height * 0.15;
9303
+ rects.push({
9304
+ x: matchX,
9305
+ y: y - height + yOffset,
9306
+ width: matchWidth,
9307
+ height
9308
+ });
9284
9309
  }
9310
+ currentOffset = itemEnd;
9285
9311
  }
9286
- merged.push(current);
9287
- return merged;
9312
+ return rects;
9288
9313
  }
9289
9314
  var PDFViewerInner, PDFViewerInnerWithRef, PDFViewerClient;
9290
9315
  var init_PDFViewerClient = __esm({
@@ -9381,26 +9406,15 @@ var init_PDFViewerClient = __esm({
9381
9406
  const textContent = await page.getTextContent();
9382
9407
  const viewport = page.getViewport({ scale: 1 });
9383
9408
  let fullText = "";
9384
- const charPositions = [];
9409
+ const textItems = [];
9385
9410
  for (const item of textContent.items) {
9386
9411
  if ("str" in item && item.str) {
9387
- const tx = item.transform;
9388
- const x = tx[4];
9389
- const y = viewport.height - tx[5];
9390
- const width = item.width ?? 0;
9391
- const height = item.height ?? 12;
9392
- const charWidth = item.str.length > 0 ? width / item.str.length : width;
9393
- for (let i = 0; i < item.str.length; i++) {
9394
- charPositions.push({
9395
- char: item.str[i],
9396
- rect: {
9397
- x: x + i * charWidth,
9398
- y: y - height,
9399
- width: charWidth,
9400
- height
9401
- }
9402
- });
9403
- }
9412
+ textItems.push({
9413
+ text: item.str,
9414
+ transform: item.transform,
9415
+ width: item.width ?? 0,
9416
+ height: item.height ?? 12
9417
+ });
9404
9418
  fullText += item.str;
9405
9419
  }
9406
9420
  }
@@ -9409,18 +9423,16 @@ var init_PDFViewerClient = __esm({
9409
9423
  while (true) {
9410
9424
  const matchIndex = textToSearch.indexOf(searchText, startIndex);
9411
9425
  if (matchIndex === -1) break;
9412
- const matchRects = [];
9413
- for (let i = matchIndex; i < matchIndex + text.length && i < charPositions.length; i++) {
9414
- matchRects.push(charPositions[i].rect);
9426
+ const matchRects = calculateMatchRects3(textItems, matchIndex, text.length, viewport);
9427
+ if (matchRects.length > 0) {
9428
+ const highlight = annotationStore.getState().addHighlight({
9429
+ pageNumber: pageNum,
9430
+ rects: matchRects,
9431
+ color,
9432
+ text: fullText.substring(matchIndex, matchIndex + text.length)
9433
+ });
9434
+ highlightIds.push(highlight.id);
9415
9435
  }
9416
- const mergedRects = mergeRects2(matchRects);
9417
- const highlight = annotationStore.getState().addHighlight({
9418
- pageNumber: pageNum,
9419
- rects: mergedRects,
9420
- color,
9421
- text: fullText.substring(matchIndex, matchIndex + text.length)
9422
- });
9423
- highlightIds.push(highlight.id);
9424
9436
  startIndex = matchIndex + 1;
9425
9437
  }
9426
9438
  } catch {
@@ -9598,33 +9610,22 @@ var init_PDFViewerClient = __esm({
9598
9610
  const textContent = await page.getTextContent();
9599
9611
  const viewport = page.getViewport({ scale: 1 });
9600
9612
  let fullText = "";
9601
- const charPositions = [];
9613
+ const textItems = [];
9602
9614
  for (const item of textContent.items) {
9603
9615
  if ("str" in item && item.str) {
9604
- const tx = item.transform;
9605
- const x = tx[4];
9606
- const y = viewport.height - tx[5];
9607
- const width = item.width ?? 0;
9608
- const height = item.height ?? 12;
9609
- const charWidth = item.str.length > 0 ? width / item.str.length : width;
9610
- for (let i = 0; i < item.str.length; i++) {
9611
- charPositions.push({
9612
- char: item.str[i],
9613
- rect: {
9614
- x: x + i * charWidth,
9615
- y: y - height,
9616
- width: charWidth,
9617
- height
9618
- }
9619
- });
9620
- }
9616
+ textItems.push({
9617
+ text: item.str,
9618
+ transform: item.transform,
9619
+ width: item.width ?? 0,
9620
+ height: item.height ?? 12
9621
+ });
9621
9622
  fullText += item.str;
9622
9623
  }
9623
9624
  }
9624
9625
  const textToSearch = caseSensitive ? fullText : fullText.toLowerCase();
9625
9626
  let startIndex = 0;
9626
9627
  while (true) {
9627
- let matchIndex = textToSearch.indexOf(searchText, startIndex);
9628
+ const matchIndex = textToSearch.indexOf(searchText, startIndex);
9628
9629
  if (matchIndex === -1) break;
9629
9630
  if (wholeWord) {
9630
9631
  const beforeChar = matchIndex > 0 ? textToSearch[matchIndex - 1] : " ";
@@ -9634,26 +9635,24 @@ var init_PDFViewerClient = __esm({
9634
9635
  continue;
9635
9636
  }
9636
9637
  }
9637
- const matchRects = [];
9638
- for (let i = matchIndex; i < matchIndex + query.length && i < charPositions.length; i++) {
9639
- matchRects.push(charPositions[i].rect);
9638
+ const matchRects = calculateMatchRects3(textItems, matchIndex, query.length, viewport);
9639
+ if (matchRects.length > 0) {
9640
+ const highlight = annotationStore.getState().addHighlight({
9641
+ pageNumber: pageNum,
9642
+ rects: matchRects,
9643
+ color,
9644
+ text: fullText.substring(matchIndex, matchIndex + query.length),
9645
+ source: "search"
9646
+ });
9647
+ result.matchCount++;
9648
+ result.highlightIds.push(highlight.id);
9649
+ result.matches.push({
9650
+ pageNumber: pageNum,
9651
+ text: fullText.substring(matchIndex, matchIndex + query.length),
9652
+ highlightId: highlight.id,
9653
+ rects: matchRects
9654
+ });
9640
9655
  }
9641
- const mergedRects = mergeRects2(matchRects);
9642
- const highlight = annotationStore.getState().addHighlight({
9643
- pageNumber: pageNum,
9644
- rects: mergedRects,
9645
- color,
9646
- text: fullText.substring(matchIndex, matchIndex + query.length),
9647
- source: "search"
9648
- });
9649
- result.matchCount++;
9650
- result.highlightIds.push(highlight.id);
9651
- result.matches.push({
9652
- pageNumber: pageNum,
9653
- text: fullText.substring(matchIndex, matchIndex + query.length),
9654
- highlightId: highlight.id,
9655
- rects: mergedRects
9656
- });
9657
9656
  startIndex = matchIndex + 1;
9658
9657
  }
9659
9658
  } catch {