pdfjs-reader-core 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +115 -116
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +14 -2
- package/dist/index.d.ts +14 -2
- package/dist/index.js +115 -116
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.cts
CHANGED
|
@@ -2431,12 +2431,24 @@ interface CharPosition {
|
|
|
2431
2431
|
char: string;
|
|
2432
2432
|
rect: HighlightRect;
|
|
2433
2433
|
}
|
|
2434
|
+
/** Internal text item representation */
|
|
2435
|
+
interface TextItem {
|
|
2436
|
+
text: string;
|
|
2437
|
+
transform: number[];
|
|
2438
|
+
width: number;
|
|
2439
|
+
height: number;
|
|
2440
|
+
}
|
|
2434
2441
|
/**
|
|
2435
|
-
* Extract text content with
|
|
2442
|
+
* Extract text content with text items from a PDF page.
|
|
2443
|
+
* Uses proper text item tracking for accurate positioning.
|
|
2436
2444
|
*/
|
|
2437
2445
|
declare function extractPageText(document: PDFDocumentProxy, pageNumber: number): Promise<{
|
|
2438
2446
|
fullText: string;
|
|
2439
|
-
|
|
2447
|
+
textItems: TextItem[];
|
|
2448
|
+
viewport: {
|
|
2449
|
+
width: number;
|
|
2450
|
+
height: number;
|
|
2451
|
+
};
|
|
2440
2452
|
}>;
|
|
2441
2453
|
/**
|
|
2442
2454
|
* Find all occurrences of text on a specific page.
|
package/dist/index.d.ts
CHANGED
|
@@ -2431,12 +2431,24 @@ interface CharPosition {
|
|
|
2431
2431
|
char: string;
|
|
2432
2432
|
rect: HighlightRect;
|
|
2433
2433
|
}
|
|
2434
|
+
/** Internal text item representation */
|
|
2435
|
+
interface TextItem {
|
|
2436
|
+
text: string;
|
|
2437
|
+
transform: number[];
|
|
2438
|
+
width: number;
|
|
2439
|
+
height: number;
|
|
2440
|
+
}
|
|
2434
2441
|
/**
|
|
2435
|
-
* Extract text content with
|
|
2442
|
+
* Extract text content with text items from a PDF page.
|
|
2443
|
+
* Uses proper text item tracking for accurate positioning.
|
|
2436
2444
|
*/
|
|
2437
2445
|
declare function extractPageText(document: PDFDocumentProxy, pageNumber: number): Promise<{
|
|
2438
2446
|
fullText: string;
|
|
2439
|
-
|
|
2447
|
+
textItems: TextItem[];
|
|
2448
|
+
viewport: {
|
|
2449
|
+
width: number;
|
|
2450
|
+
height: number;
|
|
2451
|
+
};
|
|
2440
2452
|
}>;
|
|
2441
2453
|
/**
|
|
2442
2454
|
* Find all occurrences of text on a specific page.
|
package/dist/index.js
CHANGED
|
@@ -1307,37 +1307,54 @@ async function extractPageText(document2, pageNumber) {
|
|
|
1307
1307
|
const textContent = await page.getTextContent();
|
|
1308
1308
|
const viewport = page.getViewport({ scale: 1 });
|
|
1309
1309
|
let fullText = "";
|
|
1310
|
-
const
|
|
1310
|
+
const textItems = [];
|
|
1311
1311
|
for (const item of textContent.items) {
|
|
1312
1312
|
if ("str" in item && item.str) {
|
|
1313
|
-
|
|
1314
|
-
|
|
1315
|
-
|
|
1316
|
-
|
|
1317
|
-
|
|
1318
|
-
|
|
1319
|
-
for (let i = 0; i < item.str.length; i++) {
|
|
1320
|
-
charPositions.push({
|
|
1321
|
-
char: item.str[i],
|
|
1322
|
-
rect: {
|
|
1323
|
-
x: x + i * charWidth,
|
|
1324
|
-
y: y - height,
|
|
1325
|
-
width: charWidth,
|
|
1326
|
-
height
|
|
1327
|
-
}
|
|
1328
|
-
});
|
|
1329
|
-
}
|
|
1313
|
+
textItems.push({
|
|
1314
|
+
text: item.str,
|
|
1315
|
+
transform: item.transform,
|
|
1316
|
+
width: item.width ?? 0,
|
|
1317
|
+
height: item.height ?? 12
|
|
1318
|
+
});
|
|
1330
1319
|
fullText += item.str;
|
|
1331
1320
|
}
|
|
1332
1321
|
}
|
|
1333
|
-
return { fullText,
|
|
1322
|
+
return { fullText, textItems, viewport };
|
|
1323
|
+
}
|
|
1324
|
+
function calculateMatchRects(textItems, startOffset, length, viewport) {
|
|
1325
|
+
const rects = [];
|
|
1326
|
+
let currentOffset = 0;
|
|
1327
|
+
for (const item of textItems) {
|
|
1328
|
+
const itemStart = currentOffset;
|
|
1329
|
+
const itemEnd = currentOffset + item.text.length;
|
|
1330
|
+
if (itemEnd > startOffset && itemStart < startOffset + length) {
|
|
1331
|
+
const [, , c, d, tx, ty] = item.transform;
|
|
1332
|
+
const x = tx;
|
|
1333
|
+
const y = viewport.height - ty;
|
|
1334
|
+
const height = Math.sqrt(c * c + d * d);
|
|
1335
|
+
const matchStartInItem = Math.max(0, startOffset - itemStart);
|
|
1336
|
+
const matchEndInItem = Math.min(item.text.length, startOffset + length - itemStart);
|
|
1337
|
+
const charWidth = item.text.length > 0 ? item.width / item.text.length : item.width;
|
|
1338
|
+
const matchWidth = charWidth * (matchEndInItem - matchStartInItem);
|
|
1339
|
+
const matchX = x + charWidth * matchStartInItem;
|
|
1340
|
+
const yOffset = height * 0.15;
|
|
1341
|
+
rects.push({
|
|
1342
|
+
x: matchX,
|
|
1343
|
+
y: y - height + yOffset,
|
|
1344
|
+
width: matchWidth,
|
|
1345
|
+
height
|
|
1346
|
+
});
|
|
1347
|
+
}
|
|
1348
|
+
currentOffset = itemEnd;
|
|
1349
|
+
}
|
|
1350
|
+
return rects;
|
|
1334
1351
|
}
|
|
1335
1352
|
async function findTextOnPage(document2, pageNumber, query, options = {}) {
|
|
1336
1353
|
const { caseSensitive = false, wholeWord = false } = options;
|
|
1337
1354
|
if (!query || pageNumber < 1 || pageNumber > document2.numPages) {
|
|
1338
1355
|
return [];
|
|
1339
1356
|
}
|
|
1340
|
-
const { fullText,
|
|
1357
|
+
const { fullText, textItems, viewport } = await extractPageText(document2, pageNumber);
|
|
1341
1358
|
const matches = [];
|
|
1342
1359
|
const searchText = caseSensitive ? query : query.toLowerCase();
|
|
1343
1360
|
const textToSearch = caseSensitive ? fullText : fullText.toLowerCase();
|
|
@@ -1353,17 +1370,15 @@ async function findTextOnPage(document2, pageNumber, query, options = {}) {
|
|
|
1353
1370
|
continue;
|
|
1354
1371
|
}
|
|
1355
1372
|
}
|
|
1356
|
-
const matchRects =
|
|
1357
|
-
|
|
1358
|
-
|
|
1373
|
+
const matchRects = calculateMatchRects(textItems, matchIndex, query.length, viewport);
|
|
1374
|
+
if (matchRects.length > 0) {
|
|
1375
|
+
matches.push({
|
|
1376
|
+
text: fullText.substring(matchIndex, matchIndex + query.length),
|
|
1377
|
+
rects: matchRects,
|
|
1378
|
+
pageNumber,
|
|
1379
|
+
startIndex: matchIndex
|
|
1380
|
+
});
|
|
1359
1381
|
}
|
|
1360
|
-
const mergedRects = mergeAdjacentRects(matchRects);
|
|
1361
|
-
matches.push({
|
|
1362
|
-
text: fullText.substring(matchIndex, matchIndex + query.length),
|
|
1363
|
-
rects: mergedRects,
|
|
1364
|
-
pageNumber,
|
|
1365
|
-
startIndex: matchIndex
|
|
1366
|
-
});
|
|
1367
1382
|
startIndex = matchIndex + 1;
|
|
1368
1383
|
}
|
|
1369
1384
|
return matches;
|
|
@@ -1617,7 +1632,7 @@ function createSearchStore(initialOverrides = {}) {
|
|
|
1617
1632
|
}
|
|
1618
1633
|
}
|
|
1619
1634
|
const matchText = pageText.substring(startIndex, startIndex + query.length);
|
|
1620
|
-
const rects =
|
|
1635
|
+
const rects = calculateMatchRects2(textItems, startIndex, query.length, viewport);
|
|
1621
1636
|
results.push({
|
|
1622
1637
|
pageNumber: pageNum,
|
|
1623
1638
|
matchIndex: matchIndex++,
|
|
@@ -1678,7 +1693,7 @@ function createSearchStore(initialOverrides = {}) {
|
|
|
1678
1693
|
}
|
|
1679
1694
|
}));
|
|
1680
1695
|
}
|
|
1681
|
-
function
|
|
1696
|
+
function calculateMatchRects2(textItems, startOffset, length, viewport) {
|
|
1682
1697
|
const rects = [];
|
|
1683
1698
|
let currentOffset = 0;
|
|
1684
1699
|
for (const item of textItems) {
|
|
@@ -1693,9 +1708,10 @@ function calculateMatchRects(textItems, startOffset, length, viewport) {
|
|
|
1693
1708
|
const matchEndInItem = Math.min(item.text.length, startOffset + length - itemStart);
|
|
1694
1709
|
const matchWidth = item.width / item.text.length * (matchEndInItem - matchStartInItem);
|
|
1695
1710
|
const matchX = x + item.width / item.text.length * matchStartInItem;
|
|
1711
|
+
const yOffset = height * 0.15;
|
|
1696
1712
|
rects.push({
|
|
1697
1713
|
x: matchX,
|
|
1698
|
-
y: y - height,
|
|
1714
|
+
y: y - height + yOffset,
|
|
1699
1715
|
width: matchWidth,
|
|
1700
1716
|
height
|
|
1701
1717
|
});
|
|
@@ -9267,24 +9283,33 @@ function getSrcIdentifier(src) {
|
|
|
9267
9283
|
const last = Array.from(data.slice(-4)).map((b) => b.toString(16).padStart(2, "0")).join("");
|
|
9268
9284
|
return `binary:${len}:${first}:${last}`;
|
|
9269
9285
|
}
|
|
9270
|
-
function
|
|
9271
|
-
|
|
9272
|
-
|
|
9273
|
-
const
|
|
9274
|
-
|
|
9275
|
-
|
|
9276
|
-
|
|
9277
|
-
|
|
9278
|
-
const
|
|
9279
|
-
|
|
9280
|
-
|
|
9281
|
-
|
|
9282
|
-
|
|
9283
|
-
|
|
9286
|
+
function calculateMatchRects3(textItems, startOffset, length, viewport) {
|
|
9287
|
+
const rects = [];
|
|
9288
|
+
let currentOffset = 0;
|
|
9289
|
+
for (const item of textItems) {
|
|
9290
|
+
const itemStart = currentOffset;
|
|
9291
|
+
const itemEnd = currentOffset + item.text.length;
|
|
9292
|
+
if (itemEnd > startOffset && itemStart < startOffset + length) {
|
|
9293
|
+
const [, , c, d, tx, ty] = item.transform;
|
|
9294
|
+
const x = tx;
|
|
9295
|
+
const y = viewport.height - ty;
|
|
9296
|
+
const height = Math.sqrt(c * c + d * d);
|
|
9297
|
+
const matchStartInItem = Math.max(0, startOffset - itemStart);
|
|
9298
|
+
const matchEndInItem = Math.min(item.text.length, startOffset + length - itemStart);
|
|
9299
|
+
const charWidth = item.text.length > 0 ? item.width / item.text.length : item.width;
|
|
9300
|
+
const matchWidth = charWidth * (matchEndInItem - matchStartInItem);
|
|
9301
|
+
const matchX = x + charWidth * matchStartInItem;
|
|
9302
|
+
const yOffset = height * 0.15;
|
|
9303
|
+
rects.push({
|
|
9304
|
+
x: matchX,
|
|
9305
|
+
y: y - height + yOffset,
|
|
9306
|
+
width: matchWidth,
|
|
9307
|
+
height
|
|
9308
|
+
});
|
|
9284
9309
|
}
|
|
9310
|
+
currentOffset = itemEnd;
|
|
9285
9311
|
}
|
|
9286
|
-
|
|
9287
|
-
return merged;
|
|
9312
|
+
return rects;
|
|
9288
9313
|
}
|
|
9289
9314
|
var PDFViewerInner, PDFViewerInnerWithRef, PDFViewerClient;
|
|
9290
9315
|
var init_PDFViewerClient = __esm({
|
|
@@ -9381,26 +9406,15 @@ var init_PDFViewerClient = __esm({
|
|
|
9381
9406
|
const textContent = await page.getTextContent();
|
|
9382
9407
|
const viewport = page.getViewport({ scale: 1 });
|
|
9383
9408
|
let fullText = "";
|
|
9384
|
-
const
|
|
9409
|
+
const textItems = [];
|
|
9385
9410
|
for (const item of textContent.items) {
|
|
9386
9411
|
if ("str" in item && item.str) {
|
|
9387
|
-
|
|
9388
|
-
|
|
9389
|
-
|
|
9390
|
-
|
|
9391
|
-
|
|
9392
|
-
|
|
9393
|
-
for (let i = 0; i < item.str.length; i++) {
|
|
9394
|
-
charPositions.push({
|
|
9395
|
-
char: item.str[i],
|
|
9396
|
-
rect: {
|
|
9397
|
-
x: x + i * charWidth,
|
|
9398
|
-
y: y - height,
|
|
9399
|
-
width: charWidth,
|
|
9400
|
-
height
|
|
9401
|
-
}
|
|
9402
|
-
});
|
|
9403
|
-
}
|
|
9412
|
+
textItems.push({
|
|
9413
|
+
text: item.str,
|
|
9414
|
+
transform: item.transform,
|
|
9415
|
+
width: item.width ?? 0,
|
|
9416
|
+
height: item.height ?? 12
|
|
9417
|
+
});
|
|
9404
9418
|
fullText += item.str;
|
|
9405
9419
|
}
|
|
9406
9420
|
}
|
|
@@ -9409,18 +9423,16 @@ var init_PDFViewerClient = __esm({
|
|
|
9409
9423
|
while (true) {
|
|
9410
9424
|
const matchIndex = textToSearch.indexOf(searchText, startIndex);
|
|
9411
9425
|
if (matchIndex === -1) break;
|
|
9412
|
-
const matchRects =
|
|
9413
|
-
|
|
9414
|
-
|
|
9426
|
+
const matchRects = calculateMatchRects3(textItems, matchIndex, text.length, viewport);
|
|
9427
|
+
if (matchRects.length > 0) {
|
|
9428
|
+
const highlight = annotationStore.getState().addHighlight({
|
|
9429
|
+
pageNumber: pageNum,
|
|
9430
|
+
rects: matchRects,
|
|
9431
|
+
color,
|
|
9432
|
+
text: fullText.substring(matchIndex, matchIndex + text.length)
|
|
9433
|
+
});
|
|
9434
|
+
highlightIds.push(highlight.id);
|
|
9415
9435
|
}
|
|
9416
|
-
const mergedRects = mergeRects2(matchRects);
|
|
9417
|
-
const highlight = annotationStore.getState().addHighlight({
|
|
9418
|
-
pageNumber: pageNum,
|
|
9419
|
-
rects: mergedRects,
|
|
9420
|
-
color,
|
|
9421
|
-
text: fullText.substring(matchIndex, matchIndex + text.length)
|
|
9422
|
-
});
|
|
9423
|
-
highlightIds.push(highlight.id);
|
|
9424
9436
|
startIndex = matchIndex + 1;
|
|
9425
9437
|
}
|
|
9426
9438
|
} catch {
|
|
@@ -9598,33 +9610,22 @@ var init_PDFViewerClient = __esm({
|
|
|
9598
9610
|
const textContent = await page.getTextContent();
|
|
9599
9611
|
const viewport = page.getViewport({ scale: 1 });
|
|
9600
9612
|
let fullText = "";
|
|
9601
|
-
const
|
|
9613
|
+
const textItems = [];
|
|
9602
9614
|
for (const item of textContent.items) {
|
|
9603
9615
|
if ("str" in item && item.str) {
|
|
9604
|
-
|
|
9605
|
-
|
|
9606
|
-
|
|
9607
|
-
|
|
9608
|
-
|
|
9609
|
-
|
|
9610
|
-
for (let i = 0; i < item.str.length; i++) {
|
|
9611
|
-
charPositions.push({
|
|
9612
|
-
char: item.str[i],
|
|
9613
|
-
rect: {
|
|
9614
|
-
x: x + i * charWidth,
|
|
9615
|
-
y: y - height,
|
|
9616
|
-
width: charWidth,
|
|
9617
|
-
height
|
|
9618
|
-
}
|
|
9619
|
-
});
|
|
9620
|
-
}
|
|
9616
|
+
textItems.push({
|
|
9617
|
+
text: item.str,
|
|
9618
|
+
transform: item.transform,
|
|
9619
|
+
width: item.width ?? 0,
|
|
9620
|
+
height: item.height ?? 12
|
|
9621
|
+
});
|
|
9621
9622
|
fullText += item.str;
|
|
9622
9623
|
}
|
|
9623
9624
|
}
|
|
9624
9625
|
const textToSearch = caseSensitive ? fullText : fullText.toLowerCase();
|
|
9625
9626
|
let startIndex = 0;
|
|
9626
9627
|
while (true) {
|
|
9627
|
-
|
|
9628
|
+
const matchIndex = textToSearch.indexOf(searchText, startIndex);
|
|
9628
9629
|
if (matchIndex === -1) break;
|
|
9629
9630
|
if (wholeWord) {
|
|
9630
9631
|
const beforeChar = matchIndex > 0 ? textToSearch[matchIndex - 1] : " ";
|
|
@@ -9634,26 +9635,24 @@ var init_PDFViewerClient = __esm({
|
|
|
9634
9635
|
continue;
|
|
9635
9636
|
}
|
|
9636
9637
|
}
|
|
9637
|
-
const matchRects =
|
|
9638
|
-
|
|
9639
|
-
|
|
9638
|
+
const matchRects = calculateMatchRects3(textItems, matchIndex, query.length, viewport);
|
|
9639
|
+
if (matchRects.length > 0) {
|
|
9640
|
+
const highlight = annotationStore.getState().addHighlight({
|
|
9641
|
+
pageNumber: pageNum,
|
|
9642
|
+
rects: matchRects,
|
|
9643
|
+
color,
|
|
9644
|
+
text: fullText.substring(matchIndex, matchIndex + query.length),
|
|
9645
|
+
source: "search"
|
|
9646
|
+
});
|
|
9647
|
+
result.matchCount++;
|
|
9648
|
+
result.highlightIds.push(highlight.id);
|
|
9649
|
+
result.matches.push({
|
|
9650
|
+
pageNumber: pageNum,
|
|
9651
|
+
text: fullText.substring(matchIndex, matchIndex + query.length),
|
|
9652
|
+
highlightId: highlight.id,
|
|
9653
|
+
rects: matchRects
|
|
9654
|
+
});
|
|
9640
9655
|
}
|
|
9641
|
-
const mergedRects = mergeRects2(matchRects);
|
|
9642
|
-
const highlight = annotationStore.getState().addHighlight({
|
|
9643
|
-
pageNumber: pageNum,
|
|
9644
|
-
rects: mergedRects,
|
|
9645
|
-
color,
|
|
9646
|
-
text: fullText.substring(matchIndex, matchIndex + query.length),
|
|
9647
|
-
source: "search"
|
|
9648
|
-
});
|
|
9649
|
-
result.matchCount++;
|
|
9650
|
-
result.highlightIds.push(highlight.id);
|
|
9651
|
-
result.matches.push({
|
|
9652
|
-
pageNumber: pageNum,
|
|
9653
|
-
text: fullText.substring(matchIndex, matchIndex + query.length),
|
|
9654
|
-
highlightId: highlight.id,
|
|
9655
|
-
rects: mergedRects
|
|
9656
|
-
});
|
|
9657
9656
|
startIndex = matchIndex + 1;
|
|
9658
9657
|
}
|
|
9659
9658
|
} catch {
|