pdf-diff-viewer 1.0.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +23 -0
- package/README.md +27 -0
- package/package.json +1 -1
- package/public/app.js +159 -7
- package/public/index.html +1 -1
- package/src/PDFDiffViewer.js +180 -20
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,28 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [1.1.0] - 2026-01-20
|
|
4
|
+
|
|
5
|
+
### Added
|
|
6
|
+
- **Smart Alignment**: Text-based page matching for handling content reflow
|
|
7
|
+
- Support for comparing PDFs with different page counts
|
|
8
|
+
- Jaccard similarity algorithm for intelligent page matching
|
|
9
|
+
- `smartAlignment` option (default: true)
|
|
10
|
+
- `alignmentTolerance` option to control search range
|
|
11
|
+
- `similarityThreshold` option for minimum match quality
|
|
12
|
+
- Page mapping information in comparison results
|
|
13
|
+
- Similarity scores displayed in UI for matched pages
|
|
14
|
+
|
|
15
|
+
### Changed
|
|
16
|
+
- `compare()` method now handles different page counts gracefully
|
|
17
|
+
- Page comparison results now include `pageNumA` and `pageNumB` fields
|
|
18
|
+
- Enhanced error messages for page count mismatches
|
|
19
|
+
|
|
20
|
+
### Features
|
|
21
|
+
- Automatically finds best-matching pages based on text content
|
|
22
|
+
- Handles scenarios where text additions push content to new pages
|
|
23
|
+
- Displays page mappings (e.g., "Page 1 ↔ Page 2") in UI
|
|
24
|
+
- Shows content similarity percentage for cross-page comparisons
|
|
25
|
+
|
|
3
26
|
## [1.0.0] - 2026-01-19
|
|
4
27
|
|
|
5
28
|
### Added
|
package/README.md
CHANGED
|
@@ -142,6 +142,9 @@ new PDFDiffViewer(container, options)
|
|
|
142
142
|
- `showPageNumbers` (boolean) - Show page numbers, default: true
|
|
143
143
|
- `cropRegions` (Array) - Regions to crop: `[{ page: 1, x, y, width, height }]`
|
|
144
144
|
- `maskRegions` (Array) - Regions to mask/ignore: `[{ page: 1, x, y, width, height }]`
|
|
145
|
+
- **`smartAlignment` (boolean)** - Enable text-based page alignment for content reflow, default: true
|
|
146
|
+
- **`alignmentTolerance` (number)** - Search range for matching pages (+/- pages), default: 2
|
|
147
|
+
- **`similarityThreshold` (number)** - Minimum text similarity (0-1) for page matching, default: 0.3
|
|
145
148
|
|
|
146
149
|
### Methods
|
|
147
150
|
|
|
@@ -157,6 +160,7 @@ Compare two PDFs and render results.
|
|
|
157
160
|
- `totalPages` - Number of pages compared
|
|
158
161
|
- `totalDiffPixels` - Total different pixels across all pages
|
|
159
162
|
- `pageResults` - Array of per-page results
|
|
163
|
+
- `pageMapping` - Page alignment mappings (when smartAlignment is enabled)
|
|
160
164
|
|
|
161
165
|
#### `getResults()`
|
|
162
166
|
|
|
@@ -244,6 +248,29 @@ const viewer = new PDFDiffViewer('#container', {
|
|
|
244
248
|
});
|
|
245
249
|
```
|
|
246
250
|
|
|
251
|
+
### With Smart Alignment (Content Reflow Handling)
|
|
252
|
+
|
|
253
|
+
```javascript
|
|
254
|
+
// Handles cases where content shifts across pages
|
|
255
|
+
// (e.g., adding text pushes content to next page)
|
|
256
|
+
const viewer = new PDFDiffViewer('#container', {
|
|
257
|
+
smartAlignment: true, // Enable intelligent page matching
|
|
258
|
+
alignmentTolerance: 2, // Search +/- 2 pages for matches
|
|
259
|
+
similarityThreshold: 0.3 // Require 30% content similarity
|
|
260
|
+
});
|
|
261
|
+
|
|
262
|
+
const results = await viewer.compare(pdfA, pdfB);
|
|
263
|
+
console.log('Page mappings:', results.pageMapping);
|
|
264
|
+
// Output: [{ pageA: 1, pageB: 1, similarity: 0.95 },
|
|
265
|
+
// { pageA: 2, pageB: 3, similarity: 0.87 }, ...]
|
|
266
|
+
```
|
|
267
|
+
|
|
268
|
+
**How it works:**
|
|
269
|
+
- Extracts text from all pages in both documents
|
|
270
|
+
- Uses Jaccard similarity to find best-matching pages
|
|
271
|
+
- Handles different page counts gracefully
|
|
272
|
+
- Shows similarity scores in the UI
|
|
273
|
+
|
|
247
274
|
### With Mask Regions (Ignore Dynamic Content)
|
|
248
275
|
|
|
249
276
|
```javascript
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "pdf-diff-viewer",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.1.0",
|
|
4
4
|
"description": "Browser-based PDF comparison tool with visual diff highlighting. Zero system dependencies, pure JavaScript, client-side processing.",
|
|
5
5
|
"main": "src/PDFDiffViewer.js",
|
|
6
6
|
"type": "module",
|
package/public/app.js
CHANGED
|
@@ -431,7 +431,64 @@ angular.module("pdfDiffApp", [])
|
|
|
431
431
|
return overlay;
|
|
432
432
|
}
|
|
433
433
|
|
|
434
|
+
// Store PDFs for cleanup
|
|
435
|
+
let currentPdfA = null;
|
|
436
|
+
let currentPdfB = null;
|
|
437
|
+
|
|
438
|
+
// Cleanup function to clear all resources
|
|
439
|
+
function cleanup() {
|
|
440
|
+
// Clear canvases
|
|
441
|
+
const canvasA = document.getElementById("canvasA");
|
|
442
|
+
const canvasB = document.getElementById("canvasB");
|
|
443
|
+
const canvasDiff = document.getElementById("canvasDiff");
|
|
444
|
+
|
|
445
|
+
if (canvasA) {
|
|
446
|
+
const ctxA = canvasA.getContext("2d");
|
|
447
|
+
ctxA.clearRect(0, 0, canvasA.width, canvasA.height);
|
|
448
|
+
canvasA.width = 0;
|
|
449
|
+
canvasA.height = 0;
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
if (canvasB) {
|
|
453
|
+
const ctxB = canvasB.getContext("2d");
|
|
454
|
+
ctxB.clearRect(0, 0, canvasB.width, canvasB.height);
|
|
455
|
+
canvasB.width = 0;
|
|
456
|
+
canvasB.height = 0;
|
|
457
|
+
}
|
|
458
|
+
|
|
459
|
+
if (canvasDiff) {
|
|
460
|
+
const ctxDiff = canvasDiff.getContext("2d");
|
|
461
|
+
ctxDiff.clearRect(0, 0, canvasDiff.width, canvasDiff.height);
|
|
462
|
+
canvasDiff.width = 0;
|
|
463
|
+
canvasDiff.height = 0;
|
|
464
|
+
}
|
|
465
|
+
|
|
466
|
+
// Clear results div
|
|
467
|
+
const resultsDiv = document.getElementById("results");
|
|
468
|
+
if (resultsDiv) {
|
|
469
|
+
resultsDiv.innerHTML = "";
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
// Destroy PDF documents to free memory
|
|
473
|
+
if (currentPdfA) {
|
|
474
|
+
currentPdfA.destroy();
|
|
475
|
+
currentPdfA = null;
|
|
476
|
+
}
|
|
477
|
+
if (currentPdfB) {
|
|
478
|
+
currentPdfB.destroy();
|
|
479
|
+
currentPdfB = null;
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
// Force garbage collection hint
|
|
483
|
+
if (window.gc) {
|
|
484
|
+
window.gc();
|
|
485
|
+
}
|
|
486
|
+
}
|
|
487
|
+
|
|
434
488
|
$scope.compare = async function () {
|
|
489
|
+
// Clear previous results and free memory
|
|
490
|
+
cleanup();
|
|
491
|
+
|
|
435
492
|
const LABEL_A = "Document A";
|
|
436
493
|
const LABEL_B = "Document B";
|
|
437
494
|
const fileA = document.getElementById("fileA").files[0];
|
|
@@ -457,12 +514,17 @@ angular.module("pdfDiffApp", [])
|
|
|
457
514
|
const arrayBufferA = await fileA.arrayBuffer();
|
|
458
515
|
const arrayBufferB = await fileB.arrayBuffer();
|
|
459
516
|
|
|
460
|
-
|
|
461
|
-
|
|
517
|
+
currentPdfA = await pdfjsLib.getDocument({ data: arrayBufferA }).promise;
|
|
518
|
+
currentPdfB = await pdfjsLib.getDocument({ data: arrayBufferB }).promise;
|
|
519
|
+
const pdfA = currentPdfA;
|
|
520
|
+
const pdfB = currentPdfB;
|
|
462
521
|
|
|
522
|
+
const maxPages = Math.max(pdfA.numPages, pdfB.numPages);
|
|
523
|
+
|
|
463
524
|
if (pdfA.numPages !== pdfB.numPages) {
|
|
464
|
-
|
|
465
|
-
|
|
525
|
+
const message = `Page count differs: ${LABEL_A} has ${pdfA.numPages} page(s), ${LABEL_B} has ${pdfB.numPages} page(s). Showing all ${maxPages} page(s).`;
|
|
526
|
+
console.warn(message);
|
|
527
|
+
// alert(message);
|
|
466
528
|
}
|
|
467
529
|
|
|
468
530
|
const canvasA = document.getElementById("canvasA");
|
|
@@ -470,14 +532,86 @@ angular.module("pdfDiffApp", [])
|
|
|
470
532
|
const canvasDiff = document.getElementById("canvasDiff");
|
|
471
533
|
|
|
472
534
|
const resultsDiv = document.getElementById("results");
|
|
473
|
-
resultsDiv
|
|
474
|
-
// resultsDiv.innerHTML = "<h3>Diff on A vs Diff on B</h3>";
|
|
535
|
+
// resultsDiv already cleared in cleanup()
|
|
475
536
|
|
|
476
537
|
// const zip = new JSZip();
|
|
477
538
|
let totalDiffPixels = 0;
|
|
478
539
|
|
|
479
|
-
for (let i = 1; i <=
|
|
540
|
+
for (let i = 1; i <= maxPages; i++) {
|
|
541
|
+
|
|
542
|
+
const hasPageA = i <= pdfA.numPages;
|
|
543
|
+
const hasPageB = i <= pdfB.numPages;
|
|
544
|
+
|
|
545
|
+
// Handle pages that only exist in one document
|
|
546
|
+
if (!hasPageA || !hasPageB) {
|
|
547
|
+
const title = document.createElement("h4");
|
|
548
|
+
title.innerText = `Page ${i}`;
|
|
549
|
+
|
|
550
|
+
const row = document.createElement("div");
|
|
551
|
+
row.style.display = "grid";
|
|
552
|
+
row.style.gridTemplateColumns = "1fr 1fr";
|
|
553
|
+
row.style.gap = "15px";
|
|
554
|
+
row.style.marginBottom = "25px";
|
|
555
|
+
row.style.borderTop = "2px solid #ddd";
|
|
556
|
+
row.style.paddingTop = "15px";
|
|
557
|
+
|
|
558
|
+
function makeEmptyCol(labelText) {
|
|
559
|
+
const col = document.createElement("div");
|
|
560
|
+
const label = document.createElement("div");
|
|
561
|
+
label.innerHTML = `<b>${labelText}</b>`;
|
|
562
|
+
|
|
563
|
+
const placeholder = document.createElement("div");
|
|
564
|
+
placeholder.style.padding = "40px";
|
|
565
|
+
placeholder.style.textAlign = "center";
|
|
566
|
+
placeholder.style.backgroundColor = "#f5f5f5";
|
|
567
|
+
placeholder.style.border = "1px solid #ccc";
|
|
568
|
+
placeholder.style.color = "#999";
|
|
569
|
+
placeholder.innerText = "No corresponding page";
|
|
570
|
+
|
|
571
|
+
col.appendChild(label);
|
|
572
|
+
col.appendChild(placeholder);
|
|
573
|
+
return col;
|
|
574
|
+
}
|
|
575
|
+
|
|
576
|
+
function makePageCol(labelText, canvas) {
|
|
577
|
+
const col = document.createElement("div");
|
|
578
|
+
const label = document.createElement("div");
|
|
579
|
+
label.innerHTML = `<b>${labelText}</b>`;
|
|
580
|
+
|
|
581
|
+
const img = document.createElement("img");
|
|
582
|
+
img.src = canvas.toDataURL("image/png");
|
|
583
|
+
img.style.width = "100%";
|
|
584
|
+
img.style.border = "1px solid #ccc";
|
|
585
|
+
img.style.imageRendering = "crisp-edges";
|
|
586
|
+
img.style.backgroundColor = "#fff";
|
|
587
|
+
|
|
588
|
+
col.appendChild(label);
|
|
589
|
+
col.appendChild(img);
|
|
590
|
+
return col;
|
|
591
|
+
}
|
|
592
|
+
|
|
593
|
+
if (hasPageA && !hasPageB) {
|
|
594
|
+
// Only in Document A
|
|
595
|
+
await renderPageToCanvas(pdfA, i, canvasA);
|
|
596
|
+
const colA = makePageCol(`${LABEL_A} (only)`, canvasA);
|
|
597
|
+
const colB = makeEmptyCol(`${LABEL_B} (no page ${i})`);
|
|
598
|
+
row.appendChild(colA);
|
|
599
|
+
row.appendChild(colB);
|
|
600
|
+
} else if (!hasPageA && hasPageB) {
|
|
601
|
+
// Only in Document B
|
|
602
|
+
await renderPageToCanvas(pdfB, i, canvasB);
|
|
603
|
+
const colA = makeEmptyCol(`${LABEL_A} (no page ${i})`);
|
|
604
|
+
const colB = makePageCol(`${LABEL_B} (only)`, canvasB);
|
|
605
|
+
row.appendChild(colA);
|
|
606
|
+
row.appendChild(colB);
|
|
607
|
+
}
|
|
608
|
+
|
|
609
|
+
resultsDiv.appendChild(title);
|
|
610
|
+
resultsDiv.appendChild(row);
|
|
611
|
+
continue;
|
|
612
|
+
}
|
|
480
613
|
|
|
614
|
+
// Both pages exist - do normal comparison
|
|
481
615
|
const { words: wordsA } = await renderPageToCanvas(pdfA, i, canvasA);
|
|
482
616
|
const { words: wordsB } = await renderPageToCanvas(pdfB, i, canvasB);
|
|
483
617
|
|
|
@@ -569,6 +703,24 @@ angular.module("pdfDiffApp", [])
|
|
|
569
703
|
return col;
|
|
570
704
|
}
|
|
571
705
|
|
|
706
|
+
function makeEmptyCol(labelText) {
|
|
707
|
+
const col = document.createElement("div");
|
|
708
|
+
const label = document.createElement("div");
|
|
709
|
+
label.innerHTML = `<b>${labelText}</b>`;
|
|
710
|
+
|
|
711
|
+
const placeholder = document.createElement("div");
|
|
712
|
+
placeholder.style.padding = "40px";
|
|
713
|
+
placeholder.style.textAlign = "center";
|
|
714
|
+
placeholder.style.backgroundColor = "#f5f5f5";
|
|
715
|
+
placeholder.style.border = "1px solid #ccc";
|
|
716
|
+
placeholder.style.color = "#999";
|
|
717
|
+
placeholder.innerText = "No corresponding page";
|
|
718
|
+
|
|
719
|
+
col.appendChild(label);
|
|
720
|
+
col.appendChild(placeholder);
|
|
721
|
+
return col;
|
|
722
|
+
}
|
|
723
|
+
|
|
572
724
|
resultsDiv.appendChild(title);
|
|
573
725
|
row.appendChild(makeCol(LABEL_A, overlayOnA));
|
|
574
726
|
row.appendChild(makeCol(LABEL_B, overlayOnB));
|
package/public/index.html
CHANGED
package/src/PDFDiffViewer.js
CHANGED
|
@@ -20,7 +20,7 @@ class PDFDiffViewer {
|
|
|
20
20
|
scale: options.scale || 3.0,
|
|
21
21
|
maxShift: options.maxShift || 3,
|
|
22
22
|
dilationRadius: options.dilationRadius || 0,
|
|
23
|
-
colorTolerance: options.colorTolerance ||
|
|
23
|
+
colorTolerance: options.colorTolerance || 200,
|
|
24
24
|
minHighlightArea: options.minHighlightArea || 60,
|
|
25
25
|
minWordSize: options.minWordSize || 8,
|
|
26
26
|
highlightAlpha: options.highlightAlpha || 0.32,
|
|
@@ -29,7 +29,10 @@ class PDFDiffViewer {
|
|
|
29
29
|
workerSrc: options.workerSrc || 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.11.174/pdf.worker.min.js',
|
|
30
30
|
showPageNumbers: options.showPageNumbers !== false,
|
|
31
31
|
cropRegions: options.cropRegions || [],
|
|
32
|
-
maskRegions: options.maskRegions || []
|
|
32
|
+
maskRegions: options.maskRegions || [],
|
|
33
|
+
smartAlignment: options.smartAlignment !== false, // Enable text-based alignment
|
|
34
|
+
alignmentTolerance: options.alignmentTolerance || 2, // Search +/- 2 pages for matches
|
|
35
|
+
similarityThreshold: options.similarityThreshold || 0.3 // Minimum similarity score (0-1)
|
|
33
36
|
};
|
|
34
37
|
|
|
35
38
|
// Check if PDF.js is loaded
|
|
@@ -57,39 +60,56 @@ class PDFDiffViewer {
|
|
|
57
60
|
const docA = await pdfjsLib.getDocument({ data: bufferA }).promise;
|
|
58
61
|
const docB = await pdfjsLib.getDocument({ data: bufferB }).promise;
|
|
59
62
|
|
|
60
|
-
if (docA.numPages !== docB.numPages) {
|
|
61
|
-
throw new Error(`Page count mismatch: ${docA.numPages} vs ${docB.numPages}`);
|
|
62
|
-
}
|
|
63
|
-
|
|
64
63
|
// Clear previous results
|
|
65
64
|
this.container.innerHTML = '';
|
|
66
65
|
|
|
67
66
|
let totalDiffPixels = 0;
|
|
68
67
|
const pageResults = [];
|
|
68
|
+
let pageMapping = [];
|
|
69
69
|
|
|
70
70
|
// Create summary element
|
|
71
71
|
const summaryDiv = document.createElement('div');
|
|
72
72
|
summaryDiv.className = 'pdf-diff-summary';
|
|
73
73
|
this.container.appendChild(summaryDiv);
|
|
74
74
|
|
|
75
|
-
//
|
|
76
|
-
|
|
77
|
-
|
|
75
|
+
// Use smart alignment if enabled and page counts differ
|
|
76
|
+
if (this.options.smartAlignment && docA.numPages !== docB.numPages) {
|
|
77
|
+
summaryDiv.innerHTML = '<p>Analyzing document structure for smart alignment...</p>';
|
|
78
|
+
pageMapping = await this._findPageMappings(docA, docB);
|
|
79
|
+
summaryDiv.innerHTML = `<h3>Smart Alignment Active: Comparing ${pageMapping.length} matched page(s)</h3>`;
|
|
80
|
+
} else if (docA.numPages !== docB.numPages) {
|
|
81
|
+
throw new Error(`Page count mismatch: ${docA.numPages} vs ${docB.numPages}. Enable 'smartAlignment' option to handle different page counts.`);
|
|
82
|
+
} else {
|
|
83
|
+
// Direct 1-to-1 mapping
|
|
84
|
+
for (let i = 1; i <= docA.numPages; i++) {
|
|
85
|
+
pageMapping.push({ pageA: i, pageB: i, similarity: 1.0 });
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
// Process each mapped page pair
|
|
90
|
+
for (const mapping of pageMapping) {
|
|
91
|
+
const pageResult = await this._comparePagePair(docA, docB, mapping.pageA, mapping.pageB);
|
|
92
|
+
pageResult.similarity = mapping.similarity;
|
|
78
93
|
pageResults.push(pageResult);
|
|
79
94
|
totalDiffPixels += pageResult.diffPixels;
|
|
80
95
|
|
|
81
|
-
this._renderPageComparison(pageResult,
|
|
96
|
+
this._renderPageComparison(pageResult, mapping.pageA, mapping.pageB);
|
|
82
97
|
}
|
|
83
98
|
|
|
84
99
|
this.results = {
|
|
85
|
-
totalPages:
|
|
100
|
+
totalPages: pageMapping.length,
|
|
86
101
|
totalDiffPixels,
|
|
87
|
-
pageResults
|
|
102
|
+
pageResults,
|
|
103
|
+
pageMapping
|
|
88
104
|
};
|
|
89
105
|
|
|
90
106
|
// Update summary
|
|
91
107
|
if (this.options.showPageNumbers) {
|
|
92
|
-
|
|
108
|
+
if (docA.numPages === docB.numPages) {
|
|
109
|
+
summaryDiv.innerHTML = `<h3>Comparison Results: ${docA.numPages} page(s)</h3>`;
|
|
110
|
+
} else {
|
|
111
|
+
summaryDiv.innerHTML += `<p>Doc A: ${docA.numPages} pages | Doc B: ${docB.numPages} pages</p>`;
|
|
112
|
+
}
|
|
93
113
|
}
|
|
94
114
|
|
|
95
115
|
return this.results;
|
|
@@ -134,13 +154,17 @@ class PDFDiffViewer {
|
|
|
134
154
|
}
|
|
135
155
|
|
|
136
156
|
async _comparePage(docA, docB, pageNum) {
|
|
157
|
+
return await this._comparePagePair(docA, docB, pageNum, pageNum);
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
async _comparePagePair(docA, docB, pageNumA, pageNumB) {
|
|
137
161
|
const canvasA = document.createElement('canvas');
|
|
138
162
|
const canvasB = document.createElement('canvas');
|
|
139
163
|
|
|
140
|
-
const { words: wordsA } = await this._renderPageToCanvas(docA,
|
|
141
|
-
const { words: wordsB } = await this._renderPageToCanvas(docB,
|
|
164
|
+
const { words: wordsA } = await this._renderPageToCanvas(docA, pageNumA, canvasA);
|
|
165
|
+
const { words: wordsB } = await this._renderPageToCanvas(docB, pageNumB, canvasB);
|
|
142
166
|
|
|
143
|
-
const pageCrop = this.options.cropRegions.find(r => r.page ===
|
|
167
|
+
const pageCrop = this.options.cropRegions.find(r => r.page === pageNumA);
|
|
144
168
|
const croppedWordsA = this._offsetWordBoxes(wordsA, pageCrop);
|
|
145
169
|
const croppedWordsB = this._offsetWordBoxes(wordsB, pageCrop);
|
|
146
170
|
|
|
@@ -173,7 +197,7 @@ class PDFDiffViewer {
|
|
|
173
197
|
const diffPixels = this._buildDiffImage(imgA, shiftedB, diffImage, this.options.colorTolerance);
|
|
174
198
|
|
|
175
199
|
// Apply masks
|
|
176
|
-
const pageMasks = this.options.maskRegions.filter(r => r.page ===
|
|
200
|
+
const pageMasks = this.options.maskRegions.filter(r => r.page === pageNumA);
|
|
177
201
|
this._applyMasks(diffImage, pageMasks);
|
|
178
202
|
|
|
179
203
|
// Dilate diff mask
|
|
@@ -194,21 +218,33 @@ class PDFDiffViewer {
|
|
|
194
218
|
const overlayOnB = this._overlayDiff(paddedB, highlightCanvasB);
|
|
195
219
|
|
|
196
220
|
return {
|
|
197
|
-
|
|
221
|
+
pageNumA,
|
|
222
|
+
pageNumB,
|
|
198
223
|
diffPixels,
|
|
199
224
|
overlayA: overlayOnA.toDataURL('image/png'),
|
|
200
225
|
overlayB: overlayOnB.toDataURL('image/png'),
|
|
201
226
|
alignment: { dx: best.dx, dy: best.dy }
|
|
202
227
|
};
|
|
203
228
|
}
|
|
229
|
+
alignment: { dx: best.dx, dy: best.dy }
|
|
230
|
+
};
|
|
231
|
+
}
|
|
204
232
|
|
|
205
|
-
_renderPageComparison(pageResult,
|
|
233
|
+
_renderPageComparison(pageResult, pageNumA, pageNumB = null) {
|
|
206
234
|
const pageDiv = document.createElement('div');
|
|
207
235
|
pageDiv.className = 'pdf-diff-page';
|
|
208
236
|
|
|
209
237
|
if (this.options.showPageNumbers) {
|
|
210
238
|
const title = document.createElement('h4');
|
|
211
|
-
|
|
239
|
+
if (pageNumB !== null && pageNumA !== pageNumB) {
|
|
240
|
+
title.innerText = `Page ${pageNumA} ↔ Page ${pageNumB}`;
|
|
241
|
+
if (pageResult.similarity !== undefined) {
|
|
242
|
+
const simPercent = (pageResult.similarity * 100).toFixed(1);
|
|
243
|
+
title.innerText += ` (${simPercent}% content match)`;
|
|
244
|
+
}
|
|
245
|
+
} else {
|
|
246
|
+
title.innerText = `Page ${pageNumA}`;
|
|
247
|
+
}
|
|
212
248
|
title.style.marginTop = '20px';
|
|
213
249
|
pageDiv.appendChild(title);
|
|
214
250
|
}
|
|
@@ -627,6 +663,130 @@ class PDFDiffViewer {
|
|
|
627
663
|
|
|
628
664
|
return overlay;
|
|
629
665
|
}
|
|
666
|
+
|
|
667
|
+
// ===== SMART ALIGNMENT METHODS =====
|
|
668
|
+
|
|
669
|
+
/**
|
|
670
|
+
* Find optimal page mappings between two PDFs based on text content similarity
|
|
671
|
+
*/
|
|
672
|
+
async _findPageMappings(docA, docB) {
|
|
673
|
+
const mappings = [];
|
|
674
|
+
const usedPagesB = new Set();
|
|
675
|
+
const tolerance = this.options.alignmentTolerance;
|
|
676
|
+
|
|
677
|
+
// Extract text from all pages of both documents
|
|
678
|
+
const textsA = await this._extractAllPageTexts(docA);
|
|
679
|
+
const textsB = await this._extractAllPageTexts(docB);
|
|
680
|
+
|
|
681
|
+
// For each page in document A, find best matching page in document B
|
|
682
|
+
for (let pageA = 1; pageA <= docA.numPages; pageA++) {
|
|
683
|
+
const textA = textsA[pageA - 1];
|
|
684
|
+
|
|
685
|
+
let bestMatch = null;
|
|
686
|
+
let bestSimilarity = 0;
|
|
687
|
+
|
|
688
|
+
// Search within tolerance range
|
|
689
|
+
const startPage = Math.max(1, pageA - tolerance);
|
|
690
|
+
const endPage = Math.min(docB.numPages, pageA + tolerance);
|
|
691
|
+
|
|
692
|
+
for (let pageB = startPage; pageB <= endPage; pageB++) {
|
|
693
|
+
if (usedPagesB.has(pageB)) continue;
|
|
694
|
+
|
|
695
|
+
const textB = textsB[pageB - 1];
|
|
696
|
+
const similarity = this._calculateTextSimilarity(textA, textB);
|
|
697
|
+
|
|
698
|
+
if (similarity > bestSimilarity && similarity >= this.options.similarityThreshold) {
|
|
699
|
+
bestSimilarity = similarity;
|
|
700
|
+
bestMatch = pageB;
|
|
701
|
+
}
|
|
702
|
+
}
|
|
703
|
+
|
|
704
|
+
// If no good match found, try 1:1 mapping if that page exists and isn't used
|
|
705
|
+
if (!bestMatch && pageA <= docB.numPages && !usedPagesB.has(pageA)) {
|
|
706
|
+
bestMatch = pageA;
|
|
707
|
+
bestSimilarity = this._calculateTextSimilarity(textA, textsB[pageA - 1]);
|
|
708
|
+
}
|
|
709
|
+
|
|
710
|
+
if (bestMatch) {
|
|
711
|
+
usedPagesB.add(bestMatch);
|
|
712
|
+
mappings.push({
|
|
713
|
+
pageA,
|
|
714
|
+
pageB: bestMatch,
|
|
715
|
+
similarity: bestSimilarity
|
|
716
|
+
});
|
|
717
|
+
}
|
|
718
|
+
}
|
|
719
|
+
|
|
720
|
+
return mappings;
|
|
721
|
+
}
|
|
722
|
+
|
|
723
|
+
/**
|
|
724
|
+
* Extract text from all pages of a PDF document
|
|
725
|
+
*/
|
|
726
|
+
async _extractAllPageTexts(doc) {
|
|
727
|
+
const texts = [];
|
|
728
|
+
for (let i = 1; i <= doc.numPages; i++) {
|
|
729
|
+
const page = await doc.getPage(i);
|
|
730
|
+
const textContent = await page.getTextContent();
|
|
731
|
+
const pageText = textContent.items
|
|
732
|
+
.map(item => item.str || '')
|
|
733
|
+
.join(' ')
|
|
734
|
+
.toLowerCase()
|
|
735
|
+
.replace(/\s+/g, ' ')
|
|
736
|
+
.trim();
|
|
737
|
+
texts.push(pageText);
|
|
738
|
+
}
|
|
739
|
+
return texts;
|
|
740
|
+
}
|
|
741
|
+
|
|
742
|
+
/**
|
|
743
|
+
* Calculate text similarity using Jaccard similarity coefficient
|
|
744
|
+
* Returns a value between 0 (no similarity) and 1 (identical)
|
|
745
|
+
*/
|
|
746
|
+
_calculateTextSimilarity(text1, text2) {
|
|
747
|
+
if (!text1 && !text2) return 1.0;
|
|
748
|
+
if (!text1 || !text2) return 0.0;
|
|
749
|
+
|
|
750
|
+
// Tokenize into words
|
|
751
|
+
const words1 = this._tokenize(text1);
|
|
752
|
+
const words2 = this._tokenize(text2);
|
|
753
|
+
|
|
754
|
+
// Create sets of words
|
|
755
|
+
const set1 = new Set(words1);
|
|
756
|
+
const set2 = new Set(words2);
|
|
757
|
+
|
|
758
|
+
// Calculate Jaccard similarity: |intersection| / |union|
|
|
759
|
+
const intersection = new Set([...set1].filter(x => set2.has(x)));
|
|
760
|
+
const union = new Set([...set1, ...set2]);
|
|
761
|
+
|
|
762
|
+
if (union.size === 0) return 0.0;
|
|
763
|
+
|
|
764
|
+
const jaccardSimilarity = intersection.size / union.size;
|
|
765
|
+
|
|
766
|
+
// Also consider length ratio for better accuracy
|
|
767
|
+
const lengthRatio = Math.min(text1.length, text2.length) / Math.max(text1.length, text2.length);
|
|
768
|
+
|
|
769
|
+
// Weighted combination
|
|
770
|
+
return jaccardSimilarity * 0.7 + lengthRatio * 0.3;
|
|
771
|
+
}
|
|
772
|
+
|
|
773
|
+
/**
|
|
774
|
+
* Tokenize text into words, removing common stopwords
|
|
775
|
+
*/
|
|
776
|
+
_tokenize(text) {
|
|
777
|
+
const stopwords = new Set([
|
|
778
|
+
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
|
|
779
|
+
'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'be',
|
|
780
|
+
'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
|
|
781
|
+
'would', 'should', 'could', 'may', 'might', 'can', 'shall'
|
|
782
|
+
]);
|
|
783
|
+
|
|
784
|
+
return text
|
|
785
|
+
.toLowerCase()
|
|
786
|
+
.replace(/[^\w\s]/g, ' ') // Remove punctuation
|
|
787
|
+
.split(/\s+/)
|
|
788
|
+
.filter(word => word.length > 2 && !stopwords.has(word));
|
|
789
|
+
}
|
|
630
790
|
}
|
|
631
791
|
|
|
632
792
|
// Export for different module systems
|