pdf-diff-viewer 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,28 @@
1
1
  # Changelog
2
2
 
3
+ ## [1.1.0] - 2026-01-20
4
+
5
+ ### Added
6
+ - **Smart Alignment**: Text-based page matching for handling content reflow
7
+ - Support for comparing PDFs with different page counts
8
+ - Jaccard similarity algorithm for intelligent page matching
9
+ - `smartAlignment` option (default: true)
10
+ - `alignmentTolerance` option to control search range
11
+ - `similarityThreshold` option for minimum match quality
12
+ - Page mapping information in comparison results
13
+ - Similarity scores displayed in UI for matched pages
14
+
15
+ ### Changed
16
+ - `compare()` method now handles different page counts gracefully
17
+ - Page comparison results now include `pageNumA` and `pageNumB` fields
18
+ - Enhanced error messages for page count mismatches
19
+
20
+ ### Features
21
+ - Automatically finds best-matching pages based on text content
22
+ - Handles scenarios where text additions push content to new pages
23
+ - Displays page mappings (e.g., "Page 1 ↔ Page 2") in UI
24
+ - Shows content similarity percentage for cross-page comparisons
25
+
3
26
  ## [1.0.0] - 2026-01-19
4
27
 
5
28
  ### Added
package/README.md CHANGED
@@ -142,6 +142,9 @@ new PDFDiffViewer(container, options)
142
142
  - `showPageNumbers` (boolean) - Show page numbers, default: true
143
143
  - `cropRegions` (Array) - Regions to crop: `[{ page: 1, x, y, width, height }]`
144
144
  - `maskRegions` (Array) - Regions to mask/ignore: `[{ page: 1, x, y, width, height }]`
145
+ - **`smartAlignment` (boolean)** - Enable text-based page alignment for content reflow, default: true
146
+ - **`alignmentTolerance` (number)** - Search range for matching pages (+/- pages), default: 2
147
+ - **`similarityThreshold` (number)** - Minimum text similarity (0-1) for page matching, default: 0.3
145
148
 
146
149
  ### Methods
147
150
 
@@ -157,6 +160,7 @@ Compare two PDFs and render results.
157
160
  - `totalPages` - Number of pages compared
158
161
  - `totalDiffPixels` - Total different pixels across all pages
159
162
  - `pageResults` - Array of per-page results
163
+ - `pageMapping` - Page alignment mappings (when smartAlignment is enabled)
160
164
 
161
165
  #### `getResults()`
162
166
 
@@ -244,6 +248,29 @@ const viewer = new PDFDiffViewer('#container', {
244
248
  });
245
249
  ```
246
250
 
251
+ ### With Smart Alignment (Content Reflow Handling)
252
+
253
+ ```javascript
254
+ // Handles cases where content shifts across pages
255
+ // (e.g., adding text pushes content to next page)
256
+ const viewer = new PDFDiffViewer('#container', {
257
+ smartAlignment: true, // Enable intelligent page matching
258
+ alignmentTolerance: 2, // Search +/- 2 pages for matches
259
+ similarityThreshold: 0.3 // Require 30% content similarity
260
+ });
261
+
262
+ const results = await viewer.compare(pdfA, pdfB);
263
+ console.log('Page mappings:', results.pageMapping);
264
+ // Output: [{ pageA: 1, pageB: 1, similarity: 0.95 },
265
+ // { pageA: 2, pageB: 3, similarity: 0.87 }, ...]
266
+ ```
267
+
268
+ **How it works:**
269
+ - Extracts text from all pages in both documents
270
+ - Uses Jaccard similarity to find best-matching pages
271
+ - Handles different page counts gracefully
272
+ - Shows similarity scores in the UI
273
+
247
274
  ### With Mask Regions (Ignore Dynamic Content)
248
275
 
249
276
  ```javascript
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "pdf-diff-viewer",
3
- "version": "1.0.0",
3
+ "version": "1.1.0",
4
4
  "description": "Browser-based PDF comparison tool with visual diff highlighting. Zero system dependencies, pure JavaScript, client-side processing.",
5
5
  "main": "src/PDFDiffViewer.js",
6
6
  "type": "module",
package/public/app.js CHANGED
@@ -431,7 +431,64 @@ angular.module("pdfDiffApp", [])
431
431
  return overlay;
432
432
  }
433
433
 
434
+ // Store PDFs for cleanup
435
+ let currentPdfA = null;
436
+ let currentPdfB = null;
437
+
438
+ // Cleanup function to clear all resources
439
+ function cleanup() {
440
+ // Clear canvases
441
+ const canvasA = document.getElementById("canvasA");
442
+ const canvasB = document.getElementById("canvasB");
443
+ const canvasDiff = document.getElementById("canvasDiff");
444
+
445
+ if (canvasA) {
446
+ const ctxA = canvasA.getContext("2d");
447
+ ctxA.clearRect(0, 0, canvasA.width, canvasA.height);
448
+ canvasA.width = 0;
449
+ canvasA.height = 0;
450
+ }
451
+
452
+ if (canvasB) {
453
+ const ctxB = canvasB.getContext("2d");
454
+ ctxB.clearRect(0, 0, canvasB.width, canvasB.height);
455
+ canvasB.width = 0;
456
+ canvasB.height = 0;
457
+ }
458
+
459
+ if (canvasDiff) {
460
+ const ctxDiff = canvasDiff.getContext("2d");
461
+ ctxDiff.clearRect(0, 0, canvasDiff.width, canvasDiff.height);
462
+ canvasDiff.width = 0;
463
+ canvasDiff.height = 0;
464
+ }
465
+
466
+ // Clear results div
467
+ const resultsDiv = document.getElementById("results");
468
+ if (resultsDiv) {
469
+ resultsDiv.innerHTML = "";
470
+ }
471
+
472
+ // Destroy PDF documents to free memory
473
+ if (currentPdfA) {
474
+ currentPdfA.destroy();
475
+ currentPdfA = null;
476
+ }
477
+ if (currentPdfB) {
478
+ currentPdfB.destroy();
479
+ currentPdfB = null;
480
+ }
481
+
482
+ // Force garbage collection hint
483
+ if (window.gc) {
484
+ window.gc();
485
+ }
486
+ }
487
+
434
488
  $scope.compare = async function () {
489
+ // Clear previous results and free memory
490
+ cleanup();
491
+
435
492
  const LABEL_A = "Document A";
436
493
  const LABEL_B = "Document B";
437
494
  const fileA = document.getElementById("fileA").files[0];
@@ -457,12 +514,17 @@ angular.module("pdfDiffApp", [])
457
514
  const arrayBufferA = await fileA.arrayBuffer();
458
515
  const arrayBufferB = await fileB.arrayBuffer();
459
516
 
460
- const pdfA = await pdfjsLib.getDocument({ data: arrayBufferA }).promise;
461
- const pdfB = await pdfjsLib.getDocument({ data: arrayBufferB }).promise;
517
+ currentPdfA = await pdfjsLib.getDocument({ data: arrayBufferA }).promise;
518
+ currentPdfB = await pdfjsLib.getDocument({ data: arrayBufferB }).promise;
519
+ const pdfA = currentPdfA;
520
+ const pdfB = currentPdfB;
462
521
 
522
+ const maxPages = Math.max(pdfA.numPages, pdfB.numPages);
523
+
463
524
  if (pdfA.numPages !== pdfB.numPages) {
464
- alert(`Page mismatch: ${pdfA.numPages} vs ${pdfB.numPages}`);
465
- return;
525
+ const message = `Page count differs: ${LABEL_A} has ${pdfA.numPages} page(s), ${LABEL_B} has ${pdfB.numPages} page(s). Showing all ${maxPages} page(s).`;
526
+ console.warn(message);
527
+ // alert(message);
466
528
  }
467
529
 
468
530
  const canvasA = document.getElementById("canvasA");
@@ -470,14 +532,86 @@ angular.module("pdfDiffApp", [])
470
532
  const canvasDiff = document.getElementById("canvasDiff");
471
533
 
472
534
  const resultsDiv = document.getElementById("results");
473
- resultsDiv.innerHTML = "";
474
- // resultsDiv.innerHTML = "<h3>Diff on A vs Diff on B</h3>";
535
+ // resultsDiv already cleared in cleanup()
475
536
 
476
537
  // const zip = new JSZip();
477
538
  let totalDiffPixels = 0;
478
539
 
479
- for (let i = 1; i <= pdfA.numPages; i++) {
540
+ for (let i = 1; i <= maxPages; i++) {
541
+
542
+ const hasPageA = i <= pdfA.numPages;
543
+ const hasPageB = i <= pdfB.numPages;
544
+
545
+ // Handle pages that only exist in one document
546
+ if (!hasPageA || !hasPageB) {
547
+ const title = document.createElement("h4");
548
+ title.innerText = `Page ${i}`;
549
+
550
+ const row = document.createElement("div");
551
+ row.style.display = "grid";
552
+ row.style.gridTemplateColumns = "1fr 1fr";
553
+ row.style.gap = "15px";
554
+ row.style.marginBottom = "25px";
555
+ row.style.borderTop = "2px solid #ddd";
556
+ row.style.paddingTop = "15px";
557
+
558
+ function makeEmptyCol(labelText) {
559
+ const col = document.createElement("div");
560
+ const label = document.createElement("div");
561
+ label.innerHTML = `<b>${labelText}</b>`;
562
+
563
+ const placeholder = document.createElement("div");
564
+ placeholder.style.padding = "40px";
565
+ placeholder.style.textAlign = "center";
566
+ placeholder.style.backgroundColor = "#f5f5f5";
567
+ placeholder.style.border = "1px solid #ccc";
568
+ placeholder.style.color = "#999";
569
+ placeholder.innerText = "No corresponding page";
570
+
571
+ col.appendChild(label);
572
+ col.appendChild(placeholder);
573
+ return col;
574
+ }
575
+
576
+ function makePageCol(labelText, canvas) {
577
+ const col = document.createElement("div");
578
+ const label = document.createElement("div");
579
+ label.innerHTML = `<b>${labelText}</b>`;
580
+
581
+ const img = document.createElement("img");
582
+ img.src = canvas.toDataURL("image/png");
583
+ img.style.width = "100%";
584
+ img.style.border = "1px solid #ccc";
585
+ img.style.imageRendering = "crisp-edges";
586
+ img.style.backgroundColor = "#fff";
587
+
588
+ col.appendChild(label);
589
+ col.appendChild(img);
590
+ return col;
591
+ }
592
+
593
+ if (hasPageA && !hasPageB) {
594
+ // Only in Document A
595
+ await renderPageToCanvas(pdfA, i, canvasA);
596
+ const colA = makePageCol(`${LABEL_A} (only)`, canvasA);
597
+ const colB = makeEmptyCol(`${LABEL_B} (no page ${i})`);
598
+ row.appendChild(colA);
599
+ row.appendChild(colB);
600
+ } else if (!hasPageA && hasPageB) {
601
+ // Only in Document B
602
+ await renderPageToCanvas(pdfB, i, canvasB);
603
+ const colA = makeEmptyCol(`${LABEL_A} (no page ${i})`);
604
+ const colB = makePageCol(`${LABEL_B} (only)`, canvasB);
605
+ row.appendChild(colA);
606
+ row.appendChild(colB);
607
+ }
608
+
609
+ resultsDiv.appendChild(title);
610
+ resultsDiv.appendChild(row);
611
+ continue;
612
+ }
480
613
 
614
+ // Both pages exist - do normal comparison
481
615
  const { words: wordsA } = await renderPageToCanvas(pdfA, i, canvasA);
482
616
  const { words: wordsB } = await renderPageToCanvas(pdfB, i, canvasB);
483
617
 
@@ -569,6 +703,24 @@ angular.module("pdfDiffApp", [])
569
703
  return col;
570
704
  }
571
705
 
706
+ function makeEmptyCol(labelText) {
707
+ const col = document.createElement("div");
708
+ const label = document.createElement("div");
709
+ label.innerHTML = `<b>${labelText}</b>`;
710
+
711
+ const placeholder = document.createElement("div");
712
+ placeholder.style.padding = "40px";
713
+ placeholder.style.textAlign = "center";
714
+ placeholder.style.backgroundColor = "#f5f5f5";
715
+ placeholder.style.border = "1px solid #ccc";
716
+ placeholder.style.color = "#999";
717
+ placeholder.innerText = "No corresponding page";
718
+
719
+ col.appendChild(label);
720
+ col.appendChild(placeholder);
721
+ return col;
722
+ }
723
+
572
724
  resultsDiv.appendChild(title);
573
725
  row.appendChild(makeCol(LABEL_A, overlayOnA));
574
726
  row.appendChild(makeCol(LABEL_B, overlayOnB));
package/public/index.html CHANGED
@@ -29,7 +29,7 @@
29
29
 
30
30
  <canvas id="canvasA" style="display:none"></canvas>
31
31
  <canvas id="canvasB" style="display:none"></canvas>
32
- <canvas id="canvasDiff"></canvas>
32
+ <canvas id="canvasDiff" style="display:none"></canvas>
33
33
 
34
34
  </body>
35
35
  </html>
@@ -20,7 +20,7 @@ class PDFDiffViewer {
20
20
  scale: options.scale || 3.0,
21
21
  maxShift: options.maxShift || 3,
22
22
  dilationRadius: options.dilationRadius || 0,
23
- colorTolerance: options.colorTolerance || 120,
23
+ colorTolerance: options.colorTolerance || 200,
24
24
  minHighlightArea: options.minHighlightArea || 60,
25
25
  minWordSize: options.minWordSize || 8,
26
26
  highlightAlpha: options.highlightAlpha || 0.32,
@@ -29,7 +29,10 @@ class PDFDiffViewer {
29
29
  workerSrc: options.workerSrc || 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.11.174/pdf.worker.min.js',
30
30
  showPageNumbers: options.showPageNumbers !== false,
31
31
  cropRegions: options.cropRegions || [],
32
- maskRegions: options.maskRegions || []
32
+ maskRegions: options.maskRegions || [],
33
+ smartAlignment: options.smartAlignment !== false, // Enable text-based alignment
34
+ alignmentTolerance: options.alignmentTolerance || 2, // Search +/- 2 pages for matches
35
+ similarityThreshold: options.similarityThreshold || 0.3 // Minimum similarity score (0-1)
33
36
  };
34
37
 
35
38
  // Check if PDF.js is loaded
@@ -57,39 +60,56 @@ class PDFDiffViewer {
57
60
  const docA = await pdfjsLib.getDocument({ data: bufferA }).promise;
58
61
  const docB = await pdfjsLib.getDocument({ data: bufferB }).promise;
59
62
 
60
- if (docA.numPages !== docB.numPages) {
61
- throw new Error(`Page count mismatch: ${docA.numPages} vs ${docB.numPages}`);
62
- }
63
-
64
63
  // Clear previous results
65
64
  this.container.innerHTML = '';
66
65
 
67
66
  let totalDiffPixels = 0;
68
67
  const pageResults = [];
68
+ let pageMapping = [];
69
69
 
70
70
  // Create summary element
71
71
  const summaryDiv = document.createElement('div');
72
72
  summaryDiv.className = 'pdf-diff-summary';
73
73
  this.container.appendChild(summaryDiv);
74
74
 
75
- // Process each page
76
- for (let i = 1; i <= docA.numPages; i++) {
77
- const pageResult = await this._comparePage(docA, docB, i);
75
+ // Use smart alignment if enabled and page counts differ
76
+ if (this.options.smartAlignment && docA.numPages !== docB.numPages) {
77
+ summaryDiv.innerHTML = '<p>Analyzing document structure for smart alignment...</p>';
78
+ pageMapping = await this._findPageMappings(docA, docB);
79
+ summaryDiv.innerHTML = `<h3>Smart Alignment Active: Comparing ${pageMapping.length} matched page(s)</h3>`;
80
+ } else if (docA.numPages !== docB.numPages) {
81
+ throw new Error(`Page count mismatch: ${docA.numPages} vs ${docB.numPages}. Enable 'smartAlignment' option to handle different page counts.`);
82
+ } else {
83
+ // Direct 1-to-1 mapping
84
+ for (let i = 1; i <= docA.numPages; i++) {
85
+ pageMapping.push({ pageA: i, pageB: i, similarity: 1.0 });
86
+ }
87
+ }
88
+
89
+ // Process each mapped page pair
90
+ for (const mapping of pageMapping) {
91
+ const pageResult = await this._comparePagePair(docA, docB, mapping.pageA, mapping.pageB);
92
+ pageResult.similarity = mapping.similarity;
78
93
  pageResults.push(pageResult);
79
94
  totalDiffPixels += pageResult.diffPixels;
80
95
 
81
- this._renderPageComparison(pageResult, i);
96
+ this._renderPageComparison(pageResult, mapping.pageA, mapping.pageB);
82
97
  }
83
98
 
84
99
  this.results = {
85
- totalPages: docA.numPages,
100
+ totalPages: pageMapping.length,
86
101
  totalDiffPixels,
87
- pageResults
102
+ pageResults,
103
+ pageMapping
88
104
  };
89
105
 
90
106
  // Update summary
91
107
  if (this.options.showPageNumbers) {
92
- summaryDiv.innerHTML = `<h3>Comparison Results: ${docA.numPages} page(s)</h3>`;
108
+ if (docA.numPages === docB.numPages) {
109
+ summaryDiv.innerHTML = `<h3>Comparison Results: ${docA.numPages} page(s)</h3>`;
110
+ } else {
111
+ summaryDiv.innerHTML += `<p>Doc A: ${docA.numPages} pages | Doc B: ${docB.numPages} pages</p>`;
112
+ }
93
113
  }
94
114
 
95
115
  return this.results;
@@ -134,13 +154,17 @@ class PDFDiffViewer {
134
154
  }
135
155
 
136
156
  async _comparePage(docA, docB, pageNum) {
157
+ return await this._comparePagePair(docA, docB, pageNum, pageNum);
158
+ }
159
+
160
+ async _comparePagePair(docA, docB, pageNumA, pageNumB) {
137
161
  const canvasA = document.createElement('canvas');
138
162
  const canvasB = document.createElement('canvas');
139
163
 
140
- const { words: wordsA } = await this._renderPageToCanvas(docA, pageNum, canvasA);
141
- const { words: wordsB } = await this._renderPageToCanvas(docB, pageNum, canvasB);
164
+ const { words: wordsA } = await this._renderPageToCanvas(docA, pageNumA, canvasA);
165
+ const { words: wordsB } = await this._renderPageToCanvas(docB, pageNumB, canvasB);
142
166
 
143
- const pageCrop = this.options.cropRegions.find(r => r.page === pageNum);
167
+ const pageCrop = this.options.cropRegions.find(r => r.page === pageNumA);
144
168
  const croppedWordsA = this._offsetWordBoxes(wordsA, pageCrop);
145
169
  const croppedWordsB = this._offsetWordBoxes(wordsB, pageCrop);
146
170
 
@@ -173,7 +197,7 @@ class PDFDiffViewer {
173
197
  const diffPixels = this._buildDiffImage(imgA, shiftedB, diffImage, this.options.colorTolerance);
174
198
 
175
199
  // Apply masks
176
- const pageMasks = this.options.maskRegions.filter(r => r.page === pageNum);
200
+ const pageMasks = this.options.maskRegions.filter(r => r.page === pageNumA);
177
201
  this._applyMasks(diffImage, pageMasks);
178
202
 
179
203
  // Dilate diff mask
@@ -194,21 +218,33 @@ class PDFDiffViewer {
194
218
  const overlayOnB = this._overlayDiff(paddedB, highlightCanvasB);
195
219
 
196
220
  return {
197
- pageNum,
221
+ pageNumA,
222
+ pageNumB,
198
223
  diffPixels,
199
224
  overlayA: overlayOnA.toDataURL('image/png'),
200
225
  overlayB: overlayOnB.toDataURL('image/png'),
201
226
  alignment: { dx: best.dx, dy: best.dy }
202
227
  };
203
228
  }
229
+ alignment: { dx: best.dx, dy: best.dy }
230
+ };
231
+ }
204
232
 
205
- _renderPageComparison(pageResult, pageNum) {
233
+ _renderPageComparison(pageResult, pageNumA, pageNumB = null) {
206
234
  const pageDiv = document.createElement('div');
207
235
  pageDiv.className = 'pdf-diff-page';
208
236
 
209
237
  if (this.options.showPageNumbers) {
210
238
  const title = document.createElement('h4');
211
- title.innerText = `Page ${pageNum}`;
239
+ if (pageNumB !== null && pageNumA !== pageNumB) {
240
+ title.innerText = `Page ${pageNumA} ↔ Page ${pageNumB}`;
241
+ if (pageResult.similarity !== undefined) {
242
+ const simPercent = (pageResult.similarity * 100).toFixed(1);
243
+ title.innerText += ` (${simPercent}% content match)`;
244
+ }
245
+ } else {
246
+ title.innerText = `Page ${pageNumA}`;
247
+ }
212
248
  title.style.marginTop = '20px';
213
249
  pageDiv.appendChild(title);
214
250
  }
@@ -627,6 +663,130 @@ class PDFDiffViewer {
627
663
 
628
664
  return overlay;
629
665
  }
666
+
667
+ // ===== SMART ALIGNMENT METHODS =====
668
+
669
+ /**
670
+ * Find optimal page mappings between two PDFs based on text content similarity
671
+ */
672
+ async _findPageMappings(docA, docB) {
673
+ const mappings = [];
674
+ const usedPagesB = new Set();
675
+ const tolerance = this.options.alignmentTolerance;
676
+
677
+ // Extract text from all pages of both documents
678
+ const textsA = await this._extractAllPageTexts(docA);
679
+ const textsB = await this._extractAllPageTexts(docB);
680
+
681
+ // For each page in document A, find best matching page in document B
682
+ for (let pageA = 1; pageA <= docA.numPages; pageA++) {
683
+ const textA = textsA[pageA - 1];
684
+
685
+ let bestMatch = null;
686
+ let bestSimilarity = 0;
687
+
688
+ // Search within tolerance range
689
+ const startPage = Math.max(1, pageA - tolerance);
690
+ const endPage = Math.min(docB.numPages, pageA + tolerance);
691
+
692
+ for (let pageB = startPage; pageB <= endPage; pageB++) {
693
+ if (usedPagesB.has(pageB)) continue;
694
+
695
+ const textB = textsB[pageB - 1];
696
+ const similarity = this._calculateTextSimilarity(textA, textB);
697
+
698
+ if (similarity > bestSimilarity && similarity >= this.options.similarityThreshold) {
699
+ bestSimilarity = similarity;
700
+ bestMatch = pageB;
701
+ }
702
+ }
703
+
704
+ // If no good match found, try 1:1 mapping if that page exists and isn't used
705
+ if (!bestMatch && pageA <= docB.numPages && !usedPagesB.has(pageA)) {
706
+ bestMatch = pageA;
707
+ bestSimilarity = this._calculateTextSimilarity(textA, textsB[pageA - 1]);
708
+ }
709
+
710
+ if (bestMatch) {
711
+ usedPagesB.add(bestMatch);
712
+ mappings.push({
713
+ pageA,
714
+ pageB: bestMatch,
715
+ similarity: bestSimilarity
716
+ });
717
+ }
718
+ }
719
+
720
+ return mappings;
721
+ }
722
+
723
+ /**
724
+ * Extract text from all pages of a PDF document
725
+ */
726
+ async _extractAllPageTexts(doc) {
727
+ const texts = [];
728
+ for (let i = 1; i <= doc.numPages; i++) {
729
+ const page = await doc.getPage(i);
730
+ const textContent = await page.getTextContent();
731
+ const pageText = textContent.items
732
+ .map(item => item.str || '')
733
+ .join(' ')
734
+ .toLowerCase()
735
+ .replace(/\s+/g, ' ')
736
+ .trim();
737
+ texts.push(pageText);
738
+ }
739
+ return texts;
740
+ }
741
+
742
+ /**
743
+ * Calculate text similarity using Jaccard similarity coefficient
744
+ * Returns a value between 0 (no similarity) and 1 (identical)
745
+ */
746
+ _calculateTextSimilarity(text1, text2) {
747
+ if (!text1 && !text2) return 1.0;
748
+ if (!text1 || !text2) return 0.0;
749
+
750
+ // Tokenize into words
751
+ const words1 = this._tokenize(text1);
752
+ const words2 = this._tokenize(text2);
753
+
754
+ // Create sets of words
755
+ const set1 = new Set(words1);
756
+ const set2 = new Set(words2);
757
+
758
+ // Calculate Jaccard similarity: |intersection| / |union|
759
+ const intersection = new Set([...set1].filter(x => set2.has(x)));
760
+ const union = new Set([...set1, ...set2]);
761
+
762
+ if (union.size === 0) return 0.0;
763
+
764
+ const jaccardSimilarity = intersection.size / union.size;
765
+
766
+ // Also consider length ratio for better accuracy
767
+ const lengthRatio = Math.min(text1.length, text2.length) / Math.max(text1.length, text2.length);
768
+
769
+ // Weighted combination
770
+ return jaccardSimilarity * 0.7 + lengthRatio * 0.3;
771
+ }
772
+
773
+ /**
774
+ * Tokenize text into words, removing common stopwords
775
+ */
776
+ _tokenize(text) {
777
+ const stopwords = new Set([
778
+ 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
779
+ 'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'be',
780
+ 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
781
+ 'would', 'should', 'could', 'may', 'might', 'can', 'shall'
782
+ ]);
783
+
784
+ return text
785
+ .toLowerCase()
786
+ .replace(/[^\w\s]/g, ' ') // Remove punctuation
787
+ .split(/\s+/)
788
+ .filter(word => word.length > 2 && !stopwords.has(word));
789
+ }
630
790
  }
631
791
 
632
792
  // Export for different module systems