philoch-bib-sdk 0.3.9__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. philoch_bib_sdk/__init__.py +0 -0
  2. philoch_bib_sdk/_rust.cp313-win_amd64.pyd +0 -0
  3. philoch_bib_sdk/adapters/io/__init__.py +115 -0
  4. philoch_bib_sdk/adapters/io/csv/__init__.py +308 -0
  5. philoch_bib_sdk/adapters/io/ods/__init__.py +145 -0
  6. philoch_bib_sdk/adapters/plaintext/bibitem_reader.py +0 -0
  7. philoch_bib_sdk/adapters/tabular_data/read_journal_volume_number_index.py +58 -0
  8. philoch_bib_sdk/converters/latex.py +6 -0
  9. philoch_bib_sdk/converters/plaintext/author/formatter.py +34 -0
  10. philoch_bib_sdk/converters/plaintext/author/parser.py +83 -0
  11. philoch_bib_sdk/converters/plaintext/bib_string_formatter.py +8 -0
  12. philoch_bib_sdk/converters/plaintext/bibitem/bibkey_formatter.py +21 -0
  13. philoch_bib_sdk/converters/plaintext/bibitem/bibkey_parser.py +158 -0
  14. philoch_bib_sdk/converters/plaintext/bibitem/date_formatter.py +37 -0
  15. philoch_bib_sdk/converters/plaintext/bibitem/date_parser.py +62 -0
  16. philoch_bib_sdk/converters/plaintext/bibitem/formatter.py +182 -0
  17. philoch_bib_sdk/converters/plaintext/bibitem/pages_formatter.py +13 -0
  18. philoch_bib_sdk/converters/plaintext/bibitem/pages_parser.py +63 -0
  19. philoch_bib_sdk/converters/plaintext/bibitem/parser.py +415 -0
  20. philoch_bib_sdk/converters/plaintext/journal/formatter.py +25 -0
  21. philoch_bib_sdk/converters/plaintext/journal/parser.py +36 -0
  22. philoch_bib_sdk/converters/plaintext/shared/renderable_formatter.py +25 -0
  23. philoch_bib_sdk/interfaces/cli/__init__.py +3 -0
  24. philoch_bib_sdk/interfaces/cli/fuzzy_matching.py +135 -0
  25. philoch_bib_sdk/logic/__init__.py +39 -0
  26. philoch_bib_sdk/logic/default_models.py +315 -0
  27. philoch_bib_sdk/logic/functions/__init__.py +31 -0
  28. philoch_bib_sdk/logic/functions/comparator.py +414 -0
  29. philoch_bib_sdk/logic/functions/fuzzy_matcher.py +796 -0
  30. philoch_bib_sdk/logic/functions/journal_article_matcher.py +44 -0
  31. philoch_bib_sdk/logic/literals.py +98 -0
  32. philoch_bib_sdk/logic/models.py +366 -0
  33. philoch_bib_sdk/logic/models_staging.py +173 -0
  34. philoch_bib_sdk/procedures/fuzzy_matching.py +112 -0
  35. philoch_bib_sdk/py.typed +0 -0
  36. philoch_bib_sdk/rust_scorer/Cargo.lock +232 -0
  37. philoch_bib_sdk/rust_scorer/Cargo.toml +26 -0
  38. philoch_bib_sdk/rust_scorer/pyproject.toml +15 -0
  39. philoch_bib_sdk/rust_scorer/rust_scorer.pyi +65 -0
  40. philoch_bib_sdk/rust_scorer/src/lib.rs +362 -0
  41. philoch_bib_sdk-0.3.9.dist-info/METADATA +15 -0
  42. philoch_bib_sdk-0.3.9.dist-info/RECORD +44 -0
  43. philoch_bib_sdk-0.3.9.dist-info/WHEEL +4 -0
  44. philoch_bib_sdk-0.3.9.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,796 @@
1
+ """High-performance fuzzy matching for BibItems using blocking indexes.
2
+
3
+ This module provides efficient fuzzy matching against large bibliographies (100k+ items)
4
+ by using multi-index blocking to reduce the search space before applying detailed scoring.
5
+
6
+ When available, uses a Rust-based batch scorer (rust_scorer) for parallel processing,
7
+ providing 10-100x speedup on large batches.
8
+ """
9
+
10
+ import pickle
11
+ import time
12
+ from collections import defaultdict
13
+ from pathlib import Path
14
+ from typing import TYPE_CHECKING, Any, DefaultDict, FrozenSet, Iterator, Sequence, Tuple
15
+
16
+ from aletk.utils import remove_extra_whitespace
17
+ from cytoolz import topk
18
+
19
+ from philoch_bib_sdk.converters.plaintext.bibitem.bibkey_formatter import format_bibkey
20
+ from philoch_bib_sdk.logic.functions.comparator import compare_bibitems_detailed
21
+ from philoch_bib_sdk.logic.models import BibItem, BibItemDateAttr, TBibString
22
+ from philoch_bib_sdk.logic.models_staging import BibItemStaged, Match, SearchMetadata
23
+
24
+ from philoch_bib_sdk.converters.plaintext.author.formatter import format_author
25
+ from philoch_bib_sdk.logic.models import Author, BibStringAttr
26
+ from philoch_bib_sdk.logic.models_staging import PartialScore, ScoreComponent
27
+
28
+
29
+ if TYPE_CHECKING:
30
+ from rust_scorer import BibItemData
31
+
32
+ # Try to import Rust scorer for batch processing
33
+ try:
34
+ import rust_scorer
35
+
36
+ _RUST_SCORER_AVAILABLE = True
37
+ except ImportError:
38
+ _RUST_SCORER_AVAILABLE = False
39
+
40
+
41
+ class BibItemBlockIndex:
42
+ """Multi-index structure for fast candidate retrieval.
43
+
44
+ Uses multiple overlapping indexes (DOI, title n-grams, author surnames, year decades,
45
+ journal names) to quickly find potential matches without excluding items due to dirty data.
46
+
47
+ Attributes:
48
+ doi_index: Exact DOI lookup for instant matches
49
+ title_trigrams: Title n-gram index for fuzzy title matching
50
+ author_surnames: Author surname index for author matching
51
+ year_decades: Year grouped by decade (with None for missing)
52
+ journals: Journal name index
53
+ all_items: Complete tuple of all items (fallback)
54
+ """
55
+
56
+ def __init__(
57
+ self,
58
+ doi_index: dict[str, BibItem],
59
+ title_trigrams: dict[str, FrozenSet[BibItem]],
60
+ author_surnames: dict[str, FrozenSet[BibItem]],
61
+ year_decades: dict[int | None, FrozenSet[BibItem]],
62
+ journals: dict[str, FrozenSet[BibItem]],
63
+ all_items: Tuple[BibItem, ...],
64
+ ) -> None:
65
+ self.doi_index = doi_index
66
+ self.title_trigrams = title_trigrams
67
+ self.author_surnames = author_surnames
68
+ self.year_decades = year_decades
69
+ self.journals = journals
70
+ self.all_items = all_items
71
+
72
+
73
+ def _extract_trigrams(text: str) -> FrozenSet[str]:
74
+ """Extract 3-character n-grams from text for fuzzy matching.
75
+
76
+ Args:
77
+ text: Input text to extract trigrams from
78
+
79
+ Returns:
80
+ Frozen set of trigrams (immutable for hashing)
81
+ """
82
+ normalized = remove_extra_whitespace(text).lower()
83
+ if len(normalized) < 3:
84
+ return frozenset()
85
+ return frozenset(normalized[i : i + 3] for i in range(len(normalized) - 2))
86
+
87
+
88
+ def _extract_author_surnames(authors: Tuple["Author", ...]) -> FrozenSet[str]:
89
+ """Extract author surnames for indexing.
90
+
91
+ Args:
92
+ authors: Tuple of Author objects
93
+
94
+ Returns:
95
+ Frozen set of normalized surnames
96
+ """
97
+
98
+ if not authors:
99
+ return frozenset()
100
+
101
+ surnames: list[str] = []
102
+ for author in authors:
103
+ if isinstance(author, Author):
104
+ family_name_attr = author.family_name
105
+ if isinstance(family_name_attr, BibStringAttr) and family_name_attr.simplified:
106
+ surnames.append(remove_extra_whitespace(family_name_attr.simplified).lower())
107
+
108
+ return frozenset(surnames)
109
+
110
+
111
+ def _get_decade(date: BibItemDateAttr | str) -> int | None:
112
+ """Get the decade from a date attribute.
113
+
114
+ Args:
115
+ date: BibItemDateAttr or "no date"
116
+
117
+ Returns:
118
+ Decade as integer (e.g., 1990) or None if no date
119
+ """
120
+ if date == "no date":
121
+ return None
122
+ if isinstance(date, BibItemDateAttr):
123
+ return (date.year // 10) * 10
124
+ return None
125
+
126
+
127
+ def _prepare_items_for_rust(bibitems: Sequence[BibItem]) -> list[dict[str, Any]]:
128
+ """Extract minimal data needed by Rust build_index_rust.
129
+
130
+ Args:
131
+ bibitems: Sequence of BibItems to prepare
132
+
133
+ Returns:
134
+ List of dicts with minimal data for Rust
135
+ """
136
+
137
+ items_data = []
138
+ for i, item in enumerate(bibitems):
139
+ # Extract title string
140
+ title_attr = item.title
141
+ if isinstance(title_attr, BibStringAttr):
142
+ title = title_attr.simplified
143
+ else:
144
+ title = str(title_attr) if title_attr else ""
145
+
146
+ # Extract author surnames
147
+ author_surnames = list(_extract_author_surnames(item.author))
148
+
149
+ # Extract year
150
+ decade = _get_decade(item.date)
151
+ year = decade if decade is not None else None
152
+
153
+ # Extract journal name
154
+ journal_name = None
155
+ if item.journal:
156
+ journal_name_attr = item.journal.name
157
+ if isinstance(journal_name_attr, BibStringAttr):
158
+ journal_name = remove_extra_whitespace(journal_name_attr.simplified).lower()
159
+
160
+ items_data.append(
161
+ {
162
+ "item_index": i,
163
+ "doi": item.doi if item.doi else None,
164
+ "title": title,
165
+ "author_surnames": author_surnames,
166
+ "year": year,
167
+ "journal_name": journal_name,
168
+ }
169
+ )
170
+
171
+ return items_data
172
+
173
+
174
+ def _reconstruct_index_from_rust(index_data: Any, items: Tuple[BibItem, ...]) -> BibItemBlockIndex:
175
+ """Reconstruct BibItemBlockIndex from Rust IndexData.
176
+
177
+ Args:
178
+ index_data: IndexData object from Rust
179
+ items: Tuple of original BibItems
180
+
181
+ Returns:
182
+ BibItemBlockIndex with all indexes built
183
+ """
184
+ # Convert Rust index mappings back to Python objects using original BibItems
185
+ doi_index = {doi: items[idx] for doi, idx in index_data.doi_to_index.items()}
186
+
187
+ title_trigrams = {
188
+ trigram: frozenset(items[idx] for idx in indices) for trigram, indices in index_data.trigram_to_indices.items()
189
+ }
190
+
191
+ author_surnames = {
192
+ surname: frozenset(items[idx] for idx in indices) for surname, indices in index_data.surname_to_indices.items()
193
+ }
194
+
195
+ year_decades = {
196
+ decade: frozenset(items[idx] for idx in indices) for decade, indices in index_data.decade_to_indices.items()
197
+ }
198
+
199
+ journals = {
200
+ name: frozenset(items[idx] for idx in indices) for name, indices in index_data.journal_to_indices.items()
201
+ }
202
+
203
+ return BibItemBlockIndex(
204
+ doi_index=doi_index,
205
+ title_trigrams=title_trigrams,
206
+ author_surnames=author_surnames,
207
+ year_decades=year_decades,
208
+ journals=journals,
209
+ all_items=items,
210
+ )
211
+
212
+
213
+ def _build_index_python(bibitems: Tuple[BibItem, ...]) -> BibItemBlockIndex:
214
+ """Pure Python implementation of build_index (fallback).
215
+
216
+ Creates overlapping indexes to handle dirty data gracefully while maintaining
217
+ fast lookup performance. No pre-filtering means no data loss.
218
+
219
+ Optimized for performance:
220
+ - Single-pass indexing (one loop instead of 5)
221
+ - Deferred frozenset conversion (only at the end)
222
+ - Reduced memory allocations
223
+
224
+ Args:
225
+ bibitems: Tuple of BibItems to index
226
+
227
+ Returns:
228
+ BibItemBlockIndex with all indexes built
229
+ """
230
+
231
+ # Initialize all index structures
232
+ doi_index: dict[str, BibItem] = {}
233
+ title_trigram_map: DefaultDict[str, set[BibItem]] = defaultdict(set)
234
+ author_surname_map: DefaultDict[str, set[BibItem]] = defaultdict(set)
235
+ year_decade_map: DefaultDict[int | None, set[BibItem]] = defaultdict(set)
236
+ journal_map: DefaultDict[str, set[BibItem]] = defaultdict(set)
237
+
238
+ # Single pass over all items - build all indexes at once
239
+ for item in bibitems:
240
+ # DOI index
241
+ if item.doi:
242
+ doi_index[item.doi] = item
243
+
244
+ # Title trigram index
245
+ title_attr = item.title
246
+ if isinstance(title_attr, BibStringAttr):
247
+ trigrams = _extract_trigrams(title_attr.simplified)
248
+ for trigram in trigrams:
249
+ title_trigram_map[trigram].add(item)
250
+
251
+ # Author surname index
252
+ surnames = _extract_author_surnames(item.author)
253
+ for surname in surnames:
254
+ author_surname_map[surname].add(item)
255
+
256
+ # Year decade index
257
+ decade = _get_decade(item.date)
258
+ year_decade_map[decade].add(item)
259
+
260
+ # Journal index
261
+ if item.journal:
262
+ journal_name_attr = item.journal.name
263
+ if isinstance(journal_name_attr, BibStringAttr):
264
+ journal_name = remove_extra_whitespace(journal_name_attr.simplified).lower()
265
+ if journal_name:
266
+ journal_map[journal_name].add(item)
267
+
268
+ # Convert sets to frozensets only at the end (single pass per index)
269
+ title_trigrams = {trigram: frozenset(items) for trigram, items in title_trigram_map.items()}
270
+ author_surnames = {surname: frozenset(items) for surname, items in author_surname_map.items()}
271
+ year_decades = {decade: frozenset(items) for decade, items in year_decade_map.items()}
272
+ journals = {name: frozenset(items) for name, items in journal_map.items()}
273
+
274
+ return BibItemBlockIndex(
275
+ doi_index=doi_index,
276
+ title_trigrams=title_trigrams,
277
+ author_surnames=author_surnames,
278
+ year_decades=year_decades,
279
+ journals=journals,
280
+ all_items=bibitems,
281
+ )
282
+
283
+
284
+ def build_index(bibitems: Sequence[BibItem]) -> BibItemBlockIndex:
285
+ """Build multi-index structure for fast fuzzy matching.
286
+
287
+ Creates overlapping indexes to handle dirty data gracefully while maintaining
288
+ fast lookup performance. No pre-filtering means no data loss.
289
+
290
+ Uses Rust implementation when available (100x faster), falls back to Python.
291
+
292
+ Args:
293
+ bibitems: Sequence of BibItems to index
294
+
295
+ Returns:
296
+ BibItemBlockIndex with all indexes built
297
+ """
298
+ # Try to use Rust implementation
299
+ try:
300
+ from philoch_bib_sdk._rust import build_index_rust # type: ignore[import-not-found]
301
+
302
+ use_rust = True
303
+ except ImportError:
304
+ use_rust = False
305
+
306
+ # Convert to tuple for immutability
307
+ items_tuple = tuple(bibitems)
308
+
309
+ if use_rust:
310
+ # Fast path: use Rust
311
+ items_data = _prepare_items_for_rust(items_tuple)
312
+ index_data = build_index_rust(items_data)
313
+ return _reconstruct_index_from_rust(index_data, items_tuple)
314
+ else:
315
+ # Fallback: pure Python
316
+ return _build_index_python(items_tuple)
317
+
318
+
319
+ # --- Rust Scorer Integration ---
320
+
321
+
322
+ def _prepare_bibitem_for_rust_scorer(item: BibItem, idx: int) -> "BibItemData":
323
+ """Prepare a BibItem for Rust scorer.
324
+
325
+ Extracts simplified string fields needed for fuzzy matching.
326
+
327
+ Args:
328
+ item: BibItem to prepare
329
+ idx: Index of item in the source list (for result reconstruction)
330
+
331
+ Returns:
332
+ Dict with fields for Rust BibItemData struct
333
+ """
334
+
335
+ # Title
336
+ if isinstance(item.title, BibStringAttr):
337
+ title = item.title.simplified
338
+ else:
339
+ title = str(item.title) if item.title else ""
340
+
341
+ # Author
342
+ author = format_author(item.author, "simplified")
343
+
344
+ # Year
345
+ year = None
346
+ if item.date != "no date" and isinstance(item.date, BibItemDateAttr):
347
+ year = item.date.year
348
+
349
+ # Journal
350
+ journal = None
351
+ if item.journal and isinstance(item.journal.name, BibStringAttr):
352
+ journal = item.journal.name.simplified
353
+
354
+ # Volume, Number, Pages (volume and number are on BibItem, not Journal)
355
+ volume = item.volume if item.volume else None
356
+ number = item.number if item.number else None
357
+ pages = None
358
+ if item.pages and len(item.pages) > 0:
359
+ # pages is a tuple of PageAttr objects, take the first one
360
+ first_page = item.pages[0]
361
+ if first_page.end:
362
+ pages = f"{first_page.start}--{first_page.end}"
363
+ else:
364
+ pages = first_page.start
365
+
366
+ # Publisher
367
+ publisher = None
368
+ if item.publisher and isinstance(item.publisher, BibStringAttr):
369
+ publisher = item.publisher.simplified
370
+
371
+ return {
372
+ "index": idx,
373
+ "title": title,
374
+ "author": author,
375
+ "year": year,
376
+ "doi": item.doi,
377
+ "journal": journal,
378
+ "volume": volume,
379
+ "number": number,
380
+ "pages": pages,
381
+ "publisher": publisher,
382
+ }
383
+
384
+
385
+ def _find_similar_batch_rust(
386
+ subjects: Sequence[BibItem],
387
+ candidates: Sequence[BibItem],
388
+ top_n: int,
389
+ min_score: float,
390
+ ) -> list[Tuple[Match, ...]]:
391
+ """Batch find similar items using Rust scorer.
392
+
393
+ Scores all subjects against all candidates in parallel using Rust.
394
+
395
+ Args:
396
+ subjects: Sequence of BibItems to find matches for
397
+ candidates: Sequence of candidate BibItems to match against
398
+ top_n: Number of top matches per subject
399
+ min_score: Minimum score threshold
400
+
401
+ Returns:
402
+ List of Match tuples, one per subject
403
+ """
404
+
405
+ if not _RUST_SCORER_AVAILABLE:
406
+ raise RuntimeError("Rust scorer not available")
407
+
408
+ # Prepare data for Rust
409
+ subjects_data = [_prepare_bibitem_for_rust_scorer(s, i) for i, s in enumerate(subjects)]
410
+ candidates_data = [_prepare_bibitem_for_rust_scorer(c, i) for i, c in enumerate(candidates)]
411
+
412
+ # Call Rust batch scorer
413
+ results = rust_scorer.score_batch(subjects_data, candidates_data, top_n, min_score)
414
+
415
+ # Reconstruct Match objects
416
+ all_matches: list[Tuple[Match, ...]] = []
417
+
418
+ for result in results:
419
+ matches: list[Match] = []
420
+ # Handle both dict and object access patterns from Rust
421
+ result_matches = result.get("matches", []) if isinstance(result, dict) else result.matches
422
+ for rank, match_result in enumerate(result_matches, start=1):
423
+ # Handle both dict and object access patterns
424
+ if isinstance(match_result, dict):
425
+ cand_idx = match_result["candidate_index"]
426
+ title_score = match_result["title_score"]
427
+ author_score = match_result["author_score"]
428
+ date_score = match_result["date_score"]
429
+ bonus_score = match_result["bonus_score"]
430
+ total_score = match_result["total_score"]
431
+ else:
432
+ cand_idx = match_result.candidate_index
433
+ title_score = match_result.title_score
434
+ author_score = match_result.author_score
435
+ date_score = match_result.date_score
436
+ bonus_score = match_result.bonus_score
437
+ total_score = match_result.total_score
438
+
439
+ candidate = candidates[cand_idx]
440
+
441
+ # Create PartialScore objects from Rust scores
442
+ partial_scores = (
443
+ PartialScore(
444
+ component=ScoreComponent.TITLE,
445
+ score=int(title_score / 0.5) if title_score > 0 else 0,
446
+ weight=0.5,
447
+ weighted_score=title_score,
448
+ details="[rust]",
449
+ ),
450
+ PartialScore(
451
+ component=ScoreComponent.AUTHOR,
452
+ score=int(author_score / 0.3) if author_score > 0 else 0,
453
+ weight=0.3,
454
+ weighted_score=author_score,
455
+ details="[rust]",
456
+ ),
457
+ PartialScore(
458
+ component=ScoreComponent.DATE,
459
+ score=int(date_score / 0.1) if date_score > 0 else 0,
460
+ weight=0.1,
461
+ weighted_score=date_score,
462
+ details="[rust]",
463
+ ),
464
+ PartialScore(
465
+ component=ScoreComponent.PUBLISHER, # Using PUBLISHER as generic bonus component
466
+ score=int(bonus_score / 0.1) if bonus_score > 0 else 0,
467
+ weight=0.1,
468
+ weighted_score=bonus_score,
469
+ details="[rust]",
470
+ ),
471
+ )
472
+
473
+ matches.append(
474
+ Match(
475
+ bibkey=format_bibkey(candidate.bibkey),
476
+ matched_bibitem=candidate,
477
+ total_score=total_score,
478
+ partial_scores=partial_scores,
479
+ rank=rank,
480
+ )
481
+ )
482
+
483
+ all_matches.append(tuple(matches))
484
+
485
+ return all_matches
486
+
487
+
488
+ def _get_candidate_set(subject: BibItem, index: BibItemBlockIndex) -> FrozenSet[BibItem]:
489
+ """Get candidate items from index using multiple lookup strategies.
490
+
491
+ Combines results from multiple indexes to create a candidate set that's
492
+ much smaller than the full bibliography but still comprehensive.
493
+
494
+ Args:
495
+ subject: BibItem to find candidates for
496
+ index: BibItemBlockIndex to search
497
+
498
+ Returns:
499
+ Frozen set of candidate BibItems (typically 0.5-2% of total)
500
+ """
501
+ candidates: set[BibItem] = set()
502
+
503
+ # Check DOI first (instant exact match)
504
+ if subject.doi and subject.doi in index.doi_index:
505
+ return frozenset([index.doi_index[subject.doi]])
506
+
507
+ # Title trigrams
508
+ title_attr = subject.title
509
+ if isinstance(title_attr, BibStringAttr):
510
+ subject_trigrams = _extract_trigrams(title_attr.simplified)
511
+ for trigram in subject_trigrams:
512
+ if trigram in index.title_trigrams:
513
+ candidates.update(index.title_trigrams[trigram])
514
+
515
+ # Author surnames
516
+ subject_surnames = _extract_author_surnames(subject.author)
517
+ for surname in subject_surnames:
518
+ if surname in index.author_surnames:
519
+ candidates.update(index.author_surnames[surname])
520
+
521
+ # Year decades (±5 decades = ±50 years for safety)
522
+ subject_decade = _get_decade(subject.date)
523
+ if subject_decade is not None:
524
+ for offset in range(-5, 6):
525
+ decade = subject_decade + (offset * 10)
526
+ if decade in index.year_decades:
527
+ candidates.update(index.year_decades[decade])
528
+ else:
529
+ # No date: include all items with no date
530
+ if None in index.year_decades:
531
+ candidates.update(index.year_decades[None])
532
+
533
+ # Journal
534
+ if subject.journal:
535
+ journal_name_attr = subject.journal.name
536
+ if isinstance(journal_name_attr, BibStringAttr):
537
+ journal_name = remove_extra_whitespace(journal_name_attr.simplified).lower()
538
+ if journal_name and journal_name in index.journals:
539
+ candidates.update(index.journals[journal_name])
540
+
541
+ # Fallback: if no candidates found, use all items (rare but safe)
542
+ if not candidates:
543
+ return frozenset(index.all_items)
544
+
545
+ return frozenset(candidates)
546
+
547
+
548
+ def find_similar_bibitems(
549
+ subject: BibItem,
550
+ index: BibItemBlockIndex,
551
+ top_n: int = 5,
552
+ min_score: float = 0.0,
553
+ bibstring_type: TBibString = "simplified",
554
+ ) -> Tuple[Match, ...]:
555
+ """Find top N most similar BibItems using fuzzy matching.
556
+
557
+ Uses blocking indexes to reduce search space, then applies detailed
558
+ fuzzy scoring to find the best matches.
559
+
560
+ Args:
561
+ subject: BibItem to find matches for
562
+ index: Pre-built BibItemBlockIndex
563
+ top_n: Number of top matches to return (default: 5)
564
+ min_score: Minimum score threshold (default: 0.0)
565
+ bibstring_type: Which bibstring variant to use (default: "simplified")
566
+
567
+ Returns:
568
+ Tuple of Match objects with detailed scoring, sorted by score (best first)
569
+ """
570
+ # Get candidate set from indexes
571
+ candidates = _get_candidate_set(subject, index)
572
+
573
+ # Score all candidates (generator for memory efficiency)
574
+ scored_items = (
575
+ (
576
+ candidate,
577
+ compare_bibitems_detailed(candidate, subject, bibstring_type),
578
+ )
579
+ for candidate in candidates
580
+ )
581
+
582
+ # Calculate total scores
583
+ with_totals = (
584
+ (candidate, partial_scores, sum(ps.weighted_score for ps in partial_scores))
585
+ for candidate, partial_scores in scored_items
586
+ )
587
+
588
+ # Filter by minimum score
589
+ filtered = (
590
+ (candidate, partial_scores, total_score)
591
+ for candidate, partial_scores, total_score in with_totals
592
+ if total_score >= min_score
593
+ )
594
+
595
+ # Get top N using cytoolz (heap-based, efficient)
596
+ top_results = tuple(topk(top_n, filtered, key=lambda x: x[2]))
597
+
598
+ # Convert to Match objects
599
+ return tuple(
600
+ Match(
601
+ bibkey=format_bibkey(candidate.bibkey),
602
+ matched_bibitem=candidate,
603
+ total_score=total_score,
604
+ partial_scores=partial_scores,
605
+ rank=rank,
606
+ )
607
+ for rank, (candidate, partial_scores, total_score) in enumerate(top_results, start=1)
608
+ )
609
+
610
+
611
+ def stage_bibitem(
612
+ bibitem: BibItem,
613
+ index: BibItemBlockIndex,
614
+ top_n: int = 5,
615
+ min_score: float = 0.0,
616
+ ) -> BibItemStaged:
617
+ """Stage a single BibItem with its top matches.
618
+
619
+ Args:
620
+ bibitem: BibItem to stage
621
+ index: Pre-built BibItemBlockIndex
622
+ top_n: Number of top matches to find (default: 5)
623
+ min_score: Minimum score threshold (default: 0.0)
624
+
625
+ Returns:
626
+ BibItemStaged with top matches and search metadata
627
+ """
628
+ start_time = time.perf_counter()
629
+ candidates = _get_candidate_set(bibitem, index)
630
+ top_matches = find_similar_bibitems(bibitem, index, top_n, min_score)
631
+ end_time = time.perf_counter()
632
+
633
+ search_metadata: SearchMetadata = {
634
+ "search_time_ms": int((end_time - start_time) * 1000),
635
+ "candidates_searched": len(candidates),
636
+ }
637
+
638
+ return BibItemStaged(
639
+ bibitem=bibitem,
640
+ top_matches=top_matches,
641
+ search_metadata=search_metadata,
642
+ )
643
+
644
+
645
+ def stage_bibitems_batch(
646
+ bibitems: Sequence[BibItem],
647
+ index: BibItemBlockIndex,
648
+ top_n: int = 5,
649
+ min_score: float = 0.0,
650
+ use_rust: bool | None = None,
651
+ ) -> Tuple[BibItemStaged, ...]:
652
+ """Stage multiple BibItems in batch.
653
+
654
+ When Rust scorer is available, processes all items in parallel for
655
+ significant speedup (10-100x on large batches).
656
+
657
+ Args:
658
+ bibitems: Sequence of BibItems to stage
659
+ index: Pre-built BibItemBlockIndex
660
+ top_n: Number of top matches per item (default: 5)
661
+ min_score: Minimum score threshold (default: 0.0)
662
+ use_rust: Force Rust (True), Python (False), or auto-detect (None)
663
+
664
+ Returns:
665
+ Tuple of BibItemStaged objects
666
+ """
667
+ # Determine whether to use Rust
668
+ if use_rust is None:
669
+ use_rust = _RUST_SCORER_AVAILABLE
670
+
671
+ if use_rust and not _RUST_SCORER_AVAILABLE:
672
+ raise RuntimeError("Rust scorer requested but not available")
673
+
674
+ if use_rust:
675
+ # Fast path: Rust batch scorer
676
+ start_time = time.perf_counter()
677
+ all_matches = _find_similar_batch_rust(bibitems, index.all_items, top_n, min_score)
678
+ end_time = time.perf_counter()
679
+
680
+ # Create BibItemStaged objects
681
+ total_time_ms = int((end_time - start_time) * 1000)
682
+ time_per_item = total_time_ms // len(bibitems) if bibitems else 0
683
+
684
+ return tuple(
685
+ BibItemStaged(
686
+ bibitem=bibitem,
687
+ top_matches=matches,
688
+ search_metadata={
689
+ "search_time_ms": time_per_item,
690
+ "candidates_searched": len(index.all_items),
691
+ "scorer": "rust",
692
+ },
693
+ )
694
+ for bibitem, matches in zip(bibitems, all_matches)
695
+ )
696
+ else:
697
+ # Fallback: Python sequential processing
698
+ return tuple(stage_bibitem(bibitem, index, top_n, min_score) for bibitem in bibitems)
699
+
700
+
701
+ def stage_bibitems_streaming(
702
+ bibitems: Sequence[BibItem],
703
+ index: BibItemBlockIndex,
704
+ top_n: int = 5,
705
+ min_score: float = 0.0,
706
+ ) -> Iterator[BibItemStaged]:
707
+ """Stage multiple BibItems with streaming results.
708
+
709
+ Yields BibItemStaged objects one at a time as they're processed,
710
+ enabling real-time progress monitoring and immediate CSV output.
711
+
712
+ Args:
713
+ bibitems: Sequence of BibItems to stage
714
+ index: Pre-built BibItemBlockIndex
715
+ top_n: Number of top matches per item (default: 5)
716
+ min_score: Minimum score threshold (default: 0.0)
717
+
718
+ Yields:
719
+ BibItemStaged objects as they're processed
720
+ """
721
+ for bibitem in bibitems:
722
+ yield stage_bibitem(bibitem, index, top_n, min_score)
723
+
724
+
725
+ # --- Index Caching ---
726
+
727
+
728
+ def save_index(index: BibItemBlockIndex, cache_path: Path) -> None:
729
+ """Save index to pickle file for later reuse.
730
+
731
+ Args:
732
+ index: BibItemBlockIndex to save
733
+ cache_path: Path to save the pickle file
734
+ """
735
+ cache_path.parent.mkdir(parents=True, exist_ok=True)
736
+ with open(cache_path, "wb") as f:
737
+ pickle.dump(index, f, protocol=pickle.HIGHEST_PROTOCOL)
738
+
739
+
740
+ def load_index(cache_path: Path) -> BibItemBlockIndex | None:
741
+ """Load index from pickle file if exists and valid.
742
+
743
+ Args:
744
+ cache_path: Path to the pickle file
745
+
746
+ Returns:
747
+ BibItemBlockIndex if successfully loaded, None otherwise
748
+ """
749
+ if not cache_path.exists():
750
+ return None
751
+ try:
752
+ with open(cache_path, "rb") as f:
753
+ loaded = pickle.load(f)
754
+ if not isinstance(loaded, BibItemBlockIndex):
755
+ raise TypeError(
756
+ f"Cached index at {cache_path} contains {type(loaded).__name__}, " f"expected BibItemBlockIndex"
757
+ )
758
+ return loaded
759
+ except TypeError:
760
+ raise
761
+ except Exception:
762
+ return None
763
+
764
+
765
+ def build_index_cached(
766
+ bibitems: Sequence[BibItem],
767
+ cache_path: Path | None = None,
768
+ force_rebuild: bool = False,
769
+ ) -> BibItemBlockIndex:
770
+ """Build index with optional caching to avoid rebuilding.
771
+
772
+ If cache_path is provided and a valid cached index exists, it will be loaded
773
+ instead of rebuilding. Otherwise, builds the index and optionally saves it.
774
+
775
+ Args:
776
+ bibitems: Sequence of BibItems to index
777
+ cache_path: Optional path to cache the index (pickle file)
778
+ force_rebuild: If True, rebuild index even if cache exists
779
+
780
+ Returns:
781
+ BibItemBlockIndex (either from cache or freshly built)
782
+ """
783
+ # Try loading from cache first
784
+ if cache_path and not force_rebuild:
785
+ cached = load_index(cache_path)
786
+ if cached is not None:
787
+ return cached
788
+
789
+ # Build fresh index
790
+ index = build_index(bibitems)
791
+
792
+ # Save to cache if path provided
793
+ if cache_path:
794
+ save_index(index, cache_path)
795
+
796
+ return index