philoch-bib-sdk 0.3.9__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- philoch_bib_sdk/__init__.py +0 -0
- philoch_bib_sdk/_rust.cp313-win_amd64.pyd +0 -0
- philoch_bib_sdk/adapters/io/__init__.py +115 -0
- philoch_bib_sdk/adapters/io/csv/__init__.py +308 -0
- philoch_bib_sdk/adapters/io/ods/__init__.py +145 -0
- philoch_bib_sdk/adapters/plaintext/bibitem_reader.py +0 -0
- philoch_bib_sdk/adapters/tabular_data/read_journal_volume_number_index.py +58 -0
- philoch_bib_sdk/converters/latex.py +6 -0
- philoch_bib_sdk/converters/plaintext/author/formatter.py +34 -0
- philoch_bib_sdk/converters/plaintext/author/parser.py +83 -0
- philoch_bib_sdk/converters/plaintext/bib_string_formatter.py +8 -0
- philoch_bib_sdk/converters/plaintext/bibitem/bibkey_formatter.py +21 -0
- philoch_bib_sdk/converters/plaintext/bibitem/bibkey_parser.py +158 -0
- philoch_bib_sdk/converters/plaintext/bibitem/date_formatter.py +37 -0
- philoch_bib_sdk/converters/plaintext/bibitem/date_parser.py +62 -0
- philoch_bib_sdk/converters/plaintext/bibitem/formatter.py +182 -0
- philoch_bib_sdk/converters/plaintext/bibitem/pages_formatter.py +13 -0
- philoch_bib_sdk/converters/plaintext/bibitem/pages_parser.py +63 -0
- philoch_bib_sdk/converters/plaintext/bibitem/parser.py +415 -0
- philoch_bib_sdk/converters/plaintext/journal/formatter.py +25 -0
- philoch_bib_sdk/converters/plaintext/journal/parser.py +36 -0
- philoch_bib_sdk/converters/plaintext/shared/renderable_formatter.py +25 -0
- philoch_bib_sdk/interfaces/cli/__init__.py +3 -0
- philoch_bib_sdk/interfaces/cli/fuzzy_matching.py +135 -0
- philoch_bib_sdk/logic/__init__.py +39 -0
- philoch_bib_sdk/logic/default_models.py +315 -0
- philoch_bib_sdk/logic/functions/__init__.py +31 -0
- philoch_bib_sdk/logic/functions/comparator.py +414 -0
- philoch_bib_sdk/logic/functions/fuzzy_matcher.py +796 -0
- philoch_bib_sdk/logic/functions/journal_article_matcher.py +44 -0
- philoch_bib_sdk/logic/literals.py +98 -0
- philoch_bib_sdk/logic/models.py +366 -0
- philoch_bib_sdk/logic/models_staging.py +173 -0
- philoch_bib_sdk/procedures/fuzzy_matching.py +112 -0
- philoch_bib_sdk/py.typed +0 -0
- philoch_bib_sdk/rust_scorer/Cargo.lock +232 -0
- philoch_bib_sdk/rust_scorer/Cargo.toml +26 -0
- philoch_bib_sdk/rust_scorer/pyproject.toml +15 -0
- philoch_bib_sdk/rust_scorer/rust_scorer.pyi +65 -0
- philoch_bib_sdk/rust_scorer/src/lib.rs +362 -0
- philoch_bib_sdk-0.3.9.dist-info/METADATA +15 -0
- philoch_bib_sdk-0.3.9.dist-info/RECORD +44 -0
- philoch_bib_sdk-0.3.9.dist-info/WHEEL +4 -0
- philoch_bib_sdk-0.3.9.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,796 @@
|
|
|
1
|
+
"""High-performance fuzzy matching for BibItems using blocking indexes.
|
|
2
|
+
|
|
3
|
+
This module provides efficient fuzzy matching against large bibliographies (100k+ items)
|
|
4
|
+
by using multi-index blocking to reduce the search space before applying detailed scoring.
|
|
5
|
+
|
|
6
|
+
When available, uses a Rust-based batch scorer (rust_scorer) for parallel processing,
|
|
7
|
+
providing 10-100x speedup on large batches.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import pickle
|
|
11
|
+
import time
|
|
12
|
+
from collections import defaultdict
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import TYPE_CHECKING, Any, DefaultDict, FrozenSet, Iterator, Sequence, Tuple
|
|
15
|
+
|
|
16
|
+
from aletk.utils import remove_extra_whitespace
|
|
17
|
+
from cytoolz import topk
|
|
18
|
+
|
|
19
|
+
from philoch_bib_sdk.converters.plaintext.bibitem.bibkey_formatter import format_bibkey
|
|
20
|
+
from philoch_bib_sdk.logic.functions.comparator import compare_bibitems_detailed
|
|
21
|
+
from philoch_bib_sdk.logic.models import BibItem, BibItemDateAttr, TBibString
|
|
22
|
+
from philoch_bib_sdk.logic.models_staging import BibItemStaged, Match, SearchMetadata
|
|
23
|
+
|
|
24
|
+
from philoch_bib_sdk.converters.plaintext.author.formatter import format_author
|
|
25
|
+
from philoch_bib_sdk.logic.models import Author, BibStringAttr
|
|
26
|
+
from philoch_bib_sdk.logic.models_staging import PartialScore, ScoreComponent
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
if TYPE_CHECKING:
|
|
30
|
+
from rust_scorer import BibItemData
|
|
31
|
+
|
|
32
|
+
# Try to import Rust scorer for batch processing
|
|
33
|
+
try:
|
|
34
|
+
import rust_scorer
|
|
35
|
+
|
|
36
|
+
_RUST_SCORER_AVAILABLE = True
|
|
37
|
+
except ImportError:
|
|
38
|
+
_RUST_SCORER_AVAILABLE = False
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class BibItemBlockIndex:
|
|
42
|
+
"""Multi-index structure for fast candidate retrieval.
|
|
43
|
+
|
|
44
|
+
Uses multiple overlapping indexes (DOI, title n-grams, author surnames, year decades,
|
|
45
|
+
journal names) to quickly find potential matches without excluding items due to dirty data.
|
|
46
|
+
|
|
47
|
+
Attributes:
|
|
48
|
+
doi_index: Exact DOI lookup for instant matches
|
|
49
|
+
title_trigrams: Title n-gram index for fuzzy title matching
|
|
50
|
+
author_surnames: Author surname index for author matching
|
|
51
|
+
year_decades: Year grouped by decade (with None for missing)
|
|
52
|
+
journals: Journal name index
|
|
53
|
+
all_items: Complete tuple of all items (fallback)
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
def __init__(
|
|
57
|
+
self,
|
|
58
|
+
doi_index: dict[str, BibItem],
|
|
59
|
+
title_trigrams: dict[str, FrozenSet[BibItem]],
|
|
60
|
+
author_surnames: dict[str, FrozenSet[BibItem]],
|
|
61
|
+
year_decades: dict[int | None, FrozenSet[BibItem]],
|
|
62
|
+
journals: dict[str, FrozenSet[BibItem]],
|
|
63
|
+
all_items: Tuple[BibItem, ...],
|
|
64
|
+
) -> None:
|
|
65
|
+
self.doi_index = doi_index
|
|
66
|
+
self.title_trigrams = title_trigrams
|
|
67
|
+
self.author_surnames = author_surnames
|
|
68
|
+
self.year_decades = year_decades
|
|
69
|
+
self.journals = journals
|
|
70
|
+
self.all_items = all_items
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _extract_trigrams(text: str) -> FrozenSet[str]:
|
|
74
|
+
"""Extract 3-character n-grams from text for fuzzy matching.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
text: Input text to extract trigrams from
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
Frozen set of trigrams (immutable for hashing)
|
|
81
|
+
"""
|
|
82
|
+
normalized = remove_extra_whitespace(text).lower()
|
|
83
|
+
if len(normalized) < 3:
|
|
84
|
+
return frozenset()
|
|
85
|
+
return frozenset(normalized[i : i + 3] for i in range(len(normalized) - 2))
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _extract_author_surnames(authors: Tuple["Author", ...]) -> FrozenSet[str]:
|
|
89
|
+
"""Extract author surnames for indexing.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
authors: Tuple of Author objects
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
Frozen set of normalized surnames
|
|
96
|
+
"""
|
|
97
|
+
|
|
98
|
+
if not authors:
|
|
99
|
+
return frozenset()
|
|
100
|
+
|
|
101
|
+
surnames: list[str] = []
|
|
102
|
+
for author in authors:
|
|
103
|
+
if isinstance(author, Author):
|
|
104
|
+
family_name_attr = author.family_name
|
|
105
|
+
if isinstance(family_name_attr, BibStringAttr) and family_name_attr.simplified:
|
|
106
|
+
surnames.append(remove_extra_whitespace(family_name_attr.simplified).lower())
|
|
107
|
+
|
|
108
|
+
return frozenset(surnames)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _get_decade(date: BibItemDateAttr | str) -> int | None:
|
|
112
|
+
"""Get the decade from a date attribute.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
date: BibItemDateAttr or "no date"
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
Decade as integer (e.g., 1990) or None if no date
|
|
119
|
+
"""
|
|
120
|
+
if date == "no date":
|
|
121
|
+
return None
|
|
122
|
+
if isinstance(date, BibItemDateAttr):
|
|
123
|
+
return (date.year // 10) * 10
|
|
124
|
+
return None
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _prepare_items_for_rust(bibitems: Sequence[BibItem]) -> list[dict[str, Any]]:
|
|
128
|
+
"""Extract minimal data needed by Rust build_index_rust.
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
bibitems: Sequence of BibItems to prepare
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
List of dicts with minimal data for Rust
|
|
135
|
+
"""
|
|
136
|
+
|
|
137
|
+
items_data = []
|
|
138
|
+
for i, item in enumerate(bibitems):
|
|
139
|
+
# Extract title string
|
|
140
|
+
title_attr = item.title
|
|
141
|
+
if isinstance(title_attr, BibStringAttr):
|
|
142
|
+
title = title_attr.simplified
|
|
143
|
+
else:
|
|
144
|
+
title = str(title_attr) if title_attr else ""
|
|
145
|
+
|
|
146
|
+
# Extract author surnames
|
|
147
|
+
author_surnames = list(_extract_author_surnames(item.author))
|
|
148
|
+
|
|
149
|
+
# Extract year
|
|
150
|
+
decade = _get_decade(item.date)
|
|
151
|
+
year = decade if decade is not None else None
|
|
152
|
+
|
|
153
|
+
# Extract journal name
|
|
154
|
+
journal_name = None
|
|
155
|
+
if item.journal:
|
|
156
|
+
journal_name_attr = item.journal.name
|
|
157
|
+
if isinstance(journal_name_attr, BibStringAttr):
|
|
158
|
+
journal_name = remove_extra_whitespace(journal_name_attr.simplified).lower()
|
|
159
|
+
|
|
160
|
+
items_data.append(
|
|
161
|
+
{
|
|
162
|
+
"item_index": i,
|
|
163
|
+
"doi": item.doi if item.doi else None,
|
|
164
|
+
"title": title,
|
|
165
|
+
"author_surnames": author_surnames,
|
|
166
|
+
"year": year,
|
|
167
|
+
"journal_name": journal_name,
|
|
168
|
+
}
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
return items_data
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def _reconstruct_index_from_rust(index_data: Any, items: Tuple[BibItem, ...]) -> BibItemBlockIndex:
|
|
175
|
+
"""Reconstruct BibItemBlockIndex from Rust IndexData.
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
index_data: IndexData object from Rust
|
|
179
|
+
items: Tuple of original BibItems
|
|
180
|
+
|
|
181
|
+
Returns:
|
|
182
|
+
BibItemBlockIndex with all indexes built
|
|
183
|
+
"""
|
|
184
|
+
# Convert Rust index mappings back to Python objects using original BibItems
|
|
185
|
+
doi_index = {doi: items[idx] for doi, idx in index_data.doi_to_index.items()}
|
|
186
|
+
|
|
187
|
+
title_trigrams = {
|
|
188
|
+
trigram: frozenset(items[idx] for idx in indices) for trigram, indices in index_data.trigram_to_indices.items()
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
author_surnames = {
|
|
192
|
+
surname: frozenset(items[idx] for idx in indices) for surname, indices in index_data.surname_to_indices.items()
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
year_decades = {
|
|
196
|
+
decade: frozenset(items[idx] for idx in indices) for decade, indices in index_data.decade_to_indices.items()
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
journals = {
|
|
200
|
+
name: frozenset(items[idx] for idx in indices) for name, indices in index_data.journal_to_indices.items()
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
return BibItemBlockIndex(
|
|
204
|
+
doi_index=doi_index,
|
|
205
|
+
title_trigrams=title_trigrams,
|
|
206
|
+
author_surnames=author_surnames,
|
|
207
|
+
year_decades=year_decades,
|
|
208
|
+
journals=journals,
|
|
209
|
+
all_items=items,
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def _build_index_python(bibitems: Tuple[BibItem, ...]) -> BibItemBlockIndex:
|
|
214
|
+
"""Pure Python implementation of build_index (fallback).
|
|
215
|
+
|
|
216
|
+
Creates overlapping indexes to handle dirty data gracefully while maintaining
|
|
217
|
+
fast lookup performance. No pre-filtering means no data loss.
|
|
218
|
+
|
|
219
|
+
Optimized for performance:
|
|
220
|
+
- Single-pass indexing (one loop instead of 5)
|
|
221
|
+
- Deferred frozenset conversion (only at the end)
|
|
222
|
+
- Reduced memory allocations
|
|
223
|
+
|
|
224
|
+
Args:
|
|
225
|
+
bibitems: Tuple of BibItems to index
|
|
226
|
+
|
|
227
|
+
Returns:
|
|
228
|
+
BibItemBlockIndex with all indexes built
|
|
229
|
+
"""
|
|
230
|
+
|
|
231
|
+
# Initialize all index structures
|
|
232
|
+
doi_index: dict[str, BibItem] = {}
|
|
233
|
+
title_trigram_map: DefaultDict[str, set[BibItem]] = defaultdict(set)
|
|
234
|
+
author_surname_map: DefaultDict[str, set[BibItem]] = defaultdict(set)
|
|
235
|
+
year_decade_map: DefaultDict[int | None, set[BibItem]] = defaultdict(set)
|
|
236
|
+
journal_map: DefaultDict[str, set[BibItem]] = defaultdict(set)
|
|
237
|
+
|
|
238
|
+
# Single pass over all items - build all indexes at once
|
|
239
|
+
for item in bibitems:
|
|
240
|
+
# DOI index
|
|
241
|
+
if item.doi:
|
|
242
|
+
doi_index[item.doi] = item
|
|
243
|
+
|
|
244
|
+
# Title trigram index
|
|
245
|
+
title_attr = item.title
|
|
246
|
+
if isinstance(title_attr, BibStringAttr):
|
|
247
|
+
trigrams = _extract_trigrams(title_attr.simplified)
|
|
248
|
+
for trigram in trigrams:
|
|
249
|
+
title_trigram_map[trigram].add(item)
|
|
250
|
+
|
|
251
|
+
# Author surname index
|
|
252
|
+
surnames = _extract_author_surnames(item.author)
|
|
253
|
+
for surname in surnames:
|
|
254
|
+
author_surname_map[surname].add(item)
|
|
255
|
+
|
|
256
|
+
# Year decade index
|
|
257
|
+
decade = _get_decade(item.date)
|
|
258
|
+
year_decade_map[decade].add(item)
|
|
259
|
+
|
|
260
|
+
# Journal index
|
|
261
|
+
if item.journal:
|
|
262
|
+
journal_name_attr = item.journal.name
|
|
263
|
+
if isinstance(journal_name_attr, BibStringAttr):
|
|
264
|
+
journal_name = remove_extra_whitespace(journal_name_attr.simplified).lower()
|
|
265
|
+
if journal_name:
|
|
266
|
+
journal_map[journal_name].add(item)
|
|
267
|
+
|
|
268
|
+
# Convert sets to frozensets only at the end (single pass per index)
|
|
269
|
+
title_trigrams = {trigram: frozenset(items) for trigram, items in title_trigram_map.items()}
|
|
270
|
+
author_surnames = {surname: frozenset(items) for surname, items in author_surname_map.items()}
|
|
271
|
+
year_decades = {decade: frozenset(items) for decade, items in year_decade_map.items()}
|
|
272
|
+
journals = {name: frozenset(items) for name, items in journal_map.items()}
|
|
273
|
+
|
|
274
|
+
return BibItemBlockIndex(
|
|
275
|
+
doi_index=doi_index,
|
|
276
|
+
title_trigrams=title_trigrams,
|
|
277
|
+
author_surnames=author_surnames,
|
|
278
|
+
year_decades=year_decades,
|
|
279
|
+
journals=journals,
|
|
280
|
+
all_items=bibitems,
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def build_index(bibitems: Sequence[BibItem]) -> BibItemBlockIndex:
|
|
285
|
+
"""Build multi-index structure for fast fuzzy matching.
|
|
286
|
+
|
|
287
|
+
Creates overlapping indexes to handle dirty data gracefully while maintaining
|
|
288
|
+
fast lookup performance. No pre-filtering means no data loss.
|
|
289
|
+
|
|
290
|
+
Uses Rust implementation when available (100x faster), falls back to Python.
|
|
291
|
+
|
|
292
|
+
Args:
|
|
293
|
+
bibitems: Sequence of BibItems to index
|
|
294
|
+
|
|
295
|
+
Returns:
|
|
296
|
+
BibItemBlockIndex with all indexes built
|
|
297
|
+
"""
|
|
298
|
+
# Try to use Rust implementation
|
|
299
|
+
try:
|
|
300
|
+
from philoch_bib_sdk._rust import build_index_rust # type: ignore[import-not-found]
|
|
301
|
+
|
|
302
|
+
use_rust = True
|
|
303
|
+
except ImportError:
|
|
304
|
+
use_rust = False
|
|
305
|
+
|
|
306
|
+
# Convert to tuple for immutability
|
|
307
|
+
items_tuple = tuple(bibitems)
|
|
308
|
+
|
|
309
|
+
if use_rust:
|
|
310
|
+
# Fast path: use Rust
|
|
311
|
+
items_data = _prepare_items_for_rust(items_tuple)
|
|
312
|
+
index_data = build_index_rust(items_data)
|
|
313
|
+
return _reconstruct_index_from_rust(index_data, items_tuple)
|
|
314
|
+
else:
|
|
315
|
+
# Fallback: pure Python
|
|
316
|
+
return _build_index_python(items_tuple)
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
# --- Rust Scorer Integration ---
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
def _prepare_bibitem_for_rust_scorer(item: BibItem, idx: int) -> "BibItemData":
|
|
323
|
+
"""Prepare a BibItem for Rust scorer.
|
|
324
|
+
|
|
325
|
+
Extracts simplified string fields needed for fuzzy matching.
|
|
326
|
+
|
|
327
|
+
Args:
|
|
328
|
+
item: BibItem to prepare
|
|
329
|
+
idx: Index of item in the source list (for result reconstruction)
|
|
330
|
+
|
|
331
|
+
Returns:
|
|
332
|
+
Dict with fields for Rust BibItemData struct
|
|
333
|
+
"""
|
|
334
|
+
|
|
335
|
+
# Title
|
|
336
|
+
if isinstance(item.title, BibStringAttr):
|
|
337
|
+
title = item.title.simplified
|
|
338
|
+
else:
|
|
339
|
+
title = str(item.title) if item.title else ""
|
|
340
|
+
|
|
341
|
+
# Author
|
|
342
|
+
author = format_author(item.author, "simplified")
|
|
343
|
+
|
|
344
|
+
# Year
|
|
345
|
+
year = None
|
|
346
|
+
if item.date != "no date" and isinstance(item.date, BibItemDateAttr):
|
|
347
|
+
year = item.date.year
|
|
348
|
+
|
|
349
|
+
# Journal
|
|
350
|
+
journal = None
|
|
351
|
+
if item.journal and isinstance(item.journal.name, BibStringAttr):
|
|
352
|
+
journal = item.journal.name.simplified
|
|
353
|
+
|
|
354
|
+
# Volume, Number, Pages (volume and number are on BibItem, not Journal)
|
|
355
|
+
volume = item.volume if item.volume else None
|
|
356
|
+
number = item.number if item.number else None
|
|
357
|
+
pages = None
|
|
358
|
+
if item.pages and len(item.pages) > 0:
|
|
359
|
+
# pages is a tuple of PageAttr objects, take the first one
|
|
360
|
+
first_page = item.pages[0]
|
|
361
|
+
if first_page.end:
|
|
362
|
+
pages = f"{first_page.start}--{first_page.end}"
|
|
363
|
+
else:
|
|
364
|
+
pages = first_page.start
|
|
365
|
+
|
|
366
|
+
# Publisher
|
|
367
|
+
publisher = None
|
|
368
|
+
if item.publisher and isinstance(item.publisher, BibStringAttr):
|
|
369
|
+
publisher = item.publisher.simplified
|
|
370
|
+
|
|
371
|
+
return {
|
|
372
|
+
"index": idx,
|
|
373
|
+
"title": title,
|
|
374
|
+
"author": author,
|
|
375
|
+
"year": year,
|
|
376
|
+
"doi": item.doi,
|
|
377
|
+
"journal": journal,
|
|
378
|
+
"volume": volume,
|
|
379
|
+
"number": number,
|
|
380
|
+
"pages": pages,
|
|
381
|
+
"publisher": publisher,
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
def _find_similar_batch_rust(
|
|
386
|
+
subjects: Sequence[BibItem],
|
|
387
|
+
candidates: Sequence[BibItem],
|
|
388
|
+
top_n: int,
|
|
389
|
+
min_score: float,
|
|
390
|
+
) -> list[Tuple[Match, ...]]:
|
|
391
|
+
"""Batch find similar items using Rust scorer.
|
|
392
|
+
|
|
393
|
+
Scores all subjects against all candidates in parallel using Rust.
|
|
394
|
+
|
|
395
|
+
Args:
|
|
396
|
+
subjects: Sequence of BibItems to find matches for
|
|
397
|
+
candidates: Sequence of candidate BibItems to match against
|
|
398
|
+
top_n: Number of top matches per subject
|
|
399
|
+
min_score: Minimum score threshold
|
|
400
|
+
|
|
401
|
+
Returns:
|
|
402
|
+
List of Match tuples, one per subject
|
|
403
|
+
"""
|
|
404
|
+
|
|
405
|
+
if not _RUST_SCORER_AVAILABLE:
|
|
406
|
+
raise RuntimeError("Rust scorer not available")
|
|
407
|
+
|
|
408
|
+
# Prepare data for Rust
|
|
409
|
+
subjects_data = [_prepare_bibitem_for_rust_scorer(s, i) for i, s in enumerate(subjects)]
|
|
410
|
+
candidates_data = [_prepare_bibitem_for_rust_scorer(c, i) for i, c in enumerate(candidates)]
|
|
411
|
+
|
|
412
|
+
# Call Rust batch scorer
|
|
413
|
+
results = rust_scorer.score_batch(subjects_data, candidates_data, top_n, min_score)
|
|
414
|
+
|
|
415
|
+
# Reconstruct Match objects
|
|
416
|
+
all_matches: list[Tuple[Match, ...]] = []
|
|
417
|
+
|
|
418
|
+
for result in results:
|
|
419
|
+
matches: list[Match] = []
|
|
420
|
+
# Handle both dict and object access patterns from Rust
|
|
421
|
+
result_matches = result.get("matches", []) if isinstance(result, dict) else result.matches
|
|
422
|
+
for rank, match_result in enumerate(result_matches, start=1):
|
|
423
|
+
# Handle both dict and object access patterns
|
|
424
|
+
if isinstance(match_result, dict):
|
|
425
|
+
cand_idx = match_result["candidate_index"]
|
|
426
|
+
title_score = match_result["title_score"]
|
|
427
|
+
author_score = match_result["author_score"]
|
|
428
|
+
date_score = match_result["date_score"]
|
|
429
|
+
bonus_score = match_result["bonus_score"]
|
|
430
|
+
total_score = match_result["total_score"]
|
|
431
|
+
else:
|
|
432
|
+
cand_idx = match_result.candidate_index
|
|
433
|
+
title_score = match_result.title_score
|
|
434
|
+
author_score = match_result.author_score
|
|
435
|
+
date_score = match_result.date_score
|
|
436
|
+
bonus_score = match_result.bonus_score
|
|
437
|
+
total_score = match_result.total_score
|
|
438
|
+
|
|
439
|
+
candidate = candidates[cand_idx]
|
|
440
|
+
|
|
441
|
+
# Create PartialScore objects from Rust scores
|
|
442
|
+
partial_scores = (
|
|
443
|
+
PartialScore(
|
|
444
|
+
component=ScoreComponent.TITLE,
|
|
445
|
+
score=int(title_score / 0.5) if title_score > 0 else 0,
|
|
446
|
+
weight=0.5,
|
|
447
|
+
weighted_score=title_score,
|
|
448
|
+
details="[rust]",
|
|
449
|
+
),
|
|
450
|
+
PartialScore(
|
|
451
|
+
component=ScoreComponent.AUTHOR,
|
|
452
|
+
score=int(author_score / 0.3) if author_score > 0 else 0,
|
|
453
|
+
weight=0.3,
|
|
454
|
+
weighted_score=author_score,
|
|
455
|
+
details="[rust]",
|
|
456
|
+
),
|
|
457
|
+
PartialScore(
|
|
458
|
+
component=ScoreComponent.DATE,
|
|
459
|
+
score=int(date_score / 0.1) if date_score > 0 else 0,
|
|
460
|
+
weight=0.1,
|
|
461
|
+
weighted_score=date_score,
|
|
462
|
+
details="[rust]",
|
|
463
|
+
),
|
|
464
|
+
PartialScore(
|
|
465
|
+
component=ScoreComponent.PUBLISHER, # Using PUBLISHER as generic bonus component
|
|
466
|
+
score=int(bonus_score / 0.1) if bonus_score > 0 else 0,
|
|
467
|
+
weight=0.1,
|
|
468
|
+
weighted_score=bonus_score,
|
|
469
|
+
details="[rust]",
|
|
470
|
+
),
|
|
471
|
+
)
|
|
472
|
+
|
|
473
|
+
matches.append(
|
|
474
|
+
Match(
|
|
475
|
+
bibkey=format_bibkey(candidate.bibkey),
|
|
476
|
+
matched_bibitem=candidate,
|
|
477
|
+
total_score=total_score,
|
|
478
|
+
partial_scores=partial_scores,
|
|
479
|
+
rank=rank,
|
|
480
|
+
)
|
|
481
|
+
)
|
|
482
|
+
|
|
483
|
+
all_matches.append(tuple(matches))
|
|
484
|
+
|
|
485
|
+
return all_matches
|
|
486
|
+
|
|
487
|
+
|
|
488
|
+
def _get_candidate_set(subject: BibItem, index: BibItemBlockIndex) -> FrozenSet[BibItem]:
|
|
489
|
+
"""Get candidate items from index using multiple lookup strategies.
|
|
490
|
+
|
|
491
|
+
Combines results from multiple indexes to create a candidate set that's
|
|
492
|
+
much smaller than the full bibliography but still comprehensive.
|
|
493
|
+
|
|
494
|
+
Args:
|
|
495
|
+
subject: BibItem to find candidates for
|
|
496
|
+
index: BibItemBlockIndex to search
|
|
497
|
+
|
|
498
|
+
Returns:
|
|
499
|
+
Frozen set of candidate BibItems (typically 0.5-2% of total)
|
|
500
|
+
"""
|
|
501
|
+
candidates: set[BibItem] = set()
|
|
502
|
+
|
|
503
|
+
# Check DOI first (instant exact match)
|
|
504
|
+
if subject.doi and subject.doi in index.doi_index:
|
|
505
|
+
return frozenset([index.doi_index[subject.doi]])
|
|
506
|
+
|
|
507
|
+
# Title trigrams
|
|
508
|
+
title_attr = subject.title
|
|
509
|
+
if isinstance(title_attr, BibStringAttr):
|
|
510
|
+
subject_trigrams = _extract_trigrams(title_attr.simplified)
|
|
511
|
+
for trigram in subject_trigrams:
|
|
512
|
+
if trigram in index.title_trigrams:
|
|
513
|
+
candidates.update(index.title_trigrams[trigram])
|
|
514
|
+
|
|
515
|
+
# Author surnames
|
|
516
|
+
subject_surnames = _extract_author_surnames(subject.author)
|
|
517
|
+
for surname in subject_surnames:
|
|
518
|
+
if surname in index.author_surnames:
|
|
519
|
+
candidates.update(index.author_surnames[surname])
|
|
520
|
+
|
|
521
|
+
# Year decades (±5 decades = ±50 years for safety)
|
|
522
|
+
subject_decade = _get_decade(subject.date)
|
|
523
|
+
if subject_decade is not None:
|
|
524
|
+
for offset in range(-5, 6):
|
|
525
|
+
decade = subject_decade + (offset * 10)
|
|
526
|
+
if decade in index.year_decades:
|
|
527
|
+
candidates.update(index.year_decades[decade])
|
|
528
|
+
else:
|
|
529
|
+
# No date: include all items with no date
|
|
530
|
+
if None in index.year_decades:
|
|
531
|
+
candidates.update(index.year_decades[None])
|
|
532
|
+
|
|
533
|
+
# Journal
|
|
534
|
+
if subject.journal:
|
|
535
|
+
journal_name_attr = subject.journal.name
|
|
536
|
+
if isinstance(journal_name_attr, BibStringAttr):
|
|
537
|
+
journal_name = remove_extra_whitespace(journal_name_attr.simplified).lower()
|
|
538
|
+
if journal_name and journal_name in index.journals:
|
|
539
|
+
candidates.update(index.journals[journal_name])
|
|
540
|
+
|
|
541
|
+
# Fallback: if no candidates found, use all items (rare but safe)
|
|
542
|
+
if not candidates:
|
|
543
|
+
return frozenset(index.all_items)
|
|
544
|
+
|
|
545
|
+
return frozenset(candidates)
|
|
546
|
+
|
|
547
|
+
|
|
548
|
+
def find_similar_bibitems(
|
|
549
|
+
subject: BibItem,
|
|
550
|
+
index: BibItemBlockIndex,
|
|
551
|
+
top_n: int = 5,
|
|
552
|
+
min_score: float = 0.0,
|
|
553
|
+
bibstring_type: TBibString = "simplified",
|
|
554
|
+
) -> Tuple[Match, ...]:
|
|
555
|
+
"""Find top N most similar BibItems using fuzzy matching.
|
|
556
|
+
|
|
557
|
+
Uses blocking indexes to reduce search space, then applies detailed
|
|
558
|
+
fuzzy scoring to find the best matches.
|
|
559
|
+
|
|
560
|
+
Args:
|
|
561
|
+
subject: BibItem to find matches for
|
|
562
|
+
index: Pre-built BibItemBlockIndex
|
|
563
|
+
top_n: Number of top matches to return (default: 5)
|
|
564
|
+
min_score: Minimum score threshold (default: 0.0)
|
|
565
|
+
bibstring_type: Which bibstring variant to use (default: "simplified")
|
|
566
|
+
|
|
567
|
+
Returns:
|
|
568
|
+
Tuple of Match objects with detailed scoring, sorted by score (best first)
|
|
569
|
+
"""
|
|
570
|
+
# Get candidate set from indexes
|
|
571
|
+
candidates = _get_candidate_set(subject, index)
|
|
572
|
+
|
|
573
|
+
# Score all candidates (generator for memory efficiency)
|
|
574
|
+
scored_items = (
|
|
575
|
+
(
|
|
576
|
+
candidate,
|
|
577
|
+
compare_bibitems_detailed(candidate, subject, bibstring_type),
|
|
578
|
+
)
|
|
579
|
+
for candidate in candidates
|
|
580
|
+
)
|
|
581
|
+
|
|
582
|
+
# Calculate total scores
|
|
583
|
+
with_totals = (
|
|
584
|
+
(candidate, partial_scores, sum(ps.weighted_score for ps in partial_scores))
|
|
585
|
+
for candidate, partial_scores in scored_items
|
|
586
|
+
)
|
|
587
|
+
|
|
588
|
+
# Filter by minimum score
|
|
589
|
+
filtered = (
|
|
590
|
+
(candidate, partial_scores, total_score)
|
|
591
|
+
for candidate, partial_scores, total_score in with_totals
|
|
592
|
+
if total_score >= min_score
|
|
593
|
+
)
|
|
594
|
+
|
|
595
|
+
# Get top N using cytoolz (heap-based, efficient)
|
|
596
|
+
top_results = tuple(topk(top_n, filtered, key=lambda x: x[2]))
|
|
597
|
+
|
|
598
|
+
# Convert to Match objects
|
|
599
|
+
return tuple(
|
|
600
|
+
Match(
|
|
601
|
+
bibkey=format_bibkey(candidate.bibkey),
|
|
602
|
+
matched_bibitem=candidate,
|
|
603
|
+
total_score=total_score,
|
|
604
|
+
partial_scores=partial_scores,
|
|
605
|
+
rank=rank,
|
|
606
|
+
)
|
|
607
|
+
for rank, (candidate, partial_scores, total_score) in enumerate(top_results, start=1)
|
|
608
|
+
)
|
|
609
|
+
|
|
610
|
+
|
|
611
|
+
def stage_bibitem(
|
|
612
|
+
bibitem: BibItem,
|
|
613
|
+
index: BibItemBlockIndex,
|
|
614
|
+
top_n: int = 5,
|
|
615
|
+
min_score: float = 0.0,
|
|
616
|
+
) -> BibItemStaged:
|
|
617
|
+
"""Stage a single BibItem with its top matches.
|
|
618
|
+
|
|
619
|
+
Args:
|
|
620
|
+
bibitem: BibItem to stage
|
|
621
|
+
index: Pre-built BibItemBlockIndex
|
|
622
|
+
top_n: Number of top matches to find (default: 5)
|
|
623
|
+
min_score: Minimum score threshold (default: 0.0)
|
|
624
|
+
|
|
625
|
+
Returns:
|
|
626
|
+
BibItemStaged with top matches and search metadata
|
|
627
|
+
"""
|
|
628
|
+
start_time = time.perf_counter()
|
|
629
|
+
candidates = _get_candidate_set(bibitem, index)
|
|
630
|
+
top_matches = find_similar_bibitems(bibitem, index, top_n, min_score)
|
|
631
|
+
end_time = time.perf_counter()
|
|
632
|
+
|
|
633
|
+
search_metadata: SearchMetadata = {
|
|
634
|
+
"search_time_ms": int((end_time - start_time) * 1000),
|
|
635
|
+
"candidates_searched": len(candidates),
|
|
636
|
+
}
|
|
637
|
+
|
|
638
|
+
return BibItemStaged(
|
|
639
|
+
bibitem=bibitem,
|
|
640
|
+
top_matches=top_matches,
|
|
641
|
+
search_metadata=search_metadata,
|
|
642
|
+
)
|
|
643
|
+
|
|
644
|
+
|
|
645
|
+
def stage_bibitems_batch(
|
|
646
|
+
bibitems: Sequence[BibItem],
|
|
647
|
+
index: BibItemBlockIndex,
|
|
648
|
+
top_n: int = 5,
|
|
649
|
+
min_score: float = 0.0,
|
|
650
|
+
use_rust: bool | None = None,
|
|
651
|
+
) -> Tuple[BibItemStaged, ...]:
|
|
652
|
+
"""Stage multiple BibItems in batch.
|
|
653
|
+
|
|
654
|
+
When Rust scorer is available, processes all items in parallel for
|
|
655
|
+
significant speedup (10-100x on large batches).
|
|
656
|
+
|
|
657
|
+
Args:
|
|
658
|
+
bibitems: Sequence of BibItems to stage
|
|
659
|
+
index: Pre-built BibItemBlockIndex
|
|
660
|
+
top_n: Number of top matches per item (default: 5)
|
|
661
|
+
min_score: Minimum score threshold (default: 0.0)
|
|
662
|
+
use_rust: Force Rust (True), Python (False), or auto-detect (None)
|
|
663
|
+
|
|
664
|
+
Returns:
|
|
665
|
+
Tuple of BibItemStaged objects
|
|
666
|
+
"""
|
|
667
|
+
# Determine whether to use Rust
|
|
668
|
+
if use_rust is None:
|
|
669
|
+
use_rust = _RUST_SCORER_AVAILABLE
|
|
670
|
+
|
|
671
|
+
if use_rust and not _RUST_SCORER_AVAILABLE:
|
|
672
|
+
raise RuntimeError("Rust scorer requested but not available")
|
|
673
|
+
|
|
674
|
+
if use_rust:
|
|
675
|
+
# Fast path: Rust batch scorer
|
|
676
|
+
start_time = time.perf_counter()
|
|
677
|
+
all_matches = _find_similar_batch_rust(bibitems, index.all_items, top_n, min_score)
|
|
678
|
+
end_time = time.perf_counter()
|
|
679
|
+
|
|
680
|
+
# Create BibItemStaged objects
|
|
681
|
+
total_time_ms = int((end_time - start_time) * 1000)
|
|
682
|
+
time_per_item = total_time_ms // len(bibitems) if bibitems else 0
|
|
683
|
+
|
|
684
|
+
return tuple(
|
|
685
|
+
BibItemStaged(
|
|
686
|
+
bibitem=bibitem,
|
|
687
|
+
top_matches=matches,
|
|
688
|
+
search_metadata={
|
|
689
|
+
"search_time_ms": time_per_item,
|
|
690
|
+
"candidates_searched": len(index.all_items),
|
|
691
|
+
"scorer": "rust",
|
|
692
|
+
},
|
|
693
|
+
)
|
|
694
|
+
for bibitem, matches in zip(bibitems, all_matches)
|
|
695
|
+
)
|
|
696
|
+
else:
|
|
697
|
+
# Fallback: Python sequential processing
|
|
698
|
+
return tuple(stage_bibitem(bibitem, index, top_n, min_score) for bibitem in bibitems)
|
|
699
|
+
|
|
700
|
+
|
|
701
|
+
def stage_bibitems_streaming(
|
|
702
|
+
bibitems: Sequence[BibItem],
|
|
703
|
+
index: BibItemBlockIndex,
|
|
704
|
+
top_n: int = 5,
|
|
705
|
+
min_score: float = 0.0,
|
|
706
|
+
) -> Iterator[BibItemStaged]:
|
|
707
|
+
"""Stage multiple BibItems with streaming results.
|
|
708
|
+
|
|
709
|
+
Yields BibItemStaged objects one at a time as they're processed,
|
|
710
|
+
enabling real-time progress monitoring and immediate CSV output.
|
|
711
|
+
|
|
712
|
+
Args:
|
|
713
|
+
bibitems: Sequence of BibItems to stage
|
|
714
|
+
index: Pre-built BibItemBlockIndex
|
|
715
|
+
top_n: Number of top matches per item (default: 5)
|
|
716
|
+
min_score: Minimum score threshold (default: 0.0)
|
|
717
|
+
|
|
718
|
+
Yields:
|
|
719
|
+
BibItemStaged objects as they're processed
|
|
720
|
+
"""
|
|
721
|
+
for bibitem in bibitems:
|
|
722
|
+
yield stage_bibitem(bibitem, index, top_n, min_score)
|
|
723
|
+
|
|
724
|
+
|
|
725
|
+
# --- Index Caching ---
|
|
726
|
+
|
|
727
|
+
|
|
728
|
+
def save_index(index: BibItemBlockIndex, cache_path: Path) -> None:
|
|
729
|
+
"""Save index to pickle file for later reuse.
|
|
730
|
+
|
|
731
|
+
Args:
|
|
732
|
+
index: BibItemBlockIndex to save
|
|
733
|
+
cache_path: Path to save the pickle file
|
|
734
|
+
"""
|
|
735
|
+
cache_path.parent.mkdir(parents=True, exist_ok=True)
|
|
736
|
+
with open(cache_path, "wb") as f:
|
|
737
|
+
pickle.dump(index, f, protocol=pickle.HIGHEST_PROTOCOL)
|
|
738
|
+
|
|
739
|
+
|
|
740
|
+
def load_index(cache_path: Path) -> BibItemBlockIndex | None:
|
|
741
|
+
"""Load index from pickle file if exists and valid.
|
|
742
|
+
|
|
743
|
+
Args:
|
|
744
|
+
cache_path: Path to the pickle file
|
|
745
|
+
|
|
746
|
+
Returns:
|
|
747
|
+
BibItemBlockIndex if successfully loaded, None otherwise
|
|
748
|
+
"""
|
|
749
|
+
if not cache_path.exists():
|
|
750
|
+
return None
|
|
751
|
+
try:
|
|
752
|
+
with open(cache_path, "rb") as f:
|
|
753
|
+
loaded = pickle.load(f)
|
|
754
|
+
if not isinstance(loaded, BibItemBlockIndex):
|
|
755
|
+
raise TypeError(
|
|
756
|
+
f"Cached index at {cache_path} contains {type(loaded).__name__}, " f"expected BibItemBlockIndex"
|
|
757
|
+
)
|
|
758
|
+
return loaded
|
|
759
|
+
except TypeError:
|
|
760
|
+
raise
|
|
761
|
+
except Exception:
|
|
762
|
+
return None
|
|
763
|
+
|
|
764
|
+
|
|
765
|
+
def build_index_cached(
|
|
766
|
+
bibitems: Sequence[BibItem],
|
|
767
|
+
cache_path: Path | None = None,
|
|
768
|
+
force_rebuild: bool = False,
|
|
769
|
+
) -> BibItemBlockIndex:
|
|
770
|
+
"""Build index with optional caching to avoid rebuilding.
|
|
771
|
+
|
|
772
|
+
If cache_path is provided and a valid cached index exists, it will be loaded
|
|
773
|
+
instead of rebuilding. Otherwise, builds the index and optionally saves it.
|
|
774
|
+
|
|
775
|
+
Args:
|
|
776
|
+
bibitems: Sequence of BibItems to index
|
|
777
|
+
cache_path: Optional path to cache the index (pickle file)
|
|
778
|
+
force_rebuild: If True, rebuild index even if cache exists
|
|
779
|
+
|
|
780
|
+
Returns:
|
|
781
|
+
BibItemBlockIndex (either from cache or freshly built)
|
|
782
|
+
"""
|
|
783
|
+
# Try loading from cache first
|
|
784
|
+
if cache_path and not force_rebuild:
|
|
785
|
+
cached = load_index(cache_path)
|
|
786
|
+
if cached is not None:
|
|
787
|
+
return cached
|
|
788
|
+
|
|
789
|
+
# Build fresh index
|
|
790
|
+
index = build_index(bibitems)
|
|
791
|
+
|
|
792
|
+
# Save to cache if path provided
|
|
793
|
+
if cache_path:
|
|
794
|
+
save_index(index, cache_path)
|
|
795
|
+
|
|
796
|
+
return index
|