philoch-bib-sdk 0.3.9__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- philoch_bib_sdk/__init__.py +0 -0
- philoch_bib_sdk/_rust.cp313-win_amd64.pyd +0 -0
- philoch_bib_sdk/adapters/io/__init__.py +115 -0
- philoch_bib_sdk/adapters/io/csv/__init__.py +308 -0
- philoch_bib_sdk/adapters/io/ods/__init__.py +145 -0
- philoch_bib_sdk/adapters/plaintext/bibitem_reader.py +0 -0
- philoch_bib_sdk/adapters/tabular_data/read_journal_volume_number_index.py +58 -0
- philoch_bib_sdk/converters/latex.py +6 -0
- philoch_bib_sdk/converters/plaintext/author/formatter.py +34 -0
- philoch_bib_sdk/converters/plaintext/author/parser.py +83 -0
- philoch_bib_sdk/converters/plaintext/bib_string_formatter.py +8 -0
- philoch_bib_sdk/converters/plaintext/bibitem/bibkey_formatter.py +21 -0
- philoch_bib_sdk/converters/plaintext/bibitem/bibkey_parser.py +158 -0
- philoch_bib_sdk/converters/plaintext/bibitem/date_formatter.py +37 -0
- philoch_bib_sdk/converters/plaintext/bibitem/date_parser.py +62 -0
- philoch_bib_sdk/converters/plaintext/bibitem/formatter.py +182 -0
- philoch_bib_sdk/converters/plaintext/bibitem/pages_formatter.py +13 -0
- philoch_bib_sdk/converters/plaintext/bibitem/pages_parser.py +63 -0
- philoch_bib_sdk/converters/plaintext/bibitem/parser.py +415 -0
- philoch_bib_sdk/converters/plaintext/journal/formatter.py +25 -0
- philoch_bib_sdk/converters/plaintext/journal/parser.py +36 -0
- philoch_bib_sdk/converters/plaintext/shared/renderable_formatter.py +25 -0
- philoch_bib_sdk/interfaces/cli/__init__.py +3 -0
- philoch_bib_sdk/interfaces/cli/fuzzy_matching.py +135 -0
- philoch_bib_sdk/logic/__init__.py +39 -0
- philoch_bib_sdk/logic/default_models.py +315 -0
- philoch_bib_sdk/logic/functions/__init__.py +31 -0
- philoch_bib_sdk/logic/functions/comparator.py +414 -0
- philoch_bib_sdk/logic/functions/fuzzy_matcher.py +796 -0
- philoch_bib_sdk/logic/functions/journal_article_matcher.py +44 -0
- philoch_bib_sdk/logic/literals.py +98 -0
- philoch_bib_sdk/logic/models.py +366 -0
- philoch_bib_sdk/logic/models_staging.py +173 -0
- philoch_bib_sdk/procedures/fuzzy_matching.py +112 -0
- philoch_bib_sdk/py.typed +0 -0
- philoch_bib_sdk/rust_scorer/Cargo.lock +232 -0
- philoch_bib_sdk/rust_scorer/Cargo.toml +26 -0
- philoch_bib_sdk/rust_scorer/pyproject.toml +15 -0
- philoch_bib_sdk/rust_scorer/rust_scorer.pyi +65 -0
- philoch_bib_sdk/rust_scorer/src/lib.rs +362 -0
- philoch_bib_sdk-0.3.9.dist-info/METADATA +15 -0
- philoch_bib_sdk-0.3.9.dist-info/RECORD +44 -0
- philoch_bib_sdk-0.3.9.dist-info/WHEEL +4 -0
- philoch_bib_sdk-0.3.9.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
"""Fuzzy matching procedure for bibliographic items.
|
|
2
|
+
|
|
3
|
+
This module provides the main orchestration logic for fuzzy matching staged BibItems
|
|
4
|
+
against an existing bibliography. It uses dependency injection to remain agnostic
|
|
5
|
+
to specific IO formats (CSV, JSON, etc.).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import Callable, Dict, Tuple
|
|
9
|
+
|
|
10
|
+
from aletk.ResultMonad import Err, Ok, try_except_wrapper
|
|
11
|
+
from aletk.utils import get_logger
|
|
12
|
+
|
|
13
|
+
from philoch_bib_sdk.logic.functions.fuzzy_matcher import (
|
|
14
|
+
build_index,
|
|
15
|
+
stage_bibitems_batch,
|
|
16
|
+
)
|
|
17
|
+
from philoch_bib_sdk.logic.models import BibItem
|
|
18
|
+
from philoch_bib_sdk.logic.models_staging import BibItemStaged
|
|
19
|
+
|
|
20
|
+
logger = get_logger(__name__)
|
|
21
|
+
|
|
22
|
+
# Type aliases for dependency injection
|
|
23
|
+
Bibliography = Dict[str, BibItem] # Key is formatted bibkey
|
|
24
|
+
LoadBibliographyFn = Callable[[str], Ok[Bibliography] | Err]
|
|
25
|
+
LoadStagedFn = Callable[[str], Ok[Tuple[BibItem, ...]] | Err]
|
|
26
|
+
WriteReportFn = Callable[[str, Tuple[BibItemStaged, ...]], Ok[None] | Err]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@try_except_wrapper(logger)
|
|
30
|
+
def fuzzy_match_procedure(
|
|
31
|
+
bibliography_path: str,
|
|
32
|
+
staged_path: str,
|
|
33
|
+
output_path: str,
|
|
34
|
+
load_bibliography: LoadBibliographyFn,
|
|
35
|
+
load_staged: LoadStagedFn,
|
|
36
|
+
write_report: WriteReportFn,
|
|
37
|
+
top_n: int = 5,
|
|
38
|
+
min_score: float = 0.0,
|
|
39
|
+
) -> None:
|
|
40
|
+
"""Execute fuzzy matching workflow with dependency injection.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
bibliography_path: Path to bibliography file
|
|
44
|
+
staged_path: Path to staged items file
|
|
45
|
+
output_path: Path for output report (without extension)
|
|
46
|
+
load_bibliography: Function to load bibliography from file
|
|
47
|
+
load_staged: Function to load staged items from file
|
|
48
|
+
write_report: Function to write results to file
|
|
49
|
+
top_n: Number of top matches to find per item
|
|
50
|
+
min_score: Minimum score threshold for matches
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
None on success (raises exception on failure)
|
|
54
|
+
"""
|
|
55
|
+
logger.info("Starting fuzzy matching procedure")
|
|
56
|
+
logger.info(f"Bibliography: {bibliography_path}")
|
|
57
|
+
logger.info(f"Staged items: {staged_path}")
|
|
58
|
+
logger.info(f"Output: {output_path}")
|
|
59
|
+
logger.info(f"Parameters: top_n={top_n}, min_score={min_score}")
|
|
60
|
+
|
|
61
|
+
# Step a: Load bibliography
|
|
62
|
+
logger.info("Loading bibliography...")
|
|
63
|
+
bibliography_result = load_bibliography(bibliography_path)
|
|
64
|
+
if isinstance(bibliography_result, Err):
|
|
65
|
+
raise RuntimeError(f"Failed to load bibliography: {bibliography_result.message}")
|
|
66
|
+
|
|
67
|
+
bibliography = bibliography_result.out
|
|
68
|
+
logger.info(f"Loaded {len(bibliography)} items from bibliography")
|
|
69
|
+
|
|
70
|
+
# Step b: Load staged items
|
|
71
|
+
logger.info("Loading staged items...")
|
|
72
|
+
staged_result = load_staged(staged_path)
|
|
73
|
+
if isinstance(staged_result, Err):
|
|
74
|
+
raise RuntimeError(f"Failed to load staged items: {staged_result.message}")
|
|
75
|
+
|
|
76
|
+
staged_items = staged_result.out
|
|
77
|
+
logger.info(f"Loaded {len(staged_items)} staged items")
|
|
78
|
+
|
|
79
|
+
if not staged_items:
|
|
80
|
+
logger.warning("No staged items to process")
|
|
81
|
+
return None
|
|
82
|
+
|
|
83
|
+
# Step c: Build fuzzy matching index from bibliography
|
|
84
|
+
logger.info("Building fuzzy matching index...")
|
|
85
|
+
# Convert dict to tuple for indexing
|
|
86
|
+
bibliography_tuple = tuple(bibliography.values())
|
|
87
|
+
index = build_index(bibliography_tuple)
|
|
88
|
+
logger.info("Index built successfully")
|
|
89
|
+
|
|
90
|
+
# Step d: Process each staged item to find matches
|
|
91
|
+
logger.info("Processing staged items...")
|
|
92
|
+
staged_with_matches = stage_bibitems_batch(staged_items, index, top_n=top_n, min_score=min_score)
|
|
93
|
+
logger.info(f"Processed {len(staged_with_matches)} items")
|
|
94
|
+
|
|
95
|
+
# Log summary statistics
|
|
96
|
+
total_matches = sum(len(item.top_matches) for item in staged_with_matches)
|
|
97
|
+
avg_matches = total_matches / len(staged_with_matches) if staged_with_matches else 0
|
|
98
|
+
logger.info(f"Found {total_matches} total matches (avg {avg_matches:.2f} per item)")
|
|
99
|
+
|
|
100
|
+
if staged_with_matches:
|
|
101
|
+
avg_time = sum(item.search_metadata["search_time_ms"] for item in staged_with_matches) / len(
|
|
102
|
+
staged_with_matches
|
|
103
|
+
)
|
|
104
|
+
logger.info(f"Average search time: {avg_time:.0f}ms per item")
|
|
105
|
+
|
|
106
|
+
# Step e: Write report
|
|
107
|
+
logger.info("Writing report...")
|
|
108
|
+
write_result = write_report(output_path, staged_with_matches)
|
|
109
|
+
if isinstance(write_result, Err):
|
|
110
|
+
raise RuntimeError(f"Failed to write report: {write_result.message}")
|
|
111
|
+
|
|
112
|
+
logger.info("Fuzzy matching procedure completed successfully")
|
philoch_bib_sdk/py.typed
ADDED
|
File without changes
|
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
# This file is automatically @generated by Cargo.
|
|
2
|
+
# It is not intended for manual editing.
|
|
3
|
+
version = 4
|
|
4
|
+
|
|
5
|
+
[[package]]
|
|
6
|
+
name = "autocfg"
|
|
7
|
+
version = "1.5.0"
|
|
8
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
9
|
+
checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
|
|
10
|
+
|
|
11
|
+
[[package]]
|
|
12
|
+
name = "crossbeam-deque"
|
|
13
|
+
version = "0.8.6"
|
|
14
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
15
|
+
checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
|
|
16
|
+
dependencies = [
|
|
17
|
+
"crossbeam-epoch",
|
|
18
|
+
"crossbeam-utils",
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
[[package]]
|
|
22
|
+
name = "crossbeam-epoch"
|
|
23
|
+
version = "0.9.18"
|
|
24
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
25
|
+
checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
|
|
26
|
+
dependencies = [
|
|
27
|
+
"crossbeam-utils",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
[[package]]
|
|
31
|
+
name = "crossbeam-utils"
|
|
32
|
+
version = "0.8.21"
|
|
33
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
34
|
+
checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
|
|
35
|
+
|
|
36
|
+
[[package]]
|
|
37
|
+
name = "either"
|
|
38
|
+
version = "1.15.0"
|
|
39
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
40
|
+
checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
|
|
41
|
+
|
|
42
|
+
[[package]]
|
|
43
|
+
name = "heck"
|
|
44
|
+
version = "0.5.0"
|
|
45
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
46
|
+
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
|
|
47
|
+
|
|
48
|
+
[[package]]
|
|
49
|
+
name = "indoc"
|
|
50
|
+
version = "2.0.7"
|
|
51
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
52
|
+
checksum = "79cf5c93f93228cf8efb3ba362535fb11199ac548a09ce117c9b1adc3030d706"
|
|
53
|
+
dependencies = [
|
|
54
|
+
"rustversion",
|
|
55
|
+
]
|
|
56
|
+
|
|
57
|
+
[[package]]
|
|
58
|
+
name = "libc"
|
|
59
|
+
version = "0.2.180"
|
|
60
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
61
|
+
checksum = "bcc35a38544a891a5f7c865aca548a982ccb3b8650a5b06d0fd33a10283c56fc"
|
|
62
|
+
|
|
63
|
+
[[package]]
|
|
64
|
+
name = "memoffset"
|
|
65
|
+
version = "0.9.1"
|
|
66
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
67
|
+
checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a"
|
|
68
|
+
dependencies = [
|
|
69
|
+
"autocfg",
|
|
70
|
+
]
|
|
71
|
+
|
|
72
|
+
[[package]]
|
|
73
|
+
name = "once_cell"
|
|
74
|
+
version = "1.21.3"
|
|
75
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
76
|
+
checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
|
|
77
|
+
|
|
78
|
+
[[package]]
|
|
79
|
+
name = "portable-atomic"
|
|
80
|
+
version = "1.13.0"
|
|
81
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
82
|
+
checksum = "f89776e4d69bb58bc6993e99ffa1d11f228b839984854c7daeb5d37f87cbe950"
|
|
83
|
+
|
|
84
|
+
[[package]]
|
|
85
|
+
name = "proc-macro2"
|
|
86
|
+
version = "1.0.105"
|
|
87
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
88
|
+
checksum = "535d180e0ecab6268a3e718bb9fd44db66bbbc256257165fc699dadf70d16fe7"
|
|
89
|
+
dependencies = [
|
|
90
|
+
"unicode-ident",
|
|
91
|
+
]
|
|
92
|
+
|
|
93
|
+
[[package]]
|
|
94
|
+
name = "pyo3"
|
|
95
|
+
version = "0.25.1"
|
|
96
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
97
|
+
checksum = "8970a78afe0628a3e3430376fc5fd76b6b45c4d43360ffd6cdd40bdde72b682a"
|
|
98
|
+
dependencies = [
|
|
99
|
+
"indoc",
|
|
100
|
+
"libc",
|
|
101
|
+
"memoffset",
|
|
102
|
+
"once_cell",
|
|
103
|
+
"portable-atomic",
|
|
104
|
+
"pyo3-build-config",
|
|
105
|
+
"pyo3-ffi",
|
|
106
|
+
"pyo3-macros",
|
|
107
|
+
"unindent",
|
|
108
|
+
]
|
|
109
|
+
|
|
110
|
+
[[package]]
|
|
111
|
+
name = "pyo3-build-config"
|
|
112
|
+
version = "0.25.1"
|
|
113
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
114
|
+
checksum = "458eb0c55e7ece017adeba38f2248ff3ac615e53660d7c71a238d7d2a01c7598"
|
|
115
|
+
dependencies = [
|
|
116
|
+
"once_cell",
|
|
117
|
+
"target-lexicon",
|
|
118
|
+
]
|
|
119
|
+
|
|
120
|
+
[[package]]
|
|
121
|
+
name = "pyo3-ffi"
|
|
122
|
+
version = "0.25.1"
|
|
123
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
124
|
+
checksum = "7114fe5457c61b276ab77c5055f206295b812608083644a5c5b2640c3102565c"
|
|
125
|
+
dependencies = [
|
|
126
|
+
"libc",
|
|
127
|
+
"pyo3-build-config",
|
|
128
|
+
]
|
|
129
|
+
|
|
130
|
+
[[package]]
|
|
131
|
+
name = "pyo3-macros"
|
|
132
|
+
version = "0.25.1"
|
|
133
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
134
|
+
checksum = "a8725c0a622b374d6cb051d11a0983786448f7785336139c3c94f5aa6bef7e50"
|
|
135
|
+
dependencies = [
|
|
136
|
+
"proc-macro2",
|
|
137
|
+
"pyo3-macros-backend",
|
|
138
|
+
"quote",
|
|
139
|
+
"syn",
|
|
140
|
+
]
|
|
141
|
+
|
|
142
|
+
[[package]]
|
|
143
|
+
name = "pyo3-macros-backend"
|
|
144
|
+
version = "0.25.1"
|
|
145
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
146
|
+
checksum = "4109984c22491085343c05b0dbc54ddc405c3cf7b4374fc533f5c3313a572ccc"
|
|
147
|
+
dependencies = [
|
|
148
|
+
"heck",
|
|
149
|
+
"proc-macro2",
|
|
150
|
+
"pyo3-build-config",
|
|
151
|
+
"quote",
|
|
152
|
+
"syn",
|
|
153
|
+
]
|
|
154
|
+
|
|
155
|
+
[[package]]
|
|
156
|
+
name = "quote"
|
|
157
|
+
version = "1.0.43"
|
|
158
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
159
|
+
checksum = "dc74d9a594b72ae6656596548f56f667211f8a97b3d4c3d467150794690dc40a"
|
|
160
|
+
dependencies = [
|
|
161
|
+
"proc-macro2",
|
|
162
|
+
]
|
|
163
|
+
|
|
164
|
+
[[package]]
|
|
165
|
+
name = "rayon"
|
|
166
|
+
version = "1.11.0"
|
|
167
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
168
|
+
checksum = "368f01d005bf8fd9b1206fb6fa653e6c4a81ceb1466406b81792d87c5677a58f"
|
|
169
|
+
dependencies = [
|
|
170
|
+
"either",
|
|
171
|
+
"rayon-core",
|
|
172
|
+
]
|
|
173
|
+
|
|
174
|
+
[[package]]
|
|
175
|
+
name = "rayon-core"
|
|
176
|
+
version = "1.13.0"
|
|
177
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
178
|
+
checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91"
|
|
179
|
+
dependencies = [
|
|
180
|
+
"crossbeam-deque",
|
|
181
|
+
"crossbeam-utils",
|
|
182
|
+
]
|
|
183
|
+
|
|
184
|
+
[[package]]
|
|
185
|
+
name = "rust_scorer"
|
|
186
|
+
version = "0.1.0"
|
|
187
|
+
dependencies = [
|
|
188
|
+
"pyo3",
|
|
189
|
+
"rayon",
|
|
190
|
+
"strsim",
|
|
191
|
+
]
|
|
192
|
+
|
|
193
|
+
[[package]]
|
|
194
|
+
name = "rustversion"
|
|
195
|
+
version = "1.0.22"
|
|
196
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
197
|
+
checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
|
|
198
|
+
|
|
199
|
+
[[package]]
|
|
200
|
+
name = "strsim"
|
|
201
|
+
version = "0.11.1"
|
|
202
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
203
|
+
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
|
|
204
|
+
|
|
205
|
+
[[package]]
|
|
206
|
+
name = "syn"
|
|
207
|
+
version = "2.0.114"
|
|
208
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
209
|
+
checksum = "d4d107df263a3013ef9b1879b0df87d706ff80f65a86ea879bd9c31f9b307c2a"
|
|
210
|
+
dependencies = [
|
|
211
|
+
"proc-macro2",
|
|
212
|
+
"quote",
|
|
213
|
+
"unicode-ident",
|
|
214
|
+
]
|
|
215
|
+
|
|
216
|
+
[[package]]
|
|
217
|
+
name = "target-lexicon"
|
|
218
|
+
version = "0.13.4"
|
|
219
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
220
|
+
checksum = "b1dd07eb858a2067e2f3c7155d54e929265c264e6f37efe3ee7a8d1b5a1dd0ba"
|
|
221
|
+
|
|
222
|
+
[[package]]
|
|
223
|
+
name = "unicode-ident"
|
|
224
|
+
version = "1.0.22"
|
|
225
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
226
|
+
checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5"
|
|
227
|
+
|
|
228
|
+
[[package]]
|
|
229
|
+
name = "unindent"
|
|
230
|
+
version = "0.2.4"
|
|
231
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
232
|
+
checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3"
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
[package]
|
|
2
|
+
name = "rust_scorer"
|
|
3
|
+
version = "0.3.9"
|
|
4
|
+
edition = "2021"
|
|
5
|
+
|
|
6
|
+
[lib]
|
|
7
|
+
name = "rust_scorer"
|
|
8
|
+
crate-type = ["cdylib"]
|
|
9
|
+
|
|
10
|
+
[dependencies]
|
|
11
|
+
pyo3 = "0.25.0"
|
|
12
|
+
rayon = "1.11.0"
|
|
13
|
+
strsim = "0.11.1"
|
|
14
|
+
|
|
15
|
+
[lints.clippy]
|
|
16
|
+
all = "warn"
|
|
17
|
+
# Type safety: prevent silent numeric conversions
|
|
18
|
+
cast_possible_truncation = "warn"
|
|
19
|
+
cast_sign_loss = "warn"
|
|
20
|
+
cast_possible_wrap = "warn"
|
|
21
|
+
cast_lossless = "warn"
|
|
22
|
+
# Efficiency
|
|
23
|
+
redundant_clone = "warn"
|
|
24
|
+
|
|
25
|
+
[lints.rust]
|
|
26
|
+
warnings = "deny"
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["maturin>=1.9,<2.0"]
|
|
3
|
+
build-backend = "maturin"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "rust_scorer"
|
|
7
|
+
requires-python = ">=3.8"
|
|
8
|
+
classifiers = [
|
|
9
|
+
"Programming Language :: Rust",
|
|
10
|
+
"Programming Language :: Python :: Implementation :: CPython",
|
|
11
|
+
"Programming Language :: Python :: Implementation :: PyPy",
|
|
12
|
+
]
|
|
13
|
+
dynamic = ["version"]
|
|
14
|
+
[tool.maturin]
|
|
15
|
+
features = ["pyo3/extension-module"]
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""Type stubs for rust_scorer - high-performance fuzzy matching for BibItems."""
|
|
2
|
+
|
|
3
|
+
from typing import TypedDict
|
|
4
|
+
|
|
5
|
+
class BibItemData(TypedDict):
|
|
6
|
+
"""Input data for a single BibItem."""
|
|
7
|
+
|
|
8
|
+
index: int
|
|
9
|
+
title: str
|
|
10
|
+
author: str
|
|
11
|
+
year: int | None
|
|
12
|
+
doi: str | None
|
|
13
|
+
journal: str | None
|
|
14
|
+
volume: str | None
|
|
15
|
+
number: str | None
|
|
16
|
+
pages: str | None
|
|
17
|
+
publisher: str | None
|
|
18
|
+
|
|
19
|
+
class MatchResult(TypedDict):
|
|
20
|
+
"""Result of scoring a candidate against a subject."""
|
|
21
|
+
|
|
22
|
+
candidate_index: int
|
|
23
|
+
total_score: float
|
|
24
|
+
title_score: float
|
|
25
|
+
author_score: float
|
|
26
|
+
date_score: float
|
|
27
|
+
bonus_score: float
|
|
28
|
+
|
|
29
|
+
class SubjectMatchResult(TypedDict):
|
|
30
|
+
"""Result for a single subject with its top matches."""
|
|
31
|
+
|
|
32
|
+
subject_index: int
|
|
33
|
+
matches: list[MatchResult]
|
|
34
|
+
candidates_searched: int
|
|
35
|
+
|
|
36
|
+
def token_sort_ratio(s1: str, s2: str) -> float:
|
|
37
|
+
"""Token sort ratio using Jaro-Winkler similarity.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
s1: First string to compare
|
|
41
|
+
s2: Second string to compare
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
Similarity score from 0.0 to 100.0
|
|
45
|
+
"""
|
|
46
|
+
...
|
|
47
|
+
|
|
48
|
+
def score_batch(
|
|
49
|
+
subjects: list[BibItemData],
|
|
50
|
+
candidates: list[BibItemData],
|
|
51
|
+
top_n: int,
|
|
52
|
+
min_score: float,
|
|
53
|
+
) -> list[SubjectMatchResult]:
|
|
54
|
+
"""Batch score multiple subjects against candidates in parallel.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
subjects: List of BibItems to find matches for
|
|
58
|
+
candidates: List of BibItems to match against
|
|
59
|
+
top_n: Maximum number of matches to return per subject
|
|
60
|
+
min_score: Minimum score threshold for matches
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
List of results, one per subject, containing top matches
|
|
64
|
+
"""
|
|
65
|
+
...
|