philoch-bib-sdk 0.3.9__tar.gz → 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/Cargo.lock +72 -14
- {philoch_bib_sdk-0.3.9/philoch_bib_sdk/rust_scorer → philoch_bib_sdk-0.4.0}/Cargo.toml +10 -5
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/PKG-INFO +1 -1
- philoch_bib_sdk-0.3.9/philoch_bib_sdk/rust_scorer/rust_scorer.pyi → philoch_bib_sdk-0.4.0/philoch_bib_sdk/_rust.pyi +49 -2
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/philoch_bib_sdk/logic/functions/fuzzy_matcher.py +5 -5
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/pyproject.toml +9 -1
- {philoch_bib_sdk-0.3.9/philoch_bib_sdk/rust_scorer → philoch_bib_sdk-0.4.0}/src/lib.rs +189 -4
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/tests/logic/functions/test_fuzzy_matcher.py +3 -3
- philoch_bib_sdk-0.3.9/Cargo.toml +0 -18
- philoch_bib_sdk-0.3.9/philoch_bib_sdk/rust_scorer/Cargo.lock +0 -232
- philoch_bib_sdk-0.3.9/philoch_bib_sdk/rust_scorer/pyproject.toml +0 -15
- philoch_bib_sdk-0.3.9/src/lib.rs +0 -192
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/LICENSE +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/README.md +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/docs/fuzzy-matching.md +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/docs/python-style-guide.md +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/docs/rust-implementation-summary.md +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/docs/rust-index-building-spec.md +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/docs/rust-scorer.md +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/docs/streaming-output.md +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/docs/todo/fuzzy-matching-enhanced-output.md +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/docs/todo/merge_fuzzy_results.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/docs/todo/rust-build-index-implementation-plan.md +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/philoch_bib_sdk/__init__.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/philoch_bib_sdk/adapters/io/__init__.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/philoch_bib_sdk/adapters/io/csv/__init__.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/philoch_bib_sdk/adapters/io/ods/__init__.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/philoch_bib_sdk/adapters/plaintext/bibitem_reader.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/philoch_bib_sdk/adapters/tabular_data/read_journal_volume_number_index.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/philoch_bib_sdk/converters/latex.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/philoch_bib_sdk/converters/plaintext/author/formatter.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/philoch_bib_sdk/converters/plaintext/author/parser.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/philoch_bib_sdk/converters/plaintext/bib_string_formatter.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/philoch_bib_sdk/converters/plaintext/bibitem/bibkey_formatter.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/philoch_bib_sdk/converters/plaintext/bibitem/bibkey_parser.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/philoch_bib_sdk/converters/plaintext/bibitem/date_formatter.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/philoch_bib_sdk/converters/plaintext/bibitem/date_parser.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/philoch_bib_sdk/converters/plaintext/bibitem/formatter.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/philoch_bib_sdk/converters/plaintext/bibitem/pages_formatter.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/philoch_bib_sdk/converters/plaintext/bibitem/pages_parser.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/philoch_bib_sdk/converters/plaintext/bibitem/parser.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/philoch_bib_sdk/converters/plaintext/journal/formatter.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/philoch_bib_sdk/converters/plaintext/journal/parser.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/philoch_bib_sdk/converters/plaintext/shared/renderable_formatter.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/philoch_bib_sdk/interfaces/cli/__init__.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/philoch_bib_sdk/interfaces/cli/fuzzy_matching.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/philoch_bib_sdk/logic/__init__.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/philoch_bib_sdk/logic/default_models.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/philoch_bib_sdk/logic/functions/__init__.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/philoch_bib_sdk/logic/functions/comparator.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/philoch_bib_sdk/logic/functions/journal_article_matcher.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/philoch_bib_sdk/logic/literals.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/philoch_bib_sdk/logic/models.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/philoch_bib_sdk/logic/models_staging.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/philoch_bib_sdk/procedures/fuzzy_matching.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/philoch_bib_sdk/py.typed +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/poetry.lock +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/run_fuzzy_matching.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/run_fuzzy_matching_streaming.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/scripts/format.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/tests/adapters/test_read_jvn_index_from_ods.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/tests/converters/plaintext/conftest.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/tests/converters/plaintext/test_author_formatter.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/tests/converters/plaintext/test_author_parser.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/tests/converters/plaintext/test_bibitem_formatter.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/tests/converters/plaintext/test_bibitem_parser.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/tests/converters/plaintext/test_bibkey_formatter.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/tests/converters/plaintext/test_bibkey_parser.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/tests/converters/plaintext/test_date_formatter.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/tests/converters/plaintext/test_date_parser.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/tests/converters/plaintext/test_journal_formatter.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/tests/converters/plaintext/test_journal_parser.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/tests/converters/plaintext/test_page_formatter.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/tests/converters/plaintext/test_page_parser.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/tests/logic/functions/test_comparator.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/tests/logic/functions/test_journal_article_matcher.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/tests/logic/test_default_models.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/tests/logic/test_models.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/tests/logic/test_setup.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/tests/processing/test_bulk_operation_styles.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/tests/shared.py +0 -0
- {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/tests/test_tautology.py +0 -0
|
@@ -27,6 +27,37 @@ version = "1.0.4"
|
|
|
27
27
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
28
28
|
checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
|
|
29
29
|
|
|
30
|
+
[[package]]
|
|
31
|
+
name = "crossbeam-deque"
|
|
32
|
+
version = "0.8.6"
|
|
33
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
34
|
+
checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
|
|
35
|
+
dependencies = [
|
|
36
|
+
"crossbeam-epoch",
|
|
37
|
+
"crossbeam-utils",
|
|
38
|
+
]
|
|
39
|
+
|
|
40
|
+
[[package]]
|
|
41
|
+
name = "crossbeam-epoch"
|
|
42
|
+
version = "0.9.18"
|
|
43
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
44
|
+
checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
|
|
45
|
+
dependencies = [
|
|
46
|
+
"crossbeam-utils",
|
|
47
|
+
]
|
|
48
|
+
|
|
49
|
+
[[package]]
|
|
50
|
+
name = "crossbeam-utils"
|
|
51
|
+
version = "0.8.21"
|
|
52
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
53
|
+
checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
|
|
54
|
+
|
|
55
|
+
[[package]]
|
|
56
|
+
name = "either"
|
|
57
|
+
version = "1.15.0"
|
|
58
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
59
|
+
checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
|
|
60
|
+
|
|
30
61
|
[[package]]
|
|
31
62
|
name = "getrandom"
|
|
32
63
|
version = "0.3.4"
|
|
@@ -77,10 +108,12 @@ checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
|
|
|
77
108
|
|
|
78
109
|
[[package]]
|
|
79
110
|
name = "philoch_bib_sdk"
|
|
80
|
-
version = "0.
|
|
111
|
+
version = "0.4.0"
|
|
81
112
|
dependencies = [
|
|
82
113
|
"ahash",
|
|
83
114
|
"pyo3",
|
|
115
|
+
"rayon",
|
|
116
|
+
"strsim",
|
|
84
117
|
]
|
|
85
118
|
|
|
86
119
|
[[package]]
|
|
@@ -100,11 +133,10 @@ dependencies = [
|
|
|
100
133
|
|
|
101
134
|
[[package]]
|
|
102
135
|
name = "pyo3"
|
|
103
|
-
version = "0.
|
|
136
|
+
version = "0.25.1"
|
|
104
137
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
105
|
-
checksum = "
|
|
138
|
+
checksum = "8970a78afe0628a3e3430376fc5fd76b6b45c4d43360ffd6cdd40bdde72b682a"
|
|
106
139
|
dependencies = [
|
|
107
|
-
"cfg-if",
|
|
108
140
|
"indoc",
|
|
109
141
|
"libc",
|
|
110
142
|
"memoffset",
|
|
@@ -118,9 +150,9 @@ dependencies = [
|
|
|
118
150
|
|
|
119
151
|
[[package]]
|
|
120
152
|
name = "pyo3-build-config"
|
|
121
|
-
version = "0.
|
|
153
|
+
version = "0.25.1"
|
|
122
154
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
123
|
-
checksum = "
|
|
155
|
+
checksum = "458eb0c55e7ece017adeba38f2248ff3ac615e53660d7c71a238d7d2a01c7598"
|
|
124
156
|
dependencies = [
|
|
125
157
|
"once_cell",
|
|
126
158
|
"target-lexicon",
|
|
@@ -128,9 +160,9 @@ dependencies = [
|
|
|
128
160
|
|
|
129
161
|
[[package]]
|
|
130
162
|
name = "pyo3-ffi"
|
|
131
|
-
version = "0.
|
|
163
|
+
version = "0.25.1"
|
|
132
164
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
133
|
-
checksum = "
|
|
165
|
+
checksum = "7114fe5457c61b276ab77c5055f206295b812608083644a5c5b2640c3102565c"
|
|
134
166
|
dependencies = [
|
|
135
167
|
"libc",
|
|
136
168
|
"pyo3-build-config",
|
|
@@ -138,9 +170,9 @@ dependencies = [
|
|
|
138
170
|
|
|
139
171
|
[[package]]
|
|
140
172
|
name = "pyo3-macros"
|
|
141
|
-
version = "0.
|
|
173
|
+
version = "0.25.1"
|
|
142
174
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
143
|
-
checksum = "
|
|
175
|
+
checksum = "a8725c0a622b374d6cb051d11a0983786448f7785336139c3c94f5aa6bef7e50"
|
|
144
176
|
dependencies = [
|
|
145
177
|
"proc-macro2",
|
|
146
178
|
"pyo3-macros-backend",
|
|
@@ -150,9 +182,9 @@ dependencies = [
|
|
|
150
182
|
|
|
151
183
|
[[package]]
|
|
152
184
|
name = "pyo3-macros-backend"
|
|
153
|
-
version = "0.
|
|
185
|
+
version = "0.25.1"
|
|
154
186
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
155
|
-
checksum = "
|
|
187
|
+
checksum = "4109984c22491085343c05b0dbc54ddc405c3cf7b4374fc533f5c3313a572ccc"
|
|
156
188
|
dependencies = [
|
|
157
189
|
"heck",
|
|
158
190
|
"proc-macro2",
|
|
@@ -176,12 +208,38 @@ version = "5.3.0"
|
|
|
176
208
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
177
209
|
checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f"
|
|
178
210
|
|
|
211
|
+
[[package]]
|
|
212
|
+
name = "rayon"
|
|
213
|
+
version = "1.11.0"
|
|
214
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
215
|
+
checksum = "368f01d005bf8fd9b1206fb6fa653e6c4a81ceb1466406b81792d87c5677a58f"
|
|
216
|
+
dependencies = [
|
|
217
|
+
"either",
|
|
218
|
+
"rayon-core",
|
|
219
|
+
]
|
|
220
|
+
|
|
221
|
+
[[package]]
|
|
222
|
+
name = "rayon-core"
|
|
223
|
+
version = "1.13.0"
|
|
224
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
225
|
+
checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91"
|
|
226
|
+
dependencies = [
|
|
227
|
+
"crossbeam-deque",
|
|
228
|
+
"crossbeam-utils",
|
|
229
|
+
]
|
|
230
|
+
|
|
179
231
|
[[package]]
|
|
180
232
|
name = "rustversion"
|
|
181
233
|
version = "1.0.22"
|
|
182
234
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
183
235
|
checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
|
|
184
236
|
|
|
237
|
+
[[package]]
|
|
238
|
+
name = "strsim"
|
|
239
|
+
version = "0.11.1"
|
|
240
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
241
|
+
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
|
|
242
|
+
|
|
185
243
|
[[package]]
|
|
186
244
|
name = "syn"
|
|
187
245
|
version = "2.0.108"
|
|
@@ -195,9 +253,9 @@ dependencies = [
|
|
|
195
253
|
|
|
196
254
|
[[package]]
|
|
197
255
|
name = "target-lexicon"
|
|
198
|
-
version = "0.
|
|
256
|
+
version = "0.13.4"
|
|
199
257
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
200
|
-
checksum = "
|
|
258
|
+
checksum = "b1dd07eb858a2067e2f3c7155d54e929265c264e6f37efe3ee7a8d1b5a1dd0ba"
|
|
201
259
|
|
|
202
260
|
[[package]]
|
|
203
261
|
name = "unicode-ident"
|
|
@@ -1,25 +1,30 @@
|
|
|
1
1
|
[package]
|
|
2
|
-
name = "
|
|
3
|
-
version = "0.
|
|
2
|
+
name = "philoch_bib_sdk"
|
|
3
|
+
version = "0.4.0"
|
|
4
4
|
edition = "2021"
|
|
5
|
+
readme = "README.md"
|
|
5
6
|
|
|
6
7
|
[lib]
|
|
7
|
-
name = "
|
|
8
|
+
name = "_rust"
|
|
8
9
|
crate-type = ["cdylib"]
|
|
9
10
|
|
|
10
11
|
[dependencies]
|
|
11
12
|
pyo3 = "0.25.0"
|
|
13
|
+
ahash = "0.8"
|
|
12
14
|
rayon = "1.11.0"
|
|
13
15
|
strsim = "0.11.1"
|
|
14
16
|
|
|
17
|
+
[profile.release]
|
|
18
|
+
opt-level = 3
|
|
19
|
+
lto = true
|
|
20
|
+
codegen-units = 1
|
|
21
|
+
|
|
15
22
|
[lints.clippy]
|
|
16
23
|
all = "warn"
|
|
17
|
-
# Type safety: prevent silent numeric conversions
|
|
18
24
|
cast_possible_truncation = "warn"
|
|
19
25
|
cast_sign_loss = "warn"
|
|
20
26
|
cast_possible_wrap = "warn"
|
|
21
27
|
cast_lossless = "warn"
|
|
22
|
-
# Efficiency
|
|
23
28
|
redundant_clone = "warn"
|
|
24
29
|
|
|
25
30
|
[lints.rust]
|
|
@@ -1,9 +1,56 @@
|
|
|
1
|
-
"""Type stubs for
|
|
1
|
+
"""Type stubs for philoch_bib_sdk._rust - Rust extension module.
|
|
2
|
+
|
|
3
|
+
This module provides high-performance Rust implementations for:
|
|
4
|
+
- Building search indexes for fuzzy matching
|
|
5
|
+
- Batch fuzzy scoring of bibliographic items
|
|
6
|
+
"""
|
|
2
7
|
|
|
3
8
|
from typing import TypedDict
|
|
4
9
|
|
|
10
|
+
# === Index Building Types ===
|
|
11
|
+
|
|
12
|
+
class ItemData(TypedDict):
|
|
13
|
+
"""Input data for a single bibliographic item (for index building)."""
|
|
14
|
+
|
|
15
|
+
item_index: int
|
|
16
|
+
doi: str | None
|
|
17
|
+
title: str
|
|
18
|
+
author_surnames: list[str]
|
|
19
|
+
year: int | None
|
|
20
|
+
journal_name: str | None
|
|
21
|
+
|
|
22
|
+
class IndexData:
|
|
23
|
+
"""Output index data structure from build_index_rust."""
|
|
24
|
+
|
|
25
|
+
doi_to_index: dict[str, int]
|
|
26
|
+
trigram_to_indices: dict[str, list[int]]
|
|
27
|
+
surname_to_indices: dict[str, list[int]]
|
|
28
|
+
decade_to_indices: dict[int | None, list[int]]
|
|
29
|
+
journal_to_indices: dict[str, list[int]]
|
|
30
|
+
|
|
31
|
+
def build_index_rust(items_data: list[ItemData]) -> IndexData:
|
|
32
|
+
"""Build index for fuzzy matching.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
items_data: List of ItemData dicts with bibliographic info
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
IndexData with all indexes built for fast lookup
|
|
39
|
+
"""
|
|
40
|
+
...
|
|
41
|
+
|
|
42
|
+
def hello_rust() -> str:
|
|
43
|
+
"""A simple test function to verify Rust integration works.
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
A greeting string from Rust
|
|
47
|
+
"""
|
|
48
|
+
...
|
|
49
|
+
|
|
50
|
+
# === Scorer Types ===
|
|
51
|
+
|
|
5
52
|
class BibItemData(TypedDict):
|
|
6
|
-
"""Input data for a single BibItem."""
|
|
53
|
+
"""Input data for a single BibItem (for scoring)."""
|
|
7
54
|
|
|
8
55
|
index: int
|
|
9
56
|
title: str
|
{philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.0}/philoch_bib_sdk/logic/functions/fuzzy_matcher.py
RENAMED
|
@@ -27,11 +27,11 @@ from philoch_bib_sdk.logic.models_staging import PartialScore, ScoreComponent
|
|
|
27
27
|
|
|
28
28
|
|
|
29
29
|
if TYPE_CHECKING:
|
|
30
|
-
from
|
|
30
|
+
from philoch_bib_sdk._rust import BibItemData, ItemData
|
|
31
31
|
|
|
32
32
|
# Try to import Rust scorer for batch processing
|
|
33
33
|
try:
|
|
34
|
-
import rust_scorer
|
|
34
|
+
from philoch_bib_sdk import _rust as rust_scorer
|
|
35
35
|
|
|
36
36
|
_RUST_SCORER_AVAILABLE = True
|
|
37
37
|
except ImportError:
|
|
@@ -124,7 +124,7 @@ def _get_decade(date: BibItemDateAttr | str) -> int | None:
|
|
|
124
124
|
return None
|
|
125
125
|
|
|
126
126
|
|
|
127
|
-
def _prepare_items_for_rust(bibitems: Sequence[BibItem]) -> list[
|
|
127
|
+
def _prepare_items_for_rust(bibitems: Sequence[BibItem]) -> "list[ItemData]":
|
|
128
128
|
"""Extract minimal data needed by Rust build_index_rust.
|
|
129
129
|
|
|
130
130
|
Args:
|
|
@@ -134,7 +134,7 @@ def _prepare_items_for_rust(bibitems: Sequence[BibItem]) -> list[dict[str, Any]]
|
|
|
134
134
|
List of dicts with minimal data for Rust
|
|
135
135
|
"""
|
|
136
136
|
|
|
137
|
-
items_data = []
|
|
137
|
+
items_data: list[ItemData] = []
|
|
138
138
|
for i, item in enumerate(bibitems):
|
|
139
139
|
# Extract title string
|
|
140
140
|
title_attr = item.title
|
|
@@ -297,7 +297,7 @@ def build_index(bibitems: Sequence[BibItem]) -> BibItemBlockIndex:
|
|
|
297
297
|
"""
|
|
298
298
|
# Try to use Rust implementation
|
|
299
299
|
try:
|
|
300
|
-
from philoch_bib_sdk._rust import build_index_rust
|
|
300
|
+
from philoch_bib_sdk._rust import build_index_rust
|
|
301
301
|
|
|
302
302
|
use_rust = True
|
|
303
303
|
except ImportError:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "philoch-bib-sdk"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.4.0"
|
|
4
4
|
description = "Standard development kit for the Philosophie Bibliography project"
|
|
5
5
|
authors = [
|
|
6
6
|
{name = "Luis Alejandro Bordo García", email = "luis.bordo@philosophie.ch"}
|
|
@@ -19,6 +19,14 @@ build-backend = "maturin"
|
|
|
19
19
|
# Note: For development, continue using Poetry commands (poetry install, poetry run, etc.)
|
|
20
20
|
# Poetry will work alongside maturin. Use 'maturin develop' to build the Rust extension.
|
|
21
21
|
# For PyPI releases, use 'maturin build --release'
|
|
22
|
+
|
|
23
|
+
dependencies = [
|
|
24
|
+
"aletk>=0.1.6",
|
|
25
|
+
"attrs>=25.3.0",
|
|
26
|
+
"polars>=1.32.3",
|
|
27
|
+
"pydantic>=2.11.9",
|
|
28
|
+
"cytoolz>=1.0.1",
|
|
29
|
+
]
|
|
22
30
|
|
|
23
31
|
[tool.poetry.dependencies]
|
|
24
32
|
aletk = "^0.1.6"
|
|
@@ -1,9 +1,155 @@
|
|
|
1
1
|
use pyo3::prelude::*;
|
|
2
|
+
use pyo3::types::{PyDict, PyList};
|
|
2
3
|
use rayon::prelude::*;
|
|
3
4
|
use std::cmp::Ordering;
|
|
4
|
-
use std::collections::BinaryHeap;
|
|
5
|
+
use std::collections::{BinaryHeap, HashSet};
|
|
5
6
|
use strsim::jaro_winkler;
|
|
6
7
|
|
|
8
|
+
/// Input data for a single bibliographic item
|
|
9
|
+
#[derive(Debug, FromPyObject)]
|
|
10
|
+
struct ItemData {
|
|
11
|
+
#[pyo3(item)]
|
|
12
|
+
item_index: usize,
|
|
13
|
+
#[pyo3(item)]
|
|
14
|
+
doi: Option<String>,
|
|
15
|
+
#[pyo3(item)]
|
|
16
|
+
title: String,
|
|
17
|
+
#[pyo3(item)]
|
|
18
|
+
author_surnames: Vec<String>,
|
|
19
|
+
#[pyo3(item)]
|
|
20
|
+
year: Option<i32>,
|
|
21
|
+
#[pyo3(item)]
|
|
22
|
+
journal_name: Option<String>,
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
/// Output index data structure
|
|
26
|
+
#[pyclass]
|
|
27
|
+
struct IndexData {
|
|
28
|
+
#[pyo3(get)]
|
|
29
|
+
doi_to_index: Py<PyDict>,
|
|
30
|
+
#[pyo3(get)]
|
|
31
|
+
trigram_to_indices: Py<PyDict>,
|
|
32
|
+
#[pyo3(get)]
|
|
33
|
+
surname_to_indices: Py<PyDict>,
|
|
34
|
+
#[pyo3(get)]
|
|
35
|
+
decade_to_indices: Py<PyDict>,
|
|
36
|
+
#[pyo3(get)]
|
|
37
|
+
journal_to_indices: Py<PyDict>,
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
/// Extract trigrams from text
|
|
41
|
+
fn extract_trigrams(text: &str) -> HashSet<String> {
|
|
42
|
+
let normalized = text.to_lowercase();
|
|
43
|
+
let chars: Vec<char> = normalized.chars().collect();
|
|
44
|
+
|
|
45
|
+
if chars.len() < 3 {
|
|
46
|
+
return HashSet::new();
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
let mut trigrams = HashSet::new();
|
|
50
|
+
for i in 0..=chars.len() - 3 {
|
|
51
|
+
let trigram: String = chars[i..i + 3].iter().collect();
|
|
52
|
+
trigrams.insert(trigram);
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
trigrams
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/// Calculate decade from year
|
|
59
|
+
fn get_decade(year: Option<i32>) -> Option<i32> {
|
|
60
|
+
year.map(|y| (y / 10) * 10)
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
/// Build index for fuzzy matching
|
|
64
|
+
#[pyfunction]
|
|
65
|
+
fn build_index_rust(py: Python, items_data: Vec<ItemData>) -> PyResult<IndexData> {
|
|
66
|
+
use ahash::AHashMap;
|
|
67
|
+
|
|
68
|
+
// Pre-allocate with capacity hints
|
|
69
|
+
let capacity = items_data.len();
|
|
70
|
+
let mut doi_map: AHashMap<String, usize> = AHashMap::with_capacity(capacity);
|
|
71
|
+
let mut trigram_map: AHashMap<String, Vec<usize>> = AHashMap::new();
|
|
72
|
+
let mut surname_map: AHashMap<String, Vec<usize>> = AHashMap::new();
|
|
73
|
+
let mut decade_map: AHashMap<Option<i32>, Vec<usize>> = AHashMap::new();
|
|
74
|
+
let mut journal_map: AHashMap<String, Vec<usize>> = AHashMap::new();
|
|
75
|
+
|
|
76
|
+
// Single pass over all items
|
|
77
|
+
for item in items_data {
|
|
78
|
+
let idx = item.item_index;
|
|
79
|
+
|
|
80
|
+
// DOI index
|
|
81
|
+
if let Some(doi) = item.doi {
|
|
82
|
+
doi_map.insert(doi, idx);
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
// Title trigram index
|
|
86
|
+
let trigrams = extract_trigrams(&item.title);
|
|
87
|
+
for trigram in trigrams {
|
|
88
|
+
trigram_map.entry(trigram).or_default().push(idx);
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// Author surname index
|
|
92
|
+
for surname in item.author_surnames {
|
|
93
|
+
let normalized = surname.to_lowercase().trim().to_string();
|
|
94
|
+
if !normalized.is_empty() {
|
|
95
|
+
surname_map.entry(normalized).or_default().push(idx);
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
// Year decade index
|
|
100
|
+
let decade = get_decade(item.year);
|
|
101
|
+
decade_map.entry(decade).or_default().push(idx);
|
|
102
|
+
|
|
103
|
+
// Journal index
|
|
104
|
+
if let Some(journal) = item.journal_name {
|
|
105
|
+
let normalized = journal.to_lowercase().trim().to_string();
|
|
106
|
+
if !normalized.is_empty() {
|
|
107
|
+
journal_map.entry(normalized).or_default().push(idx);
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
// Convert to Python dicts
|
|
113
|
+
let doi_dict = PyDict::new(py);
|
|
114
|
+
for (k, v) in doi_map {
|
|
115
|
+
doi_dict.set_item(k, v)?;
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
let trigram_dict = PyDict::new(py);
|
|
119
|
+
for (k, v) in trigram_map {
|
|
120
|
+
let py_list = PyList::new(py, &v)?;
|
|
121
|
+
trigram_dict.set_item(k, py_list)?;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
let surname_dict = PyDict::new(py);
|
|
125
|
+
for (k, v) in surname_map {
|
|
126
|
+
let py_list = PyList::new(py, &v)?;
|
|
127
|
+
surname_dict.set_item(k, py_list)?;
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
let decade_dict = PyDict::new(py);
|
|
131
|
+
for (k, v) in decade_map {
|
|
132
|
+
let py_list = PyList::new(py, &v)?;
|
|
133
|
+
decade_dict.set_item(k, py_list)?;
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
let journal_dict = PyDict::new(py);
|
|
137
|
+
for (k, v) in journal_map {
|
|
138
|
+
let py_list = PyList::new(py, &v)?;
|
|
139
|
+
journal_dict.set_item(k, py_list)?;
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
Ok(IndexData {
|
|
143
|
+
doi_to_index: doi_dict.into(),
|
|
144
|
+
trigram_to_indices: trigram_dict.into(),
|
|
145
|
+
surname_to_indices: surname_dict.into(),
|
|
146
|
+
decade_to_indices: decade_dict.into(),
|
|
147
|
+
journal_to_indices: journal_dict.into(),
|
|
148
|
+
})
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
// === SCORER FUNCTIONALITY (merged from rust_scorer) ===
|
|
152
|
+
|
|
7
153
|
/// Normalize text: lowercase and collapse whitespace
|
|
8
154
|
fn normalize(s: &str) -> String {
|
|
9
155
|
s.to_lowercase()
|
|
@@ -41,7 +187,7 @@ fn token_sort_ratio(s1: &str, s2: &str) -> f64 {
|
|
|
41
187
|
token_sort_ratio_f64(s1, s2)
|
|
42
188
|
}
|
|
43
189
|
|
|
44
|
-
/// Input data for a single BibItem (
|
|
190
|
+
/// Input data for a single BibItem (for scoring)
|
|
45
191
|
#[derive(Clone, Debug, FromPyObject)]
|
|
46
192
|
#[pyo3(from_item_all)]
|
|
47
193
|
struct BibItemData {
|
|
@@ -306,9 +452,21 @@ fn score_batch(
|
|
|
306
452
|
.collect()
|
|
307
453
|
}
|
|
308
454
|
|
|
309
|
-
|
|
455
|
+
// === END SCORER FUNCTIONALITY ===
|
|
456
|
+
|
|
457
|
+
/// A simple test function to verify Rust integration works
|
|
458
|
+
#[pyfunction]
|
|
459
|
+
fn hello_rust() -> PyResult<String> {
|
|
460
|
+
Ok("Hello from Rust!".to_string())
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
/// A Python module implemented in Rust.
|
|
310
464
|
#[pymodule]
|
|
311
|
-
fn
|
|
465
|
+
fn _rust(m: &Bound<'_, PyModule>) -> PyResult<()> {
|
|
466
|
+
m.add_function(wrap_pyfunction!(hello_rust, m)?)?;
|
|
467
|
+
m.add_function(wrap_pyfunction!(build_index_rust, m)?)?;
|
|
468
|
+
m.add_class::<IndexData>()?;
|
|
469
|
+
// Scorer functions (merged from rust_scorer)
|
|
312
470
|
m.add_function(wrap_pyfunction!(token_sort_ratio, m)?)?;
|
|
313
471
|
m.add_function(wrap_pyfunction!(score_batch, m)?)?;
|
|
314
472
|
Ok(())
|
|
@@ -318,6 +476,33 @@ fn rust_scorer(m: &Bound<'_, PyModule>) -> PyResult<()> {
|
|
|
318
476
|
mod tests {
|
|
319
477
|
use super::*;
|
|
320
478
|
|
|
479
|
+
#[test]
|
|
480
|
+
fn test_extract_trigrams() {
|
|
481
|
+
let text = "hello";
|
|
482
|
+
let trigrams = extract_trigrams(text);
|
|
483
|
+
|
|
484
|
+
assert_eq!(trigrams.len(), 3);
|
|
485
|
+
assert!(trigrams.contains("hel"));
|
|
486
|
+
assert!(trigrams.contains("ell"));
|
|
487
|
+
assert!(trigrams.contains("llo"));
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
#[test]
|
|
491
|
+
fn test_extract_trigrams_short() {
|
|
492
|
+
let text = "hi";
|
|
493
|
+
let trigrams = extract_trigrams(text);
|
|
494
|
+
assert_eq!(trigrams.len(), 0);
|
|
495
|
+
}
|
|
496
|
+
|
|
497
|
+
#[test]
|
|
498
|
+
fn test_get_decade() {
|
|
499
|
+
assert_eq!(get_decade(Some(1995)), Some(1990));
|
|
500
|
+
assert_eq!(get_decade(Some(2000)), Some(2000));
|
|
501
|
+
assert_eq!(get_decade(Some(2025)), Some(2020));
|
|
502
|
+
assert_eq!(get_decade(None), None);
|
|
503
|
+
}
|
|
504
|
+
|
|
505
|
+
// Scorer tests (merged from rust_scorer)
|
|
321
506
|
#[test]
|
|
322
507
|
fn test_token_sort_ratio_identical() {
|
|
323
508
|
let score = token_sort_ratio("hello world", "hello world");
|
|
@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING
|
|
|
5
5
|
from philoch_bib_sdk.logic.default_models import default_bib_item
|
|
6
6
|
|
|
7
7
|
if TYPE_CHECKING:
|
|
8
|
-
from
|
|
8
|
+
from philoch_bib_sdk._rust import BibItemData
|
|
9
9
|
from philoch_bib_sdk.logic.functions.fuzzy_matcher import (
|
|
10
10
|
BibItemBlockIndex,
|
|
11
11
|
build_index,
|
|
@@ -500,7 +500,7 @@ def test_rust_scorer_available() -> None:
|
|
|
500
500
|
|
|
501
501
|
pytest.skip("Rust scorer not available")
|
|
502
502
|
|
|
503
|
-
import rust_scorer
|
|
503
|
+
from philoch_bib_sdk import _rust as rust_scorer
|
|
504
504
|
|
|
505
505
|
# Test basic function exists and works
|
|
506
506
|
score = rust_scorer.token_sort_ratio("hello world", "world hello")
|
|
@@ -516,7 +516,7 @@ def test_rust_batch_scorer_basic() -> None:
|
|
|
516
516
|
|
|
517
517
|
pytest.skip("Rust scorer not available")
|
|
518
518
|
|
|
519
|
-
import rust_scorer
|
|
519
|
+
from philoch_bib_sdk import _rust as rust_scorer
|
|
520
520
|
|
|
521
521
|
subjects: list[BibItemData] = [
|
|
522
522
|
{
|
philoch_bib_sdk-0.3.9/Cargo.toml
DELETED
|
@@ -1,18 +0,0 @@
|
|
|
1
|
-
[package]
|
|
2
|
-
name = "philoch_bib_sdk"
|
|
3
|
-
version = "0.1.6"
|
|
4
|
-
edition = "2021"
|
|
5
|
-
readme = "README.md"
|
|
6
|
-
|
|
7
|
-
[lib]
|
|
8
|
-
name = "_rust"
|
|
9
|
-
crate-type = ["cdylib"]
|
|
10
|
-
|
|
11
|
-
[dependencies]
|
|
12
|
-
pyo3 = { version = "0.22", features = ["extension-module"] }
|
|
13
|
-
ahash = "0.8"
|
|
14
|
-
|
|
15
|
-
[profile.release]
|
|
16
|
-
opt-level = 3
|
|
17
|
-
lto = true
|
|
18
|
-
codegen-units = 1
|