philoch-bib-sdk 0.3.9__tar.gz → 0.4.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/Cargo.lock +72 -14
  2. {philoch_bib_sdk-0.3.9/philoch_bib_sdk/rust_scorer → philoch_bib_sdk-0.4.2}/Cargo.toml +10 -5
  3. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/PKG-INFO +6 -1
  4. philoch_bib_sdk-0.3.9/philoch_bib_sdk/rust_scorer/rust_scorer.pyi → philoch_bib_sdk-0.4.2/philoch_bib_sdk/_rust.pyi +49 -2
  5. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/philoch_bib_sdk/logic/functions/fuzzy_matcher.py +5 -5
  6. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/poetry.lock +3 -3
  7. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/pyproject.toml +8 -1
  8. {philoch_bib_sdk-0.3.9/philoch_bib_sdk/rust_scorer → philoch_bib_sdk-0.4.2}/src/lib.rs +189 -4
  9. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/tests/logic/functions/test_fuzzy_matcher.py +3 -3
  10. philoch_bib_sdk-0.3.9/Cargo.toml +0 -18
  11. philoch_bib_sdk-0.3.9/philoch_bib_sdk/rust_scorer/Cargo.lock +0 -232
  12. philoch_bib_sdk-0.3.9/philoch_bib_sdk/rust_scorer/pyproject.toml +0 -15
  13. philoch_bib_sdk-0.3.9/src/lib.rs +0 -192
  14. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/LICENSE +0 -0
  15. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/README.md +0 -0
  16. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/docs/fuzzy-matching.md +0 -0
  17. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/docs/python-style-guide.md +0 -0
  18. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/docs/rust-implementation-summary.md +0 -0
  19. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/docs/rust-index-building-spec.md +0 -0
  20. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/docs/rust-scorer.md +0 -0
  21. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/docs/streaming-output.md +0 -0
  22. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/docs/todo/fuzzy-matching-enhanced-output.md +0 -0
  23. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/docs/todo/merge_fuzzy_results.py +0 -0
  24. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/docs/todo/rust-build-index-implementation-plan.md +0 -0
  25. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/philoch_bib_sdk/__init__.py +0 -0
  26. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/philoch_bib_sdk/adapters/io/__init__.py +0 -0
  27. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/philoch_bib_sdk/adapters/io/csv/__init__.py +0 -0
  28. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/philoch_bib_sdk/adapters/io/ods/__init__.py +0 -0
  29. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/philoch_bib_sdk/adapters/plaintext/bibitem_reader.py +0 -0
  30. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/philoch_bib_sdk/adapters/tabular_data/read_journal_volume_number_index.py +0 -0
  31. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/philoch_bib_sdk/converters/latex.py +0 -0
  32. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/philoch_bib_sdk/converters/plaintext/author/formatter.py +0 -0
  33. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/philoch_bib_sdk/converters/plaintext/author/parser.py +0 -0
  34. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/philoch_bib_sdk/converters/plaintext/bib_string_formatter.py +0 -0
  35. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/philoch_bib_sdk/converters/plaintext/bibitem/bibkey_formatter.py +0 -0
  36. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/philoch_bib_sdk/converters/plaintext/bibitem/bibkey_parser.py +0 -0
  37. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/philoch_bib_sdk/converters/plaintext/bibitem/date_formatter.py +0 -0
  38. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/philoch_bib_sdk/converters/plaintext/bibitem/date_parser.py +0 -0
  39. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/philoch_bib_sdk/converters/plaintext/bibitem/formatter.py +0 -0
  40. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/philoch_bib_sdk/converters/plaintext/bibitem/pages_formatter.py +0 -0
  41. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/philoch_bib_sdk/converters/plaintext/bibitem/pages_parser.py +0 -0
  42. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/philoch_bib_sdk/converters/plaintext/bibitem/parser.py +0 -0
  43. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/philoch_bib_sdk/converters/plaintext/journal/formatter.py +0 -0
  44. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/philoch_bib_sdk/converters/plaintext/journal/parser.py +0 -0
  45. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/philoch_bib_sdk/converters/plaintext/shared/renderable_formatter.py +0 -0
  46. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/philoch_bib_sdk/interfaces/cli/__init__.py +0 -0
  47. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/philoch_bib_sdk/interfaces/cli/fuzzy_matching.py +0 -0
  48. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/philoch_bib_sdk/logic/__init__.py +0 -0
  49. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/philoch_bib_sdk/logic/default_models.py +0 -0
  50. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/philoch_bib_sdk/logic/functions/__init__.py +0 -0
  51. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/philoch_bib_sdk/logic/functions/comparator.py +0 -0
  52. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/philoch_bib_sdk/logic/functions/journal_article_matcher.py +0 -0
  53. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/philoch_bib_sdk/logic/literals.py +0 -0
  54. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/philoch_bib_sdk/logic/models.py +0 -0
  55. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/philoch_bib_sdk/logic/models_staging.py +0 -0
  56. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/philoch_bib_sdk/procedures/fuzzy_matching.py +0 -0
  57. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/philoch_bib_sdk/py.typed +0 -0
  58. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/run_fuzzy_matching.py +0 -0
  59. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/run_fuzzy_matching_streaming.py +0 -0
  60. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/scripts/format.py +0 -0
  61. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/tests/adapters/test_read_jvn_index_from_ods.py +0 -0
  62. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/tests/converters/plaintext/conftest.py +0 -0
  63. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/tests/converters/plaintext/test_author_formatter.py +0 -0
  64. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/tests/converters/plaintext/test_author_parser.py +0 -0
  65. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/tests/converters/plaintext/test_bibitem_formatter.py +0 -0
  66. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/tests/converters/plaintext/test_bibitem_parser.py +0 -0
  67. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/tests/converters/plaintext/test_bibkey_formatter.py +0 -0
  68. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/tests/converters/plaintext/test_bibkey_parser.py +0 -0
  69. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/tests/converters/plaintext/test_date_formatter.py +0 -0
  70. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/tests/converters/plaintext/test_date_parser.py +0 -0
  71. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/tests/converters/plaintext/test_journal_formatter.py +0 -0
  72. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/tests/converters/plaintext/test_journal_parser.py +0 -0
  73. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/tests/converters/plaintext/test_page_formatter.py +0 -0
  74. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/tests/converters/plaintext/test_page_parser.py +0 -0
  75. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/tests/logic/functions/test_comparator.py +0 -0
  76. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/tests/logic/functions/test_journal_article_matcher.py +0 -0
  77. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/tests/logic/test_default_models.py +0 -0
  78. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/tests/logic/test_models.py +0 -0
  79. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/tests/logic/test_setup.py +0 -0
  80. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/tests/processing/test_bulk_operation_styles.py +0 -0
  81. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/tests/shared.py +0 -0
  82. {philoch_bib_sdk-0.3.9 → philoch_bib_sdk-0.4.2}/tests/test_tautology.py +0 -0
@@ -27,6 +27,37 @@ version = "1.0.4"
27
27
  source = "registry+https://github.com/rust-lang/crates.io-index"
28
28
  checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
29
29
 
30
+ [[package]]
31
+ name = "crossbeam-deque"
32
+ version = "0.8.6"
33
+ source = "registry+https://github.com/rust-lang/crates.io-index"
34
+ checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
35
+ dependencies = [
36
+ "crossbeam-epoch",
37
+ "crossbeam-utils",
38
+ ]
39
+
40
+ [[package]]
41
+ name = "crossbeam-epoch"
42
+ version = "0.9.18"
43
+ source = "registry+https://github.com/rust-lang/crates.io-index"
44
+ checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
45
+ dependencies = [
46
+ "crossbeam-utils",
47
+ ]
48
+
49
+ [[package]]
50
+ name = "crossbeam-utils"
51
+ version = "0.8.21"
52
+ source = "registry+https://github.com/rust-lang/crates.io-index"
53
+ checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
54
+
55
+ [[package]]
56
+ name = "either"
57
+ version = "1.15.0"
58
+ source = "registry+https://github.com/rust-lang/crates.io-index"
59
+ checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
60
+
30
61
  [[package]]
31
62
  name = "getrandom"
32
63
  version = "0.3.4"
@@ -77,10 +108,12 @@ checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
77
108
 
78
109
  [[package]]
79
110
  name = "philoch_bib_sdk"
80
- version = "0.1.6"
111
+ version = "0.4.2"
81
112
  dependencies = [
82
113
  "ahash",
83
114
  "pyo3",
115
+ "rayon",
116
+ "strsim",
84
117
  ]
85
118
 
86
119
  [[package]]
@@ -100,11 +133,10 @@ dependencies = [
100
133
 
101
134
  [[package]]
102
135
  name = "pyo3"
103
- version = "0.22.6"
136
+ version = "0.25.1"
104
137
  source = "registry+https://github.com/rust-lang/crates.io-index"
105
- checksum = "f402062616ab18202ae8319da13fa4279883a2b8a9d9f83f20dbade813ce1884"
138
+ checksum = "8970a78afe0628a3e3430376fc5fd76b6b45c4d43360ffd6cdd40bdde72b682a"
106
139
  dependencies = [
107
- "cfg-if",
108
140
  "indoc",
109
141
  "libc",
110
142
  "memoffset",
@@ -118,9 +150,9 @@ dependencies = [
118
150
 
119
151
  [[package]]
120
152
  name = "pyo3-build-config"
121
- version = "0.22.6"
153
+ version = "0.25.1"
122
154
  source = "registry+https://github.com/rust-lang/crates.io-index"
123
- checksum = "b14b5775b5ff446dd1056212d778012cbe8a0fbffd368029fd9e25b514479c38"
155
+ checksum = "458eb0c55e7ece017adeba38f2248ff3ac615e53660d7c71a238d7d2a01c7598"
124
156
  dependencies = [
125
157
  "once_cell",
126
158
  "target-lexicon",
@@ -128,9 +160,9 @@ dependencies = [
128
160
 
129
161
  [[package]]
130
162
  name = "pyo3-ffi"
131
- version = "0.22.6"
163
+ version = "0.25.1"
132
164
  source = "registry+https://github.com/rust-lang/crates.io-index"
133
- checksum = "9ab5bcf04a2cdcbb50c7d6105de943f543f9ed92af55818fd17b660390fc8636"
165
+ checksum = "7114fe5457c61b276ab77c5055f206295b812608083644a5c5b2640c3102565c"
134
166
  dependencies = [
135
167
  "libc",
136
168
  "pyo3-build-config",
@@ -138,9 +170,9 @@ dependencies = [
138
170
 
139
171
  [[package]]
140
172
  name = "pyo3-macros"
141
- version = "0.22.6"
173
+ version = "0.25.1"
142
174
  source = "registry+https://github.com/rust-lang/crates.io-index"
143
- checksum = "0fd24d897903a9e6d80b968368a34e1525aeb719d568dba8b3d4bfa5dc67d453"
175
+ checksum = "a8725c0a622b374d6cb051d11a0983786448f7785336139c3c94f5aa6bef7e50"
144
176
  dependencies = [
145
177
  "proc-macro2",
146
178
  "pyo3-macros-backend",
@@ -150,9 +182,9 @@ dependencies = [
150
182
 
151
183
  [[package]]
152
184
  name = "pyo3-macros-backend"
153
- version = "0.22.6"
185
+ version = "0.25.1"
154
186
  source = "registry+https://github.com/rust-lang/crates.io-index"
155
- checksum = "36c011a03ba1e50152b4b394b479826cad97e7a21eb52df179cd91ac411cbfbe"
187
+ checksum = "4109984c22491085343c05b0dbc54ddc405c3cf7b4374fc533f5c3313a572ccc"
156
188
  dependencies = [
157
189
  "heck",
158
190
  "proc-macro2",
@@ -176,12 +208,38 @@ version = "5.3.0"
176
208
  source = "registry+https://github.com/rust-lang/crates.io-index"
177
209
  checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f"
178
210
 
211
+ [[package]]
212
+ name = "rayon"
213
+ version = "1.11.0"
214
+ source = "registry+https://github.com/rust-lang/crates.io-index"
215
+ checksum = "368f01d005bf8fd9b1206fb6fa653e6c4a81ceb1466406b81792d87c5677a58f"
216
+ dependencies = [
217
+ "either",
218
+ "rayon-core",
219
+ ]
220
+
221
+ [[package]]
222
+ name = "rayon-core"
223
+ version = "1.13.0"
224
+ source = "registry+https://github.com/rust-lang/crates.io-index"
225
+ checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91"
226
+ dependencies = [
227
+ "crossbeam-deque",
228
+ "crossbeam-utils",
229
+ ]
230
+
179
231
  [[package]]
180
232
  name = "rustversion"
181
233
  version = "1.0.22"
182
234
  source = "registry+https://github.com/rust-lang/crates.io-index"
183
235
  checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
184
236
 
237
+ [[package]]
238
+ name = "strsim"
239
+ version = "0.11.1"
240
+ source = "registry+https://github.com/rust-lang/crates.io-index"
241
+ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
242
+
185
243
  [[package]]
186
244
  name = "syn"
187
245
  version = "2.0.108"
@@ -195,9 +253,9 @@ dependencies = [
195
253
 
196
254
  [[package]]
197
255
  name = "target-lexicon"
198
- version = "0.12.16"
256
+ version = "0.13.4"
199
257
  source = "registry+https://github.com/rust-lang/crates.io-index"
200
- checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1"
258
+ checksum = "b1dd07eb858a2067e2f3c7155d54e929265c264e6f37efe3ee7a8d1b5a1dd0ba"
201
259
 
202
260
  [[package]]
203
261
  name = "unicode-ident"
@@ -1,25 +1,30 @@
1
1
  [package]
2
- name = "rust_scorer"
3
- version = "0.3.9"
2
+ name = "philoch_bib_sdk"
3
+ version = "0.4.2"
4
4
  edition = "2021"
5
+ readme = "README.md"
5
6
 
6
7
  [lib]
7
- name = "rust_scorer"
8
+ name = "_rust"
8
9
  crate-type = ["cdylib"]
9
10
 
10
11
  [dependencies]
11
12
  pyo3 = "0.25.0"
13
+ ahash = "0.8"
12
14
  rayon = "1.11.0"
13
15
  strsim = "0.11.1"
14
16
 
17
+ [profile.release]
18
+ opt-level = 3
19
+ lto = true
20
+ codegen-units = 1
21
+
15
22
  [lints.clippy]
16
23
  all = "warn"
17
- # Type safety: prevent silent numeric conversions
18
24
  cast_possible_truncation = "warn"
19
25
  cast_sign_loss = "warn"
20
26
  cast_possible_wrap = "warn"
21
27
  cast_lossless = "warn"
22
- # Efficiency
23
28
  redundant_clone = "warn"
24
29
 
25
30
  [lints.rust]
@@ -1,6 +1,11 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: philoch-bib-sdk
3
- Version: 0.3.9
3
+ Version: 0.4.2
4
+ Requires-Dist: aletk>=0.1.6
5
+ Requires-Dist: attrs>=25.3.0
6
+ Requires-Dist: polars>=1.32.3
7
+ Requires-Dist: pydantic>=2.11.9
8
+ Requires-Dist: cytoolz>=1.0.1
4
9
  License-File: LICENSE
5
10
  Summary: Standard development kit for the Philosophie Bibliography project
6
11
  Author-email: Luis Alejandro Bordo García <luis.bordo@philosophie.ch>
@@ -1,9 +1,56 @@
1
- """Type stubs for rust_scorer - high-performance fuzzy matching for BibItems."""
1
+ """Type stubs for philoch_bib_sdk._rust - Rust extension module.
2
+
3
+ This module provides high-performance Rust implementations for:
4
+ - Building search indexes for fuzzy matching
5
+ - Batch fuzzy scoring of bibliographic items
6
+ """
2
7
 
3
8
  from typing import TypedDict
4
9
 
10
+ # === Index Building Types ===
11
+
12
+ class ItemData(TypedDict):
13
+ """Input data for a single bibliographic item (for index building)."""
14
+
15
+ item_index: int
16
+ doi: str | None
17
+ title: str
18
+ author_surnames: list[str]
19
+ year: int | None
20
+ journal_name: str | None
21
+
22
+ class IndexData:
23
+ """Output index data structure from build_index_rust."""
24
+
25
+ doi_to_index: dict[str, int]
26
+ trigram_to_indices: dict[str, list[int]]
27
+ surname_to_indices: dict[str, list[int]]
28
+ decade_to_indices: dict[int | None, list[int]]
29
+ journal_to_indices: dict[str, list[int]]
30
+
31
+ def build_index_rust(items_data: list[ItemData]) -> IndexData:
32
+ """Build index for fuzzy matching.
33
+
34
+ Args:
35
+ items_data: List of ItemData dicts with bibliographic info
36
+
37
+ Returns:
38
+ IndexData with all indexes built for fast lookup
39
+ """
40
+ ...
41
+
42
+ def hello_rust() -> str:
43
+ """A simple test function to verify Rust integration works.
44
+
45
+ Returns:
46
+ A greeting string from Rust
47
+ """
48
+ ...
49
+
50
+ # === Scorer Types ===
51
+
5
52
  class BibItemData(TypedDict):
6
- """Input data for a single BibItem."""
53
+ """Input data for a single BibItem (for scoring)."""
7
54
 
8
55
  index: int
9
56
  title: str
@@ -27,11 +27,11 @@ from philoch_bib_sdk.logic.models_staging import PartialScore, ScoreComponent
27
27
 
28
28
 
29
29
  if TYPE_CHECKING:
30
- from rust_scorer import BibItemData
30
+ from philoch_bib_sdk._rust import BibItemData, ItemData
31
31
 
32
32
  # Try to import Rust scorer for batch processing
33
33
  try:
34
- import rust_scorer
34
+ from philoch_bib_sdk import _rust as rust_scorer
35
35
 
36
36
  _RUST_SCORER_AVAILABLE = True
37
37
  except ImportError:
@@ -124,7 +124,7 @@ def _get_decade(date: BibItemDateAttr | str) -> int | None:
124
124
  return None
125
125
 
126
126
 
127
- def _prepare_items_for_rust(bibitems: Sequence[BibItem]) -> list[dict[str, Any]]:
127
+ def _prepare_items_for_rust(bibitems: Sequence[BibItem]) -> "list[ItemData]":
128
128
  """Extract minimal data needed by Rust build_index_rust.
129
129
 
130
130
  Args:
@@ -134,7 +134,7 @@ def _prepare_items_for_rust(bibitems: Sequence[BibItem]) -> list[dict[str, Any]]
134
134
  List of dicts with minimal data for Rust
135
135
  """
136
136
 
137
- items_data = []
137
+ items_data: list[ItemData] = []
138
138
  for i, item in enumerate(bibitems):
139
139
  # Extract title string
140
140
  title_attr = item.title
@@ -297,7 +297,7 @@ def build_index(bibitems: Sequence[BibItem]) -> BibItemBlockIndex:
297
297
  """
298
298
  # Try to use Rust implementation
299
299
  try:
300
- from philoch_bib_sdk._rust import build_index_rust # type: ignore[import-not-found]
300
+ from philoch_bib_sdk._rust import build_index_rust
301
301
 
302
302
  use_rust = True
303
303
  except ImportError:
@@ -1,4 +1,4 @@
1
- # This file is automatically @generated by Poetry 2.1.4 and should not be changed by hand.
1
+ # This file is automatically @generated by Poetry 2.3.0 and should not be changed by hand.
2
2
 
3
3
  [[package]]
4
4
  name = "aletk"
@@ -1125,7 +1125,7 @@ fqdn = {version = "*", optional = true, markers = "extra == \"format-nongpl\""}
1125
1125
  idna = {version = "*", optional = true, markers = "extra == \"format-nongpl\""}
1126
1126
  isoduration = {version = "*", optional = true, markers = "extra == \"format-nongpl\""}
1127
1127
  jsonpointer = {version = ">1.13", optional = true, markers = "extra == \"format-nongpl\""}
1128
- jsonschema-specifications = ">=2023.03.6"
1128
+ jsonschema-specifications = ">=2023.3.6"
1129
1129
  referencing = ">=0.28.4"
1130
1130
  rfc3339-validator = {version = "*", optional = true, markers = "extra == \"format-nongpl\""}
1131
1131
  rfc3986-validator = {version = ">0.1.0", optional = true, markers = "extra == \"format-nongpl\""}
@@ -3344,4 +3344,4 @@ files = [
3344
3344
  [metadata]
3345
3345
  lock-version = "2.1"
3346
3346
  python-versions = ">=3.13"
3347
- content-hash = "6495f61e85ccdea28bf3110c075adcfbcc75ce3d9b380569cb73b53ed8165b9c"
3347
+ content-hash = "ab873fe5c14fba4553b3606fd8e8971b77087cd9d370db1a01b1fee1f82c36d9"
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "philoch-bib-sdk"
3
- version = "0.3.9"
3
+ version = "0.4.2"
4
4
  description = "Standard development kit for the Philosophie Bibliography project"
5
5
  authors = [
6
6
  {name = "Luis Alejandro Bordo García", email = "luis.bordo@philosophie.ch"}
@@ -11,6 +11,13 @@ maintainers = [
11
11
  license = {text = "MIT"}
12
12
  readme = "README.md"
13
13
  requires-python = ">=3.13"
14
+ dependencies = [
15
+ "aletk>=0.1.6",
16
+ "attrs>=25.3.0",
17
+ "polars>=1.32.3",
18
+ "pydantic>=2.11.9",
19
+ "cytoolz>=1.0.1",
20
+ ]
14
21
 
15
22
  [build-system]
16
23
  requires = ["maturin>=1.0,<2.0"]
@@ -1,9 +1,155 @@
1
1
  use pyo3::prelude::*;
2
+ use pyo3::types::{PyDict, PyList};
2
3
  use rayon::prelude::*;
3
4
  use std::cmp::Ordering;
4
- use std::collections::BinaryHeap;
5
+ use std::collections::{BinaryHeap, HashSet};
5
6
  use strsim::jaro_winkler;
6
7
 
8
+ /// Input data for a single bibliographic item
9
+ #[derive(Debug, FromPyObject)]
10
+ struct ItemData {
11
+ #[pyo3(item)]
12
+ item_index: usize,
13
+ #[pyo3(item)]
14
+ doi: Option<String>,
15
+ #[pyo3(item)]
16
+ title: String,
17
+ #[pyo3(item)]
18
+ author_surnames: Vec<String>,
19
+ #[pyo3(item)]
20
+ year: Option<i32>,
21
+ #[pyo3(item)]
22
+ journal_name: Option<String>,
23
+ }
24
+
25
+ /// Output index data structure
26
+ #[pyclass]
27
+ struct IndexData {
28
+ #[pyo3(get)]
29
+ doi_to_index: Py<PyDict>,
30
+ #[pyo3(get)]
31
+ trigram_to_indices: Py<PyDict>,
32
+ #[pyo3(get)]
33
+ surname_to_indices: Py<PyDict>,
34
+ #[pyo3(get)]
35
+ decade_to_indices: Py<PyDict>,
36
+ #[pyo3(get)]
37
+ journal_to_indices: Py<PyDict>,
38
+ }
39
+
40
+ /// Extract trigrams from text
41
+ fn extract_trigrams(text: &str) -> HashSet<String> {
42
+ let normalized = text.to_lowercase();
43
+ let chars: Vec<char> = normalized.chars().collect();
44
+
45
+ if chars.len() < 3 {
46
+ return HashSet::new();
47
+ }
48
+
49
+ let mut trigrams = HashSet::new();
50
+ for i in 0..=chars.len() - 3 {
51
+ let trigram: String = chars[i..i + 3].iter().collect();
52
+ trigrams.insert(trigram);
53
+ }
54
+
55
+ trigrams
56
+ }
57
+
58
+ /// Calculate decade from year
59
+ fn get_decade(year: Option<i32>) -> Option<i32> {
60
+ year.map(|y| (y / 10) * 10)
61
+ }
62
+
63
+ /// Build index for fuzzy matching
64
+ #[pyfunction]
65
+ fn build_index_rust(py: Python, items_data: Vec<ItemData>) -> PyResult<IndexData> {
66
+ use ahash::AHashMap;
67
+
68
+ // Pre-allocate with capacity hints
69
+ let capacity = items_data.len();
70
+ let mut doi_map: AHashMap<String, usize> = AHashMap::with_capacity(capacity);
71
+ let mut trigram_map: AHashMap<String, Vec<usize>> = AHashMap::new();
72
+ let mut surname_map: AHashMap<String, Vec<usize>> = AHashMap::new();
73
+ let mut decade_map: AHashMap<Option<i32>, Vec<usize>> = AHashMap::new();
74
+ let mut journal_map: AHashMap<String, Vec<usize>> = AHashMap::new();
75
+
76
+ // Single pass over all items
77
+ for item in items_data {
78
+ let idx = item.item_index;
79
+
80
+ // DOI index
81
+ if let Some(doi) = item.doi {
82
+ doi_map.insert(doi, idx);
83
+ }
84
+
85
+ // Title trigram index
86
+ let trigrams = extract_trigrams(&item.title);
87
+ for trigram in trigrams {
88
+ trigram_map.entry(trigram).or_default().push(idx);
89
+ }
90
+
91
+ // Author surname index
92
+ for surname in item.author_surnames {
93
+ let normalized = surname.to_lowercase().trim().to_string();
94
+ if !normalized.is_empty() {
95
+ surname_map.entry(normalized).or_default().push(idx);
96
+ }
97
+ }
98
+
99
+ // Year decade index
100
+ let decade = get_decade(item.year);
101
+ decade_map.entry(decade).or_default().push(idx);
102
+
103
+ // Journal index
104
+ if let Some(journal) = item.journal_name {
105
+ let normalized = journal.to_lowercase().trim().to_string();
106
+ if !normalized.is_empty() {
107
+ journal_map.entry(normalized).or_default().push(idx);
108
+ }
109
+ }
110
+ }
111
+
112
+ // Convert to Python dicts
113
+ let doi_dict = PyDict::new(py);
114
+ for (k, v) in doi_map {
115
+ doi_dict.set_item(k, v)?;
116
+ }
117
+
118
+ let trigram_dict = PyDict::new(py);
119
+ for (k, v) in trigram_map {
120
+ let py_list = PyList::new(py, &v)?;
121
+ trigram_dict.set_item(k, py_list)?;
122
+ }
123
+
124
+ let surname_dict = PyDict::new(py);
125
+ for (k, v) in surname_map {
126
+ let py_list = PyList::new(py, &v)?;
127
+ surname_dict.set_item(k, py_list)?;
128
+ }
129
+
130
+ let decade_dict = PyDict::new(py);
131
+ for (k, v) in decade_map {
132
+ let py_list = PyList::new(py, &v)?;
133
+ decade_dict.set_item(k, py_list)?;
134
+ }
135
+
136
+ let journal_dict = PyDict::new(py);
137
+ for (k, v) in journal_map {
138
+ let py_list = PyList::new(py, &v)?;
139
+ journal_dict.set_item(k, py_list)?;
140
+ }
141
+
142
+ Ok(IndexData {
143
+ doi_to_index: doi_dict.into(),
144
+ trigram_to_indices: trigram_dict.into(),
145
+ surname_to_indices: surname_dict.into(),
146
+ decade_to_indices: decade_dict.into(),
147
+ journal_to_indices: journal_dict.into(),
148
+ })
149
+ }
150
+
151
+ // === SCORER FUNCTIONALITY (merged from rust_scorer) ===
152
+
7
153
  /// Normalize text: lowercase and collapse whitespace
8
154
  fn normalize(s: &str) -> String {
9
155
  s.to_lowercase()
@@ -41,7 +187,7 @@ fn token_sort_ratio(s1: &str, s2: &str) -> f64 {
41
187
  token_sort_ratio_f64(s1, s2)
42
188
  }
43
189
 
44
- /// Input data for a single BibItem (simplified for Rust processing)
190
+ /// Input data for a single BibItem (for scoring)
45
191
  #[derive(Clone, Debug, FromPyObject)]
46
192
  #[pyo3(from_item_all)]
47
193
  struct BibItemData {
@@ -306,9 +452,21 @@ fn score_batch(
306
452
  .collect()
307
453
  }
308
454
 
309
- /// A Python module implemented in Rust for fast fuzzy matching.
455
+ // === END SCORER FUNCTIONALITY ===
456
+
457
+ /// A simple test function to verify Rust integration works
458
+ #[pyfunction]
459
+ fn hello_rust() -> PyResult<String> {
460
+ Ok("Hello from Rust!".to_string())
461
+ }
462
+
463
+ /// A Python module implemented in Rust.
310
464
  #[pymodule]
311
- fn rust_scorer(m: &Bound<'_, PyModule>) -> PyResult<()> {
465
+ fn _rust(m: &Bound<'_, PyModule>) -> PyResult<()> {
466
+ m.add_function(wrap_pyfunction!(hello_rust, m)?)?;
467
+ m.add_function(wrap_pyfunction!(build_index_rust, m)?)?;
468
+ m.add_class::<IndexData>()?;
469
+ // Scorer functions (merged from rust_scorer)
312
470
  m.add_function(wrap_pyfunction!(token_sort_ratio, m)?)?;
313
471
  m.add_function(wrap_pyfunction!(score_batch, m)?)?;
314
472
  Ok(())
@@ -318,6 +476,33 @@ fn rust_scorer(m: &Bound<'_, PyModule>) -> PyResult<()> {
318
476
  mod tests {
319
477
  use super::*;
320
478
 
479
+ #[test]
480
+ fn test_extract_trigrams() {
481
+ let text = "hello";
482
+ let trigrams = extract_trigrams(text);
483
+
484
+ assert_eq!(trigrams.len(), 3);
485
+ assert!(trigrams.contains("hel"));
486
+ assert!(trigrams.contains("ell"));
487
+ assert!(trigrams.contains("llo"));
488
+ }
489
+
490
+ #[test]
491
+ fn test_extract_trigrams_short() {
492
+ let text = "hi";
493
+ let trigrams = extract_trigrams(text);
494
+ assert_eq!(trigrams.len(), 0);
495
+ }
496
+
497
+ #[test]
498
+ fn test_get_decade() {
499
+ assert_eq!(get_decade(Some(1995)), Some(1990));
500
+ assert_eq!(get_decade(Some(2000)), Some(2000));
501
+ assert_eq!(get_decade(Some(2025)), Some(2020));
502
+ assert_eq!(get_decade(None), None);
503
+ }
504
+
505
+ // Scorer tests (merged from rust_scorer)
321
506
  #[test]
322
507
  fn test_token_sort_ratio_identical() {
323
508
  let score = token_sort_ratio("hello world", "hello world");
@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING
5
5
  from philoch_bib_sdk.logic.default_models import default_bib_item
6
6
 
7
7
  if TYPE_CHECKING:
8
- from rust_scorer import BibItemData
8
+ from philoch_bib_sdk._rust import BibItemData
9
9
  from philoch_bib_sdk.logic.functions.fuzzy_matcher import (
10
10
  BibItemBlockIndex,
11
11
  build_index,
@@ -500,7 +500,7 @@ def test_rust_scorer_available() -> None:
500
500
 
501
501
  pytest.skip("Rust scorer not available")
502
502
 
503
- import rust_scorer
503
+ from philoch_bib_sdk import _rust as rust_scorer
504
504
 
505
505
  # Test basic function exists and works
506
506
  score = rust_scorer.token_sort_ratio("hello world", "world hello")
@@ -516,7 +516,7 @@ def test_rust_batch_scorer_basic() -> None:
516
516
 
517
517
  pytest.skip("Rust scorer not available")
518
518
 
519
- import rust_scorer
519
+ from philoch_bib_sdk import _rust as rust_scorer
520
520
 
521
521
  subjects: list[BibItemData] = [
522
522
  {
@@ -1,18 +0,0 @@
1
- [package]
2
- name = "philoch_bib_sdk"
3
- version = "0.1.6"
4
- edition = "2021"
5
- readme = "README.md"
6
-
7
- [lib]
8
- name = "_rust"
9
- crate-type = ["cdylib"]
10
-
11
- [dependencies]
12
- pyo3 = { version = "0.22", features = ["extension-module"] }
13
- ahash = "0.8"
14
-
15
- [profile.release]
16
- opt-level = 3
17
- lto = true
18
- codegen-units = 1