philoch-bib-sdk 0.3.9__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. philoch_bib_sdk/__init__.py +0 -0
  2. philoch_bib_sdk/_rust.cp313-win_amd64.pyd +0 -0
  3. philoch_bib_sdk/adapters/io/__init__.py +115 -0
  4. philoch_bib_sdk/adapters/io/csv/__init__.py +308 -0
  5. philoch_bib_sdk/adapters/io/ods/__init__.py +145 -0
  6. philoch_bib_sdk/adapters/plaintext/bibitem_reader.py +0 -0
  7. philoch_bib_sdk/adapters/tabular_data/read_journal_volume_number_index.py +58 -0
  8. philoch_bib_sdk/converters/latex.py +6 -0
  9. philoch_bib_sdk/converters/plaintext/author/formatter.py +34 -0
  10. philoch_bib_sdk/converters/plaintext/author/parser.py +83 -0
  11. philoch_bib_sdk/converters/plaintext/bib_string_formatter.py +8 -0
  12. philoch_bib_sdk/converters/plaintext/bibitem/bibkey_formatter.py +21 -0
  13. philoch_bib_sdk/converters/plaintext/bibitem/bibkey_parser.py +158 -0
  14. philoch_bib_sdk/converters/plaintext/bibitem/date_formatter.py +37 -0
  15. philoch_bib_sdk/converters/plaintext/bibitem/date_parser.py +62 -0
  16. philoch_bib_sdk/converters/plaintext/bibitem/formatter.py +182 -0
  17. philoch_bib_sdk/converters/plaintext/bibitem/pages_formatter.py +13 -0
  18. philoch_bib_sdk/converters/plaintext/bibitem/pages_parser.py +63 -0
  19. philoch_bib_sdk/converters/plaintext/bibitem/parser.py +415 -0
  20. philoch_bib_sdk/converters/plaintext/journal/formatter.py +25 -0
  21. philoch_bib_sdk/converters/plaintext/journal/parser.py +36 -0
  22. philoch_bib_sdk/converters/plaintext/shared/renderable_formatter.py +25 -0
  23. philoch_bib_sdk/interfaces/cli/__init__.py +3 -0
  24. philoch_bib_sdk/interfaces/cli/fuzzy_matching.py +135 -0
  25. philoch_bib_sdk/logic/__init__.py +39 -0
  26. philoch_bib_sdk/logic/default_models.py +315 -0
  27. philoch_bib_sdk/logic/functions/__init__.py +31 -0
  28. philoch_bib_sdk/logic/functions/comparator.py +414 -0
  29. philoch_bib_sdk/logic/functions/fuzzy_matcher.py +796 -0
  30. philoch_bib_sdk/logic/functions/journal_article_matcher.py +44 -0
  31. philoch_bib_sdk/logic/literals.py +98 -0
  32. philoch_bib_sdk/logic/models.py +366 -0
  33. philoch_bib_sdk/logic/models_staging.py +173 -0
  34. philoch_bib_sdk/procedures/fuzzy_matching.py +112 -0
  35. philoch_bib_sdk/py.typed +0 -0
  36. philoch_bib_sdk/rust_scorer/Cargo.lock +232 -0
  37. philoch_bib_sdk/rust_scorer/Cargo.toml +26 -0
  38. philoch_bib_sdk/rust_scorer/pyproject.toml +15 -0
  39. philoch_bib_sdk/rust_scorer/rust_scorer.pyi +65 -0
  40. philoch_bib_sdk/rust_scorer/src/lib.rs +362 -0
  41. philoch_bib_sdk-0.3.9.dist-info/METADATA +15 -0
  42. philoch_bib_sdk-0.3.9.dist-info/RECORD +44 -0
  43. philoch_bib_sdk-0.3.9.dist-info/WHEEL +4 -0
  44. philoch_bib_sdk-0.3.9.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,112 @@
1
+ """Fuzzy matching procedure for bibliographic items.
2
+
3
+ This module provides the main orchestration logic for fuzzy matching staged BibItems
4
+ against an existing bibliography. It uses dependency injection to remain agnostic
5
+ to specific IO formats (CSV, JSON, etc.).
6
+ """
7
+
8
+ from typing import Callable, Dict, Tuple
9
+
10
+ from aletk.ResultMonad import Err, Ok, try_except_wrapper
11
+ from aletk.utils import get_logger
12
+
13
+ from philoch_bib_sdk.logic.functions.fuzzy_matcher import (
14
+ build_index,
15
+ stage_bibitems_batch,
16
+ )
17
+ from philoch_bib_sdk.logic.models import BibItem
18
+ from philoch_bib_sdk.logic.models_staging import BibItemStaged
19
+
20
+ logger = get_logger(__name__)
21
+
22
+ # Type aliases for dependency injection
23
+ Bibliography = Dict[str, BibItem] # Key is formatted bibkey
24
+ LoadBibliographyFn = Callable[[str], Ok[Bibliography] | Err]
25
+ LoadStagedFn = Callable[[str], Ok[Tuple[BibItem, ...]] | Err]
26
+ WriteReportFn = Callable[[str, Tuple[BibItemStaged, ...]], Ok[None] | Err]
27
+
28
+
29
+ @try_except_wrapper(logger)
30
+ def fuzzy_match_procedure(
31
+ bibliography_path: str,
32
+ staged_path: str,
33
+ output_path: str,
34
+ load_bibliography: LoadBibliographyFn,
35
+ load_staged: LoadStagedFn,
36
+ write_report: WriteReportFn,
37
+ top_n: int = 5,
38
+ min_score: float = 0.0,
39
+ ) -> None:
40
+ """Execute fuzzy matching workflow with dependency injection.
41
+
42
+ Args:
43
+ bibliography_path: Path to bibliography file
44
+ staged_path: Path to staged items file
45
+ output_path: Path for output report (without extension)
46
+ load_bibliography: Function to load bibliography from file
47
+ load_staged: Function to load staged items from file
48
+ write_report: Function to write results to file
49
+ top_n: Number of top matches to find per item
50
+ min_score: Minimum score threshold for matches
51
+
52
+ Returns:
53
+ None on success (raises exception on failure)
54
+ """
55
+ logger.info("Starting fuzzy matching procedure")
56
+ logger.info(f"Bibliography: {bibliography_path}")
57
+ logger.info(f"Staged items: {staged_path}")
58
+ logger.info(f"Output: {output_path}")
59
+ logger.info(f"Parameters: top_n={top_n}, min_score={min_score}")
60
+
61
+ # Step a: Load bibliography
62
+ logger.info("Loading bibliography...")
63
+ bibliography_result = load_bibliography(bibliography_path)
64
+ if isinstance(bibliography_result, Err):
65
+ raise RuntimeError(f"Failed to load bibliography: {bibliography_result.message}")
66
+
67
+ bibliography = bibliography_result.out
68
+ logger.info(f"Loaded {len(bibliography)} items from bibliography")
69
+
70
+ # Step b: Load staged items
71
+ logger.info("Loading staged items...")
72
+ staged_result = load_staged(staged_path)
73
+ if isinstance(staged_result, Err):
74
+ raise RuntimeError(f"Failed to load staged items: {staged_result.message}")
75
+
76
+ staged_items = staged_result.out
77
+ logger.info(f"Loaded {len(staged_items)} staged items")
78
+
79
+ if not staged_items:
80
+ logger.warning("No staged items to process")
81
+ return None
82
+
83
+ # Step c: Build fuzzy matching index from bibliography
84
+ logger.info("Building fuzzy matching index...")
85
+ # Convert dict to tuple for indexing
86
+ bibliography_tuple = tuple(bibliography.values())
87
+ index = build_index(bibliography_tuple)
88
+ logger.info("Index built successfully")
89
+
90
+ # Step d: Process each staged item to find matches
91
+ logger.info("Processing staged items...")
92
+ staged_with_matches = stage_bibitems_batch(staged_items, index, top_n=top_n, min_score=min_score)
93
+ logger.info(f"Processed {len(staged_with_matches)} items")
94
+
95
+ # Log summary statistics
96
+ total_matches = sum(len(item.top_matches) for item in staged_with_matches)
97
+ avg_matches = total_matches / len(staged_with_matches) if staged_with_matches else 0
98
+ logger.info(f"Found {total_matches} total matches (avg {avg_matches:.2f} per item)")
99
+
100
+ if staged_with_matches:
101
+ avg_time = sum(item.search_metadata["search_time_ms"] for item in staged_with_matches) / len(
102
+ staged_with_matches
103
+ )
104
+ logger.info(f"Average search time: {avg_time:.0f}ms per item")
105
+
106
+ # Step e: Write report
107
+ logger.info("Writing report...")
108
+ write_result = write_report(output_path, staged_with_matches)
109
+ if isinstance(write_result, Err):
110
+ raise RuntimeError(f"Failed to write report: {write_result.message}")
111
+
112
+ logger.info("Fuzzy matching procedure completed successfully")
File without changes
@@ -0,0 +1,232 @@
1
+ # This file is automatically @generated by Cargo.
2
+ # It is not intended for manual editing.
3
+ version = 4
4
+
5
+ [[package]]
6
+ name = "autocfg"
7
+ version = "1.5.0"
8
+ source = "registry+https://github.com/rust-lang/crates.io-index"
9
+ checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
10
+
11
+ [[package]]
12
+ name = "crossbeam-deque"
13
+ version = "0.8.6"
14
+ source = "registry+https://github.com/rust-lang/crates.io-index"
15
+ checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
16
+ dependencies = [
17
+ "crossbeam-epoch",
18
+ "crossbeam-utils",
19
+ ]
20
+
21
+ [[package]]
22
+ name = "crossbeam-epoch"
23
+ version = "0.9.18"
24
+ source = "registry+https://github.com/rust-lang/crates.io-index"
25
+ checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
26
+ dependencies = [
27
+ "crossbeam-utils",
28
+ ]
29
+
30
+ [[package]]
31
+ name = "crossbeam-utils"
32
+ version = "0.8.21"
33
+ source = "registry+https://github.com/rust-lang/crates.io-index"
34
+ checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
35
+
36
+ [[package]]
37
+ name = "either"
38
+ version = "1.15.0"
39
+ source = "registry+https://github.com/rust-lang/crates.io-index"
40
+ checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
41
+
42
+ [[package]]
43
+ name = "heck"
44
+ version = "0.5.0"
45
+ source = "registry+https://github.com/rust-lang/crates.io-index"
46
+ checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
47
+
48
+ [[package]]
49
+ name = "indoc"
50
+ version = "2.0.7"
51
+ source = "registry+https://github.com/rust-lang/crates.io-index"
52
+ checksum = "79cf5c93f93228cf8efb3ba362535fb11199ac548a09ce117c9b1adc3030d706"
53
+ dependencies = [
54
+ "rustversion",
55
+ ]
56
+
57
+ [[package]]
58
+ name = "libc"
59
+ version = "0.2.180"
60
+ source = "registry+https://github.com/rust-lang/crates.io-index"
61
+ checksum = "bcc35a38544a891a5f7c865aca548a982ccb3b8650a5b06d0fd33a10283c56fc"
62
+
63
+ [[package]]
64
+ name = "memoffset"
65
+ version = "0.9.1"
66
+ source = "registry+https://github.com/rust-lang/crates.io-index"
67
+ checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a"
68
+ dependencies = [
69
+ "autocfg",
70
+ ]
71
+
72
+ [[package]]
73
+ name = "once_cell"
74
+ version = "1.21.3"
75
+ source = "registry+https://github.com/rust-lang/crates.io-index"
76
+ checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
77
+
78
+ [[package]]
79
+ name = "portable-atomic"
80
+ version = "1.13.0"
81
+ source = "registry+https://github.com/rust-lang/crates.io-index"
82
+ checksum = "f89776e4d69bb58bc6993e99ffa1d11f228b839984854c7daeb5d37f87cbe950"
83
+
84
+ [[package]]
85
+ name = "proc-macro2"
86
+ version = "1.0.105"
87
+ source = "registry+https://github.com/rust-lang/crates.io-index"
88
+ checksum = "535d180e0ecab6268a3e718bb9fd44db66bbbc256257165fc699dadf70d16fe7"
89
+ dependencies = [
90
+ "unicode-ident",
91
+ ]
92
+
93
+ [[package]]
94
+ name = "pyo3"
95
+ version = "0.25.1"
96
+ source = "registry+https://github.com/rust-lang/crates.io-index"
97
+ checksum = "8970a78afe0628a3e3430376fc5fd76b6b45c4d43360ffd6cdd40bdde72b682a"
98
+ dependencies = [
99
+ "indoc",
100
+ "libc",
101
+ "memoffset",
102
+ "once_cell",
103
+ "portable-atomic",
104
+ "pyo3-build-config",
105
+ "pyo3-ffi",
106
+ "pyo3-macros",
107
+ "unindent",
108
+ ]
109
+
110
+ [[package]]
111
+ name = "pyo3-build-config"
112
+ version = "0.25.1"
113
+ source = "registry+https://github.com/rust-lang/crates.io-index"
114
+ checksum = "458eb0c55e7ece017adeba38f2248ff3ac615e53660d7c71a238d7d2a01c7598"
115
+ dependencies = [
116
+ "once_cell",
117
+ "target-lexicon",
118
+ ]
119
+
120
+ [[package]]
121
+ name = "pyo3-ffi"
122
+ version = "0.25.1"
123
+ source = "registry+https://github.com/rust-lang/crates.io-index"
124
+ checksum = "7114fe5457c61b276ab77c5055f206295b812608083644a5c5b2640c3102565c"
125
+ dependencies = [
126
+ "libc",
127
+ "pyo3-build-config",
128
+ ]
129
+
130
+ [[package]]
131
+ name = "pyo3-macros"
132
+ version = "0.25.1"
133
+ source = "registry+https://github.com/rust-lang/crates.io-index"
134
+ checksum = "a8725c0a622b374d6cb051d11a0983786448f7785336139c3c94f5aa6bef7e50"
135
+ dependencies = [
136
+ "proc-macro2",
137
+ "pyo3-macros-backend",
138
+ "quote",
139
+ "syn",
140
+ ]
141
+
142
+ [[package]]
143
+ name = "pyo3-macros-backend"
144
+ version = "0.25.1"
145
+ source = "registry+https://github.com/rust-lang/crates.io-index"
146
+ checksum = "4109984c22491085343c05b0dbc54ddc405c3cf7b4374fc533f5c3313a572ccc"
147
+ dependencies = [
148
+ "heck",
149
+ "proc-macro2",
150
+ "pyo3-build-config",
151
+ "quote",
152
+ "syn",
153
+ ]
154
+
155
+ [[package]]
156
+ name = "quote"
157
+ version = "1.0.43"
158
+ source = "registry+https://github.com/rust-lang/crates.io-index"
159
+ checksum = "dc74d9a594b72ae6656596548f56f667211f8a97b3d4c3d467150794690dc40a"
160
+ dependencies = [
161
+ "proc-macro2",
162
+ ]
163
+
164
+ [[package]]
165
+ name = "rayon"
166
+ version = "1.11.0"
167
+ source = "registry+https://github.com/rust-lang/crates.io-index"
168
+ checksum = "368f01d005bf8fd9b1206fb6fa653e6c4a81ceb1466406b81792d87c5677a58f"
169
+ dependencies = [
170
+ "either",
171
+ "rayon-core",
172
+ ]
173
+
174
+ [[package]]
175
+ name = "rayon-core"
176
+ version = "1.13.0"
177
+ source = "registry+https://github.com/rust-lang/crates.io-index"
178
+ checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91"
179
+ dependencies = [
180
+ "crossbeam-deque",
181
+ "crossbeam-utils",
182
+ ]
183
+
184
+ [[package]]
185
+ name = "rust_scorer"
186
+ version = "0.1.0"
187
+ dependencies = [
188
+ "pyo3",
189
+ "rayon",
190
+ "strsim",
191
+ ]
192
+
193
+ [[package]]
194
+ name = "rustversion"
195
+ version = "1.0.22"
196
+ source = "registry+https://github.com/rust-lang/crates.io-index"
197
+ checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
198
+
199
+ [[package]]
200
+ name = "strsim"
201
+ version = "0.11.1"
202
+ source = "registry+https://github.com/rust-lang/crates.io-index"
203
+ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
204
+
205
+ [[package]]
206
+ name = "syn"
207
+ version = "2.0.114"
208
+ source = "registry+https://github.com/rust-lang/crates.io-index"
209
+ checksum = "d4d107df263a3013ef9b1879b0df87d706ff80f65a86ea879bd9c31f9b307c2a"
210
+ dependencies = [
211
+ "proc-macro2",
212
+ "quote",
213
+ "unicode-ident",
214
+ ]
215
+
216
+ [[package]]
217
+ name = "target-lexicon"
218
+ version = "0.13.4"
219
+ source = "registry+https://github.com/rust-lang/crates.io-index"
220
+ checksum = "b1dd07eb858a2067e2f3c7155d54e929265c264e6f37efe3ee7a8d1b5a1dd0ba"
221
+
222
+ [[package]]
223
+ name = "unicode-ident"
224
+ version = "1.0.22"
225
+ source = "registry+https://github.com/rust-lang/crates.io-index"
226
+ checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5"
227
+
228
+ [[package]]
229
+ name = "unindent"
230
+ version = "0.2.4"
231
+ source = "registry+https://github.com/rust-lang/crates.io-index"
232
+ checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3"
@@ -0,0 +1,26 @@
1
+ [package]
2
+ name = "rust_scorer"
3
+ version = "0.3.9"
4
+ edition = "2021"
5
+
6
+ [lib]
7
+ name = "rust_scorer"
8
+ crate-type = ["cdylib"]
9
+
10
+ [dependencies]
11
+ pyo3 = "0.25.0"
12
+ rayon = "1.11.0"
13
+ strsim = "0.11.1"
14
+
15
+ [lints.clippy]
16
+ all = "warn"
17
+ # Type safety: prevent silent numeric conversions
18
+ cast_possible_truncation = "warn"
19
+ cast_sign_loss = "warn"
20
+ cast_possible_wrap = "warn"
21
+ cast_lossless = "warn"
22
+ # Efficiency
23
+ redundant_clone = "warn"
24
+
25
+ [lints.rust]
26
+ warnings = "deny"
@@ -0,0 +1,15 @@
1
+ [build-system]
2
+ requires = ["maturin>=1.9,<2.0"]
3
+ build-backend = "maturin"
4
+
5
+ [project]
6
+ name = "rust_scorer"
7
+ requires-python = ">=3.8"
8
+ classifiers = [
9
+ "Programming Language :: Rust",
10
+ "Programming Language :: Python :: Implementation :: CPython",
11
+ "Programming Language :: Python :: Implementation :: PyPy",
12
+ ]
13
+ dynamic = ["version"]
14
+ [tool.maturin]
15
+ features = ["pyo3/extension-module"]
@@ -0,0 +1,65 @@
1
+ """Type stubs for rust_scorer - high-performance fuzzy matching for BibItems."""
2
+
3
+ from typing import TypedDict
4
+
5
+ class BibItemData(TypedDict):
6
+ """Input data for a single BibItem."""
7
+
8
+ index: int
9
+ title: str
10
+ author: str
11
+ year: int | None
12
+ doi: str | None
13
+ journal: str | None
14
+ volume: str | None
15
+ number: str | None
16
+ pages: str | None
17
+ publisher: str | None
18
+
19
+ class MatchResult(TypedDict):
20
+ """Result of scoring a candidate against a subject."""
21
+
22
+ candidate_index: int
23
+ total_score: float
24
+ title_score: float
25
+ author_score: float
26
+ date_score: float
27
+ bonus_score: float
28
+
29
+ class SubjectMatchResult(TypedDict):
30
+ """Result for a single subject with its top matches."""
31
+
32
+ subject_index: int
33
+ matches: list[MatchResult]
34
+ candidates_searched: int
35
+
36
+ def token_sort_ratio(s1: str, s2: str) -> float:
37
+ """Token sort ratio using Jaro-Winkler similarity.
38
+
39
+ Args:
40
+ s1: First string to compare
41
+ s2: Second string to compare
42
+
43
+ Returns:
44
+ Similarity score from 0.0 to 100.0
45
+ """
46
+ ...
47
+
48
+ def score_batch(
49
+ subjects: list[BibItemData],
50
+ candidates: list[BibItemData],
51
+ top_n: int,
52
+ min_score: float,
53
+ ) -> list[SubjectMatchResult]:
54
+ """Batch score multiple subjects against candidates in parallel.
55
+
56
+ Args:
57
+ subjects: List of BibItems to find matches for
58
+ candidates: List of BibItems to match against
59
+ top_n: Maximum number of matches to return per subject
60
+ min_score: Minimum score threshold for matches
61
+
62
+ Returns:
63
+ List of results, one per subject, containing top matches
64
+ """
65
+ ...