philoch-bib-sdk 0.3.9__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. philoch_bib_sdk/__init__.py +0 -0
  2. philoch_bib_sdk/_rust.cp313-win_amd64.pyd +0 -0
  3. philoch_bib_sdk/adapters/io/__init__.py +115 -0
  4. philoch_bib_sdk/adapters/io/csv/__init__.py +308 -0
  5. philoch_bib_sdk/adapters/io/ods/__init__.py +145 -0
  6. philoch_bib_sdk/adapters/plaintext/bibitem_reader.py +0 -0
  7. philoch_bib_sdk/adapters/tabular_data/read_journal_volume_number_index.py +58 -0
  8. philoch_bib_sdk/converters/latex.py +6 -0
  9. philoch_bib_sdk/converters/plaintext/author/formatter.py +34 -0
  10. philoch_bib_sdk/converters/plaintext/author/parser.py +83 -0
  11. philoch_bib_sdk/converters/plaintext/bib_string_formatter.py +8 -0
  12. philoch_bib_sdk/converters/plaintext/bibitem/bibkey_formatter.py +21 -0
  13. philoch_bib_sdk/converters/plaintext/bibitem/bibkey_parser.py +158 -0
  14. philoch_bib_sdk/converters/plaintext/bibitem/date_formatter.py +37 -0
  15. philoch_bib_sdk/converters/plaintext/bibitem/date_parser.py +62 -0
  16. philoch_bib_sdk/converters/plaintext/bibitem/formatter.py +182 -0
  17. philoch_bib_sdk/converters/plaintext/bibitem/pages_formatter.py +13 -0
  18. philoch_bib_sdk/converters/plaintext/bibitem/pages_parser.py +63 -0
  19. philoch_bib_sdk/converters/plaintext/bibitem/parser.py +415 -0
  20. philoch_bib_sdk/converters/plaintext/journal/formatter.py +25 -0
  21. philoch_bib_sdk/converters/plaintext/journal/parser.py +36 -0
  22. philoch_bib_sdk/converters/plaintext/shared/renderable_formatter.py +25 -0
  23. philoch_bib_sdk/interfaces/cli/__init__.py +3 -0
  24. philoch_bib_sdk/interfaces/cli/fuzzy_matching.py +135 -0
  25. philoch_bib_sdk/logic/__init__.py +39 -0
  26. philoch_bib_sdk/logic/default_models.py +315 -0
  27. philoch_bib_sdk/logic/functions/__init__.py +31 -0
  28. philoch_bib_sdk/logic/functions/comparator.py +414 -0
  29. philoch_bib_sdk/logic/functions/fuzzy_matcher.py +796 -0
  30. philoch_bib_sdk/logic/functions/journal_article_matcher.py +44 -0
  31. philoch_bib_sdk/logic/literals.py +98 -0
  32. philoch_bib_sdk/logic/models.py +366 -0
  33. philoch_bib_sdk/logic/models_staging.py +173 -0
  34. philoch_bib_sdk/procedures/fuzzy_matching.py +112 -0
  35. philoch_bib_sdk/py.typed +0 -0
  36. philoch_bib_sdk/rust_scorer/Cargo.lock +232 -0
  37. philoch_bib_sdk/rust_scorer/Cargo.toml +26 -0
  38. philoch_bib_sdk/rust_scorer/pyproject.toml +15 -0
  39. philoch_bib_sdk/rust_scorer/rust_scorer.pyi +65 -0
  40. philoch_bib_sdk/rust_scorer/src/lib.rs +362 -0
  41. philoch_bib_sdk-0.3.9.dist-info/METADATA +15 -0
  42. philoch_bib_sdk-0.3.9.dist-info/RECORD +44 -0
  43. philoch_bib_sdk-0.3.9.dist-info/WHEEL +4 -0
  44. philoch_bib_sdk-0.3.9.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,414 @@
1
+ from aletk.utils import get_logger, fuzzy_match_score, remove_extra_whitespace
2
+
3
+ from typing import Tuple, TypedDict
4
+ from philoch_bib_sdk.converters.plaintext.author.formatter import format_author
5
+ from philoch_bib_sdk.logic.models import BibItem, BibItemDateAttr, BibStringAttr, TBibString
6
+ from philoch_bib_sdk.logic.models_staging import PartialScore, ScoreComponent
7
+
8
+
9
+ logger = get_logger(__name__)
10
+
11
+
12
+ class BibItemScore(TypedDict):
13
+ score: int
14
+ score_title: int
15
+ score_author: int
16
+ score_year: int
17
+
18
+
19
+ class ScoredBibItems(TypedDict):
20
+ reference: BibItem
21
+ subject: BibItem
22
+ score: BibItemScore
23
+
24
+
25
+ UNDESIRED_TITLE_KEYWORDS = ["errata", "review"]
26
+
27
+
28
+ def _score_title(title_1: str, title_2: str) -> int:
29
+
30
+ norm_title_1 = remove_extra_whitespace(title_1).lower()
31
+ norm_title_2 = remove_extra_whitespace(title_2).lower()
32
+
33
+ if not norm_title_1 or not norm_title_2:
34
+ raise ValueError("Titles cannot be empty for comparison")
35
+
36
+ title_score = fuzzy_match_score(
37
+ norm_title_1,
38
+ norm_title_2,
39
+ )
40
+
41
+ # Might catch cases in which one doesn't include the subtitle
42
+ one_included_in_the_other = (norm_title_1 in norm_title_2) or (norm_title_2 in norm_title_1)
43
+
44
+ undesired_kws_in_title_1 = {kw for kw in UNDESIRED_TITLE_KEYWORDS if kw in norm_title_1}
45
+
46
+ undesired_kws_in_title_2 = {kw for kw in UNDESIRED_TITLE_KEYWORDS if kw in norm_title_2}
47
+
48
+ # disjunction
49
+ undesired_kws = undesired_kws_in_title_1.symmetric_difference(undesired_kws_in_title_2)
50
+
51
+ undesired_kws_mismatch = True if len(undesired_kws) > 0 else False
52
+
53
+ if ((title_score > 85) or one_included_in_the_other) and not undesired_kws_mismatch:
54
+ title_score += 100
55
+
56
+ for _ in undesired_kws:
57
+ title_score -= 50
58
+
59
+ return title_score
60
+
61
+
62
+ def _score_author(author_1_full_name: str, author_2_full_name: str) -> int:
63
+ stripped_author_1 = remove_extra_whitespace(author_1_full_name)
64
+ stripped_author_2 = remove_extra_whitespace(author_2_full_name)
65
+
66
+ if not stripped_author_1 or not stripped_author_2:
67
+ raise ValueError("Authors cannot be empty for comparison")
68
+
69
+ author_score = fuzzy_match_score(
70
+ stripped_author_1,
71
+ stripped_author_2,
72
+ )
73
+
74
+ if author_score > 85:
75
+ author_score += 100
76
+
77
+ return author_score
78
+
79
+
80
+ def _score_year(year_1: int, year_2: int, range_offset: int = 1) -> int:
81
+
82
+ if not year_1 or not year_2:
83
+ raise ValueError("Years cannot be empty for comparison")
84
+
85
+ if not any(isinstance(year, int) for year in (year_1, year_2)):
86
+ if year_1 == year_2:
87
+ return 100
88
+ else:
89
+ return 0
90
+
91
+ range = [year_1 - range_offset, year_1, year_1 + range_offset]
92
+
93
+ if year_2 in range:
94
+ return 100
95
+ else:
96
+ return 0
97
+
98
+
99
+ def compare_bibitems(reference: BibItem, subject: BibItem, bibstring_type: TBibString) -> ScoredBibItems:
100
+ """
101
+ Calculate the score of two BibItems based on their title, author, and year.
102
+ The scoring is done using fuzzy matching for title and author, and exact matching for year.
103
+ The final score is a combination of the individual scores.
104
+ """
105
+
106
+ logger.debug(f"Scoring bibitems: {reference}, {subject}")
107
+
108
+ title_1 = getattr(reference.title, bibstring_type)
109
+ title_2 = getattr(subject.title, bibstring_type)
110
+ title_score = _score_title(title_1, title_2)
111
+
112
+ author_1_full_name = format_author(reference.author, bibstring_type)
113
+ author_2_full_name = format_author(subject.author, bibstring_type)
114
+
115
+ author_score = _score_author(author_1_full_name, author_2_full_name)
116
+
117
+ if isinstance(reference.date, BibItemDateAttr) and isinstance(subject.date, BibItemDateAttr):
118
+ year_1 = reference.date.year
119
+ year_2 = subject.date.year
120
+ year_score = _score_year(year_1, year_2)
121
+ else:
122
+ year_score = 0
123
+
124
+ total_score = title_score + author_score + year_score
125
+
126
+ return {
127
+ "reference": reference,
128
+ "subject": subject,
129
+ "score": {
130
+ "score": total_score,
131
+ "score_title": title_score,
132
+ "score_author": author_score,
133
+ "score_year": year_score,
134
+ },
135
+ }
136
+
137
+
138
+ # Enhanced scoring functions with detailed breakdown for fuzzy matching
139
+
140
+
141
+ def _score_title_detailed(title_1: str, title_2: str, weight: float = 0.5) -> PartialScore:
142
+ """Score title similarity with detailed explanation.
143
+
144
+ Args:
145
+ title_1: First title to compare
146
+ title_2: Second title to compare
147
+ weight: Weight to apply to the score (default 0.5 = 50%)
148
+
149
+ Returns:
150
+ PartialScore with raw score, weight, and explanation
151
+ """
152
+ norm_title_1 = remove_extra_whitespace(title_1).lower()
153
+ norm_title_2 = remove_extra_whitespace(title_2).lower()
154
+
155
+ if not norm_title_1 or not norm_title_2:
156
+ return PartialScore(
157
+ component=ScoreComponent.TITLE,
158
+ score=0,
159
+ weight=weight,
160
+ weighted_score=0.0,
161
+ details="Empty title(s)",
162
+ )
163
+
164
+ raw_score = fuzzy_match_score(norm_title_1, norm_title_2)
165
+
166
+ # Bonuses and penalties
167
+ one_included_in_other = (norm_title_1 in norm_title_2) or (norm_title_2 in norm_title_1)
168
+
169
+ undesired_kws_1 = frozenset(kw for kw in UNDESIRED_TITLE_KEYWORDS if kw in norm_title_1)
170
+ undesired_kws_2 = frozenset(kw for kw in UNDESIRED_TITLE_KEYWORDS if kw in norm_title_2)
171
+ undesired_kws_mismatch = undesired_kws_1.symmetric_difference(undesired_kws_2)
172
+
173
+ final_score = raw_score
174
+ details_parts = [f"Fuzzy: {raw_score}"]
175
+
176
+ if (raw_score > 85 or one_included_in_other) and not undesired_kws_mismatch:
177
+ final_score += 100
178
+ details_parts.append("High similarity bonus: +100")
179
+
180
+ if undesired_kws_mismatch:
181
+ penalty = len(undesired_kws_mismatch) * 50
182
+ final_score -= penalty
183
+ details_parts.append(f"Undesired keyword mismatch: -{penalty}")
184
+
185
+ return PartialScore(
186
+ component=ScoreComponent.TITLE,
187
+ score=final_score,
188
+ weight=weight,
189
+ weighted_score=final_score * weight,
190
+ details="; ".join(details_parts),
191
+ )
192
+
193
+
194
+ def _score_author_detailed(author_1: str, author_2: str, weight: float = 0.3) -> PartialScore:
195
+ """Score author similarity with detailed explanation.
196
+
197
+ Args:
198
+ author_1: First author string to compare
199
+ author_2: Second author string to compare
200
+ weight: Weight to apply to the score (default 0.3 = 30%)
201
+
202
+ Returns:
203
+ PartialScore with raw score, weight, and explanation
204
+ """
205
+ stripped_1 = remove_extra_whitespace(author_1)
206
+ stripped_2 = remove_extra_whitespace(author_2)
207
+
208
+ if not stripped_1 or not stripped_2:
209
+ return PartialScore(
210
+ component=ScoreComponent.AUTHOR,
211
+ score=0,
212
+ weight=weight,
213
+ weighted_score=0.0,
214
+ details="Empty author(s)",
215
+ )
216
+
217
+ raw_score = fuzzy_match_score(stripped_1, stripped_2)
218
+ final_score = raw_score
219
+
220
+ details_parts = [f"Fuzzy: {raw_score}"]
221
+
222
+ if raw_score > 85:
223
+ final_score += 100
224
+ details_parts.append("High similarity bonus: +100")
225
+
226
+ return PartialScore(
227
+ component=ScoreComponent.AUTHOR,
228
+ score=final_score,
229
+ weight=weight,
230
+ weighted_score=final_score * weight,
231
+ details="; ".join(details_parts),
232
+ )
233
+
234
+
235
+ def _score_date_detailed(
236
+ date_1: BibItemDateAttr | str, date_2: BibItemDateAttr | str, weight: float = 0.1
237
+ ) -> PartialScore:
238
+ """Score date similarity with detailed explanation.
239
+
240
+ Handles date ranges, missing dates, and flexible matching.
241
+
242
+ Args:
243
+ date_1: First date (BibItemDateAttr or "no date")
244
+ date_2: Second date (BibItemDateAttr or "no date")
245
+ weight: Weight to apply to the score (default 0.1 = 10%)
246
+
247
+ Returns:
248
+ PartialScore with raw score, weight, and explanation
249
+ """
250
+ # Handle missing dates
251
+ if date_1 == "no date" or date_2 == "no date":
252
+ return PartialScore(
253
+ component=ScoreComponent.DATE,
254
+ score=0,
255
+ weight=weight,
256
+ weighted_score=0.0,
257
+ details="Missing date(s)",
258
+ )
259
+
260
+ # Both are BibItemDateAttr
261
+ if not isinstance(date_1, BibItemDateAttr) or not isinstance(date_2, BibItemDateAttr):
262
+ return PartialScore(
263
+ component=ScoreComponent.DATE,
264
+ score=0,
265
+ weight=weight,
266
+ weighted_score=0.0,
267
+ details="Invalid date type",
268
+ )
269
+
270
+ year_1 = date_1.year
271
+ year_2 = date_2.year
272
+
273
+ # Exact match
274
+ if year_1 == year_2:
275
+ return PartialScore(
276
+ component=ScoreComponent.DATE,
277
+ score=100,
278
+ weight=weight,
279
+ weighted_score=100.0 * weight,
280
+ details=f"Exact year match: {year_1}",
281
+ )
282
+
283
+ # Flexible matching (±3 years for reprints/editions)
284
+ year_diff = abs(year_1 - year_2)
285
+ if year_diff <= 3:
286
+ score = 100 - (year_diff * 10) # 90, 80, 70 for 1, 2, 3 year diff
287
+ return PartialScore(
288
+ component=ScoreComponent.DATE,
289
+ score=score,
290
+ weight=weight,
291
+ weighted_score=score * weight,
292
+ details=f"Close years: {year_1} vs {year_2} (diff: {year_diff})",
293
+ )
294
+
295
+ # Same decade (partial credit)
296
+ if year_1 // 10 == year_2 // 10:
297
+ return PartialScore(
298
+ component=ScoreComponent.DATE,
299
+ score=30,
300
+ weight=weight,
301
+ weighted_score=30.0 * weight,
302
+ details=f"Same decade: {year_1} vs {year_2}",
303
+ )
304
+
305
+ # Different years
306
+ return PartialScore(
307
+ component=ScoreComponent.DATE,
308
+ score=0,
309
+ weight=weight,
310
+ weighted_score=0.0,
311
+ details=f"Different years: {year_1} vs {year_2}",
312
+ )
313
+
314
+
315
+ def _score_bonus_fields(reference: BibItem, subject: BibItem, weight: float = 0.1) -> PartialScore:
316
+ """Score bonus fields (DOI, journal+volume+number, pages, publisher).
317
+
318
+ Args:
319
+ reference: Reference BibItem
320
+ subject: Subject BibItem to compare
321
+ weight: Weight to apply to the score (default 0.1 = 10%)
322
+
323
+ Returns:
324
+ PartialScore with combined bonus score and details
325
+ """
326
+ bonus_score = 0
327
+ details_parts = []
328
+
329
+ # DOI match (highest confidence)
330
+ if reference.doi and subject.doi and reference.doi == subject.doi:
331
+ bonus_score += 100
332
+ details_parts.append("DOI exact match: +100")
333
+
334
+ # Journal + Volume + Number match
335
+ if reference.journal and subject.journal:
336
+ ref_journal = reference.journal.name.simplified.lower()
337
+ subj_journal = subject.journal.name.simplified.lower()
338
+ if ref_journal == subj_journal and reference.volume and subject.volume and reference.number and subject.number:
339
+ if reference.volume == subject.volume and reference.number == subject.number:
340
+ bonus_score += 50
341
+ details_parts.append("Journal+Vol+Num match: +50")
342
+
343
+ # Pages overlap (same or overlapping page ranges)
344
+ if reference.pages and subject.pages:
345
+ # Simple check: if any page start matches
346
+ ref_pages_str = " ".join(str(p.start) for p in reference.pages)
347
+ subj_pages_str = " ".join(str(p.start) for p in subject.pages)
348
+ if ref_pages_str and subj_pages_str and ref_pages_str == subj_pages_str:
349
+ bonus_score += 20
350
+ details_parts.append("Page match: +20")
351
+
352
+ # Publisher match
353
+ if reference.publisher and subject.publisher:
354
+ ref_pub = reference.publisher.simplified.lower()
355
+ subj_pub = subject.publisher.simplified.lower()
356
+ if ref_pub and subj_pub:
357
+ pub_score = fuzzy_match_score(ref_pub, subj_pub)
358
+ if pub_score > 85:
359
+ bonus_score += 10
360
+ details_parts.append("Publisher match: +10")
361
+
362
+ return PartialScore(
363
+ component=ScoreComponent.PUBLISHER, # Using PUBLISHER as generic bonus component
364
+ score=bonus_score,
365
+ weight=weight,
366
+ weighted_score=bonus_score * weight,
367
+ details="; ".join(details_parts) if details_parts else "No bonus matches",
368
+ )
369
+
370
+
371
+ def compare_bibitems_detailed(
372
+ reference: BibItem,
373
+ subject: BibItem,
374
+ bibstring_type: TBibString = "simplified",
375
+ weights: tuple[float, float, float, float] = (0.5, 0.3, 0.1, 0.1),
376
+ ) -> Tuple[PartialScore, ...]:
377
+ """Compare two BibItems with detailed scoring breakdown.
378
+
379
+ Args:
380
+ reference: Reference BibItem to compare against
381
+ subject: Subject BibItem to compare
382
+ bibstring_type: Which bibstring variant to use (default: "simplified")
383
+ weights: Tuple of weights (title, author, date, bonus) - must sum to 1.0
384
+
385
+ Returns:
386
+ Tuple of PartialScore objects for each component
387
+ """
388
+ weight_title, weight_author, weight_date, weight_bonus = weights
389
+
390
+ # Title scoring - handle both string and BibStringAttr
391
+ if isinstance(reference.title, BibStringAttr):
392
+ title_1 = getattr(reference.title, bibstring_type)
393
+ else:
394
+ title_1 = str(reference.title) if reference.title else ""
395
+
396
+ if isinstance(subject.title, BibStringAttr):
397
+ title_2 = getattr(subject.title, bibstring_type)
398
+ else:
399
+ title_2 = str(subject.title) if subject.title else ""
400
+
401
+ title_partial = _score_title_detailed(title_1, title_2, weight_title)
402
+
403
+ # Author scoring
404
+ author_1 = format_author(reference.author, bibstring_type)
405
+ author_2 = format_author(subject.author, bibstring_type)
406
+ author_partial = _score_author_detailed(author_1, author_2, weight_author)
407
+
408
+ # Date scoring
409
+ date_partial = _score_date_detailed(reference.date, subject.date, weight_date)
410
+
411
+ # Bonus fields scoring
412
+ bonus_partial = _score_bonus_fields(reference, subject, weight_bonus)
413
+
414
+ return (title_partial, author_partial, date_partial, bonus_partial)