philoch-bib-sdk 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. philoch_bib_sdk/adapters/tabular_data/read_journal_volume_number_index.py +59 -0
  2. philoch_bib_sdk/converters/latex.py +6 -0
  3. philoch_bib_sdk/converters/plaintext/author/formatter.py +31 -0
  4. philoch_bib_sdk/converters/plaintext/author/parser.py +72 -0
  5. philoch_bib_sdk/converters/plaintext/bib_string_formatter.py +8 -0
  6. philoch_bib_sdk/converters/plaintext/bibitem/bibkey_formatter.py +21 -0
  7. philoch_bib_sdk/converters/plaintext/bibitem/bibkey_parser.py +144 -0
  8. philoch_bib_sdk/converters/plaintext/bibitem/date_formatter.py +37 -0
  9. philoch_bib_sdk/converters/plaintext/bibitem/date_parser.py +62 -0
  10. philoch_bib_sdk/converters/plaintext/bibitem/formatter.py +182 -0
  11. philoch_bib_sdk/converters/plaintext/bibitem/pages_formatter.py +13 -0
  12. philoch_bib_sdk/converters/plaintext/bibitem/pages_parser.py +63 -0
  13. philoch_bib_sdk/converters/plaintext/bibitem/parser.py +3 -0
  14. philoch_bib_sdk/converters/plaintext/journal/formatter.py +25 -0
  15. philoch_bib_sdk/converters/plaintext/journal/parser.py +36 -0
  16. philoch_bib_sdk/converters/plaintext/shared/renderable_formatter.py +25 -0
  17. philoch_bib_sdk/logic/default_models.py +315 -0
  18. philoch_bib_sdk/logic/functions/comparator.py +134 -0
  19. philoch_bib_sdk/logic/functions/journal_article_matcher.py +40 -0
  20. philoch_bib_sdk/logic/literals.py +7 -3
  21. philoch_bib_sdk/logic/models.py +226 -219
  22. {philoch_bib_sdk-0.1.2.dist-info → philoch_bib_sdk-0.1.4.dist-info}/METADATA +2 -1
  23. philoch_bib_sdk-0.1.4.dist-info/RECORD +28 -0
  24. {philoch_bib_sdk-0.1.2.dist-info → philoch_bib_sdk-0.1.4.dist-info}/WHEEL +1 -1
  25. philoch_bib_sdk-0.1.4.dist-info/entry_points.txt +3 -0
  26. philoch_bib_sdk-0.1.2.dist-info/RECORD +0 -8
  27. {philoch_bib_sdk-0.1.2.dist-info → philoch_bib_sdk-0.1.4.dist-info}/LICENSE +0 -0
@@ -0,0 +1,63 @@
1
+ import re
2
+ import traceback
3
+ from typing import Tuple
4
+ from aletk.utils import remove_extra_whitespace
5
+ from aletk.ResultMonad import Ok, Err
6
+
7
+ from philoch_bib_sdk.logic.models import PageAttr
8
+
9
+
10
+ def is_valid_roman(raw_str: str) -> bool:
11
+ """
12
+ TODO: TBD, decide if we want to control if the pages are in roman numbers.
13
+ """
14
+ raw_str = raw_str.upper()
15
+ pattern = r"^M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$"
16
+ return bool(re.match(pattern, raw_str))
17
+
18
+
19
+ def _parse_single_page_attr(
20
+ text: str,
21
+ ) -> PageAttr:
22
+ """
23
+ Parse a single page attribute from a string.
24
+ """
25
+ if "--" not in text and "-" in text:
26
+ raise ValueError(f"Unexpected page format found in '{text}'. Expected either '<start>--<end>' or '<page>'.")
27
+ elif "--" in text:
28
+ parts = remove_extra_whitespace(text).split("--")
29
+
30
+ if len(parts) != 2:
31
+ raise ValueError(f"Unexpected number of page parts found in '{text}': '{parts}'. Expected exactly 2.")
32
+
33
+ start_page, end_page = (remove_extra_whitespace(part) for part in parts)
34
+
35
+ else:
36
+ start_page = remove_extra_whitespace(text)
37
+ end_page = ""
38
+
39
+ return PageAttr(start=start_page, end=end_page)
40
+
41
+
42
+ def parse_pages(text: str) -> Ok[Tuple[PageAttr, ...]] | Err:
43
+ """
44
+ Parse a string of pages into a tuple of PageAttr objects.
45
+ The input string is expected to be a comma-separated list of page attributes, with each attribute in the format "<start>--<end>" or "<page>".
46
+ """
47
+ try:
48
+ if text == "":
49
+ return Ok(())
50
+
51
+ parts = (remove_extra_whitespace(part) for part in text.split(","))
52
+ parts_normalized = (_parse_single_page_attr(part) for part in parts)
53
+
54
+ return Ok(tuple(parts_normalized))
55
+
56
+ except Exception as e:
57
+ error_message = f"Error parsing pages from '{text}': {e}"
58
+ return Err(
59
+ error_message,
60
+ code=-1,
61
+ error_type=f"{e.__class__.__name__}",
62
+ error_trace=traceback.format_exc(),
63
+ )
@@ -0,0 +1,3 @@
1
+ from aletk.utils import get_logger
2
+
3
+ lgr = get_logger(__name__)
@@ -0,0 +1,25 @@
1
+ from aletk.utils import get_logger
2
+ from philoch_bib_sdk.logic.models import Journal, Maybe, TBibString
3
+
4
+ lgr = get_logger(__name__)
5
+
6
+
7
+ def format_journal(journal: Maybe[Journal], bibstring_type: TBibString) -> str:
8
+ """
9
+ Format a journal object into a string representation.
10
+ """
11
+
12
+ match journal:
13
+
14
+ case None:
15
+ return ""
16
+
17
+ case Journal(name, id):
18
+
19
+ if not name:
20
+ return ""
21
+
22
+ return f"{getattr(name, bibstring_type)}"
23
+
24
+ case _:
25
+ raise TypeError(f"Invalid type for journal: {type(journal)}. Dump: {journal!r}")
@@ -0,0 +1,36 @@
1
+ import traceback
2
+ from aletk.ResultMonad import Ok, Err
3
+ from aletk.utils import get_logger, remove_extra_whitespace
4
+ from philoch_bib_sdk.logic.models import Journal, BibStringAttr, TBibString
5
+
6
+ lgr = get_logger(__name__)
7
+
8
+
9
+ def parse_journal(text: str, bibstring_type: TBibString) -> Ok[Journal | None] | Err:
10
+ """
11
+ Parse a journal string into a Journal object.
12
+ """
13
+ try:
14
+ if text == "":
15
+ lgr.debug("Empty journal string, returning None.")
16
+ return Ok(None)
17
+
18
+ # Normalize the text by removing extra whitespace
19
+ normalized_text = remove_extra_whitespace(text)
20
+
21
+ journal = Journal(
22
+ name=BibStringAttr(**{str(bibstring_type): normalized_text}),
23
+ issn_electronic="",
24
+ issn_print="",
25
+ )
26
+
27
+ return Ok(journal)
28
+
29
+ except Exception as e:
30
+ error_message = f"Error parsing journal string '{text}': {e}"
31
+ return Err(
32
+ message=error_message,
33
+ code=-1,
34
+ error_type=f"{e.__class__.__name__}",
35
+ error_trace=traceback.format_exc(),
36
+ )
@@ -0,0 +1,25 @@
1
+ from philoch_bib_sdk.logic.models import BaseRenderable, BaseNamedRenderable, TBibString
2
+
3
+
4
+ def format_renderable(
5
+ renderable: BaseRenderable | BaseNamedRenderable,
6
+ bibstring_type: TBibString,
7
+ ) -> str:
8
+ """
9
+ Format a base renderable object into a string representation.
10
+ """
11
+
12
+ match renderable:
13
+
14
+ case BaseRenderable(text, id):
15
+ if not text:
16
+ return ""
17
+ return f"{getattr(text, bibstring_type)}"
18
+
19
+ case BaseNamedRenderable(name, id):
20
+ if not name:
21
+ return ""
22
+ return f"{getattr(name, bibstring_type)}"
23
+
24
+ case _:
25
+ raise TypeError("Invalid type for renderable")
@@ -0,0 +1,315 @@
1
+ from typing import Tuple, TypedDict, Unpack, Literal
2
+ from philoch_bib_sdk.logic.models import BibItem, PageAttr, KeywordsAttr, BibItemDateAttr, BibKeyAttr, Keyword
3
+
4
+ from philoch_bib_sdk.logic.literals import TBasicPubState, TBibTeXEntryType, TEpoch, TLanguageID, TPubState
5
+ from philoch_bib_sdk.logic.models import (
6
+ Author,
7
+ BaseNamedRenderable,
8
+ BaseRenderable,
9
+ BibItem,
10
+ BibStringAttr,
11
+ Journal,
12
+ Keyword,
13
+ )
14
+
15
+
16
+ class BibStringArgs(TypedDict, total=False):
17
+ latex: str
18
+ unicode: str
19
+ simplified: str
20
+
21
+
22
+ def default_bib_string(**kwargs: Unpack[BibStringArgs]) -> BibStringAttr:
23
+ """
24
+ Create a default BibString object, given a dictionary with any (or None) of its attributes. Defaults to empty strings if not provided.
25
+ """
26
+ return BibStringAttr(
27
+ latex=kwargs.get("latex", ""),
28
+ unicode=kwargs.get("unicode", ""),
29
+ simplified=kwargs.get("simplified", ""),
30
+ )
31
+
32
+
33
+ ############
34
+ # Base Renderables
35
+ ############
36
+
37
+
38
+ class BaseRenderableArgs(TypedDict, total=False):
39
+ text: BibStringArgs
40
+ id: int | None
41
+
42
+
43
+ def default_base_renderable(**kwargs: Unpack[BaseRenderableArgs]) -> BaseRenderable:
44
+ """
45
+ Create a default BaseRenderable object, given a dictionary with any (or None) of its attributes. Defaults to empty strings if not provided.
46
+ """
47
+ return BaseRenderable(
48
+ text=default_bib_string(**kwargs.get("text", {})),
49
+ id=kwargs.get("id", None),
50
+ )
51
+
52
+
53
+ class BaseNamedRenderableArgs(TypedDict, total=False):
54
+ name: BibStringArgs
55
+ id: int | None
56
+
57
+
58
+ def default_base_named_renderable(**kwargs: Unpack[BaseNamedRenderableArgs]) -> BaseNamedRenderable:
59
+ """
60
+ Create a default BaseNamedRenderable object, given a dictionary with any (or None) of its attributes. Defaults to empty strings if not provided.
61
+ """
62
+ return BaseNamedRenderable(
63
+ name=default_bib_string(**kwargs.get("name", {})),
64
+ id=kwargs.get("id", None),
65
+ )
66
+
67
+
68
+ ############
69
+ # Author
70
+ ############
71
+
72
+
73
+ class AuthorArgs(TypedDict, total=False):
74
+ given_name: BibStringArgs
75
+ family_name: BibStringArgs
76
+ mononym: BibStringArgs
77
+ shorthand: BibStringArgs
78
+ famous_name: BibStringArgs
79
+ publications: Tuple[BibItem, ...]
80
+ id: int | None
81
+
82
+
83
+ def default_author(**kwargs: Unpack[AuthorArgs]) -> Author:
84
+ """
85
+ Create a default Author object, given a dictionary with any (or None) of its attributes. Defaults to empty strings and an empty tuple for publications if not provided.
86
+ """
87
+
88
+ return Author(
89
+ given_name=default_bib_string(**kwargs.get("given_name", {})),
90
+ family_name=default_bib_string(**kwargs.get("family_name", {})),
91
+ mononym=default_bib_string(**kwargs.get("mononym", {})),
92
+ shorthand=default_bib_string(**kwargs.get("shorthand", {})),
93
+ famous_name=default_bib_string(**kwargs.get("famous_name", {})),
94
+ publications=kwargs.get("publications", ()),
95
+ id=kwargs.get("id", None),
96
+ )
97
+
98
+
99
+ ############
100
+ # Journal
101
+ ############
102
+
103
+
104
+ class JournalArgs(TypedDict, total=False):
105
+ name: BibStringArgs
106
+ issn_print: str
107
+ issn_electronic: str
108
+ id: int | None
109
+
110
+
111
+ def default_journal(**kwargs: Unpack[JournalArgs]) -> Journal | None:
112
+ """
113
+ Create a default Journal object, given a dictionary with any (or None) of its attributes. Defaults to empty strings if not provided.
114
+ """
115
+ if kwargs == {}:
116
+ return None
117
+
118
+ return Journal(
119
+ name=default_bib_string(**kwargs.get("name", {})),
120
+ issn_print=kwargs.get("issn_print", ""),
121
+ issn_electronic=kwargs.get("issn_electronic", ""),
122
+ id=kwargs.get("id", None),
123
+ )
124
+
125
+
126
+ ############
127
+ # Support Args
128
+ ############
129
+
130
+
131
+ class PageArgs(TypedDict, total=False):
132
+ start: str
133
+ end: str
134
+
135
+
136
+ def default_page(**kwargs: Unpack[PageArgs]) -> PageAttr:
137
+ return PageAttr(
138
+ start=kwargs.get("start", ""),
139
+ end=kwargs.get("end", ""),
140
+ )
141
+
142
+
143
+ class KeywordsArgs(TypedDict, total=False):
144
+ level_1: str
145
+ level_2: str
146
+ level_3: str
147
+
148
+
149
+ def default_keywords(**kwargs: Unpack[KeywordsArgs]) -> KeywordsAttr:
150
+ return KeywordsAttr(
151
+ level_1=Keyword(name=kwargs.get("level_1", "")),
152
+ level_2=Keyword(name=kwargs.get("level_2", "")),
153
+ level_3=Keyword(name=kwargs.get("level_3", "")),
154
+ )
155
+
156
+
157
+ class BibItemDateArgs(TypedDict, total=False):
158
+ year: int
159
+ year_part_2_hyphen: int | None
160
+ year_part_2_slash: int | None
161
+ month: int | None
162
+ day: int | None
163
+
164
+
165
+ def default_bib_item_date(**kwargs: Unpack[BibItemDateArgs]) -> BibItemDateAttr:
166
+ return BibItemDateAttr(
167
+ year=kwargs.get("year", 0),
168
+ year_part_2_hyphen=kwargs.get("year_part_2_hyphen"),
169
+ year_part_2_slash=kwargs.get("year_part_2_slash"),
170
+ month=kwargs.get("month"),
171
+ day=kwargs.get("day"),
172
+ )
173
+
174
+
175
+ def parse_date(date: BibItemDateArgs | Literal["no date"]) -> BibItemDateAttr | Literal["no date"]:
176
+ if isinstance(date, dict):
177
+ return default_bib_item_date(**date)
178
+ else:
179
+ return "no date"
180
+
181
+
182
+ class BibKeyArgs(TypedDict, total=False):
183
+ first_author: str
184
+ other_authors: str
185
+ date: int | TBasicPubState
186
+ date_suffix: str
187
+
188
+
189
+ def default_bib_key(**kwargs: Unpack[BibKeyArgs]) -> BibKeyAttr:
190
+ # Then pass to BibKeyAttr
191
+ return BibKeyAttr(
192
+ first_author=kwargs.get("first_author", ""),
193
+ other_authors=kwargs.get("other_authors", ""),
194
+ date=kwargs.get("date", ""),
195
+ date_suffix=kwargs.get("date_suffix", ""),
196
+ )
197
+
198
+
199
+ ############
200
+ # BibItem Args
201
+ ############
202
+
203
+
204
+ class BibItemArgs(TypedDict, total=False):
205
+ _to_do_general: str
206
+ _change_request: str
207
+ entry_type: TBibTeXEntryType
208
+ bibkey: BibKeyArgs
209
+ author: Tuple[AuthorArgs, ...]
210
+ editor: Tuple[AuthorArgs, ...]
211
+ options: Tuple[str, ...]
212
+ date: BibItemDateArgs | Literal["no date"]
213
+ pubstate: TPubState
214
+ title: BibStringArgs
215
+ booktitle: BibStringArgs
216
+ # crossref: dict
217
+ journal: JournalArgs
218
+ volume: str
219
+ number: str
220
+ pages: Tuple[PageArgs, ...]
221
+ eid: str
222
+ series: BaseNamedRenderableArgs
223
+ address: BibStringArgs
224
+ institution: BibStringArgs
225
+ school: BibStringArgs
226
+ publisher: BibStringArgs
227
+ type: BibStringArgs
228
+ edition: int
229
+ note: BibStringArgs
230
+ issuetitle: BibStringArgs
231
+ _guesteditor: Tuple[AuthorArgs, ...]
232
+ _extra_note: BibStringArgs
233
+ urn: str
234
+ eprint: str
235
+ doi: str
236
+ url: str
237
+ _kws: KeywordsArgs
238
+ _epoch: TEpoch
239
+ _person: AuthorArgs
240
+ _comm_for_profile_bib: str
241
+ _langid: TLanguageID
242
+ _lang_der: str
243
+ _further_refs: Tuple[BibKeyArgs, ...]
244
+ _depends_on: Tuple[BibKeyArgs, ...]
245
+ _dltc_num: int
246
+ _spec_interest: str
247
+ _note_perso: str
248
+ _note_stock: str
249
+ _note_status: str
250
+ _num_inwork_coll: int
251
+ _num_inwork: str
252
+ _num_coll: int
253
+ _dltc_copyediting_note: str
254
+ _note_missing: str
255
+ _num_sort: int
256
+ id: int
257
+ _bib_info_source: str
258
+
259
+
260
+ def default_bib_item(**kwargs: Unpack[BibItemArgs]) -> BibItem:
261
+ return BibItem(
262
+ to_do_general=kwargs.get("_to_do_general", ""),
263
+ change_request=kwargs.get("_change_request", ""),
264
+ entry_type=kwargs.get("entry_type", "UNKNOWN"),
265
+ bibkey=default_bib_key(**kwargs.get("bibkey", {})) if "bibkey" in kwargs else "",
266
+ author=tuple(default_author(**a) for a in kwargs.get("author", ())),
267
+ editor=tuple(default_author(**e) for e in kwargs.get("editor", ())),
268
+ options=kwargs.get("options", ()),
269
+ date=parse_date(kwargs.get("date", "no date")),
270
+ pubstate=kwargs.get("pubstate", ""),
271
+ title=default_bib_string(**kwargs.get("title", {})) if "title" in kwargs else "",
272
+ booktitle=default_bib_string(**kwargs.get("booktitle", {})) if "booktitle" in kwargs else "",
273
+ crossref="", # Crossref is not defined in the provided context, so we leave it as an empty string
274
+ journal=default_journal(**kwargs.get("journal", {})) if "journal" in kwargs else None,
275
+ volume=kwargs.get("volume", ""),
276
+ number=kwargs.get("number", ""),
277
+ pages=tuple(default_page(**p) for p in kwargs.get("pages", ())),
278
+ eid=kwargs.get("eid", ""),
279
+ series=default_base_named_renderable(**kwargs.get("series", {})) if "series" in kwargs else "",
280
+ address=default_bib_string(**kwargs.get("address", {})) if "address" in kwargs else "",
281
+ institution=default_bib_string(**kwargs.get("institution", {})) if "institution" in kwargs else "",
282
+ school=default_bib_string(**kwargs.get("school", {})) if "school" in kwargs else "",
283
+ publisher=default_bib_string(**kwargs.get("publisher", {})) if "publisher" in kwargs else "",
284
+ type=default_bib_string(**kwargs.get("type", {})) if "type" in kwargs else "",
285
+ edition=kwargs.get("edition"),
286
+ note=default_bib_string(**kwargs.get("note", {})) if "note" in kwargs else "",
287
+ issuetitle=default_bib_string(**kwargs.get("issuetitle", {})) if "issuetitle" in kwargs else "",
288
+ guesteditor=tuple(default_author(**a) for a in kwargs.get("_guesteditor", ())),
289
+ extra_note=default_bib_string(**kwargs.get("_extra_note", {})) if "_extra_note" in kwargs else "",
290
+ urn=kwargs.get("urn", ""),
291
+ eprint=kwargs.get("eprint", ""),
292
+ doi=kwargs.get("doi", ""),
293
+ url=kwargs.get("url", ""),
294
+ kws=default_keywords(**kwargs.get("_kws", {})) if "_kws" in kwargs else "",
295
+ epoch=kwargs.get("_epoch", ""),
296
+ person=default_author(**kwargs.get("_person", {})) if "_person" in kwargs else "",
297
+ comm_for_profile_bib=kwargs.get("_comm_for_profile_bib", ""),
298
+ langid=kwargs.get("_langid", ""),
299
+ lang_der=kwargs.get("_lang_der", ""),
300
+ further_refs=tuple(default_bib_key(**b) for b in kwargs.get("_further_refs", ())),
301
+ depends_on=tuple(default_bib_key(**b) for b in kwargs.get("_depends_on", ())),
302
+ dltc_num=kwargs.get("_dltc_num"),
303
+ spec_interest=kwargs.get("_spec_interest", ""),
304
+ note_perso=kwargs.get("_note_perso", ""),
305
+ note_stock=kwargs.get("_note_stock", ""),
306
+ note_status=kwargs.get("_note_status", ""),
307
+ num_inwork_coll=kwargs.get("_num_inwork_coll"),
308
+ num_inwork=kwargs.get("_num_inwork", ""),
309
+ num_coll=kwargs.get("_num_coll"),
310
+ dltc_copyediting_note=kwargs.get("_dltc_copyediting_note", ""),
311
+ note_missing=kwargs.get("_note_missing", ""),
312
+ num_sort=kwargs.get("_num_sort"),
313
+ id=kwargs.get("id"),
314
+ bib_info_source=kwargs.get("_bib_info_source", ""),
315
+ )
@@ -0,0 +1,134 @@
1
+ from aletk.utils import get_logger, fuzzy_match_score, remove_extra_whitespace
2
+
3
+ from typing import TypedDict
4
+ from philoch_bib_sdk.converters.plaintext.author.formatter import format_author
5
+ from philoch_bib_sdk.logic.models import BibItem, BibItemDateAttr, TBibString
6
+
7
+
8
+ logger = get_logger(__name__)
9
+
10
+
11
+ class BibItemScore(TypedDict):
12
+ score: int
13
+ score_title: int
14
+ score_author: int
15
+ score_year: int
16
+
17
+
18
+ class ScoredBibItems(TypedDict):
19
+ reference: BibItem
20
+ subject: BibItem
21
+ score: BibItemScore
22
+
23
+
24
+ UNDESIRED_TITLE_KEYWORDS = ["errata", "review"]
25
+
26
+
27
+ def _score_title(title_1: str, title_2: str) -> int:
28
+
29
+ norm_title_1 = remove_extra_whitespace(title_1).lower()
30
+ norm_title_2 = remove_extra_whitespace(title_2).lower()
31
+
32
+ if not norm_title_1 or not norm_title_2:
33
+ raise ValueError("Titles cannot be empty for comparison")
34
+
35
+ title_score = fuzzy_match_score(
36
+ norm_title_1,
37
+ norm_title_2,
38
+ )
39
+
40
+ # Might catch cases in which one doesn't include the subtitle
41
+ one_included_in_the_other = (norm_title_1 in norm_title_2) or (norm_title_2 in norm_title_1)
42
+
43
+ undesired_kws_in_title_1 = {kw for kw in UNDESIRED_TITLE_KEYWORDS if kw in norm_title_1}
44
+
45
+ undesired_kws_in_title_2 = {kw for kw in UNDESIRED_TITLE_KEYWORDS if kw in norm_title_2}
46
+
47
+ # disjunction
48
+ undesired_kws = undesired_kws_in_title_1.symmetric_difference(undesired_kws_in_title_2)
49
+
50
+ undesired_kws_mismatch = True if len(undesired_kws) > 0 else False
51
+
52
+ if ((title_score > 85) or one_included_in_the_other) and not undesired_kws_mismatch:
53
+ title_score += 100
54
+
55
+ for _ in undesired_kws:
56
+ title_score -= 50
57
+
58
+ return title_score
59
+
60
+
61
+ def _score_author(author_1_full_name: str, author_2_full_name: str) -> int:
62
+ stripped_author_1 = remove_extra_whitespace(author_1_full_name)
63
+ stripped_author_2 = remove_extra_whitespace(author_2_full_name)
64
+
65
+ if not stripped_author_1 or not stripped_author_2:
66
+ raise ValueError("Authors cannot be empty for comparison")
67
+
68
+ author_score = fuzzy_match_score(
69
+ stripped_author_1,
70
+ stripped_author_2,
71
+ )
72
+
73
+ if author_score > 85:
74
+ author_score += 100
75
+
76
+ return author_score
77
+
78
+
79
+ def _score_year(year_1: int, year_2: int, range_offset: int = 1) -> int:
80
+
81
+ if not year_1 or not year_2:
82
+ raise ValueError("Years cannot be empty for comparison")
83
+
84
+ if not any(isinstance(year, int) for year in (year_1, year_2)):
85
+ if year_1 == year_2:
86
+ return 100
87
+ else:
88
+ return 0
89
+
90
+ range = [year_1 - range_offset, year_1, year_1 + range_offset]
91
+
92
+ if year_2 in range:
93
+ return 100
94
+ else:
95
+ return 0
96
+
97
+
98
+ def compare_bibitems(reference: BibItem, subject: BibItem, bibstring_type: TBibString) -> ScoredBibItems:
99
+ """
100
+ Calculate the score of two BibItems based on their title, author, and year.
101
+ The scoring is done using fuzzy matching for title and author, and exact matching for year.
102
+ The final score is a combination of the individual scores.
103
+ """
104
+
105
+ logger.debug(f"Scoring bibitems: {reference}, {subject}")
106
+
107
+ title_1 = getattr(reference.title, bibstring_type)
108
+ title_2 = getattr(subject.title, bibstring_type)
109
+ title_score = _score_title(title_1, title_2)
110
+
111
+ author_1_full_name = format_author(reference.author, bibstring_type)
112
+ author_2_full_name = format_author(subject.author, bibstring_type)
113
+
114
+ author_score = _score_author(author_1_full_name, author_2_full_name)
115
+
116
+ if isinstance(reference.date, BibItemDateAttr) and isinstance(subject.date, BibItemDateAttr):
117
+ year_1 = reference.date.year
118
+ year_2 = subject.date.year
119
+ year_score = _score_year(year_1, year_2)
120
+ else:
121
+ year_score = 0
122
+
123
+ total_score = title_score + author_score + year_score
124
+
125
+ return {
126
+ "reference": reference,
127
+ "subject": subject,
128
+ "score": {
129
+ "score": total_score,
130
+ "score_title": title_score,
131
+ "score_author": author_score,
132
+ "score_year": year_score,
133
+ },
134
+ }
@@ -0,0 +1,40 @@
1
+ from typing import Callable, Dict, Tuple
2
+ from philoch_bib_sdk.converters.plaintext.journal.formatter import format_journal
3
+ from philoch_bib_sdk.logic.models import BibItem
4
+
5
+
6
+ type TJournalName = str
7
+
8
+ type TVolume = str
9
+
10
+ type TNumber = str
11
+
12
+ type TBibkey = str
13
+
14
+
15
+ type TJournalBibkeyIndex = Dict[Tuple[TJournalName, TVolume, TNumber], TBibkey] # (journal, volume, number) # bibkey
16
+
17
+
18
+ def get_bibkey_by_journal_volume_number(index: TJournalBibkeyIndex, subject: BibItem) -> TBibkey:
19
+ """
20
+ Simple lookup of a Bibitem on an index for its bibkey, via the combination (journal_name, volume, number). Fails if any of the three fields are missing.
21
+ """
22
+
23
+ journal = format_journal(subject.journal, bibstring_type="latex")
24
+ volume = subject.volume
25
+ number = subject.number
26
+
27
+ if any((journal == "", volume == "", number == "")):
28
+ raise ValueError(
29
+ f"Expected subject bibitem journal with non-empty journal, volume, and number. Found [[ journal: {journal}; volume: {volume}; number: {number} ]] instead."
30
+ )
31
+
32
+ return index[(journal, volume, number)]
33
+
34
+
35
+ type TReadIndex = Callable[
36
+ [
37
+ str, # path to the index file
38
+ ],
39
+ TJournalBibkeyIndex,
40
+ ]
@@ -1,7 +1,6 @@
1
1
  from typing import Literal
2
2
 
3
3
  type TBibTeXEntryType = Literal[
4
- "",
5
4
  "article",
6
5
  "book",
7
6
  "incollection",
@@ -12,15 +11,20 @@ type TBibTeXEntryType = Literal[
12
11
  "proceedings",
13
12
  "techreport",
14
13
  "unpublished",
14
+ "UNKNOWN",
15
15
  ]
16
16
 
17
- type TPubState = Literal[
17
+ type TBasicPubState = Literal[
18
18
  "",
19
19
  "unpub",
20
+ "forthcoming",
21
+ ]
22
+
23
+ type TPubState = Literal[
24
+ TBasicPubState,
20
25
  "inwork",
21
26
  "submitted",
22
27
  "published",
23
- "forthcoming",
24
28
  ]
25
29
 
26
30
  type TLanguageID = Literal[