philoch-bib-sdk 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. philoch_bib_sdk/converters/latex.py +6 -0
  2. philoch_bib_sdk/converters/plaintext/author/formatter.py +31 -0
  3. philoch_bib_sdk/converters/plaintext/author/parser.py +72 -0
  4. philoch_bib_sdk/converters/plaintext/bib_string_formatter.py +8 -0
  5. philoch_bib_sdk/converters/plaintext/bibitem/bibkey_formatter.py +21 -0
  6. philoch_bib_sdk/converters/plaintext/bibitem/bibkey_parser.py +144 -0
  7. philoch_bib_sdk/converters/plaintext/bibitem/date_formatter.py +37 -0
  8. philoch_bib_sdk/converters/plaintext/bibitem/date_parser.py +62 -0
  9. philoch_bib_sdk/converters/plaintext/bibitem/formatter.py +182 -0
  10. philoch_bib_sdk/converters/plaintext/bibitem/pages_formatter.py +13 -0
  11. philoch_bib_sdk/converters/plaintext/bibitem/pages_parser.py +63 -0
  12. philoch_bib_sdk/converters/plaintext/bibitem/parser.py +3 -0
  13. philoch_bib_sdk/converters/plaintext/journal/formatter.py +25 -0
  14. philoch_bib_sdk/converters/plaintext/journal/parser.py +36 -0
  15. philoch_bib_sdk/converters/plaintext/shared/renderable_formatter.py +25 -0
  16. philoch_bib_sdk/logic/default_models.py +120 -0
  17. philoch_bib_sdk/logic/functions/comparator.py +134 -0
  18. philoch_bib_sdk/logic/literals.py +7 -3
  19. philoch_bib_sdk/logic/models.py +226 -219
  20. {philoch_bib_sdk-0.1.2.dist-info → philoch_bib_sdk-0.1.3.dist-info}/METADATA +1 -1
  21. philoch_bib_sdk-0.1.3.dist-info/RECORD +26 -0
  22. {philoch_bib_sdk-0.1.2.dist-info → philoch_bib_sdk-0.1.3.dist-info}/WHEEL +1 -1
  23. philoch_bib_sdk-0.1.3.dist-info/entry_points.txt +3 -0
  24. philoch_bib_sdk-0.1.2.dist-info/RECORD +0 -8
  25. {philoch_bib_sdk-0.1.2.dist-info → philoch_bib_sdk-0.1.3.dist-info}/LICENSE +0 -0
@@ -0,0 +1,36 @@
1
+ import traceback
2
+ from aletk.ResultMonad import Ok, Err
3
+ from aletk.utils import get_logger, remove_extra_whitespace
4
+ from philoch_bib_sdk.logic.models import Journal, BibStringAttr, TBibString
5
+
6
+ lgr = get_logger(__name__)
7
+
8
+
9
+ def parse_journal(text: str, bibstring_type: TBibString) -> Ok[Journal | None] | Err:
10
+ """
11
+ Parse a journal string into a Journal object.
12
+ """
13
+ try:
14
+ if text == "":
15
+ lgr.debug("Empty journal string, returning None.")
16
+ return Ok(None)
17
+
18
+ # Normalize the text by removing extra whitespace
19
+ normalized_text = remove_extra_whitespace(text)
20
+
21
+ journal = Journal(
22
+ name=BibStringAttr(**{str(bibstring_type): normalized_text}),
23
+ issn_electronic="",
24
+ issn_print="",
25
+ )
26
+
27
+ return Ok(journal)
28
+
29
+ except Exception as e:
30
+ error_message = f"Error parsing journal string '{text}': {e}"
31
+ return Err(
32
+ message=error_message,
33
+ code=-1,
34
+ error_type=f"{e.__class__.__name__}",
35
+ error_trace=traceback.format_exc(),
36
+ )
@@ -0,0 +1,25 @@
1
+ from philoch_bib_sdk.logic.models import BaseRenderable, BaseNamedRenderable, TBibString
2
+
3
+
4
+ def format_renderable(
5
+ renderable: BaseRenderable | BaseNamedRenderable,
6
+ bibstring_type: TBibString,
7
+ ) -> str:
8
+ """
9
+ Format a base renderable object into a string representation.
10
+ """
11
+
12
+ match renderable:
13
+
14
+ case BaseRenderable(text, id):
15
+ if not text:
16
+ return ""
17
+ return f"{getattr(text, bibstring_type)}"
18
+
19
+ case BaseNamedRenderable(name, id):
20
+ if not name:
21
+ return ""
22
+ return f"{getattr(name, bibstring_type)}"
23
+
24
+ case _:
25
+ raise TypeError("Invalid type for renderable")
@@ -0,0 +1,120 @@
1
+ from typing import Tuple, TypedDict, Unpack
2
+
3
+ from philoch_bib_sdk.logic.models import (
4
+ Author,
5
+ BaseNamedRenderable,
6
+ BaseRenderable,
7
+ BibItem,
8
+ BibStringAttr,
9
+ Journal,
10
+ )
11
+
12
+
13
+ class BibStringArgs(TypedDict, total=False):
14
+ latex: str
15
+ unicode: str
16
+ simplified: str
17
+
18
+
19
+ def default_bib_string(**kwargs: Unpack[BibStringArgs]) -> BibStringAttr:
20
+ """
21
+ Create a default BibString object, given a dictionary with any (or None) of its attributes. Defaults to empty strings if not provided.
22
+ """
23
+ return BibStringAttr(
24
+ latex=kwargs.get("latex", ""),
25
+ unicode=kwargs.get("unicode", ""),
26
+ simplified=kwargs.get("simplified", ""),
27
+ )
28
+
29
+
30
+ ############
31
+ # Base Renderables
32
+ ############
33
+
34
+
35
+ class BaseRenderableArgs(TypedDict, total=False):
36
+ text: BibStringArgs
37
+ id: int | None
38
+
39
+
40
+ def default_base_renderable(**kwargs: Unpack[BaseRenderableArgs]) -> BaseRenderable:
41
+ """
42
+ Create a default BaseRenderable object, given a dictionary with any (or None) of its attributes. Defaults to empty strings if not provided.
43
+ """
44
+ return BaseRenderable(
45
+ text=default_bib_string(**kwargs.get("text", {})),
46
+ id=kwargs.get("id", None),
47
+ )
48
+
49
+
50
+ class BaseNamedRenderableArgs(TypedDict, total=False):
51
+ name: BibStringArgs
52
+ id: int | None
53
+
54
+
55
+ def default_base_named_renderable(**kwargs: Unpack[BaseNamedRenderableArgs]) -> BaseNamedRenderable:
56
+ """
57
+ Create a default BaseNamedRenderable object, given a dictionary with any (or None) of its attributes. Defaults to empty strings if not provided.
58
+ """
59
+ return BaseNamedRenderable(
60
+ name=default_bib_string(**kwargs.get("name", {})),
61
+ id=kwargs.get("id", None),
62
+ )
63
+
64
+
65
+ ############
66
+ # Author
67
+ ############
68
+
69
+
70
+ class AuthorArgs(TypedDict, total=False):
71
+ given_name: BibStringArgs
72
+ family_name: BibStringArgs
73
+ mononym: BibStringArgs
74
+ shorthand: BibStringArgs
75
+ famous_name: BibStringArgs
76
+ publications: Tuple[BibItem, ...]
77
+ id: int | None
78
+
79
+
80
+ def default_author(**kwargs: Unpack[AuthorArgs]) -> Author:
81
+ """
82
+ Create a default Author object, given a dictionary with any (or None) of its attributes. Defaults to empty strings and an empty tuple for publications if not provided.
83
+ """
84
+
85
+ return Author(
86
+ given_name=default_bib_string(**kwargs.get("given_name", {})),
87
+ family_name=default_bib_string(**kwargs.get("family_name", {})),
88
+ mononym=default_bib_string(**kwargs.get("mononym", {})),
89
+ shorthand=default_bib_string(**kwargs.get("shorthand", {})),
90
+ famous_name=default_bib_string(**kwargs.get("famous_name", {})),
91
+ publications=kwargs.get("publications", ()),
92
+ id=kwargs.get("id", None),
93
+ )
94
+
95
+
96
+ ############
97
+ # Journal
98
+ ############
99
+
100
+
101
+ class JournalArgs(TypedDict, total=False):
102
+ name: BibStringArgs
103
+ issn_print: str
104
+ issn_electronic: str
105
+ id: int | None
106
+
107
+
108
+ def default_journal(**kwargs: Unpack[JournalArgs]) -> Journal | None:
109
+ """
110
+ Create a default Journal object, given a dictionary with any (or None) of its attributes. Defaults to empty strings if not provided.
111
+ """
112
+ if kwargs == {}:
113
+ return None
114
+
115
+ return Journal(
116
+ name=default_bib_string(**kwargs.get("name", {})),
117
+ issn_print=kwargs.get("issn_print", ""),
118
+ issn_electronic=kwargs.get("issn_electronic", ""),
119
+ id=kwargs.get("id", None),
120
+ )
@@ -0,0 +1,134 @@
1
+ from aletk.utils import get_logger, fuzzy_match_score, remove_extra_whitespace
2
+
3
+ from typing import TypedDict
4
+ from philoch_bib_sdk.converters.plaintext.author.formatter import format_author
5
+ from philoch_bib_sdk.logic.models import BibItem, BibItemDateAttr, TBibString
6
+
7
+
8
+ logger = get_logger(__name__)
9
+
10
+
11
+ class BibItemScore(TypedDict):
12
+ score: int
13
+ score_title: int
14
+ score_author: int
15
+ score_year: int
16
+
17
+
18
+ class ScoredBibItems(TypedDict):
19
+ reference: BibItem
20
+ subject: BibItem
21
+ score: BibItemScore
22
+
23
+
24
+ UNDESIRED_TITLE_KEYWORDS = ["errata", "review"]
25
+
26
+
27
+ def _score_title(title_1: str, title_2: str) -> int:
28
+
29
+ norm_title_1 = remove_extra_whitespace(title_1).lower()
30
+ norm_title_2 = remove_extra_whitespace(title_2).lower()
31
+
32
+ if not norm_title_1 or not norm_title_2:
33
+ raise ValueError("Titles cannot be empty for comparison")
34
+
35
+ title_score = fuzzy_match_score(
36
+ norm_title_1,
37
+ norm_title_2,
38
+ )
39
+
40
+ # Might catch cases in which one doesn't include the subtitle
41
+ one_included_in_the_other = (norm_title_1 in norm_title_2) or (norm_title_2 in norm_title_1)
42
+
43
+ undesired_kws_in_title_1 = {kw for kw in UNDESIRED_TITLE_KEYWORDS if kw in norm_title_1}
44
+
45
+ undesired_kws_in_title_2 = {kw for kw in UNDESIRED_TITLE_KEYWORDS if kw in norm_title_2}
46
+
47
+ # disjunction
48
+ undesired_kws = undesired_kws_in_title_1.symmetric_difference(undesired_kws_in_title_2)
49
+
50
+ undesired_kws_mismatch = True if len(undesired_kws) > 0 else False
51
+
52
+ if ((title_score > 85) or one_included_in_the_other) and not undesired_kws_mismatch:
53
+ title_score += 100
54
+
55
+ for _ in undesired_kws:
56
+ title_score -= 50
57
+
58
+ return title_score
59
+
60
+
61
+ def _score_author(author_1_full_name: str, author_2_full_name: str) -> int:
62
+ stripped_author_1 = remove_extra_whitespace(author_1_full_name)
63
+ stripped_author_2 = remove_extra_whitespace(author_2_full_name)
64
+
65
+ if not stripped_author_1 or not stripped_author_2:
66
+ raise ValueError("Authors cannot be empty for comparison")
67
+
68
+ author_score = fuzzy_match_score(
69
+ stripped_author_1,
70
+ stripped_author_2,
71
+ )
72
+
73
+ if author_score > 85:
74
+ author_score += 100
75
+
76
+ return author_score
77
+
78
+
79
+ def _score_year(year_1: int, year_2: int, range_offset: int = 1) -> int:
80
+
81
+ if not year_1 or not year_2:
82
+ raise ValueError("Years cannot be empty for comparison")
83
+
84
+ if not any(isinstance(year, int) for year in (year_1, year_2)):
85
+ if year_1 == year_2:
86
+ return 100
87
+ else:
88
+ return 0
89
+
90
+ range = [year_1 - range_offset, year_1, year_1 + range_offset]
91
+
92
+ if year_2 in range:
93
+ return 100
94
+ else:
95
+ return 0
96
+
97
+
98
+ def compare_bibitems(reference: BibItem, subject: BibItem, bibstring_type: TBibString) -> ScoredBibItems:
99
+ """
100
+ Calculate the score of two BibItems based on their title, author, and year.
101
+ The scoring is done using fuzzy matching for title and author, and exact matching for year.
102
+ The final score is a combination of the individual scores.
103
+ """
104
+
105
+ logger.debug(f"Scoring bibitems: {reference}, {subject}")
106
+
107
+ title_1 = getattr(reference.title, bibstring_type)
108
+ title_2 = getattr(subject.title, bibstring_type)
109
+ title_score = _score_title(title_1, title_2)
110
+
111
+ author_1_full_name = format_author(reference.author, bibstring_type)
112
+ author_2_full_name = format_author(subject.author, bibstring_type)
113
+
114
+ author_score = _score_author(author_1_full_name, author_2_full_name)
115
+
116
+ if isinstance(reference.date, BibItemDateAttr) and isinstance(subject.date, BibItemDateAttr):
117
+ year_1 = reference.date.year
118
+ year_2 = subject.date.year
119
+ year_score = _score_year(year_1, year_2)
120
+ else:
121
+ year_score = 0
122
+
123
+ total_score = title_score + author_score + year_score
124
+
125
+ return {
126
+ "reference": reference,
127
+ "subject": subject,
128
+ "score": {
129
+ "score": total_score,
130
+ "score_title": title_score,
131
+ "score_author": author_score,
132
+ "score_year": year_score,
133
+ },
134
+ }
@@ -1,7 +1,6 @@
1
1
  from typing import Literal
2
2
 
3
3
  type TBibTeXEntryType = Literal[
4
- "",
5
4
  "article",
6
5
  "book",
7
6
  "incollection",
@@ -12,15 +11,20 @@ type TBibTeXEntryType = Literal[
12
11
  "proceedings",
13
12
  "techreport",
14
13
  "unpublished",
14
+ "UNKNOWN",
15
15
  ]
16
16
 
17
- type TPubState = Literal[
17
+ type TBasicPubState = Literal[
18
18
  "",
19
19
  "unpub",
20
+ "forthcoming",
21
+ ]
22
+
23
+ type TPubState = Literal[
24
+ TBasicPubState,
20
25
  "inwork",
21
26
  "submitted",
22
27
  "published",
23
- "forthcoming",
24
28
  ]
25
29
 
26
30
  type TLanguageID = Literal[