philoch-bib-sdk 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- philoch_bib_sdk/converters/latex.py +6 -0
- philoch_bib_sdk/converters/plaintext/author/formatter.py +31 -0
- philoch_bib_sdk/converters/plaintext/author/parser.py +72 -0
- philoch_bib_sdk/converters/plaintext/bib_string_formatter.py +8 -0
- philoch_bib_sdk/converters/plaintext/bibitem/bibkey_formatter.py +21 -0
- philoch_bib_sdk/converters/plaintext/bibitem/bibkey_parser.py +144 -0
- philoch_bib_sdk/converters/plaintext/bibitem/date_formatter.py +37 -0
- philoch_bib_sdk/converters/plaintext/bibitem/date_parser.py +62 -0
- philoch_bib_sdk/converters/plaintext/bibitem/formatter.py +182 -0
- philoch_bib_sdk/converters/plaintext/bibitem/pages_formatter.py +13 -0
- philoch_bib_sdk/converters/plaintext/bibitem/pages_parser.py +63 -0
- philoch_bib_sdk/converters/plaintext/bibitem/parser.py +3 -0
- philoch_bib_sdk/converters/plaintext/journal/formatter.py +25 -0
- philoch_bib_sdk/converters/plaintext/journal/parser.py +36 -0
- philoch_bib_sdk/converters/plaintext/shared/renderable_formatter.py +25 -0
- philoch_bib_sdk/logic/default_models.py +120 -0
- philoch_bib_sdk/logic/functions/comparator.py +134 -0
- philoch_bib_sdk/logic/literals.py +7 -3
- philoch_bib_sdk/logic/models.py +226 -219
- {philoch_bib_sdk-0.1.2.dist-info → philoch_bib_sdk-0.1.3.dist-info}/METADATA +1 -1
- philoch_bib_sdk-0.1.3.dist-info/RECORD +26 -0
- {philoch_bib_sdk-0.1.2.dist-info → philoch_bib_sdk-0.1.3.dist-info}/WHEEL +1 -1
- philoch_bib_sdk-0.1.3.dist-info/entry_points.txt +3 -0
- philoch_bib_sdk-0.1.2.dist-info/RECORD +0 -8
- {philoch_bib_sdk-0.1.2.dist-info → philoch_bib_sdk-0.1.3.dist-info}/LICENSE +0 -0
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import traceback
|
|
2
|
+
from aletk.ResultMonad import Ok, Err
|
|
3
|
+
from aletk.utils import get_logger, remove_extra_whitespace
|
|
4
|
+
from philoch_bib_sdk.logic.models import Journal, BibStringAttr, TBibString
|
|
5
|
+
|
|
6
|
+
lgr = get_logger(__name__)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def parse_journal(text: str, bibstring_type: TBibString) -> Ok[Journal | None] | Err:
|
|
10
|
+
"""
|
|
11
|
+
Parse a journal string into a Journal object.
|
|
12
|
+
"""
|
|
13
|
+
try:
|
|
14
|
+
if text == "":
|
|
15
|
+
lgr.debug("Empty journal string, returning None.")
|
|
16
|
+
return Ok(None)
|
|
17
|
+
|
|
18
|
+
# Normalize the text by removing extra whitespace
|
|
19
|
+
normalized_text = remove_extra_whitespace(text)
|
|
20
|
+
|
|
21
|
+
journal = Journal(
|
|
22
|
+
name=BibStringAttr(**{str(bibstring_type): normalized_text}),
|
|
23
|
+
issn_electronic="",
|
|
24
|
+
issn_print="",
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
return Ok(journal)
|
|
28
|
+
|
|
29
|
+
except Exception as e:
|
|
30
|
+
error_message = f"Error parsing journal string '{text}': {e}"
|
|
31
|
+
return Err(
|
|
32
|
+
message=error_message,
|
|
33
|
+
code=-1,
|
|
34
|
+
error_type=f"{e.__class__.__name__}",
|
|
35
|
+
error_trace=traceback.format_exc(),
|
|
36
|
+
)
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from philoch_bib_sdk.logic.models import BaseRenderable, BaseNamedRenderable, TBibString
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def format_renderable(
|
|
5
|
+
renderable: BaseRenderable | BaseNamedRenderable,
|
|
6
|
+
bibstring_type: TBibString,
|
|
7
|
+
) -> str:
|
|
8
|
+
"""
|
|
9
|
+
Format a base renderable object into a string representation.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
match renderable:
|
|
13
|
+
|
|
14
|
+
case BaseRenderable(text, id):
|
|
15
|
+
if not text:
|
|
16
|
+
return ""
|
|
17
|
+
return f"{getattr(text, bibstring_type)}"
|
|
18
|
+
|
|
19
|
+
case BaseNamedRenderable(name, id):
|
|
20
|
+
if not name:
|
|
21
|
+
return ""
|
|
22
|
+
return f"{getattr(name, bibstring_type)}"
|
|
23
|
+
|
|
24
|
+
case _:
|
|
25
|
+
raise TypeError("Invalid type for renderable")
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
from typing import Tuple, TypedDict, Unpack
|
|
2
|
+
|
|
3
|
+
from philoch_bib_sdk.logic.models import (
|
|
4
|
+
Author,
|
|
5
|
+
BaseNamedRenderable,
|
|
6
|
+
BaseRenderable,
|
|
7
|
+
BibItem,
|
|
8
|
+
BibStringAttr,
|
|
9
|
+
Journal,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class BibStringArgs(TypedDict, total=False):
|
|
14
|
+
latex: str
|
|
15
|
+
unicode: str
|
|
16
|
+
simplified: str
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def default_bib_string(**kwargs: Unpack[BibStringArgs]) -> BibStringAttr:
|
|
20
|
+
"""
|
|
21
|
+
Create a default BibString object, given a dictionary with any (or None) of its attributes. Defaults to empty strings if not provided.
|
|
22
|
+
"""
|
|
23
|
+
return BibStringAttr(
|
|
24
|
+
latex=kwargs.get("latex", ""),
|
|
25
|
+
unicode=kwargs.get("unicode", ""),
|
|
26
|
+
simplified=kwargs.get("simplified", ""),
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
############
|
|
31
|
+
# Base Renderables
|
|
32
|
+
############
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class BaseRenderableArgs(TypedDict, total=False):
|
|
36
|
+
text: BibStringArgs
|
|
37
|
+
id: int | None
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def default_base_renderable(**kwargs: Unpack[BaseRenderableArgs]) -> BaseRenderable:
|
|
41
|
+
"""
|
|
42
|
+
Create a default BaseRenderable object, given a dictionary with any (or None) of its attributes. Defaults to empty strings if not provided.
|
|
43
|
+
"""
|
|
44
|
+
return BaseRenderable(
|
|
45
|
+
text=default_bib_string(**kwargs.get("text", {})),
|
|
46
|
+
id=kwargs.get("id", None),
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class BaseNamedRenderableArgs(TypedDict, total=False):
|
|
51
|
+
name: BibStringArgs
|
|
52
|
+
id: int | None
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def default_base_named_renderable(**kwargs: Unpack[BaseNamedRenderableArgs]) -> BaseNamedRenderable:
|
|
56
|
+
"""
|
|
57
|
+
Create a default BaseNamedRenderable object, given a dictionary with any (or None) of its attributes. Defaults to empty strings if not provided.
|
|
58
|
+
"""
|
|
59
|
+
return BaseNamedRenderable(
|
|
60
|
+
name=default_bib_string(**kwargs.get("name", {})),
|
|
61
|
+
id=kwargs.get("id", None),
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
############
|
|
66
|
+
# Author
|
|
67
|
+
############
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class AuthorArgs(TypedDict, total=False):
|
|
71
|
+
given_name: BibStringArgs
|
|
72
|
+
family_name: BibStringArgs
|
|
73
|
+
mononym: BibStringArgs
|
|
74
|
+
shorthand: BibStringArgs
|
|
75
|
+
famous_name: BibStringArgs
|
|
76
|
+
publications: Tuple[BibItem, ...]
|
|
77
|
+
id: int | None
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def default_author(**kwargs: Unpack[AuthorArgs]) -> Author:
|
|
81
|
+
"""
|
|
82
|
+
Create a default Author object, given a dictionary with any (or None) of its attributes. Defaults to empty strings and an empty tuple for publications if not provided.
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
return Author(
|
|
86
|
+
given_name=default_bib_string(**kwargs.get("given_name", {})),
|
|
87
|
+
family_name=default_bib_string(**kwargs.get("family_name", {})),
|
|
88
|
+
mononym=default_bib_string(**kwargs.get("mononym", {})),
|
|
89
|
+
shorthand=default_bib_string(**kwargs.get("shorthand", {})),
|
|
90
|
+
famous_name=default_bib_string(**kwargs.get("famous_name", {})),
|
|
91
|
+
publications=kwargs.get("publications", ()),
|
|
92
|
+
id=kwargs.get("id", None),
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
############
|
|
97
|
+
# Journal
|
|
98
|
+
############
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class JournalArgs(TypedDict, total=False):
|
|
102
|
+
name: BibStringArgs
|
|
103
|
+
issn_print: str
|
|
104
|
+
issn_electronic: str
|
|
105
|
+
id: int | None
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def default_journal(**kwargs: Unpack[JournalArgs]) -> Journal | None:
|
|
109
|
+
"""
|
|
110
|
+
Create a default Journal object, given a dictionary with any (or None) of its attributes. Defaults to empty strings if not provided.
|
|
111
|
+
"""
|
|
112
|
+
if kwargs == {}:
|
|
113
|
+
return None
|
|
114
|
+
|
|
115
|
+
return Journal(
|
|
116
|
+
name=default_bib_string(**kwargs.get("name", {})),
|
|
117
|
+
issn_print=kwargs.get("issn_print", ""),
|
|
118
|
+
issn_electronic=kwargs.get("issn_electronic", ""),
|
|
119
|
+
id=kwargs.get("id", None),
|
|
120
|
+
)
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
from aletk.utils import get_logger, fuzzy_match_score, remove_extra_whitespace
|
|
2
|
+
|
|
3
|
+
from typing import TypedDict
|
|
4
|
+
from philoch_bib_sdk.converters.plaintext.author.formatter import format_author
|
|
5
|
+
from philoch_bib_sdk.logic.models import BibItem, BibItemDateAttr, TBibString
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
logger = get_logger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class BibItemScore(TypedDict):
|
|
12
|
+
score: int
|
|
13
|
+
score_title: int
|
|
14
|
+
score_author: int
|
|
15
|
+
score_year: int
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ScoredBibItems(TypedDict):
|
|
19
|
+
reference: BibItem
|
|
20
|
+
subject: BibItem
|
|
21
|
+
score: BibItemScore
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
UNDESIRED_TITLE_KEYWORDS = ["errata", "review"]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _score_title(title_1: str, title_2: str) -> int:
|
|
28
|
+
|
|
29
|
+
norm_title_1 = remove_extra_whitespace(title_1).lower()
|
|
30
|
+
norm_title_2 = remove_extra_whitespace(title_2).lower()
|
|
31
|
+
|
|
32
|
+
if not norm_title_1 or not norm_title_2:
|
|
33
|
+
raise ValueError("Titles cannot be empty for comparison")
|
|
34
|
+
|
|
35
|
+
title_score = fuzzy_match_score(
|
|
36
|
+
norm_title_1,
|
|
37
|
+
norm_title_2,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
# Might catch cases in which one doesn't include the subtitle
|
|
41
|
+
one_included_in_the_other = (norm_title_1 in norm_title_2) or (norm_title_2 in norm_title_1)
|
|
42
|
+
|
|
43
|
+
undesired_kws_in_title_1 = {kw for kw in UNDESIRED_TITLE_KEYWORDS if kw in norm_title_1}
|
|
44
|
+
|
|
45
|
+
undesired_kws_in_title_2 = {kw for kw in UNDESIRED_TITLE_KEYWORDS if kw in norm_title_2}
|
|
46
|
+
|
|
47
|
+
# disjunction
|
|
48
|
+
undesired_kws = undesired_kws_in_title_1.symmetric_difference(undesired_kws_in_title_2)
|
|
49
|
+
|
|
50
|
+
undesired_kws_mismatch = True if len(undesired_kws) > 0 else False
|
|
51
|
+
|
|
52
|
+
if ((title_score > 85) or one_included_in_the_other) and not undesired_kws_mismatch:
|
|
53
|
+
title_score += 100
|
|
54
|
+
|
|
55
|
+
for _ in undesired_kws:
|
|
56
|
+
title_score -= 50
|
|
57
|
+
|
|
58
|
+
return title_score
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _score_author(author_1_full_name: str, author_2_full_name: str) -> int:
|
|
62
|
+
stripped_author_1 = remove_extra_whitespace(author_1_full_name)
|
|
63
|
+
stripped_author_2 = remove_extra_whitespace(author_2_full_name)
|
|
64
|
+
|
|
65
|
+
if not stripped_author_1 or not stripped_author_2:
|
|
66
|
+
raise ValueError("Authors cannot be empty for comparison")
|
|
67
|
+
|
|
68
|
+
author_score = fuzzy_match_score(
|
|
69
|
+
stripped_author_1,
|
|
70
|
+
stripped_author_2,
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
if author_score > 85:
|
|
74
|
+
author_score += 100
|
|
75
|
+
|
|
76
|
+
return author_score
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _score_year(year_1: int, year_2: int, range_offset: int = 1) -> int:
|
|
80
|
+
|
|
81
|
+
if not year_1 or not year_2:
|
|
82
|
+
raise ValueError("Years cannot be empty for comparison")
|
|
83
|
+
|
|
84
|
+
if not any(isinstance(year, int) for year in (year_1, year_2)):
|
|
85
|
+
if year_1 == year_2:
|
|
86
|
+
return 100
|
|
87
|
+
else:
|
|
88
|
+
return 0
|
|
89
|
+
|
|
90
|
+
range = [year_1 - range_offset, year_1, year_1 + range_offset]
|
|
91
|
+
|
|
92
|
+
if year_2 in range:
|
|
93
|
+
return 100
|
|
94
|
+
else:
|
|
95
|
+
return 0
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def compare_bibitems(reference: BibItem, subject: BibItem, bibstring_type: TBibString) -> ScoredBibItems:
|
|
99
|
+
"""
|
|
100
|
+
Calculate the score of two BibItems based on their title, author, and year.
|
|
101
|
+
The scoring is done using fuzzy matching for title and author, and exact matching for year.
|
|
102
|
+
The final score is a combination of the individual scores.
|
|
103
|
+
"""
|
|
104
|
+
|
|
105
|
+
logger.debug(f"Scoring bibitems: {reference}, {subject}")
|
|
106
|
+
|
|
107
|
+
title_1 = getattr(reference.title, bibstring_type)
|
|
108
|
+
title_2 = getattr(subject.title, bibstring_type)
|
|
109
|
+
title_score = _score_title(title_1, title_2)
|
|
110
|
+
|
|
111
|
+
author_1_full_name = format_author(reference.author, bibstring_type)
|
|
112
|
+
author_2_full_name = format_author(subject.author, bibstring_type)
|
|
113
|
+
|
|
114
|
+
author_score = _score_author(author_1_full_name, author_2_full_name)
|
|
115
|
+
|
|
116
|
+
if isinstance(reference.date, BibItemDateAttr) and isinstance(subject.date, BibItemDateAttr):
|
|
117
|
+
year_1 = reference.date.year
|
|
118
|
+
year_2 = subject.date.year
|
|
119
|
+
year_score = _score_year(year_1, year_2)
|
|
120
|
+
else:
|
|
121
|
+
year_score = 0
|
|
122
|
+
|
|
123
|
+
total_score = title_score + author_score + year_score
|
|
124
|
+
|
|
125
|
+
return {
|
|
126
|
+
"reference": reference,
|
|
127
|
+
"subject": subject,
|
|
128
|
+
"score": {
|
|
129
|
+
"score": total_score,
|
|
130
|
+
"score_title": title_score,
|
|
131
|
+
"score_author": author_score,
|
|
132
|
+
"score_year": year_score,
|
|
133
|
+
},
|
|
134
|
+
}
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
from typing import Literal
|
|
2
2
|
|
|
3
3
|
type TBibTeXEntryType = Literal[
|
|
4
|
-
"",
|
|
5
4
|
"article",
|
|
6
5
|
"book",
|
|
7
6
|
"incollection",
|
|
@@ -12,15 +11,20 @@ type TBibTeXEntryType = Literal[
|
|
|
12
11
|
"proceedings",
|
|
13
12
|
"techreport",
|
|
14
13
|
"unpublished",
|
|
14
|
+
"UNKNOWN",
|
|
15
15
|
]
|
|
16
16
|
|
|
17
|
-
type
|
|
17
|
+
type TBasicPubState = Literal[
|
|
18
18
|
"",
|
|
19
19
|
"unpub",
|
|
20
|
+
"forthcoming",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
type TPubState = Literal[
|
|
24
|
+
TBasicPubState,
|
|
20
25
|
"inwork",
|
|
21
26
|
"submitted",
|
|
22
27
|
"published",
|
|
23
|
-
"forthcoming",
|
|
24
28
|
]
|
|
25
29
|
|
|
26
30
|
type TLanguageID = Literal[
|