philoch-bib-sdk 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- philoch_bib_sdk/adapters/tabular_data/read_journal_volume_number_index.py +59 -0
- philoch_bib_sdk/converters/latex.py +6 -0
- philoch_bib_sdk/converters/plaintext/author/formatter.py +31 -0
- philoch_bib_sdk/converters/plaintext/author/parser.py +72 -0
- philoch_bib_sdk/converters/plaintext/bib_string_formatter.py +8 -0
- philoch_bib_sdk/converters/plaintext/bibitem/bibkey_formatter.py +21 -0
- philoch_bib_sdk/converters/plaintext/bibitem/bibkey_parser.py +144 -0
- philoch_bib_sdk/converters/plaintext/bibitem/date_formatter.py +37 -0
- philoch_bib_sdk/converters/plaintext/bibitem/date_parser.py +62 -0
- philoch_bib_sdk/converters/plaintext/bibitem/formatter.py +182 -0
- philoch_bib_sdk/converters/plaintext/bibitem/pages_formatter.py +13 -0
- philoch_bib_sdk/converters/plaintext/bibitem/pages_parser.py +63 -0
- philoch_bib_sdk/converters/plaintext/bibitem/parser.py +3 -0
- philoch_bib_sdk/converters/plaintext/journal/formatter.py +25 -0
- philoch_bib_sdk/converters/plaintext/journal/parser.py +36 -0
- philoch_bib_sdk/converters/plaintext/shared/renderable_formatter.py +25 -0
- philoch_bib_sdk/logic/default_models.py +315 -0
- philoch_bib_sdk/logic/functions/comparator.py +134 -0
- philoch_bib_sdk/logic/functions/journal_article_matcher.py +40 -0
- philoch_bib_sdk/logic/literals.py +7 -3
- philoch_bib_sdk/logic/models.py +226 -219
- {philoch_bib_sdk-0.1.2.dist-info → philoch_bib_sdk-0.1.4.dist-info}/METADATA +2 -1
- philoch_bib_sdk-0.1.4.dist-info/RECORD +28 -0
- {philoch_bib_sdk-0.1.2.dist-info → philoch_bib_sdk-0.1.4.dist-info}/WHEEL +1 -1
- philoch_bib_sdk-0.1.4.dist-info/entry_points.txt +3 -0
- philoch_bib_sdk-0.1.2.dist-info/RECORD +0 -8
- {philoch_bib_sdk-0.1.2.dist-info → philoch_bib_sdk-0.1.4.dist-info}/LICENSE +0 -0
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import traceback
|
|
3
|
+
from typing import Tuple
|
|
4
|
+
from aletk.utils import remove_extra_whitespace
|
|
5
|
+
from aletk.ResultMonad import Ok, Err
|
|
6
|
+
|
|
7
|
+
from philoch_bib_sdk.logic.models import PageAttr
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def is_valid_roman(raw_str: str) -> bool:
|
|
11
|
+
"""
|
|
12
|
+
TODO: TBD, decide if we want to control if the pages are in roman numbers.
|
|
13
|
+
"""
|
|
14
|
+
raw_str = raw_str.upper()
|
|
15
|
+
pattern = r"^M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$"
|
|
16
|
+
return bool(re.match(pattern, raw_str))
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _parse_single_page_attr(
|
|
20
|
+
text: str,
|
|
21
|
+
) -> PageAttr:
|
|
22
|
+
"""
|
|
23
|
+
Parse a single page attribute from a string.
|
|
24
|
+
"""
|
|
25
|
+
if "--" not in text and "-" in text:
|
|
26
|
+
raise ValueError(f"Unexpected page format found in '{text}'. Expected either '<start>--<end>' or '<page>'.")
|
|
27
|
+
elif "--" in text:
|
|
28
|
+
parts = remove_extra_whitespace(text).split("--")
|
|
29
|
+
|
|
30
|
+
if len(parts) != 2:
|
|
31
|
+
raise ValueError(f"Unexpected number of page parts found in '{text}': '{parts}'. Expected exactly 2.")
|
|
32
|
+
|
|
33
|
+
start_page, end_page = (remove_extra_whitespace(part) for part in parts)
|
|
34
|
+
|
|
35
|
+
else:
|
|
36
|
+
start_page = remove_extra_whitespace(text)
|
|
37
|
+
end_page = ""
|
|
38
|
+
|
|
39
|
+
return PageAttr(start=start_page, end=end_page)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def parse_pages(text: str) -> Ok[Tuple[PageAttr, ...]] | Err:
|
|
43
|
+
"""
|
|
44
|
+
Parse a string of pages into a tuple of PageAttr objects.
|
|
45
|
+
The input string is expected to be a comma-separated list of page attributes, with each attribute in the format "<start>--<end>" or "<page>".
|
|
46
|
+
"""
|
|
47
|
+
try:
|
|
48
|
+
if text == "":
|
|
49
|
+
return Ok(())
|
|
50
|
+
|
|
51
|
+
parts = (remove_extra_whitespace(part) for part in text.split(","))
|
|
52
|
+
parts_normalized = (_parse_single_page_attr(part) for part in parts)
|
|
53
|
+
|
|
54
|
+
return Ok(tuple(parts_normalized))
|
|
55
|
+
|
|
56
|
+
except Exception as e:
|
|
57
|
+
error_message = f"Error parsing pages from '{text}': {e}"
|
|
58
|
+
return Err(
|
|
59
|
+
error_message,
|
|
60
|
+
code=-1,
|
|
61
|
+
error_type=f"{e.__class__.__name__}",
|
|
62
|
+
error_trace=traceback.format_exc(),
|
|
63
|
+
)
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from aletk.utils import get_logger
|
|
2
|
+
from philoch_bib_sdk.logic.models import Journal, Maybe, TBibString
|
|
3
|
+
|
|
4
|
+
lgr = get_logger(__name__)
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def format_journal(journal: Maybe[Journal], bibstring_type: TBibString) -> str:
|
|
8
|
+
"""
|
|
9
|
+
Format a journal object into a string representation.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
match journal:
|
|
13
|
+
|
|
14
|
+
case None:
|
|
15
|
+
return ""
|
|
16
|
+
|
|
17
|
+
case Journal(name, id):
|
|
18
|
+
|
|
19
|
+
if not name:
|
|
20
|
+
return ""
|
|
21
|
+
|
|
22
|
+
return f"{getattr(name, bibstring_type)}"
|
|
23
|
+
|
|
24
|
+
case _:
|
|
25
|
+
raise TypeError(f"Invalid type for journal: {type(journal)}. Dump: {journal!r}")
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import traceback
|
|
2
|
+
from aletk.ResultMonad import Ok, Err
|
|
3
|
+
from aletk.utils import get_logger, remove_extra_whitespace
|
|
4
|
+
from philoch_bib_sdk.logic.models import Journal, BibStringAttr, TBibString
|
|
5
|
+
|
|
6
|
+
lgr = get_logger(__name__)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def parse_journal(text: str, bibstring_type: TBibString) -> Ok[Journal | None] | Err:
|
|
10
|
+
"""
|
|
11
|
+
Parse a journal string into a Journal object.
|
|
12
|
+
"""
|
|
13
|
+
try:
|
|
14
|
+
if text == "":
|
|
15
|
+
lgr.debug("Empty journal string, returning None.")
|
|
16
|
+
return Ok(None)
|
|
17
|
+
|
|
18
|
+
# Normalize the text by removing extra whitespace
|
|
19
|
+
normalized_text = remove_extra_whitespace(text)
|
|
20
|
+
|
|
21
|
+
journal = Journal(
|
|
22
|
+
name=BibStringAttr(**{str(bibstring_type): normalized_text}),
|
|
23
|
+
issn_electronic="",
|
|
24
|
+
issn_print="",
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
return Ok(journal)
|
|
28
|
+
|
|
29
|
+
except Exception as e:
|
|
30
|
+
error_message = f"Error parsing journal string '{text}': {e}"
|
|
31
|
+
return Err(
|
|
32
|
+
message=error_message,
|
|
33
|
+
code=-1,
|
|
34
|
+
error_type=f"{e.__class__.__name__}",
|
|
35
|
+
error_trace=traceback.format_exc(),
|
|
36
|
+
)
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from philoch_bib_sdk.logic.models import BaseRenderable, BaseNamedRenderable, TBibString
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def format_renderable(
|
|
5
|
+
renderable: BaseRenderable | BaseNamedRenderable,
|
|
6
|
+
bibstring_type: TBibString,
|
|
7
|
+
) -> str:
|
|
8
|
+
"""
|
|
9
|
+
Format a base renderable object into a string representation.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
match renderable:
|
|
13
|
+
|
|
14
|
+
case BaseRenderable(text, id):
|
|
15
|
+
if not text:
|
|
16
|
+
return ""
|
|
17
|
+
return f"{getattr(text, bibstring_type)}"
|
|
18
|
+
|
|
19
|
+
case BaseNamedRenderable(name, id):
|
|
20
|
+
if not name:
|
|
21
|
+
return ""
|
|
22
|
+
return f"{getattr(name, bibstring_type)}"
|
|
23
|
+
|
|
24
|
+
case _:
|
|
25
|
+
raise TypeError("Invalid type for renderable")
|
|
@@ -0,0 +1,315 @@
|
|
|
1
|
+
from typing import Tuple, TypedDict, Unpack, Literal
|
|
2
|
+
from philoch_bib_sdk.logic.models import BibItem, PageAttr, KeywordsAttr, BibItemDateAttr, BibKeyAttr, Keyword
|
|
3
|
+
|
|
4
|
+
from philoch_bib_sdk.logic.literals import TBasicPubState, TBibTeXEntryType, TEpoch, TLanguageID, TPubState
|
|
5
|
+
from philoch_bib_sdk.logic.models import (
|
|
6
|
+
Author,
|
|
7
|
+
BaseNamedRenderable,
|
|
8
|
+
BaseRenderable,
|
|
9
|
+
BibItem,
|
|
10
|
+
BibStringAttr,
|
|
11
|
+
Journal,
|
|
12
|
+
Keyword,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class BibStringArgs(TypedDict, total=False):
|
|
17
|
+
latex: str
|
|
18
|
+
unicode: str
|
|
19
|
+
simplified: str
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def default_bib_string(**kwargs: Unpack[BibStringArgs]) -> BibStringAttr:
|
|
23
|
+
"""
|
|
24
|
+
Create a default BibString object, given a dictionary with any (or None) of its attributes. Defaults to empty strings if not provided.
|
|
25
|
+
"""
|
|
26
|
+
return BibStringAttr(
|
|
27
|
+
latex=kwargs.get("latex", ""),
|
|
28
|
+
unicode=kwargs.get("unicode", ""),
|
|
29
|
+
simplified=kwargs.get("simplified", ""),
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
############
|
|
34
|
+
# Base Renderables
|
|
35
|
+
############
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class BaseRenderableArgs(TypedDict, total=False):
|
|
39
|
+
text: BibStringArgs
|
|
40
|
+
id: int | None
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def default_base_renderable(**kwargs: Unpack[BaseRenderableArgs]) -> BaseRenderable:
|
|
44
|
+
"""
|
|
45
|
+
Create a default BaseRenderable object, given a dictionary with any (or None) of its attributes. Defaults to empty strings if not provided.
|
|
46
|
+
"""
|
|
47
|
+
return BaseRenderable(
|
|
48
|
+
text=default_bib_string(**kwargs.get("text", {})),
|
|
49
|
+
id=kwargs.get("id", None),
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class BaseNamedRenderableArgs(TypedDict, total=False):
|
|
54
|
+
name: BibStringArgs
|
|
55
|
+
id: int | None
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def default_base_named_renderable(**kwargs: Unpack[BaseNamedRenderableArgs]) -> BaseNamedRenderable:
|
|
59
|
+
"""
|
|
60
|
+
Create a default BaseNamedRenderable object, given a dictionary with any (or None) of its attributes. Defaults to empty strings if not provided.
|
|
61
|
+
"""
|
|
62
|
+
return BaseNamedRenderable(
|
|
63
|
+
name=default_bib_string(**kwargs.get("name", {})),
|
|
64
|
+
id=kwargs.get("id", None),
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
############
|
|
69
|
+
# Author
|
|
70
|
+
############
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class AuthorArgs(TypedDict, total=False):
|
|
74
|
+
given_name: BibStringArgs
|
|
75
|
+
family_name: BibStringArgs
|
|
76
|
+
mononym: BibStringArgs
|
|
77
|
+
shorthand: BibStringArgs
|
|
78
|
+
famous_name: BibStringArgs
|
|
79
|
+
publications: Tuple[BibItem, ...]
|
|
80
|
+
id: int | None
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def default_author(**kwargs: Unpack[AuthorArgs]) -> Author:
|
|
84
|
+
"""
|
|
85
|
+
Create a default Author object, given a dictionary with any (or None) of its attributes. Defaults to empty strings and an empty tuple for publications if not provided.
|
|
86
|
+
"""
|
|
87
|
+
|
|
88
|
+
return Author(
|
|
89
|
+
given_name=default_bib_string(**kwargs.get("given_name", {})),
|
|
90
|
+
family_name=default_bib_string(**kwargs.get("family_name", {})),
|
|
91
|
+
mononym=default_bib_string(**kwargs.get("mononym", {})),
|
|
92
|
+
shorthand=default_bib_string(**kwargs.get("shorthand", {})),
|
|
93
|
+
famous_name=default_bib_string(**kwargs.get("famous_name", {})),
|
|
94
|
+
publications=kwargs.get("publications", ()),
|
|
95
|
+
id=kwargs.get("id", None),
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
############
|
|
100
|
+
# Journal
|
|
101
|
+
############
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
class JournalArgs(TypedDict, total=False):
|
|
105
|
+
name: BibStringArgs
|
|
106
|
+
issn_print: str
|
|
107
|
+
issn_electronic: str
|
|
108
|
+
id: int | None
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def default_journal(**kwargs: Unpack[JournalArgs]) -> Journal | None:
|
|
112
|
+
"""
|
|
113
|
+
Create a default Journal object, given a dictionary with any (or None) of its attributes. Defaults to empty strings if not provided.
|
|
114
|
+
"""
|
|
115
|
+
if kwargs == {}:
|
|
116
|
+
return None
|
|
117
|
+
|
|
118
|
+
return Journal(
|
|
119
|
+
name=default_bib_string(**kwargs.get("name", {})),
|
|
120
|
+
issn_print=kwargs.get("issn_print", ""),
|
|
121
|
+
issn_electronic=kwargs.get("issn_electronic", ""),
|
|
122
|
+
id=kwargs.get("id", None),
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
############
|
|
127
|
+
# Support Args
|
|
128
|
+
############
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
class PageArgs(TypedDict, total=False):
|
|
132
|
+
start: str
|
|
133
|
+
end: str
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def default_page(**kwargs: Unpack[PageArgs]) -> PageAttr:
|
|
137
|
+
return PageAttr(
|
|
138
|
+
start=kwargs.get("start", ""),
|
|
139
|
+
end=kwargs.get("end", ""),
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
class KeywordsArgs(TypedDict, total=False):
|
|
144
|
+
level_1: str
|
|
145
|
+
level_2: str
|
|
146
|
+
level_3: str
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def default_keywords(**kwargs: Unpack[KeywordsArgs]) -> KeywordsAttr:
|
|
150
|
+
return KeywordsAttr(
|
|
151
|
+
level_1=Keyword(name=kwargs.get("level_1", "")),
|
|
152
|
+
level_2=Keyword(name=kwargs.get("level_2", "")),
|
|
153
|
+
level_3=Keyword(name=kwargs.get("level_3", "")),
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
class BibItemDateArgs(TypedDict, total=False):
|
|
158
|
+
year: int
|
|
159
|
+
year_part_2_hyphen: int | None
|
|
160
|
+
year_part_2_slash: int | None
|
|
161
|
+
month: int | None
|
|
162
|
+
day: int | None
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def default_bib_item_date(**kwargs: Unpack[BibItemDateArgs]) -> BibItemDateAttr:
|
|
166
|
+
return BibItemDateAttr(
|
|
167
|
+
year=kwargs.get("year", 0),
|
|
168
|
+
year_part_2_hyphen=kwargs.get("year_part_2_hyphen"),
|
|
169
|
+
year_part_2_slash=kwargs.get("year_part_2_slash"),
|
|
170
|
+
month=kwargs.get("month"),
|
|
171
|
+
day=kwargs.get("day"),
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def parse_date(date: BibItemDateArgs | Literal["no date"]) -> BibItemDateAttr | Literal["no date"]:
|
|
176
|
+
if isinstance(date, dict):
|
|
177
|
+
return default_bib_item_date(**date)
|
|
178
|
+
else:
|
|
179
|
+
return "no date"
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
class BibKeyArgs(TypedDict, total=False):
|
|
183
|
+
first_author: str
|
|
184
|
+
other_authors: str
|
|
185
|
+
date: int | TBasicPubState
|
|
186
|
+
date_suffix: str
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def default_bib_key(**kwargs: Unpack[BibKeyArgs]) -> BibKeyAttr:
|
|
190
|
+
# Then pass to BibKeyAttr
|
|
191
|
+
return BibKeyAttr(
|
|
192
|
+
first_author=kwargs.get("first_author", ""),
|
|
193
|
+
other_authors=kwargs.get("other_authors", ""),
|
|
194
|
+
date=kwargs.get("date", ""),
|
|
195
|
+
date_suffix=kwargs.get("date_suffix", ""),
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
############
|
|
200
|
+
# BibItem Args
|
|
201
|
+
############
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
class BibItemArgs(TypedDict, total=False):
|
|
205
|
+
_to_do_general: str
|
|
206
|
+
_change_request: str
|
|
207
|
+
entry_type: TBibTeXEntryType
|
|
208
|
+
bibkey: BibKeyArgs
|
|
209
|
+
author: Tuple[AuthorArgs, ...]
|
|
210
|
+
editor: Tuple[AuthorArgs, ...]
|
|
211
|
+
options: Tuple[str, ...]
|
|
212
|
+
date: BibItemDateArgs | Literal["no date"]
|
|
213
|
+
pubstate: TPubState
|
|
214
|
+
title: BibStringArgs
|
|
215
|
+
booktitle: BibStringArgs
|
|
216
|
+
# crossref: dict
|
|
217
|
+
journal: JournalArgs
|
|
218
|
+
volume: str
|
|
219
|
+
number: str
|
|
220
|
+
pages: Tuple[PageArgs, ...]
|
|
221
|
+
eid: str
|
|
222
|
+
series: BaseNamedRenderableArgs
|
|
223
|
+
address: BibStringArgs
|
|
224
|
+
institution: BibStringArgs
|
|
225
|
+
school: BibStringArgs
|
|
226
|
+
publisher: BibStringArgs
|
|
227
|
+
type: BibStringArgs
|
|
228
|
+
edition: int
|
|
229
|
+
note: BibStringArgs
|
|
230
|
+
issuetitle: BibStringArgs
|
|
231
|
+
_guesteditor: Tuple[AuthorArgs, ...]
|
|
232
|
+
_extra_note: BibStringArgs
|
|
233
|
+
urn: str
|
|
234
|
+
eprint: str
|
|
235
|
+
doi: str
|
|
236
|
+
url: str
|
|
237
|
+
_kws: KeywordsArgs
|
|
238
|
+
_epoch: TEpoch
|
|
239
|
+
_person: AuthorArgs
|
|
240
|
+
_comm_for_profile_bib: str
|
|
241
|
+
_langid: TLanguageID
|
|
242
|
+
_lang_der: str
|
|
243
|
+
_further_refs: Tuple[BibKeyArgs, ...]
|
|
244
|
+
_depends_on: Tuple[BibKeyArgs, ...]
|
|
245
|
+
_dltc_num: int
|
|
246
|
+
_spec_interest: str
|
|
247
|
+
_note_perso: str
|
|
248
|
+
_note_stock: str
|
|
249
|
+
_note_status: str
|
|
250
|
+
_num_inwork_coll: int
|
|
251
|
+
_num_inwork: str
|
|
252
|
+
_num_coll: int
|
|
253
|
+
_dltc_copyediting_note: str
|
|
254
|
+
_note_missing: str
|
|
255
|
+
_num_sort: int
|
|
256
|
+
id: int
|
|
257
|
+
_bib_info_source: str
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def default_bib_item(**kwargs: Unpack[BibItemArgs]) -> BibItem:
|
|
261
|
+
return BibItem(
|
|
262
|
+
to_do_general=kwargs.get("_to_do_general", ""),
|
|
263
|
+
change_request=kwargs.get("_change_request", ""),
|
|
264
|
+
entry_type=kwargs.get("entry_type", "UNKNOWN"),
|
|
265
|
+
bibkey=default_bib_key(**kwargs.get("bibkey", {})) if "bibkey" in kwargs else "",
|
|
266
|
+
author=tuple(default_author(**a) for a in kwargs.get("author", ())),
|
|
267
|
+
editor=tuple(default_author(**e) for e in kwargs.get("editor", ())),
|
|
268
|
+
options=kwargs.get("options", ()),
|
|
269
|
+
date=parse_date(kwargs.get("date", "no date")),
|
|
270
|
+
pubstate=kwargs.get("pubstate", ""),
|
|
271
|
+
title=default_bib_string(**kwargs.get("title", {})) if "title" in kwargs else "",
|
|
272
|
+
booktitle=default_bib_string(**kwargs.get("booktitle", {})) if "booktitle" in kwargs else "",
|
|
273
|
+
crossref="", # Crossref is not defined in the provided context, so we leave it as an empty string
|
|
274
|
+
journal=default_journal(**kwargs.get("journal", {})) if "journal" in kwargs else None,
|
|
275
|
+
volume=kwargs.get("volume", ""),
|
|
276
|
+
number=kwargs.get("number", ""),
|
|
277
|
+
pages=tuple(default_page(**p) for p in kwargs.get("pages", ())),
|
|
278
|
+
eid=kwargs.get("eid", ""),
|
|
279
|
+
series=default_base_named_renderable(**kwargs.get("series", {})) if "series" in kwargs else "",
|
|
280
|
+
address=default_bib_string(**kwargs.get("address", {})) if "address" in kwargs else "",
|
|
281
|
+
institution=default_bib_string(**kwargs.get("institution", {})) if "institution" in kwargs else "",
|
|
282
|
+
school=default_bib_string(**kwargs.get("school", {})) if "school" in kwargs else "",
|
|
283
|
+
publisher=default_bib_string(**kwargs.get("publisher", {})) if "publisher" in kwargs else "",
|
|
284
|
+
type=default_bib_string(**kwargs.get("type", {})) if "type" in kwargs else "",
|
|
285
|
+
edition=kwargs.get("edition"),
|
|
286
|
+
note=default_bib_string(**kwargs.get("note", {})) if "note" in kwargs else "",
|
|
287
|
+
issuetitle=default_bib_string(**kwargs.get("issuetitle", {})) if "issuetitle" in kwargs else "",
|
|
288
|
+
guesteditor=tuple(default_author(**a) for a in kwargs.get("_guesteditor", ())),
|
|
289
|
+
extra_note=default_bib_string(**kwargs.get("_extra_note", {})) if "_extra_note" in kwargs else "",
|
|
290
|
+
urn=kwargs.get("urn", ""),
|
|
291
|
+
eprint=kwargs.get("eprint", ""),
|
|
292
|
+
doi=kwargs.get("doi", ""),
|
|
293
|
+
url=kwargs.get("url", ""),
|
|
294
|
+
kws=default_keywords(**kwargs.get("_kws", {})) if "_kws" in kwargs else "",
|
|
295
|
+
epoch=kwargs.get("_epoch", ""),
|
|
296
|
+
person=default_author(**kwargs.get("_person", {})) if "_person" in kwargs else "",
|
|
297
|
+
comm_for_profile_bib=kwargs.get("_comm_for_profile_bib", ""),
|
|
298
|
+
langid=kwargs.get("_langid", ""),
|
|
299
|
+
lang_der=kwargs.get("_lang_der", ""),
|
|
300
|
+
further_refs=tuple(default_bib_key(**b) for b in kwargs.get("_further_refs", ())),
|
|
301
|
+
depends_on=tuple(default_bib_key(**b) for b in kwargs.get("_depends_on", ())),
|
|
302
|
+
dltc_num=kwargs.get("_dltc_num"),
|
|
303
|
+
spec_interest=kwargs.get("_spec_interest", ""),
|
|
304
|
+
note_perso=kwargs.get("_note_perso", ""),
|
|
305
|
+
note_stock=kwargs.get("_note_stock", ""),
|
|
306
|
+
note_status=kwargs.get("_note_status", ""),
|
|
307
|
+
num_inwork_coll=kwargs.get("_num_inwork_coll"),
|
|
308
|
+
num_inwork=kwargs.get("_num_inwork", ""),
|
|
309
|
+
num_coll=kwargs.get("_num_coll"),
|
|
310
|
+
dltc_copyediting_note=kwargs.get("_dltc_copyediting_note", ""),
|
|
311
|
+
note_missing=kwargs.get("_note_missing", ""),
|
|
312
|
+
num_sort=kwargs.get("_num_sort"),
|
|
313
|
+
id=kwargs.get("id"),
|
|
314
|
+
bib_info_source=kwargs.get("_bib_info_source", ""),
|
|
315
|
+
)
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
from aletk.utils import get_logger, fuzzy_match_score, remove_extra_whitespace
|
|
2
|
+
|
|
3
|
+
from typing import TypedDict
|
|
4
|
+
from philoch_bib_sdk.converters.plaintext.author.formatter import format_author
|
|
5
|
+
from philoch_bib_sdk.logic.models import BibItem, BibItemDateAttr, TBibString
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
logger = get_logger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class BibItemScore(TypedDict):
|
|
12
|
+
score: int
|
|
13
|
+
score_title: int
|
|
14
|
+
score_author: int
|
|
15
|
+
score_year: int
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ScoredBibItems(TypedDict):
|
|
19
|
+
reference: BibItem
|
|
20
|
+
subject: BibItem
|
|
21
|
+
score: BibItemScore
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
UNDESIRED_TITLE_KEYWORDS = ["errata", "review"]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _score_title(title_1: str, title_2: str) -> int:
|
|
28
|
+
|
|
29
|
+
norm_title_1 = remove_extra_whitespace(title_1).lower()
|
|
30
|
+
norm_title_2 = remove_extra_whitespace(title_2).lower()
|
|
31
|
+
|
|
32
|
+
if not norm_title_1 or not norm_title_2:
|
|
33
|
+
raise ValueError("Titles cannot be empty for comparison")
|
|
34
|
+
|
|
35
|
+
title_score = fuzzy_match_score(
|
|
36
|
+
norm_title_1,
|
|
37
|
+
norm_title_2,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
# Might catch cases in which one doesn't include the subtitle
|
|
41
|
+
one_included_in_the_other = (norm_title_1 in norm_title_2) or (norm_title_2 in norm_title_1)
|
|
42
|
+
|
|
43
|
+
undesired_kws_in_title_1 = {kw for kw in UNDESIRED_TITLE_KEYWORDS if kw in norm_title_1}
|
|
44
|
+
|
|
45
|
+
undesired_kws_in_title_2 = {kw for kw in UNDESIRED_TITLE_KEYWORDS if kw in norm_title_2}
|
|
46
|
+
|
|
47
|
+
# disjunction
|
|
48
|
+
undesired_kws = undesired_kws_in_title_1.symmetric_difference(undesired_kws_in_title_2)
|
|
49
|
+
|
|
50
|
+
undesired_kws_mismatch = True if len(undesired_kws) > 0 else False
|
|
51
|
+
|
|
52
|
+
if ((title_score > 85) or one_included_in_the_other) and not undesired_kws_mismatch:
|
|
53
|
+
title_score += 100
|
|
54
|
+
|
|
55
|
+
for _ in undesired_kws:
|
|
56
|
+
title_score -= 50
|
|
57
|
+
|
|
58
|
+
return title_score
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _score_author(author_1_full_name: str, author_2_full_name: str) -> int:
|
|
62
|
+
stripped_author_1 = remove_extra_whitespace(author_1_full_name)
|
|
63
|
+
stripped_author_2 = remove_extra_whitespace(author_2_full_name)
|
|
64
|
+
|
|
65
|
+
if not stripped_author_1 or not stripped_author_2:
|
|
66
|
+
raise ValueError("Authors cannot be empty for comparison")
|
|
67
|
+
|
|
68
|
+
author_score = fuzzy_match_score(
|
|
69
|
+
stripped_author_1,
|
|
70
|
+
stripped_author_2,
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
if author_score > 85:
|
|
74
|
+
author_score += 100
|
|
75
|
+
|
|
76
|
+
return author_score
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _score_year(year_1: int, year_2: int, range_offset: int = 1) -> int:
|
|
80
|
+
|
|
81
|
+
if not year_1 or not year_2:
|
|
82
|
+
raise ValueError("Years cannot be empty for comparison")
|
|
83
|
+
|
|
84
|
+
if not any(isinstance(year, int) for year in (year_1, year_2)):
|
|
85
|
+
if year_1 == year_2:
|
|
86
|
+
return 100
|
|
87
|
+
else:
|
|
88
|
+
return 0
|
|
89
|
+
|
|
90
|
+
range = [year_1 - range_offset, year_1, year_1 + range_offset]
|
|
91
|
+
|
|
92
|
+
if year_2 in range:
|
|
93
|
+
return 100
|
|
94
|
+
else:
|
|
95
|
+
return 0
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def compare_bibitems(reference: BibItem, subject: BibItem, bibstring_type: TBibString) -> ScoredBibItems:
|
|
99
|
+
"""
|
|
100
|
+
Calculate the score of two BibItems based on their title, author, and year.
|
|
101
|
+
The scoring is done using fuzzy matching for title and author, and exact matching for year.
|
|
102
|
+
The final score is a combination of the individual scores.
|
|
103
|
+
"""
|
|
104
|
+
|
|
105
|
+
logger.debug(f"Scoring bibitems: {reference}, {subject}")
|
|
106
|
+
|
|
107
|
+
title_1 = getattr(reference.title, bibstring_type)
|
|
108
|
+
title_2 = getattr(subject.title, bibstring_type)
|
|
109
|
+
title_score = _score_title(title_1, title_2)
|
|
110
|
+
|
|
111
|
+
author_1_full_name = format_author(reference.author, bibstring_type)
|
|
112
|
+
author_2_full_name = format_author(subject.author, bibstring_type)
|
|
113
|
+
|
|
114
|
+
author_score = _score_author(author_1_full_name, author_2_full_name)
|
|
115
|
+
|
|
116
|
+
if isinstance(reference.date, BibItemDateAttr) and isinstance(subject.date, BibItemDateAttr):
|
|
117
|
+
year_1 = reference.date.year
|
|
118
|
+
year_2 = subject.date.year
|
|
119
|
+
year_score = _score_year(year_1, year_2)
|
|
120
|
+
else:
|
|
121
|
+
year_score = 0
|
|
122
|
+
|
|
123
|
+
total_score = title_score + author_score + year_score
|
|
124
|
+
|
|
125
|
+
return {
|
|
126
|
+
"reference": reference,
|
|
127
|
+
"subject": subject,
|
|
128
|
+
"score": {
|
|
129
|
+
"score": total_score,
|
|
130
|
+
"score_title": title_score,
|
|
131
|
+
"score_author": author_score,
|
|
132
|
+
"score_year": year_score,
|
|
133
|
+
},
|
|
134
|
+
}
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
from typing import Callable, Dict, Tuple
|
|
2
|
+
from philoch_bib_sdk.converters.plaintext.journal.formatter import format_journal
|
|
3
|
+
from philoch_bib_sdk.logic.models import BibItem
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
type TJournalName = str
|
|
7
|
+
|
|
8
|
+
type TVolume = str
|
|
9
|
+
|
|
10
|
+
type TNumber = str
|
|
11
|
+
|
|
12
|
+
type TBibkey = str
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
type TJournalBibkeyIndex = Dict[Tuple[TJournalName, TVolume, TNumber], TBibkey] # (journal, volume, number) # bibkey
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def get_bibkey_by_journal_volume_number(index: TJournalBibkeyIndex, subject: BibItem) -> TBibkey:
|
|
19
|
+
"""
|
|
20
|
+
Simple lookup of a Bibitem on an index for its bibkey, via the combination (journal_name, volume, number). Fails if any of the three fields are missing.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
journal = format_journal(subject.journal, bibstring_type="latex")
|
|
24
|
+
volume = subject.volume
|
|
25
|
+
number = subject.number
|
|
26
|
+
|
|
27
|
+
if any((journal == "", volume == "", number == "")):
|
|
28
|
+
raise ValueError(
|
|
29
|
+
f"Expected subject bibitem journal with non-empty journal, volume, and number. Found [[ journal: {journal}; volume: {volume}; number: {number} ]] instead."
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
return index[(journal, volume, number)]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
type TReadIndex = Callable[
|
|
36
|
+
[
|
|
37
|
+
str, # path to the index file
|
|
38
|
+
],
|
|
39
|
+
TJournalBibkeyIndex,
|
|
40
|
+
]
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
from typing import Literal
|
|
2
2
|
|
|
3
3
|
type TBibTeXEntryType = Literal[
|
|
4
|
-
"",
|
|
5
4
|
"article",
|
|
6
5
|
"book",
|
|
7
6
|
"incollection",
|
|
@@ -12,15 +11,20 @@ type TBibTeXEntryType = Literal[
|
|
|
12
11
|
"proceedings",
|
|
13
12
|
"techreport",
|
|
14
13
|
"unpublished",
|
|
14
|
+
"UNKNOWN",
|
|
15
15
|
]
|
|
16
16
|
|
|
17
|
-
type
|
|
17
|
+
type TBasicPubState = Literal[
|
|
18
18
|
"",
|
|
19
19
|
"unpub",
|
|
20
|
+
"forthcoming",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
type TPubState = Literal[
|
|
24
|
+
TBasicPubState,
|
|
20
25
|
"inwork",
|
|
21
26
|
"submitted",
|
|
22
27
|
"published",
|
|
23
|
-
"forthcoming",
|
|
24
28
|
]
|
|
25
29
|
|
|
26
30
|
type TLanguageID = Literal[
|