philoch-bib-sdk 0.3.9__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- philoch_bib_sdk/__init__.py +0 -0
- philoch_bib_sdk/_rust.cp313-win_amd64.pyd +0 -0
- philoch_bib_sdk/adapters/io/__init__.py +115 -0
- philoch_bib_sdk/adapters/io/csv/__init__.py +308 -0
- philoch_bib_sdk/adapters/io/ods/__init__.py +145 -0
- philoch_bib_sdk/adapters/plaintext/bibitem_reader.py +0 -0
- philoch_bib_sdk/adapters/tabular_data/read_journal_volume_number_index.py +58 -0
- philoch_bib_sdk/converters/latex.py +6 -0
- philoch_bib_sdk/converters/plaintext/author/formatter.py +34 -0
- philoch_bib_sdk/converters/plaintext/author/parser.py +83 -0
- philoch_bib_sdk/converters/plaintext/bib_string_formatter.py +8 -0
- philoch_bib_sdk/converters/plaintext/bibitem/bibkey_formatter.py +21 -0
- philoch_bib_sdk/converters/plaintext/bibitem/bibkey_parser.py +158 -0
- philoch_bib_sdk/converters/plaintext/bibitem/date_formatter.py +37 -0
- philoch_bib_sdk/converters/plaintext/bibitem/date_parser.py +62 -0
- philoch_bib_sdk/converters/plaintext/bibitem/formatter.py +182 -0
- philoch_bib_sdk/converters/plaintext/bibitem/pages_formatter.py +13 -0
- philoch_bib_sdk/converters/plaintext/bibitem/pages_parser.py +63 -0
- philoch_bib_sdk/converters/plaintext/bibitem/parser.py +415 -0
- philoch_bib_sdk/converters/plaintext/journal/formatter.py +25 -0
- philoch_bib_sdk/converters/plaintext/journal/parser.py +36 -0
- philoch_bib_sdk/converters/plaintext/shared/renderable_formatter.py +25 -0
- philoch_bib_sdk/interfaces/cli/__init__.py +3 -0
- philoch_bib_sdk/interfaces/cli/fuzzy_matching.py +135 -0
- philoch_bib_sdk/logic/__init__.py +39 -0
- philoch_bib_sdk/logic/default_models.py +315 -0
- philoch_bib_sdk/logic/functions/__init__.py +31 -0
- philoch_bib_sdk/logic/functions/comparator.py +414 -0
- philoch_bib_sdk/logic/functions/fuzzy_matcher.py +796 -0
- philoch_bib_sdk/logic/functions/journal_article_matcher.py +44 -0
- philoch_bib_sdk/logic/literals.py +98 -0
- philoch_bib_sdk/logic/models.py +366 -0
- philoch_bib_sdk/logic/models_staging.py +173 -0
- philoch_bib_sdk/procedures/fuzzy_matching.py +112 -0
- philoch_bib_sdk/py.typed +0 -0
- philoch_bib_sdk/rust_scorer/Cargo.lock +232 -0
- philoch_bib_sdk/rust_scorer/Cargo.toml +26 -0
- philoch_bib_sdk/rust_scorer/pyproject.toml +15 -0
- philoch_bib_sdk/rust_scorer/rust_scorer.pyi +65 -0
- philoch_bib_sdk/rust_scorer/src/lib.rs +362 -0
- philoch_bib_sdk-0.3.9.dist-info/METADATA +15 -0
- philoch_bib_sdk-0.3.9.dist-info/RECORD +44 -0
- philoch_bib_sdk-0.3.9.dist-info/WHEEL +4 -0
- philoch_bib_sdk-0.3.9.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
import traceback
|
|
2
|
+
from typing import Tuple
|
|
3
|
+
from aletk.ResultMonad import Ok, Err
|
|
4
|
+
from aletk.utils import get_logger, remove_extra_whitespace
|
|
5
|
+
from philoch_bib_sdk.logic.models import Author, BibStringAttr, TBibString
|
|
6
|
+
|
|
7
|
+
lgr = get_logger(__name__)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _parse_normalize(text: str) -> Tuple[str, str, str]:
|
|
11
|
+
"""
|
|
12
|
+
Return a tuple of (given_name, family_name, mononym).
|
|
13
|
+
|
|
14
|
+
Handles formats:
|
|
15
|
+
- "Mononym" -> ("", "", "Mononym")
|
|
16
|
+
- "Family, Given" -> ("Given", "Family", "")
|
|
17
|
+
- "Family, Suffix, Given" -> ("Given", "Family Suffix", "")
|
|
18
|
+
|
|
19
|
+
Suffixes like Jr., Sr., III, etc. are combined with the family name.
|
|
20
|
+
"""
|
|
21
|
+
parts = tuple(remove_extra_whitespace(part) for part in text.split(","))
|
|
22
|
+
|
|
23
|
+
if len(parts) == 0:
|
|
24
|
+
return ("", "", "")
|
|
25
|
+
|
|
26
|
+
elif len(parts) == 1:
|
|
27
|
+
# Mononym
|
|
28
|
+
return ("", "", parts[0])
|
|
29
|
+
|
|
30
|
+
elif len(parts) == 2:
|
|
31
|
+
# Full name: "Family, Given"
|
|
32
|
+
return (parts[1], parts[0], "")
|
|
33
|
+
|
|
34
|
+
elif len(parts) == 3:
|
|
35
|
+
# Full name with suffix: "Family, Suffix, Given"
|
|
36
|
+
# Combine family name and suffix (e.g., "Belnap Jr.")
|
|
37
|
+
family_with_suffix = f"{parts[0]} {parts[1]}"
|
|
38
|
+
return (parts[2], family_with_suffix, "")
|
|
39
|
+
|
|
40
|
+
else:
|
|
41
|
+
raise ValueError(f"Unexpected number of author parts found in '{text}': '{parts}'. Expected 3 or less.")
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _parse_single(normalized_name_parts: Tuple[str, str, str], bib_string_type: TBibString) -> Author:
|
|
45
|
+
"""
|
|
46
|
+
Parse a single author from a string.
|
|
47
|
+
"""
|
|
48
|
+
_given_name, _family_name, _mononym = normalized_name_parts
|
|
49
|
+
|
|
50
|
+
return Author(
|
|
51
|
+
given_name=BibStringAttr(**{str(bib_string_type): _given_name}),
|
|
52
|
+
family_name=BibStringAttr(**{str(bib_string_type): _family_name}),
|
|
53
|
+
mononym=BibStringAttr(**{str(bib_string_type): _mononym}),
|
|
54
|
+
shorthand=BibStringAttr(),
|
|
55
|
+
famous_name=BibStringAttr(),
|
|
56
|
+
publications=(),
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def parse_author(text: str, bibstring_type: TBibString) -> Ok[Tuple[Author, ...]] | Err:
|
|
61
|
+
"""
|
|
62
|
+
Return either a tuple of Author objects or an error.
|
|
63
|
+
The input string is expected to be an ' and '-separated list of authors, with each author in the format "family_name, given_name" or "mononym".
|
|
64
|
+
"""
|
|
65
|
+
try:
|
|
66
|
+
if text == "":
|
|
67
|
+
lgr.debug("Empty author string, returning empty tuple.")
|
|
68
|
+
return Ok(())
|
|
69
|
+
|
|
70
|
+
parts = tuple(remove_extra_whitespace(part) for part in text.split("and"))
|
|
71
|
+
parts_normalized = (_parse_normalize(part) for part in parts)
|
|
72
|
+
|
|
73
|
+
authors = tuple(_parse_single(part, bibstring_type) for part in parts_normalized)
|
|
74
|
+
|
|
75
|
+
return Ok(authors)
|
|
76
|
+
|
|
77
|
+
except Exception as e:
|
|
78
|
+
return Err(
|
|
79
|
+
message=f"Could not parse 'author' field with value [[ {text} ]]. {e.__class__.__name__}: {e}",
|
|
80
|
+
code=-1,
|
|
81
|
+
error_type="ParsingError",
|
|
82
|
+
error_trace=f"{traceback.format_exc()}",
|
|
83
|
+
)
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
from philoch_bib_sdk.logic.models import BibStringAttr, MaybeStr, TBibString
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def format_bib_string_attr(bib_string: MaybeStr[BibStringAttr], bibstring_type: TBibString) -> str:
|
|
5
|
+
"""
|
|
6
|
+
Format a BibStringAttr into a string representation.
|
|
7
|
+
"""
|
|
8
|
+
return "" if not bib_string else getattr(bib_string, bibstring_type, "")
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from philoch_bib_sdk.logic.models import BibKeyAttr, MaybeStr
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def format_bibkey(bibkey: MaybeStr[BibKeyAttr]) -> str:
|
|
5
|
+
|
|
6
|
+
if bibkey == "":
|
|
7
|
+
return ""
|
|
8
|
+
|
|
9
|
+
if bibkey.other_authors:
|
|
10
|
+
authors_l = [bibkey.first_author, bibkey.other_authors]
|
|
11
|
+
else:
|
|
12
|
+
authors_l = [bibkey.first_author]
|
|
13
|
+
|
|
14
|
+
authors = "-".join(authors_l)
|
|
15
|
+
|
|
16
|
+
if isinstance(bibkey.date, int):
|
|
17
|
+
year = f"{bibkey.date}{bibkey.date_suffix}"
|
|
18
|
+
else:
|
|
19
|
+
year = f"{bibkey.date}-{bibkey.date_suffix}" if bibkey.date_suffix else bibkey.date
|
|
20
|
+
|
|
21
|
+
return f"{authors}:{year}"
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
import traceback
|
|
2
|
+
from typing import Tuple
|
|
3
|
+
from aletk.ResultMonad import Ok, Err
|
|
4
|
+
from aletk.utils import get_logger
|
|
5
|
+
from philoch_bib_sdk.logic.literals import TBasicPubState
|
|
6
|
+
from philoch_bib_sdk.logic.models import BibKeyAttr
|
|
7
|
+
|
|
8
|
+
lgr = get_logger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _parse_bibkey_author(text: str) -> Tuple[str, str]:
|
|
12
|
+
|
|
13
|
+
author_parts = text.split("-")
|
|
14
|
+
|
|
15
|
+
if len(author_parts) == 1:
|
|
16
|
+
first_author = author_parts[0]
|
|
17
|
+
other_authors = ""
|
|
18
|
+
elif len(author_parts) == 2:
|
|
19
|
+
first_author = author_parts[0]
|
|
20
|
+
other_authors = author_parts[1]
|
|
21
|
+
else:
|
|
22
|
+
raise ValueError(
|
|
23
|
+
f"Unexpected bibkey author parts in [[ {text} ]]. Found [[ {author_parts} ]]. Expected 1 author, or 2 authors separated by '-'."
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
return first_author, other_authors
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _parse_bibkey_date_int_part(text: str) -> Tuple[int | None, int | None]:
|
|
30
|
+
|
|
31
|
+
char_index_type_d = {i: (char, char.isdigit()) for i, char in enumerate(text)}
|
|
32
|
+
|
|
33
|
+
year_l: list[str] = []
|
|
34
|
+
int_breakpoint = None
|
|
35
|
+
for i, (char, is_digit) in char_index_type_d.items():
|
|
36
|
+
if is_digit:
|
|
37
|
+
year_l.append(char)
|
|
38
|
+
int_breakpoint = i
|
|
39
|
+
else:
|
|
40
|
+
break
|
|
41
|
+
|
|
42
|
+
if year_l != []:
|
|
43
|
+
year_int = int(f"{''.join(year_l)}")
|
|
44
|
+
else:
|
|
45
|
+
year_int = None
|
|
46
|
+
|
|
47
|
+
if year_int and len(f"{year_int}") > 4:
|
|
48
|
+
raise ValueError(f"Unexpected year value in '{text}': is not a valid year or publication state")
|
|
49
|
+
|
|
50
|
+
return year_int, int_breakpoint
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _parse_bibkey_date_suffix_part(
|
|
54
|
+
date_parts: str, year_int: int | None, int_breakpoint: int | None
|
|
55
|
+
) -> Tuple[int | TBasicPubState, str]:
|
|
56
|
+
|
|
57
|
+
# Case 1. The first part of the year is a digit
|
|
58
|
+
if int_breakpoint is not None:
|
|
59
|
+
if year_int is None:
|
|
60
|
+
raise ValueError(
|
|
61
|
+
f"Unexpected case! year_int is None but int_breakpoint is not None. This should not happen."
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
date_suffix_raw = date_parts[int_breakpoint + 1 :]
|
|
65
|
+
return (
|
|
66
|
+
year_int,
|
|
67
|
+
date_suffix_raw,
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
if year_int is not None:
|
|
71
|
+
raise ValueError(f"Unexpected case! year_int is None but int_breakpoint is not None. This should not happen.")
|
|
72
|
+
|
|
73
|
+
# Case 2. first characters are non-digits
|
|
74
|
+
# has to start with either "unpub" or "forthcoming" then
|
|
75
|
+
date_suffix_raw = "".join(date_parts)
|
|
76
|
+
|
|
77
|
+
if not (date_suffix_raw.startswith("forthcoming") or date_suffix_raw.startswith("unpub")):
|
|
78
|
+
raise ValueError(f"Unexpected year value in '{date_parts}': it is not a valid publication state.")
|
|
79
|
+
|
|
80
|
+
date_suffix_parts = date_suffix_raw.split("-")
|
|
81
|
+
|
|
82
|
+
if len(date_suffix_parts) == 2:
|
|
83
|
+
suffix = date_suffix_parts[1]
|
|
84
|
+
if not suffix:
|
|
85
|
+
raise ValueError(
|
|
86
|
+
f"Unexpected year value in '{date_parts}': it is not a valid publication state. Expected a suffix after '-'."
|
|
87
|
+
)
|
|
88
|
+
elif len(date_suffix_parts) == 1:
|
|
89
|
+
suffix = ""
|
|
90
|
+
else:
|
|
91
|
+
raise ValueError(f"Unexpected year value in '{date_parts}': it is not a valid publication state.")
|
|
92
|
+
|
|
93
|
+
pubstate: TBasicPubState = ""
|
|
94
|
+
if date_suffix_parts[0] == "unpub":
|
|
95
|
+
pubstate = "unpub"
|
|
96
|
+
elif date_suffix_parts[0] == "forthcoming":
|
|
97
|
+
pubstate = "forthcoming"
|
|
98
|
+
else:
|
|
99
|
+
raise ValueError(f"Unexpected year value in '{date_parts}': it is not a valid publication state.")
|
|
100
|
+
|
|
101
|
+
return pubstate, suffix
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def parse_bibkey(text: str) -> Ok[BibKeyAttr] | Err:
|
|
105
|
+
"""
|
|
106
|
+
Return either a Bibkey object, or a BibkeyError object to indicate a parsing error.
|
|
107
|
+
"""
|
|
108
|
+
|
|
109
|
+
try:
|
|
110
|
+
bibkey_parts = text.split(":")
|
|
111
|
+
if len(bibkey_parts) != 2:
|
|
112
|
+
raise ValueError(
|
|
113
|
+
f"Unexpected number of bibkey parts in [[ {text} ]]. Expected only two parts separated by ':'."
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
# Parse the author part
|
|
117
|
+
first_author, other_authors = _parse_bibkey_author(bibkey_parts[0])
|
|
118
|
+
|
|
119
|
+
# Parse the date part
|
|
120
|
+
date_parts = bibkey_parts[1]
|
|
121
|
+
|
|
122
|
+
year_int, int_breakpoint = _parse_bibkey_date_int_part(date_parts)
|
|
123
|
+
|
|
124
|
+
# Parse the date suffix part
|
|
125
|
+
date, date_suffix = _parse_bibkey_date_suffix_part(date_parts, year_int, int_breakpoint)
|
|
126
|
+
|
|
127
|
+
return Ok(
|
|
128
|
+
BibKeyAttr(
|
|
129
|
+
first_author=first_author,
|
|
130
|
+
other_authors=other_authors,
|
|
131
|
+
date=date,
|
|
132
|
+
date_suffix=date_suffix,
|
|
133
|
+
)
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
except Exception as e:
|
|
137
|
+
error_message = f"Could not parse bibkey for '{text}'"
|
|
138
|
+
|
|
139
|
+
return Err(
|
|
140
|
+
message=error_message,
|
|
141
|
+
code=-1,
|
|
142
|
+
error_type="BibkeyError",
|
|
143
|
+
error_trace=f"{traceback.format_exc()}",
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def hard_parse_bibkey(text: str) -> BibKeyAttr:
|
|
148
|
+
"""
|
|
149
|
+
Hard parse a bibkey, without any error handling.
|
|
150
|
+
This is used for testing purposes only.
|
|
151
|
+
"""
|
|
152
|
+
|
|
153
|
+
bibkey_parsed = parse_bibkey(text)
|
|
154
|
+
|
|
155
|
+
if isinstance(bibkey_parsed, Err):
|
|
156
|
+
raise ValueError(f"Could not hard parse '{text}' as bibkey: {bibkey_parsed.message}")
|
|
157
|
+
|
|
158
|
+
return bibkey_parsed.out
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from typing import Literal
|
|
2
|
+
from philoch_bib_sdk.logic.models import VALID_DATE_FORMATS, BibItemDateAttr
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def format_date(date: BibItemDateAttr | Literal["no date"]) -> str:
|
|
6
|
+
|
|
7
|
+
if date == "no date":
|
|
8
|
+
return "no date"
|
|
9
|
+
|
|
10
|
+
match date:
|
|
11
|
+
case BibItemDateAttr(year=year, year_part_2_hyphen=None, year_part_2_slash=None, month=None, day=None):
|
|
12
|
+
return str(year)
|
|
13
|
+
|
|
14
|
+
case BibItemDateAttr(year=year, year_part_2_hyphen=None, year_part_2_slash=None, month=month, day=day) if (
|
|
15
|
+
month is not None and day is not None
|
|
16
|
+
):
|
|
17
|
+
return f"{year}-{str(month).zfill(2)}-{str(day).zfill(2)}"
|
|
18
|
+
|
|
19
|
+
case BibItemDateAttr(year=year, year_part_2_hyphen=None, year_part_2_slash=None, month=month, day=None) if (
|
|
20
|
+
month is not None
|
|
21
|
+
):
|
|
22
|
+
return f"{year}-{str(month).zfill(2)}"
|
|
23
|
+
|
|
24
|
+
case BibItemDateAttr(
|
|
25
|
+
year=year, year_part_2_hyphen=year_part_2_hyphen, year_part_2_slash=None, month=None, day=None
|
|
26
|
+
) if (year_part_2_hyphen is not None):
|
|
27
|
+
return f"{year}-{year_part_2_hyphen}"
|
|
28
|
+
|
|
29
|
+
case BibItemDateAttr(
|
|
30
|
+
year=year, year_part_2_hyphen=None, year_part_2_slash=year_part_2_slash, month=None, day=None
|
|
31
|
+
) if (year_part_2_slash is not None):
|
|
32
|
+
return f"{year}/{year_part_2_slash}"
|
|
33
|
+
|
|
34
|
+
case _:
|
|
35
|
+
raise ValueError(
|
|
36
|
+
f"Invalid date format. Expected one of {', '.join(VALID_DATE_FORMATS)}, but found '{date}'."
|
|
37
|
+
)
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
from aletk.utils import remove_extra_whitespace, get_logger
|
|
2
|
+
from aletk.ResultMonad import Ok, Err
|
|
3
|
+
from typing import Literal
|
|
4
|
+
from philoch_bib_sdk.logic.models import VALID_DATE_FORMATS, BibItemDateAttr
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
lgr = get_logger(__name__)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _parse_date(text: str) -> BibItemDateAttr | Literal["no date"]:
|
|
11
|
+
"""
|
|
12
|
+
Parse a single date attribute from a string.
|
|
13
|
+
"""
|
|
14
|
+
text = remove_extra_whitespace(text)
|
|
15
|
+
|
|
16
|
+
if remove_extra_whitespace(text).lower() == "no date":
|
|
17
|
+
return "no date"
|
|
18
|
+
|
|
19
|
+
# Split by potential delimiters (hyphens or slashes)
|
|
20
|
+
parts = text.replace("-", "/").split("/")
|
|
21
|
+
|
|
22
|
+
# Handle the number of parts (could be year, year-year2, year/year_2, year-month-day)
|
|
23
|
+
if len(parts) == 1:
|
|
24
|
+
return BibItemDateAttr(
|
|
25
|
+
year=int(parts[0]), year_part_2_hyphen=None, year_part_2_slash=None, month=None, day=None
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
elif len(parts) == 2 and "-" in text:
|
|
29
|
+
return BibItemDateAttr(
|
|
30
|
+
year=int(parts[0]), year_part_2_hyphen=int(parts[1]), year_part_2_slash=None, month=None, day=None
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
elif len(parts) == 2 and "/" in text:
|
|
34
|
+
return BibItemDateAttr(
|
|
35
|
+
year=int(parts[0]), year_part_2_hyphen=None, year_part_2_slash=int(parts[1]), month=None, day=None
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
elif len(parts) == 3 and "-" in text and len(parts[1]) <= 2 and len(parts[2]) <= 2:
|
|
39
|
+
return BibItemDateAttr(
|
|
40
|
+
year=int(parts[0]), year_part_2_hyphen=None, year_part_2_slash=None, month=int(parts[1]), day=int(parts[2])
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
else:
|
|
44
|
+
raise ValueError(f"Invalid date format found in '{text}'. Expected one of {', '.join(VALID_DATE_FORMATS)}.")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def parse_date(text: str) -> Ok[BibItemDateAttr | Literal["no date"]] | Err:
|
|
48
|
+
"""
|
|
49
|
+
Parse a single date string into a BibItemDateAttr object.
|
|
50
|
+
The input is expected to be a single date, either in the format '<year>' or '<year>-<month>' or '<year>-<month>-<day>' (or slashes instead of hyphens).
|
|
51
|
+
"""
|
|
52
|
+
try:
|
|
53
|
+
return Ok(_parse_date(text))
|
|
54
|
+
|
|
55
|
+
except Exception as e:
|
|
56
|
+
error_message = f"Error parsing date from '{text}': {e}"
|
|
57
|
+
return Err(
|
|
58
|
+
message=error_message,
|
|
59
|
+
code=-1,
|
|
60
|
+
error_type=f"{e.__class__.__name__}",
|
|
61
|
+
error_trace="",
|
|
62
|
+
)
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
from typing import TypedDict
|
|
2
|
+
from aletk.utils import get_logger
|
|
3
|
+
|
|
4
|
+
from philoch_bib_sdk.converters.plaintext.author.formatter import format_author
|
|
5
|
+
from philoch_bib_sdk.converters.plaintext.bib_string_formatter import format_bib_string_attr
|
|
6
|
+
from philoch_bib_sdk.converters.plaintext.bibitem.bibkey_formatter import format_bibkey
|
|
7
|
+
from philoch_bib_sdk.converters.plaintext.bibitem.date_formatter import format_date
|
|
8
|
+
from philoch_bib_sdk.converters.plaintext.bibitem.pages_formatter import format_pages
|
|
9
|
+
from philoch_bib_sdk.converters.plaintext.journal.formatter import format_journal
|
|
10
|
+
from philoch_bib_sdk.logic.literals import TBibTeXEntryType
|
|
11
|
+
from philoch_bib_sdk.logic.models import BibItem
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
lgr = get_logger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def format_entry_type(entry_type: TBibTeXEntryType) -> str:
|
|
18
|
+
"""
|
|
19
|
+
Format the entry type for the BibItem.
|
|
20
|
+
"""
|
|
21
|
+
match entry_type:
|
|
22
|
+
case "UNKNOWN":
|
|
23
|
+
return "UNKNOWN"
|
|
24
|
+
case _ if entry_type:
|
|
25
|
+
return f"@{entry_type}"
|
|
26
|
+
case _:
|
|
27
|
+
return ""
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class FormattedBibItem(TypedDict, total=True):
|
|
31
|
+
_to_do_general: str
|
|
32
|
+
_change_request: str
|
|
33
|
+
entry_type: str
|
|
34
|
+
bibkey: str
|
|
35
|
+
author: str
|
|
36
|
+
_author_ids: str
|
|
37
|
+
editor: str
|
|
38
|
+
_editor_ids: str
|
|
39
|
+
author_ids: str
|
|
40
|
+
options: str
|
|
41
|
+
shorthand: str
|
|
42
|
+
date: str
|
|
43
|
+
pubstate: str
|
|
44
|
+
title: str
|
|
45
|
+
_title_unicode: str
|
|
46
|
+
booktitle: str
|
|
47
|
+
crossref: str
|
|
48
|
+
journal: str
|
|
49
|
+
journal_id: str
|
|
50
|
+
volume: str
|
|
51
|
+
number: str
|
|
52
|
+
pages: str
|
|
53
|
+
eid: str
|
|
54
|
+
series: str
|
|
55
|
+
address: str
|
|
56
|
+
institution: str
|
|
57
|
+
school: str
|
|
58
|
+
publisher: str
|
|
59
|
+
publisher_id: str
|
|
60
|
+
type: str
|
|
61
|
+
edition: str
|
|
62
|
+
note: str
|
|
63
|
+
_issuetitle: str
|
|
64
|
+
_guesteditor: str
|
|
65
|
+
_extra_note: str
|
|
66
|
+
urn: str
|
|
67
|
+
eprint: str
|
|
68
|
+
doi: str
|
|
69
|
+
url: str
|
|
70
|
+
_kw_level1: str
|
|
71
|
+
_kw_level2: str
|
|
72
|
+
_kw_level3: str
|
|
73
|
+
_epoch: str
|
|
74
|
+
_person: str
|
|
75
|
+
_comm_for_profile_bib: str
|
|
76
|
+
_langid: str
|
|
77
|
+
_lang_der: str
|
|
78
|
+
_further_refs: str
|
|
79
|
+
_depends_on: str
|
|
80
|
+
_dltc_num: str
|
|
81
|
+
_spec_interest: str
|
|
82
|
+
_note_perso: str
|
|
83
|
+
_note_stock: str
|
|
84
|
+
_note_status: str
|
|
85
|
+
_num_inwork_coll: str
|
|
86
|
+
_num_inwork: str
|
|
87
|
+
_num_coll: str
|
|
88
|
+
_dltc_copyediting_note: str
|
|
89
|
+
_note_missing: str
|
|
90
|
+
_num_sort: str
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def format_bibitem(bibitem: BibItem) -> FormattedBibItem:
|
|
94
|
+
|
|
95
|
+
bibkey = format_bibkey(bibitem.bibkey)
|
|
96
|
+
|
|
97
|
+
author = format_author(bibitem.author, "latex")
|
|
98
|
+
editor = format_author(bibitem.editor, "latex")
|
|
99
|
+
person = format_author((bibitem._person,), "latex") if bibitem._person else ""
|
|
100
|
+
|
|
101
|
+
shorthand = ", ".join([author.mononym.latex for author in bibitem.author if author.mononym.latex])
|
|
102
|
+
date = format_date(bibitem.date)
|
|
103
|
+
|
|
104
|
+
pages = format_pages(bibitem.pages)
|
|
105
|
+
|
|
106
|
+
journal = format_journal(bibitem.journal, "latex")
|
|
107
|
+
|
|
108
|
+
crossref = format_bibkey(bibitem.crossref.bibkey) if bibitem.crossref else ""
|
|
109
|
+
|
|
110
|
+
_kw_level1, kw_level2, kw_level3 = (
|
|
111
|
+
bibitem._kws.level_1.name if bibitem._kws else "",
|
|
112
|
+
bibitem._kws.level_2.name if bibitem._kws else "",
|
|
113
|
+
bibitem._kws.level_3.name if bibitem._kws else "",
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
further_refs = ", ".join([format_bibkey(ref) for ref in bibitem._further_refs])
|
|
117
|
+
depends_on = ", ".join([format_bibkey(dep) for dep in bibitem._depends_on])
|
|
118
|
+
|
|
119
|
+
formatted: FormattedBibItem = {
|
|
120
|
+
"_to_do_general": bibitem._to_do_general,
|
|
121
|
+
"_change_request": bibitem._change_request,
|
|
122
|
+
"entry_type": format_entry_type(bibitem.entry_type),
|
|
123
|
+
"bibkey": bibkey,
|
|
124
|
+
"author": author,
|
|
125
|
+
"_author_ids": "",
|
|
126
|
+
"editor": editor,
|
|
127
|
+
"_editor_ids": "",
|
|
128
|
+
"author_ids": "",
|
|
129
|
+
"options": ", ".join(bibitem.options),
|
|
130
|
+
"shorthand": shorthand,
|
|
131
|
+
"date": date,
|
|
132
|
+
"pubstate": bibitem.pubstate,
|
|
133
|
+
"title": format_bib_string_attr(bibitem.title, "latex"),
|
|
134
|
+
"_title_unicode": format_bib_string_attr(bibitem.title, "unicode"),
|
|
135
|
+
"booktitle": format_bib_string_attr(bibitem.booktitle, "latex"),
|
|
136
|
+
"crossref": crossref,
|
|
137
|
+
"journal": journal,
|
|
138
|
+
"journal_id": "",
|
|
139
|
+
"volume": bibitem.volume,
|
|
140
|
+
"number": bibitem.number,
|
|
141
|
+
"pages": pages,
|
|
142
|
+
"eid": bibitem.eid,
|
|
143
|
+
"series": format_bib_string_attr(bibitem.series.name, "latex") if bibitem.series else "",
|
|
144
|
+
"address": format_bib_string_attr(bibitem.address, "latex"),
|
|
145
|
+
"institution": format_bib_string_attr(bibitem.institution, "latex"),
|
|
146
|
+
"school": format_bib_string_attr(bibitem.school, "latex"),
|
|
147
|
+
"publisher": format_bib_string_attr(bibitem.publisher, "latex"),
|
|
148
|
+
"publisher_id": "",
|
|
149
|
+
"type": format_bib_string_attr(bibitem.type, "latex"),
|
|
150
|
+
"edition": str(bibitem.edition) if bibitem.edition is not None else "",
|
|
151
|
+
"note": format_bib_string_attr(bibitem.note, "latex"),
|
|
152
|
+
"_issuetitle": format_bib_string_attr(bibitem.issuetitle, "latex") if bibitem.issuetitle else "",
|
|
153
|
+
"_guesteditor": ", ".join(format_author(tuple(author for author in bibitem._guesteditor), "latex")),
|
|
154
|
+
"_extra_note": format_bib_string_attr(bibitem._extra_note, "latex") if bibitem._extra_note else "",
|
|
155
|
+
"urn": bibitem.urn,
|
|
156
|
+
"eprint": bibitem.eprint,
|
|
157
|
+
"doi": bibitem.doi,
|
|
158
|
+
"url": bibitem.url,
|
|
159
|
+
"_kw_level1": _kw_level1,
|
|
160
|
+
"_kw_level2": kw_level2,
|
|
161
|
+
"_kw_level3": kw_level3,
|
|
162
|
+
"_epoch": bibitem._epoch,
|
|
163
|
+
"_person": person,
|
|
164
|
+
"_comm_for_profile_bib": bibitem._comm_for_profile_bib,
|
|
165
|
+
"_langid": bibitem._langid,
|
|
166
|
+
"_lang_der": bibitem._lang_der,
|
|
167
|
+
"_further_refs": further_refs,
|
|
168
|
+
"_depends_on": depends_on,
|
|
169
|
+
"_dltc_num": str(bibitem._dltc_num) if bibitem._dltc_num is not None else "",
|
|
170
|
+
"_spec_interest": bibitem._spec_interest,
|
|
171
|
+
"_note_perso": bibitem._note_perso,
|
|
172
|
+
"_note_stock": bibitem._note_stock,
|
|
173
|
+
"_note_status": bibitem._note_status,
|
|
174
|
+
"_num_inwork_coll": str(bibitem._num_inwork_coll) if bibitem._num_inwork_coll is not None else "",
|
|
175
|
+
"_num_inwork": bibitem._num_inwork,
|
|
176
|
+
"_num_coll": str(bibitem._num_coll) if bibitem._num_coll is not None else "",
|
|
177
|
+
"_dltc_copyediting_note": bibitem._dltc_copyediting_note,
|
|
178
|
+
"_note_missing": bibitem._note_missing,
|
|
179
|
+
"_num_sort": str(bibitem._num_sort) if bibitem._num_sort is not None else "",
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
return formatted
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from typing import Tuple
|
|
2
|
+
from philoch_bib_sdk.logic.models import PageAttr
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def _pages_single_str(page_pair: PageAttr) -> str:
|
|
6
|
+
return "--".join((page_pair.start, page_pair.end)) if page_pair.end else page_pair.start
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def format_pages(pages: Tuple[PageAttr, ...]) -> str:
|
|
10
|
+
if pages is tuple():
|
|
11
|
+
return ""
|
|
12
|
+
|
|
13
|
+
return ", ".join((_pages_single_str(page_pair) for page_pair in pages))
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import traceback
|
|
3
|
+
from typing import Tuple
|
|
4
|
+
from aletk.utils import remove_extra_whitespace
|
|
5
|
+
from aletk.ResultMonad import Ok, Err
|
|
6
|
+
|
|
7
|
+
from philoch_bib_sdk.logic.models import PageAttr
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def is_valid_roman(raw_str: str) -> bool:
|
|
11
|
+
"""
|
|
12
|
+
TODO: TBD, decide if we want to control if the pages are in roman numbers.
|
|
13
|
+
"""
|
|
14
|
+
raw_str = raw_str.upper()
|
|
15
|
+
pattern = r"^M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$"
|
|
16
|
+
return bool(re.match(pattern, raw_str))
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _parse_single_page_attr(
|
|
20
|
+
text: str,
|
|
21
|
+
) -> PageAttr:
|
|
22
|
+
"""
|
|
23
|
+
Parse a single page attribute from a string.
|
|
24
|
+
"""
|
|
25
|
+
if "--" not in text and "-" in text:
|
|
26
|
+
raise ValueError(f"Unexpected page format found in '{text}'. Expected either '<start>--<end>' or '<page>'.")
|
|
27
|
+
elif "--" in text:
|
|
28
|
+
parts = remove_extra_whitespace(text).split("--")
|
|
29
|
+
|
|
30
|
+
if len(parts) != 2:
|
|
31
|
+
raise ValueError(f"Unexpected number of page parts found in '{text}': '{parts}'. Expected exactly 2.")
|
|
32
|
+
|
|
33
|
+
start_page, end_page = (remove_extra_whitespace(part) for part in parts)
|
|
34
|
+
|
|
35
|
+
else:
|
|
36
|
+
start_page = remove_extra_whitespace(text)
|
|
37
|
+
end_page = ""
|
|
38
|
+
|
|
39
|
+
return PageAttr(start=start_page, end=end_page)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def parse_pages(text: str) -> Ok[Tuple[PageAttr, ...]] | Err:
|
|
43
|
+
"""
|
|
44
|
+
Parse a string of pages into a tuple of PageAttr objects.
|
|
45
|
+
The input string is expected to be a comma-separated list of page attributes, with each attribute in the format "<start>--<end>" or "<page>".
|
|
46
|
+
"""
|
|
47
|
+
try:
|
|
48
|
+
if text == "":
|
|
49
|
+
return Ok(())
|
|
50
|
+
|
|
51
|
+
parts = (remove_extra_whitespace(part) for part in text.split(","))
|
|
52
|
+
parts_normalized = (_parse_single_page_attr(part) for part in parts)
|
|
53
|
+
|
|
54
|
+
return Ok(tuple(parts_normalized))
|
|
55
|
+
|
|
56
|
+
except Exception as e:
|
|
57
|
+
error_message = f"Error parsing pages from '{text}': {e}"
|
|
58
|
+
return Err(
|
|
59
|
+
error_message,
|
|
60
|
+
code=-1,
|
|
61
|
+
error_type=f"{e.__class__.__name__}",
|
|
62
|
+
error_trace=traceback.format_exc(),
|
|
63
|
+
)
|