philoch-bib-sdk 0.3.9__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- philoch_bib_sdk/__init__.py +0 -0
- philoch_bib_sdk/_rust.cp313-win_amd64.pyd +0 -0
- philoch_bib_sdk/adapters/io/__init__.py +115 -0
- philoch_bib_sdk/adapters/io/csv/__init__.py +308 -0
- philoch_bib_sdk/adapters/io/ods/__init__.py +145 -0
- philoch_bib_sdk/adapters/plaintext/bibitem_reader.py +0 -0
- philoch_bib_sdk/adapters/tabular_data/read_journal_volume_number_index.py +58 -0
- philoch_bib_sdk/converters/latex.py +6 -0
- philoch_bib_sdk/converters/plaintext/author/formatter.py +34 -0
- philoch_bib_sdk/converters/plaintext/author/parser.py +83 -0
- philoch_bib_sdk/converters/plaintext/bib_string_formatter.py +8 -0
- philoch_bib_sdk/converters/plaintext/bibitem/bibkey_formatter.py +21 -0
- philoch_bib_sdk/converters/plaintext/bibitem/bibkey_parser.py +158 -0
- philoch_bib_sdk/converters/plaintext/bibitem/date_formatter.py +37 -0
- philoch_bib_sdk/converters/plaintext/bibitem/date_parser.py +62 -0
- philoch_bib_sdk/converters/plaintext/bibitem/formatter.py +182 -0
- philoch_bib_sdk/converters/plaintext/bibitem/pages_formatter.py +13 -0
- philoch_bib_sdk/converters/plaintext/bibitem/pages_parser.py +63 -0
- philoch_bib_sdk/converters/plaintext/bibitem/parser.py +415 -0
- philoch_bib_sdk/converters/plaintext/journal/formatter.py +25 -0
- philoch_bib_sdk/converters/plaintext/journal/parser.py +36 -0
- philoch_bib_sdk/converters/plaintext/shared/renderable_formatter.py +25 -0
- philoch_bib_sdk/interfaces/cli/__init__.py +3 -0
- philoch_bib_sdk/interfaces/cli/fuzzy_matching.py +135 -0
- philoch_bib_sdk/logic/__init__.py +39 -0
- philoch_bib_sdk/logic/default_models.py +315 -0
- philoch_bib_sdk/logic/functions/__init__.py +31 -0
- philoch_bib_sdk/logic/functions/comparator.py +414 -0
- philoch_bib_sdk/logic/functions/fuzzy_matcher.py +796 -0
- philoch_bib_sdk/logic/functions/journal_article_matcher.py +44 -0
- philoch_bib_sdk/logic/literals.py +98 -0
- philoch_bib_sdk/logic/models.py +366 -0
- philoch_bib_sdk/logic/models_staging.py +173 -0
- philoch_bib_sdk/procedures/fuzzy_matching.py +112 -0
- philoch_bib_sdk/py.typed +0 -0
- philoch_bib_sdk/rust_scorer/Cargo.lock +232 -0
- philoch_bib_sdk/rust_scorer/Cargo.toml +26 -0
- philoch_bib_sdk/rust_scorer/pyproject.toml +15 -0
- philoch_bib_sdk/rust_scorer/rust_scorer.pyi +65 -0
- philoch_bib_sdk/rust_scorer/src/lib.rs +362 -0
- philoch_bib_sdk-0.3.9.dist-info/METADATA +15 -0
- philoch_bib_sdk-0.3.9.dist-info/RECORD +44 -0
- philoch_bib_sdk-0.3.9.dist-info/WHEEL +4 -0
- philoch_bib_sdk-0.3.9.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,415 @@
|
|
|
1
|
+
import traceback
|
|
2
|
+
from typing import Tuple, Literal, TypedDict, TypeGuard, Any
|
|
3
|
+
from aletk.ResultMonad import Ok, Err
|
|
4
|
+
from aletk.utils import get_logger, remove_extra_whitespace
|
|
5
|
+
from philoch_bib_sdk.converters.plaintext.author.parser import parse_author
|
|
6
|
+
from philoch_bib_sdk.converters.plaintext.bibitem.bibkey_parser import parse_bibkey
|
|
7
|
+
from philoch_bib_sdk.converters.plaintext.bibitem.date_parser import parse_date
|
|
8
|
+
from philoch_bib_sdk.converters.plaintext.bibitem.pages_parser import parse_pages
|
|
9
|
+
from philoch_bib_sdk.converters.plaintext.journal.parser import parse_journal
|
|
10
|
+
from philoch_bib_sdk.logic.literals import (
|
|
11
|
+
TBibTeXEntryType,
|
|
12
|
+
TPubState,
|
|
13
|
+
TEpoch,
|
|
14
|
+
TLanguageID,
|
|
15
|
+
BIBTEX_ENTRY_TYPE_VALUES,
|
|
16
|
+
PUB_STATE_VALUES,
|
|
17
|
+
EPOCH_VALUES,
|
|
18
|
+
LANGUAGE_ID_VALUES,
|
|
19
|
+
)
|
|
20
|
+
from philoch_bib_sdk.logic.models import (
|
|
21
|
+
BibItem,
|
|
22
|
+
BibStringAttr,
|
|
23
|
+
BibKeyAttr,
|
|
24
|
+
Author,
|
|
25
|
+
PageAttr,
|
|
26
|
+
BibItemDateAttr,
|
|
27
|
+
BaseNamedRenderable,
|
|
28
|
+
KeywordsAttr,
|
|
29
|
+
Keyword,
|
|
30
|
+
TBibString,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
lgr = get_logger(__name__)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class ParsedBibItemData(TypedDict, total=False):
|
|
37
|
+
_to_do_general: str
|
|
38
|
+
_change_request: str
|
|
39
|
+
entry_type: str
|
|
40
|
+
bibkey: str
|
|
41
|
+
author: str
|
|
42
|
+
_author_ids: str
|
|
43
|
+
editor: str
|
|
44
|
+
_editor_ids: str
|
|
45
|
+
author_ids: str
|
|
46
|
+
options: str
|
|
47
|
+
shorthand: str
|
|
48
|
+
date: str
|
|
49
|
+
pubstate: str
|
|
50
|
+
title: str
|
|
51
|
+
_title_unicode: str
|
|
52
|
+
booktitle: str
|
|
53
|
+
crossref: str
|
|
54
|
+
journal: str
|
|
55
|
+
journal_id: str
|
|
56
|
+
volume: str
|
|
57
|
+
number: str
|
|
58
|
+
pages: str
|
|
59
|
+
eid: str
|
|
60
|
+
series: str
|
|
61
|
+
address: str
|
|
62
|
+
institution: str
|
|
63
|
+
school: str
|
|
64
|
+
publisher: str
|
|
65
|
+
publisher_id: str
|
|
66
|
+
type: str
|
|
67
|
+
edition: str
|
|
68
|
+
note: str
|
|
69
|
+
_issuetitle: str
|
|
70
|
+
_guesteditor: str
|
|
71
|
+
_extra_note: str
|
|
72
|
+
urn: str
|
|
73
|
+
eprint: str
|
|
74
|
+
doi: str
|
|
75
|
+
url: str
|
|
76
|
+
_kw_level1: str
|
|
77
|
+
_kw_level2: str
|
|
78
|
+
_kw_level3: str
|
|
79
|
+
_epoch: str
|
|
80
|
+
_person: str
|
|
81
|
+
_comm_for_profile_bib: str
|
|
82
|
+
_langid: str
|
|
83
|
+
_lang_der: str
|
|
84
|
+
_further_refs: str
|
|
85
|
+
_depends_on: str
|
|
86
|
+
_dltc_num: str
|
|
87
|
+
_spec_interest: str
|
|
88
|
+
_note_perso: str
|
|
89
|
+
_note_stock: str
|
|
90
|
+
_note_status: str
|
|
91
|
+
_num_inwork_coll: str
|
|
92
|
+
_num_inwork: str
|
|
93
|
+
_num_coll: str
|
|
94
|
+
_dltc_copyediting_note: str
|
|
95
|
+
_note_missing: str
|
|
96
|
+
_num_sort: str
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _is_valid_bibtex_entry_type(value: Any) -> TypeGuard[TBibTeXEntryType]:
|
|
100
|
+
"""
|
|
101
|
+
TypeGuard function to validate if a value is a valid BibTeX entry type.
|
|
102
|
+
"""
|
|
103
|
+
return isinstance(value, str) and value in BIBTEX_ENTRY_TYPE_VALUES
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def parse_entry_type(text: str) -> TBibTeXEntryType:
|
|
107
|
+
"""
|
|
108
|
+
Parse the entry type from a string.
|
|
109
|
+
"""
|
|
110
|
+
if text == "" or text == "UNKNOWN":
|
|
111
|
+
return "UNKNOWN"
|
|
112
|
+
|
|
113
|
+
clean = text.strip().replace(" ", "").lower().replace("@", "").replace("{", "").replace("}", "")
|
|
114
|
+
|
|
115
|
+
if _is_valid_bibtex_entry_type(clean):
|
|
116
|
+
return clean
|
|
117
|
+
else:
|
|
118
|
+
return "UNKNOWN"
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def parse_options(text: str) -> Tuple[str, ...]:
|
|
122
|
+
"""
|
|
123
|
+
Parse a comma-separated list of options.
|
|
124
|
+
"""
|
|
125
|
+
if not text:
|
|
126
|
+
return ()
|
|
127
|
+
return tuple(remove_extra_whitespace(opt) for opt in text.split(",") if opt.strip())
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def parse_bibkey_list(text: str) -> Tuple[BibKeyAttr, ...]:
|
|
131
|
+
"""
|
|
132
|
+
Parse a comma-separated list of bibkeys.
|
|
133
|
+
"""
|
|
134
|
+
if not text:
|
|
135
|
+
return ()
|
|
136
|
+
|
|
137
|
+
bibkeys = []
|
|
138
|
+
for bibkey_str in text.split(","):
|
|
139
|
+
bibkey_str = remove_extra_whitespace(bibkey_str)
|
|
140
|
+
if bibkey_str:
|
|
141
|
+
result = parse_bibkey(bibkey_str)
|
|
142
|
+
if isinstance(result, Ok):
|
|
143
|
+
bibkeys.append(result.out)
|
|
144
|
+
else:
|
|
145
|
+
raise ValueError(f"Failed to parse bibkey '{bibkey_str}': {result.message}")
|
|
146
|
+
|
|
147
|
+
return tuple(bibkeys)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def _clean_keyword(text: str) -> str:
|
|
151
|
+
"""
|
|
152
|
+
Clean a keyword string by stripping whitespace and removing unwanted characters.
|
|
153
|
+
"""
|
|
154
|
+
return remove_extra_whitespace(text).replace(",", "").replace(";", "")
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def parse_keywords(level1: str, level2: str, level3: str) -> KeywordsAttr | None:
|
|
158
|
+
"""
|
|
159
|
+
Parse keywords from three level strings.
|
|
160
|
+
"""
|
|
161
|
+
if not any([level1, level2, level3]):
|
|
162
|
+
return None
|
|
163
|
+
|
|
164
|
+
return KeywordsAttr(
|
|
165
|
+
level_1=Keyword(name=_clean_keyword(level1), id=None) if level1 else Keyword(name="", id=None),
|
|
166
|
+
level_2=Keyword(name=_clean_keyword(level2), id=None) if level2 else Keyword(name="", id=None),
|
|
167
|
+
level_3=Keyword(name=_clean_keyword(level3), id=None) if level3 else Keyword(name="", id=None),
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def _is_valid_pubstate(value: Any) -> TypeGuard[TPubState]:
|
|
172
|
+
"""
|
|
173
|
+
TypeGuard function to validate if a value is a valid publication state.
|
|
174
|
+
"""
|
|
175
|
+
return isinstance(value, str) and value in PUB_STATE_VALUES
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def parse_pubstate(text: str) -> TPubState:
|
|
179
|
+
"""
|
|
180
|
+
Parse publication state from a string.
|
|
181
|
+
"""
|
|
182
|
+
if _is_valid_pubstate(text):
|
|
183
|
+
return text
|
|
184
|
+
else:
|
|
185
|
+
return ""
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def _is_valid_epoch(value: Any) -> TypeGuard[TEpoch]:
|
|
189
|
+
"""
|
|
190
|
+
TypeGuard function to validate if a value is a valid epoch.
|
|
191
|
+
"""
|
|
192
|
+
return isinstance(value, str) and value in EPOCH_VALUES
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def parse_epoch(text: str) -> TEpoch:
|
|
196
|
+
"""
|
|
197
|
+
Parse epoch from a string.
|
|
198
|
+
"""
|
|
199
|
+
if _is_valid_epoch(text):
|
|
200
|
+
return text
|
|
201
|
+
else:
|
|
202
|
+
return ""
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def _is_valid_language_id(value: Any) -> TypeGuard[TLanguageID]:
|
|
206
|
+
"""
|
|
207
|
+
TypeGuard function to validate if a value is a valid language ID.
|
|
208
|
+
"""
|
|
209
|
+
return isinstance(value, str) and value in LANGUAGE_ID_VALUES
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def parse_language_id(text: str) -> TLanguageID:
|
|
213
|
+
"""
|
|
214
|
+
Parse language ID from a string.
|
|
215
|
+
"""
|
|
216
|
+
if _is_valid_language_id(text):
|
|
217
|
+
return text
|
|
218
|
+
else:
|
|
219
|
+
return ""
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def _create_bibstring_attr(value: str, bibstring_type: TBibString) -> BibStringAttr:
|
|
223
|
+
"""
|
|
224
|
+
Create a BibStringAttr with the correct field set based on bibstring_type.
|
|
225
|
+
"""
|
|
226
|
+
if bibstring_type == "latex":
|
|
227
|
+
return BibStringAttr(latex=value)
|
|
228
|
+
elif bibstring_type == "unicode":
|
|
229
|
+
return BibStringAttr(unicode=value)
|
|
230
|
+
else: # simplified
|
|
231
|
+
return BibStringAttr(simplified=value)
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def parse_bibitem(data: ParsedBibItemData, bibstring_type: TBibString = "latex") -> Ok[BibItem] | Err:
|
|
235
|
+
"""
|
|
236
|
+
Parse a bibitem from a dictionary of string fields into a BibItem object.
|
|
237
|
+
"""
|
|
238
|
+
try:
|
|
239
|
+
# Parse bibkey
|
|
240
|
+
bibkey = None
|
|
241
|
+
if data.get("bibkey"):
|
|
242
|
+
bibkey_result = parse_bibkey(data["bibkey"])
|
|
243
|
+
if isinstance(bibkey_result, Err):
|
|
244
|
+
return bibkey_result
|
|
245
|
+
bibkey = bibkey_result.out
|
|
246
|
+
|
|
247
|
+
# Parse authors
|
|
248
|
+
authors: tuple[Author, ...] = ()
|
|
249
|
+
if data.get("author"):
|
|
250
|
+
author_result = parse_author(data["author"], bibstring_type)
|
|
251
|
+
if isinstance(author_result, Err):
|
|
252
|
+
return author_result
|
|
253
|
+
authors = author_result.out
|
|
254
|
+
|
|
255
|
+
# Parse editors
|
|
256
|
+
editors: tuple[Author, ...] = ()
|
|
257
|
+
if data.get("editor"):
|
|
258
|
+
editor_result = parse_author(data["editor"], bibstring_type)
|
|
259
|
+
if isinstance(editor_result, Err):
|
|
260
|
+
return editor_result
|
|
261
|
+
editors = editor_result.out
|
|
262
|
+
|
|
263
|
+
# Parse guest editors
|
|
264
|
+
guesteditors: tuple[Author, ...] = ()
|
|
265
|
+
if data.get("_guesteditor"):
|
|
266
|
+
guesteditor_result = parse_author(data["_guesteditor"], bibstring_type)
|
|
267
|
+
if isinstance(guesteditor_result, Err):
|
|
268
|
+
return guesteditor_result
|
|
269
|
+
guesteditors = guesteditor_result.out
|
|
270
|
+
|
|
271
|
+
# Parse person
|
|
272
|
+
person = None
|
|
273
|
+
if data.get("_person"):
|
|
274
|
+
person_result = parse_author(data["_person"], bibstring_type)
|
|
275
|
+
if isinstance(person_result, Err):
|
|
276
|
+
return person_result
|
|
277
|
+
if person_result.out:
|
|
278
|
+
person = person_result.out[0]
|
|
279
|
+
|
|
280
|
+
# Parse date
|
|
281
|
+
date: BibItemDateAttr | Literal["no date"] = BibItemDateAttr(year=0)
|
|
282
|
+
if data.get("date"):
|
|
283
|
+
date_result = parse_date(data["date"])
|
|
284
|
+
if isinstance(date_result, Err):
|
|
285
|
+
return date_result
|
|
286
|
+
date = date_result.out
|
|
287
|
+
|
|
288
|
+
# Parse pages
|
|
289
|
+
pages: tuple[PageAttr, ...] = ()
|
|
290
|
+
if data.get("pages"):
|
|
291
|
+
pages_result = parse_pages(data["pages"])
|
|
292
|
+
if isinstance(pages_result, Err):
|
|
293
|
+
return pages_result
|
|
294
|
+
pages = pages_result.out
|
|
295
|
+
|
|
296
|
+
# Parse journal
|
|
297
|
+
journal = None
|
|
298
|
+
if data.get("journal"):
|
|
299
|
+
journal_result = parse_journal(data["journal"], bibstring_type)
|
|
300
|
+
if isinstance(journal_result, Err):
|
|
301
|
+
return journal_result
|
|
302
|
+
journal = journal_result.out
|
|
303
|
+
|
|
304
|
+
# Parse crossref - for now, we'll skip complex crossref parsing and set to empty string
|
|
305
|
+
# TODO: Implement proper crossref parsing if needed
|
|
306
|
+
|
|
307
|
+
# Parse further_refs and depends_on
|
|
308
|
+
further_refs = parse_bibkey_list(data.get("_further_refs", ""))
|
|
309
|
+
depends_on = parse_bibkey_list(data.get("_depends_on", ""))
|
|
310
|
+
|
|
311
|
+
# Parse keywords
|
|
312
|
+
keywords = parse_keywords(data.get("_kw_level1", ""), data.get("_kw_level2", ""), data.get("_kw_level3", ""))
|
|
313
|
+
|
|
314
|
+
# Parse edition
|
|
315
|
+
edition = None
|
|
316
|
+
if data.get("edition"):
|
|
317
|
+
edition_str = data["edition"].strip()
|
|
318
|
+
if edition_str:
|
|
319
|
+
edition = int(edition_str)
|
|
320
|
+
|
|
321
|
+
# Parse numeric fields
|
|
322
|
+
dltc_num = None
|
|
323
|
+
if data.get("_dltc_num"):
|
|
324
|
+
dltc_num_str = data["_dltc_num"].strip()
|
|
325
|
+
if dltc_num_str:
|
|
326
|
+
dltc_num = int(dltc_num_str)
|
|
327
|
+
|
|
328
|
+
num_inwork_coll = None
|
|
329
|
+
if data.get("_num_inwork_coll"):
|
|
330
|
+
num_inwork_coll_str = data["_num_inwork_coll"].strip()
|
|
331
|
+
if num_inwork_coll_str:
|
|
332
|
+
num_inwork_coll = int(num_inwork_coll_str)
|
|
333
|
+
|
|
334
|
+
num_coll = None
|
|
335
|
+
if data.get("_num_coll"):
|
|
336
|
+
num_coll_str = data["_num_coll"].strip()
|
|
337
|
+
if num_coll_str:
|
|
338
|
+
num_coll = int(num_coll_str)
|
|
339
|
+
|
|
340
|
+
num_sort = None
|
|
341
|
+
if data.get("_num_sort"):
|
|
342
|
+
num_sort_str = data["_num_sort"].strip()
|
|
343
|
+
if num_sort_str:
|
|
344
|
+
num_sort = int(num_sort_str)
|
|
345
|
+
|
|
346
|
+
# Parse series
|
|
347
|
+
series: BaseNamedRenderable | Literal[""] = ""
|
|
348
|
+
if data.get("series"):
|
|
349
|
+
series_attr = _create_bibstring_attr(data["series"], bibstring_type)
|
|
350
|
+
series = BaseNamedRenderable(name=series_attr, id=None)
|
|
351
|
+
|
|
352
|
+
# Create BibItem
|
|
353
|
+
bibitem = BibItem(
|
|
354
|
+
to_do_general=data.get("_to_do_general", ""),
|
|
355
|
+
change_request=data.get("_change_request", ""),
|
|
356
|
+
entry_type=parse_entry_type(data.get("entry_type", "")),
|
|
357
|
+
bibkey=bibkey or "",
|
|
358
|
+
author=authors,
|
|
359
|
+
editor=editors,
|
|
360
|
+
options=parse_options(data.get("options", "")),
|
|
361
|
+
date=date,
|
|
362
|
+
pubstate=parse_pubstate(data.get("pubstate", "")),
|
|
363
|
+
title=_create_bibstring_attr(data["title"], bibstring_type) if data.get("title") else "",
|
|
364
|
+
booktitle=_create_bibstring_attr(data["booktitle"], bibstring_type) if data.get("booktitle") else "",
|
|
365
|
+
crossref="",
|
|
366
|
+
journal=journal,
|
|
367
|
+
volume=data.get("volume", ""),
|
|
368
|
+
number=data.get("number", ""),
|
|
369
|
+
pages=pages,
|
|
370
|
+
eid=data.get("eid", ""),
|
|
371
|
+
series=series,
|
|
372
|
+
address=_create_bibstring_attr(data["address"], bibstring_type) if data.get("address") else "",
|
|
373
|
+
institution=_create_bibstring_attr(data["institution"], bibstring_type) if data.get("institution") else "",
|
|
374
|
+
school=_create_bibstring_attr(data["school"], bibstring_type) if data.get("school") else "",
|
|
375
|
+
publisher=_create_bibstring_attr(data["publisher"], bibstring_type) if data.get("publisher") else "",
|
|
376
|
+
type=_create_bibstring_attr(data["type"], bibstring_type) if data.get("type") else "",
|
|
377
|
+
edition=edition,
|
|
378
|
+
note=_create_bibstring_attr(data["note"], bibstring_type) if data.get("note") else "",
|
|
379
|
+
issuetitle=_create_bibstring_attr(data["_issuetitle"], bibstring_type) if data.get("_issuetitle") else "",
|
|
380
|
+
guesteditor=guesteditors,
|
|
381
|
+
extra_note=_create_bibstring_attr(data["_extra_note"], bibstring_type) if data.get("_extra_note") else "",
|
|
382
|
+
urn=data.get("urn", ""),
|
|
383
|
+
eprint=data.get("eprint", ""),
|
|
384
|
+
doi=data.get("doi", ""),
|
|
385
|
+
url=data.get("url", ""),
|
|
386
|
+
kws=keywords or "",
|
|
387
|
+
epoch=parse_epoch(data.get("_epoch", "")),
|
|
388
|
+
person=person or "",
|
|
389
|
+
comm_for_profile_bib=data.get("_comm_for_profile_bib", ""),
|
|
390
|
+
langid=parse_language_id(data.get("_langid", "")),
|
|
391
|
+
lang_der=data.get("_lang_der", ""),
|
|
392
|
+
further_refs=further_refs,
|
|
393
|
+
depends_on=depends_on,
|
|
394
|
+
dltc_num=dltc_num,
|
|
395
|
+
spec_interest=data.get("_spec_interest", ""),
|
|
396
|
+
note_perso=data.get("_note_perso", ""),
|
|
397
|
+
note_stock=data.get("_note_stock", ""),
|
|
398
|
+
note_status=data.get("_note_status", ""),
|
|
399
|
+
num_inwork_coll=num_inwork_coll,
|
|
400
|
+
num_inwork=data.get("_num_inwork", ""),
|
|
401
|
+
num_coll=num_coll,
|
|
402
|
+
dltc_copyediting_note=data.get("_dltc_copyediting_note", ""),
|
|
403
|
+
note_missing=data.get("_note_missing", ""),
|
|
404
|
+
num_sort=num_sort,
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
return Ok(bibitem)
|
|
408
|
+
|
|
409
|
+
except Exception as e:
|
|
410
|
+
return Err(
|
|
411
|
+
message=f"Failed to parse bibitem: {e.__class__.__name__}: {e}",
|
|
412
|
+
code=-1,
|
|
413
|
+
error_type="BibItemParsingError",
|
|
414
|
+
error_trace=traceback.format_exc(),
|
|
415
|
+
)
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from aletk.utils import get_logger
|
|
2
|
+
from philoch_bib_sdk.logic.models import Journal, Maybe, TBibString
|
|
3
|
+
|
|
4
|
+
lgr = get_logger(__name__)
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def format_journal(journal: Maybe[Journal], bibstring_type: TBibString) -> str:
|
|
8
|
+
"""
|
|
9
|
+
Format a journal object into a string representation.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
match journal:
|
|
13
|
+
|
|
14
|
+
case None:
|
|
15
|
+
return ""
|
|
16
|
+
|
|
17
|
+
case Journal(name, id):
|
|
18
|
+
|
|
19
|
+
if not name:
|
|
20
|
+
return ""
|
|
21
|
+
|
|
22
|
+
return f"{getattr(name, bibstring_type)}"
|
|
23
|
+
|
|
24
|
+
case _:
|
|
25
|
+
raise TypeError(f"Invalid type for journal: {type(journal)}. Dump: {journal!r}")
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import traceback
|
|
2
|
+
from aletk.ResultMonad import Ok, Err
|
|
3
|
+
from aletk.utils import get_logger, remove_extra_whitespace
|
|
4
|
+
from philoch_bib_sdk.logic.models import Journal, BibStringAttr, TBibString
|
|
5
|
+
|
|
6
|
+
lgr = get_logger(__name__)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def parse_journal(text: str, bibstring_type: TBibString) -> Ok[Journal | None] | Err:
|
|
10
|
+
"""
|
|
11
|
+
Parse a journal string into a Journal object.
|
|
12
|
+
"""
|
|
13
|
+
try:
|
|
14
|
+
if text == "":
|
|
15
|
+
lgr.debug("Empty journal string, returning None.")
|
|
16
|
+
return Ok(None)
|
|
17
|
+
|
|
18
|
+
# Normalize the text by removing extra whitespace
|
|
19
|
+
normalized_text = remove_extra_whitespace(text)
|
|
20
|
+
|
|
21
|
+
journal = Journal(
|
|
22
|
+
name=BibStringAttr(**{str(bibstring_type): normalized_text}),
|
|
23
|
+
issn_electronic="",
|
|
24
|
+
issn_print="",
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
return Ok(journal)
|
|
28
|
+
|
|
29
|
+
except Exception as e:
|
|
30
|
+
error_message = f"Error parsing journal string '{text}': {e}"
|
|
31
|
+
return Err(
|
|
32
|
+
message=error_message,
|
|
33
|
+
code=-1,
|
|
34
|
+
error_type=f"{e.__class__.__name__}",
|
|
35
|
+
error_trace=traceback.format_exc(),
|
|
36
|
+
)
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from philoch_bib_sdk.logic.models import BaseRenderable, BaseNamedRenderable, TBibString
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def format_renderable(
|
|
5
|
+
renderable: BaseRenderable | BaseNamedRenderable,
|
|
6
|
+
bibstring_type: TBibString,
|
|
7
|
+
) -> str:
|
|
8
|
+
"""
|
|
9
|
+
Format a base renderable object into a string representation.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
match renderable:
|
|
13
|
+
|
|
14
|
+
case BaseRenderable(text, id):
|
|
15
|
+
if not text:
|
|
16
|
+
return ""
|
|
17
|
+
return f"{getattr(text, bibstring_type)}"
|
|
18
|
+
|
|
19
|
+
case BaseNamedRenderable(name, id):
|
|
20
|
+
if not name:
|
|
21
|
+
return ""
|
|
22
|
+
return f"{getattr(name, bibstring_type)}"
|
|
23
|
+
|
|
24
|
+
case _:
|
|
25
|
+
raise TypeError("Invalid type for renderable")
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
"""CLI interface for fuzzy matching bibliographic items.
|
|
2
|
+
|
|
3
|
+
This module provides a command-line interface for matching new bibliographic entries
|
|
4
|
+
against an existing bibliography using fuzzy matching.
|
|
5
|
+
|
|
6
|
+
Usage:
|
|
7
|
+
poetry run python -m philoch_bib_sdk.interfaces.cli.fuzzy_matching \\
|
|
8
|
+
--bibliography path/to/bibliography.csv \\
|
|
9
|
+
--input path/to/new_items.csv \\
|
|
10
|
+
--output path/to/report \\
|
|
11
|
+
[--format csv] \\
|
|
12
|
+
[--top-n 5] \\
|
|
13
|
+
[--min-score 100.0]
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import argparse
|
|
17
|
+
import sys
|
|
18
|
+
from functools import partial
|
|
19
|
+
|
|
20
|
+
from aletk.ResultMonad import Err, main_try_except_wrapper
|
|
21
|
+
from aletk.utils import get_logger
|
|
22
|
+
|
|
23
|
+
from philoch_bib_sdk.adapters.io import load_bibliography, load_staged, write_report
|
|
24
|
+
from philoch_bib_sdk.procedures.fuzzy_matching import fuzzy_match_procedure
|
|
25
|
+
|
|
26
|
+
logger = get_logger(__name__)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@main_try_except_wrapper(logger)
|
|
30
|
+
def cli() -> None:
|
|
31
|
+
"""Command-line interface for fuzzy matching.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
None on success (raises exception on failure)
|
|
35
|
+
"""
|
|
36
|
+
parser = argparse.ArgumentParser(
|
|
37
|
+
description="Fuzzy match bibliographic items against an existing bibliography.",
|
|
38
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
39
|
+
epilog="""
|
|
40
|
+
Examples:
|
|
41
|
+
# Basic usage with CSV files
|
|
42
|
+
%(prog)s --bibliography refs.csv --input new_refs.csv --output matches
|
|
43
|
+
|
|
44
|
+
# With custom matching parameters
|
|
45
|
+
%(prog)s --bibliography refs.csv --input new_refs.csv --output matches \\
|
|
46
|
+
--top-n 10 --min-score 200.0
|
|
47
|
+
|
|
48
|
+
# Specify output format explicitly
|
|
49
|
+
%(prog)s --bibliography refs.csv --input new_refs.csv --output matches \\
|
|
50
|
+
--format csv
|
|
51
|
+
""",
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
parser.add_argument(
|
|
55
|
+
"--bibliography",
|
|
56
|
+
required=True,
|
|
57
|
+
help="Path to bibliography file (format auto-detected from extension)",
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
parser.add_argument(
|
|
61
|
+
"--input",
|
|
62
|
+
required=True,
|
|
63
|
+
help="Path to input file with new items to match",
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
parser.add_argument(
|
|
67
|
+
"--output",
|
|
68
|
+
required=True,
|
|
69
|
+
help="Path to output report file (without extension)",
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
parser.add_argument(
|
|
73
|
+
"--format",
|
|
74
|
+
default="csv",
|
|
75
|
+
choices=["csv"],
|
|
76
|
+
help="Output format (default: csv)",
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
parser.add_argument(
|
|
80
|
+
"--top-n",
|
|
81
|
+
type=int,
|
|
82
|
+
default=5,
|
|
83
|
+
help="Number of top matches to return per item (default: 5)",
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
parser.add_argument(
|
|
87
|
+
"--min-score",
|
|
88
|
+
type=float,
|
|
89
|
+
default=0.0,
|
|
90
|
+
help="Minimum score threshold for matches (default: 0.0)",
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
args = parser.parse_args()
|
|
94
|
+
|
|
95
|
+
# Validate parameters
|
|
96
|
+
if args.top_n < 1:
|
|
97
|
+
raise ValueError("--top-n must be at least 1")
|
|
98
|
+
|
|
99
|
+
if args.min_score < 0:
|
|
100
|
+
raise ValueError("--min-score must be non-negative")
|
|
101
|
+
|
|
102
|
+
# Create write_report function with format bound
|
|
103
|
+
write_report_with_format = partial(write_report, output_format=args.format)
|
|
104
|
+
|
|
105
|
+
# Execute procedure
|
|
106
|
+
logger.info("Starting fuzzy matching CLI")
|
|
107
|
+
result = fuzzy_match_procedure(
|
|
108
|
+
bibliography_path=args.bibliography,
|
|
109
|
+
staged_path=args.input,
|
|
110
|
+
output_path=args.output,
|
|
111
|
+
load_bibliography=load_bibliography,
|
|
112
|
+
load_staged=load_staged,
|
|
113
|
+
write_report=write_report_with_format,
|
|
114
|
+
top_n=args.top_n,
|
|
115
|
+
min_score=args.min_score,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
# Handle result - raise exception if procedure failed
|
|
119
|
+
if isinstance(result, Err):
|
|
120
|
+
raise RuntimeError(result.message)
|
|
121
|
+
|
|
122
|
+
print(f"Success! Report written to {args.output}.{args.format}")
|
|
123
|
+
logger.info("Fuzzy matching completed successfully")
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def main() -> None:
|
|
127
|
+
"""Entry point for CLI when run as script."""
|
|
128
|
+
result = cli()
|
|
129
|
+
if isinstance(result, Err):
|
|
130
|
+
sys.exit(result.code if result.code > 0 else 1)
|
|
131
|
+
sys.exit(0)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
if __name__ == "__main__":
|
|
135
|
+
main()
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""Logic layer for bibliography SDK."""
|
|
2
|
+
|
|
3
|
+
from philoch_bib_sdk.logic.literals import TBibTeXEntryType
|
|
4
|
+
from philoch_bib_sdk.logic.models import (
|
|
5
|
+
Author,
|
|
6
|
+
BibItem,
|
|
7
|
+
BibItemDateAttr,
|
|
8
|
+
BibKeyAttr,
|
|
9
|
+
BibStringAttr,
|
|
10
|
+
Journal,
|
|
11
|
+
Maybe,
|
|
12
|
+
PageAttr,
|
|
13
|
+
TBibString,
|
|
14
|
+
)
|
|
15
|
+
from philoch_bib_sdk.logic.models_staging import (
|
|
16
|
+
BibItemStaged,
|
|
17
|
+
Match,
|
|
18
|
+
PartialScore,
|
|
19
|
+
ScoreComponent,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
__all__ = [
|
|
23
|
+
# Core models
|
|
24
|
+
"Author",
|
|
25
|
+
"BibItem",
|
|
26
|
+
"BibItemDateAttr",
|
|
27
|
+
"BibKeyAttr",
|
|
28
|
+
"BibStringAttr",
|
|
29
|
+
"Journal",
|
|
30
|
+
"Maybe",
|
|
31
|
+
"PageAttr",
|
|
32
|
+
"TBibString",
|
|
33
|
+
"TBibTeXEntryType",
|
|
34
|
+
# Staging models
|
|
35
|
+
"BibItemStaged",
|
|
36
|
+
"Match",
|
|
37
|
+
"PartialScore",
|
|
38
|
+
"ScoreComponent",
|
|
39
|
+
]
|