pyreslib 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pyreslib/__init__.py ADDED
@@ -0,0 +1,9 @@
1
+ """
2
+ py-resounding-libraries
3
+
4
+ Python package from Resounding Libraries research cluster
5
+ """
6
+
7
+ __version__ = "0.1.0"
8
+ __author__ = "Nicholas Cornia"
9
+ __credits__ = "Orpheus Instituut"
pyreslib/bibtex.py ADDED
@@ -0,0 +1,690 @@
1
+ from pyreslib import koha
2
+ from pyreslib import utilities
3
+
4
+ import os, json, re
5
+ from bibtexparser.bwriter import BibTexWriter
6
+ from bibtexparser.bibdatabase import BibDatabase
7
+
8
+ # Load entry types from JSON
9
+ with open(
10
+ os.path.join("data", "mappings", "bibtex", "koha_entry_types.json"), "r"
11
+ ) as f:
12
+ entry_type_mapping = json.load(f)
13
+
14
+
15
+ def convert_biblio_to_bibtex(
16
+ biblio_id: int,
17
+ bibtex_filepath: str,
18
+ koha_session,
19
+ base_url: str,
20
+ ):
21
+ """
22
+ Converts MARC-in-JSON record from Koha API into BibTeX file.
23
+
24
+ Args:
25
+ biblio_id (int): Unique idenfier for the record in Koha catalogue.
26
+ bibtex_filepath (str): String filepath indicating the location where the .bib output has to be stored.
27
+ koha_session (oauth2): Oauth2 session provided by `pyreslib.koha.oauth2_session` method.
28
+ base_url (str): Koha API url from credentials.
29
+
30
+ Returns:
31
+ `None`
32
+
33
+ Examples:
34
+
35
+ """
36
+
37
+ # get MARC-in-JSON record
38
+ record = koha.get_biblio_marc(
39
+ session=koha_session, biblio_id=biblio_id, base_url=base_url
40
+ )
41
+
42
+ # get marc biblio type
43
+ entry_type = entry_type_mapping[koha.get_biblio_type(record)]
44
+
45
+ # generate BibTeX entry according to type
46
+
47
+ entry = {}
48
+
49
+ entry_id = (
50
+ get_authors(record).split(" ")[0].lower()
51
+ + "_"
52
+ + get_title(record).replace(" ", "").lower()[:10]
53
+ + "_"
54
+ + get_year(record)
55
+ )
56
+
57
+ if entry_type == "article":
58
+ entry = {
59
+ "ENTRYTYPE": entry_type,
60
+ "ID": entry_id,
61
+ "author": get_authors(record),
62
+ "title": get_title(record),
63
+ "journal": get_journal(record),
64
+ "year": str(get_year(record)),
65
+ "volume": get_volume(record),
66
+ "number": get_issue_number(record),
67
+ "issn": get_issn(record),
68
+ "pages": get_pages(record),
69
+ "url": get_url(record),
70
+ "doi": get_doi(record),
71
+ "keywords": get_keywords(record),
72
+ "abstract": get_abstract(record),
73
+ "note": get_note(record),
74
+ "type": entry_type,
75
+ "howpublished": get_howpublished(record),
76
+ }
77
+
78
+ elif entry_type == "book":
79
+ entry = {
80
+ "ENTRYTYPE": entry_type,
81
+ "ID": entry_id,
82
+ "author": get_authors(record),
83
+ "title": get_title(record),
84
+ "publisher": get_publisher(record),
85
+ "address": get_address(record),
86
+ "year": str(get_year(record)),
87
+ "isbn": get_isbn(record),
88
+ "pages": get_pages(record),
89
+ "url": get_url(record),
90
+ "doi": get_doi(record),
91
+ "keywords": get_keywords(record),
92
+ "abstract": get_abstract(record),
93
+ "note": get_note(record),
94
+ "type": entry_type,
95
+ "howpublished": get_howpublished(record),
96
+ }
97
+
98
+ elif entry_type == "incollection":
99
+ entry = {
100
+ "ENTRYTYPE": entry_type,
101
+ "ID": entry_id,
102
+ "author": get_authors(record),
103
+ "title": get_title(record),
104
+ "booktitle": get_booktitle(record),
105
+ "publisher": get_publisher(record),
106
+ "address": get_address(record),
107
+ "year": str(get_year(record)),
108
+ "isbn": get_isbn(record),
109
+ "pages": get_pages(record),
110
+ "url": get_url(record),
111
+ "doi": get_doi(record),
112
+ "keywords": get_keywords(record),
113
+ "abstract": get_abstract(record),
114
+ "note": get_note(record),
115
+ "type": entry_type,
116
+ "howpublished": get_howpublished(record),
117
+ }
118
+
119
+ elif entry_type == "misc":
120
+ entry = {
121
+ "ENTRYTYPE": entry_type,
122
+ "ID": entry_id,
123
+ "author": get_authors(record),
124
+ "title": get_title(record),
125
+ "publisher": get_publisher(record),
126
+ "address": get_address(record),
127
+ "year": str(get_year(record)),
128
+ "url": get_url(record),
129
+ "doi": get_doi(record),
130
+ "keywords": get_keywords(record),
131
+ "abstract": get_abstract(record),
132
+ "note": get_note(record),
133
+ "type": entry_type,
134
+ "howpublished": get_howpublished(record),
135
+ }
136
+
137
+ elif entry_type == "phdthesis":
138
+ entry = {
139
+ "ENTRYTYPE": entry_type,
140
+ "ID": entry_id,
141
+ "author": get_authors(record),
142
+ "title": get_title(record),
143
+ "school": get_school(record),
144
+ "address": get_address(record),
145
+ "year": str(get_year(record)),
146
+ "pages": get_pages(record),
147
+ "url": get_url(record),
148
+ "doi": get_doi(record),
149
+ "keywords": get_keywords(record),
150
+ "abstract": get_abstract(record),
151
+ "note": get_note(record),
152
+ "type": entry_type,
153
+ "howpublished": get_howpublished(record),
154
+ }
155
+ else:
156
+ raise ValueError(f"No entry_type found for biblio_id {biblio_id}")
157
+
158
+ # generate single bibtex file
159
+ bibtex_db = BibDatabase()
160
+ # initialize BibTeX writer
161
+ bibtex_writer = BibTexWriter()
162
+
163
+ # add entries to database
164
+ bibtex_db.entries = [entry]
165
+
166
+ with open(bibtex_filepath, "w") as bibfile:
167
+ bibfile.write(bibtex_writer.write(bibtex_db))
168
+
169
+
170
+ def latex_normalize(input_string):
171
+ special_chars = {
172
+ "&": r"\&",
173
+ "%": r"\%",
174
+ "$": r"\$",
175
+ "#": r"\#",
176
+ "_": r"\_",
177
+ "{": r"\{",
178
+ "}": r"\}",
179
+ "~": r"\textasciitilde{}",
180
+ "^": r"\^{}",
181
+ "\\": r"\textbackslash{}",
182
+ }
183
+
184
+ pattern = re.compile("|".join(re.escape(key) for key in special_chars.keys()))
185
+ normalized_string = pattern.sub(lambda x: special_chars[x.group()], input_string)
186
+
187
+ return normalized_string
188
+
189
+
190
+ # CONVERT FUNCTIONS MARC2BIBTEX
191
+
192
+ ### I am assuming a record as marc-in-json format
193
+
194
+
195
+ def get_abstract(record_dict, field="520", subfield="a"):
196
+ abstract = ""
197
+
198
+ filter_query = list(filter(lambda x: field in x.keys(), record_dict["fields"]))
199
+
200
+ if len(filter_query) > 0:
201
+ subfield_query = list(
202
+ filter(lambda x: subfield in x.keys(), filter_query[0][field]["subfields"])
203
+ )
204
+ if len(subfield_query) > 0:
205
+ abstract = subfield_query[0][subfield]
206
+
207
+ return abstract
208
+
209
+
210
+ country_codes = utilities.csv2dict(
211
+ os.path.join("data", "mappings", "bibtex", "country_codes.csv")
212
+ )
213
+
214
+ country_acronyms = {}
215
+
216
+ for country in country_codes:
217
+ country_acronyms[country["code"]] = country["label"]
218
+
219
+
220
+ def get_address(
221
+ record_dict,
222
+ field="260",
223
+ subfield="a",
224
+ country_field="044",
225
+ country_subfield="a",
226
+ acronyms_mapping=country_acronyms,
227
+ ):
228
+ address = ""
229
+
230
+ filter_query = list(filter(lambda x: field in x.keys(), record_dict["fields"]))
231
+
232
+ if len(filter_query) > 0:
233
+ subfield_query = list(
234
+ filter(lambda x: subfield in x.keys(), filter_query[0][field]["subfields"])
235
+ )
236
+ if len(subfield_query) > 0:
237
+ address = subfield_query[0][subfield]
238
+
239
+ else: # case of country of publication field 044$a
240
+ filter_query = list(
241
+ filter(lambda x: country_field in x.keys(), record_dict["fields"])
242
+ )
243
+
244
+ if len(filter_query) > 0:
245
+ subfield_query = list(
246
+ filter(
247
+ lambda x: subfield in x.keys(),
248
+ filter_query[0][country_field]["subfields"],
249
+ )
250
+ )
251
+ if len(subfield_query) > 0:
252
+ address = acronyms_mapping[subfield_query[0][country_subfield]]
253
+
254
+ return address
255
+
256
+
257
+ role_codes = utilities.csv2dict(
258
+ os.path.join("data", "mappings", "bibtex", "role_codes.csv")
259
+ )
260
+
261
+ role_acronyms = {}
262
+
263
+ for role in role_codes:
264
+ role_acronyms[role["code"]] = role["label"]
265
+
266
+
267
+ def get_authors(
268
+ record_dict,
269
+ main_field="100",
270
+ main_subfield="a",
271
+ alt_field="700",
272
+ alt_subfield="a",
273
+ role_subfield="4",
274
+ acronyms_mapping=role_acronyms,
275
+ roles=False,
276
+ ):
277
+ author_list = []
278
+
279
+ if roles is not True:
280
+ # get main author
281
+ filter_query = list(
282
+ filter(lambda x: main_field in x.keys(), record_dict["fields"])
283
+ )
284
+
285
+ if len(filter_query) > 0:
286
+ for entry in filter_query:
287
+ subfield_query = list(
288
+ filter(
289
+ lambda x: main_subfield in x.keys(),
290
+ entry[main_field]["subfields"],
291
+ )
292
+ )
293
+ if len(subfield_query) > 0:
294
+ author_list.append(subfield_query[0][main_subfield])
295
+
296
+ # get additional authors
297
+ filter_query = list(
298
+ filter(lambda x: alt_field in x.keys(), record_dict["fields"])
299
+ )
300
+
301
+ if len(filter_query) > 0:
302
+ for entry in filter_query:
303
+ subfield_query = list(
304
+ filter(
305
+ lambda x: alt_subfield in x.keys(),
306
+ entry[alt_field]["subfields"],
307
+ )
308
+ )
309
+ if len(subfield_query) > 0:
310
+ author_list.append(subfield_query[0][alt_subfield])
311
+ else:
312
+ # get main author
313
+ filter_query = list(
314
+ filter(lambda x: main_field in x.keys(), record_dict["fields"])
315
+ )
316
+
317
+ if len(filter_query) > 0:
318
+ for entry in filter_query:
319
+ subfield_query = list(
320
+ filter(
321
+ lambda x: main_subfield in x.keys(),
322
+ entry[main_field]["subfields"],
323
+ )
324
+ )
325
+ if len(subfield_query) > 0:
326
+ author_list.append(
327
+ f"{subfield_query[0][main_subfield]} ({acronyms_mapping[subfield_query[0][role_subfield]]})"
328
+ )
329
+
330
+ # get additional authors
331
+ filter_query = list(
332
+ filter(lambda x: alt_field in x.keys(), record_dict["fields"])
333
+ )
334
+
335
+ if len(filter_query) > 0:
336
+ for entry in filter_query:
337
+ subfield_query = list(
338
+ filter(
339
+ lambda x: alt_subfield in x.keys(),
340
+ entry[alt_field]["subfields"],
341
+ )
342
+ )
343
+ if len(subfield_query) > 0:
344
+ author_list.append(
345
+ f"{subfield_query[0][alt_subfield]} ({acronyms_mapping[subfield_query[0][role_subfield]]})"
346
+ )
347
+
348
+ # parse authors in one string
349
+ return " and ".join(author_list)
350
+
351
+
352
+ def get_howpublished(
353
+ record_dict,
354
+ license_field="506",
355
+ licence_subfield="a",
356
+ referee_field="591",
357
+ referee_subfield="a",
358
+ acronyms_mapping={
359
+ "0": "Not peer-reviewed",
360
+ "1": "Peer-reviewed",
361
+ "Editorial review": "Peer-reviewed",
362
+ },
363
+ ):
364
+ howpublished = ""
365
+
366
+ # License information
367
+ filter_query = list(
368
+ filter(lambda x: license_field in x.keys(), record_dict["fields"])
369
+ )
370
+
371
+ if len(filter_query) > 0:
372
+ subfield_query = list(
373
+ filter(
374
+ lambda x: licence_subfield in x.keys(),
375
+ filter_query[0][license_field]["subfields"],
376
+ )
377
+ )
378
+ if len(subfield_query) > 0:
379
+ howpublished = subfield_query[0][licence_subfield]
380
+
381
+ # Referee information
382
+
383
+ filter_query = list(
384
+ filter(lambda x: referee_field in x.keys(), record_dict["fields"])
385
+ )
386
+
387
+ if len(filter_query) > 0:
388
+ subfield_query = list(
389
+ filter(
390
+ lambda x: referee_subfield in x.keys(),
391
+ filter_query[0][referee_field]["subfields"],
392
+ )
393
+ )
394
+ if len(subfield_query) > 0:
395
+ howpublished += f", {acronyms_mapping[subfield_query[0][referee_subfield]]}"
396
+
397
+ return howpublished
398
+
399
+
400
+ def get_booktitle(record_dict, field="773", subfield="t"):
401
+ booktitle = ""
402
+
403
+ filter_query = list(filter(lambda x: field in x.keys(), record_dict["fields"]))
404
+
405
+ if len(filter_query) > 0:
406
+ subfield_query = list(
407
+ filter(lambda x: subfield in x.keys(), filter_query[0][field]["subfields"])
408
+ )
409
+ if len(subfield_query) > 0:
410
+ booktitle = subfield_query[0][subfield]
411
+
412
+ return booktitle
413
+
414
+
415
+ def get_doi(
416
+ record_dict,
417
+ field="856",
418
+ subfield="u",
419
+ control_field="856",
420
+ control_subfield="3",
421
+ control_value="DOI",
422
+ ):
423
+ doi = ""
424
+
425
+ filter_query = list(filter(lambda x: field in x.keys(), record_dict["fields"]))
426
+ if len(filter_query) > 0:
427
+ for entry in filter_query:
428
+ control_subfield_query = list(
429
+ filter(
430
+ lambda x: control_subfield in x.keys(),
431
+ entry[control_field]["subfields"],
432
+ )
433
+ )
434
+ if len(control_subfield_query) > 0:
435
+ if control_subfield_query[0][control_subfield] == control_value:
436
+ subfield_query = list(
437
+ filter(
438
+ lambda x: subfield in x.keys(), entry[field]["subfields"]
439
+ )
440
+ )
441
+ if len(subfield_query) > 0:
442
+ doi = latex_normalize(subfield_query[0][subfield]).replace(
443
+ "https://doi.org/", ""
444
+ )
445
+
446
+ return doi
447
+
448
+
449
+ def get_institution(record_dict, field="610", subfield="a"):
450
+ institution = ""
451
+
452
+ filter_query = list(filter(lambda x: field in x.keys(), record_dict["fields"]))
453
+
454
+ if len(filter_query) > 0:
455
+ subfield_query = list(
456
+ filter(lambda x: subfield in x.keys(), filter_query[0][field]["subfields"])
457
+ )
458
+ if len(subfield_query) > 0:
459
+ institution = subfield_query[0][subfield]
460
+
461
+ return institution
462
+
463
+
464
+ def get_issn(record_dict, field="773", subfield="x"):
465
+ issn = ""
466
+
467
+ filter_query = list(filter(lambda x: field in x.keys(), record_dict["fields"]))
468
+
469
+ if len(filter_query) > 0:
470
+ subfield_query = list(
471
+ filter(lambda x: subfield in x.keys(), filter_query[0][field]["subfields"])
472
+ )
473
+ if len(subfield_query) > 0:
474
+ issn = subfield_query[0][subfield].replace("-", "")
475
+
476
+ return issn
477
+
478
+
479
+ def get_isbn(record_dict, fields=[["020", "a"], ["773", "z"]]):
480
+ isbn = ""
481
+
482
+ for field in fields:
483
+ filter_query = list(
484
+ filter(lambda x: field[0] in x.keys(), record_dict["fields"])
485
+ )
486
+
487
+ if len(filter_query) > 0:
488
+ subfield_query = list(
489
+ filter(
490
+ lambda x: field[1] in x.keys(),
491
+ filter_query[0][field[0]]["subfields"],
492
+ )
493
+ )
494
+ if len(subfield_query) > 0:
495
+ # found isbn
496
+ isbn = subfield_query[0][field[1]]
497
+ return isbn
498
+
499
+
500
+ def get_journal(record_dict, field="773", subfield="t"):
501
+ journal = ""
502
+
503
+ filter_query = list(filter(lambda x: field in x.keys(), record_dict["fields"]))
504
+
505
+ if len(filter_query) > 0:
506
+ subfield_query = list(
507
+ filter(lambda x: subfield in x.keys(), filter_query[0][field]["subfields"])
508
+ )
509
+ if len(subfield_query) > 0:
510
+ journal = subfield_query[0][subfield]
511
+
512
+ return journal
513
+
514
+
515
+ def get_keywords(record_dict, field="650", subfield="a"):
516
+ keywords_list = []
517
+
518
+ filter_query = list(filter(lambda x: field in x.keys(), record_dict["fields"]))
519
+
520
+ if len(filter_query) > 0:
521
+ for entry in filter_query:
522
+ subfield_query = list(
523
+ filter(lambda x: subfield in x.keys(), entry[field]["subfields"])
524
+ )
525
+ if len(subfield_query) > 0:
526
+ keywords_list.append(subfield_query[0][subfield])
527
+
528
+ # parse keywords in one string
529
+ return ", ".join(keywords_list)
530
+
531
+
532
+ def get_note(record_dict, field="500", subfield="a"):
533
+ note = ""
534
+
535
+ filter_query = list(filter(lambda x: field in x.keys(), record_dict["fields"]))
536
+
537
+ if len(filter_query) > 0:
538
+ subfield_query = list(
539
+ filter(lambda x: subfield in x.keys(), filter_query[0][field]["subfields"])
540
+ )
541
+ if len(subfield_query) > 0:
542
+ note = subfield_query[0][subfield]
543
+
544
+ return note
545
+
546
+
547
+ def get_issue_number(record_dict, field="773", subfield="g"):
548
+ issue_number = ""
549
+
550
+ filter_query = list(filter(lambda x: field in x.keys(), record_dict["fields"]))
551
+
552
+ if len(filter_query) > 0:
553
+ subfield_query = list(
554
+ filter(lambda x: subfield in x.keys(), filter_query[0][field]["subfields"])
555
+ )
556
+ if len(subfield_query) > 0:
557
+ # check if there are multiple informations, such as volume and issue.
558
+ comma_sep = subfield_query[0][subfield].split(",")
559
+ if len(comma_sep) == 1: # only pages case
560
+ pass
561
+
562
+ else: # volume,issue,pages case
563
+ issue_number = comma_sep[1].replace(" ", "")
564
+
565
+ return issue_number
566
+
567
+
568
+ def get_pages(record_dict, fields=[["300", "a"], ["773", "g"]]):
569
+ pages = ""
570
+
571
+ for field in fields:
572
+ filter_query = list(
573
+ filter(lambda x: field[0] in x.keys(), record_dict["fields"])
574
+ )
575
+
576
+ if len(filter_query) > 0:
577
+ subfield_query = list(
578
+ filter(
579
+ lambda x: field[1] in x.keys(),
580
+ filter_query[0][field[0]]["subfields"],
581
+ )
582
+ )
583
+ if len(subfield_query) > 0:
584
+ # found pages
585
+ pages = (
586
+ subfield_query[0][field[1]].replace("pages", "").replace("-", "--")
587
+ )
588
+ return pages
589
+
590
+
591
+ def get_publisher(record_dict, field="260", subfield="b"):
592
+ publisher = ""
593
+
594
+ filter_query = list(filter(lambda x: field in x.keys(), record_dict["fields"]))
595
+
596
+ if len(filter_query) > 0:
597
+ subfield_query = list(
598
+ filter(lambda x: subfield in x.keys(), filter_query[0][field]["subfields"])
599
+ )
600
+ if len(subfield_query) > 0:
601
+ publisher = subfield_query[0][subfield]
602
+
603
+ return publisher
604
+
605
+
606
+ def get_school(record_dict, field="260", subfield="b"):
607
+ school = ""
608
+
609
+ filter_query = list(filter(lambda x: field in x.keys(), record_dict["fields"]))
610
+
611
+ if len(filter_query) > 0:
612
+ subfield_query = list(
613
+ filter(lambda x: subfield in x.keys(), filter_query[0][field]["subfields"])
614
+ )
615
+ if len(subfield_query) > 0:
616
+ school = subfield_query[0][subfield]
617
+
618
+ return school
619
+
620
+
621
+ def get_title(record_dict, field="245", subfield="a"):
622
+ title = ""
623
+
624
+ filter_query = list(filter(lambda x: field in x.keys(), record_dict["fields"]))
625
+
626
+ if len(filter_query) > 0:
627
+ subfield_query = list(
628
+ filter(lambda x: subfield in x.keys(), filter_query[0][field]["subfields"])
629
+ )
630
+ if len(subfield_query) > 0:
631
+ title = subfield_query[0][subfield]
632
+
633
+ return title
634
+
635
+
636
+ def get_url(record_dict, field="856", subfield="u"):
637
+ url = ""
638
+
639
+ filter_query = list(filter(lambda x: field in x.keys(), record_dict["fields"]))
640
+ if len(filter_query) > 0:
641
+ # only considering first url value:
642
+ subfield_query = list(
643
+ filter(lambda x: subfield in x.keys(), filter_query[0][field]["subfields"])
644
+ )
645
+ if len(subfield_query) > 0:
646
+ url = subfield_query[0][subfield]
647
+
648
+ return url
649
+
650
+
651
+ def get_volume(record_dict, field="773", subfield="g"):
652
+ volume = ""
653
+
654
+ filter_query = list(filter(lambda x: field in x.keys(), record_dict["fields"]))
655
+
656
+ if len(filter_query) > 0:
657
+ subfield_query = list(
658
+ filter(lambda x: subfield in x.keys(), filter_query[0][field]["subfields"])
659
+ )
660
+ if len(subfield_query) > 0:
661
+ # check if there are multiple informations, such as volume and issue.
662
+ comma_sep = subfield_query[0][subfield].split(",")
663
+ if len(comma_sep) == 1: # only pages case
664
+ pass
665
+
666
+ else: # volume,issue,pages case
667
+ volume = comma_sep[0]
668
+
669
+ return volume
670
+
671
+
672
+ def get_year(record_dict, fields=[["502", "a"], ["366", "b"], ["260", "c"]]):
673
+ year = ""
674
+
675
+ for field in fields:
676
+ filter_query = list(
677
+ filter(lambda x: field[0] in x.keys(), record_dict["fields"])
678
+ )
679
+
680
+ if len(filter_query) > 0:
681
+ subfield_query = list(
682
+ filter(
683
+ lambda x: field[1] in x.keys(),
684
+ filter_query[0][field[0]]["subfields"],
685
+ )
686
+ )
687
+ if len(subfield_query) > 0:
688
+ # found date of publication in the format yyyy-mm-dd
689
+ year = subfield_query[0][field[1]].split("-")[0]
690
+ return year