rara-tools 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rara-tools might be problematic. Click here for more details.

@@ -0,0 +1,204 @@
1
+ import requests
2
+ import json
3
+ from typing import List
4
+ from collections import defaultdict
5
+
6
+ import logging
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ class VIAFRecord:
11
+ def __init__(self,
12
+ record: dict,
13
+ allowed_sources: List[str] = [
14
+ "LC", "DNB", "LNB", "NLL", "ERRR", "J9U"]
15
+ ):
16
+ self.__record: dict = record
17
+ self.__record_data: dict = {}
18
+ self.__allowed_sources: List[str] = allowed_sources
19
+ self.__viaf_id: int = None
20
+ self.__name_variations: List[str] = []
21
+ self.__birth_date: str = None
22
+ self.__death_date: str = None
23
+ self.__occupations: List[str] = []
24
+ self.__all_fields: dict = {}
25
+ self.__nationality: str = ""
26
+ self.__has_isni: bool = False
27
+ self.__author: str = ""
28
+ self.__author_type: str = None
29
+ self.__has_isni: str = ""
30
+ self.__activity_start: str = None
31
+ self.__activity_end: str = None
32
+
33
+ @property
34
+ def author(self) -> str:
35
+ if not self.__author:
36
+ self.__author = self.record_data.get(
37
+ "mainHeading", {}).get("text", "")
38
+
39
+ @property
40
+ def author_type(self) -> str:
41
+ """type of name (personal, corporate, title, etc)"""
42
+ if not self.__author_type:
43
+ self.__author_type = self.record_data.get("nameType")
44
+
45
+ @property
46
+ def viaf_id(self) -> int:
47
+ if not self.__viaf_id:
48
+ self.__viaf_id = self.record_data.get("viafID", "")
49
+ return self.__viaf_id
50
+
51
+ @property
52
+ def has_isni(self) -> bool:
53
+ return bool(self.record_data.get("isni", ""))
54
+
55
+ def __get_data(self, field_name: str) -> List[str]:
56
+ entries = self.record_data.get(field_name, {}).get("data", [])
57
+
58
+ data = []
59
+ for entry in entries:
60
+ sources = entry.get("sources", {}).get("s", [])
61
+ if set(self.__allowed_sources).intersection(set(sources)):
62
+ data.append(entry.get("text", ""))
63
+ return data
64
+
65
+ @property
66
+ def record_data(self) -> dict:
67
+ if not self.__record_data:
68
+ try:
69
+ self.__record_data = self.__record["queryResult"]
70
+ except:
71
+ self.__record_data = self.__record["recordData"]["VIAFCluster"]
72
+
73
+ return self.__record_data
74
+
75
+ @property
76
+ def name_variations(self) -> List[str]:
77
+ if not self.__name_variations:
78
+ self.__name_variations = self.__get_data("mainHeadings")
79
+ return self.__name_variations
80
+
81
+ @property
82
+ def birth_date(self) -> str:
83
+ if not self.__birth_date:
84
+ self.__birth_date = self.record_data.get("birthDate", None)
85
+ return self.__birth_date
86
+
87
+ @property
88
+ def death_date(self) -> str:
89
+ if not self.__death_date:
90
+ self.__death_date = self.record_data.get("deathDate", None)
91
+ return self.__death_date
92
+
93
+ @property
94
+ def occupations(self) -> List[str]:
95
+ if not self.__occupations:
96
+ self.__occupations = self.__get_data("occupation")
97
+ return self.__occupations
98
+
99
+ @property
100
+ def activity_start(self) -> str:
101
+ if not self.__birth_date:
102
+ self.__birth_date = self.record_data.get("activityStart", None)
103
+ return self.__birth_date
104
+
105
+ @property
106
+ def activity_end(self) -> str:
107
+ if not self.__death_date:
108
+ self.__death_date = self.record_data.get("activityEnd", None)
109
+ return self.__death_date
110
+
111
+ @property
112
+ def nationality(self) -> str:
113
+ if not self.__nationality:
114
+ nationalities = self.__get_data("nationalityOfEntity")
115
+ nationalities_dict = defaultdict(int)
116
+ for n in nationalities:
117
+ nationalities_dict[n.lower()] += 1
118
+ if nationalities:
119
+ self.__nationality = sorted(
120
+ nationalities_dict.items(), key=lambda x: x[1], reverse=True)[0][0]
121
+ return self.__nationality
122
+
123
+ @property
124
+ def all_fields(self) -> dict:
125
+ if not self.__all_fields:
126
+ self.__all_fields = {
127
+ "viaf_id": self.viaf_id,
128
+ "name_variations": self.name_variations,
129
+ "birth_date": self.birth_date,
130
+ "death_date": self.death_date,
131
+ "occupations": self.occupations,
132
+ "nationality": self.nationality,
133
+ "activity_start": self.activity_start,
134
+ "activity_end": self.activity_end,
135
+ "has_isni": self.has_isni,
136
+ "author": self.author
137
+ }
138
+ return self.__all_fields
139
+
140
+
141
+ class VIAFClient:
142
+ def __init__(self, viaf_api_url: str = "https://viaf.org/api"):
143
+ self.root_url = viaf_api_url.strip("/")
144
+ self.record_url = f"{self.root_url}/cluster-record"
145
+ self.search_url = f"{self.root_url}/search"
146
+ self.headers = {
147
+ "Accept": "application/json",
148
+ "Content-Type": "application/json"
149
+ }
150
+
151
+ def _send_request(self, url: str, data: dict) -> dict:
152
+ return requests.post(url, data=json.dumps(data), headers=self.headers)
153
+
154
+ def get_records_by_search_term(self,
155
+ search_term: str,
156
+ index: str = "viaf",
157
+ field: str = "local.names",
158
+ page_index: int = 0,
159
+ page_size: int = 50
160
+ ) -> dict:
161
+ data = {
162
+ "reqValues": {
163
+ "field": field,
164
+ "index": index,
165
+ "searchTerms": search_term
166
+ },
167
+ "meta": {
168
+ "env": "prod",
169
+ "pageIndex": page_index,
170
+ "pageSize": page_size
171
+ }
172
+ }
173
+ response = self._send_request(url=self.search_url, data=data)
174
+ return response
175
+
176
+ def get_records_by_viaf_id(self, record_id: str) -> dict:
177
+ data = {
178
+ "reqValues": {
179
+ "recordId": str(record_id)
180
+ }
181
+ }
182
+ response = self._send_request(url=self.record_url, data=data)
183
+
184
+ return response
185
+
186
+ def fetch_viaf_clusters(self, viaf_ids):
187
+
188
+ results = {}
189
+
190
+ for viaf_id in viaf_ids:
191
+ try:
192
+ response = self.get_records_by_viaf_id(viaf_id)
193
+ response.raise_for_status()
194
+ results[viaf_id] = response.json()
195
+ except requests.exceptions.RequestException as e:
196
+ logger.error(f"Error fetching VIAF record {viaf_id}: {e}")
197
+ results[viaf_id] = {}
198
+
199
+ return results
200
+
201
+ def get_normalized_data(self, record_ids: List[str]) -> List[VIAFRecord]:
202
+ """ Fetch data required for normalization from VIAF. """
203
+ response = self.fetch_viaf_clusters(record_ids)
204
+ return [VIAFRecord(response[record_id]) for record_id in record_ids]
@@ -0,0 +1,50 @@
1
+ from typing import List, NoReturn
2
+ from pymarc.record import Record
3
+ from pymarc import MARCReader
4
+ from abc import abstractmethod
5
+ from collections.abc import Iterator, Iterable
6
+ import jsonlines
7
+
8
+
9
+ class BaseMARCParser:
10
+ """ Base class for MARC parsers.
11
+ """
12
+ def __init__(self,
13
+ marc_file_path: str,
14
+ add_variations: bool = True
15
+ ) -> NoReturn:
16
+ """ Initializes BaseMARCParser object.
17
+
18
+ Parameters
19
+ -----------
20
+ marc_file_path: str
21
+ Full path to .mrc file containing EMS data.
22
+ add_variations: bool
23
+ If enabled, constructs an additional variations field, which
24
+ combines the content of multiple fields + adds some generated
25
+ variations. If the output is uploaded into Elastic and used
26
+ via rara-norm-linker, it is necessary to enable this.
27
+ """
28
+ self.add_variations = add_variations
29
+ self.marc_file_path = marc_file_path
30
+
31
+ def _write_line(self, line: dict, file_path: str) -> NoReturn:
32
+ with jsonlines.open(file_path, "a") as f:
33
+ f.write(line)
34
+
35
+ def marc_record_generator(self) -> Iterator[Record]:
36
+ """ Generates pymarc.record.Record objects.
37
+ """
38
+ with open(self.marc_file_path, "rb") as fh:
39
+ reader = MARCReader(fh)
40
+ for record in reader:
41
+ if record:
42
+ yield record
43
+
44
+ @abstractmethod
45
+ def record_generator(self) -> Iterator:
46
+ pass
47
+
48
+ def save_as_jl(self, jl_file_path: str) -> NoReturn:
49
+ for record in self.record_generator():
50
+ self._write_line(record.full_record, jl_file_path)
@@ -0,0 +1,49 @@
1
+ from typing import NoReturn
2
+ from collections.abc import Iterator
3
+ from rara_tools.parsers.marc_parsers.base_parser import BaseMARCParser
4
+ from rara_tools.parsers.marc_records.ems_record import EMSRecord
5
+ from rara_tools.constants.parsers import KeywordType, LOGGER
6
+
7
+
8
+ class EMSMARCParser(BaseMARCParser):
9
+ """ MARC parser for EMS .mrc files.
10
+ """
11
+ def __init__(self,
12
+ marc_file_path: str,
13
+ add_variations: bool = True
14
+ ) -> NoReturn:
15
+ """ Initializes EMSMARCParser object.
16
+
17
+ Parameters
18
+ -----------
19
+ marc_file_path: str
20
+ Full path to .mrc file containing EMS data.
21
+ add_variations: bool
22
+ If enabled, constructs an additional variations field, which
23
+ combines the content of multiple fields + adds some generated
24
+ variations. If the output is uploaded into Elastic and used
25
+ via rara-norm-linker, it is necessary to enable this.
26
+ """
27
+ super().__init__(
28
+ marc_file_path=marc_file_path,
29
+ add_variations=add_variations
30
+ )
31
+
32
+ def record_generator(self) -> Iterator[EMSRecord]:
33
+ """ Generates EMSRecord objects for topic, genre,
34
+ and time keywords. Location keywords are ignored here
35
+ and handled in a separate parser.
36
+ """
37
+ LOGGER.info(
38
+ f"Generating EMS records (without location keywords) " \
39
+ f"from MARC dump '{self.marc_file_path}'."
40
+ )
41
+ for record in self.marc_record_generator():
42
+ ems_record = EMSRecord(
43
+ record=record,
44
+ add_variations=self.add_variations
45
+ )
46
+ if ems_record.keyword_type == KeywordType.LOC:
47
+ continue
48
+ else:
49
+ yield ems_record.full_record
@@ -0,0 +1,46 @@
1
+ from typing import NoReturn
2
+ from collections.abc import Iterator
3
+ from rara_tools.parsers.marc_parsers.base_parser import BaseMARCParser
4
+ from rara_tools.parsers.marc_records.ems_record import EMSRecord
5
+ from rara_tools.constants.parsers import KeywordType, LOGGER
6
+
7
+
8
+ class LocationMARCParser(BaseMARCParser):
9
+ """ MARC parser for EMS .mrc files.
10
+ """
11
+ def __init__(self,
12
+ marc_file_path: str,
13
+ add_variations: bool = True
14
+ ) -> NoReturn:
15
+ """ Initializes LocationMARCParser object.
16
+
17
+ Parameters
18
+ -----------
19
+ marc_file_pasth: str
20
+ Full path to .mrc file containing EMS data.
21
+ add_variations: bool
22
+ If enabled, constructs an additional variations field, which
23
+ combines the content of multiple fields + adds some generated
24
+ variations. If the output is uploaded into Elastic and used
25
+ via rara-norm-linker, it is necessary to enable this.
26
+ """
27
+
28
+ super().__init__(
29
+ marc_file_path=marc_file_path,
30
+ add_variations=add_variations
31
+ )
32
+
33
+ def record_generator(self) -> Iterator[EMSRecord]:
34
+ """ Generates EMSRecord objects for location keywords.
35
+ """
36
+ LOGGER.info(
37
+ f"Generating EMS-based location records " \
38
+ f"from MARC dump '{self.marc_file_path}'."
39
+ )
40
+ for record in self.marc_record_generator():
41
+ ems_record = EMSRecord(
42
+ record=record,
43
+ add_variations=self.add_variations
44
+ )
45
+ if ems_record.keyword_type == KeywordType.LOC:
46
+ yield ems_record.full_record
@@ -0,0 +1,44 @@
1
+ from typing import NoReturn
2
+ from collections.abc import Iterator
3
+ from rara_tools.parsers.marc_parsers.base_parser import BaseMARCParser
4
+ from rara_tools.parsers.marc_records.organization_record import OrganizationRecord
5
+ from rara_tools.constants.parsers import LOGGER
6
+
7
+
8
+ class OrganizationsMARCParser(BaseMARCParser):
9
+ """ MARC parser for organizations' .mrc files.
10
+ """
11
+ def __init__(self,
12
+ marc_file_path: str,
13
+ add_variations: bool = True
14
+ ) -> NoReturn:
15
+ """ Initializes OrganizationsMARCParser object.
16
+
17
+ Parameters
18
+ -----------
19
+ marc_file_path: str
20
+ Full path to .mrc file containing organizations' data.
21
+ add_variations: bool
22
+ If enabled, constructs an additional variations field, which
23
+ combines the content of multiple fields + adds some generated
24
+ variations. If the output is uploaded into Elastic and used
25
+ via rara-norm-linker, it is necessary to enable this.
26
+ """
27
+ super().__init__(
28
+ marc_file_path=marc_file_path,
29
+ add_variations=add_variations
30
+ )
31
+
32
+ def record_generator(self) -> Iterator[OrganizationRecord]:
33
+ """ Generates OrganizationRecord objects.
34
+ """
35
+ LOGGER.info(
36
+ f"Generating organization records from " \
37
+ f"MARC dump '{self.marc_file_path}'."
38
+ )
39
+ for record in self.marc_record_generator():
40
+ org_record = OrganizationRecord(
41
+ record=record,
42
+ add_variations=self.add_variations
43
+ )
44
+ yield org_record.full_record
@@ -0,0 +1,45 @@
1
+ from typing import NoReturn, List
2
+ from collections.abc import Iterator
3
+ from rara_tools.parsers.marc_parsers.base_parser import BaseMARCParser
4
+ from rara_tools.parsers.marc_records.person_record import PersonRecord
5
+ from rara_tools.constants.parsers import LOGGER
6
+
7
+
8
+ class PersonsMARCParser(BaseMARCParser):
9
+ """ MARC parser for persons' .mrc files.
10
+ """
11
+ def __init__(self,
12
+ marc_file_path: str,
13
+ add_variations: bool = True
14
+ ) -> NoReturn:
15
+ """ Initializes PersonsMARCParser object.
16
+
17
+ Parameters
18
+ -----------
19
+ marc_file_path: str
20
+ Full path to .mrc file containing persons' data.
21
+ add_variations: bool
22
+ If enabled, constructs an additional variations field, which
23
+ combines the content of multiple fields + adds some generated
24
+ variations. If the output is uploaded into Elastic and used
25
+ via rara-norm-linker, it is necessary to enable this.
26
+ """
27
+
28
+ super().__init__(
29
+ marc_file_path=marc_file_path,
30
+ add_variations=add_variations
31
+ )
32
+
33
+ def record_generator(self) -> Iterator[PersonRecord]:
34
+ """ Generates PersonRecord objects.
35
+ """
36
+ LOGGER.info(
37
+ f"Generating person records from " \
38
+ f"MARC dump '{self.marc_file_path}'."
39
+ )
40
+ for record in self.marc_record_generator():
41
+ person_record = PersonRecord(
42
+ record=record,
43
+ add_variations=self.add_variations
44
+ )
45
+ yield person_record.full_record
@@ -0,0 +1 @@
1
+ # Coming soon
@@ -0,0 +1,112 @@
1
+ from typing import List, NoReturn, Tuple
2
+ from abc import abstractmethod
3
+ from pymarc.record import Record
4
+ from rara_tools.constants.parsers import GeneralMarcIDs
5
+
6
+
7
+ class BaseRecord:
8
+ """ Implements general logic of parsing MARC files.
9
+ """
10
+ def __init__(self, record: Record, add_variations: bool = False) -> NoReturn:
11
+ """ Initializes BaseRecord object.
12
+
13
+ Parameters
14
+ -----------
15
+ record: Record
16
+ pymarc.record.Record objectself.
17
+ add_variations: bool
18
+ If enabled, constructs an additional variations field, which
19
+ combines the content of multiple fields + adds some generated
20
+ variations. If the output is uploaded into Elastic and used
21
+ via rara-norm-linker, it is necessary to enable this.
22
+ """
23
+ self.add_variations: bool = add_variations
24
+ self.__record_mrc: Record = record
25
+ self.__record_dict: dict = record.as_dict()["fields"]
26
+
27
+ self.__id_field_id: List[str] = GeneralMarcIDs.ID
28
+ self.__id_source_field_id: List[str] = GeneralMarcIDs.ID_SOURCE
29
+
30
+ self.__identifier: str = ""
31
+ self.__identifier_source: str = ""
32
+
33
+ def get_values(self,
34
+ marc_ids: List[str],
35
+ subfield_id: str | List[str] = "",
36
+ ind1: str = " ",
37
+ ind2: str = " ",
38
+ subfield_restriction: Tuple[str, str] = (),
39
+ subfield_to_ignore: str | None = None
40
+ ) -> List[str] | List[dict]:
41
+ values = []
42
+
43
+ for field in self.dict_record:
44
+ field_id = list(field.keys())[0]
45
+ if field_id in marc_ids:
46
+ # TODO: ind1!
47
+ if not subfield_id:
48
+ values.append(field[field_id])
49
+ else:
50
+ if field[field_id]["ind2"] == ind2:
51
+ subfields = field[field_id]["subfields"]
52
+ subfield_tuples = [list(subfield.items())[0] for subfield in subfields]
53
+ subfield_keys = [list(subfield.keys())[0] for subfield in subfields]
54
+ if subfield_restriction and subfield_restriction not in subfield_tuples:
55
+ continue
56
+ if subfield_to_ignore and subfield_to_ignore in subfield_keys:
57
+ continue
58
+ _value = {}
59
+ for subfield in subfields:
60
+ _subfield_id = list(subfield.keys())[0]
61
+ if isinstance(subfield_id, str):
62
+ if _subfield_id == subfield_id:
63
+ value = subfield[_subfield_id]
64
+ values.append(value)
65
+ elif isinstance(subfield_id, list):
66
+ if _subfield_id in subfield_id:
67
+ value = subfield[_subfield_id]
68
+ _value[_subfield_id] = value
69
+ if isinstance(subfield_id, list):
70
+ values.append(_value)
71
+
72
+ return values
73
+
74
+ def _clean_value(self, value: str) -> str:
75
+ cleaned_value = value.strip("., ")
76
+ return cleaned_value
77
+
78
+ def _merge_and_clean(self, value: dict, keys: List[str]) -> str:
79
+ _merged = []
80
+ for key in keys:
81
+ _value = self._clean_value(value.get(key, ""))
82
+ if _value:
83
+ _merged.append(_value)
84
+ merged = " ".join(_merged)
85
+ return merged
86
+
87
+
88
+ @property
89
+ def identifier(self) -> str:
90
+ if not self.__identifier:
91
+ values = self.get_values(marc_ids=self.__id_field_id)
92
+ self.__identifier = values[0] if values else ""
93
+ return self.__identifier
94
+
95
+ @property
96
+ def identifier_source(self) -> str:
97
+ if not self.__identifier_source:
98
+ values = self.get_values(marc_ids=self.__id_source_field_id)
99
+ self.__identifier_source = values[0] if values else ""
100
+ return self.__identifier_source
101
+
102
+ @property
103
+ def marc_record(self) -> Record:
104
+ return self.__record_mrc
105
+
106
+ @property
107
+ def marc_json_record(self) -> dict:
108
+ return self.marc_record.as_dict()
109
+
110
+ @property
111
+ def dict_record(self) -> Record:
112
+ return self.__record_dict