rara-tools 0.6.17__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rara-tools might be problematic. Click here for more details.

@@ -1,3 +1,6 @@
1
+ import logging
2
+ from rara_tools.constants.normalizers import EntityType, VIAF_ENTITY_MAP
3
+
1
4
  COMPONENT_KEY = "linker"
2
5
 
3
6
 
@@ -20,3 +23,114 @@ class Queue:
20
23
  class StatusKeys:
21
24
  VECTORIZE_CONTEXT = "vectorize_context"
22
25
  LINK_KEYWORDS = "link_keywords"
26
+
27
+ class URLSource:
28
+ VIAF = "VIAF"
29
+ SIERRA = "Sierra"
30
+ EMS = "EMS"
31
+
32
+ class KeywordType:
33
+ LOC = "Kohamärksõnad"
34
+ TIME = "Ajamärksõnad"
35
+ TOPIC = "Teemamärksõnad"
36
+ GENRE = "Vormimärksõnad"
37
+ TITLE = "Teose pealkiri"
38
+ PER = "Isikunimi"
39
+ ORG = "Kollektiivi nimi"
40
+ EVENT = "Ajutine kollektiiv või sündmus"
41
+ CATEGORY = "Valdkonnamärksõnad"
42
+ UDC = "UDC Summary"
43
+ UDK = "UDK Rahvusbibliograafia"
44
+
45
+
46
+ class KeywordMARC:
47
+ PER = 600
48
+ ORG = 610
49
+ TOPIC = 650
50
+ GENRE = 655
51
+ TIME = 648
52
+ LOC = 651
53
+ EVENT = 611
54
+ TITLE = 630
55
+ TITLE_LINKED = 600
56
+
57
+ class KeywordSource:
58
+ EMS = "EMS"
59
+ SIERRA = "SIERRA"
60
+ VIAF = "VIAF"
61
+ AI = "AI"
62
+
63
+ class Filters:
64
+ AUTHOR = "author"
65
+ YEAR = "year"
66
+
67
+
68
+ UNLINKED_KEYWORD_MARC_FIELD = 693
69
+
70
+ ALLOWED_FILTERS_MAP = {
71
+ EntityType.PER: [Filters.YEAR],
72
+ EntityType.ORG: [Filters.YEAR],
73
+ EntityType.TITLE: [Filters.YEAR, Filters.AUTHOR],
74
+ EntityType.KEYWORD: [],
75
+ EntityType.LOC: []
76
+ }
77
+ KEYWORD_MARC_MAP = {
78
+ KeywordType.LOC: KeywordMARC.LOC,
79
+ KeywordType.TIME: KeywordMARC.TIME,
80
+ KeywordType.TOPIC: KeywordMARC.TOPIC,
81
+ KeywordType.GENRE: KeywordMARC.GENRE,
82
+ KeywordType.TITLE: KeywordMARC.TITLE,
83
+ KeywordType.ORG: KeywordMARC.ORG,
84
+ KeywordType.PER: KeywordMARC.PER,
85
+ KeywordType.EVENT: KeywordMARC.EVENT
86
+ }
87
+
88
+ URL_SOURCE_MAP = {
89
+ EntityType.PER: URLSource.VIAF,
90
+ EntityType.ORG: URLSource.VIAF,
91
+ EntityType.TITLE: URLSource.VIAF,
92
+ EntityType.KEYWORD: URLSource.EMS,
93
+ EntityType.LOC: URLSource.EMS
94
+ }
95
+
96
+ # Ignore those "keyword types" while linking the
97
+ # rara-subject-indexer results
98
+ KEYWORD_TYPES_TO_IGNORE = [
99
+ KeywordType.CATEGORY,
100
+ KeywordType.UDC,
101
+ KeywordType.UDK
102
+ ]
103
+
104
+ ALLOWED_ENTITY_TYPES = [
105
+ EntityType.PER,
106
+ EntityType.ORG,
107
+ EntityType.KEYWORD,
108
+ EntityType.LOC,
109
+ EntityType.TITLE,
110
+ EntityType.UNK,
111
+ ]
112
+
113
+
114
+ KEYWORD_TYPE_MAP = {
115
+ KeywordType.TIME: EntityType.KEYWORD,
116
+ KeywordType.GENRE: EntityType.KEYWORD,
117
+ KeywordType.LOC: EntityType.LOC,
118
+ KeywordType.PER: EntityType.PER,
119
+ KeywordType.ORG: EntityType.ORG,
120
+ KeywordType.TOPIC: EntityType.KEYWORD,
121
+ KeywordType.TITLE: EntityType.TITLE,
122
+ KeywordType.EVENT: EntityType.ORG
123
+ }
124
+
125
+ EMS_ENTITY_TYPES = [EntityType.KEYWORD, EntityType.LOC]
126
+ SIERRA_ENTITY_TYPES = [EntityType.PER, EntityType.ORG, EntityType.TITLE]
127
+ VIAF_ENTITY_TYPES = [EntityType.PER, EntityType.ORG, EntityType.TITLE]
128
+
129
+ # Params for filters
130
+ MIN_AUTHOR_SIMILARITY = 0.95
131
+ YEAR_EXCEPTION_VALUE = True
132
+
133
+ LOGGER_NAME = "rara-tools-norm-linker"
134
+ LOGGER = logging.getLogger(LOGGER_NAME)
135
+
136
+ MAIN_TAXONOMY_LANG = "et"
@@ -0,0 +1,86 @@
1
+ from typing import List, Tuple, Any
2
+ from rara_tools.core_formatters.formatted_keyword import FormattedKeyword
3
+ from rara_tools.core_formatters.formatted_meta import FormattedAuthor
4
+ from rara_tools.constants.linker import MAIN_TAXONOMY_LANG, KEYWORD_TYPES_TO_IGNORE, EntityType
5
+
6
+ def get_primary_author(authors: List[dict]) -> str:
7
+ primary_author = ""
8
+ for author in authors:
9
+ if author.get("is_primary", False):
10
+ primary_author = author.get("name", "")
11
+ return primary_author
12
+
13
+ def format_authors(authors: List[dict]) -> List[dict]:
14
+ formatted_authors = []
15
+ for author in authors:
16
+ entity_type = author.get("type", EntityType.UNK)
17
+
18
+ formatted_author = FormattedAuthor(
19
+ object_dict=author,
20
+ linked_doc=None,
21
+ entity_type=entity_type
22
+ ).to_dict()
23
+ formatted_authors.append(formatted_author)
24
+ return formatted_authors
25
+
26
+ def format_sections(sections: List[dict]) -> List[dict]:
27
+ for section in sections:
28
+ authors = section.pop("authors", [])
29
+ titles = section.pop("titles", [])
30
+ primary_author = get_primary_author(authors)
31
+ if primary_author:
32
+ for title in titles:
33
+ title["author_from_title"] = primary_author
34
+ section["titles"] = titles
35
+
36
+ formatted_authors = format_authors(authors)
37
+ section["authors"] = formatted_authors
38
+
39
+ return sections
40
+
41
+ def format_meta(meta: dict) -> dict:
42
+ """ Formats unlinked meta for Kata CORE.
43
+ """
44
+
45
+ meta_to_format = meta.get("meta")
46
+
47
+ authors = meta_to_format.pop("authors", [])
48
+ sections = meta_to_format.pop("sections", [])
49
+
50
+ formatted_authors = format_authors(authors)
51
+ formatted_sections = format_sections(sections)
52
+
53
+ if sections and formatted_sections:
54
+ meta_to_format["sections"] = formatted_sections
55
+ if authors and formatted_authors:
56
+ meta_to_format["authors"] = formatted_authors
57
+
58
+ meta["meta"] = meta_to_format
59
+
60
+ return meta
61
+
62
+
63
+ def format_keywords(flat_keywords: List[dict]) -> List[dict]:
64
+ """ Formats unlinked keywords for Kata CORE.
65
+ """
66
+ ignored_keywords = []
67
+ filtered_keywords = []
68
+
69
+ for keyword_dict in flat_keywords:
70
+ keyword_type = keyword_dict.get("entity_type")
71
+ if keyword_type in KEYWORD_TYPES_TO_IGNORE:
72
+ ignored_keywords.append(keyword_dict)
73
+ else:
74
+ filtered_keywords.append(keyword_dict)
75
+
76
+ formatted_keywords = []
77
+
78
+ for keyword_dict in filtered_keywords:
79
+ formatted_keyword = FormattedKeyword(
80
+ object_dict=keyword_dict,
81
+ linked_doc=None,
82
+ main_taxnomy_lang=MAIN_TAXONOMY_LANG
83
+ ).to_dict()
84
+ formatted_keywords.append(formatted_keyword)
85
+
86
+ return formatted_keywords
@@ -0,0 +1,229 @@
1
+ from rara_tools.constants.linker import (
2
+ LOGGER, URLSource, KeywordSource, EntityType, KeywordType, KeywordMARC,
3
+ KEYWORD_MARC_MAP, KEYWORD_TYPES_TO_IGNORE, KEYWORD_TYPE_MAP,
4
+ EMS_ENTITY_TYPES, SIERRA_ENTITY_TYPES, UNLINKED_KEYWORD_MARC_FIELD,
5
+ URL_SOURCE_MAP
6
+ )
7
+ from rara_tools.core_formatters.formatted_object import FormattedObject
8
+ from typing import List, Dict, NoReturn, Tuple, Any
9
+
10
+ class FormattedKeyword(FormattedObject):
11
+ def __init__(self, object_dict: dict, linked_doc: Any,
12
+ main_taxnomy_lang: str, url_source_map: str = URL_SOURCE_MAP
13
+ ) -> NoReturn:
14
+ super().__init__(
15
+ object_dict=object_dict,
16
+ linked_doc=linked_doc,
17
+ original_entity_key="keyword"
18
+ )
19
+
20
+ self.main_taxnomy_lang: str = main_taxnomy_lang
21
+
22
+ self.original_keyword: str = self.original_entity
23
+ self.score: float = self.object_dict.get("score")
24
+ self.count: int = self.object_dict.get("count")
25
+ self.method: str = self.object_dict.get("method")
26
+ self.model_arch: str = self.object_dict.get("model_arch", self.method)
27
+ self.keyword_type: str = self.object_dict.get("entity_type")
28
+
29
+ self.entity_type: str = KEYWORD_TYPE_MAP.get(self.keyword_type, "")
30
+ self.url_source_map: dict = url_source_map
31
+
32
+ self.__keyword_source: str = ""
33
+ self.__indicator_1: str = ""
34
+ self.__indicator_2: str = ""
35
+ self.__url: str | None = None
36
+ self.__url_source: str | None = None
37
+ self.__marc_field: str = ""
38
+
39
+ self.__language: str = ""
40
+ self.__author: str | None = None
41
+
42
+ @property
43
+ def keyword(self) -> str:
44
+ return self.entity
45
+
46
+ @property
47
+ def keyword_source(self) -> str:
48
+ if not self.__keyword_source:
49
+ if not self.is_linked:
50
+ source = KeywordSource.AI
51
+ elif self.entity_type in EMS_ENTITY_TYPES:
52
+ source = KeywordSource.EMS
53
+ elif self.entity_type in SIERRA_ENTITY_TYPES:
54
+ if self.linked_doc and self.linked_doc.elastic:
55
+ source = KeywordSource.SIERRA
56
+ elif self.linked_doc and self.linked_doc.viaf:
57
+ source = KeywordSource.VIAF
58
+ else:
59
+ source = KeywordSource.AI
60
+ else:
61
+ source = KeywordSource.AI
62
+ self.__keyword_source = source
63
+ return self.__keyword_source
64
+
65
+ @property
66
+ def indicator1(self) -> str:
67
+ if not self.__indicator_1:
68
+ ind1, ind2 = self._get_indicators()
69
+ self.__indicator_1 = ind1
70
+ self.__indicator_2 = ind2
71
+ return self.__indicator_1
72
+
73
+ @property
74
+ def indicator2(self) -> str:
75
+ if not self.__indicator_2:
76
+ ind1, ind2 = self._get_indicators()
77
+ self.__indicator_1 = ind1
78
+ self.__indicator_2 = ind2
79
+ return self.__indicator_2
80
+
81
+ @property
82
+ def url(self) -> str:
83
+ if self.__url == None:
84
+ url_info = self._get_url_info()
85
+ self.__url = url_info.get("url")
86
+ self.__url_source = url_info.get("url_source")
87
+ return self.__url
88
+
89
+ @property
90
+ def url_source(self) -> str:
91
+ if self.__url_source == None:
92
+ url_info = self._get_url_info()
93
+ self.__url = url_info.get("url")
94
+ self.__url_source = url_info.get("url_source")
95
+ return self.__url_source
96
+
97
+ @property
98
+ def marc_field(self) -> int:
99
+ if not self.__marc_field:
100
+ # TODO: teoste + isikute loogika!!!!
101
+ if self.is_linked:
102
+ marc_field = KEYWORD_MARC_MAP.get(str(self.keyword_type), "")
103
+ else:
104
+ marc_field = UNLINKED_KEYWORD_MARC_FIELD
105
+
106
+ if self.entity_type == EntityType.TITLE:
107
+ if self.author:
108
+ marc_field = KeywordMARC.TITLE_LINKED
109
+ else:
110
+ marc_field = KeywordMARC.TITLE
111
+ self.__marc_field = marc_field
112
+ return self.__marc_field
113
+
114
+
115
+ @property
116
+ def persons_title(self) -> str:
117
+ return self.titles
118
+
119
+
120
+ @property
121
+ def language(self) -> str:
122
+ if not self.__language:
123
+ if self.is_linked:
124
+ self.__language = self.main_taxnomy_lang
125
+ else:
126
+ self.__language = self.object_dict.get("language", "")
127
+ return self.__language
128
+
129
+ @property
130
+ def author(self) -> str:
131
+ # Only relevant for titles!
132
+ if self.__author == None:
133
+ self.__author = ""
134
+ if self.entity_type == EntityType.TITLE:
135
+ if self.original_record:
136
+ self.__author = self.original_record.author_name
137
+ elif self.viaf_info:
138
+ pass
139
+ #self.__author = self.viaf_info.get
140
+ return self.__author
141
+
142
+ def _get_url_info(self) -> dict:
143
+ """ Finds URL identifier from LinkedDoc based on
144
+ given entity type.
145
+
146
+ Parameters
147
+ -----------
148
+ linked_doc: LinkedDoc | None
149
+ A LinkedDoc class instance.
150
+ entity_type: str
151
+ Entity type for detecting correct URL source.
152
+
153
+ Returns
154
+ ----------
155
+ dict:
156
+ Dictionary with keys `url` - URL identifier and
157
+ `url_source` - source of the URL (e.g. "EMS").
158
+
159
+ """
160
+ url_source = self.url_source_map.get(self.entity_type, "")
161
+ url = ""
162
+
163
+ if self.linked_doc:
164
+ if url_source == URLSource.EMS:
165
+ url = self.linked_doc.elastic.get("ems_url", "")
166
+ elif url_source == URLSource.VIAF:
167
+ url = self.viaf_info.get("viaf_url", "")
168
+ if not url:
169
+ url_source = ""
170
+
171
+ url_info = {"url": url, "url_source": url_source}
172
+
173
+ LOGGER.debug(
174
+ f"Detected URL info: {url_info}. Used entity_type = {self.entity_type}. " \
175
+ f"URL source map = {self.url_source_map}."
176
+ )
177
+ return url_info
178
+
179
+ def _get_indicators(self) -> Tuple[str, str]:
180
+ """ Find MARC indicators 1 and 2.
181
+ """
182
+ ind1 = " "
183
+ ind2 = " "
184
+ if self.entity_type in SIERRA_ENTITY_TYPES:
185
+ if self.entity_type == EntityType.PER:
186
+ if "," in self.keyword:
187
+ ind1 = "1"
188
+ else:
189
+ ind1 = "0"
190
+ elif self.entity_type == EntityType.ORG:
191
+ # 1 märksõna esimeseks elemendiks võimupiirkonna nimi, nt:
192
+ # (a) Eesti (b) Riigikogu - raske automaatselt määrata
193
+ # 2 märksõna esimeseks elemendiks nimi pärijärjestuses
194
+ ind1 = "2"
195
+ else:
196
+ ind1 = "0"
197
+
198
+ if not self.is_linked:
199
+ ind2 = "4"
200
+ elif self.entity_type in EMS_ENTITY_TYPES:
201
+ ind2 = "4"
202
+ return (ind1, ind2)
203
+
204
+
205
+ def to_dict(self) -> dict:
206
+ keyword_dict = {
207
+ "count": self.count,
208
+ "dates": self.dates,
209
+ "entity_type": self.keyword_type,
210
+ "indicator1": self.indicator1,
211
+ "indicator2": self.indicator2,
212
+ "is_linked": self.is_linked,
213
+ "keyword": self.keyword,
214
+ "keyword_source": self.keyword_source,
215
+ "lang": self.language,
216
+ "location": self.location,
217
+ "marc_field": self.marc_field,
218
+ "method": self.method,
219
+ "model_arch": self.model_arch,
220
+ "numeration": self.numeration,
221
+ "organisation_sub_unit": self.organisation_sub_unit,
222
+ "original_keyword": self.original_keyword,
223
+ "persons_title": self.persons_title,
224
+ "score": self.score,
225
+ "url": self.url,
226
+ "url_source": self.url_source,
227
+ "author": self.author
228
+ }
229
+ return keyword_dict
@@ -0,0 +1,154 @@
1
+ from rara_tools.constants.linker import (
2
+ LOGGER, EntityType
3
+ )
4
+ from rara_tools.core_formatters.formatted_object import FormattedObject
5
+ from typing import List, Dict, NoReturn, Tuple, Any
6
+
7
+
8
+ class FormattedTitle(FormattedObject):
9
+ # TODO: Kas seda on üldse vaja?
10
+ def __init__(self, object_dict: dict, linked_doc: Any):
11
+ super().__init__(
12
+ object_dict=object_dict,
13
+ linked_doc=linked_doc,
14
+ original_entity_key="name"
15
+ )
16
+
17
+
18
+ class FormattedAuthor(FormattedObject):
19
+ def __init__(self, object_dict: dict, linked_doc: Any, entity_type: str):
20
+ super().__init__(
21
+ object_dict=object_dict,
22
+ linked_doc=linked_doc,
23
+ original_entity_key="name"
24
+ )
25
+ self.entity_type: str = entity_type
26
+
27
+ self.is_linked: bool = True if self.linked_doc else False # NB! Lisada andmebaasi uus veerg!
28
+ self.original_name: str = self.original_entity # NB! Lisada andmebaasi uus veerg
29
+ self.author_role: str = self.object_dict.get("role")
30
+ self.is_primary: bool = self.object_dict.get("is_primary")
31
+
32
+ self.__primary_author_type: str = None
33
+
34
+ self.__name_order_type: str = ""
35
+ self.__event_sub_unit: str = ""
36
+ self.__order_number: str = ""
37
+ self.__sub_title: str = ""
38
+ self.__additional_info: str = ""
39
+ self.__publication_type: str = ""
40
+ self.__publication_language: str = ""
41
+ #self.__standardized_uri: str = ""
42
+ self.__viaf_id: str = ""
43
+
44
+
45
+ @property
46
+ def primary_author_type(self) -> str:
47
+ if self.__primary_author_type == None:
48
+ if self.is_primary:
49
+ if self.entity_type != EntityType.UNK:
50
+ self.__primary_author_type = self.entity_type
51
+ else:
52
+ self.__primary_author_type = EntityType.PER
53
+ else:
54
+ self.__primary_author_type = ""
55
+ return self.__primary_author_type
56
+
57
+
58
+ @property
59
+ def name(self) -> str:
60
+ return self.entity
61
+
62
+ @property
63
+ def name_order(self) -> str:
64
+ if not self.__name_order_type:
65
+ if self.entity_type == EntityType.PER or self.entity_type == EntityType.UNK:
66
+ if "," in self.name:
67
+ ind1 = "1"
68
+ else:
69
+ ind1 = "0"
70
+ elif self.entity_type == EntityType.ORG:
71
+ #LOGGER.debug(f"Entity type {self.entity_type} is not {EntityType.PER}.")
72
+ # 1 märksõna esimeseks elemendiks võimupiirkonna nimi, nt:
73
+ # (a) Eesti (b) Riigikogu - raske automaatselt määrata
74
+ # 2 märksõna esimeseks elemendiks nimi pärijärjestuses
75
+ ind1 = "2" #????????
76
+ else:
77
+ ind1 = "0"
78
+ self.__name_order_type = ind1
79
+ return self.__name_order_type
80
+
81
+ @property
82
+ def event_sub_unit(self) -> str:
83
+ if not self.__event_sub_unit:
84
+ self.__event_sub_unit = ""
85
+ return self.__event_sub_unit
86
+
87
+
88
+ @property
89
+ def order_number(self) -> str:
90
+ if not self.__order_number:
91
+ self.__order_number = ""
92
+ return self.__order_number
93
+
94
+ @property
95
+ def sub_title(self) -> str:
96
+ if not self.__sub_title:
97
+ self.__sub_title = ""
98
+ return self.__sub_title
99
+
100
+ @property
101
+ def additional_info(self) -> str:
102
+ if not self.__additional_info:
103
+ self.__additional_info = ""
104
+ return self.__additional_info
105
+
106
+ @property
107
+ def publication_type(self) -> str:
108
+ if not self.__publication_type:
109
+ self.__publication_type = ""
110
+ return self.__publication_type
111
+
112
+ @property
113
+ def publication_language(self) -> str:
114
+ if not self.__publication_language:
115
+ self.__publication_language = ""
116
+ return self.__publication_language
117
+
118
+ @property
119
+ def standardized_uri(self) -> str:
120
+ return self.identifier
121
+
122
+ @property
123
+ def viaf_id(self):
124
+ if not self.__viaf_id:
125
+ if self.viaf_info:
126
+ self.__viaf_id = self.viaf_info.get("viaf_url", "")
127
+ else:
128
+ self.__viaf_id = ""
129
+ return self.__viaf_id
130
+
131
+ def to_dict(self):
132
+ author_dict = {
133
+ "is_linked": self.is_linked,
134
+ "original_name": self.original_name,
135
+ "author_role": self.author_role,
136
+ "is_primary": self.is_primary,
137
+ "primary_author_type": self.primary_author_type,
138
+ "name": self.name,
139
+ "numeration": self.numeration,
140
+ "organisation_sub_unit": self.organisation_sub_unit,
141
+ "titles": self.titles,
142
+ "location": self.location,
143
+ "dates": self.dates,
144
+ "name_order_type": self.name_order,
145
+ "event_sub_unit": self.event_sub_unit,
146
+ "order_number": self.order_number,
147
+ "sub_title": self.sub_title,
148
+ "additional_info": self.additional_info,
149
+ "publication_type": self.publication_type,
150
+ "publication_language": self.publication_language,
151
+ "standardized_uri": self.standardized_uri,
152
+ "viaf_id": self.viaf_id
153
+ }
154
+ return author_dict
@@ -0,0 +1,137 @@
1
+ from rara_tools.constants.linker import (
2
+ LOGGER, URLSource, KeywordSource, EntityType, KeywordType,
3
+ KEYWORD_MARC_MAP, KEYWORD_TYPES_TO_IGNORE, KEYWORD_TYPE_MAP,
4
+ EMS_ENTITY_TYPES, SIERRA_ENTITY_TYPES, UNLINKED_KEYWORD_MARC_FIELD,
5
+ URL_SOURCE_MAP
6
+ )
7
+
8
+ from rara_tools.parsers.marc_records.person_record import PersonRecord
9
+ from rara_tools.parsers.marc_records.organization_record import OrganizationRecord
10
+ from rara_tools.parsers.marc_records.title_record import TitleRecord
11
+ from rara_tools.utils import format_date
12
+ from typing import List, Dict, NoReturn, Tuple, Any
13
+
14
+
15
+ class FormattedObject:
16
+ def __init__(self, object_dict: dict, linked_doc: Any, original_entity_key: str):
17
+ self.object_dict: dict = object_dict
18
+ self.linked_doc: Any = linked_doc
19
+ self.viaf_info: dict = self.linked_doc.viaf.get("parsed", {}) if self.linked_doc else {}
20
+ self.original_entity: str = self.object_dict.get(original_entity_key)
21
+ self.is_linked: bool = True if self.linked_doc else False
22
+
23
+ self.__original_record: PersonRecord | OrganizationRecord | TitleRecord | None = None
24
+ self.__persons_title: str | None = None
25
+ self.__dates: str | None = None
26
+ self.__numeration: str | None = None
27
+ self.__location: str | None = None
28
+ self.__organization_sub_unit: str | None = None
29
+ self.__entity: str | None = None
30
+ self.__titles: str | None = None
31
+ self.__identifier: str | None = ""
32
+
33
+
34
+ @property
35
+ def original_record(self) -> PersonRecord | OrganizationRecord | None:
36
+ if not self.__original_record and self.linked_doc and self.linked_doc.json:
37
+ try:
38
+ if self.entity_type == EntityType.PER:
39
+ original_record = PersonRecord(self.linked_doc.json)
40
+ elif self.entity_type == EntityType.ORG:
41
+ original_record = OrganizationRecord(self.linked_doc.json)
42
+ elif self.entity_type == EntityType.TITLE:
43
+ original_record = TitleRecord(self.linked_doc.json)
44
+ else:
45
+ original_record = None
46
+ except Exception as e:
47
+ LOGGER.exception(
48
+ f"Could not retrieve JSON from LinkedDoc instance. Exception: '{e}'."
49
+ )
50
+ original_record = None
51
+ self.__original_record = original_record
52
+ return self.__original_record
53
+
54
+
55
+ @property
56
+ def entity(self) -> str:
57
+ if self.__entity == None:
58
+ if self.linked_doc != None:
59
+ if self.entity_type == EntityType.ORG and self.original_record:
60
+ self.__entity = self.original_record.original_name.get("a", "")
61
+ else:
62
+ self.__entity = self.linked_doc.linked_entity
63
+ if not self.__entity and self.viaf_info:
64
+ self.__entity = self.viaf_info.get("name", self.original_entity)
65
+ else:
66
+ self.__entity = self.original_entity
67
+ return self.__entity
68
+
69
+
70
+
71
+ @property
72
+ def dates(self) -> str:
73
+ if self.__dates == None:
74
+ self.__dates = ""
75
+ if self.viaf_info:
76
+ birth_date = format_date(self.viaf_info.get("birth_date", ""))
77
+ death_date = format_date(self.viaf_info.get("death_date", ""))
78
+ if not death_date:
79
+ death_date = ""
80
+
81
+ if birth_date:
82
+ self.__dates = f"{birth_date}-{death_date}"
83
+
84
+ if self.original_record and not self.__dates:
85
+ if self.entity_type == EntityType.PER:
86
+ self.__dates = self.original_record.life_years
87
+ elif self.entity_type == EntityType.ORG:
88
+ self.__dates = self.original_record.dates
89
+ elif self.entity_type == EntityType.TITLE:
90
+ self.__dates = self.original_record.author_life_years
91
+
92
+ return self.__dates
93
+
94
+
95
+ @property
96
+ def numeration(self) -> str:
97
+ if self.__numeration == None:
98
+ self.__numeration = ""
99
+ if self.original_record:
100
+ if self.entity_type == EntityType.PER:
101
+ self.__numeration = self.original_record.original_name.get("b", "")
102
+ elif self.entity_type == EntityType.ORG:
103
+ self.__numeration = self.original_record.numeration
104
+ return self.__numeration
105
+
106
+ @property
107
+ def location(self) -> str:
108
+ if self.__location == None:
109
+ self.__location = ""
110
+ if self.entity_type == EntityType.ORG and self.original_record:
111
+ self.__location = self.original_record.location
112
+ return self.__location
113
+
114
+ @property
115
+ def organisation_sub_unit(self) -> str:
116
+ if self.__organization_sub_unit == None:
117
+ self.__organization_sub_unit = ""
118
+ if self.entity_type == EntityType.ORG and self.original_record:
119
+ self.__organization_sub_unit = self.original_record.original_name.get("b", "")
120
+ return self.__organization_sub_unit
121
+
122
+ @property
123
+ def titles(self) -> str:
124
+ if self.__titles == None:
125
+ if self.entity_type == EntityType.PER and self.original_record:
126
+ self.__titles = self.original_record.name_specification
127
+ else:
128
+ self.__titles = ""
129
+ return self.__titles
130
+
131
+ @property
132
+ def identifier(self) -> str:
133
+ if self.__identifier == None:
134
+ self.__identifier = ""
135
+ if self.original_record:
136
+ self.__identifier = self.original_record.identifier
137
+ return self.__identifier
rara_tools/utils.py CHANGED
@@ -2,20 +2,20 @@ from iso639 import Lang
2
2
 
3
3
 
4
4
  def lang_to_iso639_1(lang: str, unk_code: str = "unk") -> str:
5
- """ Converts language into ISO-639-1 standard.
6
- Input can be any language code in a valid ISO-639
7
- standard or even a full name of the language,
5
+ """ Converts language into ISO-639-1 standard.
6
+ Input can be any language code in a valid ISO-639
7
+ standard or even a full name of the language,
8
8
  e.g. "Estonian".
9
-
9
+
10
10
  Parameters
11
11
  -----------
12
12
  lang: str
13
13
  Language code in any valid ISO-639 standard.
14
-
14
+
15
15
  unk_code: str
16
16
  Code to return incase of invalid/unsupported
17
17
  input language.
18
-
18
+
19
19
  Returns
20
20
  -------
21
21
  Language code in ISO-639-1 standard.
@@ -29,20 +29,20 @@ def lang_to_iso639_1(lang: str, unk_code: str = "unk") -> str:
29
29
 
30
30
 
31
31
  def lang_to_iso639_2(lang: str, unk_code: str = "unk") -> str:
32
- """ Converts language into ISO-639-2 standard.
33
- Input can be any language code in a valid ISO-639
34
- standard or even a full name of the language,
32
+ """ Converts language into ISO-639-2 standard.
33
+ Input can be any language code in a valid ISO-639
34
+ standard or even a full name of the language,
35
35
  e.g. "Estonian".
36
-
36
+
37
37
  Parameters
38
38
  -----------
39
39
  lang: str
40
40
  Language code in any valid ISO-639 standard.
41
-
41
+
42
42
  unk_code: str
43
43
  Code to return incase of invalid/unsupported
44
44
  input language.
45
-
45
+
46
46
  Returns
47
47
  -------
48
48
  Language code in ISO-639-2 standard.
@@ -59,20 +59,20 @@ def lang_to_iso639_2(lang: str, unk_code: str = "unk") -> str:
59
59
 
60
60
 
61
61
  def lang_to_iso639_3(lang: str, unk_code: str = "unk") -> str:
62
- """ Converts language into ISO-639-3 standard.
63
- Input can be any language code in a valid ISO-639
64
- standard or even a full name of the language,
62
+ """ Converts language into ISO-639-3 standard.
63
+ Input can be any language code in a valid ISO-639
64
+ standard or even a full name of the language,
65
65
  e.g. "Estonian".
66
-
66
+
67
67
  Parameters
68
68
  -----------
69
69
  lang: str
70
70
  Language code in any valid ISO-639 standard.
71
71
  unk_code: str
72
-
72
+
73
73
  Code to return incase of invalid/unsupported
74
74
  input language.
75
-
75
+
76
76
  Returns
77
77
  -------
78
78
  str
@@ -88,17 +88,39 @@ def lang_to_iso639_3(lang: str, unk_code: str = "unk") -> str:
88
88
 
89
89
  def ratio_to_percentage(ratio: float) -> str:
90
90
  """ Converts ratio to corresponding percentage.
91
-
91
+
92
92
  Parameters
93
93
  -----------
94
94
  ratio: float
95
95
  Float in range [0,1]
96
-
96
+
97
97
  Returns
98
98
  --------
99
99
  str
100
100
  Percentage corresponding to the float.
101
-
101
+
102
102
  """
103
103
  percentage = f"{int(ratio*100)}%"
104
104
  return percentage
105
+
106
+ def format_date(original_date: str) -> str:
107
+ """ Converts date from format %Y-%m-%d into format %d.%m.%Y, e.g:
108
+ 2025-02-12 -> 12.02.2025.
109
+
110
+ Parameters
111
+ -----------
112
+ original_date: str
113
+ Original date in format %Y-%m-%d
114
+
115
+ Returns
116
+ ----------
117
+ str:
118
+ Date in format %d.%m.%Y
119
+ """
120
+ try:
121
+ date_obj = datetime.strptime(original_date, "%Y-%m-%d")
122
+ new_date = date_obj.strftime("%d.%m.%Y")
123
+
124
+ except:
125
+ new_date = original_date
126
+ return new_date
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rara-tools
3
- Version: 0.6.17
3
+ Version: 0.7.0
4
4
  Summary: Tools to support Kata's work.
5
5
  Classifier: Programming Language :: Python :: 3
6
6
  Classifier: Programming Language :: Python :: 3.10
@@ -3,19 +3,22 @@ rara_tools/decorators.py,sha256=MjOyvZ5nTkwxwx2JLFEGpKKBysvecFw6EN6UDrSvZLU,2187
3
3
  rara_tools/digar_schema_converter.py,sha256=k95U2iRlEA3sh772-v6snhHW6fju6qSTMnvWJ6DpzZk,14254
4
4
  rara_tools/elastic.py,sha256=4D9yoyMy6AJIKwhSi2H1usffDHAh2A_IZfv5BtYnBKg,13992
5
5
  rara_tools/exceptions.py,sha256=YQyaueUbXeTkJYFDEuN6iWTXMI3eCv5l7PxGp87vg5I,550
6
- rara_tools/formatters.py,sha256=LTliadjIPZTO4s-44NsumaUdlQlEvqetvWz4bEvwf90,3418
7
6
  rara_tools/s3.py,sha256=9ziDXsLjBtFAvsjTPxFddhfvkpA8773rzPJqO7y1N5Q,6415
8
7
  rara_tools/task_reporter.py,sha256=WCcZts9dAUokPc4vbrG3-lNAFLnWaMgE3b3iaUB7mr8,3256
9
- rara_tools/utils.py,sha256=9vSbmuWYU5ydr4lXBKlUKa0xzDccFsaJv4T-XwgUfuY,2578
8
+ rara_tools/utils.py,sha256=1UrxOzo3cxe4juMkDlKWv1VKWMYay5v1pivGci1ajiM,3003
10
9
  rara_tools/constants/__init__.py,sha256=r78laM9vyRDAvzDhPvzDlhaX6qPwUUBBtwf1WosrW3o,27
11
10
  rara_tools/constants/digitizer.py,sha256=A7FfqqEB4hGJ9t3z8gTFK7hkzCxz44rCOSWx6Pzvwjs,548
12
11
  rara_tools/constants/general.py,sha256=jE1aIir_eKbka_q1iCJWRtmyz_xpnTPntbshiWo9eTA,1061
13
12
  rara_tools/constants/language_evaluator.py,sha256=3sCSaoS-zXQRY0vJ7UUMuZqbtYQD_quVVbdpgvJjE7I,124
14
- rara_tools/constants/linker.py,sha256=TQaigi7AUNOqmQPPz3hM8_xXgofrhoQ1taln79LhXQg,550
13
+ rara_tools/constants/linker.py,sha256=yBN9NpUhB3ENz8BapoIfpSHY_xNqwYdqutgQFdc_Cd8,3240
15
14
  rara_tools/constants/meta_extractor.py,sha256=adYH8cQqH0ZWYO7clGMiObclXRTGsxWgk3pC1oiHxHE,242
16
15
  rara_tools/constants/normalizers.py,sha256=Xs3anDwJHpHeniwx3xoIZyqdEXtO3eb7ouGLLr0CpHw,1344
17
16
  rara_tools/constants/parsers.py,sha256=L6nh1Itget9_9DMsliDkh6T25z78eMFPWVkbaU08DwU,5561
18
17
  rara_tools/constants/subject_indexer.py,sha256=E2D7pylH6Yey9h2TAvAWQiX5JtKKagsZx2E1Fz_afMI,1967
18
+ rara_tools/core_formatters/core_formatter.py,sha256=HJX7jOi9kaFie_zm0Wzjk0nKF8dRleJpVWbCplFFquo,2760
19
+ rara_tools/core_formatters/formatted_keyword.py,sha256=1-B9IQTycFt69pTy8WZNnfJ2WIMRow3kpEub6igyNQc,7865
20
+ rara_tools/core_formatters/formatted_meta.py,sha256=Zd0oQFLbn6m_wHaWtgxBsu9J7wGyWIpZxb2-8PrR3Wk,5240
21
+ rara_tools/core_formatters/formatted_object.py,sha256=7a499ZmcZXOqtlwxDi6FWHWF5a6HdCsduS22wV3uHIE,5656
19
22
  rara_tools/normalizers/__init__.py,sha256=_NqpS5w710DhaURytHq9JpEt8HgYpSPfRDcOtOymJgE,193
20
23
  rara_tools/normalizers/authorities.py,sha256=IDtcm0yNZNhv1f-WcdqWFSRzZk_CoKuBFsk6hEPddWM,4513
21
24
  rara_tools/normalizers/base.py,sha256=6tLfNdF6FZo8M6j_Q61lXoaF1HdIB1c0SKMatTc-Z64,12014
@@ -35,8 +38,8 @@ rara_tools/parsers/marc_records/title_record.py,sha256=XrtJ4gj7wzSaGxNaPtPuawmqq
35
38
  rara_tools/parsers/tools/entity_normalizers.py,sha256=VyCy_NowCLpOsL0luQ55IW-Qi-J5oBH0Ofzr7HRFBhM,8949
36
39
  rara_tools/parsers/tools/marc_converter.py,sha256=LgSHe-7n7aiDrw2bnsB53r3fXTRFjZXTwBYfTpL0pfs,415
37
40
  rara_tools/parsers/tools/russian_transliterator.py,sha256=5ZU66iTqAhr7pmfVqXPAI_cidF43VqqmuN4d7H4_JuA,9770
38
- rara_tools-0.6.17.dist-info/licenses/LICENSE.md,sha256=hkZVnIZll7e_KNEQzeY94Y9tlzVL8iVZBTMBvDykksU,35142
39
- rara_tools-0.6.17.dist-info/METADATA,sha256=_E9Ml7OSHn29YTsh5V4DytlzwRTSf-xeeQRZJNDzbos,4080
40
- rara_tools-0.6.17.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
41
- rara_tools-0.6.17.dist-info/top_level.txt,sha256=JwfB5b8BAtW5OFKRln2AQ_WElTRyIBM4nO0FKN1cupY,11
42
- rara_tools-0.6.17.dist-info/RECORD,,
41
+ rara_tools-0.7.0.dist-info/licenses/LICENSE.md,sha256=hkZVnIZll7e_KNEQzeY94Y9tlzVL8iVZBTMBvDykksU,35142
42
+ rara_tools-0.7.0.dist-info/METADATA,sha256=UmA4_431SdsYqP6IrEv8mr4yL9OoFVGHvpG-mQgs_g8,4079
43
+ rara_tools-0.7.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
44
+ rara_tools-0.7.0.dist-info/top_level.txt,sha256=JwfB5b8BAtW5OFKRln2AQ_WElTRyIBM4nO0FKN1cupY,11
45
+ rara_tools-0.7.0.dist-info/RECORD,,
rara_tools/formatters.py DELETED
@@ -1,106 +0,0 @@
1
- from typing import List, Tuple, Any
2
- from rara_tools.constants.subject_indexer import (
3
- EntityType, KeywordType, KeywordMARC, KeywordSource, URLSource,
4
- KEYWORD_TYPE_MAP, KEYWORD_MARC_MAP, KEYWORD_TYPES_TO_IGNORE,
5
- EMS_ENTITY_TYPES, SIERRA_ENTITY_TYPES, VIAF_ENTITY_TYPES
6
- )
7
-
8
- def _get_keyword_source(linked_doc: Any, entity_type: str, is_linked: bool
9
- ) -> str:
10
- """ Find keyword source.
11
- """
12
- if not is_linked:
13
- source = KeywordSource.AI
14
- elif entity_type in EMS_ENTITY_TYPES:
15
- source = KeywordSource.EMS
16
- elif entity_type in SIERRA_ENTITY_TYPES:
17
- if linked_doc and linked_doc.elastic:
18
- source = KeywordSource.SIERRA
19
- elif linked_doc and linked_doc.viaf:
20
- source = KeywordSource.VIAF
21
- else:
22
- source = KeywordSource.AI
23
- else:
24
- source = KeywordSource.AI
25
- return source
26
-
27
- def _find_indicators(entity_type: str, entity: str,
28
- is_linked: bool
29
- ) -> Tuple[str, str]:
30
- """ Find MARC indicators 1 and 2.
31
- """
32
- ind1 = " "
33
- ind2 = " "
34
- if entity_type in SIERRA_ENTITY_TYPES:
35
- if entity_type == EntityType.PER:
36
- if "," in entity:
37
- ind1 = "1"
38
- else:
39
- ind1 = "0"
40
- else:
41
- # 1 märksõna esimeseks elemendiks võimupiirkonna nimi, nt:
42
- # (a) Eesti (b) Riigikogu - raske automaatselt määrata
43
- # 2 märksõna esimeseks elemendiks nimi pärijärjestuses
44
- ind1 = "2"
45
- if not is_linked:
46
- ind2 = "4"
47
- elif entity_type in EMS_ENTITY_TYPES:
48
- ind2 = "4"
49
- return (ind1, ind2)
50
-
51
-
52
- def format_keywords(flat_keywords: List[dict]) -> dict:
53
- """ Formats unlinked keywords for Kata CORE.
54
- """
55
- ignored_keywords = []
56
- filtered_keywords = []
57
-
58
- for keyword_dict in flat_keywords:
59
- keyword_type = keyword_dict.get("entity_type")
60
- if keyword_type in KEYWORD_TYPES_TO_IGNORE:
61
- ignored_keywords.append(keyword_dict)
62
- else:
63
- filtered_keywords.append(keyword_dict)
64
-
65
- formatted_keywords = {
66
- "keywords": [],
67
- "other": ignored_keywords
68
- }
69
-
70
- for keyword_dict in filtered_keywords:
71
- original_keyword = keyword_dict.get("keyword")
72
- keyword_type = keyword_dict.get("entity_type")
73
- entity_type = KEYWORD_TYPE_MAP.get(keyword_type, "")
74
- marc_field = KEYWORD_MARC_MAP.get(str(keyword_type), "")
75
- lang = keyword_dict.get("language", "")
76
-
77
- ind1, ind2 = _find_indicators(
78
- entity_type=entity_type,
79
- entity=original_keyword,
80
- is_linked=False
81
- )
82
- keyword_source = _get_keyword_source(
83
- linked_doc=None,
84
- is_linked=False,
85
- entity_type=entity_type
86
- )
87
- new_keyword_dict = {
88
- "dates": "",
89
- "indicator1": ind1,
90
- "indicator2": ind2,
91
- "is_linked": False,
92
- "keyword_source": keyword_source,
93
- "lang": lang,
94
- "location": "",
95
- "marc_field": marc_field,
96
- "numeration": "",
97
- "organisation_sub_unit": "",
98
- "original_keyword": original_keyword,
99
- "persons_title": "",
100
- "url": "",
101
- "url_source": ""
102
- }
103
- new_keyword_dict.update(keyword_dict)
104
- formatted_keywords["keywords"].append(new_keyword_dict)
105
-
106
- return formatted_keywords