rara-tools 0.6.16__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of rara-tools might be problematic. Click here for more details.
- rara_tools/constants/linker.py +114 -0
- rara_tools/core_formatters/core_formatter.py +86 -0
- rara_tools/core_formatters/formatted_keyword.py +229 -0
- rara_tools/core_formatters/formatted_meta.py +154 -0
- rara_tools/core_formatters/formatted_object.py +137 -0
- rara_tools/normalizers/viaf.py +1 -0
- rara_tools/utils.py +43 -21
- {rara_tools-0.6.16.dist-info → rara_tools-0.7.0.dist-info}/METADATA +1 -1
- {rara_tools-0.6.16.dist-info → rara_tools-0.7.0.dist-info}/RECORD +12 -9
- rara_tools/formatters.py +0 -106
- {rara_tools-0.6.16.dist-info → rara_tools-0.7.0.dist-info}/WHEEL +0 -0
- {rara_tools-0.6.16.dist-info → rara_tools-0.7.0.dist-info}/licenses/LICENSE.md +0 -0
- {rara_tools-0.6.16.dist-info → rara_tools-0.7.0.dist-info}/top_level.txt +0 -0
rara_tools/constants/linker.py
CHANGED
|
@@ -1,3 +1,6 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from rara_tools.constants.normalizers import EntityType, VIAF_ENTITY_MAP
|
|
3
|
+
|
|
1
4
|
COMPONENT_KEY = "linker"
|
|
2
5
|
|
|
3
6
|
|
|
@@ -20,3 +23,114 @@ class Queue:
|
|
|
20
23
|
class StatusKeys:
|
|
21
24
|
VECTORIZE_CONTEXT = "vectorize_context"
|
|
22
25
|
LINK_KEYWORDS = "link_keywords"
|
|
26
|
+
|
|
27
|
+
class URLSource:
|
|
28
|
+
VIAF = "VIAF"
|
|
29
|
+
SIERRA = "Sierra"
|
|
30
|
+
EMS = "EMS"
|
|
31
|
+
|
|
32
|
+
class KeywordType:
|
|
33
|
+
LOC = "Kohamärksõnad"
|
|
34
|
+
TIME = "Ajamärksõnad"
|
|
35
|
+
TOPIC = "Teemamärksõnad"
|
|
36
|
+
GENRE = "Vormimärksõnad"
|
|
37
|
+
TITLE = "Teose pealkiri"
|
|
38
|
+
PER = "Isikunimi"
|
|
39
|
+
ORG = "Kollektiivi nimi"
|
|
40
|
+
EVENT = "Ajutine kollektiiv või sündmus"
|
|
41
|
+
CATEGORY = "Valdkonnamärksõnad"
|
|
42
|
+
UDC = "UDC Summary"
|
|
43
|
+
UDK = "UDK Rahvusbibliograafia"
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class KeywordMARC:
|
|
47
|
+
PER = 600
|
|
48
|
+
ORG = 610
|
|
49
|
+
TOPIC = 650
|
|
50
|
+
GENRE = 655
|
|
51
|
+
TIME = 648
|
|
52
|
+
LOC = 651
|
|
53
|
+
EVENT = 611
|
|
54
|
+
TITLE = 630
|
|
55
|
+
TITLE_LINKED = 600
|
|
56
|
+
|
|
57
|
+
class KeywordSource:
|
|
58
|
+
EMS = "EMS"
|
|
59
|
+
SIERRA = "SIERRA"
|
|
60
|
+
VIAF = "VIAF"
|
|
61
|
+
AI = "AI"
|
|
62
|
+
|
|
63
|
+
class Filters:
|
|
64
|
+
AUTHOR = "author"
|
|
65
|
+
YEAR = "year"
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
UNLINKED_KEYWORD_MARC_FIELD = 693
|
|
69
|
+
|
|
70
|
+
ALLOWED_FILTERS_MAP = {
|
|
71
|
+
EntityType.PER: [Filters.YEAR],
|
|
72
|
+
EntityType.ORG: [Filters.YEAR],
|
|
73
|
+
EntityType.TITLE: [Filters.YEAR, Filters.AUTHOR],
|
|
74
|
+
EntityType.KEYWORD: [],
|
|
75
|
+
EntityType.LOC: []
|
|
76
|
+
}
|
|
77
|
+
KEYWORD_MARC_MAP = {
|
|
78
|
+
KeywordType.LOC: KeywordMARC.LOC,
|
|
79
|
+
KeywordType.TIME: KeywordMARC.TIME,
|
|
80
|
+
KeywordType.TOPIC: KeywordMARC.TOPIC,
|
|
81
|
+
KeywordType.GENRE: KeywordMARC.GENRE,
|
|
82
|
+
KeywordType.TITLE: KeywordMARC.TITLE,
|
|
83
|
+
KeywordType.ORG: KeywordMARC.ORG,
|
|
84
|
+
KeywordType.PER: KeywordMARC.PER,
|
|
85
|
+
KeywordType.EVENT: KeywordMARC.EVENT
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
URL_SOURCE_MAP = {
|
|
89
|
+
EntityType.PER: URLSource.VIAF,
|
|
90
|
+
EntityType.ORG: URLSource.VIAF,
|
|
91
|
+
EntityType.TITLE: URLSource.VIAF,
|
|
92
|
+
EntityType.KEYWORD: URLSource.EMS,
|
|
93
|
+
EntityType.LOC: URLSource.EMS
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
# Ignore those "keyword types" while linking the
|
|
97
|
+
# rara-subject-indexer results
|
|
98
|
+
KEYWORD_TYPES_TO_IGNORE = [
|
|
99
|
+
KeywordType.CATEGORY,
|
|
100
|
+
KeywordType.UDC,
|
|
101
|
+
KeywordType.UDK
|
|
102
|
+
]
|
|
103
|
+
|
|
104
|
+
ALLOWED_ENTITY_TYPES = [
|
|
105
|
+
EntityType.PER,
|
|
106
|
+
EntityType.ORG,
|
|
107
|
+
EntityType.KEYWORD,
|
|
108
|
+
EntityType.LOC,
|
|
109
|
+
EntityType.TITLE,
|
|
110
|
+
EntityType.UNK,
|
|
111
|
+
]
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
KEYWORD_TYPE_MAP = {
|
|
115
|
+
KeywordType.TIME: EntityType.KEYWORD,
|
|
116
|
+
KeywordType.GENRE: EntityType.KEYWORD,
|
|
117
|
+
KeywordType.LOC: EntityType.LOC,
|
|
118
|
+
KeywordType.PER: EntityType.PER,
|
|
119
|
+
KeywordType.ORG: EntityType.ORG,
|
|
120
|
+
KeywordType.TOPIC: EntityType.KEYWORD,
|
|
121
|
+
KeywordType.TITLE: EntityType.TITLE,
|
|
122
|
+
KeywordType.EVENT: EntityType.ORG
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
EMS_ENTITY_TYPES = [EntityType.KEYWORD, EntityType.LOC]
|
|
126
|
+
SIERRA_ENTITY_TYPES = [EntityType.PER, EntityType.ORG, EntityType.TITLE]
|
|
127
|
+
VIAF_ENTITY_TYPES = [EntityType.PER, EntityType.ORG, EntityType.TITLE]
|
|
128
|
+
|
|
129
|
+
# Params for filters
|
|
130
|
+
MIN_AUTHOR_SIMILARITY = 0.95
|
|
131
|
+
YEAR_EXCEPTION_VALUE = True
|
|
132
|
+
|
|
133
|
+
LOGGER_NAME = "rara-tools-norm-linker"
|
|
134
|
+
LOGGER = logging.getLogger(LOGGER_NAME)
|
|
135
|
+
|
|
136
|
+
MAIN_TAXONOMY_LANG = "et"
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
from typing import List, Tuple, Any
|
|
2
|
+
from rara_tools.core_formatters.formatted_keyword import FormattedKeyword
|
|
3
|
+
from rara_tools.core_formatters.formatted_meta import FormattedAuthor
|
|
4
|
+
from rara_tools.constants.linker import MAIN_TAXONOMY_LANG, KEYWORD_TYPES_TO_IGNORE, EntityType
|
|
5
|
+
|
|
6
|
+
def get_primary_author(authors: List[dict]) -> str:
|
|
7
|
+
primary_author = ""
|
|
8
|
+
for author in authors:
|
|
9
|
+
if author.get("is_primary", False):
|
|
10
|
+
primary_author = author.get("name", "")
|
|
11
|
+
return primary_author
|
|
12
|
+
|
|
13
|
+
def format_authors(authors: List[dict]) -> List[dict]:
|
|
14
|
+
formatted_authors = []
|
|
15
|
+
for author in authors:
|
|
16
|
+
entity_type = author.get("type", EntityType.UNK)
|
|
17
|
+
|
|
18
|
+
formatted_author = FormattedAuthor(
|
|
19
|
+
object_dict=author,
|
|
20
|
+
linked_doc=None,
|
|
21
|
+
entity_type=entity_type
|
|
22
|
+
).to_dict()
|
|
23
|
+
formatted_authors.append(formatted_author)
|
|
24
|
+
return formatted_authors
|
|
25
|
+
|
|
26
|
+
def format_sections(sections: List[dict]) -> List[dict]:
|
|
27
|
+
for section in sections:
|
|
28
|
+
authors = section.pop("authors", [])
|
|
29
|
+
titles = section.pop("titles", [])
|
|
30
|
+
primary_author = get_primary_author(authors)
|
|
31
|
+
if primary_author:
|
|
32
|
+
for title in titles:
|
|
33
|
+
title["author_from_title"] = primary_author
|
|
34
|
+
section["titles"] = titles
|
|
35
|
+
|
|
36
|
+
formatted_authors = format_authors(authors)
|
|
37
|
+
section["authors"] = formatted_authors
|
|
38
|
+
|
|
39
|
+
return sections
|
|
40
|
+
|
|
41
|
+
def format_meta(meta: dict) -> dict:
|
|
42
|
+
""" Formats unlinked meta for Kata CORE.
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
meta_to_format = meta.get("meta")
|
|
46
|
+
|
|
47
|
+
authors = meta_to_format.pop("authors", [])
|
|
48
|
+
sections = meta_to_format.pop("sections", [])
|
|
49
|
+
|
|
50
|
+
formatted_authors = format_authors(authors)
|
|
51
|
+
formatted_sections = format_sections(sections)
|
|
52
|
+
|
|
53
|
+
if sections and formatted_sections:
|
|
54
|
+
meta_to_format["sections"] = formatted_sections
|
|
55
|
+
if authors and formatted_authors:
|
|
56
|
+
meta_to_format["authors"] = formatted_authors
|
|
57
|
+
|
|
58
|
+
meta["meta"] = meta_to_format
|
|
59
|
+
|
|
60
|
+
return meta
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def format_keywords(flat_keywords: List[dict]) -> List[dict]:
|
|
64
|
+
""" Formats unlinked keywords for Kata CORE.
|
|
65
|
+
"""
|
|
66
|
+
ignored_keywords = []
|
|
67
|
+
filtered_keywords = []
|
|
68
|
+
|
|
69
|
+
for keyword_dict in flat_keywords:
|
|
70
|
+
keyword_type = keyword_dict.get("entity_type")
|
|
71
|
+
if keyword_type in KEYWORD_TYPES_TO_IGNORE:
|
|
72
|
+
ignored_keywords.append(keyword_dict)
|
|
73
|
+
else:
|
|
74
|
+
filtered_keywords.append(keyword_dict)
|
|
75
|
+
|
|
76
|
+
formatted_keywords = []
|
|
77
|
+
|
|
78
|
+
for keyword_dict in filtered_keywords:
|
|
79
|
+
formatted_keyword = FormattedKeyword(
|
|
80
|
+
object_dict=keyword_dict,
|
|
81
|
+
linked_doc=None,
|
|
82
|
+
main_taxnomy_lang=MAIN_TAXONOMY_LANG
|
|
83
|
+
).to_dict()
|
|
84
|
+
formatted_keywords.append(formatted_keyword)
|
|
85
|
+
|
|
86
|
+
return formatted_keywords
|
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
from rara_tools.constants.linker import (
|
|
2
|
+
LOGGER, URLSource, KeywordSource, EntityType, KeywordType, KeywordMARC,
|
|
3
|
+
KEYWORD_MARC_MAP, KEYWORD_TYPES_TO_IGNORE, KEYWORD_TYPE_MAP,
|
|
4
|
+
EMS_ENTITY_TYPES, SIERRA_ENTITY_TYPES, UNLINKED_KEYWORD_MARC_FIELD,
|
|
5
|
+
URL_SOURCE_MAP
|
|
6
|
+
)
|
|
7
|
+
from rara_tools.core_formatters.formatted_object import FormattedObject
|
|
8
|
+
from typing import List, Dict, NoReturn, Tuple, Any
|
|
9
|
+
|
|
10
|
+
class FormattedKeyword(FormattedObject):
|
|
11
|
+
def __init__(self, object_dict: dict, linked_doc: Any,
|
|
12
|
+
main_taxnomy_lang: str, url_source_map: str = URL_SOURCE_MAP
|
|
13
|
+
) -> NoReturn:
|
|
14
|
+
super().__init__(
|
|
15
|
+
object_dict=object_dict,
|
|
16
|
+
linked_doc=linked_doc,
|
|
17
|
+
original_entity_key="keyword"
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
self.main_taxnomy_lang: str = main_taxnomy_lang
|
|
21
|
+
|
|
22
|
+
self.original_keyword: str = self.original_entity
|
|
23
|
+
self.score: float = self.object_dict.get("score")
|
|
24
|
+
self.count: int = self.object_dict.get("count")
|
|
25
|
+
self.method: str = self.object_dict.get("method")
|
|
26
|
+
self.model_arch: str = self.object_dict.get("model_arch", self.method)
|
|
27
|
+
self.keyword_type: str = self.object_dict.get("entity_type")
|
|
28
|
+
|
|
29
|
+
self.entity_type: str = KEYWORD_TYPE_MAP.get(self.keyword_type, "")
|
|
30
|
+
self.url_source_map: dict = url_source_map
|
|
31
|
+
|
|
32
|
+
self.__keyword_source: str = ""
|
|
33
|
+
self.__indicator_1: str = ""
|
|
34
|
+
self.__indicator_2: str = ""
|
|
35
|
+
self.__url: str | None = None
|
|
36
|
+
self.__url_source: str | None = None
|
|
37
|
+
self.__marc_field: str = ""
|
|
38
|
+
|
|
39
|
+
self.__language: str = ""
|
|
40
|
+
self.__author: str | None = None
|
|
41
|
+
|
|
42
|
+
@property
|
|
43
|
+
def keyword(self) -> str:
|
|
44
|
+
return self.entity
|
|
45
|
+
|
|
46
|
+
@property
|
|
47
|
+
def keyword_source(self) -> str:
|
|
48
|
+
if not self.__keyword_source:
|
|
49
|
+
if not self.is_linked:
|
|
50
|
+
source = KeywordSource.AI
|
|
51
|
+
elif self.entity_type in EMS_ENTITY_TYPES:
|
|
52
|
+
source = KeywordSource.EMS
|
|
53
|
+
elif self.entity_type in SIERRA_ENTITY_TYPES:
|
|
54
|
+
if self.linked_doc and self.linked_doc.elastic:
|
|
55
|
+
source = KeywordSource.SIERRA
|
|
56
|
+
elif self.linked_doc and self.linked_doc.viaf:
|
|
57
|
+
source = KeywordSource.VIAF
|
|
58
|
+
else:
|
|
59
|
+
source = KeywordSource.AI
|
|
60
|
+
else:
|
|
61
|
+
source = KeywordSource.AI
|
|
62
|
+
self.__keyword_source = source
|
|
63
|
+
return self.__keyword_source
|
|
64
|
+
|
|
65
|
+
@property
|
|
66
|
+
def indicator1(self) -> str:
|
|
67
|
+
if not self.__indicator_1:
|
|
68
|
+
ind1, ind2 = self._get_indicators()
|
|
69
|
+
self.__indicator_1 = ind1
|
|
70
|
+
self.__indicator_2 = ind2
|
|
71
|
+
return self.__indicator_1
|
|
72
|
+
|
|
73
|
+
@property
|
|
74
|
+
def indicator2(self) -> str:
|
|
75
|
+
if not self.__indicator_2:
|
|
76
|
+
ind1, ind2 = self._get_indicators()
|
|
77
|
+
self.__indicator_1 = ind1
|
|
78
|
+
self.__indicator_2 = ind2
|
|
79
|
+
return self.__indicator_2
|
|
80
|
+
|
|
81
|
+
@property
|
|
82
|
+
def url(self) -> str:
|
|
83
|
+
if self.__url == None:
|
|
84
|
+
url_info = self._get_url_info()
|
|
85
|
+
self.__url = url_info.get("url")
|
|
86
|
+
self.__url_source = url_info.get("url_source")
|
|
87
|
+
return self.__url
|
|
88
|
+
|
|
89
|
+
@property
|
|
90
|
+
def url_source(self) -> str:
|
|
91
|
+
if self.__url_source == None:
|
|
92
|
+
url_info = self._get_url_info()
|
|
93
|
+
self.__url = url_info.get("url")
|
|
94
|
+
self.__url_source = url_info.get("url_source")
|
|
95
|
+
return self.__url_source
|
|
96
|
+
|
|
97
|
+
@property
|
|
98
|
+
def marc_field(self) -> int:
|
|
99
|
+
if not self.__marc_field:
|
|
100
|
+
# TODO: teoste + isikute loogika!!!!
|
|
101
|
+
if self.is_linked:
|
|
102
|
+
marc_field = KEYWORD_MARC_MAP.get(str(self.keyword_type), "")
|
|
103
|
+
else:
|
|
104
|
+
marc_field = UNLINKED_KEYWORD_MARC_FIELD
|
|
105
|
+
|
|
106
|
+
if self.entity_type == EntityType.TITLE:
|
|
107
|
+
if self.author:
|
|
108
|
+
marc_field = KeywordMARC.TITLE_LINKED
|
|
109
|
+
else:
|
|
110
|
+
marc_field = KeywordMARC.TITLE
|
|
111
|
+
self.__marc_field = marc_field
|
|
112
|
+
return self.__marc_field
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
@property
|
|
116
|
+
def persons_title(self) -> str:
|
|
117
|
+
return self.titles
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
@property
|
|
121
|
+
def language(self) -> str:
|
|
122
|
+
if not self.__language:
|
|
123
|
+
if self.is_linked:
|
|
124
|
+
self.__language = self.main_taxnomy_lang
|
|
125
|
+
else:
|
|
126
|
+
self.__language = self.object_dict.get("language", "")
|
|
127
|
+
return self.__language
|
|
128
|
+
|
|
129
|
+
@property
|
|
130
|
+
def author(self) -> str:
|
|
131
|
+
# Only relevant for titles!
|
|
132
|
+
if self.__author == None:
|
|
133
|
+
self.__author = ""
|
|
134
|
+
if self.entity_type == EntityType.TITLE:
|
|
135
|
+
if self.original_record:
|
|
136
|
+
self.__author = self.original_record.author_name
|
|
137
|
+
elif self.viaf_info:
|
|
138
|
+
pass
|
|
139
|
+
#self.__author = self.viaf_info.get
|
|
140
|
+
return self.__author
|
|
141
|
+
|
|
142
|
+
def _get_url_info(self) -> dict:
|
|
143
|
+
""" Finds URL identifier from LinkedDoc based on
|
|
144
|
+
given entity type.
|
|
145
|
+
|
|
146
|
+
Parameters
|
|
147
|
+
-----------
|
|
148
|
+
linked_doc: LinkedDoc | None
|
|
149
|
+
A LinkedDoc class instance.
|
|
150
|
+
entity_type: str
|
|
151
|
+
Entity type for detecting correct URL source.
|
|
152
|
+
|
|
153
|
+
Returns
|
|
154
|
+
----------
|
|
155
|
+
dict:
|
|
156
|
+
Dictionary with keys `url` - URL identifier and
|
|
157
|
+
`url_source` - source of the URL (e.g. "EMS").
|
|
158
|
+
|
|
159
|
+
"""
|
|
160
|
+
url_source = self.url_source_map.get(self.entity_type, "")
|
|
161
|
+
url = ""
|
|
162
|
+
|
|
163
|
+
if self.linked_doc:
|
|
164
|
+
if url_source == URLSource.EMS:
|
|
165
|
+
url = self.linked_doc.elastic.get("ems_url", "")
|
|
166
|
+
elif url_source == URLSource.VIAF:
|
|
167
|
+
url = self.viaf_info.get("viaf_url", "")
|
|
168
|
+
if not url:
|
|
169
|
+
url_source = ""
|
|
170
|
+
|
|
171
|
+
url_info = {"url": url, "url_source": url_source}
|
|
172
|
+
|
|
173
|
+
LOGGER.debug(
|
|
174
|
+
f"Detected URL info: {url_info}. Used entity_type = {self.entity_type}. " \
|
|
175
|
+
f"URL source map = {self.url_source_map}."
|
|
176
|
+
)
|
|
177
|
+
return url_info
|
|
178
|
+
|
|
179
|
+
def _get_indicators(self) -> Tuple[str, str]:
|
|
180
|
+
""" Find MARC indicators 1 and 2.
|
|
181
|
+
"""
|
|
182
|
+
ind1 = " "
|
|
183
|
+
ind2 = " "
|
|
184
|
+
if self.entity_type in SIERRA_ENTITY_TYPES:
|
|
185
|
+
if self.entity_type == EntityType.PER:
|
|
186
|
+
if "," in self.keyword:
|
|
187
|
+
ind1 = "1"
|
|
188
|
+
else:
|
|
189
|
+
ind1 = "0"
|
|
190
|
+
elif self.entity_type == EntityType.ORG:
|
|
191
|
+
# 1 märksõna esimeseks elemendiks võimupiirkonna nimi, nt:
|
|
192
|
+
# (a) Eesti (b) Riigikogu - raske automaatselt määrata
|
|
193
|
+
# 2 märksõna esimeseks elemendiks nimi pärijärjestuses
|
|
194
|
+
ind1 = "2"
|
|
195
|
+
else:
|
|
196
|
+
ind1 = "0"
|
|
197
|
+
|
|
198
|
+
if not self.is_linked:
|
|
199
|
+
ind2 = "4"
|
|
200
|
+
elif self.entity_type in EMS_ENTITY_TYPES:
|
|
201
|
+
ind2 = "4"
|
|
202
|
+
return (ind1, ind2)
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def to_dict(self) -> dict:
|
|
206
|
+
keyword_dict = {
|
|
207
|
+
"count": self.count,
|
|
208
|
+
"dates": self.dates,
|
|
209
|
+
"entity_type": self.keyword_type,
|
|
210
|
+
"indicator1": self.indicator1,
|
|
211
|
+
"indicator2": self.indicator2,
|
|
212
|
+
"is_linked": self.is_linked,
|
|
213
|
+
"keyword": self.keyword,
|
|
214
|
+
"keyword_source": self.keyword_source,
|
|
215
|
+
"lang": self.language,
|
|
216
|
+
"location": self.location,
|
|
217
|
+
"marc_field": self.marc_field,
|
|
218
|
+
"method": self.method,
|
|
219
|
+
"model_arch": self.model_arch,
|
|
220
|
+
"numeration": self.numeration,
|
|
221
|
+
"organisation_sub_unit": self.organisation_sub_unit,
|
|
222
|
+
"original_keyword": self.original_keyword,
|
|
223
|
+
"persons_title": self.persons_title,
|
|
224
|
+
"score": self.score,
|
|
225
|
+
"url": self.url,
|
|
226
|
+
"url_source": self.url_source,
|
|
227
|
+
"author": self.author
|
|
228
|
+
}
|
|
229
|
+
return keyword_dict
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
from rara_tools.constants.linker import (
|
|
2
|
+
LOGGER, EntityType
|
|
3
|
+
)
|
|
4
|
+
from rara_tools.core_formatters.formatted_object import FormattedObject
|
|
5
|
+
from typing import List, Dict, NoReturn, Tuple, Any
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class FormattedTitle(FormattedObject):
|
|
9
|
+
# TODO: Kas seda on üldse vaja?
|
|
10
|
+
def __init__(self, object_dict: dict, linked_doc: Any):
|
|
11
|
+
super().__init__(
|
|
12
|
+
object_dict=object_dict,
|
|
13
|
+
linked_doc=linked_doc,
|
|
14
|
+
original_entity_key="name"
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class FormattedAuthor(FormattedObject):
|
|
19
|
+
def __init__(self, object_dict: dict, linked_doc: Any, entity_type: str):
|
|
20
|
+
super().__init__(
|
|
21
|
+
object_dict=object_dict,
|
|
22
|
+
linked_doc=linked_doc,
|
|
23
|
+
original_entity_key="name"
|
|
24
|
+
)
|
|
25
|
+
self.entity_type: str = entity_type
|
|
26
|
+
|
|
27
|
+
self.is_linked: bool = True if self.linked_doc else False # NB! Lisada andmebaasi uus veerg!
|
|
28
|
+
self.original_name: str = self.original_entity # NB! Lisada andmebaasi uus veerg
|
|
29
|
+
self.author_role: str = self.object_dict.get("role")
|
|
30
|
+
self.is_primary: bool = self.object_dict.get("is_primary")
|
|
31
|
+
|
|
32
|
+
self.__primary_author_type: str = None
|
|
33
|
+
|
|
34
|
+
self.__name_order_type: str = ""
|
|
35
|
+
self.__event_sub_unit: str = ""
|
|
36
|
+
self.__order_number: str = ""
|
|
37
|
+
self.__sub_title: str = ""
|
|
38
|
+
self.__additional_info: str = ""
|
|
39
|
+
self.__publication_type: str = ""
|
|
40
|
+
self.__publication_language: str = ""
|
|
41
|
+
#self.__standardized_uri: str = ""
|
|
42
|
+
self.__viaf_id: str = ""
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def primary_author_type(self) -> str:
|
|
47
|
+
if self.__primary_author_type == None:
|
|
48
|
+
if self.is_primary:
|
|
49
|
+
if self.entity_type != EntityType.UNK:
|
|
50
|
+
self.__primary_author_type = self.entity_type
|
|
51
|
+
else:
|
|
52
|
+
self.__primary_author_type = EntityType.PER
|
|
53
|
+
else:
|
|
54
|
+
self.__primary_author_type = ""
|
|
55
|
+
return self.__primary_author_type
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@property
|
|
59
|
+
def name(self) -> str:
|
|
60
|
+
return self.entity
|
|
61
|
+
|
|
62
|
+
@property
|
|
63
|
+
def name_order(self) -> str:
|
|
64
|
+
if not self.__name_order_type:
|
|
65
|
+
if self.entity_type == EntityType.PER or self.entity_type == EntityType.UNK:
|
|
66
|
+
if "," in self.name:
|
|
67
|
+
ind1 = "1"
|
|
68
|
+
else:
|
|
69
|
+
ind1 = "0"
|
|
70
|
+
elif self.entity_type == EntityType.ORG:
|
|
71
|
+
#LOGGER.debug(f"Entity type {self.entity_type} is not {EntityType.PER}.")
|
|
72
|
+
# 1 märksõna esimeseks elemendiks võimupiirkonna nimi, nt:
|
|
73
|
+
# (a) Eesti (b) Riigikogu - raske automaatselt määrata
|
|
74
|
+
# 2 märksõna esimeseks elemendiks nimi pärijärjestuses
|
|
75
|
+
ind1 = "2" #????????
|
|
76
|
+
else:
|
|
77
|
+
ind1 = "0"
|
|
78
|
+
self.__name_order_type = ind1
|
|
79
|
+
return self.__name_order_type
|
|
80
|
+
|
|
81
|
+
@property
|
|
82
|
+
def event_sub_unit(self) -> str:
|
|
83
|
+
if not self.__event_sub_unit:
|
|
84
|
+
self.__event_sub_unit = ""
|
|
85
|
+
return self.__event_sub_unit
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
@property
|
|
89
|
+
def order_number(self) -> str:
|
|
90
|
+
if not self.__order_number:
|
|
91
|
+
self.__order_number = ""
|
|
92
|
+
return self.__order_number
|
|
93
|
+
|
|
94
|
+
@property
|
|
95
|
+
def sub_title(self) -> str:
|
|
96
|
+
if not self.__sub_title:
|
|
97
|
+
self.__sub_title = ""
|
|
98
|
+
return self.__sub_title
|
|
99
|
+
|
|
100
|
+
@property
|
|
101
|
+
def additional_info(self) -> str:
|
|
102
|
+
if not self.__additional_info:
|
|
103
|
+
self.__additional_info = ""
|
|
104
|
+
return self.__additional_info
|
|
105
|
+
|
|
106
|
+
@property
|
|
107
|
+
def publication_type(self) -> str:
|
|
108
|
+
if not self.__publication_type:
|
|
109
|
+
self.__publication_type = ""
|
|
110
|
+
return self.__publication_type
|
|
111
|
+
|
|
112
|
+
@property
|
|
113
|
+
def publication_language(self) -> str:
|
|
114
|
+
if not self.__publication_language:
|
|
115
|
+
self.__publication_language = ""
|
|
116
|
+
return self.__publication_language
|
|
117
|
+
|
|
118
|
+
@property
|
|
119
|
+
def standardized_uri(self) -> str:
|
|
120
|
+
return self.identifier
|
|
121
|
+
|
|
122
|
+
@property
|
|
123
|
+
def viaf_id(self):
|
|
124
|
+
if not self.__viaf_id:
|
|
125
|
+
if self.viaf_info:
|
|
126
|
+
self.__viaf_id = self.viaf_info.get("viaf_url", "")
|
|
127
|
+
else:
|
|
128
|
+
self.__viaf_id = ""
|
|
129
|
+
return self.__viaf_id
|
|
130
|
+
|
|
131
|
+
def to_dict(self):
|
|
132
|
+
author_dict = {
|
|
133
|
+
"is_linked": self.is_linked,
|
|
134
|
+
"original_name": self.original_name,
|
|
135
|
+
"author_role": self.author_role,
|
|
136
|
+
"is_primary": self.is_primary,
|
|
137
|
+
"primary_author_type": self.primary_author_type,
|
|
138
|
+
"name": self.name,
|
|
139
|
+
"numeration": self.numeration,
|
|
140
|
+
"organisation_sub_unit": self.organisation_sub_unit,
|
|
141
|
+
"titles": self.titles,
|
|
142
|
+
"location": self.location,
|
|
143
|
+
"dates": self.dates,
|
|
144
|
+
"name_order_type": self.name_order,
|
|
145
|
+
"event_sub_unit": self.event_sub_unit,
|
|
146
|
+
"order_number": self.order_number,
|
|
147
|
+
"sub_title": self.sub_title,
|
|
148
|
+
"additional_info": self.additional_info,
|
|
149
|
+
"publication_type": self.publication_type,
|
|
150
|
+
"publication_language": self.publication_language,
|
|
151
|
+
"standardized_uri": self.standardized_uri,
|
|
152
|
+
"viaf_id": self.viaf_id
|
|
153
|
+
}
|
|
154
|
+
return author_dict
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
from rara_tools.constants.linker import (
|
|
2
|
+
LOGGER, URLSource, KeywordSource, EntityType, KeywordType,
|
|
3
|
+
KEYWORD_MARC_MAP, KEYWORD_TYPES_TO_IGNORE, KEYWORD_TYPE_MAP,
|
|
4
|
+
EMS_ENTITY_TYPES, SIERRA_ENTITY_TYPES, UNLINKED_KEYWORD_MARC_FIELD,
|
|
5
|
+
URL_SOURCE_MAP
|
|
6
|
+
)
|
|
7
|
+
|
|
8
|
+
from rara_tools.parsers.marc_records.person_record import PersonRecord
|
|
9
|
+
from rara_tools.parsers.marc_records.organization_record import OrganizationRecord
|
|
10
|
+
from rara_tools.parsers.marc_records.title_record import TitleRecord
|
|
11
|
+
from rara_tools.utils import format_date
|
|
12
|
+
from typing import List, Dict, NoReturn, Tuple, Any
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class FormattedObject:
|
|
16
|
+
def __init__(self, object_dict: dict, linked_doc: Any, original_entity_key: str):
|
|
17
|
+
self.object_dict: dict = object_dict
|
|
18
|
+
self.linked_doc: Any = linked_doc
|
|
19
|
+
self.viaf_info: dict = self.linked_doc.viaf.get("parsed", {}) if self.linked_doc else {}
|
|
20
|
+
self.original_entity: str = self.object_dict.get(original_entity_key)
|
|
21
|
+
self.is_linked: bool = True if self.linked_doc else False
|
|
22
|
+
|
|
23
|
+
self.__original_record: PersonRecord | OrganizationRecord | TitleRecord | None = None
|
|
24
|
+
self.__persons_title: str | None = None
|
|
25
|
+
self.__dates: str | None = None
|
|
26
|
+
self.__numeration: str | None = None
|
|
27
|
+
self.__location: str | None = None
|
|
28
|
+
self.__organization_sub_unit: str | None = None
|
|
29
|
+
self.__entity: str | None = None
|
|
30
|
+
self.__titles: str | None = None
|
|
31
|
+
self.__identifier: str | None = ""
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@property
|
|
35
|
+
def original_record(self) -> PersonRecord | OrganizationRecord | None:
|
|
36
|
+
if not self.__original_record and self.linked_doc and self.linked_doc.json:
|
|
37
|
+
try:
|
|
38
|
+
if self.entity_type == EntityType.PER:
|
|
39
|
+
original_record = PersonRecord(self.linked_doc.json)
|
|
40
|
+
elif self.entity_type == EntityType.ORG:
|
|
41
|
+
original_record = OrganizationRecord(self.linked_doc.json)
|
|
42
|
+
elif self.entity_type == EntityType.TITLE:
|
|
43
|
+
original_record = TitleRecord(self.linked_doc.json)
|
|
44
|
+
else:
|
|
45
|
+
original_record = None
|
|
46
|
+
except Exception as e:
|
|
47
|
+
LOGGER.exception(
|
|
48
|
+
f"Could not retrieve JSON from LinkedDoc instance. Exception: '{e}'."
|
|
49
|
+
)
|
|
50
|
+
original_record = None
|
|
51
|
+
self.__original_record = original_record
|
|
52
|
+
return self.__original_record
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@property
|
|
56
|
+
def entity(self) -> str:
|
|
57
|
+
if self.__entity == None:
|
|
58
|
+
if self.linked_doc != None:
|
|
59
|
+
if self.entity_type == EntityType.ORG and self.original_record:
|
|
60
|
+
self.__entity = self.original_record.original_name.get("a", "")
|
|
61
|
+
else:
|
|
62
|
+
self.__entity = self.linked_doc.linked_entity
|
|
63
|
+
if not self.__entity and self.viaf_info:
|
|
64
|
+
self.__entity = self.viaf_info.get("name", self.original_entity)
|
|
65
|
+
else:
|
|
66
|
+
self.__entity = self.original_entity
|
|
67
|
+
return self.__entity
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@property
|
|
72
|
+
def dates(self) -> str:
|
|
73
|
+
if self.__dates == None:
|
|
74
|
+
self.__dates = ""
|
|
75
|
+
if self.viaf_info:
|
|
76
|
+
birth_date = format_date(self.viaf_info.get("birth_date", ""))
|
|
77
|
+
death_date = format_date(self.viaf_info.get("death_date", ""))
|
|
78
|
+
if not death_date:
|
|
79
|
+
death_date = ""
|
|
80
|
+
|
|
81
|
+
if birth_date:
|
|
82
|
+
self.__dates = f"{birth_date}-{death_date}"
|
|
83
|
+
|
|
84
|
+
if self.original_record and not self.__dates:
|
|
85
|
+
if self.entity_type == EntityType.PER:
|
|
86
|
+
self.__dates = self.original_record.life_years
|
|
87
|
+
elif self.entity_type == EntityType.ORG:
|
|
88
|
+
self.__dates = self.original_record.dates
|
|
89
|
+
elif self.entity_type == EntityType.TITLE:
|
|
90
|
+
self.__dates = self.original_record.author_life_years
|
|
91
|
+
|
|
92
|
+
return self.__dates
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
@property
|
|
96
|
+
def numeration(self) -> str:
|
|
97
|
+
if self.__numeration == None:
|
|
98
|
+
self.__numeration = ""
|
|
99
|
+
if self.original_record:
|
|
100
|
+
if self.entity_type == EntityType.PER:
|
|
101
|
+
self.__numeration = self.original_record.original_name.get("b", "")
|
|
102
|
+
elif self.entity_type == EntityType.ORG:
|
|
103
|
+
self.__numeration = self.original_record.numeration
|
|
104
|
+
return self.__numeration
|
|
105
|
+
|
|
106
|
+
@property
|
|
107
|
+
def location(self) -> str:
|
|
108
|
+
if self.__location == None:
|
|
109
|
+
self.__location = ""
|
|
110
|
+
if self.entity_type == EntityType.ORG and self.original_record:
|
|
111
|
+
self.__location = self.original_record.location
|
|
112
|
+
return self.__location
|
|
113
|
+
|
|
114
|
+
@property
|
|
115
|
+
def organisation_sub_unit(self) -> str:
|
|
116
|
+
if self.__organization_sub_unit == None:
|
|
117
|
+
self.__organization_sub_unit = ""
|
|
118
|
+
if self.entity_type == EntityType.ORG and self.original_record:
|
|
119
|
+
self.__organization_sub_unit = self.original_record.original_name.get("b", "")
|
|
120
|
+
return self.__organization_sub_unit
|
|
121
|
+
|
|
122
|
+
@property
|
|
123
|
+
def titles(self) -> str:
|
|
124
|
+
if self.__titles == None:
|
|
125
|
+
if self.entity_type == EntityType.PER and self.original_record:
|
|
126
|
+
self.__titles = self.original_record.name_specification
|
|
127
|
+
else:
|
|
128
|
+
self.__titles = ""
|
|
129
|
+
return self.__titles
|
|
130
|
+
|
|
131
|
+
@property
|
|
132
|
+
def identifier(self) -> str:
|
|
133
|
+
if self.__identifier == None:
|
|
134
|
+
self.__identifier = ""
|
|
135
|
+
if self.original_record:
|
|
136
|
+
self.__identifier = self.original_record.identifier
|
|
137
|
+
return self.__identifier
|
rara_tools/normalizers/viaf.py
CHANGED
rara_tools/utils.py
CHANGED
|
@@ -2,20 +2,20 @@ from iso639 import Lang
|
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
def lang_to_iso639_1(lang: str, unk_code: str = "unk") -> str:
|
|
5
|
-
""" Converts language into ISO-639-1 standard.
|
|
6
|
-
Input can be any language code in a valid ISO-639
|
|
7
|
-
standard or even a full name of the language,
|
|
5
|
+
""" Converts language into ISO-639-1 standard.
|
|
6
|
+
Input can be any language code in a valid ISO-639
|
|
7
|
+
standard or even a full name of the language,
|
|
8
8
|
e.g. "Estonian".
|
|
9
|
-
|
|
9
|
+
|
|
10
10
|
Parameters
|
|
11
11
|
-----------
|
|
12
12
|
lang: str
|
|
13
13
|
Language code in any valid ISO-639 standard.
|
|
14
|
-
|
|
14
|
+
|
|
15
15
|
unk_code: str
|
|
16
16
|
Code to return incase of invalid/unsupported
|
|
17
17
|
input language.
|
|
18
|
-
|
|
18
|
+
|
|
19
19
|
Returns
|
|
20
20
|
-------
|
|
21
21
|
Language code in ISO-639-1 standard.
|
|
@@ -29,20 +29,20 @@ def lang_to_iso639_1(lang: str, unk_code: str = "unk") -> str:
|
|
|
29
29
|
|
|
30
30
|
|
|
31
31
|
def lang_to_iso639_2(lang: str, unk_code: str = "unk") -> str:
|
|
32
|
-
""" Converts language into ISO-639-2 standard.
|
|
33
|
-
Input can be any language code in a valid ISO-639
|
|
34
|
-
standard or even a full name of the language,
|
|
32
|
+
""" Converts language into ISO-639-2 standard.
|
|
33
|
+
Input can be any language code in a valid ISO-639
|
|
34
|
+
standard or even a full name of the language,
|
|
35
35
|
e.g. "Estonian".
|
|
36
|
-
|
|
36
|
+
|
|
37
37
|
Parameters
|
|
38
38
|
-----------
|
|
39
39
|
lang: str
|
|
40
40
|
Language code in any valid ISO-639 standard.
|
|
41
|
-
|
|
41
|
+
|
|
42
42
|
unk_code: str
|
|
43
43
|
Code to return incase of invalid/unsupported
|
|
44
44
|
input language.
|
|
45
|
-
|
|
45
|
+
|
|
46
46
|
Returns
|
|
47
47
|
-------
|
|
48
48
|
Language code in ISO-639-2 standard.
|
|
@@ -59,20 +59,20 @@ def lang_to_iso639_2(lang: str, unk_code: str = "unk") -> str:
|
|
|
59
59
|
|
|
60
60
|
|
|
61
61
|
def lang_to_iso639_3(lang: str, unk_code: str = "unk") -> str:
|
|
62
|
-
""" Converts language into ISO-639-3 standard.
|
|
63
|
-
Input can be any language code in a valid ISO-639
|
|
64
|
-
standard or even a full name of the language,
|
|
62
|
+
""" Converts language into ISO-639-3 standard.
|
|
63
|
+
Input can be any language code in a valid ISO-639
|
|
64
|
+
standard or even a full name of the language,
|
|
65
65
|
e.g. "Estonian".
|
|
66
|
-
|
|
66
|
+
|
|
67
67
|
Parameters
|
|
68
68
|
-----------
|
|
69
69
|
lang: str
|
|
70
70
|
Language code in any valid ISO-639 standard.
|
|
71
71
|
unk_code: str
|
|
72
|
-
|
|
72
|
+
|
|
73
73
|
Code to return incase of invalid/unsupported
|
|
74
74
|
input language.
|
|
75
|
-
|
|
75
|
+
|
|
76
76
|
Returns
|
|
77
77
|
-------
|
|
78
78
|
str
|
|
@@ -88,17 +88,39 @@ def lang_to_iso639_3(lang: str, unk_code: str = "unk") -> str:
|
|
|
88
88
|
|
|
89
89
|
def ratio_to_percentage(ratio: float) -> str:
|
|
90
90
|
""" Converts ratio to corresponding percentage.
|
|
91
|
-
|
|
91
|
+
|
|
92
92
|
Parameters
|
|
93
93
|
-----------
|
|
94
94
|
ratio: float
|
|
95
95
|
Float in range [0,1]
|
|
96
|
-
|
|
96
|
+
|
|
97
97
|
Returns
|
|
98
98
|
--------
|
|
99
99
|
str
|
|
100
100
|
Percentage corresponding to the float.
|
|
101
|
-
|
|
101
|
+
|
|
102
102
|
"""
|
|
103
103
|
percentage = f"{int(ratio*100)}%"
|
|
104
104
|
return percentage
|
|
105
|
+
|
|
106
|
+
def format_date(original_date: str) -> str:
|
|
107
|
+
""" Converts date from format %Y-%m-%d into format %d.%m.%Y, e.g:
|
|
108
|
+
2025-02-12 -> 12.02.2025.
|
|
109
|
+
|
|
110
|
+
Parameters
|
|
111
|
+
-----------
|
|
112
|
+
original_date: str
|
|
113
|
+
Original date in format %Y-%m-%d
|
|
114
|
+
|
|
115
|
+
Returns
|
|
116
|
+
----------
|
|
117
|
+
str:
|
|
118
|
+
Date in format %d.%m.%Y
|
|
119
|
+
"""
|
|
120
|
+
try:
|
|
121
|
+
date_obj = datetime.strptime(original_date, "%Y-%m-%d")
|
|
122
|
+
new_date = date_obj.strftime("%d.%m.%Y")
|
|
123
|
+
|
|
124
|
+
except:
|
|
125
|
+
new_date = original_date
|
|
126
|
+
return new_date
|
|
@@ -3,24 +3,27 @@ rara_tools/decorators.py,sha256=MjOyvZ5nTkwxwx2JLFEGpKKBysvecFw6EN6UDrSvZLU,2187
|
|
|
3
3
|
rara_tools/digar_schema_converter.py,sha256=k95U2iRlEA3sh772-v6snhHW6fju6qSTMnvWJ6DpzZk,14254
|
|
4
4
|
rara_tools/elastic.py,sha256=4D9yoyMy6AJIKwhSi2H1usffDHAh2A_IZfv5BtYnBKg,13992
|
|
5
5
|
rara_tools/exceptions.py,sha256=YQyaueUbXeTkJYFDEuN6iWTXMI3eCv5l7PxGp87vg5I,550
|
|
6
|
-
rara_tools/formatters.py,sha256=LTliadjIPZTO4s-44NsumaUdlQlEvqetvWz4bEvwf90,3418
|
|
7
6
|
rara_tools/s3.py,sha256=9ziDXsLjBtFAvsjTPxFddhfvkpA8773rzPJqO7y1N5Q,6415
|
|
8
7
|
rara_tools/task_reporter.py,sha256=WCcZts9dAUokPc4vbrG3-lNAFLnWaMgE3b3iaUB7mr8,3256
|
|
9
|
-
rara_tools/utils.py,sha256=
|
|
8
|
+
rara_tools/utils.py,sha256=1UrxOzo3cxe4juMkDlKWv1VKWMYay5v1pivGci1ajiM,3003
|
|
10
9
|
rara_tools/constants/__init__.py,sha256=r78laM9vyRDAvzDhPvzDlhaX6qPwUUBBtwf1WosrW3o,27
|
|
11
10
|
rara_tools/constants/digitizer.py,sha256=A7FfqqEB4hGJ9t3z8gTFK7hkzCxz44rCOSWx6Pzvwjs,548
|
|
12
11
|
rara_tools/constants/general.py,sha256=jE1aIir_eKbka_q1iCJWRtmyz_xpnTPntbshiWo9eTA,1061
|
|
13
12
|
rara_tools/constants/language_evaluator.py,sha256=3sCSaoS-zXQRY0vJ7UUMuZqbtYQD_quVVbdpgvJjE7I,124
|
|
14
|
-
rara_tools/constants/linker.py,sha256=
|
|
13
|
+
rara_tools/constants/linker.py,sha256=yBN9NpUhB3ENz8BapoIfpSHY_xNqwYdqutgQFdc_Cd8,3240
|
|
15
14
|
rara_tools/constants/meta_extractor.py,sha256=adYH8cQqH0ZWYO7clGMiObclXRTGsxWgk3pC1oiHxHE,242
|
|
16
15
|
rara_tools/constants/normalizers.py,sha256=Xs3anDwJHpHeniwx3xoIZyqdEXtO3eb7ouGLLr0CpHw,1344
|
|
17
16
|
rara_tools/constants/parsers.py,sha256=L6nh1Itget9_9DMsliDkh6T25z78eMFPWVkbaU08DwU,5561
|
|
18
17
|
rara_tools/constants/subject_indexer.py,sha256=E2D7pylH6Yey9h2TAvAWQiX5JtKKagsZx2E1Fz_afMI,1967
|
|
18
|
+
rara_tools/core_formatters/core_formatter.py,sha256=HJX7jOi9kaFie_zm0Wzjk0nKF8dRleJpVWbCplFFquo,2760
|
|
19
|
+
rara_tools/core_formatters/formatted_keyword.py,sha256=1-B9IQTycFt69pTy8WZNnfJ2WIMRow3kpEub6igyNQc,7865
|
|
20
|
+
rara_tools/core_formatters/formatted_meta.py,sha256=Zd0oQFLbn6m_wHaWtgxBsu9J7wGyWIpZxb2-8PrR3Wk,5240
|
|
21
|
+
rara_tools/core_formatters/formatted_object.py,sha256=7a499ZmcZXOqtlwxDi6FWHWF5a6HdCsduS22wV3uHIE,5656
|
|
19
22
|
rara_tools/normalizers/__init__.py,sha256=_NqpS5w710DhaURytHq9JpEt8HgYpSPfRDcOtOymJgE,193
|
|
20
23
|
rara_tools/normalizers/authorities.py,sha256=IDtcm0yNZNhv1f-WcdqWFSRzZk_CoKuBFsk6hEPddWM,4513
|
|
21
24
|
rara_tools/normalizers/base.py,sha256=6tLfNdF6FZo8M6j_Q61lXoaF1HdIB1c0SKMatTc-Z64,12014
|
|
22
25
|
rara_tools/normalizers/bibs.py,sha256=4DTS6k37z8qR5B3n7aiCXsT5Z49rLTvQ60lKKr5dyLs,2352
|
|
23
|
-
rara_tools/normalizers/viaf.py,sha256=
|
|
26
|
+
rara_tools/normalizers/viaf.py,sha256=LIeqbJoKtVt_0H1o7XMmhSE0BjF4l-jdAJgX_8Gg9Z4,24218
|
|
24
27
|
rara_tools/parsers/marc_parsers/base_parser.py,sha256=Kdw4aivJf2FkWgIK7pJtHtVXF_G1pjHVQ7IcFItSqy8,1649
|
|
25
28
|
rara_tools/parsers/marc_parsers/ems_parser.py,sha256=LFuhZcVwmHMcJknX9p4ZkO8RdjPdQZ4APGbw8KV6BIs,2024
|
|
26
29
|
rara_tools/parsers/marc_parsers/location_parser.py,sha256=dSU9dQoGV5z0ajhLI1bn3AAghkOr79qKIrX7sO0_4lA,1873
|
|
@@ -35,8 +38,8 @@ rara_tools/parsers/marc_records/title_record.py,sha256=XrtJ4gj7wzSaGxNaPtPuawmqq
|
|
|
35
38
|
rara_tools/parsers/tools/entity_normalizers.py,sha256=VyCy_NowCLpOsL0luQ55IW-Qi-J5oBH0Ofzr7HRFBhM,8949
|
|
36
39
|
rara_tools/parsers/tools/marc_converter.py,sha256=LgSHe-7n7aiDrw2bnsB53r3fXTRFjZXTwBYfTpL0pfs,415
|
|
37
40
|
rara_tools/parsers/tools/russian_transliterator.py,sha256=5ZU66iTqAhr7pmfVqXPAI_cidF43VqqmuN4d7H4_JuA,9770
|
|
38
|
-
rara_tools-0.
|
|
39
|
-
rara_tools-0.
|
|
40
|
-
rara_tools-0.
|
|
41
|
-
rara_tools-0.
|
|
42
|
-
rara_tools-0.
|
|
41
|
+
rara_tools-0.7.0.dist-info/licenses/LICENSE.md,sha256=hkZVnIZll7e_KNEQzeY94Y9tlzVL8iVZBTMBvDykksU,35142
|
|
42
|
+
rara_tools-0.7.0.dist-info/METADATA,sha256=UmA4_431SdsYqP6IrEv8mr4yL9OoFVGHvpG-mQgs_g8,4079
|
|
43
|
+
rara_tools-0.7.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
44
|
+
rara_tools-0.7.0.dist-info/top_level.txt,sha256=JwfB5b8BAtW5OFKRln2AQ_WElTRyIBM4nO0FKN1cupY,11
|
|
45
|
+
rara_tools-0.7.0.dist-info/RECORD,,
|
rara_tools/formatters.py
DELETED
|
@@ -1,106 +0,0 @@
|
|
|
1
|
-
from typing import List, Tuple, Any
|
|
2
|
-
from rara_tools.constants.subject_indexer import (
|
|
3
|
-
EntityType, KeywordType, KeywordMARC, KeywordSource, URLSource,
|
|
4
|
-
KEYWORD_TYPE_MAP, KEYWORD_MARC_MAP, KEYWORD_TYPES_TO_IGNORE,
|
|
5
|
-
EMS_ENTITY_TYPES, SIERRA_ENTITY_TYPES, VIAF_ENTITY_TYPES
|
|
6
|
-
)
|
|
7
|
-
|
|
8
|
-
def _get_keyword_source(linked_doc: Any, entity_type: str, is_linked: bool
|
|
9
|
-
) -> str:
|
|
10
|
-
""" Find keyword source.
|
|
11
|
-
"""
|
|
12
|
-
if not is_linked:
|
|
13
|
-
source = KeywordSource.AI
|
|
14
|
-
elif entity_type in EMS_ENTITY_TYPES:
|
|
15
|
-
source = KeywordSource.EMS
|
|
16
|
-
elif entity_type in SIERRA_ENTITY_TYPES:
|
|
17
|
-
if linked_doc and linked_doc.elastic:
|
|
18
|
-
source = KeywordSource.SIERRA
|
|
19
|
-
elif linked_doc and linked_doc.viaf:
|
|
20
|
-
source = KeywordSource.VIAF
|
|
21
|
-
else:
|
|
22
|
-
source = KeywordSource.AI
|
|
23
|
-
else:
|
|
24
|
-
source = KeywordSource.AI
|
|
25
|
-
return source
|
|
26
|
-
|
|
27
|
-
def _find_indicators(entity_type: str, entity: str,
|
|
28
|
-
is_linked: bool
|
|
29
|
-
) -> Tuple[str, str]:
|
|
30
|
-
""" Find MARC indicators 1 and 2.
|
|
31
|
-
"""
|
|
32
|
-
ind1 = " "
|
|
33
|
-
ind2 = " "
|
|
34
|
-
if entity_type in SIERRA_ENTITY_TYPES:
|
|
35
|
-
if entity_type == EntityType.PER:
|
|
36
|
-
if "," in entity:
|
|
37
|
-
ind1 = "1"
|
|
38
|
-
else:
|
|
39
|
-
ind1 = "0"
|
|
40
|
-
else:
|
|
41
|
-
# 1 märksõna esimeseks elemendiks võimupiirkonna nimi, nt:
|
|
42
|
-
# (a) Eesti (b) Riigikogu - raske automaatselt määrata
|
|
43
|
-
# 2 märksõna esimeseks elemendiks nimi pärijärjestuses
|
|
44
|
-
ind1 = "2"
|
|
45
|
-
if not is_linked:
|
|
46
|
-
ind2 = "4"
|
|
47
|
-
elif entity_type in EMS_ENTITY_TYPES:
|
|
48
|
-
ind2 = "4"
|
|
49
|
-
return (ind1, ind2)
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
def format_keywords(flat_keywords: List[dict]) -> dict:
|
|
53
|
-
""" Formats unlinked keywords for Kata CORE.
|
|
54
|
-
"""
|
|
55
|
-
ignored_keywords = []
|
|
56
|
-
filtered_keywords = []
|
|
57
|
-
|
|
58
|
-
for keyword_dict in flat_keywords:
|
|
59
|
-
keyword_type = keyword_dict.get("entity_type")
|
|
60
|
-
if keyword_type in KEYWORD_TYPES_TO_IGNORE:
|
|
61
|
-
ignored_keywords.append(keyword_dict)
|
|
62
|
-
else:
|
|
63
|
-
filtered_keywords.append(keyword_dict)
|
|
64
|
-
|
|
65
|
-
formatted_keywords = {
|
|
66
|
-
"keywords": [],
|
|
67
|
-
"other": ignored_keywords
|
|
68
|
-
}
|
|
69
|
-
|
|
70
|
-
for keyword_dict in filtered_keywords:
|
|
71
|
-
original_keyword = keyword_dict.get("keyword")
|
|
72
|
-
keyword_type = keyword_dict.get("entity_type")
|
|
73
|
-
entity_type = KEYWORD_TYPE_MAP.get(keyword_type, "")
|
|
74
|
-
marc_field = KEYWORD_MARC_MAP.get(str(keyword_type), "")
|
|
75
|
-
lang = keyword_dict.get("language", "")
|
|
76
|
-
|
|
77
|
-
ind1, ind2 = _find_indicators(
|
|
78
|
-
entity_type=entity_type,
|
|
79
|
-
entity=original_keyword,
|
|
80
|
-
is_linked=False
|
|
81
|
-
)
|
|
82
|
-
keyword_source = _get_keyword_source(
|
|
83
|
-
linked_doc=None,
|
|
84
|
-
is_linked=False,
|
|
85
|
-
entity_type=entity_type
|
|
86
|
-
)
|
|
87
|
-
new_keyword_dict = {
|
|
88
|
-
"dates": "",
|
|
89
|
-
"indicator1": ind1,
|
|
90
|
-
"indicator2": ind2,
|
|
91
|
-
"is_linked": False,
|
|
92
|
-
"keyword_source": keyword_source,
|
|
93
|
-
"lang": lang,
|
|
94
|
-
"location": "",
|
|
95
|
-
"marc_field": marc_field,
|
|
96
|
-
"numeration": "",
|
|
97
|
-
"organisation_sub_unit": "",
|
|
98
|
-
"original_keyword": original_keyword,
|
|
99
|
-
"persons_title": "",
|
|
100
|
-
"url": "",
|
|
101
|
-
"url_source": ""
|
|
102
|
-
}
|
|
103
|
-
new_keyword_dict.update(keyword_dict)
|
|
104
|
-
formatted_keywords["keywords"].append(new_keyword_dict)
|
|
105
|
-
|
|
106
|
-
return formatted_keywords
|
|
File without changes
|
|
File without changes
|
|
File without changes
|