rara-tools 0.6.12__py3-none-any.whl → 0.6.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of rara-tools might be problematic. Click here for more details.
- rara_tools/constants/subject_indexer.py +72 -1
- rara_tools/formatters.py +106 -0
- rara_tools/parsers/marc_records/organization_record.py +20 -10
- {rara_tools-0.6.12.dist-info → rara_tools-0.6.14.dist-info}/METADATA +1 -1
- {rara_tools-0.6.12.dist-info → rara_tools-0.6.14.dist-info}/RECORD +8 -7
- {rara_tools-0.6.12.dist-info → rara_tools-0.6.14.dist-info}/WHEEL +0 -0
- {rara_tools-0.6.12.dist-info → rara_tools-0.6.14.dist-info}/licenses/LICENSE.md +0 -0
- {rara_tools-0.6.12.dist-info → rara_tools-0.6.14.dist-info}/top_level.txt +0 -0
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
from rara_tools.constants.normalizers import EntityType
|
|
2
|
+
|
|
1
3
|
COMPONENT_KEY = "subject_indexer"
|
|
2
4
|
|
|
3
5
|
|
|
@@ -11,4 +13,73 @@ class Queue:
|
|
|
11
13
|
|
|
12
14
|
|
|
13
15
|
class StatusKeys:
|
|
14
|
-
EXTRACT_KEYWORDS = "extract_keywords"
|
|
16
|
+
EXTRACT_KEYWORDS = "extract_keywords"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class URLSource:
|
|
20
|
+
VIAF = "VIAF"
|
|
21
|
+
SIERRA = "Sierra"
|
|
22
|
+
EMS = "EMS"
|
|
23
|
+
|
|
24
|
+
class KeywordType:
|
|
25
|
+
LOC = "Kohamärksõnad"
|
|
26
|
+
TIME = "Ajamärksõnad"
|
|
27
|
+
TOPIC = "Teemamärksõnad"
|
|
28
|
+
GENRE = "Vormimärksõnad"
|
|
29
|
+
TITLE = "Teose pealkiri"
|
|
30
|
+
PER = "Isikunimi"
|
|
31
|
+
ORG = "Kollektiivi nimi"
|
|
32
|
+
EVENT = "Ajutine kollektiiv või sündmus"
|
|
33
|
+
CATEGORY = "Valdkonnamärksõnad"
|
|
34
|
+
UDC = "UDC Summary"
|
|
35
|
+
UDK = "UDK Rahvusbibliograafia"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class KeywordMARC:
|
|
39
|
+
PER = 600
|
|
40
|
+
ORG = 610
|
|
41
|
+
TOPIC = 650
|
|
42
|
+
GENRE = 655
|
|
43
|
+
TIME = 648
|
|
44
|
+
LOC = 651
|
|
45
|
+
EVENT = 611
|
|
46
|
+
TITLE = 630
|
|
47
|
+
|
|
48
|
+
class KeywordSource:
|
|
49
|
+
EMS = "EMS"
|
|
50
|
+
SIERRA = "SIERRA"
|
|
51
|
+
VIAF = "VIAF"
|
|
52
|
+
AI = "AI"
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
KEYWORD_TYPE_MAP = {
|
|
56
|
+
KeywordType.TIME: EntityType.KEYWORD,
|
|
57
|
+
KeywordType.GENRE: EntityType.KEYWORD,
|
|
58
|
+
KeywordType.LOC: EntityType.LOC,
|
|
59
|
+
KeywordType.PER: EntityType.PER,
|
|
60
|
+
KeywordType.ORG: EntityType.ORG,
|
|
61
|
+
KeywordType.TOPIC: EntityType.KEYWORD,
|
|
62
|
+
KeywordType.TITLE: EntityType.TITLE,
|
|
63
|
+
KeywordType.EVENT: EntityType.ORG
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
KEYWORD_MARC_MAP = {
|
|
67
|
+
KeywordType.LOC: KeywordMARC.LOC,
|
|
68
|
+
KeywordType.TIME: KeywordMARC.TIME,
|
|
69
|
+
KeywordType.TOPIC: KeywordMARC.TOPIC,
|
|
70
|
+
KeywordType.GENRE: KeywordMARC.GENRE,
|
|
71
|
+
KeywordType.TITLE: KeywordMARC.TITLE,
|
|
72
|
+
KeywordType.ORG: KeywordMARC.ORG,
|
|
73
|
+
KeywordType.PER: KeywordMARC.PER,
|
|
74
|
+
KeywordType.EVENT: KeywordMARC.EVENT
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
KEYWORD_TYPES_TO_IGNORE = [
|
|
78
|
+
KeywordType.CATEGORY,
|
|
79
|
+
KeywordType.UDC,
|
|
80
|
+
KeywordType.UDK
|
|
81
|
+
]
|
|
82
|
+
|
|
83
|
+
EMS_ENTITY_TYPES = [EntityType.KEYWORD, EntityType.LOC]
|
|
84
|
+
SIERRA_ENTITY_TYPES = [EntityType.PER, EntityType.ORG, EntityType.TITLE]
|
|
85
|
+
VIAF_ENTITY_TYPES = [EntityType.PER, EntityType.ORG, EntityType.TITLE]
|
rara_tools/formatters.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
from typing import List, Tuple, Any
|
|
2
|
+
from rara_tools.constants.subject_indexer import (
|
|
3
|
+
EntityType, KeywordType, KeywordMARC, KeywordSource, URLSource,
|
|
4
|
+
KEYWORD_TYPE_MAP, KEYWORD_MARC_MAP, KEYWORD_TYPES_TO_IGNORE,
|
|
5
|
+
EMS_ENTITY_TYPES, SIERRA_ENTITY_TYPES, VIAF_ENTITY_TYPES
|
|
6
|
+
)
|
|
7
|
+
|
|
8
|
+
def _get_keyword_source(linked_doc: Any, entity_type: str, is_linked: bool
|
|
9
|
+
) -> str:
|
|
10
|
+
""" Find keyword source.
|
|
11
|
+
"""
|
|
12
|
+
if not is_linked:
|
|
13
|
+
source = KeywordSource.AI
|
|
14
|
+
elif entity_type in EMS_ENTITY_TYPES:
|
|
15
|
+
source = KeywordSource.EMS
|
|
16
|
+
elif entity_type in SIERRA_ENTITY_TYPES:
|
|
17
|
+
if linked_doc and linked_doc.elastic:
|
|
18
|
+
source = KeywordSource.SIERRA
|
|
19
|
+
elif linked_doc and linked_doc.viaf:
|
|
20
|
+
source = KeywordSource.VIAF
|
|
21
|
+
else:
|
|
22
|
+
source = KeywordSource.AI
|
|
23
|
+
else:
|
|
24
|
+
source = KeywordSource.AI
|
|
25
|
+
return source
|
|
26
|
+
|
|
27
|
+
def _find_indicators(entity_type: str, entity: str,
|
|
28
|
+
is_linked: bool
|
|
29
|
+
) -> Tuple[str, str]:
|
|
30
|
+
""" Find MARC indicators 1 and 2.
|
|
31
|
+
"""
|
|
32
|
+
ind1 = " "
|
|
33
|
+
ind2 = " "
|
|
34
|
+
if entity_type in SIERRA_ENTITY_TYPES:
|
|
35
|
+
if entity_type == EntityType.PER:
|
|
36
|
+
if "," in entity:
|
|
37
|
+
ind1 = "1"
|
|
38
|
+
else:
|
|
39
|
+
ind1 = "0"
|
|
40
|
+
else:
|
|
41
|
+
# 1 märksõna esimeseks elemendiks võimupiirkonna nimi, nt:
|
|
42
|
+
# (a) Eesti (b) Riigikogu - raske automaatselt määrata
|
|
43
|
+
# 2 märksõna esimeseks elemendiks nimi pärijärjestuses
|
|
44
|
+
ind1 = "2"
|
|
45
|
+
if not is_linked:
|
|
46
|
+
ind2 = "4"
|
|
47
|
+
elif entity_type in EMS_ENTITY_TYPES:
|
|
48
|
+
ind2 = "4"
|
|
49
|
+
return (ind1, ind2)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def format_keywords(flat_keywords: List[dict]) -> dict:
|
|
53
|
+
""" Formats unlinked keywords for Kata CORE.
|
|
54
|
+
"""
|
|
55
|
+
ignored_keywords = []
|
|
56
|
+
filtered_keywords = []
|
|
57
|
+
|
|
58
|
+
for keyword_dict in flat_keywords:
|
|
59
|
+
keyword_type = keyword_dict.get("entity_type")
|
|
60
|
+
if keyword_type in KEYWORD_TYPES_TO_IGNORE:
|
|
61
|
+
ignored_keywords.append(keyword_dict)
|
|
62
|
+
else:
|
|
63
|
+
filtered_keywords.append(keyword_dict)
|
|
64
|
+
|
|
65
|
+
formatted_keywords = {
|
|
66
|
+
"keywords": [],
|
|
67
|
+
"other": ignored_keywords
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
for keyword_dict in filtered_keywords:
|
|
71
|
+
original_keyword = keyword_dict.get("keyword")
|
|
72
|
+
keyword_type = keyword_dict.get("entity_type")
|
|
73
|
+
entity_type = KEYWORD_TYPE_MAP.get(keyword_type, "")
|
|
74
|
+
marc_field = KEYWORD_MARC_MAP.get(str(keyword_type), "")
|
|
75
|
+
lang = keyword_dict.get("language", "")
|
|
76
|
+
|
|
77
|
+
ind1, ind2 = _find_indicators(
|
|
78
|
+
entity_type=entity_type,
|
|
79
|
+
entity=original_keyword,
|
|
80
|
+
is_linked=False
|
|
81
|
+
)
|
|
82
|
+
keyword_source = _get_keyword_source(
|
|
83
|
+
linked_doc=None,
|
|
84
|
+
is_linked=False,
|
|
85
|
+
entity_type=entity_type
|
|
86
|
+
)
|
|
87
|
+
new_keyword_dict = {
|
|
88
|
+
"dates": "",
|
|
89
|
+
"indicator1": ind1,
|
|
90
|
+
"indicator2": ind2,
|
|
91
|
+
"is_linked": False,
|
|
92
|
+
"keyword_source": keyword_source,
|
|
93
|
+
"lang": lang,
|
|
94
|
+
"location": "",
|
|
95
|
+
"marc_field": marc_field,
|
|
96
|
+
"numeration": "",
|
|
97
|
+
"organisation_sub_unit": "",
|
|
98
|
+
"original_keyword": original_keyword,
|
|
99
|
+
"persons_title": "",
|
|
100
|
+
"url": "",
|
|
101
|
+
"url_source": ""
|
|
102
|
+
}
|
|
103
|
+
new_keyword_dict.update(keyword_dict)
|
|
104
|
+
formatted_keywords["keywords"].append(new_keyword_dict)
|
|
105
|
+
|
|
106
|
+
return formatted_keywords
|
|
@@ -6,13 +6,8 @@ import regex as re
|
|
|
6
6
|
import json
|
|
7
7
|
|
|
8
8
|
# TODO: indikaatorid ind1 väljadel 100 ja 400?
|
|
9
|
-
"""
|
|
10
|
-
|c asutuse konverentsi toimumise koht (MK)
|
|
11
9
|
|
|
12
|
-
|d asutuse konverentsi toimumise aeg (K)
|
|
13
10
|
|
|
14
|
-
|n asutuse konverentsi järjenumber (K)
|
|
15
|
-
"""
|
|
16
11
|
class OrganizationRecord(BaseRecord):
|
|
17
12
|
""" Generates a simplified organization JSON record
|
|
18
13
|
from a pymarc MARC record.
|
|
@@ -45,6 +40,7 @@ class OrganizationRecord(BaseRecord):
|
|
|
45
40
|
self.__name_specification: str = ""
|
|
46
41
|
self.__dates: str = ""
|
|
47
42
|
self.__location: str = ""
|
|
43
|
+
self.__numeration: str = ""
|
|
48
44
|
self.__name_variations: List[str] = []
|
|
49
45
|
self.__source: str = ""
|
|
50
46
|
self.__description: str = ""
|
|
@@ -61,7 +57,10 @@ class OrganizationRecord(BaseRecord):
|
|
|
61
57
|
|
|
62
58
|
|
|
63
59
|
def _clean_value(self, value: str) -> str:
|
|
64
|
-
|
|
60
|
+
try:
|
|
61
|
+
cleaned_value = value.strip("., ")
|
|
62
|
+
except Exception as e:
|
|
63
|
+
cleaned_value = ""
|
|
65
64
|
return cleaned_value
|
|
66
65
|
|
|
67
66
|
def _merge_and_clean(self, value: dict, keys: List[str]) -> str:
|
|
@@ -103,10 +102,10 @@ class OrganizationRecord(BaseRecord):
|
|
|
103
102
|
if not self.__dates:
|
|
104
103
|
values = self.get_values(
|
|
105
104
|
marc_ids=self.__name_field_id,
|
|
106
|
-
subfield_id=
|
|
105
|
+
subfield_id="d"
|
|
107
106
|
)
|
|
108
107
|
if values:
|
|
109
|
-
self.__dates = self.
|
|
108
|
+
self.__dates = self._clean_value(values[0])
|
|
110
109
|
return self.__dates
|
|
111
110
|
|
|
112
111
|
@property
|
|
@@ -114,12 +113,23 @@ class OrganizationRecord(BaseRecord):
|
|
|
114
113
|
if not self.__location:
|
|
115
114
|
values = self.get_values(
|
|
116
115
|
marc_ids=self.__name_field_id,
|
|
117
|
-
subfield_id=
|
|
116
|
+
subfield_id="c"
|
|
118
117
|
)
|
|
119
118
|
if values:
|
|
120
|
-
self.__location = self.
|
|
119
|
+
self.__location = self._clean_value(values[0])
|
|
121
120
|
return self.__location
|
|
122
121
|
|
|
122
|
+
@property
|
|
123
|
+
def numeration(self) -> str:
|
|
124
|
+
if not self.__numeration:
|
|
125
|
+
values = self.get_values(
|
|
126
|
+
marc_ids=self.__name_field_id,
|
|
127
|
+
subfield_id="n"
|
|
128
|
+
)
|
|
129
|
+
if values:
|
|
130
|
+
self.__numeration = self._clean_value(values[0])
|
|
131
|
+
return self.__numeration
|
|
132
|
+
|
|
123
133
|
@property
|
|
124
134
|
def acronyms(self) -> List[str]:
|
|
125
135
|
if not self.__acronyms:
|
|
@@ -3,6 +3,7 @@ rara_tools/decorators.py,sha256=MjOyvZ5nTkwxwx2JLFEGpKKBysvecFw6EN6UDrSvZLU,2187
|
|
|
3
3
|
rara_tools/digar_schema_converter.py,sha256=k95U2iRlEA3sh772-v6snhHW6fju6qSTMnvWJ6DpzZk,14254
|
|
4
4
|
rara_tools/elastic.py,sha256=7HvDmFKpQbGnnzYyiCKOg0uvubnv2TpCASRrnPP8DcQ,13540
|
|
5
5
|
rara_tools/exceptions.py,sha256=YQyaueUbXeTkJYFDEuN6iWTXMI3eCv5l7PxGp87vg5I,550
|
|
6
|
+
rara_tools/formatters.py,sha256=LTliadjIPZTO4s-44NsumaUdlQlEvqetvWz4bEvwf90,3418
|
|
6
7
|
rara_tools/s3.py,sha256=9ziDXsLjBtFAvsjTPxFddhfvkpA8773rzPJqO7y1N5Q,6415
|
|
7
8
|
rara_tools/task_reporter.py,sha256=WCcZts9dAUokPc4vbrG3-lNAFLnWaMgE3b3iaUB7mr8,3256
|
|
8
9
|
rara_tools/utils.py,sha256=9vSbmuWYU5ydr4lXBKlUKa0xzDccFsaJv4T-XwgUfuY,2578
|
|
@@ -14,7 +15,7 @@ rara_tools/constants/linker.py,sha256=TQaigi7AUNOqmQPPz3hM8_xXgofrhoQ1taln79LhXQ
|
|
|
14
15
|
rara_tools/constants/meta_extractor.py,sha256=adYH8cQqH0ZWYO7clGMiObclXRTGsxWgk3pC1oiHxHE,242
|
|
15
16
|
rara_tools/constants/normalizers.py,sha256=Xs3anDwJHpHeniwx3xoIZyqdEXtO3eb7ouGLLr0CpHw,1344
|
|
16
17
|
rara_tools/constants/parsers.py,sha256=L6nh1Itget9_9DMsliDkh6T25z78eMFPWVkbaU08DwU,5561
|
|
17
|
-
rara_tools/constants/subject_indexer.py,sha256=
|
|
18
|
+
rara_tools/constants/subject_indexer.py,sha256=E2D7pylH6Yey9h2TAvAWQiX5JtKKagsZx2E1Fz_afMI,1967
|
|
18
19
|
rara_tools/normalizers/__init__.py,sha256=_NqpS5w710DhaURytHq9JpEt8HgYpSPfRDcOtOymJgE,193
|
|
19
20
|
rara_tools/normalizers/authorities.py,sha256=IDtcm0yNZNhv1f-WcdqWFSRzZk_CoKuBFsk6hEPddWM,4513
|
|
20
21
|
rara_tools/normalizers/base.py,sha256=LbS7Y7CEL-C-ynT-WPc-eCLkNeMO9BI9qtBm-W1skGM,11790
|
|
@@ -28,14 +29,14 @@ rara_tools/parsers/marc_parsers/person_parser.py,sha256=iMycHSlgfvgB0axE_rneB5sI
|
|
|
28
29
|
rara_tools/parsers/marc_parsers/title_parser.py,sha256=uZiYb_aZWzv_xLEBSZmFt2vN6UIauNSFRCkNG_ZKL10,1570
|
|
29
30
|
rara_tools/parsers/marc_records/base_record.py,sha256=yllX2ArjBm9PfUnH6dk3__Rb2LQuEGCYqZGVKBzqSl0,4673
|
|
30
31
|
rara_tools/parsers/marc_records/ems_record.py,sha256=B2YZLEeDd-GmmYqxhczbMsSEB7-x6ZLjB8OeDnzOxww,9376
|
|
31
|
-
rara_tools/parsers/marc_records/organization_record.py,sha256=
|
|
32
|
+
rara_tools/parsers/marc_records/organization_record.py,sha256=WFmmMBiZUhaIMh-j06ChH37JLT7yFG7ZDc_0keqjIYo,10355
|
|
32
33
|
rara_tools/parsers/marc_records/person_record.py,sha256=AtGESwFmN5YvrBES0BsfTgOZbroB4l0SuFRznumfmJA,7867
|
|
33
34
|
rara_tools/parsers/marc_records/title_record.py,sha256=XrtJ4gj7wzSaGxNaPtPuawmqqkXsVX5HAAKfXTSo4mA,6855
|
|
34
35
|
rara_tools/parsers/tools/entity_normalizers.py,sha256=VyCy_NowCLpOsL0luQ55IW-Qi-J5oBH0Ofzr7HRFBhM,8949
|
|
35
36
|
rara_tools/parsers/tools/marc_converter.py,sha256=LgSHe-7n7aiDrw2bnsB53r3fXTRFjZXTwBYfTpL0pfs,415
|
|
36
37
|
rara_tools/parsers/tools/russian_transliterator.py,sha256=5ZU66iTqAhr7pmfVqXPAI_cidF43VqqmuN4d7H4_JuA,9770
|
|
37
|
-
rara_tools-0.6.
|
|
38
|
-
rara_tools-0.6.
|
|
39
|
-
rara_tools-0.6.
|
|
40
|
-
rara_tools-0.6.
|
|
41
|
-
rara_tools-0.6.
|
|
38
|
+
rara_tools-0.6.14.dist-info/licenses/LICENSE.md,sha256=hkZVnIZll7e_KNEQzeY94Y9tlzVL8iVZBTMBvDykksU,35142
|
|
39
|
+
rara_tools-0.6.14.dist-info/METADATA,sha256=kGpk4MmwncYdzkqpFT64vu9ZgI_oQj4hOzAb6NzpyJs,4080
|
|
40
|
+
rara_tools-0.6.14.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
41
|
+
rara_tools-0.6.14.dist-info/top_level.txt,sha256=JwfB5b8BAtW5OFKRln2AQ_WElTRyIBM4nO0FKN1cupY,11
|
|
42
|
+
rara_tools-0.6.14.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|