rara-tools 0.6.13__py3-none-any.whl → 0.6.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rara-tools might be problematic. Click here for more details.

@@ -1,3 +1,5 @@
1
+ from rara_tools.constants.normalizers import EntityType
2
+
1
3
  COMPONENT_KEY = "subject_indexer"
2
4
 
3
5
 
@@ -11,4 +13,73 @@ class Queue:
11
13
 
12
14
 
13
15
  class StatusKeys:
14
- EXTRACT_KEYWORDS = "extract_keywords"
16
+ EXTRACT_KEYWORDS = "extract_keywords"
17
+
18
+
19
+ class URLSource:
20
+ VIAF = "VIAF"
21
+ SIERRA = "Sierra"
22
+ EMS = "EMS"
23
+
24
+ class KeywordType:
25
+ LOC = "Kohamärksõnad"
26
+ TIME = "Ajamärksõnad"
27
+ TOPIC = "Teemamärksõnad"
28
+ GENRE = "Vormimärksõnad"
29
+ TITLE = "Teose pealkiri"
30
+ PER = "Isikunimi"
31
+ ORG = "Kollektiivi nimi"
32
+ EVENT = "Ajutine kollektiiv või sündmus"
33
+ CATEGORY = "Valdkonnamärksõnad"
34
+ UDC = "UDC Summary"
35
+ UDK = "UDK Rahvusbibliograafia"
36
+
37
+
38
+ class KeywordMARC:
39
+ PER = 600
40
+ ORG = 610
41
+ TOPIC = 650
42
+ GENRE = 655
43
+ TIME = 648
44
+ LOC = 651
45
+ EVENT = 611
46
+ TITLE = 630
47
+
48
+ class KeywordSource:
49
+ EMS = "EMS"
50
+ SIERRA = "SIERRA"
51
+ VIAF = "VIAF"
52
+ AI = "AI"
53
+
54
+
55
+ KEYWORD_TYPE_MAP = {
56
+ KeywordType.TIME: EntityType.KEYWORD,
57
+ KeywordType.GENRE: EntityType.KEYWORD,
58
+ KeywordType.LOC: EntityType.LOC,
59
+ KeywordType.PER: EntityType.PER,
60
+ KeywordType.ORG: EntityType.ORG,
61
+ KeywordType.TOPIC: EntityType.KEYWORD,
62
+ KeywordType.TITLE: EntityType.TITLE,
63
+ KeywordType.EVENT: EntityType.ORG
64
+ }
65
+
66
+ KEYWORD_MARC_MAP = {
67
+ KeywordType.LOC: KeywordMARC.LOC,
68
+ KeywordType.TIME: KeywordMARC.TIME,
69
+ KeywordType.TOPIC: KeywordMARC.TOPIC,
70
+ KeywordType.GENRE: KeywordMARC.GENRE,
71
+ KeywordType.TITLE: KeywordMARC.TITLE,
72
+ KeywordType.ORG: KeywordMARC.ORG,
73
+ KeywordType.PER: KeywordMARC.PER,
74
+ KeywordType.EVENT: KeywordMARC.EVENT
75
+ }
76
+
77
+ KEYWORD_TYPES_TO_IGNORE = [
78
+ KeywordType.CATEGORY,
79
+ KeywordType.UDC,
80
+ KeywordType.UDK
81
+ ]
82
+
83
+ EMS_ENTITY_TYPES = [EntityType.KEYWORD, EntityType.LOC]
84
+ SIERRA_ENTITY_TYPES = [EntityType.PER, EntityType.ORG, EntityType.TITLE]
85
+ VIAF_ENTITY_TYPES = [EntityType.PER, EntityType.ORG, EntityType.TITLE]
@@ -0,0 +1,106 @@
1
+ from typing import List, Tuple, Any
2
+ from rara_tools.constants.subject_indexer import (
3
+ EntityType, KeywordType, KeywordMARC, KeywordSource, URLSource,
4
+ KEYWORD_TYPE_MAP, KEYWORD_MARC_MAP, KEYWORD_TYPES_TO_IGNORE,
5
+ EMS_ENTITY_TYPES, SIERRA_ENTITY_TYPES, VIAF_ENTITY_TYPES
6
+ )
7
+
8
+ def _get_keyword_source(linked_doc: Any, entity_type: str, is_linked: bool
9
+ ) -> str:
10
+ """ Find keyword source.
11
+ """
12
+ if not is_linked:
13
+ source = KeywordSource.AI
14
+ elif entity_type in EMS_ENTITY_TYPES:
15
+ source = KeywordSource.EMS
16
+ elif entity_type in SIERRA_ENTITY_TYPES:
17
+ if linked_doc and linked_doc.elastic:
18
+ source = KeywordSource.SIERRA
19
+ elif linked_doc and linked_doc.viaf:
20
+ source = KeywordSource.VIAF
21
+ else:
22
+ source = KeywordSource.AI
23
+ else:
24
+ source = KeywordSource.AI
25
+ return source
26
+
27
+ def _find_indicators(entity_type: str, entity: str,
28
+ is_linked: bool
29
+ ) -> Tuple[str, str]:
30
+ """ Find MARC indicators 1 and 2.
31
+ """
32
+ ind1 = " "
33
+ ind2 = " "
34
+ if entity_type in SIERRA_ENTITY_TYPES:
35
+ if entity_type == EntityType.PER:
36
+ if "," in entity:
37
+ ind1 = "1"
38
+ else:
39
+ ind1 = "0"
40
+ else:
41
+ # 1 märksõna esimeseks elemendiks võimupiirkonna nimi, nt:
42
+ # (a) Eesti (b) Riigikogu - raske automaatselt määrata
43
+ # 2 märksõna esimeseks elemendiks nimi pärijärjestuses
44
+ ind1 = "2"
45
+ if not is_linked:
46
+ ind2 = "4"
47
+ elif entity_type in EMS_ENTITY_TYPES:
48
+ ind2 = "4"
49
+ return (ind1, ind2)
50
+
51
+
52
+ def format_keywords(flat_keywords: List[dict]) -> dict:
53
+ """ Formats unlinked keywords for Kata CORE.
54
+ """
55
+ ignored_keywords = []
56
+ filtered_keywords = []
57
+
58
+ for keyword_dict in flat_keywords:
59
+ keyword_type = keyword_dict.get("entity_type")
60
+ if keyword_type in KEYWORD_TYPES_TO_IGNORE:
61
+ ignored_keywords.append(keyword_dict)
62
+ else:
63
+ filtered_keywords.append(keyword_dict)
64
+
65
+ formatted_keywords = {
66
+ "keywords": [],
67
+ "other": ignored_keywords
68
+ }
69
+
70
+ for keyword_dict in filtered_keywords:
71
+ original_keyword = keyword_dict.get("keyword")
72
+ keyword_type = keyword_dict.get("entity_type")
73
+ entity_type = KEYWORD_TYPE_MAP.get(keyword_type, "")
74
+ marc_field = KEYWORD_MARC_MAP.get(str(keyword_type), "")
75
+ lang = keyword_dict.get("language", "")
76
+
77
+ ind1, ind2 = _find_indicators(
78
+ entity_type=entity_type,
79
+ entity=original_keyword,
80
+ is_linked=False
81
+ )
82
+ keyword_source = _get_keyword_source(
83
+ linked_doc=None,
84
+ is_linked=False,
85
+ entity_type=entity_type
86
+ )
87
+ new_keyword_dict = {
88
+ "dates": "",
89
+ "indicator1": ind1,
90
+ "indicator2": ind2,
91
+ "is_linked": False,
92
+ "keyword_source": keyword_source,
93
+ "lang": lang,
94
+ "location": "",
95
+ "marc_field": marc_field,
96
+ "numeration": "",
97
+ "organisation_sub_unit": "",
98
+ "original_keyword": original_keyword,
99
+ "persons_title": "",
100
+ "url": "",
101
+ "url_source": ""
102
+ }
103
+ new_keyword_dict.update(keyword_dict)
104
+ formatted_keywords["keywords"].append(new_keyword_dict)
105
+
106
+ return formatted_keywords
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rara-tools
3
- Version: 0.6.13
3
+ Version: 0.6.14
4
4
  Summary: Tools to support Kata's work.
5
5
  Classifier: Programming Language :: Python :: 3
6
6
  Classifier: Programming Language :: Python :: 3.10
@@ -3,6 +3,7 @@ rara_tools/decorators.py,sha256=MjOyvZ5nTkwxwx2JLFEGpKKBysvecFw6EN6UDrSvZLU,2187
3
3
  rara_tools/digar_schema_converter.py,sha256=k95U2iRlEA3sh772-v6snhHW6fju6qSTMnvWJ6DpzZk,14254
4
4
  rara_tools/elastic.py,sha256=7HvDmFKpQbGnnzYyiCKOg0uvubnv2TpCASRrnPP8DcQ,13540
5
5
  rara_tools/exceptions.py,sha256=YQyaueUbXeTkJYFDEuN6iWTXMI3eCv5l7PxGp87vg5I,550
6
+ rara_tools/formatters.py,sha256=LTliadjIPZTO4s-44NsumaUdlQlEvqetvWz4bEvwf90,3418
6
7
  rara_tools/s3.py,sha256=9ziDXsLjBtFAvsjTPxFddhfvkpA8773rzPJqO7y1N5Q,6415
7
8
  rara_tools/task_reporter.py,sha256=WCcZts9dAUokPc4vbrG3-lNAFLnWaMgE3b3iaUB7mr8,3256
8
9
  rara_tools/utils.py,sha256=9vSbmuWYU5ydr4lXBKlUKa0xzDccFsaJv4T-XwgUfuY,2578
@@ -14,7 +15,7 @@ rara_tools/constants/linker.py,sha256=TQaigi7AUNOqmQPPz3hM8_xXgofrhoQ1taln79LhXQ
14
15
  rara_tools/constants/meta_extractor.py,sha256=adYH8cQqH0ZWYO7clGMiObclXRTGsxWgk3pC1oiHxHE,242
15
16
  rara_tools/constants/normalizers.py,sha256=Xs3anDwJHpHeniwx3xoIZyqdEXtO3eb7ouGLLr0CpHw,1344
16
17
  rara_tools/constants/parsers.py,sha256=L6nh1Itget9_9DMsliDkh6T25z78eMFPWVkbaU08DwU,5561
17
- rara_tools/constants/subject_indexer.py,sha256=C-Hi4fJ8YKyXB1L-hSKX0plw1ghMkpk61eDhFOqZw2c,250
18
+ rara_tools/constants/subject_indexer.py,sha256=E2D7pylH6Yey9h2TAvAWQiX5JtKKagsZx2E1Fz_afMI,1967
18
19
  rara_tools/normalizers/__init__.py,sha256=_NqpS5w710DhaURytHq9JpEt8HgYpSPfRDcOtOymJgE,193
19
20
  rara_tools/normalizers/authorities.py,sha256=IDtcm0yNZNhv1f-WcdqWFSRzZk_CoKuBFsk6hEPddWM,4513
20
21
  rara_tools/normalizers/base.py,sha256=LbS7Y7CEL-C-ynT-WPc-eCLkNeMO9BI9qtBm-W1skGM,11790
@@ -34,8 +35,8 @@ rara_tools/parsers/marc_records/title_record.py,sha256=XrtJ4gj7wzSaGxNaPtPuawmqq
34
35
  rara_tools/parsers/tools/entity_normalizers.py,sha256=VyCy_NowCLpOsL0luQ55IW-Qi-J5oBH0Ofzr7HRFBhM,8949
35
36
  rara_tools/parsers/tools/marc_converter.py,sha256=LgSHe-7n7aiDrw2bnsB53r3fXTRFjZXTwBYfTpL0pfs,415
36
37
  rara_tools/parsers/tools/russian_transliterator.py,sha256=5ZU66iTqAhr7pmfVqXPAI_cidF43VqqmuN4d7H4_JuA,9770
37
- rara_tools-0.6.13.dist-info/licenses/LICENSE.md,sha256=hkZVnIZll7e_KNEQzeY94Y9tlzVL8iVZBTMBvDykksU,35142
38
- rara_tools-0.6.13.dist-info/METADATA,sha256=P_y0fL650yYO1aN0Db2d_2pq93g99ytU8_pNq6kl0iY,4080
39
- rara_tools-0.6.13.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
40
- rara_tools-0.6.13.dist-info/top_level.txt,sha256=JwfB5b8BAtW5OFKRln2AQ_WElTRyIBM4nO0FKN1cupY,11
41
- rara_tools-0.6.13.dist-info/RECORD,,
38
+ rara_tools-0.6.14.dist-info/licenses/LICENSE.md,sha256=hkZVnIZll7e_KNEQzeY94Y9tlzVL8iVZBTMBvDykksU,35142
39
+ rara_tools-0.6.14.dist-info/METADATA,sha256=kGpk4MmwncYdzkqpFT64vu9ZgI_oQj4hOzAb6NzpyJs,4080
40
+ rara_tools-0.6.14.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
41
+ rara_tools-0.6.14.dist-info/top_level.txt,sha256=JwfB5b8BAtW5OFKRln2AQ_WElTRyIBM4nO0FKN1cupY,11
42
+ rara_tools-0.6.14.dist-info/RECORD,,