rara-tools 0.7.3__py3-none-any.whl → 0.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rara-tools might be problematic. Click here for more details.

@@ -26,3 +26,15 @@ class Queue:
26
26
  class Tasks:
27
27
  START_DIGITIZER_PIPELINE = "start_digitizer_pipeline"
28
28
  PURGE_MODELS = "purge_unused_digitizer_models"
29
+
30
+
31
+ class Error:
32
+ NO_SPACE = "Disk out of space!"
33
+ COULDNT_DOWNLOAD = "Unknown error when downloading model!"
34
+ UNKNOWN = "Unknown system error!"
35
+ S3_CONNECTION = "Failed to connect to S3!"
36
+ UNSUPPORTED_FILETYPE = "Unsupported file type!"
37
+ COULDNT_UPLOAD = "Could not upload documents to Elasticsearch!"
38
+ FILE_IS_PROTECTED = "File is password protected or encrypted!"
39
+ UNKNOWN_OCR = "Unknown error when applying ocr!"
40
+ CUSTOM_MODEL_ERROR = "Couldn't download custom image classification model!"
@@ -1,5 +1,6 @@
1
1
  import logging
2
- from rara_tools.constants.normalizers import EntityType, VIAF_ENTITY_MAP
2
+
3
+ from rara_tools.constants.normalizers import EntityType
3
4
 
4
5
  COMPONENT_KEY = "linker"
5
6
 
@@ -24,11 +25,13 @@ class StatusKeys:
24
25
  VECTORIZE_CONTEXT = "vectorize_context"
25
26
  LINK_KEYWORDS = "link_keywords"
26
27
 
28
+
27
29
  class URLSource:
28
30
  VIAF = "VIAF"
29
31
  SIERRA = "Sierra"
30
32
  EMS = "EMS"
31
33
 
34
+
32
35
  class KeywordType:
33
36
  LOC = "Kohamärksõnad"
34
37
  TIME = "Ajamärksõnad"
@@ -54,17 +57,24 @@ class KeywordMARC:
54
57
  TITLE = 630
55
58
  TITLE_LINKED = 600
56
59
 
60
+
57
61
  class KeywordSource:
58
62
  EMS = "EMS"
59
63
  SIERRA = "SIERRA"
60
64
  VIAF = "VIAF"
61
65
  AI = "AI"
62
66
 
67
+
63
68
  class Filters:
64
69
  AUTHOR = "author"
65
70
  YEAR = "year"
66
71
 
67
72
 
73
+ class Error:
74
+ VECTORIZATION = "Failed to vectorize text!"
75
+ LINKING_KEYWORDS = "Failed to link keywords!"
76
+ LINKING_META = "Failed to link meta!"
77
+
68
78
  UNLINKED_KEYWORD_MARC_FIELD = 693
69
79
 
70
80
  ALLOWED_FILTERS_MAP = {
@@ -110,7 +120,6 @@ ALLOWED_ENTITY_TYPES = [
110
120
  EntityType.UNK,
111
121
  ]
112
122
 
113
-
114
123
  KEYWORD_TYPE_MAP = {
115
124
  KeywordType.TIME: EntityType.KEYWORD,
116
125
  KeywordType.GENRE: EntityType.KEYWORD,
@@ -1,3 +1,5 @@
1
+ from dataclasses import dataclass
2
+
1
3
  COMPONENT_KEY = "meta_extractor"
2
4
 
3
5
 
@@ -11,4 +13,32 @@ class Queue:
11
13
 
12
14
 
13
15
  class StatusKeys:
14
- EXTRACT_METADATA = "extract_metadata"
16
+ EXTRACT_METADATA = "extract_metadata"
17
+
18
+ class Error:
19
+ UNKNOWN = "Failed to extract meta information from digitizer output!"
20
+
21
+
22
+ @dataclass(frozen=True)
23
+ class TitleType:
24
+ AUTHOR_WITHOUT_TITLE: str = "pealkirjata autor"
25
+ NORMALIZED_TITLE: str = "normitud eelispealkiri"
26
+ TITLE: str = "väljaandes esitatud kujul põhipealkiri"
27
+ PARALLEL_TITLE: str = "rööppealkiri"
28
+ ADDITIONAL_TITLE: str = "alampealkiri"
29
+ METS_TITLE: str = "väljaandes esitatud kujul põhipealkiri"
30
+ ANON: str = "anonüümne väljaanne"
31
+
32
+
33
+ TITLE_TYPES_MAP = {
34
+ TitleType.AUTHOR_WITHOUT_TITLE: 130,
35
+ TitleType.NORMALIZED_TITLE: 240,
36
+ TitleType.TITLE: 245,
37
+ TitleType.PARALLEL_TITLE: 246,
38
+ TitleType.ADDITIONAL_TITLE: 245,
39
+ TitleType.METS_TITLE: 245,
40
+ TitleType.ANON: 130
41
+ }
42
+
43
+
44
+ PUBLISHER_KEY = "Väljaandja"
@@ -18,6 +18,10 @@ class StatusKeys:
18
18
  EXTRACT_KEYWORDS = "extract_keywords"
19
19
 
20
20
 
21
+ class Error:
22
+ UNKNOWN = "Could not extract keywords from text!"
23
+
24
+
21
25
  class URLSource:
22
26
  VIAF = "VIAF"
23
27
  SIERRA = "Sierra"
@@ -2,6 +2,10 @@ from typing import List, Tuple, Any
2
2
  from rara_tools.core_formatters.formatted_keyword import FormattedKeyword
3
3
  from rara_tools.core_formatters.formatted_meta import FormattedAuthor
4
4
  from rara_tools.constants.linker import MAIN_TAXONOMY_LANG, KEYWORD_TYPES_TO_IGNORE, EntityType
5
+ from rara_tools.constants.meta_extractor import TitleType, TITLE_TYPES_MAP, PUBLISHER_KEY
6
+ from rara_tools.constants.subject_indexer import KeywordType
7
+
8
+ import regex as re
5
9
 
6
10
  def get_primary_author(authors: List[dict]) -> str:
7
11
  primary_author = ""
@@ -10,8 +14,19 @@ def get_primary_author(authors: List[dict]) -> str:
10
14
  primary_author = author.get("name", "")
11
15
  return primary_author
12
16
 
13
- def format_authors(authors: List[dict]) -> List[dict]:
17
+ def is_valid_keyword(keyword: str) -> bool:
18
+ # If keywords contains ONLY punctuation
19
+ # characters, we assume it`s not valid
20
+ if re.search(r"^(\W|_)+$", keyword):
21
+ return False
22
+ return True
23
+
24
+ def format_series_info(series: str):
25
+ pass
26
+
27
+ def format_authors(authors: List[dict]) -> Tuple[List[dict], dict]:
14
28
  formatted_authors = []
29
+ publisher = {}
15
30
  for author in authors:
16
31
  entity_type = author.get("type", EntityType.UNK)
17
32
 
@@ -20,8 +35,14 @@ def format_authors(authors: List[dict]) -> List[dict]:
20
35
  linked_doc=None,
21
36
  entity_type=entity_type
22
37
  ).to_dict()
38
+
39
+ # If author role == publisher, do not add it as an author
40
+ if formatted_author.get("author_role", "") == PUBLISHER_KEY:
41
+ publisher = formatted_author
42
+ continue
43
+
23
44
  formatted_authors.append(formatted_author)
24
- return formatted_authors
45
+ return (formatted_authors, publisher)
25
46
 
26
47
  def format_sections(sections: List[dict]) -> List[dict]:
27
48
  for section in sections:
@@ -31,9 +52,16 @@ def format_sections(sections: List[dict]) -> List[dict]:
31
52
  if primary_author:
32
53
  for title in titles:
33
54
  title["author_from_title"] = primary_author
55
+ if not authors:
56
+ for title in titles:
57
+ title["title_type"] = TitleType.ANON
58
+ title["title_type_int"] = TITLE_TYPES_MAP.get(TitleType.ANON)
34
59
  section["titles"] = titles
35
60
 
36
- formatted_authors = format_authors(authors)
61
+ # Extract publisher, but do nothing with it
62
+ # as it is unlikely for the publishing info to be
63
+ # in a METS/ALTO section. Can update it, if proven otherwise
64
+ formatted_authors, publisher = format_authors(authors)
37
65
  section["authors"] = formatted_authors
38
66
 
39
67
  return sections
@@ -46,14 +74,25 @@ def format_meta(meta: dict) -> dict:
46
74
 
47
75
  authors = meta_to_format.pop("authors", [])
48
76
  sections = meta_to_format.pop("sections", [])
77
+ titles = meta_to_format.pop("titles", [])
49
78
 
50
- formatted_authors = format_authors(authors)
79
+ formatted_authors, publisher = format_authors(authors)
51
80
  formatted_sections = format_sections(sections)
52
81
 
53
82
  if sections and formatted_sections:
54
83
  meta_to_format["sections"] = formatted_sections
55
84
  if authors and formatted_authors:
56
85
  meta_to_format["authors"] = formatted_authors
86
+ if titles and not authors:
87
+ for title in titles:
88
+ title["title_type"] = TitleType.ANON
89
+ title["title_type_int"] = TITLE_TYPES_MAP.get(TitleType.ANON)
90
+ meta_to_format["titles"] = titles
91
+
92
+ if publisher:
93
+ # Not sure, if it would be better to add original name or
94
+ # linked value. Currently adding original for safety
95
+ meta_to_format["publisher"] = publisher.get("original_name")
57
96
 
58
97
  meta["meta"] = meta_to_format
59
98
 
@@ -81,6 +120,30 @@ def format_keywords(flat_keywords: List[dict]) -> List[dict]:
81
120
  linked_doc=None,
82
121
  main_taxnomy_lang=MAIN_TAXONOMY_LANG
83
122
  ).to_dict()
84
- formatted_keywords.append(formatted_keyword)
123
+ if is_valid_keyword(formatted_keyword.get("keyword")):
124
+ formatted_keywords.append(formatted_keyword)
85
125
 
86
126
  return formatted_keywords
127
+
128
+ def get_udk072(flat_keywords: List[dict]) -> List[str]:
129
+ """ Filters out UDK from flat subject indexer output.
130
+ """
131
+ # keyword type: UDK
132
+ udk072 = [
133
+ keyword.get("keyword")
134
+ for keyword in flat_keywords
135
+ if keyword.get("entity_type") == KeywordType.UDK
136
+ ]
137
+ return udk072
138
+
139
+
140
+ def get_udk080(flat_keywords: List[dict]) -> List[str]:
141
+ """ Filters out UDC from flat subject indexer output.
142
+ """
143
+ # keyword type: UDC
144
+ udk080 = [
145
+ keyword.get("keyword")
146
+ for keyword in flat_keywords
147
+ if keyword.get("entity_type") == KeywordType.UDC
148
+ ]
149
+ return udk080
@@ -11,7 +11,7 @@ class FormattedTitle(FormattedObject):
11
11
  super().__init__(
12
12
  object_dict=object_dict,
13
13
  linked_doc=linked_doc,
14
- original_entity_key="name"
14
+ original_entity_key="title"
15
15
  )
16
16
 
17
17
 
@@ -41,17 +41,16 @@ class FormattedAuthor(FormattedObject):
41
41
  #self.__standardized_uri: str = ""
42
42
  self.__viaf_id: str = ""
43
43
 
44
+ self._default_author_type: str = EntityType.PER
45
+
44
46
 
45
47
  @property
46
48
  def primary_author_type(self) -> str:
47
49
  if self.__primary_author_type == None:
48
- if self.is_primary:
49
- if self.entity_type != EntityType.UNK:
50
+ self.__primary_author_type = self._default_author_type
51
+ if self.entity_type != EntityType.UNK:
52
+ if self.entity_type in [EntityType.ORG, EntityType.PER]:
50
53
  self.__primary_author_type = self.entity_type
51
- else:
52
- self.__primary_author_type = EntityType.PER
53
- else:
54
- self.__primary_author_type = ""
55
54
  return self.__primary_author_type
56
55
 
57
56
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rara-tools
3
- Version: 0.7.3
3
+ Version: 0.7.5
4
4
  Summary: Tools to support Kata's work.
5
5
  Classifier: Programming Language :: Python :: 3
6
6
  Classifier: Programming Language :: Python :: 3.10
@@ -7,17 +7,17 @@ rara_tools/s3.py,sha256=9ziDXsLjBtFAvsjTPxFddhfvkpA8773rzPJqO7y1N5Q,6415
7
7
  rara_tools/task_reporter.py,sha256=WCcZts9dAUokPc4vbrG3-lNAFLnWaMgE3b3iaUB7mr8,3256
8
8
  rara_tools/utils.py,sha256=1UrxOzo3cxe4juMkDlKWv1VKWMYay5v1pivGci1ajiM,3003
9
9
  rara_tools/constants/__init__.py,sha256=r78laM9vyRDAvzDhPvzDlhaX6qPwUUBBtwf1WosrW3o,27
10
- rara_tools/constants/digitizer.py,sha256=9aQkJj8C5a_HLgCayrz3PpGYJMLoO4Ph9_U28Q-C1T4,633
10
+ rara_tools/constants/digitizer.py,sha256=f7VQGIYXd-MoPMi-iMgENsLWR_AA6koy0_jZct4HzVc,1152
11
11
  rara_tools/constants/general.py,sha256=dLomRopLiHv_J_liSIGzK1A3XByydsKGIyVN8KuuN98,1191
12
12
  rara_tools/constants/language_evaluator.py,sha256=3sCSaoS-zXQRY0vJ7UUMuZqbtYQD_quVVbdpgvJjE7I,124
13
- rara_tools/constants/linker.py,sha256=yBN9NpUhB3ENz8BapoIfpSHY_xNqwYdqutgQFdc_Cd8,3240
14
- rara_tools/constants/meta_extractor.py,sha256=adYH8cQqH0ZWYO7clGMiObclXRTGsxWgk3pC1oiHxHE,242
13
+ rara_tools/constants/linker.py,sha256=WnOmJFTkoBMZUbBaW1uY45NTQB7FGG-dc9a_6qYTtwk,3381
14
+ rara_tools/constants/meta_extractor.py,sha256=iVyxycKScbrjFWLv50dRmdeHfTLOKbdyEhgUF3DyBrY,1053
15
15
  rara_tools/constants/normalizers.py,sha256=Xs3anDwJHpHeniwx3xoIZyqdEXtO3eb7ouGLLr0CpHw,1344
16
16
  rara_tools/constants/parsers.py,sha256=L6nh1Itget9_9DMsliDkh6T25z78eMFPWVkbaU08DwU,5561
17
- rara_tools/constants/subject_indexer.py,sha256=i0xRdqwasyb6d6WZZKPgyuEUd2JeO_qwWYoG6UeBo5U,2064
18
- rara_tools/core_formatters/core_formatter.py,sha256=HJX7jOi9kaFie_zm0Wzjk0nKF8dRleJpVWbCplFFquo,2760
17
+ rara_tools/constants/subject_indexer.py,sha256=0snyyB8IMCWXOYPXR_c0Kavq4nBiww559rdNOKjawx8,2133
18
+ rara_tools/core_formatters/core_formatter.py,sha256=u_Cdgv9qBcyF-XddjaRGUqAFik9OMAdSzAulXpYR7vE,4997
19
19
  rara_tools/core_formatters/formatted_keyword.py,sha256=1-B9IQTycFt69pTy8WZNnfJ2WIMRow3kpEub6igyNQc,7865
20
- rara_tools/core_formatters/formatted_meta.py,sha256=Zd0oQFLbn6m_wHaWtgxBsu9J7wGyWIpZxb2-8PrR3Wk,5240
20
+ rara_tools/core_formatters/formatted_meta.py,sha256=r0RPG4eM-REPIR1DrIJnvYPQtQrzkgdvX9tvhNWjQ0Y,5250
21
21
  rara_tools/core_formatters/formatted_object.py,sha256=7a499ZmcZXOqtlwxDi6FWHWF5a6HdCsduS22wV3uHIE,5656
22
22
  rara_tools/normalizers/__init__.py,sha256=_NqpS5w710DhaURytHq9JpEt8HgYpSPfRDcOtOymJgE,193
23
23
  rara_tools/normalizers/authorities.py,sha256=IDtcm0yNZNhv1f-WcdqWFSRzZk_CoKuBFsk6hEPddWM,4513
@@ -39,8 +39,8 @@ rara_tools/parsers/marc_records/title_record.py,sha256=XrtJ4gj7wzSaGxNaPtPuawmqq
39
39
  rara_tools/parsers/tools/entity_normalizers.py,sha256=VyCy_NowCLpOsL0luQ55IW-Qi-J5oBH0Ofzr7HRFBhM,8949
40
40
  rara_tools/parsers/tools/marc_converter.py,sha256=LgSHe-7n7aiDrw2bnsB53r3fXTRFjZXTwBYfTpL0pfs,415
41
41
  rara_tools/parsers/tools/russian_transliterator.py,sha256=5ZU66iTqAhr7pmfVqXPAI_cidF43VqqmuN4d7H4_JuA,9770
42
- rara_tools-0.7.3.dist-info/licenses/LICENSE.md,sha256=hkZVnIZll7e_KNEQzeY94Y9tlzVL8iVZBTMBvDykksU,35142
43
- rara_tools-0.7.3.dist-info/METADATA,sha256=D6RNdyg7JAga9gYCrL6hGP1TRN45OpqccD-UkprgVdI,4079
44
- rara_tools-0.7.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
45
- rara_tools-0.7.3.dist-info/top_level.txt,sha256=JwfB5b8BAtW5OFKRln2AQ_WElTRyIBM4nO0FKN1cupY,11
46
- rara_tools-0.7.3.dist-info/RECORD,,
42
+ rara_tools-0.7.5.dist-info/licenses/LICENSE.md,sha256=hkZVnIZll7e_KNEQzeY94Y9tlzVL8iVZBTMBvDykksU,35142
43
+ rara_tools-0.7.5.dist-info/METADATA,sha256=UsHxND7IUSmGYDbzmHWjihiPrbqxefcSHyGy8EvpnFY,4079
44
+ rara_tools-0.7.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
45
+ rara_tools-0.7.5.dist-info/top_level.txt,sha256=JwfB5b8BAtW5OFKRln2AQ_WElTRyIBM4nO0FKN1cupY,11
46
+ rara_tools-0.7.5.dist-info/RECORD,,