rara-tools 0.7.4__py3-none-any.whl → 0.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rara-tools might be problematic. Click here for more details.

@@ -1,3 +1,5 @@
1
+ from dataclasses import dataclass
2
+
1
3
  COMPONENT_KEY = "meta_extractor"
2
4
 
3
5
 
@@ -13,6 +15,30 @@ class Queue:
13
15
  class StatusKeys:
14
16
  EXTRACT_METADATA = "extract_metadata"
15
17
 
16
-
17
18
  class Error:
18
- UNKNOWN = "Failed to extract meta information from digitizer output!"
19
+ UNKNOWN = "Failed to extract meta information from digitizer output!"
20
+
21
+
22
+ @dataclass(frozen=True)
23
+ class TitleType:
24
+ AUTHOR_WITHOUT_TITLE: str = "pealkirjata autor"
25
+ NORMALIZED_TITLE: str = "normitud eelispealkiri"
26
+ TITLE: str = "väljaandes esitatud kujul põhipealkiri"
27
+ PARALLEL_TITLE: str = "rööppealkiri"
28
+ ADDITIONAL_TITLE: str = "alampealkiri"
29
+ METS_TITLE: str = "väljaandes esitatud kujul põhipealkiri"
30
+ ANON: str = "anonüümne väljaanne"
31
+
32
+
33
+ TITLE_TYPES_MAP = {
34
+ TitleType.AUTHOR_WITHOUT_TITLE: 130,
35
+ TitleType.NORMALIZED_TITLE: 240,
36
+ TitleType.TITLE: 245,
37
+ TitleType.PARALLEL_TITLE: 246,
38
+ TitleType.ADDITIONAL_TITLE: 245,
39
+ TitleType.METS_TITLE: 245,
40
+ TitleType.ANON: 130
41
+ }
42
+
43
+
44
+ PUBLISHER_KEY = "Väljaandja"
@@ -2,6 +2,10 @@ from typing import List, Tuple, Any
2
2
  from rara_tools.core_formatters.formatted_keyword import FormattedKeyword
3
3
  from rara_tools.core_formatters.formatted_meta import FormattedAuthor
4
4
  from rara_tools.constants.linker import MAIN_TAXONOMY_LANG, KEYWORD_TYPES_TO_IGNORE, EntityType
5
+ from rara_tools.constants.meta_extractor import TitleType, TITLE_TYPES_MAP, PUBLISHER_KEY
6
+ from rara_tools.constants.subject_indexer import KeywordType
7
+
8
+ import regex as re
5
9
 
6
10
  def get_primary_author(authors: List[dict]) -> str:
7
11
  primary_author = ""
@@ -10,8 +14,19 @@ def get_primary_author(authors: List[dict]) -> str:
10
14
  primary_author = author.get("name", "")
11
15
  return primary_author
12
16
 
13
- def format_authors(authors: List[dict]) -> List[dict]:
17
+ def is_valid_keyword(keyword: str) -> bool:
18
+ # If keywords contains ONLY punctuation
19
+ # characters, we assume it`s not valid
20
+ if re.search(r"^(\W|_)+$", keyword):
21
+ return False
22
+ return True
23
+
24
+ def format_series_info(series: str):
25
+ pass
26
+
27
+ def format_authors(authors: List[dict]) -> Tuple[List[dict], dict]:
14
28
  formatted_authors = []
29
+ publisher = {}
15
30
  for author in authors:
16
31
  entity_type = author.get("type", EntityType.UNK)
17
32
 
@@ -20,8 +35,14 @@ def format_authors(authors: List[dict]) -> List[dict]:
20
35
  linked_doc=None,
21
36
  entity_type=entity_type
22
37
  ).to_dict()
38
+
39
+ # If author role == publisher, do not add it as an author
40
+ if formatted_author.get("author_role", "") == PUBLISHER_KEY:
41
+ publisher = formatted_author
42
+ continue
43
+
23
44
  formatted_authors.append(formatted_author)
24
- return formatted_authors
45
+ return (formatted_authors, publisher)
25
46
 
26
47
  def format_sections(sections: List[dict]) -> List[dict]:
27
48
  for section in sections:
@@ -31,9 +52,16 @@ def format_sections(sections: List[dict]) -> List[dict]:
31
52
  if primary_author:
32
53
  for title in titles:
33
54
  title["author_from_title"] = primary_author
55
+ if not authors:
56
+ for title in titles:
57
+ title["title_type"] = TitleType.ANON
58
+ title["title_type_int"] = TITLE_TYPES_MAP.get(TitleType.ANON)
34
59
  section["titles"] = titles
35
60
 
36
- formatted_authors = format_authors(authors)
61
+ # Extract publisher, but do nothing with it
62
+ # as it is unlikely for the publishing info to be
63
+ # in a METS/ALTO section. Can update it, if proven otherwise
64
+ formatted_authors, publisher = format_authors(authors)
37
65
  section["authors"] = formatted_authors
38
66
 
39
67
  return sections
@@ -46,14 +74,25 @@ def format_meta(meta: dict) -> dict:
46
74
 
47
75
  authors = meta_to_format.pop("authors", [])
48
76
  sections = meta_to_format.pop("sections", [])
77
+ titles = meta_to_format.pop("titles", [])
49
78
 
50
- formatted_authors = format_authors(authors)
79
+ formatted_authors, publisher = format_authors(authors)
51
80
  formatted_sections = format_sections(sections)
52
81
 
53
82
  if sections and formatted_sections:
54
83
  meta_to_format["sections"] = formatted_sections
55
84
  if authors and formatted_authors:
56
85
  meta_to_format["authors"] = formatted_authors
86
+ if titles and not authors:
87
+ for title in titles:
88
+ title["title_type"] = TitleType.ANON
89
+ title["title_type_int"] = TITLE_TYPES_MAP.get(TitleType.ANON)
90
+ meta_to_format["titles"] = titles
91
+
92
+ if publisher:
93
+ # Not sure, if it would be better to add original name or
94
+ # linked value. Currently adding original for safety
95
+ meta_to_format["publisher"] = publisher.get("original_name")
57
96
 
58
97
  meta["meta"] = meta_to_format
59
98
 
@@ -81,6 +120,30 @@ def format_keywords(flat_keywords: List[dict]) -> List[dict]:
81
120
  linked_doc=None,
82
121
  main_taxnomy_lang=MAIN_TAXONOMY_LANG
83
122
  ).to_dict()
84
- formatted_keywords.append(formatted_keyword)
123
+ if is_valid_keyword(formatted_keyword.get("keyword")):
124
+ formatted_keywords.append(formatted_keyword)
85
125
 
86
126
  return formatted_keywords
127
+
128
+ def get_udk072(flat_keywords: List[dict]) -> List[str]:
129
+ """ Filters out UDK from flat subject indexer output.
130
+ """
131
+ # keyword type: UDK
132
+ udk072 = [
133
+ keyword.get("keyword")
134
+ for keyword in flat_keywords
135
+ if keyword.get("entity_type") == KeywordType.UDK
136
+ ]
137
+ return udk072
138
+
139
+
140
+ def get_udk080(flat_keywords: List[dict]) -> List[str]:
141
+ """ Filters out UDC from flat subject indexer output.
142
+ """
143
+ # keyword type: UDC
144
+ udk080 = [
145
+ keyword.get("keyword")
146
+ for keyword in flat_keywords
147
+ if keyword.get("entity_type") == KeywordType.UDC
148
+ ]
149
+ return udk080
@@ -25,6 +25,7 @@ class FormattedKeyword(FormattedObject):
25
25
  self.method: str = self.object_dict.get("method")
26
26
  self.model_arch: str = self.object_dict.get("model_arch", self.method)
27
27
  self.keyword_type: str = self.object_dict.get("entity_type")
28
+ self.article_id: str | None = self.object_dict.get("article_id", None)
28
29
 
29
30
  self.entity_type: str = KEYWORD_TYPE_MAP.get(self.keyword_type, "")
30
31
  self.url_source_map: dict = url_source_map
@@ -39,6 +40,7 @@ class FormattedKeyword(FormattedObject):
39
40
  self.__language: str = ""
40
41
  self.__author: str | None = None
41
42
 
43
+
42
44
  @property
43
45
  def keyword(self) -> str:
44
46
  return self.entity
@@ -139,6 +141,8 @@ class FormattedKeyword(FormattedObject):
139
141
  #self.__author = self.viaf_info.get
140
142
  return self.__author
141
143
 
144
+
145
+
142
146
  def _get_url_info(self) -> dict:
143
147
  """ Finds URL identifier from LinkedDoc based on
144
148
  given entity type.
@@ -224,6 +228,7 @@ class FormattedKeyword(FormattedObject):
224
228
  "score": self.score,
225
229
  "url": self.url,
226
230
  "url_source": self.url_source,
227
- "author": self.author
231
+ "author": self.author,
232
+ "article_id": self.article_id
228
233
  }
229
234
  return keyword_dict
@@ -11,7 +11,7 @@ class FormattedTitle(FormattedObject):
11
11
  super().__init__(
12
12
  object_dict=object_dict,
13
13
  linked_doc=linked_doc,
14
- original_entity_key="name"
14
+ original_entity_key="title"
15
15
  )
16
16
 
17
17
 
@@ -41,17 +41,16 @@ class FormattedAuthor(FormattedObject):
41
41
  #self.__standardized_uri: str = ""
42
42
  self.__viaf_id: str = ""
43
43
 
44
+ self._default_author_type: str = EntityType.PER
45
+
44
46
 
45
47
  @property
46
48
  def primary_author_type(self) -> str:
47
49
  if self.__primary_author_type == None:
48
- if self.is_primary:
49
- if self.entity_type != EntityType.UNK:
50
+ self.__primary_author_type = self._default_author_type
51
+ if self.entity_type != EntityType.UNK:
52
+ if self.entity_type in [EntityType.ORG, EntityType.PER]:
50
53
  self.__primary_author_type = self.entity_type
51
- else:
52
- self.__primary_author_type = EntityType.PER
53
- else:
54
- self.__primary_author_type = ""
55
54
  return self.__primary_author_type
56
55
 
57
56
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rara-tools
3
- Version: 0.7.4
3
+ Version: 0.7.6
4
4
  Summary: Tools to support Kata's work.
5
5
  Classifier: Programming Language :: Python :: 3
6
6
  Classifier: Programming Language :: Python :: 3.10
@@ -11,13 +11,13 @@ rara_tools/constants/digitizer.py,sha256=f7VQGIYXd-MoPMi-iMgENsLWR_AA6koy0_jZct4
11
11
  rara_tools/constants/general.py,sha256=dLomRopLiHv_J_liSIGzK1A3XByydsKGIyVN8KuuN98,1191
12
12
  rara_tools/constants/language_evaluator.py,sha256=3sCSaoS-zXQRY0vJ7UUMuZqbtYQD_quVVbdpgvJjE7I,124
13
13
  rara_tools/constants/linker.py,sha256=WnOmJFTkoBMZUbBaW1uY45NTQB7FGG-dc9a_6qYTtwk,3381
14
- rara_tools/constants/meta_extractor.py,sha256=Z5GFsQmru_OUgj1cAe-v9jOsxwPYZ3JT0CjIDo8C6ro,331
14
+ rara_tools/constants/meta_extractor.py,sha256=iVyxycKScbrjFWLv50dRmdeHfTLOKbdyEhgUF3DyBrY,1053
15
15
  rara_tools/constants/normalizers.py,sha256=Xs3anDwJHpHeniwx3xoIZyqdEXtO3eb7ouGLLr0CpHw,1344
16
16
  rara_tools/constants/parsers.py,sha256=L6nh1Itget9_9DMsliDkh6T25z78eMFPWVkbaU08DwU,5561
17
17
  rara_tools/constants/subject_indexer.py,sha256=0snyyB8IMCWXOYPXR_c0Kavq4nBiww559rdNOKjawx8,2133
18
- rara_tools/core_formatters/core_formatter.py,sha256=HJX7jOi9kaFie_zm0Wzjk0nKF8dRleJpVWbCplFFquo,2760
19
- rara_tools/core_formatters/formatted_keyword.py,sha256=1-B9IQTycFt69pTy8WZNnfJ2WIMRow3kpEub6igyNQc,7865
20
- rara_tools/core_formatters/formatted_meta.py,sha256=Zd0oQFLbn6m_wHaWtgxBsu9J7wGyWIpZxb2-8PrR3Wk,5240
18
+ rara_tools/core_formatters/core_formatter.py,sha256=u_Cdgv9qBcyF-XddjaRGUqAFik9OMAdSzAulXpYR7vE,4997
19
+ rara_tools/core_formatters/formatted_keyword.py,sha256=hhi6wh4ErFionjBqYsEeKGbf1CACF7c5hu2XPaZDidc,7990
20
+ rara_tools/core_formatters/formatted_meta.py,sha256=r0RPG4eM-REPIR1DrIJnvYPQtQrzkgdvX9tvhNWjQ0Y,5250
21
21
  rara_tools/core_formatters/formatted_object.py,sha256=7a499ZmcZXOqtlwxDi6FWHWF5a6HdCsduS22wV3uHIE,5656
22
22
  rara_tools/normalizers/__init__.py,sha256=_NqpS5w710DhaURytHq9JpEt8HgYpSPfRDcOtOymJgE,193
23
23
  rara_tools/normalizers/authorities.py,sha256=IDtcm0yNZNhv1f-WcdqWFSRzZk_CoKuBFsk6hEPddWM,4513
@@ -39,8 +39,8 @@ rara_tools/parsers/marc_records/title_record.py,sha256=XrtJ4gj7wzSaGxNaPtPuawmqq
39
39
  rara_tools/parsers/tools/entity_normalizers.py,sha256=VyCy_NowCLpOsL0luQ55IW-Qi-J5oBH0Ofzr7HRFBhM,8949
40
40
  rara_tools/parsers/tools/marc_converter.py,sha256=LgSHe-7n7aiDrw2bnsB53r3fXTRFjZXTwBYfTpL0pfs,415
41
41
  rara_tools/parsers/tools/russian_transliterator.py,sha256=5ZU66iTqAhr7pmfVqXPAI_cidF43VqqmuN4d7H4_JuA,9770
42
- rara_tools-0.7.4.dist-info/licenses/LICENSE.md,sha256=hkZVnIZll7e_KNEQzeY94Y9tlzVL8iVZBTMBvDykksU,35142
43
- rara_tools-0.7.4.dist-info/METADATA,sha256=OZ5OnDVf_aBcLWWEu_gr2hcc_-86FetOcfEyFeobyJ8,4079
44
- rara_tools-0.7.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
45
- rara_tools-0.7.4.dist-info/top_level.txt,sha256=JwfB5b8BAtW5OFKRln2AQ_WElTRyIBM4nO0FKN1cupY,11
46
- rara_tools-0.7.4.dist-info/RECORD,,
42
+ rara_tools-0.7.6.dist-info/licenses/LICENSE.md,sha256=hkZVnIZll7e_KNEQzeY94Y9tlzVL8iVZBTMBvDykksU,35142
43
+ rara_tools-0.7.6.dist-info/METADATA,sha256=DFO10PC3Toj5n2u6Lpd1Bfuv9PytQMHEbdi1CrLfu1w,4079
44
+ rara_tools-0.7.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
45
+ rara_tools-0.7.6.dist-info/top_level.txt,sha256=JwfB5b8BAtW5OFKRln2AQ_WElTRyIBM4nO0FKN1cupY,11
46
+ rara_tools-0.7.6.dist-info/RECORD,,