rara-tools 0.7.4__py3-none-any.whl → 0.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rara-tools might be problematic. Click here for more details.

@@ -1,3 +1,5 @@
1
+ from dataclasses import dataclass
2
+
1
3
  COMPONENT_KEY = "meta_extractor"
2
4
 
3
5
 
@@ -13,6 +15,30 @@ class Queue:
13
15
  class StatusKeys:
14
16
  EXTRACT_METADATA = "extract_metadata"
15
17
 
16
-
17
18
  class Error:
18
- UNKNOWN = "Failed to extract meta information from digitizer output!"
19
+ UNKNOWN = "Failed to extract meta information from digitizer output!"
20
+
21
+
22
+ @dataclass(frozen=True)
23
+ class TitleType:
24
+ AUTHOR_WITHOUT_TITLE: str = "pealkirjata autor"
25
+ NORMALIZED_TITLE: str = "normitud eelispealkiri"
26
+ TITLE: str = "väljaandes esitatud kujul põhipealkiri"
27
+ PARALLEL_TITLE: str = "rööppealkiri"
28
+ ADDITIONAL_TITLE: str = "alampealkiri"
29
+ METS_TITLE: str = "väljaandes esitatud kujul põhipealkiri"
30
+ ANON: str = "anonüümne väljaanne"
31
+
32
+
33
+ TITLE_TYPES_MAP = {
34
+ TitleType.AUTHOR_WITHOUT_TITLE: 130,
35
+ TitleType.NORMALIZED_TITLE: 240,
36
+ TitleType.TITLE: 245,
37
+ TitleType.PARALLEL_TITLE: 246,
38
+ TitleType.ADDITIONAL_TITLE: 245,
39
+ TitleType.METS_TITLE: 245,
40
+ TitleType.ANON: 130
41
+ }
42
+
43
+
44
+ PUBLISHER_KEY = "Väljaandja"
@@ -2,6 +2,10 @@ from typing import List, Tuple, Any
2
2
  from rara_tools.core_formatters.formatted_keyword import FormattedKeyword
3
3
  from rara_tools.core_formatters.formatted_meta import FormattedAuthor
4
4
  from rara_tools.constants.linker import MAIN_TAXONOMY_LANG, KEYWORD_TYPES_TO_IGNORE, EntityType
5
+ from rara_tools.constants.meta_extractor import TitleType, TITLE_TYPES_MAP, PUBLISHER_KEY
6
+ from rara_tools.constants.subject_indexer import KeywordType
7
+
8
+ import regex as re
5
9
 
6
10
  def get_primary_author(authors: List[dict]) -> str:
7
11
  primary_author = ""
@@ -10,8 +14,19 @@ def get_primary_author(authors: List[dict]) -> str:
10
14
  primary_author = author.get("name", "")
11
15
  return primary_author
12
16
 
13
- def format_authors(authors: List[dict]) -> List[dict]:
17
+ def is_valid_keyword(keyword: str) -> bool:
18
+ # If keywords contains ONLY punctuation
19
+ # characters, we assume it`s not valid
20
+ if re.search(r"^(\W|_)+$", keyword):
21
+ return False
22
+ return True
23
+
24
+ def format_series_info(series: str):
25
+ pass
26
+
27
+ def format_authors(authors: List[dict]) -> Tuple[List[dict], dict]:
14
28
  formatted_authors = []
29
+ publisher = {}
15
30
  for author in authors:
16
31
  entity_type = author.get("type", EntityType.UNK)
17
32
 
@@ -20,8 +35,14 @@ def format_authors(authors: List[dict]) -> List[dict]:
20
35
  linked_doc=None,
21
36
  entity_type=entity_type
22
37
  ).to_dict()
38
+
39
+ # If author role == publisher, do not add it as an author
40
+ if formatted_author.get("author_role", "") == PUBLISHER_KEY:
41
+ publisher = formatted_author
42
+ continue
43
+
23
44
  formatted_authors.append(formatted_author)
24
- return formatted_authors
45
+ return (formatted_authors, publisher)
25
46
 
26
47
  def format_sections(sections: List[dict]) -> List[dict]:
27
48
  for section in sections:
@@ -31,9 +52,16 @@ def format_sections(sections: List[dict]) -> List[dict]:
31
52
  if primary_author:
32
53
  for title in titles:
33
54
  title["author_from_title"] = primary_author
55
+ if not authors:
56
+ for title in titles:
57
+ title["title_type"] = TitleType.ANON
58
+ title["title_type_int"] = TITLE_TYPES_MAP.get(TitleType.ANON)
34
59
  section["titles"] = titles
35
60
 
36
- formatted_authors = format_authors(authors)
61
+ # Extract publisher, but do nothing with it
62
+ # as it is unlikely for the publishing info to be
63
+ # in a METS/ALTO section. Can update it, if proven otherwise
64
+ formatted_authors, publisher = format_authors(authors)
37
65
  section["authors"] = formatted_authors
38
66
 
39
67
  return sections
@@ -46,14 +74,25 @@ def format_meta(meta: dict) -> dict:
46
74
 
47
75
  authors = meta_to_format.pop("authors", [])
48
76
  sections = meta_to_format.pop("sections", [])
77
+ titles = meta_to_format.pop("titles", [])
49
78
 
50
- formatted_authors = format_authors(authors)
79
+ formatted_authors, publisher = format_authors(authors)
51
80
  formatted_sections = format_sections(sections)
52
81
 
53
82
  if sections and formatted_sections:
54
83
  meta_to_format["sections"] = formatted_sections
55
84
  if authors and formatted_authors:
56
85
  meta_to_format["authors"] = formatted_authors
86
+ if titles and not authors:
87
+ for title in titles:
88
+ title["title_type"] = TitleType.ANON
89
+ title["title_type_int"] = TITLE_TYPES_MAP.get(TitleType.ANON)
90
+ meta_to_format["titles"] = titles
91
+
92
+ if publisher:
93
+ # Not sure, if it would be better to add original name or
94
+ # linked value. Currently adding original for safety
95
+ meta_to_format["publisher"] = publisher.get("original_name")
57
96
 
58
97
  meta["meta"] = meta_to_format
59
98
 
@@ -81,6 +120,30 @@ def format_keywords(flat_keywords: List[dict]) -> List[dict]:
81
120
  linked_doc=None,
82
121
  main_taxnomy_lang=MAIN_TAXONOMY_LANG
83
122
  ).to_dict()
84
- formatted_keywords.append(formatted_keyword)
123
+ if is_valid_keyword(formatted_keyword.get("keyword")):
124
+ formatted_keywords.append(formatted_keyword)
85
125
 
86
126
  return formatted_keywords
127
+
128
+ def get_udk072(flat_keywords: List[dict]) -> List[str]:
129
+ """ Filters out UDK from flat subject indexer output.
130
+ """
131
+ # keyword type: UDK
132
+ udk072 = [
133
+ keyword.get("keyword")
134
+ for keyword in flat_keywords
135
+ if keyword.get("entity_type") == KeywordType.UDK
136
+ ]
137
+ return udk072
138
+
139
+
140
+ def get_udk080(flat_keywords: List[dict]) -> List[str]:
141
+ """ Filters out UDC from flat subject indexer output.
142
+ """
143
+ # keyword type: UDC
144
+ udk080 = [
145
+ keyword.get("keyword")
146
+ for keyword in flat_keywords
147
+ if keyword.get("entity_type") == KeywordType.UDC
148
+ ]
149
+ return udk080
@@ -11,7 +11,7 @@ class FormattedTitle(FormattedObject):
11
11
  super().__init__(
12
12
  object_dict=object_dict,
13
13
  linked_doc=linked_doc,
14
- original_entity_key="name"
14
+ original_entity_key="title"
15
15
  )
16
16
 
17
17
 
@@ -41,17 +41,16 @@ class FormattedAuthor(FormattedObject):
41
41
  #self.__standardized_uri: str = ""
42
42
  self.__viaf_id: str = ""
43
43
 
44
+ self._default_author_type: str = EntityType.PER
45
+
44
46
 
45
47
  @property
46
48
  def primary_author_type(self) -> str:
47
49
  if self.__primary_author_type == None:
48
- if self.is_primary:
49
- if self.entity_type != EntityType.UNK:
50
+ self.__primary_author_type = self._default_author_type
51
+ if self.entity_type != EntityType.UNK:
52
+ if self.entity_type in [EntityType.ORG, EntityType.PER]:
50
53
  self.__primary_author_type = self.entity_type
51
- else:
52
- self.__primary_author_type = EntityType.PER
53
- else:
54
- self.__primary_author_type = ""
55
54
  return self.__primary_author_type
56
55
 
57
56
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rara-tools
3
- Version: 0.7.4
3
+ Version: 0.7.5
4
4
  Summary: Tools to support Kata's work.
5
5
  Classifier: Programming Language :: Python :: 3
6
6
  Classifier: Programming Language :: Python :: 3.10
@@ -11,13 +11,13 @@ rara_tools/constants/digitizer.py,sha256=f7VQGIYXd-MoPMi-iMgENsLWR_AA6koy0_jZct4
11
11
  rara_tools/constants/general.py,sha256=dLomRopLiHv_J_liSIGzK1A3XByydsKGIyVN8KuuN98,1191
12
12
  rara_tools/constants/language_evaluator.py,sha256=3sCSaoS-zXQRY0vJ7UUMuZqbtYQD_quVVbdpgvJjE7I,124
13
13
  rara_tools/constants/linker.py,sha256=WnOmJFTkoBMZUbBaW1uY45NTQB7FGG-dc9a_6qYTtwk,3381
14
- rara_tools/constants/meta_extractor.py,sha256=Z5GFsQmru_OUgj1cAe-v9jOsxwPYZ3JT0CjIDo8C6ro,331
14
+ rara_tools/constants/meta_extractor.py,sha256=iVyxycKScbrjFWLv50dRmdeHfTLOKbdyEhgUF3DyBrY,1053
15
15
  rara_tools/constants/normalizers.py,sha256=Xs3anDwJHpHeniwx3xoIZyqdEXtO3eb7ouGLLr0CpHw,1344
16
16
  rara_tools/constants/parsers.py,sha256=L6nh1Itget9_9DMsliDkh6T25z78eMFPWVkbaU08DwU,5561
17
17
  rara_tools/constants/subject_indexer.py,sha256=0snyyB8IMCWXOYPXR_c0Kavq4nBiww559rdNOKjawx8,2133
18
- rara_tools/core_formatters/core_formatter.py,sha256=HJX7jOi9kaFie_zm0Wzjk0nKF8dRleJpVWbCplFFquo,2760
18
+ rara_tools/core_formatters/core_formatter.py,sha256=u_Cdgv9qBcyF-XddjaRGUqAFik9OMAdSzAulXpYR7vE,4997
19
19
  rara_tools/core_formatters/formatted_keyword.py,sha256=1-B9IQTycFt69pTy8WZNnfJ2WIMRow3kpEub6igyNQc,7865
20
- rara_tools/core_formatters/formatted_meta.py,sha256=Zd0oQFLbn6m_wHaWtgxBsu9J7wGyWIpZxb2-8PrR3Wk,5240
20
+ rara_tools/core_formatters/formatted_meta.py,sha256=r0RPG4eM-REPIR1DrIJnvYPQtQrzkgdvX9tvhNWjQ0Y,5250
21
21
  rara_tools/core_formatters/formatted_object.py,sha256=7a499ZmcZXOqtlwxDi6FWHWF5a6HdCsduS22wV3uHIE,5656
22
22
  rara_tools/normalizers/__init__.py,sha256=_NqpS5w710DhaURytHq9JpEt8HgYpSPfRDcOtOymJgE,193
23
23
  rara_tools/normalizers/authorities.py,sha256=IDtcm0yNZNhv1f-WcdqWFSRzZk_CoKuBFsk6hEPddWM,4513
@@ -39,8 +39,8 @@ rara_tools/parsers/marc_records/title_record.py,sha256=XrtJ4gj7wzSaGxNaPtPuawmqq
39
39
  rara_tools/parsers/tools/entity_normalizers.py,sha256=VyCy_NowCLpOsL0luQ55IW-Qi-J5oBH0Ofzr7HRFBhM,8949
40
40
  rara_tools/parsers/tools/marc_converter.py,sha256=LgSHe-7n7aiDrw2bnsB53r3fXTRFjZXTwBYfTpL0pfs,415
41
41
  rara_tools/parsers/tools/russian_transliterator.py,sha256=5ZU66iTqAhr7pmfVqXPAI_cidF43VqqmuN4d7H4_JuA,9770
42
- rara_tools-0.7.4.dist-info/licenses/LICENSE.md,sha256=hkZVnIZll7e_KNEQzeY94Y9tlzVL8iVZBTMBvDykksU,35142
43
- rara_tools-0.7.4.dist-info/METADATA,sha256=OZ5OnDVf_aBcLWWEu_gr2hcc_-86FetOcfEyFeobyJ8,4079
44
- rara_tools-0.7.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
45
- rara_tools-0.7.4.dist-info/top_level.txt,sha256=JwfB5b8BAtW5OFKRln2AQ_WElTRyIBM4nO0FKN1cupY,11
46
- rara_tools-0.7.4.dist-info/RECORD,,
42
+ rara_tools-0.7.5.dist-info/licenses/LICENSE.md,sha256=hkZVnIZll7e_KNEQzeY94Y9tlzVL8iVZBTMBvDykksU,35142
43
+ rara_tools-0.7.5.dist-info/METADATA,sha256=UsHxND7IUSmGYDbzmHWjihiPrbqxefcSHyGy8EvpnFY,4079
44
+ rara_tools-0.7.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
45
+ rara_tools-0.7.5.dist-info/top_level.txt,sha256=JwfB5b8BAtW5OFKRln2AQ_WElTRyIBM4nO0FKN1cupY,11
46
+ rara_tools-0.7.5.dist-info/RECORD,,