PyPI - rara-tools - Versions diffs - 0.7.3__py3-none-any.whl → 0.7.5__py3-none-any.whl - Mend

rara-tools 0.7.3py3-none-any.whl → 0.7.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of rara-tools might be problematic. Click here for more details.

Files changed (11) hide show

rara_tools/constants/digitizer.py CHANGED Viewed

@@ -26,3 +26,15 @@ class Queue:
 class Tasks:
     START_DIGITIZER_PIPELINE = "start_digitizer_pipeline"
     PURGE_MODELS = "purge_unused_digitizer_models"
+class Error:
+    NO_SPACE = "Disk out of space!"
+    COULDNT_DOWNLOAD = "Unknown error when downloading model!"
+    UNKNOWN = "Unknown system error!"
+    S3_CONNECTION = "Failed to connect to S3!"
+    UNSUPPORTED_FILETYPE = "Unsupported file type!"
+    COULDNT_UPLOAD = "Could not upload documents to Elasticsearch!"
+    FILE_IS_PROTECTED = "File is password protected or encrypted!"
+    UNKNOWN_OCR = "Unknown error when applying ocr!"
+    CUSTOM_MODEL_ERROR = "Couldn't download custom image classification model!"

rara_tools/constants/linker.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import logging
-from rara_tools.constants.normalizers import EntityType, VIAF_ENTITY_MAP
+from rara_tools.constants.normalizers import EntityType
 COMPONENT_KEY = "linker"
@@ -24,11 +25,13 @@ class StatusKeys:
     VECTORIZE_CONTEXT = "vectorize_context"
     LINK_KEYWORDS = "link_keywords"
 class URLSource:
     VIAF = "VIAF"
     SIERRA = "Sierra"
     EMS = "EMS"
 class KeywordType:
     LOC = "Kohamärksõnad"
     TIME = "Ajamärksõnad"
@@ -54,17 +57,24 @@ class KeywordMARC:
     TITLE = 630
     TITLE_LINKED = 600
 class KeywordSource:
     EMS = "EMS"
     SIERRA = "SIERRA"
     VIAF = "VIAF"
     AI = "AI"
 class Filters:
     AUTHOR = "author"
     YEAR = "year"
+class Error:
+    VECTORIZATION = "Failed to vectorize text!"
+    LINKING_KEYWORDS = "Failed to link keywords!"
+    LINKING_META = "Failed to link meta!"
 UNLINKED_KEYWORD_MARC_FIELD = 693
 ALLOWED_FILTERS_MAP = {
@@ -110,7 +120,6 @@ ALLOWED_ENTITY_TYPES = [
     EntityType.UNK,
 ]
 KEYWORD_TYPE_MAP = {
     KeywordType.TIME: EntityType.KEYWORD,
     KeywordType.GENRE: EntityType.KEYWORD,

rara_tools/constants/meta_extractor.py CHANGED Viewed

@@ -1,3 +1,5 @@
+from dataclasses import dataclass
 COMPONENT_KEY = "meta_extractor"
@@ -11,4 +13,32 @@ class Queue:
 class StatusKeys:
-    EXTRACT_METADATA = "extract_metadata"
+    EXTRACT_METADATA = "extract_metadata"
+class Error:
+    UNKNOWN = "Failed to extract meta information from digitizer output!"
+@dataclass(frozen=True)
+class TitleType:
+    AUTHOR_WITHOUT_TITLE: str = "pealkirjata autor"
+    NORMALIZED_TITLE: str = "normitud eelispealkiri"
+    TITLE: str = "väljaandes esitatud kujul põhipealkiri"
+    PARALLEL_TITLE: str = "rööppealkiri"
+    ADDITIONAL_TITLE: str = "alampealkiri"
+    METS_TITLE: str = "väljaandes esitatud kujul põhipealkiri"
+    ANON: str = "anonüümne väljaanne"
+TITLE_TYPES_MAP = {
+    TitleType.AUTHOR_WITHOUT_TITLE: 130,
+    TitleType.NORMALIZED_TITLE: 240,
+    TitleType.TITLE: 245,
+    TitleType.PARALLEL_TITLE: 246,
+    TitleType.ADDITIONAL_TITLE: 245,
+    TitleType.METS_TITLE: 245,
+    TitleType.ANON: 130
+}
+PUBLISHER_KEY = "Väljaandja"

rara_tools/constants/subject_indexer.py CHANGED Viewed

@@ -18,6 +18,10 @@ class StatusKeys:
     EXTRACT_KEYWORDS = "extract_keywords"
+class Error:
+    UNKNOWN = "Could not extract keywords from text!"
 class URLSource:
     VIAF = "VIAF"
     SIERRA = "Sierra"

rara_tools/core_formatters/core_formatter.py CHANGED Viewed

@@ -2,6 +2,10 @@ from typing import List, Tuple, Any
 from rara_tools.core_formatters.formatted_keyword import FormattedKeyword
 from rara_tools.core_formatters.formatted_meta import FormattedAuthor
 from rara_tools.constants.linker import MAIN_TAXONOMY_LANG, KEYWORD_TYPES_TO_IGNORE, EntityType
+from rara_tools.constants.meta_extractor import TitleType, TITLE_TYPES_MAP, PUBLISHER_KEY
+from rara_tools.constants.subject_indexer import KeywordType
+import regex as re
 def get_primary_author(authors: List[dict]) -> str:
     primary_author = ""
@@ -10,8 +14,19 @@ def get_primary_author(authors: List[dict]) -> str:
             primary_author = author.get("name", "")
     return primary_author
-def format_authors(authors: List[dict]) -> List[dict]:
+def is_valid_keyword(keyword: str) -> bool:
+    # If keywords contains ONLY punctuation
+    # characters, we assume it`s not valid
+    if re.search(r"^(\W|_)+$", keyword):
+        return False
+    return True
+def format_series_info(series: str):
+    pass
+def format_authors(authors: List[dict]) -> Tuple[List[dict], dict]:
     formatted_authors = []
+    publisher = {}
     for author in authors:
         entity_type = author.get("type", EntityType.UNK)
@@ -20,8 +35,14 @@ def format_authors(authors: List[dict]) -> List[dict]:
             linked_doc=None,
             entity_type=entity_type
         ).to_dict()
+        # If author role == publisher, do not add it as an author
+        if formatted_author.get("author_role", "") == PUBLISHER_KEY:
+            publisher = formatted_author
+            continue
         formatted_authors.append(formatted_author)
-    return formatted_authors
+    return (formatted_authors, publisher)
 def format_sections(sections: List[dict]) -> List[dict]:
     for section in sections:
@@ -31,9 +52,16 @@ def format_sections(sections: List[dict]) -> List[dict]:
         if primary_author:
             for title in titles:
                 title["author_from_title"] = primary_author
+        if not authors:
+            for title in titles:
+                title["title_type"] = TitleType.ANON
+                title["title_type_int"] = TITLE_TYPES_MAP.get(TitleType.ANON)
         section["titles"] = titles
-        formatted_authors = format_authors(authors)
+        # Extract publisher, but do nothing with it
+        # as it is unlikely for the publishing info to be
+        # in a METS/ALTO section. Can update it, if proven otherwise
+        formatted_authors, publisher = format_authors(authors)
         section["authors"] = formatted_authors
     return sections
@@ -46,14 +74,25 @@ def format_meta(meta: dict) -> dict:
     authors = meta_to_format.pop("authors", [])
     sections = meta_to_format.pop("sections", [])
+    titles = meta_to_format.pop("titles", [])
-    formatted_authors = format_authors(authors)
+    formatted_authors, publisher = format_authors(authors)
     formatted_sections = format_sections(sections)
     if sections and formatted_sections:
         meta_to_format["sections"] = formatted_sections
     if authors and formatted_authors:
         meta_to_format["authors"] = formatted_authors
+    if titles and not authors:
+        for title in titles:
+            title["title_type"] = TitleType.ANON
+            title["title_type_int"] = TITLE_TYPES_MAP.get(TitleType.ANON)
+        meta_to_format["titles"] = titles
+    if publisher:
+        # Not sure, if it would be better to add original name or
+        # linked value. Currently adding original for safety
+        meta_to_format["publisher"] = publisher.get("original_name")
     meta["meta"] = meta_to_format
@@ -81,6 +120,30 @@ def format_keywords(flat_keywords: List[dict]) -> List[dict]:
             linked_doc=None,
             main_taxnomy_lang=MAIN_TAXONOMY_LANG
         ).to_dict()
-        formatted_keywords.append(formatted_keyword)
+        if is_valid_keyword(formatted_keyword.get("keyword")):
+            formatted_keywords.append(formatted_keyword)
     return formatted_keywords
+def get_udk072(flat_keywords: List[dict]) -> List[str]:
+    """ Filters out UDK from flat subject indexer output.
+    """
+    # keyword type: UDK
+    udk072 = [
+        keyword.get("keyword")
+        for keyword in flat_keywords
+        if keyword.get("entity_type") == KeywordType.UDK
+    ]
+    return udk072
+def get_udk080(flat_keywords: List[dict]) -> List[str]:
+    """ Filters out UDC from flat subject indexer output.
+    """
+    # keyword type: UDC
+    udk080 = [
+        keyword.get("keyword")
+        for keyword in flat_keywords
+        if keyword.get("entity_type") == KeywordType.UDC
+    ]
+    return udk080

rara_tools/core_formatters/formatted_meta.py CHANGED Viewed

@@ -11,7 +11,7 @@ class FormattedTitle(FormattedObject):
         super().__init__(
             object_dict=object_dict,
             linked_doc=linked_doc,
-            original_entity_key="name"
+            original_entity_key="title"
         )
@@ -41,17 +41,16 @@ class FormattedAuthor(FormattedObject):
         #self.__standardized_uri: str = ""
         self.__viaf_id: str = ""
+        self._default_author_type: str = EntityType.PER
     @property
     def primary_author_type(self) -> str:
         if self.__primary_author_type == None:
-            if self.is_primary:
-                if self.entity_type != EntityType.UNK:
+            self.__primary_author_type = self._default_author_type
+            if self.entity_type != EntityType.UNK:
+                if self.entity_type in [EntityType.ORG, EntityType.PER]:
                     self.__primary_author_type = self.entity_type
-                else:
-                    self.__primary_author_type = EntityType.PER
-            else:
-                self.__primary_author_type = ""
         return self.__primary_author_type

{rara_tools-0.7.3.dist-info → rara_tools-0.7.5.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: rara-tools
-Version: 0.7.3
+Version: 0.7.5
 Summary: Tools to support Kata's work.
 Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.10

{rara_tools-0.7.3.dist-info → rara_tools-0.7.5.dist-info}/RECORD RENAMED Viewed

@@ -7,17 +7,17 @@ rara_tools/s3.py,sha256=9ziDXsLjBtFAvsjTPxFddhfvkpA8773rzPJqO7y1N5Q,6415
 rara_tools/task_reporter.py,sha256=WCcZts9dAUokPc4vbrG3-lNAFLnWaMgE3b3iaUB7mr8,3256
 rara_tools/utils.py,sha256=1UrxOzo3cxe4juMkDlKWv1VKWMYay5v1pivGci1ajiM,3003
 rara_tools/constants/__init__.py,sha256=r78laM9vyRDAvzDhPvzDlhaX6qPwUUBBtwf1WosrW3o,27
-rara_tools/constants/digitizer.py,sha256=9aQkJj8C5a_HLgCayrz3PpGYJMLoO4Ph9_U28Q-C1T4,633
+rara_tools/constants/digitizer.py,sha256=f7VQGIYXd-MoPMi-iMgENsLWR_AA6koy0_jZct4HzVc,1152
 rara_tools/constants/general.py,sha256=dLomRopLiHv_J_liSIGzK1A3XByydsKGIyVN8KuuN98,1191
 rara_tools/constants/language_evaluator.py,sha256=3sCSaoS-zXQRY0vJ7UUMuZqbtYQD_quVVbdpgvJjE7I,124
-rara_tools/constants/linker.py,sha256=yBN9NpUhB3ENz8BapoIfpSHY_xNqwYdqutgQFdc_Cd8,3240
-rara_tools/constants/meta_extractor.py,sha256=adYH8cQqH0ZWYO7clGMiObclXRTGsxWgk3pC1oiHxHE,242
+rara_tools/constants/linker.py,sha256=WnOmJFTkoBMZUbBaW1uY45NTQB7FGG-dc9a_6qYTtwk,3381
+rara_tools/constants/meta_extractor.py,sha256=iVyxycKScbrjFWLv50dRmdeHfTLOKbdyEhgUF3DyBrY,1053
 rara_tools/constants/normalizers.py,sha256=Xs3anDwJHpHeniwx3xoIZyqdEXtO3eb7ouGLLr0CpHw,1344
 rara_tools/constants/parsers.py,sha256=L6nh1Itget9_9DMsliDkh6T25z78eMFPWVkbaU08DwU,5561
-rara_tools/constants/subject_indexer.py,sha256=i0xRdqwasyb6d6WZZKPgyuEUd2JeO_qwWYoG6UeBo5U,2064
-rara_tools/core_formatters/core_formatter.py,sha256=HJX7jOi9kaFie_zm0Wzjk0nKF8dRleJpVWbCplFFquo,2760
+rara_tools/constants/subject_indexer.py,sha256=0snyyB8IMCWXOYPXR_c0Kavq4nBiww559rdNOKjawx8,2133
+rara_tools/core_formatters/core_formatter.py,sha256=u_Cdgv9qBcyF-XddjaRGUqAFik9OMAdSzAulXpYR7vE,4997
 rara_tools/core_formatters/formatted_keyword.py,sha256=1-B9IQTycFt69pTy8WZNnfJ2WIMRow3kpEub6igyNQc,7865
-rara_tools/core_formatters/formatted_meta.py,sha256=Zd0oQFLbn6m_wHaWtgxBsu9J7wGyWIpZxb2-8PrR3Wk,5240
+rara_tools/core_formatters/formatted_meta.py,sha256=r0RPG4eM-REPIR1DrIJnvYPQtQrzkgdvX9tvhNWjQ0Y,5250
 rara_tools/core_formatters/formatted_object.py,sha256=7a499ZmcZXOqtlwxDi6FWHWF5a6HdCsduS22wV3uHIE,5656
 rara_tools/normalizers/__init__.py,sha256=_NqpS5w710DhaURytHq9JpEt8HgYpSPfRDcOtOymJgE,193
 rara_tools/normalizers/authorities.py,sha256=IDtcm0yNZNhv1f-WcdqWFSRzZk_CoKuBFsk6hEPddWM,4513
@@ -39,8 +39,8 @@ rara_tools/parsers/marc_records/title_record.py,sha256=XrtJ4gj7wzSaGxNaPtPuawmqq
 rara_tools/parsers/tools/entity_normalizers.py,sha256=VyCy_NowCLpOsL0luQ55IW-Qi-J5oBH0Ofzr7HRFBhM,8949
 rara_tools/parsers/tools/marc_converter.py,sha256=LgSHe-7n7aiDrw2bnsB53r3fXTRFjZXTwBYfTpL0pfs,415
 rara_tools/parsers/tools/russian_transliterator.py,sha256=5ZU66iTqAhr7pmfVqXPAI_cidF43VqqmuN4d7H4_JuA,9770
-rara_tools-0.7.3.dist-info/licenses/LICENSE.md,sha256=hkZVnIZll7e_KNEQzeY94Y9tlzVL8iVZBTMBvDykksU,35142
-rara_tools-0.7.3.dist-info/METADATA,sha256=D6RNdyg7JAga9gYCrL6hGP1TRN45OpqccD-UkprgVdI,4079
-rara_tools-0.7.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-rara_tools-0.7.3.dist-info/top_level.txt,sha256=JwfB5b8BAtW5OFKRln2AQ_WElTRyIBM4nO0FKN1cupY,11
-rara_tools-0.7.3.dist-info/RECORD,,
+rara_tools-0.7.5.dist-info/licenses/LICENSE.md,sha256=hkZVnIZll7e_KNEQzeY94Y9tlzVL8iVZBTMBvDykksU,35142
+rara_tools-0.7.5.dist-info/METADATA,sha256=UsHxND7IUSmGYDbzmHWjihiPrbqxefcSHyGy8EvpnFY,4079
+rara_tools-0.7.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+rara_tools-0.7.5.dist-info/top_level.txt,sha256=JwfB5b8BAtW5OFKRln2AQ_WElTRyIBM4nO0FKN1cupY,11
+rara_tools-0.7.5.dist-info/RECORD,,

{rara_tools-0.7.3.dist-info → rara_tools-0.7.5.dist-info}/WHEEL RENAMED Viewed

File without changes

{rara_tools-0.7.3.dist-info → rara_tools-0.7.5.dist-info}/licenses/LICENSE.md RENAMED Viewed

File without changes

{rara_tools-0.7.3.dist-info → rara_tools-0.7.5.dist-info}/top_level.txt RENAMED Viewed

File without changes

rara-tools 0.7.3__py3-none-any.whl → 0.7.5__py3-none-any.whl

Potentially problematic release.

rara-tools 0.7.3py3-none-any.whl → 0.7.5py3-none-any.whl