PyPI - rara-tools - Versions diffs - 0.6.16__tar.gz → 0.7.0__tar.gz - Mend

rara-tools 0.6.16tar.gz → 0.7.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of rara-tools might be problematic. Click here for more details.

Files changed (69) hide show

{rara_tools-0.6.16/rara_tools.egg-info → rara_tools-0.7.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: rara-tools
-Version: 0.6.16
+Version: 0.7.0
 Summary: Tools to support Kata's work.
 Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.10

rara_tools-0.7.0/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0.7.0

{rara_tools-0.6.16 → rara_tools-0.7.0}/pyproject.toml RENAMED Viewed

@@ -14,7 +14,8 @@ include = [
     "rara_tools.parsers",
     "rara_tools.parsers.marc_parsers",
     "rara_tools.parsers.marc_records",
-    "rara_tools.parsers.tools"
+    "rara_tools.parsers.tools",
+    "rara_tools.core_formatters"
 ]
 [project]

rara_tools-0.7.0/rara_tools/constants/linker.py ADDED Viewed

@@ -0,0 +1,136 @@
+import logging
+from rara_tools.constants.normalizers import EntityType, VIAF_ENTITY_MAP
+COMPONENT_KEY = "linker"
+class Tasks:
+    BASE = "base_linker_task"
+    VECTORIZE = "vectorize_text"
+    VECTORIZE_WITH_CORE = "vectorize_text_with_core_logic"
+    PIPELINE = "link_keywords_with_core_logic"
+    LINK_AND_NORMALIZE = "core_linker_with_normalization"
+    VECTORIZE_AND_INDEX = "core_vectorize_and_index"
+    RECEIVE_LINK_AND_NORMALIZE = "receive_link_and_normalize"
+class Queue:
+    LINKER = "linker"
+    VECTORIZER = "vectorizer"
+class StatusKeys:
+    VECTORIZE_CONTEXT = "vectorize_context"
+    LINK_KEYWORDS = "link_keywords"
+class URLSource:
+    VIAF = "VIAF"
+    SIERRA = "Sierra"
+    EMS = "EMS"
+class KeywordType:
+    LOC = "Kohamärksõnad"
+    TIME = "Ajamärksõnad"
+    TOPIC = "Teemamärksõnad"
+    GENRE = "Vormimärksõnad"
+    TITLE = "Teose pealkiri"
+    PER = "Isikunimi"
+    ORG = "Kollektiivi nimi"
+    EVENT = "Ajutine kollektiiv või sündmus"
+    CATEGORY = "Valdkonnamärksõnad"
+    UDC = "UDC Summary"
+    UDK = "UDK Rahvusbibliograafia"
+class KeywordMARC:
+    PER = 600
+    ORG = 610
+    TOPIC = 650
+    GENRE = 655
+    TIME = 648
+    LOC = 651
+    EVENT = 611
+    TITLE = 630
+    TITLE_LINKED = 600
+class KeywordSource:
+    EMS = "EMS"
+    SIERRA = "SIERRA"
+    VIAF = "VIAF"
+    AI = "AI"
+class Filters:
+    AUTHOR = "author"
+    YEAR = "year"
+UNLINKED_KEYWORD_MARC_FIELD = 693
+ALLOWED_FILTERS_MAP = {
+    EntityType.PER: [Filters.YEAR],
+    EntityType.ORG: [Filters.YEAR],
+    EntityType.TITLE: [Filters.YEAR, Filters.AUTHOR],
+    EntityType.KEYWORD: [],
+    EntityType.LOC: []
+}
+KEYWORD_MARC_MAP = {
+    KeywordType.LOC: KeywordMARC.LOC,
+    KeywordType.TIME: KeywordMARC.TIME,
+    KeywordType.TOPIC: KeywordMARC.TOPIC,
+    KeywordType.GENRE: KeywordMARC.GENRE,
+    KeywordType.TITLE: KeywordMARC.TITLE,
+    KeywordType.ORG: KeywordMARC.ORG,
+    KeywordType.PER: KeywordMARC.PER,
+    KeywordType.EVENT: KeywordMARC.EVENT
+}
+URL_SOURCE_MAP = {
+    EntityType.PER: URLSource.VIAF,
+    EntityType.ORG: URLSource.VIAF,
+    EntityType.TITLE: URLSource.VIAF,
+    EntityType.KEYWORD: URLSource.EMS,
+    EntityType.LOC: URLSource.EMS
+}
+# Ignore those "keyword types" while linking the
+# rara-subject-indexer results
+KEYWORD_TYPES_TO_IGNORE = [
+    KeywordType.CATEGORY,
+    KeywordType.UDC,
+    KeywordType.UDK
+]
+ALLOWED_ENTITY_TYPES = [
+    EntityType.PER,
+    EntityType.ORG,
+    EntityType.KEYWORD,
+    EntityType.LOC,
+    EntityType.TITLE,
+    EntityType.UNK,
+]
+KEYWORD_TYPE_MAP = {
+    KeywordType.TIME: EntityType.KEYWORD,
+    KeywordType.GENRE: EntityType.KEYWORD,
+    KeywordType.LOC: EntityType.LOC,
+    KeywordType.PER: EntityType.PER,
+    KeywordType.ORG: EntityType.ORG,
+    KeywordType.TOPIC: EntityType.KEYWORD,
+    KeywordType.TITLE: EntityType.TITLE,
+    KeywordType.EVENT: EntityType.ORG
+}
+EMS_ENTITY_TYPES = [EntityType.KEYWORD, EntityType.LOC]
+SIERRA_ENTITY_TYPES = [EntityType.PER, EntityType.ORG, EntityType.TITLE]
+VIAF_ENTITY_TYPES = [EntityType.PER, EntityType.ORG, EntityType.TITLE]
+# Params for filters
+MIN_AUTHOR_SIMILARITY = 0.95
+YEAR_EXCEPTION_VALUE = True
+LOGGER_NAME = "rara-tools-norm-linker"
+LOGGER = logging.getLogger(LOGGER_NAME)
+MAIN_TAXONOMY_LANG = "et"

rara_tools-0.7.0/rara_tools/core_formatters/core_formatter.py ADDED Viewed

@@ -0,0 +1,86 @@
+from typing import List, Tuple, Any
+from rara_tools.core_formatters.formatted_keyword import FormattedKeyword
+from rara_tools.core_formatters.formatted_meta import FormattedAuthor
+from rara_tools.constants.linker import MAIN_TAXONOMY_LANG, KEYWORD_TYPES_TO_IGNORE, EntityType
+def get_primary_author(authors: List[dict]) -> str:
+    primary_author = ""
+    for author in authors:
+        if author.get("is_primary", False):
+            primary_author = author.get("name", "")
+    return primary_author
+def format_authors(authors: List[dict]) -> List[dict]:
+    formatted_authors = []
+    for author in authors:
+        entity_type = author.get("type", EntityType.UNK)
+        formatted_author = FormattedAuthor(
+            object_dict=author,
+            linked_doc=None,
+            entity_type=entity_type
+        ).to_dict()
+        formatted_authors.append(formatted_author)
+    return formatted_authors
+def format_sections(sections: List[dict]) -> List[dict]:
+    for section in sections:
+        authors = section.pop("authors", [])
+        titles = section.pop("titles", [])
+        primary_author = get_primary_author(authors)
+        if primary_author:
+            for title in titles:
+                title["author_from_title"] = primary_author
+        section["titles"] = titles
+        formatted_authors = format_authors(authors)
+        section["authors"] = formatted_authors
+    return sections
+def format_meta(meta: dict) -> dict:
+    """ Formats unlinked meta for Kata CORE.
+    """
+    meta_to_format = meta.get("meta")
+    authors = meta_to_format.pop("authors", [])
+    sections = meta_to_format.pop("sections", [])
+    formatted_authors = format_authors(authors)
+    formatted_sections = format_sections(sections)
+    if sections and formatted_sections:
+        meta_to_format["sections"] = formatted_sections
+    if authors and formatted_authors:
+        meta_to_format["authors"] = formatted_authors
+    meta["meta"] = meta_to_format
+    return meta
+def format_keywords(flat_keywords: List[dict]) -> List[dict]:
+    """ Formats unlinked keywords for Kata CORE.
+    """
+    ignored_keywords = []
+    filtered_keywords = []
+    for keyword_dict in flat_keywords:
+        keyword_type = keyword_dict.get("entity_type")
+        if keyword_type in KEYWORD_TYPES_TO_IGNORE:
+            ignored_keywords.append(keyword_dict)
+        else:
+            filtered_keywords.append(keyword_dict)
+    formatted_keywords = []
+    for keyword_dict in filtered_keywords:
+        formatted_keyword = FormattedKeyword(
+            object_dict=keyword_dict,
+            linked_doc=None,
+            main_taxnomy_lang=MAIN_TAXONOMY_LANG
+        ).to_dict()
+        formatted_keywords.append(formatted_keyword)
+    return formatted_keywords

rara_tools-0.7.0/rara_tools/core_formatters/formatted_keyword.py ADDED Viewed

@@ -0,0 +1,229 @@
+from rara_tools.constants.linker import (
+    LOGGER, URLSource, KeywordSource, EntityType, KeywordType, KeywordMARC,
+    KEYWORD_MARC_MAP,  KEYWORD_TYPES_TO_IGNORE, KEYWORD_TYPE_MAP,
+    EMS_ENTITY_TYPES, SIERRA_ENTITY_TYPES, UNLINKED_KEYWORD_MARC_FIELD,
+    URL_SOURCE_MAP
+)
+from rara_tools.core_formatters.formatted_object import FormattedObject
+from typing import List, Dict, NoReturn, Tuple, Any
+class FormattedKeyword(FormattedObject):
+    def __init__(self, object_dict: dict, linked_doc: Any,
+            main_taxnomy_lang: str, url_source_map: str = URL_SOURCE_MAP
+    ) -> NoReturn:
+        super().__init__(
+            object_dict=object_dict,
+            linked_doc=linked_doc,
+            original_entity_key="keyword"
+        )
+        self.main_taxnomy_lang: str = main_taxnomy_lang
+        self.original_keyword: str = self.original_entity
+        self.score: float = self.object_dict.get("score")
+        self.count: int = self.object_dict.get("count")
+        self.method: str = self.object_dict.get("method")
+        self.model_arch: str = self.object_dict.get("model_arch", self.method)
+        self.keyword_type: str = self.object_dict.get("entity_type")
+        self.entity_type: str = KEYWORD_TYPE_MAP.get(self.keyword_type, "")
+        self.url_source_map: dict = url_source_map
+        self.__keyword_source: str = ""
+        self.__indicator_1: str = ""
+        self.__indicator_2: str = ""
+        self.__url: str | None = None
+        self.__url_source: str | None = None
+        self.__marc_field: str = ""
+        self.__language: str = ""
+        self.__author: str | None = None
+    @property
+    def keyword(self) -> str:
+        return self.entity
+    @property
+    def keyword_source(self) -> str:
+        if not self.__keyword_source:
+            if not self.is_linked:
+                source = KeywordSource.AI
+            elif self.entity_type in EMS_ENTITY_TYPES:
+                source = KeywordSource.EMS
+            elif self.entity_type in SIERRA_ENTITY_TYPES:
+                if self.linked_doc and self.linked_doc.elastic:
+                    source = KeywordSource.SIERRA
+                elif self.linked_doc and self.linked_doc.viaf:
+                    source = KeywordSource.VIAF
+                else:
+                    source = KeywordSource.AI
+            else:
+                source = KeywordSource.AI
+            self.__keyword_source = source
+        return self.__keyword_source
+    @property
+    def indicator1(self) -> str:
+        if not self.__indicator_1:
+            ind1, ind2 = self._get_indicators()
+            self.__indicator_1 = ind1
+            self.__indicator_2 = ind2
+        return self.__indicator_1
+    @property
+    def indicator2(self) -> str:
+        if not self.__indicator_2:
+            ind1, ind2 = self._get_indicators()
+            self.__indicator_1 = ind1
+            self.__indicator_2 = ind2
+        return self.__indicator_2
+    @property
+    def url(self) -> str:
+        if self.__url == None:
+            url_info = self._get_url_info()
+            self.__url = url_info.get("url")
+            self.__url_source = url_info.get("url_source")
+        return self.__url
+    @property
+    def url_source(self) -> str:
+        if self.__url_source == None:
+            url_info = self._get_url_info()
+            self.__url = url_info.get("url")
+            self.__url_source = url_info.get("url_source")
+        return self.__url_source
+    @property
+    def marc_field(self) -> int:
+        if not self.__marc_field:
+            # TODO: teoste + isikute loogika!!!!
+            if self.is_linked:
+                marc_field = KEYWORD_MARC_MAP.get(str(self.keyword_type), "")
+            else:
+                marc_field = UNLINKED_KEYWORD_MARC_FIELD
+            if self.entity_type == EntityType.TITLE:
+                if self.author:
+                    marc_field = KeywordMARC.TITLE_LINKED
+                else:
+                    marc_field = KeywordMARC.TITLE
+            self.__marc_field = marc_field
+        return self.__marc_field
+    @property
+    def persons_title(self) -> str:
+        return self.titles
+    @property
+    def language(self) -> str:
+        if not self.__language:
+            if self.is_linked:
+                self.__language = self.main_taxnomy_lang
+            else:
+                self.__language = self.object_dict.get("language", "")
+        return self.__language
+    @property
+    def author(self) -> str:
+        # Only relevant for titles!
+        if self.__author == None:
+            self.__author = ""
+            if self.entity_type == EntityType.TITLE:
+                if self.original_record:
+                    self.__author = self.original_record.author_name
+                elif self.viaf_info:
+                    pass
+                    #self.__author = self.viaf_info.get
+        return self.__author
+    def _get_url_info(self) -> dict:
+        """ Finds URL identifier from LinkedDoc based on
+        given entity type.
+        Parameters
+        -----------
+        linked_doc: LinkedDoc | None
+            A LinkedDoc class instance.
+        entity_type: str
+            Entity type for detecting correct URL source.
+        Returns
+        ----------
+        dict:
+            Dictionary with keys `url` - URL identifier and
+            `url_source` - source of the URL (e.g. "EMS").
+        """
+        url_source = self.url_source_map.get(self.entity_type, "")
+        url = ""
+        if self.linked_doc:
+            if url_source == URLSource.EMS:
+                url = self.linked_doc.elastic.get("ems_url", "")
+            elif url_source == URLSource.VIAF:
+                url = self.viaf_info.get("viaf_url", "")
+        if not url:
+            url_source = ""
+        url_info = {"url": url, "url_source": url_source}
+        LOGGER.debug(
+            f"Detected URL info: {url_info}. Used entity_type = {self.entity_type}. " \
+            f"URL source map = {self.url_source_map}."
+        )
+        return url_info
+    def _get_indicators(self) -> Tuple[str, str]:
+        """ Find MARC indicators 1 and 2.
+        """
+        ind1 = " "
+        ind2 = " "
+        if self.entity_type in SIERRA_ENTITY_TYPES:
+            if self.entity_type == EntityType.PER:
+                if "," in self.keyword:
+                    ind1 = "1"
+                else:
+                    ind1 = "0"
+            elif self.entity_type == EntityType.ORG:
+                # 1 märksõna esimeseks elemendiks võimupiirkonna nimi, nt:
+                #    (a) Eesti (b) Riigikogu - raske automaatselt määrata
+                # 2 märksõna esimeseks elemendiks nimi pärijärjestuses
+                ind1 = "2"
+            else:
+                ind1 = "0"
+            if not self.is_linked:
+                ind2 = "4"
+        elif self.entity_type in EMS_ENTITY_TYPES:
+            ind2 = "4"
+        return (ind1, ind2)
+    def to_dict(self) -> dict:
+        keyword_dict = {
+            "count": self.count,
+            "dates": self.dates,
+            "entity_type": self.keyword_type,
+            "indicator1": self.indicator1,
+            "indicator2": self.indicator2,
+            "is_linked": self.is_linked,
+            "keyword": self.keyword,
+            "keyword_source": self.keyword_source,
+            "lang": self.language,
+            "location": self.location,
+            "marc_field": self.marc_field,
+            "method": self.method,
+            "model_arch": self.model_arch,
+            "numeration": self.numeration,
+            "organisation_sub_unit": self.organisation_sub_unit,
+            "original_keyword": self.original_keyword,
+            "persons_title": self.persons_title,
+            "score": self.score,
+            "url": self.url,
+            "url_source": self.url_source,
+            "author": self.author
+        }
+        return keyword_dict

rara_tools-0.7.0/rara_tools/core_formatters/formatted_meta.py ADDED Viewed

@@ -0,0 +1,154 @@
+from rara_tools.constants.linker  import (
+    LOGGER, EntityType
+)
+from rara_tools.core_formatters.formatted_object import FormattedObject
+from typing import List, Dict, NoReturn, Tuple, Any
+class FormattedTitle(FormattedObject):
+    # TODO: Kas seda on üldse vaja?
+    def __init__(self, object_dict: dict, linked_doc: Any):
+        super().__init__(
+            object_dict=object_dict,
+            linked_doc=linked_doc,
+            original_entity_key="name"
+        )
+class FormattedAuthor(FormattedObject):
+    def __init__(self, object_dict: dict, linked_doc: Any, entity_type: str):
+        super().__init__(
+            object_dict=object_dict,
+            linked_doc=linked_doc,
+            original_entity_key="name"
+        )
+        self.entity_type: str = entity_type
+        self.is_linked: bool = True if self.linked_doc else False # NB! Lisada andmebaasi uus veerg!
+        self.original_name: str = self.original_entity            # NB! Lisada andmebaasi uus veerg
+        self.author_role: str = self.object_dict.get("role")
+        self.is_primary: bool = self.object_dict.get("is_primary")
+        self.__primary_author_type: str = None
+        self.__name_order_type: str = ""
+        self.__event_sub_unit: str = ""
+        self.__order_number: str = ""
+        self.__sub_title: str = ""
+        self.__additional_info: str = ""
+        self.__publication_type: str = ""
+        self.__publication_language: str = ""
+        #self.__standardized_uri: str = ""
+        self.__viaf_id: str = ""
+    @property
+    def primary_author_type(self) -> str:
+        if self.__primary_author_type == None:
+            if self.is_primary:
+                if self.entity_type != EntityType.UNK:
+                    self.__primary_author_type = self.entity_type
+                else:
+                    self.__primary_author_type = EntityType.PER
+            else:
+                self.__primary_author_type = ""
+        return self.__primary_author_type
+    @property
+    def name(self) -> str:
+        return self.entity
+    @property
+    def name_order(self) -> str:
+        if not self.__name_order_type:
+            if self.entity_type == EntityType.PER or self.entity_type == EntityType.UNK:
+                if "," in self.name:
+                    ind1 = "1"
+                else:
+                    ind1 = "0"
+            elif self.entity_type == EntityType.ORG:
+                #LOGGER.debug(f"Entity type {self.entity_type} is not {EntityType.PER}.")
+                # 1 märksõna esimeseks elemendiks võimupiirkonna nimi, nt:
+                #    (a) Eesti (b) Riigikogu - raske automaatselt määrata
+                # 2 märksõna esimeseks elemendiks nimi pärijärjestuses
+                ind1 = "2" #????????
+            else:
+                ind1 = "0"
+            self.__name_order_type = ind1
+        return self.__name_order_type
+    @property
+    def event_sub_unit(self) -> str:
+        if not self.__event_sub_unit:
+            self.__event_sub_unit = ""
+        return self.__event_sub_unit
+    @property
+    def order_number(self) -> str:
+        if not self.__order_number:
+            self.__order_number = ""
+        return self.__order_number
+    @property
+    def sub_title(self) -> str:
+        if not self.__sub_title:
+            self.__sub_title = ""
+        return self.__sub_title
+    @property
+    def additional_info(self) -> str:
+        if not self.__additional_info:
+            self.__additional_info = ""
+        return self.__additional_info
+    @property
+    def publication_type(self) -> str:
+        if not self.__publication_type:
+            self.__publication_type = ""
+        return self.__publication_type
+    @property
+    def publication_language(self) -> str:
+        if not self.__publication_language:
+            self.__publication_language = ""
+        return self.__publication_language
+    @property
+    def standardized_uri(self) -> str:
+        return self.identifier
+    @property
+    def viaf_id(self):
+        if not self.__viaf_id:
+            if self.viaf_info:
+                self.__viaf_id = self.viaf_info.get("viaf_url", "")
+            else:
+                self.__viaf_id = ""
+        return self.__viaf_id
+    def to_dict(self):
+        author_dict = {
+            "is_linked": self.is_linked,
+            "original_name": self.original_name,
+            "author_role": self.author_role,
+            "is_primary": self.is_primary,
+            "primary_author_type": self.primary_author_type,
+            "name": self.name,
+            "numeration": self.numeration,
+            "organisation_sub_unit": self.organisation_sub_unit,
+            "titles": self.titles,
+            "location": self.location,
+            "dates": self.dates,
+            "name_order_type": self.name_order,
+            "event_sub_unit": self.event_sub_unit,
+            "order_number": self.order_number,
+            "sub_title": self.sub_title,
+            "additional_info": self.additional_info,
+            "publication_type": self.publication_type,
+            "publication_language": self.publication_language,
+            "standardized_uri": self.standardized_uri,
+            "viaf_id": self.viaf_id
+        }
+        return author_dict

rara-tools 0.6.16__tar.gz → 0.7.0__tar.gz

Potentially problematic release.

rara-tools 0.6.16tar.gz → 0.7.0tar.gz