PyPI - rara-tools - Versions diffs - 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

rara-tools 0.2.0py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of rara-tools might be problematic. Click here for more details.

Files changed (27) hide show

rara_tools/constants/normalizers.py +0 -11
rara_tools/constants/parsers.py +152 -0
rara_tools/normalizers/__init__.py +4 -0
rara_tools/normalizers/authorities.py +120 -0
rara_tools/normalizers/base.py +290 -0
rara_tools/normalizers/bibs.py +76 -0
rara_tools/normalizers/viaf.py +204 -0
rara_tools/parsers/marc_parsers/base_parser.py +50 -0
rara_tools/parsers/marc_parsers/ems_parser.py +49 -0
rara_tools/parsers/marc_parsers/location_parser.py +46 -0
rara_tools/parsers/marc_parsers/organization_parser.py +44 -0
rara_tools/parsers/marc_parsers/person_parser.py +45 -0
rara_tools/parsers/marc_parsers/title_parser.py +1 -0
rara_tools/parsers/marc_records/base_record.py +112 -0
rara_tools/parsers/marc_records/ems_record.py +267 -0
rara_tools/parsers/marc_records/organization_record.py +245 -0
rara_tools/parsers/marc_records/person_record.py +217 -0
rara_tools/parsers/marc_records/title_record.py +1 -0
rara_tools/parsers/tools/entity_normalizers.py +256 -0
rara_tools/parsers/tools/marc_converter.py +15 -0
rara_tools/parsers/tools/russian_transliterator.py +248 -0
{rara_tools-0.2.0.dist-info → rara_tools-0.4.0.dist-info}/METADATA +5 -2
rara_tools-0.4.0.dist-info/RECORD +37 -0
rara_tools-0.2.0.dist-info/RECORD +0 -17
{rara_tools-0.2.0.dist-info → rara_tools-0.4.0.dist-info}/WHEEL +0 -0
{rara_tools-0.2.0.dist-info → rara_tools-0.4.0.dist-info}/licenses/LICENSE.md +0 -0
{rara_tools-0.2.0.dist-info → rara_tools-0.4.0.dist-info}/top_level.txt +0 -0

rara_tools/parsers/marc_records/person_record.py ADDED Viewed

@@ -0,0 +1,217 @@
+from typing import List, NoReturn
+from pymarc.record import Record
+from rara_tools.parsers.tools.entity_normalizers import PersonNormalizer
+from rara_tools.parsers.marc_records.base_record import BaseRecord
+from rara_tools.constants.parsers import PersonMarcIDs, LOGGER
+import regex as re
+import json
+import logging
+class PersonRecord(BaseRecord):
+    """ Generates a simplified organization JSON record
+    from a pymarc MARC record.
+    """
+    def __init__(self, record: Record, add_variations: bool = False) -> NoReturn:
+        """ Initializes PersonRecord object.
+        Parameters
+        -----------
+        record: Record
+            pymarc.record.Record object.
+        add_variations: bool
+            If enabled, constructs an additional variations field, which
+            combines the content of multiple fields + adds some generated
+            variations. If the output is uploaded into Elastic and used
+            via rara-norm-linker, it is necessary to enable this.
+        """
+        super().__init__(record=record, add_variations=add_variations)
+        self.__name_field_id: List[str] = PersonMarcIDs.NAME
+        self.__name_variations_field_id: List[str]= PersonMarcIDs.NAME_VARIATIONS
+        self.__source_field_id: List[str] = PersonMarcIDs.SOURCE
+        self.__description_field_id: List[str] = PersonMarcIDs.DESCRIPTION
+        self.__default_year: int | None = None
+        self.__name: str = ""
+        self.__original_name: dict = {}
+        self.__name_specification: str = ""
+        self.__life_years: str = ""
+        self.__birth_year: int = -1
+        self.__death_year: int = -1
+        self.__name_variations: List[str] = []
+        self.__source: str = ""
+        self.__description: str = ""
+        self.__full_record: dict = {}
+        self.__name_in_cyrillic: bool = None
+        self.__variations: List[str] = []
+        self.__person_normalizer: PersonNormalizer = PersonNormalizer(self.name)
+    def _parse_year(self, year: str) -> int:
+        year = year.strip()
+        _year = self.__default_year
+        if len(year) >= 4:
+            if year[:4].isnumeric():
+                _year = int(year[:4])
+        elif len(year) == 3 and year.isnumeric():
+            _year = int(year)
+        return _year
+    @property
+    def original_name(self) -> str:
+        if not self.__original_name:
+            values = self.get_values(
+                marc_ids=self.__name_field_id,
+                subfield_id=["a", "b"]
+            )
+            if values:
+                self.__original_name = {
+                    "a": self._clean_value(values[0].get("a", "")),
+                    "b": self._clean_value(values[0].get("b", ""))
+                }
+            else:
+                pass
+        return self.__original_name
+    @property
+    def name(self) -> str:
+        if not self.__name:
+            self.__name = self._merge_and_clean(self.original_name, ["a", "b"])
+        return self.__name
+    @property
+    def name_specification(self) -> str:
+        if not self.__name_specification:
+            values = self.get_values(
+                marc_ids=self.__name_field_id,
+                subfield_id="c"
+            )
+            self.__name_specification = self._clean_value(values[0]) if values else ""
+        return self.__name_specification
+    @property
+    def life_years(self) -> str:
+        if not self.__life_years:
+            values = self.get_values(
+                marc_ids = self.__name_field_id,
+                subfield_id="d"
+            )
+            self.__life_years = self._clean_value(values[0]) if values else ""
+        return self.__life_years
+    @property
+    def birth_year(self) -> int:
+        if self.__birth_year == -1:
+            try:
+                birth_year, death_year = self.life_years.split("-")
+                self.__birth_year = self._parse_year(birth_year)
+                self.__death_year = self._parse_year(death_year)
+            except Exception as e:
+                LOGGER.error(
+                    f"Failed extracting birth and/or death year " \
+                    f"from '{self.life_years}' with the following " \
+                    f"exception: '{e}'."
+                )
+        return self.__birth_year
+    @property
+    def death_year(self) -> int:
+        if self.__death_year == -1:
+            try:
+                birth_year, death_year = self.life_years.split("-")
+                self.__birth_year = self._parse_year(birth_year)
+                self.__death_year = self._parse_year(death_year)
+            except Exception as e:
+                LOGGER.error(
+                    f"Failed extracting birth and/or death year " \
+                    f"from '{self.life_years}' with the following " \
+                    f"exception: '{e}'."
+                )
+        return self.__death_year
+    @property
+    def name_variations(self) -> List[str]:
+        if not self.__name_variations:
+            values = self.get_values(
+                marc_ids=self.__name_variations_field_id,
+                subfield_id=["a", "b"]
+            )
+            if values:
+                raw_variations = [
+                    {
+                        "a": self._clean_value(value.get("a", "")),
+                        "b": self._clean_value(value.get("b", ""))
+                    }
+                    for value in values
+                ]
+                self.__name_variations = [
+                    self._merge_and_clean(value, ["a", "b"])
+                    for value in raw_variations
+                ]
+            else:
+                pass
+        return self.__name_variations
+    @property
+    def source(self) -> str:
+        if not self.__source:
+            values = self.get_values(
+                marc_ids=self.__source_field_id,
+                subfield_id="a"
+            )
+            self.__source = self._clean_value(values[0]) if values else ""
+        return self.__source
+    @property
+    def description(self) -> str:
+        if not self.__description:
+            values = self.get_values(
+                marc_ids=self.__description_field_id,
+                subfield_id="i"
+            )
+            self.__description = self._clean_value(values[0]) if values else ""
+        return self.__description
+    @property
+    def name_in_cyrillic(self) -> bool:
+        if self.__name_in_cyrillic == None:
+            self.__name_in_cyrillic = PersonNormalizer.has_cyrillic(self.name)
+        return self.__name_in_cyrillic
+    @property
+    def variations(self) -> List[str]:
+        if not self.__variations:
+            variations_ = self.__person_normalizer.variations
+            for name in self.name_variations:
+                variations_.extend(PersonNormalizer(name).variations)
+            self.__variations = [v.lower() for v in list(set(variations_))]
+        return self.__variations
+    @property
+    def full_record(self) -> dict:
+        if not self.__full_record:
+            self.__full_record = {
+                "name": self.name,
+                "life_year": self.life_years,
+                "source": self.source,
+                "birth_year": self.birth_year,
+                "death_year": self.death_year,
+                "identifier": self.identifier,
+                "identifier_source": self.identifier_source,
+                "name_variations": self.name_variations,
+                "name_specification": self.name_specification,
+                "description": self.description,
+                "name_in_cyrillic": self.name_in_cyrillic,
+                "full_record_marc": str(self.marc_record),
+                "full_record_json": json.dumps(self.marc_json_record)
+            }
+            if self.add_variations:
+                self.__full_record.update(
+                    {"link_variations": self.variations}
+                )
+        return self.__full_record

rara_tools/parsers/marc_records/title_record.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ # Coming soon

rara_tools/parsers/tools/entity_normalizers.py ADDED Viewed

@@ -0,0 +1,256 @@
+import regex as re
+import estnltk
+import nltk
+import logging
+from rara_tools.parsers.tools.russian_transliterator import Transliterate
+from rara_tools.constants.parsers import KeywordType, LOGGER
+from typing import List, NoReturn
+from abc import abstractmethod
+nltk.download("punkt_tab")
+class PersonalName:
+    """ Wraps generating and accessing main name forms.
+    """
+    def __init__(self, name: str) -> NoReturn:
+        """ Initializes PersonName object.
+        Parameters
+        -----------
+        name: str
+            Personal name. Expects one of the following formats:
+            '<first name> <last name>' or '<last name>, <first_name>', e.g:
+            'Uku Tamm' or 'Tamm, Uku'.
+        """
+        self.__original_name: str = name
+        self.__name: dict = {}
+        self.__last_comma_first: str = ""
+        self.__first_last: str = ""
+    @property
+    def first_name(self) -> str:
+        return self.name.get("first_name")
+    @property
+    def last_name(self) -> str:
+        return self.name.get("last_name")
+    @property
+    def name(self) -> dict:
+        if not self.__name:
+            last_name = ""
+            first_name = ""
+            if "," in self.__original_name:
+                try:
+                    last_name, first_name = self.__original_name.split(",")
+                except Exception as e:
+                    LOGGER.error(
+                        f"Parsing personal name {self.__original_name} " \
+                        f"failed with error: {e}."
+                    )
+            else:
+                name_tokens = [
+                    t.strip()
+                    for t in self.__original_name.split()
+                    if t.strip()
+                ]
+                if len(name_tokens) > 1:
+                    last_name = name_tokens[-1]
+                    first_name = " ".join(name_tokens[:-1])
+            self.__name = {
+                "first_name": first_name.strip(),
+                "last_name": last_name.strip()
+            }
+        return self.__name
+    @property
+    def last_comma_first(self) -> str:
+        if not self.__last_comma_first:
+            if self.last_name or self.first_name:
+                self.__last_comma_first = f"{self.last_name}, {self.first_name}"
+        return self.__last_comma_first.strip()
+    @property
+    def first_last(self) -> str:
+        if not self.__first_last:
+            self.__first_last = f"{self.first_name} {self.last_name}"
+        return self.__first_last.strip()
+class Normalizer:
+    """ Class for handling general methods for string
+    normalizations and variations generation.
+    """
+    def __init__(self, entity: str) -> NoReturn:
+        """ Initializes Normalizer object.
+        Parameters
+        -----------
+        entity: str
+            Entity (keyword, person etc) to normalize.
+        """
+        self.__entity: str = entity
+        self.__lemmatized_entity: str = ""
+        self.__cleaned_entity: str = ""
+    @staticmethod
+    def has_cyrillic(entity: str) -> bool:
+        return bool(re.search("[а-яА-Я]", entity))
+    @staticmethod
+    def transliterate(entity: str) -> str:
+        transliterator = Transliterate()
+        transliteration = transliterator([entity])[0]
+        return transliteration
+    @staticmethod
+    def lemmatize(entity: str) -> str:
+        layer = estnltk.Text(entity).tag_layer()
+        lemma_list = [l[0] for l in list(layer.lemma)]
+        lemmatized_entity = " ".join(lemma_list)
+        return lemmatized_entity
+    @staticmethod
+    def remove_parenthesized_info(entity: str) -> str:
+        clean_entity = re.sub(r"[(][^)]+[)]", "", entity)
+        return clean_entity.strip()
+    @staticmethod
+    def clean_entity(entity: str) -> str:
+        clean_entity = Normalizer.remove_parenthesized_info(entity)
+        return clean_entity
+    @property
+    def lemmatized_entity(self) -> str:
+        if not self.__lemmatized_entity:
+            self.__lemmatized_entity = Normalizer.lemmatize(self.__entity)
+        return self.__lemmatized_entity
+    @property
+    def cleaned_entity(self) -> str:
+        if not self.__cleaned_entity:
+            self.__cleaned_entity = Normalizer.clean_entity(self.__entity)
+        return self.__cleaned_entity
+    @abstractmethod
+    def variations(self) -> List[str]:
+        pass
+class PersonNormalizer(Normalizer):
+    """ Class for handling person-specific methods for string
+    normalizations and variations generation.
+    """
+    def __init__(self, name: str) -> NoReturn:
+        """ Initializes PersonNormalizer object.
+        Parameters
+        -----------
+        name: str
+            Personal name to normalize / generate variations for.
+        """
+        super().__init__(entity=name)
+        self.__name: str = name
+        self.__name_object: PersonalName = PersonalName(name)
+        self.__variations: List[str] = []
+    @property
+    def variations(self) -> List[str]:
+        if not self.__variations:
+            LOGGER.debug(f"Generating variations for name {self.__name}.")
+            variations = []
+            variations.append(self.__name_object.last_comma_first)
+            variations.append(self.__name_object.first_last)
+            if Normalizer.has_cyrillic(self.__name):
+                LOGGER.debug(
+                    f"Detected cyrillic in the original name '{self.__name}'. " \
+                    f"Generating a transliterated latin version."
+                )
+                transliterations = [
+                    Normalizer.transliterate(name)
+                    for name in variations
+                ]
+                variations.extend(transliterations)
+            # Guarantee adding one-word names as well
+            if self.__name not in variations:
+                variations.append(self.__name)
+            _variations = [v.strip() for v in variations if v.strip()]
+            self.__variations = list(set(_variations))
+            LOGGER.debug(
+                f"Generated the following variations for name '{self.__name}': " \
+                f"{self.__variations}."
+            )
+        return self.__variations
+class KeywordNormalizer(Normalizer):
+    """ Class for handling keyword-specific methods for string
+    normalizations and variations generation.
+    """
+    def __init__(self, keyword: str, keyword_type: str = "") -> NoReturn:
+        """ Initializes KeywordNormalizer object.
+        Parameters
+        -----------
+        keyword: str
+            keyword to normalize / generate variations for.
+        keyword_type: str
+            Keyword type. Should be one of the types specified in
+            rara_tools.constants.parsers.KeywordType or "".
+        """
+        super().__init__(entity=keyword)
+        self.__keyword: str = keyword
+        self.__variations: List[str] = []
+        self.__keyword_type: str = keyword_type
+        self.__loc_substitutions_map: dict = {"v": "w", "V": "W"}
+    def _transform_v_into_w(self, entity: str) -> str:
+        for old_val, new_val in list(self.__loc_substitutions_map.items()):
+            entity = re.sub(old_val, new_val, entity)
+        return entity
+    @property
+    def loc_substitutions_as_str(self) -> str:
+        subs = [
+            f"'{old_val}' -> '{new_val}'"
+            for old_val, new_val in list(self.__loc_substitutions_map.items())
+        ]
+        return ", ".join(subs)
+    @property
+    def variations(self) -> List[str]:
+        if not self.__variations:
+            LOGGER.debug(f"Generating variations for keyword {self.__keyword}.")
+            variations = []
+            variations.append(self.__keyword)
+            variations.append(self.lemmatized_entity)
+            variations.append(self.cleaned_entity)
+            variations.append(Normalizer.lemmatize(self.cleaned_entity))
+            # If keyword_type = LOC, add variations containing
+            # v -> w replacements
+            if self.__keyword_type == KeywordType.LOC:
+                LOGGER.debug(
+                    f"Detected keyword type = '{KeywordType.LOC}' -> " \
+                    f"Adding variations with the following character " \
+                    f"replacements: {self.loc_substitutions_as_str}."
+                )
+                v_w_transformations = [
+                    self._transform_v_into_w(entity)
+                    for entity in variations
+                ]
+                variations.extend(v_w_transformations)
+            variations = list(set(variations))
+            self.__variations = variations
+            LOGGER.debug(
+                f"Generated the following variations for keyword '{self.__keyword}': " \
+                f"{self.__variations}."
+            )
+        return self.__variations

rara_tools/parsers/tools/marc_converter.py ADDED Viewed

@@ -0,0 +1,15 @@
+import pymarc
+from typing import NoReturn
+class MarcConveter:
+    def __init__(self):
+        pass
+    @staticmethod
+    def marc21xml_to_mrc(input_file: str, output_file: str) -> NoReturn:
+        """ Converts Marc21XML file into a MRC file.
+        """
+        with open(output_file, "wb") as f:
+            writer = pymarc.MARCWriter(f)
+            records = pymarc.marcxml.map_xml(writer.write, input_file)

rara-tools 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

Potentially problematic release.

rara-tools 0.2.0py3-none-any.whl → 0.4.0py3-none-any.whl