PyPI - rara-tools - Versions diffs - 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

rara-tools 0.2.0py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of rara-tools might be problematic. Click here for more details.

Files changed (27) hide show

rara_tools/constants/normalizers.py +0 -11
rara_tools/constants/parsers.py +152 -0
rara_tools/normalizers/__init__.py +4 -0
rara_tools/normalizers/authorities.py +120 -0
rara_tools/normalizers/base.py +290 -0
rara_tools/normalizers/bibs.py +76 -0
rara_tools/normalizers/viaf.py +204 -0
rara_tools/parsers/marc_parsers/base_parser.py +50 -0
rara_tools/parsers/marc_parsers/ems_parser.py +49 -0
rara_tools/parsers/marc_parsers/location_parser.py +46 -0
rara_tools/parsers/marc_parsers/organization_parser.py +44 -0
rara_tools/parsers/marc_parsers/person_parser.py +45 -0
rara_tools/parsers/marc_parsers/title_parser.py +1 -0
rara_tools/parsers/marc_records/base_record.py +112 -0
rara_tools/parsers/marc_records/ems_record.py +267 -0
rara_tools/parsers/marc_records/organization_record.py +245 -0
rara_tools/parsers/marc_records/person_record.py +217 -0
rara_tools/parsers/marc_records/title_record.py +1 -0
rara_tools/parsers/tools/entity_normalizers.py +256 -0
rara_tools/parsers/tools/marc_converter.py +15 -0
rara_tools/parsers/tools/russian_transliterator.py +248 -0
{rara_tools-0.2.0.dist-info → rara_tools-0.4.0.dist-info}/METADATA +5 -2
rara_tools-0.4.0.dist-info/RECORD +37 -0
rara_tools-0.2.0.dist-info/RECORD +0 -17
{rara_tools-0.2.0.dist-info → rara_tools-0.4.0.dist-info}/WHEEL +0 -0
{rara_tools-0.2.0.dist-info → rara_tools-0.4.0.dist-info}/licenses/LICENSE.md +0 -0
{rara_tools-0.2.0.dist-info → rara_tools-0.4.0.dist-info}/top_level.txt +0 -0

rara_tools/normalizers/viaf.py ADDED Viewed

@@ -0,0 +1,204 @@
+import requests
+import json
+from typing import List
+from collections import defaultdict
+import logging
+logger = logging.getLogger(__name__)
+class VIAFRecord:
+    def __init__(self,
+                 record: dict,
+                 allowed_sources: List[str] = [
+                     "LC", "DNB", "LNB", "NLL", "ERRR", "J9U"]
+                 ):
+        self.__record: dict = record
+        self.__record_data: dict = {}
+        self.__allowed_sources: List[str] = allowed_sources
+        self.__viaf_id: int = None
+        self.__name_variations: List[str] = []
+        self.__birth_date: str = None
+        self.__death_date: str = None
+        self.__occupations: List[str] = []
+        self.__all_fields: dict = {}
+        self.__nationality: str = ""
+        self.__has_isni: bool = False
+        self.__author: str = ""
+        self.__author_type: str = None
+        self.__has_isni: str = ""
+        self.__activity_start: str = None
+        self.__activity_end: str = None
+    @property
+    def author(self) -> str:
+        if not self.__author:
+            self.__author = self.record_data.get(
+                "mainHeading", {}).get("text", "")
+    @property
+    def author_type(self) -> str:
+        """type of name (personal, corporate, title, etc)"""
+        if not self.__author_type:
+            self.__author_type = self.record_data.get("nameType")
+    @property
+    def viaf_id(self) -> int:
+        if not self.__viaf_id:
+            self.__viaf_id = self.record_data.get("viafID", "")
+        return self.__viaf_id
+    @property
+    def has_isni(self) -> bool:
+        return bool(self.record_data.get("isni", ""))
+    def __get_data(self, field_name: str) -> List[str]:
+        entries = self.record_data.get(field_name, {}).get("data", [])
+        data = []
+        for entry in entries:
+            sources = entry.get("sources", {}).get("s", [])
+            if set(self.__allowed_sources).intersection(set(sources)):
+                data.append(entry.get("text", ""))
+        return data
+    @property
+    def record_data(self) -> dict:
+        if not self.__record_data:
+            try:
+                self.__record_data = self.__record["queryResult"]
+            except:
+                self.__record_data = self.__record["recordData"]["VIAFCluster"]
+        return self.__record_data
+    @property
+    def name_variations(self) -> List[str]:
+        if not self.__name_variations:
+            self.__name_variations = self.__get_data("mainHeadings")
+        return self.__name_variations
+    @property
+    def birth_date(self) -> str:
+        if not self.__birth_date:
+            self.__birth_date = self.record_data.get("birthDate", None)
+        return self.__birth_date
+    @property
+    def death_date(self) -> str:
+        if not self.__death_date:
+            self.__death_date = self.record_data.get("deathDate", None)
+        return self.__death_date
+    @property
+    def occupations(self) -> List[str]:
+        if not self.__occupations:
+            self.__occupations = self.__get_data("occupation")
+        return self.__occupations
+    @property
+    def activity_start(self) -> str:
+        if not self.__birth_date:
+            self.__birth_date = self.record_data.get("activityStart", None)
+        return self.__birth_date
+    @property
+    def activity_end(self) -> str:
+        if not self.__death_date:
+            self.__death_date = self.record_data.get("activityEnd", None)
+        return self.__death_date
+    @property
+    def nationality(self) -> str:
+        if not self.__nationality:
+            nationalities = self.__get_data("nationalityOfEntity")
+            nationalities_dict = defaultdict(int)
+            for n in nationalities:
+                nationalities_dict[n.lower()] += 1
+            if nationalities:
+                self.__nationality = sorted(
+                    nationalities_dict.items(), key=lambda x: x[1], reverse=True)[0][0]
+        return self.__nationality
+    @property
+    def all_fields(self) -> dict:
+        if not self.__all_fields:
+            self.__all_fields = {
+                "viaf_id": self.viaf_id,
+                "name_variations": self.name_variations,
+                "birth_date": self.birth_date,
+                "death_date": self.death_date,
+                "occupations": self.occupations,
+                "nationality": self.nationality,
+                "activity_start": self.activity_start,
+                "activity_end": self.activity_end,
+                "has_isni": self.has_isni,
+                "author": self.author
+            }
+        return self.__all_fields
+class VIAFClient:
+    def __init__(self, viaf_api_url: str = "https://viaf.org/api"):
+        self.root_url = viaf_api_url.strip("/")
+        self.record_url = f"{self.root_url}/cluster-record"
+        self.search_url = f"{self.root_url}/search"
+        self.headers = {
+            "Accept": "application/json",
+            "Content-Type": "application/json"
+        }
+    def _send_request(self, url: str, data: dict) -> dict:
+        return requests.post(url, data=json.dumps(data), headers=self.headers)
+    def get_records_by_search_term(self,
+                                   search_term: str,
+                                   index: str = "viaf",
+                                   field: str = "local.names",
+                                   page_index: int = 0,
+                                   page_size: int = 50
+                                   ) -> dict:
+        data = {
+            "reqValues": {
+                "field": field,
+                "index": index,
+                "searchTerms": search_term
+            },
+            "meta": {
+                "env": "prod",
+                "pageIndex": page_index,
+                "pageSize": page_size
+            }
+        }
+        response = self._send_request(url=self.search_url, data=data)
+        return response
+    def get_records_by_viaf_id(self, record_id: str) -> dict:
+        data = {
+            "reqValues": {
+                "recordId": str(record_id)
+            }
+        }
+        response = self._send_request(url=self.record_url, data=data)
+        return response
+    def fetch_viaf_clusters(self, viaf_ids):
+        results = {}
+        for viaf_id in viaf_ids:
+            try:
+                response = self.get_records_by_viaf_id(viaf_id)
+                response.raise_for_status()
+                results[viaf_id] = response.json()
+            except requests.exceptions.RequestException as e:
+                logger.error(f"Error fetching VIAF record {viaf_id}: {e}")
+                results[viaf_id] = {}
+        return results
+    def get_normalized_data(self, record_ids: List[str]) -> List[VIAFRecord]:
+        """ Fetch data required for normalization from VIAF. """
+        response = self.fetch_viaf_clusters(record_ids)
+        return [VIAFRecord(response[record_id]) for record_id in record_ids]

rara_tools/parsers/marc_parsers/base_parser.py ADDED Viewed

@@ -0,0 +1,50 @@
+from typing import List, NoReturn
+from pymarc.record import Record
+from pymarc import MARCReader
+from abc import abstractmethod
+from collections.abc import Iterator, Iterable
+import jsonlines
+class BaseMARCParser:
+    """ Base class for MARC parsers.
+    """
+    def __init__(self,
+            marc_file_path: str,
+            add_variations: bool = True
+        ) -> NoReturn:
+        """ Initializes BaseMARCParser object.
+        Parameters
+        -----------
+        marc_file_path: str
+            Full path to .mrc file containing EMS data.
+        add_variations: bool
+            If enabled, constructs an additional variations field, which
+            combines the content of multiple fields + adds some generated
+            variations. If the output is uploaded into Elastic and used
+            via rara-norm-linker, it is necessary to enable this.
+        """
+        self.add_variations = add_variations
+        self.marc_file_path = marc_file_path
+    def _write_line(self, line: dict, file_path: str) -> NoReturn:
+        with jsonlines.open(file_path, "a") as f:
+            f.write(line)
+    def marc_record_generator(self) -> Iterator[Record]:
+        """ Generates pymarc.record.Record objects.
+        """
+        with open(self.marc_file_path, "rb") as fh:
+            reader = MARCReader(fh)
+            for record in reader:
+                if record:
+                    yield record
+    @abstractmethod
+    def record_generator(self) -> Iterator:
+        pass
+    def save_as_jl(self, jl_file_path: str) -> NoReturn:
+        for record in self.record_generator():
+            self._write_line(record.full_record, jl_file_path)

rara_tools/parsers/marc_parsers/ems_parser.py ADDED Viewed

@@ -0,0 +1,49 @@
+from typing import NoReturn
+from collections.abc import Iterator
+from rara_tools.parsers.marc_parsers.base_parser import BaseMARCParser
+from rara_tools.parsers.marc_records.ems_record import EMSRecord
+from rara_tools.constants.parsers import KeywordType, LOGGER
+class EMSMARCParser(BaseMARCParser):
+    """ MARC parser for EMS .mrc files.
+    """
+    def __init__(self,
+            marc_file_path: str,
+            add_variations: bool = True
+        ) -> NoReturn:
+        """ Initializes EMSMARCParser object.
+        Parameters
+        -----------
+        marc_file_path: str
+            Full path to .mrc file containing EMS data.
+        add_variations: bool
+            If enabled, constructs an additional variations field, which
+            combines the content of multiple fields + adds some generated
+            variations. If the output is uploaded into Elastic and used
+            via rara-norm-linker, it is necessary to enable this.
+        """
+        super().__init__(
+            marc_file_path=marc_file_path,
+            add_variations=add_variations
+        )
+    def record_generator(self) -> Iterator[EMSRecord]:
+        """ Generates EMSRecord objects for topic, genre,
+        and time keywords. Location keywords are ignored here
+        and handled in a separate parser.
+        """
+        LOGGER.info(
+            f"Generating EMS records (without location keywords) " \
+            f"from MARC dump '{self.marc_file_path}'."
+        )
+        for record in self.marc_record_generator():
+            ems_record = EMSRecord(
+                record=record,
+                add_variations=self.add_variations
+            )
+            if ems_record.keyword_type == KeywordType.LOC:
+                continue
+            else:
+                yield ems_record.full_record

rara_tools/parsers/marc_parsers/location_parser.py ADDED Viewed

@@ -0,0 +1,46 @@
+from typing import NoReturn
+from collections.abc import Iterator
+from rara_tools.parsers.marc_parsers.base_parser import BaseMARCParser
+from rara_tools.parsers.marc_records.ems_record import EMSRecord
+from rara_tools.constants.parsers import KeywordType, LOGGER
+class LocationMARCParser(BaseMARCParser):
+    """ MARC parser for EMS .mrc files.
+    """
+    def __init__(self,
+            marc_file_path: str,
+            add_variations: bool = True
+        ) -> NoReturn:
+        """ Initializes LocationMARCParser object.
+        Parameters
+        -----------
+        marc_file_pasth: str
+            Full path to .mrc file containing EMS data.
+        add_variations: bool
+            If enabled, constructs an additional variations field, which
+            combines the content of multiple fields + adds some generated
+            variations. If the output is uploaded into Elastic and used
+            via rara-norm-linker, it is necessary to enable this.
+        """
+        super().__init__(
+            marc_file_path=marc_file_path,
+            add_variations=add_variations
+        )
+    def record_generator(self) -> Iterator[EMSRecord]:
+        """ Generates EMSRecord objects for location keywords.
+        """
+        LOGGER.info(
+            f"Generating EMS-based location records " \
+            f"from MARC dump '{self.marc_file_path}'."
+        )
+        for record in self.marc_record_generator():
+            ems_record = EMSRecord(
+                record=record,
+                add_variations=self.add_variations
+            )
+            if ems_record.keyword_type == KeywordType.LOC:
+                yield ems_record.full_record

rara_tools/parsers/marc_parsers/organization_parser.py ADDED Viewed

@@ -0,0 +1,44 @@
+from typing import NoReturn
+from collections.abc import Iterator
+from rara_tools.parsers.marc_parsers.base_parser import BaseMARCParser
+from rara_tools.parsers.marc_records.organization_record import OrganizationRecord
+from rara_tools.constants.parsers import LOGGER
+class OrganizationsMARCParser(BaseMARCParser):
+    """ MARC parser for organizations' .mrc files.
+    """
+    def __init__(self,
+            marc_file_path: str,
+            add_variations: bool = True
+        ) -> NoReturn:
+        """ Initializes OrganizationsMARCParser object.
+        Parameters
+        -----------
+        marc_file_path: str
+            Full path to .mrc file containing organizations' data.
+        add_variations: bool
+            If enabled, constructs an additional variations field, which
+            combines the content of multiple fields + adds some generated
+            variations. If the output is uploaded into Elastic and used
+            via rara-norm-linker, it is necessary to enable this.
+        """
+        super().__init__(
+            marc_file_path=marc_file_path,
+            add_variations=add_variations
+        )
+    def record_generator(self) -> Iterator[OrganizationRecord]:
+        """ Generates OrganizationRecord objects.
+        """
+        LOGGER.info(
+            f"Generating organization records from " \
+            f"MARC dump '{self.marc_file_path}'."
+        )
+        for record in self.marc_record_generator():
+            org_record = OrganizationRecord(
+                record=record,
+                add_variations=self.add_variations
+            )
+            yield org_record.full_record

rara_tools/parsers/marc_parsers/person_parser.py ADDED Viewed

@@ -0,0 +1,45 @@
+from typing import NoReturn, List
+from collections.abc import Iterator
+from rara_tools.parsers.marc_parsers.base_parser import BaseMARCParser
+from rara_tools.parsers.marc_records.person_record import PersonRecord
+from rara_tools.constants.parsers import LOGGER
+class PersonsMARCParser(BaseMARCParser):
+    """ MARC parser for persons' .mrc files.
+    """
+    def __init__(self,
+            marc_file_path: str,
+            add_variations: bool = True
+        ) -> NoReturn:
+        """ Initializes PersonsMARCParser object.
+        Parameters
+        -----------
+        marc_file_path: str
+            Full path to .mrc file containing persons' data.
+        add_variations: bool
+            If enabled, constructs an additional variations field, which
+            combines the content of multiple fields + adds some generated
+            variations. If the output is uploaded into Elastic and used
+            via rara-norm-linker, it is necessary to enable this.
+        """
+        super().__init__(
+            marc_file_path=marc_file_path,
+            add_variations=add_variations
+        )
+    def record_generator(self) -> Iterator[PersonRecord]:
+        """ Generates PersonRecord objects.
+        """
+        LOGGER.info(
+            f"Generating person records from " \
+            f"MARC dump '{self.marc_file_path}'."
+        )
+        for record in self.marc_record_generator():
+            person_record = PersonRecord(
+                record=record,
+                add_variations=self.add_variations
+            )
+            yield person_record.full_record

rara_tools/parsers/marc_parsers/title_parser.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ # Coming soon

rara_tools/parsers/marc_records/base_record.py ADDED Viewed

@@ -0,0 +1,112 @@
+from typing import List, NoReturn, Tuple
+from abc import abstractmethod
+from pymarc.record import Record
+from rara_tools.constants.parsers import GeneralMarcIDs
+class BaseRecord:
+    """ Implements general logic of parsing MARC files.
+    """
+    def __init__(self, record: Record, add_variations: bool = False) -> NoReturn:
+        """ Initializes BaseRecord object.
+        Parameters
+        -----------
+        record: Record
+            pymarc.record.Record objectself.
+        add_variations: bool
+            If enabled, constructs an additional variations field, which
+            combines the content of multiple fields + adds some generated
+            variations. If the output is uploaded into Elastic and used
+            via rara-norm-linker, it is necessary to enable this.
+        """
+        self.add_variations: bool = add_variations
+        self.__record_mrc: Record = record
+        self.__record_dict: dict = record.as_dict()["fields"]
+        self.__id_field_id: List[str] = GeneralMarcIDs.ID
+        self.__id_source_field_id: List[str] = GeneralMarcIDs.ID_SOURCE
+        self.__identifier: str = ""
+        self.__identifier_source: str = ""
+    def get_values(self,
+            marc_ids: List[str],
+            subfield_id: str | List[str] = "",
+            ind1: str = " ",
+            ind2: str = " ",
+            subfield_restriction: Tuple[str, str] = (),
+            subfield_to_ignore: str | None = None
+        ) -> List[str] | List[dict]:
+        values = []
+        for field in self.dict_record:
+            field_id = list(field.keys())[0]
+            if field_id in marc_ids:
+                # TODO: ind1!
+                if not subfield_id:
+                    values.append(field[field_id])
+                else:
+                    if field[field_id]["ind2"] == ind2:
+                        subfields = field[field_id]["subfields"]
+                        subfield_tuples = [list(subfield.items())[0] for subfield in subfields]
+                        subfield_keys = [list(subfield.keys())[0] for subfield in subfields]
+                        if subfield_restriction and subfield_restriction not in subfield_tuples:
+                            continue
+                        if subfield_to_ignore and subfield_to_ignore in subfield_keys:
+                            continue
+                        _value = {}
+                        for subfield in subfields:
+                            _subfield_id = list(subfield.keys())[0]
+                            if isinstance(subfield_id, str):
+                                if _subfield_id == subfield_id:
+                                    value = subfield[_subfield_id]
+                                    values.append(value)
+                            elif isinstance(subfield_id, list):
+                                if _subfield_id in subfield_id:
+                                    value = subfield[_subfield_id]
+                                    _value[_subfield_id] = value
+                        if isinstance(subfield_id, list):
+                            values.append(_value)
+        return values
+    def _clean_value(self, value: str) -> str:
+        cleaned_value = value.strip("., ")
+        return cleaned_value
+    def _merge_and_clean(self, value: dict, keys: List[str]) -> str:
+        _merged = []
+        for key in keys:
+            _value = self._clean_value(value.get(key, ""))
+            if _value:
+                _merged.append(_value)
+        merged = " ".join(_merged)
+        return merged
+    @property
+    def identifier(self) -> str:
+        if not self.__identifier:
+            values = self.get_values(marc_ids=self.__id_field_id)
+            self.__identifier = values[0] if values else ""
+        return self.__identifier
+    @property
+    def identifier_source(self) -> str:
+        if not self.__identifier_source:
+            values = self.get_values(marc_ids=self.__id_source_field_id)
+            self.__identifier_source = values[0] if values else ""
+        return self.__identifier_source
+    @property
+    def marc_record(self) -> Record:
+        return self.__record_mrc
+    @property
+    def marc_json_record(self) -> dict:
+        return self.marc_record.as_dict()
+    @property
+    def dict_record(self) -> Record:
+        return self.__record_dict

rara-tools 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

Potentially problematic release.

rara-tools 0.2.0py3-none-any.whl → 0.4.0py3-none-any.whl