PyPI - rara-tools - Versions diffs - 0.7.8__tar.gz → 0.7.10__tar.gz - Mend

rara-tools 0.7.8tar.gz → 0.7.10tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of rara-tools might be problematic. Click here for more details.

Files changed (68) hide show

{rara_tools-0.7.8/rara_tools.egg-info → rara_tools-0.7.10}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: rara-tools
-Version: 0.7.8
+Version: 0.7.10
 Summary: Tools to support Kata's work.
 Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.10

rara_tools-0.7.10/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0.7.10

{rara_tools-0.7.8 → rara_tools-0.7.10}/rara_tools/constants/normalizers.py RENAMED Viewed

@@ -1,5 +1,7 @@
 from pymarc import Indicators
-import os
+YYMMDD_FORMAT = "%y%m%d"
+YY_DD_FORMAT = "%Y-%m"
 class EntityType:
     PER = "PER"

{rara_tools-0.7.8 → rara_tools-0.7.10}/rara_tools/normalizers/authorities.py RENAMED Viewed

@@ -11,13 +11,15 @@ class AuthoritiesRecordNormalizer(RecordNormalizer):
     """ Normalize authorities records """
     def __init__(self, linking_results: List[dict] = [], sierra_data: List[dict] = [],
-                 ALLOW_EDIT_FIELDS: List[str] = [
-                     "667", "925", "043"],
-                 REPEATABLE_FIELDS: List[str] = ["024", "035", "400", "670"]):
+                 ALLOW_EDIT_FIELDS: List[str] = ["008", "925"],
+                 REPEATABLE_FIELDS: List[str] = ["024", "035", "400", "670", "667"]):
         super().__init__(linking_results, sierra_data)
         self.ALLOW_EDIT_FIELDS = ALLOW_EDIT_FIELDS
         self.REPEATABLE_FIELDS = REPEATABLE_FIELDS
+        self.records_extra_data = []
+        self.sierra_data = sierra_data
+        self.records = self._setup_records(linking_results, sierra_data)
     def _normalize_sierra(self, record: Record, sierraID: str) -> None:
@@ -26,7 +28,6 @@ class AuthoritiesRecordNormalizer(RecordNormalizer):
         fields = [
             Field(
                 tag="008",
-                indicators=EMPTY_INDICATORS,
                 data=f"{self.current_timestamp()}{suffix_008}"
             ),
@@ -50,22 +51,24 @@ class AuthoritiesRecordNormalizer(RecordNormalizer):
         return record
     def _add_birth_and_death_dates(self, record: Record, viaf_record: VIAFRecord) -> None:
+        formatted_birth_date = self._format_date(viaf_record.birth_date)
+        formatted_death_date = self._format_date(viaf_record.death_date) if viaf_record.death_date != 0 else ""
+        birth_date = self.get_subfield(
+            record, "046", "f", formatted_birth_date)
+        death_date = self.get_subfield(
+            record, "046", "g", formatted_death_date)
         subfields_046 = [
-            Subfield("f", self.get_subfield(
-                record, "046", "f", viaf_record.birth_date)),
-            Subfield("g", self.get_subfield(
-                record, "046", "g", viaf_record.death_date)),
-            Subfield("s", self.get_subfield(
-                record, "046", "s", viaf_record.activity_start)),
-            Subfield("t", self.get_subfield(
-                record, "046", "t", viaf_record.activity_end)),
+            Subfield("f", birth_date),
+            Subfield("g", death_date),
         ]
         self._add_fields_to_record(
             record, [Field(tag="046", indicators=EMPTY_INDICATORS, subfields=subfields_046)])
-    def _add_viaf_url_and_isni(self, record: Record, viaf_record: VIAFRecord) -> None:
-        # TODO 024. will be used to store KRATT KATA ID. Just generate one?
+    def _add_viaf_url_or_isni(self, record: Record, viaf_record: VIAFRecord) -> None:
         viaf_url = f"https://viaf.org/viaf/{viaf_record.viaf_id}"
         subfields = [Subfield("0", self.get_subfield(
@@ -80,17 +83,20 @@ class AuthoritiesRecordNormalizer(RecordNormalizer):
         self._add_fields_to_record(record, [field])
     def _add_nationality(self, record: Record, viaf_record: VIAFRecord) -> None:
+        """ Non-repeatable field 043 - adds ee only if is estonian nationality and
+        the records does not have the field already."""
+        is_person_est = self._is_person_est_nationality(viaf_record)
+        if is_person_est:
+            fields = [
+                Field(
+                    tag="043",
+                    indicators=EMPTY_INDICATORS,
+                    subfields=[Subfield("c", "ee")])
+                ]
-        fields = [
-            Field(
-                tag="043",
-                indicators=EMPTY_INDICATORS,
-                subfields=[
-                    Subfield("c", "ee")
-                ] if self._is_person_est_nationality(viaf_record) else []
-            )]
-        self._add_fields_to_record(record, fields)
+            self._add_fields_to_record(record, fields)
     def _normalize_viaf(self, record: Record, viaf_record: VIAFRecord) -> None:
         """"
@@ -102,18 +108,17 @@ class AuthoritiesRecordNormalizer(RecordNormalizer):
         100, 110, 111 - non-repeatable field, attempts to add author type, if missing.
         """
-        # TODO: include KRATT KATA ID to 024 and remove on delete. Increment last elastic ID?
         if not viaf_record:
             return
         self._add_nationality(record, viaf_record)
-        self._add_viaf_url_and_isni(record, viaf_record)
+        self._add_viaf_url_or_isni(record, viaf_record)
         self._add_birth_and_death_dates(record, viaf_record)
         self._add_author(record, viaf_record)
     def _normalize_record(self, record: Record, sierraID: str,
                           viaf_record: VIAFRecord, is_editing_existing_record: bool) -> Record:
         self._normalize_sierra(record, sierraID)
         self._normalize_viaf(record, viaf_record)

{rara_tools-0.7.8 → rara_tools-0.7.10}/rara_tools/normalizers/base.py RENAMED Viewed

@@ -7,9 +7,12 @@ from rara_tools.normalizers.viaf import VIAFRecord, VIAFClient
 from rara_tools.constants.normalizers import (
     DEFAULT_VIAF_FIELD, ALLOWED_VIAF_FIELDS, ALLOWED_VIAF_WIKILINK_LANGS,
     VIAF_SIMILARITY_THRESHOLD, VERIFY_VIAF_RECORD, MAX_VIAF_RECORDS_TO_VERIFY,
-    EMPTY_INDICATORS
+    EMPTY_INDICATORS, YYMMDD_FORMAT, YY_DD_FORMAT
 )
 from glom import glom
+from dateutil import parser
+from datetime import date
 import logging
 import json
@@ -18,7 +21,7 @@ logger = logging.getLogger(__name__)
 class RecordNormalizer:
     """
-    Base class. For normalizing different record types corresponding classes have been created.
+    Base class for normalizing different record types corresponding classes have been created.
     By default existing record fields will not be changed, unless included in ALLOW_EDIT_FIELDS. If a field
     included in the normalization is not present, it will be added to the record. If under REPEATABLE_FIELDS.
     a new record field is added.
@@ -30,72 +33,15 @@ class RecordNormalizer:
     """
     def __init__(self, linking_results: List[dict] = [], sierra_data: List[dict] = [],
-                 ALLOW_EDIT_FIELDS: List[str] = ["667", "925"], REPEATABLE_FIELDS: List[str] = []):
+                 ALLOW_EDIT_FIELDS: List[str] = ["925"], REPEATABLE_FIELDS: List[str] = ["667"]):
         # Include, if will replace existing field
         self.ALLOW_EDIT_FIELDS = ALLOW_EDIT_FIELDS
         # include, if should be added alongside existing fields
         self.REPEATABLE_FIELDS = REPEATABLE_FIELDS
-        self.records_extra_data = []
-        self.records = self._setup_records(linking_results, sierra_data)
-        self.sierra_data = sierra_data
-    def _setup_records(self, linking_results: List[dict], sierra_data: List[dict]) -> JSONReader:
-        """Setup initial MARC records and data.
-        For linked entities:
-            1. Try to get single linked normalized record from KATA elastic. If more than one found, skip.
-            2. If 0 matches, search from VIAF and if 1 result found, create a new authority record from the data.
-            3. If none or more than one responses found, use only Classificator data (coming from Linker?).
-        for SIERRA records: normalize.
-        """
-        linked_records = []
-        for linked in linking_results:
-            entity = linked.get("original_entity")
-            try:
-                linked_info = linked.get("linked_info", [])
-                linked_num = len(linked_info)
-                if not linked_info:
-                    # new record will be created
-                    logger.info(
-                        f"No linked entities found for {entity}")
-                    continue
-                if linked_num == 1:
-                    linked = linked_info[0]
-                    linked_records.append(linked.get("json", {}))
-                    self.records_extra_data.append({
-                        "entity": entity,
-                        "viaf": linked.get("viaf", {}),
-                        "type": "linked",
-                        "edited": True
-                    })
-                else:
-                    # new record will be created
-                    logger.info(
-                        f"Multiple linked entities found for {entity}")
-            except Exception as e:
-                logger.error(f"Error processing entity {entity}: {e}")
-        self.records_extra_data.extend(
-            {
-                "sierraID": obj.get("sierraID"),
-                "type": "sierra",
-                "edited": True
-            }
-            for obj in (sierra_data or [])
-        )
-        all_records = linked_records + (sierra_data or [])
-        return SafeJSONReader(
-            json.dumps(all_records, ensure_ascii=False),
-        )
+        # leader applied to new records
+        self.DEFAULT_LEADER = "01682nz  a2200349n  4500" # must be 24 digits
     def _setup_records(self, linking_results: List[dict], sierra_data: List[dict]) -> JSONReader:
         """Setup initial MARC records and data.
@@ -114,9 +60,34 @@ class RecordNormalizer:
             linked_info = linked.get("linked_info", [])
             if not isinstance(linked_info, list) or not linked_info:
+                # No linked entities found, create new record
+                logger.info(
+                    f"No linked entities found for {entity}, Creating new record.")
+                linked_records.append({
+                    "leader": self.DEFAULT_LEADER,
+                    "fields": []
+                })
+                self.records_extra_data.append({
+                    "entity": entity,
+                    "edited": False
+                })
                 continue
+            if len(linked_info) > 1:
+                # Multiple linked entities found, create new record
+                logger.info(
+                    f"Multiple linked entities found for {entity}. Creating new record.")
+                linked_records.append({
+                    "leader": self.DEFAULT_LEADER,
+                    "fields": []
+                })
+                self.records_extra_data.append({
+                    "entity": entity,
+                    "edited": False
+                })
+                continue
-            if len(linked_info) == 1:
+            elif len(linked_info) == 1:
                 linked_item = linked_info[0]
                 if not isinstance(linked_item, dict):
                     continue
@@ -128,7 +99,8 @@ class RecordNormalizer:
                     "type": "linked",
                     "edited": True
                 })
+                continue
         self.records_extra_data.extend(
             {
                 "sierraID": obj.get("sierraID"),
@@ -138,25 +110,25 @@ class RecordNormalizer:
             for obj in (sierra_data or [])
             if isinstance(obj, dict)
         )
         all_records = linked_records + (sierra_data or [])
         return SafeJSONReader(json.dumps(all_records, ensure_ascii=False))
     @staticmethod
     def current_timestamp():
-        """6 digit timestamp."""
-        return datetime.now().strftime("%H%M%S")
+        """6 digit timestamp, format YYMMDD"""
+        return datetime.now().strftime(YYMMDD_FORMAT)
     @staticmethod
     def current_yyyy_dd():
         """format of 2025-03"""
-        return datetime.now().strftime("%Y-%m")
+        return datetime.now().strftime(YY_DD_FORMAT)
     @staticmethod
     def _is_person_est_nationality(viaf_record: VIAFRecord) -> bool:
-        return viaf_record.nationality == "ee"
+        return hasattr(viaf_record, 'nationality') and viaf_record.nationality == "ee"
     def _is_nxx(self, field: Field, n: str):
         """ Check if fields tag is in nxx range. """
         return field.tag.startswith(n)
@@ -173,6 +145,27 @@ class RecordNormalizer:
     def _filter_equivalent_field_not_in_record(self, record: Record, fields: List[Field]) -> bool:
         """ filter out fields, that do not have an equivalent in the record. """
         return filter(lambda field: not self._field_in_record(field, record), fields)
+    def _format_date(self, value: str) -> str:
+        if not value:
+            return ""
+        if isinstance(value, (datetime, date)):
+            return value.strftime("%Y%m%d")
+        val = str(value).strip()
+        try:
+            dt = parser.parse(val, fuzzy=False, default=datetime(1, 1, 1))
+        except Exception:
+            return ""
+        if len(val) == 4 and val.isdigit():
+            return dt.strftime("%Y")  # YYYY
+        if len(val) in (6, 7):  # YYYYMM or YYYY-MM
+            return dt.strftime("%Y%m")  # YYYYMM
+        return dt.strftime("%Y%m%d")  # YYYYMMDD
     def get_subfield(self, record: Record, tag: str, subfield: str, default: str) -> str:
         """ get record existing subfield value or assign a fallback value. """
@@ -220,7 +213,10 @@ class RecordNormalizer:
         )
     def _add_fields_to_record(self, record: Record, fields: List[Field]) -> Record:
+        # filter out subfields that are empty, or 0, as VIAF returns 0 for unknown dates
+        for field in fields:
+            field.subfields = [sub for sub in field.subfields if sub.value and sub.value not in ["0", 0]]
         self._handle_repeatable_fields(record, *fields)
         self._handle_editable_fields(record, *fields)
         self._handle_default_fields(record, *fields)
@@ -247,31 +243,63 @@ class RecordNormalizer:
                 indicators=EMPTY_INDICATORS,
                 subfields=[
                     Subfield("a", viaf_record.name),
-                    Subfield("b", viaf_record.name_type), # Is this correct??
-                    Subfield("c", viaf_record.name_type)  # Is this correct??
                 ]
             )
         ]
         self._add_fields_to_record(record, fields)
+    def _move680_fields_to_667(self, record: Record) -> None:
+        """ Move existing 680 fields to 667, if any. """
+        fields_680 = record.get_fields("680")
+        if not fields_680:
+            return
+        fields_667 = [
+            Field(
+                tag="667",
+                indicators=EMPTY_INDICATORS,
+                subfields=field.subfields
+            ) for field in fields_680
+        ]
-    def _normalize_common(self, record: Record, is_editing_existing_record: bool) -> None:
-        """Common logic for all normalizations. """
+        record.remove_fields("680")
+        self._add_fields_to_record(record, fields_667)
+    def _normalize_common(self, record: Record, is_editing_existing_record: bool) -> None:
+        """Common logic for all normalizations.
+        - Includes note about record being created/edited.
+        - include date note with a different subfield, depending on if record is new or edited.
+        - move existing 680 fields to 667
+        """
+        # before adding new notes
+        self._move680_fields_to_667(record)
         note = "Muudetud AI poolt" if is_editing_existing_record else "Loodud AI poolt"
         date_note = f"KRATT {self.current_yyyy_dd()}"
-        fields = [
-            Field(tag="667",
-                  indicators=EMPTY_INDICATORS,
-                  subfields=[Subfield("a", note)]),
-            Field(tag="925",
-                  indicators=EMPTY_INDICATORS,
-                  subfields=[Subfield("t", self.get_subfield(record, "925", "t", date_note))
-                             ] + ([Subfield("p", self.get_subfield(record, "925", "p", date_note))]
-                                  if is_editing_existing_record else []))
-        ]
+        field_667 = Field(tag="667",
+                          indicators=EMPTY_INDICATORS,
+                          subfields=[Subfield("a", note)])
+        fields = [field_667]
+        if is_editing_existing_record:
+            field_925 = Field(tag="925",
+                              indicators=EMPTY_INDICATORS,
+                              subfields=[
+                                  Subfield("p", self.get_subfield(record, "925", "p", date_note))
+                              ])
+            fields.append(field_925)
+        else:
+            field_925 = Field(tag="925",
+                              indicators=EMPTY_INDICATORS,
+                              subfields=[
+                                  Subfield("t", self.get_subfield(record, "925", "t", date_note))
+                              ])
+            fields.append(field_925)
         self._add_fields_to_record(record, fields)
         return record

{rara_tools-0.7.8 → rara_tools-0.7.10}/rara_tools/normalizers/bibs.py RENAMED Viewed

@@ -1,4 +1,4 @@
-from pymarc import (Field, Indicators, Subfield, Record)
+from pymarc import (Field, Subfield, Record)
 from rara_tools.constants import EMPTY_INDICATORS
 from rara_tools.normalizers.viaf import VIAFRecord
@@ -11,32 +11,25 @@ class BibRecordNormalizer(RecordNormalizer):
     """ Normalize bib records. """
     def __init__(self, linking_results: List[dict] = [], sierra_data: List[dict] = [],
-                 ALLOW_EDIT_FIELDS: List[str] = ["667", "925"],
-                 REPEATABLE_FIELDS: List[str] = []):
+                 ALLOW_EDIT_FIELDS: List[str] = ["008", "925"],
+                 REPEATABLE_FIELDS: List[str] = ["667"]):
         super().__init__(linking_results, sierra_data)
+        self.DEFAULT_LEADER = "00399nz  a2200145n  4500" # must be 24 digits
         self.ALLOW_EDIT_FIELDS = ALLOW_EDIT_FIELDS
         self.REPEATABLE_FIELDS = REPEATABLE_FIELDS
+        self.records_extra_data = []
+        self.sierra_data = sierra_data
+        self.records = self._setup_records(linking_results, sierra_data)
     def _normalize_sierra(self, record: Record) -> Record:
+        suffix_008 = "|||aznnnaabn          || |||      "
         fields = [
             Field(
                 tag="008",
-                indicators=EMPTY_INDICATORS,
-                data=f"{self.current_timestamp()} | | | aznnnaabn | | | | |"
-            ),
-            Field(
-                tag="046",
-                indicators=EMPTY_INDICATORS,
-                subfields=[
-                    Subfield("k", "Pub date")
-                ]
-            ),
-            Field(
-                tag="245",
-                indicators=Indicators("1", "0"),
-                subfields=[
-                    Subfield("a", "Title")
-                ]
+                data=f"{self.current_timestamp()}{suffix_008}"
             ),
         ]
@@ -55,14 +48,8 @@ class BibRecordNormalizer(RecordNormalizer):
                 subfields=[
                     Subfield("a", viaf_id)
                 ]
-            ),
-            Field(
-                tag="100",
-                indicators=EMPTY_INDICATORS,
-                subfields=[
-                    Subfield("a", "?")
-                ]
-            )]
+            )
+        ]
         self._add_fields_to_record(record, fields)
         self._add_author(record, viaf_record)

{rara_tools-0.7.8 → rara_tools-0.7.10}/rara_tools/normalizers/reader.py RENAMED Viewed

@@ -3,7 +3,7 @@ import logging
 logger = logging.getLogger(__name__)
-DEFAULT_LEADER = '01682nz a2200349n 4500'
+DEFAULT_LEADER = "01682nz  a2200349n  4500" # must be 24 digits
 class SafeJSONReader(JSONReader):

{rara_tools-0.7.8 → rara_tools-0.7.10}/rara_tools/normalizers/viaf.py RENAMED Viewed

@@ -10,6 +10,7 @@ from rara_tools.constants.normalizers import (
     DEFAULT_VIAF_FIELD, ALLOWED_VIAF_FIELDS, ALLOWED_VIAF_WIKILINK_LANGS,
     VIAF_SIMILARITY_THRESHOLD, VIAF_ALLOWED_SOURCES
 )
+from glom import glom
 import logging
 logger = logging.getLogger(__name__)
@@ -598,11 +599,13 @@ class VIAFClient:
         """
         logger.debug("Extracting VIAF IDs from VIAF search query results.")
         try:
-            records = search_query_response.json()["queryResult"]["records"]["record"]
+            res_json = search_query_response.json()
+            records = glom(res_json, "queryResult.records.record", default=[])
         except Exception as e:
             logger.error(
                 f"Parsing records from search query " \
-                f"response failed with error: {e}."
+                f"failed with error: {e}."
             )
             records = []
         viaf_ids = []

{rara_tools-0.7.8 → rara_tools-0.7.10/rara_tools.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: rara-tools
-Version: 0.7.8
+Version: 0.7.10
 Summary: Tools to support Kata's work.
 Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.10

rara-tools 0.7.8__tar.gz → 0.7.10__tar.gz

Potentially problematic release.

rara-tools 0.7.8tar.gz → 0.7.10tar.gz