PyPI - rara-tools - Versions diffs - 0.7.16__tar.gz → 0.7.18__tar.gz - Mend

rara-tools 0.7.16tar.gz → 0.7.18tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of rara-tools might be problematic. Click here for more details.

Files changed (69) hide show

{rara_tools-0.7.16/rara_tools.egg-info → rara_tools-0.7.18}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: rara-tools
-Version: 0.7.16
+Version: 0.7.18
 Summary: Tools to support Kata's work.
 Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.10

rara_tools-0.7.18/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0.7.18

{rara_tools-0.7.16 → rara_tools-0.7.18}/rara_tools/digar_schema_converter.py RENAMED Viewed

@@ -77,7 +77,7 @@ class PageSchema:
             self.__schema = {
                 "@type": "CreativeWork",  # CONSTANT for pages
                 "@id": self.page_id,
-                "hasPart": []
+                "dcterms:hasPart": []
             }
             text_schemas = [
                 TextPageSchema(page).schema
@@ -91,7 +91,7 @@ class PageSchema:
             page_schemas = text_schemas + image_schemas
             page_schemas_with_ids = self._add_segment_ids(page_schemas)
-            self.__schema["hasPart"].extend(page_schemas_with_ids)
+            self.__schema["dcterms:hasPart"].extend(page_schemas_with_ids)
         return self.__schema

{rara_tools-0.7.16 → rara_tools-0.7.18}/rara_tools/normalizers/base.py RENAMED Viewed

@@ -3,6 +3,8 @@ from pymarc import (Field, Subfield, JSONReader, Record)
 from typing import List, Optional, Iterator
 from rara_tools.normalizers.reader import SafeJSONReader
+from rara_tools.parsers.tools.validators import filter_names
 from rara_tools.normalizers.viaf import VIAFRecord, VIAFClient
 from rara_tools.constants.normalizers import (
     DEFAULT_VIAF_FIELD, ALLOWED_VIAF_FIELDS, ALLOWED_VIAF_WIKILINK_LANGS,
@@ -311,25 +313,34 @@ class RecordNormalizer:
         if viaf_record:
             self._include_name_variations(record, viaf_record)
-    def _include_name_variations(self, record: Record, viaf_record: VIAFRecord) -> None:
+    def _include_name_variations(self, record: Record, viaf_record: VIAFRecord, filter_variations=True) -> None:
         """ Include name variations from VIAF record as 400|t fields """
         if not viaf_record or not viaf_record.name_variations:
             return
         existing_name_variations = record.get_fields("400")
-        existing_variations = [sf.value for field in existing_name_variations for sf in field.get_subfields("t")]
+        existing_variations = [sf.value for field in existing_name_variations for sf in field.get_subfields("a")]
+        if filter_variations:
+            allowed_variations = filter_names(viaf_record.name_variations)
+            logger.debug(
+                f"filtered out {len(viaf_record.name_variations) - len(allowed_variations)} name variations for '{viaf_record.name}'"
+            )
+        else:
+            allowed_variations = viaf_record.name_variations
         fields = []
-        for variation in viaf_record.name_variations:
+        for variation in allowed_variations:
             if variation not in existing_variations:
                 fields.append(
                     Field(
                         tag="400",
                         indicators=EMPTY_INDICATORS,
                         subfields=[
-                            Subfield("t", variation)
+                            Subfield("a", variation)
                         ]
                     )
                 )

{rara_tools-0.7.16 → rara_tools-0.7.18}/rara_tools/normalizers/bibs.py RENAMED Viewed

@@ -73,26 +73,13 @@ class BibRecordNormalizer(RecordNormalizer):
     def _normalize_viaf(self, record: Record, viaf_record: VIAFRecord, original_entity: str) -> None:
         if not viaf_record:
             # viaf record not found, include original entity as 100|t
             self._add_author(record, viaf_record=None, original_entity=original_entity)
             return record
-        viaf_id = viaf_record.viaf_id
-        fields = [
-            Field(
-                tag="035",
-                indicators=EMPTY_INDICATORS,
-                subfields=[
-                    Subfield("a", viaf_id)
-                ]
-            )
-        ]
-        self._add_fields_to_record(record, fields)
         self._add_author(record, viaf_record, original_entity=original_entity)
     def _normalize_record(self, record: Record, sierraID: str,
                           viaf_record: VIAFRecord, is_editing_existing_record: bool, original_entity: str) -> Record:

rara_tools-0.7.18/rara_tools/parsers/tools/validators.py ADDED Viewed

@@ -0,0 +1,54 @@
+import regex as re
+from typing import List
+def has_valid_chars(entity: str, allow_cyrillic: bool = True) -> bool:
+    """ Checks if entity contains any valid characters in latin
+    or in cyrillic, if the latter is enabled
+    Parameters
+    ------------
+    entity: str
+        String to validate.
+    allow_cyrillic: bool
+        Allow strings in cyrillic?
+    Returns
+    ------------
+    bool
+        Boolean value indicating, if the string
+        contains any valid characters.
+    """
+    # Check for latin characters
+    is_valid = bool(re.search(r"[a-züõöäA-ZÜÕÖÄ]", entity))
+    if allow_cyrillic and not is_valid:
+        # If cyrillic characters are allowed,
+        # check for them as well
+        is_valid = bool(re.search(r"[а-яА-Я]", entity))
+    return is_valid
+def filter_names(names: List[str], allow_cyrillic: bool = True) -> List[str]:
+    """ Filters out names not in allowed encodings (latin / cyrillic).
+    Parameters
+    ------------
+    names: List[str]
+        Names to filters.
+    allow_cyrillic: bool
+        Allow strings in cyrillic?
+    Returns
+    ------------
+    List[str]
+        List of filtered names.
+    """
+    filtered_names = [
+        name for name in names
+        if has_valid_chars(entity=name, allow_cyrillic=allow_cyrillic)
+    ]
+    return filtered_names

{rara_tools-0.7.16 → rara_tools-0.7.18/rara_tools.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: rara-tools
-Version: 0.7.16
+Version: 0.7.18
 Summary: Tools to support Kata's work.
 Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.10

{rara_tools-0.7.16 → rara_tools-0.7.18}/rara_tools.egg-info/SOURCES.txt RENAMED Viewed

@@ -49,6 +49,7 @@ rara_tools/parsers/marc_records/title_record.py
 rara_tools/parsers/tools/entity_normalizers.py
 rara_tools/parsers/tools/marc_converter.py
 rara_tools/parsers/tools/russian_transliterator.py
+rara_tools/parsers/tools/validators.py
 tests/test_digar_schema_converter.py
 tests/test_elastic.py
 tests/test_elastic_vector_and_search_operations.py
@@ -61,4 +62,5 @@ tests/test_s3_file_operations.py
 tests/test_sierra_converters.py
 tests/test_task_reporter.py
 tests/test_utils.py
+tests/test_validators.py
 tests/test_viaf_client.py

{rara_tools-0.7.16 → rara_tools-0.7.18}/tests/test_digar_schema_converter.py RENAMED Viewed

@@ -64,7 +64,7 @@ def test_digar_schema_id_generation():
     #If permalink is given, this should be used as base ID
     digar_schema = converter.digar_schema
-    first_segment_id = digar_schema["dcterms:hasPart"][0]["hasPart"][0]["@id"]
+    first_segment_id = digar_schema["dcterms:hasPart"][0]["dcterms:hasPart"][0]["@id"]
     assert first_segment_id.startswith(TEST_PERMALINK)
@@ -76,7 +76,7 @@ def test_digar_schema_id_generation():
     #If permalink is NOT given, Sierra ID should be used as base ID
     digar_schema = converter.digar_schema
-    first_segment_id = digar_schema["dcterms:hasPart"][0]["hasPart"][0]["@id"]
+    first_segment_id = digar_schema["dcterms:hasPart"][0]["dcterms:hasPart"][0]["@id"]
     assert first_segment_id.startswith(TEST_SIERRA_ID)
@@ -87,7 +87,7 @@ def test_digar_schema_id_generation():
     #If neiter permalink nor Sierra ID is given, generated ID should be used as base ID
     digar_schema = converter.digar_schema
-    first_segment_id = digar_schema["dcterms:hasPart"][0]["hasPart"][0]["@id"]
+    first_segment_id = digar_schema["dcterms:hasPart"][0]["dcterms:hasPart"][0]["@id"]
     assert first_segment_id.startswith(TEST_GENERATED_ID)

{rara_tools-0.7.16 → rara_tools-0.7.18}/tests/test_normalization.py RENAMED Viewed

@@ -213,7 +213,7 @@ def test_missing_fields_created_bibrecord_normalization():
     for record in normalizer_entities_only:
         check_record_tags_have_values(
             record, ["008",  # Sierra related, always with bibs
-                     "035", "100",  # VIAf enriched
+                      "100",  # VIAf enriched
                      ] + REQUIRED_FIELDS
         )
         validate_bibrecord_normalized(record, has_viaf_data=True)
@@ -753,7 +753,7 @@ def test_classified_fields_added_to_linked_record():
                     }
                 ]
             }
-        }
+        }
     ]
     ]
     # Case 1 - no 670 exists, should be added to linked record
@@ -766,7 +766,7 @@ def test_classified_fields_added_to_linked_record():
         assert len(fields_670) == 1
         assert fields_670[0].get_subfields("a")[0] == "Päikesekiri, 2021"
-    # Case 1 - existing record with 670 should not update (same behavior for both normalizers)
+    # Case 2 - existing record with 670 should not update (same behavior for both normalizers)
     linker_res = get_linker_res_example(
         "oneFound.json")
     linking_results = [linker_res]
@@ -780,7 +780,54 @@ def test_classified_fields_added_to_linked_record():
         fields_670 = record.get_fields("670")
         assert len(fields_670) == 1
         assert fields_670[0].get_subfields("a")[0] == "Eesti kirjarahva leksikon, 1995."
+    def get_046_field(year: str) -> dict:
+        return {
+            "046": {
+                "ind1": " ",
+                "ind2": " ",
+                "subfields": [
+                    {"k": year }
+                ]
+            }
+        }
+    # Case 3 - 046 $k - publication date Passed for bib
+    classified_fields = [
+        [get_046_field("2021")],
+        [get_046_field("1999")],
+        [get_046_field("2022")]
+    ]
+    mock_046_exists = MOCK_LINKER_ONE_FOUND.copy()
+    mock_046_exists["linked_info"][0]["json"]["fields"].append(get_046_field("2000"))
+    # for new record should get included
+    linking_results = [MOCK_LINKER_NOT_FOUND, # new record
+                       MOCK_LINKER_ONE_FOUND, # new record
+                       MOCK_LINKER_NOT_FOUND] # editing existing record
+    normalizer = BibRecordNormalizer(linking_results=linking_results, classified_fields=classified_fields)
+    # for i, record in enumerate(normalizer):
+        # first two should have 046 from classified data
+    record1 = normalizer.get_record(0)
+    fields_046 = record1.get_fields("046")
+    assert len(fields_046) == 1
+    assert fields_046[0].get_subfields("k")[0] == "2021"
+    record2 = normalizer.get_record(1)
+    fields_046 = record2.get_fields("046")
+    assert len(fields_046) == 1
+    # should be unchanged, aka 2000
+    assert fields_046[0].get_subfields("k")[0] == "2000"
+    record3 = normalizer.get_record(2)
+    fields_046 = record3.get_fields("046")
+    assert len(fields_046) == 1
+    assert fields_046[0].get_subfields("k")[0] == "2022"
 def test_classified_data_with_multiple_records():
     """ Test classified data with multiple records - should match by sierraID """
@@ -862,7 +909,11 @@ def test_viaf_name_variations():
     normalizer._add_author(record, viaf_record)
     fields_4xx = record.get_fields("400") + record.get_fields("410") + record.get_fields("430")
+    unfiltered_name_variations = viaf_record.name_variations
     assert len(fields_4xx) > 0
+    assert len(fields_4xx) < len(unfiltered_name_variations)
 def test_existing_record_linked_to_viaf_record():
     """ Test existing record linked to VIAF record - should enrich with VIAF data """
@@ -897,4 +948,4 @@ def test_existing_record_linked_to_viaf_record():
     assert get_viaf_url(normalizer.get_record(1)) == f"{viaf_base_url}/22458146/"
     assert get_viaf_url(normalizer.get_record(2)) == f"{viaf_base_url}/116796842/"

rara_tools-0.7.18/tests/test_validators.py ADDED Viewed

@@ -0,0 +1,55 @@
+from rara_tools.parsers.tools.validators import filter_names
+import pytest
+are_equal = lambda x, y: not bool(set(x).difference(set(y)))
+names_to_validate = [
+    "ליסט, פראנץ",
+    "Liszt, Franz",
+    "Lißt, Franz",
+    "ליסט, פרנץ",
+    "Liszt, Ferencz",
+    "Лист, Франц",
+    "Listz",
+    "Lißzt, Franz",
+    "Lists, Francis",
+    "List, Ferenc",
+    "List, Frants리스",
+    "List, Ferents",
+    "李斯特，弗朗西斯庫斯",
+    "ᓕᔅᑦ, ᕗᕌᓐᓯᔅᑲᔅ",
+    "리스트, 프란치스코"
+]
+valid_names_1 = [
+    "Liszt, Franz",
+    "Lißt, Franz",
+    "Liszt, Ferencz",
+    "Лист, Франц",
+    "Listz",
+    "Lißzt, Franz",
+    "Lists, Francis",
+    "List, Ferenc",
+    "List, Frants리스",
+    "List, Ferents"
+]
+valid_names_2 = [
+    "Liszt, Franz",
+    "Lißt, Franz",
+    "Liszt, Ferencz",
+    "Listz",
+    "Lißzt, Franz",
+    "Lists, Francis",
+    "List, Ferenc",
+    "List, Frants리스",
+    "List, Ferents"
+]
+def test_filtering_latin_cyrillic():
+    filtered_names = filter_names(names_to_validate, allow_cyrillic=True)
+    assert are_equal(filtered_names, valid_names_1)
+def test_filtering_latin():
+    filtered_names = filter_names(names_to_validate, allow_cyrillic=False)
+    assert are_equal(filtered_names, valid_names_2)