rara-tools 0.7.16__tar.gz → 0.7.18__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rara-tools might be problematic. Click here for more details.

Files changed (69) hide show
  1. {rara_tools-0.7.16/rara_tools.egg-info → rara_tools-0.7.18}/PKG-INFO +1 -1
  2. rara_tools-0.7.18/VERSION +1 -0
  3. {rara_tools-0.7.16 → rara_tools-0.7.18}/rara_tools/digar_schema_converter.py +2 -2
  4. {rara_tools-0.7.16 → rara_tools-0.7.18}/rara_tools/normalizers/base.py +15 -4
  5. {rara_tools-0.7.16 → rara_tools-0.7.18}/rara_tools/normalizers/bibs.py +2 -15
  6. rara_tools-0.7.18/rara_tools/parsers/tools/validators.py +54 -0
  7. {rara_tools-0.7.16 → rara_tools-0.7.18/rara_tools.egg-info}/PKG-INFO +1 -1
  8. {rara_tools-0.7.16 → rara_tools-0.7.18}/rara_tools.egg-info/SOURCES.txt +2 -0
  9. {rara_tools-0.7.16 → rara_tools-0.7.18}/tests/test_digar_schema_converter.py +3 -3
  10. {rara_tools-0.7.16 → rara_tools-0.7.18}/tests/test_normalization.py +55 -4
  11. rara_tools-0.7.18/tests/test_validators.py +55 -0
  12. rara_tools-0.7.16/VERSION +0 -1
  13. {rara_tools-0.7.16 → rara_tools-0.7.18}/LICENSE.md +0 -0
  14. {rara_tools-0.7.16 → rara_tools-0.7.18}/README.md +0 -0
  15. {rara_tools-0.7.16 → rara_tools-0.7.18}/pyproject.toml +0 -0
  16. {rara_tools-0.7.16 → rara_tools-0.7.18}/rara_tools/constants/__init__.py +0 -0
  17. {rara_tools-0.7.16 → rara_tools-0.7.18}/rara_tools/constants/digitizer.py +0 -0
  18. {rara_tools-0.7.16 → rara_tools-0.7.18}/rara_tools/constants/general.py +0 -0
  19. {rara_tools-0.7.16 → rara_tools-0.7.18}/rara_tools/constants/language_evaluator.py +0 -0
  20. {rara_tools-0.7.16 → rara_tools-0.7.18}/rara_tools/constants/linker.py +0 -0
  21. {rara_tools-0.7.16 → rara_tools-0.7.18}/rara_tools/constants/meta_extractor.py +0 -0
  22. {rara_tools-0.7.16 → rara_tools-0.7.18}/rara_tools/constants/normalizers.py +0 -0
  23. {rara_tools-0.7.16 → rara_tools-0.7.18}/rara_tools/constants/parsers.py +0 -0
  24. {rara_tools-0.7.16 → rara_tools-0.7.18}/rara_tools/constants/subject_indexer.py +0 -0
  25. {rara_tools-0.7.16 → rara_tools-0.7.18}/rara_tools/converters.py +0 -0
  26. {rara_tools-0.7.16 → rara_tools-0.7.18}/rara_tools/core_formatters/core_formatter.py +0 -0
  27. {rara_tools-0.7.16 → rara_tools-0.7.18}/rara_tools/core_formatters/formatted_keyword.py +0 -0
  28. {rara_tools-0.7.16 → rara_tools-0.7.18}/rara_tools/core_formatters/formatted_meta.py +0 -0
  29. {rara_tools-0.7.16 → rara_tools-0.7.18}/rara_tools/core_formatters/formatted_object.py +0 -0
  30. {rara_tools-0.7.16 → rara_tools-0.7.18}/rara_tools/decorators.py +0 -0
  31. {rara_tools-0.7.16 → rara_tools-0.7.18}/rara_tools/elastic.py +0 -0
  32. {rara_tools-0.7.16 → rara_tools-0.7.18}/rara_tools/exceptions.py +0 -0
  33. {rara_tools-0.7.16 → rara_tools-0.7.18}/rara_tools/normalizers/__init__.py +0 -0
  34. {rara_tools-0.7.16 → rara_tools-0.7.18}/rara_tools/normalizers/authorities.py +0 -0
  35. {rara_tools-0.7.16 → rara_tools-0.7.18}/rara_tools/normalizers/reader.py +0 -0
  36. {rara_tools-0.7.16 → rara_tools-0.7.18}/rara_tools/normalizers/viaf.py +0 -0
  37. {rara_tools-0.7.16 → rara_tools-0.7.18}/rara_tools/parsers/marc_parsers/base_parser.py +0 -0
  38. {rara_tools-0.7.16 → rara_tools-0.7.18}/rara_tools/parsers/marc_parsers/ems_parser.py +0 -0
  39. {rara_tools-0.7.16 → rara_tools-0.7.18}/rara_tools/parsers/marc_parsers/location_parser.py +0 -0
  40. {rara_tools-0.7.16 → rara_tools-0.7.18}/rara_tools/parsers/marc_parsers/organization_parser.py +0 -0
  41. {rara_tools-0.7.16 → rara_tools-0.7.18}/rara_tools/parsers/marc_parsers/person_parser.py +0 -0
  42. {rara_tools-0.7.16 → rara_tools-0.7.18}/rara_tools/parsers/marc_parsers/title_parser.py +0 -0
  43. {rara_tools-0.7.16 → rara_tools-0.7.18}/rara_tools/parsers/marc_records/base_record.py +0 -0
  44. {rara_tools-0.7.16 → rara_tools-0.7.18}/rara_tools/parsers/marc_records/ems_record.py +0 -0
  45. {rara_tools-0.7.16 → rara_tools-0.7.18}/rara_tools/parsers/marc_records/organization_record.py +0 -0
  46. {rara_tools-0.7.16 → rara_tools-0.7.18}/rara_tools/parsers/marc_records/person_record.py +0 -0
  47. {rara_tools-0.7.16 → rara_tools-0.7.18}/rara_tools/parsers/marc_records/title_record.py +0 -0
  48. {rara_tools-0.7.16 → rara_tools-0.7.18}/rara_tools/parsers/tools/entity_normalizers.py +0 -0
  49. {rara_tools-0.7.16 → rara_tools-0.7.18}/rara_tools/parsers/tools/marc_converter.py +0 -0
  50. {rara_tools-0.7.16 → rara_tools-0.7.18}/rara_tools/parsers/tools/russian_transliterator.py +0 -0
  51. {rara_tools-0.7.16 → rara_tools-0.7.18}/rara_tools/s3.py +0 -0
  52. {rara_tools-0.7.16 → rara_tools-0.7.18}/rara_tools/task_reporter.py +0 -0
  53. {rara_tools-0.7.16 → rara_tools-0.7.18}/rara_tools/utils.py +0 -0
  54. {rara_tools-0.7.16 → rara_tools-0.7.18}/rara_tools.egg-info/dependency_links.txt +0 -0
  55. {rara_tools-0.7.16 → rara_tools-0.7.18}/rara_tools.egg-info/requires.txt +0 -0
  56. {rara_tools-0.7.16 → rara_tools-0.7.18}/rara_tools.egg-info/top_level.txt +0 -0
  57. {rara_tools-0.7.16 → rara_tools-0.7.18}/requirements.txt +0 -0
  58. {rara_tools-0.7.16 → rara_tools-0.7.18}/setup.cfg +0 -0
  59. {rara_tools-0.7.16 → rara_tools-0.7.18}/tests/test_elastic.py +0 -0
  60. {rara_tools-0.7.16 → rara_tools-0.7.18}/tests/test_elastic_vector_and_search_operations.py +0 -0
  61. {rara_tools-0.7.16 → rara_tools-0.7.18}/tests/test_entity_normalizers.py +0 -0
  62. {rara_tools-0.7.16 → rara_tools-0.7.18}/tests/test_formatters.py +0 -0
  63. {rara_tools-0.7.16 → rara_tools-0.7.18}/tests/test_marc_parsers.py +0 -0
  64. {rara_tools-0.7.16 → rara_tools-0.7.18}/tests/test_s3_exceptions.py +0 -0
  65. {rara_tools-0.7.16 → rara_tools-0.7.18}/tests/test_s3_file_operations.py +0 -0
  66. {rara_tools-0.7.16 → rara_tools-0.7.18}/tests/test_sierra_converters.py +0 -0
  67. {rara_tools-0.7.16 → rara_tools-0.7.18}/tests/test_task_reporter.py +0 -0
  68. {rara_tools-0.7.16 → rara_tools-0.7.18}/tests/test_utils.py +0 -0
  69. {rara_tools-0.7.16 → rara_tools-0.7.18}/tests/test_viaf_client.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rara-tools
3
- Version: 0.7.16
3
+ Version: 0.7.18
4
4
  Summary: Tools to support Kata's work.
5
5
  Classifier: Programming Language :: Python :: 3
6
6
  Classifier: Programming Language :: Python :: 3.10
@@ -0,0 +1 @@
1
+ 0.7.18
@@ -77,7 +77,7 @@ class PageSchema:
77
77
  self.__schema = {
78
78
  "@type": "CreativeWork", # CONSTANT for pages
79
79
  "@id": self.page_id,
80
- "hasPart": []
80
+ "dcterms:hasPart": []
81
81
  }
82
82
  text_schemas = [
83
83
  TextPageSchema(page).schema
@@ -91,7 +91,7 @@ class PageSchema:
91
91
  page_schemas = text_schemas + image_schemas
92
92
  page_schemas_with_ids = self._add_segment_ids(page_schemas)
93
93
 
94
- self.__schema["hasPart"].extend(page_schemas_with_ids)
94
+ self.__schema["dcterms:hasPart"].extend(page_schemas_with_ids)
95
95
 
96
96
  return self.__schema
97
97
 
@@ -3,6 +3,8 @@ from pymarc import (Field, Subfield, JSONReader, Record)
3
3
  from typing import List, Optional, Iterator
4
4
  from rara_tools.normalizers.reader import SafeJSONReader
5
5
 
6
+ from rara_tools.parsers.tools.validators import filter_names
7
+
6
8
  from rara_tools.normalizers.viaf import VIAFRecord, VIAFClient
7
9
  from rara_tools.constants.normalizers import (
8
10
  DEFAULT_VIAF_FIELD, ALLOWED_VIAF_FIELDS, ALLOWED_VIAF_WIKILINK_LANGS,
@@ -311,25 +313,34 @@ class RecordNormalizer:
311
313
  if viaf_record:
312
314
  self._include_name_variations(record, viaf_record)
313
315
 
314
- def _include_name_variations(self, record: Record, viaf_record: VIAFRecord) -> None:
316
+ def _include_name_variations(self, record: Record, viaf_record: VIAFRecord, filter_variations=True) -> None:
315
317
  """ Include name variations from VIAF record as 400|t fields """
316
318
 
317
319
  if not viaf_record or not viaf_record.name_variations:
318
320
  return
319
321
 
320
322
  existing_name_variations = record.get_fields("400")
321
- existing_variations = [sf.value for field in existing_name_variations for sf in field.get_subfields("t")]
323
+ existing_variations = [sf.value for field in existing_name_variations for sf in field.get_subfields("a")]
324
+
325
+ if filter_variations:
326
+ allowed_variations = filter_names(viaf_record.name_variations)
327
+ logger.debug(
328
+ f"filtered out {len(viaf_record.name_variations) - len(allowed_variations)} name variations for '{viaf_record.name}'"
329
+ )
330
+
331
+ else:
332
+ allowed_variations = viaf_record.name_variations
322
333
 
323
334
  fields = []
324
335
 
325
- for variation in viaf_record.name_variations:
336
+ for variation in allowed_variations:
326
337
  if variation not in existing_variations:
327
338
  fields.append(
328
339
  Field(
329
340
  tag="400",
330
341
  indicators=EMPTY_INDICATORS,
331
342
  subfields=[
332
- Subfield("t", variation)
343
+ Subfield("a", variation)
333
344
  ]
334
345
  )
335
346
  )
@@ -73,26 +73,13 @@ class BibRecordNormalizer(RecordNormalizer):
73
73
 
74
74
 
75
75
  def _normalize_viaf(self, record: Record, viaf_record: VIAFRecord, original_entity: str) -> None:
76
-
77
76
  if not viaf_record:
78
77
  # viaf record not found, include original entity as 100|t
79
78
  self._add_author(record, viaf_record=None, original_entity=original_entity)
80
79
  return record
81
-
82
- viaf_id = viaf_record.viaf_id
83
- fields = [
84
- Field(
85
- tag="035",
86
- indicators=EMPTY_INDICATORS,
87
- subfields=[
88
- Subfield("a", viaf_id)
89
- ]
90
- )
91
- ]
92
-
93
- self._add_fields_to_record(record, fields)
80
+
94
81
  self._add_author(record, viaf_record, original_entity=original_entity)
95
-
82
+
96
83
  def _normalize_record(self, record: Record, sierraID: str,
97
84
  viaf_record: VIAFRecord, is_editing_existing_record: bool, original_entity: str) -> Record:
98
85
 
@@ -0,0 +1,54 @@
1
+ import regex as re
2
+ from typing import List
3
+
4
+ def has_valid_chars(entity: str, allow_cyrillic: bool = True) -> bool:
5
+ """ Checks if entity contains any valid characters in latin
6
+ or in cyrillic, if the latter is enabled
7
+
8
+ Parameters
9
+ ------------
10
+ entity: str
11
+ String to validate.
12
+ allow_cyrillic: bool
13
+ Allow strings in cyrillic?
14
+
15
+ Returns
16
+ ------------
17
+ bool
18
+ Boolean value indicating, if the string
19
+ contains any valid characters.
20
+
21
+ """
22
+ # Check for latin characters
23
+ is_valid = bool(re.search(r"[a-züõöäA-ZÜÕÖÄ]", entity))
24
+
25
+ if allow_cyrillic and not is_valid:
26
+ # If cyrillic characters are allowed,
27
+ # check for them as well
28
+ is_valid = bool(re.search(r"[а-яА-Я]", entity))
29
+
30
+ return is_valid
31
+
32
+
33
+ def filter_names(names: List[str], allow_cyrillic: bool = True) -> List[str]:
34
+ """ Filters out names not in allowed encodings (latin / cyrillic).
35
+
36
+ Parameters
37
+ ------------
38
+ names: List[str]
39
+ Names to filters.
40
+ allow_cyrillic: bool
41
+ Allow strings in cyrillic?
42
+
43
+ Returns
44
+ ------------
45
+ List[str]
46
+ List of filtered names.
47
+
48
+ """
49
+ filtered_names = [
50
+ name for name in names
51
+ if has_valid_chars(entity=name, allow_cyrillic=allow_cyrillic)
52
+ ]
53
+ return filtered_names
54
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rara-tools
3
- Version: 0.7.16
3
+ Version: 0.7.18
4
4
  Summary: Tools to support Kata's work.
5
5
  Classifier: Programming Language :: Python :: 3
6
6
  Classifier: Programming Language :: Python :: 3.10
@@ -49,6 +49,7 @@ rara_tools/parsers/marc_records/title_record.py
49
49
  rara_tools/parsers/tools/entity_normalizers.py
50
50
  rara_tools/parsers/tools/marc_converter.py
51
51
  rara_tools/parsers/tools/russian_transliterator.py
52
+ rara_tools/parsers/tools/validators.py
52
53
  tests/test_digar_schema_converter.py
53
54
  tests/test_elastic.py
54
55
  tests/test_elastic_vector_and_search_operations.py
@@ -61,4 +62,5 @@ tests/test_s3_file_operations.py
61
62
  tests/test_sierra_converters.py
62
63
  tests/test_task_reporter.py
63
64
  tests/test_utils.py
65
+ tests/test_validators.py
64
66
  tests/test_viaf_client.py
@@ -64,7 +64,7 @@ def test_digar_schema_id_generation():
64
64
 
65
65
  #If permalink is given, this should be used as base ID
66
66
  digar_schema = converter.digar_schema
67
- first_segment_id = digar_schema["dcterms:hasPart"][0]["hasPart"][0]["@id"]
67
+ first_segment_id = digar_schema["dcterms:hasPart"][0]["dcterms:hasPart"][0]["@id"]
68
68
 
69
69
  assert first_segment_id.startswith(TEST_PERMALINK)
70
70
 
@@ -76,7 +76,7 @@ def test_digar_schema_id_generation():
76
76
 
77
77
  #If permalink is NOT given, Sierra ID should be used as base ID
78
78
  digar_schema = converter.digar_schema
79
- first_segment_id = digar_schema["dcterms:hasPart"][0]["hasPart"][0]["@id"]
79
+ first_segment_id = digar_schema["dcterms:hasPart"][0]["dcterms:hasPart"][0]["@id"]
80
80
  assert first_segment_id.startswith(TEST_SIERRA_ID)
81
81
 
82
82
 
@@ -87,7 +87,7 @@ def test_digar_schema_id_generation():
87
87
 
88
88
  #If neiter permalink nor Sierra ID is given, generated ID should be used as base ID
89
89
  digar_schema = converter.digar_schema
90
- first_segment_id = digar_schema["dcterms:hasPart"][0]["hasPart"][0]["@id"]
90
+ first_segment_id = digar_schema["dcterms:hasPart"][0]["dcterms:hasPart"][0]["@id"]
91
91
  assert first_segment_id.startswith(TEST_GENERATED_ID)
92
92
 
93
93
 
@@ -213,7 +213,7 @@ def test_missing_fields_created_bibrecord_normalization():
213
213
  for record in normalizer_entities_only:
214
214
  check_record_tags_have_values(
215
215
  record, ["008", # Sierra related, always with bibs
216
- "035", "100", # VIAf enriched
216
+ "100", # VIAf enriched
217
217
  ] + REQUIRED_FIELDS
218
218
  )
219
219
  validate_bibrecord_normalized(record, has_viaf_data=True)
@@ -753,7 +753,7 @@ def test_classified_fields_added_to_linked_record():
753
753
  }
754
754
  ]
755
755
  }
756
- }
756
+ }
757
757
  ]
758
758
  ]
759
759
  # Case 1 - no 670 exists, should be added to linked record
@@ -766,7 +766,7 @@ def test_classified_fields_added_to_linked_record():
766
766
  assert len(fields_670) == 1
767
767
  assert fields_670[0].get_subfields("a")[0] == "Päikesekiri, 2021"
768
768
 
769
- # Case 1 - existing record with 670 should not update (same behavior for both normalizers)
769
+ # Case 2 - existing record with 670 should not update (same behavior for both normalizers)
770
770
  linker_res = get_linker_res_example(
771
771
  "oneFound.json")
772
772
  linking_results = [linker_res]
@@ -780,7 +780,54 @@ def test_classified_fields_added_to_linked_record():
780
780
  fields_670 = record.get_fields("670")
781
781
  assert len(fields_670) == 1
782
782
  assert fields_670[0].get_subfields("a")[0] == "Eesti kirjarahva leksikon, 1995."
783
+
784
+ def get_046_field(year: str) -> dict:
785
+ return {
786
+ "046": {
787
+ "ind1": " ",
788
+ "ind2": " ",
789
+ "subfields": [
790
+ {"k": year }
791
+ ]
792
+ }
793
+ }
783
794
 
795
+ # Case 3 - 046 $k - publication date Passed for bib
796
+ classified_fields = [
797
+ [get_046_field("2021")],
798
+ [get_046_field("1999")],
799
+ [get_046_field("2022")]
800
+ ]
801
+
802
+
803
+ mock_046_exists = MOCK_LINKER_ONE_FOUND.copy()
804
+ mock_046_exists["linked_info"][0]["json"]["fields"].append(get_046_field("2000"))
805
+
806
+ # for new record should get included
807
+ linking_results = [MOCK_LINKER_NOT_FOUND, # new record
808
+ MOCK_LINKER_ONE_FOUND, # new record
809
+ MOCK_LINKER_NOT_FOUND] # editing existing record
810
+
811
+ normalizer = BibRecordNormalizer(linking_results=linking_results, classified_fields=classified_fields)
812
+
813
+ # for i, record in enumerate(normalizer):
814
+ # first two should have 046 from classified data
815
+ record1 = normalizer.get_record(0)
816
+ fields_046 = record1.get_fields("046")
817
+ assert len(fields_046) == 1
818
+ assert fields_046[0].get_subfields("k")[0] == "2021"
819
+
820
+ record2 = normalizer.get_record(1)
821
+ fields_046 = record2.get_fields("046")
822
+ assert len(fields_046) == 1
823
+ # should be unchanged, aka 2000
824
+ assert fields_046[0].get_subfields("k")[0] == "2000"
825
+
826
+ record3 = normalizer.get_record(2)
827
+ fields_046 = record3.get_fields("046")
828
+ assert len(fields_046) == 1
829
+ assert fields_046[0].get_subfields("k")[0] == "2022"
830
+
784
831
  def test_classified_data_with_multiple_records():
785
832
  """ Test classified data with multiple records - should match by sierraID """
786
833
 
@@ -862,7 +909,11 @@ def test_viaf_name_variations():
862
909
  normalizer._add_author(record, viaf_record)
863
910
 
864
911
  fields_4xx = record.get_fields("400") + record.get_fields("410") + record.get_fields("430")
912
+
913
+ unfiltered_name_variations = viaf_record.name_variations
914
+
865
915
  assert len(fields_4xx) > 0
916
+ assert len(fields_4xx) < len(unfiltered_name_variations)
866
917
 
867
918
  def test_existing_record_linked_to_viaf_record():
868
919
  """ Test existing record linked to VIAF record - should enrich with VIAF data """
@@ -897,4 +948,4 @@ def test_existing_record_linked_to_viaf_record():
897
948
  assert get_viaf_url(normalizer.get_record(1)) == f"{viaf_base_url}/22458146/"
898
949
  assert get_viaf_url(normalizer.get_record(2)) == f"{viaf_base_url}/116796842/"
899
950
 
900
-
951
+
@@ -0,0 +1,55 @@
1
+ from rara_tools.parsers.tools.validators import filter_names
2
+ import pytest
3
+
4
+ are_equal = lambda x, y: not bool(set(x).difference(set(y)))
5
+
6
+ names_to_validate = [
7
+ "ליסט, פראנץ",
8
+ "Liszt, Franz",
9
+ "Lißt, Franz",
10
+ "ליסט, פרנץ",
11
+ "Liszt, Ferencz",
12
+ "Лист, Франц",
13
+ "Listz",
14
+ "Lißzt, Franz",
15
+ "Lists, Francis",
16
+ "List, Ferenc",
17
+ "List, Frants리스",
18
+ "List, Ferents",
19
+ "李斯特,弗朗西斯庫斯",
20
+ "ᓕᔅᑦ, ᕗᕌᓐᓯᔅᑲᔅ",
21
+ "리스트, 프란치스코"
22
+ ]
23
+
24
+ valid_names_1 = [
25
+ "Liszt, Franz",
26
+ "Lißt, Franz",
27
+ "Liszt, Ferencz",
28
+ "Лист, Франц",
29
+ "Listz",
30
+ "Lißzt, Franz",
31
+ "Lists, Francis",
32
+ "List, Ferenc",
33
+ "List, Frants리스",
34
+ "List, Ferents"
35
+ ]
36
+
37
+ valid_names_2 = [
38
+ "Liszt, Franz",
39
+ "Lißt, Franz",
40
+ "Liszt, Ferencz",
41
+ "Listz",
42
+ "Lißzt, Franz",
43
+ "Lists, Francis",
44
+ "List, Ferenc",
45
+ "List, Frants리스",
46
+ "List, Ferents"
47
+ ]
48
+
49
+ def test_filtering_latin_cyrillic():
50
+ filtered_names = filter_names(names_to_validate, allow_cyrillic=True)
51
+ assert are_equal(filtered_names, valid_names_1)
52
+
53
+ def test_filtering_latin():
54
+ filtered_names = filter_names(names_to_validate, allow_cyrillic=False)
55
+ assert are_equal(filtered_names, valid_names_2)
rara_tools-0.7.16/VERSION DELETED
@@ -1 +0,0 @@
1
- 0.7.16
File without changes
File without changes
File without changes
File without changes