rara-tools 0.0.13__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rara-tools might be problematic. Click here for more details.

Files changed (37) hide show
  1. {rara_tools-0.0.13/rara_tools.egg-info → rara_tools-0.2.0}/PKG-INFO +6 -2
  2. rara_tools-0.2.0/VERSION +1 -0
  3. rara_tools-0.2.0/rara_tools/constants/__init__.py +1 -0
  4. {rara_tools-0.0.13 → rara_tools-0.2.0}/rara_tools/constants/digitizer.py +12 -0
  5. {rara_tools-0.0.13 → rara_tools-0.2.0}/rara_tools/constants/general.py +1 -0
  6. rara_tools-0.2.0/rara_tools/constants/normalizers.py +17 -0
  7. {rara_tools-0.0.13 → rara_tools-0.2.0}/rara_tools/converters.py +42 -33
  8. {rara_tools-0.0.13 → rara_tools-0.2.0}/rara_tools/exceptions.py +4 -0
  9. {rara_tools-0.0.13 → rara_tools-0.2.0}/rara_tools/s3.py +53 -4
  10. {rara_tools-0.0.13 → rara_tools-0.2.0/rara_tools.egg-info}/PKG-INFO +6 -2
  11. {rara_tools-0.0.13 → rara_tools-0.2.0}/rara_tools.egg-info/SOURCES.txt +6 -2
  12. {rara_tools-0.0.13 → rara_tools-0.2.0}/rara_tools.egg-info/requires.txt +3 -0
  13. {rara_tools-0.0.13 → rara_tools-0.2.0}/requirements.txt +3 -0
  14. rara_tools-0.2.0/tests/test_normalization.py +315 -0
  15. rara_tools-0.2.0/tests/test_sierra_converters.py +101 -0
  16. rara_tools-0.2.0/tests/test_utils.py +77 -0
  17. rara_tools-0.2.0/tests/test_viaf_client.py +19 -0
  18. rara_tools-0.0.13/VERSION +0 -1
  19. rara_tools-0.0.13/rara_tools/constants/__init__.py +0 -0
  20. rara_tools-0.0.13/tests/test_converters.py +0 -127
  21. {rara_tools-0.0.13 → rara_tools-0.2.0}/LICENSE.md +0 -0
  22. {rara_tools-0.0.13 → rara_tools-0.2.0}/README.md +0 -0
  23. {rara_tools-0.0.13 → rara_tools-0.2.0}/pyproject.toml +0 -0
  24. {rara_tools-0.0.13 → rara_tools-0.2.0}/rara_tools/decorators.py +0 -0
  25. {rara_tools-0.0.13 → rara_tools-0.2.0}/rara_tools/digar_schema_converter.py +0 -0
  26. {rara_tools-0.0.13 → rara_tools-0.2.0}/rara_tools/elastic.py +0 -0
  27. {rara_tools-0.0.13 → rara_tools-0.2.0}/rara_tools/task_reporter.py +0 -0
  28. {rara_tools-0.0.13 → rara_tools-0.2.0}/rara_tools/utils.py +0 -0
  29. {rara_tools-0.0.13 → rara_tools-0.2.0}/rara_tools.egg-info/dependency_links.txt +0 -0
  30. {rara_tools-0.0.13 → rara_tools-0.2.0}/rara_tools.egg-info/top_level.txt +0 -0
  31. {rara_tools-0.0.13 → rara_tools-0.2.0}/setup.cfg +0 -0
  32. {rara_tools-0.0.13 → rara_tools-0.2.0}/tests/test_digar_schema_converter.py +0 -0
  33. {rara_tools-0.0.13 → rara_tools-0.2.0}/tests/test_elastic.py +0 -0
  34. {rara_tools-0.0.13 → rara_tools-0.2.0}/tests/test_elastic_vector_and_search_operations.py +0 -0
  35. {rara_tools-0.0.13 → rara_tools-0.2.0}/tests/test_s3_exceptions.py +0 -0
  36. {rara_tools-0.0.13 → rara_tools-0.2.0}/tests/test_s3_file_operations.py +0 -0
  37. {rara_tools-0.0.13 → rara_tools-0.2.0}/tests/test_task_reporter.py +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: rara-tools
3
- Version: 0.0.13
3
+ Version: 0.2.0
4
4
  Summary: Tools to support Kata's work.
5
5
  Classifier: Programming Language :: Python :: 3
6
6
  Classifier: Programming Language :: Python :: 3.10
@@ -13,11 +13,15 @@ License-File: LICENSE.md
13
13
  Requires-Dist: elasticsearch==8.*
14
14
  Requires-Dist: elasticsearch_dsl==8.*
15
15
  Requires-Dist: minio==7.*
16
+ Requires-Dist: rara-norm-linker==1.*
16
17
  Requires-Dist: requests
17
18
  Requires-Dist: iso639-lang
19
+ Requires-Dist: pymarc
20
+ Requires-Dist: glom
18
21
  Provides-Extra: testing
19
22
  Requires-Dist: pytest>=8.0; extra == "testing"
20
23
  Requires-Dist: pytest-order; extra == "testing"
24
+ Dynamic: license-file
21
25
 
22
26
  # RaRa Tools
23
27
 
@@ -0,0 +1 @@
1
+ 0.2.0
@@ -0,0 +1 @@
1
+ from .normalizers import *
@@ -1,4 +1,12 @@
1
+ COMPONENT_KEY = "digitizer"
2
+
3
+
4
+ class ModelTypes:
5
+ IMAGE_PROCESSOR = "image_processor"
6
+
7
+
1
8
  class StatusKeys:
9
+ DOWNLOAD_MODELS = "digitizer_download_models"
2
10
  CLEAN_UP = "digitizer_clean_up"
3
11
  ELASTICSEARCH_UPLOAD = "digitizer_elasticsearch_upload"
4
12
  UPLOAD = "s3_upload"
@@ -11,3 +19,7 @@ class Queue:
11
19
  DOWNLOAD = "download"
12
20
  FINISH = "finish"
13
21
  OCR = "ocr"
22
+
23
+
24
+ class Tasks:
25
+ MODEL_UPDATE = "component_model_update"
@@ -1,4 +1,5 @@
1
1
  class Status:
2
+ SKIPPED = "SKIPPED"
2
3
  FAILED = "FAILED"
3
4
  PENDING = "PENDING"
4
5
  RUNNING = "RUNNING"
@@ -0,0 +1,17 @@
1
+ from pymarc import Indicators
2
+ import os
3
+
4
+ EMPTY_INDICATORS = Indicators(" ", " ")
5
+ VIAF_ALLOWED_SOURCES = ["LC", "DNB", "LNB", "NLL",
6
+ "ERRR", "J9U"]
7
+
8
+ ES_HOST = os.getenv("ELASTIC_TEST_URL", "http://localhost:9200")
9
+
10
+ LINKER_CONFIG = {
11
+ "add_viaf_info": True,
12
+ "vectorizer_data_path": "./vectorizer_data",
13
+ "per_config": {"es_host": ES_HOST},
14
+ "org_config": {"es_host": ES_HOST},
15
+ "loc_config": {"es_host": ES_HOST},
16
+ "ems_config": {"es_host": ES_HOST},
17
+ }
@@ -1,19 +1,22 @@
1
- from .exceptions import SierraResponseConverterException
1
+ from rara_tools.exceptions import SierraResponseConverterException
2
2
 
3
3
 
4
4
  class SierraResponseConverter:
5
5
  """Converts a JSON response from the Sierra API to MARC-in-JSON format."""
6
-
6
+
7
7
  def __init__(self, response: dict):
8
8
  if not isinstance(response, dict):
9
- raise SierraResponseConverterException("Please provide a valid JSON response.")
9
+ raise SierraResponseConverterException(
10
+ "Please provide a valid JSON response.")
10
11
  self.response = response
11
-
12
- def _map_control_fields(self, field: dict) -> dict:
13
- # for tags < 010, no subfields, instead one str value in "value"
12
+
13
+ @staticmethod
14
+ def _map_control_fields(field: dict) -> dict:
15
+ # for tags < 010, no subfields, instead one str value in "value"
14
16
  return {field["tag"]: field["value"]}
15
-
16
- def _map_data_fields(self, field: dict) -> dict:
17
+
18
+ @staticmethod
19
+ def _map_data_fields(field: dict) -> dict:
17
20
  """ Maps marc fields > 010.
18
21
 
19
22
  Args:
@@ -22,60 +25,66 @@ class SierraResponseConverter:
22
25
  Returns:
23
26
  dict: standardised marc-in-json format.
24
27
  """
25
-
28
+
26
29
  data = field["data"]
27
-
30
+
28
31
  # Order matters ind1, in2, subfields
29
32
  field_data = {
30
33
  "ind1": data.get("ind1", " "),
31
34
  "ind2": data.get("ind2", " "),
32
35
  "subfields": data.get("subfields", [])
33
36
  }
34
-
37
+
35
38
  return {field["tag"]: field_data}
36
-
37
- def _is_marc21structured(self, field: dict) -> bool:
39
+
40
+ @staticmethod
41
+ def _is_marc21structured(field: dict) -> bool:
38
42
  """Checks if the field is already structured according to MARC21 in JSON"""
39
43
  return any(key.isdigit() for key in field.keys())
40
-
41
-
44
+
42
45
  def _handle_field_type(self, field: dict) -> dict:
43
-
46
+
44
47
  if self._is_marc21structured(field):
45
48
  return field
46
-
49
+
47
50
  if field.get("data"):
48
51
  return self._map_data_fields(field)
49
-
52
+
50
53
  tag = field.get("tag")
51
-
54
+
52
55
  if not tag:
53
- raise SierraResponseConverterException("Field is missing MARC21 tag.")
54
-
56
+ raise SierraResponseConverterException(
57
+ "Field is missing MARC21 tag.")
58
+
55
59
  if tag < "010":
56
60
  return self._map_control_fields(field)
57
61
  else:
58
62
  return self._map_data_fields(field)
59
-
63
+
60
64
  def _convert_response(self) -> list:
61
65
  entries = self.response.get("entries")
62
66
  if not entries:
63
- raise SierraResponseConverterException("No entries found in the response.")
64
-
67
+ raise SierraResponseConverterException(
68
+ "No entries found in the response.")
69
+
65
70
  try:
66
- return {"fields": [
67
- {e["id"]: [
68
- self._handle_field_type(f) for f in e["marc"]["fields"]
71
+ return [
72
+ {
73
+ "sierraID": str(e["id"]),
74
+ "leader": e["marc"]["leader"],
75
+ "fields": [
76
+ self._handle_field_type(f) for f in e["marc"]["fields"]
69
77
  ]}
70
78
  for e in entries
71
- ]}
72
-
79
+ ]
80
+
73
81
  except KeyError as e:
74
- raise SierraResponseConverterException(f"Malformed response: missing key {e}")
75
-
76
-
82
+ raise SierraResponseConverterException(
83
+ f"Malformed response: missing key {e}")
84
+
77
85
  def convert(self) -> list:
78
86
  try:
79
87
  return self._convert_response()
80
88
  except Exception as e:
81
- raise SierraResponseConverterException(f"An unexpected error occurred: {e}")
89
+ raise SierraResponseConverterException(
90
+ f"An unexpected error occurred: {e}")
@@ -7,6 +7,10 @@ class S3InitException(Exception):
7
7
  class S3ConnectionException(Exception):
8
8
  """Raised S3 Bucket/Connection Error."""
9
9
 
10
+ class S3DownloadException(Exception):
11
+ """Raised S3 Download Error."""
12
+
13
+
10
14
  class ElasticsearchException(Exception):
11
15
  """Raised Elasticsearch Error."""
12
16
 
@@ -1,11 +1,20 @@
1
+ import logging
1
2
  import os
3
+ import pathlib
4
+ import time
2
5
  import uuid
3
6
  from typing import Any, Generator, List, Optional
4
7
 
5
- from minio import Minio
8
+ from minio import Minio, S3Error
6
9
 
7
- from .exceptions import (S3ConnectionException, S3InitException,
8
- S3InputException)
10
+ from .exceptions import (
11
+ S3ConnectionException,
12
+ S3InitException,
13
+ S3InputException,
14
+ S3DownloadException
15
+ )
16
+
17
+ logger = logging.getLogger("tools.s3")
9
18
 
10
19
 
11
20
  class S3Files:
@@ -76,9 +85,49 @@ class S3Files:
76
85
  list_of_objects = list(self.minio_client.list_objects(self.bucket, prefix=path, recursive=True))
77
86
  for minio_object in list_of_objects:
78
87
  full_path = os.path.join(download_dir, minio_object.object_name)
79
- self.minio_client.fget_object(self.bucket, minio_object.object_name, full_path)
88
+ self._download_file(minio_object.object_name, full_path)
80
89
  yield full_path
81
90
 
91
+ def _download_file(self, path, download_dir=".", max_retries=3) -> str:
92
+ """Download a single file with retry and resume support."""
93
+ attempts = 0
94
+
95
+ while attempts < max_retries:
96
+ try:
97
+ stat = self.minio_client.stat_object(self.bucket, path)
98
+ file_size = stat.size
99
+ temp_path = download_dir + ".part"
100
+ pathlib.Path(temp_path).parent.mkdir(parents=True, exist_ok=True)
101
+
102
+ # Check if a partial file exists
103
+ downloaded_size = os.path.getsize(temp_path) if os.path.exists(temp_path) else 0
104
+
105
+ if downloaded_size >= file_size:
106
+ os.rename(temp_path, download_dir) # Rename to final filename
107
+ logger.info(f"Completed: {path}")
108
+ return str(pathlib.Path(download_dir) / path)
109
+
110
+ logger.info(f"Downloading {path} ({downloaded_size}/{file_size} bytes)...")
111
+
112
+ # Open file in append mode to resume download
113
+ with open(temp_path, "ab") as f:
114
+ response = self.minio_client.get_object(self.bucket, path, offset=downloaded_size)
115
+ for data in response.stream(32 * 1024): # 32KB chunks
116
+ f.write(data)
117
+ response.close()
118
+ response.release_conn()
119
+
120
+ os.rename(temp_path, download_dir) # Rename temp to final
121
+ logger.info(f"Downloaded: {path}")
122
+ return str(pathlib.Path(download_dir) / path)
123
+
124
+ except S3Error as e:
125
+ logger.info(f"Error downloading {path}, attempt {attempts + 1}: {e}")
126
+ attempts += 1
127
+ time.sleep(2 ** attempts) # Exponential backoff
128
+
129
+ raise S3DownloadException(f"Failed to download {path} after {max_retries} attempts.")
130
+
82
131
  def upload(self, path: str, prefix: Optional[str] = "") -> str:
83
132
  """Uploads file or folder to S3 bucket.
84
133
  :param: path str: Path to the file to upload in local file system.
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: rara-tools
3
- Version: 0.0.13
3
+ Version: 0.2.0
4
4
  Summary: Tools to support Kata's work.
5
5
  Classifier: Programming Language :: Python :: 3
6
6
  Classifier: Programming Language :: Python :: 3.10
@@ -13,11 +13,15 @@ License-File: LICENSE.md
13
13
  Requires-Dist: elasticsearch==8.*
14
14
  Requires-Dist: elasticsearch_dsl==8.*
15
15
  Requires-Dist: minio==7.*
16
+ Requires-Dist: rara-norm-linker==1.*
16
17
  Requires-Dist: requests
17
18
  Requires-Dist: iso639-lang
19
+ Requires-Dist: pymarc
20
+ Requires-Dist: glom
18
21
  Provides-Extra: testing
19
22
  Requires-Dist: pytest>=8.0; extra == "testing"
20
23
  Requires-Dist: pytest-order; extra == "testing"
24
+ Dynamic: license-file
21
25
 
22
26
  # RaRa Tools
23
27
 
@@ -19,10 +19,14 @@ rara_tools.egg-info/top_level.txt
19
19
  rara_tools/constants/__init__.py
20
20
  rara_tools/constants/digitizer.py
21
21
  rara_tools/constants/general.py
22
- tests/test_converters.py
22
+ rara_tools/constants/normalizers.py
23
23
  tests/test_digar_schema_converter.py
24
24
  tests/test_elastic.py
25
25
  tests/test_elastic_vector_and_search_operations.py
26
+ tests/test_normalization.py
26
27
  tests/test_s3_exceptions.py
27
28
  tests/test_s3_file_operations.py
28
- tests/test_task_reporter.py
29
+ tests/test_sierra_converters.py
30
+ tests/test_task_reporter.py
31
+ tests/test_utils.py
32
+ tests/test_viaf_client.py
@@ -1,8 +1,11 @@
1
1
  elasticsearch==8.*
2
2
  elasticsearch_dsl==8.*
3
3
  minio==7.*
4
+ rara-norm-linker==1.*
4
5
  requests
5
6
  iso639-lang
7
+ pymarc
8
+ glom
6
9
 
7
10
  [testing]
8
11
  pytest>=8.0
@@ -1,5 +1,8 @@
1
1
  elasticsearch==8.*
2
2
  elasticsearch_dsl==8.*
3
3
  minio==7.*
4
+ rara-norm-linker==1.*
4
5
  requests
5
6
  iso639-lang
7
+ pymarc
8
+ glom
@@ -0,0 +1,315 @@
1
+ from rara_tools.normalizers import BibRecordNormalizer, AuthoritiesRecordNormalizer
2
+ from tests.test_utils import (get_formatted_sierra_response,
3
+ check_record_tags_sorted, check_no_dupe_tag_values, check_record_tags_have_values)
4
+
5
+ from pymarc import Record
6
+
7
+ import pytest
8
+
9
+ import os
10
+
11
+ TEST_LEVEL = os.getenv("TEST_LEVEL", "unit")
12
+
13
+ EMPTY_SIERRA_RECORDS = [
14
+ {
15
+ "sierraID": "1",
16
+ "leader": "00000nz a2200000n 4500",
17
+ "fields": []
18
+ },
19
+ ]
20
+
21
+ REQUIRED_FIELDS = ["667", "925"] # always included after normalization
22
+ REASON = "Skipped because TEST_LEVEL is set to 'ci'"
23
+
24
+
25
+ @pytest.mark.skipif(TEST_LEVEL == "ci", reason=REASON)
26
+ def test_normalizers_OK():
27
+ """ Test field editing logic & internals"""
28
+
29
+ entities = [
30
+ "Paul Keres", # will find multiple entities
31
+ "Anton Hansen Tammsaare",
32
+ "GIBBBERRISH",
33
+ ]
34
+
35
+ test_sierra_data = get_formatted_sierra_response("authorities.json")
36
+
37
+ normalizer = AuthoritiesRecordNormalizer(
38
+ entities=entities,
39
+ sierra_data=test_sierra_data,
40
+ )
41
+ assert len(normalizer.records_extra_data) == len(normalizer.data)
42
+
43
+ normalizer = BibRecordNormalizer(
44
+ entities=entities,
45
+ sierra_data=test_sierra_data,
46
+ )
47
+ assert len(normalizer.records_extra_data) == len(normalizer.data)
48
+
49
+ data = [
50
+ {
51
+ "sierraID": "1",
52
+ "leader": "00000nz a2200000n 4500",
53
+ "fields": [
54
+ {
55
+ "667": {
56
+ "ind1": " ",
57
+ "ind2": " ",
58
+ "subfields": [
59
+ {
60
+ "a": "Val"
61
+ }
62
+ ]
63
+ }
64
+ },
65
+ ]
66
+ },
67
+ ]
68
+
69
+ # default behavior - added if not in record &
70
+ normalizer = AuthoritiesRecordNormalizer(
71
+ sierra_data=data,
72
+ ALLOW_EDIT_FIELDS=[],
73
+ REPEATABLE_FIELDS=[],
74
+ )
75
+ for r in normalizer:
76
+ assert r.get_fields("667")[0].get_subfields("a")[0] == "Val"
77
+
78
+ # not edited if exists
79
+ normalizer = AuthoritiesRecordNormalizer(
80
+ sierra_data=data,
81
+ ALLOW_EDIT_FIELDS=[],
82
+ REPEATABLE_FIELDS=[]
83
+ )
84
+ for r in normalizer:
85
+ assert r.get_fields("667")[0].get_subfields("a")[0] == "Val"
86
+
87
+ # allow repeatable, new field will be added
88
+ normalizer = AuthoritiesRecordNormalizer(
89
+ sierra_data=data,
90
+ ALLOW_EDIT_FIELDS=[],
91
+ REPEATABLE_FIELDS=["667"]
92
+ )
93
+ for r in normalizer:
94
+ fields_667 = r.get_fields("667")
95
+ assert len(fields_667) == 2
96
+ assert fields_667[0].get_subfields("a")[0] == "Val"
97
+ assert fields_667[1].get_subfields("a")[0] == "Muudetud AI poolt"
98
+
99
+ # allow editing, field will be edited
100
+ normalizer = AuthoritiesRecordNormalizer(
101
+ sierra_data=data,
102
+ ALLOW_EDIT_FIELDS=["667"],
103
+ REPEATABLE_FIELDS=[]
104
+ )
105
+ for r in normalizer:
106
+ fields_667 = r.get_fields("667")
107
+ assert len(fields_667) == 1
108
+ assert fields_667[0].get_subfields("a")[0] == "Muudetud AI poolt"
109
+
110
+
111
+ def validate_bibrecord_normalized(record: Record, has_viaf_data=False):
112
+ # source notes
113
+ assert record.get_fields("667")[0].get_subfields("a")[
114
+ 0] == "Muudetud AI poolt"
115
+
116
+
117
+ def validate_authorities_record_normalized(record: Record, has_viaf_data=False):
118
+
119
+ field_667 = record.get_fields("667")[0].get_subfields("a")[0]
120
+ assert field_667 == "Muudetud AI poolt" or field_667 == "Loodud AI poolt"
121
+
122
+ field_040_subfields = record.get_fields("040")[0]
123
+
124
+ # check that a, b & c subfields have values (can have default or unique)
125
+ assert len(field_040_subfields.get_subfields("a")) > 0
126
+ assert len(field_040_subfields.get_subfields("b")) > 0
127
+ assert len(field_040_subfields.get_subfields("c")) > 0
128
+
129
+ # check that 008 field has a value of length 40
130
+ field_008 = record.get_fields("008")[0].data
131
+ assert len(field_008) == 40
132
+
133
+ if has_viaf_data:
134
+ field_043 = record.get_fields("043")[0].get_subfields(
135
+ "c")[0] # check that 043 has subfield c with value "ee"
136
+ assert field_043 == "ee"
137
+
138
+ field_024 = record.get_fields("024")
139
+ for f in field_024:
140
+ assert len(f.get_subfields("0")) > 0 # VIAF url
141
+
142
+ field_046 = record.get_fields("046")[0]
143
+ assert len(field_046.get_subfields("f")) > 0 # birth date
144
+ assert len(field_046.get_subfields("g")) > 0 # death date
145
+ # assert len(field_046.get_subfields("s")) > 0 # activity start
146
+ # assert len(field_046.get_subfields("t")) > 0 # activity end
147
+
148
+
149
+ @pytest.mark.skipif(TEST_LEVEL == "ci", reason=REASON)
150
+ def test_missing_fields_created_bibrecord_normalization():
151
+
152
+ normalizer_entities_only = BibRecordNormalizer(
153
+ entities=["Eduard Vilde", "Linda Vilde"], # find one match
154
+ )
155
+
156
+ normalizer_sierra_data_only = BibRecordNormalizer(
157
+ sierra_data=EMPTY_SIERRA_RECORDS,
158
+ )
159
+
160
+ for record in normalizer_entities_only:
161
+ check_record_tags_have_values(
162
+ record, ["008", "046", "245", # Sierra related, always with bibs
163
+ "035", "100", # VIAf enriched
164
+ ] + REQUIRED_FIELDS
165
+ )
166
+ validate_bibrecord_normalized(record, has_viaf_data=True)
167
+
168
+ for record in normalizer_sierra_data_only:
169
+ check_record_tags_have_values(
170
+ record, ["008", "046", "245", # Sierra related, always with bibs
171
+ ] + REQUIRED_FIELDS)
172
+ validate_bibrecord_normalized(record)
173
+
174
+
175
+ @pytest.mark.skipif(TEST_LEVEL == "ci", reason=REASON)
176
+ def test_missing_fields_created_authorities_normalization():
177
+
178
+ normalizer_entities_only = AuthoritiesRecordNormalizer(
179
+ entities=["Eduard Vilde"], # find one match
180
+ )
181
+
182
+ normalizer_sierra_data_only = AuthoritiesRecordNormalizer(
183
+ sierra_data=EMPTY_SIERRA_RECORDS,
184
+ )
185
+
186
+ for r in normalizer_entities_only:
187
+ check_record_tags_have_values(r, ["008", "040", # SIERRA related
188
+ "024", "043", "046" # VIAF enriched
189
+ ] + REQUIRED_FIELDS)
190
+ validate_authorities_record_normalized(r, True)
191
+
192
+ for r in normalizer_sierra_data_only:
193
+ check_record_tags_have_values(
194
+ r, ["040"] + REQUIRED_FIELDS)
195
+ validate_authorities_record_normalized(r)
196
+
197
+
198
+ @pytest.mark.skipif(TEST_LEVEL == "ci", reason=REASON)
199
+ def test_normalized_fields_sorted():
200
+
201
+ unsorted_bibdata = [
202
+ {
203
+ "sierraID": "1",
204
+ "leader": "00000nz a2200000n 4500",
205
+ "fields": [
206
+ {
207
+ "035": {
208
+ "ind1": " ",
209
+ "ind2": " ",
210
+ "subfields": [
211
+ {
212
+ "a": "(ErESTER)<1>"
213
+ }
214
+ ]
215
+ }
216
+ },
217
+ {
218
+ "008": "220805|||aznnnaabn || ||| nz n "
219
+ },
220
+ {
221
+ "046": {
222
+ "ind1": " ",
223
+ "ind2": " ",
224
+ "subfields": [
225
+ {
226
+ "k": "1912"
227
+ }
228
+
229
+ ]
230
+ }
231
+ },
232
+ ]
233
+ }
234
+ ]
235
+
236
+ normalizers = (BibRecordNormalizer, AuthoritiesRecordNormalizer)
237
+
238
+ for normalizer in normalizers:
239
+ normalizer = normalizer(
240
+ entities=[],
241
+ sierra_data=unsorted_bibdata
242
+ )
243
+
244
+ for r in normalizer:
245
+ check_no_dupe_tag_values(r)
246
+ check_record_tags_sorted(r)
247
+
248
+
249
+ @pytest.mark.skipif(TEST_LEVEL == "ci", reason=REASON)
250
+ def test_authority_normrecord_found_in_es_and_normalized():
251
+ """ KATA elastic normkirjete seast leitakse 1 vaste & normaliseerija täiendab leitud normkirjet VIAF infoga.
252
+ - valideeri normaliseerimise mapping, mis autori tabelis. Täiendatud väljad ja VIAFist info
253
+ - Valideeri märge lisatud (TODO) """
254
+ # Presume, author name identified and sent to linker
255
+ name = "Jaan Kross"
256
+
257
+ normalizer = AuthoritiesRecordNormalizer(
258
+ entities=[name]
259
+ )
260
+
261
+ data = normalizer.data
262
+
263
+ assert len(data) == 1
264
+
265
+ for r in normalizer:
266
+ check_record_tags_have_values(r, ["040"] + REQUIRED_FIELDS)
267
+ validate_authorities_record_normalized(r, has_viaf_data=True)
268
+
269
+
270
+ @pytest.mark.skipif(TEST_LEVEL == "ci", reason=REASON)
271
+ def test_authority_normrecord_not_found_in_es_and_viaf():
272
+ """KATA elastic normkirjete seast vastet ei leitud & linkija sooritab VIAFisse otsingu
273
+ - Üks vaste leiti - luuakse uus normkirje
274
+ - Ei leitud ühtegi vastet, või on leitud vasteid mitu - AI tuvastatud info põhjal uue kirje loomine(TODO)
275
+ """
276
+
277
+ # 1 result found
278
+ normalizer = AuthoritiesRecordNormalizer(entities=["Karl Ristikivi"])
279
+
280
+ data = normalizer.data
281
+
282
+ assert len(data) == 1 # should create new normalized record
283
+
284
+ # Entities not found, es & VIAF
285
+ normalizer = AuthoritiesRecordNormalizer(entities=["asdasd#@2"])
286
+ data = normalizer.data
287
+ assert len(data) == 0 # should create new normalized record
288
+
289
+ # multiple entities found, skipped
290
+ normalizer = AuthoritiesRecordNormalizer(entities=["Paul Keres"])
291
+ data = normalizer.data
292
+ assert len(data) == 0 # should not create anything atm
293
+
294
+
295
+ @pytest.mark.skipif(TEST_LEVEL == "ci", reason=REASON)
296
+ def test_matching_sierra_record_viaf_id_found():
297
+ """normkirjelt leitakse VIAF ID, vajadusel normi asukoht, kus see ID sisaldub."""
298
+ pass
299
+
300
+
301
+ @pytest.mark.skipif(TEST_LEVEL == "ci", reason=REASON)
302
+ def test_matching_sierra_record_viaf_id_not_found():
303
+ """kirjelt VIAF IDd ei leitud, soorita otsing VIAFi pihta, et leida _vastutav isik_?. Loo uus vastavalt otsingu tulemusele."""
304
+ pass
305
+
306
+
307
+ @pytest.mark.skipif(TEST_LEVEL == "ci", reason=REASON)
308
+ def test_authorities_normalizer_checks():
309
+ """
310
+ - kontrolli kas tuvastatud nimi on SIERRAst leitud vaste 1XX, 4XX väljadel. Kui pole, siis lisa 4XX väljale.
311
+ - kontrolli, kas VIAF andmete nimekujud on normkandes olemas. Kui pole, lisa need 4XX väljale.
312
+ - Kontrolli, kas VIAF kandes on sünni ja surma daatumid ja kas need klapivad normkandes olevaga. Kui pole, siis liiguta normkandest kogu 1XX väli 4XX väljale. Seejärel loo uute daatumitega 1XX väli.
313
+ - Kontrolli, et väljal 046 olevad daatumid klapiksid just 1xx väljale lisatuga. Kui andmeid muudeti, siis märgi, et baasis on normkanne muutunud
314
+ """
315
+ pass
@@ -0,0 +1,101 @@
1
+ import os
2
+
3
+ import pytest
4
+ from rara_tools.converters import SierraResponseConverter
5
+ from rara_tools.exceptions import SierraResponseConverterException
6
+
7
+ from tests.const import SIERRA_OUTPUT_DIR
8
+ from tests.test_utils import (read_json_file, get_formatted_sierra_response, compare_results)
9
+
10
+
11
+ example_res = {
12
+ "total": 100,
13
+ "start": 50000,
14
+ "entries": [
15
+ {
16
+ "id": 1126963,
17
+ "updatedDate": "2016-02-09T08:42:52Z",
18
+ "createdDate": "2014-05-17T17:22:00Z",
19
+ "deleted": False,
20
+ "suppressed": False,
21
+ "marc": {
22
+ "leader": "00000nz a2200145n 4500",
23
+ "fields": [
24
+ {
25
+ # "tag": "100",
26
+ "data": {
27
+ "ind1": "1",
28
+ "ind2": " ",
29
+ "subfields": [
30
+ {
31
+ "code": "a",
32
+ "data": "Viggor, Signe,"
33
+ },
34
+ {
35
+ "code": "d",
36
+ "data": "1975-"
37
+ }
38
+ ]
39
+ }
40
+ },
41
+ ]}}]}
42
+
43
+
44
+ def test_convert_bibs_response():
45
+
46
+ data = get_formatted_sierra_response("bibs.json")
47
+
48
+ expected = read_json_file(os.path.join(SIERRA_OUTPUT_DIR, "bibs.json"))
49
+
50
+ assert compare_results(expected, data)
51
+
52
+
53
+ def test_convert_keywords_response():
54
+
55
+ data = get_formatted_sierra_response("keywords.json")
56
+
57
+ expected = read_json_file(os.path.join(SIERRA_OUTPUT_DIR, "keywords.json"))
58
+
59
+ assert compare_results(expected, data)
60
+
61
+
62
+ def test_convert_authorities_response():
63
+
64
+ data = get_formatted_sierra_response("authorities.json")
65
+
66
+ expected = read_json_file(os.path.join(
67
+ SIERRA_OUTPUT_DIR, "authorities.json"))
68
+
69
+ assert compare_results(expected, data)
70
+
71
+
72
+ def test_converter_handles_marc_in_json_response():
73
+ """ Gracefully handle entries already in MARC-in-JSON format """
74
+ data = get_formatted_sierra_response("bibsmarc.json")
75
+
76
+ expected = read_json_file(os.path.join(SIERRA_OUTPUT_DIR, "bibsmarc.json"))
77
+
78
+ assert compare_results(expected, data)
79
+
80
+
81
+ def test_convert_with_wrong_format():
82
+ with pytest.raises(SierraResponseConverterException):
83
+ SierraResponseConverter("$")
84
+
85
+
86
+ def test_convert_missing_tag():
87
+ with pytest.raises(SierraResponseConverterException):
88
+ response = example_res.copy()
89
+ response["entries"][0]["marc"]["fields"][0].pop("tag", None)
90
+
91
+ converter = SierraResponseConverter(response)
92
+ converter.convert()
93
+
94
+
95
+ def test_no_entries_in_response():
96
+ with pytest.raises(SierraResponseConverterException):
97
+ response = example_res.copy()
98
+ response.pop("entries", [])
99
+
100
+ converter = SierraResponseConverter(response)
101
+ converter.convert()
@@ -0,0 +1,77 @@
1
+ from tests.const import SIERRA_INPUT_DIR, NORMALIZED_DIR, VIAF_TEST_DATA_DIR
2
+ from rara_tools.constants import VIAF_ALLOWED_SOURCES
3
+
4
+ from rara_tools.converters import SierraResponseConverter
5
+ from rara_tools.normalizers.viaf import VIAFRecord, VIAFClient
6
+
7
+ from rara_linker.linkers.linker import Linker
8
+
9
+ from pymarc import Record
10
+ from typing import List
11
+
12
+ import json
13
+ import os
14
+
15
+
16
+ def read_json_file(path: str):
17
+ with open(path, "r") as f:
18
+ data = f.read()
19
+ return json.loads(data)
20
+
21
+
22
+ def check_record_tags_sorted(record: Record):
23
+ record_tags = [field.tag for field in record.get_fields()]
24
+ assert record_tags == sorted(record_tags)
25
+
26
+
27
+ def check_no_dupe_tag_values(record: Record):
28
+ repetable_tags = ["024", "035", "400", "670"]
29
+ record_tags = [field.tag for field in record.get_fields() if field.tag not in repetable_tags]
30
+ assert len(record_tags) == len(set(record_tags))
31
+
32
+
33
+ def check_record_tags_have_values(record: Record, tags: List[str]):
34
+ for tag in tags:
35
+ assert record[tag] is not None
36
+
37
+
38
+ def get_record_field_value(record: Record, tag: str):
39
+ """ handle control & variable fields """
40
+ return record.get_fields(tag)[0].value()
41
+
42
+
43
+ def compare_results(expected: dict, results: dict):
44
+ return json.dumps(expected) == json.dumps(results)
45
+
46
+
47
+ def get_formatted_sierra_response(fname: str):
48
+ """ Reads a mock Sierra response file and converts it to MARC in json."""
49
+
50
+ response = read_json_file(os.path.join(SIERRA_INPUT_DIR, fname))
51
+
52
+ converter = SierraResponseConverter(response)
53
+ return converter.convert()
54
+
55
+
56
+ def get_viaf_record(id: str, allowed_sources: list):
57
+ """ Fetches VIAF record by ID and returns a VIAFRecord object """
58
+
59
+ client = VIAFClient() # should use Linker instead? not ViafLinker directly
60
+ response = client.get_records_by_viaf_id(id)
61
+
62
+ viaf_record = VIAFRecord(
63
+ response, allowed_sources=allowed_sources)
64
+ return viaf_record
65
+
66
+
67
+ def search_viaf_record(search_term: str, allowed_sources: list):
68
+ """ Fetches VIAF record by name and returns a VIAFRecord object """
69
+ client = VIAFClient()
70
+ response = client.get_records_by_search_term(search_term)
71
+
72
+ return VIAFRecord(response, allowed_sources=allowed_sources)
73
+
74
+ def get_normalized_example(fname: str):
75
+ with open(os.path.join(NORMALIZED_DIR, fname), "r") as f:
76
+ data = f.read()
77
+ return json.loads(data)
@@ -0,0 +1,19 @@
1
+ from rara_tools.normalizers.viaf import VIAFRecord, VIAFClient
2
+
3
+
4
+ def test_fetch_clusters_by_id_list():
5
+ viaf_ids = ["7432247", "456"]
6
+ client = VIAFClient()
7
+
8
+ results = client.fetch_viaf_clusters(viaf_ids)
9
+ assert len(results) == 2
10
+ assert results["456"] == {}
11
+ assert len(results["7432247"]) > 0
12
+
13
+
14
+ def test_fetch_viaf_results_for_normalizer():
15
+ viaf_ids = ["7432247", "456"]
16
+ client = VIAFClient()
17
+
18
+ results = client.get_normalized_data(viaf_ids)
19
+ assert len(results) == 2
rara_tools-0.0.13/VERSION DELETED
@@ -1 +0,0 @@
1
- 0.0.13
File without changes
@@ -1,127 +0,0 @@
1
- import json
2
- import os
3
-
4
- import pytest
5
- from rara_tools.converters import SierraResponseConverter
6
- from rara_tools.exceptions import SierraResponseConverterException
7
-
8
- import json
9
-
10
- root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
11
-
12
- SIERRA_TEST_DATA_DIR = os.path.join(root, "tests", "test_data", "sierra")
13
- INPUT_DIR = os.path.join(SIERRA_TEST_DATA_DIR, "input")
14
- OUTPUT_DIR = os.path.join(SIERRA_TEST_DATA_DIR, "output")
15
-
16
- def compare_results(expected, converted):
17
- return json.dumps(expected) == json.dumps(converted)
18
-
19
- def read_json_file(file_path):
20
- with open(file_path, "r") as f:
21
- data = f.read()
22
- return json.loads(data)
23
-
24
- example_res = {
25
- "total": 100,
26
- "start": 50000,
27
- "entries": [
28
- {
29
- "id": 1126963,
30
- "updatedDate": "2016-02-09T08:42:52Z",
31
- "createdDate": "2014-05-17T17:22:00Z",
32
- "deleted": False,
33
- "suppressed": False,
34
- "marc": {
35
- "leader": "00000nz a2200145n 4500",
36
- "fields": [
37
- {
38
- # "tag": "100",
39
- "data": {
40
- "ind1": "1",
41
- "ind2": " ",
42
- "subfields": [
43
- {
44
- "code": "a",
45
- "data": "Viggor, Signe,"
46
- },
47
- {
48
- "code": "d",
49
- "data": "1975-"
50
- }
51
- ]
52
- }
53
- },
54
- ]}}]}
55
-
56
-
57
-
58
-
59
- def test_convert_bibs_response():
60
- response = read_json_file(os.path.join(INPUT_DIR, "bibs.json"))
61
-
62
- converter = SierraResponseConverter(response)
63
- data = converter.convert()
64
-
65
- expected = read_json_file(os.path.join(OUTPUT_DIR, "bibs.json"))
66
-
67
- assert compare_results(expected, data)
68
-
69
-
70
- def test_convert_keywords_response():
71
- with open(os.path.join(INPUT_DIR, "keywords.json"), "r") as f:
72
- response = f.read()
73
- response = json.loads(response)
74
-
75
- converter = SierraResponseConverter(response)
76
- data = converter.convert()
77
-
78
-
79
- expected = read_json_file(os.path.join(OUTPUT_DIR, "keywords.json"))
80
-
81
- assert compare_results(expected, data)
82
-
83
-
84
- def test_convert_authorities_response():
85
- with open(os.path.join(INPUT_DIR, "authorities.json"), "r") as f:
86
- response = f.read()
87
- response = json.loads(response)
88
-
89
- converter = SierraResponseConverter(response)
90
- data = converter.convert()
91
-
92
- expected = read_json_file(os.path.join(OUTPUT_DIR, "authorities.json"))
93
-
94
- assert compare_results(expected, data)
95
-
96
- def test_converter_handles_marc_in_json_response():
97
- """ Gracefully handle entries already in MARC-in-JSON format """
98
- with open(os.path.join(INPUT_DIR, "bibsmarc.json"), "r") as f:
99
- response = f.read()
100
- response = json.loads(response)
101
-
102
- converter = SierraResponseConverter(response)
103
- data = converter.convert()
104
-
105
- expected = read_json_file(os.path.join(OUTPUT_DIR, "bibsmarc.json"))
106
-
107
- assert compare_results(expected, data)
108
-
109
- def test_convert_with_wrong_format():
110
- with pytest.raises(SierraResponseConverterException):
111
- SierraResponseConverter("$")
112
-
113
- def test_convert_missing_tag():
114
- with pytest.raises(SierraResponseConverterException):
115
- response = example_res.copy()
116
- response["entries"][0]["marc"]["fields"][0].pop("tag", None)
117
-
118
- converter = SierraResponseConverter(response)
119
- converter.convert()
120
-
121
- def test_no_entries_in_response():
122
- with pytest.raises(SierraResponseConverterException):
123
- response = example_res.copy()
124
- response.pop("entries", [])
125
-
126
- converter = SierraResponseConverter(response)
127
- converter.convert()
File without changes
File without changes
File without changes
File without changes