rara-tools 0.7.0__tar.gz → 0.7.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rara-tools might be problematic. Click here for more details.

Files changed (67) hide show
  1. {rara_tools-0.7.0/rara_tools.egg-info → rara_tools-0.7.1}/PKG-INFO +1 -1
  2. rara_tools-0.7.1/VERSION +1 -0
  3. {rara_tools-0.7.0 → rara_tools-0.7.1}/rara_tools/normalizers/base.py +77 -20
  4. rara_tools-0.7.1/rara_tools/normalizers/reader.py +45 -0
  5. {rara_tools-0.7.0 → rara_tools-0.7.1/rara_tools.egg-info}/PKG-INFO +1 -1
  6. {rara_tools-0.7.0 → rara_tools-0.7.1}/rara_tools.egg-info/SOURCES.txt +1 -0
  7. {rara_tools-0.7.0 → rara_tools-0.7.1}/tests/test_normalization.py +28 -1
  8. rara_tools-0.7.0/VERSION +0 -1
  9. {rara_tools-0.7.0 → rara_tools-0.7.1}/LICENSE.md +0 -0
  10. {rara_tools-0.7.0 → rara_tools-0.7.1}/README.md +0 -0
  11. {rara_tools-0.7.0 → rara_tools-0.7.1}/pyproject.toml +0 -0
  12. {rara_tools-0.7.0 → rara_tools-0.7.1}/rara_tools/constants/__init__.py +0 -0
  13. {rara_tools-0.7.0 → rara_tools-0.7.1}/rara_tools/constants/digitizer.py +0 -0
  14. {rara_tools-0.7.0 → rara_tools-0.7.1}/rara_tools/constants/general.py +0 -0
  15. {rara_tools-0.7.0 → rara_tools-0.7.1}/rara_tools/constants/language_evaluator.py +0 -0
  16. {rara_tools-0.7.0 → rara_tools-0.7.1}/rara_tools/constants/linker.py +0 -0
  17. {rara_tools-0.7.0 → rara_tools-0.7.1}/rara_tools/constants/meta_extractor.py +0 -0
  18. {rara_tools-0.7.0 → rara_tools-0.7.1}/rara_tools/constants/normalizers.py +0 -0
  19. {rara_tools-0.7.0 → rara_tools-0.7.1}/rara_tools/constants/parsers.py +0 -0
  20. {rara_tools-0.7.0 → rara_tools-0.7.1}/rara_tools/constants/subject_indexer.py +0 -0
  21. {rara_tools-0.7.0 → rara_tools-0.7.1}/rara_tools/converters.py +0 -0
  22. {rara_tools-0.7.0 → rara_tools-0.7.1}/rara_tools/core_formatters/core_formatter.py +0 -0
  23. {rara_tools-0.7.0 → rara_tools-0.7.1}/rara_tools/core_formatters/formatted_keyword.py +0 -0
  24. {rara_tools-0.7.0 → rara_tools-0.7.1}/rara_tools/core_formatters/formatted_meta.py +0 -0
  25. {rara_tools-0.7.0 → rara_tools-0.7.1}/rara_tools/core_formatters/formatted_object.py +0 -0
  26. {rara_tools-0.7.0 → rara_tools-0.7.1}/rara_tools/decorators.py +0 -0
  27. {rara_tools-0.7.0 → rara_tools-0.7.1}/rara_tools/digar_schema_converter.py +0 -0
  28. {rara_tools-0.7.0 → rara_tools-0.7.1}/rara_tools/elastic.py +0 -0
  29. {rara_tools-0.7.0 → rara_tools-0.7.1}/rara_tools/exceptions.py +0 -0
  30. {rara_tools-0.7.0 → rara_tools-0.7.1}/rara_tools/normalizers/__init__.py +0 -0
  31. {rara_tools-0.7.0 → rara_tools-0.7.1}/rara_tools/normalizers/authorities.py +0 -0
  32. {rara_tools-0.7.0 → rara_tools-0.7.1}/rara_tools/normalizers/bibs.py +0 -0
  33. {rara_tools-0.7.0 → rara_tools-0.7.1}/rara_tools/normalizers/viaf.py +0 -0
  34. {rara_tools-0.7.0 → rara_tools-0.7.1}/rara_tools/parsers/marc_parsers/base_parser.py +0 -0
  35. {rara_tools-0.7.0 → rara_tools-0.7.1}/rara_tools/parsers/marc_parsers/ems_parser.py +0 -0
  36. {rara_tools-0.7.0 → rara_tools-0.7.1}/rara_tools/parsers/marc_parsers/location_parser.py +0 -0
  37. {rara_tools-0.7.0 → rara_tools-0.7.1}/rara_tools/parsers/marc_parsers/organization_parser.py +0 -0
  38. {rara_tools-0.7.0 → rara_tools-0.7.1}/rara_tools/parsers/marc_parsers/person_parser.py +0 -0
  39. {rara_tools-0.7.0 → rara_tools-0.7.1}/rara_tools/parsers/marc_parsers/title_parser.py +0 -0
  40. {rara_tools-0.7.0 → rara_tools-0.7.1}/rara_tools/parsers/marc_records/base_record.py +0 -0
  41. {rara_tools-0.7.0 → rara_tools-0.7.1}/rara_tools/parsers/marc_records/ems_record.py +0 -0
  42. {rara_tools-0.7.0 → rara_tools-0.7.1}/rara_tools/parsers/marc_records/organization_record.py +0 -0
  43. {rara_tools-0.7.0 → rara_tools-0.7.1}/rara_tools/parsers/marc_records/person_record.py +0 -0
  44. {rara_tools-0.7.0 → rara_tools-0.7.1}/rara_tools/parsers/marc_records/title_record.py +0 -0
  45. {rara_tools-0.7.0 → rara_tools-0.7.1}/rara_tools/parsers/tools/entity_normalizers.py +0 -0
  46. {rara_tools-0.7.0 → rara_tools-0.7.1}/rara_tools/parsers/tools/marc_converter.py +0 -0
  47. {rara_tools-0.7.0 → rara_tools-0.7.1}/rara_tools/parsers/tools/russian_transliterator.py +0 -0
  48. {rara_tools-0.7.0 → rara_tools-0.7.1}/rara_tools/s3.py +0 -0
  49. {rara_tools-0.7.0 → rara_tools-0.7.1}/rara_tools/task_reporter.py +0 -0
  50. {rara_tools-0.7.0 → rara_tools-0.7.1}/rara_tools/utils.py +0 -0
  51. {rara_tools-0.7.0 → rara_tools-0.7.1}/rara_tools.egg-info/dependency_links.txt +0 -0
  52. {rara_tools-0.7.0 → rara_tools-0.7.1}/rara_tools.egg-info/requires.txt +0 -0
  53. {rara_tools-0.7.0 → rara_tools-0.7.1}/rara_tools.egg-info/top_level.txt +0 -0
  54. {rara_tools-0.7.0 → rara_tools-0.7.1}/requirements.txt +0 -0
  55. {rara_tools-0.7.0 → rara_tools-0.7.1}/setup.cfg +0 -0
  56. {rara_tools-0.7.0 → rara_tools-0.7.1}/tests/test_digar_schema_converter.py +0 -0
  57. {rara_tools-0.7.0 → rara_tools-0.7.1}/tests/test_elastic.py +0 -0
  58. {rara_tools-0.7.0 → rara_tools-0.7.1}/tests/test_elastic_vector_and_search_operations.py +0 -0
  59. {rara_tools-0.7.0 → rara_tools-0.7.1}/tests/test_entity_normalizers.py +0 -0
  60. {rara_tools-0.7.0 → rara_tools-0.7.1}/tests/test_formatters.py +0 -0
  61. {rara_tools-0.7.0 → rara_tools-0.7.1}/tests/test_marc_parsers.py +0 -0
  62. {rara_tools-0.7.0 → rara_tools-0.7.1}/tests/test_s3_exceptions.py +0 -0
  63. {rara_tools-0.7.0 → rara_tools-0.7.1}/tests/test_s3_file_operations.py +0 -0
  64. {rara_tools-0.7.0 → rara_tools-0.7.1}/tests/test_sierra_converters.py +0 -0
  65. {rara_tools-0.7.0 → rara_tools-0.7.1}/tests/test_task_reporter.py +0 -0
  66. {rara_tools-0.7.0 → rara_tools-0.7.1}/tests/test_utils.py +0 -0
  67. {rara_tools-0.7.0 → rara_tools-0.7.1}/tests/test_viaf_client.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rara-tools
3
- Version: 0.7.0
3
+ Version: 0.7.1
4
4
  Summary: Tools to support Kata's work.
5
5
  Classifier: Programming Language :: Python :: 3
6
6
  Classifier: Programming Language :: Python :: 3.10
@@ -0,0 +1 @@
1
+ 0.7.1
@@ -1,6 +1,7 @@
1
1
  from datetime import datetime
2
2
  from pymarc import (Field, Subfield, JSONReader, Record)
3
3
  from typing import List, Optional, Iterator
4
+ from rara_tools.normalizers.reader import SafeJSONReader
4
5
 
5
6
  from rara_tools.normalizers.viaf import VIAFRecord, VIAFClient
6
7
  from rara_tools.constants.normalizers import (
@@ -35,7 +36,6 @@ class RecordNormalizer:
35
36
  self.ALLOW_EDIT_FIELDS = ALLOW_EDIT_FIELDS
36
37
  # include, if should be added alongside existing fields
37
38
  self.REPEATABLE_FIELDS = REPEATABLE_FIELDS
38
-
39
39
  self.records_extra_data = []
40
40
  self.records = self._setup_records(linking_results, sierra_data)
41
41
  self.sierra_data = sierra_data
@@ -92,8 +92,56 @@ class RecordNormalizer:
92
92
 
93
93
  all_records = linked_records + (sierra_data or [])
94
94
 
95
- return JSONReader(json.dumps(all_records,
96
- ensure_ascii=False), stream=False)
95
+ return SafeJSONReader(
96
+ json.dumps(all_records, ensure_ascii=False),
97
+ )
98
+
99
+ def _setup_records(self, linking_results: List[dict], sierra_data: List[dict]) -> JSONReader:
100
+ """Setup initial MARC records and data.
101
+
102
+ For linked entities:
103
+ 1. Try to get single linked normalized record from KATA elastic. If more than one found, skip.
104
+ 2. If 0 matches, search from VIAF and if 1 result found, create a new authority record from the data.
105
+ 3. If none or more than one responses found, use only Classificator data (coming from Linker?).
106
+ """
107
+ linked_records = []
108
+
109
+ for linked in linking_results or []:
110
+ if not isinstance(linked, dict):
111
+ continue
112
+
113
+ entity = linked.get("original_entity")
114
+ linked_info = linked.get("linked_info", [])
115
+
116
+ if not isinstance(linked_info, list) or not linked_info:
117
+ continue
118
+
119
+ if len(linked_info) == 1:
120
+ linked_item = linked_info[0]
121
+ if not isinstance(linked_item, dict):
122
+ continue
123
+
124
+ linked_records.append(linked_item.get("json", {}))
125
+ self.records_extra_data.append({
126
+ "entity": entity,
127
+ "viaf": linked_item.get("viaf", {}),
128
+ "type": "linked",
129
+ "edited": True
130
+ })
131
+
132
+ self.records_extra_data.extend(
133
+ {
134
+ "sierraID": obj.get("sierraID"),
135
+ "type": "sierra",
136
+ "edited": True
137
+ }
138
+ for obj in (sierra_data or [])
139
+ if isinstance(obj, dict)
140
+ )
141
+
142
+ all_records = linked_records + (sierra_data or [])
143
+
144
+ return SafeJSONReader(json.dumps(all_records, ensure_ascii=False))
97
145
 
98
146
  @staticmethod
99
147
  def current_timestamp():
@@ -230,11 +278,15 @@ class RecordNormalizer:
230
278
 
231
279
  def _get_viaf_search_term(self, record: Record, entity: Optional[str]) -> Optional[str]:
232
280
  """ prioritize entity name, if not available, use author name. """
233
-
234
281
  if entity:
235
282
  return entity
236
- else:
237
- return record.name
283
+
284
+ author_field = record.get("100") or record.get("110") or record.get("111")
285
+ if author_field:
286
+ return author_field.get_subfields("a")[0] if author_field.get_subfields("a") else None
287
+
288
+ logger.warning(
289
+ "No entity or author name found for VIAF search. Skipping VIAF enrichment.")
238
290
 
239
291
  def _get_viaf_record(self, record: Record, viaf_id: Optional[int] = None,
240
292
  entity: Optional[str] = None, viaf_field: str = DEFAULT_VIAF_FIELD,
@@ -252,21 +304,26 @@ class RecordNormalizer:
252
304
  viaf_record = viaf_records[0]
253
305
  else:
254
306
  search_term = self._get_viaf_search_term(record, entity)
255
- if not verify:
256
- logger.warning(
257
- f"Record verification is turned off. If multiple records are " \
258
- f"detected for search term '{search_term}', the first " \
259
- f"result is automatically returned. This might lead to " \
260
- f"some inaccuracies!"
261
- )
262
307
 
263
- viaf_record = viaf_client.get_normalized_data_by_search_term(
264
- search_term=search_term,
265
- field=viaf_field,
266
- max_records=max_records,
267
- verify=verify,
268
- threshold=threshold
269
- )
308
+ if search_term:
309
+ logger.info(
310
+ f"Searching for VIAF record with search term: {search_term}")
311
+
312
+ if not verify:
313
+ logger.warning(
314
+ f"Record verification is turned off. If multiple records are " \
315
+ f"detected for search term '{search_term}', the first " \
316
+ f"result is automatically returned. This might lead to " \
317
+ f"some inaccuracies!"
318
+ )
319
+
320
+ viaf_record = viaf_client.get_normalized_data_by_search_term(
321
+ search_term=search_term,
322
+ field=viaf_field,
323
+ max_records=max_records,
324
+ verify=verify,
325
+ threshold=threshold
326
+ )
270
327
 
271
328
  except Exception as e:
272
329
  logger.error(
@@ -0,0 +1,45 @@
1
+ from pymarc import Record, Field, Subfield, Leader, JSONReader
2
+ import logging
3
+
4
+ logger = logging.getLogger(__name__)
5
+
6
+ DEFAULT_LEADER = '01682nz a2200349n 4500'
7
+
8
+ class SafeJSONReader(JSONReader):
9
+
10
+ def __next__(self):
11
+ while True:
12
+ try:
13
+ jobj = next(self.iter)
14
+ rec = Record()
15
+
16
+ # Use custom default leader if missing
17
+ leader_str = jobj.get("leader")
18
+ if leader_str:
19
+ rec.leader = Leader(leader_str)
20
+ else:
21
+ logger.warning("Missing leader in record. Using DEFAULT_LEADER.")
22
+ rec.leader = Leader(DEFAULT_LEADER)
23
+
24
+ for field in jobj["fields"]:
25
+ k, v = list(field.items())[0]
26
+
27
+ if isinstance(v, dict) and "subfields" in v:
28
+ subfields = []
29
+ for sub in v["subfields"]:
30
+ for code, value in sub.items():
31
+ subfields.append(Subfield(code, value))
32
+ ind1 = v.get("ind1", " ")
33
+ ind2 = v.get("ind2", " ")
34
+ fld = Field(tag=k, indicators=[ind1, ind2], subfields=subfields)
35
+ else:
36
+ fld = Field(tag=k, data=v)
37
+ rec.add_field(fld)
38
+
39
+ return rec
40
+
41
+ except StopIteration:
42
+ raise
43
+ except Exception as e:
44
+ logger.error(f"Skipping invalid record: {e}")
45
+ continue
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rara-tools
3
- Version: 0.7.0
3
+ Version: 0.7.1
4
4
  Summary: Tools to support Kata's work.
5
5
  Classifier: Programming Language :: Python :: 3
6
6
  Classifier: Programming Language :: Python :: 3.10
@@ -33,6 +33,7 @@ rara_tools/normalizers/__init__.py
33
33
  rara_tools/normalizers/authorities.py
34
34
  rara_tools/normalizers/base.py
35
35
  rara_tools/normalizers/bibs.py
36
+ rara_tools/normalizers/reader.py
36
37
  rara_tools/normalizers/viaf.py
37
38
  rara_tools/parsers/marc_parsers/base_parser.py
38
39
  rara_tools/parsers/marc_parsers/ems_parser.py
@@ -1,3 +1,4 @@
1
+ from rara_tools.constants import linker
1
2
  from rara_tools.normalizers import BibRecordNormalizer, AuthoritiesRecordNormalizer
2
3
  from tests.test_utils import (get_linker_res_example, get_formatted_sierra_response,
3
4
  check_record_tags_sorted, check_no_dupe_tag_values, check_record_tags_have_values)
@@ -281,7 +282,8 @@ def test_authority_normrecord_not_found_in_es_and_viaf():
281
282
  linking_results = [linker_res]
282
283
 
283
284
  normalizer = AuthoritiesRecordNormalizer(
284
- linking_results=linking_results)
285
+ linking_results=linking_results
286
+ )
285
287
 
286
288
  data = normalizer.data
287
289
 
@@ -302,6 +304,31 @@ def test_authority_normrecord_not_found_in_es_and_viaf():
302
304
  # should create new normalized record in the future, none for now
303
305
  assert len(data) == 0
304
306
 
307
+ def _run_normalizer(linked_data):
308
+ normalizer = AuthoritiesRecordNormalizer(
309
+ linking_results=linked_data
310
+ )
311
+ return normalizer.data
312
+
313
+ def test_normalizer_handles_bad_inputs():
314
+ linker_res = get_linker_res_example(
315
+ "oneFound.json")
316
+
317
+ # pop the leader field to simulate record without leader
318
+ linker_res["linked_info"][0]["json"].pop("leader", None)
319
+ _run_normalizer([linker_res])
320
+
321
+ # make fields empty to simulate a record with no fields
322
+ linker_res["linked_info"][0]["json"]["fields"] = []
323
+ _run_normalizer([linker_res])
324
+
325
+ # pop the fields to simulate a record with no fields
326
+ linker_res["linked_info"][0]["json"].pop("fields", None)
327
+ _run_normalizer([linker_res])
328
+
329
+ inputs = ["", None, [], {}, 123]
330
+
331
+ _run_normalizer(inputs)
305
332
 
306
333
  def test_matching_sierra_record_viaf_id_found():
307
334
  """normkirjelt leitakse VIAF ID, vajadusel normi asukoht, kus see ID sisaldub."""
rara_tools-0.7.0/VERSION DELETED
@@ -1 +0,0 @@
1
- 0.7.0
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes