rara-tools 0.5.2__tar.gz → 0.5.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rara-tools might be problematic. Click here for more details.

Files changed (64) hide show
  1. {rara_tools-0.5.2/rara_tools.egg-info → rara_tools-0.5.3}/PKG-INFO +2 -1
  2. rara_tools-0.5.3/VERSION +1 -0
  3. rara_tools-0.5.3/rara_tools/constants/normalizers.py +44 -0
  4. {rara_tools-0.5.2 → rara_tools-0.5.3}/rara_tools/normalizers/base.py +40 -24
  5. rara_tools-0.5.3/rara_tools/normalizers/viaf.py +653 -0
  6. {rara_tools-0.5.2 → rara_tools-0.5.3/rara_tools.egg-info}/PKG-INFO +2 -1
  7. {rara_tools-0.5.2 → rara_tools-0.5.3}/rara_tools.egg-info/requires.txt +1 -0
  8. {rara_tools-0.5.2 → rara_tools-0.5.3}/requirements.txt +1 -0
  9. rara_tools-0.5.3/tests/test_viaf_client.py +71 -0
  10. rara_tools-0.5.2/VERSION +0 -1
  11. rara_tools-0.5.2/rara_tools/constants/normalizers.py +0 -6
  12. rara_tools-0.5.2/rara_tools/normalizers/viaf.py +0 -204
  13. rara_tools-0.5.2/tests/test_viaf_client.py +0 -19
  14. {rara_tools-0.5.2 → rara_tools-0.5.3}/LICENSE.md +0 -0
  15. {rara_tools-0.5.2 → rara_tools-0.5.3}/README.md +0 -0
  16. {rara_tools-0.5.2 → rara_tools-0.5.3}/pyproject.toml +0 -0
  17. {rara_tools-0.5.2 → rara_tools-0.5.3}/rara_tools/constants/__init__.py +0 -0
  18. {rara_tools-0.5.2 → rara_tools-0.5.3}/rara_tools/constants/digitizer.py +0 -0
  19. {rara_tools-0.5.2 → rara_tools-0.5.3}/rara_tools/constants/general.py +0 -0
  20. {rara_tools-0.5.2 → rara_tools-0.5.3}/rara_tools/constants/language_evaluator.py +0 -0
  21. {rara_tools-0.5.2 → rara_tools-0.5.3}/rara_tools/constants/linker.py +0 -0
  22. {rara_tools-0.5.2 → rara_tools-0.5.3}/rara_tools/constants/meta_extractor.py +0 -0
  23. {rara_tools-0.5.2 → rara_tools-0.5.3}/rara_tools/constants/parsers.py +0 -0
  24. {rara_tools-0.5.2 → rara_tools-0.5.3}/rara_tools/constants/subject_indexer.py +0 -0
  25. {rara_tools-0.5.2 → rara_tools-0.5.3}/rara_tools/converters.py +0 -0
  26. {rara_tools-0.5.2 → rara_tools-0.5.3}/rara_tools/decorators.py +0 -0
  27. {rara_tools-0.5.2 → rara_tools-0.5.3}/rara_tools/digar_schema_converter.py +0 -0
  28. {rara_tools-0.5.2 → rara_tools-0.5.3}/rara_tools/elastic.py +0 -0
  29. {rara_tools-0.5.2 → rara_tools-0.5.3}/rara_tools/exceptions.py +0 -0
  30. {rara_tools-0.5.2 → rara_tools-0.5.3}/rara_tools/normalizers/__init__.py +0 -0
  31. {rara_tools-0.5.2 → rara_tools-0.5.3}/rara_tools/normalizers/authorities.py +0 -0
  32. {rara_tools-0.5.2 → rara_tools-0.5.3}/rara_tools/normalizers/bibs.py +0 -0
  33. {rara_tools-0.5.2 → rara_tools-0.5.3}/rara_tools/parsers/marc_parsers/base_parser.py +0 -0
  34. {rara_tools-0.5.2 → rara_tools-0.5.3}/rara_tools/parsers/marc_parsers/ems_parser.py +0 -0
  35. {rara_tools-0.5.2 → rara_tools-0.5.3}/rara_tools/parsers/marc_parsers/location_parser.py +0 -0
  36. {rara_tools-0.5.2 → rara_tools-0.5.3}/rara_tools/parsers/marc_parsers/organization_parser.py +0 -0
  37. {rara_tools-0.5.2 → rara_tools-0.5.3}/rara_tools/parsers/marc_parsers/person_parser.py +0 -0
  38. {rara_tools-0.5.2 → rara_tools-0.5.3}/rara_tools/parsers/marc_parsers/title_parser.py +0 -0
  39. {rara_tools-0.5.2 → rara_tools-0.5.3}/rara_tools/parsers/marc_records/base_record.py +0 -0
  40. {rara_tools-0.5.2 → rara_tools-0.5.3}/rara_tools/parsers/marc_records/ems_record.py +0 -0
  41. {rara_tools-0.5.2 → rara_tools-0.5.3}/rara_tools/parsers/marc_records/organization_record.py +0 -0
  42. {rara_tools-0.5.2 → rara_tools-0.5.3}/rara_tools/parsers/marc_records/person_record.py +0 -0
  43. {rara_tools-0.5.2 → rara_tools-0.5.3}/rara_tools/parsers/marc_records/title_record.py +0 -0
  44. {rara_tools-0.5.2 → rara_tools-0.5.3}/rara_tools/parsers/tools/entity_normalizers.py +0 -0
  45. {rara_tools-0.5.2 → rara_tools-0.5.3}/rara_tools/parsers/tools/marc_converter.py +0 -0
  46. {rara_tools-0.5.2 → rara_tools-0.5.3}/rara_tools/parsers/tools/russian_transliterator.py +0 -0
  47. {rara_tools-0.5.2 → rara_tools-0.5.3}/rara_tools/s3.py +0 -0
  48. {rara_tools-0.5.2 → rara_tools-0.5.3}/rara_tools/task_reporter.py +0 -0
  49. {rara_tools-0.5.2 → rara_tools-0.5.3}/rara_tools/utils.py +0 -0
  50. {rara_tools-0.5.2 → rara_tools-0.5.3}/rara_tools.egg-info/SOURCES.txt +0 -0
  51. {rara_tools-0.5.2 → rara_tools-0.5.3}/rara_tools.egg-info/dependency_links.txt +0 -0
  52. {rara_tools-0.5.2 → rara_tools-0.5.3}/rara_tools.egg-info/top_level.txt +0 -0
  53. {rara_tools-0.5.2 → rara_tools-0.5.3}/setup.cfg +0 -0
  54. {rara_tools-0.5.2 → rara_tools-0.5.3}/tests/test_digar_schema_converter.py +0 -0
  55. {rara_tools-0.5.2 → rara_tools-0.5.3}/tests/test_elastic.py +0 -0
  56. {rara_tools-0.5.2 → rara_tools-0.5.3}/tests/test_elastic_vector_and_search_operations.py +0 -0
  57. {rara_tools-0.5.2 → rara_tools-0.5.3}/tests/test_entity_normalizers.py +0 -0
  58. {rara_tools-0.5.2 → rara_tools-0.5.3}/tests/test_marc_parsers.py +0 -0
  59. {rara_tools-0.5.2 → rara_tools-0.5.3}/tests/test_normalization.py +0 -0
  60. {rara_tools-0.5.2 → rara_tools-0.5.3}/tests/test_s3_exceptions.py +0 -0
  61. {rara_tools-0.5.2 → rara_tools-0.5.3}/tests/test_s3_file_operations.py +0 -0
  62. {rara_tools-0.5.2 → rara_tools-0.5.3}/tests/test_sierra_converters.py +0 -0
  63. {rara_tools-0.5.2 → rara_tools-0.5.3}/tests/test_task_reporter.py +0 -0
  64. {rara_tools-0.5.2 → rara_tools-0.5.3}/tests/test_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rara-tools
3
- Version: 0.5.2
3
+ Version: 0.5.3
4
4
  Summary: Tools to support Kata's work.
5
5
  Classifier: Programming Language :: Python :: 3
6
6
  Classifier: Programming Language :: Python :: 3.10
@@ -18,6 +18,7 @@ Requires-Dist: nltk
18
18
  Requires-Dist: jsonlines
19
19
  Requires-Dist: requests
20
20
  Requires-Dist: iso639-lang
21
+ Requires-Dist: jellyfish
21
22
  Requires-Dist: pymarc
22
23
  Requires-Dist: regex
23
24
  Requires-Dist: glom
@@ -0,0 +1 @@
1
+ 0.5.3
@@ -0,0 +1,44 @@
1
+ from pymarc import Indicators
2
+ import os
3
+
4
+ class EntityType:
5
+ PER = "PER"
6
+ ORG = "ORG"
7
+ KEYWORD = "EMS_KEYWORD"
8
+ LOC = "LOC"
9
+ TITLE = "TITLE"
10
+ UNK = "UNKNOWN"
11
+
12
+
13
+ EMPTY_INDICATORS = Indicators(" ", " ")
14
+ VIAF_ALLOWED_SOURCES = ["LC", "DNB", "LNB", "NLL",
15
+ "ERRR", "J9U"]
16
+
17
+ DEFAULT_VIAF_FIELD = "local.names"
18
+
19
+ ALLOWED_VIAF_FIELDS = [
20
+ "cql.any", # All fields
21
+ "local.names", # All headings
22
+ "local.personalNames", # Personal names
23
+ "local.corporateNames", # Corporate names
24
+ "local.geographicNames", # Geographic names
25
+ "local.uniformTitleWorks", # Works
26
+ "local.uniformTitleExpressions", # Expressions
27
+ "local.mainHeadingEl", # Preferred headings
28
+ "Xlocal.names", # Exact headings
29
+ "local.title" # Bibliographic titles
30
+ ]
31
+
32
+ # For mapping rara-linker's entity type's to corresponding VIAF fields
33
+ VIAF_ENTITY_MAP = {
34
+ EntityType.PER: "local.personalNames",
35
+ EntityType.ORG: "local.corporateNames",
36
+ EntityType.LOC: "loca.geographicNames",
37
+ EntityType.TITLE: "local.uniformTitleWorks"
38
+
39
+
40
+ }
41
+ ALLOWED_VIAF_WIKILINK_LANGS = ["en", "et"]
42
+ VIAF_SIMILARITY_THRESHOLD = 0.92
43
+ VERIFY_VIAF_RECORD = True
44
+ MAX_VIAF_RECORDS_TO_VERIFY = 10
@@ -4,7 +4,10 @@ from typing import List, Optional, Iterator
4
4
 
5
5
  from rara_tools.constants import EMPTY_INDICATORS
6
6
  from rara_tools.normalizers.viaf import VIAFRecord, VIAFClient
7
-
7
+ from rara_tools.constants.normalizers import (
8
+ DEFAULT_VIAF_FIELD, ALLOWED_VIAF_FIELDS, ALLOWED_VIAF_WIKILINK_LANGS,
9
+ VIAF_SIMILARITY_THRESHOLD, VERIFY_VIAF_RECORD, MAX_VIAF_RECORDS_TO_VERIFY
10
+ )
8
11
  from glom import glom
9
12
  import logging
10
13
  import json
@@ -187,7 +190,7 @@ class RecordNormalizer:
187
190
  "Collective": "111"
188
191
  }
189
192
 
190
- author_type = viaf_record.author_type
193
+ author_type = viaf_record.name_type
191
194
  tag = type_map.get(author_type, "100")
192
195
 
193
196
  fields = [
@@ -195,9 +198,9 @@ class RecordNormalizer:
195
198
  tag=tag,
196
199
  indicators=EMPTY_INDICATORS,
197
200
  subfields=[
198
- Subfield("a", viaf_record.author),
199
- Subfield("b", viaf_record.author_type),
200
- Subfield("c", viaf_record.author_type)
201
+ Subfield("a", viaf_record.name),
202
+ Subfield("b", viaf_record.name_type), # Is this correct??
203
+ Subfield("c", viaf_record.name_type) # Is this correct??
201
204
  ]
202
205
  )
203
206
  ]
@@ -231,32 +234,45 @@ class RecordNormalizer:
231
234
  if entity:
232
235
  return entity
233
236
  else:
234
- return record.author
237
+ return record.name
238
+
239
+ def _get_viaf_record(self, record: Record, viaf_id: Optional[int] = None,
240
+ entity: Optional[str] = None, viaf_field: str = DEFAULT_VIAF_FIELD,
241
+ threshold: float = VIAF_SIMILARITY_THRESHOLD, verify: bool = VERIFY_VIAF_RECORD,
242
+ max_records: int = MAX_VIAF_RECORDS_TO_VERIFY
243
+ ) -> Optional[VIAFRecord]:
244
+ viaf_record = None
235
245
 
236
- def _get_viaf_record(self, record: Record, viaf_id: Optional[int] = None, entity: Optional[str] = None) -> Optional[VIAFRecord]:
237
246
  try:
238
247
  viaf_client = VIAFClient()
239
248
 
240
249
  if viaf_id:
241
- viaf_info = viaf_client.get_records_by_viaf_id(viaf_id).json()
242
- return VIAFRecord(viaf_info)
243
-
244
- search_term = self._get_viaf_search_term(record, entity)
245
-
246
- results = viaf_client.get_records_by_search_term(
247
- search_term).json()
248
-
249
- num_records = glom(
250
- results, "queryResult.numberOfRecords.value", default=0)
251
-
252
- if num_records == 1:
253
- return VIAFRecord(results)
254
-
255
- logger.warning(
256
- f"Multiple VIAF records found for {search_term}: {num_records}. Skipping.")
250
+ viaf_records = viaf_client.get_normalized_data_by_ids([viaf_id])
251
+ if viaf_records:
252
+ viaf_record = viaf_records[0]
253
+ else:
254
+ search_term = self._get_viaf_search_term(record, entity)
255
+ if not verify:
256
+ logger.warning(
257
+ f"Record verification is turned off. If multiple records are " \
258
+ f"detected for search term '{search_term}', the first " \
259
+ f"result is automatically returned. This might lead to " \
260
+ f"some inaccuracies!"
261
+ )
262
+
263
+ viaf_record = viaf_client.get_normalized_data_by_search_term(
264
+ search_term=search_term,
265
+ field=viaf_field,
266
+ max_records=max_records,
267
+ verify=verify,
268
+ threshold=threshold
269
+ )
257
270
 
258
271
  except Exception as e:
259
- logger.error(f"Error fetching VIAF record: {e}")
272
+ logger.error(
273
+ f"Error fetching VIAF record with ID={viaf_id} / entity='{entity}': {e}"
274
+ )
275
+ return viaf_record
260
276
 
261
277
  def _normalize_record(self, record: Record, sierraID: str,
262
278
  viaf_record: VIAFRecord, is_editing_existing_record: bool) -> Record: