rara-tools 0.5.3__tar.gz → 0.6.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rara-tools might be problematic. Click here for more details.

Files changed (63) hide show
  1. {rara_tools-0.5.3/rara_tools.egg-info → rara_tools-0.6.1}/PKG-INFO +1 -1
  2. rara_tools-0.6.1/VERSION +1 -0
  3. {rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/constants/digitizer.py +1 -1
  4. {rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/constants/general.py +4 -1
  5. rara_tools-0.6.1/rara_tools/constants/language_evaluator.py +9 -0
  6. rara_tools-0.6.1/rara_tools/constants/subject_indexer.py +9 -0
  7. {rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/normalizers/base.py +2 -2
  8. {rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/normalizers/viaf.py +67 -28
  9. {rara_tools-0.5.3 → rara_tools-0.6.1/rara_tools.egg-info}/PKG-INFO +1 -1
  10. {rara_tools-0.5.3 → rara_tools-0.6.1}/tests/test_viaf_client.py +17 -0
  11. rara_tools-0.5.3/VERSION +0 -1
  12. rara_tools-0.5.3/rara_tools/constants/language_evaluator.py +0 -1
  13. rara_tools-0.5.3/rara_tools/constants/subject_indexer.py +0 -1
  14. {rara_tools-0.5.3 → rara_tools-0.6.1}/LICENSE.md +0 -0
  15. {rara_tools-0.5.3 → rara_tools-0.6.1}/README.md +0 -0
  16. {rara_tools-0.5.3 → rara_tools-0.6.1}/pyproject.toml +0 -0
  17. {rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/constants/__init__.py +0 -0
  18. {rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/constants/linker.py +0 -0
  19. {rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/constants/meta_extractor.py +0 -0
  20. {rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/constants/normalizers.py +0 -0
  21. {rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/constants/parsers.py +0 -0
  22. {rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/converters.py +0 -0
  23. {rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/decorators.py +0 -0
  24. {rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/digar_schema_converter.py +0 -0
  25. {rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/elastic.py +0 -0
  26. {rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/exceptions.py +0 -0
  27. {rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/normalizers/__init__.py +0 -0
  28. {rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/normalizers/authorities.py +0 -0
  29. {rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/normalizers/bibs.py +0 -0
  30. {rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/parsers/marc_parsers/base_parser.py +0 -0
  31. {rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/parsers/marc_parsers/ems_parser.py +0 -0
  32. {rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/parsers/marc_parsers/location_parser.py +0 -0
  33. {rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/parsers/marc_parsers/organization_parser.py +0 -0
  34. {rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/parsers/marc_parsers/person_parser.py +0 -0
  35. {rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/parsers/marc_parsers/title_parser.py +0 -0
  36. {rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/parsers/marc_records/base_record.py +0 -0
  37. {rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/parsers/marc_records/ems_record.py +0 -0
  38. {rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/parsers/marc_records/organization_record.py +0 -0
  39. {rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/parsers/marc_records/person_record.py +0 -0
  40. {rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/parsers/marc_records/title_record.py +0 -0
  41. {rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/parsers/tools/entity_normalizers.py +0 -0
  42. {rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/parsers/tools/marc_converter.py +0 -0
  43. {rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/parsers/tools/russian_transliterator.py +0 -0
  44. {rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/s3.py +0 -0
  45. {rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/task_reporter.py +0 -0
  46. {rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools/utils.py +0 -0
  47. {rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools.egg-info/SOURCES.txt +0 -0
  48. {rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools.egg-info/dependency_links.txt +0 -0
  49. {rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools.egg-info/requires.txt +0 -0
  50. {rara_tools-0.5.3 → rara_tools-0.6.1}/rara_tools.egg-info/top_level.txt +0 -0
  51. {rara_tools-0.5.3 → rara_tools-0.6.1}/requirements.txt +0 -0
  52. {rara_tools-0.5.3 → rara_tools-0.6.1}/setup.cfg +0 -0
  53. {rara_tools-0.5.3 → rara_tools-0.6.1}/tests/test_digar_schema_converter.py +0 -0
  54. {rara_tools-0.5.3 → rara_tools-0.6.1}/tests/test_elastic.py +0 -0
  55. {rara_tools-0.5.3 → rara_tools-0.6.1}/tests/test_elastic_vector_and_search_operations.py +0 -0
  56. {rara_tools-0.5.3 → rara_tools-0.6.1}/tests/test_entity_normalizers.py +0 -0
  57. {rara_tools-0.5.3 → rara_tools-0.6.1}/tests/test_marc_parsers.py +0 -0
  58. {rara_tools-0.5.3 → rara_tools-0.6.1}/tests/test_normalization.py +0 -0
  59. {rara_tools-0.5.3 → rara_tools-0.6.1}/tests/test_s3_exceptions.py +0 -0
  60. {rara_tools-0.5.3 → rara_tools-0.6.1}/tests/test_s3_file_operations.py +0 -0
  61. {rara_tools-0.5.3 → rara_tools-0.6.1}/tests/test_sierra_converters.py +0 -0
  62. {rara_tools-0.5.3 → rara_tools-0.6.1}/tests/test_task_reporter.py +0 -0
  63. {rara_tools-0.5.3 → rara_tools-0.6.1}/tests/test_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rara-tools
3
- Version: 0.5.3
3
+ Version: 0.6.1
4
4
  Summary: Tools to support Kata's work.
5
5
  Classifier: Programming Language :: Python :: 3
6
6
  Classifier: Programming Language :: Python :: 3.10
@@ -0,0 +1 @@
1
+ 0.6.1
@@ -22,4 +22,4 @@ class Queue:
22
22
 
23
23
 
24
24
  class Tasks:
25
- MODEL_UPDATE = "component_model_update"
25
+ START_DIGITIZER_PIPELINE = "start_digitizer_pipeline"
@@ -11,5 +11,8 @@ class Queue:
11
11
  CORE = "core"
12
12
 
13
13
 
14
- class Task:
14
+ class Tasks:
15
15
  SEND_VERSION = "send_version_to_core"
16
+ UPDATE_TASK_STATUS = "update_task_status"
17
+ UPDATE_TASK_VALUES = "update_task_values"
18
+ MODEL_UPDATE = "component_model_update"
@@ -0,0 +1,9 @@
1
+ COMPONENT_KEY = "language_evaluator"
2
+
3
+
4
+ class Tasks:
5
+ EVALUATE = "text_evaluator"
6
+
7
+
8
+ class Queue:
9
+ EVALUATE = "text_evaluator"
@@ -0,0 +1,9 @@
1
+ COMPONENT_KEY = "subject_indexer"
2
+
3
+
4
+ class Tasks:
5
+ PIPELINE = "run_subject_indexer_pipeline"
6
+
7
+
8
+ class Queue:
9
+ MAIN = "subject-indexer"
@@ -2,11 +2,11 @@ from datetime import datetime
2
2
  from pymarc import (Field, Subfield, JSONReader, Record)
3
3
  from typing import List, Optional, Iterator
4
4
 
5
- from rara_tools.constants import EMPTY_INDICATORS
6
5
  from rara_tools.normalizers.viaf import VIAFRecord, VIAFClient
7
6
  from rara_tools.constants.normalizers import (
8
7
  DEFAULT_VIAF_FIELD, ALLOWED_VIAF_FIELDS, ALLOWED_VIAF_WIKILINK_LANGS,
9
- VIAF_SIMILARITY_THRESHOLD, VERIFY_VIAF_RECORD, MAX_VIAF_RECORDS_TO_VERIFY
8
+ VIAF_SIMILARITY_THRESHOLD, VERIFY_VIAF_RECORD, MAX_VIAF_RECORDS_TO_VERIFY,
9
+ EMPTY_INDICATORS
10
10
  )
11
11
  from glom import glom
12
12
  import logging
@@ -8,7 +8,7 @@ from requests.models import Response
8
8
  from rara_tools.parsers.tools.entity_normalizers import PersonalName, Normalizer
9
9
  from rara_tools.constants.normalizers import (
10
10
  DEFAULT_VIAF_FIELD, ALLOWED_VIAF_FIELDS, ALLOWED_VIAF_WIKILINK_LANGS,
11
- VIAF_SIMILARITY_THRESHOLD
11
+ VIAF_SIMILARITY_THRESHOLD, VIAF_ALLOWED_SOURCES
12
12
  )
13
13
 
14
14
  import logging
@@ -20,9 +20,7 @@ class VIAFRecord:
20
20
  """
21
21
  def __init__(self,
22
22
  record: dict,
23
- allowed_sources: List[str] = [
24
- "LC", "DNB", "LNB", "NLL", "ERRR", "J9U"
25
- ]
23
+ allowed_sources: List[str] = VIAF_ALLOWED_SOURCES
26
24
  ):
27
25
  """ Initializes VIAFRecord class.
28
26
 
@@ -108,14 +106,36 @@ class VIAFRecord:
108
106
  wikilink_lang = match.group()
109
107
  return wikilink_lang
110
108
 
111
- def _get_marc_field(self, marc_dict: dict, subfield: str = "a") -> str:
109
+ def _get_marc_field(self, marc_dict: dict, subfield: str = "a",
110
+ strict_subfield: bool = True
111
+ ) -> str:
112
+ """ Retrieve value from a MARC dict
113
+
114
+ Parameters
115
+ -----------
116
+ marc_dict: dict
117
+ MARC dictionaryself.
118
+ subfield: str
119
+ Subfield to extract
120
+ strict_subfield: bool
121
+ If set to True, data is extracted ONLY from
122
+ the subfield set with param `subfield`. If set to False,
123
+ data can be extracted from other subfields as well as long
124
+ there is only one subfield in the dict. This might be necessary
125
+ for uniformTitleWorks as sometimes the title is present in
126
+ subfield (t) while subfield (a) contains the author. However,
127
+ there are instances, where the title is present is subfield (a)
128
+ with no author.
129
+ """
112
130
  value = ""
113
131
  if marc_dict.get("dtype", "") == "MARC21":
114
132
  subfields = marc_dict.get("subfield", [])
115
133
  for _subfield in subfields:
116
- if _subfield.get("code", "") == subfield:
134
+ if len(subfields) > 1 and _subfield.get("code", "") == subfield:
117
135
  value = _subfield.get("value", "")
118
136
  break
137
+ elif len(subfields) == 1 and not strict_subfield:
138
+ value = _subfield.get("value", "")
119
139
  return value
120
140
 
121
141
  def _get_marc_tag(self, marc_dict: dict) -> str:
@@ -127,7 +147,11 @@ class VIAFRecord:
127
147
  def _get_names(self, marc_dicts: List[dict]) -> List[str]:
128
148
  names_d = defaultdict(int)
129
149
  for marc_dict in marc_dicts:
130
- name = self._get_marc_field(marc_dict, self.subfield_indicator)
150
+ name = self._get_marc_field(
151
+ marc_dict=marc_dict,
152
+ subfield=self.subfield_indicator,
153
+ strict_subfield=False
154
+ )
131
155
  names_d[name]+=1
132
156
  name_list = sorted(
133
157
  list(names_d.items()),
@@ -154,8 +178,8 @@ class VIAFRecord:
154
178
  # Strip "." only if the last token is not an initial,
155
179
  # e.g: "Meri, Lennart." -> Strip
156
180
  # "Meri, L." -> Do not strip.
157
- ent_tokens = entity.split()
158
- if len(ent_tokens[-1]) > 2:
181
+ ent_tokens = [t.strip() for t in entity.split() if t.strip()]
182
+ if ent_tokens and len(ent_tokens[-1]) > 2:
159
183
  entity = entity.strip(".")
160
184
  return entity
161
185
 
@@ -166,6 +190,14 @@ class VIAFRecord:
166
190
  _entity = re.sub(r"[(][^)][)]", "", entity)
167
191
  return _entity.strip()
168
192
 
193
+ @property
194
+ def record(self) -> dict:
195
+ return self.__record
196
+
197
+ @property
198
+ def record_data(self) -> dict:
199
+ return self.__record_data
200
+
169
201
  @property
170
202
  def subfield_indicator(self) -> str:
171
203
  if not self.__subfield_indicator:
@@ -239,7 +271,6 @@ class VIAFRecord:
239
271
  vars_3 = [Normalizer.clean_entity(v) for v in _vars]
240
272
 
241
273
  vars = _vars + vars_3
242
- #print(vars)
243
274
  self.__name_variations = list(set(vars))
244
275
  return self.__name_variations
245
276
 
@@ -369,7 +400,10 @@ class VIAFRecord:
369
400
 
370
401
 
371
402
  class VIAFClient:
372
- def __init__(self, viaf_api_url: str = "https://viaf.org/api"):
403
+ def __init__(self,
404
+ viaf_api_url: str = "https://viaf.org/api",
405
+ allowed_viaf_sources: List[str] = VIAF_ALLOWED_SOURCES
406
+ ):
373
407
  self.root_url: str = viaf_api_url.strip("/")
374
408
  self.record_url: str = f"{self.root_url}/cluster-record"
375
409
  self.search_url: str = f"{self.root_url}/search"
@@ -377,6 +411,7 @@ class VIAFClient:
377
411
  "Accept": "application/json",
378
412
  "Content-Type": "application/json"
379
413
  }
414
+ self.allowed_viaf_sources: List[str] = allowed_viaf_sources
380
415
 
381
416
  def check_search_term_query(self) -> bool:
382
417
  """ Function for checking, if VIAF search term
@@ -448,6 +483,9 @@ class VIAFClient:
448
483
  score: float
449
484
  Similarity score of the most similar record.
450
485
  """
486
+ logger.debug(
487
+ f"Verifying if '{viaf_record.name}' is sufficiently similar to '{entity}'."
488
+ )
451
489
  # might not always be personal name, but shouldn't break anything
452
490
  if len(entity.split()) > 1:
453
491
  pn = PersonalName(entity)
@@ -464,7 +502,7 @@ class VIAFClient:
464
502
  max_similarity = score
465
503
  most_similar_record = var
466
504
  if score >= threshold:
467
- logger.info(
505
+ logger.debug(
468
506
  f"Verification successful! '{name_form}' sufficiently " \
469
507
  f"similar to '{var}'! Score = {score}."
470
508
  )
@@ -486,6 +524,10 @@ class VIAFClient:
486
524
  """ Takes in n VIAFRecords found while searching the term `search_term`.
487
525
  Returns the most similar VIAFRecord.
488
526
  """
527
+ logger.debug(
528
+ f"Retrieving a single verified record from VIAF search results. " \
529
+ f"search term = '{search_term}'."
530
+ )
489
531
  verified_record = None
490
532
  max_score = 0
491
533
  most_similar_record = ""
@@ -517,6 +559,7 @@ class VIAFClient:
517
559
  ) -> Response:
518
560
  """ Query VIAF records by search term.
519
561
  """
562
+ logger.debug(f"Retriecing VIAF records for search term '{search_term}'.")
520
563
  if field and field not in ALLOWED_VIAF_FIELDS:
521
564
  logger.error(
522
565
  f"Field '{field}' is not allowed. Defaulting to '{DEFAULT_VIAF_FIELD}'. " \
@@ -541,6 +584,7 @@ class VIAFClient:
541
584
  def get_records_by_viaf_id(self, record_id: str) -> Response:
542
585
  """ Query VIAF records by ID.
543
586
  """
587
+ logger.debug(f"Retrieving VIAF records for ID {record_id}.")
544
588
  data = {
545
589
  "reqValues": {
546
590
  "recordId": str(record_id)
@@ -552,6 +596,7 @@ class VIAFClient:
552
596
  def extract_viaf_ids(self, search_query_response: Response) -> List[str]:
553
597
  """ Parse VIAF ID-s from search query response.
554
598
  """
599
+ logger.debug("Extracting VIAF IDs from VIAF search query results.")
555
600
  try:
556
601
  records = search_query_response.json()["queryResult"]["records"]["record"]
557
602
  except Exception as e:
@@ -591,7 +636,6 @@ class VIAFClient:
591
636
 
592
637
  def fetch_viaf_clusters(self, viaf_ids: List[str]) -> Dict[str, dict]:
593
638
  results = {}
594
-
595
639
  for viaf_id in viaf_ids:
596
640
  try:
597
641
  response = self.get_records_by_viaf_id(viaf_id)
@@ -605,9 +649,13 @@ class VIAFClient:
605
649
 
606
650
  def get_normalized_data_by_ids(self, record_ids: List[str]) -> List[VIAFRecord]:
607
651
  """ Fetch data required for normalization from VIAF. """
652
+ logger.debug(f"Fetching VIAFRecords for the following IDs: {record_ids}.")
608
653
  response = self.fetch_viaf_clusters(record_ids)
609
654
  viaf_records = [
610
- VIAFRecord(response[record_id])
655
+ VIAFRecord(
656
+ record=response[record_id],
657
+ allowed_sources=self.allowed_viaf_sources
658
+ )
611
659
  for record_id in record_ids
612
660
  ]
613
661
  return viaf_records
@@ -618,6 +666,11 @@ class VIAFClient:
618
666
  viaf_index: str = "VIAF"
619
667
  ) -> VIAFRecord | None:
620
668
  """ Fetch data required for normalization from VIAF. """
669
+ logger.debug(
670
+ f"Finding VIAFRecords with search term '{search_term}' " \
671
+ f"using VIAF field='{field}', verify={verify}, threshold={threshold}. " \
672
+ f"Allowed VIAF sources are: {self.allowed_viaf_sources}."
673
+ )
621
674
  viaf_record = None
622
675
  viaf_ids = self.get_viaf_ids_by_search_terms(
623
676
  search_term=search_term,
@@ -637,17 +690,3 @@ class VIAFClient:
637
690
  records = self.get_normalized_data_by_ids(viaf_ids[:1])
638
691
  verified_record = records[0] if records else None
639
692
  return verified_record
640
-
641
-
642
-
643
- if __name__ == "__main__":
644
- from pprint import pprint
645
- vc = VIAFClient()
646
- entity="Kevade"
647
- record = vc.get_normalized_data_by_search_term(entity, field="local.uniformTitleWorks", max_records=5, verify=True)
648
- #pprint(record.record_data)
649
- if record:
650
- pprint(record.all_fields)
651
- #pprint(record.record_data)
652
- else:
653
- print(f"Couldn't detect a verified record for entity '{entity}' :(.")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rara-tools
3
- Version: 0.5.3
3
+ Version: 0.6.1
4
4
  Summary: Tools to support Kata's work.
5
5
  Classifier: Programming Language :: Python :: 3
6
6
  Classifier: Programming Language :: Python :: 3.10
@@ -69,3 +69,20 @@ def test_subfield_based_main_field_extraction():
69
69
  verify=True
70
70
  )
71
71
  assert record.name == "Luts, Oskar"
72
+
73
+ def test_changing_allowed_sources():
74
+ client = VIAFClient(allowed_viaf_sources=["PLWABN"])
75
+ record = client.get_normalized_data_by_search_term(
76
+ search_term="Anora",
77
+ field="local.uniformTitleWorks",
78
+ verify=False
79
+ )
80
+ assert record.name == "Anora (film)"
81
+
82
+ client = VIAFClient(allowed_viaf_sources=["LC"])
83
+ record = client.get_normalized_data_by_search_term(
84
+ search_term="Anora",
85
+ field="local.uniformTitleWorks",
86
+ verify=False
87
+ )
88
+ assert record.name == "Anora (Motion picture)"
rara_tools-0.5.3/VERSION DELETED
@@ -1 +0,0 @@
1
- 0.5.3
@@ -1 +0,0 @@
1
- COMPONENT_KEY = "language_evaluator"
@@ -1 +0,0 @@
1
- COMPONENT_KEY = "subject_indexer"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes