rara-tools 0.2.0__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rara-tools might be problematic. Click here for more details.

Files changed (59) hide show
  1. {rara_tools-0.2.0/rara_tools.egg-info → rara_tools-0.4.0}/PKG-INFO +5 -2
  2. rara_tools-0.4.0/VERSION +1 -0
  3. {rara_tools-0.2.0 → rara_tools-0.4.0}/pyproject.toml +6 -1
  4. rara_tools-0.4.0/rara_tools/constants/normalizers.py +6 -0
  5. rara_tools-0.4.0/rara_tools/constants/parsers.py +152 -0
  6. rara_tools-0.4.0/rara_tools/normalizers/__init__.py +4 -0
  7. rara_tools-0.4.0/rara_tools/normalizers/authorities.py +120 -0
  8. rara_tools-0.4.0/rara_tools/normalizers/base.py +290 -0
  9. rara_tools-0.4.0/rara_tools/normalizers/bibs.py +76 -0
  10. rara_tools-0.4.0/rara_tools/normalizers/viaf.py +204 -0
  11. rara_tools-0.4.0/rara_tools/parsers/marc_parsers/base_parser.py +50 -0
  12. rara_tools-0.4.0/rara_tools/parsers/marc_parsers/ems_parser.py +49 -0
  13. rara_tools-0.4.0/rara_tools/parsers/marc_parsers/location_parser.py +46 -0
  14. rara_tools-0.4.0/rara_tools/parsers/marc_parsers/organization_parser.py +44 -0
  15. rara_tools-0.4.0/rara_tools/parsers/marc_parsers/person_parser.py +45 -0
  16. rara_tools-0.4.0/rara_tools/parsers/marc_parsers/title_parser.py +1 -0
  17. rara_tools-0.4.0/rara_tools/parsers/marc_records/base_record.py +112 -0
  18. rara_tools-0.4.0/rara_tools/parsers/marc_records/ems_record.py +267 -0
  19. rara_tools-0.4.0/rara_tools/parsers/marc_records/organization_record.py +245 -0
  20. rara_tools-0.4.0/rara_tools/parsers/marc_records/person_record.py +217 -0
  21. rara_tools-0.4.0/rara_tools/parsers/marc_records/title_record.py +1 -0
  22. rara_tools-0.4.0/rara_tools/parsers/tools/entity_normalizers.py +256 -0
  23. rara_tools-0.4.0/rara_tools/parsers/tools/marc_converter.py +15 -0
  24. rara_tools-0.4.0/rara_tools/parsers/tools/russian_transliterator.py +248 -0
  25. {rara_tools-0.2.0 → rara_tools-0.4.0/rara_tools.egg-info}/PKG-INFO +5 -2
  26. rara_tools-0.4.0/rara_tools.egg-info/SOURCES.txt +54 -0
  27. {rara_tools-0.2.0 → rara_tools-0.4.0}/rara_tools.egg-info/requires.txt +4 -1
  28. {rara_tools-0.2.0 → rara_tools-0.4.0}/requirements.txt +5 -2
  29. rara_tools-0.4.0/tests/test_entity_normalizers.py +64 -0
  30. rara_tools-0.4.0/tests/test_marc_parsers.py +49 -0
  31. {rara_tools-0.2.0 → rara_tools-0.4.0}/tests/test_normalization.py +41 -33
  32. {rara_tools-0.2.0 → rara_tools-0.4.0}/tests/test_utils.py +8 -9
  33. rara_tools-0.2.0/VERSION +0 -1
  34. rara_tools-0.2.0/rara_tools/constants/normalizers.py +0 -17
  35. rara_tools-0.2.0/rara_tools.egg-info/SOURCES.txt +0 -32
  36. {rara_tools-0.2.0 → rara_tools-0.4.0}/LICENSE.md +0 -0
  37. {rara_tools-0.2.0 → rara_tools-0.4.0}/README.md +0 -0
  38. {rara_tools-0.2.0 → rara_tools-0.4.0}/rara_tools/constants/__init__.py +0 -0
  39. {rara_tools-0.2.0 → rara_tools-0.4.0}/rara_tools/constants/digitizer.py +0 -0
  40. {rara_tools-0.2.0 → rara_tools-0.4.0}/rara_tools/constants/general.py +0 -0
  41. {rara_tools-0.2.0 → rara_tools-0.4.0}/rara_tools/converters.py +0 -0
  42. {rara_tools-0.2.0 → rara_tools-0.4.0}/rara_tools/decorators.py +0 -0
  43. {rara_tools-0.2.0 → rara_tools-0.4.0}/rara_tools/digar_schema_converter.py +0 -0
  44. {rara_tools-0.2.0 → rara_tools-0.4.0}/rara_tools/elastic.py +0 -0
  45. {rara_tools-0.2.0 → rara_tools-0.4.0}/rara_tools/exceptions.py +0 -0
  46. {rara_tools-0.2.0 → rara_tools-0.4.0}/rara_tools/s3.py +0 -0
  47. {rara_tools-0.2.0 → rara_tools-0.4.0}/rara_tools/task_reporter.py +0 -0
  48. {rara_tools-0.2.0 → rara_tools-0.4.0}/rara_tools/utils.py +0 -0
  49. {rara_tools-0.2.0 → rara_tools-0.4.0}/rara_tools.egg-info/dependency_links.txt +0 -0
  50. {rara_tools-0.2.0 → rara_tools-0.4.0}/rara_tools.egg-info/top_level.txt +0 -0
  51. {rara_tools-0.2.0 → rara_tools-0.4.0}/setup.cfg +0 -0
  52. {rara_tools-0.2.0 → rara_tools-0.4.0}/tests/test_digar_schema_converter.py +0 -0
  53. {rara_tools-0.2.0 → rara_tools-0.4.0}/tests/test_elastic.py +0 -0
  54. {rara_tools-0.2.0 → rara_tools-0.4.0}/tests/test_elastic_vector_and_search_operations.py +0 -0
  55. {rara_tools-0.2.0 → rara_tools-0.4.0}/tests/test_s3_exceptions.py +0 -0
  56. {rara_tools-0.2.0 → rara_tools-0.4.0}/tests/test_s3_file_operations.py +0 -0
  57. {rara_tools-0.2.0 → rara_tools-0.4.0}/tests/test_sierra_converters.py +0 -0
  58. {rara_tools-0.2.0 → rara_tools-0.4.0}/tests/test_task_reporter.py +0 -0
  59. {rara_tools-0.2.0 → rara_tools-0.4.0}/tests/test_viaf_client.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rara-tools
3
- Version: 0.2.0
3
+ Version: 0.4.0
4
4
  Summary: Tools to support Kata's work.
5
5
  Classifier: Programming Language :: Python :: 3
6
6
  Classifier: Programming Language :: Python :: 3.10
@@ -13,10 +13,13 @@ License-File: LICENSE.md
13
13
  Requires-Dist: elasticsearch==8.*
14
14
  Requires-Dist: elasticsearch_dsl==8.*
15
15
  Requires-Dist: minio==7.*
16
- Requires-Dist: rara-norm-linker==1.*
16
+ Requires-Dist: estnltk==1.7.3
17
+ Requires-Dist: nltk
18
+ Requires-Dist: jsonlines
17
19
  Requires-Dist: requests
18
20
  Requires-Dist: iso639-lang
19
21
  Requires-Dist: pymarc
22
+ Requires-Dist: regex
20
23
  Requires-Dist: glom
21
24
  Provides-Extra: testing
22
25
  Requires-Dist: pytest>=8.0; extra == "testing"
@@ -0,0 +1 @@
1
+ 0.4.0
@@ -9,7 +9,12 @@ dependencies = { file = ["requirements.txt"] }
9
9
  [tool.setuptools.packages.find]
10
10
  include = [
11
11
  "rara_tools",
12
- "rara_tools.constants"
12
+ "rara_tools.constants",
13
+ "rara_tools.normalizers",
14
+ "rara_tools.parsers",
15
+ "rara_tools.parsers.marc_parsers",
16
+ "rara_tools.parsers.marc_records",
17
+ "rara_tools.parsers.tools"
13
18
  ]
14
19
 
15
20
  [project]
@@ -0,0 +1,6 @@
1
+ from pymarc import Indicators
2
+ import os
3
+
4
+ EMPTY_INDICATORS = Indicators(" ", " ")
5
+ VIAF_ALLOWED_SOURCES = ["LC", "DNB", "LNB", "NLL",
6
+ "ERRR", "J9U"]
@@ -0,0 +1,152 @@
1
+ import logging
2
+
3
+ LOGGER = logging.getLogger("rara-tools-marc-parser")
4
+
5
+
6
+ class EMSMarcIDs:
7
+ SYNONYMS = ["448", "450", "451", "455"]
8
+ RELATED = ["548", "550", "551", "555"]
9
+ CATEGORY = ["072"]
10
+ NOTES = ["680"]
11
+ URL = ["024"]
12
+ TIME_KEYWORD = "148"
13
+ TOPIC_KEYWORD = "150"
14
+ LOC_KEYWORD = "151"
15
+ GENRE_KEYWORD = "155"
16
+
17
+
18
+ class GeneralMarcIDs:
19
+ ID = ["001"]
20
+ ID_SOURCE = ["003"]
21
+
22
+
23
+ class OrganizationMarcIDs:
24
+ NAME = ["110", "111"]
25
+ NAME_VARIATIONS = ["410", "411"]
26
+ RELATED_NAMES = ["510", "511"]
27
+ SOURCE = ["667"]
28
+ DESCRIPTION = ["680"]
29
+ AREA_CODE = ["043"]
30
+
31
+
32
+ class PersonMarcIDs:
33
+ NAME = ["100"]
34
+ NAME_VARIATIONS = ["400"]
35
+ SOURCE = ["670"]
36
+ DESCRIPTION = ["680"]
37
+
38
+ class KeywordType:
39
+ LOC = "Kohamärksõnad"
40
+ TIME = "Ajamärksõnad"
41
+ TOPIC = "Teemamärksõnad"
42
+ GENRE = "Vormimärksõnad"
43
+
44
+ class EntityType:
45
+ PER = "PER"
46
+ ORG = "ORG"
47
+ KEYWORD = "EMS_KEYWORD"
48
+ LOC = "LOC"
49
+ UNK = "UNKNOWN"
50
+
51
+
52
+ EN_SUBJECT_FIELDS = {
53
+ "00": "GENERAL CONCEPTS",
54
+ "01": "PHILOSOPHY. ETHICS. SEMIOTICS",
55
+ "02": "RELIGION. THEOLOGY. ESOTERICISM",
56
+ "03": "SCIENCE AND TECHNOLOGY",
57
+ "04": "EDUCATION. PEDAGOGY",
58
+ "06": "PSYCHOLOGY",
59
+ "07": "SOCIOLOGY. SOCIAL PSYCHOLOGY",
60
+ "08": "EMPLOYMENT. WORKING CONDITIONS. OCCUPATIONS",
61
+ "10": "POLITOLOGY. POLITICS",
62
+ "11": "GOVERNMENT. ADMINISTRATION. CIVIL DEFENCE. SECURITY SERVICE",
63
+ "12": "SOCIAL POLICY",
64
+ "13": "LAW. LEGISLATION",
65
+ "15": "MILITARY AFFAIRS. ARMAMENT. MILITARY EQUIPMENT",
66
+ "17": "DEMOGRAPHY. STATISTICS",
67
+ "18": "JOURNALISM. COMMUNICATION. MEDIA. ADVERTISING",
68
+ "19": "NATURE PROTECTION. ENVIRONMENT PROTECTION. ENVIRONMENT TECHY",
69
+ "20": "GEOLOGY. HYDROLOGY. CLIMATOLOGY",
70
+ "21": "MATHEMATICS",
71
+ "22": "ASTRONOMY. ASTROPHYSICS. SPACE EXPLORATION",
72
+ "23": "PHYSICS",
73
+ "24": "CHEMISTRY. CHEMICAL INDUSTRY",
74
+ "25": "BIOLOGY. MICROBIOLOGY. GENETICS. ANTHROPOLOGY",
75
+ "26": "BOTANY",
76
+ "27": "ZOOLOGY",
77
+ "28": "MEDICINE. BIOCHEMISTRY. ANATOMY. PHYSIOLOGY. COSMETICS",
78
+ "29": "ECONOMY. ECONOMICS. FINANCE. TRADE",
79
+ "30": "AGRICULTURE. HORTICULTURE. ANIMAL BREEDING. VETERINARY",
80
+ "31": "FORESTRY. HUNTING. FISHERY. FOREST AND WOODWORK INDUSTRY. PULP AND PAPER INDUSTRY",
81
+ "34": "CATERING. DOMESTIC ECONOMY",
82
+ "35": "ELECTRICAL TECHNOLOGY. ELECTRONICS. ENERGETICS",
83
+ "36": "INFORMATICS. INFORMATION TECHNOLOGY. AUTOMATICS",
84
+ "39": "MECHANICAL ENGINEERING. METAL INDUSTRY. METALLURGY. MINING",
85
+ "43": "LIGHT INDUSTRY. TEXTILE INDUSTRY. LEATHER INDUSTRY",
86
+ "44": "BUILDING. CONSTRUCTION. SANITARY ENGINEERING",
87
+ "45": "TRAFFIC. TRANSPORT. COMMUNICATION",
88
+ "47": "ART. ARCHITECTURE",
89
+ "48": "PHOTOGRAPHY. CINEMA",
90
+ "49": "MUSIC",
91
+ "50": "THEATRE. DANCE",
92
+ "51": "SPORTS. PHYSICAL CULTURE",
93
+ "52": "HOBBIES. LEISURE ACTIVITIES",
94
+ "53": "INFORMATION SCIENCE. LIBRARIANSHIP. MEMORY INSTITUTIONS. PUBLISHING. TYPOGRAPHY",
95
+ "54": "LINGUISTICS. LANGUAGES",
96
+ "55": "LITERARY SCIENCE. LITERATURE. FOLKLORE",
97
+ "56": "HISTORY. ARCHAEOLOGY",
98
+ "58": "ETHNOLOGY. CULTURAL ANTHROPOLOGY",
99
+ "59": "GEOGRAPHY. GEODESY. CARTOGRAPHY",
100
+ "60": "GEOGRAPHICAL NAMES"
101
+ }
102
+
103
+ ET_SUBJECT_FIELDS = {
104
+ "00": "ÜLDMÕISTED",
105
+ "01": "FILOSOOFIA. EETIKA. SEMIOOTIKA",
106
+ "02": "RELIGIOON. TEOLOOGIA. ESOTEERIKA",
107
+ "03": "TEADUS JA TEHNIKA. TEADUSKORRALDUS. TEADUSMETODOLOOGIA. KULTUUR",
108
+ "04": "HARIDUS. PEDAGOOGIKA",
109
+ "06": "PSÜHHOLOOGIA",
110
+ "07": "SOTSIOLOOGIA. SOTSIAALPSÜHHOLOOGIA",
111
+ "08": "TÖÖHÕIVE. TÖÖTINGIMUSED. AMETID",
112
+ "10": "RIIGIÕPETUS. POLIITIKA",
113
+ "11": "VALITSUS. HALDUS. KODANIKUKAITSE. TURVATEENISTUS",
114
+ "12": "SOTSIAALPOLIITIKA",
115
+ "13": "ÕIGUS",
116
+ "15": "SÕJANDUS. RELVAJÕUD. SÕJATEHNIKA",
117
+ "17": "DEMOGRAAFIA. STATISTIKA",
118
+ "18": "AJAKIRJANDUS. KOMMUNIKATSIOON. MEEDIA. REKLAAM",
119
+ "19": "LOODUSKAITSE. KESKKONNAKAITSE. KESKKONNATEHNIKA",
120
+ "20": "GEOLOOGIA. HÜDROLOOGIA. KLIMATOLOOGIA",
121
+ "21": "MATEMAATIKA",
122
+ "22": "ASTRONOOMIA. ASTROFÜÜSIKA. KOSMOSEUURIMINE",
123
+ "23": "FÜÜSIKA",
124
+ "24": "KEEMIA. KEEMIATÖÖSTUS",
125
+ "25": "BIOLOOGIA. MIKROBIOLOOGIA. GENEETIKA. ANTROPOLOOGIA",
126
+ "26": "BOTAANIKA",
127
+ "27": "ZOOLOOGIA",
128
+ "28": "MEDITSIIN. BIOKEEMIA. ANATOOMIA. FÜSIOLOOGIA. FARMAKOLOOGIA. KOSMEETIKA",
129
+ "29": "MAJANDUS. MAJANDUSTEADUS. RAHANDUS. KAUBANDUS",
130
+ "30": "PÕLLUMAJANDUS. AIANDUS. LOOMAKASVATUS. VETERINAARIA",
131
+ "31": "METSANDUS. JAHINDUS. KALANDUS. METSA- JA PUIDUTÖÖSTUS. TSELLULOOSI- JA PABERITÖÖSTUS",
132
+ "34": "KODUMAJANDUS. TOIDUAINETETÖÖSTUS. TOITLUSTUS. OLME",
133
+ "35": "ELEKTROTEHNIKA. ELEKTROONIKA. ENERGEETIKA",
134
+ "36": "INFORMAATIKA. INFOTEHNOLOOGIA. AUTOMAATIKA",
135
+ "39": "MASINAEHITUS. METALLITÖÖSTUS. METALLURGIA. MÄENDUS",
136
+ "43": "KERGETÖÖSTUS. TEKSTIILITÖÖSTUS. NAHA- JA JALATSITÖÖSTUS",
137
+ "44": "EHITUS. SANITAARTEHNIKA",
138
+ "45": "LIIKLUS. TRANSPORT. SIDE",
139
+ "47": "KUNST. ARHITEKTUUR",
140
+ "48": "FOTOGRAAFIA. FILM. KINO",
141
+ "49": "MUUSIKA",
142
+ "50": "TEATER. TANTS",
143
+ "51": "SPORT. KEHAKULTUUR",
144
+ "52": "HARRASTUSED. VABA AEG",
145
+ "53": "INFOTEADUS. RAAMATUKOGUNDUS. MÄLUASUTUSED. KIRJASTAMINE. TRÜKINDUS",
146
+ "54": "KEELETEADUS. KEELED",
147
+ "55": "KIRJANDUSTEADUS. ILUKIRJANDUS. RAHVALUULE",
148
+ "56": "AJALUGU. ARHEOLOOGIA",
149
+ "58": "ETNOLOOGIA. KULTUURIANTROPOLOOGIA",
150
+ "59": "GEOGRAAFIA. GEODEESIA. KARTOGRAAFIA",
151
+ "60": "KOHANIMED"
152
+ }
@@ -0,0 +1,4 @@
1
+
2
+ from rara_tools.normalizers.base import RecordNormalizer
3
+ from rara_tools.normalizers.authorities import AuthoritiesRecordNormalizer
4
+ from rara_tools.normalizers.bibs import BibRecordNormalizer
@@ -0,0 +1,120 @@
1
+ from rara_tools.constants import EMPTY_INDICATORS
2
+ from rara_tools.normalizers.viaf import VIAFRecord
3
+
4
+ from rara_tools.normalizers import RecordNormalizer
5
+
6
+ from pymarc import Field, Subfield, Record
7
+ from typing import List
8
+
9
+
10
+ class AuthoritiesRecordNormalizer(RecordNormalizer):
11
+ """ Normalize authorities records """
12
+
13
+ def __init__(self, linking_results: List[dict] = [], sierra_data: List[dict] = [],
14
+ ALLOW_EDIT_FIELDS: List[str] = [
15
+ "667", "925", "043"],
16
+ REPEATABLE_FIELDS: List[str] = ["024", "035", "400", "670"]):
17
+
18
+ super().__init__(linking_results, sierra_data)
19
+ self.ALLOW_EDIT_FIELDS = ALLOW_EDIT_FIELDS
20
+ self.REPEATABLE_FIELDS = REPEATABLE_FIELDS
21
+
22
+ def _normalize_sierra(self, record: Record, sierraID: str) -> None:
23
+
24
+ suffix_008 = "|n|adnnnaabn || |a| "
25
+
26
+ fields = [
27
+ Field(
28
+ tag="008",
29
+ indicators=EMPTY_INDICATORS,
30
+ data=f"{self.current_timestamp()}{suffix_008}"
31
+ ),
32
+
33
+ Field(
34
+ tag="040",
35
+ indicators=EMPTY_INDICATORS,
36
+ subfields=[
37
+ # if record subfield exists already, use that value. if not, use hardcoded value
38
+ Subfield("a", self.get_subfield(
39
+ record, "040", "a", "ErESTER")),
40
+ Subfield("b", self.get_subfield(
41
+ record, "040", "b", "est")),
42
+ Subfield("c", self.get_subfield(
43
+ record, "040", "c", "ErEster")),
44
+ ]
45
+ ),
46
+ ]
47
+
48
+ self._add_fields_to_record(record, fields)
49
+
50
+ return record
51
+
52
+ def _add_birth_and_death_dates(self, record: Record, viaf_record: VIAFRecord) -> None:
53
+ subfields_046 = [
54
+ Subfield("f", self.get_subfield(
55
+ record, "046", "f", viaf_record.birth_date)),
56
+ Subfield("g", self.get_subfield(
57
+ record, "046", "g", viaf_record.death_date)),
58
+ Subfield("s", self.get_subfield(
59
+ record, "046", "s", viaf_record.activity_start)),
60
+ Subfield("t", self.get_subfield(
61
+ record, "046", "t", viaf_record.activity_end)),
62
+ ]
63
+
64
+ self._add_fields_to_record(
65
+ record, [Field(tag="046", indicators=EMPTY_INDICATORS, subfields=subfields_046)])
66
+
67
+ def _add_viaf_url_and_isni(self, record: Record, viaf_record: VIAFRecord) -> None:
68
+ # TODO 024. will be used to store KRATT KATA ID. Just generate one?
69
+ viaf_url = f"https://viaf.org/viaf/{viaf_record.viaf_id}"
70
+
71
+ subfields = [Subfield("0", self.get_subfield(
72
+ record, "024", "0", viaf_url))]
73
+
74
+ if viaf_record.has_isni:
75
+ subfields.append(Subfield("2", "isni"))
76
+
77
+ field = Field(tag="024", indicators=EMPTY_INDICATORS,
78
+ subfields=subfields)
79
+
80
+ self._add_fields_to_record(record, [field])
81
+
82
+ def _add_nationality(self, record: Record, viaf_record: VIAFRecord) -> None:
83
+
84
+ fields = [
85
+ Field(
86
+ tag="043",
87
+ indicators=EMPTY_INDICATORS,
88
+ subfields=[
89
+ Subfield("c", "ee")
90
+ ] if self._is_person_est_nationality(viaf_record) else []
91
+ )]
92
+
93
+ self._add_fields_to_record(record, fields)
94
+
95
+ def _normalize_viaf(self, record: Record, viaf_record: VIAFRecord) -> None:
96
+ """"
97
+ Attempts to enrich the record with VIAF data.
98
+
99
+ 024 - repeatable field, add VIAF URL to subfield 0. If ISNI found, add to subfield 2
100
+ 043 - repeatable field. Add "ee" if found to be estonian nationality
101
+ 046 - non-repeatable field, add birth and death dates
102
+ 100, 110, 111 - non-repeatable field, attempts to add author type, if missing.
103
+
104
+ """
105
+ # TODO: include KRATT KATA ID to 024 and remove on delete. Increment last elastic ID?
106
+ if not viaf_record:
107
+ return
108
+
109
+ self._add_nationality(record, viaf_record)
110
+ self._add_viaf_url_and_isni(record, viaf_record)
111
+ self._add_birth_and_death_dates(record, viaf_record)
112
+ self._add_author(record, viaf_record)
113
+
114
+ def _normalize_record(self, record: Record, sierraID: str,
115
+ viaf_record: VIAFRecord, is_editing_existing_record: bool) -> Record:
116
+
117
+ self._normalize_sierra(record, sierraID)
118
+ self._normalize_viaf(record, viaf_record)
119
+
120
+ return record
@@ -0,0 +1,290 @@
1
+ from datetime import datetime
2
+ from pymarc import (Field, Subfield, JSONReader, Record)
3
+ from typing import List, Optional, Iterator
4
+
5
+ from rara_tools.constants import EMPTY_INDICATORS
6
+ from rara_tools.normalizers.viaf import VIAFRecord, VIAFClient
7
+
8
+ from glom import glom
9
+ import logging
10
+ import json
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class RecordNormalizer:
16
+ """
17
+ Base class. For normalizing different record types corresponding classes have been created.
18
+ By default existing record fields will not be changed, unless included in ALLOW_EDIT_FIELDS. If a field
19
+ included in the normalization is not present, it will be added to the record. If under REPEATABLE_FIELDS.
20
+ a new record field is added.
21
+
22
+ Args:
23
+ sierra_data: Optionally, can normalize records from SIERRA. Must be in specific format,
24
+ e.g converted with SierraResponseConverter. examples at: tests/sierra/output
25
+ entities: List of Full names (str). If included, will use NormLinker to match with normalized records on KATA elastic.
26
+ """
27
+
28
+ def __init__(self, linking_results: List[dict] = [], sierra_data: List[dict] = [],
29
+ ALLOW_EDIT_FIELDS: List[str] = ["667", "925"], REPEATABLE_FIELDS: List[str] = []):
30
+
31
+ # Include, if will replace existing field
32
+ self.ALLOW_EDIT_FIELDS = ALLOW_EDIT_FIELDS
33
+ # include, if should be added alongside existing fields
34
+ self.REPEATABLE_FIELDS = REPEATABLE_FIELDS
35
+
36
+ self.records_extra_data = []
37
+ self.records = self._setup_records(linking_results, sierra_data)
38
+ self.sierra_data = sierra_data
39
+
40
+ def _setup_records(self, linking_results: List[dict], sierra_data: List[dict]) -> JSONReader:
41
+ """Setup initial MARC records and data.
42
+
43
+ For linked entities:
44
+ 1. Try to get single linked normalized record from KATA elastic. If more than one found, skip.
45
+ 2. If 0 matches, search from VIAF and if 1 result found, create a new authority record from the data.
46
+ 3. If none or more than one responses found, use only Classificator data (coming from Linker?).
47
+
48
+ for SIERRA records: normalize.
49
+ """
50
+ linked_records = []
51
+
52
+ for linked in linking_results:
53
+ entity = linked.get("original_entity")
54
+ try:
55
+ linked_info = linked.get("linked_info", [])
56
+ linked_num = len(linked_info)
57
+
58
+ if not linked_info:
59
+ # new record will be created
60
+ logger.info(
61
+ f"No linked entities found for {entity}")
62
+ continue
63
+
64
+ if linked_num == 1:
65
+ linked = linked_info[0]
66
+ linked_records.append(linked.get("json", {}))
67
+ self.records_extra_data.append({
68
+ "entity": entity,
69
+ "viaf": linked.get("viaf", {}),
70
+ "type": "linked",
71
+ "edited": True
72
+ })
73
+ else:
74
+ # new record will be created
75
+ logger.info(
76
+ f"Multiple linked entities found for {entity}")
77
+
78
+ except Exception as e:
79
+ logger.error(f"Error processing entity {entity}: {e}")
80
+
81
+ self.records_extra_data.extend(
82
+ {
83
+ "sierraID": obj.get("sierraID"),
84
+ "type": "sierra",
85
+ "edited": True
86
+ }
87
+ for obj in (sierra_data or [])
88
+ )
89
+
90
+ all_records = linked_records + (sierra_data or [])
91
+
92
+ return JSONReader(json.dumps(all_records,
93
+ ensure_ascii=False), stream=False)
94
+
95
+ @staticmethod
96
+ def current_timestamp():
97
+ """6 digit timestamp."""
98
+ return datetime.now().strftime("%H%M%S")
99
+
100
+ @staticmethod
101
+ def current_yyyy_dd():
102
+ """format of 2025-03"""
103
+ return datetime.now().strftime("%Y-%m")
104
+
105
+ @staticmethod
106
+ def _is_person_est_nationality(viaf_record: VIAFRecord) -> bool:
107
+ return viaf_record.nationality == "ee"
108
+
109
+ def _is_nxx(self, field: Field, n: str):
110
+ """ Check if fields tag is in nxx range. """
111
+ return field.tag.startswith(n)
112
+
113
+ def get_record_field_or_none(self, record: Record, tag: str) -> Optional[Field]:
114
+ return record.get_fields(tag)[0] if record.get_fields(tag) else None
115
+
116
+ def _field_in_record(self, field: Field, record: Record) -> bool:
117
+ """ Check if field exists in record. """
118
+ existing_fields = record.get_fields(field.tag)
119
+ return any(
120
+ field.data == existing_field.data for existing_field in existing_fields)
121
+
122
+ def _filter_equivalent_field_not_in_record(self, record: Record, fields: List[Field]) -> bool:
123
+ """ filter out fields, that do not have an equivalent in the record. """
124
+ return filter(lambda field: not self._field_in_record(field, record), fields)
125
+
126
+ def get_subfield(self, record: Record, tag: str, subfield: str, default: str) -> str:
127
+ """ get record existing subfield value or assign a fallback value. """
128
+
129
+ field = self.get_record_field_or_none(record, tag)
130
+
131
+ if field is None:
132
+ return default
133
+
134
+ subfields = field.get_subfields(subfield)
135
+ return subfields[0] if subfields else default
136
+
137
+ def _handle_default_fields(self, record: Record, *fields: List[Field]) -> Record:
138
+ """ add field to record iff not present already """
139
+ record.add_field(
140
+ *filter(lambda field: field.tag not in [
141
+ f.tag for f in record.get_fields()], fields)
142
+ )
143
+
144
+ def _handle_editable_fields(self, record: Record, *fields: List[Field]) -> Record:
145
+ """ replace existing field with a new field. """
146
+
147
+ editable_fields = filter(
148
+ lambda field: field.tag in self.ALLOW_EDIT_FIELDS, fields)
149
+
150
+ tags = [f.tag for f in editable_fields]
151
+
152
+ record.remove_fields(
153
+ *tags
154
+ )
155
+ record.add_field(
156
+ *editable_fields
157
+ )
158
+
159
+ def _handle_repeatable_fields(self, record: Record, *fields: List[Field]) -> Record:
160
+ """ add field to the record & don't replace existing field."""
161
+
162
+ repeatable_fields = [
163
+ field for field in fields if field.tag in self.REPEATABLE_FIELDS]
164
+
165
+ record.add_field(
166
+ *repeatable_fields
167
+ # *self._filter_equivalent_field_not_in_record(
168
+ # record, repeatable_fields)
169
+ )
170
+
171
+ def _add_fields_to_record(self, record: Record, fields: List[Field]) -> Record:
172
+
173
+ self._handle_repeatable_fields(record, *fields)
174
+ self._handle_editable_fields(record, *fields)
175
+ self._handle_default_fields(record, *fields)
176
+
177
+ def _add_author(self, record: Record, viaf_record: VIAFRecord) -> Optional[Field]:
178
+
179
+ existing_author: Optional[Field] = record.get(
180
+ "100") or record.get("110") or record.get("111")
181
+ if existing_author:
182
+ return record
183
+
184
+ type_map = {
185
+ "Personal": "100",
186
+ "Corporate": "110",
187
+ "Collective": "111"
188
+ }
189
+
190
+ author_type = viaf_record.author_type
191
+ tag = type_map.get(author_type, "100")
192
+
193
+ fields = [
194
+ Field(
195
+ tag=tag,
196
+ indicators=EMPTY_INDICATORS,
197
+ subfields=[
198
+ Subfield("a", viaf_record.author),
199
+ Subfield("b", viaf_record.author_type),
200
+ Subfield("c", viaf_record.author_type)
201
+ ]
202
+ )
203
+ ]
204
+
205
+ self._add_fields_to_record(record, fields)
206
+
207
+ def _normalize_common(self, record: Record, is_editing_existing_record: bool) -> None:
208
+ """Common logic for all normalizations. """
209
+
210
+ note = "Muudetud AI poolt" if is_editing_existing_record else "Loodud AI poolt"
211
+ date_note = f"KRATT {self.current_yyyy_dd()}"
212
+
213
+ fields = [
214
+ Field(tag="667",
215
+ indicators=EMPTY_INDICATORS,
216
+ subfields=[Subfield("a", note)]),
217
+ Field(tag="925",
218
+ indicators=EMPTY_INDICATORS,
219
+ subfields=[Subfield("t", self.get_subfield(record, "925", "t", date_note))
220
+ ] + ([Subfield("p", self.get_subfield(record, "925", "p", date_note))]
221
+ if is_editing_existing_record else []))
222
+ ]
223
+
224
+ self._add_fields_to_record(record, fields)
225
+
226
+ return record
227
+
228
+ def _get_viaf_search_term(self, record: Record, entity: Optional[str]) -> Optional[str]:
229
+ """ prioritize entity name, if not available, use author name. """
230
+
231
+ if entity:
232
+ return entity
233
+ else:
234
+ return record.author
235
+
236
+ def _get_viaf_record(self, record: Record, viaf_id: Optional[int] = None, entity: Optional[str] = None) -> Optional[VIAFRecord]:
237
+ try:
238
+ viaf_client = VIAFClient()
239
+
240
+ if viaf_id:
241
+ viaf_info = viaf_client.get_records_by_viaf_id(viaf_id).json()
242
+ return VIAFRecord(viaf_info)
243
+
244
+ search_term = self._get_viaf_search_term(record, entity)
245
+
246
+ results = viaf_client.get_records_by_search_term(
247
+ search_term).json()
248
+
249
+ num_records = glom(
250
+ results, "queryResult.numberOfRecords.value", default=0)
251
+
252
+ if num_records == 1:
253
+ return VIAFRecord(results)
254
+
255
+ logger.warning(
256
+ f"Multiple VIAF records found for {search_term}: {num_records}. Skipping.")
257
+
258
+ except Exception as e:
259
+ logger.error(f"Error fetching VIAF record: {e}")
260
+
261
+ def _normalize_record(self, record: Record, sierraID: str,
262
+ viaf_record: VIAFRecord, is_editing_existing_record: bool) -> Record:
263
+ return record
264
+
265
+ @property
266
+ def data(self) -> List[dict]:
267
+ """ Shorthand to get all normalized records as dict. """
268
+ return [record.as_dict() for record in self]
269
+
270
+ def __iter__(self) -> Iterator:
271
+ viaf_id_path = "viaf.queryResult.records.record.0.recordData.VIAFCluster.viafID"
272
+ sierra_id_path = "sierraID"
273
+
274
+ for record, extra_data in zip(self.records, self.records_extra_data):
275
+
276
+ sierra_id = glom(extra_data, sierra_id_path, default="")
277
+ viaf_id = glom(extra_data, viaf_id_path, default=None)
278
+
279
+ entity = extra_data.get("entity")
280
+ is_editing_existing_record = extra_data.get("edited") == True
281
+
282
+ viaf_record = self._get_viaf_record(record, viaf_id, entity)
283
+ record = self._normalize_common(record, is_editing_existing_record)
284
+
285
+ normalized_record = self._normalize_record(
286
+ record, sierra_id, viaf_record, is_editing_existing_record)
287
+
288
+ normalized_record.fields.sort(key=lambda field: field.tag)
289
+
290
+ yield normalized_record