rara-tools 0.4.4__tar.gz → 0.5.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rara-tools might be problematic. Click here for more details.

Files changed (63) hide show
  1. {rara_tools-0.4.4/rara_tools.egg-info → rara_tools-0.5.1}/PKG-INFO +1 -1
  2. rara_tools-0.5.1/VERSION +1 -0
  3. rara_tools-0.5.1/rara_tools/constants/language_evaluator.py +1 -0
  4. rara_tools-0.5.1/rara_tools/constants/meta_extractor.py +1 -0
  5. {rara_tools-0.4.4 → rara_tools-0.5.1}/rara_tools/constants/parsers.py +6 -0
  6. rara_tools-0.5.1/rara_tools/constants/subject_indexer.py +1 -0
  7. rara_tools-0.5.1/rara_tools/parsers/marc_parsers/title_parser.py +44 -0
  8. {rara_tools-0.4.4 → rara_tools-0.5.1}/rara_tools/parsers/marc_records/base_record.py +1 -1
  9. rara_tools-0.5.1/rara_tools/parsers/marc_records/title_record.py +196 -0
  10. {rara_tools-0.4.4 → rara_tools-0.5.1/rara_tools.egg-info}/PKG-INFO +1 -1
  11. {rara_tools-0.4.4 → rara_tools-0.5.1}/rara_tools.egg-info/SOURCES.txt +3 -0
  12. {rara_tools-0.4.4 → rara_tools-0.5.1}/tests/test_marc_parsers.py +9 -0
  13. rara_tools-0.4.4/VERSION +0 -1
  14. rara_tools-0.4.4/rara_tools/parsers/marc_parsers/title_parser.py +0 -1
  15. rara_tools-0.4.4/rara_tools/parsers/marc_records/title_record.py +0 -1
  16. {rara_tools-0.4.4 → rara_tools-0.5.1}/LICENSE.md +0 -0
  17. {rara_tools-0.4.4 → rara_tools-0.5.1}/README.md +0 -0
  18. {rara_tools-0.4.4 → rara_tools-0.5.1}/pyproject.toml +0 -0
  19. {rara_tools-0.4.4 → rara_tools-0.5.1}/rara_tools/constants/__init__.py +0 -0
  20. {rara_tools-0.4.4 → rara_tools-0.5.1}/rara_tools/constants/digitizer.py +0 -0
  21. {rara_tools-0.4.4 → rara_tools-0.5.1}/rara_tools/constants/general.py +0 -0
  22. {rara_tools-0.4.4 → rara_tools-0.5.1}/rara_tools/constants/linker.py +0 -0
  23. {rara_tools-0.4.4 → rara_tools-0.5.1}/rara_tools/constants/normalizers.py +0 -0
  24. {rara_tools-0.4.4 → rara_tools-0.5.1}/rara_tools/converters.py +0 -0
  25. {rara_tools-0.4.4 → rara_tools-0.5.1}/rara_tools/decorators.py +0 -0
  26. {rara_tools-0.4.4 → rara_tools-0.5.1}/rara_tools/digar_schema_converter.py +0 -0
  27. {rara_tools-0.4.4 → rara_tools-0.5.1}/rara_tools/elastic.py +0 -0
  28. {rara_tools-0.4.4 → rara_tools-0.5.1}/rara_tools/exceptions.py +0 -0
  29. {rara_tools-0.4.4 → rara_tools-0.5.1}/rara_tools/normalizers/__init__.py +0 -0
  30. {rara_tools-0.4.4 → rara_tools-0.5.1}/rara_tools/normalizers/authorities.py +0 -0
  31. {rara_tools-0.4.4 → rara_tools-0.5.1}/rara_tools/normalizers/base.py +0 -0
  32. {rara_tools-0.4.4 → rara_tools-0.5.1}/rara_tools/normalizers/bibs.py +0 -0
  33. {rara_tools-0.4.4 → rara_tools-0.5.1}/rara_tools/normalizers/viaf.py +0 -0
  34. {rara_tools-0.4.4 → rara_tools-0.5.1}/rara_tools/parsers/marc_parsers/base_parser.py +0 -0
  35. {rara_tools-0.4.4 → rara_tools-0.5.1}/rara_tools/parsers/marc_parsers/ems_parser.py +0 -0
  36. {rara_tools-0.4.4 → rara_tools-0.5.1}/rara_tools/parsers/marc_parsers/location_parser.py +0 -0
  37. {rara_tools-0.4.4 → rara_tools-0.5.1}/rara_tools/parsers/marc_parsers/organization_parser.py +0 -0
  38. {rara_tools-0.4.4 → rara_tools-0.5.1}/rara_tools/parsers/marc_parsers/person_parser.py +0 -0
  39. {rara_tools-0.4.4 → rara_tools-0.5.1}/rara_tools/parsers/marc_records/ems_record.py +0 -0
  40. {rara_tools-0.4.4 → rara_tools-0.5.1}/rara_tools/parsers/marc_records/organization_record.py +0 -0
  41. {rara_tools-0.4.4 → rara_tools-0.5.1}/rara_tools/parsers/marc_records/person_record.py +0 -0
  42. {rara_tools-0.4.4 → rara_tools-0.5.1}/rara_tools/parsers/tools/entity_normalizers.py +0 -0
  43. {rara_tools-0.4.4 → rara_tools-0.5.1}/rara_tools/parsers/tools/marc_converter.py +0 -0
  44. {rara_tools-0.4.4 → rara_tools-0.5.1}/rara_tools/parsers/tools/russian_transliterator.py +0 -0
  45. {rara_tools-0.4.4 → rara_tools-0.5.1}/rara_tools/s3.py +0 -0
  46. {rara_tools-0.4.4 → rara_tools-0.5.1}/rara_tools/task_reporter.py +0 -0
  47. {rara_tools-0.4.4 → rara_tools-0.5.1}/rara_tools/utils.py +0 -0
  48. {rara_tools-0.4.4 → rara_tools-0.5.1}/rara_tools.egg-info/dependency_links.txt +0 -0
  49. {rara_tools-0.4.4 → rara_tools-0.5.1}/rara_tools.egg-info/requires.txt +0 -0
  50. {rara_tools-0.4.4 → rara_tools-0.5.1}/rara_tools.egg-info/top_level.txt +0 -0
  51. {rara_tools-0.4.4 → rara_tools-0.5.1}/requirements.txt +0 -0
  52. {rara_tools-0.4.4 → rara_tools-0.5.1}/setup.cfg +0 -0
  53. {rara_tools-0.4.4 → rara_tools-0.5.1}/tests/test_digar_schema_converter.py +0 -0
  54. {rara_tools-0.4.4 → rara_tools-0.5.1}/tests/test_elastic.py +0 -0
  55. {rara_tools-0.4.4 → rara_tools-0.5.1}/tests/test_elastic_vector_and_search_operations.py +0 -0
  56. {rara_tools-0.4.4 → rara_tools-0.5.1}/tests/test_entity_normalizers.py +0 -0
  57. {rara_tools-0.4.4 → rara_tools-0.5.1}/tests/test_normalization.py +0 -0
  58. {rara_tools-0.4.4 → rara_tools-0.5.1}/tests/test_s3_exceptions.py +0 -0
  59. {rara_tools-0.4.4 → rara_tools-0.5.1}/tests/test_s3_file_operations.py +0 -0
  60. {rara_tools-0.4.4 → rara_tools-0.5.1}/tests/test_sierra_converters.py +0 -0
  61. {rara_tools-0.4.4 → rara_tools-0.5.1}/tests/test_task_reporter.py +0 -0
  62. {rara_tools-0.4.4 → rara_tools-0.5.1}/tests/test_utils.py +0 -0
  63. {rara_tools-0.4.4 → rara_tools-0.5.1}/tests/test_viaf_client.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rara-tools
3
- Version: 0.4.4
3
+ Version: 0.5.1
4
4
  Summary: Tools to support Kata's work.
5
5
  Classifier: Programming Language :: Python :: 3
6
6
  Classifier: Programming Language :: Python :: 3.10
@@ -0,0 +1 @@
1
+ 0.5.1
@@ -0,0 +1 @@
1
+ COMPONENT_KEY = "language_evaluator"
@@ -0,0 +1 @@
1
+ COMPONENT_KEY = "meta_extractor"
@@ -35,6 +35,12 @@ class PersonMarcIDs:
35
35
  SOURCE = ["670"]
36
36
  DESCRIPTION = ["680"]
37
37
 
38
+ class TitleMarcIDs:
39
+ NAME = ["100"]
40
+ NAME_VARIATIONS = ["400"]
41
+ YEAR = ["046"]
42
+ TYPE = ["075"]
43
+
38
44
  class KeywordType:
39
45
  LOC = "Kohamärksõnad"
40
46
  TIME = "Ajamärksõnad"
@@ -0,0 +1 @@
1
+ COMPONENT_KEY = "subject_indexer"
@@ -0,0 +1,44 @@
1
+ from typing import NoReturn
2
+ from collections.abc import Iterator
3
+ from rara_tools.parsers.marc_parsers.base_parser import BaseMARCParser
4
+ from rara_tools.parsers.marc_records.title_record import TitleRecord
5
+ from rara_tools.constants.parsers import LOGGER
6
+
7
+
8
+ class TitlesMARCParser(BaseMARCParser):
9
+ """ MARC parser for titles' .mrc files.
10
+ """
11
+ def __init__(self,
12
+ marc_file_path: str,
13
+ add_variations: bool = True
14
+ ) -> NoReturn:
15
+ """ Initializes OrganizationsMARCParser object.
16
+
17
+ Parameters
18
+ -----------
19
+ marc_file_path: str
20
+ Full path to .mrc file containing titles' data.
21
+ add_variations: bool
22
+ If enabled, constructs an additional variations field, which
23
+ combines the content of multiple fields + adds some generated
24
+ variations. If the output is uploaded into Elastic and used
25
+ via rara-norm-linker, it is necessary to enable this.
26
+ """
27
+ super().__init__(
28
+ marc_file_path=marc_file_path,
29
+ add_variations=add_variations
30
+ )
31
+
32
+ def record_generator(self) -> Iterator[TitleRecord]:
33
+ """ Generates TitleRecord objects.
34
+ """
35
+ LOGGER.info(
36
+ f"Generating title records from " \
37
+ f"MARC dump '{self.marc_file_path}'."
38
+ )
39
+ for record in self.marc_record_generator():
40
+ title_record = TitleRecord(
41
+ record=record,
42
+ add_variations=self.add_variations
43
+ )
44
+ yield title_record.full_record
@@ -61,7 +61,7 @@ class BaseRecord:
61
61
  if isinstance(subfield_id, str):
62
62
  if _subfield_id == subfield_id:
63
63
  value = subfield[_subfield_id]
64
- values.append(value)
64
+ values.append(value.strip())
65
65
  elif isinstance(subfield_id, list):
66
66
  if _subfield_id in subfield_id:
67
67
  value = subfield[_subfield_id]
@@ -0,0 +1,196 @@
1
+ from typing import List, NoReturn
2
+ from pymarc.record import Record
3
+ from rara_tools.parsers.marc_records.base_record import BaseRecord
4
+ from rara_tools.constants.parsers import TitleMarcIDs, LOGGER
5
+ import regex as re
6
+ import json
7
+ import logging
8
+
9
+
10
+ class TitleRecord(BaseRecord):
11
+ """ Generates a simplified title JSON record
12
+ from a pymarc MARC record.
13
+ """
14
+ def __init__(self, record: Record, add_variations: bool = False) -> NoReturn:
15
+ """ Initializes TitleRecord object.
16
+
17
+ Parameters
18
+ -----------
19
+ record: Record
20
+ pymarc.record.Record object.
21
+ add_variations: bool
22
+ If enabled, constructs an additional variations field, which
23
+ combines the content of multiple fields + adds some generated
24
+ variations. If the output is uploaded into Elastic and used
25
+ via rara-norm-linker, it is necessary to enable this.
26
+ """
27
+ super().__init__(record=record, add_variations=add_variations)
28
+
29
+ self.__name_field_id: List[str] = TitleMarcIDs.NAME
30
+ self.__name_variations_field_id: List[str]= TitleMarcIDs.NAME_VARIATIONS
31
+ self.__year_field_id: List[str] = TitleMarcIDs.YEAR
32
+ self.__type_field_id: List[str] = TitleMarcIDs.TYPE
33
+
34
+ self.__default_year: int | None = None
35
+
36
+ self.__name: str = ""
37
+ self.__author_original_name: dict = {}
38
+ self.__author_name: str = ""
39
+ self.__year: int = -1
40
+ self.__type: str = ""
41
+
42
+ self.__author_life_years: str = ""
43
+ self.__author_birth_year: int = -1
44
+ self.__author_death_year: int = -1
45
+ self.__name_variations: List[str] = []
46
+ self.__full_record: dict = {}
47
+
48
+
49
+ def _parse_year(self, year: str) -> int:
50
+ year = year.strip()
51
+ _year = self.__default_year
52
+ if len(year) >= 4:
53
+ if year[:4].isnumeric():
54
+ _year = int(year[:4])
55
+ elif len(year) == 3 and year.isnumeric():
56
+ _year = int(year)
57
+ return _year
58
+
59
+ @property
60
+ def name(self) -> str:
61
+ if not self.__name:
62
+ values = self.get_values(
63
+ marc_ids=self.__name_field_id,
64
+ subfield_id="t"
65
+ )
66
+ if values:
67
+ self.__name = self._clean_value(values[0])
68
+ else:
69
+ pass
70
+ return self.__name
71
+
72
+
73
+ @property
74
+ def year(self) -> str:
75
+ if self.__year == -1:
76
+ values = self.get_values(
77
+ marc_ids=self.__year_field_id,
78
+ subfield_id="k"
79
+ )
80
+ self.__year = self._clean_value(values[0]) if values else None
81
+ return self.__year
82
+
83
+ @property
84
+ def type(self) -> str:
85
+ if not self.__type:
86
+ values = self.get_values(
87
+ marc_ids = self.__type_field_id,
88
+ subfield_id="a"
89
+ )
90
+ self.__type = self._clean_value(values[0]) if values else ""
91
+ return self.__type
92
+
93
+ @property
94
+ def author_original_name(self) -> str:
95
+ if not self.__author_original_name:
96
+ values = self.get_values(
97
+ marc_ids=self.__name_field_id,
98
+ subfield_id=["a", "b"]
99
+ )
100
+ if values:
101
+ self.__author_original_name = {
102
+ "a": self._clean_value(values[0].get("a", "")),
103
+ "b": self._clean_value(values[0].get("b", ""))
104
+ }
105
+ else:
106
+ pass
107
+ return self.__author_original_name
108
+
109
+ @property
110
+ def author_name(self) -> str:
111
+ if not self.__author_name:
112
+ self.__author_name = self._merge_and_clean(self.author_original_name, ["a", "b"])
113
+ return self.__author_name
114
+
115
+ @property
116
+ def author_life_years(self) -> str:
117
+ if not self.__author_life_years:
118
+ values = self.get_values(
119
+ marc_ids = self.__name_field_id,
120
+ subfield_id="d"
121
+ )
122
+ self.__author_life_years = self._clean_value(values[0]) if values else ""
123
+ return self.__author_life_years
124
+
125
+
126
+ @property
127
+ def author_birth_year(self) -> int:
128
+ if self.__author_birth_year == -1:
129
+ try:
130
+ birth_year, death_year = self.author_life_years.split("-")
131
+ self.__author_birth_year = self._parse_year(birth_year)
132
+ self.__author_death_year = self._parse_year(death_year)
133
+ except Exception as e:
134
+ LOGGER.error(
135
+ f"Failed extracting birth and/or death year " \
136
+ f"from '{self.author_life_years}' with the following " \
137
+ f"exception: '{e}'."
138
+ )
139
+ return self.__author_birth_year
140
+
141
+
142
+ @property
143
+ def author_death_year(self) -> int:
144
+ if self.__author_death_year == -1:
145
+ try:
146
+ birth_year, death_year = self.author_life_years.split("-")
147
+ self.__author_birth_year = self._parse_year(birth_year)
148
+ self.__author_death_year = self._parse_year(death_year)
149
+ except Exception as e:
150
+ LOGGER.error(
151
+ f"Failed extracting birth and/or death year " \
152
+ f"from '{self.author_life_years}' with the following " \
153
+ f"exception: '{e}'."
154
+ )
155
+ return self.__author_death_year
156
+
157
+ @property
158
+ def name_variations(self) -> List[str]:
159
+ if not self.__name_variations:
160
+ values = self.get_values(
161
+ marc_ids=self.__name_variations_field_id,
162
+ subfield_id="t"
163
+ )
164
+ variations = [self.name]
165
+ if values:
166
+ _variations = [
167
+ self._clean_value(value)
168
+ for value in values
169
+ ]
170
+ variations.extend(_variations)
171
+ variations = [v.lower() for v in variations]
172
+ self.__name_variations = list(set(variations))
173
+ return self.__name_variations
174
+
175
+
176
+ @property
177
+ def full_record(self) -> dict:
178
+ if not self.__full_record:
179
+ self.__full_record = {
180
+ "name": self.name,
181
+ "author_name": self.author_name,
182
+ "year": self.year,
183
+ "type": self.type,
184
+ "author_life_years": self.author_life_years,
185
+ "author_birth_year": self.author_birth_year,
186
+ "auhtor_death_year": self.author_death_year,
187
+ "name_variations": self.name_variations,
188
+ "full_record_marc": str(self.marc_record),
189
+ "full_record_json": json.dumps(self.marc_json_record)
190
+ }
191
+ if self.add_variations:
192
+ self.__full_record.update(
193
+ {"link_variations": self.name_variations}
194
+ )
195
+
196
+ return self.__full_record
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rara-tools
3
- Version: 0.4.4
3
+ Version: 0.5.1
4
4
  Summary: Tools to support Kata's work.
5
5
  Classifier: Programming Language :: Python :: 3
6
6
  Classifier: Programming Language :: Python :: 3.10
@@ -19,9 +19,12 @@ rara_tools.egg-info/top_level.txt
19
19
  rara_tools/constants/__init__.py
20
20
  rara_tools/constants/digitizer.py
21
21
  rara_tools/constants/general.py
22
+ rara_tools/constants/language_evaluator.py
22
23
  rara_tools/constants/linker.py
24
+ rara_tools/constants/meta_extractor.py
23
25
  rara_tools/constants/normalizers.py
24
26
  rara_tools/constants/parsers.py
27
+ rara_tools/constants/subject_indexer.py
25
28
  rara_tools/normalizers/__init__.py
26
29
  rara_tools/normalizers/authorities.py
27
30
  rara_tools/normalizers/base.py
@@ -4,12 +4,14 @@ from rara_tools.parsers.marc_parsers.ems_parser import EMSMARCParser
4
4
  from rara_tools.parsers.marc_parsers.person_parser import PersonsMARCParser
5
5
  from rara_tools.parsers.marc_parsers.organization_parser import OrganizationsMARCParser
6
6
  from rara_tools.parsers.marc_parsers.location_parser import LocationMARCParser
7
+ from rara_tools.parsers.marc_parsers.title_parser import TitlesMARCParser
7
8
 
8
9
 
9
10
  MARC_ROOT_DIR = os.path.join("tests", "test_data", "marc_records")
10
11
  EMS_TEST_FILE = os.path.join(MARC_ROOT_DIR, "ems_test_subset.mrc")
11
12
  PER_TEST_FILE = os.path.join(MARC_ROOT_DIR, "per_test_subset.mrc")
12
13
  ORG_TEST_FILE = os.path.join(MARC_ROOT_DIR, "org_test_subset.mrc")
14
+ TITLE_TEST_FILE = os.path.join(MARC_ROOT_DIR, "title_test_subset.mrc")
13
15
 
14
16
  def test_ems_parser_without_variations():
15
17
  ems_marc_parser = EMSMARCParser(EMS_TEST_FILE, add_variations=False)
@@ -47,3 +49,10 @@ def test_organizations_parser_without_variations():
47
49
  for record in org_marc_parser.record_generator():
48
50
  assert "name" in record
49
51
  assert "link_variations" not in record
52
+
53
+ def test_title_parser_with_variations():
54
+ title_marc_parser = TitlesMARCParser(TITLE_TEST_FILE, add_variations=True)
55
+ for record in title_marc_parser.record_generator():
56
+ assert "name" in record
57
+ assert "link_variations" in record
58
+ assert len(record["link_variations"]) > 0
rara_tools-0.4.4/VERSION DELETED
@@ -1 +0,0 @@
1
- 0.4.4
@@ -1 +0,0 @@
1
- # Coming soon
@@ -1 +0,0 @@
1
- # Coming soon
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes