rara-tools 0.7.7__tar.gz → 0.7.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of rara-tools might be problematic. Click here for more details.
- {rara_tools-0.7.7/rara_tools.egg-info → rara_tools-0.7.8}/PKG-INFO +1 -1
- rara_tools-0.7.8/VERSION +1 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/digar_schema_converter.py +29 -4
- {rara_tools-0.7.7 → rara_tools-0.7.8/rara_tools.egg-info}/PKG-INFO +1 -1
- {rara_tools-0.7.7 → rara_tools-0.7.8}/tests/test_digar_schema_converter.py +56 -25
- rara_tools-0.7.7/VERSION +0 -1
- {rara_tools-0.7.7 → rara_tools-0.7.8}/LICENSE.md +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/README.md +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/pyproject.toml +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/constants/__init__.py +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/constants/digitizer.py +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/constants/general.py +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/constants/language_evaluator.py +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/constants/linker.py +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/constants/meta_extractor.py +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/constants/normalizers.py +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/constants/parsers.py +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/constants/subject_indexer.py +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/converters.py +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/core_formatters/core_formatter.py +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/core_formatters/formatted_keyword.py +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/core_formatters/formatted_meta.py +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/core_formatters/formatted_object.py +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/decorators.py +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/elastic.py +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/exceptions.py +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/normalizers/__init__.py +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/normalizers/authorities.py +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/normalizers/base.py +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/normalizers/bibs.py +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/normalizers/reader.py +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/normalizers/viaf.py +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/parsers/marc_parsers/base_parser.py +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/parsers/marc_parsers/ems_parser.py +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/parsers/marc_parsers/location_parser.py +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/parsers/marc_parsers/organization_parser.py +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/parsers/marc_parsers/person_parser.py +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/parsers/marc_parsers/title_parser.py +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/parsers/marc_records/base_record.py +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/parsers/marc_records/ems_record.py +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/parsers/marc_records/organization_record.py +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/parsers/marc_records/person_record.py +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/parsers/marc_records/title_record.py +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/parsers/tools/entity_normalizers.py +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/parsers/tools/marc_converter.py +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/parsers/tools/russian_transliterator.py +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/s3.py +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/task_reporter.py +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/utils.py +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools.egg-info/SOURCES.txt +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools.egg-info/dependency_links.txt +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools.egg-info/requires.txt +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools.egg-info/top_level.txt +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/requirements.txt +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/setup.cfg +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/tests/test_elastic.py +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/tests/test_elastic_vector_and_search_operations.py +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/tests/test_entity_normalizers.py +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/tests/test_formatters.py +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/tests/test_marc_parsers.py +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/tests/test_normalization.py +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/tests/test_s3_exceptions.py +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/tests/test_s3_file_operations.py +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/tests/test_sierra_converters.py +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/tests/test_task_reporter.py +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/tests/test_utils.py +0 -0
- {rara_tools-0.7.7 → rara_tools-0.7.8}/tests/test_viaf_client.py +0 -0
rara_tools-0.7.8/VERSION
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0.7.8
|
|
@@ -241,8 +241,8 @@ class DIGARSchemaConverter:
|
|
|
241
241
|
min_language_ratio: float = 0.2,
|
|
242
242
|
convert_ratio: bool = False
|
|
243
243
|
) -> NoReturn:
|
|
244
|
-
""" Initialize DIGARSchemaConverter object.
|
|
245
|
-
|
|
244
|
+
""" Initialize DIGARSchemaConverter object.
|
|
245
|
+
|
|
246
246
|
Parameters
|
|
247
247
|
----------
|
|
248
248
|
digitizer_output: dict
|
|
@@ -261,7 +261,7 @@ class DIGARSchemaConverter:
|
|
|
261
261
|
be added to the final output.
|
|
262
262
|
convert_ratio: bool
|
|
263
263
|
If enabled, all ratios are converted into percentages.
|
|
264
|
-
|
|
264
|
+
|
|
265
265
|
"""
|
|
266
266
|
self.__digitizer_output: dict = digitizer_output
|
|
267
267
|
self.__min_language_ratio: float = min_language_ratio
|
|
@@ -280,6 +280,7 @@ class DIGARSchemaConverter:
|
|
|
280
280
|
self.__dc_origin: dict = {}
|
|
281
281
|
self.__dc_identifier: List[dict] = []
|
|
282
282
|
self.__doc_id: str = ""
|
|
283
|
+
self.__page_count: int = None
|
|
283
284
|
|
|
284
285
|
self.__doc_schemas = DocSchemas(
|
|
285
286
|
doc_meta=self.doc_meta,
|
|
@@ -303,6 +304,28 @@ class DIGARSchemaConverter:
|
|
|
303
304
|
page_number = _first_segment.get("page")
|
|
304
305
|
return page_number
|
|
305
306
|
|
|
307
|
+
def _add_dummy_pages(self, docs: List[dict]):
|
|
308
|
+
for doc in docs:
|
|
309
|
+
if not doc.get("page"):
|
|
310
|
+
doc["page"] = self.dummy_page
|
|
311
|
+
return docs
|
|
312
|
+
|
|
313
|
+
@property
|
|
314
|
+
def dummy_page(self) -> int:
|
|
315
|
+
""" Get page number to add for images,
|
|
316
|
+
if actual page is missing. Currently returns
|
|
317
|
+
a new (non-existing) final page.
|
|
318
|
+
"""
|
|
319
|
+
return self.page_count+1
|
|
320
|
+
|
|
321
|
+
@property
|
|
322
|
+
def page_count(self) -> int:
|
|
323
|
+
""" Returns total page count of the document.
|
|
324
|
+
"""
|
|
325
|
+
if not self.__page_count:
|
|
326
|
+
self.__page_count = self.__digitizer_output.get("doc_meta", {}).get("pages", {}).get("count", 0)
|
|
327
|
+
return self.__page_count
|
|
328
|
+
|
|
306
329
|
@property
|
|
307
330
|
def doc_id(self) -> str:
|
|
308
331
|
""" Retrieves document ID to use for generating
|
|
@@ -327,7 +350,8 @@ class DIGARSchemaConverter:
|
|
|
327
350
|
@property
|
|
328
351
|
def images(self) -> List[dict]:
|
|
329
352
|
if not self.__images:
|
|
330
|
-
|
|
353
|
+
images = self.__digitizer_output.get("images")
|
|
354
|
+
self.__images = self._add_dummy_pages(images)
|
|
331
355
|
return self.__images
|
|
332
356
|
|
|
333
357
|
@property
|
|
@@ -344,6 +368,7 @@ class DIGARSchemaConverter:
|
|
|
344
368
|
mapped[text["start_page"]]["texts"].append(text)
|
|
345
369
|
for img in self.images:
|
|
346
370
|
mapped[img["page"]]["images"].append(img)
|
|
371
|
+
#print(mapped.items())
|
|
347
372
|
|
|
348
373
|
self.__page_mappings = [
|
|
349
374
|
v for k, v in sorted(list(mapped.items()), key=lambda x: x[0])
|
|
@@ -12,11 +12,19 @@ def load_json(file_path: str):
|
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
TEST_DIGITIZER_OUTPUT_FILE = os.path.join(".", "tests", "test_data", "b1267058_test_digitizer_output.json")
|
|
15
|
+
TEST_DIGITIZER_OUTPUT_FILE_2 = os.path.join(".", "tests", "test_data", "b5493797_test_digitizer_output_shortened_empty_image_pages.json")
|
|
16
|
+
|
|
15
17
|
TEST_DIGITIZER_OUTPUT = load_json(TEST_DIGITIZER_OUTPUT_FILE)
|
|
18
|
+
TEST_DIGITIZER_OUTPUT_2 = load_json(TEST_DIGITIZER_OUTPUT_FILE_2)
|
|
19
|
+
|
|
16
20
|
TEST_SIERRA_ID = "b1267058"
|
|
17
21
|
TEST_GENERATED_ID = "hsasaHSAHHGDhb"
|
|
18
22
|
TEST_PERMALINK = "https://www.digar.ee/b1267058"
|
|
19
23
|
|
|
24
|
+
TEST_SIERRA_ID_2 = "b5493797"
|
|
25
|
+
TEST_GENERATED_ID_2 = "mzmudhju38hd3ndlk"
|
|
26
|
+
TEST_PERMALINK_2 = "https://www.digar.ee/b5493797"
|
|
27
|
+
|
|
20
28
|
def test_digar_schema_converstion_default():
|
|
21
29
|
converter = DIGARSchemaConverter(
|
|
22
30
|
digitizer_output=TEST_DIGITIZER_OUTPUT,
|
|
@@ -24,25 +32,25 @@ def test_digar_schema_converstion_default():
|
|
|
24
32
|
generated_id=TEST_GENERATED_ID
|
|
25
33
|
)
|
|
26
34
|
digar_schema = converter.digar_schema
|
|
27
|
-
|
|
35
|
+
|
|
28
36
|
# check that all neseccary fields are present
|
|
29
37
|
assert "dc:language" in digar_schema
|
|
30
38
|
assert "dcterms:provenance" in digar_schema
|
|
31
39
|
assert "dc:identifier" in digar_schema
|
|
32
40
|
assert "dcterms:hasPart" in digar_schema
|
|
33
41
|
assert "dcterms:conformsTo" in digar_schema
|
|
34
|
-
|
|
42
|
+
|
|
35
43
|
languages = [lang.get("value") for lang in digar_schema.get("dc:language")]
|
|
36
44
|
# check that languages are converted into ISO-693-2
|
|
37
45
|
for lang in languages:
|
|
38
46
|
assert len(lang) == 3
|
|
39
|
-
|
|
40
|
-
|
|
47
|
+
|
|
48
|
+
|
|
41
49
|
# check that ratio is converted into percentage
|
|
42
50
|
text_quality = digar_schema.get("dcterms:conformsTo")[0].get("value")
|
|
43
51
|
assert isinstance(text_quality, str)
|
|
44
|
-
|
|
45
|
-
|
|
52
|
+
|
|
53
|
+
|
|
46
54
|
def test_digar_schema_id_generation():
|
|
47
55
|
""" Tests ID generation logic.
|
|
48
56
|
"""
|
|
@@ -51,38 +59,38 @@ def test_digar_schema_id_generation():
|
|
|
51
59
|
sierra_id=TEST_SIERRA_ID,
|
|
52
60
|
generated_id=TEST_GENERATED_ID,
|
|
53
61
|
permalink=TEST_PERMALINK
|
|
54
|
-
|
|
62
|
+
|
|
55
63
|
)
|
|
56
|
-
|
|
64
|
+
|
|
57
65
|
#If permalink is given, this should be used as base ID
|
|
58
66
|
digar_schema = converter.digar_schema
|
|
59
67
|
first_segment_id = digar_schema["dcterms:hasPart"][0]["hasPart"][0]["@id"]
|
|
60
|
-
|
|
68
|
+
|
|
61
69
|
assert first_segment_id.startswith(TEST_PERMALINK)
|
|
62
|
-
|
|
70
|
+
|
|
63
71
|
converter = DIGARSchemaConverter(
|
|
64
72
|
digitizer_output=TEST_DIGITIZER_OUTPUT,
|
|
65
73
|
sierra_id=TEST_SIERRA_ID,
|
|
66
74
|
generated_id=TEST_GENERATED_ID
|
|
67
75
|
)
|
|
68
|
-
|
|
76
|
+
|
|
69
77
|
#If permalink is NOT given, Sierra ID should be used as base ID
|
|
70
78
|
digar_schema = converter.digar_schema
|
|
71
79
|
first_segment_id = digar_schema["dcterms:hasPart"][0]["hasPart"][0]["@id"]
|
|
72
80
|
assert first_segment_id.startswith(TEST_SIERRA_ID)
|
|
73
|
-
|
|
74
|
-
|
|
81
|
+
|
|
82
|
+
|
|
75
83
|
converter = DIGARSchemaConverter(
|
|
76
84
|
digitizer_output=TEST_DIGITIZER_OUTPUT,
|
|
77
85
|
generated_id=TEST_GENERATED_ID
|
|
78
86
|
)
|
|
79
|
-
|
|
87
|
+
|
|
80
88
|
#If neiter permalink nor Sierra ID is given, generated ID should be used as base ID
|
|
81
89
|
digar_schema = converter.digar_schema
|
|
82
90
|
first_segment_id = digar_schema["dcterms:hasPart"][0]["hasPart"][0]["@id"]
|
|
83
91
|
assert first_segment_id.startswith(TEST_GENERATED_ID)
|
|
84
|
-
|
|
85
|
-
|
|
92
|
+
|
|
93
|
+
|
|
86
94
|
def test_restricting_languages_with_ratio():
|
|
87
95
|
""" Checks that param `min_language_ratio` influences
|
|
88
96
|
the number of output languages.
|
|
@@ -93,41 +101,64 @@ def test_restricting_languages_with_ratio():
|
|
|
93
101
|
generated_id=TEST_GENERATED_ID,
|
|
94
102
|
permalink=TEST_PERMALINK,
|
|
95
103
|
min_language_ratio=0
|
|
96
|
-
|
|
104
|
+
|
|
97
105
|
)
|
|
98
|
-
|
|
106
|
+
|
|
99
107
|
#If permalink is given, this should be used as base ID
|
|
100
108
|
digar_schema = converter.digar_schema
|
|
101
109
|
languages = [lang.get("value") for lang in digar_schema.get("dc:language")]
|
|
102
110
|
assert len(languages) == 7
|
|
103
|
-
|
|
111
|
+
|
|
104
112
|
converter = DIGARSchemaConverter(
|
|
105
113
|
digitizer_output=TEST_DIGITIZER_OUTPUT,
|
|
106
114
|
sierra_id=TEST_SIERRA_ID,
|
|
107
115
|
generated_id=TEST_GENERATED_ID,
|
|
108
116
|
permalink=TEST_PERMALINK,
|
|
109
117
|
min_language_ratio=0.02
|
|
110
|
-
|
|
118
|
+
|
|
111
119
|
)
|
|
112
|
-
|
|
120
|
+
|
|
113
121
|
#If permalink is given, this should be used as base ID
|
|
114
122
|
digar_schema = converter.digar_schema
|
|
115
123
|
languages = [lang.get("value") for lang in digar_schema.get("dc:language")]
|
|
116
124
|
assert len(languages) == 2
|
|
117
|
-
|
|
125
|
+
|
|
118
126
|
converter = DIGARSchemaConverter(
|
|
119
127
|
digitizer_output=TEST_DIGITIZER_OUTPUT,
|
|
120
128
|
sierra_id=TEST_SIERRA_ID,
|
|
121
129
|
generated_id=TEST_GENERATED_ID,
|
|
122
130
|
permalink=TEST_PERMALINK,
|
|
123
131
|
min_language_ratio=0.5
|
|
124
|
-
|
|
132
|
+
|
|
125
133
|
)
|
|
126
|
-
|
|
134
|
+
|
|
127
135
|
#If permalink is given, this should be used as base ID
|
|
128
136
|
digar_schema = converter.digar_schema
|
|
129
137
|
languages = [lang.get("value") for lang in digar_schema.get("dc:language")]
|
|
130
138
|
assert len(languages) == 1
|
|
131
|
-
|
|
132
139
|
|
|
133
140
|
|
|
141
|
+
def test_digar_schema_converstion_with_missing_image_pages():
|
|
142
|
+
converter = DIGARSchemaConverter(
|
|
143
|
+
digitizer_output=TEST_DIGITIZER_OUTPUT_2,
|
|
144
|
+
sierra_id=TEST_SIERRA_ID_2,
|
|
145
|
+
generated_id=TEST_GENERATED_ID_2
|
|
146
|
+
)
|
|
147
|
+
digar_schema = converter.digar_schema
|
|
148
|
+
|
|
149
|
+
# check that all neseccary fields are present
|
|
150
|
+
assert "dc:language" in digar_schema
|
|
151
|
+
assert "dcterms:provenance" in digar_schema
|
|
152
|
+
assert "dc:identifier" in digar_schema
|
|
153
|
+
assert "dcterms:hasPart" in digar_schema
|
|
154
|
+
assert "dcterms:conformsTo" in digar_schema
|
|
155
|
+
|
|
156
|
+
languages = [lang.get("value") for lang in digar_schema.get("dc:language")]
|
|
157
|
+
# check that languages are converted into ISO-693-2
|
|
158
|
+
for lang in languages:
|
|
159
|
+
assert len(lang) == 3
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
# check that ratio is converted into percentage
|
|
163
|
+
text_quality = digar_schema.get("dcterms:conformsTo")[0].get("value")
|
|
164
|
+
assert isinstance(text_quality, str)
|
rara_tools-0.7.7/VERSION
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
0.7.7
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/parsers/marc_parsers/organization_parser.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/parsers/marc_records/organization_record.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|