rara-tools 0.7.3__tar.gz → 0.7.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of rara-tools might be problematic. Click here for more details.
- {rara_tools-0.7.3/rara_tools.egg-info → rara_tools-0.7.5}/PKG-INFO +1 -1
- rara_tools-0.7.5/VERSION +1 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/rara_tools/constants/digitizer.py +12 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/rara_tools/constants/linker.py +11 -2
- rara_tools-0.7.5/rara_tools/constants/meta_extractor.py +44 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/rara_tools/constants/subject_indexer.py +4 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/rara_tools/core_formatters/core_formatter.py +68 -5
- {rara_tools-0.7.3 → rara_tools-0.7.5}/rara_tools/core_formatters/formatted_meta.py +6 -7
- {rara_tools-0.7.3 → rara_tools-0.7.5/rara_tools.egg-info}/PKG-INFO +1 -1
- rara_tools-0.7.5/tests/test_formatters.py +85 -0
- rara_tools-0.7.3/VERSION +0 -1
- rara_tools-0.7.3/rara_tools/constants/meta_extractor.py +0 -14
- rara_tools-0.7.3/tests/test_formatters.py +0 -41
- {rara_tools-0.7.3 → rara_tools-0.7.5}/LICENSE.md +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/README.md +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/pyproject.toml +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/rara_tools/constants/__init__.py +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/rara_tools/constants/general.py +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/rara_tools/constants/language_evaluator.py +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/rara_tools/constants/normalizers.py +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/rara_tools/constants/parsers.py +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/rara_tools/converters.py +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/rara_tools/core_formatters/formatted_keyword.py +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/rara_tools/core_formatters/formatted_object.py +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/rara_tools/decorators.py +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/rara_tools/digar_schema_converter.py +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/rara_tools/elastic.py +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/rara_tools/exceptions.py +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/rara_tools/normalizers/__init__.py +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/rara_tools/normalizers/authorities.py +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/rara_tools/normalizers/base.py +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/rara_tools/normalizers/bibs.py +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/rara_tools/normalizers/reader.py +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/rara_tools/normalizers/viaf.py +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/rara_tools/parsers/marc_parsers/base_parser.py +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/rara_tools/parsers/marc_parsers/ems_parser.py +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/rara_tools/parsers/marc_parsers/location_parser.py +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/rara_tools/parsers/marc_parsers/organization_parser.py +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/rara_tools/parsers/marc_parsers/person_parser.py +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/rara_tools/parsers/marc_parsers/title_parser.py +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/rara_tools/parsers/marc_records/base_record.py +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/rara_tools/parsers/marc_records/ems_record.py +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/rara_tools/parsers/marc_records/organization_record.py +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/rara_tools/parsers/marc_records/person_record.py +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/rara_tools/parsers/marc_records/title_record.py +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/rara_tools/parsers/tools/entity_normalizers.py +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/rara_tools/parsers/tools/marc_converter.py +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/rara_tools/parsers/tools/russian_transliterator.py +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/rara_tools/s3.py +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/rara_tools/task_reporter.py +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/rara_tools/utils.py +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/rara_tools.egg-info/SOURCES.txt +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/rara_tools.egg-info/dependency_links.txt +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/rara_tools.egg-info/requires.txt +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/rara_tools.egg-info/top_level.txt +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/requirements.txt +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/setup.cfg +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/tests/test_digar_schema_converter.py +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/tests/test_elastic.py +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/tests/test_elastic_vector_and_search_operations.py +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/tests/test_entity_normalizers.py +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/tests/test_marc_parsers.py +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/tests/test_normalization.py +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/tests/test_s3_exceptions.py +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/tests/test_s3_file_operations.py +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/tests/test_sierra_converters.py +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/tests/test_task_reporter.py +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/tests/test_utils.py +0 -0
- {rara_tools-0.7.3 → rara_tools-0.7.5}/tests/test_viaf_client.py +0 -0
rara_tools-0.7.5/VERSION
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0.7.5
|
|
@@ -26,3 +26,15 @@ class Queue:
|
|
|
26
26
|
class Tasks:
|
|
27
27
|
START_DIGITIZER_PIPELINE = "start_digitizer_pipeline"
|
|
28
28
|
PURGE_MODELS = "purge_unused_digitizer_models"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class Error:
|
|
32
|
+
NO_SPACE = "Disk out of space!"
|
|
33
|
+
COULDNT_DOWNLOAD = "Unknown error when downloading model!"
|
|
34
|
+
UNKNOWN = "Unknown system error!"
|
|
35
|
+
S3_CONNECTION = "Failed to connect to S3!"
|
|
36
|
+
UNSUPPORTED_FILETYPE = "Unsupported file type!"
|
|
37
|
+
COULDNT_UPLOAD = "Could not upload documents to Elasticsearch!"
|
|
38
|
+
FILE_IS_PROTECTED = "File is password protected or encrypted!"
|
|
39
|
+
UNKNOWN_OCR = "Unknown error when applying ocr!"
|
|
40
|
+
CUSTOM_MODEL_ERROR = "Couldn't download custom image classification model!"
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
|
|
2
|
+
|
|
3
|
+
from rara_tools.constants.normalizers import EntityType
|
|
3
4
|
|
|
4
5
|
COMPONENT_KEY = "linker"
|
|
5
6
|
|
|
@@ -24,11 +25,13 @@ class StatusKeys:
|
|
|
24
25
|
VECTORIZE_CONTEXT = "vectorize_context"
|
|
25
26
|
LINK_KEYWORDS = "link_keywords"
|
|
26
27
|
|
|
28
|
+
|
|
27
29
|
class URLSource:
|
|
28
30
|
VIAF = "VIAF"
|
|
29
31
|
SIERRA = "Sierra"
|
|
30
32
|
EMS = "EMS"
|
|
31
33
|
|
|
34
|
+
|
|
32
35
|
class KeywordType:
|
|
33
36
|
LOC = "Kohamärksõnad"
|
|
34
37
|
TIME = "Ajamärksõnad"
|
|
@@ -54,17 +57,24 @@ class KeywordMARC:
|
|
|
54
57
|
TITLE = 630
|
|
55
58
|
TITLE_LINKED = 600
|
|
56
59
|
|
|
60
|
+
|
|
57
61
|
class KeywordSource:
|
|
58
62
|
EMS = "EMS"
|
|
59
63
|
SIERRA = "SIERRA"
|
|
60
64
|
VIAF = "VIAF"
|
|
61
65
|
AI = "AI"
|
|
62
66
|
|
|
67
|
+
|
|
63
68
|
class Filters:
|
|
64
69
|
AUTHOR = "author"
|
|
65
70
|
YEAR = "year"
|
|
66
71
|
|
|
67
72
|
|
|
73
|
+
class Error:
|
|
74
|
+
VECTORIZATION = "Failed to vectorize text!"
|
|
75
|
+
LINKING_KEYWORDS = "Failed to link keywords!"
|
|
76
|
+
LINKING_META = "Failed to link meta!"
|
|
77
|
+
|
|
68
78
|
UNLINKED_KEYWORD_MARC_FIELD = 693
|
|
69
79
|
|
|
70
80
|
ALLOWED_FILTERS_MAP = {
|
|
@@ -110,7 +120,6 @@ ALLOWED_ENTITY_TYPES = [
|
|
|
110
120
|
EntityType.UNK,
|
|
111
121
|
]
|
|
112
122
|
|
|
113
|
-
|
|
114
123
|
KEYWORD_TYPE_MAP = {
|
|
115
124
|
KeywordType.TIME: EntityType.KEYWORD,
|
|
116
125
|
KeywordType.GENRE: EntityType.KEYWORD,
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
|
|
3
|
+
COMPONENT_KEY = "meta_extractor"
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Tasks:
|
|
7
|
+
SINGLE = "extract_meta_from_text"
|
|
8
|
+
PIPELINE = "run_meta_extractor_with_core_logic"
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class Queue:
|
|
12
|
+
MAIN = "meta_extractor"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class StatusKeys:
|
|
16
|
+
EXTRACT_METADATA = "extract_metadata"
|
|
17
|
+
|
|
18
|
+
class Error:
|
|
19
|
+
UNKNOWN = "Failed to extract meta information from digitizer output!"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass(frozen=True)
|
|
23
|
+
class TitleType:
|
|
24
|
+
AUTHOR_WITHOUT_TITLE: str = "pealkirjata autor"
|
|
25
|
+
NORMALIZED_TITLE: str = "normitud eelispealkiri"
|
|
26
|
+
TITLE: str = "väljaandes esitatud kujul põhipealkiri"
|
|
27
|
+
PARALLEL_TITLE: str = "rööppealkiri"
|
|
28
|
+
ADDITIONAL_TITLE: str = "alampealkiri"
|
|
29
|
+
METS_TITLE: str = "väljaandes esitatud kujul põhipealkiri"
|
|
30
|
+
ANON: str = "anonüümne väljaanne"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
TITLE_TYPES_MAP = {
|
|
34
|
+
TitleType.AUTHOR_WITHOUT_TITLE: 130,
|
|
35
|
+
TitleType.NORMALIZED_TITLE: 240,
|
|
36
|
+
TitleType.TITLE: 245,
|
|
37
|
+
TitleType.PARALLEL_TITLE: 246,
|
|
38
|
+
TitleType.ADDITIONAL_TITLE: 245,
|
|
39
|
+
TitleType.METS_TITLE: 245,
|
|
40
|
+
TitleType.ANON: 130
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
PUBLISHER_KEY = "Väljaandja"
|
|
@@ -2,6 +2,10 @@ from typing import List, Tuple, Any
|
|
|
2
2
|
from rara_tools.core_formatters.formatted_keyword import FormattedKeyword
|
|
3
3
|
from rara_tools.core_formatters.formatted_meta import FormattedAuthor
|
|
4
4
|
from rara_tools.constants.linker import MAIN_TAXONOMY_LANG, KEYWORD_TYPES_TO_IGNORE, EntityType
|
|
5
|
+
from rara_tools.constants.meta_extractor import TitleType, TITLE_TYPES_MAP, PUBLISHER_KEY
|
|
6
|
+
from rara_tools.constants.subject_indexer import KeywordType
|
|
7
|
+
|
|
8
|
+
import regex as re
|
|
5
9
|
|
|
6
10
|
def get_primary_author(authors: List[dict]) -> str:
|
|
7
11
|
primary_author = ""
|
|
@@ -10,8 +14,19 @@ def get_primary_author(authors: List[dict]) -> str:
|
|
|
10
14
|
primary_author = author.get("name", "")
|
|
11
15
|
return primary_author
|
|
12
16
|
|
|
13
|
-
def
|
|
17
|
+
def is_valid_keyword(keyword: str) -> bool:
|
|
18
|
+
# If keywords contains ONLY punctuation
|
|
19
|
+
# characters, we assume it`s not valid
|
|
20
|
+
if re.search(r"^(\W|_)+$", keyword):
|
|
21
|
+
return False
|
|
22
|
+
return True
|
|
23
|
+
|
|
24
|
+
def format_series_info(series: str):
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
def format_authors(authors: List[dict]) -> Tuple[List[dict], dict]:
|
|
14
28
|
formatted_authors = []
|
|
29
|
+
publisher = {}
|
|
15
30
|
for author in authors:
|
|
16
31
|
entity_type = author.get("type", EntityType.UNK)
|
|
17
32
|
|
|
@@ -20,8 +35,14 @@ def format_authors(authors: List[dict]) -> List[dict]:
|
|
|
20
35
|
linked_doc=None,
|
|
21
36
|
entity_type=entity_type
|
|
22
37
|
).to_dict()
|
|
38
|
+
|
|
39
|
+
# If author role == publisher, do not add it as an author
|
|
40
|
+
if formatted_author.get("author_role", "") == PUBLISHER_KEY:
|
|
41
|
+
publisher = formatted_author
|
|
42
|
+
continue
|
|
43
|
+
|
|
23
44
|
formatted_authors.append(formatted_author)
|
|
24
|
-
return formatted_authors
|
|
45
|
+
return (formatted_authors, publisher)
|
|
25
46
|
|
|
26
47
|
def format_sections(sections: List[dict]) -> List[dict]:
|
|
27
48
|
for section in sections:
|
|
@@ -31,9 +52,16 @@ def format_sections(sections: List[dict]) -> List[dict]:
|
|
|
31
52
|
if primary_author:
|
|
32
53
|
for title in titles:
|
|
33
54
|
title["author_from_title"] = primary_author
|
|
55
|
+
if not authors:
|
|
56
|
+
for title in titles:
|
|
57
|
+
title["title_type"] = TitleType.ANON
|
|
58
|
+
title["title_type_int"] = TITLE_TYPES_MAP.get(TitleType.ANON)
|
|
34
59
|
section["titles"] = titles
|
|
35
60
|
|
|
36
|
-
|
|
61
|
+
# Extract publisher, but do nothing with it
|
|
62
|
+
# as it is unlikely for the publishing info to be
|
|
63
|
+
# in a METS/ALTO section. Can update it, if proven otherwise
|
|
64
|
+
formatted_authors, publisher = format_authors(authors)
|
|
37
65
|
section["authors"] = formatted_authors
|
|
38
66
|
|
|
39
67
|
return sections
|
|
@@ -46,14 +74,25 @@ def format_meta(meta: dict) -> dict:
|
|
|
46
74
|
|
|
47
75
|
authors = meta_to_format.pop("authors", [])
|
|
48
76
|
sections = meta_to_format.pop("sections", [])
|
|
77
|
+
titles = meta_to_format.pop("titles", [])
|
|
49
78
|
|
|
50
|
-
formatted_authors = format_authors(authors)
|
|
79
|
+
formatted_authors, publisher = format_authors(authors)
|
|
51
80
|
formatted_sections = format_sections(sections)
|
|
52
81
|
|
|
53
82
|
if sections and formatted_sections:
|
|
54
83
|
meta_to_format["sections"] = formatted_sections
|
|
55
84
|
if authors and formatted_authors:
|
|
56
85
|
meta_to_format["authors"] = formatted_authors
|
|
86
|
+
if titles and not authors:
|
|
87
|
+
for title in titles:
|
|
88
|
+
title["title_type"] = TitleType.ANON
|
|
89
|
+
title["title_type_int"] = TITLE_TYPES_MAP.get(TitleType.ANON)
|
|
90
|
+
meta_to_format["titles"] = titles
|
|
91
|
+
|
|
92
|
+
if publisher:
|
|
93
|
+
# Not sure, if it would be better to add original name or
|
|
94
|
+
# linked value. Currently adding original for safety
|
|
95
|
+
meta_to_format["publisher"] = publisher.get("original_name")
|
|
57
96
|
|
|
58
97
|
meta["meta"] = meta_to_format
|
|
59
98
|
|
|
@@ -81,6 +120,30 @@ def format_keywords(flat_keywords: List[dict]) -> List[dict]:
|
|
|
81
120
|
linked_doc=None,
|
|
82
121
|
main_taxnomy_lang=MAIN_TAXONOMY_LANG
|
|
83
122
|
).to_dict()
|
|
84
|
-
|
|
123
|
+
if is_valid_keyword(formatted_keyword.get("keyword")):
|
|
124
|
+
formatted_keywords.append(formatted_keyword)
|
|
85
125
|
|
|
86
126
|
return formatted_keywords
|
|
127
|
+
|
|
128
|
+
def get_udk072(flat_keywords: List[dict]) -> List[str]:
|
|
129
|
+
""" Filters out UDK from flat subject indexer output.
|
|
130
|
+
"""
|
|
131
|
+
# keyword type: UDK
|
|
132
|
+
udk072 = [
|
|
133
|
+
keyword.get("keyword")
|
|
134
|
+
for keyword in flat_keywords
|
|
135
|
+
if keyword.get("entity_type") == KeywordType.UDK
|
|
136
|
+
]
|
|
137
|
+
return udk072
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def get_udk080(flat_keywords: List[dict]) -> List[str]:
|
|
141
|
+
""" Filters out UDC from flat subject indexer output.
|
|
142
|
+
"""
|
|
143
|
+
# keyword type: UDC
|
|
144
|
+
udk080 = [
|
|
145
|
+
keyword.get("keyword")
|
|
146
|
+
for keyword in flat_keywords
|
|
147
|
+
if keyword.get("entity_type") == KeywordType.UDC
|
|
148
|
+
]
|
|
149
|
+
return udk080
|
|
@@ -11,7 +11,7 @@ class FormattedTitle(FormattedObject):
|
|
|
11
11
|
super().__init__(
|
|
12
12
|
object_dict=object_dict,
|
|
13
13
|
linked_doc=linked_doc,
|
|
14
|
-
original_entity_key="
|
|
14
|
+
original_entity_key="title"
|
|
15
15
|
)
|
|
16
16
|
|
|
17
17
|
|
|
@@ -41,17 +41,16 @@ class FormattedAuthor(FormattedObject):
|
|
|
41
41
|
#self.__standardized_uri: str = ""
|
|
42
42
|
self.__viaf_id: str = ""
|
|
43
43
|
|
|
44
|
+
self._default_author_type: str = EntityType.PER
|
|
45
|
+
|
|
44
46
|
|
|
45
47
|
@property
|
|
46
48
|
def primary_author_type(self) -> str:
|
|
47
49
|
if self.__primary_author_type == None:
|
|
48
|
-
|
|
49
|
-
|
|
50
|
+
self.__primary_author_type = self._default_author_type
|
|
51
|
+
if self.entity_type != EntityType.UNK:
|
|
52
|
+
if self.entity_type in [EntityType.ORG, EntityType.PER]:
|
|
50
53
|
self.__primary_author_type = self.entity_type
|
|
51
|
-
else:
|
|
52
|
-
self.__primary_author_type = EntityType.PER
|
|
53
|
-
else:
|
|
54
|
-
self.__primary_author_type = ""
|
|
55
54
|
return self.__primary_author_type
|
|
56
55
|
|
|
57
56
|
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
import os
|
|
3
|
+
from pprint import pprint
|
|
4
|
+
from rara_tools.core_formatters.core_formatter import format_keywords, format_meta, get_udk072, get_udk080
|
|
5
|
+
from rara_tools.constants.meta_extractor import PUBLISHER_KEY
|
|
6
|
+
from tests.test_utils import read_json_file
|
|
7
|
+
|
|
8
|
+
ROOT_DIR = os.path.join("tests", "test_data", "formatter")
|
|
9
|
+
INPUT_KEYWORDS_FILE_PATHS = [
|
|
10
|
+
os.path.join(ROOT_DIR, "keywords_1.json"),
|
|
11
|
+
os.path.join(ROOT_DIR, "keywords_2.json"),
|
|
12
|
+
os.path.join(ROOT_DIR, "keywords_3.json")
|
|
13
|
+
]
|
|
14
|
+
INPUT_META_FILE_PATHS = [
|
|
15
|
+
os.path.join(ROOT_DIR, "epub_meta.json"),
|
|
16
|
+
os.path.join(ROOT_DIR, "mets_alto_meta.json"),
|
|
17
|
+
os.path.join(ROOT_DIR, "pdf_meta_2.json"),
|
|
18
|
+
os.path.join(ROOT_DIR, "pdf_meta.json")
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
INPUT_KEYWORDS = [
|
|
22
|
+
read_json_file(keyword_file_path)
|
|
23
|
+
for keyword_file_path in INPUT_KEYWORDS_FILE_PATHS
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
INPUT_META_DICTS = [
|
|
27
|
+
read_json_file(meta_file_path)
|
|
28
|
+
for meta_file_path in INPUT_META_FILE_PATHS
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
def test_formatting_keywords_for_core():
|
|
32
|
+
for keyword_dict_list in INPUT_KEYWORDS:
|
|
33
|
+
formatted_keywords = format_keywords(keyword_dict_list)
|
|
34
|
+
#pprint(formatted_keywords)
|
|
35
|
+
assert formatted_keywords
|
|
36
|
+
assert isinstance(formatted_keywords, list)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def test_formatting_meta_for_core():
|
|
40
|
+
for meta_dict in INPUT_META_DICTS:
|
|
41
|
+
formatted_meta = format_meta(meta_dict)
|
|
42
|
+
#pprint(formatted_meta)
|
|
43
|
+
assert formatted_meta
|
|
44
|
+
assert isinstance(formatted_meta, dict)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def test_validating_keywords():
|
|
48
|
+
keyword_dict_list = INPUT_KEYWORDS[-1]
|
|
49
|
+
assert len(keyword_dict_list) == 6
|
|
50
|
+
formatted_keywords = format_keywords(keyword_dict_list)
|
|
51
|
+
assert len(formatted_keywords) == 2
|
|
52
|
+
|
|
53
|
+
def test_removing_publisher_from_authors():
|
|
54
|
+
meta_dict = INPUT_META_DICTS[-1]
|
|
55
|
+
formatted_meta = format_meta(meta_dict)
|
|
56
|
+
assert formatted_meta["meta"]["publisher"] == "s.n"
|
|
57
|
+
for author in formatted_meta["meta"]["authors"]:
|
|
58
|
+
assert author.get("author_role") != PUBLISHER_KEY
|
|
59
|
+
assert isinstance(formatted_meta, dict)
|
|
60
|
+
|
|
61
|
+
def test_title_key_without_authors():
|
|
62
|
+
meta_dict = INPUT_META_DICTS[2]
|
|
63
|
+
formatted_meta = format_meta(meta_dict)
|
|
64
|
+
for title in formatted_meta["meta"]["titles"]:
|
|
65
|
+
assert title.get("title_type_int") == 130
|
|
66
|
+
assert isinstance(formatted_meta, dict)
|
|
67
|
+
|
|
68
|
+
def test_all_authors_have_types():
|
|
69
|
+
for meta_dict in INPUT_META_DICTS:
|
|
70
|
+
formatted_meta = format_meta(meta_dict)
|
|
71
|
+
authors = formatted_meta.get("authors", [])
|
|
72
|
+
for author in authors:
|
|
73
|
+
assert author.get("primary_author_type")
|
|
74
|
+
|
|
75
|
+
def test_getting_udk072():
|
|
76
|
+
keyword_dict_list = INPUT_KEYWORDS[-1]
|
|
77
|
+
assert len(keyword_dict_list) == 6
|
|
78
|
+
udk072_list = get_udk072(keyword_dict_list)
|
|
79
|
+
assert len(udk072_list) == 1
|
|
80
|
+
|
|
81
|
+
def test_getting_udk080():
|
|
82
|
+
keyword_dict_list = INPUT_KEYWORDS[-1]
|
|
83
|
+
assert len(keyword_dict_list) == 6
|
|
84
|
+
udk080_list = get_udk080(keyword_dict_list)
|
|
85
|
+
assert len(udk080_list) == 1
|
rara_tools-0.7.3/VERSION
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
0.7.3
|
|
@@ -1,41 +0,0 @@
|
|
|
1
|
-
import pytest
|
|
2
|
-
import os
|
|
3
|
-
from pprint import pprint
|
|
4
|
-
from rara_tools.core_formatters.core_formatter import format_keywords, format_meta
|
|
5
|
-
from tests.test_utils import read_json_file
|
|
6
|
-
|
|
7
|
-
ROOT_DIR = os.path.join("tests", "test_data", "formatter")
|
|
8
|
-
INPUT_KEYWORDS_FILE_PATHS = [
|
|
9
|
-
os.path.join(ROOT_DIR, "keywords_1.json"),
|
|
10
|
-
os.path.join(ROOT_DIR, "keywords_2.json")
|
|
11
|
-
]
|
|
12
|
-
INPUT_META_FILE_PATHS = [
|
|
13
|
-
os.path.join(ROOT_DIR, "epub_meta.json"),
|
|
14
|
-
os.path.join(ROOT_DIR, "mets_alto_meta.json"),
|
|
15
|
-
os.path.join(ROOT_DIR, "pdf_meta.json")
|
|
16
|
-
]
|
|
17
|
-
|
|
18
|
-
INPUT_KEYWORDS = [
|
|
19
|
-
read_json_file(keyword_file_path)
|
|
20
|
-
for keyword_file_path in INPUT_KEYWORDS_FILE_PATHS
|
|
21
|
-
]
|
|
22
|
-
|
|
23
|
-
INPUT_META_DICTS = [
|
|
24
|
-
read_json_file(meta_file_path)
|
|
25
|
-
for meta_file_path in INPUT_META_FILE_PATHS
|
|
26
|
-
]
|
|
27
|
-
|
|
28
|
-
def test_formatting_keywords_for_core():
|
|
29
|
-
for keyword_dict in INPUT_KEYWORDS:
|
|
30
|
-
formatted_keywords = format_keywords(keyword_dict)
|
|
31
|
-
#pprint(formatted_keywords)
|
|
32
|
-
assert formatted_keywords
|
|
33
|
-
assert isinstance(formatted_keywords, list)
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
def test_formatting_meta_for_core():
|
|
37
|
-
for meta_dict in INPUT_META_DICTS:
|
|
38
|
-
formatted_meta = format_meta(meta_dict)
|
|
39
|
-
#pprint(formatted_meta)
|
|
40
|
-
assert formatted_meta
|
|
41
|
-
assert isinstance(formatted_meta, dict)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{rara_tools-0.7.3 → rara_tools-0.7.5}/rara_tools/parsers/marc_parsers/organization_parser.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{rara_tools-0.7.3 → rara_tools-0.7.5}/rara_tools/parsers/marc_records/organization_record.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|