rara-tools 0.6.12__tar.gz → 0.6.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rara-tools might be problematic. Click here for more details.

Files changed (64) hide show
  1. {rara_tools-0.6.12/rara_tools.egg-info → rara_tools-0.6.14}/PKG-INFO +1 -1
  2. rara_tools-0.6.14/VERSION +1 -0
  3. rara_tools-0.6.14/rara_tools/constants/subject_indexer.py +85 -0
  4. rara_tools-0.6.14/rara_tools/formatters.py +106 -0
  5. {rara_tools-0.6.12 → rara_tools-0.6.14}/rara_tools/parsers/marc_records/organization_record.py +20 -10
  6. {rara_tools-0.6.12 → rara_tools-0.6.14/rara_tools.egg-info}/PKG-INFO +1 -1
  7. {rara_tools-0.6.12 → rara_tools-0.6.14}/rara_tools.egg-info/SOURCES.txt +2 -0
  8. rara_tools-0.6.14/tests/test_formatters.py +15 -0
  9. {rara_tools-0.6.12 → rara_tools-0.6.14}/tests/test_marc_parsers.py +15 -3
  10. rara_tools-0.6.12/VERSION +0 -1
  11. rara_tools-0.6.12/rara_tools/constants/subject_indexer.py +0 -14
  12. {rara_tools-0.6.12 → rara_tools-0.6.14}/LICENSE.md +0 -0
  13. {rara_tools-0.6.12 → rara_tools-0.6.14}/README.md +0 -0
  14. {rara_tools-0.6.12 → rara_tools-0.6.14}/pyproject.toml +0 -0
  15. {rara_tools-0.6.12 → rara_tools-0.6.14}/rara_tools/constants/__init__.py +0 -0
  16. {rara_tools-0.6.12 → rara_tools-0.6.14}/rara_tools/constants/digitizer.py +0 -0
  17. {rara_tools-0.6.12 → rara_tools-0.6.14}/rara_tools/constants/general.py +0 -0
  18. {rara_tools-0.6.12 → rara_tools-0.6.14}/rara_tools/constants/language_evaluator.py +0 -0
  19. {rara_tools-0.6.12 → rara_tools-0.6.14}/rara_tools/constants/linker.py +0 -0
  20. {rara_tools-0.6.12 → rara_tools-0.6.14}/rara_tools/constants/meta_extractor.py +0 -0
  21. {rara_tools-0.6.12 → rara_tools-0.6.14}/rara_tools/constants/normalizers.py +0 -0
  22. {rara_tools-0.6.12 → rara_tools-0.6.14}/rara_tools/constants/parsers.py +0 -0
  23. {rara_tools-0.6.12 → rara_tools-0.6.14}/rara_tools/converters.py +0 -0
  24. {rara_tools-0.6.12 → rara_tools-0.6.14}/rara_tools/decorators.py +0 -0
  25. {rara_tools-0.6.12 → rara_tools-0.6.14}/rara_tools/digar_schema_converter.py +0 -0
  26. {rara_tools-0.6.12 → rara_tools-0.6.14}/rara_tools/elastic.py +0 -0
  27. {rara_tools-0.6.12 → rara_tools-0.6.14}/rara_tools/exceptions.py +0 -0
  28. {rara_tools-0.6.12 → rara_tools-0.6.14}/rara_tools/normalizers/__init__.py +0 -0
  29. {rara_tools-0.6.12 → rara_tools-0.6.14}/rara_tools/normalizers/authorities.py +0 -0
  30. {rara_tools-0.6.12 → rara_tools-0.6.14}/rara_tools/normalizers/base.py +0 -0
  31. {rara_tools-0.6.12 → rara_tools-0.6.14}/rara_tools/normalizers/bibs.py +0 -0
  32. {rara_tools-0.6.12 → rara_tools-0.6.14}/rara_tools/normalizers/viaf.py +0 -0
  33. {rara_tools-0.6.12 → rara_tools-0.6.14}/rara_tools/parsers/marc_parsers/base_parser.py +0 -0
  34. {rara_tools-0.6.12 → rara_tools-0.6.14}/rara_tools/parsers/marc_parsers/ems_parser.py +0 -0
  35. {rara_tools-0.6.12 → rara_tools-0.6.14}/rara_tools/parsers/marc_parsers/location_parser.py +0 -0
  36. {rara_tools-0.6.12 → rara_tools-0.6.14}/rara_tools/parsers/marc_parsers/organization_parser.py +0 -0
  37. {rara_tools-0.6.12 → rara_tools-0.6.14}/rara_tools/parsers/marc_parsers/person_parser.py +0 -0
  38. {rara_tools-0.6.12 → rara_tools-0.6.14}/rara_tools/parsers/marc_parsers/title_parser.py +0 -0
  39. {rara_tools-0.6.12 → rara_tools-0.6.14}/rara_tools/parsers/marc_records/base_record.py +0 -0
  40. {rara_tools-0.6.12 → rara_tools-0.6.14}/rara_tools/parsers/marc_records/ems_record.py +0 -0
  41. {rara_tools-0.6.12 → rara_tools-0.6.14}/rara_tools/parsers/marc_records/person_record.py +0 -0
  42. {rara_tools-0.6.12 → rara_tools-0.6.14}/rara_tools/parsers/marc_records/title_record.py +0 -0
  43. {rara_tools-0.6.12 → rara_tools-0.6.14}/rara_tools/parsers/tools/entity_normalizers.py +0 -0
  44. {rara_tools-0.6.12 → rara_tools-0.6.14}/rara_tools/parsers/tools/marc_converter.py +0 -0
  45. {rara_tools-0.6.12 → rara_tools-0.6.14}/rara_tools/parsers/tools/russian_transliterator.py +0 -0
  46. {rara_tools-0.6.12 → rara_tools-0.6.14}/rara_tools/s3.py +0 -0
  47. {rara_tools-0.6.12 → rara_tools-0.6.14}/rara_tools/task_reporter.py +0 -0
  48. {rara_tools-0.6.12 → rara_tools-0.6.14}/rara_tools/utils.py +0 -0
  49. {rara_tools-0.6.12 → rara_tools-0.6.14}/rara_tools.egg-info/dependency_links.txt +0 -0
  50. {rara_tools-0.6.12 → rara_tools-0.6.14}/rara_tools.egg-info/requires.txt +0 -0
  51. {rara_tools-0.6.12 → rara_tools-0.6.14}/rara_tools.egg-info/top_level.txt +0 -0
  52. {rara_tools-0.6.12 → rara_tools-0.6.14}/requirements.txt +0 -0
  53. {rara_tools-0.6.12 → rara_tools-0.6.14}/setup.cfg +0 -0
  54. {rara_tools-0.6.12 → rara_tools-0.6.14}/tests/test_digar_schema_converter.py +0 -0
  55. {rara_tools-0.6.12 → rara_tools-0.6.14}/tests/test_elastic.py +0 -0
  56. {rara_tools-0.6.12 → rara_tools-0.6.14}/tests/test_elastic_vector_and_search_operations.py +0 -0
  57. {rara_tools-0.6.12 → rara_tools-0.6.14}/tests/test_entity_normalizers.py +0 -0
  58. {rara_tools-0.6.12 → rara_tools-0.6.14}/tests/test_normalization.py +0 -0
  59. {rara_tools-0.6.12 → rara_tools-0.6.14}/tests/test_s3_exceptions.py +0 -0
  60. {rara_tools-0.6.12 → rara_tools-0.6.14}/tests/test_s3_file_operations.py +0 -0
  61. {rara_tools-0.6.12 → rara_tools-0.6.14}/tests/test_sierra_converters.py +0 -0
  62. {rara_tools-0.6.12 → rara_tools-0.6.14}/tests/test_task_reporter.py +0 -0
  63. {rara_tools-0.6.12 → rara_tools-0.6.14}/tests/test_utils.py +0 -0
  64. {rara_tools-0.6.12 → rara_tools-0.6.14}/tests/test_viaf_client.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rara-tools
3
- Version: 0.6.12
3
+ Version: 0.6.14
4
4
  Summary: Tools to support Kata's work.
5
5
  Classifier: Programming Language :: Python :: 3
6
6
  Classifier: Programming Language :: Python :: 3.10
@@ -0,0 +1 @@
1
+ 0.6.14
@@ -0,0 +1,85 @@
1
+ from rara_tools.constants.normalizers import EntityType
2
+
3
+ COMPONENT_KEY = "subject_indexer"
4
+
5
+
6
+ class Tasks:
7
+ SINGLE = "run_subject_indexer_process"
8
+ PIPELINE = "run_subject_indexer_with_core_logic"
9
+
10
+
11
+ class Queue:
12
+ MAIN = "subject-indexer"
13
+
14
+
15
+ class StatusKeys:
16
+ EXTRACT_KEYWORDS = "extract_keywords"
17
+
18
+
19
+ class URLSource:
20
+ VIAF = "VIAF"
21
+ SIERRA = "Sierra"
22
+ EMS = "EMS"
23
+
24
+ class KeywordType:
25
+ LOC = "Kohamärksõnad"
26
+ TIME = "Ajamärksõnad"
27
+ TOPIC = "Teemamärksõnad"
28
+ GENRE = "Vormimärksõnad"
29
+ TITLE = "Teose pealkiri"
30
+ PER = "Isikunimi"
31
+ ORG = "Kollektiivi nimi"
32
+ EVENT = "Ajutine kollektiiv või sündmus"
33
+ CATEGORY = "Valdkonnamärksõnad"
34
+ UDC = "UDC Summary"
35
+ UDK = "UDK Rahvusbibliograafia"
36
+
37
+
38
+ class KeywordMARC:
39
+ PER = 600
40
+ ORG = 610
41
+ TOPIC = 650
42
+ GENRE = 655
43
+ TIME = 648
44
+ LOC = 651
45
+ EVENT = 611
46
+ TITLE = 630
47
+
48
+ class KeywordSource:
49
+ EMS = "EMS"
50
+ SIERRA = "SIERRA"
51
+ VIAF = "VIAF"
52
+ AI = "AI"
53
+
54
+
55
+ KEYWORD_TYPE_MAP = {
56
+ KeywordType.TIME: EntityType.KEYWORD,
57
+ KeywordType.GENRE: EntityType.KEYWORD,
58
+ KeywordType.LOC: EntityType.LOC,
59
+ KeywordType.PER: EntityType.PER,
60
+ KeywordType.ORG: EntityType.ORG,
61
+ KeywordType.TOPIC: EntityType.KEYWORD,
62
+ KeywordType.TITLE: EntityType.TITLE,
63
+ KeywordType.EVENT: EntityType.ORG
64
+ }
65
+
66
+ KEYWORD_MARC_MAP = {
67
+ KeywordType.LOC: KeywordMARC.LOC,
68
+ KeywordType.TIME: KeywordMARC.TIME,
69
+ KeywordType.TOPIC: KeywordMARC.TOPIC,
70
+ KeywordType.GENRE: KeywordMARC.GENRE,
71
+ KeywordType.TITLE: KeywordMARC.TITLE,
72
+ KeywordType.ORG: KeywordMARC.ORG,
73
+ KeywordType.PER: KeywordMARC.PER,
74
+ KeywordType.EVENT: KeywordMARC.EVENT
75
+ }
76
+
77
+ KEYWORD_TYPES_TO_IGNORE = [
78
+ KeywordType.CATEGORY,
79
+ KeywordType.UDC,
80
+ KeywordType.UDK
81
+ ]
82
+
83
+ EMS_ENTITY_TYPES = [EntityType.KEYWORD, EntityType.LOC]
84
+ SIERRA_ENTITY_TYPES = [EntityType.PER, EntityType.ORG, EntityType.TITLE]
85
+ VIAF_ENTITY_TYPES = [EntityType.PER, EntityType.ORG, EntityType.TITLE]
@@ -0,0 +1,106 @@
1
+ from typing import List, Tuple, Any
2
+ from rara_tools.constants.subject_indexer import (
3
+ EntityType, KeywordType, KeywordMARC, KeywordSource, URLSource,
4
+ KEYWORD_TYPE_MAP, KEYWORD_MARC_MAP, KEYWORD_TYPES_TO_IGNORE,
5
+ EMS_ENTITY_TYPES, SIERRA_ENTITY_TYPES, VIAF_ENTITY_TYPES
6
+ )
7
+
8
+ def _get_keyword_source(linked_doc: Any, entity_type: str, is_linked: bool
9
+ ) -> str:
10
+ """ Find keyword source.
11
+ """
12
+ if not is_linked:
13
+ source = KeywordSource.AI
14
+ elif entity_type in EMS_ENTITY_TYPES:
15
+ source = KeywordSource.EMS
16
+ elif entity_type in SIERRA_ENTITY_TYPES:
17
+ if linked_doc and linked_doc.elastic:
18
+ source = KeywordSource.SIERRA
19
+ elif linked_doc and linked_doc.viaf:
20
+ source = KeywordSource.VIAF
21
+ else:
22
+ source = KeywordSource.AI
23
+ else:
24
+ source = KeywordSource.AI
25
+ return source
26
+
27
+ def _find_indicators(entity_type: str, entity: str,
28
+ is_linked: bool
29
+ ) -> Tuple[str, str]:
30
+ """ Find MARC indicators 1 and 2.
31
+ """
32
+ ind1 = " "
33
+ ind2 = " "
34
+ if entity_type in SIERRA_ENTITY_TYPES:
35
+ if entity_type == EntityType.PER:
36
+ if "," in entity:
37
+ ind1 = "1"
38
+ else:
39
+ ind1 = "0"
40
+ else:
41
+ # 1 märksõna esimeseks elemendiks võimupiirkonna nimi, nt:
42
+ # (a) Eesti (b) Riigikogu - raske automaatselt määrata
43
+ # 2 märksõna esimeseks elemendiks nimi pärijärjestuses
44
+ ind1 = "2"
45
+ if not is_linked:
46
+ ind2 = "4"
47
+ elif entity_type in EMS_ENTITY_TYPES:
48
+ ind2 = "4"
49
+ return (ind1, ind2)
50
+
51
+
52
+ def format_keywords(flat_keywords: List[dict]) -> dict:
53
+ """ Formats unlinked keywords for Kata CORE.
54
+ """
55
+ ignored_keywords = []
56
+ filtered_keywords = []
57
+
58
+ for keyword_dict in flat_keywords:
59
+ keyword_type = keyword_dict.get("entity_type")
60
+ if keyword_type in KEYWORD_TYPES_TO_IGNORE:
61
+ ignored_keywords.append(keyword_dict)
62
+ else:
63
+ filtered_keywords.append(keyword_dict)
64
+
65
+ formatted_keywords = {
66
+ "keywords": [],
67
+ "other": ignored_keywords
68
+ }
69
+
70
+ for keyword_dict in filtered_keywords:
71
+ original_keyword = keyword_dict.get("keyword")
72
+ keyword_type = keyword_dict.get("entity_type")
73
+ entity_type = KEYWORD_TYPE_MAP.get(keyword_type, "")
74
+ marc_field = KEYWORD_MARC_MAP.get(str(keyword_type), "")
75
+ lang = keyword_dict.get("language", "")
76
+
77
+ ind1, ind2 = _find_indicators(
78
+ entity_type=entity_type,
79
+ entity=original_keyword,
80
+ is_linked=False
81
+ )
82
+ keyword_source = _get_keyword_source(
83
+ linked_doc=None,
84
+ is_linked=False,
85
+ entity_type=entity_type
86
+ )
87
+ new_keyword_dict = {
88
+ "dates": "",
89
+ "indicator1": ind1,
90
+ "indicator2": ind2,
91
+ "is_linked": False,
92
+ "keyword_source": keyword_source,
93
+ "lang": lang,
94
+ "location": "",
95
+ "marc_field": marc_field,
96
+ "numeration": "",
97
+ "organisation_sub_unit": "",
98
+ "original_keyword": original_keyword,
99
+ "persons_title": "",
100
+ "url": "",
101
+ "url_source": ""
102
+ }
103
+ new_keyword_dict.update(keyword_dict)
104
+ formatted_keywords["keywords"].append(new_keyword_dict)
105
+
106
+ return formatted_keywords
@@ -6,13 +6,8 @@ import regex as re
6
6
  import json
7
7
 
8
8
  # TODO: indikaatorid ind1 väljadel 100 ja 400?
9
- """
10
- |c asutuse konverentsi toimumise koht (MK)
11
9
 
12
- |d asutuse konverentsi toimumise aeg (K)
13
10
 
14
- |n asutuse konverentsi järjenumber (K)
15
- """
16
11
  class OrganizationRecord(BaseRecord):
17
12
  """ Generates a simplified organization JSON record
18
13
  from a pymarc MARC record.
@@ -45,6 +40,7 @@ class OrganizationRecord(BaseRecord):
45
40
  self.__name_specification: str = ""
46
41
  self.__dates: str = ""
47
42
  self.__location: str = ""
43
+ self.__numeration: str = ""
48
44
  self.__name_variations: List[str] = []
49
45
  self.__source: str = ""
50
46
  self.__description: str = ""
@@ -61,7 +57,10 @@ class OrganizationRecord(BaseRecord):
61
57
 
62
58
 
63
59
  def _clean_value(self, value: str) -> str:
64
- cleaned_value = value.strip("., ")
60
+ try:
61
+ cleaned_value = value.strip("., ")
62
+ except Exception as e:
63
+ cleaned_value = ""
65
64
  return cleaned_value
66
65
 
67
66
  def _merge_and_clean(self, value: dict, keys: List[str]) -> str:
@@ -103,10 +102,10 @@ class OrganizationRecord(BaseRecord):
103
102
  if not self.__dates:
104
103
  values = self.get_values(
105
104
  marc_ids=self.__name_field_id,
106
- subfield_id=["d"]
105
+ subfield_id="d"
107
106
  )
108
107
  if values:
109
- self.__dates = self.__clean_value(values[0])
108
+ self.__dates = self._clean_value(values[0])
110
109
  return self.__dates
111
110
 
112
111
  @property
@@ -114,12 +113,23 @@ class OrganizationRecord(BaseRecord):
114
113
  if not self.__location:
115
114
  values = self.get_values(
116
115
  marc_ids=self.__name_field_id,
117
- subfield_id=["c"]
116
+ subfield_id="c"
118
117
  )
119
118
  if values:
120
- self.__location = self.__clean_value(values[0])
119
+ self.__location = self._clean_value(values[0])
121
120
  return self.__location
122
121
 
122
+ @property
123
+ def numeration(self) -> str:
124
+ if not self.__numeration:
125
+ values = self.get_values(
126
+ marc_ids=self.__name_field_id,
127
+ subfield_id="n"
128
+ )
129
+ if values:
130
+ self.__numeration = self._clean_value(values[0])
131
+ return self.__numeration
132
+
123
133
  @property
124
134
  def acronyms(self) -> List[str]:
125
135
  if not self.__acronyms:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rara-tools
3
- Version: 0.6.12
3
+ Version: 0.6.14
4
4
  Summary: Tools to support Kata's work.
5
5
  Classifier: Programming Language :: Python :: 3
6
6
  Classifier: Programming Language :: Python :: 3.10
@@ -8,6 +8,7 @@ rara_tools/decorators.py
8
8
  rara_tools/digar_schema_converter.py
9
9
  rara_tools/elastic.py
10
10
  rara_tools/exceptions.py
11
+ rara_tools/formatters.py
11
12
  rara_tools/s3.py
12
13
  rara_tools/task_reporter.py
13
14
  rara_tools/utils.py
@@ -48,6 +49,7 @@ tests/test_digar_schema_converter.py
48
49
  tests/test_elastic.py
49
50
  tests/test_elastic_vector_and_search_operations.py
50
51
  tests/test_entity_normalizers.py
52
+ tests/test_formatters.py
51
53
  tests/test_marc_parsers.py
52
54
  tests/test_normalization.py
53
55
  tests/test_s3_exceptions.py
@@ -0,0 +1,15 @@
1
+ import pytest
2
+ import os
3
+ from pprint import pprint
4
+ from rara_tools.formatters import format_keywords
5
+ from tests.test_utils import read_json_file
6
+
7
+ ROOT_DIR = os.path.join("tests", "test_data", "formatter")
8
+ INPUT_KEYWORDS_FILE_PATH = os.path.join(ROOT_DIR, "keywords.json")
9
+
10
+ INPUT_KEYWORDS = read_json_file(INPUT_KEYWORDS_FILE_PATH)
11
+
12
+ def test_formatting_keywords_for_core():
13
+ formatted_keywords = format_keywords(INPUT_KEYWORDS)
14
+ assert formatted_keywords
15
+ assert isinstance(formatted_keywords, dict)
@@ -6,6 +6,7 @@ from rara_tools.parsers.marc_parsers.organization_parser import OrganizationsMAR
6
6
  from rara_tools.parsers.marc_parsers.location_parser import LocationMARCParser
7
7
  from rara_tools.parsers.marc_parsers.title_parser import TitlesMARCParser
8
8
  from rara_tools.parsers.marc_records.person_record import PersonRecord
9
+ from rara_tools.parsers.marc_records.organization_record import OrganizationRecord
9
10
  from tests.test_utils import read_json_file
10
11
 
11
12
  ROOT_DIR = os.path.join("tests", "test_data", "marc_records")
@@ -15,7 +16,8 @@ EMS_TEST_FILE = os.path.join(MARC_ROOT_DIR, "ems_test_subset.mrc")
15
16
  PER_TEST_FILE = os.path.join(MARC_ROOT_DIR, "per_test_subset.mrc")
16
17
  ORG_TEST_FILE = os.path.join(MARC_ROOT_DIR, "org_test_subset.mrc")
17
18
  TITLE_TEST_FILE = os.path.join(MARC_ROOT_DIR, "title_test_subset.mrc")
18
- JSON_TEST_FILE = os.path.join(JSON_ROOT_DIR, "marc_json_record.json")
19
+ PER_JSON_TEST_FILE = os.path.join(JSON_ROOT_DIR, "per_marc_json_record.json")
20
+ ORG_JSON_TEST_FILE = os.path.join(JSON_ROOT_DIR, "org_marc_json_record.json")
19
21
 
20
22
  def test_ems_parser_without_variations():
21
23
  ems_marc_parser = EMSMARCParser(EMS_TEST_FILE, add_variations=False)
@@ -61,7 +63,17 @@ def test_title_parser_with_variations():
61
63
  assert "link_variations" in record
62
64
  assert len(record["link_variations"]) > 0
63
65
 
64
- def test_creating_marc_record_with_json_input():
65
- json_data = read_json_file(JSON_TEST_FILE)
66
+ def test_creating_per_marc_record_with_json_input():
67
+ json_data = read_json_file(PER_JSON_TEST_FILE)
66
68
  record = PersonRecord(json_data)
67
69
  assert record.name == "Koidula, Lydia"
70
+
71
+
72
+ def test_creating_org_marc_record_with_json_input():
73
+ json_data = read_json_file(ORG_JSON_TEST_FILE)
74
+ record = OrganizationRecord(json_data)
75
+ assert record.original_name.get("a") == "Eesti"
76
+ assert record.original_name.get("b") == "Riigikogu"
77
+ assert not record.location
78
+ assert not record.dates
79
+ assert not record.numeration
rara_tools-0.6.12/VERSION DELETED
@@ -1 +0,0 @@
1
- 0.6.12
@@ -1,14 +0,0 @@
1
- COMPONENT_KEY = "subject_indexer"
2
-
3
-
4
- class Tasks:
5
- SINGLE = "run_subject_indexer_process"
6
- PIPELINE = "run_subject_indexer_with_core_logic"
7
-
8
-
9
- class Queue:
10
- MAIN = "subject-indexer"
11
-
12
-
13
- class StatusKeys:
14
- EXTRACT_KEYWORDS = "extract_keywords"
File without changes
File without changes
File without changes
File without changes