rara-tools 0.7.7__tar.gz → 0.7.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rara-tools might be problematic. Click here for more details.

Files changed (67) hide show
  1. {rara_tools-0.7.7/rara_tools.egg-info → rara_tools-0.7.8}/PKG-INFO +1 -1
  2. rara_tools-0.7.8/VERSION +1 -0
  3. {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/digar_schema_converter.py +29 -4
  4. {rara_tools-0.7.7 → rara_tools-0.7.8/rara_tools.egg-info}/PKG-INFO +1 -1
  5. {rara_tools-0.7.7 → rara_tools-0.7.8}/tests/test_digar_schema_converter.py +56 -25
  6. rara_tools-0.7.7/VERSION +0 -1
  7. {rara_tools-0.7.7 → rara_tools-0.7.8}/LICENSE.md +0 -0
  8. {rara_tools-0.7.7 → rara_tools-0.7.8}/README.md +0 -0
  9. {rara_tools-0.7.7 → rara_tools-0.7.8}/pyproject.toml +0 -0
  10. {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/constants/__init__.py +0 -0
  11. {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/constants/digitizer.py +0 -0
  12. {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/constants/general.py +0 -0
  13. {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/constants/language_evaluator.py +0 -0
  14. {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/constants/linker.py +0 -0
  15. {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/constants/meta_extractor.py +0 -0
  16. {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/constants/normalizers.py +0 -0
  17. {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/constants/parsers.py +0 -0
  18. {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/constants/subject_indexer.py +0 -0
  19. {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/converters.py +0 -0
  20. {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/core_formatters/core_formatter.py +0 -0
  21. {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/core_formatters/formatted_keyword.py +0 -0
  22. {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/core_formatters/formatted_meta.py +0 -0
  23. {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/core_formatters/formatted_object.py +0 -0
  24. {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/decorators.py +0 -0
  25. {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/elastic.py +0 -0
  26. {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/exceptions.py +0 -0
  27. {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/normalizers/__init__.py +0 -0
  28. {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/normalizers/authorities.py +0 -0
  29. {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/normalizers/base.py +0 -0
  30. {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/normalizers/bibs.py +0 -0
  31. {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/normalizers/reader.py +0 -0
  32. {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/normalizers/viaf.py +0 -0
  33. {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/parsers/marc_parsers/base_parser.py +0 -0
  34. {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/parsers/marc_parsers/ems_parser.py +0 -0
  35. {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/parsers/marc_parsers/location_parser.py +0 -0
  36. {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/parsers/marc_parsers/organization_parser.py +0 -0
  37. {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/parsers/marc_parsers/person_parser.py +0 -0
  38. {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/parsers/marc_parsers/title_parser.py +0 -0
  39. {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/parsers/marc_records/base_record.py +0 -0
  40. {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/parsers/marc_records/ems_record.py +0 -0
  41. {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/parsers/marc_records/organization_record.py +0 -0
  42. {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/parsers/marc_records/person_record.py +0 -0
  43. {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/parsers/marc_records/title_record.py +0 -0
  44. {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/parsers/tools/entity_normalizers.py +0 -0
  45. {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/parsers/tools/marc_converter.py +0 -0
  46. {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/parsers/tools/russian_transliterator.py +0 -0
  47. {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/s3.py +0 -0
  48. {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/task_reporter.py +0 -0
  49. {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools/utils.py +0 -0
  50. {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools.egg-info/SOURCES.txt +0 -0
  51. {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools.egg-info/dependency_links.txt +0 -0
  52. {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools.egg-info/requires.txt +0 -0
  53. {rara_tools-0.7.7 → rara_tools-0.7.8}/rara_tools.egg-info/top_level.txt +0 -0
  54. {rara_tools-0.7.7 → rara_tools-0.7.8}/requirements.txt +0 -0
  55. {rara_tools-0.7.7 → rara_tools-0.7.8}/setup.cfg +0 -0
  56. {rara_tools-0.7.7 → rara_tools-0.7.8}/tests/test_elastic.py +0 -0
  57. {rara_tools-0.7.7 → rara_tools-0.7.8}/tests/test_elastic_vector_and_search_operations.py +0 -0
  58. {rara_tools-0.7.7 → rara_tools-0.7.8}/tests/test_entity_normalizers.py +0 -0
  59. {rara_tools-0.7.7 → rara_tools-0.7.8}/tests/test_formatters.py +0 -0
  60. {rara_tools-0.7.7 → rara_tools-0.7.8}/tests/test_marc_parsers.py +0 -0
  61. {rara_tools-0.7.7 → rara_tools-0.7.8}/tests/test_normalization.py +0 -0
  62. {rara_tools-0.7.7 → rara_tools-0.7.8}/tests/test_s3_exceptions.py +0 -0
  63. {rara_tools-0.7.7 → rara_tools-0.7.8}/tests/test_s3_file_operations.py +0 -0
  64. {rara_tools-0.7.7 → rara_tools-0.7.8}/tests/test_sierra_converters.py +0 -0
  65. {rara_tools-0.7.7 → rara_tools-0.7.8}/tests/test_task_reporter.py +0 -0
  66. {rara_tools-0.7.7 → rara_tools-0.7.8}/tests/test_utils.py +0 -0
  67. {rara_tools-0.7.7 → rara_tools-0.7.8}/tests/test_viaf_client.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rara-tools
3
- Version: 0.7.7
3
+ Version: 0.7.8
4
4
  Summary: Tools to support Kata's work.
5
5
  Classifier: Programming Language :: Python :: 3
6
6
  Classifier: Programming Language :: Python :: 3.10
@@ -0,0 +1 @@
1
+ 0.7.8
@@ -241,8 +241,8 @@ class DIGARSchemaConverter:
241
241
  min_language_ratio: float = 0.2,
242
242
  convert_ratio: bool = False
243
243
  ) -> NoReturn:
244
- """ Initialize DIGARSchemaConverter object.
245
-
244
+ """ Initialize DIGARSchemaConverter object.
245
+
246
246
  Parameters
247
247
  ----------
248
248
  digitizer_output: dict
@@ -261,7 +261,7 @@ class DIGARSchemaConverter:
261
261
  be added to the final output.
262
262
  convert_ratio: bool
263
263
  If enabled, all ratios are converted into percentages.
264
-
264
+
265
265
  """
266
266
  self.__digitizer_output: dict = digitizer_output
267
267
  self.__min_language_ratio: float = min_language_ratio
@@ -280,6 +280,7 @@ class DIGARSchemaConverter:
280
280
  self.__dc_origin: dict = {}
281
281
  self.__dc_identifier: List[dict] = []
282
282
  self.__doc_id: str = ""
283
+ self.__page_count: int = None
283
284
 
284
285
  self.__doc_schemas = DocSchemas(
285
286
  doc_meta=self.doc_meta,
@@ -303,6 +304,28 @@ class DIGARSchemaConverter:
303
304
  page_number = _first_segment.get("page")
304
305
  return page_number
305
306
 
307
+ def _add_dummy_pages(self, docs: List[dict]):
308
+ for doc in docs:
309
+ if not doc.get("page"):
310
+ doc["page"] = self.dummy_page
311
+ return docs
312
+
313
+ @property
314
+ def dummy_page(self) -> int:
315
+ """ Get page number to add for images,
316
+ if actual page is missing. Currently returns
317
+ a new (non-existing) final page.
318
+ """
319
+ return self.page_count+1
320
+
321
+ @property
322
+ def page_count(self) -> int:
323
+ """ Returns total page count of the document.
324
+ """
325
+ if not self.__page_count:
326
+ self.__page_count = self.__digitizer_output.get("doc_meta", {}).get("pages", {}).get("count", 0)
327
+ return self.__page_count
328
+
306
329
  @property
307
330
  def doc_id(self) -> str:
308
331
  """ Retrieves document ID to use for generating
@@ -327,7 +350,8 @@ class DIGARSchemaConverter:
327
350
  @property
328
351
  def images(self) -> List[dict]:
329
352
  if not self.__images:
330
- self.__images = self.__digitizer_output.get("images")
353
+ images = self.__digitizer_output.get("images")
354
+ self.__images = self._add_dummy_pages(images)
331
355
  return self.__images
332
356
 
333
357
  @property
@@ -344,6 +368,7 @@ class DIGARSchemaConverter:
344
368
  mapped[text["start_page"]]["texts"].append(text)
345
369
  for img in self.images:
346
370
  mapped[img["page"]]["images"].append(img)
371
+ #print(mapped.items())
347
372
 
348
373
  self.__page_mappings = [
349
374
  v for k, v in sorted(list(mapped.items()), key=lambda x: x[0])
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rara-tools
3
- Version: 0.7.7
3
+ Version: 0.7.8
4
4
  Summary: Tools to support Kata's work.
5
5
  Classifier: Programming Language :: Python :: 3
6
6
  Classifier: Programming Language :: Python :: 3.10
@@ -12,11 +12,19 @@ def load_json(file_path: str):
12
12
 
13
13
 
14
14
  TEST_DIGITIZER_OUTPUT_FILE = os.path.join(".", "tests", "test_data", "b1267058_test_digitizer_output.json")
15
+ TEST_DIGITIZER_OUTPUT_FILE_2 = os.path.join(".", "tests", "test_data", "b5493797_test_digitizer_output_shortened_empty_image_pages.json")
16
+
15
17
  TEST_DIGITIZER_OUTPUT = load_json(TEST_DIGITIZER_OUTPUT_FILE)
18
+ TEST_DIGITIZER_OUTPUT_2 = load_json(TEST_DIGITIZER_OUTPUT_FILE_2)
19
+
16
20
  TEST_SIERRA_ID = "b1267058"
17
21
  TEST_GENERATED_ID = "hsasaHSAHHGDhb"
18
22
  TEST_PERMALINK = "https://www.digar.ee/b1267058"
19
23
 
24
+ TEST_SIERRA_ID_2 = "b5493797"
25
+ TEST_GENERATED_ID_2 = "mzmudhju38hd3ndlk"
26
+ TEST_PERMALINK_2 = "https://www.digar.ee/b5493797"
27
+
20
28
  def test_digar_schema_converstion_default():
21
29
  converter = DIGARSchemaConverter(
22
30
  digitizer_output=TEST_DIGITIZER_OUTPUT,
@@ -24,25 +32,25 @@ def test_digar_schema_converstion_default():
24
32
  generated_id=TEST_GENERATED_ID
25
33
  )
26
34
  digar_schema = converter.digar_schema
27
-
35
+
28
36
  # check that all neseccary fields are present
29
37
  assert "dc:language" in digar_schema
30
38
  assert "dcterms:provenance" in digar_schema
31
39
  assert "dc:identifier" in digar_schema
32
40
  assert "dcterms:hasPart" in digar_schema
33
41
  assert "dcterms:conformsTo" in digar_schema
34
-
42
+
35
43
  languages = [lang.get("value") for lang in digar_schema.get("dc:language")]
36
44
  # check that languages are converted into ISO-693-2
37
45
  for lang in languages:
38
46
  assert len(lang) == 3
39
-
40
-
47
+
48
+
41
49
  # check that ratio is converted into percentage
42
50
  text_quality = digar_schema.get("dcterms:conformsTo")[0].get("value")
43
51
  assert isinstance(text_quality, str)
44
-
45
-
52
+
53
+
46
54
  def test_digar_schema_id_generation():
47
55
  """ Tests ID generation logic.
48
56
  """
@@ -51,38 +59,38 @@ def test_digar_schema_id_generation():
51
59
  sierra_id=TEST_SIERRA_ID,
52
60
  generated_id=TEST_GENERATED_ID,
53
61
  permalink=TEST_PERMALINK
54
-
62
+
55
63
  )
56
-
64
+
57
65
  #If permalink is given, this should be used as base ID
58
66
  digar_schema = converter.digar_schema
59
67
  first_segment_id = digar_schema["dcterms:hasPart"][0]["hasPart"][0]["@id"]
60
-
68
+
61
69
  assert first_segment_id.startswith(TEST_PERMALINK)
62
-
70
+
63
71
  converter = DIGARSchemaConverter(
64
72
  digitizer_output=TEST_DIGITIZER_OUTPUT,
65
73
  sierra_id=TEST_SIERRA_ID,
66
74
  generated_id=TEST_GENERATED_ID
67
75
  )
68
-
76
+
69
77
  #If permalink is NOT given, Sierra ID should be used as base ID
70
78
  digar_schema = converter.digar_schema
71
79
  first_segment_id = digar_schema["dcterms:hasPart"][0]["hasPart"][0]["@id"]
72
80
  assert first_segment_id.startswith(TEST_SIERRA_ID)
73
-
74
-
81
+
82
+
75
83
  converter = DIGARSchemaConverter(
76
84
  digitizer_output=TEST_DIGITIZER_OUTPUT,
77
85
  generated_id=TEST_GENERATED_ID
78
86
  )
79
-
87
+
80
88
  #If neiter permalink nor Sierra ID is given, generated ID should be used as base ID
81
89
  digar_schema = converter.digar_schema
82
90
  first_segment_id = digar_schema["dcterms:hasPart"][0]["hasPart"][0]["@id"]
83
91
  assert first_segment_id.startswith(TEST_GENERATED_ID)
84
-
85
-
92
+
93
+
86
94
  def test_restricting_languages_with_ratio():
87
95
  """ Checks that param `min_language_ratio` influences
88
96
  the number of output languages.
@@ -93,41 +101,64 @@ def test_restricting_languages_with_ratio():
93
101
  generated_id=TEST_GENERATED_ID,
94
102
  permalink=TEST_PERMALINK,
95
103
  min_language_ratio=0
96
-
104
+
97
105
  )
98
-
106
+
99
107
  #If permalink is given, this should be used as base ID
100
108
  digar_schema = converter.digar_schema
101
109
  languages = [lang.get("value") for lang in digar_schema.get("dc:language")]
102
110
  assert len(languages) == 7
103
-
111
+
104
112
  converter = DIGARSchemaConverter(
105
113
  digitizer_output=TEST_DIGITIZER_OUTPUT,
106
114
  sierra_id=TEST_SIERRA_ID,
107
115
  generated_id=TEST_GENERATED_ID,
108
116
  permalink=TEST_PERMALINK,
109
117
  min_language_ratio=0.02
110
-
118
+
111
119
  )
112
-
120
+
113
121
  #If permalink is given, this should be used as base ID
114
122
  digar_schema = converter.digar_schema
115
123
  languages = [lang.get("value") for lang in digar_schema.get("dc:language")]
116
124
  assert len(languages) == 2
117
-
125
+
118
126
  converter = DIGARSchemaConverter(
119
127
  digitizer_output=TEST_DIGITIZER_OUTPUT,
120
128
  sierra_id=TEST_SIERRA_ID,
121
129
  generated_id=TEST_GENERATED_ID,
122
130
  permalink=TEST_PERMALINK,
123
131
  min_language_ratio=0.5
124
-
132
+
125
133
  )
126
-
134
+
127
135
  #If permalink is given, this should be used as base ID
128
136
  digar_schema = converter.digar_schema
129
137
  languages = [lang.get("value") for lang in digar_schema.get("dc:language")]
130
138
  assert len(languages) == 1
131
-
132
139
 
133
140
 
141
+ def test_digar_schema_converstion_with_missing_image_pages():
142
+ converter = DIGARSchemaConverter(
143
+ digitizer_output=TEST_DIGITIZER_OUTPUT_2,
144
+ sierra_id=TEST_SIERRA_ID_2,
145
+ generated_id=TEST_GENERATED_ID_2
146
+ )
147
+ digar_schema = converter.digar_schema
148
+
149
+ # check that all neseccary fields are present
150
+ assert "dc:language" in digar_schema
151
+ assert "dcterms:provenance" in digar_schema
152
+ assert "dc:identifier" in digar_schema
153
+ assert "dcterms:hasPart" in digar_schema
154
+ assert "dcterms:conformsTo" in digar_schema
155
+
156
+ languages = [lang.get("value") for lang in digar_schema.get("dc:language")]
157
+ # check that languages are converted into ISO-693-2
158
+ for lang in languages:
159
+ assert len(lang) == 3
160
+
161
+
162
+ # check that ratio is converted into percentage
163
+ text_quality = digar_schema.get("dcterms:conformsTo")[0].get("value")
164
+ assert isinstance(text_quality, str)
rara_tools-0.7.7/VERSION DELETED
@@ -1 +0,0 @@
1
- 0.7.7
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes