rara-tools 0.7.0__py3-none-any.whl → 0.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rara-tools might be problematic. Click here for more details.

@@ -20,7 +20,9 @@ class Queue:
20
20
  DOWNLOAD = "download"
21
21
  FINISH = "finish"
22
22
  OCR = "ocr"
23
+ UTILITY = "digitizer-utility"
23
24
 
24
25
 
25
26
  class Tasks:
26
27
  START_DIGITIZER_PIPELINE = "start_digitizer_pipeline"
28
+ PURGE_MODELS = "purge_unused_digitizer_models"
@@ -20,6 +20,7 @@ class Tasks:
20
20
  UPDATE_TASK_VALUES = "update_task_values"
21
21
  MODEL_UPDATE = "component_model_update"
22
22
  RUN_POST_TASK_COMPLETION_TASKS = "run_post_task_completion_tasks"
23
+ PURGE_MODELS = "purge_unused_models"
23
24
 
24
25
 
25
26
  class Models:
@@ -6,10 +6,12 @@ COMPONENT_KEY = "subject_indexer"
6
6
  class Tasks:
7
7
  SINGLE = "run_subject_indexer_process"
8
8
  PIPELINE = "run_subject_indexer_with_core_logic"
9
+ PURGE_MODELS = "purge_unused_subjectindexer_models"
9
10
 
10
11
 
11
12
  class Queue:
12
13
  MAIN = "subject-indexer"
14
+ UTILITY = "subjectindexer-utility"
13
15
 
14
16
 
15
17
  class StatusKeys:
@@ -21,6 +23,7 @@ class URLSource:
21
23
  SIERRA = "Sierra"
22
24
  EMS = "EMS"
23
25
 
26
+
24
27
  class KeywordType:
25
28
  LOC = "Kohamärksõnad"
26
29
  TIME = "Ajamärksõnad"
@@ -45,6 +48,7 @@ class KeywordMARC:
45
48
  EVENT = 611
46
49
  TITLE = 630
47
50
 
51
+
48
52
  class KeywordSource:
49
53
  EMS = "EMS"
50
54
  SIERRA = "SIERRA"
@@ -1,6 +1,7 @@
1
1
  from datetime import datetime
2
2
  from pymarc import (Field, Subfield, JSONReader, Record)
3
3
  from typing import List, Optional, Iterator
4
+ from rara_tools.normalizers.reader import SafeJSONReader
4
5
 
5
6
  from rara_tools.normalizers.viaf import VIAFRecord, VIAFClient
6
7
  from rara_tools.constants.normalizers import (
@@ -35,7 +36,6 @@ class RecordNormalizer:
35
36
  self.ALLOW_EDIT_FIELDS = ALLOW_EDIT_FIELDS
36
37
  # include, if should be added alongside existing fields
37
38
  self.REPEATABLE_FIELDS = REPEATABLE_FIELDS
38
-
39
39
  self.records_extra_data = []
40
40
  self.records = self._setup_records(linking_results, sierra_data)
41
41
  self.sierra_data = sierra_data
@@ -92,8 +92,56 @@ class RecordNormalizer:
92
92
 
93
93
  all_records = linked_records + (sierra_data or [])
94
94
 
95
- return JSONReader(json.dumps(all_records,
96
- ensure_ascii=False), stream=False)
95
+ return SafeJSONReader(
96
+ json.dumps(all_records, ensure_ascii=False),
97
+ )
98
+
99
+ def _setup_records(self, linking_results: List[dict], sierra_data: List[dict]) -> JSONReader:
100
+ """Setup initial MARC records and data.
101
+
102
+ For linked entities:
103
+ 1. Try to get single linked normalized record from KATA elastic. If more than one found, skip.
104
+ 2. If 0 matches, search from VIAF and if 1 result found, create a new authority record from the data.
105
+ 3. If none or more than one responses found, use only Classificator data (coming from Linker?).
106
+ """
107
+ linked_records = []
108
+
109
+ for linked in linking_results or []:
110
+ if not isinstance(linked, dict):
111
+ continue
112
+
113
+ entity = linked.get("original_entity")
114
+ linked_info = linked.get("linked_info", [])
115
+
116
+ if not isinstance(linked_info, list) or not linked_info:
117
+ continue
118
+
119
+ if len(linked_info) == 1:
120
+ linked_item = linked_info[0]
121
+ if not isinstance(linked_item, dict):
122
+ continue
123
+
124
+ linked_records.append(linked_item.get("json", {}))
125
+ self.records_extra_data.append({
126
+ "entity": entity,
127
+ "viaf": linked_item.get("viaf", {}),
128
+ "type": "linked",
129
+ "edited": True
130
+ })
131
+
132
+ self.records_extra_data.extend(
133
+ {
134
+ "sierraID": obj.get("sierraID"),
135
+ "type": "sierra",
136
+ "edited": True
137
+ }
138
+ for obj in (sierra_data or [])
139
+ if isinstance(obj, dict)
140
+ )
141
+
142
+ all_records = linked_records + (sierra_data or [])
143
+
144
+ return SafeJSONReader(json.dumps(all_records, ensure_ascii=False))
97
145
 
98
146
  @staticmethod
99
147
  def current_timestamp():
@@ -230,11 +278,15 @@ class RecordNormalizer:
230
278
 
231
279
  def _get_viaf_search_term(self, record: Record, entity: Optional[str]) -> Optional[str]:
232
280
  """ prioritize entity name, if not available, use author name. """
233
-
234
281
  if entity:
235
282
  return entity
236
- else:
237
- return record.name
283
+
284
+ author_field = record.get("100") or record.get("110") or record.get("111")
285
+ if author_field:
286
+ return author_field.get_subfields("a")[0] if author_field.get_subfields("a") else None
287
+
288
+ logger.warning(
289
+ "No entity or author name found for VIAF search. Skipping VIAF enrichment.")
238
290
 
239
291
  def _get_viaf_record(self, record: Record, viaf_id: Optional[int] = None,
240
292
  entity: Optional[str] = None, viaf_field: str = DEFAULT_VIAF_FIELD,
@@ -252,21 +304,26 @@ class RecordNormalizer:
252
304
  viaf_record = viaf_records[0]
253
305
  else:
254
306
  search_term = self._get_viaf_search_term(record, entity)
255
- if not verify:
256
- logger.warning(
257
- f"Record verification is turned off. If multiple records are " \
258
- f"detected for search term '{search_term}', the first " \
259
- f"result is automatically returned. This might lead to " \
260
- f"some inaccuracies!"
261
- )
262
307
 
263
- viaf_record = viaf_client.get_normalized_data_by_search_term(
264
- search_term=search_term,
265
- field=viaf_field,
266
- max_records=max_records,
267
- verify=verify,
268
- threshold=threshold
269
- )
308
+ if search_term:
309
+ logger.info(
310
+ f"Searching for VIAF record with search term: {search_term}")
311
+
312
+ if not verify:
313
+ logger.warning(
314
+ f"Record verification is turned off. If multiple records are " \
315
+ f"detected for search term '{search_term}', the first " \
316
+ f"result is automatically returned. This might lead to " \
317
+ f"some inaccuracies!"
318
+ )
319
+
320
+ viaf_record = viaf_client.get_normalized_data_by_search_term(
321
+ search_term=search_term,
322
+ field=viaf_field,
323
+ max_records=max_records,
324
+ verify=verify,
325
+ threshold=threshold
326
+ )
270
327
 
271
328
  except Exception as e:
272
329
  logger.error(
@@ -0,0 +1,45 @@
1
+ from pymarc import Record, Field, Subfield, Leader, JSONReader
2
+ import logging
3
+
4
+ logger = logging.getLogger(__name__)
5
+
6
+ DEFAULT_LEADER = '01682nz a2200349n 4500'
7
+
8
+ class SafeJSONReader(JSONReader):
9
+
10
+ def __next__(self):
11
+ while True:
12
+ try:
13
+ jobj = next(self.iter)
14
+ rec = Record()
15
+
16
+ # Use custom default leader if missing
17
+ leader_str = jobj.get("leader")
18
+ if leader_str:
19
+ rec.leader = Leader(leader_str)
20
+ else:
21
+ logger.warning("Missing leader in record. Using DEFAULT_LEADER.")
22
+ rec.leader = Leader(DEFAULT_LEADER)
23
+
24
+ for field in jobj["fields"]:
25
+ k, v = list(field.items())[0]
26
+
27
+ if isinstance(v, dict) and "subfields" in v:
28
+ subfields = []
29
+ for sub in v["subfields"]:
30
+ for code, value in sub.items():
31
+ subfields.append(Subfield(code, value))
32
+ ind1 = v.get("ind1", " ")
33
+ ind2 = v.get("ind2", " ")
34
+ fld = Field(tag=k, indicators=[ind1, ind2], subfields=subfields)
35
+ else:
36
+ fld = Field(tag=k, data=v)
37
+ rec.add_field(fld)
38
+
39
+ return rec
40
+
41
+ except StopIteration:
42
+ raise
43
+ except Exception as e:
44
+ logger.error(f"Skipping invalid record: {e}")
45
+ continue
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rara-tools
3
- Version: 0.7.0
3
+ Version: 0.7.2
4
4
  Summary: Tools to support Kata's work.
5
5
  Classifier: Programming Language :: Python :: 3
6
6
  Classifier: Programming Language :: Python :: 3.10
@@ -7,22 +7,23 @@ rara_tools/s3.py,sha256=9ziDXsLjBtFAvsjTPxFddhfvkpA8773rzPJqO7y1N5Q,6415
7
7
  rara_tools/task_reporter.py,sha256=WCcZts9dAUokPc4vbrG3-lNAFLnWaMgE3b3iaUB7mr8,3256
8
8
  rara_tools/utils.py,sha256=1UrxOzo3cxe4juMkDlKWv1VKWMYay5v1pivGci1ajiM,3003
9
9
  rara_tools/constants/__init__.py,sha256=r78laM9vyRDAvzDhPvzDlhaX6qPwUUBBtwf1WosrW3o,27
10
- rara_tools/constants/digitizer.py,sha256=A7FfqqEB4hGJ9t3z8gTFK7hkzCxz44rCOSWx6Pzvwjs,548
11
- rara_tools/constants/general.py,sha256=jE1aIir_eKbka_q1iCJWRtmyz_xpnTPntbshiWo9eTA,1061
10
+ rara_tools/constants/digitizer.py,sha256=9aQkJj8C5a_HLgCayrz3PpGYJMLoO4Ph9_U28Q-C1T4,633
11
+ rara_tools/constants/general.py,sha256=uCSIfE0o-drRDDqnaJALMGHecUOQIRhdr2g5M5NQqMY,1102
12
12
  rara_tools/constants/language_evaluator.py,sha256=3sCSaoS-zXQRY0vJ7UUMuZqbtYQD_quVVbdpgvJjE7I,124
13
13
  rara_tools/constants/linker.py,sha256=yBN9NpUhB3ENz8BapoIfpSHY_xNqwYdqutgQFdc_Cd8,3240
14
14
  rara_tools/constants/meta_extractor.py,sha256=adYH8cQqH0ZWYO7clGMiObclXRTGsxWgk3pC1oiHxHE,242
15
15
  rara_tools/constants/normalizers.py,sha256=Xs3anDwJHpHeniwx3xoIZyqdEXtO3eb7ouGLLr0CpHw,1344
16
16
  rara_tools/constants/parsers.py,sha256=L6nh1Itget9_9DMsliDkh6T25z78eMFPWVkbaU08DwU,5561
17
- rara_tools/constants/subject_indexer.py,sha256=E2D7pylH6Yey9h2TAvAWQiX5JtKKagsZx2E1Fz_afMI,1967
17
+ rara_tools/constants/subject_indexer.py,sha256=i0xRdqwasyb6d6WZZKPgyuEUd2JeO_qwWYoG6UeBo5U,2064
18
18
  rara_tools/core_formatters/core_formatter.py,sha256=HJX7jOi9kaFie_zm0Wzjk0nKF8dRleJpVWbCplFFquo,2760
19
19
  rara_tools/core_formatters/formatted_keyword.py,sha256=1-B9IQTycFt69pTy8WZNnfJ2WIMRow3kpEub6igyNQc,7865
20
20
  rara_tools/core_formatters/formatted_meta.py,sha256=Zd0oQFLbn6m_wHaWtgxBsu9J7wGyWIpZxb2-8PrR3Wk,5240
21
21
  rara_tools/core_formatters/formatted_object.py,sha256=7a499ZmcZXOqtlwxDi6FWHWF5a6HdCsduS22wV3uHIE,5656
22
22
  rara_tools/normalizers/__init__.py,sha256=_NqpS5w710DhaURytHq9JpEt8HgYpSPfRDcOtOymJgE,193
23
23
  rara_tools/normalizers/authorities.py,sha256=IDtcm0yNZNhv1f-WcdqWFSRzZk_CoKuBFsk6hEPddWM,4513
24
- rara_tools/normalizers/base.py,sha256=6tLfNdF6FZo8M6j_Q61lXoaF1HdIB1c0SKMatTc-Z64,12014
24
+ rara_tools/normalizers/base.py,sha256=e2Ql_JNrpRiVDeb9Iqf6mug3BHNjKasjwmiIh5oWgWc,14276
25
25
  rara_tools/normalizers/bibs.py,sha256=4DTS6k37z8qR5B3n7aiCXsT5Z49rLTvQ60lKKr5dyLs,2352
26
+ rara_tools/normalizers/reader.py,sha256=_usCY2jDC_1KvEJEdthPEldVz-v4-2-S8deNJCzQVoI,1573
26
27
  rara_tools/normalizers/viaf.py,sha256=LIeqbJoKtVt_0H1o7XMmhSE0BjF4l-jdAJgX_8Gg9Z4,24218
27
28
  rara_tools/parsers/marc_parsers/base_parser.py,sha256=Kdw4aivJf2FkWgIK7pJtHtVXF_G1pjHVQ7IcFItSqy8,1649
28
29
  rara_tools/parsers/marc_parsers/ems_parser.py,sha256=LFuhZcVwmHMcJknX9p4ZkO8RdjPdQZ4APGbw8KV6BIs,2024
@@ -38,8 +39,8 @@ rara_tools/parsers/marc_records/title_record.py,sha256=XrtJ4gj7wzSaGxNaPtPuawmqq
38
39
  rara_tools/parsers/tools/entity_normalizers.py,sha256=VyCy_NowCLpOsL0luQ55IW-Qi-J5oBH0Ofzr7HRFBhM,8949
39
40
  rara_tools/parsers/tools/marc_converter.py,sha256=LgSHe-7n7aiDrw2bnsB53r3fXTRFjZXTwBYfTpL0pfs,415
40
41
  rara_tools/parsers/tools/russian_transliterator.py,sha256=5ZU66iTqAhr7pmfVqXPAI_cidF43VqqmuN4d7H4_JuA,9770
41
- rara_tools-0.7.0.dist-info/licenses/LICENSE.md,sha256=hkZVnIZll7e_KNEQzeY94Y9tlzVL8iVZBTMBvDykksU,35142
42
- rara_tools-0.7.0.dist-info/METADATA,sha256=UmA4_431SdsYqP6IrEv8mr4yL9OoFVGHvpG-mQgs_g8,4079
43
- rara_tools-0.7.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
44
- rara_tools-0.7.0.dist-info/top_level.txt,sha256=JwfB5b8BAtW5OFKRln2AQ_WElTRyIBM4nO0FKN1cupY,11
45
- rara_tools-0.7.0.dist-info/RECORD,,
42
+ rara_tools-0.7.2.dist-info/licenses/LICENSE.md,sha256=hkZVnIZll7e_KNEQzeY94Y9tlzVL8iVZBTMBvDykksU,35142
43
+ rara_tools-0.7.2.dist-info/METADATA,sha256=x8GYrcQQCpMdhCUlfLr0Nxd1a2fNH3Fg5sUq2nK5x6Q,4079
44
+ rara_tools-0.7.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
45
+ rara_tools-0.7.2.dist-info/top_level.txt,sha256=JwfB5b8BAtW5OFKRln2AQ_WElTRyIBM4nO0FKN1cupY,11
46
+ rara_tools-0.7.2.dist-info/RECORD,,