pyreslib 0.2.0__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,12 +1,12 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pyreslib
3
- Version: 0.2.0
3
+ Version: 0.3.0
4
4
  Summary: Python package for digital libraries from the Resounding Libraries cluster at Orpheus Instituut
5
5
  Author-email: Nicholas Cornia <nicholas.cornia@orpheusinstituut.be>
6
6
  License-Expression: MIT
7
7
  Project-URL: Homepage, https://nicholascorniaorpheus.github.io/py-resounding-libraries/
8
8
  Project-URL: Repository, https://github.com/NicholasCorniaOrpheus/py-resounding-libraries
9
- Keywords: koha,wikidata,digital-libraries,transkribus,omekas,bibtex,marc,library-software
9
+ Keywords: koha,wikidata,digital-libraries,transkribus,omekas,bibtex,marc,library-software,kraken,resourcespace,ocr,wikibase
10
10
  Classifier: Development Status :: 4 - Beta
11
11
  Classifier: Programming Language :: Python :: 3
12
12
  Classifier: Programming Language :: Python :: 3.10
@@ -16,6 +16,7 @@ Requires-Python: >=3.10
16
16
  Description-Content-Type: text/markdown
17
17
  License-File: LICENSE
18
18
  Requires-Dist: bibtexparser>=1.4.4
19
+ Requires-Dist: kraken>=7.0.2
19
20
  Requires-Dist: omeka-s-tools>=0.3.0
20
21
  Requires-Dist: pandas>=2.3.3
21
22
  Requires-Dist: pymarc>=5.3.1
@@ -33,6 +34,8 @@ Provides-Extra: dev
33
34
  Requires-Dist: pytest>=7.0; extra == "dev"
34
35
  Requires-Dist: black>=22.0; extra == "dev"
35
36
  Requires-Dist: ruff>=0.1; extra == "dev"
37
+ Provides-Extra: ocr
38
+ Requires-Dist: kraken>=7.0; extra == "ocr"
36
39
  Dynamic: license-file
37
40
 
38
41
  # py-resounding-libraries
@@ -10,13 +10,14 @@ classifiers = [
10
10
  "Programming Language :: Python :: 3.11",
11
11
  "Programming Language :: Python :: 3.12",
12
12
  ]
13
- version = "0.2.0"
13
+ version = "0.3.0"
14
14
  description = "Python package for digital libraries from the Resounding Libraries cluster at Orpheus Instituut"
15
- keywords = ["koha", "wikidata", "digital-libraries", "transkribus","omekas","bibtex","marc","library-software"]
15
+ keywords = ["koha", "wikidata", "digital-libraries", "transkribus","omekas","bibtex","marc","library-software","kraken","resourcespace","ocr","wikibase"]
16
16
  readme = "README.md"
17
17
  requires-python = ">=3.10"
18
18
  dependencies = [
19
19
  "bibtexparser>=1.4.4",
20
+ "kraken>=7.0.2",
20
21
  "omeka-s-tools>=0.3.0",
21
22
  "pandas>=2.3.3",
22
23
  "pymarc>=5.3.1",
@@ -40,16 +41,28 @@ dev = [
40
41
  "ruff>=0.1",
41
42
  ]
42
43
 
44
+ ocr = [
45
+ "kraken>=7.0"
46
+ ]
47
+
43
48
  [tool.setuptools]
44
49
  packages = ["pyreslib"]
45
50
 
46
51
  [tool.setuptools.package-data]
47
52
  pyreslib = ["data/**/*"]
48
53
 
54
+ [tool.uv.sources]
55
+ pyreslib = { workspace = true }
56
+
49
57
  [build-system]
50
58
  requires = ["setuptools>=68.0", "wheel"]
51
59
  build-backend = "setuptools.build_meta"
52
60
 
61
+ [dependency-groups]
62
+ dev = [
63
+ "pyreslib",
64
+ ]
65
+
53
66
  [project.urls]
54
67
  Homepage = "https://nicholascorniaorpheus.github.io/py-resounding-libraries/"
55
- Repository = "https://github.com/NicholasCorniaOrpheus/py-resounding-libraries"
68
+ Repository = "https://github.com/NicholasCorniaOrpheus/py-resounding-libraries"
@@ -0,0 +1,308 @@
1
+ from PIL import Image
2
+ import os
3
+ from kraken import binarization
4
+ from kraken import serialization
5
+ from kraken.containers import Segmentation
6
+ from kraken.tasks import SegmentationTaskModel, RecognitionTaskModel
7
+ from kraken.configs import SegmentationInferenceConfig, RecognitionInferenceConfig
8
+ import xml.etree.ElementTree as ET
9
+
10
+ from pyreslib import transkribus
11
+
12
+
13
+ def transcribe_directory(
14
+ dir_path: str = "./data/kraken/transcriptions",
15
+ segmentation_model_path: str = None,
16
+ recognition_model_path: str = "./data/kraken/models/catmus-large/catmus-print-fondue-large.mlmodel",
17
+ export_dir: str = "./data/kraken/transcriptions",
18
+ xml_namespace: str = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15",
19
+ ):
20
+ """
21
+ Transcribe image files within directory (no subdirectories allowed) using Kraken.
22
+
23
+ Args:
24
+ dir_path(str): Directory path where images are stored. Default is `./data/kraken/transcriptions`
25
+ segmentation_model_path (str): Path to segmentation model. Default is `None`, but you can install dfine_kraken plugin and upload your custom model in `"./data/kraken/models/`.
26
+ recognition_model_path(str): Path to recognition model. Default is the [CATMUS Print large](https://zenodo.org/records/10592716) "./data/kraken/models/catmus-large/catmus-print-fondue-large.mlmodel"
27
+ export_dir(str): Export directory for the PAGEXML files. Default is `./data/kraken/transcriptions`
28
+ xml_namespace(str): PAGEXML namespace used by Kraken 7: `http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15`.
29
+
30
+ Returns:
31
+ `None`
32
+ """
33
+ # Load models once outside the loop
34
+ print("Loading segmentation model...")
35
+ try:
36
+ seg_model = SegmentationTaskModel.load_model(segmentation_model_path)
37
+ except ValueError as e:
38
+ print(f"Error loading segmentation model: {e}")
39
+ print("Falling back to default BLLA segmentation model...")
40
+ seg_model = SegmentationTaskModel.load_model() # Load default model
41
+
42
+ print("Loading recognition model...")
43
+ rec_model = RecognitionTaskModel.load_model(recognition_model_path)
44
+
45
+ seg_config = SegmentationInferenceConfig()
46
+ rec_config = RecognitionInferenceConfig()
47
+
48
+ for file in os.scandir(dir_path):
49
+ if file.name.endswith((".png", ".jpg", ".jpeg", ".tif", ".gif")):
50
+ print(f"Processing {file.name}...")
51
+
52
+ try:
53
+ # Open and binarize image
54
+ img = Image.open(file.path)
55
+ bw_img = binarization.nlbin(img)
56
+
57
+ # Perform segmentation
58
+ segmentation = seg_model.predict(bw_img, seg_config)
59
+
60
+ # Perform recognition
61
+ records = list(rec_model.predict(bw_img, segmentation, rec_config))
62
+
63
+ # Build segmentation with recognition results
64
+ recognized_segmentation = Segmentation(
65
+ lines=records,
66
+ imagename=file.path,
67
+ type=segmentation.type,
68
+ text_direction=segmentation.text_direction,
69
+ script_detection=segmentation.script_detection,
70
+ regions=segmentation.regions,
71
+ )
72
+
73
+ # Serialize and save results
74
+ serialization_prediction(
75
+ segmentation=recognized_segmentation,
76
+ file_path=file.path,
77
+ image=bw_img,
78
+ export_dir=export_dir,
79
+ xml_namespace=xml_namespace,
80
+ )
81
+ except Exception as e:
82
+ print(f"Error processing {file.name}: {e}")
83
+ continue
84
+
85
+ def transcribe_image(
86
+ image_filepath: str,
87
+ export_dir: str,
88
+ segmentation_model_path: str = None,
89
+ recognition_model_path: str = "./data/kraken/models/catmus-large/catmus-print-fondue-large.mlmodel",
90
+ xml_namespace: str = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15",
91
+ ):
92
+ """
93
+ Transcribe image file using Kraken, returning a PAGEXML and TXT transcriptions.
94
+
95
+ Args:
96
+ dir_path(str): Directory path where images are stored. Default is `./data/kraken/transcriptions`
97
+ segmentation_model_path (str): Path to segmentation model. Default is `None`, but you can install dfine_kraken plugin and upload your custom model in `"./data/kraken/models/`.
98
+ recognition_model_path(str): Path to recognition model. Default is the [CATMUS Print large](https://zenodo.org/records/10592716) "./data/kraken/models/catmus-large/catmus-print-fondue-large.mlmodel"
99
+ xml_namespace(str): PAGEXML namespace used by Kraken 7: `http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15`.
100
+
101
+ Returns:
102
+ `None`
103
+ """
104
+ # Load models once outside the loop
105
+ print("Loading segmentation model...")
106
+ try:
107
+ seg_model = SegmentationTaskModel.load_model(segmentation_model_path)
108
+ except ValueError as e:
109
+ print(f"Error loading segmentation model: {e}")
110
+ print("Falling back to default BLLA segmentation model...")
111
+ seg_model = SegmentationTaskModel.load_model() # Load default model
112
+
113
+ print("Loading recognition model...")
114
+ rec_model = RecognitionTaskModel.load_model(recognition_model_path)
115
+
116
+ seg_config = SegmentationInferenceConfig()
117
+ rec_config = RecognitionInferenceConfig()
118
+
119
+ print(f"Processing {image_filepath}...")
120
+
121
+ try:
122
+ # Open and binarize image
123
+ img = Image.open(image_filepath)
124
+ bw_img = binarization.nlbin(img)
125
+
126
+ # Perform segmentation
127
+ segmentation = seg_model.predict(bw_img, seg_config)
128
+
129
+ # Perform recognition
130
+ records = list(rec_model.predict(bw_img, segmentation, rec_config))
131
+
132
+ # Build segmentation with recognition results
133
+ recognized_segmentation = Segmentation(
134
+ lines=records,
135
+ imagename=image_filepath,
136
+ type=segmentation.type,
137
+ text_direction=segmentation.text_direction,
138
+ script_detection=segmentation.script_detection,
139
+ regions=segmentation.regions,
140
+ )
141
+
142
+ # Serialize and save results
143
+ serialization_prediction(
144
+ segmentation=recognized_segmentation,
145
+ file_path=image_filepath,
146
+ image=bw_img,
147
+ export_dir=export_dir,
148
+ xml_namespace=xml_namespace,
149
+ )
150
+ except Exception as e:
151
+ print(f"Error processing {image_filepath}: {e}")
152
+
153
+ def serialization_prediction(
154
+ segmentation, file_path: str, image, export_dir: str, xml_namespace: str
155
+ ):
156
+ """
157
+ Serializes recognition results to PageXML and plain text files.
158
+
159
+ Args:
160
+ segmentation: Segmentation object with recognition results
161
+ file_path (str): Path of the original image.
162
+ image: PIL.Image object
163
+ export_dir (str): Export directory.
164
+ xml_namespace (str): XML namespace for PageXML output.
165
+
166
+ Returns:
167
+ None
168
+ """
169
+ # Serialize to PageXML
170
+ page_xml = serialization.serialize(
171
+ segmentation,
172
+ image_size=image.size,
173
+ template="pagexml",
174
+ sub_line_segmentation=False,
175
+ )
176
+
177
+ # Get base filename without extension
178
+ base_filename = os.path.splitext(os.path.basename(file_path))[0]
179
+
180
+ # Save XML to file
181
+ xml_output_path = os.path.join(export_dir, f"{base_filename}.xml")
182
+ print(f"Serializing image into {xml_output_path}...")
183
+ with open(xml_output_path, "w") as f:
184
+ f.write(page_xml)
185
+
186
+ # Parse PageXML and extract plain text
187
+ root = ET.fromstring(page_xml)
188
+ plain_text = ""
189
+
190
+ # Find all TextLine elements (using wildcard for namespace)
191
+ for line in root.iter():
192
+ if line.tag.endswith("TextLine"):
193
+ for child in line.iter():
194
+ if child.tag.endswith("Unicode") and child.text:
195
+ plain_text += child.text
196
+ plain_text += "\n"
197
+
198
+ # Save TXT to file
199
+ txt_output_path = os.path.join(export_dir, f"{base_filename}.txt")
200
+ print(f"Saving text to {txt_output_path}...")
201
+ with open(txt_output_path, "w") as f:
202
+ f.write(plain_text)
203
+
204
+ def update_kraken_XML_to_transkribus(session,collection_id: int,document_id:int, page_number: int,page_xml_filepath: str):
205
+
206
+ """
207
+ Converts and updates PAGEXML generated by Kraken back to Transkribus via API.
208
+
209
+ Args:
210
+ session: Transkribus session from `pyreslib.transkribus.api_login()` method.
211
+ collection_id (int): Collection ID identifier from Transkribus.
212
+ document_id (int): Document ID identifier from Transkribus.
213
+ page_number (int): Internal page number identifier from Transkribus.
214
+ page_xml_filepath (str): Filepath for Kraken generated PAGEXML file.
215
+
216
+ Returns:
217
+ `None`
218
+
219
+ """
220
+ # convert PAGEXML kraken file for Transkribus
221
+ TRANSKRIBUS_NS_URL = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15"
222
+ KRAKEN_NS_URL = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"
223
+ tree = ET.parse(page_xml_filepath)
224
+ root = tree.getroot()
225
+ # PAGE XML namespace map for searching elements
226
+ ns = {'xmlns': KRAKEN_NS_URL}
227
+
228
+ if 'xsi:schemaLocation' in root.attrib:
229
+ del root.attrib['{http://w3.org}schemaLocation']
230
+
231
+ # 3. Patch TextRegions: Add readingOrder and change layout structural names
232
+ for idx, region in enumerate(root.findall('.//xmlns:TextRegion', ns)):
233
+ # Transkribus reads layouts via 'structure {type:heading/paragraph/etc}'
234
+ region.set('custom', f"readingOrder {{index:{idx};}} structure {{type:paragraph;}}")
235
+
236
+ # 4. Patch TextLines: Transkribus expects 'layout' definitions rather than 'default'
237
+ for idx, line in enumerate(root.findall('.//xmlns:TextLine', ns)):
238
+ line.set('custom', f"readingOrder {{index:{idx};}}")
239
+
240
+ # namespaces
241
+ #ET.register_namespace('', 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15')
242
+ ET.register_namespace("",TRANSKRIBUS_NS_URL) # use Transkribus version
243
+ #ET.register_namespace('xsi', "http://www.w3.org/2001/XMLSchema-instance")
244
+ #ET.register_namespace('schemaLocation',"http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15/pagecontent.xsd")
245
+ # 5. Convert back to string bytes
246
+ # Transkribus explicitly requires the XML declaration at the top
247
+ xml_patched_bytes = ET.tostring(root, encoding="utf-8", xml_declaration=True)
248
+ # replace Kraken namespace with transkribus one
249
+ xml_patched_bytes = xml_patched_bytes.replace(KRAKEN_NS_URL.encode('utf-8'), TRANSKRIBUS_NS_URL.encode('utf-8'))
250
+
251
+ # post file back to Transkribus via API
252
+ print(f"Updating transcription from {page_xml_filepath} back to Transkribus")
253
+ transkribus.post_page_xml(session=session,page_xml=xml_patched_bytes,collection_id=collection_id,document_id=document_id,page_number=page_number,filepath=False)
254
+
255
+
256
+
257
+ def transcription_dir_to_transkribus(transcription_dir: str, session, collection_id:int, document_id:int):
258
+ """
259
+ Transcribes a whole Transkribus document by:
260
+
261
+ 1. Getting all images from API using [pyreslib.transkribus.get_jpg_image][]
262
+ 2. Transcribing images using kraken [pyreslib.kraken.transcribe_image][]
263
+ 3. Updating transcriptions backt to Transkribus via [pyreslib.kraken].update_kraken_XML_to_transkribus][].
264
+
265
+ """
266
+ # 1. images
267
+
268
+ # document_metadata
269
+ print("Importing document metadata...")
270
+ document_metadata = transkribus.get_document_metadata(session=session,collection_id=collection_id,document_id=document_id)
271
+
272
+ print("Getting images from Transkribus API...")
273
+ for page in document_metadata["pageList"]["pages"]:
274
+ # import images in directory
275
+ print(f"Current page: {page["pageNr"]}")
276
+ transkribus.get_jpg_image(session=session, collection_id=collection_id,document_id=document_id,page_number=page["pageNr"], output_filepath=os.path.join(transcription_dir,page["imgFileName"]))
277
+
278
+ # 2. Kraken
279
+ print("Transcribing images using Kraken, by default it skips the image if an XML file is present.")
280
+ print("It might take a while...")
281
+
282
+ all_files = list(os.scandir(transcription_dir))
283
+ existing_xml_basenames = {
284
+ f.name.replace(".xml", "")
285
+ for f in all_files
286
+ if f.name.endswith(".xml")
287
+ }
288
+
289
+ for f in all_files:
290
+ if f.path.endswith(".jpg"):
291
+ img_basename = f.name.replace(".jpg","")
292
+ if img_basename in existing_xml_basenames:
293
+ print("Skipping image. Transcription is already present.")
294
+ else:
295
+ transcribe_image(image_filepath=f.path,export_dir=transcription_dir)
296
+
297
+
298
+ # 3. Transkribus
299
+ print("Converting and updating transcriptions back to Transkribus")
300
+ for f in all_files:
301
+ if f.path.endswith(".xml"):
302
+ # retrieve page number from filename
303
+ xml_page_number = list(filter(lambda x: x["imgFileName"] == f.name.replace(".xml",".jpg"), document_metadata["pageList"]["pages"]))[0]["pageNr"]
304
+ update_kraken_XML_to_transkribus(session=session,collection_id=collection_id,document_id=document_id, page_number=xml_page_number,page_xml_filepath=f.path)
305
+
306
+
307
+
308
+
@@ -1,6 +1,6 @@
1
1
  from omeka_s_tools.api import OmekaAPIClient
2
2
  import requests
3
- from pyreslib import koha
3
+ from pyreslib import koha, utilities
4
4
  import os
5
5
  import json, csv
6
6
 
@@ -30,13 +30,6 @@ def omekas_session(api_url: str, key_identity: str, key_credential: str):
30
30
  return omekas_session
31
31
 
32
32
 
33
- def csv2dict(csv_filename): # imports a CSV file as dictionary
34
- f = open(csv_filename, "r")
35
- reader = csv.DictReader(f)
36
- d = {"items": []}
37
- for row in reader:
38
- d["items"].append(row)
39
- return d["items"]
40
33
 
41
34
 
42
35
  def generate_omekas_mapping(mappings_directory: str) -> dict:
@@ -50,33 +43,33 @@ def generate_omekas_mapping(mappings_directory: str) -> dict:
50
43
  omekas_mapping = {}
51
44
 
52
45
  # get authorities mapping
53
- omekas_mapping["auth"] = csv2dict(
46
+ omekas_mapping["auth"] = utilities.csv2dict(
54
47
  os.path.join(mappings_directory, "koha-omekas_mapping - auth.csv")
55
48
  )
56
49
  # biblio mapping
57
- omekas_mapping["biblio"] = csv2dict(
50
+ omekas_mapping["biblio"] = utilities.csv2dict(
58
51
  os.path.join(mappings_directory, "koha-omekas_mapping - biblio.csv")
59
52
  )
60
53
  # get media mapping
61
- omekas_mapping["media"] = csv2dict(
54
+ omekas_mapping["media"] = utilities.csv2dict(
62
55
  os.path.join(mappings_directory, "koha-omekas_mapping - media.csv")
63
56
  )
64
57
  # get locations
65
- omekas_mapping["locations"] = csv2dict(
58
+ omekas_mapping["locations"] = utilities.csv2dict(
66
59
  os.path.join(mappings_directory, "koha-omekas_mapping - locations.csv")
67
60
  )
68
61
  # get research groups
69
- omekas_mapping["research_groups"] = csv2dict(
62
+ omekas_mapping["research_groups"] = utilities.csv2dict(
70
63
  os.path.join(mappings_directory, "koha-omekas_mapping - research_groups.csv")
71
64
  )
72
65
 
73
66
  # get researchers
74
- omekas_mapping["researchers"] = csv2dict(
67
+ omekas_mapping["researchers"] = utilities.csv2dict(
75
68
  os.path.join(mappings_directory, "koha-omekas_mapping - researchers.csv")
76
69
  )
77
70
 
78
71
  # get projects
79
- omekas_mapping["projects"] = csv2dict(
72
+ omekas_mapping["projects"] = utilities.csv2dict(
80
73
  os.path.join(mappings_directory, "koha-omekas_mapping - projects.csv")
81
74
  )
82
75